aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig27
-rw-r--r--net/ipv4/Makefile2
-rw-r--r--net/ipv4/af_inet.c148
-rw-r--r--net/ipv4/ah4.c14
-rw-r--r--net/ipv4/arp.c16
-rw-r--r--net/ipv4/cipso_ipv4.c4
-rw-r--r--net/ipv4/devinet.c37
-rw-r--r--net/ipv4/esp4.c59
-rw-r--r--net/ipv4/fib_frontend.c21
-rw-r--r--net/ipv4/fib_hash.c2
-rw-r--r--net/ipv4/fib_rules.c11
-rw-r--r--net/ipv4/fib_semantics.c2
-rw-r--r--net/ipv4/fib_trie.c51
-rw-r--r--net/ipv4/icmp.c31
-rw-r--r--net/ipv4/igmp.c43
-rw-r--r--net/ipv4/inet_diag.c90
-rw-r--r--net/ipv4/inetpeer.c38
-rw-r--r--net/ipv4/ip_forward.c14
-rw-r--r--net/ipv4/ip_fragment.c47
-rw-r--r--net/ipv4/ip_gre.c63
-rw-r--r--net/ipv4/ip_input.c24
-rw-r--r--net/ipv4/ip_options.c26
-rw-r--r--net/ipv4/ip_output.c123
-rw-r--r--net/ipv4/ip_sockglue.c1169
-rw-r--r--net/ipv4/ipcomp.c58
-rw-r--r--net/ipv4/ipconfig.c19
-rw-r--r--net/ipv4/ipip.c60
-rw-r--r--net/ipv4/ipmr.c418
-rw-r--r--net/ipv4/ipvs/ip_vs_app.c14
-rw-r--r--net/ipv4/ipvs/ip_vs_core.c56
-rw-r--r--net/ipv4/ipvs/ip_vs_dh.c2
-rw-r--r--net/ipv4/ipvs/ip_vs_ftp.c8
-rw-r--r--net/ipv4/ipvs/ip_vs_lblc.c2
-rw-r--r--net/ipv4/ipvs/ip_vs_lblcr.c2
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_ah.c16
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_tcp.c24
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_udp.c26
-rw-r--r--net/ipv4/ipvs/ip_vs_sh.c2
-rw-r--r--net/ipv4/ipvs/ip_vs_xmit.c44
-rw-r--r--net/ipv4/multipath_drr.c2
-rw-r--r--net/ipv4/netfilter.c8
-rw-r--r--net/ipv4/netfilter/Kconfig267
-rw-r--r--net/ipv4/netfilter/Makefile45
-rw-r--r--net/ipv4/netfilter/arp_tables.c4
-rw-r--r--net/ipv4/netfilter/arpt_mangle.c12
-rw-r--r--net/ipv4/netfilter/ip_conntrack_amanda.c229
-rw-r--r--net/ipv4/netfilter/ip_conntrack_core.c1550
-rw-r--r--net/ipv4/netfilter/ip_conntrack_ftp.c520
-rw-r--r--net/ipv4/netfilter/ip_conntrack_helper_h323.c1841
-rw-r--r--net/ipv4/netfilter/ip_conntrack_helper_pptp.c684
-rw-r--r--net/ipv4/netfilter/ip_conntrack_irc.c314
-rw-r--r--net/ipv4/netfilter/ip_conntrack_netbios_ns.c143
-rw-r--r--net/ipv4/netfilter/ip_conntrack_netlink.c1577
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_generic.c74
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_gre.c328
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_icmp.c315
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_sctp.c659
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_tcp.c1164
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_udp.c148
-rw-r--r--net/ipv4/netfilter/ip_conntrack_sip.c520
-rw-r--r--net/ipv4/netfilter/ip_conntrack_standalone.c962
-rw-r--r--net/ipv4/netfilter/ip_conntrack_tftp.c161
-rw-r--r--net/ipv4/netfilter/ip_nat_amanda.c85
-rw-r--r--net/ipv4/netfilter/ip_nat_core.c634
-rw-r--r--net/ipv4/netfilter/ip_nat_ftp.c180
-rw-r--r--net/ipv4/netfilter/ip_nat_helper.c436
-rw-r--r--net/ipv4/netfilter/ip_nat_helper_h323.c611
-rw-r--r--net/ipv4/netfilter/ip_nat_helper_pptp.c350
-rw-r--r--net/ipv4/netfilter/ip_nat_irc.c122
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_gre.c174
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_icmp.c87
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_tcp.c154
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_udp.c144
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_unknown.c55
-rw-r--r--net/ipv4/netfilter/ip_nat_rule.c314
-rw-r--r--net/ipv4/netfilter/ip_nat_sip.c282
-rw-r--r--net/ipv4/netfilter/ip_nat_snmp_basic.c1333
-rw-r--r--net/ipv4/netfilter/ip_nat_standalone.c388
-rw-r--r--net/ipv4/netfilter/ip_nat_tftp.c70
-rw-r--r--net/ipv4/netfilter/ip_queue.c28
-rw-r--r--net/ipv4/netfilter/ip_tables.c12
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c24
-rw-r--r--net/ipv4/netfilter/ipt_ECN.c15
-rw-r--r--net/ipv4/netfilter/ipt_LOG.c16
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c57
-rw-r--r--net/ipv4/netfilter/ipt_NETMAP.c26
-rw-r--r--net/ipv4/netfilter/ipt_REDIRECT.c24
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c45
-rw-r--r--net/ipv4/netfilter/ipt_SAME.c40
-rw-r--r--net/ipv4/netfilter/ipt_TOS.c4
-rw-r--r--net/ipv4/netfilter/ipt_TTL.c2
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c77
-rw-r--r--net/ipv4/netfilter/ipt_addrtype.c2
-rw-r--r--net/ipv4/netfilter/ipt_ecn.c10
-rw-r--r--net/ipv4/netfilter/ipt_iprange.c2
-rw-r--r--net/ipv4/netfilter/ipt_recent.c6
-rw-r--r--net/ipv4/netfilter/ipt_tos.c2
-rw-r--r--net/ipv4/netfilter/ipt_ttl.c11
-rw-r--r--net/ipv4/netfilter/iptable_filter.c3
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c30
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c27
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c11
-rw-r--r--net/ipv4/netfilter/nf_nat_core.c14
-rw-r--r--net/ipv4/netfilter/nf_nat_h323.c14
-rw-r--r--net/ipv4/netfilter/nf_nat_helper.c76
-rw-r--r--net/ipv4/netfilter/nf_nat_pptp.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_rule.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_sip.c11
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c8
-rw-r--r--net/ipv4/netfilter/nf_nat_standalone.c18
-rw-r--r--net/ipv4/proc.c41
-rw-r--r--net/ipv4/protocol.c2
-rw-r--r--net/ipv4/raw.c18
-rw-r--r--net/ipv4/route.c29
-rw-r--r--net/ipv4/syncookies.c40
-rw-r--r--net/ipv4/sysctl_net_ipv4.c16
-rw-r--r--net/ipv4/tcp.c131
-rw-r--r--net/ipv4/tcp_bic.c2
-rw-r--r--net/ipv4/tcp_cong.c45
-rw-r--r--net/ipv4/tcp_cubic.c81
-rw-r--r--net/ipv4/tcp_htcp.c2
-rw-r--r--net/ipv4/tcp_hybla.c2
-rw-r--r--net/ipv4/tcp_illinois.c356
-rw-r--r--net/ipv4/tcp_input.c642
-rw-r--r--net/ipv4/tcp_ipv4.c143
-rw-r--r--net/ipv4/tcp_lp.c8
-rw-r--r--net/ipv4/tcp_minisocks.c29
-rw-r--r--net/ipv4/tcp_output.c198
-rw-r--r--net/ipv4/tcp_probe.c68
-rw-r--r--net/ipv4/tcp_timer.c10
-rw-r--r--net/ipv4/tcp_vegas.c57
-rw-r--r--net/ipv4/tcp_vegas.h24
-rw-r--r--net/ipv4/tcp_veno.c10
-rw-r--r--net/ipv4/tcp_westwood.c21
-rw-r--r--net/ipv4/tcp_yeah.c268
-rw-r--r--net/ipv4/tcp_yeah.h7
-rw-r--r--net/ipv4/udp.c238
-rw-r--r--net/ipv4/udplite.c2
-rw-r--r--net/ipv4/xfrm4_input.c23
-rw-r--r--net/ipv4/xfrm4_mode_beet.c37
-rw-r--r--net/ipv4/xfrm4_mode_transport.c28
-rw-r--r--net/ipv4/xfrm4_mode_tunnel.c31
-rw-r--r--net/ipv4/xfrm4_output.c3
-rw-r--r--net/ipv4/xfrm4_policy.c8
-rw-r--r--net/ipv4/xfrm4_tunnel.c3
145 files changed, 3565 insertions, 19436 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 9e8ef509c51d..e62aee0ec4c5 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -574,6 +574,33 @@ config TCP_CONG_VENO
574 loss packets. 574 loss packets.
575 See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf 575 See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf
576 576
577config TCP_CONG_YEAH
578 tristate "YeAH TCP"
579 depends on EXPERIMENTAL
580 default n
581 ---help---
582 YeAH-TCP is a sender-side high-speed enabled TCP congestion control
583 algorithm, which uses a mixed loss/delay approach to compute the
584 congestion window. It's design goals target high efficiency,
585 internal, RTT and Reno fairness, resilience to link loss while
586 keeping network elements load as low as possible.
587
588 For further details look here:
589 http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf
590
591config TCP_CONG_ILLINOIS
592 tristate "TCP Illinois"
593 depends on EXPERIMENTAL
594 default n
595 ---help---
596 TCP-Illinois is a sender-side modificatio of TCP Reno for
597 high speed long delay links. It uses round-trip-time to
598 adjust the alpha and beta parameters to achieve a higher average
599 throughput and maintain fairness.
600
601 For further details see:
602 http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html
603
577choice 604choice
578 prompt "Default TCP congestion control" 605 prompt "Default TCP congestion control"
579 default DEFAULT_CUBIC 606 default DEFAULT_CUBIC
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 7a068626feea..4ff6c151d7f3 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -49,6 +49,8 @@ obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
49obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o 49obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o
50obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o 50obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
51obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o 51obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
52obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
53obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
52obj-$(CONFIG_NETLABEL) += cipso_ipv4.o 54obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
53 55
54obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ 56obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index cf358c84c440..16aae8ef5555 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -87,6 +87,7 @@
87#include <linux/init.h> 87#include <linux/init.h>
88#include <linux/poll.h> 88#include <linux/poll.h>
89#include <linux/netfilter_ipv4.h> 89#include <linux/netfilter_ipv4.h>
90#include <linux/random.h>
90 91
91#include <asm/uaccess.h> 92#include <asm/uaccess.h>
92#include <asm/system.h> 93#include <asm/system.h>
@@ -217,6 +218,26 @@ out:
217 return err; 218 return err;
218} 219}
219 220
221u32 inet_ehash_secret __read_mostly;
222EXPORT_SYMBOL(inet_ehash_secret);
223
224/*
225 * inet_ehash_secret must be set exactly once
226 * Instead of using a dedicated spinlock, we (ab)use inetsw_lock
227 */
228void build_ehash_secret(void)
229{
230 u32 rnd;
231 do {
232 get_random_bytes(&rnd, sizeof(rnd));
233 } while (rnd == 0);
234 spin_lock_bh(&inetsw_lock);
235 if (!inet_ehash_secret)
236 inet_ehash_secret = rnd;
237 spin_unlock_bh(&inetsw_lock);
238}
239EXPORT_SYMBOL(build_ehash_secret);
240
220/* 241/*
221 * Create an inet socket. 242 * Create an inet socket.
222 */ 243 */
@@ -233,6 +254,11 @@ static int inet_create(struct socket *sock, int protocol)
233 int try_loading_module = 0; 254 int try_loading_module = 0;
234 int err; 255 int err;
235 256
257 if (sock->type != SOCK_RAW &&
258 sock->type != SOCK_DGRAM &&
259 !inet_ehash_secret)
260 build_ehash_secret();
261
236 sock->state = SS_UNCONNECTED; 262 sock->state = SS_UNCONNECTED;
237 263
238 /* Look for the requested type/protocol pair. */ 264 /* Look for the requested type/protocol pair. */
@@ -755,6 +781,9 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
755 case SIOCGSTAMP: 781 case SIOCGSTAMP:
756 err = sock_get_timestamp(sk, (struct timeval __user *)arg); 782 err = sock_get_timestamp(sk, (struct timeval __user *)arg);
757 break; 783 break;
784 case SIOCGSTAMPNS:
785 err = sock_get_timestampns(sk, (struct timespec __user *)arg);
786 break;
758 case SIOCADDRT: 787 case SIOCADDRT:
759 case SIOCDELRT: 788 case SIOCDELRT:
760 case SIOCRTMSG: 789 case SIOCRTMSG:
@@ -1109,7 +1138,7 @@ static int inet_gso_send_check(struct sk_buff *skb)
1109 if (unlikely(!pskb_may_pull(skb, sizeof(*iph)))) 1138 if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
1110 goto out; 1139 goto out;
1111 1140
1112 iph = skb->nh.iph; 1141 iph = ip_hdr(skb);
1113 ihl = iph->ihl * 4; 1142 ihl = iph->ihl * 4;
1114 if (ihl < sizeof(*iph)) 1143 if (ihl < sizeof(*iph))
1115 goto out; 1144 goto out;
@@ -1117,8 +1146,9 @@ static int inet_gso_send_check(struct sk_buff *skb)
1117 if (unlikely(!pskb_may_pull(skb, ihl))) 1146 if (unlikely(!pskb_may_pull(skb, ihl)))
1118 goto out; 1147 goto out;
1119 1148
1120 skb->h.raw = __skb_pull(skb, ihl); 1149 __skb_pull(skb, ihl);
1121 iph = skb->nh.iph; 1150 skb_reset_transport_header(skb);
1151 iph = ip_hdr(skb);
1122 proto = iph->protocol & (MAX_INET_PROTOS - 1); 1152 proto = iph->protocol & (MAX_INET_PROTOS - 1);
1123 err = -EPROTONOSUPPORT; 1153 err = -EPROTONOSUPPORT;
1124 1154
@@ -1152,7 +1182,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features)
1152 if (unlikely(!pskb_may_pull(skb, sizeof(*iph)))) 1182 if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
1153 goto out; 1183 goto out;
1154 1184
1155 iph = skb->nh.iph; 1185 iph = ip_hdr(skb);
1156 ihl = iph->ihl * 4; 1186 ihl = iph->ihl * 4;
1157 if (ihl < sizeof(*iph)) 1187 if (ihl < sizeof(*iph))
1158 goto out; 1188 goto out;
@@ -1160,8 +1190,9 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features)
1160 if (unlikely(!pskb_may_pull(skb, ihl))) 1190 if (unlikely(!pskb_may_pull(skb, ihl)))
1161 goto out; 1191 goto out;
1162 1192
1163 skb->h.raw = __skb_pull(skb, ihl); 1193 __skb_pull(skb, ihl);
1164 iph = skb->nh.iph; 1194 skb_reset_transport_header(skb);
1195 iph = ip_hdr(skb);
1165 id = ntohs(iph->id); 1196 id = ntohs(iph->id);
1166 proto = iph->protocol & (MAX_INET_PROTOS - 1); 1197 proto = iph->protocol & (MAX_INET_PROTOS - 1);
1167 segs = ERR_PTR(-EPROTONOSUPPORT); 1198 segs = ERR_PTR(-EPROTONOSUPPORT);
@@ -1177,17 +1208,57 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features)
1177 1208
1178 skb = segs; 1209 skb = segs;
1179 do { 1210 do {
1180 iph = skb->nh.iph; 1211 iph = ip_hdr(skb);
1181 iph->id = htons(id++); 1212 iph->id = htons(id++);
1182 iph->tot_len = htons(skb->len - skb->mac_len); 1213 iph->tot_len = htons(skb->len - skb->mac_len);
1183 iph->check = 0; 1214 iph->check = 0;
1184 iph->check = ip_fast_csum(skb->nh.raw, iph->ihl); 1215 iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl);
1185 } while ((skb = skb->next)); 1216 } while ((skb = skb->next));
1186 1217
1187out: 1218out:
1188 return segs; 1219 return segs;
1189} 1220}
1190 1221
1222unsigned long snmp_fold_field(void *mib[], int offt)
1223{
1224 unsigned long res = 0;
1225 int i;
1226
1227 for_each_possible_cpu(i) {
1228 res += *(((unsigned long *) per_cpu_ptr(mib[0], i)) + offt);
1229 res += *(((unsigned long *) per_cpu_ptr(mib[1], i)) + offt);
1230 }
1231 return res;
1232}
1233EXPORT_SYMBOL_GPL(snmp_fold_field);
1234
1235int snmp_mib_init(void *ptr[2], size_t mibsize, size_t mibalign)
1236{
1237 BUG_ON(ptr == NULL);
1238 ptr[0] = __alloc_percpu(mibsize);
1239 if (!ptr[0])
1240 goto err0;
1241 ptr[1] = __alloc_percpu(mibsize);
1242 if (!ptr[1])
1243 goto err1;
1244 return 0;
1245err1:
1246 free_percpu(ptr[0]);
1247 ptr[0] = NULL;
1248err0:
1249 return -ENOMEM;
1250}
1251EXPORT_SYMBOL_GPL(snmp_mib_init);
1252
1253void snmp_mib_free(void *ptr[2])
1254{
1255 BUG_ON(ptr == NULL);
1256 free_percpu(ptr[0]);
1257 free_percpu(ptr[1]);
1258 ptr[0] = ptr[1] = NULL;
1259}
1260EXPORT_SYMBOL_GPL(snmp_mib_free);
1261
1191#ifdef CONFIG_IP_MULTICAST 1262#ifdef CONFIG_IP_MULTICAST
1192static struct net_protocol igmp_protocol = { 1263static struct net_protocol igmp_protocol = {
1193 .handler = igmp_rcv, 1264 .handler = igmp_rcv,
@@ -1214,28 +1285,47 @@ static struct net_protocol icmp_protocol = {
1214 1285
1215static int __init init_ipv4_mibs(void) 1286static int __init init_ipv4_mibs(void)
1216{ 1287{
1217 net_statistics[0] = alloc_percpu(struct linux_mib); 1288 if (snmp_mib_init((void **)net_statistics,
1218 net_statistics[1] = alloc_percpu(struct linux_mib); 1289 sizeof(struct linux_mib),
1219 ip_statistics[0] = alloc_percpu(struct ipstats_mib); 1290 __alignof__(struct linux_mib)) < 0)
1220 ip_statistics[1] = alloc_percpu(struct ipstats_mib); 1291 goto err_net_mib;
1221 icmp_statistics[0] = alloc_percpu(struct icmp_mib); 1292 if (snmp_mib_init((void **)ip_statistics,
1222 icmp_statistics[1] = alloc_percpu(struct icmp_mib); 1293 sizeof(struct ipstats_mib),
1223 tcp_statistics[0] = alloc_percpu(struct tcp_mib); 1294 __alignof__(struct ipstats_mib)) < 0)
1224 tcp_statistics[1] = alloc_percpu(struct tcp_mib); 1295 goto err_ip_mib;
1225 udp_statistics[0] = alloc_percpu(struct udp_mib); 1296 if (snmp_mib_init((void **)icmp_statistics,
1226 udp_statistics[1] = alloc_percpu(struct udp_mib); 1297 sizeof(struct icmp_mib),
1227 udplite_statistics[0] = alloc_percpu(struct udp_mib); 1298 __alignof__(struct icmp_mib)) < 0)
1228 udplite_statistics[1] = alloc_percpu(struct udp_mib); 1299 goto err_icmp_mib;
1229 if (! 1300 if (snmp_mib_init((void **)tcp_statistics,
1230 (net_statistics[0] && net_statistics[1] && ip_statistics[0] 1301 sizeof(struct tcp_mib),
1231 && ip_statistics[1] && tcp_statistics[0] && tcp_statistics[1] 1302 __alignof__(struct tcp_mib)) < 0)
1232 && udp_statistics[0] && udp_statistics[1] 1303 goto err_tcp_mib;
1233 && udplite_statistics[0] && udplite_statistics[1] ) ) 1304 if (snmp_mib_init((void **)udp_statistics,
1234 return -ENOMEM; 1305 sizeof(struct udp_mib),
1235 1306 __alignof__(struct udp_mib)) < 0)
1236 (void) tcp_mib_init(); 1307 goto err_udp_mib;
1308 if (snmp_mib_init((void **)udplite_statistics,
1309 sizeof(struct udp_mib),
1310 __alignof__(struct udp_mib)) < 0)
1311 goto err_udplite_mib;
1312
1313 tcp_mib_init();
1237 1314
1238 return 0; 1315 return 0;
1316
1317err_udplite_mib:
1318 snmp_mib_free((void **)udp_statistics);
1319err_udp_mib:
1320 snmp_mib_free((void **)tcp_statistics);
1321err_tcp_mib:
1322 snmp_mib_free((void **)icmp_statistics);
1323err_icmp_mib:
1324 snmp_mib_free((void **)ip_statistics);
1325err_ip_mib:
1326 snmp_mib_free((void **)net_statistics);
1327err_net_mib:
1328 return -ENOMEM;
1239} 1329}
1240 1330
1241static int ipv4_proc_init(void); 1331static int ipv4_proc_init(void);
@@ -1336,7 +1426,7 @@ static int __init inet_init(void)
1336 * Initialise per-cpu ipv4 mibs 1426 * Initialise per-cpu ipv4 mibs
1337 */ 1427 */
1338 1428
1339 if(init_ipv4_mibs()) 1429 if (init_ipv4_mibs())
1340 printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n"); ; 1430 printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n"); ;
1341 1431
1342 ipv4_proc_init(); 1432 ipv4_proc_init();
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 7194eb40b6d0..6da8ff597ad3 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -65,7 +65,7 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
65 char buf[60]; 65 char buf[60];
66 } tmp_iph; 66 } tmp_iph;
67 67
68 top_iph = skb->nh.iph; 68 top_iph = ip_hdr(skb);
69 iph = &tmp_iph.iph; 69 iph = &tmp_iph.iph;
70 70
71 iph->tos = top_iph->tos; 71 iph->tos = top_iph->tos;
@@ -152,9 +152,9 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
152 skb->ip_summed = CHECKSUM_NONE; 152 skb->ip_summed = CHECKSUM_NONE;
153 153
154 ah = (struct ip_auth_hdr*)skb->data; 154 ah = (struct ip_auth_hdr*)skb->data;
155 iph = skb->nh.iph; 155 iph = ip_hdr(skb);
156 156
157 ihl = skb->data - skb->nh.raw; 157 ihl = skb->data - skb_network_header(skb);
158 memcpy(work_buf, iph, ihl); 158 memcpy(work_buf, iph, ihl);
159 159
160 iph->ttl = 0; 160 iph->ttl = 0;
@@ -181,7 +181,9 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
181 } 181 }
182 } 182 }
183 ((struct iphdr*)work_buf)->protocol = ah->nexthdr; 183 ((struct iphdr*)work_buf)->protocol = ah->nexthdr;
184 skb->h.raw = memcpy(skb->nh.raw += ah_hlen, work_buf, ihl); 184 skb->network_header += ah_hlen;
185 memcpy(skb_network_header(skb), work_buf, ihl);
186 skb->transport_header = skb->network_header;
185 __skb_pull(skb, ah_hlen + ihl); 187 __skb_pull(skb, ah_hlen + ihl);
186 188
187 return 0; 189 return 0;
@@ -196,8 +198,8 @@ static void ah4_err(struct sk_buff *skb, u32 info)
196 struct ip_auth_hdr *ah = (struct ip_auth_hdr*)(skb->data+(iph->ihl<<2)); 198 struct ip_auth_hdr *ah = (struct ip_auth_hdr*)(skb->data+(iph->ihl<<2));
197 struct xfrm_state *x; 199 struct xfrm_state *x;
198 200
199 if (skb->h.icmph->type != ICMP_DEST_UNREACH || 201 if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH ||
200 skb->h.icmph->code != ICMP_FRAG_NEEDED) 202 icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
201 return; 203 return;
202 204
203 x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET); 205 x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 1a3488a83f49..7110779a0244 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -342,13 +342,13 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
342 switch (IN_DEV_ARP_ANNOUNCE(in_dev)) { 342 switch (IN_DEV_ARP_ANNOUNCE(in_dev)) {
343 default: 343 default:
344 case 0: /* By default announce any local IP */ 344 case 0: /* By default announce any local IP */
345 if (skb && inet_addr_type(skb->nh.iph->saddr) == RTN_LOCAL) 345 if (skb && inet_addr_type(ip_hdr(skb)->saddr) == RTN_LOCAL)
346 saddr = skb->nh.iph->saddr; 346 saddr = ip_hdr(skb)->saddr;
347 break; 347 break;
348 case 1: /* Restrict announcements of saddr in same subnet */ 348 case 1: /* Restrict announcements of saddr in same subnet */
349 if (!skb) 349 if (!skb)
350 break; 350 break;
351 saddr = skb->nh.iph->saddr; 351 saddr = ip_hdr(skb)->saddr;
352 if (inet_addr_type(saddr) == RTN_LOCAL) { 352 if (inet_addr_type(saddr) == RTN_LOCAL) {
353 /* saddr should be known to target */ 353 /* saddr should be known to target */
354 if (inet_addr_onlink(in_dev, target, saddr)) 354 if (inet_addr_onlink(in_dev, target, saddr))
@@ -578,7 +578,7 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
578 return NULL; 578 return NULL;
579 579
580 skb_reserve(skb, LL_RESERVED_SPACE(dev)); 580 skb_reserve(skb, LL_RESERVED_SPACE(dev));
581 skb->nh.raw = skb->data; 581 skb_reset_network_header(skb);
582 arp = (struct arphdr *) skb_put(skb,sizeof(struct arphdr) + 2*(dev->addr_len+4)); 582 arp = (struct arphdr *) skb_put(skb,sizeof(struct arphdr) + 2*(dev->addr_len+4));
583 skb->dev = dev; 583 skb->dev = dev;
584 skb->protocol = htons(ETH_P_ARP); 584 skb->protocol = htons(ETH_P_ARP);
@@ -721,7 +721,7 @@ static int arp_process(struct sk_buff *skb)
721 if (in_dev == NULL) 721 if (in_dev == NULL)
722 goto out; 722 goto out;
723 723
724 arp = skb->nh.arph; 724 arp = arp_hdr(skb);
725 725
726 switch (dev_type) { 726 switch (dev_type) {
727 default: 727 default:
@@ -937,7 +937,7 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
937 (2 * sizeof(u32))))) 937 (2 * sizeof(u32)))))
938 goto freeskb; 938 goto freeskb;
939 939
940 arp = skb->nh.arph; 940 arp = arp_hdr(skb);
941 if (arp->ar_hln != dev->addr_len || 941 if (arp->ar_hln != dev->addr_len ||
942 dev->flags & IFF_NOARP || 942 dev->flags & IFF_NOARP ||
943 skb->pkt_type == PACKET_OTHERHOST || 943 skb->pkt_type == PACKET_OTHERHOST ||
@@ -1178,7 +1178,7 @@ int arp_ioctl(unsigned int cmd, void __user *arg)
1178 goto out; 1178 goto out;
1179 } 1179 }
1180 1180
1181 switch(cmd) { 1181 switch (cmd) {
1182 case SIOCDARP: 1182 case SIOCDARP:
1183 err = arp_req_delete(&r, dev); 1183 err = arp_req_delete(&r, dev);
1184 break; 1184 break;
@@ -1360,7 +1360,7 @@ static void *arp_seq_start(struct seq_file *seq, loff_t *pos)
1360 1360
1361/* ------------------------------------------------------------------------ */ 1361/* ------------------------------------------------------------------------ */
1362 1362
1363static struct seq_operations arp_seq_ops = { 1363static const struct seq_operations arp_seq_ops = {
1364 .start = arp_seq_start, 1364 .start = arp_seq_start,
1365 .next = neigh_seq_next, 1365 .next = neigh_seq_next,
1366 .stop = neigh_seq_stop, 1366 .stop = neigh_seq_stop,
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 2ce5b693a8bd..11a3404d65af 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -1174,7 +1174,7 @@ static int cipso_v4_map_cat_rng_ntoh(const struct cipso_v4_doi *doi_def,
1174 u16 cat_low; 1174 u16 cat_low;
1175 u16 cat_high; 1175 u16 cat_high;
1176 1176
1177 for(net_iter = 0; net_iter < net_cat_len; net_iter += 4) { 1177 for (net_iter = 0; net_iter < net_cat_len; net_iter += 4) {
1178 cat_high = ntohs(*((__be16 *)&net_cat[net_iter])); 1178 cat_high = ntohs(*((__be16 *)&net_cat[net_iter]));
1179 if ((net_iter + 4) <= net_cat_len) 1179 if ((net_iter + 4) <= net_cat_len)
1180 cat_low = ntohs(*((__be16 *)&net_cat[net_iter + 2])); 1180 cat_low = ntohs(*((__be16 *)&net_cat[net_iter + 2]));
@@ -1676,7 +1676,7 @@ validate_return:
1676 */ 1676 */
1677void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway) 1677void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway)
1678{ 1678{
1679 if (skb->nh.iph->protocol == IPPROTO_ICMP || error != -EACCES) 1679 if (ip_hdr(skb)->protocol == IPPROTO_ICMP || error != -EACCES)
1680 return; 1680 return;
1681 1681
1682 if (gateway) 1682 if (gateway)
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 98a00d0edc76..088888db8b3d 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -48,7 +48,6 @@
48#include <linux/netdevice.h> 48#include <linux/netdevice.h>
49#include <linux/etherdevice.h> 49#include <linux/etherdevice.h>
50#include <linux/skbuff.h> 50#include <linux/skbuff.h>
51#include <linux/rtnetlink.h>
52#include <linux/init.h> 51#include <linux/init.h>
53#include <linux/notifier.h> 52#include <linux/notifier.h>
54#include <linux/inetdevice.h> 53#include <linux/inetdevice.h>
@@ -62,7 +61,7 @@
62#include <net/ip.h> 61#include <net/ip.h>
63#include <net/route.h> 62#include <net/route.h>
64#include <net/ip_fib.h> 63#include <net/ip_fib.h>
65#include <net/netlink.h> 64#include <net/rtnetlink.h>
66 65
67struct ipv4_devconf ipv4_devconf = { 66struct ipv4_devconf ipv4_devconf = {
68 .accept_redirects = 1, 67 .accept_redirects = 1,
@@ -633,7 +632,7 @@ int devinet_ioctl(unsigned int cmd, void __user *arg)
633 dev_load(ifr.ifr_name); 632 dev_load(ifr.ifr_name);
634#endif 633#endif
635 634
636 switch(cmd) { 635 switch (cmd) {
637 case SIOCGIFADDR: /* Get interface address */ 636 case SIOCGIFADDR: /* Get interface address */
638 case SIOCGIFBRDADDR: /* Get the broadcast address */ 637 case SIOCGIFBRDADDR: /* Get the broadcast address */
639 case SIOCGIFDSTADDR: /* Get the destination address */ 638 case SIOCGIFDSTADDR: /* Get the destination address */
@@ -708,7 +707,7 @@ int devinet_ioctl(unsigned int cmd, void __user *arg)
708 if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS) 707 if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS)
709 goto done; 708 goto done;
710 709
711 switch(cmd) { 710 switch (cmd) {
712 case SIOCGIFADDR: /* Get interface address */ 711 case SIOCGIFADDR: /* Get interface address */
713 sin->sin_addr.s_addr = ifa->ifa_local; 712 sin->sin_addr.s_addr = ifa->ifa_local;
714 goto rarok; 713 goto rarok;
@@ -1183,17 +1182,13 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
1183 int s_ip_idx, s_idx = cb->args[0]; 1182 int s_ip_idx, s_idx = cb->args[0];
1184 1183
1185 s_ip_idx = ip_idx = cb->args[1]; 1184 s_ip_idx = ip_idx = cb->args[1];
1186 read_lock(&dev_base_lock);
1187 for (dev = dev_base, idx = 0; dev; dev = dev->next, idx++) { 1185 for (dev = dev_base, idx = 0; dev; dev = dev->next, idx++) {
1188 if (idx < s_idx) 1186 if (idx < s_idx)
1189 continue; 1187 continue;
1190 if (idx > s_idx) 1188 if (idx > s_idx)
1191 s_ip_idx = 0; 1189 s_ip_idx = 0;
1192 rcu_read_lock(); 1190 if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
1193 if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
1194 rcu_read_unlock();
1195 continue; 1191 continue;
1196 }
1197 1192
1198 for (ifa = in_dev->ifa_list, ip_idx = 0; ifa; 1193 for (ifa = in_dev->ifa_list, ip_idx = 0; ifa;
1199 ifa = ifa->ifa_next, ip_idx++) { 1194 ifa = ifa->ifa_next, ip_idx++) {
@@ -1201,16 +1196,12 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
1201 continue; 1196 continue;
1202 if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid, 1197 if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid,
1203 cb->nlh->nlmsg_seq, 1198 cb->nlh->nlmsg_seq,
1204 RTM_NEWADDR, NLM_F_MULTI) <= 0) { 1199 RTM_NEWADDR, NLM_F_MULTI) <= 0)
1205 rcu_read_unlock();
1206 goto done; 1200 goto done;
1207 }
1208 } 1201 }
1209 rcu_read_unlock();
1210 } 1202 }
1211 1203
1212done: 1204done:
1213 read_unlock(&dev_base_lock);
1214 cb->args[0] = idx; 1205 cb->args[0] = idx;
1215 cb->args[1] = ip_idx; 1206 cb->args[1] = ip_idx;
1216 1207
@@ -1241,19 +1232,6 @@ errout:
1241 rtnl_set_sk_err(RTNLGRP_IPV4_IFADDR, err); 1232 rtnl_set_sk_err(RTNLGRP_IPV4_IFADDR, err);
1242} 1233}
1243 1234
1244static struct rtnetlink_link inet_rtnetlink_table[RTM_NR_MSGTYPES] = {
1245 [RTM_NEWADDR - RTM_BASE] = { .doit = inet_rtm_newaddr, },
1246 [RTM_DELADDR - RTM_BASE] = { .doit = inet_rtm_deladdr, },
1247 [RTM_GETADDR - RTM_BASE] = { .dumpit = inet_dump_ifaddr, },
1248 [RTM_NEWROUTE - RTM_BASE] = { .doit = inet_rtm_newroute, },
1249 [RTM_DELROUTE - RTM_BASE] = { .doit = inet_rtm_delroute, },
1250 [RTM_GETROUTE - RTM_BASE] = { .doit = inet_rtm_getroute,
1251 .dumpit = inet_dump_fib, },
1252#ifdef CONFIG_IP_MULTIPLE_TABLES
1253 [RTM_GETRULE - RTM_BASE] = { .dumpit = fib4_rules_dump, },
1254#endif
1255};
1256
1257#ifdef CONFIG_SYSCTL 1235#ifdef CONFIG_SYSCTL
1258 1236
1259void inet_forward_change(void) 1237void inet_forward_change(void)
@@ -1636,7 +1614,10 @@ void __init devinet_init(void)
1636{ 1614{
1637 register_gifconf(PF_INET, inet_gifconf); 1615 register_gifconf(PF_INET, inet_gifconf);
1638 register_netdevice_notifier(&ip_netdev_notifier); 1616 register_netdevice_notifier(&ip_netdev_notifier);
1639 rtnetlink_links[PF_INET] = inet_rtnetlink_table; 1617
1618 rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL);
1619 rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL);
1620 rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr);
1640#ifdef CONFIG_SYSCTL 1621#ifdef CONFIG_SYSCTL
1641 devinet_sysctl.sysctl_header = 1622 devinet_sysctl.sysctl_header =
1642 register_sysctl_table(devinet_sysctl.devinet_root_dir); 1623 register_sysctl_table(devinet_sysctl.devinet_root_dir);
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 31041127eeb8..47c95e8ef045 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -21,13 +21,14 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
21 struct blkcipher_desc desc; 21 struct blkcipher_desc desc;
22 struct esp_data *esp; 22 struct esp_data *esp;
23 struct sk_buff *trailer; 23 struct sk_buff *trailer;
24 u8 *tail;
24 int blksize; 25 int blksize;
25 int clen; 26 int clen;
26 int alen; 27 int alen;
27 int nfrags; 28 int nfrags;
28 29
29 /* Strip IP+ESP header. */ 30 /* Strip IP+ESP header. */
30 __skb_pull(skb, skb->h.raw - skb->data); 31 __skb_pull(skb, skb_transport_offset(skb));
31 /* Now skb is pure payload to encrypt */ 32 /* Now skb is pure payload to encrypt */
32 33
33 err = -ENOMEM; 34 err = -ENOMEM;
@@ -49,19 +50,21 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
49 goto error; 50 goto error;
50 51
51 /* Fill padding... */ 52 /* Fill padding... */
53 tail = skb_tail_pointer(trailer);
52 do { 54 do {
53 int i; 55 int i;
54 for (i=0; i<clen-skb->len - 2; i++) 56 for (i=0; i<clen-skb->len - 2; i++)
55 *(u8*)(trailer->tail + i) = i+1; 57 tail[i] = i + 1;
56 } while (0); 58 } while (0);
57 *(u8*)(trailer->tail + clen-skb->len - 2) = (clen - skb->len)-2; 59 tail[clen - skb->len - 2] = (clen - skb->len) - 2;
58 pskb_put(skb, trailer, clen - skb->len); 60 pskb_put(skb, trailer, clen - skb->len);
59 61
60 __skb_push(skb, skb->data - skb->nh.raw); 62 __skb_push(skb, skb->data - skb_network_header(skb));
61 top_iph = skb->nh.iph; 63 top_iph = ip_hdr(skb);
62 esph = (struct ip_esp_hdr *)(skb->nh.raw + top_iph->ihl*4); 64 esph = (struct ip_esp_hdr *)(skb_network_header(skb) +
65 top_iph->ihl * 4);
63 top_iph->tot_len = htons(skb->len + alen); 66 top_iph->tot_len = htons(skb->len + alen);
64 *(u8*)(trailer->tail - 1) = top_iph->protocol; 67 *(skb_tail_pointer(trailer) - 1) = top_iph->protocol;
65 68
66 /* this is non-NULL only with UDP Encapsulation */ 69 /* this is non-NULL only with UDP Encapsulation */
67 if (x->encap) { 70 if (x->encap) {
@@ -217,12 +220,12 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
217 220
218 /* ... check padding bits here. Silly. :-) */ 221 /* ... check padding bits here. Silly. :-) */
219 222
220 iph = skb->nh.iph; 223 iph = ip_hdr(skb);
221 ihl = iph->ihl * 4; 224 ihl = iph->ihl * 4;
222 225
223 if (x->encap) { 226 if (x->encap) {
224 struct xfrm_encap_tmpl *encap = x->encap; 227 struct xfrm_encap_tmpl *encap = x->encap;
225 struct udphdr *uh = (void *)(skb->nh.raw + ihl); 228 struct udphdr *uh = (void *)(skb_network_header(skb) + ihl);
226 229
227 /* 230 /*
228 * 1) if the NAT-T peer's IP or port changed then 231 * 1) if the NAT-T peer's IP or port changed then
@@ -260,7 +263,8 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
260 263
261 iph->protocol = nexthdr[1]; 264 iph->protocol = nexthdr[1];
262 pskb_trim(skb, skb->len - alen - padlen - 2); 265 pskb_trim(skb, skb->len - alen - padlen - 2);
263 skb->h.raw = __skb_pull(skb, sizeof(*esph) + esp->conf.ivlen) - ihl; 266 __skb_pull(skb, sizeof(*esph) + esp->conf.ivlen);
267 skb_set_transport_header(skb, -ihl);
264 268
265 return 0; 269 return 0;
266 270
@@ -268,32 +272,33 @@ out:
268 return -EINVAL; 272 return -EINVAL;
269} 273}
270 274
271static u32 esp4_get_max_size(struct xfrm_state *x, int mtu) 275static u32 esp4_get_mtu(struct xfrm_state *x, int mtu)
272{ 276{
273 struct esp_data *esp = x->data; 277 struct esp_data *esp = x->data;
274 u32 blksize = ALIGN(crypto_blkcipher_blocksize(esp->conf.tfm), 4); 278 u32 blksize = ALIGN(crypto_blkcipher_blocksize(esp->conf.tfm), 4);
275 int enclen = 0; 279 u32 align = max_t(u32, blksize, esp->conf.padlen);
280 u32 rem;
281
282 mtu -= x->props.header_len + esp->auth.icv_trunc_len;
283 rem = mtu & (align - 1);
284 mtu &= ~(align - 1);
276 285
277 switch (x->props.mode) { 286 switch (x->props.mode) {
278 case XFRM_MODE_TUNNEL: 287 case XFRM_MODE_TUNNEL:
279 mtu = ALIGN(mtu +2, blksize);
280 break; 288 break;
281 default: 289 default:
282 case XFRM_MODE_TRANSPORT: 290 case XFRM_MODE_TRANSPORT:
283 /* The worst case */ 291 /* The worst case */
284 mtu = ALIGN(mtu + 2, 4) + blksize - 4; 292 mtu -= blksize - 4;
293 mtu += min_t(u32, blksize - 4, rem);
285 break; 294 break;
286 case XFRM_MODE_BEET: 295 case XFRM_MODE_BEET:
287 /* The worst case. */ 296 /* The worst case. */
288 enclen = IPV4_BEET_PHMAXLEN; 297 mtu += min_t(u32, IPV4_BEET_PHMAXLEN, rem);
289 mtu = ALIGN(mtu + enclen + 2, blksize);
290 break; 298 break;
291 } 299 }
292 300
293 if (esp->conf.padlen) 301 return mtu - 2;
294 mtu = ALIGN(mtu, esp->conf.padlen);
295
296 return mtu + x->props.header_len + esp->auth.icv_trunc_len - enclen;
297} 302}
298 303
299static void esp4_err(struct sk_buff *skb, u32 info) 304static void esp4_err(struct sk_buff *skb, u32 info)
@@ -302,8 +307,8 @@ static void esp4_err(struct sk_buff *skb, u32 info)
302 struct ip_esp_hdr *esph = (struct ip_esp_hdr*)(skb->data+(iph->ihl<<2)); 307 struct ip_esp_hdr *esph = (struct ip_esp_hdr*)(skb->data+(iph->ihl<<2));
303 struct xfrm_state *x; 308 struct xfrm_state *x;
304 309
305 if (skb->h.icmph->type != ICMP_DEST_UNREACH || 310 if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH ||
306 skb->h.icmph->code != ICMP_FRAG_NEEDED) 311 icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
307 return; 312 return;
308 313
309 x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET); 314 x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET);
@@ -336,6 +341,7 @@ static int esp_init_state(struct xfrm_state *x)
336{ 341{
337 struct esp_data *esp = NULL; 342 struct esp_data *esp = NULL;
338 struct crypto_blkcipher *tfm; 343 struct crypto_blkcipher *tfm;
344 u32 align;
339 345
340 /* null auth and encryption can have zero length keys */ 346 /* null auth and encryption can have zero length keys */
341 if (x->aalg) { 347 if (x->aalg) {
@@ -402,6 +408,8 @@ static int esp_init_state(struct xfrm_state *x)
402 x->props.header_len = sizeof(struct ip_esp_hdr) + esp->conf.ivlen; 408 x->props.header_len = sizeof(struct ip_esp_hdr) + esp->conf.ivlen;
403 if (x->props.mode == XFRM_MODE_TUNNEL) 409 if (x->props.mode == XFRM_MODE_TUNNEL)
404 x->props.header_len += sizeof(struct iphdr); 410 x->props.header_len += sizeof(struct iphdr);
411 else if (x->props.mode == XFRM_MODE_BEET)
412 x->props.header_len += IPV4_BEET_PHMAXLEN;
405 if (x->encap) { 413 if (x->encap) {
406 struct xfrm_encap_tmpl *encap = x->encap; 414 struct xfrm_encap_tmpl *encap = x->encap;
407 415
@@ -417,7 +425,10 @@ static int esp_init_state(struct xfrm_state *x)
417 } 425 }
418 } 426 }
419 x->data = esp; 427 x->data = esp;
420 x->props.trailer_len = esp4_get_max_size(x, 0) - x->props.header_len; 428 align = ALIGN(crypto_blkcipher_blocksize(esp->conf.tfm), 4);
429 if (esp->conf.padlen)
430 align = max_t(u32, align, esp->conf.padlen);
431 x->props.trailer_len = align + 1 + esp->auth.icv_trunc_len;
421 return 0; 432 return 0;
422 433
423error: 434error:
@@ -434,7 +445,7 @@ static struct xfrm_type esp_type =
434 .proto = IPPROTO_ESP, 445 .proto = IPPROTO_ESP,
435 .init_state = esp_init_state, 446 .init_state = esp_init_state,
436 .destructor = esp_destroy, 447 .destructor = esp_destroy,
437 .get_max_size = esp4_get_max_size, 448 .get_mtu = esp4_get_mtu,
438 .input = esp_input, 449 .input = esp_input,
439 .output = esp_output 450 .output = esp_output
440}; 451};
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index cac06c43f004..837f2957fa83 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -34,7 +34,6 @@
34#include <linux/if_addr.h> 34#include <linux/if_addr.h>
35#include <linux/if_arp.h> 35#include <linux/if_arp.h>
36#include <linux/skbuff.h> 36#include <linux/skbuff.h>
37#include <linux/netlink.h>
38#include <linux/init.h> 37#include <linux/init.h>
39#include <linux/list.h> 38#include <linux/list.h>
40 39
@@ -46,6 +45,7 @@
46#include <net/icmp.h> 45#include <net/icmp.h>
47#include <net/arp.h> 46#include <net/arp.h>
48#include <net/ip_fib.h> 47#include <net/ip_fib.h>
48#include <net/rtnetlink.h>
49 49
50#define FFprint(a...) printk(KERN_DEBUG a) 50#define FFprint(a...) printk(KERN_DEBUG a)
51 51
@@ -540,7 +540,7 @@ errout:
540 return err; 540 return err;
541} 541}
542 542
543int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) 543static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
544{ 544{
545 struct fib_config cfg; 545 struct fib_config cfg;
546 struct fib_table *tb; 546 struct fib_table *tb;
@@ -561,7 +561,7 @@ errout:
561 return err; 561 return err;
562} 562}
563 563
564int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) 564static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
565{ 565{
566 struct fib_config cfg; 566 struct fib_config cfg;
567 struct fib_table *tb; 567 struct fib_table *tb;
@@ -582,7 +582,7 @@ errout:
582 return err; 582 return err;
583} 583}
584 584
585int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) 585static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
586{ 586{
587 unsigned int h, s_h; 587 unsigned int h, s_h;
588 unsigned int e = 0, s_e; 588 unsigned int e = 0, s_e;
@@ -777,6 +777,10 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb )
777 .tos = frn->fl_tos, 777 .tos = frn->fl_tos,
778 .scope = frn->fl_scope } } }; 778 .scope = frn->fl_scope } } };
779 779
780#ifdef CONFIG_IP_MULTIPLE_TABLES
781 res.r = NULL;
782#endif
783
780 frn->err = -ENOENT; 784 frn->err = -ENOENT;
781 if (tb) { 785 if (tb) {
782 local_bh_disable(); 786 local_bh_disable();
@@ -807,7 +811,7 @@ static void nl_fib_input(struct sock *sk, int len)
807 if (skb == NULL) 811 if (skb == NULL)
808 return; 812 return;
809 813
810 nlh = (struct nlmsghdr *)skb->data; 814 nlh = nlmsg_hdr(skb);
811 if (skb->len < NLMSG_SPACE(0) || skb->len < nlh->nlmsg_len || 815 if (skb->len < NLMSG_SPACE(0) || skb->len < nlh->nlmsg_len ||
812 nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*frn))) { 816 nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*frn))) {
813 kfree_skb(skb); 817 kfree_skb(skb);
@@ -827,7 +831,8 @@ static void nl_fib_input(struct sock *sk, int len)
827 831
828static void nl_fib_lookup_init(void) 832static void nl_fib_lookup_init(void)
829{ 833{
830 netlink_kernel_create(NETLINK_FIB_LOOKUP, 0, nl_fib_input, THIS_MODULE); 834 netlink_kernel_create(NETLINK_FIB_LOOKUP, 0, nl_fib_input, NULL,
835 THIS_MODULE);
831} 836}
832 837
833static void fib_disable_ip(struct net_device *dev, int force) 838static void fib_disable_ip(struct net_device *dev, int force)
@@ -925,6 +930,10 @@ void __init ip_fib_init(void)
925 register_netdevice_notifier(&fib_netdev_notifier); 930 register_netdevice_notifier(&fib_netdev_notifier);
926 register_inetaddr_notifier(&fib_inetaddr_notifier); 931 register_inetaddr_notifier(&fib_inetaddr_notifier);
927 nl_fib_lookup_init(); 932 nl_fib_lookup_init();
933
934 rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL);
935 rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL);
936 rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib);
928} 937}
929 938
930EXPORT_SYMBOL(inet_addr_type); 939EXPORT_SYMBOL(inet_addr_type);
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index a4949f957ab5..9cfecf1215c9 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -1027,7 +1027,7 @@ out:
1027 return 0; 1027 return 0;
1028} 1028}
1029 1029
1030static struct seq_operations fib_seq_ops = { 1030static const struct seq_operations fib_seq_ops = {
1031 .start = fib_seq_start, 1031 .start = fib_seq_start,
1032 .next = fib_seq_next, 1032 .next = fib_seq_next,
1033 .stop = fib_seq_stop, 1033 .stop = fib_seq_stop,
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index c660c074c76c..33083ad52e9f 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -274,11 +274,6 @@ nla_put_failure:
274 return -ENOBUFS; 274 return -ENOBUFS;
275} 275}
276 276
277int fib4_rules_dump(struct sk_buff *skb, struct netlink_callback *cb)
278{
279 return fib_rules_dump(skb, cb, AF_INET);
280}
281
282static u32 fib4_rule_default_pref(void) 277static u32 fib4_rule_default_pref(void)
283{ 278{
284 struct list_head *pos; 279 struct list_head *pos;
@@ -303,6 +298,11 @@ static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule)
303 + nla_total_size(4); /* flow */ 298 + nla_total_size(4); /* flow */
304} 299}
305 300
301static void fib4_rule_flush_cache(void)
302{
303 rt_cache_flush(-1);
304}
305
306static struct fib_rules_ops fib4_rules_ops = { 306static struct fib_rules_ops fib4_rules_ops = {
307 .family = AF_INET, 307 .family = AF_INET,
308 .rule_size = sizeof(struct fib4_rule), 308 .rule_size = sizeof(struct fib4_rule),
@@ -314,6 +314,7 @@ static struct fib_rules_ops fib4_rules_ops = {
314 .fill = fib4_rule_fill, 314 .fill = fib4_rule_fill,
315 .default_pref = fib4_rule_default_pref, 315 .default_pref = fib4_rule_default_pref,
316 .nlmsg_payload = fib4_rule_nlmsg_payload, 316 .nlmsg_payload = fib4_rule_nlmsg_payload,
317 .flush_cache = fib4_rule_flush_cache,
317 .nlgroup = RTNLGRP_IPV4_RULE, 318 .nlgroup = RTNLGRP_IPV4_RULE,
318 .policy = fib4_rule_policy, 319 .policy = fib4_rule_policy,
319 .rules_list = &fib4_rules, 320 .rules_list = &fib4_rules,
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 3dad12ee76c3..406ea7050aed 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -927,7 +927,7 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp,
927 default: 927 default:
928 printk(KERN_DEBUG "impossible 102\n"); 928 printk(KERN_DEBUG "impossible 102\n");
929 return -EINVAL; 929 return -EINVAL;
930 }; 930 }
931 } 931 }
932 return err; 932 return err;
933 } 933 }
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 214c34732e84..9be7da7c3a8f 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -50,7 +50,7 @@
50 * Patrick McHardy <kaber@trash.net> 50 * Patrick McHardy <kaber@trash.net>
51 */ 51 */
52 52
53#define VERSION "0.407" 53#define VERSION "0.408"
54 54
55#include <asm/uaccess.h> 55#include <asm/uaccess.h>
56#include <asm/system.h> 56#include <asm/system.h>
@@ -292,8 +292,8 @@ static inline void check_tnode(const struct tnode *tn)
292 292
293static int halve_threshold = 25; 293static int halve_threshold = 25;
294static int inflate_threshold = 50; 294static int inflate_threshold = 50;
295static int halve_threshold_root = 15; 295static int halve_threshold_root = 8;
296static int inflate_threshold_root = 25; 296static int inflate_threshold_root = 15;
297 297
298 298
299static void __alias_free_mem(struct rcu_head *head) 299static void __alias_free_mem(struct rcu_head *head)
@@ -350,11 +350,10 @@ static void __tnode_free_rcu(struct rcu_head *head)
350 350
351static inline void tnode_free(struct tnode *tn) 351static inline void tnode_free(struct tnode *tn)
352{ 352{
353 if(IS_LEAF(tn)) { 353 if (IS_LEAF(tn)) {
354 struct leaf *l = (struct leaf *) tn; 354 struct leaf *l = (struct leaf *) tn;
355 call_rcu_bh(&l->rcu, __leaf_free_rcu); 355 call_rcu_bh(&l->rcu, __leaf_free_rcu);
356 } 356 } else
357 else
358 call_rcu(&tn->rcu, __tnode_free_rcu); 357 call_rcu(&tn->rcu, __tnode_free_rcu);
359} 358}
360 359
@@ -459,6 +458,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
459 struct tnode *old_tn; 458 struct tnode *old_tn;
460 int inflate_threshold_use; 459 int inflate_threshold_use;
461 int halve_threshold_use; 460 int halve_threshold_use;
461 int max_resize;
462 462
463 if (!tn) 463 if (!tn)
464 return NULL; 464 return NULL;
@@ -553,13 +553,14 @@ static struct node *resize(struct trie *t, struct tnode *tn)
553 553
554 /* Keep root node larger */ 554 /* Keep root node larger */
555 555
556 if(!tn->parent) 556 if (!tn->parent)
557 inflate_threshold_use = inflate_threshold_root; 557 inflate_threshold_use = inflate_threshold_root;
558 else 558 else
559 inflate_threshold_use = inflate_threshold; 559 inflate_threshold_use = inflate_threshold;
560 560
561 err = 0; 561 err = 0;
562 while ((tn->full_children > 0 && 562 max_resize = 10;
563 while ((tn->full_children > 0 && max_resize-- &&
563 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >= 564 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
564 inflate_threshold_use * tnode_child_length(tn))) { 565 inflate_threshold_use * tnode_child_length(tn))) {
565 566
@@ -574,6 +575,15 @@ static struct node *resize(struct trie *t, struct tnode *tn)
574 } 575 }
575 } 576 }
576 577
578 if (max_resize < 0) {
579 if (!tn->parent)
580 printk(KERN_WARNING "Fix inflate_threshold_root. Now=%d size=%d bits\n",
581 inflate_threshold_root, tn->bits);
582 else
583 printk(KERN_WARNING "Fix inflate_threshold. Now=%d size=%d bits\n",
584 inflate_threshold, tn->bits);
585 }
586
577 check_tnode(tn); 587 check_tnode(tn);
578 588
579 /* 589 /*
@@ -584,13 +594,14 @@ static struct node *resize(struct trie *t, struct tnode *tn)
584 594
585 /* Keep root node larger */ 595 /* Keep root node larger */
586 596
587 if(!tn->parent) 597 if (!tn->parent)
588 halve_threshold_use = halve_threshold_root; 598 halve_threshold_use = halve_threshold_root;
589 else 599 else
590 halve_threshold_use = halve_threshold; 600 halve_threshold_use = halve_threshold;
591 601
592 err = 0; 602 err = 0;
593 while (tn->bits > 1 && 603 max_resize = 10;
604 while (tn->bits > 1 && max_resize-- &&
594 100 * (tnode_child_length(tn) - tn->empty_children) < 605 100 * (tnode_child_length(tn) - tn->empty_children) <
595 halve_threshold_use * tnode_child_length(tn)) { 606 halve_threshold_use * tnode_child_length(tn)) {
596 607
@@ -605,6 +616,14 @@ static struct node *resize(struct trie *t, struct tnode *tn)
605 } 616 }
606 } 617 }
607 618
619 if (max_resize < 0) {
620 if (!tn->parent)
621 printk(KERN_WARNING "Fix halve_threshold_root. Now=%d size=%d bits\n",
622 halve_threshold_root, tn->bits);
623 else
624 printk(KERN_WARNING "Fix halve_threshold. Now=%d size=%d bits\n",
625 halve_threshold, tn->bits);
626 }
608 627
609 /* Only one child remains */ 628 /* Only one child remains */
610 if (tn->empty_children == tnode_child_length(tn) - 1) 629 if (tn->empty_children == tnode_child_length(tn) - 1)
@@ -2039,12 +2058,12 @@ static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2039{ 2058{
2040 struct node *n ; 2059 struct node *n ;
2041 2060
2042 if(!t) 2061 if (!t)
2043 return NULL; 2062 return NULL;
2044 2063
2045 n = rcu_dereference(t->trie); 2064 n = rcu_dereference(t->trie);
2046 2065
2047 if(!iter) 2066 if (!iter)
2048 return NULL; 2067 return NULL;
2049 2068
2050 if (n) { 2069 if (n) {
@@ -2084,7 +2103,7 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2084 int i; 2103 int i;
2085 2104
2086 s->tnodes++; 2105 s->tnodes++;
2087 if(tn->bits < MAX_STAT_DEPTH) 2106 if (tn->bits < MAX_STAT_DEPTH)
2088 s->nodesizes[tn->bits]++; 2107 s->nodesizes[tn->bits]++;
2089 2108
2090 for (i = 0; i < (1<<tn->bits); i++) 2109 for (i = 0; i < (1<<tn->bits); i++)
@@ -2250,7 +2269,7 @@ static inline const char *rtn_scope(enum rt_scope_t s)
2250{ 2269{
2251 static char buf[32]; 2270 static char buf[32];
2252 2271
2253 switch(s) { 2272 switch (s) {
2254 case RT_SCOPE_UNIVERSE: return "universe"; 2273 case RT_SCOPE_UNIVERSE: return "universe";
2255 case RT_SCOPE_SITE: return "site"; 2274 case RT_SCOPE_SITE: return "site";
2256 case RT_SCOPE_LINK: return "link"; 2275 case RT_SCOPE_LINK: return "link";
@@ -2340,7 +2359,7 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
2340 return 0; 2359 return 0;
2341} 2360}
2342 2361
2343static struct seq_operations fib_trie_seq_ops = { 2362static const struct seq_operations fib_trie_seq_ops = {
2344 .start = fib_trie_seq_start, 2363 .start = fib_trie_seq_start,
2345 .next = fib_trie_seq_next, 2364 .next = fib_trie_seq_next,
2346 .stop = fib_trie_seq_stop, 2365 .stop = fib_trie_seq_stop,
@@ -2461,7 +2480,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
2461 return 0; 2480 return 0;
2462} 2481}
2463 2482
2464static struct seq_operations fib_route_seq_ops = { 2483static const struct seq_operations fib_route_seq_ops = {
2465 .start = fib_trie_seq_start, 2484 .start = fib_trie_seq_start,
2466 .next = fib_trie_seq_next, 2485 .next = fib_trie_seq_next,
2467 .stop = fib_trie_seq_stop, 2486 .stop = fib_trie_seq_stop,
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 4b7a0d946a0d..d38cbba92a4d 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -355,7 +355,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
355 ipc, rt, MSG_DONTWAIT) < 0) 355 ipc, rt, MSG_DONTWAIT) < 0)
356 ip_flush_pending_frames(icmp_socket->sk); 356 ip_flush_pending_frames(icmp_socket->sk);
357 else if ((skb = skb_peek(&icmp_socket->sk->sk_write_queue)) != NULL) { 357 else if ((skb = skb_peek(&icmp_socket->sk->sk_write_queue)) != NULL) {
358 struct icmphdr *icmph = skb->h.icmph; 358 struct icmphdr *icmph = icmp_hdr(skb);
359 __wsum csum = 0; 359 __wsum csum = 0;
360 struct sk_buff *skb1; 360 struct sk_buff *skb1;
361 361
@@ -392,7 +392,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
392 icmp_param->data.icmph.checksum = 0; 392 icmp_param->data.icmph.checksum = 0;
393 icmp_out_count(icmp_param->data.icmph.type); 393 icmp_out_count(icmp_param->data.icmph.type);
394 394
395 inet->tos = skb->nh.iph->tos; 395 inet->tos = ip_hdr(skb)->tos;
396 daddr = ipc.addr = rt->rt_src; 396 daddr = ipc.addr = rt->rt_src;
397 ipc.opt = NULL; 397 ipc.opt = NULL;
398 if (icmp_param->replyopts.optlen) { 398 if (icmp_param->replyopts.optlen) {
@@ -404,7 +404,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
404 struct flowi fl = { .nl_u = { .ip4_u = 404 struct flowi fl = { .nl_u = { .ip4_u =
405 { .daddr = daddr, 405 { .daddr = daddr,
406 .saddr = rt->rt_spec_dst, 406 .saddr = rt->rt_spec_dst,
407 .tos = RT_TOS(skb->nh.iph->tos) } }, 407 .tos = RT_TOS(ip_hdr(skb)->tos) } },
408 .proto = IPPROTO_ICMP }; 408 .proto = IPPROTO_ICMP };
409 security_skb_classify_flow(skb, &fl); 409 security_skb_classify_flow(skb, &fl);
410 if (ip_route_output_key(&rt, &fl)) 410 if (ip_route_output_key(&rt, &fl))
@@ -448,9 +448,10 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
448 * Check this, icmp_send is called from the most obscure devices 448 * Check this, icmp_send is called from the most obscure devices
449 * sometimes. 449 * sometimes.
450 */ 450 */
451 iph = skb_in->nh.iph; 451 iph = ip_hdr(skb_in);
452 452
453 if ((u8 *)iph < skb_in->head || (u8 *)(iph + 1) > skb_in->tail) 453 if ((u8 *)iph < skb_in->head ||
454 (skb_in->network_header + sizeof(*iph)) > skb_in->tail)
454 goto out; 455 goto out;
455 456
456 /* 457 /*
@@ -484,7 +485,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
484 u8 _inner_type, *itp; 485 u8 _inner_type, *itp;
485 486
486 itp = skb_header_pointer(skb_in, 487 itp = skb_header_pointer(skb_in,
487 skb_in->nh.raw + 488 skb_network_header(skb_in) +
488 (iph->ihl << 2) + 489 (iph->ihl << 2) +
489 offsetof(struct icmphdr, 490 offsetof(struct icmphdr,
490 type) - 491 type) -
@@ -536,7 +537,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
536 icmp_param.data.icmph.un.gateway = info; 537 icmp_param.data.icmph.un.gateway = info;
537 icmp_param.data.icmph.checksum = 0; 538 icmp_param.data.icmph.checksum = 0;
538 icmp_param.skb = skb_in; 539 icmp_param.skb = skb_in;
539 icmp_param.offset = skb_in->nh.raw - skb_in->data; 540 icmp_param.offset = skb_network_offset(skb_in);
540 icmp_out_count(icmp_param.data.icmph.type); 541 icmp_out_count(icmp_param.data.icmph.type);
541 inet_sk(icmp_socket->sk)->tos = tos; 542 inet_sk(icmp_socket->sk)->tos = tos;
542 ipc.addr = iph->saddr; 543 ipc.addr = iph->saddr;
@@ -613,7 +614,7 @@ static void icmp_unreach(struct sk_buff *skb)
613 if (!pskb_may_pull(skb, sizeof(struct iphdr))) 614 if (!pskb_may_pull(skb, sizeof(struct iphdr)))
614 goto out_err; 615 goto out_err;
615 616
616 icmph = skb->h.icmph; 617 icmph = icmp_hdr(skb);
617 iph = (struct iphdr *)skb->data; 618 iph = (struct iphdr *)skb->data;
618 619
619 if (iph->ihl < 5) /* Mangled header, drop. */ 620 if (iph->ihl < 5) /* Mangled header, drop. */
@@ -676,7 +677,7 @@ static void icmp_unreach(struct sk_buff *skb)
676 printk(KERN_WARNING "%u.%u.%u.%u sent an invalid ICMP " 677 printk(KERN_WARNING "%u.%u.%u.%u sent an invalid ICMP "
677 "type %u, code %u " 678 "type %u, code %u "
678 "error to a broadcast: %u.%u.%u.%u on %s\n", 679 "error to a broadcast: %u.%u.%u.%u on %s\n",
679 NIPQUAD(skb->nh.iph->saddr), 680 NIPQUAD(ip_hdr(skb)->saddr),
680 icmph->type, icmph->code, 681 icmph->type, icmph->code,
681 NIPQUAD(iph->daddr), 682 NIPQUAD(iph->daddr),
682 skb->dev->name); 683 skb->dev->name);
@@ -743,7 +744,7 @@ static void icmp_redirect(struct sk_buff *skb)
743 744
744 iph = (struct iphdr *)skb->data; 745 iph = (struct iphdr *)skb->data;
745 746
746 switch (skb->h.icmph->code & 7) { 747 switch (icmp_hdr(skb)->code & 7) {
747 case ICMP_REDIR_NET: 748 case ICMP_REDIR_NET:
748 case ICMP_REDIR_NETTOS: 749 case ICMP_REDIR_NETTOS:
749 /* 750 /*
@@ -751,8 +752,8 @@ static void icmp_redirect(struct sk_buff *skb)
751 */ 752 */
752 case ICMP_REDIR_HOST: 753 case ICMP_REDIR_HOST:
753 case ICMP_REDIR_HOSTTOS: 754 case ICMP_REDIR_HOSTTOS:
754 ip_rt_redirect(skb->nh.iph->saddr, iph->daddr, 755 ip_rt_redirect(ip_hdr(skb)->saddr, iph->daddr,
755 skb->h.icmph->un.gateway, 756 icmp_hdr(skb)->un.gateway,
756 iph->saddr, skb->dev); 757 iph->saddr, skb->dev);
757 break; 758 break;
758 } 759 }
@@ -780,7 +781,7 @@ static void icmp_echo(struct sk_buff *skb)
780 if (!sysctl_icmp_echo_ignore_all) { 781 if (!sysctl_icmp_echo_ignore_all) {
781 struct icmp_bxm icmp_param; 782 struct icmp_bxm icmp_param;
782 783
783 icmp_param.data.icmph = *skb->h.icmph; 784 icmp_param.data.icmph = *icmp_hdr(skb);
784 icmp_param.data.icmph.type = ICMP_ECHOREPLY; 785 icmp_param.data.icmph.type = ICMP_ECHOREPLY;
785 icmp_param.skb = skb; 786 icmp_param.skb = skb;
786 icmp_param.offset = 0; 787 icmp_param.offset = 0;
@@ -816,7 +817,7 @@ static void icmp_timestamp(struct sk_buff *skb)
816 icmp_param.data.times[2] = icmp_param.data.times[1]; 817 icmp_param.data.times[2] = icmp_param.data.times[1];
817 if (skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4)) 818 if (skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4))
818 BUG(); 819 BUG();
819 icmp_param.data.icmph = *skb->h.icmph; 820 icmp_param.data.icmph = *icmp_hdr(skb);
820 icmp_param.data.icmph.type = ICMP_TIMESTAMPREPLY; 821 icmp_param.data.icmph.type = ICMP_TIMESTAMPREPLY;
821 icmp_param.data.icmph.code = 0; 822 icmp_param.data.icmph.code = 0;
822 icmp_param.skb = skb; 823 icmp_param.skb = skb;
@@ -943,7 +944,7 @@ int icmp_rcv(struct sk_buff *skb)
943 if (!pskb_pull(skb, sizeof(struct icmphdr))) 944 if (!pskb_pull(skb, sizeof(struct icmphdr)))
944 goto error; 945 goto error;
945 946
946 icmph = skb->h.icmph; 947 icmph = icmp_hdr(skb);
947 948
948 /* 949 /*
949 * 18 is the highest 'known' ICMP type. Anything else is a mystery 950 * 18 is the highest 'known' ICMP type. Anything else is a mystery
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 8cedb2a2c9df..2506021c2935 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -314,7 +314,9 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
314 314
315 skb_reserve(skb, LL_RESERVED_SPACE(dev)); 315 skb_reserve(skb, LL_RESERVED_SPACE(dev));
316 316
317 skb->nh.iph = pip =(struct iphdr *)skb_put(skb, sizeof(struct iphdr)+4); 317 skb_reset_network_header(skb);
318 pip = ip_hdr(skb);
319 skb_put(skb, sizeof(struct iphdr) + 4);
318 320
319 pip->version = 4; 321 pip->version = 4;
320 pip->ihl = (sizeof(struct iphdr)+4)>>2; 322 pip->ihl = (sizeof(struct iphdr)+4)>>2;
@@ -331,8 +333,9 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
331 ((u8*)&pip[1])[2] = 0; 333 ((u8*)&pip[1])[2] = 0;
332 ((u8*)&pip[1])[3] = 0; 334 ((u8*)&pip[1])[3] = 0;
333 335
334 pig =(struct igmpv3_report *)skb_put(skb, sizeof(*pig)); 336 skb->transport_header = skb->network_header + sizeof(struct iphdr) + 4;
335 skb->h.igmph = (struct igmphdr *)pig; 337 skb_put(skb, sizeof(*pig));
338 pig = igmpv3_report_hdr(skb);
336 pig->type = IGMPV3_HOST_MEMBERSHIP_REPORT; 339 pig->type = IGMPV3_HOST_MEMBERSHIP_REPORT;
337 pig->resv1 = 0; 340 pig->resv1 = 0;
338 pig->csum = 0; 341 pig->csum = 0;
@@ -343,16 +346,14 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
343 346
344static int igmpv3_sendpack(struct sk_buff *skb) 347static int igmpv3_sendpack(struct sk_buff *skb)
345{ 348{
346 struct iphdr *pip = skb->nh.iph; 349 struct iphdr *pip = ip_hdr(skb);
347 struct igmphdr *pig = skb->h.igmph; 350 struct igmphdr *pig = igmp_hdr(skb);
348 int iplen, igmplen; 351 const int iplen = skb->tail - skb->network_header;
352 const int igmplen = skb->tail - skb->transport_header;
349 353
350 iplen = skb->tail - (unsigned char *)skb->nh.iph;
351 pip->tot_len = htons(iplen); 354 pip->tot_len = htons(iplen);
352 ip_send_check(pip); 355 ip_send_check(pip);
353 356 pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen);
354 igmplen = skb->tail - (unsigned char *)skb->h.igmph;
355 pig->csum = ip_compute_csum((void *)skb->h.igmph, igmplen);
356 357
357 return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, skb->dev, 358 return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, skb->dev,
358 dst_output); 359 dst_output);
@@ -379,7 +380,7 @@ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc,
379 pgr->grec_auxwords = 0; 380 pgr->grec_auxwords = 0;
380 pgr->grec_nsrcs = 0; 381 pgr->grec_nsrcs = 0;
381 pgr->grec_mca = pmc->multiaddr; 382 pgr->grec_mca = pmc->multiaddr;
382 pih = (struct igmpv3_report *)skb->h.igmph; 383 pih = igmpv3_report_hdr(skb);
383 pih->ngrec = htons(ntohs(pih->ngrec)+1); 384 pih->ngrec = htons(ntohs(pih->ngrec)+1);
384 *ppgr = pgr; 385 *ppgr = pgr;
385 return skb; 386 return skb;
@@ -412,7 +413,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
412 if (!*psf_list) 413 if (!*psf_list)
413 goto empty_source; 414 goto empty_source;
414 415
415 pih = skb ? (struct igmpv3_report *)skb->h.igmph : NULL; 416 pih = skb ? igmpv3_report_hdr(skb) : NULL;
416 417
417 /* EX and TO_EX get a fresh packet, if needed */ 418 /* EX and TO_EX get a fresh packet, if needed */
418 if (truncate) { 419 if (truncate) {
@@ -664,7 +665,9 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
664 665
665 skb_reserve(skb, LL_RESERVED_SPACE(dev)); 666 skb_reserve(skb, LL_RESERVED_SPACE(dev));
666 667
667 skb->nh.iph = iph = (struct iphdr *)skb_put(skb, sizeof(struct iphdr)+4); 668 skb_reset_network_header(skb);
669 iph = ip_hdr(skb);
670 skb_put(skb, sizeof(struct iphdr) + 4);
668 671
669 iph->version = 4; 672 iph->version = 4;
670 iph->ihl = (sizeof(struct iphdr)+4)>>2; 673 iph->ihl = (sizeof(struct iphdr)+4)>>2;
@@ -827,8 +830,8 @@ static void igmp_heard_report(struct in_device *in_dev, __be32 group)
827static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, 830static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
828 int len) 831 int len)
829{ 832{
830 struct igmphdr *ih = skb->h.igmph; 833 struct igmphdr *ih = igmp_hdr(skb);
831 struct igmpv3_query *ih3 = (struct igmpv3_query *)ih; 834 struct igmpv3_query *ih3 = igmpv3_query_hdr(skb);
832 struct ip_mc_list *im; 835 struct ip_mc_list *im;
833 __be32 group = ih->group; 836 __be32 group = ih->group;
834 int max_delay; 837 int max_delay;
@@ -861,12 +864,12 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
861 if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) 864 if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)))
862 return; 865 return;
863 866
864 ih3 = (struct igmpv3_query *) skb->h.raw; 867 ih3 = igmpv3_query_hdr(skb);
865 if (ih3->nsrcs) { 868 if (ih3->nsrcs) {
866 if (!pskb_may_pull(skb, sizeof(struct igmpv3_query) 869 if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)
867 + ntohs(ih3->nsrcs)*sizeof(__be32))) 870 + ntohs(ih3->nsrcs)*sizeof(__be32)))
868 return; 871 return;
869 ih3 = (struct igmpv3_query *) skb->h.raw; 872 ih3 = igmpv3_query_hdr(skb);
870 } 873 }
871 874
872 max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE); 875 max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE);
@@ -943,7 +946,7 @@ int igmp_rcv(struct sk_buff *skb)
943 goto drop; 946 goto drop;
944 } 947 }
945 948
946 ih = skb->h.igmph; 949 ih = igmp_hdr(skb);
947 switch (ih->type) { 950 switch (ih->type) {
948 case IGMP_HOST_MEMBERSHIP_QUERY: 951 case IGMP_HOST_MEMBERSHIP_QUERY:
949 igmp_heard_query(in_dev, skb, len); 952 igmp_heard_query(in_dev, skb, len);
@@ -2397,7 +2400,7 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v)
2397 return 0; 2400 return 0;
2398} 2401}
2399 2402
2400static struct seq_operations igmp_mc_seq_ops = { 2403static const struct seq_operations igmp_mc_seq_ops = {
2401 .start = igmp_mc_seq_start, 2404 .start = igmp_mc_seq_start,
2402 .next = igmp_mc_seq_next, 2405 .next = igmp_mc_seq_next,
2403 .stop = igmp_mc_seq_stop, 2406 .stop = igmp_mc_seq_stop,
@@ -2571,7 +2574,7 @@ static int igmp_mcf_seq_show(struct seq_file *seq, void *v)
2571 return 0; 2574 return 0;
2572} 2575}
2573 2576
2574static struct seq_operations igmp_mcf_seq_ops = { 2577static const struct seq_operations igmp_mcf_seq_ops = {
2575 .start = igmp_mcf_seq_start, 2578 .start = igmp_mcf_seq_start,
2576 .next = igmp_mcf_seq_next, 2579 .next = igmp_mcf_seq_next,
2577 .stop = igmp_mcf_seq_stop, 2580 .stop = igmp_mcf_seq_stop,
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 5df71cd08da8..dbeacd8b0f90 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -27,6 +27,7 @@
27#include <net/inet_hashtables.h> 27#include <net/inet_hashtables.h>
28#include <net/inet_timewait_sock.h> 28#include <net/inet_timewait_sock.h>
29#include <net/inet6_hashtables.h> 29#include <net/inet6_hashtables.h>
30#include <net/netlink.h>
30 31
31#include <linux/inet.h> 32#include <linux/inet.h>
32#include <linux/stddef.h> 33#include <linux/stddef.h>
@@ -60,7 +61,7 @@ static int inet_csk_diag_fill(struct sock *sk,
60 struct nlmsghdr *nlh; 61 struct nlmsghdr *nlh;
61 void *info = NULL; 62 void *info = NULL;
62 struct inet_diag_meminfo *minfo = NULL; 63 struct inet_diag_meminfo *minfo = NULL;
63 unsigned char *b = skb->tail; 64 unsigned char *b = skb_tail_pointer(skb);
64 const struct inet_diag_handler *handler; 65 const struct inet_diag_handler *handler;
65 66
66 handler = inet_diag_table[unlh->nlmsg_type]; 67 handler = inet_diag_table[unlh->nlmsg_type];
@@ -147,12 +148,12 @@ static int inet_csk_diag_fill(struct sock *sk,
147 icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info) 148 icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info)
148 icsk->icsk_ca_ops->get_info(sk, ext, skb); 149 icsk->icsk_ca_ops->get_info(sk, ext, skb);
149 150
150 nlh->nlmsg_len = skb->tail - b; 151 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
151 return skb->len; 152 return skb->len;
152 153
153rtattr_failure: 154rtattr_failure:
154nlmsg_failure: 155nlmsg_failure:
155 skb_trim(skb, b - skb->data); 156 nlmsg_trim(skb, b);
156 return -EMSGSIZE; 157 return -EMSGSIZE;
157} 158}
158 159
@@ -163,7 +164,7 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
163{ 164{
164 long tmo; 165 long tmo;
165 struct inet_diag_msg *r; 166 struct inet_diag_msg *r;
166 const unsigned char *previous_tail = skb->tail; 167 const unsigned char *previous_tail = skb_tail_pointer(skb);
167 struct nlmsghdr *nlh = NLMSG_PUT(skb, pid, seq, 168 struct nlmsghdr *nlh = NLMSG_PUT(skb, pid, seq,
168 unlh->nlmsg_type, sizeof(*r)); 169 unlh->nlmsg_type, sizeof(*r));
169 170
@@ -205,10 +206,10 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
205 &tw6->tw_v6_daddr); 206 &tw6->tw_v6_daddr);
206 } 207 }
207#endif 208#endif
208 nlh->nlmsg_len = skb->tail - previous_tail; 209 nlh->nlmsg_len = skb_tail_pointer(skb) - previous_tail;
209 return skb->len; 210 return skb->len;
210nlmsg_failure: 211nlmsg_failure:
211 skb_trim(skb, previous_tail - skb->data); 212 nlmsg_trim(skb, previous_tail);
212 return -EMSGSIZE; 213 return -EMSGSIZE;
213} 214}
214 215
@@ -535,7 +536,7 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
535{ 536{
536 const struct inet_request_sock *ireq = inet_rsk(req); 537 const struct inet_request_sock *ireq = inet_rsk(req);
537 struct inet_sock *inet = inet_sk(sk); 538 struct inet_sock *inet = inet_sk(sk);
538 unsigned char *b = skb->tail; 539 unsigned char *b = skb_tail_pointer(skb);
539 struct inet_diag_msg *r; 540 struct inet_diag_msg *r;
540 struct nlmsghdr *nlh; 541 struct nlmsghdr *nlh;
541 long tmo; 542 long tmo;
@@ -574,12 +575,12 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
574 &inet6_rsk(req)->rmt_addr); 575 &inet6_rsk(req)->rmt_addr);
575 } 576 }
576#endif 577#endif
577 nlh->nlmsg_len = skb->tail - b; 578 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
578 579
579 return skb->len; 580 return skb->len;
580 581
581nlmsg_failure: 582nlmsg_failure:
582 skb_trim(skb, b - skb->data); 583 nlmsg_trim(skb, b);
583 return -1; 584 return -1;
584} 585}
585 586
@@ -805,68 +806,43 @@ done:
805 return skb->len; 806 return skb->len;
806} 807}
807 808
808static inline int inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) 809static int inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
809{ 810{
810 if (!(nlh->nlmsg_flags&NLM_F_REQUEST)) 811 int hdrlen = sizeof(struct inet_diag_req);
811 return 0;
812 812
813 if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX) 813 if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX ||
814 goto err_inval; 814 nlmsg_len(nlh) < hdrlen)
815 return -EINVAL;
815 816
816 if (inet_diag_table[nlh->nlmsg_type] == NULL) 817 if (inet_diag_table[nlh->nlmsg_type] == NULL)
817 return -ENOENT; 818 return -ENOENT;
818 819
819 if (NLMSG_LENGTH(sizeof(struct inet_diag_req)) > skb->len) 820 if (nlh->nlmsg_flags & NLM_F_DUMP) {
820 goto err_inval; 821 if (nlmsg_attrlen(nlh, hdrlen)) {
821 822 struct nlattr *attr;
822 if (nlh->nlmsg_flags&NLM_F_DUMP) { 823
823 if (nlh->nlmsg_len > 824 attr = nlmsg_find_attr(nlh, hdrlen,
824 (4 + NLMSG_SPACE(sizeof(struct inet_diag_req)))) { 825 INET_DIAG_REQ_BYTECODE);
825 struct rtattr *rta = (void *)(NLMSG_DATA(nlh) + 826 if (attr == NULL ||
826 sizeof(struct inet_diag_req)); 827 nla_len(attr) < sizeof(struct inet_diag_bc_op) ||
827 if (rta->rta_type != INET_DIAG_REQ_BYTECODE || 828 inet_diag_bc_audit(nla_data(attr), nla_len(attr)))
828 rta->rta_len < 8 || 829 return -EINVAL;
829 rta->rta_len >
830 (nlh->nlmsg_len -
831 NLMSG_SPACE(sizeof(struct inet_diag_req))))
832 goto err_inval;
833 if (inet_diag_bc_audit(RTA_DATA(rta), RTA_PAYLOAD(rta)))
834 goto err_inval;
835 } 830 }
831
836 return netlink_dump_start(idiagnl, skb, nlh, 832 return netlink_dump_start(idiagnl, skb, nlh,
837 inet_diag_dump, NULL); 833 inet_diag_dump, NULL);
838 } else
839 return inet_diag_get_exact(skb, nlh);
840
841err_inval:
842 return -EINVAL;
843}
844
845
846static inline void inet_diag_rcv_skb(struct sk_buff *skb)
847{
848 if (skb->len >= NLMSG_SPACE(0)) {
849 int err;
850 struct nlmsghdr *nlh = (struct nlmsghdr *)skb->data;
851
852 if (nlh->nlmsg_len < sizeof(*nlh) ||
853 skb->len < nlh->nlmsg_len)
854 return;
855 err = inet_diag_rcv_msg(skb, nlh);
856 if (err || nlh->nlmsg_flags & NLM_F_ACK)
857 netlink_ack(skb, nlh, err);
858 } 834 }
835
836 return inet_diag_get_exact(skb, nlh);
859} 837}
860 838
861static void inet_diag_rcv(struct sock *sk, int len) 839static void inet_diag_rcv(struct sock *sk, int len)
862{ 840{
863 struct sk_buff *skb; 841 unsigned int qlen = 0;
864 unsigned int qlen = skb_queue_len(&sk->sk_receive_queue);
865 842
866 while (qlen-- && (skb = skb_dequeue(&sk->sk_receive_queue))) { 843 do {
867 inet_diag_rcv_skb(skb); 844 netlink_run_queue(sk, &qlen, &inet_diag_rcv_msg);
868 kfree_skb(skb); 845 } while (qlen);
869 }
870} 846}
871 847
872static DEFINE_SPINLOCK(inet_diag_register_lock); 848static DEFINE_SPINLOCK(inet_diag_register_lock);
@@ -917,7 +893,7 @@ static int __init inet_diag_init(void)
917 goto out; 893 goto out;
918 894
919 idiagnl = netlink_kernel_create(NETLINK_INET_DIAG, 0, inet_diag_rcv, 895 idiagnl = netlink_kernel_create(NETLINK_INET_DIAG, 0, inet_diag_rcv,
920 THIS_MODULE); 896 NULL, THIS_MODULE);
921 if (idiagnl == NULL) 897 if (idiagnl == NULL)
922 goto out_free_table; 898 goto out_free_table;
923 err = 0; 899 err = 0;
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index db3ef96bdfd9..2f44e6128068 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -87,10 +87,12 @@ static DEFINE_RWLOCK(peer_pool_lock);
87 87
88static int peer_total; 88static int peer_total;
89/* Exported for sysctl_net_ipv4. */ 89/* Exported for sysctl_net_ipv4. */
90int inet_peer_threshold = 65536 + 128; /* start to throw entries more 90int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries more
91 * aggressively at this stage */ 91 * aggressively at this stage */
92int inet_peer_minttl = 120 * HZ; /* TTL under high load: 120 sec */ 92int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */
93int inet_peer_maxttl = 10 * 60 * HZ; /* usual time to live: 10 min */ 93int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */
94int inet_peer_gc_mintime __read_mostly = 10 * HZ;
95int inet_peer_gc_maxtime __read_mostly = 120 * HZ;
94 96
95static struct inet_peer *inet_peer_unused_head; 97static struct inet_peer *inet_peer_unused_head;
96static struct inet_peer **inet_peer_unused_tailp = &inet_peer_unused_head; 98static struct inet_peer **inet_peer_unused_tailp = &inet_peer_unused_head;
@@ -99,9 +101,6 @@ static DEFINE_SPINLOCK(inet_peer_unused_lock);
99static void peer_check_expire(unsigned long dummy); 101static void peer_check_expire(unsigned long dummy);
100static DEFINE_TIMER(peer_periodic_timer, peer_check_expire, 0, 0); 102static DEFINE_TIMER(peer_periodic_timer, peer_check_expire, 0, 0);
101 103
102/* Exported for sysctl_net_ipv4. */
103int inet_peer_gc_mintime = 10 * HZ,
104 inet_peer_gc_maxtime = 120 * HZ;
105 104
106/* Called from ip_output.c:ip_init */ 105/* Called from ip_output.c:ip_init */
107void __init inet_initpeers(void) 106void __init inet_initpeers(void)
@@ -151,20 +150,27 @@ static void unlink_from_unused(struct inet_peer *p)
151 spin_unlock_bh(&inet_peer_unused_lock); 150 spin_unlock_bh(&inet_peer_unused_lock);
152} 151}
153 152
154/* Called with local BH disabled and the pool lock held. */ 153/*
155#define lookup(daddr) \ 154 * Called with local BH disabled and the pool lock held.
155 * _stack is known to be NULL or not at compile time,
156 * so compiler will optimize the if (_stack) tests.
157 */
158#define lookup(_daddr,_stack) \
156({ \ 159({ \
157 struct inet_peer *u, **v; \ 160 struct inet_peer *u, **v; \
158 stackptr = stack; \ 161 if (_stack) { \
159 *stackptr++ = &peer_root; \ 162 stackptr = _stack; \
163 *stackptr++ = &peer_root; \
164 } \
160 for (u = peer_root; u != peer_avl_empty; ) { \ 165 for (u = peer_root; u != peer_avl_empty; ) { \
161 if (daddr == u->v4daddr) \ 166 if (_daddr == u->v4daddr) \
162 break; \ 167 break; \
163 if ((__force __u32)daddr < (__force __u32)u->v4daddr) \ 168 if ((__force __u32)_daddr < (__force __u32)u->v4daddr) \
164 v = &u->avl_left; \ 169 v = &u->avl_left; \
165 else \ 170 else \
166 v = &u->avl_right; \ 171 v = &u->avl_right; \
167 *stackptr++ = v; \ 172 if (_stack) \
173 *stackptr++ = v; \
168 u = *v; \ 174 u = *v; \
169 } \ 175 } \
170 u; \ 176 u; \
@@ -288,7 +294,7 @@ static void unlink_from_pool(struct inet_peer *p)
288 if (atomic_read(&p->refcnt) == 1) { 294 if (atomic_read(&p->refcnt) == 1) {
289 struct inet_peer **stack[PEER_MAXDEPTH]; 295 struct inet_peer **stack[PEER_MAXDEPTH];
290 struct inet_peer ***stackptr, ***delp; 296 struct inet_peer ***stackptr, ***delp;
291 if (lookup(p->v4daddr) != p) 297 if (lookup(p->v4daddr, stack) != p)
292 BUG(); 298 BUG();
293 delp = stackptr - 1; /* *delp[0] == p */ 299 delp = stackptr - 1; /* *delp[0] == p */
294 if (p->avl_left == peer_avl_empty) { 300 if (p->avl_left == peer_avl_empty) {
@@ -373,7 +379,7 @@ struct inet_peer *inet_getpeer(__be32 daddr, int create)
373 379
374 /* Look up for the address quickly. */ 380 /* Look up for the address quickly. */
375 read_lock_bh(&peer_pool_lock); 381 read_lock_bh(&peer_pool_lock);
376 p = lookup(daddr); 382 p = lookup(daddr, NULL);
377 if (p != peer_avl_empty) 383 if (p != peer_avl_empty)
378 atomic_inc(&p->refcnt); 384 atomic_inc(&p->refcnt);
379 read_unlock_bh(&peer_pool_lock); 385 read_unlock_bh(&peer_pool_lock);
@@ -400,7 +406,7 @@ struct inet_peer *inet_getpeer(__be32 daddr, int create)
400 406
401 write_lock_bh(&peer_pool_lock); 407 write_lock_bh(&peer_pool_lock);
402 /* Check if an entry has suddenly appeared. */ 408 /* Check if an entry has suddenly appeared. */
403 p = lookup(daddr); 409 p = lookup(daddr, stack);
404 if (p != peer_avl_empty) 410 if (p != peer_avl_empty)
405 goto out_free; 411 goto out_free;
406 412
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 369e721c4bab..9cb04df0054b 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -67,14 +67,14 @@ int ip_forward(struct sk_buff *skb)
67 if (skb->pkt_type != PACKET_HOST) 67 if (skb->pkt_type != PACKET_HOST)
68 goto drop; 68 goto drop;
69 69
70 skb->ip_summed = CHECKSUM_NONE; 70 skb_forward_csum(skb);
71 71
72 /* 72 /*
73 * According to the RFC, we must first decrease the TTL field. If 73 * According to the RFC, we must first decrease the TTL field. If
74 * that reaches zero, we must reply an ICMP control message telling 74 * that reaches zero, we must reply an ICMP control message telling
75 * that the packet's lifetime expired. 75 * that the packet's lifetime expired.
76 */ 76 */
77 if (skb->nh.iph->ttl <= 1) 77 if (ip_hdr(skb)->ttl <= 1)
78 goto too_many_hops; 78 goto too_many_hops;
79 79
80 if (!xfrm4_route_forward(skb)) 80 if (!xfrm4_route_forward(skb))
@@ -85,10 +85,18 @@ int ip_forward(struct sk_buff *skb)
85 if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) 85 if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
86 goto sr_failed; 86 goto sr_failed;
87 87
88 if (unlikely(skb->len > dst_mtu(&rt->u.dst) &&
89 (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) {
90 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
91 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
92 htonl(dst_mtu(&rt->u.dst)));
93 goto drop;
94 }
95
88 /* We are about to mangle packet. Copy it! */ 96 /* We are about to mangle packet. Copy it! */
89 if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len)) 97 if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len))
90 goto drop; 98 goto drop;
91 iph = skb->nh.iph; 99 iph = ip_hdr(skb);
92 100
93 /* Decrease ttl after skb cow done */ 101 /* Decrease ttl after skb cow done */
94 ip_decrease_ttl(iph); 102 ip_decrease_ttl(iph);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index b6f055380373..0231bdcb2ab7 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -92,7 +92,7 @@ struct ipq {
92 spinlock_t lock; 92 spinlock_t lock;
93 atomic_t refcnt; 93 atomic_t refcnt;
94 struct timer_list timer; /* when will this queue expire? */ 94 struct timer_list timer; /* when will this queue expire? */
95 struct timeval stamp; 95 ktime_t stamp;
96 int iif; 96 int iif;
97 unsigned int rid; 97 unsigned int rid;
98 struct inet_peer *peer; 98 struct inet_peer *peer;
@@ -184,7 +184,7 @@ static __inline__ struct ipq *frag_alloc_queue(void)
184{ 184{
185 struct ipq *qp = kmalloc(sizeof(struct ipq), GFP_ATOMIC); 185 struct ipq *qp = kmalloc(sizeof(struct ipq), GFP_ATOMIC);
186 186
187 if(!qp) 187 if (!qp)
188 return NULL; 188 return NULL;
189 atomic_add(sizeof(struct ipq), &ip_frag_mem); 189 atomic_add(sizeof(struct ipq), &ip_frag_mem);
190 return qp; 190 return qp;
@@ -321,11 +321,11 @@ static struct ipq *ip_frag_intern(struct ipq *qp_in)
321 * promoted read lock to write lock. 321 * promoted read lock to write lock.
322 */ 322 */
323 hlist_for_each_entry(qp, n, &ipq_hash[hash], list) { 323 hlist_for_each_entry(qp, n, &ipq_hash[hash], list) {
324 if(qp->id == qp_in->id && 324 if (qp->id == qp_in->id &&
325 qp->saddr == qp_in->saddr && 325 qp->saddr == qp_in->saddr &&
326 qp->daddr == qp_in->daddr && 326 qp->daddr == qp_in->daddr &&
327 qp->protocol == qp_in->protocol && 327 qp->protocol == qp_in->protocol &&
328 qp->user == qp_in->user) { 328 qp->user == qp_in->user) {
329 atomic_inc(&qp->refcnt); 329 atomic_inc(&qp->refcnt);
330 write_unlock(&ipfrag_lock); 330 write_unlock(&ipfrag_lock);
331 qp_in->last_in |= COMPLETE; 331 qp_in->last_in |= COMPLETE;
@@ -398,11 +398,11 @@ static inline struct ipq *ip_find(struct iphdr *iph, u32 user)
398 read_lock(&ipfrag_lock); 398 read_lock(&ipfrag_lock);
399 hash = ipqhashfn(id, saddr, daddr, protocol); 399 hash = ipqhashfn(id, saddr, daddr, protocol);
400 hlist_for_each_entry(qp, n, &ipq_hash[hash], list) { 400 hlist_for_each_entry(qp, n, &ipq_hash[hash], list) {
401 if(qp->id == id && 401 if (qp->id == id &&
402 qp->saddr == saddr && 402 qp->saddr == saddr &&
403 qp->daddr == daddr && 403 qp->daddr == daddr &&
404 qp->protocol == protocol && 404 qp->protocol == protocol &&
405 qp->user == user) { 405 qp->user == user) {
406 atomic_inc(&qp->refcnt); 406 atomic_inc(&qp->refcnt);
407 read_unlock(&ipfrag_lock); 407 read_unlock(&ipfrag_lock);
408 return qp; 408 return qp;
@@ -479,11 +479,11 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
479 goto err; 479 goto err;
480 } 480 }
481 481
482 offset = ntohs(skb->nh.iph->frag_off); 482 offset = ntohs(ip_hdr(skb)->frag_off);
483 flags = offset & ~IP_OFFSET; 483 flags = offset & ~IP_OFFSET;
484 offset &= IP_OFFSET; 484 offset &= IP_OFFSET;
485 offset <<= 3; /* offset is in 8-byte chunks */ 485 offset <<= 3; /* offset is in 8-byte chunks */
486 ihl = skb->nh.iph->ihl * 4; 486 ihl = ip_hdrlen(skb);
487 487
488 /* Determine the position of this fragment. */ 488 /* Determine the position of this fragment. */
489 end = offset + skb->len - ihl; 489 end = offset + skb->len - ihl;
@@ -524,7 +524,7 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
524 * this fragment, right? 524 * this fragment, right?
525 */ 525 */
526 prev = NULL; 526 prev = NULL;
527 for(next = qp->fragments; next != NULL; next = next->next) { 527 for (next = qp->fragments; next != NULL; next = next->next) {
528 if (FRAG_CB(next)->offset >= offset) 528 if (FRAG_CB(next)->offset >= offset)
529 break; /* bingo! */ 529 break; /* bingo! */
530 prev = next; 530 prev = next;
@@ -592,7 +592,7 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
592 if (skb->dev) 592 if (skb->dev)
593 qp->iif = skb->dev->ifindex; 593 qp->iif = skb->dev->ifindex;
594 skb->dev = NULL; 594 skb->dev = NULL;
595 skb_get_timestamp(skb, &qp->stamp); 595 qp->stamp = skb->tstamp;
596 qp->meat += skb->len; 596 qp->meat += skb->len;
597 atomic_add(skb->truesize, &ip_frag_mem); 597 atomic_add(skb->truesize, &ip_frag_mem);
598 if (offset == 0) 598 if (offset == 0)
@@ -624,10 +624,10 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev)
624 BUG_TRAP(FRAG_CB(head)->offset == 0); 624 BUG_TRAP(FRAG_CB(head)->offset == 0);
625 625
626 /* Allocate a new buffer for the datagram. */ 626 /* Allocate a new buffer for the datagram. */
627 ihlen = head->nh.iph->ihl*4; 627 ihlen = ip_hdrlen(head);
628 len = ihlen + qp->len; 628 len = ihlen + qp->len;
629 629
630 if(len > 65535) 630 if (len > 65535)
631 goto out_oversize; 631 goto out_oversize;
632 632
633 /* Head of list must not be cloned. */ 633 /* Head of list must not be cloned. */
@@ -658,7 +658,7 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev)
658 } 658 }
659 659
660 skb_shinfo(head)->frag_list = head->next; 660 skb_shinfo(head)->frag_list = head->next;
661 skb_push(head, head->data - head->nh.raw); 661 skb_push(head, head->data - skb_network_header(head));
662 atomic_sub(head->truesize, &ip_frag_mem); 662 atomic_sub(head->truesize, &ip_frag_mem);
663 663
664 for (fp=head->next; fp; fp = fp->next) { 664 for (fp=head->next; fp; fp = fp->next) {
@@ -674,9 +674,9 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev)
674 674
675 head->next = NULL; 675 head->next = NULL;
676 head->dev = dev; 676 head->dev = dev;
677 skb_set_timestamp(head, &qp->stamp); 677 head->tstamp = qp->stamp;
678 678
679 iph = head->nh.iph; 679 iph = ip_hdr(head);
680 iph->frag_off = 0; 680 iph->frag_off = 0;
681 iph->tot_len = htons(len); 681 iph->tot_len = htons(len);
682 IP_INC_STATS_BH(IPSTATS_MIB_REASMOKS); 682 IP_INC_STATS_BH(IPSTATS_MIB_REASMOKS);
@@ -700,7 +700,6 @@ out_fail:
700/* Process an incoming IP datagram fragment. */ 700/* Process an incoming IP datagram fragment. */
701struct sk_buff *ip_defrag(struct sk_buff *skb, u32 user) 701struct sk_buff *ip_defrag(struct sk_buff *skb, u32 user)
702{ 702{
703 struct iphdr *iph = skb->nh.iph;
704 struct ipq *qp; 703 struct ipq *qp;
705 struct net_device *dev; 704 struct net_device *dev;
706 705
@@ -713,7 +712,7 @@ struct sk_buff *ip_defrag(struct sk_buff *skb, u32 user)
713 dev = skb->dev; 712 dev = skb->dev;
714 713
715 /* Lookup (or create) queue header */ 714 /* Lookup (or create) queue header */
716 if ((qp = ip_find(iph, user)) != NULL) { 715 if ((qp = ip_find(ip_hdr(skb), user)) != NULL) {
717 struct sk_buff *ret = NULL; 716 struct sk_buff *ret = NULL;
718 717
719 spin_lock(&qp->lock); 718 spin_lock(&qp->lock);
@@ -734,7 +733,7 @@ struct sk_buff *ip_defrag(struct sk_buff *skb, u32 user)
734 return NULL; 733 return NULL;
735} 734}
736 735
737void ipfrag_init(void) 736void __init ipfrag_init(void)
738{ 737{
739 ipfrag_hash_rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^ 738 ipfrag_hash_rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^
740 (jiffies ^ (jiffies >> 6))); 739 (jiffies ^ (jiffies >> 6)));
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 9151da642318..63282934725e 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -191,11 +191,11 @@ static struct ip_tunnel * ipgre_tunnel_lookup(__be32 remote, __be32 local, __be3
191 return NULL; 191 return NULL;
192} 192}
193 193
194static struct ip_tunnel **ipgre_bucket(struct ip_tunnel *t) 194static struct ip_tunnel **__ipgre_bucket(struct ip_tunnel_parm *parms)
195{ 195{
196 __be32 remote = t->parms.iph.daddr; 196 __be32 remote = parms->iph.daddr;
197 __be32 local = t->parms.iph.saddr; 197 __be32 local = parms->iph.saddr;
198 __be32 key = t->parms.i_key; 198 __be32 key = parms->i_key;
199 unsigned h = HASH(key); 199 unsigned h = HASH(key);
200 int prio = 0; 200 int prio = 0;
201 201
@@ -209,6 +209,11 @@ static struct ip_tunnel **ipgre_bucket(struct ip_tunnel *t)
209 return &tunnels[prio][h]; 209 return &tunnels[prio][h];
210} 210}
211 211
212static inline struct ip_tunnel **ipgre_bucket(struct ip_tunnel *t)
213{
214 return __ipgre_bucket(&t->parms);
215}
216
212static void ipgre_tunnel_link(struct ip_tunnel *t) 217static void ipgre_tunnel_link(struct ip_tunnel *t)
213{ 218{
214 struct ip_tunnel **tp = ipgre_bucket(t); 219 struct ip_tunnel **tp = ipgre_bucket(t);
@@ -240,17 +245,9 @@ static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int
240 __be32 key = parms->i_key; 245 __be32 key = parms->i_key;
241 struct ip_tunnel *t, **tp, *nt; 246 struct ip_tunnel *t, **tp, *nt;
242 struct net_device *dev; 247 struct net_device *dev;
243 unsigned h = HASH(key);
244 int prio = 0;
245 char name[IFNAMSIZ]; 248 char name[IFNAMSIZ];
246 249
247 if (local) 250 for (tp = __ipgre_bucket(parms); (t = *tp) != NULL; tp = &t->next) {
248 prio |= 1;
249 if (remote && !MULTICAST(remote)) {
250 prio |= 2;
251 h ^= HASH(remote);
252 }
253 for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) {
254 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) { 251 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
255 if (key == t->parms.i_key) 252 if (key == t->parms.i_key)
256 return t; 253 return t;
@@ -320,8 +317,8 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
320 struct iphdr *iph = (struct iphdr*)skb->data; 317 struct iphdr *iph = (struct iphdr*)skb->data;
321 __be16 *p = (__be16*)(skb->data+(iph->ihl<<2)); 318 __be16 *p = (__be16*)(skb->data+(iph->ihl<<2));
322 int grehlen = (iph->ihl<<2) + 4; 319 int grehlen = (iph->ihl<<2) + 4;
323 int type = skb->h.icmph->type; 320 const int type = icmp_hdr(skb)->type;
324 int code = skb->h.icmph->code; 321 const int code = icmp_hdr(skb)->code;
325 struct ip_tunnel *t; 322 struct ip_tunnel *t;
326 __be16 flags; 323 __be16 flags;
327 324
@@ -388,8 +385,8 @@ out:
388 struct iphdr *iph = (struct iphdr*)dp; 385 struct iphdr *iph = (struct iphdr*)dp;
389 struct iphdr *eiph; 386 struct iphdr *eiph;
390 __be16 *p = (__be16*)(dp+(iph->ihl<<2)); 387 __be16 *p = (__be16*)(dp+(iph->ihl<<2));
391 int type = skb->h.icmph->type; 388 const int type = icmp_hdr(skb)->type;
392 int code = skb->h.icmph->code; 389 const int code = icmp_hdr(skb)->code;
393 int rel_type = 0; 390 int rel_type = 0;
394 int rel_code = 0; 391 int rel_code = 0;
395 __be32 rel_info = 0; 392 __be32 rel_info = 0;
@@ -422,7 +419,7 @@ out:
422 default: 419 default:
423 return; 420 return;
424 case ICMP_PARAMETERPROB: 421 case ICMP_PARAMETERPROB:
425 n = ntohl(skb->h.icmph->un.gateway) >> 24; 422 n = ntohl(icmp_hdr(skb)->un.gateway) >> 24;
426 if (n < (iph->ihl<<2)) 423 if (n < (iph->ihl<<2))
427 return; 424 return;
428 425
@@ -442,7 +439,7 @@ out:
442 return; 439 return;
443 case ICMP_FRAG_NEEDED: 440 case ICMP_FRAG_NEEDED:
444 /* And it is the only really necessary thing :-) */ 441 /* And it is the only really necessary thing :-) */
445 n = ntohs(skb->h.icmph->un.frag.mtu); 442 n = ntohs(icmp_hdr(skb)->un.frag.mtu);
446 if (n < grehlen+68) 443 if (n < grehlen+68)
447 return; 444 return;
448 n -= grehlen; 445 n -= grehlen;
@@ -474,7 +471,7 @@ out:
474 dst_release(skb2->dst); 471 dst_release(skb2->dst);
475 skb2->dst = NULL; 472 skb2->dst = NULL;
476 skb_pull(skb2, skb->data - (u8*)eiph); 473 skb_pull(skb2, skb->data - (u8*)eiph);
477 skb2->nh.raw = skb2->data; 474 skb_reset_network_header(skb2);
478 475
479 /* Try to guess incoming interface */ 476 /* Try to guess incoming interface */
480 memset(&fl, 0, sizeof(fl)); 477 memset(&fl, 0, sizeof(fl));
@@ -533,9 +530,9 @@ static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
533{ 530{
534 if (INET_ECN_is_ce(iph->tos)) { 531 if (INET_ECN_is_ce(iph->tos)) {
535 if (skb->protocol == htons(ETH_P_IP)) { 532 if (skb->protocol == htons(ETH_P_IP)) {
536 IP_ECN_set_ce(skb->nh.iph); 533 IP_ECN_set_ce(ip_hdr(skb));
537 } else if (skb->protocol == htons(ETH_P_IPV6)) { 534 } else if (skb->protocol == htons(ETH_P_IPV6)) {
538 IP6_ECN_set_ce(skb->nh.ipv6h); 535 IP6_ECN_set_ce(ipv6_hdr(skb));
539 } 536 }
540 } 537 }
541} 538}
@@ -565,7 +562,7 @@ static int ipgre_rcv(struct sk_buff *skb)
565 if (!pskb_may_pull(skb, 16)) 562 if (!pskb_may_pull(skb, 16))
566 goto drop_nolock; 563 goto drop_nolock;
567 564
568 iph = skb->nh.iph; 565 iph = ip_hdr(skb);
569 h = skb->data; 566 h = skb->data;
570 flags = *(__be16*)h; 567 flags = *(__be16*)h;
571 568
@@ -616,9 +613,10 @@ static int ipgre_rcv(struct sk_buff *skb)
616 offset += 4; 613 offset += 4;
617 } 614 }
618 615
619 skb->mac.raw = skb->nh.raw; 616 skb_reset_mac_header(skb);
620 skb->nh.raw = __pskb_pull(skb, offset); 617 __pskb_pull(skb, offset);
621 skb_postpull_rcsum(skb, skb->h.raw, offset); 618 skb_reset_network_header(skb);
619 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
622 skb->pkt_type = PACKET_HOST; 620 skb->pkt_type = PACKET_HOST;
623#ifdef CONFIG_NET_IPGRE_BROADCAST 621#ifdef CONFIG_NET_IPGRE_BROADCAST
624 if (MULTICAST(iph->daddr)) { 622 if (MULTICAST(iph->daddr)) {
@@ -669,7 +667,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
669{ 667{
670 struct ip_tunnel *tunnel = netdev_priv(dev); 668 struct ip_tunnel *tunnel = netdev_priv(dev);
671 struct net_device_stats *stats = &tunnel->stat; 669 struct net_device_stats *stats = &tunnel->stat;
672 struct iphdr *old_iph = skb->nh.iph; 670 struct iphdr *old_iph = ip_hdr(skb);
673 struct iphdr *tiph; 671 struct iphdr *tiph;
674 u8 tos; 672 u8 tos;
675 __be16 df; 673 __be16 df;
@@ -720,7 +718,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
720 addr_type = ipv6_addr_type(addr6); 718 addr_type = ipv6_addr_type(addr6);
721 719
722 if (addr_type == IPV6_ADDR_ANY) { 720 if (addr_type == IPV6_ADDR_ANY) {
723 addr6 = &skb->nh.ipv6h->daddr; 721 addr6 = &ipv6_hdr(skb)->daddr;
724 addr_type = ipv6_addr_type(addr6); 722 addr_type = ipv6_addr_type(addr6);
725 } 723 }
726 724
@@ -824,11 +822,12 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
824 skb_set_owner_w(new_skb, skb->sk); 822 skb_set_owner_w(new_skb, skb->sk);
825 dev_kfree_skb(skb); 823 dev_kfree_skb(skb);
826 skb = new_skb; 824 skb = new_skb;
827 old_iph = skb->nh.iph; 825 old_iph = ip_hdr(skb);
828 } 826 }
829 827
830 skb->h.raw = skb->nh.raw; 828 skb->transport_header = skb->network_header;
831 skb->nh.raw = skb_push(skb, gre_hlen); 829 skb_push(skb, gre_hlen);
830 skb_reset_network_header(skb);
832 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 831 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
833 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | 832 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
834 IPSKB_REROUTED); 833 IPSKB_REROUTED);
@@ -839,7 +838,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
839 * Push down and install the IPIP header. 838 * Push down and install the IPIP header.
840 */ 839 */
841 840
842 iph = skb->nh.iph; 841 iph = ip_hdr(skb);
843 iph->version = 4; 842 iph->version = 4;
844 iph->ihl = sizeof(struct iphdr) >> 2; 843 iph->ihl = sizeof(struct iphdr) >> 2;
845 iph->frag_off = df; 844 iph->frag_off = df;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index f38e97647ac0..324e7e0fdb2a 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -158,7 +158,7 @@ DEFINE_SNMP_STAT(struct ipstats_mib, ip_statistics) __read_mostly;
158int ip_call_ra_chain(struct sk_buff *skb) 158int ip_call_ra_chain(struct sk_buff *skb)
159{ 159{
160 struct ip_ra_chain *ra; 160 struct ip_ra_chain *ra;
161 u8 protocol = skb->nh.iph->protocol; 161 u8 protocol = ip_hdr(skb)->protocol;
162 struct sock *last = NULL; 162 struct sock *last = NULL;
163 163
164 read_lock(&ip_ra_lock); 164 read_lock(&ip_ra_lock);
@@ -171,7 +171,7 @@ int ip_call_ra_chain(struct sk_buff *skb)
171 if (sk && inet_sk(sk)->num == protocol && 171 if (sk && inet_sk(sk)->num == protocol &&
172 (!sk->sk_bound_dev_if || 172 (!sk->sk_bound_dev_if ||
173 sk->sk_bound_dev_if == skb->dev->ifindex)) { 173 sk->sk_bound_dev_if == skb->dev->ifindex)) {
174 if (skb->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { 174 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
175 skb = ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN); 175 skb = ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN);
176 if (skb == NULL) { 176 if (skb == NULL) {
177 read_unlock(&ip_ra_lock); 177 read_unlock(&ip_ra_lock);
@@ -198,17 +198,15 @@ int ip_call_ra_chain(struct sk_buff *skb)
198 198
199static inline int ip_local_deliver_finish(struct sk_buff *skb) 199static inline int ip_local_deliver_finish(struct sk_buff *skb)
200{ 200{
201 int ihl = skb->nh.iph->ihl*4; 201 __skb_pull(skb, ip_hdrlen(skb));
202
203 __skb_pull(skb, ihl);
204 202
205 /* Point into the IP datagram, just past the header. */ 203 /* Point into the IP datagram, just past the header. */
206 skb->h.raw = skb->data; 204 skb_reset_transport_header(skb);
207 205
208 rcu_read_lock(); 206 rcu_read_lock();
209 { 207 {
210 /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */ 208 /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */
211 int protocol = skb->nh.iph->protocol; 209 int protocol = ip_hdr(skb)->protocol;
212 int hash; 210 int hash;
213 struct sock *raw_sk; 211 struct sock *raw_sk;
214 struct net_protocol *ipprot; 212 struct net_protocol *ipprot;
@@ -220,7 +218,7 @@ static inline int ip_local_deliver_finish(struct sk_buff *skb)
220 /* If there maybe a raw socket we must check - if not we 218 /* If there maybe a raw socket we must check - if not we
221 * don't care less 219 * don't care less
222 */ 220 */
223 if (raw_sk && !raw_v4_input(skb, skb->nh.iph, hash)) 221 if (raw_sk && !raw_v4_input(skb, ip_hdr(skb), hash))
224 raw_sk = NULL; 222 raw_sk = NULL;
225 223
226 if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) { 224 if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) {
@@ -266,7 +264,7 @@ int ip_local_deliver(struct sk_buff *skb)
266 * Reassemble IP fragments. 264 * Reassemble IP fragments.
267 */ 265 */
268 266
269 if (skb->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { 267 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
270 skb = ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER); 268 skb = ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER);
271 if (!skb) 269 if (!skb)
272 return 0; 270 return 0;
@@ -294,7 +292,7 @@ static inline int ip_rcv_options(struct sk_buff *skb)
294 goto drop; 292 goto drop;
295 } 293 }
296 294
297 iph = skb->nh.iph; 295 iph = ip_hdr(skb);
298 296
299 if (ip_options_compile(NULL, skb)) { 297 if (ip_options_compile(NULL, skb)) {
300 IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); 298 IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
@@ -330,7 +328,7 @@ drop:
330 328
331static inline int ip_rcv_finish(struct sk_buff *skb) 329static inline int ip_rcv_finish(struct sk_buff *skb)
332{ 330{
333 struct iphdr *iph = skb->nh.iph; 331 const struct iphdr *iph = ip_hdr(skb);
334 332
335 /* 333 /*
336 * Initialise the virtual path cache for the packet. It describes 334 * Initialise the virtual path cache for the packet. It describes
@@ -391,7 +389,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
391 if (!pskb_may_pull(skb, sizeof(struct iphdr))) 389 if (!pskb_may_pull(skb, sizeof(struct iphdr)))
392 goto inhdr_error; 390 goto inhdr_error;
393 391
394 iph = skb->nh.iph; 392 iph = ip_hdr(skb);
395 393
396 /* 394 /*
397 * RFC1122: 3.1.2.2 MUST silently discard any IP frame that fails the checksum. 395 * RFC1122: 3.1.2.2 MUST silently discard any IP frame that fails the checksum.
@@ -410,7 +408,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
410 if (!pskb_may_pull(skb, iph->ihl*4)) 408 if (!pskb_may_pull(skb, iph->ihl*4))
411 goto inhdr_error; 409 goto inhdr_error;
412 410
413 iph = skb->nh.iph; 411 iph = ip_hdr(skb);
414 412
415 if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) 413 if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
416 goto inhdr_error; 414 goto inhdr_error;
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index f906a80d5a87..251346828cb4 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -40,7 +40,7 @@
40void ip_options_build(struct sk_buff * skb, struct ip_options * opt, 40void ip_options_build(struct sk_buff * skb, struct ip_options * opt,
41 __be32 daddr, struct rtable *rt, int is_frag) 41 __be32 daddr, struct rtable *rt, int is_frag)
42{ 42{
43 unsigned char * iph = skb->nh.raw; 43 unsigned char *iph = skb_network_header(skb);
44 44
45 memcpy(&(IPCB(skb)->opt), opt, sizeof(struct ip_options)); 45 memcpy(&(IPCB(skb)->opt), opt, sizeof(struct ip_options));
46 memcpy(iph+sizeof(struct iphdr), opt->__data, opt->optlen); 46 memcpy(iph+sizeof(struct iphdr), opt->__data, opt->optlen);
@@ -104,13 +104,13 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)
104 return 0; 104 return 0;
105 } 105 }
106 106
107 sptr = skb->nh.raw; 107 sptr = skb_network_header(skb);
108 dptr = dopt->__data; 108 dptr = dopt->__data;
109 109
110 if (skb->dst) 110 if (skb->dst)
111 daddr = ((struct rtable*)skb->dst)->rt_spec_dst; 111 daddr = ((struct rtable*)skb->dst)->rt_spec_dst;
112 else 112 else
113 daddr = skb->nh.iph->daddr; 113 daddr = ip_hdr(skb)->daddr;
114 114
115 if (sopt->rr) { 115 if (sopt->rr) {
116 optlen = sptr[sopt->rr+1]; 116 optlen = sptr[sopt->rr+1];
@@ -180,7 +180,8 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)
180 /* 180 /*
181 * RFC1812 requires to fix illegal source routes. 181 * RFC1812 requires to fix illegal source routes.
182 */ 182 */
183 if (memcmp(&skb->nh.iph->saddr, &start[soffset+3], 4) == 0) 183 if (memcmp(&ip_hdr(skb)->saddr,
184 &start[soffset + 3], 4) == 0)
184 doffset -= 4; 185 doffset -= 4;
185 } 186 }
186 if (doffset > 3) { 187 if (doffset > 3) {
@@ -217,7 +218,7 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)
217 218
218void ip_options_fragment(struct sk_buff * skb) 219void ip_options_fragment(struct sk_buff * skb)
219{ 220{
220 unsigned char * optptr = skb->nh.raw + sizeof(struct iphdr); 221 unsigned char *optptr = skb_network_header(skb) + sizeof(struct iphdr);
221 struct ip_options * opt = &(IPCB(skb)->opt); 222 struct ip_options * opt = &(IPCB(skb)->opt);
222 int l = opt->optlen; 223 int l = opt->optlen;
223 int optlen; 224 int optlen;
@@ -264,12 +265,13 @@ int ip_options_compile(struct ip_options * opt, struct sk_buff * skb)
264 265
265 if (!opt) { 266 if (!opt) {
266 opt = &(IPCB(skb)->opt); 267 opt = &(IPCB(skb)->opt);
267 iph = skb->nh.raw; 268 iph = skb_network_header(skb);
268 opt->optlen = ((struct iphdr *)iph)->ihl*4 - sizeof(struct iphdr); 269 opt->optlen = ((struct iphdr *)iph)->ihl*4 - sizeof(struct iphdr);
269 optptr = iph + sizeof(struct iphdr); 270 optptr = iph + sizeof(struct iphdr);
270 opt->is_data = 0; 271 opt->is_data = 0;
271 } else { 272 } else {
272 optptr = opt->is_data ? opt->__data : (unsigned char*)&(skb->nh.iph[1]); 273 optptr = opt->is_data ? opt->__data :
274 (unsigned char *)&(ip_hdr(skb)[1]);
273 iph = optptr - sizeof(struct iphdr); 275 iph = optptr - sizeof(struct iphdr);
274 } 276 }
275 277
@@ -563,7 +565,7 @@ void ip_forward_options(struct sk_buff *skb)
563 struct ip_options * opt = &(IPCB(skb)->opt); 565 struct ip_options * opt = &(IPCB(skb)->opt);
564 unsigned char * optptr; 566 unsigned char * optptr;
565 struct rtable *rt = (struct rtable*)skb->dst; 567 struct rtable *rt = (struct rtable*)skb->dst;
566 unsigned char *raw = skb->nh.raw; 568 unsigned char *raw = skb_network_header(skb);
567 569
568 if (opt->rr_needaddr) { 570 if (opt->rr_needaddr) {
569 optptr = (unsigned char *)raw + opt->rr; 571 optptr = (unsigned char *)raw + opt->rr;
@@ -587,7 +589,7 @@ void ip_forward_options(struct sk_buff *skb)
587 if (srrptr + 3 <= srrspace) { 589 if (srrptr + 3 <= srrspace) {
588 opt->is_changed = 1; 590 opt->is_changed = 1;
589 ip_rt_get_source(&optptr[srrptr-1], rt); 591 ip_rt_get_source(&optptr[srrptr-1], rt);
590 skb->nh.iph->daddr = rt->rt_dst; 592 ip_hdr(skb)->daddr = rt->rt_dst;
591 optptr[2] = srrptr+4; 593 optptr[2] = srrptr+4;
592 } else if (net_ratelimit()) 594 } else if (net_ratelimit())
593 printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n"); 595 printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n");
@@ -599,7 +601,7 @@ void ip_forward_options(struct sk_buff *skb)
599 } 601 }
600 if (opt->is_changed) { 602 if (opt->is_changed) {
601 opt->is_changed = 0; 603 opt->is_changed = 0;
602 ip_send_check(skb->nh.iph); 604 ip_send_check(ip_hdr(skb));
603 } 605 }
604} 606}
605 607
@@ -608,8 +610,8 @@ int ip_options_rcv_srr(struct sk_buff *skb)
608 struct ip_options *opt = &(IPCB(skb)->opt); 610 struct ip_options *opt = &(IPCB(skb)->opt);
609 int srrspace, srrptr; 611 int srrspace, srrptr;
610 __be32 nexthop; 612 __be32 nexthop;
611 struct iphdr *iph = skb->nh.iph; 613 struct iphdr *iph = ip_hdr(skb);
612 unsigned char * optptr = skb->nh.raw + opt->srr; 614 unsigned char *optptr = skb_network_header(skb) + opt->srr;
613 struct rtable *rt = (struct rtable*)skb->dst; 615 struct rtable *rt = (struct rtable*)skb->dst;
614 struct rtable *rt2; 616 struct rtable *rt2;
615 int err; 617 int err;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index d096332f6c6d..534650cad3a8 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -95,8 +95,8 @@ __inline__ void ip_send_check(struct iphdr *iph)
95/* dev_loopback_xmit for use with netfilter. */ 95/* dev_loopback_xmit for use with netfilter. */
96static int ip_dev_loopback_xmit(struct sk_buff *newskb) 96static int ip_dev_loopback_xmit(struct sk_buff *newskb)
97{ 97{
98 newskb->mac.raw = newskb->data; 98 skb_reset_mac_header(newskb);
99 __skb_pull(newskb, newskb->nh.raw - newskb->data); 99 __skb_pull(newskb, skb_network_offset(newskb));
100 newskb->pkt_type = PACKET_LOOPBACK; 100 newskb->pkt_type = PACKET_LOOPBACK;
101 newskb->ip_summed = CHECKSUM_UNNECESSARY; 101 newskb->ip_summed = CHECKSUM_UNNECESSARY;
102 BUG_TRAP(newskb->dst); 102 BUG_TRAP(newskb->dst);
@@ -125,11 +125,9 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
125 struct iphdr *iph; 125 struct iphdr *iph;
126 126
127 /* Build the IP header. */ 127 /* Build the IP header. */
128 if (opt) 128 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
129 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen); 129 skb_reset_network_header(skb);
130 else 130 iph = ip_hdr(skb);
131 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
132
133 iph->version = 4; 131 iph->version = 4;
134 iph->ihl = 5; 132 iph->ihl = 5;
135 iph->tos = inet->tos; 133 iph->tos = inet->tos;
@@ -143,7 +141,6 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
143 iph->protocol = sk->sk_protocol; 141 iph->protocol = sk->sk_protocol;
144 iph->tot_len = htons(skb->len); 142 iph->tot_len = htons(skb->len);
145 ip_select_ident(iph, &rt->u.dst, sk); 143 ip_select_ident(iph, &rt->u.dst, sk);
146 skb->nh.iph = iph;
147 144
148 if (opt && opt->optlen) { 145 if (opt && opt->optlen) {
149 iph->ihl += opt->optlen>>2; 146 iph->ihl += opt->optlen>>2;
@@ -192,6 +189,14 @@ static inline int ip_finish_output2(struct sk_buff *skb)
192 return -EINVAL; 189 return -EINVAL;
193} 190}
194 191
192static inline int ip_skb_dst_mtu(struct sk_buff *skb)
193{
194 struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
195
196 return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
197 skb->dst->dev->mtu : dst_mtu(skb->dst);
198}
199
195static inline int ip_finish_output(struct sk_buff *skb) 200static inline int ip_finish_output(struct sk_buff *skb)
196{ 201{
197#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) 202#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
@@ -201,7 +206,7 @@ static inline int ip_finish_output(struct sk_buff *skb)
201 return dst_output(skb); 206 return dst_output(skb);
202 } 207 }
203#endif 208#endif
204 if (skb->len > dst_mtu(skb->dst) && !skb_is_gso(skb)) 209 if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
205 return ip_fragment(skb, ip_finish_output2); 210 return ip_fragment(skb, ip_finish_output2);
206 else 211 else
207 return ip_finish_output2(skb); 212 return ip_finish_output2(skb);
@@ -248,7 +253,7 @@ int ip_mc_output(struct sk_buff *skb)
248 253
249 /* Multicasts with ttl 0 must not go beyond the host */ 254 /* Multicasts with ttl 0 must not go beyond the host */
250 255
251 if (skb->nh.iph->ttl == 0) { 256 if (ip_hdr(skb)->ttl == 0) {
252 kfree_skb(skb); 257 kfree_skb(skb);
253 return 0; 258 return 0;
254 } 259 }
@@ -333,7 +338,9 @@ packet_routed:
333 goto no_route; 338 goto no_route;
334 339
335 /* OK, we know where to send it, allocate and build IP header. */ 340 /* OK, we know where to send it, allocate and build IP header. */
336 iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0)); 341 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
342 skb_reset_network_header(skb);
343 iph = ip_hdr(skb);
337 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); 344 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
338 iph->tot_len = htons(skb->len); 345 iph->tot_len = htons(skb->len);
339 if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok) 346 if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
@@ -344,7 +351,6 @@ packet_routed:
344 iph->protocol = sk->sk_protocol; 351 iph->protocol = sk->sk_protocol;
345 iph->saddr = rt->rt_src; 352 iph->saddr = rt->rt_src;
346 iph->daddr = rt->rt_dst; 353 iph->daddr = rt->rt_dst;
347 skb->nh.iph = iph;
348 /* Transport layer set skb->h.foo itself. */ 354 /* Transport layer set skb->h.foo itself. */
349 355
350 if (opt && opt->optlen) { 356 if (opt && opt->optlen) {
@@ -386,21 +392,10 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
386#ifdef CONFIG_NET_SCHED 392#ifdef CONFIG_NET_SCHED
387 to->tc_index = from->tc_index; 393 to->tc_index = from->tc_index;
388#endif 394#endif
389#ifdef CONFIG_NETFILTER 395 nf_copy(to, from);
390 /* Connection association is same as pre-frag packet */
391 nf_conntrack_put(to->nfct);
392 to->nfct = from->nfct;
393 nf_conntrack_get(to->nfct);
394 to->nfctinfo = from->nfctinfo;
395#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) 396#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
396 to->ipvs_property = from->ipvs_property; 397 to->ipvs_property = from->ipvs_property;
397#endif 398#endif
398#ifdef CONFIG_BRIDGE_NETFILTER
399 nf_bridge_put(to->nf_bridge);
400 to->nf_bridge = from->nf_bridge;
401 nf_bridge_get(to->nf_bridge);
402#endif
403#endif
404 skb_copy_secmark(to, from); 399 skb_copy_secmark(to, from);
405} 400}
406 401
@@ -430,12 +425,12 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
430 * Point into the IP datagram header. 425 * Point into the IP datagram header.
431 */ 426 */
432 427
433 iph = skb->nh.iph; 428 iph = ip_hdr(skb);
434 429
435 if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) { 430 if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
436 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS); 431 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
437 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, 432 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
438 htonl(dst_mtu(&rt->u.dst))); 433 htonl(ip_skb_dst_mtu(skb)));
439 kfree_skb(skb); 434 kfree_skb(skb);
440 return -EMSGSIZE; 435 return -EMSGSIZE;
441 } 436 }
@@ -502,10 +497,11 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
502 * before previous one went down. */ 497 * before previous one went down. */
503 if (frag) { 498 if (frag) {
504 frag->ip_summed = CHECKSUM_NONE; 499 frag->ip_summed = CHECKSUM_NONE;
505 frag->h.raw = frag->data; 500 skb_reset_transport_header(frag);
506 frag->nh.raw = __skb_push(frag, hlen); 501 __skb_push(frag, hlen);
507 memcpy(frag->nh.raw, iph, hlen); 502 skb_reset_network_header(frag);
508 iph = frag->nh.iph; 503 memcpy(skb_network_header(frag), iph, hlen);
504 iph = ip_hdr(frag);
509 iph->tot_len = htons(frag->len); 505 iph->tot_len = htons(frag->len);
510 ip_copy_metadata(frag, skb); 506 ip_copy_metadata(frag, skb);
511 if (offset == 0) 507 if (offset == 0)
@@ -566,7 +562,7 @@ slow_path:
566 * Keep copying data until we run out. 562 * Keep copying data until we run out.
567 */ 563 */
568 564
569 while(left > 0) { 565 while (left > 0) {
570 len = left; 566 len = left;
571 /* IF: it doesn't fit, use 'mtu' - the data space left */ 567 /* IF: it doesn't fit, use 'mtu' - the data space left */
572 if (len > mtu) 568 if (len > mtu)
@@ -593,8 +589,8 @@ slow_path:
593 ip_copy_metadata(skb2, skb); 589 ip_copy_metadata(skb2, skb);
594 skb_reserve(skb2, ll_rs); 590 skb_reserve(skb2, ll_rs);
595 skb_put(skb2, len + hlen); 591 skb_put(skb2, len + hlen);
596 skb2->nh.raw = skb2->data; 592 skb_reset_network_header(skb2);
597 skb2->h.raw = skb2->data + hlen; 593 skb2->transport_header = skb2->network_header + hlen;
598 594
599 /* 595 /*
600 * Charge the memory for the fragment to any owner 596 * Charge the memory for the fragment to any owner
@@ -608,19 +604,19 @@ slow_path:
608 * Copy the packet header into the new buffer. 604 * Copy the packet header into the new buffer.
609 */ 605 */
610 606
611 memcpy(skb2->nh.raw, skb->data, hlen); 607 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
612 608
613 /* 609 /*
614 * Copy a block of the IP datagram. 610 * Copy a block of the IP datagram.
615 */ 611 */
616 if (skb_copy_bits(skb, ptr, skb2->h.raw, len)) 612 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
617 BUG(); 613 BUG();
618 left -= len; 614 left -= len;
619 615
620 /* 616 /*
621 * Fill in the new header fields. 617 * Fill in the new header fields.
622 */ 618 */
623 iph = skb2->nh.iph; 619 iph = ip_hdr(skb2);
624 iph->frag_off = htons((offset >> 3)); 620 iph->frag_off = htons((offset >> 3));
625 621
626 /* ANK: dirty, but effective trick. Upgrade options only if 622 /* ANK: dirty, but effective trick. Upgrade options only if
@@ -722,10 +718,10 @@ static inline int ip_ufo_append_data(struct sock *sk,
722 skb_put(skb,fragheaderlen + transhdrlen); 718 skb_put(skb,fragheaderlen + transhdrlen);
723 719
724 /* initialize network header pointer */ 720 /* initialize network header pointer */
725 skb->nh.raw = skb->data; 721 skb_reset_network_header(skb);
726 722
727 /* initialize protocol header pointer */ 723 /* initialize protocol header pointer */
728 skb->h.raw = skb->data + fragheaderlen; 724 skb->transport_header = skb->network_header + fragheaderlen;
729 725
730 skb->ip_summed = CHECKSUM_PARTIAL; 726 skb->ip_summed = CHECKSUM_PARTIAL;
731 skb->csum = 0; 727 skb->csum = 0;
@@ -799,7 +795,9 @@ int ip_append_data(struct sock *sk,
799 inet->cork.addr = ipc->addr; 795 inet->cork.addr = ipc->addr;
800 } 796 }
801 dst_hold(&rt->u.dst); 797 dst_hold(&rt->u.dst);
802 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path); 798 inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
799 rt->u.dst.dev->mtu :
800 dst_mtu(rt->u.dst.path);
803 inet->cork.rt = rt; 801 inet->cork.rt = rt;
804 inet->cork.length = 0; 802 inet->cork.length = 0;
805 sk->sk_sndmsg_page = NULL; 803 sk->sk_sndmsg_page = NULL;
@@ -929,9 +927,10 @@ alloc_new_skb:
929 * Find where to start putting bytes. 927 * Find where to start putting bytes.
930 */ 928 */
931 data = skb_put(skb, fraglen); 929 data = skb_put(skb, fraglen);
932 skb->nh.raw = data + exthdrlen; 930 skb_set_network_header(skb, exthdrlen);
931 skb->transport_header = (skb->network_header +
932 fragheaderlen);
933 data += fragheaderlen; 933 data += fragheaderlen;
934 skb->h.raw = data + exthdrlen;
935 934
936 if (fraggap) { 935 if (fraggap) {
937 skb->csum = skb_copy_and_csum_bits( 936 skb->csum = skb_copy_and_csum_bits(
@@ -1100,8 +1099,6 @@ ssize_t ip_append_page(struct sock *sk, struct page *page,
1100 } 1099 }
1101 if (len <= 0) { 1100 if (len <= 0) {
1102 struct sk_buff *skb_prev; 1101 struct sk_buff *skb_prev;
1103 char *data;
1104 struct iphdr *iph;
1105 int alloclen; 1102 int alloclen;
1106 1103
1107 skb_prev = skb; 1104 skb_prev = skb;
@@ -1124,15 +1121,15 @@ ssize_t ip_append_page(struct sock *sk, struct page *page,
1124 /* 1121 /*
1125 * Find where to start putting bytes. 1122 * Find where to start putting bytes.
1126 */ 1123 */
1127 data = skb_put(skb, fragheaderlen + fraggap); 1124 skb_put(skb, fragheaderlen + fraggap);
1128 skb->nh.iph = iph = (struct iphdr *)data; 1125 skb_reset_network_header(skb);
1129 data += fragheaderlen; 1126 skb->transport_header = (skb->network_header +
1130 skb->h.raw = data; 1127 fragheaderlen);
1131
1132 if (fraggap) { 1128 if (fraggap) {
1133 skb->csum = skb_copy_and_csum_bits( 1129 skb->csum = skb_copy_and_csum_bits(skb_prev,
1134 skb_prev, maxfraglen, 1130 maxfraglen,
1135 data, fraggap, 0); 1131 skb_transport_header(skb),
1132 fraggap, 0);
1136 skb_prev->csum = csum_sub(skb_prev->csum, 1133 skb_prev->csum = csum_sub(skb_prev->csum,
1137 skb->csum); 1134 skb->csum);
1138 pskb_trim_unique(skb_prev, maxfraglen); 1135 pskb_trim_unique(skb_prev, maxfraglen);
@@ -1198,10 +1195,10 @@ int ip_push_pending_frames(struct sock *sk)
1198 tail_skb = &(skb_shinfo(skb)->frag_list); 1195 tail_skb = &(skb_shinfo(skb)->frag_list);
1199 1196
1200 /* move skb->data to ip header from ext header */ 1197 /* move skb->data to ip header from ext header */
1201 if (skb->data < skb->nh.raw) 1198 if (skb->data < skb_network_header(skb))
1202 __skb_pull(skb, skb->nh.raw - skb->data); 1199 __skb_pull(skb, skb_network_offset(skb));
1203 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { 1200 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1204 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw); 1201 __skb_pull(tmp_skb, skb_network_header_len(skb));
1205 *tail_skb = tmp_skb; 1202 *tail_skb = tmp_skb;
1206 tail_skb = &(tmp_skb->next); 1203 tail_skb = &(tmp_skb->next);
1207 skb->len += tmp_skb->len; 1204 skb->len += tmp_skb->len;
@@ -1216,13 +1213,13 @@ int ip_push_pending_frames(struct sock *sk)
1216 * to fragment the frame generated here. No matter, what transforms 1213 * to fragment the frame generated here. No matter, what transforms
1217 * how transforms change size of the packet, it will come out. 1214 * how transforms change size of the packet, it will come out.
1218 */ 1215 */
1219 if (inet->pmtudisc != IP_PMTUDISC_DO) 1216 if (inet->pmtudisc < IP_PMTUDISC_DO)
1220 skb->local_df = 1; 1217 skb->local_df = 1;
1221 1218
1222 /* DF bit is set when we want to see DF on outgoing frames. 1219 /* DF bit is set when we want to see DF on outgoing frames.
1223 * If local_df is set too, we still allow to fragment this frame 1220 * If local_df is set too, we still allow to fragment this frame
1224 * locally. */ 1221 * locally. */
1225 if (inet->pmtudisc == IP_PMTUDISC_DO || 1222 if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1226 (skb->len <= dst_mtu(&rt->u.dst) && 1223 (skb->len <= dst_mtu(&rt->u.dst) &&
1227 ip_dont_fragment(sk, &rt->u.dst))) 1224 ip_dont_fragment(sk, &rt->u.dst)))
1228 df = htons(IP_DF); 1225 df = htons(IP_DF);
@@ -1352,11 +1349,11 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
1352 struct flowi fl = { .nl_u = { .ip4_u = 1349 struct flowi fl = { .nl_u = { .ip4_u =
1353 { .daddr = daddr, 1350 { .daddr = daddr,
1354 .saddr = rt->rt_spec_dst, 1351 .saddr = rt->rt_spec_dst,
1355 .tos = RT_TOS(skb->nh.iph->tos) } }, 1352 .tos = RT_TOS(ip_hdr(skb)->tos) } },
1356 /* Not quite clean, but right. */ 1353 /* Not quite clean, but right. */
1357 .uli_u = { .ports = 1354 .uli_u = { .ports =
1358 { .sport = skb->h.th->dest, 1355 { .sport = tcp_hdr(skb)->dest,
1359 .dport = skb->h.th->source } }, 1356 .dport = tcp_hdr(skb)->source } },
1360 .proto = sk->sk_protocol }; 1357 .proto = sk->sk_protocol };
1361 security_skb_classify_flow(skb, &fl); 1358 security_skb_classify_flow(skb, &fl);
1362 if (ip_route_output_key(&rt, &fl)) 1359 if (ip_route_output_key(&rt, &fl))
@@ -1370,14 +1367,16 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
1370 with locally disabled BH and that sk cannot be already spinlocked. 1367 with locally disabled BH and that sk cannot be already spinlocked.
1371 */ 1368 */
1372 bh_lock_sock(sk); 1369 bh_lock_sock(sk);
1373 inet->tos = skb->nh.iph->tos; 1370 inet->tos = ip_hdr(skb)->tos;
1374 sk->sk_priority = skb->priority; 1371 sk->sk_priority = skb->priority;
1375 sk->sk_protocol = skb->nh.iph->protocol; 1372 sk->sk_protocol = ip_hdr(skb)->protocol;
1376 ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0, 1373 ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1377 &ipc, rt, MSG_DONTWAIT); 1374 &ipc, rt, MSG_DONTWAIT);
1378 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { 1375 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1379 if (arg->csumoffset >= 0) 1376 if (arg->csumoffset >= 0)
1380 *((__sum16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum)); 1377 *((__sum16 *)skb_transport_header(skb) +
1378 arg->csumoffset) = csum_fold(csum_add(skb->csum,
1379 arg->csum));
1381 skb->ip_summed = CHECKSUM_NONE; 1380 skb->ip_summed = CHECKSUM_NONE;
1382 ip_push_pending_frames(sk); 1381 ip_push_pending_frames(sk);
1383 } 1382 }
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 23048d9f3584..4d544573f48a 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -59,7 +59,7 @@ static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
59 struct in_pktinfo info; 59 struct in_pktinfo info;
60 struct rtable *rt = (struct rtable *)skb->dst; 60 struct rtable *rt = (struct rtable *)skb->dst;
61 61
62 info.ipi_addr.s_addr = skb->nh.iph->daddr; 62 info.ipi_addr.s_addr = ip_hdr(skb)->daddr;
63 if (rt) { 63 if (rt) {
64 info.ipi_ifindex = rt->rt_iif; 64 info.ipi_ifindex = rt->rt_iif;
65 info.ipi_spec_dst.s_addr = rt->rt_spec_dst; 65 info.ipi_spec_dst.s_addr = rt->rt_spec_dst;
@@ -73,13 +73,13 @@ static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
73 73
74static void ip_cmsg_recv_ttl(struct msghdr *msg, struct sk_buff *skb) 74static void ip_cmsg_recv_ttl(struct msghdr *msg, struct sk_buff *skb)
75{ 75{
76 int ttl = skb->nh.iph->ttl; 76 int ttl = ip_hdr(skb)->ttl;
77 put_cmsg(msg, SOL_IP, IP_TTL, sizeof(int), &ttl); 77 put_cmsg(msg, SOL_IP, IP_TTL, sizeof(int), &ttl);
78} 78}
79 79
80static void ip_cmsg_recv_tos(struct msghdr *msg, struct sk_buff *skb) 80static void ip_cmsg_recv_tos(struct msghdr *msg, struct sk_buff *skb)
81{ 81{
82 put_cmsg(msg, SOL_IP, IP_TOS, 1, &skb->nh.iph->tos); 82 put_cmsg(msg, SOL_IP, IP_TOS, 1, &ip_hdr(skb)->tos);
83} 83}
84 84
85static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb) 85static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb)
@@ -87,7 +87,8 @@ static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb)
87 if (IPCB(skb)->opt.optlen == 0) 87 if (IPCB(skb)->opt.optlen == 0)
88 return; 88 return;
89 89
90 put_cmsg(msg, SOL_IP, IP_RECVOPTS, IPCB(skb)->opt.optlen, skb->nh.iph+1); 90 put_cmsg(msg, SOL_IP, IP_RECVOPTS, IPCB(skb)->opt.optlen,
91 ip_hdr(skb) + 1);
91} 92}
92 93
93 94
@@ -268,18 +269,21 @@ void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
268 serr = SKB_EXT_ERR(skb); 269 serr = SKB_EXT_ERR(skb);
269 serr->ee.ee_errno = err; 270 serr->ee.ee_errno = err;
270 serr->ee.ee_origin = SO_EE_ORIGIN_ICMP; 271 serr->ee.ee_origin = SO_EE_ORIGIN_ICMP;
271 serr->ee.ee_type = skb->h.icmph->type; 272 serr->ee.ee_type = icmp_hdr(skb)->type;
272 serr->ee.ee_code = skb->h.icmph->code; 273 serr->ee.ee_code = icmp_hdr(skb)->code;
273 serr->ee.ee_pad = 0; 274 serr->ee.ee_pad = 0;
274 serr->ee.ee_info = info; 275 serr->ee.ee_info = info;
275 serr->ee.ee_data = 0; 276 serr->ee.ee_data = 0;
276 serr->addr_offset = (u8*)&(((struct iphdr*)(skb->h.icmph+1))->daddr) - skb->nh.raw; 277 serr->addr_offset = (u8 *)&(((struct iphdr *)(icmp_hdr(skb) + 1))->daddr) -
278 skb_network_header(skb);
277 serr->port = port; 279 serr->port = port;
278 280
279 skb->h.raw = payload; 281 if (skb_pull(skb, payload - skb->data) != NULL) {
280 if (!skb_pull(skb, payload - skb->data) || 282 skb_reset_transport_header(skb);
281 sock_queue_err_skb(sk, skb)) 283 if (sock_queue_err_skb(sk, skb) == 0)
282 kfree_skb(skb); 284 return;
285 }
286 kfree_skb(skb);
283} 287}
284 288
285void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 info) 289void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 info)
@@ -296,8 +300,9 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 inf
296 if (!skb) 300 if (!skb)
297 return; 301 return;
298 302
299 iph = (struct iphdr*)skb_put(skb, sizeof(struct iphdr)); 303 skb_put(skb, sizeof(struct iphdr));
300 skb->nh.iph = iph; 304 skb_reset_network_header(skb);
305 iph = ip_hdr(skb);
301 iph->daddr = daddr; 306 iph->daddr = daddr;
302 307
303 serr = SKB_EXT_ERR(skb); 308 serr = SKB_EXT_ERR(skb);
@@ -308,11 +313,11 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 inf
308 serr->ee.ee_pad = 0; 313 serr->ee.ee_pad = 0;
309 serr->ee.ee_info = info; 314 serr->ee.ee_info = info;
310 serr->ee.ee_data = 0; 315 serr->ee.ee_data = 0;
311 serr->addr_offset = (u8*)&iph->daddr - skb->nh.raw; 316 serr->addr_offset = (u8 *)&iph->daddr - skb_network_header(skb);
312 serr->port = port; 317 serr->port = port;
313 318
314 skb->h.raw = skb->tail; 319 __skb_pull(skb, skb_tail_pointer(skb) - skb->data);
315 __skb_pull(skb, skb->tail - skb->data); 320 skb_reset_transport_header(skb);
316 321
317 if (sock_queue_err_skb(sk, skb)) 322 if (sock_queue_err_skb(sk, skb))
318 kfree_skb(skb); 323 kfree_skb(skb);
@@ -354,7 +359,8 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len)
354 sin = (struct sockaddr_in *)msg->msg_name; 359 sin = (struct sockaddr_in *)msg->msg_name;
355 if (sin) { 360 if (sin) {
356 sin->sin_family = AF_INET; 361 sin->sin_family = AF_INET;
357 sin->sin_addr.s_addr = *(__be32*)(skb->nh.raw + serr->addr_offset); 362 sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) +
363 serr->addr_offset);
358 sin->sin_port = serr->port; 364 sin->sin_port = serr->port;
359 memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); 365 memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
360 } 366 }
@@ -366,7 +372,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len)
366 struct inet_sock *inet = inet_sk(sk); 372 struct inet_sock *inet = inet_sk(sk);
367 373
368 sin->sin_family = AF_INET; 374 sin->sin_family = AF_INET;
369 sin->sin_addr.s_addr = skb->nh.iph->saddr; 375 sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
370 sin->sin_port = 0; 376 sin->sin_port = 0;
371 memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); 377 memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
372 if (inet->cmsg_flags) 378 if (inet->cmsg_flags)
@@ -403,20 +409,20 @@ out:
403 */ 409 */
404 410
405static int do_ip_setsockopt(struct sock *sk, int level, 411static int do_ip_setsockopt(struct sock *sk, int level,
406 int optname, char __user *optval, int optlen) 412 int optname, char __user *optval, int optlen)
407{ 413{
408 struct inet_sock *inet = inet_sk(sk); 414 struct inet_sock *inet = inet_sk(sk);
409 int val=0,err; 415 int val=0,err;
410 416
411 if (((1<<optname) & ((1<<IP_PKTINFO) | (1<<IP_RECVTTL) | 417 if (((1<<optname) & ((1<<IP_PKTINFO) | (1<<IP_RECVTTL) |
412 (1<<IP_RECVOPTS) | (1<<IP_RECVTOS) | 418 (1<<IP_RECVOPTS) | (1<<IP_RECVTOS) |
413 (1<<IP_RETOPTS) | (1<<IP_TOS) | 419 (1<<IP_RETOPTS) | (1<<IP_TOS) |
414 (1<<IP_TTL) | (1<<IP_HDRINCL) | 420 (1<<IP_TTL) | (1<<IP_HDRINCL) |
415 (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) | 421 (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) |
416 (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) | 422 (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) |
417 (1<<IP_PASSSEC))) || 423 (1<<IP_PASSSEC))) ||
418 optname == IP_MULTICAST_TTL || 424 optname == IP_MULTICAST_TTL ||
419 optname == IP_MULTICAST_LOOP) { 425 optname == IP_MULTICAST_LOOP) {
420 if (optlen >= sizeof(int)) { 426 if (optlen >= sizeof(int)) {
421 if (get_user(val, (int __user *) optval)) 427 if (get_user(val, (int __user *) optval))
422 return -EFAULT; 428 return -EFAULT;
@@ -440,444 +446,444 @@ static int do_ip_setsockopt(struct sock *sk, int level,
440 lock_sock(sk); 446 lock_sock(sk);
441 447
442 switch (optname) { 448 switch (optname) {
443 case IP_OPTIONS: 449 case IP_OPTIONS:
444 { 450 {
445 struct ip_options * opt = NULL; 451 struct ip_options * opt = NULL;
446 if (optlen > 40 || optlen < 0) 452 if (optlen > 40 || optlen < 0)
447 goto e_inval; 453 goto e_inval;
448 err = ip_options_get_from_user(&opt, optval, optlen); 454 err = ip_options_get_from_user(&opt, optval, optlen);
449 if (err) 455 if (err)
450 break; 456 break;
451 if (inet->is_icsk) { 457 if (inet->is_icsk) {
452 struct inet_connection_sock *icsk = inet_csk(sk); 458 struct inet_connection_sock *icsk = inet_csk(sk);
453#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 459#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
454 if (sk->sk_family == PF_INET || 460 if (sk->sk_family == PF_INET ||
455 (!((1 << sk->sk_state) & 461 (!((1 << sk->sk_state) &
456 (TCPF_LISTEN | TCPF_CLOSE)) && 462 (TCPF_LISTEN | TCPF_CLOSE)) &&
457 inet->daddr != LOOPBACK4_IPV6)) { 463 inet->daddr != LOOPBACK4_IPV6)) {
458#endif 464#endif
459 if (inet->opt) 465 if (inet->opt)
460 icsk->icsk_ext_hdr_len -= inet->opt->optlen; 466 icsk->icsk_ext_hdr_len -= inet->opt->optlen;
461 if (opt) 467 if (opt)
462 icsk->icsk_ext_hdr_len += opt->optlen; 468 icsk->icsk_ext_hdr_len += opt->optlen;
463 icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); 469 icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
464#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 470#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
465 }
466#endif
467 } 471 }
468 opt = xchg(&inet->opt, opt); 472#endif
469 kfree(opt);
470 break;
471 } 473 }
472 case IP_PKTINFO: 474 opt = xchg(&inet->opt, opt);
473 if (val) 475 kfree(opt);
474 inet->cmsg_flags |= IP_CMSG_PKTINFO; 476 break;
475 else 477 }
476 inet->cmsg_flags &= ~IP_CMSG_PKTINFO; 478 case IP_PKTINFO:
477 break; 479 if (val)
478 case IP_RECVTTL: 480 inet->cmsg_flags |= IP_CMSG_PKTINFO;
479 if (val) 481 else
480 inet->cmsg_flags |= IP_CMSG_TTL; 482 inet->cmsg_flags &= ~IP_CMSG_PKTINFO;
481 else 483 break;
482 inet->cmsg_flags &= ~IP_CMSG_TTL; 484 case IP_RECVTTL:
483 break; 485 if (val)
484 case IP_RECVTOS: 486 inet->cmsg_flags |= IP_CMSG_TTL;
485 if (val) 487 else
486 inet->cmsg_flags |= IP_CMSG_TOS; 488 inet->cmsg_flags &= ~IP_CMSG_TTL;
487 else 489 break;
488 inet->cmsg_flags &= ~IP_CMSG_TOS; 490 case IP_RECVTOS:
489 break; 491 if (val)
490 case IP_RECVOPTS: 492 inet->cmsg_flags |= IP_CMSG_TOS;
491 if (val) 493 else
492 inet->cmsg_flags |= IP_CMSG_RECVOPTS; 494 inet->cmsg_flags &= ~IP_CMSG_TOS;
493 else 495 break;
494 inet->cmsg_flags &= ~IP_CMSG_RECVOPTS; 496 case IP_RECVOPTS:
495 break; 497 if (val)
496 case IP_RETOPTS: 498 inet->cmsg_flags |= IP_CMSG_RECVOPTS;
497 if (val) 499 else
498 inet->cmsg_flags |= IP_CMSG_RETOPTS; 500 inet->cmsg_flags &= ~IP_CMSG_RECVOPTS;
499 else 501 break;
500 inet->cmsg_flags &= ~IP_CMSG_RETOPTS; 502 case IP_RETOPTS:
503 if (val)
504 inet->cmsg_flags |= IP_CMSG_RETOPTS;
505 else
506 inet->cmsg_flags &= ~IP_CMSG_RETOPTS;
507 break;
508 case IP_PASSSEC:
509 if (val)
510 inet->cmsg_flags |= IP_CMSG_PASSSEC;
511 else
512 inet->cmsg_flags &= ~IP_CMSG_PASSSEC;
513 break;
514 case IP_TOS: /* This sets both TOS and Precedence */
515 if (sk->sk_type == SOCK_STREAM) {
516 val &= ~3;
517 val |= inet->tos & 3;
518 }
519 if (IPTOS_PREC(val) >= IPTOS_PREC_CRITIC_ECP &&
520 !capable(CAP_NET_ADMIN)) {
521 err = -EPERM;
501 break; 522 break;
502 case IP_PASSSEC: 523 }
503 if (val) 524 if (inet->tos != val) {
504 inet->cmsg_flags |= IP_CMSG_PASSSEC; 525 inet->tos = val;
505 else 526 sk->sk_priority = rt_tos2priority(val);
506 inet->cmsg_flags &= ~IP_CMSG_PASSSEC; 527 sk_dst_reset(sk);
528 }
529 break;
530 case IP_TTL:
531 if (optlen<1)
532 goto e_inval;
533 if (val != -1 && (val < 1 || val>255))
534 goto e_inval;
535 inet->uc_ttl = val;
536 break;
537 case IP_HDRINCL:
538 if (sk->sk_type != SOCK_RAW) {
539 err = -ENOPROTOOPT;
507 break; 540 break;
508 case IP_TOS: /* This sets both TOS and Precedence */ 541 }
509 if (sk->sk_type == SOCK_STREAM) { 542 inet->hdrincl = val ? 1 : 0;
510 val &= ~3; 543 break;
511 val |= inet->tos & 3; 544 case IP_MTU_DISCOVER:
512 } 545 if (val<0 || val>3)
513 if (IPTOS_PREC(val) >= IPTOS_PREC_CRITIC_ECP && 546 goto e_inval;
514 !capable(CAP_NET_ADMIN)) { 547 inet->pmtudisc = val;
515 err = -EPERM; 548 break;
549 case IP_RECVERR:
550 inet->recverr = !!val;
551 if (!val)
552 skb_queue_purge(&sk->sk_error_queue);
553 break;
554 case IP_MULTICAST_TTL:
555 if (sk->sk_type == SOCK_STREAM)
556 goto e_inval;
557 if (optlen<1)
558 goto e_inval;
559 if (val==-1)
560 val = 1;
561 if (val < 0 || val > 255)
562 goto e_inval;
563 inet->mc_ttl = val;
564 break;
565 case IP_MULTICAST_LOOP:
566 if (optlen<1)
567 goto e_inval;
568 inet->mc_loop = !!val;
569 break;
570 case IP_MULTICAST_IF:
571 {
572 struct ip_mreqn mreq;
573 struct net_device *dev = NULL;
574
575 if (sk->sk_type == SOCK_STREAM)
576 goto e_inval;
577 /*
578 * Check the arguments are allowable
579 */
580
581 err = -EFAULT;
582 if (optlen >= sizeof(struct ip_mreqn)) {
583 if (copy_from_user(&mreq,optval,sizeof(mreq)))
516 break; 584 break;
517 } 585 } else {
518 if (inet->tos != val) { 586 memset(&mreq, 0, sizeof(mreq));
519 inet->tos = val; 587 if (optlen >= sizeof(struct in_addr) &&
520 sk->sk_priority = rt_tos2priority(val); 588 copy_from_user(&mreq.imr_address,optval,sizeof(struct in_addr)))
521 sk_dst_reset(sk); 589 break;
522 } 590 }
523 break; 591
524 case IP_TTL: 592 if (!mreq.imr_ifindex) {
525 if (optlen<1) 593 if (mreq.imr_address.s_addr == INADDR_ANY) {
526 goto e_inval; 594 inet->mc_index = 0;
527 if (val != -1 && (val < 1 || val>255)) 595 inet->mc_addr = 0;
528 goto e_inval; 596 err = 0;
529 inet->uc_ttl = val;
530 break;
531 case IP_HDRINCL:
532 if (sk->sk_type != SOCK_RAW) {
533 err = -ENOPROTOOPT;
534 break; 597 break;
535 } 598 }
536 inet->hdrincl = val ? 1 : 0; 599 dev = ip_dev_find(mreq.imr_address.s_addr);
537 break; 600 if (dev) {
538 case IP_MTU_DISCOVER: 601 mreq.imr_ifindex = dev->ifindex;
539 if (val<0 || val>2) 602 dev_put(dev);
540 goto e_inval; 603 }
541 inet->pmtudisc = val; 604 } else
542 break; 605 dev = __dev_get_by_index(mreq.imr_ifindex);
543 case IP_RECVERR:
544 inet->recverr = !!val;
545 if (!val)
546 skb_queue_purge(&sk->sk_error_queue);
547 break;
548 case IP_MULTICAST_TTL:
549 if (sk->sk_type == SOCK_STREAM)
550 goto e_inval;
551 if (optlen<1)
552 goto e_inval;
553 if (val==-1)
554 val = 1;
555 if (val < 0 || val > 255)
556 goto e_inval;
557 inet->mc_ttl = val;
558 break;
559 case IP_MULTICAST_LOOP:
560 if (optlen<1)
561 goto e_inval;
562 inet->mc_loop = !!val;
563 break;
564 case IP_MULTICAST_IF:
565 {
566 struct ip_mreqn mreq;
567 struct net_device *dev = NULL;
568 606
569 if (sk->sk_type == SOCK_STREAM)
570 goto e_inval;
571 /*
572 * Check the arguments are allowable
573 */
574 607
575 err = -EFAULT; 608 err = -EADDRNOTAVAIL;
576 if (optlen >= sizeof(struct ip_mreqn)) { 609 if (!dev)
577 if (copy_from_user(&mreq,optval,sizeof(mreq))) 610 break;
578 break;
579 } else {
580 memset(&mreq, 0, sizeof(mreq));
581 if (optlen >= sizeof(struct in_addr) &&
582 copy_from_user(&mreq.imr_address,optval,sizeof(struct in_addr)))
583 break;
584 }
585 611
586 if (!mreq.imr_ifindex) { 612 err = -EINVAL;
587 if (mreq.imr_address.s_addr == INADDR_ANY) { 613 if (sk->sk_bound_dev_if &&
588 inet->mc_index = 0; 614 mreq.imr_ifindex != sk->sk_bound_dev_if)
589 inet->mc_addr = 0; 615 break;
590 err = 0;
591 break;
592 }
593 dev = ip_dev_find(mreq.imr_address.s_addr);
594 if (dev) {
595 mreq.imr_ifindex = dev->ifindex;
596 dev_put(dev);
597 }
598 } else
599 dev = __dev_get_by_index(mreq.imr_ifindex);
600 616
617 inet->mc_index = mreq.imr_ifindex;
618 inet->mc_addr = mreq.imr_address.s_addr;
619 err = 0;
620 break;
621 }
601 622
602 err = -EADDRNOTAVAIL; 623 case IP_ADD_MEMBERSHIP:
603 if (!dev) 624 case IP_DROP_MEMBERSHIP:
604 break; 625 {
626 struct ip_mreqn mreq;
605 627
606 err = -EINVAL; 628 if (optlen < sizeof(struct ip_mreq))
607 if (sk->sk_bound_dev_if && 629 goto e_inval;
608 mreq.imr_ifindex != sk->sk_bound_dev_if) 630 err = -EFAULT;
631 if (optlen >= sizeof(struct ip_mreqn)) {
632 if (copy_from_user(&mreq,optval,sizeof(mreq)))
609 break; 633 break;
634 } else {
635 memset(&mreq, 0, sizeof(mreq));
636 if (copy_from_user(&mreq,optval,sizeof(struct ip_mreq)))
637 break;
638 }
610 639
611 inet->mc_index = mreq.imr_ifindex; 640 if (optname == IP_ADD_MEMBERSHIP)
612 inet->mc_addr = mreq.imr_address.s_addr; 641 err = ip_mc_join_group(sk, &mreq);
613 err = 0; 642 else
643 err = ip_mc_leave_group(sk, &mreq);
644 break;
645 }
646 case IP_MSFILTER:
647 {
648 extern int sysctl_igmp_max_msf;
649 struct ip_msfilter *msf;
650
651 if (optlen < IP_MSFILTER_SIZE(0))
652 goto e_inval;
653 if (optlen > sysctl_optmem_max) {
654 err = -ENOBUFS;
614 break; 655 break;
615 } 656 }
657 msf = kmalloc(optlen, GFP_KERNEL);
658 if (msf == 0) {
659 err = -ENOBUFS;
660 break;
661 }
662 err = -EFAULT;
663 if (copy_from_user(msf, optval, optlen)) {
664 kfree(msf);
665 break;
666 }
667 /* numsrc >= (1G-4) overflow in 32 bits */
668 if (msf->imsf_numsrc >= 0x3ffffffcU ||
669 msf->imsf_numsrc > sysctl_igmp_max_msf) {
670 kfree(msf);
671 err = -ENOBUFS;
672 break;
673 }
674 if (IP_MSFILTER_SIZE(msf->imsf_numsrc) > optlen) {
675 kfree(msf);
676 err = -EINVAL;
677 break;
678 }
679 err = ip_mc_msfilter(sk, msf, 0);
680 kfree(msf);
681 break;
682 }
683 case IP_BLOCK_SOURCE:
684 case IP_UNBLOCK_SOURCE:
685 case IP_ADD_SOURCE_MEMBERSHIP:
686 case IP_DROP_SOURCE_MEMBERSHIP:
687 {
688 struct ip_mreq_source mreqs;
689 int omode, add;
616 690
617 case IP_ADD_MEMBERSHIP: 691 if (optlen != sizeof(struct ip_mreq_source))
618 case IP_DROP_MEMBERSHIP: 692 goto e_inval;
619 { 693 if (copy_from_user(&mreqs, optval, sizeof(mreqs))) {
620 struct ip_mreqn mreq;
621
622 if (optlen < sizeof(struct ip_mreq))
623 goto e_inval;
624 err = -EFAULT; 694 err = -EFAULT;
625 if (optlen >= sizeof(struct ip_mreqn)) {
626 if(copy_from_user(&mreq,optval,sizeof(mreq)))
627 break;
628 } else {
629 memset(&mreq, 0, sizeof(mreq));
630 if (copy_from_user(&mreq,optval,sizeof(struct ip_mreq)))
631 break;
632 }
633
634 if (optname == IP_ADD_MEMBERSHIP)
635 err = ip_mc_join_group(sk, &mreq);
636 else
637 err = ip_mc_leave_group(sk, &mreq);
638 break; 695 break;
639 } 696 }
640 case IP_MSFILTER: 697 if (optname == IP_BLOCK_SOURCE) {
641 { 698 omode = MCAST_EXCLUDE;
642 extern int sysctl_igmp_max_msf; 699 add = 1;
643 struct ip_msfilter *msf; 700 } else if (optname == IP_UNBLOCK_SOURCE) {
701 omode = MCAST_EXCLUDE;
702 add = 0;
703 } else if (optname == IP_ADD_SOURCE_MEMBERSHIP) {
704 struct ip_mreqn mreq;
644 705
645 if (optlen < IP_MSFILTER_SIZE(0)) 706 mreq.imr_multiaddr.s_addr = mreqs.imr_multiaddr;
646 goto e_inval; 707 mreq.imr_address.s_addr = mreqs.imr_interface;
647 if (optlen > sysctl_optmem_max) { 708 mreq.imr_ifindex = 0;
648 err = -ENOBUFS; 709 err = ip_mc_join_group(sk, &mreq);
649 break; 710 if (err && err != -EADDRINUSE)
650 }
651 msf = kmalloc(optlen, GFP_KERNEL);
652 if (msf == 0) {
653 err = -ENOBUFS;
654 break; 711 break;
655 } 712 omode = MCAST_INCLUDE;
713 add = 1;
714 } else /* IP_DROP_SOURCE_MEMBERSHIP */ {
715 omode = MCAST_INCLUDE;
716 add = 0;
717 }
718 err = ip_mc_source(add, omode, sk, &mreqs, 0);
719 break;
720 }
721 case MCAST_JOIN_GROUP:
722 case MCAST_LEAVE_GROUP:
723 {
724 struct group_req greq;
725 struct sockaddr_in *psin;
726 struct ip_mreqn mreq;
727
728 if (optlen < sizeof(struct group_req))
729 goto e_inval;
730 err = -EFAULT;
731 if (copy_from_user(&greq, optval, sizeof(greq)))
732 break;
733 psin = (struct sockaddr_in *)&greq.gr_group;
734 if (psin->sin_family != AF_INET)
735 goto e_inval;
736 memset(&mreq, 0, sizeof(mreq));
737 mreq.imr_multiaddr = psin->sin_addr;
738 mreq.imr_ifindex = greq.gr_interface;
739
740 if (optname == MCAST_JOIN_GROUP)
741 err = ip_mc_join_group(sk, &mreq);
742 else
743 err = ip_mc_leave_group(sk, &mreq);
744 break;
745 }
746 case MCAST_JOIN_SOURCE_GROUP:
747 case MCAST_LEAVE_SOURCE_GROUP:
748 case MCAST_BLOCK_SOURCE:
749 case MCAST_UNBLOCK_SOURCE:
750 {
751 struct group_source_req greqs;
752 struct ip_mreq_source mreqs;
753 struct sockaddr_in *psin;
754 int omode, add;
755
756 if (optlen != sizeof(struct group_source_req))
757 goto e_inval;
758 if (copy_from_user(&greqs, optval, sizeof(greqs))) {
656 err = -EFAULT; 759 err = -EFAULT;
657 if (copy_from_user(msf, optval, optlen)) {
658 kfree(msf);
659 break;
660 }
661 /* numsrc >= (1G-4) overflow in 32 bits */
662 if (msf->imsf_numsrc >= 0x3ffffffcU ||
663 msf->imsf_numsrc > sysctl_igmp_max_msf) {
664 kfree(msf);
665 err = -ENOBUFS;
666 break;
667 }
668 if (IP_MSFILTER_SIZE(msf->imsf_numsrc) > optlen) {
669 kfree(msf);
670 err = -EINVAL;
671 break;
672 }
673 err = ip_mc_msfilter(sk, msf, 0);
674 kfree(msf);
675 break; 760 break;
676 } 761 }
677 case IP_BLOCK_SOURCE: 762 if (greqs.gsr_group.ss_family != AF_INET ||
678 case IP_UNBLOCK_SOURCE: 763 greqs.gsr_source.ss_family != AF_INET) {
679 case IP_ADD_SOURCE_MEMBERSHIP: 764 err = -EADDRNOTAVAIL;
680 case IP_DROP_SOURCE_MEMBERSHIP:
681 {
682 struct ip_mreq_source mreqs;
683 int omode, add;
684
685 if (optlen != sizeof(struct ip_mreq_source))
686 goto e_inval;
687 if (copy_from_user(&mreqs, optval, sizeof(mreqs))) {
688 err = -EFAULT;
689 break;
690 }
691 if (optname == IP_BLOCK_SOURCE) {
692 omode = MCAST_EXCLUDE;
693 add = 1;
694 } else if (optname == IP_UNBLOCK_SOURCE) {
695 omode = MCAST_EXCLUDE;
696 add = 0;
697 } else if (optname == IP_ADD_SOURCE_MEMBERSHIP) {
698 struct ip_mreqn mreq;
699
700 mreq.imr_multiaddr.s_addr = mreqs.imr_multiaddr;
701 mreq.imr_address.s_addr = mreqs.imr_interface;
702 mreq.imr_ifindex = 0;
703 err = ip_mc_join_group(sk, &mreq);
704 if (err && err != -EADDRINUSE)
705 break;
706 omode = MCAST_INCLUDE;
707 add = 1;
708 } else /* IP_DROP_SOURCE_MEMBERSHIP */ {
709 omode = MCAST_INCLUDE;
710 add = 0;
711 }
712 err = ip_mc_source(add, omode, sk, &mreqs, 0);
713 break; 765 break;
714 } 766 }
715 case MCAST_JOIN_GROUP: 767 psin = (struct sockaddr_in *)&greqs.gsr_group;
716 case MCAST_LEAVE_GROUP: 768 mreqs.imr_multiaddr = psin->sin_addr.s_addr;
717 { 769 psin = (struct sockaddr_in *)&greqs.gsr_source;
718 struct group_req greq; 770 mreqs.imr_sourceaddr = psin->sin_addr.s_addr;
719 struct sockaddr_in *psin; 771 mreqs.imr_interface = 0; /* use index for mc_source */
772
773 if (optname == MCAST_BLOCK_SOURCE) {
774 omode = MCAST_EXCLUDE;
775 add = 1;
776 } else if (optname == MCAST_UNBLOCK_SOURCE) {
777 omode = MCAST_EXCLUDE;
778 add = 0;
779 } else if (optname == MCAST_JOIN_SOURCE_GROUP) {
720 struct ip_mreqn mreq; 780 struct ip_mreqn mreq;
721 781
722 if (optlen < sizeof(struct group_req)) 782 psin = (struct sockaddr_in *)&greqs.gsr_group;
723 goto e_inval;
724 err = -EFAULT;
725 if(copy_from_user(&greq, optval, sizeof(greq)))
726 break;
727 psin = (struct sockaddr_in *)&greq.gr_group;
728 if (psin->sin_family != AF_INET)
729 goto e_inval;
730 memset(&mreq, 0, sizeof(mreq));
731 mreq.imr_multiaddr = psin->sin_addr; 783 mreq.imr_multiaddr = psin->sin_addr;
732 mreq.imr_ifindex = greq.gr_interface; 784 mreq.imr_address.s_addr = 0;
733 785 mreq.imr_ifindex = greqs.gsr_interface;
734 if (optname == MCAST_JOIN_GROUP) 786 err = ip_mc_join_group(sk, &mreq);
735 err = ip_mc_join_group(sk, &mreq); 787 if (err && err != -EADDRINUSE)
736 else 788 break;
737 err = ip_mc_leave_group(sk, &mreq); 789 greqs.gsr_interface = mreq.imr_ifindex;
790 omode = MCAST_INCLUDE;
791 add = 1;
792 } else /* MCAST_LEAVE_SOURCE_GROUP */ {
793 omode = MCAST_INCLUDE;
794 add = 0;
795 }
796 err = ip_mc_source(add, omode, sk, &mreqs,
797 greqs.gsr_interface);
798 break;
799 }
800 case MCAST_MSFILTER:
801 {
802 extern int sysctl_igmp_max_msf;
803 struct sockaddr_in *psin;
804 struct ip_msfilter *msf = NULL;
805 struct group_filter *gsf = NULL;
806 int msize, i, ifindex;
807
808 if (optlen < GROUP_FILTER_SIZE(0))
809 goto e_inval;
810 if (optlen > sysctl_optmem_max) {
811 err = -ENOBUFS;
738 break; 812 break;
739 } 813 }
740 case MCAST_JOIN_SOURCE_GROUP: 814 gsf = kmalloc(optlen,GFP_KERNEL);
741 case MCAST_LEAVE_SOURCE_GROUP: 815 if (gsf == 0) {
742 case MCAST_BLOCK_SOURCE: 816 err = -ENOBUFS;
743 case MCAST_UNBLOCK_SOURCE:
744 {
745 struct group_source_req greqs;
746 struct ip_mreq_source mreqs;
747 struct sockaddr_in *psin;
748 int omode, add;
749
750 if (optlen != sizeof(struct group_source_req))
751 goto e_inval;
752 if (copy_from_user(&greqs, optval, sizeof(greqs))) {
753 err = -EFAULT;
754 break;
755 }
756 if (greqs.gsr_group.ss_family != AF_INET ||
757 greqs.gsr_source.ss_family != AF_INET) {
758 err = -EADDRNOTAVAIL;
759 break;
760 }
761 psin = (struct sockaddr_in *)&greqs.gsr_group;
762 mreqs.imr_multiaddr = psin->sin_addr.s_addr;
763 psin = (struct sockaddr_in *)&greqs.gsr_source;
764 mreqs.imr_sourceaddr = psin->sin_addr.s_addr;
765 mreqs.imr_interface = 0; /* use index for mc_source */
766
767 if (optname == MCAST_BLOCK_SOURCE) {
768 omode = MCAST_EXCLUDE;
769 add = 1;
770 } else if (optname == MCAST_UNBLOCK_SOURCE) {
771 omode = MCAST_EXCLUDE;
772 add = 0;
773 } else if (optname == MCAST_JOIN_SOURCE_GROUP) {
774 struct ip_mreqn mreq;
775
776 psin = (struct sockaddr_in *)&greqs.gsr_group;
777 mreq.imr_multiaddr = psin->sin_addr;
778 mreq.imr_address.s_addr = 0;
779 mreq.imr_ifindex = greqs.gsr_interface;
780 err = ip_mc_join_group(sk, &mreq);
781 if (err && err != -EADDRINUSE)
782 break;
783 greqs.gsr_interface = mreq.imr_ifindex;
784 omode = MCAST_INCLUDE;
785 add = 1;
786 } else /* MCAST_LEAVE_SOURCE_GROUP */ {
787 omode = MCAST_INCLUDE;
788 add = 0;
789 }
790 err = ip_mc_source(add, omode, sk, &mreqs,
791 greqs.gsr_interface);
792 break; 817 break;
793 } 818 }
794 case MCAST_MSFILTER: 819 err = -EFAULT;
795 { 820 if (copy_from_user(gsf, optval, optlen)) {
796 extern int sysctl_igmp_max_msf; 821 goto mc_msf_out;
797 struct sockaddr_in *psin; 822 }
798 struct ip_msfilter *msf = NULL; 823 /* numsrc >= (4G-140)/128 overflow in 32 bits */
799 struct group_filter *gsf = NULL; 824 if (gsf->gf_numsrc >= 0x1ffffff ||
800 int msize, i, ifindex; 825 gsf->gf_numsrc > sysctl_igmp_max_msf) {
801 826 err = -ENOBUFS;
802 if (optlen < GROUP_FILTER_SIZE(0)) 827 goto mc_msf_out;
803 goto e_inval; 828 }
804 if (optlen > sysctl_optmem_max) { 829 if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) {
805 err = -ENOBUFS; 830 err = -EINVAL;
806 break; 831 goto mc_msf_out;
807 } 832 }
808 gsf = kmalloc(optlen,GFP_KERNEL); 833 msize = IP_MSFILTER_SIZE(gsf->gf_numsrc);
809 if (gsf == 0) { 834 msf = kmalloc(msize,GFP_KERNEL);
810 err = -ENOBUFS; 835 if (msf == 0) {
811 break; 836 err = -ENOBUFS;
812 } 837 goto mc_msf_out;
813 err = -EFAULT; 838 }
814 if (copy_from_user(gsf, optval, optlen)) { 839 ifindex = gsf->gf_interface;
815 goto mc_msf_out; 840 psin = (struct sockaddr_in *)&gsf->gf_group;
816 } 841 if (psin->sin_family != AF_INET) {
817 /* numsrc >= (4G-140)/128 overflow in 32 bits */
818 if (gsf->gf_numsrc >= 0x1ffffff ||
819 gsf->gf_numsrc > sysctl_igmp_max_msf) {
820 err = -ENOBUFS;
821 goto mc_msf_out;
822 }
823 if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) {
824 err = -EINVAL;
825 goto mc_msf_out;
826 }
827 msize = IP_MSFILTER_SIZE(gsf->gf_numsrc);
828 msf = kmalloc(msize,GFP_KERNEL);
829 if (msf == 0) {
830 err = -ENOBUFS;
831 goto mc_msf_out;
832 }
833 ifindex = gsf->gf_interface;
834 psin = (struct sockaddr_in *)&gsf->gf_group;
835 if (psin->sin_family != AF_INET) {
836 err = -EADDRNOTAVAIL;
837 goto mc_msf_out;
838 }
839 msf->imsf_multiaddr = psin->sin_addr.s_addr;
840 msf->imsf_interface = 0;
841 msf->imsf_fmode = gsf->gf_fmode;
842 msf->imsf_numsrc = gsf->gf_numsrc;
843 err = -EADDRNOTAVAIL; 842 err = -EADDRNOTAVAIL;
844 for (i=0; i<gsf->gf_numsrc; ++i) { 843 goto mc_msf_out;
845 psin = (struct sockaddr_in *)&gsf->gf_slist[i];
846
847 if (psin->sin_family != AF_INET)
848 goto mc_msf_out;
849 msf->imsf_slist[i] = psin->sin_addr.s_addr;
850 }
851 kfree(gsf);
852 gsf = NULL;
853
854 err = ip_mc_msfilter(sk, msf, ifindex);
855mc_msf_out:
856 kfree(msf);
857 kfree(gsf);
858 break;
859 } 844 }
860 case IP_ROUTER_ALERT: 845 msf->imsf_multiaddr = psin->sin_addr.s_addr;
861 err = ip_ra_control(sk, val ? 1 : 0, NULL); 846 msf->imsf_interface = 0;
862 break; 847 msf->imsf_fmode = gsf->gf_fmode;
863 848 msf->imsf_numsrc = gsf->gf_numsrc;
864 case IP_FREEBIND: 849 err = -EADDRNOTAVAIL;
865 if (optlen<1) 850 for (i=0; i<gsf->gf_numsrc; ++i) {
866 goto e_inval; 851 psin = (struct sockaddr_in *)&gsf->gf_slist[i];
867 inet->freebind = !!val;
868 break;
869 852
870 case IP_IPSEC_POLICY: 853 if (psin->sin_family != AF_INET)
871 case IP_XFRM_POLICY: 854 goto mc_msf_out;
872 err = -EPERM; 855 msf->imsf_slist[i] = psin->sin_addr.s_addr;
873 if (!capable(CAP_NET_ADMIN)) 856 }
874 break; 857 kfree(gsf);
875 err = xfrm_user_policy(sk, optname, optval, optlen); 858 gsf = NULL;
859
860 err = ip_mc_msfilter(sk, msf, ifindex);
861 mc_msf_out:
862 kfree(msf);
863 kfree(gsf);
864 break;
865 }
866 case IP_ROUTER_ALERT:
867 err = ip_ra_control(sk, val ? 1 : 0, NULL);
868 break;
869
870 case IP_FREEBIND:
871 if (optlen<1)
872 goto e_inval;
873 inet->freebind = !!val;
874 break;
875
876 case IP_IPSEC_POLICY:
877 case IP_XFRM_POLICY:
878 err = -EPERM;
879 if (!capable(CAP_NET_ADMIN))
876 break; 880 break;
881 err = xfrm_user_policy(sk, optname, optval, optlen);
882 break;
877 883
878 default: 884 default:
879 err = -ENOPROTOOPT; 885 err = -ENOPROTOOPT;
880 break; 886 break;
881 } 887 }
882 release_sock(sk); 888 release_sock(sk);
883 return err; 889 return err;
@@ -948,214 +954,213 @@ EXPORT_SYMBOL(compat_ip_setsockopt);
948 */ 954 */
949 955
950static int do_ip_getsockopt(struct sock *sk, int level, int optname, 956static int do_ip_getsockopt(struct sock *sk, int level, int optname,
951 char __user *optval, int __user *optlen) 957 char __user *optval, int __user *optlen)
952{ 958{
953 struct inet_sock *inet = inet_sk(sk); 959 struct inet_sock *inet = inet_sk(sk);
954 int val; 960 int val;
955 int len; 961 int len;
956 962
957 if(level!=SOL_IP) 963 if (level != SOL_IP)
958 return -EOPNOTSUPP; 964 return -EOPNOTSUPP;
959 965
960#ifdef CONFIG_IP_MROUTE 966#ifdef CONFIG_IP_MROUTE
961 if(optname>=MRT_BASE && optname <=MRT_BASE+10) 967 if (optname >= MRT_BASE && optname <= MRT_BASE+10) {
962 {
963 return ip_mroute_getsockopt(sk,optname,optval,optlen); 968 return ip_mroute_getsockopt(sk,optname,optval,optlen);
964 } 969 }
965#endif 970#endif
966 971
967 if(get_user(len,optlen)) 972 if (get_user(len,optlen))
968 return -EFAULT; 973 return -EFAULT;
969 if(len < 0) 974 if (len < 0)
970 return -EINVAL; 975 return -EINVAL;
971 976
972 lock_sock(sk); 977 lock_sock(sk);
973 978
974 switch(optname) { 979 switch (optname) {
975 case IP_OPTIONS: 980 case IP_OPTIONS:
976 { 981 {
977 unsigned char optbuf[sizeof(struct ip_options)+40]; 982 unsigned char optbuf[sizeof(struct ip_options)+40];
978 struct ip_options * opt = (struct ip_options*)optbuf; 983 struct ip_options * opt = (struct ip_options*)optbuf;
979 opt->optlen = 0; 984 opt->optlen = 0;
980 if (inet->opt) 985 if (inet->opt)
981 memcpy(optbuf, inet->opt, 986 memcpy(optbuf, inet->opt,
982 sizeof(struct ip_options)+ 987 sizeof(struct ip_options)+
983 inet->opt->optlen); 988 inet->opt->optlen);
984 release_sock(sk); 989 release_sock(sk);
985 990
986 if (opt->optlen == 0) 991 if (opt->optlen == 0)
987 return put_user(0, optlen); 992 return put_user(0, optlen);
988 993
989 ip_options_undo(opt); 994 ip_options_undo(opt);
990 995
991 len = min_t(unsigned int, len, opt->optlen); 996 len = min_t(unsigned int, len, opt->optlen);
992 if(put_user(len, optlen)) 997 if (put_user(len, optlen))
993 return -EFAULT; 998 return -EFAULT;
994 if(copy_to_user(optval, opt->__data, len)) 999 if (copy_to_user(optval, opt->__data, len))
995 return -EFAULT; 1000 return -EFAULT;
996 return 0; 1001 return 0;
997 } 1002 }
998 case IP_PKTINFO: 1003 case IP_PKTINFO:
999 val = (inet->cmsg_flags & IP_CMSG_PKTINFO) != 0; 1004 val = (inet->cmsg_flags & IP_CMSG_PKTINFO) != 0;
1000 break; 1005 break;
1001 case IP_RECVTTL: 1006 case IP_RECVTTL:
1002 val = (inet->cmsg_flags & IP_CMSG_TTL) != 0; 1007 val = (inet->cmsg_flags & IP_CMSG_TTL) != 0;
1003 break; 1008 break;
1004 case IP_RECVTOS: 1009 case IP_RECVTOS:
1005 val = (inet->cmsg_flags & IP_CMSG_TOS) != 0; 1010 val = (inet->cmsg_flags & IP_CMSG_TOS) != 0;
1006 break; 1011 break;
1007 case IP_RECVOPTS: 1012 case IP_RECVOPTS:
1008 val = (inet->cmsg_flags & IP_CMSG_RECVOPTS) != 0; 1013 val = (inet->cmsg_flags & IP_CMSG_RECVOPTS) != 0;
1009 break; 1014 break;
1010 case IP_RETOPTS: 1015 case IP_RETOPTS:
1011 val = (inet->cmsg_flags & IP_CMSG_RETOPTS) != 0; 1016 val = (inet->cmsg_flags & IP_CMSG_RETOPTS) != 0;
1012 break; 1017 break;
1013 case IP_PASSSEC: 1018 case IP_PASSSEC:
1014 val = (inet->cmsg_flags & IP_CMSG_PASSSEC) != 0; 1019 val = (inet->cmsg_flags & IP_CMSG_PASSSEC) != 0;
1015 break; 1020 break;
1016 case IP_TOS: 1021 case IP_TOS:
1017 val = inet->tos; 1022 val = inet->tos;
1018 break; 1023 break;
1019 case IP_TTL: 1024 case IP_TTL:
1020 val = (inet->uc_ttl == -1 ? 1025 val = (inet->uc_ttl == -1 ?
1021 sysctl_ip_default_ttl : 1026 sysctl_ip_default_ttl :
1022 inet->uc_ttl); 1027 inet->uc_ttl);
1023 break; 1028 break;
1024 case IP_HDRINCL: 1029 case IP_HDRINCL:
1025 val = inet->hdrincl; 1030 val = inet->hdrincl;
1026 break; 1031 break;
1027 case IP_MTU_DISCOVER: 1032 case IP_MTU_DISCOVER:
1028 val = inet->pmtudisc; 1033 val = inet->pmtudisc;
1029 break; 1034 break;
1030 case IP_MTU: 1035 case IP_MTU:
1031 { 1036 {
1032 struct dst_entry *dst; 1037 struct dst_entry *dst;
1033 val = 0; 1038 val = 0;
1034 dst = sk_dst_get(sk); 1039 dst = sk_dst_get(sk);
1035 if (dst) { 1040 if (dst) {
1036 val = dst_mtu(dst); 1041 val = dst_mtu(dst);
1037 dst_release(dst); 1042 dst_release(dst);
1038 }
1039 if (!val) {
1040 release_sock(sk);
1041 return -ENOTCONN;
1042 }
1043 break;
1044 } 1043 }
1045 case IP_RECVERR: 1044 if (!val) {
1046 val = inet->recverr;
1047 break;
1048 case IP_MULTICAST_TTL:
1049 val = inet->mc_ttl;
1050 break;
1051 case IP_MULTICAST_LOOP:
1052 val = inet->mc_loop;
1053 break;
1054 case IP_MULTICAST_IF:
1055 {
1056 struct in_addr addr;
1057 len = min_t(unsigned int, len, sizeof(struct in_addr));
1058 addr.s_addr = inet->mc_addr;
1059 release_sock(sk); 1045 release_sock(sk);
1060 1046 return -ENOTCONN;
1061 if(put_user(len, optlen))
1062 return -EFAULT;
1063 if(copy_to_user(optval, &addr, len))
1064 return -EFAULT;
1065 return 0;
1066 } 1047 }
1067 case IP_MSFILTER: 1048 break;
1068 { 1049 }
1069 struct ip_msfilter msf; 1050 case IP_RECVERR:
1070 int err; 1051 val = inet->recverr;
1052 break;
1053 case IP_MULTICAST_TTL:
1054 val = inet->mc_ttl;
1055 break;
1056 case IP_MULTICAST_LOOP:
1057 val = inet->mc_loop;
1058 break;
1059 case IP_MULTICAST_IF:
1060 {
1061 struct in_addr addr;
1062 len = min_t(unsigned int, len, sizeof(struct in_addr));
1063 addr.s_addr = inet->mc_addr;
1064 release_sock(sk);
1071 1065
1072 if (len < IP_MSFILTER_SIZE(0)) { 1066 if (put_user(len, optlen))
1073 release_sock(sk); 1067 return -EFAULT;
1074 return -EINVAL; 1068 if (copy_to_user(optval, &addr, len))
1075 } 1069 return -EFAULT;
1076 if (copy_from_user(&msf, optval, IP_MSFILTER_SIZE(0))) { 1070 return 0;
1077 release_sock(sk); 1071 }
1078 return -EFAULT; 1072 case IP_MSFILTER:
1079 } 1073 {
1080 err = ip_mc_msfget(sk, &msf, 1074 struct ip_msfilter msf;
1081 (struct ip_msfilter __user *)optval, optlen); 1075 int err;
1076
1077 if (len < IP_MSFILTER_SIZE(0)) {
1082 release_sock(sk); 1078 release_sock(sk);
1083 return err; 1079 return -EINVAL;
1084 } 1080 }
1085 case MCAST_MSFILTER: 1081 if (copy_from_user(&msf, optval, IP_MSFILTER_SIZE(0))) {
1086 {
1087 struct group_filter gsf;
1088 int err;
1089
1090 if (len < GROUP_FILTER_SIZE(0)) {
1091 release_sock(sk);
1092 return -EINVAL;
1093 }
1094 if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) {
1095 release_sock(sk);
1096 return -EFAULT;
1097 }
1098 err = ip_mc_gsfget(sk, &gsf,
1099 (struct group_filter __user *)optval, optlen);
1100 release_sock(sk); 1082 release_sock(sk);
1101 return err; 1083 return -EFAULT;
1102 } 1084 }
1103 case IP_PKTOPTIONS: 1085 err = ip_mc_msfget(sk, &msf,
1104 { 1086 (struct ip_msfilter __user *)optval, optlen);
1105 struct msghdr msg; 1087 release_sock(sk);
1088 return err;
1089 }
1090 case MCAST_MSFILTER:
1091 {
1092 struct group_filter gsf;
1093 int err;
1106 1094
1095 if (len < GROUP_FILTER_SIZE(0)) {
1107 release_sock(sk); 1096 release_sock(sk);
1097 return -EINVAL;
1098 }
1099 if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) {
1100 release_sock(sk);
1101 return -EFAULT;
1102 }
1103 err = ip_mc_gsfget(sk, &gsf,
1104 (struct group_filter __user *)optval, optlen);
1105 release_sock(sk);
1106 return err;
1107 }
1108 case IP_PKTOPTIONS:
1109 {
1110 struct msghdr msg;
1111
1112 release_sock(sk);
1108 1113
1109 if (sk->sk_type != SOCK_STREAM) 1114 if (sk->sk_type != SOCK_STREAM)
1110 return -ENOPROTOOPT; 1115 return -ENOPROTOOPT;
1111 1116
1112 msg.msg_control = optval; 1117 msg.msg_control = optval;
1113 msg.msg_controllen = len; 1118 msg.msg_controllen = len;
1114 msg.msg_flags = 0; 1119 msg.msg_flags = 0;
1115 1120
1116 if (inet->cmsg_flags & IP_CMSG_PKTINFO) { 1121 if (inet->cmsg_flags & IP_CMSG_PKTINFO) {
1117 struct in_pktinfo info; 1122 struct in_pktinfo info;
1118 1123
1119 info.ipi_addr.s_addr = inet->rcv_saddr; 1124 info.ipi_addr.s_addr = inet->rcv_saddr;
1120 info.ipi_spec_dst.s_addr = inet->rcv_saddr; 1125 info.ipi_spec_dst.s_addr = inet->rcv_saddr;
1121 info.ipi_ifindex = inet->mc_index; 1126 info.ipi_ifindex = inet->mc_index;
1122 put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info); 1127 put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
1123 }
1124 if (inet->cmsg_flags & IP_CMSG_TTL) {
1125 int hlim = inet->mc_ttl;
1126 put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim);
1127 }
1128 len -= msg.msg_controllen;
1129 return put_user(len, optlen);
1130 } 1128 }
1131 case IP_FREEBIND: 1129 if (inet->cmsg_flags & IP_CMSG_TTL) {
1132 val = inet->freebind; 1130 int hlim = inet->mc_ttl;
1133 break; 1131 put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim);
1134 default: 1132 }
1135 release_sock(sk); 1133 len -= msg.msg_controllen;
1136 return -ENOPROTOOPT; 1134 return put_user(len, optlen);
1135 }
1136 case IP_FREEBIND:
1137 val = inet->freebind;
1138 break;
1139 default:
1140 release_sock(sk);
1141 return -ENOPROTOOPT;
1137 } 1142 }
1138 release_sock(sk); 1143 release_sock(sk);
1139 1144
1140 if (len < sizeof(int) && len > 0 && val>=0 && val<255) { 1145 if (len < sizeof(int) && len > 0 && val>=0 && val<255) {
1141 unsigned char ucval = (unsigned char)val; 1146 unsigned char ucval = (unsigned char)val;
1142 len = 1; 1147 len = 1;
1143 if(put_user(len, optlen)) 1148 if (put_user(len, optlen))
1144 return -EFAULT; 1149 return -EFAULT;
1145 if(copy_to_user(optval,&ucval,1)) 1150 if (copy_to_user(optval,&ucval,1))
1146 return -EFAULT; 1151 return -EFAULT;
1147 } else { 1152 } else {
1148 len = min_t(unsigned int, sizeof(int), len); 1153 len = min_t(unsigned int, sizeof(int), len);
1149 if(put_user(len, optlen)) 1154 if (put_user(len, optlen))
1150 return -EFAULT; 1155 return -EFAULT;
1151 if(copy_to_user(optval,&val,len)) 1156 if (copy_to_user(optval,&val,len))
1152 return -EFAULT; 1157 return -EFAULT;
1153 } 1158 }
1154 return 0; 1159 return 0;
1155} 1160}
1156 1161
1157int ip_getsockopt(struct sock *sk, int level, 1162int ip_getsockopt(struct sock *sk, int level,
1158 int optname, char __user *optval, int __user *optlen) 1163 int optname, char __user *optval, int __user *optlen)
1159{ 1164{
1160 int err; 1165 int err;
1161 1166
@@ -1169,7 +1174,7 @@ int ip_getsockopt(struct sock *sk, int level,
1169 ) { 1174 ) {
1170 int len; 1175 int len;
1171 1176
1172 if(get_user(len,optlen)) 1177 if (get_user(len,optlen))
1173 return -EFAULT; 1178 return -EFAULT;
1174 1179
1175 lock_sock(sk); 1180 lock_sock(sk);
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index aa704b88f014..ab86137c71d2 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -43,21 +43,15 @@ static LIST_HEAD(ipcomp_tfms_list);
43 43
44static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb) 44static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb)
45{ 45{
46 int err, plen, dlen;
47 struct ipcomp_data *ipcd = x->data; 46 struct ipcomp_data *ipcd = x->data;
48 u8 *start, *scratch; 47 const int plen = skb->len;
49 struct crypto_comp *tfm; 48 int dlen = IPCOMP_SCRATCH_SIZE;
50 int cpu; 49 const u8 *start = skb->data;
51 50 const int cpu = get_cpu();
52 plen = skb->len; 51 u8 *scratch = *per_cpu_ptr(ipcomp_scratches, cpu);
53 dlen = IPCOMP_SCRATCH_SIZE; 52 struct crypto_comp *tfm = *per_cpu_ptr(ipcd->tfms, cpu);
54 start = skb->data; 53 int err = crypto_comp_decompress(tfm, start, plen, scratch, &dlen);
55 54
56 cpu = get_cpu();
57 scratch = *per_cpu_ptr(ipcomp_scratches, cpu);
58 tfm = *per_cpu_ptr(ipcd->tfms, cpu);
59
60 err = crypto_comp_decompress(tfm, start, plen, scratch, &dlen);
61 if (err) 55 if (err)
62 goto out; 56 goto out;
63 57
@@ -72,7 +66,7 @@ static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb)
72 66
73 skb->truesize += dlen - plen; 67 skb->truesize += dlen - plen;
74 __skb_put(skb, dlen - plen); 68 __skb_put(skb, dlen - plen);
75 memcpy(skb->data, scratch, dlen); 69 skb_copy_to_linear_data(skb, scratch, dlen);
76out: 70out:
77 put_cpu(); 71 put_cpu();
78 return err; 72 return err;
@@ -90,10 +84,10 @@ static int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb)
90 skb->ip_summed = CHECKSUM_NONE; 84 skb->ip_summed = CHECKSUM_NONE;
91 85
92 /* Remove ipcomp header and decompress original payload */ 86 /* Remove ipcomp header and decompress original payload */
93 iph = skb->nh.iph; 87 iph = ip_hdr(skb);
94 ipch = (void *)skb->data; 88 ipch = (void *)skb->data;
95 iph->protocol = ipch->nexthdr; 89 iph->protocol = ipch->nexthdr;
96 skb->h.raw = skb->nh.raw + sizeof(*ipch); 90 skb->transport_header = skb->network_header + sizeof(*ipch);
97 __skb_pull(skb, sizeof(*ipch)); 91 __skb_pull(skb, sizeof(*ipch));
98 err = ipcomp_decompress(x, skb); 92 err = ipcomp_decompress(x, skb);
99 93
@@ -103,23 +97,16 @@ out:
103 97
104static int ipcomp_compress(struct xfrm_state *x, struct sk_buff *skb) 98static int ipcomp_compress(struct xfrm_state *x, struct sk_buff *skb)
105{ 99{
106 int err, plen, dlen, ihlen;
107 struct iphdr *iph = skb->nh.iph;
108 struct ipcomp_data *ipcd = x->data; 100 struct ipcomp_data *ipcd = x->data;
109 u8 *start, *scratch; 101 const int ihlen = ip_hdrlen(skb);
110 struct crypto_comp *tfm; 102 const int plen = skb->len - ihlen;
111 int cpu; 103 int dlen = IPCOMP_SCRATCH_SIZE;
104 u8 *start = skb->data + ihlen;
105 const int cpu = get_cpu();
106 u8 *scratch = *per_cpu_ptr(ipcomp_scratches, cpu);
107 struct crypto_comp *tfm = *per_cpu_ptr(ipcd->tfms, cpu);
108 int err = crypto_comp_compress(tfm, start, plen, scratch, &dlen);
112 109
113 ihlen = iph->ihl * 4;
114 plen = skb->len - ihlen;
115 dlen = IPCOMP_SCRATCH_SIZE;
116 start = skb->data + ihlen;
117
118 cpu = get_cpu();
119 scratch = *per_cpu_ptr(ipcomp_scratches, cpu);
120 tfm = *per_cpu_ptr(ipcd->tfms, cpu);
121
122 err = crypto_comp_compress(tfm, start, plen, scratch, &dlen);
123 if (err) 110 if (err)
124 goto out; 111 goto out;
125 112
@@ -142,12 +129,11 @@ out:
142static int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb) 129static int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb)
143{ 130{
144 int err; 131 int err;
145 struct iphdr *iph;
146 struct ip_comp_hdr *ipch; 132 struct ip_comp_hdr *ipch;
147 struct ipcomp_data *ipcd = x->data; 133 struct ipcomp_data *ipcd = x->data;
148 int hdr_len = 0; 134 int hdr_len = 0;
135 struct iphdr *iph = ip_hdr(skb);
149 136
150 iph = skb->nh.iph;
151 iph->tot_len = htons(skb->len); 137 iph->tot_len = htons(skb->len);
152 hdr_len = iph->ihl * 4; 138 hdr_len = iph->ihl * 4;
153 if ((skb->len - hdr_len) < ipcd->threshold) { 139 if ((skb->len - hdr_len) < ipcd->threshold) {
@@ -159,7 +145,7 @@ static int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb)
159 goto out_ok; 145 goto out_ok;
160 146
161 err = ipcomp_compress(x, skb); 147 err = ipcomp_compress(x, skb);
162 iph = skb->nh.iph; 148 iph = ip_hdr(skb);
163 149
164 if (err) { 150 if (err) {
165 goto out_ok; 151 goto out_ok;
@@ -188,8 +174,8 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info)
188 struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); 174 struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
189 struct xfrm_state *x; 175 struct xfrm_state *x;
190 176
191 if (skb->h.icmph->type != ICMP_DEST_UNREACH || 177 if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH ||
192 skb->h.icmph->code != ICMP_FRAG_NEEDED) 178 icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
193 return; 179 return;
194 180
195 spi = htonl(ntohs(ipch->cpi)); 181 spi = htonl(ntohs(ipch->cpi));
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index cf49de1a4983..597c800b2fdc 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -432,7 +432,7 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
432 goto drop; 432 goto drop;
433 433
434 /* Basic sanity checks can be done without the lock. */ 434 /* Basic sanity checks can be done without the lock. */
435 rarp = (struct arphdr *)skb->h.raw; 435 rarp = (struct arphdr *)skb_transport_header(skb);
436 436
437 /* If this test doesn't pass, it's not IP, or we should 437 /* If this test doesn't pass, it's not IP, or we should
438 * ignore it anyway. 438 * ignore it anyway.
@@ -455,7 +455,7 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
455 goto drop; 455 goto drop;
456 456
457 /* OK, it is all there and looks valid, process... */ 457 /* OK, it is all there and looks valid, process... */
458 rarp = (struct arphdr *)skb->h.raw; 458 rarp = (struct arphdr *)skb_transport_header(skb);
459 rarp_ptr = (unsigned char *) (rarp + 1); 459 rarp_ptr = (unsigned char *) (rarp + 1);
460 460
461 /* One reply at a time, please. */ 461 /* One reply at a time, please. */
@@ -702,7 +702,8 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d
702 memset(b, 0, sizeof(struct bootp_pkt)); 702 memset(b, 0, sizeof(struct bootp_pkt));
703 703
704 /* Construct IP header */ 704 /* Construct IP header */
705 skb->nh.iph = h = &b->iph; 705 skb_reset_network_header(skb);
706 h = ip_hdr(skb);
706 h->version = 4; 707 h->version = 4;
707 h->ihl = 5; 708 h->ihl = 5;
708 h->tot_len = htons(sizeof(struct bootp_pkt)); 709 h->tot_len = htons(sizeof(struct bootp_pkt));
@@ -782,7 +783,7 @@ static void __init ic_do_bootp_ext(u8 *ext)
782 u8 *c; 783 u8 *c;
783 784
784 printk("DHCP/BOOTP: Got extension %d:",*ext); 785 printk("DHCP/BOOTP: Got extension %d:",*ext);
785 for(c=ext+2; c<ext+2+ext[1]; c++) 786 for (c=ext+2; c<ext+2+ext[1]; c++)
786 printk(" %02x", *c); 787 printk(" %02x", *c);
787 printk("\n"); 788 printk("\n");
788#endif 789#endif
@@ -845,7 +846,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
845 sizeof(struct udphdr))) 846 sizeof(struct udphdr)))
846 goto drop; 847 goto drop;
847 848
848 b = (struct bootp_pkt *) skb->nh.iph; 849 b = (struct bootp_pkt *)skb_network_header(skb);
849 h = &b->iph; 850 h = &b->iph;
850 851
851 if (h->ihl != 5 || h->version != 4 || h->protocol != IPPROTO_UDP) 852 if (h->ihl != 5 || h->version != 4 || h->protocol != IPPROTO_UDP)
@@ -883,7 +884,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
883 if (!pskb_may_pull(skb, skb->len)) 884 if (!pskb_may_pull(skb, skb->len))
884 goto drop; 885 goto drop;
885 886
886 b = (struct bootp_pkt *) skb->nh.iph; 887 b = (struct bootp_pkt *)skb_network_header(skb);
887 h = &b->iph; 888 h = &b->iph;
888 889
889 /* One reply at a time, please. */ 890 /* One reply at a time, please. */
@@ -938,7 +939,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
938 if (opt[1] >= 4) 939 if (opt[1] >= 4)
939 memcpy(&server_id, opt + 2, 4); 940 memcpy(&server_id, opt + 2, 4);
940 break; 941 break;
941 }; 942 }
942 } 943 }
943 944
944#ifdef IPCONFIG_DEBUG 945#ifdef IPCONFIG_DEBUG
@@ -983,7 +984,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
983 ic_myaddr = NONE; 984 ic_myaddr = NONE;
984 ic_servaddr = NONE; 985 ic_servaddr = NONE;
985 goto drop_unlock; 986 goto drop_unlock;
986 }; 987 }
987 988
988 ic_dhcp_msgtype = mt; 989 ic_dhcp_msgtype = mt;
989 990
@@ -1094,7 +1095,7 @@ static int __init ic_dynamic(void)
1094 retries = CONF_SEND_RETRIES; 1095 retries = CONF_SEND_RETRIES;
1095 get_random_bytes(&timeout, sizeof(timeout)); 1096 get_random_bytes(&timeout, sizeof(timeout));
1096 timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned) CONF_TIMEOUT_RANDOM); 1097 timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned) CONF_TIMEOUT_RANDOM);
1097 for(;;) { 1098 for (;;) {
1098#ifdef IPCONFIG_BOOTP 1099#ifdef IPCONFIG_BOOTP
1099 if (do_bootp && (d->able & IC_BOOTP)) 1100 if (do_bootp && (d->able & IC_BOOTP))
1100 ic_bootp_send_if(d, jiffies - start_jiffies); 1101 ic_bootp_send_if(d, jiffies - start_jiffies);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 3ec5ce0f5498..ebd2f2d532f6 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -157,10 +157,10 @@ static struct ip_tunnel * ipip_tunnel_lookup(__be32 remote, __be32 local)
157 return NULL; 157 return NULL;
158} 158}
159 159
160static struct ip_tunnel **ipip_bucket(struct ip_tunnel *t) 160static struct ip_tunnel **__ipip_bucket(struct ip_tunnel_parm *parms)
161{ 161{
162 __be32 remote = t->parms.iph.daddr; 162 __be32 remote = parms->iph.daddr;
163 __be32 local = t->parms.iph.saddr; 163 __be32 local = parms->iph.saddr;
164 unsigned h = 0; 164 unsigned h = 0;
165 int prio = 0; 165 int prio = 0;
166 166
@@ -175,6 +175,10 @@ static struct ip_tunnel **ipip_bucket(struct ip_tunnel *t)
175 return &tunnels[prio][h]; 175 return &tunnels[prio][h];
176} 176}
177 177
178static inline struct ip_tunnel **ipip_bucket(struct ip_tunnel *t)
179{
180 return __ipip_bucket(&t->parms);
181}
178 182
179static void ipip_tunnel_unlink(struct ip_tunnel *t) 183static void ipip_tunnel_unlink(struct ip_tunnel *t)
180{ 184{
@@ -206,19 +210,9 @@ static struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int c
206 __be32 local = parms->iph.saddr; 210 __be32 local = parms->iph.saddr;
207 struct ip_tunnel *t, **tp, *nt; 211 struct ip_tunnel *t, **tp, *nt;
208 struct net_device *dev; 212 struct net_device *dev;
209 unsigned h = 0;
210 int prio = 0;
211 char name[IFNAMSIZ]; 213 char name[IFNAMSIZ];
212 214
213 if (remote) { 215 for (tp = __ipip_bucket(parms); (t = *tp) != NULL; tp = &t->next) {
214 prio |= 2;
215 h ^= HASH(remote);
216 }
217 if (local) {
218 prio |= 1;
219 h ^= HASH(local);
220 }
221 for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) {
222 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) 216 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
223 return t; 217 return t;
224 } 218 }
@@ -280,8 +274,8 @@ static int ipip_err(struct sk_buff *skb, u32 info)
280 ICMP in the real Internet is absolutely infeasible. 274 ICMP in the real Internet is absolutely infeasible.
281 */ 275 */
282 struct iphdr *iph = (struct iphdr*)skb->data; 276 struct iphdr *iph = (struct iphdr*)skb->data;
283 int type = skb->h.icmph->type; 277 const int type = icmp_hdr(skb)->type;
284 int code = skb->h.icmph->code; 278 const int code = icmp_hdr(skb)->code;
285 struct ip_tunnel *t; 279 struct ip_tunnel *t;
286 int err; 280 int err;
287 281
@@ -336,8 +330,8 @@ out:
336 struct iphdr *iph = (struct iphdr*)dp; 330 struct iphdr *iph = (struct iphdr*)dp;
337 int hlen = iph->ihl<<2; 331 int hlen = iph->ihl<<2;
338 struct iphdr *eiph; 332 struct iphdr *eiph;
339 int type = skb->h.icmph->type; 333 const int type = icmp_hdr(skb)->type;
340 int code = skb->h.icmph->code; 334 const int code = icmp_hdr(skb)->code;
341 int rel_type = 0; 335 int rel_type = 0;
342 int rel_code = 0; 336 int rel_code = 0;
343 __be32 rel_info = 0; 337 __be32 rel_info = 0;
@@ -354,7 +348,7 @@ out:
354 default: 348 default:
355 return 0; 349 return 0;
356 case ICMP_PARAMETERPROB: 350 case ICMP_PARAMETERPROB:
357 n = ntohl(skb->h.icmph->un.gateway) >> 24; 351 n = ntohl(icmp_hdr(skb)->un.gateway) >> 24;
358 if (n < hlen) 352 if (n < hlen)
359 return 0; 353 return 0;
360 354
@@ -373,7 +367,7 @@ out:
373 return 0; 367 return 0;
374 case ICMP_FRAG_NEEDED: 368 case ICMP_FRAG_NEEDED:
375 /* And it is the only really necessary thing :-) */ 369 /* And it is the only really necessary thing :-) */
376 n = ntohs(skb->h.icmph->un.frag.mtu); 370 n = ntohs(icmp_hdr(skb)->un.frag.mtu);
377 if (n < hlen+68) 371 if (n < hlen+68)
378 return 0; 372 return 0;
379 n -= hlen; 373 n -= hlen;
@@ -405,7 +399,7 @@ out:
405 dst_release(skb2->dst); 399 dst_release(skb2->dst);
406 skb2->dst = NULL; 400 skb2->dst = NULL;
407 skb_pull(skb2, skb->data - (u8*)eiph); 401 skb_pull(skb2, skb->data - (u8*)eiph);
408 skb2->nh.raw = skb2->data; 402 skb_reset_network_header(skb2);
409 403
410 /* Try to guess incoming interface */ 404 /* Try to guess incoming interface */
411 memset(&fl, 0, sizeof(fl)); 405 memset(&fl, 0, sizeof(fl));
@@ -461,9 +455,10 @@ out:
461#endif 455#endif
462} 456}
463 457
464static inline void ipip_ecn_decapsulate(struct iphdr *outer_iph, struct sk_buff *skb) 458static inline void ipip_ecn_decapsulate(const struct iphdr *outer_iph,
459 struct sk_buff *skb)
465{ 460{
466 struct iphdr *inner_iph = skb->nh.iph; 461 struct iphdr *inner_iph = ip_hdr(skb);
467 462
468 if (INET_ECN_is_ce(outer_iph->tos)) 463 if (INET_ECN_is_ce(outer_iph->tos))
469 IP_ECN_set_ce(inner_iph); 464 IP_ECN_set_ce(inner_iph);
@@ -471,10 +466,8 @@ static inline void ipip_ecn_decapsulate(struct iphdr *outer_iph, struct sk_buff
471 466
472static int ipip_rcv(struct sk_buff *skb) 467static int ipip_rcv(struct sk_buff *skb)
473{ 468{
474 struct iphdr *iph;
475 struct ip_tunnel *tunnel; 469 struct ip_tunnel *tunnel;
476 470 const struct iphdr *iph = ip_hdr(skb);
477 iph = skb->nh.iph;
478 471
479 read_lock(&ipip_lock); 472 read_lock(&ipip_lock);
480 if ((tunnel = ipip_tunnel_lookup(iph->saddr, iph->daddr)) != NULL) { 473 if ((tunnel = ipip_tunnel_lookup(iph->saddr, iph->daddr)) != NULL) {
@@ -486,8 +479,8 @@ static int ipip_rcv(struct sk_buff *skb)
486 479
487 secpath_reset(skb); 480 secpath_reset(skb);
488 481
489 skb->mac.raw = skb->nh.raw; 482 skb->mac_header = skb->network_header;
490 skb->nh.raw = skb->data; 483 skb_reset_network_header(skb);
491 skb->protocol = htons(ETH_P_IP); 484 skb->protocol = htons(ETH_P_IP);
492 skb->pkt_type = PACKET_HOST; 485 skb->pkt_type = PACKET_HOST;
493 486
@@ -521,7 +514,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
521 __be16 df = tiph->frag_off; 514 __be16 df = tiph->frag_off;
522 struct rtable *rt; /* Route to the other host */ 515 struct rtable *rt; /* Route to the other host */
523 struct net_device *tdev; /* Device to other host */ 516 struct net_device *tdev; /* Device to other host */
524 struct iphdr *old_iph = skb->nh.iph; 517 struct iphdr *old_iph = ip_hdr(skb);
525 struct iphdr *iph; /* Our new IP header */ 518 struct iphdr *iph; /* Our new IP header */
526 int max_headroom; /* The extra header space needed */ 519 int max_headroom; /* The extra header space needed */
527 __be32 dst = tiph->daddr; 520 __be32 dst = tiph->daddr;
@@ -615,11 +608,12 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
615 skb_set_owner_w(new_skb, skb->sk); 608 skb_set_owner_w(new_skb, skb->sk);
616 dev_kfree_skb(skb); 609 dev_kfree_skb(skb);
617 skb = new_skb; 610 skb = new_skb;
618 old_iph = skb->nh.iph; 611 old_iph = ip_hdr(skb);
619 } 612 }
620 613
621 skb->h.raw = skb->nh.raw; 614 skb->transport_header = skb->network_header;
622 skb->nh.raw = skb_push(skb, sizeof(struct iphdr)); 615 skb_push(skb, sizeof(struct iphdr));
616 skb_reset_network_header(skb);
623 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 617 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
624 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | 618 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
625 IPSKB_REROUTED); 619 IPSKB_REROUTED);
@@ -630,7 +624,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
630 * Push down and install the IPIP header. 624 * Push down and install the IPIP header.
631 */ 625 */
632 626
633 iph = skb->nh.iph; 627 iph = ip_hdr(skb);
634 iph->version = 4; 628 iph->version = 4;
635 iph->ihl = sizeof(struct iphdr)>>2; 629 iph->ihl = sizeof(struct iphdr)>>2;
636 iph->frag_off = df; 630 iph->frag_off = df;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 601e3df69258..0ebae413ae87 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -62,6 +62,7 @@
62#include <linux/netfilter_ipv4.h> 62#include <linux/netfilter_ipv4.h>
63#include <net/ipip.h> 63#include <net/ipip.h>
64#include <net/checksum.h> 64#include <net/checksum.h>
65#include <net/netlink.h>
65 66
66#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2) 67#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
67#define CONFIG_IP_PIMSM 1 68#define CONFIG_IP_PIMSM 1
@@ -302,8 +303,8 @@ static void ipmr_destroy_unres(struct mfc_cache *c)
302 303
303 atomic_dec(&cache_resolve_queue_len); 304 atomic_dec(&cache_resolve_queue_len);
304 305
305 while((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) { 306 while ((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) {
306 if (skb->nh.iph->version == 0) { 307 if (ip_hdr(skb)->version == 0) {
307 struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); 308 struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
308 nlh->nlmsg_type = NLMSG_ERROR; 309 nlh->nlmsg_type = NLMSG_ERROR;
309 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); 310 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
@@ -479,7 +480,7 @@ static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp)
479static struct mfc_cache *ipmr_cache_alloc(void) 480static struct mfc_cache *ipmr_cache_alloc(void)
480{ 481{
481 struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); 482 struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
482 if(c==NULL) 483 if (c==NULL)
483 return NULL; 484 return NULL;
484 c->mfc_un.res.minvif = MAXVIFS; 485 c->mfc_un.res.minvif = MAXVIFS;
485 return c; 486 return c;
@@ -488,7 +489,7 @@ static struct mfc_cache *ipmr_cache_alloc(void)
488static struct mfc_cache *ipmr_cache_alloc_unres(void) 489static struct mfc_cache *ipmr_cache_alloc_unres(void)
489{ 490{
490 struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); 491 struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
491 if(c==NULL) 492 if (c==NULL)
492 return NULL; 493 return NULL;
493 skb_queue_head_init(&c->mfc_un.unres.unresolved); 494 skb_queue_head_init(&c->mfc_un.unres.unresolved);
494 c->mfc_un.unres.expires = jiffies + 10*HZ; 495 c->mfc_un.unres.expires = jiffies + 10*HZ;
@@ -508,12 +509,13 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
508 * Play the pending entries through our router 509 * Play the pending entries through our router
509 */ 510 */
510 511
511 while((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) { 512 while ((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) {
512 if (skb->nh.iph->version == 0) { 513 if (ip_hdr(skb)->version == 0) {
513 struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); 514 struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
514 515
515 if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) { 516 if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
516 nlh->nlmsg_len = skb->tail - (u8*)nlh; 517 nlh->nlmsg_len = (skb_tail_pointer(skb) -
518 (u8 *)nlh);
517 } else { 519 } else {
518 nlh->nlmsg_type = NLMSG_ERROR; 520 nlh->nlmsg_type = NLMSG_ERROR;
519 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); 521 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
@@ -539,7 +541,7 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
539static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert) 541static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
540{ 542{
541 struct sk_buff *skb; 543 struct sk_buff *skb;
542 int ihl = pkt->nh.iph->ihl<<2; 544 const int ihl = ip_hdrlen(pkt);
543 struct igmphdr *igmp; 545 struct igmphdr *igmp;
544 struct igmpmsg *msg; 546 struct igmpmsg *msg;
545 int ret; 547 int ret;
@@ -551,7 +553,7 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
551#endif 553#endif
552 skb = alloc_skb(128, GFP_ATOMIC); 554 skb = alloc_skb(128, GFP_ATOMIC);
553 555
554 if(!skb) 556 if (!skb)
555 return -ENOBUFS; 557 return -ENOBUFS;
556 558
557#ifdef CONFIG_IP_PIMSM 559#ifdef CONFIG_IP_PIMSM
@@ -561,14 +563,17 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
561 And all this only to mangle msg->im_msgtype and 563 And all this only to mangle msg->im_msgtype and
562 to set msg->im_mbz to "mbz" :-) 564 to set msg->im_mbz to "mbz" :-)
563 */ 565 */
564 msg = (struct igmpmsg*)skb_push(skb, sizeof(struct iphdr)); 566 skb_push(skb, sizeof(struct iphdr));
565 skb->nh.raw = skb->h.raw = (u8*)msg; 567 skb_reset_network_header(skb);
566 memcpy(msg, pkt->nh.raw, sizeof(struct iphdr)); 568 skb_reset_transport_header(skb);
569 msg = (struct igmpmsg *)skb_network_header(skb);
570 memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
567 msg->im_msgtype = IGMPMSG_WHOLEPKT; 571 msg->im_msgtype = IGMPMSG_WHOLEPKT;
568 msg->im_mbz = 0; 572 msg->im_mbz = 0;
569 msg->im_vif = reg_vif_num; 573 msg->im_vif = reg_vif_num;
570 skb->nh.iph->ihl = sizeof(struct iphdr) >> 2; 574 ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
571 skb->nh.iph->tot_len = htons(ntohs(pkt->nh.iph->tot_len) + sizeof(struct iphdr)); 575 ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
576 sizeof(struct iphdr));
572 } else 577 } else
573#endif 578#endif
574 { 579 {
@@ -577,10 +582,11 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
577 * Copy the IP header 582 * Copy the IP header
578 */ 583 */
579 584
580 skb->nh.iph = (struct iphdr *)skb_put(skb, ihl); 585 skb->network_header = skb->tail;
581 memcpy(skb->data,pkt->data,ihl); 586 skb_put(skb, ihl);
582 skb->nh.iph->protocol = 0; /* Flag to the kernel this is a route add */ 587 skb_copy_to_linear_data(skb, pkt->data, ihl);
583 msg = (struct igmpmsg*)skb->nh.iph; 588 ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */
589 msg = (struct igmpmsg *)skb_network_header(skb);
584 msg->im_vif = vifi; 590 msg->im_vif = vifi;
585 skb->dst = dst_clone(pkt->dst); 591 skb->dst = dst_clone(pkt->dst);
586 592
@@ -592,8 +598,8 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
592 igmp->type = 598 igmp->type =
593 msg->im_msgtype = assert; 599 msg->im_msgtype = assert;
594 igmp->code = 0; 600 igmp->code = 0;
595 skb->nh.iph->tot_len=htons(skb->len); /* Fix the length */ 601 ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */
596 skb->h.raw = skb->nh.raw; 602 skb->transport_header = skb->network_header;
597 } 603 }
598 604
599 if (mroute_socket == NULL) { 605 if (mroute_socket == NULL) {
@@ -622,11 +628,12 @@ ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
622{ 628{
623 int err; 629 int err;
624 struct mfc_cache *c; 630 struct mfc_cache *c;
631 const struct iphdr *iph = ip_hdr(skb);
625 632
626 spin_lock_bh(&mfc_unres_lock); 633 spin_lock_bh(&mfc_unres_lock);
627 for (c=mfc_unres_queue; c; c=c->next) { 634 for (c=mfc_unres_queue; c; c=c->next) {
628 if (c->mfc_mcastgrp == skb->nh.iph->daddr && 635 if (c->mfc_mcastgrp == iph->daddr &&
629 c->mfc_origin == skb->nh.iph->saddr) 636 c->mfc_origin == iph->saddr)
630 break; 637 break;
631 } 638 }
632 639
@@ -646,9 +653,9 @@ ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
646 /* 653 /*
647 * Fill in the new cache entry 654 * Fill in the new cache entry
648 */ 655 */
649 c->mfc_parent=-1; 656 c->mfc_parent = -1;
650 c->mfc_origin=skb->nh.iph->saddr; 657 c->mfc_origin = iph->saddr;
651 c->mfc_mcastgrp=skb->nh.iph->daddr; 658 c->mfc_mcastgrp = iph->daddr;
652 659
653 /* 660 /*
654 * Reflect first query at mrouted. 661 * Reflect first query at mrouted.
@@ -734,7 +741,7 @@ static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
734 return 0; 741 return 0;
735 } 742 }
736 743
737 if(!MULTICAST(mfc->mfcc_mcastgrp.s_addr)) 744 if (!MULTICAST(mfc->mfcc_mcastgrp.s_addr))
738 return -EINVAL; 745 return -EINVAL;
739 746
740 c=ipmr_cache_alloc(); 747 c=ipmr_cache_alloc();
@@ -788,7 +795,7 @@ static void mroute_clean_tables(struct sock *sk)
788 /* 795 /*
789 * Shut down all active vif entries 796 * Shut down all active vif entries
790 */ 797 */
791 for(i=0; i<maxvif; i++) { 798 for (i=0; i<maxvif; i++) {
792 if (!(vif_table[i].flags&VIFF_STATIC)) 799 if (!(vif_table[i].flags&VIFF_STATIC))
793 vif_delete(i); 800 vif_delete(i);
794 } 801 }
@@ -858,119 +865,117 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int opt
858 struct vifctl vif; 865 struct vifctl vif;
859 struct mfcctl mfc; 866 struct mfcctl mfc;
860 867
861 if(optname!=MRT_INIT) 868 if (optname != MRT_INIT) {
862 { 869 if (sk != mroute_socket && !capable(CAP_NET_ADMIN))
863 if(sk!=mroute_socket && !capable(CAP_NET_ADMIN))
864 return -EACCES; 870 return -EACCES;
865 } 871 }
866 872
867 switch(optname) 873 switch (optname) {
868 { 874 case MRT_INIT:
869 case MRT_INIT: 875 if (sk->sk_type != SOCK_RAW ||
870 if (sk->sk_type != SOCK_RAW || 876 inet_sk(sk)->num != IPPROTO_IGMP)
871 inet_sk(sk)->num != IPPROTO_IGMP) 877 return -EOPNOTSUPP;
872 return -EOPNOTSUPP; 878 if (optlen!=sizeof(int))
873 if(optlen!=sizeof(int)) 879 return -ENOPROTOOPT;
874 return -ENOPROTOOPT;
875
876 rtnl_lock();
877 if (mroute_socket) {
878 rtnl_unlock();
879 return -EADDRINUSE;
880 }
881
882 ret = ip_ra_control(sk, 1, mrtsock_destruct);
883 if (ret == 0) {
884 write_lock_bh(&mrt_lock);
885 mroute_socket=sk;
886 write_unlock_bh(&mrt_lock);
887 880
888 ipv4_devconf.mc_forwarding++; 881 rtnl_lock();
889 } 882 if (mroute_socket) {
890 rtnl_unlock(); 883 rtnl_unlock();
891 return ret; 884 return -EADDRINUSE;
892 case MRT_DONE: 885 }
893 if (sk!=mroute_socket) 886
894 return -EACCES; 887 ret = ip_ra_control(sk, 1, mrtsock_destruct);
895 return ip_ra_control(sk, 0, NULL); 888 if (ret == 0) {
896 case MRT_ADD_VIF: 889 write_lock_bh(&mrt_lock);
897 case MRT_DEL_VIF: 890 mroute_socket=sk;
898 if(optlen!=sizeof(vif)) 891 write_unlock_bh(&mrt_lock);
899 return -EINVAL; 892
900 if (copy_from_user(&vif,optval,sizeof(vif))) 893 ipv4_devconf.mc_forwarding++;
901 return -EFAULT; 894 }
902 if(vif.vifc_vifi >= MAXVIFS) 895 rtnl_unlock();
903 return -ENFILE; 896 return ret;
904 rtnl_lock(); 897 case MRT_DONE:
905 if (optname==MRT_ADD_VIF) { 898 if (sk!=mroute_socket)
906 ret = vif_add(&vif, sk==mroute_socket); 899 return -EACCES;
907 } else { 900 return ip_ra_control(sk, 0, NULL);
908 ret = vif_delete(vif.vifc_vifi); 901 case MRT_ADD_VIF:
909 } 902 case MRT_DEL_VIF:
910 rtnl_unlock(); 903 if (optlen!=sizeof(vif))
911 return ret; 904 return -EINVAL;
905 if (copy_from_user(&vif,optval,sizeof(vif)))
906 return -EFAULT;
907 if (vif.vifc_vifi >= MAXVIFS)
908 return -ENFILE;
909 rtnl_lock();
910 if (optname==MRT_ADD_VIF) {
911 ret = vif_add(&vif, sk==mroute_socket);
912 } else {
913 ret = vif_delete(vif.vifc_vifi);
914 }
915 rtnl_unlock();
916 return ret;
912 917
913 /* 918 /*
914 * Manipulate the forwarding caches. These live 919 * Manipulate the forwarding caches. These live
915 * in a sort of kernel/user symbiosis. 920 * in a sort of kernel/user symbiosis.
916 */ 921 */
917 case MRT_ADD_MFC: 922 case MRT_ADD_MFC:
918 case MRT_DEL_MFC: 923 case MRT_DEL_MFC:
919 if(optlen!=sizeof(mfc)) 924 if (optlen!=sizeof(mfc))
920 return -EINVAL; 925 return -EINVAL;
921 if (copy_from_user(&mfc,optval, sizeof(mfc))) 926 if (copy_from_user(&mfc,optval, sizeof(mfc)))
922 return -EFAULT; 927 return -EFAULT;
923 rtnl_lock(); 928 rtnl_lock();
924 if (optname==MRT_DEL_MFC) 929 if (optname==MRT_DEL_MFC)
925 ret = ipmr_mfc_delete(&mfc); 930 ret = ipmr_mfc_delete(&mfc);
926 else 931 else
927 ret = ipmr_mfc_add(&mfc, sk==mroute_socket); 932 ret = ipmr_mfc_add(&mfc, sk==mroute_socket);
928 rtnl_unlock(); 933 rtnl_unlock();
929 return ret; 934 return ret;
930 /* 935 /*
931 * Control PIM assert. 936 * Control PIM assert.
932 */ 937 */
933 case MRT_ASSERT: 938 case MRT_ASSERT:
934 { 939 {
935 int v; 940 int v;
936 if(get_user(v,(int __user *)optval)) 941 if (get_user(v,(int __user *)optval))
937 return -EFAULT; 942 return -EFAULT;
938 mroute_do_assert=(v)?1:0; 943 mroute_do_assert=(v)?1:0;
939 return 0; 944 return 0;
940 } 945 }
941#ifdef CONFIG_IP_PIMSM 946#ifdef CONFIG_IP_PIMSM
942 case MRT_PIM: 947 case MRT_PIM:
943 { 948 {
944 int v, ret; 949 int v, ret;
945 if(get_user(v,(int __user *)optval)) 950 if (get_user(v,(int __user *)optval))
946 return -EFAULT; 951 return -EFAULT;
947 v = (v)?1:0; 952 v = (v)?1:0;
948 rtnl_lock(); 953 rtnl_lock();
949 ret = 0; 954 ret = 0;
950 if (v != mroute_do_pim) { 955 if (v != mroute_do_pim) {
951 mroute_do_pim = v; 956 mroute_do_pim = v;
952 mroute_do_assert = v; 957 mroute_do_assert = v;
953#ifdef CONFIG_IP_PIMSM_V2 958#ifdef CONFIG_IP_PIMSM_V2
954 if (mroute_do_pim) 959 if (mroute_do_pim)
955 ret = inet_add_protocol(&pim_protocol, 960 ret = inet_add_protocol(&pim_protocol,
956 IPPROTO_PIM); 961 IPPROTO_PIM);
957 else 962 else
958 ret = inet_del_protocol(&pim_protocol, 963 ret = inet_del_protocol(&pim_protocol,
959 IPPROTO_PIM); 964 IPPROTO_PIM);
960 if (ret < 0) 965 if (ret < 0)
961 ret = -EAGAIN; 966 ret = -EAGAIN;
962#endif 967#endif
963 }
964 rtnl_unlock();
965 return ret;
966 } 968 }
969 rtnl_unlock();
970 return ret;
971 }
967#endif 972#endif
968 /* 973 /*
969 * Spurious command, or MRT_VERSION which you cannot 974 * Spurious command, or MRT_VERSION which you cannot
970 * set. 975 * set.
971 */ 976 */
972 default: 977 default:
973 return -ENOPROTOOPT; 978 return -ENOPROTOOPT;
974 } 979 }
975} 980}
976 981
@@ -983,7 +988,7 @@ int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __u
983 int olr; 988 int olr;
984 int val; 989 int val;
985 990
986 if(optname!=MRT_VERSION && 991 if (optname!=MRT_VERSION &&
987#ifdef CONFIG_IP_PIMSM 992#ifdef CONFIG_IP_PIMSM
988 optname!=MRT_PIM && 993 optname!=MRT_PIM &&
989#endif 994#endif
@@ -997,17 +1002,17 @@ int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __u
997 if (olr < 0) 1002 if (olr < 0)
998 return -EINVAL; 1003 return -EINVAL;
999 1004
1000 if(put_user(olr,optlen)) 1005 if (put_user(olr,optlen))
1001 return -EFAULT; 1006 return -EFAULT;
1002 if(optname==MRT_VERSION) 1007 if (optname==MRT_VERSION)
1003 val=0x0305; 1008 val=0x0305;
1004#ifdef CONFIG_IP_PIMSM 1009#ifdef CONFIG_IP_PIMSM
1005 else if(optname==MRT_PIM) 1010 else if (optname==MRT_PIM)
1006 val=mroute_do_pim; 1011 val=mroute_do_pim;
1007#endif 1012#endif
1008 else 1013 else
1009 val=mroute_do_assert; 1014 val=mroute_do_assert;
1010 if(copy_to_user(optval,&val,olr)) 1015 if (copy_to_user(optval,&val,olr))
1011 return -EFAULT; 1016 return -EFAULT;
1012 return 0; 1017 return 0;
1013} 1018}
@@ -1023,48 +1028,47 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1023 struct vif_device *vif; 1028 struct vif_device *vif;
1024 struct mfc_cache *c; 1029 struct mfc_cache *c;
1025 1030
1026 switch(cmd) 1031 switch (cmd) {
1027 { 1032 case SIOCGETVIFCNT:
1028 case SIOCGETVIFCNT: 1033 if (copy_from_user(&vr,arg,sizeof(vr)))
1029 if (copy_from_user(&vr,arg,sizeof(vr))) 1034 return -EFAULT;
1030 return -EFAULT; 1035 if (vr.vifi>=maxvif)
1031 if(vr.vifi>=maxvif) 1036 return -EINVAL;
1032 return -EINVAL; 1037 read_lock(&mrt_lock);
1033 read_lock(&mrt_lock); 1038 vif=&vif_table[vr.vifi];
1034 vif=&vif_table[vr.vifi]; 1039 if (VIF_EXISTS(vr.vifi)) {
1035 if(VIF_EXISTS(vr.vifi)) { 1040 vr.icount=vif->pkt_in;
1036 vr.icount=vif->pkt_in; 1041 vr.ocount=vif->pkt_out;
1037 vr.ocount=vif->pkt_out; 1042 vr.ibytes=vif->bytes_in;
1038 vr.ibytes=vif->bytes_in; 1043 vr.obytes=vif->bytes_out;
1039 vr.obytes=vif->bytes_out;
1040 read_unlock(&mrt_lock);
1041
1042 if (copy_to_user(arg,&vr,sizeof(vr)))
1043 return -EFAULT;
1044 return 0;
1045 }
1046 read_unlock(&mrt_lock); 1044 read_unlock(&mrt_lock);
1047 return -EADDRNOTAVAIL;
1048 case SIOCGETSGCNT:
1049 if (copy_from_user(&sr,arg,sizeof(sr)))
1050 return -EFAULT;
1051 1045
1052 read_lock(&mrt_lock); 1046 if (copy_to_user(arg,&vr,sizeof(vr)))
1053 c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr); 1047 return -EFAULT;
1054 if (c) { 1048 return 0;
1055 sr.pktcnt = c->mfc_un.res.pkt; 1049 }
1056 sr.bytecnt = c->mfc_un.res.bytes; 1050 read_unlock(&mrt_lock);
1057 sr.wrong_if = c->mfc_un.res.wrong_if; 1051 return -EADDRNOTAVAIL;
1058 read_unlock(&mrt_lock); 1052 case SIOCGETSGCNT:
1059 1053 if (copy_from_user(&sr,arg,sizeof(sr)))
1060 if (copy_to_user(arg,&sr,sizeof(sr))) 1054 return -EFAULT;
1061 return -EFAULT; 1055
1062 return 0; 1056 read_lock(&mrt_lock);
1063 } 1057 c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
1058 if (c) {
1059 sr.pktcnt = c->mfc_un.res.pkt;
1060 sr.bytecnt = c->mfc_un.res.bytes;
1061 sr.wrong_if = c->mfc_un.res.wrong_if;
1064 read_unlock(&mrt_lock); 1062 read_unlock(&mrt_lock);
1065 return -EADDRNOTAVAIL; 1063
1066 default: 1064 if (copy_to_user(arg,&sr,sizeof(sr)))
1067 return -ENOIOCTLCMD; 1065 return -EFAULT;
1066 return 0;
1067 }
1068 read_unlock(&mrt_lock);
1069 return -EADDRNOTAVAIL;
1070 default:
1071 return -ENOIOCTLCMD;
1068 } 1072 }
1069} 1073}
1070 1074
@@ -1076,7 +1080,7 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v
1076 if (event != NETDEV_UNREGISTER) 1080 if (event != NETDEV_UNREGISTER)
1077 return NOTIFY_DONE; 1081 return NOTIFY_DONE;
1078 v=&vif_table[0]; 1082 v=&vif_table[0];
1079 for(ct=0;ct<maxvif;ct++,v++) { 1083 for (ct=0;ct<maxvif;ct++,v++) {
1080 if (v->dev==ptr) 1084 if (v->dev==ptr)
1081 vif_delete(ct); 1085 vif_delete(ct);
1082 } 1086 }
@@ -1096,11 +1100,17 @@ static struct notifier_block ip_mr_notifier={
1096 1100
1097static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr) 1101static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1098{ 1102{
1099 struct iphdr *iph = (struct iphdr *)skb_push(skb,sizeof(struct iphdr)); 1103 struct iphdr *iph;
1104 struct iphdr *old_iph = ip_hdr(skb);
1105
1106 skb_push(skb, sizeof(struct iphdr));
1107 skb->transport_header = skb->network_header;
1108 skb_reset_network_header(skb);
1109 iph = ip_hdr(skb);
1100 1110
1101 iph->version = 4; 1111 iph->version = 4;
1102 iph->tos = skb->nh.iph->tos; 1112 iph->tos = old_iph->tos;
1103 iph->ttl = skb->nh.iph->ttl; 1113 iph->ttl = old_iph->ttl;
1104 iph->frag_off = 0; 1114 iph->frag_off = 0;
1105 iph->daddr = daddr; 1115 iph->daddr = daddr;
1106 iph->saddr = saddr; 1116 iph->saddr = saddr;
@@ -1110,8 +1120,6 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1110 ip_select_ident(iph, skb->dst, NULL); 1120 ip_select_ident(iph, skb->dst, NULL);
1111 ip_send_check(iph); 1121 ip_send_check(iph);
1112 1122
1113 skb->h.ipiph = skb->nh.iph;
1114 skb->nh.iph = iph;
1115 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 1123 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1116 nf_reset(skb); 1124 nf_reset(skb);
1117} 1125}
@@ -1134,7 +1142,7 @@ static inline int ipmr_forward_finish(struct sk_buff *skb)
1134 1142
1135static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi) 1143static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1136{ 1144{
1137 struct iphdr *iph = skb->nh.iph; 1145 const struct iphdr *iph = ip_hdr(skb);
1138 struct vif_device *vif = &vif_table[vifi]; 1146 struct vif_device *vif = &vif_table[vifi];
1139 struct net_device *dev; 1147 struct net_device *dev;
1140 struct rtable *rt; 1148 struct rtable *rt;
@@ -1200,8 +1208,7 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1200 1208
1201 dst_release(skb->dst); 1209 dst_release(skb->dst);
1202 skb->dst = &rt->u.dst; 1210 skb->dst = &rt->u.dst;
1203 iph = skb->nh.iph; 1211 ip_decrease_ttl(ip_hdr(skb));
1204 ip_decrease_ttl(iph);
1205 1212
1206 /* FIXME: forward and output firewalls used to be called here. 1213 /* FIXME: forward and output firewalls used to be called here.
1207 * What do we do with netfilter? -- RR */ 1214 * What do we do with netfilter? -- RR */
@@ -1301,7 +1308,7 @@ static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local
1301 * Forward the frame 1308 * Forward the frame
1302 */ 1309 */
1303 for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) { 1310 for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1304 if (skb->nh.iph->ttl > cache->mfc_un.res.ttls[ct]) { 1311 if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
1305 if (psend != -1) { 1312 if (psend != -1) {
1306 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 1313 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1307 if (skb2) 1314 if (skb2)
@@ -1347,7 +1354,7 @@ int ip_mr_input(struct sk_buff *skb)
1347 if (IPCB(skb)->opt.router_alert) { 1354 if (IPCB(skb)->opt.router_alert) {
1348 if (ip_call_ra_chain(skb)) 1355 if (ip_call_ra_chain(skb))
1349 return 0; 1356 return 0;
1350 } else if (skb->nh.iph->protocol == IPPROTO_IGMP){ 1357 } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){
1351 /* IGMPv1 (and broken IGMPv2 implementations sort of 1358 /* IGMPv1 (and broken IGMPv2 implementations sort of
1352 Cisco IOS <= 11.2(8)) do not put router alert 1359 Cisco IOS <= 11.2(8)) do not put router alert
1353 option to IGMP packets destined to routable 1360 option to IGMP packets destined to routable
@@ -1366,7 +1373,7 @@ int ip_mr_input(struct sk_buff *skb)
1366 } 1373 }
1367 1374
1368 read_lock(&mrt_lock); 1375 read_lock(&mrt_lock);
1369 cache = ipmr_cache_find(skb->nh.iph->saddr, skb->nh.iph->daddr); 1376 cache = ipmr_cache_find(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1370 1377
1371 /* 1378 /*
1372 * No usable cache entry 1379 * No usable cache entry
@@ -1426,14 +1433,15 @@ int pim_rcv_v1(struct sk_buff * skb)
1426 if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) 1433 if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1427 goto drop; 1434 goto drop;
1428 1435
1429 pim = (struct igmphdr*)skb->h.raw; 1436 pim = igmp_hdr(skb);
1430 1437
1431 if (!mroute_do_pim || 1438 if (!mroute_do_pim ||
1432 skb->len < sizeof(*pim) + sizeof(*encap) || 1439 skb->len < sizeof(*pim) + sizeof(*encap) ||
1433 pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) 1440 pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1434 goto drop; 1441 goto drop;
1435 1442
1436 encap = (struct iphdr*)(skb->h.raw + sizeof(struct igmphdr)); 1443 encap = (struct iphdr *)(skb_transport_header(skb) +
1444 sizeof(struct igmphdr));
1437 /* 1445 /*
1438 Check that: 1446 Check that:
1439 a. packet is really destinted to a multicast group 1447 a. packet is really destinted to a multicast group
@@ -1455,9 +1463,9 @@ int pim_rcv_v1(struct sk_buff * skb)
1455 if (reg_dev == NULL) 1463 if (reg_dev == NULL)
1456 goto drop; 1464 goto drop;
1457 1465
1458 skb->mac.raw = skb->nh.raw; 1466 skb->mac_header = skb->network_header;
1459 skb_pull(skb, (u8*)encap - skb->data); 1467 skb_pull(skb, (u8*)encap - skb->data);
1460 skb->nh.iph = (struct iphdr *)skb->data; 1468 skb_reset_network_header(skb);
1461 skb->dev = reg_dev; 1469 skb->dev = reg_dev;
1462 skb->protocol = htons(ETH_P_IP); 1470 skb->protocol = htons(ETH_P_IP);
1463 skb->ip_summed = 0; 1471 skb->ip_summed = 0;
@@ -1486,7 +1494,7 @@ static int pim_rcv(struct sk_buff * skb)
1486 if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) 1494 if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1487 goto drop; 1495 goto drop;
1488 1496
1489 pim = (struct pimreghdr*)skb->h.raw; 1497 pim = (struct pimreghdr *)skb_transport_header(skb);
1490 if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) || 1498 if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1491 (pim->flags&PIM_NULL_REGISTER) || 1499 (pim->flags&PIM_NULL_REGISTER) ||
1492 (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 && 1500 (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
@@ -1494,7 +1502,8 @@ static int pim_rcv(struct sk_buff * skb)
1494 goto drop; 1502 goto drop;
1495 1503
1496 /* check if the inner packet is destined to mcast group */ 1504 /* check if the inner packet is destined to mcast group */
1497 encap = (struct iphdr*)(skb->h.raw + sizeof(struct pimreghdr)); 1505 encap = (struct iphdr *)(skb_transport_header(skb) +
1506 sizeof(struct pimreghdr));
1498 if (!MULTICAST(encap->daddr) || 1507 if (!MULTICAST(encap->daddr) ||
1499 encap->tot_len == 0 || 1508 encap->tot_len == 0 ||
1500 ntohs(encap->tot_len) + sizeof(*pim) > skb->len) 1509 ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
@@ -1510,9 +1519,9 @@ static int pim_rcv(struct sk_buff * skb)
1510 if (reg_dev == NULL) 1519 if (reg_dev == NULL)
1511 goto drop; 1520 goto drop;
1512 1521
1513 skb->mac.raw = skb->nh.raw; 1522 skb->mac_header = skb->network_header;
1514 skb_pull(skb, (u8*)encap - skb->data); 1523 skb_pull(skb, (u8*)encap - skb->data);
1515 skb->nh.iph = (struct iphdr *)skb->data; 1524 skb_reset_network_header(skb);
1516 skb->dev = reg_dev; 1525 skb->dev = reg_dev;
1517 skb->protocol = htons(ETH_P_IP); 1526 skb->protocol = htons(ETH_P_IP);
1518 skb->ip_summed = 0; 1527 skb->ip_summed = 0;
@@ -1537,7 +1546,7 @@ ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1537 int ct; 1546 int ct;
1538 struct rtnexthop *nhp; 1547 struct rtnexthop *nhp;
1539 struct net_device *dev = vif_table[c->mfc_parent].dev; 1548 struct net_device *dev = vif_table[c->mfc_parent].dev;
1540 u8 *b = skb->tail; 1549 u8 *b = skb_tail_pointer(skb);
1541 struct rtattr *mp_head; 1550 struct rtattr *mp_head;
1542 1551
1543 if (dev) 1552 if (dev)
@@ -1557,12 +1566,12 @@ ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1557 } 1566 }
1558 } 1567 }
1559 mp_head->rta_type = RTA_MULTIPATH; 1568 mp_head->rta_type = RTA_MULTIPATH;
1560 mp_head->rta_len = skb->tail - (u8*)mp_head; 1569 mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
1561 rtm->rtm_type = RTN_MULTICAST; 1570 rtm->rtm_type = RTN_MULTICAST;
1562 return 1; 1571 return 1;
1563 1572
1564rtattr_failure: 1573rtattr_failure:
1565 skb_trim(skb, b - skb->data); 1574 nlmsg_trim(skb, b);
1566 return -EMSGSIZE; 1575 return -EMSGSIZE;
1567} 1576}
1568 1577
@@ -1577,6 +1586,7 @@ int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1577 1586
1578 if (cache==NULL) { 1587 if (cache==NULL) {
1579 struct sk_buff *skb2; 1588 struct sk_buff *skb2;
1589 struct iphdr *iph;
1580 struct net_device *dev; 1590 struct net_device *dev;
1581 int vif; 1591 int vif;
1582 1592
@@ -1596,11 +1606,13 @@ int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1596 return -ENOMEM; 1606 return -ENOMEM;
1597 } 1607 }
1598 1608
1599 skb2->nh.raw = skb_push(skb2, sizeof(struct iphdr)); 1609 skb_push(skb2, sizeof(struct iphdr));
1600 skb2->nh.iph->ihl = sizeof(struct iphdr)>>2; 1610 skb_reset_network_header(skb2);
1601 skb2->nh.iph->saddr = rt->rt_src; 1611 iph = ip_hdr(skb2);
1602 skb2->nh.iph->daddr = rt->rt_dst; 1612 iph->ihl = sizeof(struct iphdr) >> 2;
1603 skb2->nh.iph->version = 0; 1613 iph->saddr = rt->rt_src;
1614 iph->daddr = rt->rt_dst;
1615 iph->version = 0;
1604 err = ipmr_cache_unresolved(vif, skb2); 1616 err = ipmr_cache_unresolved(vif, skb2);
1605 read_unlock(&mrt_lock); 1617 read_unlock(&mrt_lock);
1606 return err; 1618 return err;
@@ -1625,7 +1637,7 @@ static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
1625 loff_t pos) 1637 loff_t pos)
1626{ 1638{
1627 for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) { 1639 for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) {
1628 if(!VIF_EXISTS(iter->ct)) 1640 if (!VIF_EXISTS(iter->ct))
1629 continue; 1641 continue;
1630 if (pos-- == 0) 1642 if (pos-- == 0)
1631 return &vif_table[iter->ct]; 1643 return &vif_table[iter->ct];
@@ -1649,7 +1661,7 @@ static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1649 return ipmr_vif_seq_idx(iter, 0); 1661 return ipmr_vif_seq_idx(iter, 0);
1650 1662
1651 while (++iter->ct < maxvif) { 1663 while (++iter->ct < maxvif) {
1652 if(!VIF_EXISTS(iter->ct)) 1664 if (!VIF_EXISTS(iter->ct))
1653 continue; 1665 continue;
1654 return &vif_table[iter->ct]; 1666 return &vif_table[iter->ct];
1655 } 1667 }
@@ -1680,7 +1692,7 @@ static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1680 return 0; 1692 return 0;
1681} 1693}
1682 1694
1683static struct seq_operations ipmr_vif_seq_ops = { 1695static const struct seq_operations ipmr_vif_seq_ops = {
1684 .start = ipmr_vif_seq_start, 1696 .start = ipmr_vif_seq_start,
1685 .next = ipmr_vif_seq_next, 1697 .next = ipmr_vif_seq_next,
1686 .stop = ipmr_vif_seq_stop, 1698 .stop = ipmr_vif_seq_stop,
@@ -1732,14 +1744,14 @@ static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1732 it->cache = mfc_cache_array; 1744 it->cache = mfc_cache_array;
1733 read_lock(&mrt_lock); 1745 read_lock(&mrt_lock);
1734 for (it->ct = 0; it->ct < MFC_LINES; it->ct++) 1746 for (it->ct = 0; it->ct < MFC_LINES; it->ct++)
1735 for(mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next) 1747 for (mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next)
1736 if (pos-- == 0) 1748 if (pos-- == 0)
1737 return mfc; 1749 return mfc;
1738 read_unlock(&mrt_lock); 1750 read_unlock(&mrt_lock);
1739 1751
1740 it->cache = &mfc_unres_queue; 1752 it->cache = &mfc_unres_queue;
1741 spin_lock_bh(&mfc_unres_lock); 1753 spin_lock_bh(&mfc_unres_lock);
1742 for(mfc = mfc_unres_queue; mfc; mfc = mfc->next) 1754 for (mfc = mfc_unres_queue; mfc; mfc = mfc->next)
1743 if (pos-- == 0) 1755 if (pos-- == 0)
1744 return mfc; 1756 return mfc;
1745 spin_unlock_bh(&mfc_unres_lock); 1757 spin_unlock_bh(&mfc_unres_lock);
@@ -1829,9 +1841,9 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1829 mfc->mfc_un.res.wrong_if); 1841 mfc->mfc_un.res.wrong_if);
1830 1842
1831 if (it->cache != &mfc_unres_queue) { 1843 if (it->cache != &mfc_unres_queue) {
1832 for(n = mfc->mfc_un.res.minvif; 1844 for (n = mfc->mfc_un.res.minvif;
1833 n < mfc->mfc_un.res.maxvif; n++ ) { 1845 n < mfc->mfc_un.res.maxvif; n++ ) {
1834 if(VIF_EXISTS(n) 1846 if (VIF_EXISTS(n)
1835 && mfc->mfc_un.res.ttls[n] < 255) 1847 && mfc->mfc_un.res.ttls[n] < 255)
1836 seq_printf(seq, 1848 seq_printf(seq,
1837 " %2d:%-3d", 1849 " %2d:%-3d",
@@ -1843,7 +1855,7 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1843 return 0; 1855 return 0;
1844} 1856}
1845 1857
1846static struct seq_operations ipmr_mfc_seq_ops = { 1858static const struct seq_operations ipmr_mfc_seq_ops = {
1847 .start = ipmr_mfc_seq_start, 1859 .start = ipmr_mfc_seq_start,
1848 .next = ipmr_mfc_seq_next, 1860 .next = ipmr_mfc_seq_next,
1849 .stop = ipmr_mfc_seq_stop, 1861 .stop = ipmr_mfc_seq_stop,
diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c
index 22e104c6a493..15ad5dd2d984 100644
--- a/net/ipv4/ipvs/ip_vs_app.c
+++ b/net/ipv4/ipvs/ip_vs_app.c
@@ -331,14 +331,14 @@ static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff **pskb,
331 struct ip_vs_app *app) 331 struct ip_vs_app *app)
332{ 332{
333 int diff; 333 int diff;
334 unsigned int tcp_offset = (*pskb)->nh.iph->ihl*4; 334 const unsigned int tcp_offset = ip_hdrlen(*pskb);
335 struct tcphdr *th; 335 struct tcphdr *th;
336 __u32 seq; 336 __u32 seq;
337 337
338 if (!ip_vs_make_skb_writable(pskb, tcp_offset + sizeof(*th))) 338 if (!ip_vs_make_skb_writable(pskb, tcp_offset + sizeof(*th)))
339 return 0; 339 return 0;
340 340
341 th = (struct tcphdr *)((*pskb)->nh.raw + tcp_offset); 341 th = (struct tcphdr *)(skb_network_header(*pskb) + tcp_offset);
342 342
343 /* 343 /*
344 * Remember seq number in case this pkt gets resized 344 * Remember seq number in case this pkt gets resized
@@ -406,14 +406,14 @@ static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff **pskb,
406 struct ip_vs_app *app) 406 struct ip_vs_app *app)
407{ 407{
408 int diff; 408 int diff;
409 unsigned int tcp_offset = (*pskb)->nh.iph->ihl*4; 409 const unsigned int tcp_offset = ip_hdrlen(*pskb);
410 struct tcphdr *th; 410 struct tcphdr *th;
411 __u32 seq; 411 __u32 seq;
412 412
413 if (!ip_vs_make_skb_writable(pskb, tcp_offset + sizeof(*th))) 413 if (!ip_vs_make_skb_writable(pskb, tcp_offset + sizeof(*th)))
414 return 0; 414 return 0;
415 415
416 th = (struct tcphdr *)((*pskb)->nh.raw + tcp_offset); 416 th = (struct tcphdr *)(skb_network_header(*pskb) + tcp_offset);
417 417
418 /* 418 /*
419 * Remember seq number in case this pkt gets resized 419 * Remember seq number in case this pkt gets resized
@@ -577,7 +577,6 @@ static const struct file_operations ip_vs_app_fops = {
577int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri, 577int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri,
578 char *o_buf, int o_len, char *n_buf, int n_len) 578 char *o_buf, int o_len, char *n_buf, int n_len)
579{ 579{
580 struct iphdr *iph;
581 int diff; 580 int diff;
582 int o_offset; 581 int o_offset;
583 int o_left; 582 int o_left;
@@ -603,12 +602,11 @@ int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri,
603 skb_put(skb, diff); 602 skb_put(skb, diff);
604 memmove(skb->data + o_offset + n_len, 603 memmove(skb->data + o_offset + n_len,
605 skb->data + o_offset + o_len, o_left); 604 skb->data + o_offset + o_len, o_left);
606 memcpy(skb->data + o_offset, n_buf, n_len); 605 skb_copy_to_linear_data_offset(skb, o_offset, n_buf, n_len);
607 } 606 }
608 607
609 /* must update the iph total length here */ 608 /* must update the iph total length here */
610 iph = skb->nh.iph; 609 ip_hdr(skb)->tot_len = htons(skb->len);
611 iph->tot_len = htons(skb->len);
612 610
613 LeaveFunction(9); 611 LeaveFunction(9);
614 return 0; 612 return 0;
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
index 24d7b66eb6d2..f005a2f929f4 100644
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -212,7 +212,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
212 __be16 ports[2]) 212 __be16 ports[2])
213{ 213{
214 struct ip_vs_conn *cp = NULL; 214 struct ip_vs_conn *cp = NULL;
215 struct iphdr *iph = skb->nh.iph; 215 struct iphdr *iph = ip_hdr(skb);
216 struct ip_vs_dest *dest; 216 struct ip_vs_dest *dest;
217 struct ip_vs_conn *ct; 217 struct ip_vs_conn *ct;
218 __be16 dport; /* destination port to forward */ 218 __be16 dport; /* destination port to forward */
@@ -381,7 +381,7 @@ struct ip_vs_conn *
381ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) 381ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
382{ 382{
383 struct ip_vs_conn *cp = NULL; 383 struct ip_vs_conn *cp = NULL;
384 struct iphdr *iph = skb->nh.iph; 384 struct iphdr *iph = ip_hdr(skb);
385 struct ip_vs_dest *dest; 385 struct ip_vs_dest *dest;
386 __be16 _ports[2], *pptr; 386 __be16 _ports[2], *pptr;
387 387
@@ -447,7 +447,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
447 struct ip_vs_protocol *pp) 447 struct ip_vs_protocol *pp)
448{ 448{
449 __be16 _ports[2], *pptr; 449 __be16 _ports[2], *pptr;
450 struct iphdr *iph = skb->nh.iph; 450 struct iphdr *iph = ip_hdr(skb);
451 451
452 pptr = skb_header_pointer(skb, iph->ihl*4, 452 pptr = skb_header_pointer(skb, iph->ihl*4,
453 sizeof(_ports), _ports); 453 sizeof(_ports), _ports);
@@ -546,7 +546,7 @@ ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
546{ 546{
547 skb = ip_defrag(skb, user); 547 skb = ip_defrag(skb, user);
548 if (skb) 548 if (skb)
549 ip_send_check(skb->nh.iph); 549 ip_send_check(ip_hdr(skb));
550 return skb; 550 return skb;
551} 551}
552 552
@@ -557,9 +557,10 @@ ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
557void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, 557void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
558 struct ip_vs_conn *cp, int inout) 558 struct ip_vs_conn *cp, int inout)
559{ 559{
560 struct iphdr *iph = skb->nh.iph; 560 struct iphdr *iph = ip_hdr(skb);
561 unsigned int icmp_offset = iph->ihl*4; 561 unsigned int icmp_offset = iph->ihl*4;
562 struct icmphdr *icmph = (struct icmphdr *)(skb->nh.raw + icmp_offset); 562 struct icmphdr *icmph = (struct icmphdr *)(skb_network_header(skb) +
563 icmp_offset);
563 struct iphdr *ciph = (struct iphdr *)(icmph + 1); 564 struct iphdr *ciph = (struct iphdr *)(icmph + 1);
564 565
565 if (inout) { 566 if (inout) {
@@ -617,14 +618,14 @@ static int ip_vs_out_icmp(struct sk_buff **pskb, int *related)
617 *related = 1; 618 *related = 1;
618 619
619 /* reassemble IP fragments */ 620 /* reassemble IP fragments */
620 if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) { 621 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
621 skb = ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT); 622 skb = ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT);
622 if (!skb) 623 if (!skb)
623 return NF_STOLEN; 624 return NF_STOLEN;
624 *pskb = skb; 625 *pskb = skb;
625 } 626 }
626 627
627 iph = skb->nh.iph; 628 iph = ip_hdr(skb);
628 offset = ihl = iph->ihl * 4; 629 offset = ihl = iph->ihl * 4;
629 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); 630 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
630 if (ic == NULL) 631 if (ic == NULL)
@@ -659,7 +660,7 @@ static int ip_vs_out_icmp(struct sk_buff **pskb, int *related)
659 return NF_ACCEPT; 660 return NF_ACCEPT;
660 661
661 /* Is the embedded protocol header present? */ 662 /* Is the embedded protocol header present? */
662 if (unlikely(cih->frag_off & __constant_htons(IP_OFFSET) && 663 if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
663 pp->dont_defrag)) 664 pp->dont_defrag))
664 return NF_ACCEPT; 665 return NF_ACCEPT;
665 666
@@ -680,8 +681,7 @@ static int ip_vs_out_icmp(struct sk_buff **pskb, int *related)
680 } 681 }
681 682
682 /* Ensure the checksum is correct */ 683 /* Ensure the checksum is correct */
683 if (skb->ip_summed != CHECKSUM_UNNECESSARY && 684 if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
684 ip_vs_checksum_complete(skb, ihl)) {
685 /* Failed checksum! */ 685 /* Failed checksum! */
686 IP_VS_DBG(1, "Forward ICMP: failed checksum from %d.%d.%d.%d!\n", 686 IP_VS_DBG(1, "Forward ICMP: failed checksum from %d.%d.%d.%d!\n",
687 NIPQUAD(iph->saddr)); 687 NIPQUAD(iph->saddr));
@@ -712,8 +712,7 @@ static inline int is_tcp_reset(const struct sk_buff *skb)
712{ 712{
713 struct tcphdr _tcph, *th; 713 struct tcphdr _tcph, *th;
714 714
715 th = skb_header_pointer(skb, skb->nh.iph->ihl * 4, 715 th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph);
716 sizeof(_tcph), &_tcph);
717 if (th == NULL) 716 if (th == NULL)
718 return 0; 717 return 0;
719 return th->rst; 718 return th->rst;
@@ -740,14 +739,14 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb,
740 if (skb->ipvs_property) 739 if (skb->ipvs_property)
741 return NF_ACCEPT; 740 return NF_ACCEPT;
742 741
743 iph = skb->nh.iph; 742 iph = ip_hdr(skb);
744 if (unlikely(iph->protocol == IPPROTO_ICMP)) { 743 if (unlikely(iph->protocol == IPPROTO_ICMP)) {
745 int related, verdict = ip_vs_out_icmp(pskb, &related); 744 int related, verdict = ip_vs_out_icmp(pskb, &related);
746 745
747 if (related) 746 if (related)
748 return verdict; 747 return verdict;
749 skb = *pskb; 748 skb = *pskb;
750 iph = skb->nh.iph; 749 iph = ip_hdr(skb);
751 } 750 }
752 751
753 pp = ip_vs_proto_get(iph->protocol); 752 pp = ip_vs_proto_get(iph->protocol);
@@ -755,12 +754,12 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb,
755 return NF_ACCEPT; 754 return NF_ACCEPT;
756 755
757 /* reassemble IP fragments */ 756 /* reassemble IP fragments */
758 if (unlikely(iph->frag_off & __constant_htons(IP_MF|IP_OFFSET) && 757 if (unlikely(iph->frag_off & htons(IP_MF|IP_OFFSET) &&
759 !pp->dont_defrag)) { 758 !pp->dont_defrag)) {
760 skb = ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT); 759 skb = ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT);
761 if (!skb) 760 if (!skb)
762 return NF_STOLEN; 761 return NF_STOLEN;
763 iph = skb->nh.iph; 762 iph = ip_hdr(skb);
764 *pskb = skb; 763 *pskb = skb;
765 } 764 }
766 765
@@ -810,8 +809,8 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb,
810 if (pp->snat_handler && !pp->snat_handler(pskb, pp, cp)) 809 if (pp->snat_handler && !pp->snat_handler(pskb, pp, cp))
811 goto drop; 810 goto drop;
812 skb = *pskb; 811 skb = *pskb;
813 skb->nh.iph->saddr = cp->vaddr; 812 ip_hdr(skb)->saddr = cp->vaddr;
814 ip_send_check(skb->nh.iph); 813 ip_send_check(ip_hdr(skb));
815 814
816 /* For policy routing, packets originating from this 815 /* For policy routing, packets originating from this
817 * machine itself may be routed differently to packets 816 * machine itself may be routed differently to packets
@@ -861,7 +860,7 @@ ip_vs_in_icmp(struct sk_buff **pskb, int *related, unsigned int hooknum)
861 *related = 1; 860 *related = 1;
862 861
863 /* reassemble IP fragments */ 862 /* reassemble IP fragments */
864 if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) { 863 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
865 skb = ip_vs_gather_frags(skb, 864 skb = ip_vs_gather_frags(skb,
866 hooknum == NF_IP_LOCAL_IN ? 865 hooknum == NF_IP_LOCAL_IN ?
867 IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD); 866 IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD);
@@ -870,7 +869,7 @@ ip_vs_in_icmp(struct sk_buff **pskb, int *related, unsigned int hooknum)
870 *pskb = skb; 869 *pskb = skb;
871 } 870 }
872 871
873 iph = skb->nh.iph; 872 iph = ip_hdr(skb);
874 offset = ihl = iph->ihl * 4; 873 offset = ihl = iph->ihl * 4;
875 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); 874 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
876 if (ic == NULL) 875 if (ic == NULL)
@@ -905,7 +904,7 @@ ip_vs_in_icmp(struct sk_buff **pskb, int *related, unsigned int hooknum)
905 return NF_ACCEPT; 904 return NF_ACCEPT;
906 905
907 /* Is the embedded protocol header present? */ 906 /* Is the embedded protocol header present? */
908 if (unlikely(cih->frag_off & __constant_htons(IP_OFFSET) && 907 if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
909 pp->dont_defrag)) 908 pp->dont_defrag))
910 return NF_ACCEPT; 909 return NF_ACCEPT;
911 910
@@ -921,8 +920,7 @@ ip_vs_in_icmp(struct sk_buff **pskb, int *related, unsigned int hooknum)
921 verdict = NF_DROP; 920 verdict = NF_DROP;
922 921
923 /* Ensure the checksum is correct */ 922 /* Ensure the checksum is correct */
924 if (skb->ip_summed != CHECKSUM_UNNECESSARY && 923 if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
925 ip_vs_checksum_complete(skb, ihl)) {
926 /* Failed checksum! */ 924 /* Failed checksum! */
927 IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n", 925 IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n",
928 NIPQUAD(iph->saddr)); 926 NIPQUAD(iph->saddr));
@@ -966,19 +964,19 @@ ip_vs_in(unsigned int hooknum, struct sk_buff **pskb,
966 || skb->dev == &loopback_dev || skb->sk)) { 964 || skb->dev == &loopback_dev || skb->sk)) {
967 IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n", 965 IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n",
968 skb->pkt_type, 966 skb->pkt_type,
969 skb->nh.iph->protocol, 967 ip_hdr(skb)->protocol,
970 NIPQUAD(skb->nh.iph->daddr)); 968 NIPQUAD(ip_hdr(skb)->daddr));
971 return NF_ACCEPT; 969 return NF_ACCEPT;
972 } 970 }
973 971
974 iph = skb->nh.iph; 972 iph = ip_hdr(skb);
975 if (unlikely(iph->protocol == IPPROTO_ICMP)) { 973 if (unlikely(iph->protocol == IPPROTO_ICMP)) {
976 int related, verdict = ip_vs_in_icmp(pskb, &related, hooknum); 974 int related, verdict = ip_vs_in_icmp(pskb, &related, hooknum);
977 975
978 if (related) 976 if (related)
979 return verdict; 977 return verdict;
980 skb = *pskb; 978 skb = *pskb;
981 iph = skb->nh.iph; 979 iph = ip_hdr(skb);
982 } 980 }
983 981
984 /* Protocol supported? */ 982 /* Protocol supported? */
@@ -1064,7 +1062,7 @@ ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff **pskb,
1064{ 1062{
1065 int r; 1063 int r;
1066 1064
1067 if ((*pskb)->nh.iph->protocol != IPPROTO_ICMP) 1065 if (ip_hdr(*pskb)->protocol != IPPROTO_ICMP)
1068 return NF_ACCEPT; 1066 return NF_ACCEPT;
1069 1067
1070 return ip_vs_in_icmp(pskb, &r, hooknum); 1068 return ip_vs_in_icmp(pskb, &r, hooknum);
diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c
index 502111fba872..dcf5d46aaa5e 100644
--- a/net/ipv4/ipvs/ip_vs_dh.c
+++ b/net/ipv4/ipvs/ip_vs_dh.c
@@ -204,7 +204,7 @@ ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
204{ 204{
205 struct ip_vs_dest *dest; 205 struct ip_vs_dest *dest;
206 struct ip_vs_dh_bucket *tbl; 206 struct ip_vs_dh_bucket *tbl;
207 struct iphdr *iph = skb->nh.iph; 207 struct iphdr *iph = ip_hdr(skb);
208 208
209 IP_VS_DBG(6, "ip_vs_dh_schedule(): Scheduling...\n"); 209 IP_VS_DBG(6, "ip_vs_dh_schedule(): Scheduling...\n");
210 210
diff --git a/net/ipv4/ipvs/ip_vs_ftp.c b/net/ipv4/ipvs/ip_vs_ftp.c
index 847c47af040c..344ddbbdc756 100644
--- a/net/ipv4/ipvs/ip_vs_ftp.c
+++ b/net/ipv4/ipvs/ip_vs_ftp.c
@@ -159,10 +159,10 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
159 return 0; 159 return 0;
160 160
161 if (cp->app_data == &ip_vs_ftp_pasv) { 161 if (cp->app_data == &ip_vs_ftp_pasv) {
162 iph = (*pskb)->nh.iph; 162 iph = ip_hdr(*pskb);
163 th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); 163 th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
164 data = (char *)th + (th->doff << 2); 164 data = (char *)th + (th->doff << 2);
165 data_limit = (*pskb)->tail; 165 data_limit = skb_tail_pointer(*pskb);
166 166
167 if (ip_vs_ftp_get_addrport(data, data_limit, 167 if (ip_vs_ftp_get_addrport(data, data_limit,
168 SERVER_STRING, 168 SERVER_STRING,
@@ -262,14 +262,14 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
262 /* 262 /*
263 * Detecting whether it is passive 263 * Detecting whether it is passive
264 */ 264 */
265 iph = (*pskb)->nh.iph; 265 iph = ip_hdr(*pskb);
266 th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); 266 th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
267 267
268 /* Since there may be OPTIONS in the TCP packet and the HLEN is 268 /* Since there may be OPTIONS in the TCP packet and the HLEN is
269 the length of the header in 32-bit multiples, it is accurate 269 the length of the header in 32-bit multiples, it is accurate
270 to calculate data address by th+HLEN*4 */ 270 to calculate data address by th+HLEN*4 */
271 data = data_start = (char *)th + (th->doff << 2); 271 data = data_start = (char *)th + (th->doff << 2);
272 data_limit = (*pskb)->tail; 272 data_limit = skb_tail_pointer(*pskb);
273 273
274 while (data <= data_limit - 6) { 274 while (data <= data_limit - 6) {
275 if (strnicmp(data, "PASV\r\n", 6) == 0) { 275 if (strnicmp(data, "PASV\r\n", 6) == 0) {
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c
index c801273cb881..052f4ed59174 100644
--- a/net/ipv4/ipvs/ip_vs_lblc.c
+++ b/net/ipv4/ipvs/ip_vs_lblc.c
@@ -521,7 +521,7 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
521 struct ip_vs_dest *dest; 521 struct ip_vs_dest *dest;
522 struct ip_vs_lblc_table *tbl; 522 struct ip_vs_lblc_table *tbl;
523 struct ip_vs_lblc_entry *en; 523 struct ip_vs_lblc_entry *en;
524 struct iphdr *iph = skb->nh.iph; 524 struct iphdr *iph = ip_hdr(skb);
525 525
526 IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n"); 526 IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n");
527 527
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
index 23f9b9e73c85..6225acac7a3b 100644
--- a/net/ipv4/ipvs/ip_vs_lblcr.c
+++ b/net/ipv4/ipvs/ip_vs_lblcr.c
@@ -775,7 +775,7 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
775 struct ip_vs_dest *dest; 775 struct ip_vs_dest *dest;
776 struct ip_vs_lblcr_table *tbl; 776 struct ip_vs_lblcr_table *tbl;
777 struct ip_vs_lblcr_entry *en; 777 struct ip_vs_lblcr_entry *en;
778 struct iphdr *iph = skb->nh.iph; 778 struct iphdr *iph = ip_hdr(skb);
779 779
780 IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n"); 780 IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n");
781 781
diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c
index 8b0505b09317..a842676e1c69 100644
--- a/net/ipv4/ipvs/ip_vs_proto_ah.c
+++ b/net/ipv4/ipvs/ip_vs_proto_ah.c
@@ -52,15 +52,15 @@ ah_conn_in_get(const struct sk_buff *skb,
52 if (likely(!inverse)) { 52 if (likely(!inverse)) {
53 cp = ip_vs_conn_in_get(IPPROTO_UDP, 53 cp = ip_vs_conn_in_get(IPPROTO_UDP,
54 iph->saddr, 54 iph->saddr,
55 __constant_htons(PORT_ISAKMP), 55 htons(PORT_ISAKMP),
56 iph->daddr, 56 iph->daddr,
57 __constant_htons(PORT_ISAKMP)); 57 htons(PORT_ISAKMP));
58 } else { 58 } else {
59 cp = ip_vs_conn_in_get(IPPROTO_UDP, 59 cp = ip_vs_conn_in_get(IPPROTO_UDP,
60 iph->daddr, 60 iph->daddr,
61 __constant_htons(PORT_ISAKMP), 61 htons(PORT_ISAKMP),
62 iph->saddr, 62 iph->saddr,
63 __constant_htons(PORT_ISAKMP)); 63 htons(PORT_ISAKMP));
64 } 64 }
65 65
66 if (!cp) { 66 if (!cp) {
@@ -89,15 +89,15 @@ ah_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
89 if (likely(!inverse)) { 89 if (likely(!inverse)) {
90 cp = ip_vs_conn_out_get(IPPROTO_UDP, 90 cp = ip_vs_conn_out_get(IPPROTO_UDP,
91 iph->saddr, 91 iph->saddr,
92 __constant_htons(PORT_ISAKMP), 92 htons(PORT_ISAKMP),
93 iph->daddr, 93 iph->daddr,
94 __constant_htons(PORT_ISAKMP)); 94 htons(PORT_ISAKMP));
95 } else { 95 } else {
96 cp = ip_vs_conn_out_get(IPPROTO_UDP, 96 cp = ip_vs_conn_out_get(IPPROTO_UDP,
97 iph->daddr, 97 iph->daddr,
98 __constant_htons(PORT_ISAKMP), 98 htons(PORT_ISAKMP),
99 iph->saddr, 99 iph->saddr,
100 __constant_htons(PORT_ISAKMP)); 100 htons(PORT_ISAKMP));
101 } 101 }
102 102
103 if (!cp) { 103 if (!cp) {
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c
index 16a9ebee2fe6..e65577a77006 100644
--- a/net/ipv4/ipvs/ip_vs_proto_tcp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c
@@ -76,16 +76,15 @@ tcp_conn_schedule(struct sk_buff *skb,
76 struct ip_vs_service *svc; 76 struct ip_vs_service *svc;
77 struct tcphdr _tcph, *th; 77 struct tcphdr _tcph, *th;
78 78
79 th = skb_header_pointer(skb, skb->nh.iph->ihl*4, 79 th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph);
80 sizeof(_tcph), &_tcph);
81 if (th == NULL) { 80 if (th == NULL) {
82 *verdict = NF_DROP; 81 *verdict = NF_DROP;
83 return 0; 82 return 0;
84 } 83 }
85 84
86 if (th->syn && 85 if (th->syn &&
87 (svc = ip_vs_service_get(skb->mark, skb->nh.iph->protocol, 86 (svc = ip_vs_service_get(skb->mark, ip_hdr(skb)->protocol,
88 skb->nh.iph->daddr, th->dest))) { 87 ip_hdr(skb)->daddr, th->dest))) {
89 if (ip_vs_todrop()) { 88 if (ip_vs_todrop()) {
90 /* 89 /*
91 * It seems that we are very loaded. 90 * It seems that we are very loaded.
@@ -127,7 +126,7 @@ tcp_snat_handler(struct sk_buff **pskb,
127 struct ip_vs_protocol *pp, struct ip_vs_conn *cp) 126 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
128{ 127{
129 struct tcphdr *tcph; 128 struct tcphdr *tcph;
130 unsigned int tcphoff = (*pskb)->nh.iph->ihl * 4; 129 const unsigned int tcphoff = ip_hdrlen(*pskb);
131 130
132 /* csum_check requires unshared skb */ 131 /* csum_check requires unshared skb */
133 if (!ip_vs_make_skb_writable(pskb, tcphoff+sizeof(*tcph))) 132 if (!ip_vs_make_skb_writable(pskb, tcphoff+sizeof(*tcph)))
@@ -143,7 +142,7 @@ tcp_snat_handler(struct sk_buff **pskb,
143 return 0; 142 return 0;
144 } 143 }
145 144
146 tcph = (void *)(*pskb)->nh.iph + tcphoff; 145 tcph = (void *)ip_hdr(*pskb) + tcphoff;
147 tcph->source = cp->vport; 146 tcph->source = cp->vport;
148 147
149 /* Adjust TCP checksums */ 148 /* Adjust TCP checksums */
@@ -175,7 +174,7 @@ tcp_dnat_handler(struct sk_buff **pskb,
175 struct ip_vs_protocol *pp, struct ip_vs_conn *cp) 174 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
176{ 175{
177 struct tcphdr *tcph; 176 struct tcphdr *tcph;
178 unsigned int tcphoff = (*pskb)->nh.iph->ihl * 4; 177 const unsigned int tcphoff = ip_hdrlen(*pskb);
179 178
180 /* csum_check requires unshared skb */ 179 /* csum_check requires unshared skb */
181 if (!ip_vs_make_skb_writable(pskb, tcphoff+sizeof(*tcph))) 180 if (!ip_vs_make_skb_writable(pskb, tcphoff+sizeof(*tcph)))
@@ -194,7 +193,7 @@ tcp_dnat_handler(struct sk_buff **pskb,
194 return 0; 193 return 0;
195 } 194 }
196 195
197 tcph = (void *)(*pskb)->nh.iph + tcphoff; 196 tcph = (void *)ip_hdr(*pskb) + tcphoff;
198 tcph->dest = cp->dport; 197 tcph->dest = cp->dport;
199 198
200 /* 199 /*
@@ -224,15 +223,15 @@ tcp_dnat_handler(struct sk_buff **pskb,
224static int 223static int
225tcp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp) 224tcp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
226{ 225{
227 unsigned int tcphoff = skb->nh.iph->ihl*4; 226 const unsigned int tcphoff = ip_hdrlen(skb);
228 227
229 switch (skb->ip_summed) { 228 switch (skb->ip_summed) {
230 case CHECKSUM_NONE: 229 case CHECKSUM_NONE:
231 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); 230 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
232 case CHECKSUM_COMPLETE: 231 case CHECKSUM_COMPLETE:
233 if (csum_tcpudp_magic(skb->nh.iph->saddr, skb->nh.iph->daddr, 232 if (csum_tcpudp_magic(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
234 skb->len - tcphoff, 233 skb->len - tcphoff,
235 skb->nh.iph->protocol, skb->csum)) { 234 ip_hdr(skb)->protocol, skb->csum)) {
236 IP_VS_DBG_RL_PKT(0, pp, skb, 0, 235 IP_VS_DBG_RL_PKT(0, pp, skb, 0,
237 "Failed checksum for"); 236 "Failed checksum for");
238 return 0; 237 return 0;
@@ -467,8 +466,7 @@ tcp_state_transition(struct ip_vs_conn *cp, int direction,
467{ 466{
468 struct tcphdr _tcph, *th; 467 struct tcphdr _tcph, *th;
469 468
470 th = skb_header_pointer(skb, skb->nh.iph->ihl*4, 469 th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph);
471 sizeof(_tcph), &_tcph);
472 if (th == NULL) 470 if (th == NULL)
473 return 0; 471 return 0;
474 472
diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c
index 03f0a414cfa4..8ee5fe6a101d 100644
--- a/net/ipv4/ipvs/ip_vs_proto_udp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_udp.c
@@ -22,7 +22,7 @@
22#include <linux/udp.h> 22#include <linux/udp.h>
23 23
24#include <net/ip_vs.h> 24#include <net/ip_vs.h>
25 25#include <net/ip.h>
26 26
27static struct ip_vs_conn * 27static struct ip_vs_conn *
28udp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, 28udp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
@@ -56,7 +56,7 @@ udp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
56 struct ip_vs_conn *cp; 56 struct ip_vs_conn *cp;
57 __be16 _ports[2], *pptr; 57 __be16 _ports[2], *pptr;
58 58
59 pptr = skb_header_pointer(skb, skb->nh.iph->ihl*4, 59 pptr = skb_header_pointer(skb, ip_hdrlen(skb),
60 sizeof(_ports), _ports); 60 sizeof(_ports), _ports);
61 if (pptr == NULL) 61 if (pptr == NULL)
62 return NULL; 62 return NULL;
@@ -82,15 +82,15 @@ udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
82 struct ip_vs_service *svc; 82 struct ip_vs_service *svc;
83 struct udphdr _udph, *uh; 83 struct udphdr _udph, *uh;
84 84
85 uh = skb_header_pointer(skb, skb->nh.iph->ihl*4, 85 uh = skb_header_pointer(skb, ip_hdrlen(skb),
86 sizeof(_udph), &_udph); 86 sizeof(_udph), &_udph);
87 if (uh == NULL) { 87 if (uh == NULL) {
88 *verdict = NF_DROP; 88 *verdict = NF_DROP;
89 return 0; 89 return 0;
90 } 90 }
91 91
92 if ((svc = ip_vs_service_get(skb->mark, skb->nh.iph->protocol, 92 if ((svc = ip_vs_service_get(skb->mark, ip_hdr(skb)->protocol,
93 skb->nh.iph->daddr, uh->dest))) { 93 ip_hdr(skb)->daddr, uh->dest))) {
94 if (ip_vs_todrop()) { 94 if (ip_vs_todrop()) {
95 /* 95 /*
96 * It seems that we are very loaded. 96 * It seems that we are very loaded.
@@ -133,7 +133,7 @@ udp_snat_handler(struct sk_buff **pskb,
133 struct ip_vs_protocol *pp, struct ip_vs_conn *cp) 133 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
134{ 134{
135 struct udphdr *udph; 135 struct udphdr *udph;
136 unsigned int udphoff = (*pskb)->nh.iph->ihl * 4; 136 const unsigned int udphoff = ip_hdrlen(*pskb);
137 137
138 /* csum_check requires unshared skb */ 138 /* csum_check requires unshared skb */
139 if (!ip_vs_make_skb_writable(pskb, udphoff+sizeof(*udph))) 139 if (!ip_vs_make_skb_writable(pskb, udphoff+sizeof(*udph)))
@@ -151,7 +151,7 @@ udp_snat_handler(struct sk_buff **pskb,
151 return 0; 151 return 0;
152 } 152 }
153 153
154 udph = (void *)(*pskb)->nh.iph + udphoff; 154 udph = (void *)ip_hdr(*pskb) + udphoff;
155 udph->source = cp->vport; 155 udph->source = cp->vport;
156 156
157 /* 157 /*
@@ -187,7 +187,7 @@ udp_dnat_handler(struct sk_buff **pskb,
187 struct ip_vs_protocol *pp, struct ip_vs_conn *cp) 187 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
188{ 188{
189 struct udphdr *udph; 189 struct udphdr *udph;
190 unsigned int udphoff = (*pskb)->nh.iph->ihl * 4; 190 unsigned int udphoff = ip_hdrlen(*pskb);
191 191
192 /* csum_check requires unshared skb */ 192 /* csum_check requires unshared skb */
193 if (!ip_vs_make_skb_writable(pskb, udphoff+sizeof(*udph))) 193 if (!ip_vs_make_skb_writable(pskb, udphoff+sizeof(*udph)))
@@ -206,7 +206,7 @@ udp_dnat_handler(struct sk_buff **pskb,
206 return 0; 206 return 0;
207 } 207 }
208 208
209 udph = (void *)(*pskb)->nh.iph + udphoff; 209 udph = (void *)ip_hdr(*pskb) + udphoff;
210 udph->dest = cp->dport; 210 udph->dest = cp->dport;
211 211
212 /* 212 /*
@@ -239,7 +239,7 @@ static int
239udp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp) 239udp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
240{ 240{
241 struct udphdr _udph, *uh; 241 struct udphdr _udph, *uh;
242 unsigned int udphoff = skb->nh.iph->ihl*4; 242 const unsigned int udphoff = ip_hdrlen(skb);
243 243
244 uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph); 244 uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph);
245 if (uh == NULL) 245 if (uh == NULL)
@@ -251,10 +251,10 @@ udp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
251 skb->csum = skb_checksum(skb, udphoff, 251 skb->csum = skb_checksum(skb, udphoff,
252 skb->len - udphoff, 0); 252 skb->len - udphoff, 0);
253 case CHECKSUM_COMPLETE: 253 case CHECKSUM_COMPLETE:
254 if (csum_tcpudp_magic(skb->nh.iph->saddr, 254 if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
255 skb->nh.iph->daddr, 255 ip_hdr(skb)->daddr,
256 skb->len - udphoff, 256 skb->len - udphoff,
257 skb->nh.iph->protocol, 257 ip_hdr(skb)->protocol,
258 skb->csum)) { 258 skb->csum)) {
259 IP_VS_DBG_RL_PKT(0, pp, skb, 0, 259 IP_VS_DBG_RL_PKT(0, pp, skb, 0,
260 "Failed checksum for"); 260 "Failed checksum for");
diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c
index 338668f88fe2..1b25b00ef1e1 100644
--- a/net/ipv4/ipvs/ip_vs_sh.c
+++ b/net/ipv4/ipvs/ip_vs_sh.c
@@ -201,7 +201,7 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
201{ 201{
202 struct ip_vs_dest *dest; 202 struct ip_vs_dest *dest;
203 struct ip_vs_sh_bucket *tbl; 203 struct ip_vs_sh_bucket *tbl;
204 struct iphdr *iph = skb->nh.iph; 204 struct iphdr *iph = ip_hdr(skb);
205 205
206 IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n"); 206 IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
207 207
diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c
index e1f77bd7c9a5..900ce29db382 100644
--- a/net/ipv4/ipvs/ip_vs_xmit.c
+++ b/net/ipv4/ipvs/ip_vs_xmit.c
@@ -156,7 +156,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
156 struct ip_vs_protocol *pp) 156 struct ip_vs_protocol *pp)
157{ 157{
158 struct rtable *rt; /* Route to the other host */ 158 struct rtable *rt; /* Route to the other host */
159 struct iphdr *iph = skb->nh.iph; 159 struct iphdr *iph = ip_hdr(skb);
160 u8 tos = iph->tos; 160 u8 tos = iph->tos;
161 int mtu; 161 int mtu;
162 struct flowi fl = { 162 struct flowi fl = {
@@ -178,7 +178,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
178 178
179 /* MTU checking */ 179 /* MTU checking */
180 mtu = dst_mtu(&rt->u.dst); 180 mtu = dst_mtu(&rt->u.dst);
181 if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) { 181 if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
182 ip_rt_put(rt); 182 ip_rt_put(rt);
183 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); 183 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
184 IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n"); 184 IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n");
@@ -193,7 +193,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
193 ip_rt_put(rt); 193 ip_rt_put(rt);
194 return NF_STOLEN; 194 return NF_STOLEN;
195 } 195 }
196 ip_send_check(skb->nh.iph); 196 ip_send_check(ip_hdr(skb));
197 197
198 /* drop old route */ 198 /* drop old route */
199 dst_release(skb->dst); 199 dst_release(skb->dst);
@@ -226,7 +226,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
226{ 226{
227 struct rtable *rt; /* Route to the other host */ 227 struct rtable *rt; /* Route to the other host */
228 int mtu; 228 int mtu;
229 struct iphdr *iph = skb->nh.iph; 229 struct iphdr *iph = ip_hdr(skb);
230 230
231 EnterFunction(10); 231 EnterFunction(10);
232 232
@@ -245,7 +245,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
245 245
246 /* MTU checking */ 246 /* MTU checking */
247 mtu = dst_mtu(&rt->u.dst); 247 mtu = dst_mtu(&rt->u.dst);
248 if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) { 248 if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
249 ip_rt_put(rt); 249 ip_rt_put(rt);
250 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); 250 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
251 IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for"); 251 IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for");
@@ -266,8 +266,8 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
266 /* mangle the packet */ 266 /* mangle the packet */
267 if (pp->dnat_handler && !pp->dnat_handler(&skb, pp, cp)) 267 if (pp->dnat_handler && !pp->dnat_handler(&skb, pp, cp))
268 goto tx_error; 268 goto tx_error;
269 skb->nh.iph->daddr = cp->daddr; 269 ip_hdr(skb)->daddr = cp->daddr;
270 ip_send_check(skb->nh.iph); 270 ip_send_check(ip_hdr(skb));
271 271
272 IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); 272 IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
273 273
@@ -320,19 +320,20 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
320{ 320{
321 struct rtable *rt; /* Route to the other host */ 321 struct rtable *rt; /* Route to the other host */
322 struct net_device *tdev; /* Device to other host */ 322 struct net_device *tdev; /* Device to other host */
323 struct iphdr *old_iph = skb->nh.iph; 323 struct iphdr *old_iph = ip_hdr(skb);
324 u8 tos = old_iph->tos; 324 u8 tos = old_iph->tos;
325 __be16 df = old_iph->frag_off; 325 __be16 df = old_iph->frag_off;
326 sk_buff_data_t old_transport_header = skb->transport_header;
326 struct iphdr *iph; /* Our new IP header */ 327 struct iphdr *iph; /* Our new IP header */
327 int max_headroom; /* The extra header space needed */ 328 int max_headroom; /* The extra header space needed */
328 int mtu; 329 int mtu;
329 330
330 EnterFunction(10); 331 EnterFunction(10);
331 332
332 if (skb->protocol != __constant_htons(ETH_P_IP)) { 333 if (skb->protocol != htons(ETH_P_IP)) {
333 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, " 334 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, "
334 "ETH_P_IP: %d, skb protocol: %d\n", 335 "ETH_P_IP: %d, skb protocol: %d\n",
335 __constant_htons(ETH_P_IP), skb->protocol); 336 htons(ETH_P_IP), skb->protocol);
336 goto tx_error; 337 goto tx_error;
337 } 338 }
338 339
@@ -350,9 +351,9 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
350 if (skb->dst) 351 if (skb->dst)
351 skb->dst->ops->update_pmtu(skb->dst, mtu); 352 skb->dst->ops->update_pmtu(skb->dst, mtu);
352 353
353 df |= (old_iph->frag_off&__constant_htons(IP_DF)); 354 df |= (old_iph->frag_off & htons(IP_DF));
354 355
355 if ((old_iph->frag_off&__constant_htons(IP_DF)) 356 if ((old_iph->frag_off & htons(IP_DF))
356 && mtu < ntohs(old_iph->tot_len)) { 357 && mtu < ntohs(old_iph->tot_len)) {
357 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); 358 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
358 ip_rt_put(rt); 359 ip_rt_put(rt);
@@ -377,15 +378,16 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
377 } 378 }
378 kfree_skb(skb); 379 kfree_skb(skb);
379 skb = new_skb; 380 skb = new_skb;
380 old_iph = skb->nh.iph; 381 old_iph = ip_hdr(skb);
381 } 382 }
382 383
383 skb->h.raw = (void *) old_iph; 384 skb->transport_header = old_transport_header;
384 385
385 /* fix old IP header checksum */ 386 /* fix old IP header checksum */
386 ip_send_check(old_iph); 387 ip_send_check(old_iph);
387 388
388 skb->nh.raw = skb_push(skb, sizeof(struct iphdr)); 389 skb_push(skb, sizeof(struct iphdr));
390 skb_reset_network_header(skb);
389 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 391 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
390 392
391 /* drop old route */ 393 /* drop old route */
@@ -395,7 +397,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
395 /* 397 /*
396 * Push down and install the IPIP header. 398 * Push down and install the IPIP header.
397 */ 399 */
398 iph = skb->nh.iph; 400 iph = ip_hdr(skb);
399 iph->version = 4; 401 iph->version = 4;
400 iph->ihl = sizeof(struct iphdr)>>2; 402 iph->ihl = sizeof(struct iphdr)>>2;
401 iph->frag_off = df; 403 iph->frag_off = df;
@@ -435,7 +437,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
435 struct ip_vs_protocol *pp) 437 struct ip_vs_protocol *pp)
436{ 438{
437 struct rtable *rt; /* Route to the other host */ 439 struct rtable *rt; /* Route to the other host */
438 struct iphdr *iph = skb->nh.iph; 440 struct iphdr *iph = ip_hdr(skb);
439 int mtu; 441 int mtu;
440 442
441 EnterFunction(10); 443 EnterFunction(10);
@@ -445,7 +447,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
445 447
446 /* MTU checking */ 448 /* MTU checking */
447 mtu = dst_mtu(&rt->u.dst); 449 mtu = dst_mtu(&rt->u.dst);
448 if ((iph->frag_off&__constant_htons(IP_DF)) && skb->len > mtu) { 450 if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) {
449 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); 451 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
450 ip_rt_put(rt); 452 ip_rt_put(rt);
451 IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n"); 453 IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n");
@@ -460,7 +462,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
460 ip_rt_put(rt); 462 ip_rt_put(rt);
461 return NF_STOLEN; 463 return NF_STOLEN;
462 } 464 }
463 ip_send_check(skb->nh.iph); 465 ip_send_check(ip_hdr(skb));
464 466
465 /* drop old route */ 467 /* drop old route */
466 dst_release(skb->dst); 468 dst_release(skb->dst);
@@ -514,12 +516,12 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
514 * mangle and send the packet here (only for VS/NAT) 516 * mangle and send the packet here (only for VS/NAT)
515 */ 517 */
516 518
517 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(skb->nh.iph->tos)))) 519 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos))))
518 goto tx_error_icmp; 520 goto tx_error_icmp;
519 521
520 /* MTU checking */ 522 /* MTU checking */
521 mtu = dst_mtu(&rt->u.dst); 523 mtu = dst_mtu(&rt->u.dst);
522 if ((skb->len > mtu) && (skb->nh.iph->frag_off&__constant_htons(IP_DF))) { 524 if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) {
523 ip_rt_put(rt); 525 ip_rt_put(rt);
524 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); 526 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
525 IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n"); 527 IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
diff --git a/net/ipv4/multipath_drr.c b/net/ipv4/multipath_drr.c
index 574c735836fc..b03c5ca2c823 100644
--- a/net/ipv4/multipath_drr.c
+++ b/net/ipv4/multipath_drr.c
@@ -100,7 +100,7 @@ static int drr_dev_event(struct notifier_block *this,
100 100
101 spin_unlock_bh(&state_lock); 101 spin_unlock_bh(&state_lock);
102 break; 102 break;
103 }; 103 }
104 104
105 return NOTIFY_DONE; 105 return NOTIFY_DONE;
106} 106}
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 6069a11514f6..b44192924f95 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -10,7 +10,7 @@
10/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ 10/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
11int ip_route_me_harder(struct sk_buff **pskb, unsigned addr_type) 11int ip_route_me_harder(struct sk_buff **pskb, unsigned addr_type)
12{ 12{
13 struct iphdr *iph = (*pskb)->nh.iph; 13 const struct iphdr *iph = ip_hdr(*pskb);
14 struct rtable *rt; 14 struct rtable *rt;
15 struct flowi fl = {}; 15 struct flowi fl = {};
16 struct dst_entry *odst; 16 struct dst_entry *odst;
@@ -142,7 +142,7 @@ static void nf_ip_saveroute(const struct sk_buff *skb, struct nf_info *info)
142 struct ip_rt_info *rt_info = nf_info_reroute(info); 142 struct ip_rt_info *rt_info = nf_info_reroute(info);
143 143
144 if (info->hook == NF_IP_LOCAL_OUT) { 144 if (info->hook == NF_IP_LOCAL_OUT) {
145 const struct iphdr *iph = skb->nh.iph; 145 const struct iphdr *iph = ip_hdr(skb);
146 146
147 rt_info->tos = iph->tos; 147 rt_info->tos = iph->tos;
148 rt_info->daddr = iph->daddr; 148 rt_info->daddr = iph->daddr;
@@ -155,7 +155,7 @@ static int nf_ip_reroute(struct sk_buff **pskb, const struct nf_info *info)
155 const struct ip_rt_info *rt_info = nf_info_reroute(info); 155 const struct ip_rt_info *rt_info = nf_info_reroute(info);
156 156
157 if (info->hook == NF_IP_LOCAL_OUT) { 157 if (info->hook == NF_IP_LOCAL_OUT) {
158 struct iphdr *iph = (*pskb)->nh.iph; 158 const struct iphdr *iph = ip_hdr(*pskb);
159 159
160 if (!(iph->tos == rt_info->tos 160 if (!(iph->tos == rt_info->tos
161 && iph->daddr == rt_info->daddr 161 && iph->daddr == rt_info->daddr
@@ -168,7 +168,7 @@ static int nf_ip_reroute(struct sk_buff **pskb, const struct nf_info *info)
168__sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, 168__sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
169 unsigned int dataoff, u_int8_t protocol) 169 unsigned int dataoff, u_int8_t protocol)
170{ 170{
171 struct iphdr *iph = skb->nh.iph; 171 const struct iphdr *iph = ip_hdr(skb);
172 __sum16 csum = 0; 172 __sum16 csum = 0;
173 173
174 switch (skb->ip_summed) { 174 switch (skb->ip_summed) {
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 601808c796ec..46509fae9fd8 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -30,188 +30,6 @@ config NF_CONNTRACK_PROC_COMPAT
30 30
31 If unsure, say Y. 31 If unsure, say Y.
32 32
33# connection tracking, helpers and protocols
34config IP_NF_CT_ACCT
35 bool "Connection tracking flow accounting"
36 depends on IP_NF_CONNTRACK
37 help
38 If this option is enabled, the connection tracking code will
39 keep per-flow packet and byte counters.
40
41 Those counters can be used for flow-based accounting or the
42 `connbytes' match.
43
44 If unsure, say `N'.
45
46config IP_NF_CONNTRACK_MARK
47 bool 'Connection mark tracking support'
48 depends on IP_NF_CONNTRACK
49 help
50 This option enables support for connection marks, used by the
51 `CONNMARK' target and `connmark' match. Similar to the mark value
52 of packets, but this mark value is kept in the conntrack session
53 instead of the individual packets.
54
55config IP_NF_CONNTRACK_SECMARK
56 bool 'Connection tracking security mark support'
57 depends on IP_NF_CONNTRACK && NETWORK_SECMARK
58 help
59 This option enables security markings to be applied to
60 connections. Typically they are copied to connections from
61 packets using the CONNSECMARK target and copied back from
62 connections to packets with the same target, with the packets
63 being originally labeled via SECMARK.
64
65 If unsure, say 'N'.
66
67config IP_NF_CONNTRACK_EVENTS
68 bool "Connection tracking events (EXPERIMENTAL)"
69 depends on EXPERIMENTAL && IP_NF_CONNTRACK
70 help
71 If this option is enabled, the connection tracking code will
72 provide a notifier chain that can be used by other kernel code
73 to get notified about changes in the connection tracking state.
74
75 IF unsure, say `N'.
76
77config IP_NF_CONNTRACK_NETLINK
78 tristate 'Connection tracking netlink interface (EXPERIMENTAL)'
79 depends on EXPERIMENTAL && IP_NF_CONNTRACK && NETFILTER_NETLINK
80 depends on IP_NF_CONNTRACK!=y || NETFILTER_NETLINK!=m
81 depends on IP_NF_NAT=n || IP_NF_NAT
82 help
83 This option enables support for a netlink-based userspace interface
84
85
86config IP_NF_CT_PROTO_SCTP
87 tristate 'SCTP protocol connection tracking support (EXPERIMENTAL)'
88 depends on IP_NF_CONNTRACK && EXPERIMENTAL
89 help
90 With this option enabled, the connection tracking code will
91 be able to do state tracking on SCTP connections.
92
93 If you want to compile it as a module, say M here and read
94 <file:Documentation/modules.txt>. If unsure, say `N'.
95
96config IP_NF_FTP
97 tristate "FTP protocol support"
98 depends on IP_NF_CONNTRACK
99 help
100 Tracking FTP connections is problematic: special helpers are
101 required for tracking them, and doing masquerading and other forms
102 of Network Address Translation on them.
103
104 To compile it as a module, choose M here. If unsure, say Y.
105
106config IP_NF_IRC
107 tristate "IRC protocol support"
108 depends on IP_NF_CONNTRACK
109 ---help---
110 There is a commonly-used extension to IRC called
111 Direct Client-to-Client Protocol (DCC). This enables users to send
112 files to each other, and also chat to each other without the need
113 of a server. DCC Sending is used anywhere you send files over IRC,
114 and DCC Chat is most commonly used by Eggdrop bots. If you are
115 using NAT, this extension will enable you to send files and initiate
116 chats. Note that you do NOT need this extension to get files or
117 have others initiate chats, or everything else in IRC.
118
119 To compile it as a module, choose M here. If unsure, say Y.
120
121config IP_NF_NETBIOS_NS
122 tristate "NetBIOS name service protocol support (EXPERIMENTAL)"
123 depends on IP_NF_CONNTRACK && EXPERIMENTAL
124 help
125 NetBIOS name service requests are sent as broadcast messages from an
126 unprivileged port and responded to with unicast messages to the
127 same port. This make them hard to firewall properly because connection
128 tracking doesn't deal with broadcasts. This helper tracks locally
129 originating NetBIOS name service requests and the corresponding
130 responses. It relies on correct IP address configuration, specifically
131 netmask and broadcast address. When properly configured, the output
132 of "ip address show" should look similar to this:
133
134 $ ip -4 address show eth0
135 4: eth0: <BROADCAST,MULTICAST,UP> mtu 1500 qdisc pfifo_fast qlen 1000
136 inet 172.16.2.252/24 brd 172.16.2.255 scope global eth0
137
138 To compile it as a module, choose M here. If unsure, say N.
139
140config IP_NF_TFTP
141 tristate "TFTP protocol support"
142 depends on IP_NF_CONNTRACK
143 help
144 TFTP connection tracking helper, this is required depending
145 on how restrictive your ruleset is.
146 If you are using a tftp client behind -j SNAT or -j MASQUERADING
147 you will need this.
148
149 To compile it as a module, choose M here. If unsure, say Y.
150
151config IP_NF_AMANDA
152 tristate "Amanda backup protocol support"
153 depends on IP_NF_CONNTRACK
154 select TEXTSEARCH
155 select TEXTSEARCH_KMP
156 help
157 If you are running the Amanda backup package <http://www.amanda.org/>
158 on this machine or machines that will be MASQUERADED through this
159 machine, then you may want to enable this feature. This allows the
160 connection tracking and natting code to allow the sub-channels that
161 Amanda requires for communication of the backup data, messages and
162 index.
163
164 To compile it as a module, choose M here. If unsure, say Y.
165
166config IP_NF_PPTP
167 tristate 'PPTP protocol support'
168 depends on IP_NF_CONNTRACK
169 help
170 This module adds support for PPTP (Point to Point Tunnelling
171 Protocol, RFC2637) connection tracking and NAT.
172
173 If you are running PPTP sessions over a stateful firewall or NAT
174 box, you may want to enable this feature.
175
176 Please note that not all PPTP modes of operation are supported yet.
177 For more info, read top of the file
178 net/ipv4/netfilter/ip_conntrack_pptp.c
179
180 If you want to compile it as a module, say M here and read
181 Documentation/modules.txt. If unsure, say `N'.
182
183config IP_NF_H323
184 tristate 'H.323 protocol support (EXPERIMENTAL)'
185 depends on IP_NF_CONNTRACK && EXPERIMENTAL
186 help
187 H.323 is a VoIP signalling protocol from ITU-T. As one of the most
188 important VoIP protocols, it is widely used by voice hardware and
189 software including voice gateways, IP phones, Netmeeting, OpenPhone,
190 Gnomemeeting, etc.
191
192 With this module you can support H.323 on a connection tracking/NAT
193 firewall.
194
195 This module supports RAS, Fast Start, H.245 Tunnelling, Call
196 Forwarding, RTP/RTCP and T.120 based audio, video, fax, chat,
197 whiteboard, file transfer, etc. For more information, please
198 visit http://nath323.sourceforge.net/.
199
200 If you want to compile it as a module, say 'M' here and read
201 Documentation/modules.txt. If unsure, say 'N'.
202
203config IP_NF_SIP
204 tristate "SIP protocol support (EXPERIMENTAL)"
205 depends on IP_NF_CONNTRACK && EXPERIMENTAL
206 help
207 SIP is an application-layer control protocol that can establish,
208 modify, and terminate multimedia sessions (conferences) such as
209 Internet telephony calls. With the ip_conntrack_sip and
210 the ip_nat_sip modules you can support the protocol on a connection
211 tracking/NATing firewall.
212
213 To compile it as a module, choose M here. If unsure, say Y.
214
215config IP_NF_QUEUE 33config IP_NF_QUEUE
216 tristate "IP Userspace queueing via NETLINK (OBSOLETE)" 34 tristate "IP Userspace queueing via NETLINK (OBSOLETE)"
217 help 35 help
@@ -361,17 +179,6 @@ config IP_NF_TARGET_ULOG
361 179
362 To compile it as a module, choose M here. If unsure, say N. 180 To compile it as a module, choose M here. If unsure, say N.
363 181
364# NAT + specific targets: ip_conntrack
365config IP_NF_NAT
366 tristate "Full NAT"
367 depends on IP_NF_IPTABLES && IP_NF_CONNTRACK
368 help
369 The Full NAT option allows masquerading, port forwarding and other
370 forms of full Network Address Port Translation. It is controlled by
371 the `nat' table in iptables: see the man page for iptables(8).
372
373 To compile it as a module, choose M here. If unsure, say N.
374
375# NAT + specific targets: nf_conntrack 182# NAT + specific targets: nf_conntrack
376config NF_NAT 183config NF_NAT
377 tristate "Full NAT" 184 tristate "Full NAT"
@@ -383,11 +190,6 @@ config NF_NAT
383 190
384 To compile it as a module, choose M here. If unsure, say N. 191 To compile it as a module, choose M here. If unsure, say N.
385 192
386config IP_NF_NAT_NEEDED
387 bool
388 depends on IP_NF_NAT
389 default y
390
391config NF_NAT_NEEDED 193config NF_NAT_NEEDED
392 bool 194 bool
393 depends on NF_NAT 195 depends on NF_NAT
@@ -395,7 +197,7 @@ config NF_NAT_NEEDED
395 197
396config IP_NF_TARGET_MASQUERADE 198config IP_NF_TARGET_MASQUERADE
397 tristate "MASQUERADE target support" 199 tristate "MASQUERADE target support"
398 depends on (NF_NAT || IP_NF_NAT) 200 depends on NF_NAT
399 help 201 help
400 Masquerading is a special case of NAT: all outgoing connections are 202 Masquerading is a special case of NAT: all outgoing connections are
401 changed to seem to come from a particular interface's address, and 203 changed to seem to come from a particular interface's address, and
@@ -407,7 +209,7 @@ config IP_NF_TARGET_MASQUERADE
407 209
408config IP_NF_TARGET_REDIRECT 210config IP_NF_TARGET_REDIRECT
409 tristate "REDIRECT target support" 211 tristate "REDIRECT target support"
410 depends on (NF_NAT || IP_NF_NAT) 212 depends on NF_NAT
411 help 213 help
412 REDIRECT is a special case of NAT: all incoming connections are 214 REDIRECT is a special case of NAT: all incoming connections are
413 mapped onto the incoming interface's address, causing the packets to 215 mapped onto the incoming interface's address, causing the packets to
@@ -418,7 +220,7 @@ config IP_NF_TARGET_REDIRECT
418 220
419config IP_NF_TARGET_NETMAP 221config IP_NF_TARGET_NETMAP
420 tristate "NETMAP target support" 222 tristate "NETMAP target support"
421 depends on (NF_NAT || IP_NF_NAT) 223 depends on NF_NAT
422 help 224 help
423 NETMAP is an implementation of static 1:1 NAT mapping of network 225 NETMAP is an implementation of static 1:1 NAT mapping of network
424 addresses. It maps the network address part, while keeping the host 226 addresses. It maps the network address part, while keeping the host
@@ -429,28 +231,13 @@ config IP_NF_TARGET_NETMAP
429 231
430config IP_NF_TARGET_SAME 232config IP_NF_TARGET_SAME
431 tristate "SAME target support" 233 tristate "SAME target support"
432 depends on (NF_NAT || IP_NF_NAT) 234 depends on NF_NAT
433 help 235 help
434 This option adds a `SAME' target, which works like the standard SNAT 236 This option adds a `SAME' target, which works like the standard SNAT
435 target, but attempts to give clients the same IP for all connections. 237 target, but attempts to give clients the same IP for all connections.
436 238
437 To compile it as a module, choose M here. If unsure, say N. 239 To compile it as a module, choose M here. If unsure, say N.
438 240
439config IP_NF_NAT_SNMP_BASIC
440 tristate "Basic SNMP-ALG support (EXPERIMENTAL)"
441 depends on EXPERIMENTAL && IP_NF_NAT
442 ---help---
443
444 This module implements an Application Layer Gateway (ALG) for
445 SNMP payloads. In conjunction with NAT, it allows a network
446 management system to access multiple private networks with
447 conflicting addresses. It works by modifying IP addresses
448 inside SNMP payloads to match IP-layer NAT mapping.
449
450 This is the "basic" form of SNMP-ALG, as described in RFC 2962
451
452 To compile it as a module, choose M here. If unsure, say N.
453
454config NF_NAT_SNMP_BASIC 241config NF_NAT_SNMP_BASIC
455 tristate "Basic SNMP-ALG support (EXPERIMENTAL)" 242 tristate "Basic SNMP-ALG support (EXPERIMENTAL)"
456 depends on EXPERIMENTAL && NF_NAT 243 depends on EXPERIMENTAL && NF_NAT
@@ -477,78 +264,37 @@ config NF_NAT_PROTO_GRE
477 tristate 264 tristate
478 depends on NF_NAT && NF_CT_PROTO_GRE 265 depends on NF_NAT && NF_CT_PROTO_GRE
479 266
480config IP_NF_NAT_FTP
481 tristate
482 depends on IP_NF_IPTABLES && IP_NF_CONNTRACK && IP_NF_NAT
483 default IP_NF_NAT && IP_NF_FTP
484
485config NF_NAT_FTP 267config NF_NAT_FTP
486 tristate 268 tristate
487 depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT 269 depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT
488 default NF_NAT && NF_CONNTRACK_FTP 270 default NF_NAT && NF_CONNTRACK_FTP
489 271
490config IP_NF_NAT_IRC
491 tristate
492 depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n
493 default IP_NF_NAT if IP_NF_IRC=y
494 default m if IP_NF_IRC=m
495
496config NF_NAT_IRC 272config NF_NAT_IRC
497 tristate 273 tristate
498 depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT 274 depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT
499 default NF_NAT && NF_CONNTRACK_IRC 275 default NF_NAT && NF_CONNTRACK_IRC
500 276
501config IP_NF_NAT_TFTP
502 tristate
503 depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n
504 default IP_NF_NAT if IP_NF_TFTP=y
505 default m if IP_NF_TFTP=m
506
507config NF_NAT_TFTP 277config NF_NAT_TFTP
508 tristate 278 tristate
509 depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT 279 depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT
510 default NF_NAT && NF_CONNTRACK_TFTP 280 default NF_NAT && NF_CONNTRACK_TFTP
511 281
512config IP_NF_NAT_AMANDA
513 tristate
514 depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n
515 default IP_NF_NAT if IP_NF_AMANDA=y
516 default m if IP_NF_AMANDA=m
517
518config NF_NAT_AMANDA 282config NF_NAT_AMANDA
519 tristate 283 tristate
520 depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT 284 depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT
521 default NF_NAT && NF_CONNTRACK_AMANDA 285 default NF_NAT && NF_CONNTRACK_AMANDA
522 286
523config IP_NF_NAT_PPTP
524 tristate
525 depends on IP_NF_NAT!=n && IP_NF_PPTP!=n
526 default IP_NF_NAT if IP_NF_PPTP=y
527 default m if IP_NF_PPTP=m
528
529config NF_NAT_PPTP 287config NF_NAT_PPTP
530 tristate 288 tristate
531 depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT 289 depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT
532 default NF_NAT && NF_CONNTRACK_PPTP 290 default NF_NAT && NF_CONNTRACK_PPTP
533 select NF_NAT_PROTO_GRE 291 select NF_NAT_PROTO_GRE
534 292
535config IP_NF_NAT_H323
536 tristate
537 depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n
538 default IP_NF_NAT if IP_NF_H323=y
539 default m if IP_NF_H323=m
540
541config NF_NAT_H323 293config NF_NAT_H323
542 tristate 294 tristate
543 depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT 295 depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT
544 default NF_NAT && NF_CONNTRACK_H323 296 default NF_NAT && NF_CONNTRACK_H323
545 297
546config IP_NF_NAT_SIP
547 tristate
548 depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n
549 default IP_NF_NAT if IP_NF_SIP=y
550 default m if IP_NF_SIP=m
551
552config NF_NAT_SIP 298config NF_NAT_SIP
553 tristate 299 tristate
554 depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT 300 depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT
@@ -606,9 +352,8 @@ config IP_NF_TARGET_TTL
606config IP_NF_TARGET_CLUSTERIP 352config IP_NF_TARGET_CLUSTERIP
607 tristate "CLUSTERIP target support (EXPERIMENTAL)" 353 tristate "CLUSTERIP target support (EXPERIMENTAL)"
608 depends on IP_NF_MANGLE && EXPERIMENTAL 354 depends on IP_NF_MANGLE && EXPERIMENTAL
609 depends on IP_NF_CONNTRACK || NF_CONNTRACK_IPV4 355 depends on NF_CONNTRACK_IPV4
610 select IP_NF_CONNTRACK_MARK if IP_NF_CONNTRACK 356 select NF_CONNTRACK_MARK
611 select NF_CONNTRACK_MARK if NF_CONNTRACK_IPV4
612 help 357 help
613 The CLUSTERIP target allows you to build load-balancing clusters of 358 The CLUSTERIP target allows you to build load-balancing clusters of
614 network servers without having a dedicated load-balancing 359 network servers without having a dedicated load-balancing
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 6625ec68180c..409d273f6f82 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -2,8 +2,6 @@
2# Makefile for the netfilter modules on top of IPv4. 2# Makefile for the netfilter modules on top of IPv4.
3# 3#
4 4
5# objects for the standalone - connection tracking / NAT
6ip_conntrack-objs := ip_conntrack_standalone.o ip_conntrack_core.o ip_conntrack_proto_generic.o ip_conntrack_proto_tcp.o ip_conntrack_proto_udp.o ip_conntrack_proto_icmp.o
7# objects for l3 independent conntrack 5# objects for l3 independent conntrack
8nf_conntrack_ipv4-objs := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o 6nf_conntrack_ipv4-objs := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o
9ifeq ($(CONFIG_NF_CONNTRACK_PROC_COMPAT),y) 7ifeq ($(CONFIG_NF_CONNTRACK_PROC_COMPAT),y)
@@ -12,53 +10,14 @@ nf_conntrack_ipv4-objs += nf_conntrack_l3proto_ipv4_compat.o
12endif 10endif
13endif 11endif
14 12
15ip_nat-objs := ip_nat_core.o ip_nat_helper.o ip_nat_proto_unknown.o ip_nat_proto_tcp.o ip_nat_proto_udp.o ip_nat_proto_icmp.o 13nf_nat-objs := nf_nat_core.o nf_nat_helper.o nf_nat_proto_unknown.o nf_nat_proto_tcp.o nf_nat_proto_udp.o nf_nat_proto_icmp.o
16nf_nat-objs := nf_nat_core.o nf_nat_helper.o nf_nat_proto_unknown.o nf_nat_proto_tcp.o nf_nat_proto_udp.o nf_nat_proto_icmp.o
17ifneq ($(CONFIG_NF_NAT),)
18iptable_nat-objs := nf_nat_rule.o nf_nat_standalone.o 14iptable_nat-objs := nf_nat_rule.o nf_nat_standalone.o
19else
20iptable_nat-objs := ip_nat_rule.o ip_nat_standalone.o
21endif
22
23ip_conntrack_pptp-objs := ip_conntrack_helper_pptp.o ip_conntrack_proto_gre.o
24ip_nat_pptp-objs := ip_nat_helper_pptp.o ip_nat_proto_gre.o
25
26ip_conntrack_h323-objs := ip_conntrack_helper_h323.o ../../netfilter/nf_conntrack_h323_asn1.o
27ip_nat_h323-objs := ip_nat_helper_h323.o
28 15
29# connection tracking 16# connection tracking
30obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o
31obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o 17obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o
32 18
33obj-$(CONFIG_IP_NF_NAT) += ip_nat.o
34obj-$(CONFIG_NF_NAT) += nf_nat.o 19obj-$(CONFIG_NF_NAT) += nf_nat.o
35 20
36# conntrack netlink interface
37obj-$(CONFIG_IP_NF_CONNTRACK_NETLINK) += ip_conntrack_netlink.o
38
39
40# SCTP protocol connection tracking
41obj-$(CONFIG_IP_NF_CT_PROTO_SCTP) += ip_conntrack_proto_sctp.o
42
43# connection tracking helpers
44obj-$(CONFIG_IP_NF_H323) += ip_conntrack_h323.o
45obj-$(CONFIG_IP_NF_PPTP) += ip_conntrack_pptp.o
46obj-$(CONFIG_IP_NF_AMANDA) += ip_conntrack_amanda.o
47obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o
48obj-$(CONFIG_IP_NF_FTP) += ip_conntrack_ftp.o
49obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o
50obj-$(CONFIG_IP_NF_SIP) += ip_conntrack_sip.o
51obj-$(CONFIG_IP_NF_NETBIOS_NS) += ip_conntrack_netbios_ns.o
52
53# NAT helpers (ip_conntrack)
54obj-$(CONFIG_IP_NF_NAT_H323) += ip_nat_h323.o
55obj-$(CONFIG_IP_NF_NAT_PPTP) += ip_nat_pptp.o
56obj-$(CONFIG_IP_NF_NAT_AMANDA) += ip_nat_amanda.o
57obj-$(CONFIG_IP_NF_NAT_TFTP) += ip_nat_tftp.o
58obj-$(CONFIG_IP_NF_NAT_FTP) += ip_nat_ftp.o
59obj-$(CONFIG_IP_NF_NAT_IRC) += ip_nat_irc.o
60obj-$(CONFIG_IP_NF_NAT_SIP) += ip_nat_sip.o
61
62# NAT helpers (nf_conntrack) 21# NAT helpers (nf_conntrack)
63obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o 22obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o
64obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o 23obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o
@@ -78,7 +37,6 @@ obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
78# the three instances of ip_tables 37# the three instances of ip_tables
79obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o 38obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o
80obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o 39obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o
81obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o
82obj-$(CONFIG_NF_NAT) += iptable_nat.o 40obj-$(CONFIG_NF_NAT) += iptable_nat.o
83obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o 41obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o
84 42
@@ -100,7 +58,6 @@ obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o
100obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o 58obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o
101obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o 59obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o
102obj-$(CONFIG_IP_NF_TARGET_SAME) += ipt_SAME.o 60obj-$(CONFIG_IP_NF_TARGET_SAME) += ipt_SAME.o
103obj-$(CONFIG_IP_NF_NAT_SNMP_BASIC) += ip_nat_snmp_basic.o
104obj-$(CONFIG_IP_NF_TARGET_LOG) += ipt_LOG.o 61obj-$(CONFIG_IP_NF_TARGET_LOG) += ipt_LOG.o
105obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o 62obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o
106obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o 63obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 57b0221f9e24..cae41215e3c7 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -245,7 +245,7 @@ unsigned int arpt_do_table(struct sk_buff **pskb,
245 e = get_entry(table_base, private->hook_entry[hook]); 245 e = get_entry(table_base, private->hook_entry[hook]);
246 back = get_entry(table_base, private->underflow[hook]); 246 back = get_entry(table_base, private->underflow[hook]);
247 247
248 arp = (*pskb)->nh.arph; 248 arp = arp_hdr(*pskb);
249 do { 249 do {
250 if (arp_packet_match(arp, (*pskb)->dev, indev, outdev, &e->arp)) { 250 if (arp_packet_match(arp, (*pskb)->dev, indev, outdev, &e->arp)) {
251 struct arpt_entry_target *t; 251 struct arpt_entry_target *t;
@@ -297,7 +297,7 @@ unsigned int arpt_do_table(struct sk_buff **pskb,
297 t->data); 297 t->data);
298 298
299 /* Target might have changed stuff. */ 299 /* Target might have changed stuff. */
300 arp = (*pskb)->nh.arph; 300 arp = arp_hdr(*pskb);
301 301
302 if (verdict == ARPT_CONTINUE) 302 if (verdict == ARPT_CONTINUE)
303 e = (void *)e + e->next_offset; 303 e = (void *)e + e->next_offset;
diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c
index 709db4d3f48f..6298d404e7c7 100644
--- a/net/ipv4/netfilter/arpt_mangle.c
+++ b/net/ipv4/netfilter/arpt_mangle.c
@@ -30,35 +30,35 @@ target(struct sk_buff **pskb,
30 *pskb = nskb; 30 *pskb = nskb;
31 } 31 }
32 32
33 arp = (*pskb)->nh.arph; 33 arp = arp_hdr(*pskb);
34 arpptr = (*pskb)->nh.raw + sizeof(*arp); 34 arpptr = skb_network_header(*pskb) + sizeof(*arp);
35 pln = arp->ar_pln; 35 pln = arp->ar_pln;
36 hln = arp->ar_hln; 36 hln = arp->ar_hln;
37 /* We assume that pln and hln were checked in the match */ 37 /* We assume that pln and hln were checked in the match */
38 if (mangle->flags & ARPT_MANGLE_SDEV) { 38 if (mangle->flags & ARPT_MANGLE_SDEV) {
39 if (ARPT_DEV_ADDR_LEN_MAX < hln || 39 if (ARPT_DEV_ADDR_LEN_MAX < hln ||
40 (arpptr + hln > (**pskb).tail)) 40 (arpptr + hln > skb_tail_pointer(*pskb)))
41 return NF_DROP; 41 return NF_DROP;
42 memcpy(arpptr, mangle->src_devaddr, hln); 42 memcpy(arpptr, mangle->src_devaddr, hln);
43 } 43 }
44 arpptr += hln; 44 arpptr += hln;
45 if (mangle->flags & ARPT_MANGLE_SIP) { 45 if (mangle->flags & ARPT_MANGLE_SIP) {
46 if (ARPT_MANGLE_ADDR_LEN_MAX < pln || 46 if (ARPT_MANGLE_ADDR_LEN_MAX < pln ||
47 (arpptr + pln > (**pskb).tail)) 47 (arpptr + pln > skb_tail_pointer(*pskb)))
48 return NF_DROP; 48 return NF_DROP;
49 memcpy(arpptr, &mangle->u_s.src_ip, pln); 49 memcpy(arpptr, &mangle->u_s.src_ip, pln);
50 } 50 }
51 arpptr += pln; 51 arpptr += pln;
52 if (mangle->flags & ARPT_MANGLE_TDEV) { 52 if (mangle->flags & ARPT_MANGLE_TDEV) {
53 if (ARPT_DEV_ADDR_LEN_MAX < hln || 53 if (ARPT_DEV_ADDR_LEN_MAX < hln ||
54 (arpptr + hln > (**pskb).tail)) 54 (arpptr + hln > skb_tail_pointer(*pskb)))
55 return NF_DROP; 55 return NF_DROP;
56 memcpy(arpptr, mangle->tgt_devaddr, hln); 56 memcpy(arpptr, mangle->tgt_devaddr, hln);
57 } 57 }
58 arpptr += hln; 58 arpptr += hln;
59 if (mangle->flags & ARPT_MANGLE_TIP) { 59 if (mangle->flags & ARPT_MANGLE_TIP) {
60 if (ARPT_MANGLE_ADDR_LEN_MAX < pln || 60 if (ARPT_MANGLE_ADDR_LEN_MAX < pln ||
61 (arpptr + pln > (**pskb).tail)) 61 (arpptr + pln > skb_tail_pointer(*pskb)))
62 return NF_DROP; 62 return NF_DROP;
63 memcpy(arpptr, &mangle->u_t.tgt_ip, pln); 63 memcpy(arpptr, &mangle->u_t.tgt_ip, pln);
64 } 64 }
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c
deleted file mode 100644
index 4f561f52c83a..000000000000
--- a/net/ipv4/netfilter/ip_conntrack_amanda.c
+++ /dev/null
@@ -1,229 +0,0 @@
1/* Amanda extension for IP connection tracking, Version 0.2
2 * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca>
3 * based on HW's ip_conntrack_irc.c as well as other modules
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version
8 * 2 of the License, or (at your option) any later version.
9 *
10 * Module load syntax:
11 * insmod ip_conntrack_amanda.o [master_timeout=n]
12 *
13 * Where master_timeout is the timeout (in seconds) of the master
14 * connection (port 10080). This defaults to 5 minutes but if
15 * your clients take longer than 5 minutes to do their work
16 * before getting back to the Amanda server, you can increase
17 * this value.
18 *
19 */
20#include <linux/kernel.h>
21#include <linux/module.h>
22#include <linux/moduleparam.h>
23#include <linux/textsearch.h>
24#include <linux/skbuff.h>
25#include <linux/in.h>
26#include <linux/ip.h>
27#include <linux/udp.h>
28
29#include <linux/netfilter.h>
30#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
31#include <linux/netfilter_ipv4/ip_conntrack_amanda.h>
32
33static unsigned int master_timeout = 300;
34static char *ts_algo = "kmp";
35
36MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>");
37MODULE_DESCRIPTION("Amanda connection tracking module");
38MODULE_LICENSE("GPL");
39module_param(master_timeout, uint, 0600);
40MODULE_PARM_DESC(master_timeout, "timeout for the master connection");
41module_param(ts_algo, charp, 0400);
42MODULE_PARM_DESC(ts_algo, "textsearch algorithm to use (default kmp)");
43
44unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb,
45 enum ip_conntrack_info ctinfo,
46 unsigned int matchoff,
47 unsigned int matchlen,
48 struct ip_conntrack_expect *exp);
49EXPORT_SYMBOL_GPL(ip_nat_amanda_hook);
50
51enum amanda_strings {
52 SEARCH_CONNECT,
53 SEARCH_NEWLINE,
54 SEARCH_DATA,
55 SEARCH_MESG,
56 SEARCH_INDEX,
57};
58
59static struct {
60 char *string;
61 size_t len;
62 struct ts_config *ts;
63} search[] = {
64 [SEARCH_CONNECT] = {
65 .string = "CONNECT ",
66 .len = 8,
67 },
68 [SEARCH_NEWLINE] = {
69 .string = "\n",
70 .len = 1,
71 },
72 [SEARCH_DATA] = {
73 .string = "DATA ",
74 .len = 5,
75 },
76 [SEARCH_MESG] = {
77 .string = "MESG ",
78 .len = 5,
79 },
80 [SEARCH_INDEX] = {
81 .string = "INDEX ",
82 .len = 6,
83 },
84};
85
86static int help(struct sk_buff **pskb,
87 struct ip_conntrack *ct, enum ip_conntrack_info ctinfo)
88{
89 struct ts_state ts;
90 struct ip_conntrack_expect *exp;
91 unsigned int dataoff, start, stop, off, i;
92 char pbuf[sizeof("65535")], *tmp;
93 u_int16_t port, len;
94 int ret = NF_ACCEPT;
95 typeof(ip_nat_amanda_hook) ip_nat_amanda;
96
97 /* Only look at packets from the Amanda server */
98 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
99 return NF_ACCEPT;
100
101 /* increase the UDP timeout of the master connection as replies from
102 * Amanda clients to the server can be quite delayed */
103 ip_ct_refresh(ct, *pskb, master_timeout * HZ);
104
105 /* No data? */
106 dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
107 if (dataoff >= (*pskb)->len) {
108 if (net_ratelimit())
109 printk("amanda_help: skblen = %u\n", (*pskb)->len);
110 return NF_ACCEPT;
111 }
112
113 memset(&ts, 0, sizeof(ts));
114 start = skb_find_text(*pskb, dataoff, (*pskb)->len,
115 search[SEARCH_CONNECT].ts, &ts);
116 if (start == UINT_MAX)
117 goto out;
118 start += dataoff + search[SEARCH_CONNECT].len;
119
120 memset(&ts, 0, sizeof(ts));
121 stop = skb_find_text(*pskb, start, (*pskb)->len,
122 search[SEARCH_NEWLINE].ts, &ts);
123 if (stop == UINT_MAX)
124 goto out;
125 stop += start;
126
127 for (i = SEARCH_DATA; i <= SEARCH_INDEX; i++) {
128 memset(&ts, 0, sizeof(ts));
129 off = skb_find_text(*pskb, start, stop, search[i].ts, &ts);
130 if (off == UINT_MAX)
131 continue;
132 off += start + search[i].len;
133
134 len = min_t(unsigned int, sizeof(pbuf) - 1, stop - off);
135 if (skb_copy_bits(*pskb, off, pbuf, len))
136 break;
137 pbuf[len] = '\0';
138
139 port = simple_strtoul(pbuf, &tmp, 10);
140 len = tmp - pbuf;
141 if (port == 0 || len > 5)
142 break;
143
144 exp = ip_conntrack_expect_alloc(ct);
145 if (exp == NULL) {
146 ret = NF_DROP;
147 goto out;
148 }
149
150 exp->expectfn = NULL;
151 exp->flags = 0;
152
153 exp->tuple.src.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip;
154 exp->tuple.src.u.tcp.port = 0;
155 exp->tuple.dst.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip;
156 exp->tuple.dst.protonum = IPPROTO_TCP;
157 exp->tuple.dst.u.tcp.port = htons(port);
158
159 exp->mask.src.ip = htonl(0xFFFFFFFF);
160 exp->mask.src.u.tcp.port = 0;
161 exp->mask.dst.ip = htonl(0xFFFFFFFF);
162 exp->mask.dst.protonum = 0xFF;
163 exp->mask.dst.u.tcp.port = htons(0xFFFF);
164
165 /* RCU read locked by nf_hook_slow */
166 ip_nat_amanda = rcu_dereference(ip_nat_amanda_hook);
167 if (ip_nat_amanda)
168 ret = ip_nat_amanda(pskb, ctinfo, off - dataoff,
169 len, exp);
170 else if (ip_conntrack_expect_related(exp) != 0)
171 ret = NF_DROP;
172 ip_conntrack_expect_put(exp);
173 }
174
175out:
176 return ret;
177}
178
179static struct ip_conntrack_helper amanda_helper = {
180 .max_expected = 3,
181 .timeout = 180,
182 .me = THIS_MODULE,
183 .help = help,
184 .name = "amanda",
185
186 .tuple = { .src = { .u = { .udp = {.port = __constant_htons(10080) } } },
187 .dst = { .protonum = IPPROTO_UDP },
188 },
189 .mask = { .src = { .u = { 0xFFFF } },
190 .dst = { .protonum = 0xFF },
191 },
192};
193
194static void __exit ip_conntrack_amanda_fini(void)
195{
196 int i;
197
198 ip_conntrack_helper_unregister(&amanda_helper);
199 for (i = 0; i < ARRAY_SIZE(search); i++)
200 textsearch_destroy(search[i].ts);
201}
202
203static int __init ip_conntrack_amanda_init(void)
204{
205 int ret, i;
206
207 ret = -ENOMEM;
208 for (i = 0; i < ARRAY_SIZE(search); i++) {
209 search[i].ts = textsearch_prepare(ts_algo, search[i].string,
210 search[i].len,
211 GFP_KERNEL, TS_AUTOLOAD);
212 if (search[i].ts == NULL)
213 goto err;
214 }
215 ret = ip_conntrack_helper_register(&amanda_helper);
216 if (ret < 0)
217 goto err;
218 return 0;
219
220err:
221 for (; i >= 0; i--) {
222 if (search[i].ts)
223 textsearch_destroy(search[i].ts);
224 }
225 return ret;
226}
227
228module_init(ip_conntrack_amanda_init);
229module_exit(ip_conntrack_amanda_fini);
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
deleted file mode 100644
index 23b99ae2cc37..000000000000
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ /dev/null
@@ -1,1550 +0,0 @@
1/* Connection state tracking for netfilter. This is separated from,
2 but required by, the NAT layer; it can also be used by an iptables
3 extension. */
4
5/* (C) 1999-2001 Paul `Rusty' Russell
6 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 *
12 * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13 * - new API and handling of conntrack/nat helpers
14 * - now capable of multiple expectations for one master
15 * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16 * - add usage/reference counts to ip_conntrack_expect
17 * - export ip_conntrack[_expect]_{find_get,put} functions
18 * */
19
20#include <linux/types.h>
21#include <linux/icmp.h>
22#include <linux/ip.h>
23#include <linux/netfilter.h>
24#include <linux/netfilter_ipv4.h>
25#include <linux/module.h>
26#include <linux/skbuff.h>
27#include <linux/proc_fs.h>
28#include <linux/vmalloc.h>
29#include <net/checksum.h>
30#include <net/ip.h>
31#include <linux/stddef.h>
32#include <linux/sysctl.h>
33#include <linux/slab.h>
34#include <linux/random.h>
35#include <linux/jhash.h>
36#include <linux/err.h>
37#include <linux/percpu.h>
38#include <linux/moduleparam.h>
39#include <linux/notifier.h>
40
41/* ip_conntrack_lock protects the main hash table, protocol/helper/expected
42 registrations, conntrack timers*/
43#include <linux/netfilter_ipv4/ip_conntrack.h>
44#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
45#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
46#include <linux/netfilter_ipv4/ip_conntrack_core.h>
47
48#define IP_CONNTRACK_VERSION "2.4"
49
50#if 0
51#define DEBUGP printk
52#else
53#define DEBUGP(format, args...)
54#endif
55
56DEFINE_RWLOCK(ip_conntrack_lock);
57
58/* ip_conntrack_standalone needs this */
59atomic_t ip_conntrack_count = ATOMIC_INIT(0);
60
61void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
62LIST_HEAD(ip_conntrack_expect_list);
63struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO] __read_mostly;
64static LIST_HEAD(helpers);
65unsigned int ip_conntrack_htable_size __read_mostly = 0;
66int ip_conntrack_max __read_mostly;
67struct list_head *ip_conntrack_hash __read_mostly;
68static struct kmem_cache *ip_conntrack_cachep __read_mostly;
69static struct kmem_cache *ip_conntrack_expect_cachep __read_mostly;
70struct ip_conntrack ip_conntrack_untracked;
71unsigned int ip_ct_log_invalid __read_mostly;
72static LIST_HEAD(unconfirmed);
73static int ip_conntrack_vmalloc __read_mostly;
74
75static unsigned int ip_conntrack_next_id;
76static unsigned int ip_conntrack_expect_next_id;
77#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
78ATOMIC_NOTIFIER_HEAD(ip_conntrack_chain);
79ATOMIC_NOTIFIER_HEAD(ip_conntrack_expect_chain);
80
81DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
82
83/* deliver cached events and clear cache entry - must be called with locally
84 * disabled softirqs */
85static inline void
86__ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
87{
88 DEBUGP("ecache: delivering events for %p\n", ecache->ct);
89 if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
90 atomic_notifier_call_chain(&ip_conntrack_chain, ecache->events,
91 ecache->ct);
92 ecache->events = 0;
93 ip_conntrack_put(ecache->ct);
94 ecache->ct = NULL;
95}
96
97/* Deliver all cached events for a particular conntrack. This is called
98 * by code prior to async packet handling or freeing the skb */
99void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
100{
101 struct ip_conntrack_ecache *ecache;
102
103 local_bh_disable();
104 ecache = &__get_cpu_var(ip_conntrack_ecache);
105 if (ecache->ct == ct)
106 __ip_ct_deliver_cached_events(ecache);
107 local_bh_enable();
108}
109
110void __ip_ct_event_cache_init(struct ip_conntrack *ct)
111{
112 struct ip_conntrack_ecache *ecache;
113
114 /* take care of delivering potentially old events */
115 ecache = &__get_cpu_var(ip_conntrack_ecache);
116 BUG_ON(ecache->ct == ct);
117 if (ecache->ct)
118 __ip_ct_deliver_cached_events(ecache);
119 /* initialize for this conntrack/packet */
120 ecache->ct = ct;
121 nf_conntrack_get(&ct->ct_general);
122}
123
124/* flush the event cache - touches other CPU's data and must not be called while
125 * packets are still passing through the code */
126static void ip_ct_event_cache_flush(void)
127{
128 struct ip_conntrack_ecache *ecache;
129 int cpu;
130
131 for_each_possible_cpu(cpu) {
132 ecache = &per_cpu(ip_conntrack_ecache, cpu);
133 if (ecache->ct)
134 ip_conntrack_put(ecache->ct);
135 }
136}
137#else
138static inline void ip_ct_event_cache_flush(void) {}
139#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
140
141DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
142
143static int ip_conntrack_hash_rnd_initted;
144static unsigned int ip_conntrack_hash_rnd;
145
146static u_int32_t __hash_conntrack(const struct ip_conntrack_tuple *tuple,
147 unsigned int size, unsigned int rnd)
148{
149 return (jhash_3words((__force u32)tuple->src.ip,
150 ((__force u32)tuple->dst.ip ^ tuple->dst.protonum),
151 (tuple->src.u.all | (tuple->dst.u.all << 16)),
152 rnd) % size);
153}
154
155static u_int32_t
156hash_conntrack(const struct ip_conntrack_tuple *tuple)
157{
158 return __hash_conntrack(tuple, ip_conntrack_htable_size,
159 ip_conntrack_hash_rnd);
160}
161
162int
163ip_ct_get_tuple(const struct iphdr *iph,
164 const struct sk_buff *skb,
165 unsigned int dataoff,
166 struct ip_conntrack_tuple *tuple,
167 const struct ip_conntrack_protocol *protocol)
168{
169 /* Never happen */
170 if (iph->frag_off & htons(IP_OFFSET)) {
171 printk("ip_conntrack_core: Frag of proto %u.\n",
172 iph->protocol);
173 return 0;
174 }
175
176 tuple->src.ip = iph->saddr;
177 tuple->dst.ip = iph->daddr;
178 tuple->dst.protonum = iph->protocol;
179 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
180
181 return protocol->pkt_to_tuple(skb, dataoff, tuple);
182}
183
184int
185ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
186 const struct ip_conntrack_tuple *orig,
187 const struct ip_conntrack_protocol *protocol)
188{
189 inverse->src.ip = orig->dst.ip;
190 inverse->dst.ip = orig->src.ip;
191 inverse->dst.protonum = orig->dst.protonum;
192 inverse->dst.dir = !orig->dst.dir;
193
194 return protocol->invert_tuple(inverse, orig);
195}
196
197
198/* ip_conntrack_expect helper functions */
199void ip_ct_unlink_expect(struct ip_conntrack_expect *exp)
200{
201 IP_NF_ASSERT(!timer_pending(&exp->timeout));
202 list_del(&exp->list);
203 CONNTRACK_STAT_INC(expect_delete);
204 exp->master->expecting--;
205 ip_conntrack_expect_put(exp);
206}
207
208static void expectation_timed_out(unsigned long ul_expect)
209{
210 struct ip_conntrack_expect *exp = (void *)ul_expect;
211
212 write_lock_bh(&ip_conntrack_lock);
213 ip_ct_unlink_expect(exp);
214 write_unlock_bh(&ip_conntrack_lock);
215 ip_conntrack_expect_put(exp);
216}
217
218struct ip_conntrack_expect *
219__ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
220{
221 struct ip_conntrack_expect *i;
222
223 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
224 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask))
225 return i;
226 }
227 return NULL;
228}
229
230/* Just find a expectation corresponding to a tuple. */
231struct ip_conntrack_expect *
232ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
233{
234 struct ip_conntrack_expect *i;
235
236 read_lock_bh(&ip_conntrack_lock);
237 i = __ip_conntrack_expect_find(tuple);
238 if (i)
239 atomic_inc(&i->use);
240 read_unlock_bh(&ip_conntrack_lock);
241
242 return i;
243}
244
245/* If an expectation for this connection is found, it gets delete from
246 * global list then returned. */
247static struct ip_conntrack_expect *
248find_expectation(const struct ip_conntrack_tuple *tuple)
249{
250 struct ip_conntrack_expect *i;
251
252 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
253 /* If master is not in hash table yet (ie. packet hasn't left
254 this machine yet), how can other end know about expected?
255 Hence these are not the droids you are looking for (if
256 master ct never got confirmed, we'd hold a reference to it
257 and weird things would happen to future packets). */
258 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
259 && is_confirmed(i->master)) {
260 if (i->flags & IP_CT_EXPECT_PERMANENT) {
261 atomic_inc(&i->use);
262 return i;
263 } else if (del_timer(&i->timeout)) {
264 ip_ct_unlink_expect(i);
265 return i;
266 }
267 }
268 }
269 return NULL;
270}
271
272/* delete all expectations for this conntrack */
273void ip_ct_remove_expectations(struct ip_conntrack *ct)
274{
275 struct ip_conntrack_expect *i, *tmp;
276
277 /* Optimization: most connection never expect any others. */
278 if (ct->expecting == 0)
279 return;
280
281 list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
282 if (i->master == ct && del_timer(&i->timeout)) {
283 ip_ct_unlink_expect(i);
284 ip_conntrack_expect_put(i);
285 }
286 }
287}
288
289static void
290clean_from_lists(struct ip_conntrack *ct)
291{
292 DEBUGP("clean_from_lists(%p)\n", ct);
293 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
294 list_del(&ct->tuplehash[IP_CT_DIR_REPLY].list);
295
296 /* Destroy all pending expectations */
297 ip_ct_remove_expectations(ct);
298}
299
300static void
301destroy_conntrack(struct nf_conntrack *nfct)
302{
303 struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
304 struct ip_conntrack_protocol *proto;
305 struct ip_conntrack_helper *helper;
306 typeof(ip_conntrack_destroyed) destroyed;
307
308 DEBUGP("destroy_conntrack(%p)\n", ct);
309 IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
310 IP_NF_ASSERT(!timer_pending(&ct->timeout));
311
312 ip_conntrack_event(IPCT_DESTROY, ct);
313 set_bit(IPS_DYING_BIT, &ct->status);
314
315 helper = ct->helper;
316 if (helper && helper->destroy)
317 helper->destroy(ct);
318
319 /* To make sure we don't get any weird locking issues here:
320 * destroy_conntrack() MUST NOT be called with a write lock
321 * to ip_conntrack_lock!!! -HW */
322 rcu_read_lock();
323 proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
324 if (proto && proto->destroy)
325 proto->destroy(ct);
326
327 destroyed = rcu_dereference(ip_conntrack_destroyed);
328 if (destroyed)
329 destroyed(ct);
330
331 rcu_read_unlock();
332
333 write_lock_bh(&ip_conntrack_lock);
334 /* Expectations will have been removed in clean_from_lists,
335 * except TFTP can create an expectation on the first packet,
336 * before connection is in the list, so we need to clean here,
337 * too. */
338 ip_ct_remove_expectations(ct);
339
340 /* We overload first tuple to link into unconfirmed list. */
341 if (!is_confirmed(ct)) {
342 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
343 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
344 }
345
346 CONNTRACK_STAT_INC(delete);
347 write_unlock_bh(&ip_conntrack_lock);
348
349 if (ct->master)
350 ip_conntrack_put(ct->master);
351
352 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
353 ip_conntrack_free(ct);
354}
355
356static void death_by_timeout(unsigned long ul_conntrack)
357{
358 struct ip_conntrack *ct = (void *)ul_conntrack;
359
360 write_lock_bh(&ip_conntrack_lock);
361 /* Inside lock so preempt is disabled on module removal path.
362 * Otherwise we can get spurious warnings. */
363 CONNTRACK_STAT_INC(delete_list);
364 clean_from_lists(ct);
365 write_unlock_bh(&ip_conntrack_lock);
366 ip_conntrack_put(ct);
367}
368
369struct ip_conntrack_tuple_hash *
370__ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
371 const struct ip_conntrack *ignored_conntrack)
372{
373 struct ip_conntrack_tuple_hash *h;
374 unsigned int hash = hash_conntrack(tuple);
375
376 list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
377 if (tuplehash_to_ctrack(h) != ignored_conntrack &&
378 ip_ct_tuple_equal(tuple, &h->tuple)) {
379 CONNTRACK_STAT_INC(found);
380 return h;
381 }
382 CONNTRACK_STAT_INC(searched);
383 }
384
385 return NULL;
386}
387
388/* Find a connection corresponding to a tuple. */
389struct ip_conntrack_tuple_hash *
390ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
391 const struct ip_conntrack *ignored_conntrack)
392{
393 struct ip_conntrack_tuple_hash *h;
394
395 read_lock_bh(&ip_conntrack_lock);
396 h = __ip_conntrack_find(tuple, ignored_conntrack);
397 if (h)
398 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
399 read_unlock_bh(&ip_conntrack_lock);
400
401 return h;
402}
403
404static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
405 unsigned int hash,
406 unsigned int repl_hash)
407{
408 ct->id = ++ip_conntrack_next_id;
409 list_add(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list,
410 &ip_conntrack_hash[hash]);
411 list_add(&ct->tuplehash[IP_CT_DIR_REPLY].list,
412 &ip_conntrack_hash[repl_hash]);
413}
414
415void ip_conntrack_hash_insert(struct ip_conntrack *ct)
416{
417 unsigned int hash, repl_hash;
418
419 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
420 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
421
422 write_lock_bh(&ip_conntrack_lock);
423 __ip_conntrack_hash_insert(ct, hash, repl_hash);
424 write_unlock_bh(&ip_conntrack_lock);
425}
426
427/* Confirm a connection given skb; places it in hash table */
428int
429__ip_conntrack_confirm(struct sk_buff **pskb)
430{
431 unsigned int hash, repl_hash;
432 struct ip_conntrack_tuple_hash *h;
433 struct ip_conntrack *ct;
434 enum ip_conntrack_info ctinfo;
435
436 ct = ip_conntrack_get(*pskb, &ctinfo);
437
438 /* ipt_REJECT uses ip_conntrack_attach to attach related
439 ICMP/TCP RST packets in other direction. Actual packet
440 which created connection will be IP_CT_NEW or for an
441 expected connection, IP_CT_RELATED. */
442 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
443 return NF_ACCEPT;
444
445 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
446 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
447
448 /* We're not in hash table, and we refuse to set up related
449 connections for unconfirmed conns. But packet copies and
450 REJECT will give spurious warnings here. */
451 /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
452
453 /* No external references means noone else could have
454 confirmed us. */
455 IP_NF_ASSERT(!is_confirmed(ct));
456 DEBUGP("Confirming conntrack %p\n", ct);
457
458 write_lock_bh(&ip_conntrack_lock);
459
460 /* See if there's one in the list already, including reverse:
461 NAT could have grabbed it without realizing, since we're
462 not in the hash. If there is, we lost race. */
463 list_for_each_entry(h, &ip_conntrack_hash[hash], list)
464 if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
465 &h->tuple))
466 goto out;
467 list_for_each_entry(h, &ip_conntrack_hash[repl_hash], list)
468 if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
469 &h->tuple))
470 goto out;
471
472 /* Remove from unconfirmed list */
473 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
474
475 __ip_conntrack_hash_insert(ct, hash, repl_hash);
476 /* Timer relative to confirmation time, not original
477 setting time, otherwise we'd get timer wrap in
478 weird delay cases. */
479 ct->timeout.expires += jiffies;
480 add_timer(&ct->timeout);
481 atomic_inc(&ct->ct_general.use);
482 set_bit(IPS_CONFIRMED_BIT, &ct->status);
483 CONNTRACK_STAT_INC(insert);
484 write_unlock_bh(&ip_conntrack_lock);
485 if (ct->helper)
486 ip_conntrack_event_cache(IPCT_HELPER, *pskb);
487#ifdef CONFIG_IP_NF_NAT_NEEDED
488 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
489 test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
490 ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
491#endif
492 ip_conntrack_event_cache(master_ct(ct) ?
493 IPCT_RELATED : IPCT_NEW, *pskb);
494
495 return NF_ACCEPT;
496
497out:
498 CONNTRACK_STAT_INC(insert_failed);
499 write_unlock_bh(&ip_conntrack_lock);
500 return NF_DROP;
501}
502
503/* Returns true if a connection correspondings to the tuple (required
504 for NAT). */
505int
506ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
507 const struct ip_conntrack *ignored_conntrack)
508{
509 struct ip_conntrack_tuple_hash *h;
510
511 read_lock_bh(&ip_conntrack_lock);
512 h = __ip_conntrack_find(tuple, ignored_conntrack);
513 read_unlock_bh(&ip_conntrack_lock);
514
515 return h != NULL;
516}
517
518/* There's a small race here where we may free a just-assured
519 connection. Too bad: we're in trouble anyway. */
520static int early_drop(struct list_head *chain)
521{
522 /* Traverse backwards: gives us oldest, which is roughly LRU */
523 struct ip_conntrack_tuple_hash *h;
524 struct ip_conntrack *ct = NULL, *tmp;
525 int dropped = 0;
526
527 read_lock_bh(&ip_conntrack_lock);
528 list_for_each_entry_reverse(h, chain, list) {
529 tmp = tuplehash_to_ctrack(h);
530 if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) {
531 ct = tmp;
532 atomic_inc(&ct->ct_general.use);
533 break;
534 }
535 }
536 read_unlock_bh(&ip_conntrack_lock);
537
538 if (!ct)
539 return dropped;
540
541 if (del_timer(&ct->timeout)) {
542 death_by_timeout((unsigned long)ct);
543 dropped = 1;
544 CONNTRACK_STAT_INC_ATOMIC(early_drop);
545 }
546 ip_conntrack_put(ct);
547 return dropped;
548}
549
550static struct ip_conntrack_helper *
551__ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
552{
553 struct ip_conntrack_helper *h;
554
555 list_for_each_entry(h, &helpers, list) {
556 if (ip_ct_tuple_mask_cmp(tuple, &h->tuple, &h->mask))
557 return h;
558 }
559 return NULL;
560}
561
562struct ip_conntrack_helper *
563ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
564{
565 struct ip_conntrack_helper *helper;
566
567 /* need ip_conntrack_lock to assure that helper exists until
568 * try_module_get() is called */
569 read_lock_bh(&ip_conntrack_lock);
570
571 helper = __ip_conntrack_helper_find(tuple);
572 if (helper) {
573 /* need to increase module usage count to assure helper will
574 * not go away while the caller is e.g. busy putting a
575 * conntrack in the hash that uses the helper */
576 if (!try_module_get(helper->me))
577 helper = NULL;
578 }
579
580 read_unlock_bh(&ip_conntrack_lock);
581
582 return helper;
583}
584
585void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
586{
587 module_put(helper->me);
588}
589
590struct ip_conntrack_protocol *
591__ip_conntrack_proto_find(u_int8_t protocol)
592{
593 return ip_ct_protos[protocol];
594}
595
596/* this is guaranteed to always return a valid protocol helper, since
597 * it falls back to generic_protocol */
598struct ip_conntrack_protocol *
599ip_conntrack_proto_find_get(u_int8_t protocol)
600{
601 struct ip_conntrack_protocol *p;
602
603 rcu_read_lock();
604 p = __ip_conntrack_proto_find(protocol);
605 if (p) {
606 if (!try_module_get(p->me))
607 p = &ip_conntrack_generic_protocol;
608 }
609 rcu_read_unlock();
610
611 return p;
612}
613
614void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
615{
616 module_put(p->me);
617}
618
619struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
620 struct ip_conntrack_tuple *repl)
621{
622 struct ip_conntrack *conntrack;
623
624 if (!ip_conntrack_hash_rnd_initted) {
625 get_random_bytes(&ip_conntrack_hash_rnd, 4);
626 ip_conntrack_hash_rnd_initted = 1;
627 }
628
629 /* We don't want any race condition at early drop stage */
630 atomic_inc(&ip_conntrack_count);
631
632 if (ip_conntrack_max
633 && atomic_read(&ip_conntrack_count) > ip_conntrack_max) {
634 unsigned int hash = hash_conntrack(orig);
635 /* Try dropping from this hash chain. */
636 if (!early_drop(&ip_conntrack_hash[hash])) {
637 atomic_dec(&ip_conntrack_count);
638 if (net_ratelimit())
639 printk(KERN_WARNING
640 "ip_conntrack: table full, dropping"
641 " packet.\n");
642 return ERR_PTR(-ENOMEM);
643 }
644 }
645
646 conntrack = kmem_cache_zalloc(ip_conntrack_cachep, GFP_ATOMIC);
647 if (!conntrack) {
648 DEBUGP("Can't allocate conntrack.\n");
649 atomic_dec(&ip_conntrack_count);
650 return ERR_PTR(-ENOMEM);
651 }
652
653 atomic_set(&conntrack->ct_general.use, 1);
654 conntrack->ct_general.destroy = destroy_conntrack;
655 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
656 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
657 /* Don't set timer yet: wait for confirmation */
658 init_timer(&conntrack->timeout);
659 conntrack->timeout.data = (unsigned long)conntrack;
660 conntrack->timeout.function = death_by_timeout;
661
662 return conntrack;
663}
664
665void
666ip_conntrack_free(struct ip_conntrack *conntrack)
667{
668 atomic_dec(&ip_conntrack_count);
669 kmem_cache_free(ip_conntrack_cachep, conntrack);
670}
671
672/* Allocate a new conntrack: we return -ENOMEM if classification
673 * failed due to stress. Otherwise it really is unclassifiable */
674static struct ip_conntrack_tuple_hash *
675init_conntrack(struct ip_conntrack_tuple *tuple,
676 struct ip_conntrack_protocol *protocol,
677 struct sk_buff *skb)
678{
679 struct ip_conntrack *conntrack;
680 struct ip_conntrack_tuple repl_tuple;
681 struct ip_conntrack_expect *exp;
682
683 if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
684 DEBUGP("Can't invert tuple.\n");
685 return NULL;
686 }
687
688 conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
689 if (conntrack == NULL || IS_ERR(conntrack))
690 return (struct ip_conntrack_tuple_hash *)conntrack;
691
692 if (!protocol->new(conntrack, skb)) {
693 ip_conntrack_free(conntrack);
694 return NULL;
695 }
696
697 write_lock_bh(&ip_conntrack_lock);
698 exp = find_expectation(tuple);
699
700 if (exp) {
701 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
702 conntrack, exp);
703 /* Welcome, Mr. Bond. We've been expecting you... */
704 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
705 conntrack->master = exp->master;
706#ifdef CONFIG_IP_NF_CONNTRACK_MARK
707 conntrack->mark = exp->master->mark;
708#endif
709#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
710 defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
711 /* this is ugly, but there is no other place where to put it */
712 conntrack->nat.masq_index = exp->master->nat.masq_index;
713#endif
714#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
715 conntrack->secmark = exp->master->secmark;
716#endif
717 nf_conntrack_get(&conntrack->master->ct_general);
718 CONNTRACK_STAT_INC(expect_new);
719 } else {
720 conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
721
722 CONNTRACK_STAT_INC(new);
723 }
724
725 /* Overload tuple linked list to put us in unconfirmed list. */
726 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
727
728 write_unlock_bh(&ip_conntrack_lock);
729
730 if (exp) {
731 if (exp->expectfn)
732 exp->expectfn(conntrack, exp);
733 ip_conntrack_expect_put(exp);
734 }
735
736 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
737}
738
739/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
740static inline struct ip_conntrack *
741resolve_normal_ct(struct sk_buff *skb,
742 struct ip_conntrack_protocol *proto,
743 int *set_reply,
744 unsigned int hooknum,
745 enum ip_conntrack_info *ctinfo)
746{
747 struct ip_conntrack_tuple tuple;
748 struct ip_conntrack_tuple_hash *h;
749 struct ip_conntrack *ct;
750
751 IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
752
753 if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4,
754 &tuple,proto))
755 return NULL;
756
757 /* look for tuple match */
758 h = ip_conntrack_find_get(&tuple, NULL);
759 if (!h) {
760 h = init_conntrack(&tuple, proto, skb);
761 if (!h)
762 return NULL;
763 if (IS_ERR(h))
764 return (void *)h;
765 }
766 ct = tuplehash_to_ctrack(h);
767
768 /* It exists; we have (non-exclusive) reference. */
769 if (DIRECTION(h) == IP_CT_DIR_REPLY) {
770 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
771 /* Please set reply bit if this packet OK */
772 *set_reply = 1;
773 } else {
774 /* Once we've had two way comms, always ESTABLISHED. */
775 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
776 DEBUGP("ip_conntrack_in: normal packet for %p\n",
777 ct);
778 *ctinfo = IP_CT_ESTABLISHED;
779 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
780 DEBUGP("ip_conntrack_in: related packet for %p\n",
781 ct);
782 *ctinfo = IP_CT_RELATED;
783 } else {
784 DEBUGP("ip_conntrack_in: new packet for %p\n",
785 ct);
786 *ctinfo = IP_CT_NEW;
787 }
788 *set_reply = 0;
789 }
790 skb->nfct = &ct->ct_general;
791 skb->nfctinfo = *ctinfo;
792 return ct;
793}
794
795/* Netfilter hook itself. */
796unsigned int ip_conntrack_in(unsigned int hooknum,
797 struct sk_buff **pskb,
798 const struct net_device *in,
799 const struct net_device *out,
800 int (*okfn)(struct sk_buff *))
801{
802 struct ip_conntrack *ct;
803 enum ip_conntrack_info ctinfo;
804 struct ip_conntrack_protocol *proto;
805 int set_reply = 0;
806 int ret;
807
808 /* Previously seen (loopback or untracked)? Ignore. */
809 if ((*pskb)->nfct) {
810 CONNTRACK_STAT_INC_ATOMIC(ignore);
811 return NF_ACCEPT;
812 }
813
814 /* Never happen */
815 if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
816 if (net_ratelimit()) {
817 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
818 (*pskb)->nh.iph->protocol, hooknum);
819 }
820 return NF_DROP;
821 }
822
823/* Doesn't cover locally-generated broadcast, so not worth it. */
824#if 0
825 /* Ignore broadcast: no `connection'. */
826 if ((*pskb)->pkt_type == PACKET_BROADCAST) {
827 printk("Broadcast packet!\n");
828 return NF_ACCEPT;
829 } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF))
830 == htonl(0x000000FF)) {
831 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
832 NIPQUAD((*pskb)->nh.iph->saddr),
833 NIPQUAD((*pskb)->nh.iph->daddr),
834 (*pskb)->sk, (*pskb)->pkt_type);
835 }
836#endif
837
838 /* rcu_read_lock()ed by nf_hook_slow */
839 proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
840
841 /* It may be an special packet, error, unclean...
842 * inverse of the return code tells to the netfilter
843 * core what to do with the packet. */
844 if (proto->error != NULL
845 && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
846 CONNTRACK_STAT_INC_ATOMIC(error);
847 CONNTRACK_STAT_INC_ATOMIC(invalid);
848 return -ret;
849 }
850
851 if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
852 /* Not valid part of a connection */
853 CONNTRACK_STAT_INC_ATOMIC(invalid);
854 return NF_ACCEPT;
855 }
856
857 if (IS_ERR(ct)) {
858 /* Too stressed to deal. */
859 CONNTRACK_STAT_INC_ATOMIC(drop);
860 return NF_DROP;
861 }
862
863 IP_NF_ASSERT((*pskb)->nfct);
864
865 ret = proto->packet(ct, *pskb, ctinfo);
866 if (ret < 0) {
867 /* Invalid: inverse of the return code tells
868 * the netfilter core what to do*/
869 nf_conntrack_put((*pskb)->nfct);
870 (*pskb)->nfct = NULL;
871 CONNTRACK_STAT_INC_ATOMIC(invalid);
872 return -ret;
873 }
874
875 if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
876 ip_conntrack_event_cache(IPCT_STATUS, *pskb);
877
878 return ret;
879}
880
881int invert_tuplepr(struct ip_conntrack_tuple *inverse,
882 const struct ip_conntrack_tuple *orig)
883{
884 struct ip_conntrack_protocol *proto;
885 int ret;
886
887 rcu_read_lock();
888 proto = __ip_conntrack_proto_find(orig->dst.protonum);
889 ret = ip_ct_invert_tuple(inverse, orig, proto);
890 rcu_read_unlock();
891
892 return ret;
893}
894
895/* Would two expected things clash? */
896static inline int expect_clash(const struct ip_conntrack_expect *a,
897 const struct ip_conntrack_expect *b)
898{
899 /* Part covered by intersection of masks must be unequal,
900 otherwise they clash */
901 struct ip_conntrack_tuple intersect_mask
902 = { { a->mask.src.ip & b->mask.src.ip,
903 { a->mask.src.u.all & b->mask.src.u.all } },
904 { a->mask.dst.ip & b->mask.dst.ip,
905 { a->mask.dst.u.all & b->mask.dst.u.all },
906 a->mask.dst.protonum & b->mask.dst.protonum } };
907
908 return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
909}
910
911static inline int expect_matches(const struct ip_conntrack_expect *a,
912 const struct ip_conntrack_expect *b)
913{
914 return a->master == b->master
915 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
916 && ip_ct_tuple_equal(&a->mask, &b->mask);
917}
918
919/* Generally a bad idea to call this: could have matched already. */
920void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
921{
922 struct ip_conntrack_expect *i;
923
924 write_lock_bh(&ip_conntrack_lock);
925 /* choose the the oldest expectation to evict */
926 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
927 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
928 ip_ct_unlink_expect(i);
929 write_unlock_bh(&ip_conntrack_lock);
930 ip_conntrack_expect_put(i);
931 return;
932 }
933 }
934 write_unlock_bh(&ip_conntrack_lock);
935}
936
937/* We don't increase the master conntrack refcount for non-fulfilled
938 * conntracks. During the conntrack destruction, the expectations are
939 * always killed before the conntrack itself */
940struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
941{
942 struct ip_conntrack_expect *new;
943
944 new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
945 if (!new) {
946 DEBUGP("expect_related: OOM allocating expect\n");
947 return NULL;
948 }
949 new->master = me;
950 atomic_set(&new->use, 1);
951 return new;
952}
953
954void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
955{
956 if (atomic_dec_and_test(&exp->use))
957 kmem_cache_free(ip_conntrack_expect_cachep, exp);
958}
959
960static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
961{
962 atomic_inc(&exp->use);
963 exp->master->expecting++;
964 list_add(&exp->list, &ip_conntrack_expect_list);
965
966 init_timer(&exp->timeout);
967 exp->timeout.data = (unsigned long)exp;
968 exp->timeout.function = expectation_timed_out;
969 exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
970 add_timer(&exp->timeout);
971
972 exp->id = ++ip_conntrack_expect_next_id;
973 atomic_inc(&exp->use);
974 CONNTRACK_STAT_INC(expect_create);
975}
976
977/* Race with expectations being used means we could have none to find; OK. */
978static void evict_oldest_expect(struct ip_conntrack *master)
979{
980 struct ip_conntrack_expect *i;
981
982 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
983 if (i->master == master) {
984 if (del_timer(&i->timeout)) {
985 ip_ct_unlink_expect(i);
986 ip_conntrack_expect_put(i);
987 }
988 break;
989 }
990 }
991}
992
993static inline int refresh_timer(struct ip_conntrack_expect *i)
994{
995 if (!del_timer(&i->timeout))
996 return 0;
997
998 i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
999 add_timer(&i->timeout);
1000 return 1;
1001}
1002
1003int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
1004{
1005 struct ip_conntrack_expect *i;
1006 int ret;
1007
1008 DEBUGP("ip_conntrack_expect_related %p\n", related_to);
1009 DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
1010 DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
1011
1012 write_lock_bh(&ip_conntrack_lock);
1013 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
1014 if (expect_matches(i, expect)) {
1015 /* Refresh timer: if it's dying, ignore.. */
1016 if (refresh_timer(i)) {
1017 ret = 0;
1018 goto out;
1019 }
1020 } else if (expect_clash(i, expect)) {
1021 ret = -EBUSY;
1022 goto out;
1023 }
1024 }
1025
1026 /* Will be over limit? */
1027 if (expect->master->helper->max_expected &&
1028 expect->master->expecting >= expect->master->helper->max_expected)
1029 evict_oldest_expect(expect->master);
1030
1031 ip_conntrack_expect_insert(expect);
1032 ip_conntrack_expect_event(IPEXP_NEW, expect);
1033 ret = 0;
1034out:
1035 write_unlock_bh(&ip_conntrack_lock);
1036 return ret;
1037}
1038
1039/* Alter reply tuple (maybe alter helper). This is for NAT, and is
1040 implicitly racy: see __ip_conntrack_confirm */
1041void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1042 const struct ip_conntrack_tuple *newreply)
1043{
1044 write_lock_bh(&ip_conntrack_lock);
1045 /* Should be unconfirmed, so not in hash table yet */
1046 IP_NF_ASSERT(!is_confirmed(conntrack));
1047
1048 DEBUGP("Altering reply tuple of %p to ", conntrack);
1049 DUMP_TUPLE(newreply);
1050
1051 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1052 if (!conntrack->master && conntrack->expecting == 0)
1053 conntrack->helper = __ip_conntrack_helper_find(newreply);
1054 write_unlock_bh(&ip_conntrack_lock);
1055}
1056
1057int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1058{
1059 BUG_ON(me->timeout == 0);
1060 write_lock_bh(&ip_conntrack_lock);
1061 list_add(&me->list, &helpers);
1062 write_unlock_bh(&ip_conntrack_lock);
1063
1064 return 0;
1065}
1066
1067struct ip_conntrack_helper *
1068__ip_conntrack_helper_find_byname(const char *name)
1069{
1070 struct ip_conntrack_helper *h;
1071
1072 list_for_each_entry(h, &helpers, list) {
1073 if (!strcmp(h->name, name))
1074 return h;
1075 }
1076
1077 return NULL;
1078}
1079
1080static inline void unhelp(struct ip_conntrack_tuple_hash *i,
1081 const struct ip_conntrack_helper *me)
1082{
1083 if (tuplehash_to_ctrack(i)->helper == me) {
1084 ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
1085 tuplehash_to_ctrack(i)->helper = NULL;
1086 }
1087}
1088
1089void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1090{
1091 unsigned int i;
1092 struct ip_conntrack_tuple_hash *h;
1093 struct ip_conntrack_expect *exp, *tmp;
1094
1095 /* Need write lock here, to delete helper. */
1096 write_lock_bh(&ip_conntrack_lock);
1097 list_del(&me->list);
1098
1099 /* Get rid of expectations */
1100 list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
1101 if (exp->master->helper == me && del_timer(&exp->timeout)) {
1102 ip_ct_unlink_expect(exp);
1103 ip_conntrack_expect_put(exp);
1104 }
1105 }
1106 /* Get rid of expecteds, set helpers to NULL. */
1107 list_for_each_entry(h, &unconfirmed, list)
1108 unhelp(h, me);
1109 for (i = 0; i < ip_conntrack_htable_size; i++) {
1110 list_for_each_entry(h, &ip_conntrack_hash[i], list)
1111 unhelp(h, me);
1112 }
1113 write_unlock_bh(&ip_conntrack_lock);
1114
1115 /* Someone could be still looking at the helper in a bh. */
1116 synchronize_net();
1117}
1118
1119/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1120void __ip_ct_refresh_acct(struct ip_conntrack *ct,
1121 enum ip_conntrack_info ctinfo,
1122 const struct sk_buff *skb,
1123 unsigned long extra_jiffies,
1124 int do_acct)
1125{
1126 int event = 0;
1127
1128 IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1129 IP_NF_ASSERT(skb);
1130
1131 write_lock_bh(&ip_conntrack_lock);
1132
1133 /* Only update if this is not a fixed timeout */
1134 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
1135 write_unlock_bh(&ip_conntrack_lock);
1136 return;
1137 }
1138
1139 /* If not in hash table, timer will not be active yet */
1140 if (!is_confirmed(ct)) {
1141 ct->timeout.expires = extra_jiffies;
1142 event = IPCT_REFRESH;
1143 } else {
1144 /* Need del_timer for race avoidance (may already be dying). */
1145 if (del_timer(&ct->timeout)) {
1146 ct->timeout.expires = jiffies + extra_jiffies;
1147 add_timer(&ct->timeout);
1148 event = IPCT_REFRESH;
1149 }
1150 }
1151
1152#ifdef CONFIG_IP_NF_CT_ACCT
1153 if (do_acct) {
1154 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1155 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
1156 ntohs(skb->nh.iph->tot_len);
1157 if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
1158 || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
1159 event |= IPCT_COUNTER_FILLING;
1160 }
1161#endif
1162
1163 write_unlock_bh(&ip_conntrack_lock);
1164
1165 /* must be unlocked when calling event cache */
1166 if (event)
1167 ip_conntrack_event_cache(event, skb);
1168}
1169
1170#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1171 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1172/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1173 * in ip_conntrack_core, since we don't want the protocols to autoload
1174 * or depend on ctnetlink */
1175int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1176 const struct ip_conntrack_tuple *tuple)
1177{
1178 NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(__be16),
1179 &tuple->src.u.tcp.port);
1180 NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(__be16),
1181 &tuple->dst.u.tcp.port);
1182 return 0;
1183
1184nfattr_failure:
1185 return -1;
1186}
1187
1188int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1189 struct ip_conntrack_tuple *t)
1190{
1191 if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1192 return -EINVAL;
1193
1194 t->src.u.tcp.port =
1195 *(__be16 *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1196 t->dst.u.tcp.port =
1197 *(__be16 *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1198
1199 return 0;
1200}
1201#endif
1202
1203/* Returns new sk_buff, or NULL */
1204struct sk_buff *
1205ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
1206{
1207 skb_orphan(skb);
1208
1209 local_bh_disable();
1210 skb = ip_defrag(skb, user);
1211 local_bh_enable();
1212
1213 if (skb)
1214 ip_send_check(skb->nh.iph);
1215 return skb;
1216}
1217
1218/* Used by ipt_REJECT. */
1219static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1220{
1221 struct ip_conntrack *ct;
1222 enum ip_conntrack_info ctinfo;
1223
1224 /* This ICMP is in reverse direction to the packet which caused it */
1225 ct = ip_conntrack_get(skb, &ctinfo);
1226
1227 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1228 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1229 else
1230 ctinfo = IP_CT_RELATED;
1231
1232 /* Attach to new skbuff, and increment count */
1233 nskb->nfct = &ct->ct_general;
1234 nskb->nfctinfo = ctinfo;
1235 nf_conntrack_get(nskb->nfct);
1236}
1237
1238/* Bring out ya dead! */
1239static struct ip_conntrack *
1240get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1241 void *data, unsigned int *bucket)
1242{
1243 struct ip_conntrack_tuple_hash *h;
1244 struct ip_conntrack *ct;
1245
1246 write_lock_bh(&ip_conntrack_lock);
1247 for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1248 list_for_each_entry(h, &ip_conntrack_hash[*bucket], list) {
1249 ct = tuplehash_to_ctrack(h);
1250 if (iter(ct, data))
1251 goto found;
1252 }
1253 }
1254 list_for_each_entry(h, &unconfirmed, list) {
1255 ct = tuplehash_to_ctrack(h);
1256 if (iter(ct, data))
1257 set_bit(IPS_DYING_BIT, &ct->status);
1258 }
1259 write_unlock_bh(&ip_conntrack_lock);
1260 return NULL;
1261
1262found:
1263 atomic_inc(&ct->ct_general.use);
1264 write_unlock_bh(&ip_conntrack_lock);
1265 return ct;
1266}
1267
1268void
1269ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1270{
1271 struct ip_conntrack *ct;
1272 unsigned int bucket = 0;
1273
1274 while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
1275 /* Time to push up daises... */
1276 if (del_timer(&ct->timeout))
1277 death_by_timeout((unsigned long)ct);
1278 /* ... else the timer will get him soon. */
1279
1280 ip_conntrack_put(ct);
1281 }
1282}
1283
1284/* Fast function for those who don't want to parse /proc (and I don't
1285 blame them). */
1286/* Reversing the socket's dst/src point of view gives us the reply
1287 mapping. */
1288static int
1289getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1290{
1291 struct inet_sock *inet = inet_sk(sk);
1292 struct ip_conntrack_tuple_hash *h;
1293 struct ip_conntrack_tuple tuple;
1294
1295 IP_CT_TUPLE_U_BLANK(&tuple);
1296 tuple.src.ip = inet->rcv_saddr;
1297 tuple.src.u.tcp.port = inet->sport;
1298 tuple.dst.ip = inet->daddr;
1299 tuple.dst.u.tcp.port = inet->dport;
1300 tuple.dst.protonum = IPPROTO_TCP;
1301
1302 /* We only do TCP at the moment: is there a better way? */
1303 if (strcmp(sk->sk_prot->name, "TCP")) {
1304 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1305 return -ENOPROTOOPT;
1306 }
1307
1308 if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1309 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1310 *len, sizeof(struct sockaddr_in));
1311 return -EINVAL;
1312 }
1313
1314 h = ip_conntrack_find_get(&tuple, NULL);
1315 if (h) {
1316 struct sockaddr_in sin;
1317 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1318
1319 sin.sin_family = AF_INET;
1320 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1321 .tuple.dst.u.tcp.port;
1322 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1323 .tuple.dst.ip;
1324 memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
1325
1326 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1327 NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1328 ip_conntrack_put(ct);
1329 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1330 return -EFAULT;
1331 else
1332 return 0;
1333 }
1334 DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1335 NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1336 NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1337 return -ENOENT;
1338}
1339
1340static struct nf_sockopt_ops so_getorigdst = {
1341 .pf = PF_INET,
1342 .get_optmin = SO_ORIGINAL_DST,
1343 .get_optmax = SO_ORIGINAL_DST+1,
1344 .get = &getorigdst,
1345};
1346
1347static int kill_all(struct ip_conntrack *i, void *data)
1348{
1349 return 1;
1350}
1351
1352void ip_conntrack_flush(void)
1353{
1354 ip_ct_iterate_cleanup(kill_all, NULL);
1355}
1356
1357static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size)
1358{
1359 if (vmalloced)
1360 vfree(hash);
1361 else
1362 free_pages((unsigned long)hash,
1363 get_order(sizeof(struct list_head) * size));
1364}
1365
1366/* Mishearing the voices in his head, our hero wonders how he's
1367 supposed to kill the mall. */
1368void ip_conntrack_cleanup(void)
1369{
1370 rcu_assign_pointer(ip_ct_attach, NULL);
1371
1372 /* This makes sure all current packets have passed through
1373 netfilter framework. Roll on, two-stage module
1374 delete... */
1375 synchronize_net();
1376
1377 ip_ct_event_cache_flush();
1378 i_see_dead_people:
1379 ip_conntrack_flush();
1380 if (atomic_read(&ip_conntrack_count) != 0) {
1381 schedule();
1382 goto i_see_dead_people;
1383 }
1384 /* wait until all references to ip_conntrack_untracked are dropped */
1385 while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1386 schedule();
1387
1388 kmem_cache_destroy(ip_conntrack_cachep);
1389 kmem_cache_destroy(ip_conntrack_expect_cachep);
1390 free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1391 ip_conntrack_htable_size);
1392 nf_unregister_sockopt(&so_getorigdst);
1393}
1394
1395static struct list_head *alloc_hashtable(int size, int *vmalloced)
1396{
1397 struct list_head *hash;
1398 unsigned int i;
1399
1400 *vmalloced = 0;
1401 hash = (void*)__get_free_pages(GFP_KERNEL,
1402 get_order(sizeof(struct list_head)
1403 * size));
1404 if (!hash) {
1405 *vmalloced = 1;
1406 printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n");
1407 hash = vmalloc(sizeof(struct list_head) * size);
1408 }
1409
1410 if (hash)
1411 for (i = 0; i < size; i++)
1412 INIT_LIST_HEAD(&hash[i]);
1413
1414 return hash;
1415}
1416
1417static int set_hashsize(const char *val, struct kernel_param *kp)
1418{
1419 int i, bucket, hashsize, vmalloced;
1420 int old_vmalloced, old_size;
1421 int rnd;
1422 struct list_head *hash, *old_hash;
1423 struct ip_conntrack_tuple_hash *h;
1424
1425 /* On boot, we can set this without any fancy locking. */
1426 if (!ip_conntrack_htable_size)
1427 return param_set_int(val, kp);
1428
1429 hashsize = simple_strtol(val, NULL, 0);
1430 if (!hashsize)
1431 return -EINVAL;
1432
1433 hash = alloc_hashtable(hashsize, &vmalloced);
1434 if (!hash)
1435 return -ENOMEM;
1436
1437 /* We have to rehash for the new table anyway, so we also can
1438 * use a new random seed */
1439 get_random_bytes(&rnd, 4);
1440
1441 write_lock_bh(&ip_conntrack_lock);
1442 for (i = 0; i < ip_conntrack_htable_size; i++) {
1443 while (!list_empty(&ip_conntrack_hash[i])) {
1444 h = list_entry(ip_conntrack_hash[i].next,
1445 struct ip_conntrack_tuple_hash, list);
1446 list_del(&h->list);
1447 bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1448 list_add_tail(&h->list, &hash[bucket]);
1449 }
1450 }
1451 old_size = ip_conntrack_htable_size;
1452 old_vmalloced = ip_conntrack_vmalloc;
1453 old_hash = ip_conntrack_hash;
1454
1455 ip_conntrack_htable_size = hashsize;
1456 ip_conntrack_vmalloc = vmalloced;
1457 ip_conntrack_hash = hash;
1458 ip_conntrack_hash_rnd = rnd;
1459 write_unlock_bh(&ip_conntrack_lock);
1460
1461 free_conntrack_hash(old_hash, old_vmalloced, old_size);
1462 return 0;
1463}
1464
1465module_param_call(hashsize, set_hashsize, param_get_uint,
1466 &ip_conntrack_htable_size, 0600);
1467
1468int __init ip_conntrack_init(void)
1469{
1470 unsigned int i;
1471 int ret;
1472
1473 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1474 * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
1475 if (!ip_conntrack_htable_size) {
1476 ip_conntrack_htable_size
1477 = (((num_physpages << PAGE_SHIFT) / 16384)
1478 / sizeof(struct list_head));
1479 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1480 ip_conntrack_htable_size = 8192;
1481 if (ip_conntrack_htable_size < 16)
1482 ip_conntrack_htable_size = 16;
1483 }
1484 ip_conntrack_max = 8 * ip_conntrack_htable_size;
1485
1486 printk("ip_conntrack version %s (%u buckets, %d max)"
1487 " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1488 ip_conntrack_htable_size, ip_conntrack_max,
1489 sizeof(struct ip_conntrack));
1490
1491 ret = nf_register_sockopt(&so_getorigdst);
1492 if (ret != 0) {
1493 printk(KERN_ERR "Unable to register netfilter socket option\n");
1494 return ret;
1495 }
1496
1497 ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size,
1498 &ip_conntrack_vmalloc);
1499 if (!ip_conntrack_hash) {
1500 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1501 goto err_unreg_sockopt;
1502 }
1503
1504 ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1505 sizeof(struct ip_conntrack), 0,
1506 0, NULL, NULL);
1507 if (!ip_conntrack_cachep) {
1508 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1509 goto err_free_hash;
1510 }
1511
1512 ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1513 sizeof(struct ip_conntrack_expect),
1514 0, 0, NULL, NULL);
1515 if (!ip_conntrack_expect_cachep) {
1516 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1517 goto err_free_conntrack_slab;
1518 }
1519
1520 /* Don't NEED lock here, but good form anyway. */
1521 write_lock_bh(&ip_conntrack_lock);
1522 for (i = 0; i < MAX_IP_CT_PROTO; i++)
1523 rcu_assign_pointer(ip_ct_protos[i], &ip_conntrack_generic_protocol);
1524 /* Sew in builtin protocols. */
1525 rcu_assign_pointer(ip_ct_protos[IPPROTO_TCP], &ip_conntrack_protocol_tcp);
1526 rcu_assign_pointer(ip_ct_protos[IPPROTO_UDP], &ip_conntrack_protocol_udp);
1527 rcu_assign_pointer(ip_ct_protos[IPPROTO_ICMP], &ip_conntrack_protocol_icmp);
1528 write_unlock_bh(&ip_conntrack_lock);
1529
1530 /* For use by ipt_REJECT */
1531 rcu_assign_pointer(ip_ct_attach, ip_conntrack_attach);
1532
1533 /* Set up fake conntrack:
1534 - to never be deleted, not in any hashes */
1535 atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1536 /* - and look it like as a confirmed connection */
1537 set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1538
1539 return ret;
1540
1541err_free_conntrack_slab:
1542 kmem_cache_destroy(ip_conntrack_cachep);
1543err_free_hash:
1544 free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1545 ip_conntrack_htable_size);
1546err_unreg_sockopt:
1547 nf_unregister_sockopt(&so_getorigdst);
1548
1549 return -ENOMEM;
1550}
diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c
deleted file mode 100644
index 1faa68ab9432..000000000000
--- a/net/ipv4/netfilter/ip_conntrack_ftp.c
+++ /dev/null
@@ -1,520 +0,0 @@
1/* FTP extension for IP connection tracking. */
2
3/* (C) 1999-2001 Paul `Rusty' Russell
4 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/module.h>
12#include <linux/netfilter.h>
13#include <linux/ip.h>
14#include <linux/ctype.h>
15#include <net/checksum.h>
16#include <net/tcp.h>
17
18#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
19#include <linux/netfilter_ipv4/ip_conntrack_ftp.h>
20#include <linux/moduleparam.h>
21
22MODULE_LICENSE("GPL");
23MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
24MODULE_DESCRIPTION("ftp connection tracking helper");
25
26/* This is slow, but it's simple. --RR */
27static char *ftp_buffer;
28static DEFINE_SPINLOCK(ip_ftp_lock);
29
30#define MAX_PORTS 8
31static unsigned short ports[MAX_PORTS];
32static int ports_c;
33module_param_array(ports, ushort, &ports_c, 0400);
34
35static int loose;
36module_param(loose, bool, 0600);
37
38unsigned int (*ip_nat_ftp_hook)(struct sk_buff **pskb,
39 enum ip_conntrack_info ctinfo,
40 enum ip_ct_ftp_type type,
41 unsigned int matchoff,
42 unsigned int matchlen,
43 struct ip_conntrack_expect *exp,
44 u32 *seq);
45EXPORT_SYMBOL_GPL(ip_nat_ftp_hook);
46
47#if 0
48#define DEBUGP printk
49#else
50#define DEBUGP(format, args...)
51#endif
52
53static int try_rfc959(const char *, size_t, u_int32_t [], char);
54static int try_eprt(const char *, size_t, u_int32_t [], char);
55static int try_epsv_response(const char *, size_t, u_int32_t [], char);
56
57static const struct ftp_search {
58 const char *pattern;
59 size_t plen;
60 char skip;
61 char term;
62 enum ip_ct_ftp_type ftptype;
63 int (*getnum)(const char *, size_t, u_int32_t[], char);
64} search[IP_CT_DIR_MAX][2] = {
65 [IP_CT_DIR_ORIGINAL] = {
66 {
67 .pattern = "PORT",
68 .plen = sizeof("PORT") - 1,
69 .skip = ' ',
70 .term = '\r',
71 .ftptype = IP_CT_FTP_PORT,
72 .getnum = try_rfc959,
73 },
74 {
75 .pattern = "EPRT",
76 .plen = sizeof("EPRT") - 1,
77 .skip = ' ',
78 .term = '\r',
79 .ftptype = IP_CT_FTP_EPRT,
80 .getnum = try_eprt,
81 },
82 },
83 [IP_CT_DIR_REPLY] = {
84 {
85 .pattern = "227 ",
86 .plen = sizeof("227 ") - 1,
87 .skip = '(',
88 .term = ')',
89 .ftptype = IP_CT_FTP_PASV,
90 .getnum = try_rfc959,
91 },
92 {
93 .pattern = "229 ",
94 .plen = sizeof("229 ") - 1,
95 .skip = '(',
96 .term = ')',
97 .ftptype = IP_CT_FTP_EPSV,
98 .getnum = try_epsv_response,
99 },
100 },
101};
102
103static int try_number(const char *data, size_t dlen, u_int32_t array[],
104 int array_size, char sep, char term)
105{
106 u_int32_t i, len;
107
108 memset(array, 0, sizeof(array[0])*array_size);
109
110 /* Keep data pointing at next char. */
111 for (i = 0, len = 0; len < dlen && i < array_size; len++, data++) {
112 if (*data >= '0' && *data <= '9') {
113 array[i] = array[i]*10 + *data - '0';
114 }
115 else if (*data == sep)
116 i++;
117 else {
118 /* Unexpected character; true if it's the
119 terminator and we're finished. */
120 if (*data == term && i == array_size - 1)
121 return len;
122
123 DEBUGP("Char %u (got %u nums) `%u' unexpected\n",
124 len, i, *data);
125 return 0;
126 }
127 }
128 DEBUGP("Failed to fill %u numbers separated by %c\n", array_size, sep);
129
130 return 0;
131}
132
133/* Returns 0, or length of numbers: 192,168,1,1,5,6 */
134static int try_rfc959(const char *data, size_t dlen, u_int32_t array[6],
135 char term)
136{
137 return try_number(data, dlen, array, 6, ',', term);
138}
139
140/* Grab port: number up to delimiter */
141static int get_port(const char *data, int start, size_t dlen, char delim,
142 u_int32_t array[2])
143{
144 u_int16_t port = 0;
145 int i;
146
147 for (i = start; i < dlen; i++) {
148 /* Finished? */
149 if (data[i] == delim) {
150 if (port == 0)
151 break;
152 array[0] = port >> 8;
153 array[1] = port;
154 return i + 1;
155 }
156 else if (data[i] >= '0' && data[i] <= '9')
157 port = port*10 + data[i] - '0';
158 else /* Some other crap */
159 break;
160 }
161 return 0;
162}
163
164/* Returns 0, or length of numbers: |1|132.235.1.2|6275| */
165static int try_eprt(const char *data, size_t dlen, u_int32_t array[6],
166 char term)
167{
168 char delim;
169 int length;
170
171 /* First character is delimiter, then "1" for IPv4, then
172 delimiter again. */
173 if (dlen <= 3) return 0;
174 delim = data[0];
175 if (isdigit(delim) || delim < 33 || delim > 126
176 || data[1] != '1' || data[2] != delim)
177 return 0;
178
179 DEBUGP("EPRT: Got |1|!\n");
180 /* Now we have IP address. */
181 length = try_number(data + 3, dlen - 3, array, 4, '.', delim);
182 if (length == 0)
183 return 0;
184
185 DEBUGP("EPRT: Got IP address!\n");
186 /* Start offset includes initial "|1|", and trailing delimiter */
187 return get_port(data, 3 + length + 1, dlen, delim, array+4);
188}
189
190/* Returns 0, or length of numbers: |||6446| */
191static int try_epsv_response(const char *data, size_t dlen, u_int32_t array[6],
192 char term)
193{
194 char delim;
195
196 /* Three delimiters. */
197 if (dlen <= 3) return 0;
198 delim = data[0];
199 if (isdigit(delim) || delim < 33 || delim > 126
200 || data[1] != delim || data[2] != delim)
201 return 0;
202
203 return get_port(data, 3, dlen, delim, array+4);
204}
205
206/* Return 1 for match, 0 for accept, -1 for partial. */
207static int find_pattern(const char *data, size_t dlen,
208 const char *pattern, size_t plen,
209 char skip, char term,
210 unsigned int *numoff,
211 unsigned int *numlen,
212 u_int32_t array[6],
213 int (*getnum)(const char *, size_t, u_int32_t[], char))
214{
215 size_t i;
216
217 DEBUGP("find_pattern `%s': dlen = %u\n", pattern, dlen);
218 if (dlen == 0)
219 return 0;
220
221 if (dlen <= plen) {
222 /* Short packet: try for partial? */
223 if (strnicmp(data, pattern, dlen) == 0)
224 return -1;
225 else return 0;
226 }
227
228 if (strnicmp(data, pattern, plen) != 0) {
229#if 0
230 size_t i;
231
232 DEBUGP("ftp: string mismatch\n");
233 for (i = 0; i < plen; i++) {
234 DEBUGP("ftp:char %u `%c'(%u) vs `%c'(%u)\n",
235 i, data[i], data[i],
236 pattern[i], pattern[i]);
237 }
238#endif
239 return 0;
240 }
241
242 DEBUGP("Pattern matches!\n");
243 /* Now we've found the constant string, try to skip
244 to the 'skip' character */
245 for (i = plen; data[i] != skip; i++)
246 if (i == dlen - 1) return -1;
247
248 /* Skip over the last character */
249 i++;
250
251 DEBUGP("Skipped up to `%c'!\n", skip);
252
253 *numoff = i;
254 *numlen = getnum(data + i, dlen - i, array, term);
255 if (!*numlen)
256 return -1;
257
258 DEBUGP("Match succeeded!\n");
259 return 1;
260}
261
262/* Look up to see if we're just after a \n. */
263static int find_nl_seq(u32 seq, const struct ip_ct_ftp_master *info, int dir)
264{
265 unsigned int i;
266
267 for (i = 0; i < info->seq_aft_nl_num[dir]; i++)
268 if (info->seq_aft_nl[dir][i] == seq)
269 return 1;
270 return 0;
271}
272
273/* We don't update if it's older than what we have. */
274static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir,
275 struct sk_buff *skb)
276{
277 unsigned int i, oldest = NUM_SEQ_TO_REMEMBER;
278
279 /* Look for oldest: if we find exact match, we're done. */
280 for (i = 0; i < info->seq_aft_nl_num[dir]; i++) {
281 if (info->seq_aft_nl[dir][i] == nl_seq)
282 return;
283
284 if (oldest == info->seq_aft_nl_num[dir]
285 || before(info->seq_aft_nl[dir][i], oldest))
286 oldest = i;
287 }
288
289 if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) {
290 info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq;
291 ip_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb);
292 } else if (oldest != NUM_SEQ_TO_REMEMBER) {
293 info->seq_aft_nl[dir][oldest] = nl_seq;
294 ip_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb);
295 }
296}
297
298static int help(struct sk_buff **pskb,
299 struct ip_conntrack *ct,
300 enum ip_conntrack_info ctinfo)
301{
302 unsigned int dataoff, datalen;
303 struct tcphdr _tcph, *th;
304 char *fb_ptr;
305 int ret;
306 u32 seq, array[6] = { 0 };
307 int dir = CTINFO2DIR(ctinfo);
308 unsigned int matchlen, matchoff;
309 struct ip_ct_ftp_master *ct_ftp_info = &ct->help.ct_ftp_info;
310 struct ip_conntrack_expect *exp;
311 unsigned int i;
312 int found = 0, ends_in_nl;
313 typeof(ip_nat_ftp_hook) ip_nat_ftp;
314
315 /* Until there's been traffic both ways, don't look in packets. */
316 if (ctinfo != IP_CT_ESTABLISHED
317 && ctinfo != IP_CT_ESTABLISHED+IP_CT_IS_REPLY) {
318 DEBUGP("ftp: Conntrackinfo = %u\n", ctinfo);
319 return NF_ACCEPT;
320 }
321
322 th = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl*4,
323 sizeof(_tcph), &_tcph);
324 if (th == NULL)
325 return NF_ACCEPT;
326
327 dataoff = (*pskb)->nh.iph->ihl*4 + th->doff*4;
328 /* No data? */
329 if (dataoff >= (*pskb)->len) {
330 DEBUGP("ftp: pskblen = %u\n", (*pskb)->len);
331 return NF_ACCEPT;
332 }
333 datalen = (*pskb)->len - dataoff;
334
335 spin_lock_bh(&ip_ftp_lock);
336 fb_ptr = skb_header_pointer(*pskb, dataoff,
337 (*pskb)->len - dataoff, ftp_buffer);
338 BUG_ON(fb_ptr == NULL);
339
340 ends_in_nl = (fb_ptr[datalen - 1] == '\n');
341 seq = ntohl(th->seq) + datalen;
342
343 /* Look up to see if we're just after a \n. */
344 if (!find_nl_seq(ntohl(th->seq), ct_ftp_info, dir)) {
345 /* Now if this ends in \n, update ftp info. */
346 DEBUGP("ip_conntrack_ftp_help: wrong seq pos %s(%u) or %s(%u)\n",
347 ct_ftp_info->seq_aft_nl[0][dir]
348 old_seq_aft_nl_set ? "":"(UNSET) ", old_seq_aft_nl);
349 ret = NF_ACCEPT;
350 goto out_update_nl;
351 }
352
353 /* Initialize IP array to expected address (it's not mentioned
354 in EPSV responses) */
355 array[0] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 24) & 0xFF;
356 array[1] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 16) & 0xFF;
357 array[2] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 8) & 0xFF;
358 array[3] = ntohl(ct->tuplehash[dir].tuple.src.ip) & 0xFF;
359
360 for (i = 0; i < ARRAY_SIZE(search[dir]); i++) {
361 found = find_pattern(fb_ptr, (*pskb)->len - dataoff,
362 search[dir][i].pattern,
363 search[dir][i].plen,
364 search[dir][i].skip,
365 search[dir][i].term,
366 &matchoff, &matchlen,
367 array,
368 search[dir][i].getnum);
369 if (found) break;
370 }
371 if (found == -1) {
372 /* We don't usually drop packets. After all, this is
373 connection tracking, not packet filtering.
374 However, it is necessary for accurate tracking in
375 this case. */
376 if (net_ratelimit())
377 printk("conntrack_ftp: partial %s %u+%u\n",
378 search[dir][i].pattern,
379 ntohl(th->seq), datalen);
380 ret = NF_DROP;
381 goto out;
382 } else if (found == 0) { /* No match */
383 ret = NF_ACCEPT;
384 goto out_update_nl;
385 }
386
387 DEBUGP("conntrack_ftp: match `%s' (%u bytes at %u)\n",
388 fb_ptr + matchoff, matchlen, ntohl(th->seq) + matchoff);
389
390 /* Allocate expectation which will be inserted */
391 exp = ip_conntrack_expect_alloc(ct);
392 if (exp == NULL) {
393 ret = NF_DROP;
394 goto out;
395 }
396
397 /* We refer to the reverse direction ("!dir") tuples here,
398 * because we're expecting something in the other direction.
399 * Doesn't matter unless NAT is happening. */
400 exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip;
401
402 if (htonl((array[0] << 24) | (array[1] << 16) | (array[2] << 8) | array[3])
403 != ct->tuplehash[dir].tuple.src.ip) {
404 /* Enrico Scholz's passive FTP to partially RNAT'd ftp
405 server: it really wants us to connect to a
406 different IP address. Simply don't record it for
407 NAT. */
408 DEBUGP("conntrack_ftp: NOT RECORDING: %u,%u,%u,%u != %u.%u.%u.%u\n",
409 array[0], array[1], array[2], array[3],
410 NIPQUAD(ct->tuplehash[dir].tuple.src.ip));
411
412 /* Thanks to Cristiano Lincoln Mattos
413 <lincoln@cesar.org.br> for reporting this potential
414 problem (DMZ machines opening holes to internal
415 networks, or the packet filter itself). */
416 if (!loose) {
417 ret = NF_ACCEPT;
418 goto out_put_expect;
419 }
420 exp->tuple.dst.ip = htonl((array[0] << 24) | (array[1] << 16)
421 | (array[2] << 8) | array[3]);
422 }
423
424 exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
425 exp->tuple.dst.u.tcp.port = htons(array[4] << 8 | array[5]);
426 exp->tuple.src.u.tcp.port = 0; /* Don't care. */
427 exp->tuple.dst.protonum = IPPROTO_TCP;
428 exp->mask = ((struct ip_conntrack_tuple)
429 { { htonl(0xFFFFFFFF), { 0 } },
430 { htonl(0xFFFFFFFF), { .tcp = { htons(0xFFFF) } }, 0xFF }});
431
432 exp->expectfn = NULL;
433 exp->flags = 0;
434
435 /* Now, NAT might want to mangle the packet, and register the
436 * (possibly changed) expectation itself. */
437 ip_nat_ftp = rcu_dereference(ip_nat_ftp_hook);
438 if (ip_nat_ftp)
439 ret = ip_nat_ftp(pskb, ctinfo, search[dir][i].ftptype,
440 matchoff, matchlen, exp, &seq);
441 else {
442 /* Can't expect this? Best to drop packet now. */
443 if (ip_conntrack_expect_related(exp) != 0)
444 ret = NF_DROP;
445 else
446 ret = NF_ACCEPT;
447 }
448
449out_put_expect:
450 ip_conntrack_expect_put(exp);
451
452out_update_nl:
453 /* Now if this ends in \n, update ftp info. Seq may have been
454 * adjusted by NAT code. */
455 if (ends_in_nl)
456 update_nl_seq(seq, ct_ftp_info,dir, *pskb);
457 out:
458 spin_unlock_bh(&ip_ftp_lock);
459 return ret;
460}
461
462static struct ip_conntrack_helper ftp[MAX_PORTS];
463static char ftp_names[MAX_PORTS][sizeof("ftp-65535")];
464
465/* Not __exit: called from init() */
466static void ip_conntrack_ftp_fini(void)
467{
468 int i;
469 for (i = 0; i < ports_c; i++) {
470 DEBUGP("ip_ct_ftp: unregistering helper for port %d\n",
471 ports[i]);
472 ip_conntrack_helper_unregister(&ftp[i]);
473 }
474
475 kfree(ftp_buffer);
476}
477
478static int __init ip_conntrack_ftp_init(void)
479{
480 int i, ret;
481 char *tmpname;
482
483 ftp_buffer = kmalloc(65536, GFP_KERNEL);
484 if (!ftp_buffer)
485 return -ENOMEM;
486
487 if (ports_c == 0)
488 ports[ports_c++] = FTP_PORT;
489
490 for (i = 0; i < ports_c; i++) {
491 ftp[i].tuple.src.u.tcp.port = htons(ports[i]);
492 ftp[i].tuple.dst.protonum = IPPROTO_TCP;
493 ftp[i].mask.src.u.tcp.port = htons(0xFFFF);
494 ftp[i].mask.dst.protonum = 0xFF;
495 ftp[i].max_expected = 1;
496 ftp[i].timeout = 5 * 60; /* 5 minutes */
497 ftp[i].me = THIS_MODULE;
498 ftp[i].help = help;
499
500 tmpname = &ftp_names[i][0];
501 if (ports[i] == FTP_PORT)
502 sprintf(tmpname, "ftp");
503 else
504 sprintf(tmpname, "ftp-%d", ports[i]);
505 ftp[i].name = tmpname;
506
507 DEBUGP("ip_ct_ftp: registering helper for port %d\n",
508 ports[i]);
509 ret = ip_conntrack_helper_register(&ftp[i]);
510
511 if (ret) {
512 ip_conntrack_ftp_fini();
513 return ret;
514 }
515 }
516 return 0;
517}
518
519module_init(ip_conntrack_ftp_init);
520module_exit(ip_conntrack_ftp_fini);
diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323.c b/net/ipv4/netfilter/ip_conntrack_helper_h323.c
deleted file mode 100644
index 53eb365ccc7e..000000000000
--- a/net/ipv4/netfilter/ip_conntrack_helper_h323.c
+++ /dev/null
@@ -1,1841 +0,0 @@
1/*
2 * H.323 connection tracking helper
3 *
4 * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net>
5 *
6 * This source code is licensed under General Public License version 2.
7 *
8 * Based on the 'brute force' H.323 connection tracking module by
9 * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
10 *
11 * For more information, please see http://nath323.sourceforge.net/
12 */
13
14#include <linux/module.h>
15#include <linux/netfilter.h>
16#include <linux/ip.h>
17#include <net/tcp.h>
18#include <linux/netfilter_ipv4/ip_conntrack.h>
19#include <linux/netfilter_ipv4/ip_conntrack_core.h>
20#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
21#include <linux/netfilter_ipv4/ip_conntrack_tuple.h>
22#include <linux/netfilter_ipv4/ip_conntrack_h323.h>
23#include <linux/moduleparam.h>
24#include <linux/ctype.h>
25#include <linux/inet.h>
26
27#if 0
28#define DEBUGP printk
29#else
30#define DEBUGP(format, args...)
31#endif
32
33/* Parameters */
34static unsigned int default_rrq_ttl = 300;
35module_param(default_rrq_ttl, uint, 0600);
36MODULE_PARM_DESC(default_rrq_ttl, "use this TTL if it's missing in RRQ");
37
38static int gkrouted_only = 1;
39module_param(gkrouted_only, int, 0600);
40MODULE_PARM_DESC(gkrouted_only, "only accept calls from gatekeeper");
41
42static int callforward_filter = 1;
43module_param(callforward_filter, bool, 0600);
44MODULE_PARM_DESC(callforward_filter, "only create call forwarding expectations "
45 "if both endpoints are on different sides "
46 "(determined by routing information)");
47
48/* Hooks for NAT */
49int (*set_h245_addr_hook) (struct sk_buff ** pskb,
50 unsigned char **data, int dataoff,
51 H245_TransportAddress * addr,
52 __be32 ip, u_int16_t port);
53int (*set_h225_addr_hook) (struct sk_buff ** pskb,
54 unsigned char **data, int dataoff,
55 TransportAddress * addr,
56 __be32 ip, u_int16_t port);
57int (*set_sig_addr_hook) (struct sk_buff ** pskb,
58 struct ip_conntrack * ct,
59 enum ip_conntrack_info ctinfo,
60 unsigned char **data,
61 TransportAddress * addr, int count);
62int (*set_ras_addr_hook) (struct sk_buff ** pskb,
63 struct ip_conntrack * ct,
64 enum ip_conntrack_info ctinfo,
65 unsigned char **data,
66 TransportAddress * addr, int count);
67int (*nat_rtp_rtcp_hook) (struct sk_buff ** pskb,
68 struct ip_conntrack * ct,
69 enum ip_conntrack_info ctinfo,
70 unsigned char **data, int dataoff,
71 H245_TransportAddress * addr,
72 u_int16_t port, u_int16_t rtp_port,
73 struct ip_conntrack_expect * rtp_exp,
74 struct ip_conntrack_expect * rtcp_exp);
75int (*nat_t120_hook) (struct sk_buff ** pskb,
76 struct ip_conntrack * ct,
77 enum ip_conntrack_info ctinfo,
78 unsigned char **data, int dataoff,
79 H245_TransportAddress * addr, u_int16_t port,
80 struct ip_conntrack_expect * exp);
81int (*nat_h245_hook) (struct sk_buff ** pskb,
82 struct ip_conntrack * ct,
83 enum ip_conntrack_info ctinfo,
84 unsigned char **data, int dataoff,
85 TransportAddress * addr, u_int16_t port,
86 struct ip_conntrack_expect * exp);
87int (*nat_callforwarding_hook) (struct sk_buff ** pskb,
88 struct ip_conntrack * ct,
89 enum ip_conntrack_info ctinfo,
90 unsigned char **data, int dataoff,
91 TransportAddress * addr, u_int16_t port,
92 struct ip_conntrack_expect * exp);
93int (*nat_q931_hook) (struct sk_buff ** pskb,
94 struct ip_conntrack * ct,
95 enum ip_conntrack_info ctinfo,
96 unsigned char **data, TransportAddress * addr, int idx,
97 u_int16_t port, struct ip_conntrack_expect * exp);
98
99
100static DEFINE_SPINLOCK(ip_h323_lock);
101static char *h323_buffer;
102
103/****************************************************************************/
104static int get_tpkt_data(struct sk_buff **pskb, struct ip_conntrack *ct,
105 enum ip_conntrack_info ctinfo,
106 unsigned char **data, int *datalen, int *dataoff)
107{
108 struct ip_ct_h323_master *info = &ct->help.ct_h323_info;
109 int dir = CTINFO2DIR(ctinfo);
110 struct tcphdr _tcph, *th;
111 int tcpdatalen;
112 int tcpdataoff;
113 unsigned char *tpkt;
114 int tpktlen;
115 int tpktoff;
116
117 /* Get TCP header */
118 th = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl * 4,
119 sizeof(_tcph), &_tcph);
120 if (th == NULL)
121 return 0;
122
123 /* Get TCP data offset */
124 tcpdataoff = (*pskb)->nh.iph->ihl * 4 + th->doff * 4;
125
126 /* Get TCP data length */
127 tcpdatalen = (*pskb)->len - tcpdataoff;
128 if (tcpdatalen <= 0) /* No TCP data */
129 goto clear_out;
130
131 if (*data == NULL) { /* first TPKT */
132 /* Get first TPKT pointer */
133 tpkt = skb_header_pointer(*pskb, tcpdataoff, tcpdatalen,
134 h323_buffer);
135 BUG_ON(tpkt == NULL);
136
137 /* Validate TPKT identifier */
138 if (tcpdatalen < 4 || tpkt[0] != 0x03 || tpkt[1] != 0) {
139 /* Netmeeting sends TPKT header and data separately */
140 if (info->tpkt_len[dir] > 0) {
141 DEBUGP("ip_ct_h323: previous packet "
142 "indicated separate TPKT data of %hu "
143 "bytes\n", info->tpkt_len[dir]);
144 if (info->tpkt_len[dir] <= tcpdatalen) {
145 /* Yes, there was a TPKT header
146 * received */
147 *data = tpkt;
148 *datalen = info->tpkt_len[dir];
149 *dataoff = 0;
150 goto out;
151 }
152
153 /* Fragmented TPKT */
154 if (net_ratelimit())
155 printk("ip_ct_h323: "
156 "fragmented TPKT\n");
157 goto clear_out;
158 }
159
160 /* It is not even a TPKT */
161 return 0;
162 }
163 tpktoff = 0;
164 } else { /* Next TPKT */
165 tpktoff = *dataoff + *datalen;
166 tcpdatalen -= tpktoff;
167 if (tcpdatalen <= 4) /* No more TPKT */
168 goto clear_out;
169 tpkt = *data + *datalen;
170
171 /* Validate TPKT identifier */
172 if (tpkt[0] != 0x03 || tpkt[1] != 0)
173 goto clear_out;
174 }
175
176 /* Validate TPKT length */
177 tpktlen = tpkt[2] * 256 + tpkt[3];
178 if (tpktlen < 4)
179 goto clear_out;
180 if (tpktlen > tcpdatalen) {
181 if (tcpdatalen == 4) { /* Separate TPKT header */
182 /* Netmeeting sends TPKT header and data separately */
183 DEBUGP("ip_ct_h323: separate TPKT header indicates "
184 "there will be TPKT data of %hu bytes\n",
185 tpktlen - 4);
186 info->tpkt_len[dir] = tpktlen - 4;
187 return 0;
188 }
189
190 if (net_ratelimit())
191 printk("ip_ct_h323: incomplete TPKT (fragmented?)\n");
192 goto clear_out;
193 }
194
195 /* This is the encapsulated data */
196 *data = tpkt + 4;
197 *datalen = tpktlen - 4;
198 *dataoff = tpktoff + 4;
199
200 out:
201 /* Clear TPKT length */
202 info->tpkt_len[dir] = 0;
203 return 1;
204
205 clear_out:
206 info->tpkt_len[dir] = 0;
207 return 0;
208}
209
210/****************************************************************************/
211static int get_h245_addr(unsigned char *data, H245_TransportAddress * addr,
212 __be32 * ip, u_int16_t * port)
213{
214 unsigned char *p;
215
216 if (addr->choice != eH245_TransportAddress_unicastAddress ||
217 addr->unicastAddress.choice != eUnicastAddress_iPAddress)
218 return 0;
219
220 p = data + addr->unicastAddress.iPAddress.network;
221 *ip = htonl((p[0] << 24) | (p[1] << 16) | (p[2] << 8) | (p[3]));
222 *port = (p[4] << 8) | (p[5]);
223
224 return 1;
225}
226
227/****************************************************************************/
228static int expect_rtp_rtcp(struct sk_buff **pskb, struct ip_conntrack *ct,
229 enum ip_conntrack_info ctinfo,
230 unsigned char **data, int dataoff,
231 H245_TransportAddress * addr)
232{
233 int dir = CTINFO2DIR(ctinfo);
234 int ret = 0;
235 __be32 ip;
236 u_int16_t port;
237 u_int16_t rtp_port;
238 struct ip_conntrack_expect *rtp_exp;
239 struct ip_conntrack_expect *rtcp_exp;
240 typeof(nat_rtp_rtcp_hook) nat_rtp_rtcp;
241
242 /* Read RTP or RTCP address */
243 if (!get_h245_addr(*data, addr, &ip, &port) ||
244 ip != ct->tuplehash[dir].tuple.src.ip || port == 0)
245 return 0;
246
247 /* RTP port is even */
248 rtp_port = port & (~1);
249
250 /* Create expect for RTP */
251 if ((rtp_exp = ip_conntrack_expect_alloc(ct)) == NULL)
252 return -1;
253 rtp_exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
254 rtp_exp->tuple.src.u.udp.port = 0;
255 rtp_exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip;
256 rtp_exp->tuple.dst.u.udp.port = htons(rtp_port);
257 rtp_exp->tuple.dst.protonum = IPPROTO_UDP;
258 rtp_exp->mask.src.ip = htonl(0xFFFFFFFF);
259 rtp_exp->mask.src.u.udp.port = 0;
260 rtp_exp->mask.dst.ip = htonl(0xFFFFFFFF);
261 rtp_exp->mask.dst.u.udp.port = htons(0xFFFF);
262 rtp_exp->mask.dst.protonum = 0xFF;
263 rtp_exp->flags = 0;
264
265 /* Create expect for RTCP */
266 if ((rtcp_exp = ip_conntrack_expect_alloc(ct)) == NULL) {
267 ip_conntrack_expect_put(rtp_exp);
268 return -1;
269 }
270 rtcp_exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
271 rtcp_exp->tuple.src.u.udp.port = 0;
272 rtcp_exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip;
273 rtcp_exp->tuple.dst.u.udp.port = htons(rtp_port + 1);
274 rtcp_exp->tuple.dst.protonum = IPPROTO_UDP;
275 rtcp_exp->mask.src.ip = htonl(0xFFFFFFFF);
276 rtcp_exp->mask.src.u.udp.port = 0;
277 rtcp_exp->mask.dst.ip = htonl(0xFFFFFFFF);
278 rtcp_exp->mask.dst.u.udp.port = htons(0xFFFF);
279 rtcp_exp->mask.dst.protonum = 0xFF;
280 rtcp_exp->flags = 0;
281
282 if (ct->tuplehash[dir].tuple.src.ip !=
283 ct->tuplehash[!dir].tuple.dst.ip &&
284 (nat_rtp_rtcp = rcu_dereference(nat_rtp_rtcp_hook))) {
285 /* NAT needed */
286 ret = nat_rtp_rtcp(pskb, ct, ctinfo, data, dataoff,
287 addr, port, rtp_port, rtp_exp, rtcp_exp);
288 } else { /* Conntrack only */
289 rtp_exp->expectfn = NULL;
290 rtcp_exp->expectfn = NULL;
291
292 if (ip_conntrack_expect_related(rtp_exp) == 0) {
293 if (ip_conntrack_expect_related(rtcp_exp) == 0) {
294 DEBUGP("ip_ct_h323: expect RTP "
295 "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
296 NIPQUAD(rtp_exp->tuple.src.ip),
297 ntohs(rtp_exp->tuple.src.u.udp.port),
298 NIPQUAD(rtp_exp->tuple.dst.ip),
299 ntohs(rtp_exp->tuple.dst.u.udp.port));
300 DEBUGP("ip_ct_h323: expect RTCP "
301 "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
302 NIPQUAD(rtcp_exp->tuple.src.ip),
303 ntohs(rtcp_exp->tuple.src.u.udp.port),
304 NIPQUAD(rtcp_exp->tuple.dst.ip),
305 ntohs(rtcp_exp->tuple.dst.u.udp.port));
306 } else {
307 ip_conntrack_unexpect_related(rtp_exp);
308 ret = -1;
309 }
310 } else
311 ret = -1;
312 }
313
314 ip_conntrack_expect_put(rtp_exp);
315 ip_conntrack_expect_put(rtcp_exp);
316
317 return ret;
318}
319
320/****************************************************************************/
321static int expect_t120(struct sk_buff **pskb,
322 struct ip_conntrack *ct,
323 enum ip_conntrack_info ctinfo,
324 unsigned char **data, int dataoff,
325 H245_TransportAddress * addr)
326{
327 int dir = CTINFO2DIR(ctinfo);
328 int ret = 0;
329 __be32 ip;
330 u_int16_t port;
331 struct ip_conntrack_expect *exp = NULL;
332 typeof(nat_t120_hook) nat_t120;
333
334 /* Read T.120 address */
335 if (!get_h245_addr(*data, addr, &ip, &port) ||
336 ip != ct->tuplehash[dir].tuple.src.ip || port == 0)
337 return 0;
338
339 /* Create expect for T.120 connections */
340 if ((exp = ip_conntrack_expect_alloc(ct)) == NULL)
341 return -1;
342 exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
343 exp->tuple.src.u.tcp.port = 0;
344 exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip;
345 exp->tuple.dst.u.tcp.port = htons(port);
346 exp->tuple.dst.protonum = IPPROTO_TCP;
347 exp->mask.src.ip = htonl(0xFFFFFFFF);
348 exp->mask.src.u.tcp.port = 0;
349 exp->mask.dst.ip = htonl(0xFFFFFFFF);
350 exp->mask.dst.u.tcp.port = htons(0xFFFF);
351 exp->mask.dst.protonum = 0xFF;
352 exp->flags = IP_CT_EXPECT_PERMANENT; /* Accept multiple channels */
353
354 if (ct->tuplehash[dir].tuple.src.ip !=
355 ct->tuplehash[!dir].tuple.dst.ip &&
356 (nat_t120 = rcu_dereference(nat_t120_hook))) {
357 /* NAT needed */
358 ret = nat_t120(pskb, ct, ctinfo, data, dataoff, addr,
359 port, exp);
360 } else { /* Conntrack only */
361 exp->expectfn = NULL;
362 if (ip_conntrack_expect_related(exp) == 0) {
363 DEBUGP("ip_ct_h323: expect T.120 "
364 "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
365 NIPQUAD(exp->tuple.src.ip),
366 ntohs(exp->tuple.src.u.tcp.port),
367 NIPQUAD(exp->tuple.dst.ip),
368 ntohs(exp->tuple.dst.u.tcp.port));
369 } else
370 ret = -1;
371 }
372
373 ip_conntrack_expect_put(exp);
374
375 return ret;
376}
377
378/****************************************************************************/
379static int process_h245_channel(struct sk_buff **pskb,
380 struct ip_conntrack *ct,
381 enum ip_conntrack_info ctinfo,
382 unsigned char **data, int dataoff,
383 H2250LogicalChannelParameters * channel)
384{
385 int ret;
386
387 if (channel->options & eH2250LogicalChannelParameters_mediaChannel) {
388 /* RTP */
389 ret = expect_rtp_rtcp(pskb, ct, ctinfo, data, dataoff,
390 &channel->mediaChannel);
391 if (ret < 0)
392 return -1;
393 }
394
395 if (channel->
396 options & eH2250LogicalChannelParameters_mediaControlChannel) {
397 /* RTCP */
398 ret = expect_rtp_rtcp(pskb, ct, ctinfo, data, dataoff,
399 &channel->mediaControlChannel);
400 if (ret < 0)
401 return -1;
402 }
403
404 return 0;
405}
406
407/****************************************************************************/
408static int process_olc(struct sk_buff **pskb, struct ip_conntrack *ct,
409 enum ip_conntrack_info ctinfo,
410 unsigned char **data, int dataoff,
411 OpenLogicalChannel * olc)
412{
413 int ret;
414
415 DEBUGP("ip_ct_h323: OpenLogicalChannel\n");
416
417 if (olc->forwardLogicalChannelParameters.multiplexParameters.choice ==
418 eOpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters)
419 {
420 ret = process_h245_channel(pskb, ct, ctinfo, data, dataoff,
421 &olc->
422 forwardLogicalChannelParameters.
423 multiplexParameters.
424 h2250LogicalChannelParameters);
425 if (ret < 0)
426 return -1;
427 }
428
429 if ((olc->options &
430 eOpenLogicalChannel_reverseLogicalChannelParameters) &&
431 (olc->reverseLogicalChannelParameters.options &
432 eOpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters)
433 && (olc->reverseLogicalChannelParameters.multiplexParameters.
434 choice ==
435 eOpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters))
436 {
437 ret =
438 process_h245_channel(pskb, ct, ctinfo, data, dataoff,
439 &olc->
440 reverseLogicalChannelParameters.
441 multiplexParameters.
442 h2250LogicalChannelParameters);
443 if (ret < 0)
444 return -1;
445 }
446
447 if ((olc->options & eOpenLogicalChannel_separateStack) &&
448 olc->forwardLogicalChannelParameters.dataType.choice ==
449 eDataType_data &&
450 olc->forwardLogicalChannelParameters.dataType.data.application.
451 choice == eDataApplicationCapability_application_t120 &&
452 olc->forwardLogicalChannelParameters.dataType.data.application.
453 t120.choice == eDataProtocolCapability_separateLANStack &&
454 olc->separateStack.networkAddress.choice ==
455 eNetworkAccessParameters_networkAddress_localAreaAddress) {
456 ret = expect_t120(pskb, ct, ctinfo, data, dataoff,
457 &olc->separateStack.networkAddress.
458 localAreaAddress);
459 if (ret < 0)
460 return -1;
461 }
462
463 return 0;
464}
465
466/****************************************************************************/
467static int process_olca(struct sk_buff **pskb, struct ip_conntrack *ct,
468 enum ip_conntrack_info ctinfo,
469 unsigned char **data, int dataoff,
470 OpenLogicalChannelAck * olca)
471{
472 H2250LogicalChannelAckParameters *ack;
473 int ret;
474
475 DEBUGP("ip_ct_h323: OpenLogicalChannelAck\n");
476
477 if ((olca->options &
478 eOpenLogicalChannelAck_reverseLogicalChannelParameters) &&
479 (olca->reverseLogicalChannelParameters.options &
480 eOpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters)
481 && (olca->reverseLogicalChannelParameters.multiplexParameters.
482 choice ==
483 eOpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters))
484 {
485 ret = process_h245_channel(pskb, ct, ctinfo, data, dataoff,
486 &olca->
487 reverseLogicalChannelParameters.
488 multiplexParameters.
489 h2250LogicalChannelParameters);
490 if (ret < 0)
491 return -1;
492 }
493
494 if ((olca->options &
495 eOpenLogicalChannelAck_forwardMultiplexAckParameters) &&
496 (olca->forwardMultiplexAckParameters.choice ==
497 eOpenLogicalChannelAck_forwardMultiplexAckParameters_h2250LogicalChannelAckParameters))
498 {
499 ack = &olca->forwardMultiplexAckParameters.
500 h2250LogicalChannelAckParameters;
501 if (ack->options &
502 eH2250LogicalChannelAckParameters_mediaChannel) {
503 /* RTP */
504 ret = expect_rtp_rtcp(pskb, ct, ctinfo, data, dataoff,
505 &ack->mediaChannel);
506 if (ret < 0)
507 return -1;
508 }
509
510 if (ack->options &
511 eH2250LogicalChannelAckParameters_mediaControlChannel) {
512 /* RTCP */
513 ret = expect_rtp_rtcp(pskb, ct, ctinfo, data, dataoff,
514 &ack->mediaControlChannel);
515 if (ret < 0)
516 return -1;
517 }
518 }
519
520 return 0;
521}
522
523/****************************************************************************/
524static int process_h245(struct sk_buff **pskb, struct ip_conntrack *ct,
525 enum ip_conntrack_info ctinfo,
526 unsigned char **data, int dataoff,
527 MultimediaSystemControlMessage * mscm)
528{
529 switch (mscm->choice) {
530 case eMultimediaSystemControlMessage_request:
531 if (mscm->request.choice ==
532 eRequestMessage_openLogicalChannel) {
533 return process_olc(pskb, ct, ctinfo, data, dataoff,
534 &mscm->request.openLogicalChannel);
535 }
536 DEBUGP("ip_ct_h323: H.245 Request %d\n",
537 mscm->request.choice);
538 break;
539 case eMultimediaSystemControlMessage_response:
540 if (mscm->response.choice ==
541 eResponseMessage_openLogicalChannelAck) {
542 return process_olca(pskb, ct, ctinfo, data, dataoff,
543 &mscm->response.
544 openLogicalChannelAck);
545 }
546 DEBUGP("ip_ct_h323: H.245 Response %d\n",
547 mscm->response.choice);
548 break;
549 default:
550 DEBUGP("ip_ct_h323: H.245 signal %d\n", mscm->choice);
551 break;
552 }
553
554 return 0;
555}
556
557/****************************************************************************/
558static int h245_help(struct sk_buff **pskb, struct ip_conntrack *ct,
559 enum ip_conntrack_info ctinfo)
560{
561 static MultimediaSystemControlMessage mscm;
562 unsigned char *data = NULL;
563 int datalen;
564 int dataoff;
565 int ret;
566
567 /* Until there's been traffic both ways, don't look in packets. */
568 if (ctinfo != IP_CT_ESTABLISHED
569 && ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) {
570 return NF_ACCEPT;
571 }
572 DEBUGP("ip_ct_h245: skblen = %u\n", (*pskb)->len);
573
574 spin_lock_bh(&ip_h323_lock);
575
576 /* Process each TPKT */
577 while (get_tpkt_data(pskb, ct, ctinfo, &data, &datalen, &dataoff)) {
578 DEBUGP("ip_ct_h245: TPKT %u.%u.%u.%u->%u.%u.%u.%u, len=%d\n",
579 NIPQUAD((*pskb)->nh.iph->saddr),
580 NIPQUAD((*pskb)->nh.iph->daddr), datalen);
581
582 /* Decode H.245 signal */
583 ret = DecodeMultimediaSystemControlMessage(data, datalen,
584 &mscm);
585 if (ret < 0) {
586 if (net_ratelimit())
587 printk("ip_ct_h245: decoding error: %s\n",
588 ret == H323_ERROR_BOUND ?
589 "out of bound" : "out of range");
590 /* We don't drop when decoding error */
591 break;
592 }
593
594 /* Process H.245 signal */
595 if (process_h245(pskb, ct, ctinfo, &data, dataoff, &mscm) < 0)
596 goto drop;
597 }
598
599 spin_unlock_bh(&ip_h323_lock);
600 return NF_ACCEPT;
601
602 drop:
603 spin_unlock_bh(&ip_h323_lock);
604 if (net_ratelimit())
605 printk("ip_ct_h245: packet dropped\n");
606 return NF_DROP;
607}
608
609/****************************************************************************/
610static struct ip_conntrack_helper ip_conntrack_helper_h245 = {
611 .name = "H.245",
612 .me = THIS_MODULE,
613 .max_expected = H323_RTP_CHANNEL_MAX * 4 + 2 /* T.120 */ ,
614 .timeout = 240,
615 .tuple = {.dst = {.protonum = IPPROTO_TCP}},
616 .mask = {.src = {.u = {0xFFFF}},
617 .dst = {.protonum = 0xFF}},
618 .help = h245_help
619};
620
621/****************************************************************************/
622void ip_conntrack_h245_expect(struct ip_conntrack *new,
623 struct ip_conntrack_expect *this)
624{
625 write_lock_bh(&ip_conntrack_lock);
626 new->helper = &ip_conntrack_helper_h245;
627 write_unlock_bh(&ip_conntrack_lock);
628}
629
630/****************************************************************************/
631int get_h225_addr(unsigned char *data, TransportAddress * addr,
632 __be32 * ip, u_int16_t * port)
633{
634 unsigned char *p;
635
636 if (addr->choice != eTransportAddress_ipAddress)
637 return 0;
638
639 p = data + addr->ipAddress.ip;
640 *ip = htonl((p[0] << 24) | (p[1] << 16) | (p[2] << 8) | (p[3]));
641 *port = (p[4] << 8) | (p[5]);
642
643 return 1;
644}
645
646/****************************************************************************/
647static int expect_h245(struct sk_buff **pskb, struct ip_conntrack *ct,
648 enum ip_conntrack_info ctinfo,
649 unsigned char **data, int dataoff,
650 TransportAddress * addr)
651{
652 int dir = CTINFO2DIR(ctinfo);
653 int ret = 0;
654 __be32 ip;
655 u_int16_t port;
656 struct ip_conntrack_expect *exp = NULL;
657 typeof(nat_h245_hook) nat_h245;
658
659 /* Read h245Address */
660 if (!get_h225_addr(*data, addr, &ip, &port) ||
661 ip != ct->tuplehash[dir].tuple.src.ip || port == 0)
662 return 0;
663
664 /* Create expect for h245 connection */
665 if ((exp = ip_conntrack_expect_alloc(ct)) == NULL)
666 return -1;
667 exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
668 exp->tuple.src.u.tcp.port = 0;
669 exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip;
670 exp->tuple.dst.u.tcp.port = htons(port);
671 exp->tuple.dst.protonum = IPPROTO_TCP;
672 exp->mask.src.ip = htonl(0xFFFFFFFF);
673 exp->mask.src.u.tcp.port = 0;
674 exp->mask.dst.ip = htonl(0xFFFFFFFF);
675 exp->mask.dst.u.tcp.port = htons(0xFFFF);
676 exp->mask.dst.protonum = 0xFF;
677 exp->flags = 0;
678
679 if (ct->tuplehash[dir].tuple.src.ip !=
680 ct->tuplehash[!dir].tuple.dst.ip &&
681 (nat_h245 = rcu_dereference(nat_h245_hook))) {
682 /* NAT needed */
683 ret = nat_h245(pskb, ct, ctinfo, data, dataoff, addr,
684 port, exp);
685 } else { /* Conntrack only */
686 exp->expectfn = ip_conntrack_h245_expect;
687
688 if (ip_conntrack_expect_related(exp) == 0) {
689 DEBUGP("ip_ct_q931: expect H.245 "
690 "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
691 NIPQUAD(exp->tuple.src.ip),
692 ntohs(exp->tuple.src.u.tcp.port),
693 NIPQUAD(exp->tuple.dst.ip),
694 ntohs(exp->tuple.dst.u.tcp.port));
695 } else
696 ret = -1;
697 }
698
699 ip_conntrack_expect_put(exp);
700
701 return ret;
702}
703
704/* Forwarding declaration */
705void ip_conntrack_q931_expect(struct ip_conntrack *new,
706 struct ip_conntrack_expect *this);
707
708/****************************************************************************/
709static int expect_callforwarding(struct sk_buff **pskb,
710 struct ip_conntrack *ct,
711 enum ip_conntrack_info ctinfo,
712 unsigned char **data, int dataoff,
713 TransportAddress * addr)
714{
715 int dir = CTINFO2DIR(ctinfo);
716 int ret = 0;
717 __be32 ip;
718 u_int16_t port;
719 struct ip_conntrack_expect *exp = NULL;
720 typeof(nat_callforwarding_hook) nat_callforwarding;
721
722 /* Read alternativeAddress */
723 if (!get_h225_addr(*data, addr, &ip, &port) || port == 0)
724 return 0;
725
726 /* If the calling party is on the same side of the forward-to party,
727 * we don't need to track the second call */
728 if (callforward_filter) {
729 struct rtable *rt1, *rt2;
730 struct flowi fl1 = {
731 .fl4_dst = ip,
732 };
733 struct flowi fl2 = {
734 .fl4_dst = ct->tuplehash[!dir].tuple.src.ip,
735 };
736
737 if (ip_route_output_key(&rt1, &fl1) == 0) {
738 if (ip_route_output_key(&rt2, &fl2) == 0) {
739 if (rt1->rt_gateway == rt2->rt_gateway &&
740 rt1->u.dst.dev == rt2->u.dst.dev)
741 ret = 1;
742 dst_release(&rt2->u.dst);
743 }
744 dst_release(&rt1->u.dst);
745 }
746 if (ret) {
747 DEBUGP("ip_ct_q931: Call Forwarding not tracked\n");
748 return 0;
749 }
750 }
751
752 /* Create expect for the second call leg */
753 if ((exp = ip_conntrack_expect_alloc(ct)) == NULL)
754 return -1;
755 exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
756 exp->tuple.src.u.tcp.port = 0;
757 exp->tuple.dst.ip = ip;
758 exp->tuple.dst.u.tcp.port = htons(port);
759 exp->tuple.dst.protonum = IPPROTO_TCP;
760 exp->mask.src.ip = htonl(0xFFFFFFFF);
761 exp->mask.src.u.tcp.port = 0;
762 exp->mask.dst.ip = htonl(0xFFFFFFFF);
763 exp->mask.dst.u.tcp.port = htons(0xFFFF);
764 exp->mask.dst.protonum = 0xFF;
765 exp->flags = 0;
766
767 if (ct->tuplehash[dir].tuple.src.ip !=
768 ct->tuplehash[!dir].tuple.dst.ip &&
769 (nat_callforwarding = rcu_dereference(nat_callforwarding_hook))) {
770 /* Need NAT */
771 ret = nat_callforwarding(pskb, ct, ctinfo, data, dataoff,
772 addr, port, exp);
773 } else { /* Conntrack only */
774 exp->expectfn = ip_conntrack_q931_expect;
775
776 if (ip_conntrack_expect_related(exp) == 0) {
777 DEBUGP("ip_ct_q931: expect Call Forwarding "
778 "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
779 NIPQUAD(exp->tuple.src.ip),
780 ntohs(exp->tuple.src.u.tcp.port),
781 NIPQUAD(exp->tuple.dst.ip),
782 ntohs(exp->tuple.dst.u.tcp.port));
783 } else
784 ret = -1;
785 }
786
787 ip_conntrack_expect_put(exp);
788
789 return ret;
790}
791
792/****************************************************************************/
793static int process_setup(struct sk_buff **pskb, struct ip_conntrack *ct,
794 enum ip_conntrack_info ctinfo,
795 unsigned char **data, int dataoff,
796 Setup_UUIE * setup)
797{
798 int dir = CTINFO2DIR(ctinfo);
799 int ret;
800 int i;
801 __be32 ip;
802 u_int16_t port;
803 typeof(set_h225_addr_hook) set_h225_addr;
804
805 DEBUGP("ip_ct_q931: Setup\n");
806
807 if (setup->options & eSetup_UUIE_h245Address) {
808 ret = expect_h245(pskb, ct, ctinfo, data, dataoff,
809 &setup->h245Address);
810 if (ret < 0)
811 return -1;
812 }
813
814 set_h225_addr = rcu_dereference(set_h225_addr_hook);
815
816 if ((setup->options & eSetup_UUIE_destCallSignalAddress) &&
817 (set_h225_addr) &&
818 get_h225_addr(*data, &setup->destCallSignalAddress, &ip, &port) &&
819 ip != ct->tuplehash[!dir].tuple.src.ip) {
820 DEBUGP("ip_ct_q931: set destCallSignalAddress "
821 "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
822 NIPQUAD(ip), port,
823 NIPQUAD(ct->tuplehash[!dir].tuple.src.ip),
824 ntohs(ct->tuplehash[!dir].tuple.src.u.tcp.port));
825 ret = set_h225_addr(pskb, data, dataoff,
826 &setup->destCallSignalAddress,
827 ct->tuplehash[!dir].tuple.src.ip,
828 ntohs(ct->tuplehash[!dir].tuple.src.
829 u.tcp.port));
830 if (ret < 0)
831 return -1;
832 }
833
834 if ((setup->options & eSetup_UUIE_sourceCallSignalAddress) &&
835 (set_h225_addr) &&
836 get_h225_addr(*data, &setup->sourceCallSignalAddress, &ip, &port)
837 && ip != ct->tuplehash[!dir].tuple.dst.ip) {
838 DEBUGP("ip_ct_q931: set sourceCallSignalAddress "
839 "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
840 NIPQUAD(ip), port,
841 NIPQUAD(ct->tuplehash[!dir].tuple.dst.ip),
842 ntohs(ct->tuplehash[!dir].tuple.dst.u.tcp.port));
843 ret = set_h225_addr(pskb, data, dataoff,
844 &setup->sourceCallSignalAddress,
845 ct->tuplehash[!dir].tuple.dst.ip,
846 ntohs(ct->tuplehash[!dir].tuple.dst.
847 u.tcp.port));
848 if (ret < 0)
849 return -1;
850 }
851
852 if (setup->options & eSetup_UUIE_fastStart) {
853 for (i = 0; i < setup->fastStart.count; i++) {
854 ret = process_olc(pskb, ct, ctinfo, data, dataoff,
855 &setup->fastStart.item[i]);
856 if (ret < 0)
857 return -1;
858 }
859 }
860
861 return 0;
862}
863
864/****************************************************************************/
865static int process_callproceeding(struct sk_buff **pskb,
866 struct ip_conntrack *ct,
867 enum ip_conntrack_info ctinfo,
868 unsigned char **data, int dataoff,
869 CallProceeding_UUIE * callproc)
870{
871 int ret;
872 int i;
873
874 DEBUGP("ip_ct_q931: CallProceeding\n");
875
876 if (callproc->options & eCallProceeding_UUIE_h245Address) {
877 ret = expect_h245(pskb, ct, ctinfo, data, dataoff,
878 &callproc->h245Address);
879 if (ret < 0)
880 return -1;
881 }
882
883 if (callproc->options & eCallProceeding_UUIE_fastStart) {
884 for (i = 0; i < callproc->fastStart.count; i++) {
885 ret = process_olc(pskb, ct, ctinfo, data, dataoff,
886 &callproc->fastStart.item[i]);
887 if (ret < 0)
888 return -1;
889 }
890 }
891
892 return 0;
893}
894
895/****************************************************************************/
896static int process_connect(struct sk_buff **pskb, struct ip_conntrack *ct,
897 enum ip_conntrack_info ctinfo,
898 unsigned char **data, int dataoff,
899 Connect_UUIE * connect)
900{
901 int ret;
902 int i;
903
904 DEBUGP("ip_ct_q931: Connect\n");
905
906 if (connect->options & eConnect_UUIE_h245Address) {
907 ret = expect_h245(pskb, ct, ctinfo, data, dataoff,
908 &connect->h245Address);
909 if (ret < 0)
910 return -1;
911 }
912
913 if (connect->options & eConnect_UUIE_fastStart) {
914 for (i = 0; i < connect->fastStart.count; i++) {
915 ret = process_olc(pskb, ct, ctinfo, data, dataoff,
916 &connect->fastStart.item[i]);
917 if (ret < 0)
918 return -1;
919 }
920 }
921
922 return 0;
923}
924
925/****************************************************************************/
926static int process_alerting(struct sk_buff **pskb, struct ip_conntrack *ct,
927 enum ip_conntrack_info ctinfo,
928 unsigned char **data, int dataoff,
929 Alerting_UUIE * alert)
930{
931 int ret;
932 int i;
933
934 DEBUGP("ip_ct_q931: Alerting\n");
935
936 if (alert->options & eAlerting_UUIE_h245Address) {
937 ret = expect_h245(pskb, ct, ctinfo, data, dataoff,
938 &alert->h245Address);
939 if (ret < 0)
940 return -1;
941 }
942
943 if (alert->options & eAlerting_UUIE_fastStart) {
944 for (i = 0; i < alert->fastStart.count; i++) {
945 ret = process_olc(pskb, ct, ctinfo, data, dataoff,
946 &alert->fastStart.item[i]);
947 if (ret < 0)
948 return -1;
949 }
950 }
951
952 return 0;
953}
954
955/****************************************************************************/
956static int process_information(struct sk_buff **pskb,
957 struct ip_conntrack *ct,
958 enum ip_conntrack_info ctinfo,
959 unsigned char **data, int dataoff,
960 Information_UUIE * info)
961{
962 int ret;
963 int i;
964
965 DEBUGP("ip_ct_q931: Information\n");
966
967 if (info->options & eInformation_UUIE_fastStart) {
968 for (i = 0; i < info->fastStart.count; i++) {
969 ret = process_olc(pskb, ct, ctinfo, data, dataoff,
970 &info->fastStart.item[i]);
971 if (ret < 0)
972 return -1;
973 }
974 }
975
976 return 0;
977}
978
979/****************************************************************************/
980static int process_facility(struct sk_buff **pskb, struct ip_conntrack *ct,
981 enum ip_conntrack_info ctinfo,
982 unsigned char **data, int dataoff,
983 Facility_UUIE * facility)
984{
985 int ret;
986 int i;
987
988 DEBUGP("ip_ct_q931: Facility\n");
989
990 if (facility->reason.choice == eFacilityReason_callForwarded) {
991 if (facility->options & eFacility_UUIE_alternativeAddress)
992 return expect_callforwarding(pskb, ct, ctinfo, data,
993 dataoff,
994 &facility->
995 alternativeAddress);
996 return 0;
997 }
998
999 if (facility->options & eFacility_UUIE_h245Address) {
1000 ret = expect_h245(pskb, ct, ctinfo, data, dataoff,
1001 &facility->h245Address);
1002 if (ret < 0)
1003 return -1;
1004 }
1005
1006 if (facility->options & eFacility_UUIE_fastStart) {
1007 for (i = 0; i < facility->fastStart.count; i++) {
1008 ret = process_olc(pskb, ct, ctinfo, data, dataoff,
1009 &facility->fastStart.item[i]);
1010 if (ret < 0)
1011 return -1;
1012 }
1013 }
1014
1015 return 0;
1016}
1017
1018/****************************************************************************/
1019static int process_progress(struct sk_buff **pskb, struct ip_conntrack *ct,
1020 enum ip_conntrack_info ctinfo,
1021 unsigned char **data, int dataoff,
1022 Progress_UUIE * progress)
1023{
1024 int ret;
1025 int i;
1026
1027 DEBUGP("ip_ct_q931: Progress\n");
1028
1029 if (progress->options & eProgress_UUIE_h245Address) {
1030 ret = expect_h245(pskb, ct, ctinfo, data, dataoff,
1031 &progress->h245Address);
1032 if (ret < 0)
1033 return -1;
1034 }
1035
1036 if (progress->options & eProgress_UUIE_fastStart) {
1037 for (i = 0; i < progress->fastStart.count; i++) {
1038 ret = process_olc(pskb, ct, ctinfo, data, dataoff,
1039 &progress->fastStart.item[i]);
1040 if (ret < 0)
1041 return -1;
1042 }
1043 }
1044
1045 return 0;
1046}
1047
1048/****************************************************************************/
1049static int process_q931(struct sk_buff **pskb, struct ip_conntrack *ct,
1050 enum ip_conntrack_info ctinfo,
1051 unsigned char **data, int dataoff, Q931 * q931)
1052{
1053 H323_UU_PDU *pdu = &q931->UUIE.h323_uu_pdu;
1054 int i;
1055 int ret = 0;
1056
1057 switch (pdu->h323_message_body.choice) {
1058 case eH323_UU_PDU_h323_message_body_setup:
1059 ret = process_setup(pskb, ct, ctinfo, data, dataoff,
1060 &pdu->h323_message_body.setup);
1061 break;
1062 case eH323_UU_PDU_h323_message_body_callProceeding:
1063 ret = process_callproceeding(pskb, ct, ctinfo, data, dataoff,
1064 &pdu->h323_message_body.
1065 callProceeding);
1066 break;
1067 case eH323_UU_PDU_h323_message_body_connect:
1068 ret = process_connect(pskb, ct, ctinfo, data, dataoff,
1069 &pdu->h323_message_body.connect);
1070 break;
1071 case eH323_UU_PDU_h323_message_body_alerting:
1072 ret = process_alerting(pskb, ct, ctinfo, data, dataoff,
1073 &pdu->h323_message_body.alerting);
1074 break;
1075 case eH323_UU_PDU_h323_message_body_information:
1076 ret = process_information(pskb, ct, ctinfo, data, dataoff,
1077 &pdu->h323_message_body.
1078 information);
1079 break;
1080 case eH323_UU_PDU_h323_message_body_facility:
1081 ret = process_facility(pskb, ct, ctinfo, data, dataoff,
1082 &pdu->h323_message_body.facility);
1083 break;
1084 case eH323_UU_PDU_h323_message_body_progress:
1085 ret = process_progress(pskb, ct, ctinfo, data, dataoff,
1086 &pdu->h323_message_body.progress);
1087 break;
1088 default:
1089 DEBUGP("ip_ct_q931: Q.931 signal %d\n",
1090 pdu->h323_message_body.choice);
1091 break;
1092 }
1093
1094 if (ret < 0)
1095 return -1;
1096
1097 if (pdu->options & eH323_UU_PDU_h245Control) {
1098 for (i = 0; i < pdu->h245Control.count; i++) {
1099 ret = process_h245(pskb, ct, ctinfo, data, dataoff,
1100 &pdu->h245Control.item[i]);
1101 if (ret < 0)
1102 return -1;
1103 }
1104 }
1105
1106 return 0;
1107}
1108
1109/****************************************************************************/
1110static int q931_help(struct sk_buff **pskb, struct ip_conntrack *ct,
1111 enum ip_conntrack_info ctinfo)
1112{
1113 static Q931 q931;
1114 unsigned char *data = NULL;
1115 int datalen;
1116 int dataoff;
1117 int ret;
1118
1119 /* Until there's been traffic both ways, don't look in packets. */
1120 if (ctinfo != IP_CT_ESTABLISHED
1121 && ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) {
1122 return NF_ACCEPT;
1123 }
1124 DEBUGP("ip_ct_q931: skblen = %u\n", (*pskb)->len);
1125
1126 spin_lock_bh(&ip_h323_lock);
1127
1128 /* Process each TPKT */
1129 while (get_tpkt_data(pskb, ct, ctinfo, &data, &datalen, &dataoff)) {
1130 DEBUGP("ip_ct_q931: TPKT %u.%u.%u.%u->%u.%u.%u.%u, len=%d\n",
1131 NIPQUAD((*pskb)->nh.iph->saddr),
1132 NIPQUAD((*pskb)->nh.iph->daddr), datalen);
1133
1134 /* Decode Q.931 signal */
1135 ret = DecodeQ931(data, datalen, &q931);
1136 if (ret < 0) {
1137 if (net_ratelimit())
1138 printk("ip_ct_q931: decoding error: %s\n",
1139 ret == H323_ERROR_BOUND ?
1140 "out of bound" : "out of range");
1141 /* We don't drop when decoding error */
1142 break;
1143 }
1144
1145 /* Process Q.931 signal */
1146 if (process_q931(pskb, ct, ctinfo, &data, dataoff, &q931) < 0)
1147 goto drop;
1148 }
1149
1150 spin_unlock_bh(&ip_h323_lock);
1151 return NF_ACCEPT;
1152
1153 drop:
1154 spin_unlock_bh(&ip_h323_lock);
1155 if (net_ratelimit())
1156 printk("ip_ct_q931: packet dropped\n");
1157 return NF_DROP;
1158}
1159
1160/****************************************************************************/
1161static struct ip_conntrack_helper ip_conntrack_helper_q931 = {
1162 .name = "Q.931",
1163 .me = THIS_MODULE,
1164 .max_expected = H323_RTP_CHANNEL_MAX * 4 + 4 /* T.120 and H.245 */ ,
1165 .timeout = 240,
1166 .tuple = {.src = {.u = {.tcp = {.port = __constant_htons(Q931_PORT)}}},
1167 .dst = {.protonum = IPPROTO_TCP}},
1168 .mask = {.src = {.u = {0xFFFF}},
1169 .dst = {.protonum = 0xFF}},
1170 .help = q931_help
1171};
1172
1173/****************************************************************************/
1174void ip_conntrack_q931_expect(struct ip_conntrack *new,
1175 struct ip_conntrack_expect *this)
1176{
1177 write_lock_bh(&ip_conntrack_lock);
1178 new->helper = &ip_conntrack_helper_q931;
1179 write_unlock_bh(&ip_conntrack_lock);
1180}
1181
1182/****************************************************************************/
1183static unsigned char *get_udp_data(struct sk_buff **pskb, int *datalen)
1184{
1185 struct udphdr _uh, *uh;
1186 int dataoff;
1187
1188 uh = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl * 4, sizeof(_uh),
1189 &_uh);
1190 if (uh == NULL)
1191 return NULL;
1192 dataoff = (*pskb)->nh.iph->ihl * 4 + sizeof(_uh);
1193 if (dataoff >= (*pskb)->len)
1194 return NULL;
1195 *datalen = (*pskb)->len - dataoff;
1196 return skb_header_pointer(*pskb, dataoff, *datalen, h323_buffer);
1197}
1198
1199/****************************************************************************/
1200static struct ip_conntrack_expect *find_expect(struct ip_conntrack *ct,
1201 __be32 ip, u_int16_t port)
1202{
1203 struct ip_conntrack_expect *exp;
1204 struct ip_conntrack_tuple tuple;
1205
1206 tuple.src.ip = 0;
1207 tuple.src.u.tcp.port = 0;
1208 tuple.dst.ip = ip;
1209 tuple.dst.u.tcp.port = htons(port);
1210 tuple.dst.protonum = IPPROTO_TCP;
1211
1212 exp = __ip_conntrack_expect_find(&tuple);
1213 if (exp && exp->master == ct)
1214 return exp;
1215 return NULL;
1216}
1217
1218/****************************************************************************/
1219static int set_expect_timeout(struct ip_conntrack_expect *exp,
1220 unsigned timeout)
1221{
1222 if (!exp || !del_timer(&exp->timeout))
1223 return 0;
1224
1225 exp->timeout.expires = jiffies + timeout * HZ;
1226 add_timer(&exp->timeout);
1227
1228 return 1;
1229}
1230
1231/****************************************************************************/
1232static int expect_q931(struct sk_buff **pskb, struct ip_conntrack *ct,
1233 enum ip_conntrack_info ctinfo,
1234 unsigned char **data,
1235 TransportAddress * addr, int count)
1236{
1237 struct ip_ct_h323_master *info = &ct->help.ct_h323_info;
1238 int dir = CTINFO2DIR(ctinfo);
1239 int ret = 0;
1240 int i;
1241 __be32 ip;
1242 u_int16_t port;
1243 struct ip_conntrack_expect *exp;
1244 typeof(nat_q931_hook) nat_q931;
1245
1246 /* Look for the first related address */
1247 for (i = 0; i < count; i++) {
1248 if (get_h225_addr(*data, &addr[i], &ip, &port) &&
1249 ip == ct->tuplehash[dir].tuple.src.ip && port != 0)
1250 break;
1251 }
1252
1253 if (i >= count) /* Not found */
1254 return 0;
1255
1256 /* Create expect for Q.931 */
1257 if ((exp = ip_conntrack_expect_alloc(ct)) == NULL)
1258 return -1;
1259 exp->tuple.src.ip = gkrouted_only ? /* only accept calls from GK? */
1260 ct->tuplehash[!dir].tuple.src.ip : 0;
1261 exp->tuple.src.u.tcp.port = 0;
1262 exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip;
1263 exp->tuple.dst.u.tcp.port = htons(port);
1264 exp->tuple.dst.protonum = IPPROTO_TCP;
1265 exp->mask.src.ip = gkrouted_only ? htonl(0xFFFFFFFF) : 0;
1266 exp->mask.src.u.tcp.port = 0;
1267 exp->mask.dst.ip = htonl(0xFFFFFFFF);
1268 exp->mask.dst.u.tcp.port = htons(0xFFFF);
1269 exp->mask.dst.protonum = 0xFF;
1270 exp->flags = IP_CT_EXPECT_PERMANENT; /* Accept multiple calls */
1271
1272 nat_q931 = rcu_dereference(nat_q931_hook);
1273 if (nat_q931) { /* Need NAT */
1274 ret = nat_q931(pskb, ct, ctinfo, data, addr, i, port, exp);
1275 } else { /* Conntrack only */
1276 exp->expectfn = ip_conntrack_q931_expect;
1277
1278 if (ip_conntrack_expect_related(exp) == 0) {
1279 DEBUGP("ip_ct_ras: expect Q.931 "
1280 "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
1281 NIPQUAD(exp->tuple.src.ip),
1282 ntohs(exp->tuple.src.u.tcp.port),
1283 NIPQUAD(exp->tuple.dst.ip),
1284 ntohs(exp->tuple.dst.u.tcp.port));
1285
1286 /* Save port for looking up expect in processing RCF */
1287 info->sig_port[dir] = port;
1288 } else
1289 ret = -1;
1290 }
1291
1292 ip_conntrack_expect_put(exp);
1293
1294 return ret;
1295}
1296
1297/****************************************************************************/
1298static int process_grq(struct sk_buff **pskb, struct ip_conntrack *ct,
1299 enum ip_conntrack_info ctinfo,
1300 unsigned char **data, GatekeeperRequest * grq)
1301{
1302 typeof(set_ras_addr_hook) set_ras_addr;
1303
1304 DEBUGP("ip_ct_ras: GRQ\n");
1305
1306 set_ras_addr = rcu_dereference(set_ras_addr_hook);
1307 if (set_ras_addr) /* NATed */
1308 return set_ras_addr(pskb, ct, ctinfo, data,
1309 &grq->rasAddress, 1);
1310 return 0;
1311}
1312
1313/* Declare before using */
1314static void ip_conntrack_ras_expect(struct ip_conntrack *new,
1315 struct ip_conntrack_expect *this);
1316
1317/****************************************************************************/
1318static int process_gcf(struct sk_buff **pskb, struct ip_conntrack *ct,
1319 enum ip_conntrack_info ctinfo,
1320 unsigned char **data, GatekeeperConfirm * gcf)
1321{
1322 int dir = CTINFO2DIR(ctinfo);
1323 int ret = 0;
1324 __be32 ip;
1325 u_int16_t port;
1326 struct ip_conntrack_expect *exp;
1327
1328 DEBUGP("ip_ct_ras: GCF\n");
1329
1330 if (!get_h225_addr(*data, &gcf->rasAddress, &ip, &port))
1331 return 0;
1332
1333 /* Registration port is the same as discovery port */
1334 if (ip == ct->tuplehash[dir].tuple.src.ip &&
1335 port == ntohs(ct->tuplehash[dir].tuple.src.u.udp.port))
1336 return 0;
1337
1338 /* Avoid RAS expectation loops. A GCF is never expected. */
1339 if (test_bit(IPS_EXPECTED_BIT, &ct->status))
1340 return 0;
1341
1342 /* Need new expect */
1343 if ((exp = ip_conntrack_expect_alloc(ct)) == NULL)
1344 return -1;
1345 exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
1346 exp->tuple.src.u.tcp.port = 0;
1347 exp->tuple.dst.ip = ip;
1348 exp->tuple.dst.u.tcp.port = htons(port);
1349 exp->tuple.dst.protonum = IPPROTO_UDP;
1350 exp->mask.src.ip = htonl(0xFFFFFFFF);
1351 exp->mask.src.u.tcp.port = 0;
1352 exp->mask.dst.ip = htonl(0xFFFFFFFF);
1353 exp->mask.dst.u.tcp.port = htons(0xFFFF);
1354 exp->mask.dst.protonum = 0xFF;
1355 exp->flags = 0;
1356 exp->expectfn = ip_conntrack_ras_expect;
1357 if (ip_conntrack_expect_related(exp) == 0) {
1358 DEBUGP("ip_ct_ras: expect RAS "
1359 "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
1360 NIPQUAD(exp->tuple.src.ip),
1361 ntohs(exp->tuple.src.u.tcp.port),
1362 NIPQUAD(exp->tuple.dst.ip),
1363 ntohs(exp->tuple.dst.u.tcp.port));
1364 } else
1365 ret = -1;
1366
1367 ip_conntrack_expect_put(exp);
1368
1369 return ret;
1370}
1371
1372/****************************************************************************/
1373static int process_rrq(struct sk_buff **pskb, struct ip_conntrack *ct,
1374 enum ip_conntrack_info ctinfo,
1375 unsigned char **data, RegistrationRequest * rrq)
1376{
1377 struct ip_ct_h323_master *info = &ct->help.ct_h323_info;
1378 int ret;
1379 typeof(set_ras_addr_hook) set_ras_addr;
1380
1381 DEBUGP("ip_ct_ras: RRQ\n");
1382
1383 ret = expect_q931(pskb, ct, ctinfo, data,
1384 rrq->callSignalAddress.item,
1385 rrq->callSignalAddress.count);
1386 if (ret < 0)
1387 return -1;
1388
1389 set_ras_addr = rcu_dereference(set_ras_addr_hook);
1390 if (set_ras_addr) {
1391 ret = set_ras_addr(pskb, ct, ctinfo, data,
1392 rrq->rasAddress.item,
1393 rrq->rasAddress.count);
1394 if (ret < 0)
1395 return -1;
1396 }
1397
1398 if (rrq->options & eRegistrationRequest_timeToLive) {
1399 DEBUGP("ip_ct_ras: RRQ TTL = %u seconds\n", rrq->timeToLive);
1400 info->timeout = rrq->timeToLive;
1401 } else
1402 info->timeout = default_rrq_ttl;
1403
1404 return 0;
1405}
1406
1407/****************************************************************************/
1408static int process_rcf(struct sk_buff **pskb, struct ip_conntrack *ct,
1409 enum ip_conntrack_info ctinfo,
1410 unsigned char **data, RegistrationConfirm * rcf)
1411{
1412 struct ip_ct_h323_master *info = &ct->help.ct_h323_info;
1413 int dir = CTINFO2DIR(ctinfo);
1414 int ret;
1415 struct ip_conntrack_expect *exp;
1416 typeof(set_sig_addr_hook) set_sig_addr;
1417
1418 DEBUGP("ip_ct_ras: RCF\n");
1419
1420 set_sig_addr = rcu_dereference(set_sig_addr_hook);
1421 if (set_sig_addr) {
1422 ret = set_sig_addr(pskb, ct, ctinfo, data,
1423 rcf->callSignalAddress.item,
1424 rcf->callSignalAddress.count);
1425 if (ret < 0)
1426 return -1;
1427 }
1428
1429 if (rcf->options & eRegistrationConfirm_timeToLive) {
1430 DEBUGP("ip_ct_ras: RCF TTL = %u seconds\n", rcf->timeToLive);
1431 info->timeout = rcf->timeToLive;
1432 }
1433
1434 if (info->timeout > 0) {
1435 DEBUGP
1436 ("ip_ct_ras: set RAS connection timeout to %u seconds\n",
1437 info->timeout);
1438 ip_ct_refresh(ct, *pskb, info->timeout * HZ);
1439
1440 /* Set expect timeout */
1441 read_lock_bh(&ip_conntrack_lock);
1442 exp = find_expect(ct, ct->tuplehash[dir].tuple.dst.ip,
1443 info->sig_port[!dir]);
1444 if (exp) {
1445 DEBUGP("ip_ct_ras: set Q.931 expect "
1446 "(%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu) "
1447 "timeout to %u seconds\n",
1448 NIPQUAD(exp->tuple.src.ip),
1449 ntohs(exp->tuple.src.u.tcp.port),
1450 NIPQUAD(exp->tuple.dst.ip),
1451 ntohs(exp->tuple.dst.u.tcp.port),
1452 info->timeout);
1453 set_expect_timeout(exp, info->timeout);
1454 }
1455 read_unlock_bh(&ip_conntrack_lock);
1456 }
1457
1458 return 0;
1459}
1460
1461/****************************************************************************/
1462static int process_urq(struct sk_buff **pskb, struct ip_conntrack *ct,
1463 enum ip_conntrack_info ctinfo,
1464 unsigned char **data, UnregistrationRequest * urq)
1465{
1466 struct ip_ct_h323_master *info = &ct->help.ct_h323_info;
1467 int dir = CTINFO2DIR(ctinfo);
1468 int ret;
1469 typeof(set_sig_addr_hook) set_sig_addr;
1470
1471 DEBUGP("ip_ct_ras: URQ\n");
1472
1473 set_sig_addr = rcu_dereference(set_sig_addr_hook);
1474 if (set_sig_addr) {
1475 ret = set_sig_addr(pskb, ct, ctinfo, data,
1476 urq->callSignalAddress.item,
1477 urq->callSignalAddress.count);
1478 if (ret < 0)
1479 return -1;
1480 }
1481
1482 /* Clear old expect */
1483 ip_ct_remove_expectations(ct);
1484 info->sig_port[dir] = 0;
1485 info->sig_port[!dir] = 0;
1486
1487 /* Give it 30 seconds for UCF or URJ */
1488 ip_ct_refresh(ct, *pskb, 30 * HZ);
1489
1490 return 0;
1491}
1492
1493/****************************************************************************/
1494static int process_arq(struct sk_buff **pskb, struct ip_conntrack *ct,
1495 enum ip_conntrack_info ctinfo,
1496 unsigned char **data, AdmissionRequest * arq)
1497{
1498 struct ip_ct_h323_master *info = &ct->help.ct_h323_info;
1499 int dir = CTINFO2DIR(ctinfo);
1500 __be32 ip;
1501 u_int16_t port;
1502 typeof(set_h225_addr_hook) set_h225_addr;
1503
1504 DEBUGP("ip_ct_ras: ARQ\n");
1505
1506 set_h225_addr = rcu_dereference(set_h225_addr_hook);
1507 if ((arq->options & eAdmissionRequest_destCallSignalAddress) &&
1508 get_h225_addr(*data, &arq->destCallSignalAddress, &ip, &port) &&
1509 ip == ct->tuplehash[dir].tuple.src.ip &&
1510 port == info->sig_port[dir] && set_h225_addr) {
1511 /* Answering ARQ */
1512 return set_h225_addr(pskb, data, 0,
1513 &arq->destCallSignalAddress,
1514 ct->tuplehash[!dir].tuple.dst.ip,
1515 info->sig_port[!dir]);
1516 }
1517
1518 if ((arq->options & eAdmissionRequest_srcCallSignalAddress) &&
1519 get_h225_addr(*data, &arq->srcCallSignalAddress, &ip, &port) &&
1520 ip == ct->tuplehash[dir].tuple.src.ip && set_h225_addr) {
1521 /* Calling ARQ */
1522 return set_h225_addr(pskb, data, 0,
1523 &arq->srcCallSignalAddress,
1524 ct->tuplehash[!dir].tuple.dst.ip,
1525 port);
1526 }
1527
1528 return 0;
1529}
1530
1531/****************************************************************************/
1532static int process_acf(struct sk_buff **pskb, struct ip_conntrack *ct,
1533 enum ip_conntrack_info ctinfo,
1534 unsigned char **data, AdmissionConfirm * acf)
1535{
1536 int dir = CTINFO2DIR(ctinfo);
1537 int ret = 0;
1538 __be32 ip;
1539 u_int16_t port;
1540 struct ip_conntrack_expect *exp;
1541 typeof(set_sig_addr_hook) set_sig_addr;
1542
1543 DEBUGP("ip_ct_ras: ACF\n");
1544
1545 if (!get_h225_addr(*data, &acf->destCallSignalAddress, &ip, &port))
1546 return 0;
1547
1548 if (ip == ct->tuplehash[dir].tuple.dst.ip) { /* Answering ACF */
1549 set_sig_addr = rcu_dereference(set_sig_addr_hook);
1550 if (set_sig_addr)
1551 return set_sig_addr(pskb, ct, ctinfo, data,
1552 &acf->destCallSignalAddress, 1);
1553 return 0;
1554 }
1555
1556 /* Need new expect */
1557 if ((exp = ip_conntrack_expect_alloc(ct)) == NULL)
1558 return -1;
1559 exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
1560 exp->tuple.src.u.tcp.port = 0;
1561 exp->tuple.dst.ip = ip;
1562 exp->tuple.dst.u.tcp.port = htons(port);
1563 exp->tuple.dst.protonum = IPPROTO_TCP;
1564 exp->mask.src.ip = htonl(0xFFFFFFFF);
1565 exp->mask.src.u.tcp.port = 0;
1566 exp->mask.dst.ip = htonl(0xFFFFFFFF);
1567 exp->mask.dst.u.tcp.port = htons(0xFFFF);
1568 exp->mask.dst.protonum = 0xFF;
1569 exp->flags = IP_CT_EXPECT_PERMANENT;
1570 exp->expectfn = ip_conntrack_q931_expect;
1571
1572 if (ip_conntrack_expect_related(exp) == 0) {
1573 DEBUGP("ip_ct_ras: expect Q.931 "
1574 "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
1575 NIPQUAD(exp->tuple.src.ip),
1576 ntohs(exp->tuple.src.u.tcp.port),
1577 NIPQUAD(exp->tuple.dst.ip),
1578 ntohs(exp->tuple.dst.u.tcp.port));
1579 } else
1580 ret = -1;
1581
1582 ip_conntrack_expect_put(exp);
1583
1584 return ret;
1585}
1586
1587/****************************************************************************/
1588static int process_lrq(struct sk_buff **pskb, struct ip_conntrack *ct,
1589 enum ip_conntrack_info ctinfo,
1590 unsigned char **data, LocationRequest * lrq)
1591{
1592 typeof(set_ras_addr_hook) set_ras_addr;
1593
1594 DEBUGP("ip_ct_ras: LRQ\n");
1595
1596 set_ras_addr = rcu_dereference(set_ras_addr_hook);
1597 if (set_ras_addr)
1598 return set_ras_addr(pskb, ct, ctinfo, data,
1599 &lrq->replyAddress, 1);
1600 return 0;
1601}
1602
1603/****************************************************************************/
1604static int process_lcf(struct sk_buff **pskb, struct ip_conntrack *ct,
1605 enum ip_conntrack_info ctinfo,
1606 unsigned char **data, LocationConfirm * lcf)
1607{
1608 int dir = CTINFO2DIR(ctinfo);
1609 int ret = 0;
1610 __be32 ip;
1611 u_int16_t port;
1612 struct ip_conntrack_expect *exp = NULL;
1613
1614 DEBUGP("ip_ct_ras: LCF\n");
1615
1616 if (!get_h225_addr(*data, &lcf->callSignalAddress, &ip, &port))
1617 return 0;
1618
1619 /* Need new expect for call signal */
1620 if ((exp = ip_conntrack_expect_alloc(ct)) == NULL)
1621 return -1;
1622 exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
1623 exp->tuple.src.u.tcp.port = 0;
1624 exp->tuple.dst.ip = ip;
1625 exp->tuple.dst.u.tcp.port = htons(port);
1626 exp->tuple.dst.protonum = IPPROTO_TCP;
1627 exp->mask.src.ip = htonl(0xFFFFFFFF);
1628 exp->mask.src.u.tcp.port = 0;
1629 exp->mask.dst.ip = htonl(0xFFFFFFFF);
1630 exp->mask.dst.u.tcp.port = htons(0xFFFF);
1631 exp->mask.dst.protonum = 0xFF;
1632 exp->flags = IP_CT_EXPECT_PERMANENT;
1633 exp->expectfn = ip_conntrack_q931_expect;
1634
1635 if (ip_conntrack_expect_related(exp) == 0) {
1636 DEBUGP("ip_ct_ras: expect Q.931 "
1637 "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
1638 NIPQUAD(exp->tuple.src.ip),
1639 ntohs(exp->tuple.src.u.tcp.port),
1640 NIPQUAD(exp->tuple.dst.ip),
1641 ntohs(exp->tuple.dst.u.tcp.port));
1642 } else
1643 ret = -1;
1644
1645 ip_conntrack_expect_put(exp);
1646
1647 /* Ignore rasAddress */
1648
1649 return ret;
1650}
1651
1652/****************************************************************************/
1653static int process_irr(struct sk_buff **pskb, struct ip_conntrack *ct,
1654 enum ip_conntrack_info ctinfo,
1655 unsigned char **data, InfoRequestResponse * irr)
1656{
1657 int ret;
1658 typeof(set_ras_addr_hook) set_ras_addr;
1659 typeof(set_sig_addr_hook) set_sig_addr;
1660
1661 DEBUGP("ip_ct_ras: IRR\n");
1662
1663 set_ras_addr = rcu_dereference(set_ras_addr_hook);
1664 if (set_ras_addr) {
1665 ret = set_ras_addr(pskb, ct, ctinfo, data,
1666 &irr->rasAddress, 1);
1667 if (ret < 0)
1668 return -1;
1669 }
1670
1671 set_sig_addr = rcu_dereference(set_sig_addr_hook);
1672 if (set_sig_addr) {
1673 ret = set_sig_addr(pskb, ct, ctinfo, data,
1674 irr->callSignalAddress.item,
1675 irr->callSignalAddress.count);
1676 if (ret < 0)
1677 return -1;
1678 }
1679
1680 return 0;
1681}
1682
1683/****************************************************************************/
1684static int process_ras(struct sk_buff **pskb, struct ip_conntrack *ct,
1685 enum ip_conntrack_info ctinfo,
1686 unsigned char **data, RasMessage * ras)
1687{
1688 switch (ras->choice) {
1689 case eRasMessage_gatekeeperRequest:
1690 return process_grq(pskb, ct, ctinfo, data,
1691 &ras->gatekeeperRequest);
1692 case eRasMessage_gatekeeperConfirm:
1693 return process_gcf(pskb, ct, ctinfo, data,
1694 &ras->gatekeeperConfirm);
1695 case eRasMessage_registrationRequest:
1696 return process_rrq(pskb, ct, ctinfo, data,
1697 &ras->registrationRequest);
1698 case eRasMessage_registrationConfirm:
1699 return process_rcf(pskb, ct, ctinfo, data,
1700 &ras->registrationConfirm);
1701 case eRasMessage_unregistrationRequest:
1702 return process_urq(pskb, ct, ctinfo, data,
1703 &ras->unregistrationRequest);
1704 case eRasMessage_admissionRequest:
1705 return process_arq(pskb, ct, ctinfo, data,
1706 &ras->admissionRequest);
1707 case eRasMessage_admissionConfirm:
1708 return process_acf(pskb, ct, ctinfo, data,
1709 &ras->admissionConfirm);
1710 case eRasMessage_locationRequest:
1711 return process_lrq(pskb, ct, ctinfo, data,
1712 &ras->locationRequest);
1713 case eRasMessage_locationConfirm:
1714 return process_lcf(pskb, ct, ctinfo, data,
1715 &ras->locationConfirm);
1716 case eRasMessage_infoRequestResponse:
1717 return process_irr(pskb, ct, ctinfo, data,
1718 &ras->infoRequestResponse);
1719 default:
1720 DEBUGP("ip_ct_ras: RAS message %d\n", ras->choice);
1721 break;
1722 }
1723
1724 return 0;
1725}
1726
1727/****************************************************************************/
1728static int ras_help(struct sk_buff **pskb, struct ip_conntrack *ct,
1729 enum ip_conntrack_info ctinfo)
1730{
1731 static RasMessage ras;
1732 unsigned char *data;
1733 int datalen = 0;
1734 int ret;
1735
1736 DEBUGP("ip_ct_ras: skblen = %u\n", (*pskb)->len);
1737
1738 spin_lock_bh(&ip_h323_lock);
1739
1740 /* Get UDP data */
1741 data = get_udp_data(pskb, &datalen);
1742 if (data == NULL)
1743 goto accept;
1744 DEBUGP("ip_ct_ras: RAS message %u.%u.%u.%u->%u.%u.%u.%u, len=%d\n",
1745 NIPQUAD((*pskb)->nh.iph->saddr),
1746 NIPQUAD((*pskb)->nh.iph->daddr), datalen);
1747
1748 /* Decode RAS message */
1749 ret = DecodeRasMessage(data, datalen, &ras);
1750 if (ret < 0) {
1751 if (net_ratelimit())
1752 printk("ip_ct_ras: decoding error: %s\n",
1753 ret == H323_ERROR_BOUND ?
1754 "out of bound" : "out of range");
1755 goto accept;
1756 }
1757
1758 /* Process RAS message */
1759 if (process_ras(pskb, ct, ctinfo, &data, &ras) < 0)
1760 goto drop;
1761
1762 accept:
1763 spin_unlock_bh(&ip_h323_lock);
1764 return NF_ACCEPT;
1765
1766 drop:
1767 spin_unlock_bh(&ip_h323_lock);
1768 if (net_ratelimit())
1769 printk("ip_ct_ras: packet dropped\n");
1770 return NF_DROP;
1771}
1772
1773/****************************************************************************/
1774static struct ip_conntrack_helper ip_conntrack_helper_ras = {
1775 .name = "RAS",
1776 .me = THIS_MODULE,
1777 .max_expected = 32,
1778 .timeout = 240,
1779 .tuple = {.src = {.u = {.tcp = {.port = __constant_htons(RAS_PORT)}}},
1780 .dst = {.protonum = IPPROTO_UDP}},
1781 .mask = {.src = {.u = {0xFFFE}},
1782 .dst = {.protonum = 0xFF}},
1783 .help = ras_help,
1784};
1785
1786/****************************************************************************/
1787static void ip_conntrack_ras_expect(struct ip_conntrack *new,
1788 struct ip_conntrack_expect *this)
1789{
1790 write_lock_bh(&ip_conntrack_lock);
1791 new->helper = &ip_conntrack_helper_ras;
1792 write_unlock_bh(&ip_conntrack_lock);
1793}
1794
1795/****************************************************************************/
1796/* Not __exit - called from init() */
1797static void fini(void)
1798{
1799 ip_conntrack_helper_unregister(&ip_conntrack_helper_ras);
1800 ip_conntrack_helper_unregister(&ip_conntrack_helper_q931);
1801 kfree(h323_buffer);
1802 DEBUGP("ip_ct_h323: fini\n");
1803}
1804
1805/****************************************************************************/
1806static int __init init(void)
1807{
1808 int ret;
1809
1810 h323_buffer = kmalloc(65536, GFP_KERNEL);
1811 if (!h323_buffer)
1812 return -ENOMEM;
1813 if ((ret = ip_conntrack_helper_register(&ip_conntrack_helper_q931)) ||
1814 (ret = ip_conntrack_helper_register(&ip_conntrack_helper_ras))) {
1815 fini();
1816 return ret;
1817 }
1818 DEBUGP("ip_ct_h323: init success\n");
1819 return 0;
1820}
1821
1822/****************************************************************************/
1823module_init(init);
1824module_exit(fini);
1825
1826EXPORT_SYMBOL_GPL(get_h225_addr);
1827EXPORT_SYMBOL_GPL(ip_conntrack_h245_expect);
1828EXPORT_SYMBOL_GPL(ip_conntrack_q931_expect);
1829EXPORT_SYMBOL_GPL(set_h245_addr_hook);
1830EXPORT_SYMBOL_GPL(set_h225_addr_hook);
1831EXPORT_SYMBOL_GPL(set_sig_addr_hook);
1832EXPORT_SYMBOL_GPL(set_ras_addr_hook);
1833EXPORT_SYMBOL_GPL(nat_rtp_rtcp_hook);
1834EXPORT_SYMBOL_GPL(nat_t120_hook);
1835EXPORT_SYMBOL_GPL(nat_h245_hook);
1836EXPORT_SYMBOL_GPL(nat_callforwarding_hook);
1837EXPORT_SYMBOL_GPL(nat_q931_hook);
1838
1839MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>");
1840MODULE_DESCRIPTION("H.323 connection tracking helper");
1841MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c
deleted file mode 100644
index 2b760c5cf709..000000000000
--- a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c
+++ /dev/null
@@ -1,684 +0,0 @@
1/*
2 * ip_conntrack_pptp.c - Version 3.0
3 *
4 * Connection tracking support for PPTP (Point to Point Tunneling Protocol).
5 * PPTP is a a protocol for creating virtual private networks.
6 * It is a specification defined by Microsoft and some vendors
7 * working with Microsoft. PPTP is built on top of a modified
8 * version of the Internet Generic Routing Encapsulation Protocol.
9 * GRE is defined in RFC 1701 and RFC 1702. Documentation of
10 * PPTP can be found in RFC 2637
11 *
12 * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org>
13 *
14 * Development of this code funded by Astaro AG (http://www.astaro.com/)
15 *
16 * Limitations:
17 * - We blindly assume that control connections are always
18 * established in PNS->PAC direction. This is a violation
19 * of RFFC2673
20 * - We can only support one single call within each session
21 *
22 * TODO:
23 * - testing of incoming PPTP calls
24 *
25 * Changes:
26 * 2002-02-05 - Version 1.3
27 * - Call ip_conntrack_unexpect_related() from
28 * pptp_destroy_siblings() to destroy expectations in case
29 * CALL_DISCONNECT_NOTIFY or tcp fin packet was seen
30 * (Philip Craig <philipc@snapgear.com>)
31 * - Add Version information at module loadtime
32 * 2002-02-10 - Version 1.6
33 * - move to C99 style initializers
34 * - remove second expectation if first arrives
35 * 2004-10-22 - Version 2.0
36 * - merge Mandrake's 2.6.x port with recent 2.6.x API changes
37 * - fix lots of linear skb assumptions from Mandrake's port
38 * 2005-06-10 - Version 2.1
39 * - use ip_conntrack_expect_free() instead of kfree() on the
40 * expect's (which are from the slab for quite some time)
41 * 2005-06-10 - Version 3.0
42 * - port helper to post-2.6.11 API changes,
43 * funded by Oxcoda NetBox Blue (http://www.netboxblue.com/)
44 * 2005-07-30 - Version 3.1
45 * - port helper to 2.6.13 API changes
46 *
47 */
48
49#include <linux/module.h>
50#include <linux/netfilter.h>
51#include <linux/ip.h>
52#include <net/checksum.h>
53#include <net/tcp.h>
54
55#include <linux/netfilter_ipv4/ip_conntrack.h>
56#include <linux/netfilter_ipv4/ip_conntrack_core.h>
57#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
58#include <linux/netfilter_ipv4/ip_conntrack_proto_gre.h>
59#include <linux/netfilter_ipv4/ip_conntrack_pptp.h>
60
61#define IP_CT_PPTP_VERSION "3.1"
62
63MODULE_LICENSE("GPL");
64MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
65MODULE_DESCRIPTION("Netfilter connection tracking helper module for PPTP");
66
67static DEFINE_SPINLOCK(ip_pptp_lock);
68
69int
70(*ip_nat_pptp_hook_outbound)(struct sk_buff **pskb,
71 struct ip_conntrack *ct,
72 enum ip_conntrack_info ctinfo,
73 struct PptpControlHeader *ctlh,
74 union pptp_ctrl_union *pptpReq);
75
76int
77(*ip_nat_pptp_hook_inbound)(struct sk_buff **pskb,
78 struct ip_conntrack *ct,
79 enum ip_conntrack_info ctinfo,
80 struct PptpControlHeader *ctlh,
81 union pptp_ctrl_union *pptpReq);
82
83void
84(*ip_nat_pptp_hook_exp_gre)(struct ip_conntrack_expect *expect_orig,
85 struct ip_conntrack_expect *expect_reply);
86
87void
88(*ip_nat_pptp_hook_expectfn)(struct ip_conntrack *ct,
89 struct ip_conntrack_expect *exp);
90
91#if 0
92/* PptpControlMessageType names */
93const char *pptp_msg_name[] = {
94 "UNKNOWN_MESSAGE",
95 "START_SESSION_REQUEST",
96 "START_SESSION_REPLY",
97 "STOP_SESSION_REQUEST",
98 "STOP_SESSION_REPLY",
99 "ECHO_REQUEST",
100 "ECHO_REPLY",
101 "OUT_CALL_REQUEST",
102 "OUT_CALL_REPLY",
103 "IN_CALL_REQUEST",
104 "IN_CALL_REPLY",
105 "IN_CALL_CONNECT",
106 "CALL_CLEAR_REQUEST",
107 "CALL_DISCONNECT_NOTIFY",
108 "WAN_ERROR_NOTIFY",
109 "SET_LINK_INFO"
110};
111EXPORT_SYMBOL(pptp_msg_name);
112#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s: " format, __FILE__, __FUNCTION__, ## args)
113#else
114#define DEBUGP(format, args...)
115#endif
116
117#define SECS *HZ
118#define MINS * 60 SECS
119#define HOURS * 60 MINS
120
121#define PPTP_GRE_TIMEOUT (10 MINS)
122#define PPTP_GRE_STREAM_TIMEOUT (5 HOURS)
123
124static void pptp_expectfn(struct ip_conntrack *ct,
125 struct ip_conntrack_expect *exp)
126{
127 typeof(ip_nat_pptp_hook_expectfn) ip_nat_pptp_expectfn;
128
129 DEBUGP("increasing timeouts\n");
130
131 /* increase timeout of GRE data channel conntrack entry */
132 ct->proto.gre.timeout = PPTP_GRE_TIMEOUT;
133 ct->proto.gre.stream_timeout = PPTP_GRE_STREAM_TIMEOUT;
134
135 /* Can you see how rusty this code is, compared with the pre-2.6.11
136 * one? That's what happened to my shiny newnat of 2002 ;( -HW */
137
138 rcu_read_lock();
139 ip_nat_pptp_expectfn = rcu_dereference(ip_nat_pptp_hook_expectfn);
140 if (!ip_nat_pptp_expectfn) {
141 struct ip_conntrack_tuple inv_t;
142 struct ip_conntrack_expect *exp_other;
143
144 /* obviously this tuple inversion only works until you do NAT */
145 invert_tuplepr(&inv_t, &exp->tuple);
146 DEBUGP("trying to unexpect other dir: ");
147 DUMP_TUPLE(&inv_t);
148
149 exp_other = ip_conntrack_expect_find_get(&inv_t);
150 if (exp_other) {
151 /* delete other expectation. */
152 DEBUGP("found\n");
153 ip_conntrack_unexpect_related(exp_other);
154 ip_conntrack_expect_put(exp_other);
155 } else {
156 DEBUGP("not found\n");
157 }
158 } else {
159 /* we need more than simple inversion */
160 ip_nat_pptp_expectfn(ct, exp);
161 }
162 rcu_read_unlock();
163}
164
165static int destroy_sibling_or_exp(const struct ip_conntrack_tuple *t)
166{
167 struct ip_conntrack_tuple_hash *h;
168 struct ip_conntrack_expect *exp;
169
170 DEBUGP("trying to timeout ct or exp for tuple ");
171 DUMP_TUPLE(t);
172
173 h = ip_conntrack_find_get(t, NULL);
174 if (h) {
175 struct ip_conntrack *sibling = tuplehash_to_ctrack(h);
176 DEBUGP("setting timeout of conntrack %p to 0\n", sibling);
177 sibling->proto.gre.timeout = 0;
178 sibling->proto.gre.stream_timeout = 0;
179 if (del_timer(&sibling->timeout))
180 sibling->timeout.function((unsigned long)sibling);
181 ip_conntrack_put(sibling);
182 return 1;
183 } else {
184 exp = ip_conntrack_expect_find_get(t);
185 if (exp) {
186 DEBUGP("unexpect_related of expect %p\n", exp);
187 ip_conntrack_unexpect_related(exp);
188 ip_conntrack_expect_put(exp);
189 return 1;
190 }
191 }
192
193 return 0;
194}
195
196
197/* timeout GRE data connections */
198static void pptp_destroy_siblings(struct ip_conntrack *ct)
199{
200 struct ip_conntrack_tuple t;
201
202 ip_ct_gre_keymap_destroy(ct);
203 /* Since ct->sibling_list has literally rusted away in 2.6.11,
204 * we now need another way to find out about our sibling
205 * contrack and expects... -HW */
206
207 /* try original (pns->pac) tuple */
208 memcpy(&t, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, sizeof(t));
209 t.dst.protonum = IPPROTO_GRE;
210 t.src.u.gre.key = ct->help.ct_pptp_info.pns_call_id;
211 t.dst.u.gre.key = ct->help.ct_pptp_info.pac_call_id;
212
213 if (!destroy_sibling_or_exp(&t))
214 DEBUGP("failed to timeout original pns->pac ct/exp\n");
215
216 /* try reply (pac->pns) tuple */
217 memcpy(&t, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, sizeof(t));
218 t.dst.protonum = IPPROTO_GRE;
219 t.src.u.gre.key = ct->help.ct_pptp_info.pac_call_id;
220 t.dst.u.gre.key = ct->help.ct_pptp_info.pns_call_id;
221
222 if (!destroy_sibling_or_exp(&t))
223 DEBUGP("failed to timeout reply pac->pns ct/exp\n");
224}
225
226/* expect GRE connections (PNS->PAC and PAC->PNS direction) */
227static inline int
228exp_gre(struct ip_conntrack *ct,
229 __be16 callid,
230 __be16 peer_callid)
231{
232 struct ip_conntrack_expect *exp_orig, *exp_reply;
233 int ret = 1;
234 typeof(ip_nat_pptp_hook_exp_gre) ip_nat_pptp_exp_gre;
235
236 exp_orig = ip_conntrack_expect_alloc(ct);
237 if (exp_orig == NULL)
238 goto out;
239
240 exp_reply = ip_conntrack_expect_alloc(ct);
241 if (exp_reply == NULL)
242 goto out_put_orig;
243
244 /* original direction, PNS->PAC */
245 exp_orig->tuple.src.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip;
246 exp_orig->tuple.src.u.gre.key = peer_callid;
247 exp_orig->tuple.dst.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip;
248 exp_orig->tuple.dst.u.gre.key = callid;
249 exp_orig->tuple.dst.protonum = IPPROTO_GRE;
250
251 exp_orig->mask.src.ip = htonl(0xffffffff);
252 exp_orig->mask.src.u.all = 0;
253 exp_orig->mask.dst.u.gre.key = htons(0xffff);
254 exp_orig->mask.dst.ip = htonl(0xffffffff);
255 exp_orig->mask.dst.protonum = 0xff;
256
257 exp_orig->master = ct;
258 exp_orig->expectfn = pptp_expectfn;
259 exp_orig->flags = 0;
260
261 /* both expectations are identical apart from tuple */
262 memcpy(exp_reply, exp_orig, sizeof(*exp_reply));
263
264 /* reply direction, PAC->PNS */
265 exp_reply->tuple.src.ip = ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip;
266 exp_reply->tuple.src.u.gre.key = callid;
267 exp_reply->tuple.dst.ip = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip;
268 exp_reply->tuple.dst.u.gre.key = peer_callid;
269 exp_reply->tuple.dst.protonum = IPPROTO_GRE;
270
271 ip_nat_pptp_exp_gre = rcu_dereference(ip_nat_pptp_hook_exp_gre);
272 if (ip_nat_pptp_exp_gre)
273 ip_nat_pptp_exp_gre(exp_orig, exp_reply);
274 if (ip_conntrack_expect_related(exp_orig) != 0)
275 goto out_put_both;
276 if (ip_conntrack_expect_related(exp_reply) != 0)
277 goto out_unexpect_orig;
278
279 /* Add GRE keymap entries */
280 if (ip_ct_gre_keymap_add(ct, &exp_orig->tuple, 0) != 0)
281 goto out_unexpect_both;
282 if (ip_ct_gre_keymap_add(ct, &exp_reply->tuple, 1) != 0) {
283 ip_ct_gre_keymap_destroy(ct);
284 goto out_unexpect_both;
285 }
286 ret = 0;
287
288out_put_both:
289 ip_conntrack_expect_put(exp_reply);
290out_put_orig:
291 ip_conntrack_expect_put(exp_orig);
292out:
293 return ret;
294
295out_unexpect_both:
296 ip_conntrack_unexpect_related(exp_reply);
297out_unexpect_orig:
298 ip_conntrack_unexpect_related(exp_orig);
299 goto out_put_both;
300}
301
302static inline int
303pptp_inbound_pkt(struct sk_buff **pskb,
304 struct PptpControlHeader *ctlh,
305 union pptp_ctrl_union *pptpReq,
306 unsigned int reqlen,
307 struct ip_conntrack *ct,
308 enum ip_conntrack_info ctinfo)
309{
310 struct ip_ct_pptp_master *info = &ct->help.ct_pptp_info;
311 u_int16_t msg;
312 __be16 cid = 0, pcid = 0;
313 typeof(ip_nat_pptp_hook_inbound) ip_nat_pptp_inbound;
314
315 msg = ntohs(ctlh->messageType);
316 DEBUGP("inbound control message %s\n", pptp_msg_name[msg]);
317
318 switch (msg) {
319 case PPTP_START_SESSION_REPLY:
320 /* server confirms new control session */
321 if (info->sstate < PPTP_SESSION_REQUESTED)
322 goto invalid;
323 if (pptpReq->srep.resultCode == PPTP_START_OK)
324 info->sstate = PPTP_SESSION_CONFIRMED;
325 else
326 info->sstate = PPTP_SESSION_ERROR;
327 break;
328
329 case PPTP_STOP_SESSION_REPLY:
330 /* server confirms end of control session */
331 if (info->sstate > PPTP_SESSION_STOPREQ)
332 goto invalid;
333 if (pptpReq->strep.resultCode == PPTP_STOP_OK)
334 info->sstate = PPTP_SESSION_NONE;
335 else
336 info->sstate = PPTP_SESSION_ERROR;
337 break;
338
339 case PPTP_OUT_CALL_REPLY:
340 /* server accepted call, we now expect GRE frames */
341 if (info->sstate != PPTP_SESSION_CONFIRMED)
342 goto invalid;
343 if (info->cstate != PPTP_CALL_OUT_REQ &&
344 info->cstate != PPTP_CALL_OUT_CONF)
345 goto invalid;
346
347 cid = pptpReq->ocack.callID;
348 pcid = pptpReq->ocack.peersCallID;
349 if (info->pns_call_id != pcid)
350 goto invalid;
351 DEBUGP("%s, CID=%X, PCID=%X\n", pptp_msg_name[msg],
352 ntohs(cid), ntohs(pcid));
353
354 if (pptpReq->ocack.resultCode == PPTP_OUTCALL_CONNECT) {
355 info->cstate = PPTP_CALL_OUT_CONF;
356 info->pac_call_id = cid;
357 exp_gre(ct, cid, pcid);
358 } else
359 info->cstate = PPTP_CALL_NONE;
360 break;
361
362 case PPTP_IN_CALL_REQUEST:
363 /* server tells us about incoming call request */
364 if (info->sstate != PPTP_SESSION_CONFIRMED)
365 goto invalid;
366
367 cid = pptpReq->icreq.callID;
368 DEBUGP("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid));
369 info->cstate = PPTP_CALL_IN_REQ;
370 info->pac_call_id = cid;
371 break;
372
373 case PPTP_IN_CALL_CONNECT:
374 /* server tells us about incoming call established */
375 if (info->sstate != PPTP_SESSION_CONFIRMED)
376 goto invalid;
377 if (info->cstate != PPTP_CALL_IN_REP &&
378 info->cstate != PPTP_CALL_IN_CONF)
379 goto invalid;
380
381 pcid = pptpReq->iccon.peersCallID;
382 cid = info->pac_call_id;
383
384 if (info->pns_call_id != pcid)
385 goto invalid;
386
387 DEBUGP("%s, PCID=%X\n", pptp_msg_name[msg], ntohs(pcid));
388 info->cstate = PPTP_CALL_IN_CONF;
389
390 /* we expect a GRE connection from PAC to PNS */
391 exp_gre(ct, cid, pcid);
392 break;
393
394 case PPTP_CALL_DISCONNECT_NOTIFY:
395 /* server confirms disconnect */
396 cid = pptpReq->disc.callID;
397 DEBUGP("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid));
398 info->cstate = PPTP_CALL_NONE;
399
400 /* untrack this call id, unexpect GRE packets */
401 pptp_destroy_siblings(ct);
402 break;
403
404 case PPTP_WAN_ERROR_NOTIFY:
405 case PPTP_ECHO_REQUEST:
406 case PPTP_ECHO_REPLY:
407 /* I don't have to explain these ;) */
408 break;
409 default:
410 goto invalid;
411 }
412
413 ip_nat_pptp_inbound = rcu_dereference(ip_nat_pptp_hook_inbound);
414 if (ip_nat_pptp_inbound)
415 return ip_nat_pptp_inbound(pskb, ct, ctinfo, ctlh, pptpReq);
416 return NF_ACCEPT;
417
418invalid:
419 DEBUGP("invalid %s: type=%d cid=%u pcid=%u "
420 "cstate=%d sstate=%d pns_cid=%u pac_cid=%u\n",
421 msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : pptp_msg_name[0],
422 msg, ntohs(cid), ntohs(pcid), info->cstate, info->sstate,
423 ntohs(info->pns_call_id), ntohs(info->pac_call_id));
424 return NF_ACCEPT;
425}
426
427static inline int
428pptp_outbound_pkt(struct sk_buff **pskb,
429 struct PptpControlHeader *ctlh,
430 union pptp_ctrl_union *pptpReq,
431 unsigned int reqlen,
432 struct ip_conntrack *ct,
433 enum ip_conntrack_info ctinfo)
434{
435 struct ip_ct_pptp_master *info = &ct->help.ct_pptp_info;
436 u_int16_t msg;
437 __be16 cid = 0, pcid = 0;
438 typeof(ip_nat_pptp_hook_outbound) ip_nat_pptp_outbound;
439
440 msg = ntohs(ctlh->messageType);
441 DEBUGP("outbound control message %s\n", pptp_msg_name[msg]);
442
443 switch (msg) {
444 case PPTP_START_SESSION_REQUEST:
445 /* client requests for new control session */
446 if (info->sstate != PPTP_SESSION_NONE)
447 goto invalid;
448 info->sstate = PPTP_SESSION_REQUESTED;
449 break;
450 case PPTP_STOP_SESSION_REQUEST:
451 /* client requests end of control session */
452 info->sstate = PPTP_SESSION_STOPREQ;
453 break;
454
455 case PPTP_OUT_CALL_REQUEST:
456 /* client initiating connection to server */
457 if (info->sstate != PPTP_SESSION_CONFIRMED)
458 goto invalid;
459 info->cstate = PPTP_CALL_OUT_REQ;
460 /* track PNS call id */
461 cid = pptpReq->ocreq.callID;
462 DEBUGP("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid));
463 info->pns_call_id = cid;
464 break;
465 case PPTP_IN_CALL_REPLY:
466 /* client answers incoming call */
467 if (info->cstate != PPTP_CALL_IN_REQ &&
468 info->cstate != PPTP_CALL_IN_REP)
469 goto invalid;
470
471 cid = pptpReq->icack.callID;
472 pcid = pptpReq->icack.peersCallID;
473 if (info->pac_call_id != pcid)
474 goto invalid;
475 DEBUGP("%s, CID=%X PCID=%X\n", pptp_msg_name[msg],
476 ntohs(cid), ntohs(pcid));
477
478 if (pptpReq->icack.resultCode == PPTP_INCALL_ACCEPT) {
479 /* part two of the three-way handshake */
480 info->cstate = PPTP_CALL_IN_REP;
481 info->pns_call_id = cid;
482 } else
483 info->cstate = PPTP_CALL_NONE;
484 break;
485
486 case PPTP_CALL_CLEAR_REQUEST:
487 /* client requests hangup of call */
488 if (info->sstate != PPTP_SESSION_CONFIRMED)
489 goto invalid;
490 /* FUTURE: iterate over all calls and check if
491 * call ID is valid. We don't do this without newnat,
492 * because we only know about last call */
493 info->cstate = PPTP_CALL_CLEAR_REQ;
494 break;
495 case PPTP_SET_LINK_INFO:
496 case PPTP_ECHO_REQUEST:
497 case PPTP_ECHO_REPLY:
498 /* I don't have to explain these ;) */
499 break;
500 default:
501 goto invalid;
502 }
503
504 ip_nat_pptp_outbound = rcu_dereference(ip_nat_pptp_hook_outbound);
505 if (ip_nat_pptp_outbound)
506 return ip_nat_pptp_outbound(pskb, ct, ctinfo, ctlh, pptpReq);
507 return NF_ACCEPT;
508
509invalid:
510 DEBUGP("invalid %s: type=%d cid=%u pcid=%u "
511 "cstate=%d sstate=%d pns_cid=%u pac_cid=%u\n",
512 msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : pptp_msg_name[0],
513 msg, ntohs(cid), ntohs(pcid), info->cstate, info->sstate,
514 ntohs(info->pns_call_id), ntohs(info->pac_call_id));
515 return NF_ACCEPT;
516}
517
518static const unsigned int pptp_msg_size[] = {
519 [PPTP_START_SESSION_REQUEST] = sizeof(struct PptpStartSessionRequest),
520 [PPTP_START_SESSION_REPLY] = sizeof(struct PptpStartSessionReply),
521 [PPTP_STOP_SESSION_REQUEST] = sizeof(struct PptpStopSessionRequest),
522 [PPTP_STOP_SESSION_REPLY] = sizeof(struct PptpStopSessionReply),
523 [PPTP_OUT_CALL_REQUEST] = sizeof(struct PptpOutCallRequest),
524 [PPTP_OUT_CALL_REPLY] = sizeof(struct PptpOutCallReply),
525 [PPTP_IN_CALL_REQUEST] = sizeof(struct PptpInCallRequest),
526 [PPTP_IN_CALL_REPLY] = sizeof(struct PptpInCallReply),
527 [PPTP_IN_CALL_CONNECT] = sizeof(struct PptpInCallConnected),
528 [PPTP_CALL_CLEAR_REQUEST] = sizeof(struct PptpClearCallRequest),
529 [PPTP_CALL_DISCONNECT_NOTIFY] = sizeof(struct PptpCallDisconnectNotify),
530 [PPTP_WAN_ERROR_NOTIFY] = sizeof(struct PptpWanErrorNotify),
531 [PPTP_SET_LINK_INFO] = sizeof(struct PptpSetLinkInfo),
532};
533
534/* track caller id inside control connection, call expect_related */
535static int
536conntrack_pptp_help(struct sk_buff **pskb,
537 struct ip_conntrack *ct, enum ip_conntrack_info ctinfo)
538
539{
540 int dir = CTINFO2DIR(ctinfo);
541 struct ip_ct_pptp_master *info = &ct->help.ct_pptp_info;
542 struct tcphdr _tcph, *tcph;
543 struct pptp_pkt_hdr _pptph, *pptph;
544 struct PptpControlHeader _ctlh, *ctlh;
545 union pptp_ctrl_union _pptpReq, *pptpReq;
546 unsigned int tcplen = (*pskb)->len - (*pskb)->nh.iph->ihl * 4;
547 unsigned int datalen, reqlen, nexthdr_off;
548 int oldsstate, oldcstate;
549 int ret;
550 u_int16_t msg;
551
552 /* don't do any tracking before tcp handshake complete */
553 if (ctinfo != IP_CT_ESTABLISHED
554 && ctinfo != IP_CT_ESTABLISHED+IP_CT_IS_REPLY) {
555 DEBUGP("ctinfo = %u, skipping\n", ctinfo);
556 return NF_ACCEPT;
557 }
558
559 nexthdr_off = (*pskb)->nh.iph->ihl*4;
560 tcph = skb_header_pointer(*pskb, nexthdr_off, sizeof(_tcph), &_tcph);
561 BUG_ON(!tcph);
562 nexthdr_off += tcph->doff * 4;
563 datalen = tcplen - tcph->doff * 4;
564
565 pptph = skb_header_pointer(*pskb, nexthdr_off, sizeof(_pptph), &_pptph);
566 if (!pptph) {
567 DEBUGP("no full PPTP header, can't track\n");
568 return NF_ACCEPT;
569 }
570 nexthdr_off += sizeof(_pptph);
571 datalen -= sizeof(_pptph);
572
573 /* if it's not a control message we can't do anything with it */
574 if (ntohs(pptph->packetType) != PPTP_PACKET_CONTROL ||
575 ntohl(pptph->magicCookie) != PPTP_MAGIC_COOKIE) {
576 DEBUGP("not a control packet\n");
577 return NF_ACCEPT;
578 }
579
580 ctlh = skb_header_pointer(*pskb, nexthdr_off, sizeof(_ctlh), &_ctlh);
581 if (!ctlh)
582 return NF_ACCEPT;
583 nexthdr_off += sizeof(_ctlh);
584 datalen -= sizeof(_ctlh);
585
586 reqlen = datalen;
587 msg = ntohs(ctlh->messageType);
588 if (msg > 0 && msg <= PPTP_MSG_MAX && reqlen < pptp_msg_size[msg])
589 return NF_ACCEPT;
590 if (reqlen > sizeof(*pptpReq))
591 reqlen = sizeof(*pptpReq);
592
593 pptpReq = skb_header_pointer(*pskb, nexthdr_off, reqlen, &_pptpReq);
594 if (!pptpReq)
595 return NF_ACCEPT;
596
597 oldsstate = info->sstate;
598 oldcstate = info->cstate;
599
600 spin_lock_bh(&ip_pptp_lock);
601
602 /* FIXME: We just blindly assume that the control connection is always
603 * established from PNS->PAC. However, RFC makes no guarantee */
604 if (dir == IP_CT_DIR_ORIGINAL)
605 /* client -> server (PNS -> PAC) */
606 ret = pptp_outbound_pkt(pskb, ctlh, pptpReq, reqlen, ct,
607 ctinfo);
608 else
609 /* server -> client (PAC -> PNS) */
610 ret = pptp_inbound_pkt(pskb, ctlh, pptpReq, reqlen, ct,
611 ctinfo);
612 DEBUGP("sstate: %d->%d, cstate: %d->%d\n",
613 oldsstate, info->sstate, oldcstate, info->cstate);
614 spin_unlock_bh(&ip_pptp_lock);
615
616 return ret;
617}
618
619/* control protocol helper */
620static struct ip_conntrack_helper pptp = {
621 .list = { NULL, NULL },
622 .name = "pptp",
623 .me = THIS_MODULE,
624 .max_expected = 2,
625 .timeout = 5 * 60,
626 .tuple = { .src = { .ip = 0,
627 .u = { .tcp = { .port =
628 __constant_htons(PPTP_CONTROL_PORT) } }
629 },
630 .dst = { .ip = 0,
631 .u = { .all = 0 },
632 .protonum = IPPROTO_TCP
633 }
634 },
635 .mask = { .src = { .ip = 0,
636 .u = { .tcp = { .port = __constant_htons(0xffff) } }
637 },
638 .dst = { .ip = 0,
639 .u = { .all = 0 },
640 .protonum = 0xff
641 }
642 },
643 .help = conntrack_pptp_help,
644 .destroy = pptp_destroy_siblings,
645};
646
647extern void ip_ct_proto_gre_fini(void);
648extern int __init ip_ct_proto_gre_init(void);
649
650/* ip_conntrack_pptp initialization */
651static int __init ip_conntrack_helper_pptp_init(void)
652{
653 int retcode;
654
655 retcode = ip_ct_proto_gre_init();
656 if (retcode < 0)
657 return retcode;
658
659 DEBUGP(" registering helper\n");
660 if ((retcode = ip_conntrack_helper_register(&pptp))) {
661 printk(KERN_ERR "Unable to register conntrack application "
662 "helper for pptp: %d\n", retcode);
663 ip_ct_proto_gre_fini();
664 return retcode;
665 }
666
667 printk("ip_conntrack_pptp version %s loaded\n", IP_CT_PPTP_VERSION);
668 return 0;
669}
670
671static void __exit ip_conntrack_helper_pptp_fini(void)
672{
673 ip_conntrack_helper_unregister(&pptp);
674 ip_ct_proto_gre_fini();
675 printk("ip_conntrack_pptp version %s unloaded\n", IP_CT_PPTP_VERSION);
676}
677
678module_init(ip_conntrack_helper_pptp_init);
679module_exit(ip_conntrack_helper_pptp_fini);
680
681EXPORT_SYMBOL(ip_nat_pptp_hook_outbound);
682EXPORT_SYMBOL(ip_nat_pptp_hook_inbound);
683EXPORT_SYMBOL(ip_nat_pptp_hook_exp_gre);
684EXPORT_SYMBOL(ip_nat_pptp_hook_expectfn);
diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c
deleted file mode 100644
index 053e591f407a..000000000000
--- a/net/ipv4/netfilter/ip_conntrack_irc.c
+++ /dev/null
@@ -1,314 +0,0 @@
1/* IRC extension for IP connection tracking, Version 1.21
2 * (C) 2000-2002 by Harald Welte <laforge@gnumonks.org>
3 * based on RR's ip_conntrack_ftp.c
4 *
5 * ip_conntrack_irc.c,v 1.21 2002/02/05 14:49:26 laforge Exp
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 **
12 * Module load syntax:
13 * insmod ip_conntrack_irc.o ports=port1,port2,...port<MAX_PORTS>
14 * max_dcc_channels=n dcc_timeout=secs
15 *
16 * please give the ports of all IRC servers You wish to connect to.
17 * If You don't specify ports, the default will be port 6667.
18 * With max_dcc_channels you can define the maximum number of not
19 * yet answered DCC channels per IRC session (default 8).
20 * With dcc_timeout you can specify how long the system waits for
21 * an expected DCC channel (default 300 seconds).
22 *
23 */
24
25#include <linux/module.h>
26#include <linux/netfilter.h>
27#include <linux/ip.h>
28#include <net/checksum.h>
29#include <net/tcp.h>
30
31#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
32#include <linux/netfilter_ipv4/ip_conntrack_irc.h>
33#include <linux/moduleparam.h>
34
35#define MAX_PORTS 8
36static unsigned short ports[MAX_PORTS];
37static int ports_c;
38static unsigned int max_dcc_channels = 8;
39static unsigned int dcc_timeout = 300;
40/* This is slow, but it's simple. --RR */
41static char *irc_buffer;
42static DEFINE_SPINLOCK(irc_buffer_lock);
43
44unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb,
45 enum ip_conntrack_info ctinfo,
46 unsigned int matchoff,
47 unsigned int matchlen,
48 struct ip_conntrack_expect *exp);
49EXPORT_SYMBOL_GPL(ip_nat_irc_hook);
50
51MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
52MODULE_DESCRIPTION("IRC (DCC) connection tracking helper");
53MODULE_LICENSE("GPL");
54module_param_array(ports, ushort, &ports_c, 0400);
55MODULE_PARM_DESC(ports, "port numbers of IRC servers");
56module_param(max_dcc_channels, uint, 0400);
57MODULE_PARM_DESC(max_dcc_channels, "max number of expected DCC channels per IRC session");
58module_param(dcc_timeout, uint, 0400);
59MODULE_PARM_DESC(dcc_timeout, "timeout on for unestablished DCC channels");
60
61static const char *dccprotos[] = { "SEND ", "CHAT ", "MOVE ", "TSEND ", "SCHAT " };
62#define MINMATCHLEN 5
63
64#if 0
65#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s:" format, \
66 __FILE__, __FUNCTION__ , ## args)
67#else
68#define DEBUGP(format, args...)
69#endif
70
71static int parse_dcc(char *data, char *data_end, u_int32_t *ip,
72 u_int16_t *port, char **ad_beg_p, char **ad_end_p)
73/* tries to get the ip_addr and port out of a dcc command
74 return value: -1 on failure, 0 on success
75 data pointer to first byte of DCC command data
76 data_end pointer to last byte of dcc command data
77 ip returns parsed ip of dcc command
78 port returns parsed port of dcc command
79 ad_beg_p returns pointer to first byte of addr data
80 ad_end_p returns pointer to last byte of addr data */
81{
82
83 /* at least 12: "AAAAAAAA P\1\n" */
84 while (*data++ != ' ')
85 if (data > data_end - 12)
86 return -1;
87
88 *ad_beg_p = data;
89 *ip = simple_strtoul(data, &data, 10);
90
91 /* skip blanks between ip and port */
92 while (*data == ' ') {
93 if (data >= data_end)
94 return -1;
95 data++;
96 }
97
98 *port = simple_strtoul(data, &data, 10);
99 *ad_end_p = data;
100
101 return 0;
102}
103
104static int help(struct sk_buff **pskb,
105 struct ip_conntrack *ct, enum ip_conntrack_info ctinfo)
106{
107 unsigned int dataoff;
108 struct tcphdr _tcph, *th;
109 char *data, *data_limit, *ib_ptr;
110 int dir = CTINFO2DIR(ctinfo);
111 struct ip_conntrack_expect *exp;
112 u32 seq;
113 u_int32_t dcc_ip;
114 u_int16_t dcc_port;
115 int i, ret = NF_ACCEPT;
116 char *addr_beg_p, *addr_end_p;
117 typeof(ip_nat_irc_hook) ip_nat_irc;
118
119 DEBUGP("entered\n");
120
121 /* If packet is coming from IRC server */
122 if (dir == IP_CT_DIR_REPLY)
123 return NF_ACCEPT;
124
125 /* Until there's been traffic both ways, don't look in packets. */
126 if (ctinfo != IP_CT_ESTABLISHED
127 && ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) {
128 DEBUGP("Conntrackinfo = %u\n", ctinfo);
129 return NF_ACCEPT;
130 }
131
132 /* Not a full tcp header? */
133 th = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl*4,
134 sizeof(_tcph), &_tcph);
135 if (th == NULL)
136 return NF_ACCEPT;
137
138 /* No data? */
139 dataoff = (*pskb)->nh.iph->ihl*4 + th->doff*4;
140 if (dataoff >= (*pskb)->len)
141 return NF_ACCEPT;
142
143 spin_lock_bh(&irc_buffer_lock);
144 ib_ptr = skb_header_pointer(*pskb, dataoff,
145 (*pskb)->len - dataoff, irc_buffer);
146 BUG_ON(ib_ptr == NULL);
147
148 data = ib_ptr;
149 data_limit = ib_ptr + (*pskb)->len - dataoff;
150
151 /* strlen("\1DCC SENT t AAAAAAAA P\1\n")=24
152 * 5+MINMATCHLEN+strlen("t AAAAAAAA P\1\n")=14 */
153 while (data < (data_limit - (19 + MINMATCHLEN))) {
154 if (memcmp(data, "\1DCC ", 5)) {
155 data++;
156 continue;
157 }
158
159 data += 5;
160 /* we have at least (19+MINMATCHLEN)-5 bytes valid data left */
161
162 DEBUGP("DCC found in master %u.%u.%u.%u:%u %u.%u.%u.%u:%u...\n",
163 NIPQUAD(iph->saddr), ntohs(th->source),
164 NIPQUAD(iph->daddr), ntohs(th->dest));
165
166 for (i = 0; i < ARRAY_SIZE(dccprotos); i++) {
167 if (memcmp(data, dccprotos[i], strlen(dccprotos[i]))) {
168 /* no match */
169 continue;
170 }
171
172 DEBUGP("DCC %s detected\n", dccprotos[i]);
173 data += strlen(dccprotos[i]);
174 /* we have at least
175 * (19+MINMATCHLEN)-5-dccprotos[i].matchlen bytes valid
176 * data left (== 14/13 bytes) */
177 if (parse_dcc((char *)data, data_limit, &dcc_ip,
178 &dcc_port, &addr_beg_p, &addr_end_p)) {
179 /* unable to parse */
180 DEBUGP("unable to parse dcc command\n");
181 continue;
182 }
183 DEBUGP("DCC bound ip/port: %u.%u.%u.%u:%u\n",
184 HIPQUAD(dcc_ip), dcc_port);
185
186 /* dcc_ip can be the internal OR external (NAT'ed) IP
187 * Tiago Sousa <mirage@kaotik.org> */
188 if (ct->tuplehash[dir].tuple.src.ip != htonl(dcc_ip)
189 && ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip != htonl(dcc_ip)) {
190 if (net_ratelimit())
191 printk(KERN_WARNING
192 "Forged DCC command from "
193 "%u.%u.%u.%u: %u.%u.%u.%u:%u\n",
194 NIPQUAD(ct->tuplehash[dir].tuple.src.ip),
195 HIPQUAD(dcc_ip), dcc_port);
196
197 continue;
198 }
199
200 exp = ip_conntrack_expect_alloc(ct);
201 if (exp == NULL) {
202 ret = NF_DROP;
203 goto out;
204 }
205
206 /* save position of address in dcc string,
207 * necessary for NAT */
208 DEBUGP("tcph->seq = %u\n", th->seq);
209 seq = ntohl(th->seq) + (addr_beg_p - ib_ptr);
210
211 /* We refer to the reverse direction ("!dir")
212 * tuples here, because we're expecting
213 * something in the other * direction.
214 * Doesn't matter unless NAT is happening. */
215 exp->tuple = ((struct ip_conntrack_tuple)
216 { { 0, { 0 } },
217 { ct->tuplehash[!dir].tuple.dst.ip,
218 { .tcp = { htons(dcc_port) } },
219 IPPROTO_TCP }});
220 exp->mask = ((struct ip_conntrack_tuple)
221 { { 0, { 0 } },
222 { htonl(0xFFFFFFFF),
223 { .tcp = { htons(0xFFFF) } }, 0xFF }});
224 exp->expectfn = NULL;
225 exp->flags = 0;
226 ip_nat_irc = rcu_dereference(ip_nat_irc_hook);
227 if (ip_nat_irc)
228 ret = ip_nat_irc(pskb, ctinfo,
229 addr_beg_p - ib_ptr,
230 addr_end_p - addr_beg_p,
231 exp);
232 else if (ip_conntrack_expect_related(exp) != 0)
233 ret = NF_DROP;
234 ip_conntrack_expect_put(exp);
235 goto out;
236 } /* for .. NUM_DCCPROTO */
237 } /* while data < ... */
238
239 out:
240 spin_unlock_bh(&irc_buffer_lock);
241 return ret;
242}
243
244static struct ip_conntrack_helper irc_helpers[MAX_PORTS];
245static char irc_names[MAX_PORTS][sizeof("irc-65535")];
246
247static void ip_conntrack_irc_fini(void);
248
249static int __init ip_conntrack_irc_init(void)
250{
251 int i, ret;
252 struct ip_conntrack_helper *hlpr;
253 char *tmpname;
254
255 if (max_dcc_channels < 1) {
256 printk("ip_conntrack_irc: max_dcc_channels must be a positive integer\n");
257 return -EBUSY;
258 }
259
260 irc_buffer = kmalloc(65536, GFP_KERNEL);
261 if (!irc_buffer)
262 return -ENOMEM;
263
264 /* If no port given, default to standard irc port */
265 if (ports_c == 0)
266 ports[ports_c++] = IRC_PORT;
267
268 for (i = 0; i < ports_c; i++) {
269 hlpr = &irc_helpers[i];
270 hlpr->tuple.src.u.tcp.port = htons(ports[i]);
271 hlpr->tuple.dst.protonum = IPPROTO_TCP;
272 hlpr->mask.src.u.tcp.port = htons(0xFFFF);
273 hlpr->mask.dst.protonum = 0xFF;
274 hlpr->max_expected = max_dcc_channels;
275 hlpr->timeout = dcc_timeout;
276 hlpr->me = THIS_MODULE;
277 hlpr->help = help;
278
279 tmpname = &irc_names[i][0];
280 if (ports[i] == IRC_PORT)
281 sprintf(tmpname, "irc");
282 else
283 sprintf(tmpname, "irc-%d", i);
284 hlpr->name = tmpname;
285
286 DEBUGP("port #%d: %d\n", i, ports[i]);
287
288 ret = ip_conntrack_helper_register(hlpr);
289
290 if (ret) {
291 printk("ip_conntrack_irc: ERROR registering port %d\n",
292 ports[i]);
293 ip_conntrack_irc_fini();
294 return -EBUSY;
295 }
296 }
297 return 0;
298}
299
300/* This function is intentionally _NOT_ defined as __exit, because
301 * it is needed by the init function */
302static void ip_conntrack_irc_fini(void)
303{
304 int i;
305 for (i = 0; i < ports_c; i++) {
306 DEBUGP("unregistering port %d\n",
307 ports[i]);
308 ip_conntrack_helper_unregister(&irc_helpers[i]);
309 }
310 kfree(irc_buffer);
311}
312
313module_init(ip_conntrack_irc_init);
314module_exit(ip_conntrack_irc_fini);
diff --git a/net/ipv4/netfilter/ip_conntrack_netbios_ns.c b/net/ipv4/netfilter/ip_conntrack_netbios_ns.c
deleted file mode 100644
index cc6dd49c9da0..000000000000
--- a/net/ipv4/netfilter/ip_conntrack_netbios_ns.c
+++ /dev/null
@@ -1,143 +0,0 @@
1/*
2 * NetBIOS name service broadcast connection tracking helper
3 *
4 * (c) 2005 Patrick McHardy <kaber@trash.net>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11/*
12 * This helper tracks locally originating NetBIOS name service
13 * requests by issuing permanent expectations (valid until
14 * timing out) matching all reply connections from the
15 * destination network. The only NetBIOS specific thing is
16 * actually the port number.
17 */
18#include <linux/kernel.h>
19#include <linux/module.h>
20#include <linux/init.h>
21#include <linux/skbuff.h>
22#include <linux/netdevice.h>
23#include <linux/inetdevice.h>
24#include <linux/if_addr.h>
25#include <linux/in.h>
26#include <linux/ip.h>
27#include <net/route.h>
28
29#include <linux/netfilter.h>
30#include <linux/netfilter_ipv4.h>
31#include <linux/netfilter_ipv4/ip_conntrack.h>
32#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
33
34#define NMBD_PORT 137
35
36MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
37MODULE_DESCRIPTION("NetBIOS name service broadcast connection tracking helper");
38MODULE_LICENSE("GPL");
39
40static unsigned int timeout = 3;
41module_param(timeout, uint, 0400);
42MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds");
43
44static int help(struct sk_buff **pskb,
45 struct ip_conntrack *ct, enum ip_conntrack_info ctinfo)
46{
47 struct ip_conntrack_expect *exp;
48 struct iphdr *iph = (*pskb)->nh.iph;
49 struct rtable *rt = (struct rtable *)(*pskb)->dst;
50 struct in_device *in_dev;
51 __be32 mask = 0;
52
53 /* we're only interested in locally generated packets */
54 if ((*pskb)->sk == NULL)
55 goto out;
56 if (rt == NULL || !(rt->rt_flags & RTCF_BROADCAST))
57 goto out;
58 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
59 goto out;
60
61 rcu_read_lock();
62 in_dev = __in_dev_get_rcu(rt->u.dst.dev);
63 if (in_dev != NULL) {
64 for_primary_ifa(in_dev) {
65 if (ifa->ifa_broadcast == iph->daddr) {
66 mask = ifa->ifa_mask;
67 break;
68 }
69 } endfor_ifa(in_dev);
70 }
71 rcu_read_unlock();
72
73 if (mask == 0)
74 goto out;
75
76 exp = ip_conntrack_expect_alloc(ct);
77 if (exp == NULL)
78 goto out;
79
80 exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
81 exp->tuple.src.u.udp.port = htons(NMBD_PORT);
82
83 exp->mask.src.ip = mask;
84 exp->mask.src.u.udp.port = htons(0xFFFF);
85 exp->mask.dst.ip = htonl(0xFFFFFFFF);
86 exp->mask.dst.u.udp.port = htons(0xFFFF);
87 exp->mask.dst.protonum = 0xFF;
88
89 exp->expectfn = NULL;
90 exp->flags = IP_CT_EXPECT_PERMANENT;
91
92 ip_conntrack_expect_related(exp);
93 ip_conntrack_expect_put(exp);
94
95 ip_ct_refresh(ct, *pskb, timeout * HZ);
96out:
97 return NF_ACCEPT;
98}
99
100static struct ip_conntrack_helper helper = {
101 .name = "netbios-ns",
102 .tuple = {
103 .src = {
104 .u = {
105 .udp = {
106 .port = __constant_htons(NMBD_PORT),
107 }
108 }
109 },
110 .dst = {
111 .protonum = IPPROTO_UDP,
112 },
113 },
114 .mask = {
115 .src = {
116 .u = {
117 .udp = {
118 .port = __constant_htons(0xFFFF),
119 }
120 }
121 },
122 .dst = {
123 .protonum = 0xFF,
124 },
125 },
126 .max_expected = 1,
127 .me = THIS_MODULE,
128 .help = help,
129};
130
131static int __init ip_conntrack_netbios_ns_init(void)
132{
133 helper.timeout = timeout;
134 return ip_conntrack_helper_register(&helper);
135}
136
137static void __exit ip_conntrack_netbios_ns_fini(void)
138{
139 ip_conntrack_helper_unregister(&helper);
140}
141
142module_init(ip_conntrack_netbios_ns_init);
143module_exit(ip_conntrack_netbios_ns_fini);
diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c
deleted file mode 100644
index 9228b76ccd9a..000000000000
--- a/net/ipv4/netfilter/ip_conntrack_netlink.c
+++ /dev/null
@@ -1,1577 +0,0 @@
1/* Connection tracking via netlink socket. Allows for user space
2 * protocol helpers and general trouble making from userspace.
3 *
4 * (C) 2001 by Jay Schulist <jschlst@samba.org>
5 * (C) 2002-2005 by Harald Welte <laforge@gnumonks.org>
6 * (C) 2003 by Patrick Mchardy <kaber@trash.net>
7 * (C) 2005-2006 by Pablo Neira Ayuso <pablo@eurodev.net>
8 *
9 * I've reworked this stuff to use attributes instead of conntrack
10 * structures. 5.44 am. I need more tea. --pablo 05/07/11.
11 *
12 * Initial connection tracking via netlink development funded and
13 * generally made possible by Network Robots, Inc. (www.networkrobots.com)
14 *
15 * Further development of this code funded by Astaro AG (http://www.astaro.com)
16 *
17 * This software may be used and distributed according to the terms
18 * of the GNU General Public License, incorporated herein by reference.
19 */
20
21#include <linux/init.h>
22#include <linux/module.h>
23#include <linux/kernel.h>
24#include <linux/types.h>
25#include <linux/timer.h>
26#include <linux/skbuff.h>
27#include <linux/errno.h>
28#include <linux/netlink.h>
29#include <linux/spinlock.h>
30#include <linux/interrupt.h>
31#include <linux/notifier.h>
32
33#include <linux/netfilter.h>
34#include <linux/netfilter_ipv4/ip_conntrack.h>
35#include <linux/netfilter_ipv4/ip_conntrack_core.h>
36#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
37#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
38#include <linux/netfilter_ipv4/ip_nat_protocol.h>
39
40#include <linux/netfilter/nfnetlink.h>
41#include <linux/netfilter/nfnetlink_conntrack.h>
42
43MODULE_LICENSE("GPL");
44
45static char __initdata version[] = "0.90";
46
47static inline int
48ctnetlink_dump_tuples_proto(struct sk_buff *skb,
49 const struct ip_conntrack_tuple *tuple,
50 struct ip_conntrack_protocol *proto)
51{
52 int ret = 0;
53 struct nfattr *nest_parms = NFA_NEST(skb, CTA_TUPLE_PROTO);
54
55 NFA_PUT(skb, CTA_PROTO_NUM, sizeof(u_int8_t), &tuple->dst.protonum);
56
57 if (likely(proto->tuple_to_nfattr))
58 ret = proto->tuple_to_nfattr(skb, tuple);
59
60 NFA_NEST_END(skb, nest_parms);
61
62 return ret;
63
64nfattr_failure:
65 return -1;
66}
67
68static inline int
69ctnetlink_dump_tuples_ip(struct sk_buff *skb,
70 const struct ip_conntrack_tuple *tuple)
71{
72 struct nfattr *nest_parms = NFA_NEST(skb, CTA_TUPLE_IP);
73
74 NFA_PUT(skb, CTA_IP_V4_SRC, sizeof(__be32), &tuple->src.ip);
75 NFA_PUT(skb, CTA_IP_V4_DST, sizeof(__be32), &tuple->dst.ip);
76
77 NFA_NEST_END(skb, nest_parms);
78
79 return 0;
80
81nfattr_failure:
82 return -1;
83}
84
85static inline int
86ctnetlink_dump_tuples(struct sk_buff *skb,
87 const struct ip_conntrack_tuple *tuple)
88{
89 int ret;
90 struct ip_conntrack_protocol *proto;
91
92 ret = ctnetlink_dump_tuples_ip(skb, tuple);
93 if (unlikely(ret < 0))
94 return ret;
95
96 proto = ip_conntrack_proto_find_get(tuple->dst.protonum);
97 ret = ctnetlink_dump_tuples_proto(skb, tuple, proto);
98 ip_conntrack_proto_put(proto);
99
100 return ret;
101}
102
103static inline int
104ctnetlink_dump_status(struct sk_buff *skb, const struct ip_conntrack *ct)
105{
106 __be32 status = htonl((u_int32_t) ct->status);
107 NFA_PUT(skb, CTA_STATUS, sizeof(status), &status);
108 return 0;
109
110nfattr_failure:
111 return -1;
112}
113
114static inline int
115ctnetlink_dump_timeout(struct sk_buff *skb, const struct ip_conntrack *ct)
116{
117 long timeout_l = ct->timeout.expires - jiffies;
118 __be32 timeout;
119
120 if (timeout_l < 0)
121 timeout = 0;
122 else
123 timeout = htonl(timeout_l / HZ);
124
125 NFA_PUT(skb, CTA_TIMEOUT, sizeof(timeout), &timeout);
126 return 0;
127
128nfattr_failure:
129 return -1;
130}
131
132static inline int
133ctnetlink_dump_protoinfo(struct sk_buff *skb, const struct ip_conntrack *ct)
134{
135 struct ip_conntrack_protocol *proto = ip_conntrack_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum);
136
137 struct nfattr *nest_proto;
138 int ret;
139
140 if (!proto->to_nfattr) {
141 ip_conntrack_proto_put(proto);
142 return 0;
143 }
144
145 nest_proto = NFA_NEST(skb, CTA_PROTOINFO);
146
147 ret = proto->to_nfattr(skb, nest_proto, ct);
148
149 ip_conntrack_proto_put(proto);
150
151 NFA_NEST_END(skb, nest_proto);
152
153 return ret;
154
155nfattr_failure:
156 ip_conntrack_proto_put(proto);
157 return -1;
158}
159
160static inline int
161ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct ip_conntrack *ct)
162{
163 struct nfattr *nest_helper;
164
165 if (!ct->helper)
166 return 0;
167
168 nest_helper = NFA_NEST(skb, CTA_HELP);
169 NFA_PUT(skb, CTA_HELP_NAME, strlen(ct->helper->name), ct->helper->name);
170
171 if (ct->helper->to_nfattr)
172 ct->helper->to_nfattr(skb, ct);
173
174 NFA_NEST_END(skb, nest_helper);
175
176 return 0;
177
178nfattr_failure:
179 return -1;
180}
181
182#ifdef CONFIG_IP_NF_CT_ACCT
183static inline int
184ctnetlink_dump_counters(struct sk_buff *skb, const struct ip_conntrack *ct,
185 enum ip_conntrack_dir dir)
186{
187 enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG;
188 struct nfattr *nest_count = NFA_NEST(skb, type);
189 __be32 tmp;
190
191 tmp = htonl(ct->counters[dir].packets);
192 NFA_PUT(skb, CTA_COUNTERS32_PACKETS, sizeof(__be32), &tmp);
193
194 tmp = htonl(ct->counters[dir].bytes);
195 NFA_PUT(skb, CTA_COUNTERS32_BYTES, sizeof(__be32), &tmp);
196
197 NFA_NEST_END(skb, nest_count);
198
199 return 0;
200
201nfattr_failure:
202 return -1;
203}
204#else
205#define ctnetlink_dump_counters(a, b, c) (0)
206#endif
207
208#ifdef CONFIG_IP_NF_CONNTRACK_MARK
209static inline int
210ctnetlink_dump_mark(struct sk_buff *skb, const struct ip_conntrack *ct)
211{
212 __be32 mark = htonl(ct->mark);
213
214 NFA_PUT(skb, CTA_MARK, sizeof(__be32), &mark);
215 return 0;
216
217nfattr_failure:
218 return -1;
219}
220#else
221#define ctnetlink_dump_mark(a, b) (0)
222#endif
223
224static inline int
225ctnetlink_dump_id(struct sk_buff *skb, const struct ip_conntrack *ct)
226{
227 __be32 id = htonl(ct->id);
228 NFA_PUT(skb, CTA_ID, sizeof(__be32), &id);
229 return 0;
230
231nfattr_failure:
232 return -1;
233}
234
235static inline int
236ctnetlink_dump_use(struct sk_buff *skb, const struct ip_conntrack *ct)
237{
238 __be32 use = htonl(atomic_read(&ct->ct_general.use));
239
240 NFA_PUT(skb, CTA_USE, sizeof(__be32), &use);
241 return 0;
242
243nfattr_failure:
244 return -1;
245}
246
247#define tuple(ct, dir) (&(ct)->tuplehash[dir].tuple)
248
249static int
250ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
251 int event, int nowait,
252 const struct ip_conntrack *ct)
253{
254 struct nlmsghdr *nlh;
255 struct nfgenmsg *nfmsg;
256 struct nfattr *nest_parms;
257 unsigned char *b;
258
259 b = skb->tail;
260
261 event |= NFNL_SUBSYS_CTNETLINK << 8;
262 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg));
263 nfmsg = NLMSG_DATA(nlh);
264
265 nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
266 nfmsg->nfgen_family = AF_INET;
267 nfmsg->version = NFNETLINK_V0;
268 nfmsg->res_id = 0;
269
270 nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG);
271 if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
272 goto nfattr_failure;
273 NFA_NEST_END(skb, nest_parms);
274
275 nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY);
276 if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0)
277 goto nfattr_failure;
278 NFA_NEST_END(skb, nest_parms);
279
280 if (ctnetlink_dump_status(skb, ct) < 0 ||
281 ctnetlink_dump_timeout(skb, ct) < 0 ||
282 ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
283 ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 ||
284 ctnetlink_dump_protoinfo(skb, ct) < 0 ||
285 ctnetlink_dump_helpinfo(skb, ct) < 0 ||
286 ctnetlink_dump_mark(skb, ct) < 0 ||
287 ctnetlink_dump_id(skb, ct) < 0 ||
288 ctnetlink_dump_use(skb, ct) < 0)
289 goto nfattr_failure;
290
291 nlh->nlmsg_len = skb->tail - b;
292 return skb->len;
293
294nlmsg_failure:
295nfattr_failure:
296 skb_trim(skb, b - skb->data);
297 return -1;
298}
299
300#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
301static int ctnetlink_conntrack_event(struct notifier_block *this,
302 unsigned long events, void *ptr)
303{
304 struct nlmsghdr *nlh;
305 struct nfgenmsg *nfmsg;
306 struct nfattr *nest_parms;
307 struct ip_conntrack *ct = (struct ip_conntrack *)ptr;
308 struct sk_buff *skb;
309 unsigned int type;
310 unsigned char *b;
311 unsigned int flags = 0, group;
312
313 /* ignore our fake conntrack entry */
314 if (ct == &ip_conntrack_untracked)
315 return NOTIFY_DONE;
316
317 if (events & IPCT_DESTROY) {
318 type = IPCTNL_MSG_CT_DELETE;
319 group = NFNLGRP_CONNTRACK_DESTROY;
320 } else if (events & (IPCT_NEW | IPCT_RELATED)) {
321 type = IPCTNL_MSG_CT_NEW;
322 flags = NLM_F_CREATE|NLM_F_EXCL;
323 group = NFNLGRP_CONNTRACK_NEW;
324 } else if (events & (IPCT_STATUS | IPCT_PROTOINFO)) {
325 type = IPCTNL_MSG_CT_NEW;
326 group = NFNLGRP_CONNTRACK_UPDATE;
327 } else
328 return NOTIFY_DONE;
329
330 if (!nfnetlink_has_listeners(group))
331 return NOTIFY_DONE;
332
333 skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
334 if (!skb)
335 return NOTIFY_DONE;
336
337 b = skb->tail;
338
339 type |= NFNL_SUBSYS_CTNETLINK << 8;
340 nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg));
341 nfmsg = NLMSG_DATA(nlh);
342
343 nlh->nlmsg_flags = flags;
344 nfmsg->nfgen_family = AF_INET;
345 nfmsg->version = NFNETLINK_V0;
346 nfmsg->res_id = 0;
347
348 nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG);
349 if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
350 goto nfattr_failure;
351 NFA_NEST_END(skb, nest_parms);
352
353 nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY);
354 if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0)
355 goto nfattr_failure;
356 NFA_NEST_END(skb, nest_parms);
357
358 if (events & IPCT_DESTROY) {
359 if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
360 ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0)
361 goto nfattr_failure;
362 } else {
363 if (ctnetlink_dump_status(skb, ct) < 0)
364 goto nfattr_failure;
365
366 if (ctnetlink_dump_timeout(skb, ct) < 0)
367 goto nfattr_failure;
368
369 if (events & IPCT_PROTOINFO
370 && ctnetlink_dump_protoinfo(skb, ct) < 0)
371 goto nfattr_failure;
372
373 if ((events & IPCT_HELPER || ct->helper)
374 && ctnetlink_dump_helpinfo(skb, ct) < 0)
375 goto nfattr_failure;
376
377#ifdef CONFIG_IP_NF_CONNTRACK_MARK
378 if ((events & IPCT_MARK || ct->mark)
379 && ctnetlink_dump_mark(skb, ct) < 0)
380 goto nfattr_failure;
381#endif
382
383 if (events & IPCT_COUNTER_FILLING &&
384 (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
385 ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0))
386 goto nfattr_failure;
387 }
388
389 nlh->nlmsg_len = skb->tail - b;
390 nfnetlink_send(skb, 0, group, 0);
391 return NOTIFY_DONE;
392
393nlmsg_failure:
394nfattr_failure:
395 kfree_skb(skb);
396 return NOTIFY_DONE;
397}
398#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
399
400static int ctnetlink_done(struct netlink_callback *cb)
401{
402 if (cb->args[1])
403 ip_conntrack_put((struct ip_conntrack *)cb->args[1]);
404 return 0;
405}
406
407static int
408ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
409{
410 struct ip_conntrack *ct, *last;
411 struct ip_conntrack_tuple_hash *h;
412 struct list_head *i;
413
414 read_lock_bh(&ip_conntrack_lock);
415 last = (struct ip_conntrack *)cb->args[1];
416 for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++) {
417restart:
418 list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) {
419 h = (struct ip_conntrack_tuple_hash *) i;
420 if (DIRECTION(h) != IP_CT_DIR_ORIGINAL)
421 continue;
422 ct = tuplehash_to_ctrack(h);
423 if (cb->args[1]) {
424 if (ct != last)
425 continue;
426 cb->args[1] = 0;
427 }
428 if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid,
429 cb->nlh->nlmsg_seq,
430 IPCTNL_MSG_CT_NEW,
431 1, ct) < 0) {
432 nf_conntrack_get(&ct->ct_general);
433 cb->args[1] = (unsigned long)ct;
434 goto out;
435 }
436#ifdef CONFIG_NF_CT_ACCT
437 if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) ==
438 IPCTNL_MSG_CT_GET_CTRZERO)
439 memset(&ct->counters, 0, sizeof(ct->counters));
440#endif
441 }
442 if (cb->args[1]) {
443 cb->args[1] = 0;
444 goto restart;
445 }
446 }
447out:
448 read_unlock_bh(&ip_conntrack_lock);
449 if (last)
450 ip_conntrack_put(last);
451
452 return skb->len;
453}
454
455static const size_t cta_min_ip[CTA_IP_MAX] = {
456 [CTA_IP_V4_SRC-1] = sizeof(__be32),
457 [CTA_IP_V4_DST-1] = sizeof(__be32),
458};
459
460static inline int
461ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple)
462{
463 struct nfattr *tb[CTA_IP_MAX];
464
465 nfattr_parse_nested(tb, CTA_IP_MAX, attr);
466
467 if (nfattr_bad_size(tb, CTA_IP_MAX, cta_min_ip))
468 return -EINVAL;
469
470 if (!tb[CTA_IP_V4_SRC-1])
471 return -EINVAL;
472 tuple->src.ip = *(__be32 *)NFA_DATA(tb[CTA_IP_V4_SRC-1]);
473
474 if (!tb[CTA_IP_V4_DST-1])
475 return -EINVAL;
476 tuple->dst.ip = *(__be32 *)NFA_DATA(tb[CTA_IP_V4_DST-1]);
477
478 return 0;
479}
480
481static const size_t cta_min_proto[CTA_PROTO_MAX] = {
482 [CTA_PROTO_NUM-1] = sizeof(u_int8_t),
483 [CTA_PROTO_SRC_PORT-1] = sizeof(u_int16_t),
484 [CTA_PROTO_DST_PORT-1] = sizeof(u_int16_t),
485 [CTA_PROTO_ICMP_TYPE-1] = sizeof(u_int8_t),
486 [CTA_PROTO_ICMP_CODE-1] = sizeof(u_int8_t),
487 [CTA_PROTO_ICMP_ID-1] = sizeof(u_int16_t),
488};
489
490static inline int
491ctnetlink_parse_tuple_proto(struct nfattr *attr,
492 struct ip_conntrack_tuple *tuple)
493{
494 struct nfattr *tb[CTA_PROTO_MAX];
495 struct ip_conntrack_protocol *proto;
496 int ret = 0;
497
498 nfattr_parse_nested(tb, CTA_PROTO_MAX, attr);
499
500 if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto))
501 return -EINVAL;
502
503 if (!tb[CTA_PROTO_NUM-1])
504 return -EINVAL;
505 tuple->dst.protonum = *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_NUM-1]);
506
507 proto = ip_conntrack_proto_find_get(tuple->dst.protonum);
508
509 if (likely(proto->nfattr_to_tuple))
510 ret = proto->nfattr_to_tuple(tb, tuple);
511
512 ip_conntrack_proto_put(proto);
513
514 return ret;
515}
516
517static inline int
518ctnetlink_parse_tuple(struct nfattr *cda[], struct ip_conntrack_tuple *tuple,
519 enum ctattr_tuple type)
520{
521 struct nfattr *tb[CTA_TUPLE_MAX];
522 int err;
523
524 memset(tuple, 0, sizeof(*tuple));
525
526 nfattr_parse_nested(tb, CTA_TUPLE_MAX, cda[type-1]);
527
528 if (!tb[CTA_TUPLE_IP-1])
529 return -EINVAL;
530
531 err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP-1], tuple);
532 if (err < 0)
533 return err;
534
535 if (!tb[CTA_TUPLE_PROTO-1])
536 return -EINVAL;
537
538 err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO-1], tuple);
539 if (err < 0)
540 return err;
541
542 /* orig and expect tuples get DIR_ORIGINAL */
543 if (type == CTA_TUPLE_REPLY)
544 tuple->dst.dir = IP_CT_DIR_REPLY;
545 else
546 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
547
548 return 0;
549}
550
551#ifdef CONFIG_IP_NF_NAT_NEEDED
552static const size_t cta_min_protonat[CTA_PROTONAT_MAX] = {
553 [CTA_PROTONAT_PORT_MIN-1] = sizeof(u_int16_t),
554 [CTA_PROTONAT_PORT_MAX-1] = sizeof(u_int16_t),
555};
556
557static int ctnetlink_parse_nat_proto(struct nfattr *attr,
558 const struct ip_conntrack *ct,
559 struct ip_nat_range *range)
560{
561 struct nfattr *tb[CTA_PROTONAT_MAX];
562 struct ip_nat_protocol *npt;
563
564 nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr);
565
566 if (nfattr_bad_size(tb, CTA_PROTONAT_MAX, cta_min_protonat))
567 return -EINVAL;
568
569 npt = ip_nat_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum);
570
571 if (!npt->nfattr_to_range) {
572 ip_nat_proto_put(npt);
573 return 0;
574 }
575
576 /* nfattr_to_range returns 1 if it parsed, 0 if not, neg. on error */
577 if (npt->nfattr_to_range(tb, range) > 0)
578 range->flags |= IP_NAT_RANGE_PROTO_SPECIFIED;
579
580 ip_nat_proto_put(npt);
581
582 return 0;
583}
584
585static const size_t cta_min_nat[CTA_NAT_MAX] = {
586 [CTA_NAT_MINIP-1] = sizeof(__be32),
587 [CTA_NAT_MAXIP-1] = sizeof(__be32),
588};
589
590static inline int
591ctnetlink_parse_nat(struct nfattr *nat,
592 const struct ip_conntrack *ct, struct ip_nat_range *range)
593{
594 struct nfattr *tb[CTA_NAT_MAX];
595 int err;
596
597 memset(range, 0, sizeof(*range));
598
599 nfattr_parse_nested(tb, CTA_NAT_MAX, nat);
600
601 if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat))
602 return -EINVAL;
603
604 if (tb[CTA_NAT_MINIP-1])
605 range->min_ip = *(__be32 *)NFA_DATA(tb[CTA_NAT_MINIP-1]);
606
607 if (!tb[CTA_NAT_MAXIP-1])
608 range->max_ip = range->min_ip;
609 else
610 range->max_ip = *(__be32 *)NFA_DATA(tb[CTA_NAT_MAXIP-1]);
611
612 if (range->min_ip)
613 range->flags |= IP_NAT_RANGE_MAP_IPS;
614
615 if (!tb[CTA_NAT_PROTO-1])
616 return 0;
617
618 err = ctnetlink_parse_nat_proto(tb[CTA_NAT_PROTO-1], ct, range);
619 if (err < 0)
620 return err;
621
622 return 0;
623}
624#endif
625
626static inline int
627ctnetlink_parse_help(struct nfattr *attr, char **helper_name)
628{
629 struct nfattr *tb[CTA_HELP_MAX];
630
631 nfattr_parse_nested(tb, CTA_HELP_MAX, attr);
632
633 if (!tb[CTA_HELP_NAME-1])
634 return -EINVAL;
635
636 *helper_name = NFA_DATA(tb[CTA_HELP_NAME-1]);
637
638 return 0;
639}
640
641static const size_t cta_min[CTA_MAX] = {
642 [CTA_STATUS-1] = sizeof(__be32),
643 [CTA_TIMEOUT-1] = sizeof(__be32),
644 [CTA_MARK-1] = sizeof(__be32),
645 [CTA_USE-1] = sizeof(__be32),
646 [CTA_ID-1] = sizeof(__be32)
647};
648
649static int
650ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
651 struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
652{
653 struct ip_conntrack_tuple_hash *h;
654 struct ip_conntrack_tuple tuple;
655 struct ip_conntrack *ct;
656 int err = 0;
657
658 if (nfattr_bad_size(cda, CTA_MAX, cta_min))
659 return -EINVAL;
660
661 if (cda[CTA_TUPLE_ORIG-1])
662 err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG);
663 else if (cda[CTA_TUPLE_REPLY-1])
664 err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY);
665 else {
666 /* Flush the whole table */
667 ip_conntrack_flush();
668 return 0;
669 }
670
671 if (err < 0)
672 return err;
673
674 h = ip_conntrack_find_get(&tuple, NULL);
675 if (!h)
676 return -ENOENT;
677
678 ct = tuplehash_to_ctrack(h);
679
680 if (cda[CTA_ID-1]) {
681 u_int32_t id = ntohl(*(__be32 *)NFA_DATA(cda[CTA_ID-1]));
682 if (ct->id != id) {
683 ip_conntrack_put(ct);
684 return -ENOENT;
685 }
686 }
687 if (del_timer(&ct->timeout))
688 ct->timeout.function((unsigned long)ct);
689
690 ip_conntrack_put(ct);
691
692 return 0;
693}
694
695static int
696ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
697 struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
698{
699 struct ip_conntrack_tuple_hash *h;
700 struct ip_conntrack_tuple tuple;
701 struct ip_conntrack *ct;
702 struct sk_buff *skb2 = NULL;
703 int err = 0;
704
705 if (nlh->nlmsg_flags & NLM_F_DUMP) {
706 struct nfgenmsg *msg = NLMSG_DATA(nlh);
707 u32 rlen;
708
709 if (msg->nfgen_family != AF_INET)
710 return -EAFNOSUPPORT;
711
712#ifndef CONFIG_IP_NF_CT_ACCT
713 if (NFNL_MSG_TYPE(nlh->nlmsg_type) == IPCTNL_MSG_CT_GET_CTRZERO)
714 return -ENOTSUPP;
715#endif
716 if ((*errp = netlink_dump_start(ctnl, skb, nlh,
717 ctnetlink_dump_table,
718 ctnetlink_done)) != 0)
719 return -EINVAL;
720
721 rlen = NLMSG_ALIGN(nlh->nlmsg_len);
722 if (rlen > skb->len)
723 rlen = skb->len;
724 skb_pull(skb, rlen);
725 return 0;
726 }
727
728 if (nfattr_bad_size(cda, CTA_MAX, cta_min))
729 return -EINVAL;
730
731 if (cda[CTA_TUPLE_ORIG-1])
732 err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG);
733 else if (cda[CTA_TUPLE_REPLY-1])
734 err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY);
735 else
736 return -EINVAL;
737
738 if (err < 0)
739 return err;
740
741 h = ip_conntrack_find_get(&tuple, NULL);
742 if (!h)
743 return -ENOENT;
744
745 ct = tuplehash_to_ctrack(h);
746
747 err = -ENOMEM;
748 skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
749 if (!skb2) {
750 ip_conntrack_put(ct);
751 return -ENOMEM;
752 }
753
754 err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq,
755 IPCTNL_MSG_CT_NEW, 1, ct);
756 ip_conntrack_put(ct);
757 if (err <= 0)
758 goto free;
759
760 err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
761 if (err < 0)
762 goto out;
763
764 return 0;
765
766free:
767 kfree_skb(skb2);
768out:
769 return err;
770}
771
772static inline int
773ctnetlink_change_status(struct ip_conntrack *ct, struct nfattr *cda[])
774{
775 unsigned long d;
776 unsigned status = ntohl(*(__be32 *)NFA_DATA(cda[CTA_STATUS-1]));
777 d = ct->status ^ status;
778
779 if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING))
780 /* unchangeable */
781 return -EINVAL;
782
783 if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY))
784 /* SEEN_REPLY bit can only be set */
785 return -EINVAL;
786
787
788 if (d & IPS_ASSURED && !(status & IPS_ASSURED))
789 /* ASSURED bit can only be set */
790 return -EINVAL;
791
792 if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) {
793#ifndef CONFIG_IP_NF_NAT_NEEDED
794 return -EINVAL;
795#else
796 struct ip_nat_range range;
797
798 if (cda[CTA_NAT_DST-1]) {
799 if (ctnetlink_parse_nat(cda[CTA_NAT_DST-1], ct,
800 &range) < 0)
801 return -EINVAL;
802 if (ip_nat_initialized(ct,
803 HOOK2MANIP(NF_IP_PRE_ROUTING)))
804 return -EEXIST;
805 ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING);
806 }
807 if (cda[CTA_NAT_SRC-1]) {
808 if (ctnetlink_parse_nat(cda[CTA_NAT_SRC-1], ct,
809 &range) < 0)
810 return -EINVAL;
811 if (ip_nat_initialized(ct,
812 HOOK2MANIP(NF_IP_POST_ROUTING)))
813 return -EEXIST;
814 ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING);
815 }
816#endif
817 }
818
819 /* Be careful here, modifying NAT bits can screw up things,
820 * so don't let users modify them directly if they don't pass
821 * ip_nat_range. */
822 ct->status |= status & ~(IPS_NAT_DONE_MASK | IPS_NAT_MASK);
823 return 0;
824}
825
826
827static inline int
828ctnetlink_change_helper(struct ip_conntrack *ct, struct nfattr *cda[])
829{
830 struct ip_conntrack_helper *helper;
831 char *helpname;
832 int err;
833
834 /* don't change helper of sibling connections */
835 if (ct->master)
836 return -EINVAL;
837
838 err = ctnetlink_parse_help(cda[CTA_HELP-1], &helpname);
839 if (err < 0)
840 return err;
841
842 helper = __ip_conntrack_helper_find_byname(helpname);
843 if (!helper) {
844 if (!strcmp(helpname, ""))
845 helper = NULL;
846 else
847 return -EINVAL;
848 }
849
850 if (ct->helper) {
851 if (!helper) {
852 /* we had a helper before ... */
853 ip_ct_remove_expectations(ct);
854 ct->helper = NULL;
855 } else {
856 /* need to zero data of old helper */
857 memset(&ct->help, 0, sizeof(ct->help));
858 }
859 }
860
861 ct->helper = helper;
862
863 return 0;
864}
865
866static inline int
867ctnetlink_change_timeout(struct ip_conntrack *ct, struct nfattr *cda[])
868{
869 u_int32_t timeout = ntohl(*(__be32 *)NFA_DATA(cda[CTA_TIMEOUT-1]));
870
871 if (!del_timer(&ct->timeout))
872 return -ETIME;
873
874 ct->timeout.expires = jiffies + timeout * HZ;
875 add_timer(&ct->timeout);
876
877 return 0;
878}
879
880static inline int
881ctnetlink_change_protoinfo(struct ip_conntrack *ct, struct nfattr *cda[])
882{
883 struct nfattr *tb[CTA_PROTOINFO_MAX], *attr = cda[CTA_PROTOINFO-1];
884 struct ip_conntrack_protocol *proto;
885 u_int16_t npt = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum;
886 int err = 0;
887
888 nfattr_parse_nested(tb, CTA_PROTOINFO_MAX, attr);
889
890 proto = ip_conntrack_proto_find_get(npt);
891
892 if (proto->from_nfattr)
893 err = proto->from_nfattr(tb, ct);
894 ip_conntrack_proto_put(proto);
895
896 return err;
897}
898
899static int
900ctnetlink_change_conntrack(struct ip_conntrack *ct, struct nfattr *cda[])
901{
902 int err;
903
904 if (cda[CTA_HELP-1]) {
905 err = ctnetlink_change_helper(ct, cda);
906 if (err < 0)
907 return err;
908 }
909
910 if (cda[CTA_TIMEOUT-1]) {
911 err = ctnetlink_change_timeout(ct, cda);
912 if (err < 0)
913 return err;
914 }
915
916 if (cda[CTA_STATUS-1]) {
917 err = ctnetlink_change_status(ct, cda);
918 if (err < 0)
919 return err;
920 }
921
922 if (cda[CTA_PROTOINFO-1]) {
923 err = ctnetlink_change_protoinfo(ct, cda);
924 if (err < 0)
925 return err;
926 }
927
928#if defined(CONFIG_IP_NF_CONNTRACK_MARK)
929 if (cda[CTA_MARK-1])
930 ct->mark = ntohl(*(__be32 *)NFA_DATA(cda[CTA_MARK-1]));
931#endif
932
933 return 0;
934}
935
936static int
937ctnetlink_create_conntrack(struct nfattr *cda[],
938 struct ip_conntrack_tuple *otuple,
939 struct ip_conntrack_tuple *rtuple)
940{
941 struct ip_conntrack *ct;
942 int err = -EINVAL;
943
944 ct = ip_conntrack_alloc(otuple, rtuple);
945 if (ct == NULL || IS_ERR(ct))
946 return -ENOMEM;
947
948 if (!cda[CTA_TIMEOUT-1])
949 goto err;
950 ct->timeout.expires = ntohl(*(__be32 *)NFA_DATA(cda[CTA_TIMEOUT-1]));
951
952 ct->timeout.expires = jiffies + ct->timeout.expires * HZ;
953 ct->status |= IPS_CONFIRMED;
954
955 if (cda[CTA_STATUS-1]) {
956 err = ctnetlink_change_status(ct, cda);
957 if (err < 0)
958 goto err;
959 }
960
961 if (cda[CTA_PROTOINFO-1]) {
962 err = ctnetlink_change_protoinfo(ct, cda);
963 if (err < 0)
964 goto err;
965 }
966
967#if defined(CONFIG_IP_NF_CONNTRACK_MARK)
968 if (cda[CTA_MARK-1])
969 ct->mark = ntohl(*(__be32 *)NFA_DATA(cda[CTA_MARK-1]));
970#endif
971
972 ct->helper = ip_conntrack_helper_find_get(rtuple);
973
974 add_timer(&ct->timeout);
975 ip_conntrack_hash_insert(ct);
976
977 if (ct->helper)
978 ip_conntrack_helper_put(ct->helper);
979
980 return 0;
981
982err:
983 ip_conntrack_free(ct);
984 return err;
985}
986
987static int
988ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
989 struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
990{
991 struct ip_conntrack_tuple otuple, rtuple;
992 struct ip_conntrack_tuple_hash *h = NULL;
993 int err = 0;
994
995 if (nfattr_bad_size(cda, CTA_MAX, cta_min))
996 return -EINVAL;
997
998 if (cda[CTA_TUPLE_ORIG-1]) {
999 err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG);
1000 if (err < 0)
1001 return err;
1002 }
1003
1004 if (cda[CTA_TUPLE_REPLY-1]) {
1005 err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY);
1006 if (err < 0)
1007 return err;
1008 }
1009
1010 write_lock_bh(&ip_conntrack_lock);
1011 if (cda[CTA_TUPLE_ORIG-1])
1012 h = __ip_conntrack_find(&otuple, NULL);
1013 else if (cda[CTA_TUPLE_REPLY-1])
1014 h = __ip_conntrack_find(&rtuple, NULL);
1015
1016 if (h == NULL) {
1017 write_unlock_bh(&ip_conntrack_lock);
1018 err = -ENOENT;
1019 if (nlh->nlmsg_flags & NLM_F_CREATE)
1020 err = ctnetlink_create_conntrack(cda, &otuple, &rtuple);
1021 return err;
1022 }
1023 /* implicit 'else' */
1024
1025 /* we only allow nat config for new conntracks */
1026 if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) {
1027 err = -EINVAL;
1028 goto out_unlock;
1029 }
1030
1031 /* We manipulate the conntrack inside the global conntrack table lock,
1032 * so there's no need to increase the refcount */
1033 err = -EEXIST;
1034 if (!(nlh->nlmsg_flags & NLM_F_EXCL))
1035 err = ctnetlink_change_conntrack(tuplehash_to_ctrack(h), cda);
1036
1037out_unlock:
1038 write_unlock_bh(&ip_conntrack_lock);
1039 return err;
1040}
1041
1042/***********************************************************************
1043 * EXPECT
1044 ***********************************************************************/
1045
1046static inline int
1047ctnetlink_exp_dump_tuple(struct sk_buff *skb,
1048 const struct ip_conntrack_tuple *tuple,
1049 enum ctattr_expect type)
1050{
1051 struct nfattr *nest_parms = NFA_NEST(skb, type);
1052
1053 if (ctnetlink_dump_tuples(skb, tuple) < 0)
1054 goto nfattr_failure;
1055
1056 NFA_NEST_END(skb, nest_parms);
1057
1058 return 0;
1059
1060nfattr_failure:
1061 return -1;
1062}
1063
1064static inline int
1065ctnetlink_exp_dump_mask(struct sk_buff *skb,
1066 const struct ip_conntrack_tuple *tuple,
1067 const struct ip_conntrack_tuple *mask)
1068{
1069 int ret;
1070 struct ip_conntrack_protocol *proto;
1071 struct nfattr *nest_parms = NFA_NEST(skb, CTA_EXPECT_MASK);
1072
1073 ret = ctnetlink_dump_tuples_ip(skb, mask);
1074 if (unlikely(ret < 0))
1075 goto nfattr_failure;
1076
1077 proto = ip_conntrack_proto_find_get(tuple->dst.protonum);
1078 ret = ctnetlink_dump_tuples_proto(skb, mask, proto);
1079 ip_conntrack_proto_put(proto);
1080 if (unlikely(ret < 0))
1081 goto nfattr_failure;
1082
1083 NFA_NEST_END(skb, nest_parms);
1084
1085 return 0;
1086
1087nfattr_failure:
1088 return -1;
1089}
1090
1091static inline int
1092ctnetlink_exp_dump_expect(struct sk_buff *skb,
1093 const struct ip_conntrack_expect *exp)
1094{
1095 struct ip_conntrack *master = exp->master;
1096 __be32 timeout = htonl((exp->timeout.expires - jiffies) / HZ);
1097 __be32 id = htonl(exp->id);
1098
1099 if (ctnetlink_exp_dump_tuple(skb, &exp->tuple, CTA_EXPECT_TUPLE) < 0)
1100 goto nfattr_failure;
1101 if (ctnetlink_exp_dump_mask(skb, &exp->tuple, &exp->mask) < 0)
1102 goto nfattr_failure;
1103 if (ctnetlink_exp_dump_tuple(skb,
1104 &master->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
1105 CTA_EXPECT_MASTER) < 0)
1106 goto nfattr_failure;
1107
1108 NFA_PUT(skb, CTA_EXPECT_TIMEOUT, sizeof(__be32), &timeout);
1109 NFA_PUT(skb, CTA_EXPECT_ID, sizeof(__be32), &id);
1110
1111 return 0;
1112
1113nfattr_failure:
1114 return -1;
1115}
1116
1117static int
1118ctnetlink_exp_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
1119 int event,
1120 int nowait,
1121 const struct ip_conntrack_expect *exp)
1122{
1123 struct nlmsghdr *nlh;
1124 struct nfgenmsg *nfmsg;
1125 unsigned char *b;
1126
1127 b = skb->tail;
1128
1129 event |= NFNL_SUBSYS_CTNETLINK_EXP << 8;
1130 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg));
1131 nfmsg = NLMSG_DATA(nlh);
1132
1133 nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
1134 nfmsg->nfgen_family = AF_INET;
1135 nfmsg->version = NFNETLINK_V0;
1136 nfmsg->res_id = 0;
1137
1138 if (ctnetlink_exp_dump_expect(skb, exp) < 0)
1139 goto nfattr_failure;
1140
1141 nlh->nlmsg_len = skb->tail - b;
1142 return skb->len;
1143
1144nlmsg_failure:
1145nfattr_failure:
1146 skb_trim(skb, b - skb->data);
1147 return -1;
1148}
1149
1150#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
1151static int ctnetlink_expect_event(struct notifier_block *this,
1152 unsigned long events, void *ptr)
1153{
1154 struct nlmsghdr *nlh;
1155 struct nfgenmsg *nfmsg;
1156 struct ip_conntrack_expect *exp = (struct ip_conntrack_expect *)ptr;
1157 struct sk_buff *skb;
1158 unsigned int type;
1159 unsigned char *b;
1160 int flags = 0;
1161
1162 if (events & IPEXP_NEW) {
1163 type = IPCTNL_MSG_EXP_NEW;
1164 flags = NLM_F_CREATE|NLM_F_EXCL;
1165 } else
1166 return NOTIFY_DONE;
1167
1168 if (!nfnetlink_has_listeners(NFNLGRP_CONNTRACK_EXP_NEW))
1169 return NOTIFY_DONE;
1170
1171 skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
1172 if (!skb)
1173 return NOTIFY_DONE;
1174
1175 b = skb->tail;
1176
1177 type |= NFNL_SUBSYS_CTNETLINK_EXP << 8;
1178 nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg));
1179 nfmsg = NLMSG_DATA(nlh);
1180
1181 nlh->nlmsg_flags = flags;
1182 nfmsg->nfgen_family = AF_INET;
1183 nfmsg->version = NFNETLINK_V0;
1184 nfmsg->res_id = 0;
1185
1186 if (ctnetlink_exp_dump_expect(skb, exp) < 0)
1187 goto nfattr_failure;
1188
1189 nlh->nlmsg_len = skb->tail - b;
1190 nfnetlink_send(skb, 0, NFNLGRP_CONNTRACK_EXP_NEW, 0);
1191 return NOTIFY_DONE;
1192
1193nlmsg_failure:
1194nfattr_failure:
1195 kfree_skb(skb);
1196 return NOTIFY_DONE;
1197}
1198#endif
1199
1200static int
1201ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
1202{
1203 struct ip_conntrack_expect *exp = NULL;
1204 struct list_head *i;
1205 u_int32_t *id = (u_int32_t *) &cb->args[0];
1206
1207 read_lock_bh(&ip_conntrack_lock);
1208 list_for_each_prev(i, &ip_conntrack_expect_list) {
1209 exp = (struct ip_conntrack_expect *) i;
1210 if (exp->id <= *id)
1211 continue;
1212 if (ctnetlink_exp_fill_info(skb, NETLINK_CB(cb->skb).pid,
1213 cb->nlh->nlmsg_seq,
1214 IPCTNL_MSG_EXP_NEW,
1215 1, exp) < 0)
1216 goto out;
1217 *id = exp->id;
1218 }
1219out:
1220 read_unlock_bh(&ip_conntrack_lock);
1221
1222 return skb->len;
1223}
1224
1225static const size_t cta_min_exp[CTA_EXPECT_MAX] = {
1226 [CTA_EXPECT_TIMEOUT-1] = sizeof(__be32),
1227 [CTA_EXPECT_ID-1] = sizeof(__be32)
1228};
1229
1230static int
1231ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
1232 struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
1233{
1234 struct ip_conntrack_tuple tuple;
1235 struct ip_conntrack_expect *exp;
1236 struct sk_buff *skb2;
1237 int err = 0;
1238
1239 if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp))
1240 return -EINVAL;
1241
1242 if (nlh->nlmsg_flags & NLM_F_DUMP) {
1243 struct nfgenmsg *msg = NLMSG_DATA(nlh);
1244 u32 rlen;
1245
1246 if (msg->nfgen_family != AF_INET)
1247 return -EAFNOSUPPORT;
1248
1249 if ((*errp = netlink_dump_start(ctnl, skb, nlh,
1250 ctnetlink_exp_dump_table,
1251 ctnetlink_done)) != 0)
1252 return -EINVAL;
1253 rlen = NLMSG_ALIGN(nlh->nlmsg_len);
1254 if (rlen > skb->len)
1255 rlen = skb->len;
1256 skb_pull(skb, rlen);
1257 return 0;
1258 }
1259
1260 if (cda[CTA_EXPECT_MASTER-1])
1261 err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER);
1262 else
1263 return -EINVAL;
1264
1265 if (err < 0)
1266 return err;
1267
1268 exp = ip_conntrack_expect_find_get(&tuple);
1269 if (!exp)
1270 return -ENOENT;
1271
1272 if (cda[CTA_EXPECT_ID-1]) {
1273 __be32 id = *(__be32 *)NFA_DATA(cda[CTA_EXPECT_ID-1]);
1274 if (exp->id != ntohl(id)) {
1275 ip_conntrack_expect_put(exp);
1276 return -ENOENT;
1277 }
1278 }
1279
1280 err = -ENOMEM;
1281 skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1282 if (!skb2)
1283 goto out;
1284
1285 err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).pid,
1286 nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW,
1287 1, exp);
1288 if (err <= 0)
1289 goto free;
1290
1291 ip_conntrack_expect_put(exp);
1292
1293 return netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
1294
1295free:
1296 kfree_skb(skb2);
1297out:
1298 ip_conntrack_expect_put(exp);
1299 return err;
1300}
1301
1302static int
1303ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
1304 struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
1305{
1306 struct ip_conntrack_expect *exp, *tmp;
1307 struct ip_conntrack_tuple tuple;
1308 struct ip_conntrack_helper *h;
1309 int err;
1310
1311 if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp))
1312 return -EINVAL;
1313
1314 if (cda[CTA_EXPECT_TUPLE-1]) {
1315 /* delete a single expect by tuple */
1316 err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE);
1317 if (err < 0)
1318 return err;
1319
1320 /* bump usage count to 2 */
1321 exp = ip_conntrack_expect_find_get(&tuple);
1322 if (!exp)
1323 return -ENOENT;
1324
1325 if (cda[CTA_EXPECT_ID-1]) {
1326 __be32 id =
1327 *(__be32 *)NFA_DATA(cda[CTA_EXPECT_ID-1]);
1328 if (exp->id != ntohl(id)) {
1329 ip_conntrack_expect_put(exp);
1330 return -ENOENT;
1331 }
1332 }
1333
1334 /* after list removal, usage count == 1 */
1335 ip_conntrack_unexpect_related(exp);
1336 /* have to put what we 'get' above.
1337 * after this line usage count == 0 */
1338 ip_conntrack_expect_put(exp);
1339 } else if (cda[CTA_EXPECT_HELP_NAME-1]) {
1340 char *name = NFA_DATA(cda[CTA_EXPECT_HELP_NAME-1]);
1341
1342 /* delete all expectations for this helper */
1343 write_lock_bh(&ip_conntrack_lock);
1344 h = __ip_conntrack_helper_find_byname(name);
1345 if (!h) {
1346 write_unlock_bh(&ip_conntrack_lock);
1347 return -EINVAL;
1348 }
1349 list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list,
1350 list) {
1351 if (exp->master->helper == h
1352 && del_timer(&exp->timeout)) {
1353 ip_ct_unlink_expect(exp);
1354 ip_conntrack_expect_put(exp);
1355 }
1356 }
1357 write_unlock_bh(&ip_conntrack_lock);
1358 } else {
1359 /* This basically means we have to flush everything*/
1360 write_lock_bh(&ip_conntrack_lock);
1361 list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list,
1362 list) {
1363 if (del_timer(&exp->timeout)) {
1364 ip_ct_unlink_expect(exp);
1365 ip_conntrack_expect_put(exp);
1366 }
1367 }
1368 write_unlock_bh(&ip_conntrack_lock);
1369 }
1370
1371 return 0;
1372}
1373static int
1374ctnetlink_change_expect(struct ip_conntrack_expect *x, struct nfattr *cda[])
1375{
1376 return -EOPNOTSUPP;
1377}
1378
1379static int
1380ctnetlink_create_expect(struct nfattr *cda[])
1381{
1382 struct ip_conntrack_tuple tuple, mask, master_tuple;
1383 struct ip_conntrack_tuple_hash *h = NULL;
1384 struct ip_conntrack_expect *exp;
1385 struct ip_conntrack *ct;
1386 int err = 0;
1387
1388 /* caller guarantees that those three CTA_EXPECT_* exist */
1389 err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE);
1390 if (err < 0)
1391 return err;
1392 err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK);
1393 if (err < 0)
1394 return err;
1395 err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER);
1396 if (err < 0)
1397 return err;
1398
1399 /* Look for master conntrack of this expectation */
1400 h = ip_conntrack_find_get(&master_tuple, NULL);
1401 if (!h)
1402 return -ENOENT;
1403 ct = tuplehash_to_ctrack(h);
1404
1405 if (!ct->helper) {
1406 /* such conntrack hasn't got any helper, abort */
1407 err = -EINVAL;
1408 goto out;
1409 }
1410
1411 exp = ip_conntrack_expect_alloc(ct);
1412 if (!exp) {
1413 err = -ENOMEM;
1414 goto out;
1415 }
1416
1417 exp->expectfn = NULL;
1418 exp->flags = 0;
1419 exp->master = ct;
1420 memcpy(&exp->tuple, &tuple, sizeof(struct ip_conntrack_tuple));
1421 memcpy(&exp->mask, &mask, sizeof(struct ip_conntrack_tuple));
1422
1423 err = ip_conntrack_expect_related(exp);
1424 ip_conntrack_expect_put(exp);
1425
1426out:
1427 ip_conntrack_put(tuplehash_to_ctrack(h));
1428 return err;
1429}
1430
1431static int
1432ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb,
1433 struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
1434{
1435 struct ip_conntrack_tuple tuple;
1436 struct ip_conntrack_expect *exp;
1437 int err = 0;
1438
1439 if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp))
1440 return -EINVAL;
1441
1442 if (!cda[CTA_EXPECT_TUPLE-1]
1443 || !cda[CTA_EXPECT_MASK-1]
1444 || !cda[CTA_EXPECT_MASTER-1])
1445 return -EINVAL;
1446
1447 err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE);
1448 if (err < 0)
1449 return err;
1450
1451 write_lock_bh(&ip_conntrack_lock);
1452 exp = __ip_conntrack_expect_find(&tuple);
1453
1454 if (!exp) {
1455 write_unlock_bh(&ip_conntrack_lock);
1456 err = -ENOENT;
1457 if (nlh->nlmsg_flags & NLM_F_CREATE)
1458 err = ctnetlink_create_expect(cda);
1459 return err;
1460 }
1461
1462 err = -EEXIST;
1463 if (!(nlh->nlmsg_flags & NLM_F_EXCL))
1464 err = ctnetlink_change_expect(exp, cda);
1465 write_unlock_bh(&ip_conntrack_lock);
1466
1467 return err;
1468}
1469
1470#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
1471static struct notifier_block ctnl_notifier = {
1472 .notifier_call = ctnetlink_conntrack_event,
1473};
1474
1475static struct notifier_block ctnl_notifier_exp = {
1476 .notifier_call = ctnetlink_expect_event,
1477};
1478#endif
1479
1480static struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = {
1481 [IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack,
1482 .attr_count = CTA_MAX, },
1483 [IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack,
1484 .attr_count = CTA_MAX, },
1485 [IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack,
1486 .attr_count = CTA_MAX, },
1487 [IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack,
1488 .attr_count = CTA_MAX, },
1489};
1490
1491static struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = {
1492 [IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect,
1493 .attr_count = CTA_EXPECT_MAX, },
1494 [IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect,
1495 .attr_count = CTA_EXPECT_MAX, },
1496 [IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect,
1497 .attr_count = CTA_EXPECT_MAX, },
1498};
1499
1500static struct nfnetlink_subsystem ctnl_subsys = {
1501 .name = "conntrack",
1502 .subsys_id = NFNL_SUBSYS_CTNETLINK,
1503 .cb_count = IPCTNL_MSG_MAX,
1504 .cb = ctnl_cb,
1505};
1506
1507static struct nfnetlink_subsystem ctnl_exp_subsys = {
1508 .name = "conntrack_expect",
1509 .subsys_id = NFNL_SUBSYS_CTNETLINK_EXP,
1510 .cb_count = IPCTNL_MSG_EXP_MAX,
1511 .cb = ctnl_exp_cb,
1512};
1513
1514MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK);
1515MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_EXP);
1516
1517static int __init ctnetlink_init(void)
1518{
1519 int ret;
1520
1521 printk("ctnetlink v%s: registering with nfnetlink.\n", version);
1522 ret = nfnetlink_subsys_register(&ctnl_subsys);
1523 if (ret < 0) {
1524 printk("ctnetlink_init: cannot register with nfnetlink.\n");
1525 goto err_out;
1526 }
1527
1528 ret = nfnetlink_subsys_register(&ctnl_exp_subsys);
1529 if (ret < 0) {
1530 printk("ctnetlink_init: cannot register exp with nfnetlink.\n");
1531 goto err_unreg_subsys;
1532 }
1533
1534#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
1535 ret = ip_conntrack_register_notifier(&ctnl_notifier);
1536 if (ret < 0) {
1537 printk("ctnetlink_init: cannot register notifier.\n");
1538 goto err_unreg_exp_subsys;
1539 }
1540
1541 ret = ip_conntrack_expect_register_notifier(&ctnl_notifier_exp);
1542 if (ret < 0) {
1543 printk("ctnetlink_init: cannot expect register notifier.\n");
1544 goto err_unreg_notifier;
1545 }
1546#endif
1547
1548 return 0;
1549
1550#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
1551err_unreg_notifier:
1552 ip_conntrack_unregister_notifier(&ctnl_notifier);
1553err_unreg_exp_subsys:
1554 nfnetlink_subsys_unregister(&ctnl_exp_subsys);
1555#endif
1556err_unreg_subsys:
1557 nfnetlink_subsys_unregister(&ctnl_subsys);
1558err_out:
1559 return ret;
1560}
1561
1562static void __exit ctnetlink_exit(void)
1563{
1564 printk("ctnetlink: unregistering from nfnetlink.\n");
1565
1566#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
1567 ip_conntrack_expect_unregister_notifier(&ctnl_notifier_exp);
1568 ip_conntrack_unregister_notifier(&ctnl_notifier);
1569#endif
1570
1571 nfnetlink_subsys_unregister(&ctnl_exp_subsys);
1572 nfnetlink_subsys_unregister(&ctnl_subsys);
1573 return;
1574}
1575
1576module_init(ctnetlink_init);
1577module_exit(ctnetlink_exit);
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_generic.c b/net/ipv4/netfilter/ip_conntrack_proto_generic.c
deleted file mode 100644
index 88af82e98658..000000000000
--- a/net/ipv4/netfilter/ip_conntrack_proto_generic.c
+++ /dev/null
@@ -1,74 +0,0 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9#include <linux/types.h>
10#include <linux/timer.h>
11#include <linux/netfilter.h>
12#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
13
14unsigned int ip_ct_generic_timeout __read_mostly = 600*HZ;
15
16static int generic_pkt_to_tuple(const struct sk_buff *skb,
17 unsigned int dataoff,
18 struct ip_conntrack_tuple *tuple)
19{
20 tuple->src.u.all = 0;
21 tuple->dst.u.all = 0;
22
23 return 1;
24}
25
26static int generic_invert_tuple(struct ip_conntrack_tuple *tuple,
27 const struct ip_conntrack_tuple *orig)
28{
29 tuple->src.u.all = 0;
30 tuple->dst.u.all = 0;
31
32 return 1;
33}
34
35/* Print out the per-protocol part of the tuple. */
36static int generic_print_tuple(struct seq_file *s,
37 const struct ip_conntrack_tuple *tuple)
38{
39 return 0;
40}
41
42/* Print out the private part of the conntrack. */
43static int generic_print_conntrack(struct seq_file *s,
44 const struct ip_conntrack *state)
45{
46 return 0;
47}
48
49/* Returns verdict for packet, or -1 for invalid. */
50static int packet(struct ip_conntrack *conntrack,
51 const struct sk_buff *skb,
52 enum ip_conntrack_info ctinfo)
53{
54 ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_generic_timeout);
55 return NF_ACCEPT;
56}
57
58/* Called when a new connection for this protocol found. */
59static int new(struct ip_conntrack *conntrack, const struct sk_buff *skb)
60{
61 return 1;
62}
63
64struct ip_conntrack_protocol ip_conntrack_generic_protocol =
65{
66 .proto = 0,
67 .name = "unknown",
68 .pkt_to_tuple = generic_pkt_to_tuple,
69 .invert_tuple = generic_invert_tuple,
70 .print_tuple = generic_print_tuple,
71 .print_conntrack = generic_print_conntrack,
72 .packet = packet,
73 .new = new,
74};
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_gre.c b/net/ipv4/netfilter/ip_conntrack_proto_gre.c
deleted file mode 100644
index ac1c49ef36a9..000000000000
--- a/net/ipv4/netfilter/ip_conntrack_proto_gre.c
+++ /dev/null
@@ -1,328 +0,0 @@
1/*
2 * ip_conntrack_proto_gre.c - Version 3.0
3 *
4 * Connection tracking protocol helper module for GRE.
5 *
6 * GRE is a generic encapsulation protocol, which is generally not very
7 * suited for NAT, as it has no protocol-specific part as port numbers.
8 *
9 * It has an optional key field, which may help us distinguishing two
10 * connections between the same two hosts.
11 *
12 * GRE is defined in RFC 1701 and RFC 1702, as well as RFC 2784
13 *
14 * PPTP is built on top of a modified version of GRE, and has a mandatory
15 * field called "CallID", which serves us for the same purpose as the key
16 * field in plain GRE.
17 *
18 * Documentation about PPTP can be found in RFC 2637
19 *
20 * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org>
21 *
22 * Development of this code funded by Astaro AG (http://www.astaro.com/)
23 *
24 */
25
26#include <linux/module.h>
27#include <linux/types.h>
28#include <linux/timer.h>
29#include <linux/netfilter.h>
30#include <linux/ip.h>
31#include <linux/in.h>
32#include <linux/list.h>
33#include <linux/seq_file.h>
34#include <linux/interrupt.h>
35
36static DEFINE_RWLOCK(ip_ct_gre_lock);
37
38#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
39#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
40#include <linux/netfilter_ipv4/ip_conntrack_core.h>
41
42#include <linux/netfilter_ipv4/ip_conntrack_proto_gre.h>
43#include <linux/netfilter_ipv4/ip_conntrack_pptp.h>
44
45MODULE_LICENSE("GPL");
46MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
47MODULE_DESCRIPTION("netfilter connection tracking protocol helper for GRE");
48
49/* shamelessly stolen from ip_conntrack_proto_udp.c */
50#define GRE_TIMEOUT (30*HZ)
51#define GRE_STREAM_TIMEOUT (180*HZ)
52
53#if 0
54#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s: " format, __FILE__, __FUNCTION__, ## args)
55#define DUMP_TUPLE_GRE(x) printk("%u.%u.%u.%u:0x%x -> %u.%u.%u.%u:0x%x\n", \
56 NIPQUAD((x)->src.ip), ntohs((x)->src.u.gre.key), \
57 NIPQUAD((x)->dst.ip), ntohs((x)->dst.u.gre.key))
58#else
59#define DEBUGP(x, args...)
60#define DUMP_TUPLE_GRE(x)
61#endif
62
63/* GRE KEYMAP HANDLING FUNCTIONS */
64static LIST_HEAD(gre_keymap_list);
65
66static inline int gre_key_cmpfn(const struct ip_ct_gre_keymap *km,
67 const struct ip_conntrack_tuple *t)
68{
69 return ((km->tuple.src.ip == t->src.ip) &&
70 (km->tuple.dst.ip == t->dst.ip) &&
71 (km->tuple.dst.protonum == t->dst.protonum) &&
72 (km->tuple.dst.u.all == t->dst.u.all));
73}
74
75/* look up the source key for a given tuple */
76static __be16 gre_keymap_lookup(struct ip_conntrack_tuple *t)
77{
78 struct ip_ct_gre_keymap *km;
79 __be16 key = 0;
80
81 read_lock_bh(&ip_ct_gre_lock);
82 list_for_each_entry(km, &gre_keymap_list, list) {
83 if (gre_key_cmpfn(km, t)) {
84 key = km->tuple.src.u.gre.key;
85 break;
86 }
87 }
88 read_unlock_bh(&ip_ct_gre_lock);
89
90 DEBUGP("lookup src key 0x%x up key for ", key);
91 DUMP_TUPLE_GRE(t);
92
93 return key;
94}
95
96/* add a single keymap entry, associate with specified master ct */
97int
98ip_ct_gre_keymap_add(struct ip_conntrack *ct,
99 struct ip_conntrack_tuple *t, int reply)
100{
101 struct ip_ct_gre_keymap **exist_km, *km;
102
103 if (!ct->helper || strcmp(ct->helper->name, "pptp")) {
104 DEBUGP("refusing to add GRE keymap to non-pptp session\n");
105 return -1;
106 }
107
108 if (!reply)
109 exist_km = &ct->help.ct_pptp_info.keymap_orig;
110 else
111 exist_km = &ct->help.ct_pptp_info.keymap_reply;
112
113 if (*exist_km) {
114 /* check whether it's a retransmission */
115 list_for_each_entry(km, &gre_keymap_list, list) {
116 if (gre_key_cmpfn(km, t) && km == *exist_km)
117 return 0;
118 }
119 DEBUGP("trying to override keymap_%s for ct %p\n",
120 reply? "reply":"orig", ct);
121 return -EEXIST;
122 }
123
124 km = kmalloc(sizeof(*km), GFP_ATOMIC);
125 if (!km)
126 return -ENOMEM;
127
128 memcpy(&km->tuple, t, sizeof(*t));
129 *exist_km = km;
130
131 DEBUGP("adding new entry %p: ", km);
132 DUMP_TUPLE_GRE(&km->tuple);
133
134 write_lock_bh(&ip_ct_gre_lock);
135 list_add_tail(&km->list, &gre_keymap_list);
136 write_unlock_bh(&ip_ct_gre_lock);
137
138 return 0;
139}
140
141/* destroy the keymap entries associated with specified master ct */
142void ip_ct_gre_keymap_destroy(struct ip_conntrack *ct)
143{
144 DEBUGP("entering for ct %p\n", ct);
145
146 if (!ct->helper || strcmp(ct->helper->name, "pptp")) {
147 DEBUGP("refusing to destroy GRE keymap to non-pptp session\n");
148 return;
149 }
150
151 write_lock_bh(&ip_ct_gre_lock);
152 if (ct->help.ct_pptp_info.keymap_orig) {
153 DEBUGP("removing %p from list\n",
154 ct->help.ct_pptp_info.keymap_orig);
155 list_del(&ct->help.ct_pptp_info.keymap_orig->list);
156 kfree(ct->help.ct_pptp_info.keymap_orig);
157 ct->help.ct_pptp_info.keymap_orig = NULL;
158 }
159 if (ct->help.ct_pptp_info.keymap_reply) {
160 DEBUGP("removing %p from list\n",
161 ct->help.ct_pptp_info.keymap_reply);
162 list_del(&ct->help.ct_pptp_info.keymap_reply->list);
163 kfree(ct->help.ct_pptp_info.keymap_reply);
164 ct->help.ct_pptp_info.keymap_reply = NULL;
165 }
166 write_unlock_bh(&ip_ct_gre_lock);
167}
168
169
170/* PUBLIC CONNTRACK PROTO HELPER FUNCTIONS */
171
172/* invert gre part of tuple */
173static int gre_invert_tuple(struct ip_conntrack_tuple *tuple,
174 const struct ip_conntrack_tuple *orig)
175{
176 tuple->dst.u.gre.key = orig->src.u.gre.key;
177 tuple->src.u.gre.key = orig->dst.u.gre.key;
178
179 return 1;
180}
181
182/* gre hdr info to tuple */
183static int gre_pkt_to_tuple(const struct sk_buff *skb,
184 unsigned int dataoff,
185 struct ip_conntrack_tuple *tuple)
186{
187 struct gre_hdr_pptp _pgrehdr, *pgrehdr;
188 __be16 srckey;
189 struct gre_hdr _grehdr, *grehdr;
190
191 /* first only delinearize old RFC1701 GRE header */
192 grehdr = skb_header_pointer(skb, dataoff, sizeof(_grehdr), &_grehdr);
193 if (!grehdr || grehdr->version != GRE_VERSION_PPTP) {
194 /* try to behave like "ip_conntrack_proto_generic" */
195 tuple->src.u.all = 0;
196 tuple->dst.u.all = 0;
197 return 1;
198 }
199
200 /* PPTP header is variable length, only need up to the call_id field */
201 pgrehdr = skb_header_pointer(skb, dataoff, 8, &_pgrehdr);
202 if (!pgrehdr)
203 return 1;
204
205 if (ntohs(grehdr->protocol) != GRE_PROTOCOL_PPTP) {
206 DEBUGP("GRE_VERSION_PPTP but unknown proto\n");
207 return 0;
208 }
209
210 tuple->dst.u.gre.key = pgrehdr->call_id;
211 srckey = gre_keymap_lookup(tuple);
212 tuple->src.u.gre.key = srckey;
213
214 return 1;
215}
216
217/* print gre part of tuple */
218static int gre_print_tuple(struct seq_file *s,
219 const struct ip_conntrack_tuple *tuple)
220{
221 return seq_printf(s, "srckey=0x%x dstkey=0x%x ",
222 ntohs(tuple->src.u.gre.key),
223 ntohs(tuple->dst.u.gre.key));
224}
225
226/* print private data for conntrack */
227static int gre_print_conntrack(struct seq_file *s,
228 const struct ip_conntrack *ct)
229{
230 return seq_printf(s, "timeout=%u, stream_timeout=%u ",
231 (ct->proto.gre.timeout / HZ),
232 (ct->proto.gre.stream_timeout / HZ));
233}
234
235/* Returns verdict for packet, and may modify conntrack */
236static int gre_packet(struct ip_conntrack *ct,
237 const struct sk_buff *skb,
238 enum ip_conntrack_info conntrackinfo)
239{
240 /* If we've seen traffic both ways, this is a GRE connection.
241 * Extend timeout. */
242 if (ct->status & IPS_SEEN_REPLY) {
243 ip_ct_refresh_acct(ct, conntrackinfo, skb,
244 ct->proto.gre.stream_timeout);
245 /* Also, more likely to be important, and not a probe. */
246 set_bit(IPS_ASSURED_BIT, &ct->status);
247 ip_conntrack_event_cache(IPCT_STATUS, skb);
248 } else
249 ip_ct_refresh_acct(ct, conntrackinfo, skb,
250 ct->proto.gre.timeout);
251
252 return NF_ACCEPT;
253}
254
255/* Called when a new connection for this protocol found. */
256static int gre_new(struct ip_conntrack *ct,
257 const struct sk_buff *skb)
258{
259 DEBUGP(": ");
260 DUMP_TUPLE_GRE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
261
262 /* initialize to sane value. Ideally a conntrack helper
263 * (e.g. in case of pptp) is increasing them */
264 ct->proto.gre.stream_timeout = GRE_STREAM_TIMEOUT;
265 ct->proto.gre.timeout = GRE_TIMEOUT;
266
267 return 1;
268}
269
270/* Called when a conntrack entry has already been removed from the hashes
271 * and is about to be deleted from memory */
272static void gre_destroy(struct ip_conntrack *ct)
273{
274 struct ip_conntrack *master = ct->master;
275 DEBUGP(" entering\n");
276
277 if (!master)
278 DEBUGP("no master !?!\n");
279 else
280 ip_ct_gre_keymap_destroy(master);
281}
282
283/* protocol helper struct */
284static struct ip_conntrack_protocol gre = {
285 .proto = IPPROTO_GRE,
286 .name = "gre",
287 .pkt_to_tuple = gre_pkt_to_tuple,
288 .invert_tuple = gre_invert_tuple,
289 .print_tuple = gre_print_tuple,
290 .print_conntrack = gre_print_conntrack,
291 .packet = gre_packet,
292 .new = gre_new,
293 .destroy = gre_destroy,
294 .me = THIS_MODULE,
295#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
296 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
297 .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr,
298 .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple,
299#endif
300};
301
302/* ip_conntrack_proto_gre initialization */
303int __init ip_ct_proto_gre_init(void)
304{
305 return ip_conntrack_protocol_register(&gre);
306}
307
308/* This cannot be __exit, as it is invoked from ip_conntrack_helper_pptp.c's
309 * init() code on errors.
310 */
311void ip_ct_proto_gre_fini(void)
312{
313 struct list_head *pos, *n;
314
315 /* delete all keymap entries */
316 write_lock_bh(&ip_ct_gre_lock);
317 list_for_each_safe(pos, n, &gre_keymap_list) {
318 DEBUGP("deleting keymap %p at module unload time\n", pos);
319 list_del(pos);
320 kfree(pos);
321 }
322 write_unlock_bh(&ip_ct_gre_lock);
323
324 ip_conntrack_protocol_unregister(&gre);
325}
326
327EXPORT_SYMBOL(ip_ct_gre_keymap_add);
328EXPORT_SYMBOL(ip_ct_gre_keymap_destroy);
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
deleted file mode 100644
index ad70c81a21e0..000000000000
--- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
+++ /dev/null
@@ -1,315 +0,0 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9#include <linux/types.h>
10#include <linux/timer.h>
11#include <linux/netfilter.h>
12#include <linux/in.h>
13#include <linux/icmp.h>
14#include <linux/seq_file.h>
15#include <linux/skbuff.h>
16#include <net/ip.h>
17#include <net/checksum.h>
18#include <linux/netfilter_ipv4.h>
19#include <linux/netfilter_ipv4/ip_conntrack.h>
20#include <linux/netfilter_ipv4/ip_conntrack_core.h>
21#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
22
23unsigned int ip_ct_icmp_timeout __read_mostly = 30*HZ;
24
25#if 0
26#define DEBUGP printk
27#else
28#define DEBUGP(format, args...)
29#endif
30
31static int icmp_pkt_to_tuple(const struct sk_buff *skb,
32 unsigned int dataoff,
33 struct ip_conntrack_tuple *tuple)
34{
35 struct icmphdr _hdr, *hp;
36
37 hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
38 if (hp == NULL)
39 return 0;
40
41 tuple->dst.u.icmp.type = hp->type;
42 tuple->src.u.icmp.id = hp->un.echo.id;
43 tuple->dst.u.icmp.code = hp->code;
44
45 return 1;
46}
47
48/* Add 1; spaces filled with 0. */
49static const u_int8_t invmap[] = {
50 [ICMP_ECHO] = ICMP_ECHOREPLY + 1,
51 [ICMP_ECHOREPLY] = ICMP_ECHO + 1,
52 [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1,
53 [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1,
54 [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1,
55 [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1,
56 [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1,
57 [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1
58};
59
60static int icmp_invert_tuple(struct ip_conntrack_tuple *tuple,
61 const struct ip_conntrack_tuple *orig)
62{
63 if (orig->dst.u.icmp.type >= sizeof(invmap)
64 || !invmap[orig->dst.u.icmp.type])
65 return 0;
66
67 tuple->src.u.icmp.id = orig->src.u.icmp.id;
68 tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1;
69 tuple->dst.u.icmp.code = orig->dst.u.icmp.code;
70 return 1;
71}
72
73/* Print out the per-protocol part of the tuple. */
74static int icmp_print_tuple(struct seq_file *s,
75 const struct ip_conntrack_tuple *tuple)
76{
77 return seq_printf(s, "type=%u code=%u id=%u ",
78 tuple->dst.u.icmp.type,
79 tuple->dst.u.icmp.code,
80 ntohs(tuple->src.u.icmp.id));
81}
82
83/* Print out the private part of the conntrack. */
84static int icmp_print_conntrack(struct seq_file *s,
85 const struct ip_conntrack *conntrack)
86{
87 return 0;
88}
89
90/* Returns verdict for packet, or -1 for invalid. */
91static int icmp_packet(struct ip_conntrack *ct,
92 const struct sk_buff *skb,
93 enum ip_conntrack_info ctinfo)
94{
95 /* Try to delete connection immediately after all replies:
96 won't actually vanish as we still have skb, and del_timer
97 means this will only run once even if count hits zero twice
98 (theoretically possible with SMP) */
99 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) {
100 if (atomic_dec_and_test(&ct->proto.icmp.count)
101 && del_timer(&ct->timeout))
102 ct->timeout.function((unsigned long)ct);
103 } else {
104 atomic_inc(&ct->proto.icmp.count);
105 ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb);
106 ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout);
107 }
108
109 return NF_ACCEPT;
110}
111
112/* Called when a new connection for this protocol found. */
113static int icmp_new(struct ip_conntrack *conntrack,
114 const struct sk_buff *skb)
115{
116 static const u_int8_t valid_new[] = {
117 [ICMP_ECHO] = 1,
118 [ICMP_TIMESTAMP] = 1,
119 [ICMP_INFO_REQUEST] = 1,
120 [ICMP_ADDRESS] = 1
121 };
122
123 if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new)
124 || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) {
125 /* Can't create a new ICMP `conn' with this. */
126 DEBUGP("icmp: can't create new conn with type %u\n",
127 conntrack->tuplehash[0].tuple.dst.u.icmp.type);
128 DUMP_TUPLE(&conntrack->tuplehash[0].tuple);
129 return 0;
130 }
131 atomic_set(&conntrack->proto.icmp.count, 0);
132 return 1;
133}
134
135static int
136icmp_error_message(struct sk_buff *skb,
137 enum ip_conntrack_info *ctinfo,
138 unsigned int hooknum)
139{
140 struct ip_conntrack_tuple innertuple, origtuple;
141 struct {
142 struct icmphdr icmp;
143 struct iphdr ip;
144 } _in, *inside;
145 struct ip_conntrack_protocol *innerproto;
146 struct ip_conntrack_tuple_hash *h;
147 int dataoff;
148
149 IP_NF_ASSERT(skb->nfct == NULL);
150
151 /* Not enough header? */
152 inside = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_in), &_in);
153 if (inside == NULL)
154 return -NF_ACCEPT;
155
156 /* Ignore ICMP's containing fragments (shouldn't happen) */
157 if (inside->ip.frag_off & htons(IP_OFFSET)) {
158 DEBUGP("icmp_error_track: fragment of proto %u\n",
159 inside->ip.protocol);
160 return -NF_ACCEPT;
161 }
162
163 innerproto = ip_conntrack_proto_find_get(inside->ip.protocol);
164 dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp) + inside->ip.ihl*4;
165 /* Are they talking about one of our connections? */
166 if (!ip_ct_get_tuple(&inside->ip, skb, dataoff, &origtuple, innerproto)) {
167 DEBUGP("icmp_error: ! get_tuple p=%u", inside->ip.protocol);
168 ip_conntrack_proto_put(innerproto);
169 return -NF_ACCEPT;
170 }
171
172 /* Ordinarily, we'd expect the inverted tupleproto, but it's
173 been preserved inside the ICMP. */
174 if (!ip_ct_invert_tuple(&innertuple, &origtuple, innerproto)) {
175 DEBUGP("icmp_error_track: Can't invert tuple\n");
176 ip_conntrack_proto_put(innerproto);
177 return -NF_ACCEPT;
178 }
179 ip_conntrack_proto_put(innerproto);
180
181 *ctinfo = IP_CT_RELATED;
182
183 h = ip_conntrack_find_get(&innertuple, NULL);
184 if (!h) {
185 /* Locally generated ICMPs will match inverted if they
186 haven't been SNAT'ed yet */
187 /* FIXME: NAT code has to handle half-done double NAT --RR */
188 if (hooknum == NF_IP_LOCAL_OUT)
189 h = ip_conntrack_find_get(&origtuple, NULL);
190
191 if (!h) {
192 DEBUGP("icmp_error_track: no match\n");
193 return -NF_ACCEPT;
194 }
195 /* Reverse direction from that found */
196 if (DIRECTION(h) != IP_CT_DIR_REPLY)
197 *ctinfo += IP_CT_IS_REPLY;
198 } else {
199 if (DIRECTION(h) == IP_CT_DIR_REPLY)
200 *ctinfo += IP_CT_IS_REPLY;
201 }
202
203 /* Update skb to refer to this connection */
204 skb->nfct = &tuplehash_to_ctrack(h)->ct_general;
205 skb->nfctinfo = *ctinfo;
206 return -NF_ACCEPT;
207}
208
209/* Small and modified version of icmp_rcv */
210static int
211icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
212 unsigned int hooknum)
213{
214 struct icmphdr _ih, *icmph;
215
216 /* Not enough header? */
217 icmph = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_ih), &_ih);
218 if (icmph == NULL) {
219 if (LOG_INVALID(IPPROTO_ICMP))
220 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
221 "ip_ct_icmp: short packet ");
222 return -NF_ACCEPT;
223 }
224
225 /* See ip_conntrack_proto_tcp.c */
226 if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING &&
227 nf_ip_checksum(skb, hooknum, skb->nh.iph->ihl * 4, 0)) {
228 if (LOG_INVALID(IPPROTO_ICMP))
229 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
230 "ip_ct_icmp: bad ICMP checksum ");
231 return -NF_ACCEPT;
232 }
233
234 /*
235 * 18 is the highest 'known' ICMP type. Anything else is a mystery
236 *
237 * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently
238 * discarded.
239 */
240 if (icmph->type > NR_ICMP_TYPES) {
241 if (LOG_INVALID(IPPROTO_ICMP))
242 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
243 "ip_ct_icmp: invalid ICMP type ");
244 return -NF_ACCEPT;
245 }
246
247 /* Need to track icmp error message? */
248 if (icmph->type != ICMP_DEST_UNREACH
249 && icmph->type != ICMP_SOURCE_QUENCH
250 && icmph->type != ICMP_TIME_EXCEEDED
251 && icmph->type != ICMP_PARAMETERPROB
252 && icmph->type != ICMP_REDIRECT)
253 return NF_ACCEPT;
254
255 return icmp_error_message(skb, ctinfo, hooknum);
256}
257
258#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
259 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
260static int icmp_tuple_to_nfattr(struct sk_buff *skb,
261 const struct ip_conntrack_tuple *t)
262{
263 NFA_PUT(skb, CTA_PROTO_ICMP_ID, sizeof(__be16),
264 &t->src.u.icmp.id);
265 NFA_PUT(skb, CTA_PROTO_ICMP_TYPE, sizeof(u_int8_t),
266 &t->dst.u.icmp.type);
267 NFA_PUT(skb, CTA_PROTO_ICMP_CODE, sizeof(u_int8_t),
268 &t->dst.u.icmp.code);
269
270 return 0;
271
272nfattr_failure:
273 return -1;
274}
275
276static int icmp_nfattr_to_tuple(struct nfattr *tb[],
277 struct ip_conntrack_tuple *tuple)
278{
279 if (!tb[CTA_PROTO_ICMP_TYPE-1]
280 || !tb[CTA_PROTO_ICMP_CODE-1]
281 || !tb[CTA_PROTO_ICMP_ID-1])
282 return -EINVAL;
283
284 tuple->dst.u.icmp.type =
285 *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_TYPE-1]);
286 tuple->dst.u.icmp.code =
287 *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_CODE-1]);
288 tuple->src.u.icmp.id =
289 *(__be16 *)NFA_DATA(tb[CTA_PROTO_ICMP_ID-1]);
290
291 if (tuple->dst.u.icmp.type >= sizeof(invmap)
292 || !invmap[tuple->dst.u.icmp.type])
293 return -EINVAL;
294
295 return 0;
296}
297#endif
298
299struct ip_conntrack_protocol ip_conntrack_protocol_icmp =
300{
301 .proto = IPPROTO_ICMP,
302 .name = "icmp",
303 .pkt_to_tuple = icmp_pkt_to_tuple,
304 .invert_tuple = icmp_invert_tuple,
305 .print_tuple = icmp_print_tuple,
306 .print_conntrack = icmp_print_conntrack,
307 .packet = icmp_packet,
308 .new = icmp_new,
309 .error = icmp_error,
310#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
311 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
312 .tuple_to_nfattr = icmp_tuple_to_nfattr,
313 .nfattr_to_tuple = icmp_nfattr_to_tuple,
314#endif
315};
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
deleted file mode 100644
index e6942992b2f6..000000000000
--- a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
+++ /dev/null
@@ -1,659 +0,0 @@
1/*
2 * Connection tracking protocol helper module for SCTP.
3 *
4 * SCTP is defined in RFC 2960. References to various sections in this code
5 * are to this RFC.
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12/*
13 * Added support for proc manipulation of timeouts.
14 */
15
16#include <linux/types.h>
17#include <linux/timer.h>
18#include <linux/interrupt.h>
19#include <linux/netfilter.h>
20#include <linux/module.h>
21#include <linux/in.h>
22#include <linux/ip.h>
23#include <linux/sctp.h>
24#include <linux/string.h>
25#include <linux/seq_file.h>
26
27#include <linux/netfilter_ipv4/ip_conntrack.h>
28#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
29
30#if 0
31#define DEBUGP(format, ...) printk(format, ## __VA_ARGS__)
32#else
33#define DEBUGP(format, args...)
34#endif
35
36/* Protects conntrack->proto.sctp */
37static DEFINE_RWLOCK(sctp_lock);
38
39/* FIXME: Examine ipfilter's timeouts and conntrack transitions more
40 closely. They're more complex. --RR
41
42 And so for me for SCTP :D -Kiran */
43
44static const char *sctp_conntrack_names[] = {
45 "NONE",
46 "CLOSED",
47 "COOKIE_WAIT",
48 "COOKIE_ECHOED",
49 "ESTABLISHED",
50 "SHUTDOWN_SENT",
51 "SHUTDOWN_RECD",
52 "SHUTDOWN_ACK_SENT",
53};
54
55#define SECS * HZ
56#define MINS * 60 SECS
57#define HOURS * 60 MINS
58#define DAYS * 24 HOURS
59
60static unsigned int ip_ct_sctp_timeout_closed __read_mostly = 10 SECS;
61static unsigned int ip_ct_sctp_timeout_cookie_wait __read_mostly = 3 SECS;
62static unsigned int ip_ct_sctp_timeout_cookie_echoed __read_mostly = 3 SECS;
63static unsigned int ip_ct_sctp_timeout_established __read_mostly = 5 DAYS;
64static unsigned int ip_ct_sctp_timeout_shutdown_sent __read_mostly = 300 SECS / 1000;
65static unsigned int ip_ct_sctp_timeout_shutdown_recd __read_mostly = 300 SECS / 1000;
66static unsigned int ip_ct_sctp_timeout_shutdown_ack_sent __read_mostly = 3 SECS;
67
68static const unsigned int * sctp_timeouts[]
69= { NULL, /* SCTP_CONNTRACK_NONE */
70 &ip_ct_sctp_timeout_closed, /* SCTP_CONNTRACK_CLOSED */
71 &ip_ct_sctp_timeout_cookie_wait, /* SCTP_CONNTRACK_COOKIE_WAIT */
72 &ip_ct_sctp_timeout_cookie_echoed, /* SCTP_CONNTRACK_COOKIE_ECHOED */
73 &ip_ct_sctp_timeout_established, /* SCTP_CONNTRACK_ESTABLISHED */
74 &ip_ct_sctp_timeout_shutdown_sent, /* SCTP_CONNTRACK_SHUTDOWN_SENT */
75 &ip_ct_sctp_timeout_shutdown_recd, /* SCTP_CONNTRACK_SHUTDOWN_RECD */
76 &ip_ct_sctp_timeout_shutdown_ack_sent /* SCTP_CONNTRACK_SHUTDOWN_ACK_SENT */
77 };
78
79#define sNO SCTP_CONNTRACK_NONE
80#define sCL SCTP_CONNTRACK_CLOSED
81#define sCW SCTP_CONNTRACK_COOKIE_WAIT
82#define sCE SCTP_CONNTRACK_COOKIE_ECHOED
83#define sES SCTP_CONNTRACK_ESTABLISHED
84#define sSS SCTP_CONNTRACK_SHUTDOWN_SENT
85#define sSR SCTP_CONNTRACK_SHUTDOWN_RECD
86#define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT
87#define sIV SCTP_CONNTRACK_MAX
88
89/*
90 These are the descriptions of the states:
91
92NOTE: These state names are tantalizingly similar to the states of an
93SCTP endpoint. But the interpretation of the states is a little different,
94considering that these are the states of the connection and not of an end
95point. Please note the subtleties. -Kiran
96
97NONE - Nothing so far.
98COOKIE WAIT - We have seen an INIT chunk in the original direction, or also
99 an INIT_ACK chunk in the reply direction.
100COOKIE ECHOED - We have seen a COOKIE_ECHO chunk in the original direction.
101ESTABLISHED - We have seen a COOKIE_ACK in the reply direction.
102SHUTDOWN_SENT - We have seen a SHUTDOWN chunk in the original direction.
103SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply directoin.
104SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite
105 to that of the SHUTDOWN chunk.
106CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of
107 the SHUTDOWN chunk. Connection is closed.
108*/
109
110/* TODO
111 - I have assumed that the first INIT is in the original direction.
112 This messes things when an INIT comes in the reply direction in CLOSED
113 state.
114 - Check the error type in the reply dir before transitioning from
115cookie echoed to closed.
116 - Sec 5.2.4 of RFC 2960
117 - Multi Homing support.
118*/
119
120/* SCTP conntrack state transitions */
121static const enum sctp_conntrack sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = {
122 {
123/* ORIGINAL */
124/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */
125/* init */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA},
126/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},
127/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
128/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA},
129/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA},
130/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant have Stale cookie*/
131/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA},/* 5.2.4 - Big TODO */
132/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in orig dir */
133/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL}
134 },
135 {
136/* REPLY */
137/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */
138/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* INIT in sCL Big TODO */
139/* init_ack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},
140/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
141/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA},
142/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA},
143/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA},
144/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in reply dir */
145/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA},
146/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL}
147 }
148};
149
150static int sctp_pkt_to_tuple(const struct sk_buff *skb,
151 unsigned int dataoff,
152 struct ip_conntrack_tuple *tuple)
153{
154 sctp_sctphdr_t _hdr, *hp;
155
156 DEBUGP(__FUNCTION__);
157 DEBUGP("\n");
158
159 /* Actually only need first 8 bytes. */
160 hp = skb_header_pointer(skb, dataoff, 8, &_hdr);
161 if (hp == NULL)
162 return 0;
163
164 tuple->src.u.sctp.port = hp->source;
165 tuple->dst.u.sctp.port = hp->dest;
166 return 1;
167}
168
169static int sctp_invert_tuple(struct ip_conntrack_tuple *tuple,
170 const struct ip_conntrack_tuple *orig)
171{
172 DEBUGP(__FUNCTION__);
173 DEBUGP("\n");
174
175 tuple->src.u.sctp.port = orig->dst.u.sctp.port;
176 tuple->dst.u.sctp.port = orig->src.u.sctp.port;
177 return 1;
178}
179
180/* Print out the per-protocol part of the tuple. */
181static int sctp_print_tuple(struct seq_file *s,
182 const struct ip_conntrack_tuple *tuple)
183{
184 DEBUGP(__FUNCTION__);
185 DEBUGP("\n");
186
187 return seq_printf(s, "sport=%hu dport=%hu ",
188 ntohs(tuple->src.u.sctp.port),
189 ntohs(tuple->dst.u.sctp.port));
190}
191
192/* Print out the private part of the conntrack. */
193static int sctp_print_conntrack(struct seq_file *s,
194 const struct ip_conntrack *conntrack)
195{
196 enum sctp_conntrack state;
197
198 DEBUGP(__FUNCTION__);
199 DEBUGP("\n");
200
201 read_lock_bh(&sctp_lock);
202 state = conntrack->proto.sctp.state;
203 read_unlock_bh(&sctp_lock);
204
205 return seq_printf(s, "%s ", sctp_conntrack_names[state]);
206}
207
208#define for_each_sctp_chunk(skb, sch, _sch, offset, count) \
209for (offset = skb->nh.iph->ihl * 4 + sizeof(sctp_sctphdr_t), count = 0; \
210 offset < skb->len && \
211 (sch = skb_header_pointer(skb, offset, sizeof(_sch), &_sch)); \
212 offset += (ntohs(sch->length) + 3) & ~3, count++)
213
214/* Some validity checks to make sure the chunks are fine */
215static int do_basic_checks(struct ip_conntrack *conntrack,
216 const struct sk_buff *skb,
217 char *map)
218{
219 u_int32_t offset, count;
220 sctp_chunkhdr_t _sch, *sch;
221 int flag;
222
223 DEBUGP(__FUNCTION__);
224 DEBUGP("\n");
225
226 flag = 0;
227
228 for_each_sctp_chunk (skb, sch, _sch, offset, count) {
229 DEBUGP("Chunk Num: %d Type: %d\n", count, sch->type);
230
231 if (sch->type == SCTP_CID_INIT
232 || sch->type == SCTP_CID_INIT_ACK
233 || sch->type == SCTP_CID_SHUTDOWN_COMPLETE) {
234 flag = 1;
235 }
236
237 /*
238 * Cookie Ack/Echo chunks not the first OR
239 * Init / Init Ack / Shutdown compl chunks not the only chunks
240 * OR zero-length.
241 */
242 if (((sch->type == SCTP_CID_COOKIE_ACK
243 || sch->type == SCTP_CID_COOKIE_ECHO
244 || flag)
245 && count !=0) || !sch->length) {
246 DEBUGP("Basic checks failed\n");
247 return 1;
248 }
249
250 if (map) {
251 set_bit(sch->type, (void *)map);
252 }
253 }
254
255 DEBUGP("Basic checks passed\n");
256 return count == 0;
257}
258
259static int new_state(enum ip_conntrack_dir dir,
260 enum sctp_conntrack cur_state,
261 int chunk_type)
262{
263 int i;
264
265 DEBUGP(__FUNCTION__);
266 DEBUGP("\n");
267
268 DEBUGP("Chunk type: %d\n", chunk_type);
269
270 switch (chunk_type) {
271 case SCTP_CID_INIT:
272 DEBUGP("SCTP_CID_INIT\n");
273 i = 0; break;
274 case SCTP_CID_INIT_ACK:
275 DEBUGP("SCTP_CID_INIT_ACK\n");
276 i = 1; break;
277 case SCTP_CID_ABORT:
278 DEBUGP("SCTP_CID_ABORT\n");
279 i = 2; break;
280 case SCTP_CID_SHUTDOWN:
281 DEBUGP("SCTP_CID_SHUTDOWN\n");
282 i = 3; break;
283 case SCTP_CID_SHUTDOWN_ACK:
284 DEBUGP("SCTP_CID_SHUTDOWN_ACK\n");
285 i = 4; break;
286 case SCTP_CID_ERROR:
287 DEBUGP("SCTP_CID_ERROR\n");
288 i = 5; break;
289 case SCTP_CID_COOKIE_ECHO:
290 DEBUGP("SCTP_CID_COOKIE_ECHO\n");
291 i = 6; break;
292 case SCTP_CID_COOKIE_ACK:
293 DEBUGP("SCTP_CID_COOKIE_ACK\n");
294 i = 7; break;
295 case SCTP_CID_SHUTDOWN_COMPLETE:
296 DEBUGP("SCTP_CID_SHUTDOWN_COMPLETE\n");
297 i = 8; break;
298 default:
299 /* Other chunks like DATA, SACK, HEARTBEAT and
300 its ACK do not cause a change in state */
301 DEBUGP("Unknown chunk type, Will stay in %s\n",
302 sctp_conntrack_names[cur_state]);
303 return cur_state;
304 }
305
306 DEBUGP("dir: %d cur_state: %s chunk_type: %d new_state: %s\n",
307 dir, sctp_conntrack_names[cur_state], chunk_type,
308 sctp_conntrack_names[sctp_conntracks[dir][i][cur_state]]);
309
310 return sctp_conntracks[dir][i][cur_state];
311}
312
313/* Returns verdict for packet, or -1 for invalid. */
314static int sctp_packet(struct ip_conntrack *conntrack,
315 const struct sk_buff *skb,
316 enum ip_conntrack_info ctinfo)
317{
318 enum sctp_conntrack newconntrack, oldsctpstate;
319 struct iphdr *iph = skb->nh.iph;
320 sctp_sctphdr_t _sctph, *sh;
321 sctp_chunkhdr_t _sch, *sch;
322 u_int32_t offset, count;
323 char map[256 / sizeof (char)] = {0};
324
325 DEBUGP(__FUNCTION__);
326 DEBUGP("\n");
327
328 sh = skb_header_pointer(skb, iph->ihl * 4, sizeof(_sctph), &_sctph);
329 if (sh == NULL)
330 return -1;
331
332 if (do_basic_checks(conntrack, skb, map) != 0)
333 return -1;
334
335 /* Check the verification tag (Sec 8.5) */
336 if (!test_bit(SCTP_CID_INIT, (void *)map)
337 && !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, (void *)map)
338 && !test_bit(SCTP_CID_COOKIE_ECHO, (void *)map)
339 && !test_bit(SCTP_CID_ABORT, (void *)map)
340 && !test_bit(SCTP_CID_SHUTDOWN_ACK, (void *)map)
341 && (sh->vtag != conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) {
342 DEBUGP("Verification tag check failed\n");
343 return -1;
344 }
345
346 oldsctpstate = newconntrack = SCTP_CONNTRACK_MAX;
347 for_each_sctp_chunk (skb, sch, _sch, offset, count) {
348 write_lock_bh(&sctp_lock);
349
350 /* Special cases of Verification tag check (Sec 8.5.1) */
351 if (sch->type == SCTP_CID_INIT) {
352 /* Sec 8.5.1 (A) */
353 if (sh->vtag != 0) {
354 write_unlock_bh(&sctp_lock);
355 return -1;
356 }
357 } else if (sch->type == SCTP_CID_ABORT) {
358 /* Sec 8.5.1 (B) */
359 if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])
360 && !(sh->vtag == conntrack->proto.sctp.vtag
361 [1 - CTINFO2DIR(ctinfo)])) {
362 write_unlock_bh(&sctp_lock);
363 return -1;
364 }
365 } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) {
366 /* Sec 8.5.1 (C) */
367 if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])
368 && !(sh->vtag == conntrack->proto.sctp.vtag
369 [1 - CTINFO2DIR(ctinfo)]
370 && (sch->flags & 1))) {
371 write_unlock_bh(&sctp_lock);
372 return -1;
373 }
374 } else if (sch->type == SCTP_CID_COOKIE_ECHO) {
375 /* Sec 8.5.1 (D) */
376 if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) {
377 write_unlock_bh(&sctp_lock);
378 return -1;
379 }
380 }
381
382 oldsctpstate = conntrack->proto.sctp.state;
383 newconntrack = new_state(CTINFO2DIR(ctinfo), oldsctpstate, sch->type);
384
385 /* Invalid */
386 if (newconntrack == SCTP_CONNTRACK_MAX) {
387 DEBUGP("ip_conntrack_sctp: Invalid dir=%i ctype=%u conntrack=%u\n",
388 CTINFO2DIR(ctinfo), sch->type, oldsctpstate);
389 write_unlock_bh(&sctp_lock);
390 return -1;
391 }
392
393 /* If it is an INIT or an INIT ACK note down the vtag */
394 if (sch->type == SCTP_CID_INIT
395 || sch->type == SCTP_CID_INIT_ACK) {
396 sctp_inithdr_t _inithdr, *ih;
397
398 ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t),
399 sizeof(_inithdr), &_inithdr);
400 if (ih == NULL) {
401 write_unlock_bh(&sctp_lock);
402 return -1;
403 }
404 DEBUGP("Setting vtag %x for dir %d\n",
405 ih->init_tag, !CTINFO2DIR(ctinfo));
406 conntrack->proto.sctp.vtag[!CTINFO2DIR(ctinfo)] = ih->init_tag;
407 }
408
409 conntrack->proto.sctp.state = newconntrack;
410 if (oldsctpstate != newconntrack)
411 ip_conntrack_event_cache(IPCT_PROTOINFO, skb);
412 write_unlock_bh(&sctp_lock);
413 }
414
415 ip_ct_refresh_acct(conntrack, ctinfo, skb, *sctp_timeouts[newconntrack]);
416
417 if (oldsctpstate == SCTP_CONNTRACK_COOKIE_ECHOED
418 && CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY
419 && newconntrack == SCTP_CONNTRACK_ESTABLISHED) {
420 DEBUGP("Setting assured bit\n");
421 set_bit(IPS_ASSURED_BIT, &conntrack->status);
422 ip_conntrack_event_cache(IPCT_STATUS, skb);
423 }
424
425 return NF_ACCEPT;
426}
427
428/* Called when a new connection for this protocol found. */
429static int sctp_new(struct ip_conntrack *conntrack,
430 const struct sk_buff *skb)
431{
432 enum sctp_conntrack newconntrack;
433 struct iphdr *iph = skb->nh.iph;
434 sctp_sctphdr_t _sctph, *sh;
435 sctp_chunkhdr_t _sch, *sch;
436 u_int32_t offset, count;
437 char map[256 / sizeof (char)] = {0};
438
439 DEBUGP(__FUNCTION__);
440 DEBUGP("\n");
441
442 sh = skb_header_pointer(skb, iph->ihl * 4, sizeof(_sctph), &_sctph);
443 if (sh == NULL)
444 return 0;
445
446 if (do_basic_checks(conntrack, skb, map) != 0)
447 return 0;
448
449 /* If an OOTB packet has any of these chunks discard (Sec 8.4) */
450 if ((test_bit (SCTP_CID_ABORT, (void *)map))
451 || (test_bit (SCTP_CID_SHUTDOWN_COMPLETE, (void *)map))
452 || (test_bit (SCTP_CID_COOKIE_ACK, (void *)map))) {
453 return 0;
454 }
455
456 newconntrack = SCTP_CONNTRACK_MAX;
457 for_each_sctp_chunk (skb, sch, _sch, offset, count) {
458 /* Don't need lock here: this conntrack not in circulation yet */
459 newconntrack = new_state (IP_CT_DIR_ORIGINAL,
460 SCTP_CONNTRACK_NONE, sch->type);
461
462 /* Invalid: delete conntrack */
463 if (newconntrack == SCTP_CONNTRACK_MAX) {
464 DEBUGP("ip_conntrack_sctp: invalid new deleting.\n");
465 return 0;
466 }
467
468 /* Copy the vtag into the state info */
469 if (sch->type == SCTP_CID_INIT) {
470 if (sh->vtag == 0) {
471 sctp_inithdr_t _inithdr, *ih;
472
473 ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t),
474 sizeof(_inithdr), &_inithdr);
475 if (ih == NULL)
476 return 0;
477
478 DEBUGP("Setting vtag %x for new conn\n",
479 ih->init_tag);
480
481 conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] =
482 ih->init_tag;
483 } else {
484 /* Sec 8.5.1 (A) */
485 return 0;
486 }
487 }
488 /* If it is a shutdown ack OOTB packet, we expect a return
489 shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */
490 else {
491 DEBUGP("Setting vtag %x for new conn OOTB\n",
492 sh->vtag);
493 conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag;
494 }
495
496 conntrack->proto.sctp.state = newconntrack;
497 }
498
499 return 1;
500}
501
502static struct ip_conntrack_protocol ip_conntrack_protocol_sctp = {
503 .proto = IPPROTO_SCTP,
504 .name = "sctp",
505 .pkt_to_tuple = sctp_pkt_to_tuple,
506 .invert_tuple = sctp_invert_tuple,
507 .print_tuple = sctp_print_tuple,
508 .print_conntrack = sctp_print_conntrack,
509 .packet = sctp_packet,
510 .new = sctp_new,
511 .destroy = NULL,
512 .me = THIS_MODULE,
513#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
514 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
515 .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr,
516 .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple,
517#endif
518};
519
520#ifdef CONFIG_SYSCTL
521static ctl_table ip_ct_sysctl_table[] = {
522 {
523 .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED,
524 .procname = "ip_conntrack_sctp_timeout_closed",
525 .data = &ip_ct_sctp_timeout_closed,
526 .maxlen = sizeof(unsigned int),
527 .mode = 0644,
528 .proc_handler = &proc_dointvec_jiffies,
529 },
530 {
531 .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT,
532 .procname = "ip_conntrack_sctp_timeout_cookie_wait",
533 .data = &ip_ct_sctp_timeout_cookie_wait,
534 .maxlen = sizeof(unsigned int),
535 .mode = 0644,
536 .proc_handler = &proc_dointvec_jiffies,
537 },
538 {
539 .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED,
540 .procname = "ip_conntrack_sctp_timeout_cookie_echoed",
541 .data = &ip_ct_sctp_timeout_cookie_echoed,
542 .maxlen = sizeof(unsigned int),
543 .mode = 0644,
544 .proc_handler = &proc_dointvec_jiffies,
545 },
546 {
547 .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED,
548 .procname = "ip_conntrack_sctp_timeout_established",
549 .data = &ip_ct_sctp_timeout_established,
550 .maxlen = sizeof(unsigned int),
551 .mode = 0644,
552 .proc_handler = &proc_dointvec_jiffies,
553 },
554 {
555 .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT,
556 .procname = "ip_conntrack_sctp_timeout_shutdown_sent",
557 .data = &ip_ct_sctp_timeout_shutdown_sent,
558 .maxlen = sizeof(unsigned int),
559 .mode = 0644,
560 .proc_handler = &proc_dointvec_jiffies,
561 },
562 {
563 .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD,
564 .procname = "ip_conntrack_sctp_timeout_shutdown_recd",
565 .data = &ip_ct_sctp_timeout_shutdown_recd,
566 .maxlen = sizeof(unsigned int),
567 .mode = 0644,
568 .proc_handler = &proc_dointvec_jiffies,
569 },
570 {
571 .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT,
572 .procname = "ip_conntrack_sctp_timeout_shutdown_ack_sent",
573 .data = &ip_ct_sctp_timeout_shutdown_ack_sent,
574 .maxlen = sizeof(unsigned int),
575 .mode = 0644,
576 .proc_handler = &proc_dointvec_jiffies,
577 },
578 { .ctl_name = 0 }
579};
580
581static ctl_table ip_ct_netfilter_table[] = {
582 {
583 .ctl_name = NET_IPV4_NETFILTER,
584 .procname = "netfilter",
585 .mode = 0555,
586 .child = ip_ct_sysctl_table,
587 },
588 { .ctl_name = 0 }
589};
590
591static ctl_table ip_ct_ipv4_table[] = {
592 {
593 .ctl_name = NET_IPV4,
594 .procname = "ipv4",
595 .mode = 0555,
596 .child = ip_ct_netfilter_table,
597 },
598 { .ctl_name = 0 }
599};
600
601static ctl_table ip_ct_net_table[] = {
602 {
603 .ctl_name = CTL_NET,
604 .procname = "net",
605 .mode = 0555,
606 .child = ip_ct_ipv4_table,
607 },
608 { .ctl_name = 0 }
609};
610
611static struct ctl_table_header *ip_ct_sysctl_header;
612#endif
613
614static int __init ip_conntrack_proto_sctp_init(void)
615{
616 int ret;
617
618 ret = ip_conntrack_protocol_register(&ip_conntrack_protocol_sctp);
619 if (ret) {
620 printk("ip_conntrack_proto_sctp: protocol register failed\n");
621 goto out;
622 }
623
624#ifdef CONFIG_SYSCTL
625 ip_ct_sysctl_header = register_sysctl_table(ip_ct_net_table);
626 if (ip_ct_sysctl_header == NULL) {
627 ret = -ENOMEM;
628 printk("ip_conntrack_proto_sctp: can't register to sysctl.\n");
629 goto cleanup;
630 }
631#endif
632
633 return ret;
634
635#ifdef CONFIG_SYSCTL
636 cleanup:
637 ip_conntrack_protocol_unregister(&ip_conntrack_protocol_sctp);
638#endif
639 out:
640 DEBUGP("SCTP conntrack module loading %s\n",
641 ret ? "failed": "succeeded");
642 return ret;
643}
644
645static void __exit ip_conntrack_proto_sctp_fini(void)
646{
647 ip_conntrack_protocol_unregister(&ip_conntrack_protocol_sctp);
648#ifdef CONFIG_SYSCTL
649 unregister_sysctl_table(ip_ct_sysctl_header);
650#endif
651 DEBUGP("SCTP conntrack module unloaded\n");
652}
653
654module_init(ip_conntrack_proto_sctp_init);
655module_exit(ip_conntrack_proto_sctp_fini);
656
657MODULE_LICENSE("GPL");
658MODULE_AUTHOR("Kiran Kumar Immidi");
659MODULE_DESCRIPTION("Netfilter connection tracking protocol helper for SCTP");
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
deleted file mode 100644
index 0a72eab14620..000000000000
--- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+++ /dev/null
@@ -1,1164 +0,0 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>:
9 * - Real stateful connection tracking
10 * - Modified state transitions table
11 * - Window scaling support added
12 * - SACK support added
13 *
14 * Willy Tarreau:
15 * - State table bugfixes
16 * - More robust state changes
17 * - Tuning timer parameters
18 *
19 * version 2.2
20 */
21
22#include <linux/types.h>
23#include <linux/timer.h>
24#include <linux/netfilter.h>
25#include <linux/module.h>
26#include <linux/in.h>
27#include <linux/ip.h>
28#include <linux/tcp.h>
29#include <linux/spinlock.h>
30
31#include <net/tcp.h>
32
33#include <linux/netfilter_ipv4.h>
34#include <linux/netfilter_ipv4/ip_conntrack.h>
35#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
36
37#if 0
38#define DEBUGP printk
39#define DEBUGP_VARS
40#else
41#define DEBUGP(format, args...)
42#endif
43
44/* Protects conntrack->proto.tcp */
45static DEFINE_RWLOCK(tcp_lock);
46
47/* "Be conservative in what you do,
48 be liberal in what you accept from others."
49 If it's non-zero, we mark only out of window RST segments as INVALID. */
50int ip_ct_tcp_be_liberal __read_mostly = 0;
51
52/* If it is set to zero, we disable picking up already established
53 connections. */
54int ip_ct_tcp_loose __read_mostly = 1;
55
56/* Max number of the retransmitted packets without receiving an (acceptable)
57 ACK from the destination. If this number is reached, a shorter timer
58 will be started. */
59int ip_ct_tcp_max_retrans __read_mostly = 3;
60
61 /* FIXME: Examine ipfilter's timeouts and conntrack transitions more
62 closely. They're more complex. --RR */
63
64static const char *tcp_conntrack_names[] = {
65 "NONE",
66 "SYN_SENT",
67 "SYN_RECV",
68 "ESTABLISHED",
69 "FIN_WAIT",
70 "CLOSE_WAIT",
71 "LAST_ACK",
72 "TIME_WAIT",
73 "CLOSE",
74 "LISTEN"
75};
76
77#define SECS * HZ
78#define MINS * 60 SECS
79#define HOURS * 60 MINS
80#define DAYS * 24 HOURS
81
82unsigned int ip_ct_tcp_timeout_syn_sent __read_mostly = 2 MINS;
83unsigned int ip_ct_tcp_timeout_syn_recv __read_mostly = 60 SECS;
84unsigned int ip_ct_tcp_timeout_established __read_mostly = 5 DAYS;
85unsigned int ip_ct_tcp_timeout_fin_wait __read_mostly = 2 MINS;
86unsigned int ip_ct_tcp_timeout_close_wait __read_mostly = 60 SECS;
87unsigned int ip_ct_tcp_timeout_last_ack __read_mostly = 30 SECS;
88unsigned int ip_ct_tcp_timeout_time_wait __read_mostly = 2 MINS;
89unsigned int ip_ct_tcp_timeout_close __read_mostly = 10 SECS;
90
91/* RFC1122 says the R2 limit should be at least 100 seconds.
92 Linux uses 15 packets as limit, which corresponds
93 to ~13-30min depending on RTO. */
94unsigned int ip_ct_tcp_timeout_max_retrans __read_mostly = 5 MINS;
95
96static const unsigned int * tcp_timeouts[]
97= { NULL, /* TCP_CONNTRACK_NONE */
98 &ip_ct_tcp_timeout_syn_sent, /* TCP_CONNTRACK_SYN_SENT, */
99 &ip_ct_tcp_timeout_syn_recv, /* TCP_CONNTRACK_SYN_RECV, */
100 &ip_ct_tcp_timeout_established, /* TCP_CONNTRACK_ESTABLISHED, */
101 &ip_ct_tcp_timeout_fin_wait, /* TCP_CONNTRACK_FIN_WAIT, */
102 &ip_ct_tcp_timeout_close_wait, /* TCP_CONNTRACK_CLOSE_WAIT, */
103 &ip_ct_tcp_timeout_last_ack, /* TCP_CONNTRACK_LAST_ACK, */
104 &ip_ct_tcp_timeout_time_wait, /* TCP_CONNTRACK_TIME_WAIT, */
105 &ip_ct_tcp_timeout_close, /* TCP_CONNTRACK_CLOSE, */
106 NULL, /* TCP_CONNTRACK_LISTEN */
107 };
108
109#define sNO TCP_CONNTRACK_NONE
110#define sSS TCP_CONNTRACK_SYN_SENT
111#define sSR TCP_CONNTRACK_SYN_RECV
112#define sES TCP_CONNTRACK_ESTABLISHED
113#define sFW TCP_CONNTRACK_FIN_WAIT
114#define sCW TCP_CONNTRACK_CLOSE_WAIT
115#define sLA TCP_CONNTRACK_LAST_ACK
116#define sTW TCP_CONNTRACK_TIME_WAIT
117#define sCL TCP_CONNTRACK_CLOSE
118#define sLI TCP_CONNTRACK_LISTEN
119#define sIV TCP_CONNTRACK_MAX
120#define sIG TCP_CONNTRACK_IGNORE
121
122/* What TCP flags are set from RST/SYN/FIN/ACK. */
123enum tcp_bit_set {
124 TCP_SYN_SET,
125 TCP_SYNACK_SET,
126 TCP_FIN_SET,
127 TCP_ACK_SET,
128 TCP_RST_SET,
129 TCP_NONE_SET,
130};
131
132/*
133 * The TCP state transition table needs a few words...
134 *
135 * We are the man in the middle. All the packets go through us
136 * but might get lost in transit to the destination.
137 * It is assumed that the destinations can't receive segments
138 * we haven't seen.
139 *
140 * The checked segment is in window, but our windows are *not*
141 * equivalent with the ones of the sender/receiver. We always
142 * try to guess the state of the current sender.
143 *
144 * The meaning of the states are:
145 *
146 * NONE: initial state
147 * SYN_SENT: SYN-only packet seen
148 * SYN_RECV: SYN-ACK packet seen
149 * ESTABLISHED: ACK packet seen
150 * FIN_WAIT: FIN packet seen
151 * CLOSE_WAIT: ACK seen (after FIN)
152 * LAST_ACK: FIN seen (after FIN)
153 * TIME_WAIT: last ACK seen
154 * CLOSE: closed connection
155 *
156 * LISTEN state is not used.
157 *
158 * Packets marked as IGNORED (sIG):
159 * if they may be either invalid or valid
160 * and the receiver may send back a connection
161 * closing RST or a SYN/ACK.
162 *
163 * Packets marked as INVALID (sIV):
164 * if they are invalid
165 * or we do not support the request (simultaneous open)
166 */
167static const enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
168 {
169/* ORIGINAL */
170/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
171/*syn*/ { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sIV },
172/*
173 * sNO -> sSS Initialize a new connection
174 * sSS -> sSS Retransmitted SYN
175 * sSR -> sIG Late retransmitted SYN?
176 * sES -> sIG Error: SYNs in window outside the SYN_SENT state
177 * are errors. Receiver will reply with RST
178 * and close the connection.
179 * Or we are not in sync and hold a dead connection.
180 * sFW -> sIG
181 * sCW -> sIG
182 * sLA -> sIG
183 * sTW -> sSS Reopened connection (RFC 1122).
184 * sCL -> sSS
185 */
186/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
187/*synack*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV },
188/*
189 * A SYN/ACK from the client is always invalid:
190 * - either it tries to set up a simultaneous open, which is
191 * not supported;
192 * - or the firewall has just been inserted between the two hosts
193 * during the session set-up. The SYN will be retransmitted
194 * by the true client (or it'll time out).
195 */
196/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
197/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
198/*
199 * sNO -> sIV Too late and no reason to do anything...
200 * sSS -> sIV Client migth not send FIN in this state:
201 * we enforce waiting for a SYN/ACK reply first.
202 * sSR -> sFW Close started.
203 * sES -> sFW
204 * sFW -> sLA FIN seen in both directions, waiting for
205 * the last ACK.
206 * Migth be a retransmitted FIN as well...
207 * sCW -> sLA
208 * sLA -> sLA Retransmitted FIN. Remain in the same state.
209 * sTW -> sTW
210 * sCL -> sCL
211 */
212/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
213/*ack*/ { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV },
214/*
215 * sNO -> sES Assumed.
216 * sSS -> sIV ACK is invalid: we haven't seen a SYN/ACK yet.
217 * sSR -> sES Established state is reached.
218 * sES -> sES :-)
219 * sFW -> sCW Normal close request answered by ACK.
220 * sCW -> sCW
221 * sLA -> sTW Last ACK detected.
222 * sTW -> sTW Retransmitted last ACK. Remain in the same state.
223 * sCL -> sCL
224 */
225/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
226/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV },
227/*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
228 },
229 {
230/* REPLY */
231/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
232/*syn*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV },
233/*
234 * sNO -> sIV Never reached.
235 * sSS -> sIV Simultaneous open, not supported
236 * sSR -> sIV Simultaneous open, not supported.
237 * sES -> sIV Server may not initiate a connection.
238 * sFW -> sIV
239 * sCW -> sIV
240 * sLA -> sIV
241 * sTW -> sIV Reopened connection, but server may not do it.
242 * sCL -> sIV
243 */
244/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
245/*synack*/ { sIV, sSR, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIV },
246/*
247 * sSS -> sSR Standard open.
248 * sSR -> sSR Retransmitted SYN/ACK.
249 * sES -> sIG Late retransmitted SYN/ACK?
250 * sFW -> sIG Might be SYN/ACK answering ignored SYN
251 * sCW -> sIG
252 * sLA -> sIG
253 * sTW -> sIG
254 * sCL -> sIG
255 */
256/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
257/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
258/*
259 * sSS -> sIV Server might not send FIN in this state.
260 * sSR -> sFW Close started.
261 * sES -> sFW
262 * sFW -> sLA FIN seen in both directions.
263 * sCW -> sLA
264 * sLA -> sLA Retransmitted FIN.
265 * sTW -> sTW
266 * sCL -> sCL
267 */
268/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
269/*ack*/ { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIV },
270/*
271 * sSS -> sIG Might be a half-open connection.
272 * sSR -> sSR Might answer late resent SYN.
273 * sES -> sES :-)
274 * sFW -> sCW Normal close request answered by ACK.
275 * sCW -> sCW
276 * sLA -> sTW Last ACK detected.
277 * sTW -> sTW Retransmitted last ACK.
278 * sCL -> sCL
279 */
280/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
281/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV },
282/*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
283 }
284};
285
286static int tcp_pkt_to_tuple(const struct sk_buff *skb,
287 unsigned int dataoff,
288 struct ip_conntrack_tuple *tuple)
289{
290 struct tcphdr _hdr, *hp;
291
292 /* Actually only need first 8 bytes. */
293 hp = skb_header_pointer(skb, dataoff, 8, &_hdr);
294 if (hp == NULL)
295 return 0;
296
297 tuple->src.u.tcp.port = hp->source;
298 tuple->dst.u.tcp.port = hp->dest;
299
300 return 1;
301}
302
303static int tcp_invert_tuple(struct ip_conntrack_tuple *tuple,
304 const struct ip_conntrack_tuple *orig)
305{
306 tuple->src.u.tcp.port = orig->dst.u.tcp.port;
307 tuple->dst.u.tcp.port = orig->src.u.tcp.port;
308 return 1;
309}
310
311/* Print out the per-protocol part of the tuple. */
312static int tcp_print_tuple(struct seq_file *s,
313 const struct ip_conntrack_tuple *tuple)
314{
315 return seq_printf(s, "sport=%hu dport=%hu ",
316 ntohs(tuple->src.u.tcp.port),
317 ntohs(tuple->dst.u.tcp.port));
318}
319
320/* Print out the private part of the conntrack. */
321static int tcp_print_conntrack(struct seq_file *s,
322 const struct ip_conntrack *conntrack)
323{
324 enum tcp_conntrack state;
325
326 read_lock_bh(&tcp_lock);
327 state = conntrack->proto.tcp.state;
328 read_unlock_bh(&tcp_lock);
329
330 return seq_printf(s, "%s ", tcp_conntrack_names[state]);
331}
332
333#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
334 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
335static int tcp_to_nfattr(struct sk_buff *skb, struct nfattr *nfa,
336 const struct ip_conntrack *ct)
337{
338 struct nfattr *nest_parms;
339
340 read_lock_bh(&tcp_lock);
341 nest_parms = NFA_NEST(skb, CTA_PROTOINFO_TCP);
342 NFA_PUT(skb, CTA_PROTOINFO_TCP_STATE, sizeof(u_int8_t),
343 &ct->proto.tcp.state);
344 read_unlock_bh(&tcp_lock);
345
346 NFA_NEST_END(skb, nest_parms);
347
348 return 0;
349
350nfattr_failure:
351 read_unlock_bh(&tcp_lock);
352 return -1;
353}
354
355static const size_t cta_min_tcp[CTA_PROTOINFO_TCP_MAX] = {
356 [CTA_PROTOINFO_TCP_STATE-1] = sizeof(u_int8_t),
357};
358
359static int nfattr_to_tcp(struct nfattr *cda[], struct ip_conntrack *ct)
360{
361 struct nfattr *attr = cda[CTA_PROTOINFO_TCP-1];
362 struct nfattr *tb[CTA_PROTOINFO_TCP_MAX];
363
364 /* updates could not contain anything about the private
365 * protocol info, in that case skip the parsing */
366 if (!attr)
367 return 0;
368
369 nfattr_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, attr);
370
371 if (nfattr_bad_size(tb, CTA_PROTOINFO_TCP_MAX, cta_min_tcp))
372 return -EINVAL;
373
374 if (!tb[CTA_PROTOINFO_TCP_STATE-1])
375 return -EINVAL;
376
377 write_lock_bh(&tcp_lock);
378 ct->proto.tcp.state =
379 *(u_int8_t *)NFA_DATA(tb[CTA_PROTOINFO_TCP_STATE-1]);
380 write_unlock_bh(&tcp_lock);
381
382 return 0;
383}
384#endif
385
386static unsigned int get_conntrack_index(const struct tcphdr *tcph)
387{
388 if (tcph->rst) return TCP_RST_SET;
389 else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET);
390 else if (tcph->fin) return TCP_FIN_SET;
391 else if (tcph->ack) return TCP_ACK_SET;
392 else return TCP_NONE_SET;
393}
394
395/* TCP connection tracking based on 'Real Stateful TCP Packet Filtering
396 in IP Filter' by Guido van Rooij.
397
398 http://www.nluug.nl/events/sane2000/papers.html
399 http://www.iae.nl/users/guido/papers/tcp_filtering.ps.gz
400
401 The boundaries and the conditions are changed according to RFC793:
402 the packet must intersect the window (i.e. segments may be
403 after the right or before the left edge) and thus receivers may ACK
404 segments after the right edge of the window.
405
406 td_maxend = max(sack + max(win,1)) seen in reply packets
407 td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets
408 td_maxwin += seq + len - sender.td_maxend
409 if seq + len > sender.td_maxend
410 td_end = max(seq + len) seen in sent packets
411
412 I. Upper bound for valid data: seq <= sender.td_maxend
413 II. Lower bound for valid data: seq + len >= sender.td_end - receiver.td_maxwin
414 III. Upper bound for valid ack: sack <= receiver.td_end
415 IV. Lower bound for valid ack: ack >= receiver.td_end - MAXACKWINDOW
416
417 where sack is the highest right edge of sack block found in the packet.
418
419 The upper bound limit for a valid ack is not ignored -
420 we doesn't have to deal with fragments.
421*/
422
423static inline __u32 segment_seq_plus_len(__u32 seq,
424 size_t len,
425 struct iphdr *iph,
426 struct tcphdr *tcph)
427{
428 return (seq + len - (iph->ihl + tcph->doff)*4
429 + (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0));
430}
431
432/* Fixme: what about big packets? */
433#define MAXACKWINCONST 66000
434#define MAXACKWINDOW(sender) \
435 ((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin \
436 : MAXACKWINCONST)
437
438/*
439 * Simplified tcp_parse_options routine from tcp_input.c
440 */
441static void tcp_options(const struct sk_buff *skb,
442 struct iphdr *iph,
443 struct tcphdr *tcph,
444 struct ip_ct_tcp_state *state)
445{
446 unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
447 unsigned char *ptr;
448 int length = (tcph->doff*4) - sizeof(struct tcphdr);
449
450 if (!length)
451 return;
452
453 ptr = skb_header_pointer(skb,
454 (iph->ihl * 4) + sizeof(struct tcphdr),
455 length, buff);
456 BUG_ON(ptr == NULL);
457
458 state->td_scale =
459 state->flags = 0;
460
461 while (length > 0) {
462 int opcode=*ptr++;
463 int opsize;
464
465 switch (opcode) {
466 case TCPOPT_EOL:
467 return;
468 case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
469 length--;
470 continue;
471 default:
472 opsize=*ptr++;
473 if (opsize < 2) /* "silly options" */
474 return;
475 if (opsize > length)
476 break; /* don't parse partial options */
477
478 if (opcode == TCPOPT_SACK_PERM
479 && opsize == TCPOLEN_SACK_PERM)
480 state->flags |= IP_CT_TCP_FLAG_SACK_PERM;
481 else if (opcode == TCPOPT_WINDOW
482 && opsize == TCPOLEN_WINDOW) {
483 state->td_scale = *(u_int8_t *)ptr;
484
485 if (state->td_scale > 14) {
486 /* See RFC1323 */
487 state->td_scale = 14;
488 }
489 state->flags |=
490 IP_CT_TCP_FLAG_WINDOW_SCALE;
491 }
492 ptr += opsize - 2;
493 length -= opsize;
494 }
495 }
496}
497
498static void tcp_sack(const struct sk_buff *skb,
499 struct iphdr *iph,
500 struct tcphdr *tcph,
501 __u32 *sack)
502{
503 unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
504 unsigned char *ptr;
505 int length = (tcph->doff*4) - sizeof(struct tcphdr);
506 __u32 tmp;
507
508 if (!length)
509 return;
510
511 ptr = skb_header_pointer(skb,
512 (iph->ihl * 4) + sizeof(struct tcphdr),
513 length, buff);
514 BUG_ON(ptr == NULL);
515
516 /* Fast path for timestamp-only option */
517 if (length == TCPOLEN_TSTAMP_ALIGNED*4
518 && *(__be32 *)ptr ==
519 __constant_htonl((TCPOPT_NOP << 24)
520 | (TCPOPT_NOP << 16)
521 | (TCPOPT_TIMESTAMP << 8)
522 | TCPOLEN_TIMESTAMP))
523 return;
524
525 while (length > 0) {
526 int opcode=*ptr++;
527 int opsize, i;
528
529 switch (opcode) {
530 case TCPOPT_EOL:
531 return;
532 case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
533 length--;
534 continue;
535 default:
536 opsize=*ptr++;
537 if (opsize < 2) /* "silly options" */
538 return;
539 if (opsize > length)
540 break; /* don't parse partial options */
541
542 if (opcode == TCPOPT_SACK
543 && opsize >= (TCPOLEN_SACK_BASE
544 + TCPOLEN_SACK_PERBLOCK)
545 && !((opsize - TCPOLEN_SACK_BASE)
546 % TCPOLEN_SACK_PERBLOCK)) {
547 for (i = 0;
548 i < (opsize - TCPOLEN_SACK_BASE);
549 i += TCPOLEN_SACK_PERBLOCK) {
550 tmp = ntohl(*((__be32 *)(ptr+i)+1));
551
552 if (after(tmp, *sack))
553 *sack = tmp;
554 }
555 return;
556 }
557 ptr += opsize - 2;
558 length -= opsize;
559 }
560 }
561}
562
563static int tcp_in_window(struct ip_ct_tcp *state,
564 enum ip_conntrack_dir dir,
565 unsigned int index,
566 const struct sk_buff *skb,
567 struct iphdr *iph,
568 struct tcphdr *tcph)
569{
570 struct ip_ct_tcp_state *sender = &state->seen[dir];
571 struct ip_ct_tcp_state *receiver = &state->seen[!dir];
572 __u32 seq, ack, sack, end, win, swin;
573 int res;
574
575 /*
576 * Get the required data from the packet.
577 */
578 seq = ntohl(tcph->seq);
579 ack = sack = ntohl(tcph->ack_seq);
580 win = ntohs(tcph->window);
581 end = segment_seq_plus_len(seq, skb->len, iph, tcph);
582
583 if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM)
584 tcp_sack(skb, iph, tcph, &sack);
585
586 DEBUGP("tcp_in_window: START\n");
587 DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
588 "seq=%u ack=%u sack=%u win=%u end=%u\n",
589 NIPQUAD(iph->saddr), ntohs(tcph->source),
590 NIPQUAD(iph->daddr), ntohs(tcph->dest),
591 seq, ack, sack, win, end);
592 DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
593 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
594 sender->td_end, sender->td_maxend, sender->td_maxwin,
595 sender->td_scale,
596 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
597 receiver->td_scale);
598
599 if (sender->td_end == 0) {
600 /*
601 * Initialize sender data.
602 */
603 if (tcph->syn && tcph->ack) {
604 /*
605 * Outgoing SYN-ACK in reply to a SYN.
606 */
607 sender->td_end =
608 sender->td_maxend = end;
609 sender->td_maxwin = (win == 0 ? 1 : win);
610
611 tcp_options(skb, iph, tcph, sender);
612 /*
613 * RFC 1323:
614 * Both sides must send the Window Scale option
615 * to enable window scaling in either direction.
616 */
617 if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE
618 && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE))
619 sender->td_scale =
620 receiver->td_scale = 0;
621 } else {
622 /*
623 * We are in the middle of a connection,
624 * its history is lost for us.
625 * Let's try to use the data from the packet.
626 */
627 sender->td_end = end;
628 sender->td_maxwin = (win == 0 ? 1 : win);
629 sender->td_maxend = end + sender->td_maxwin;
630 }
631 } else if (((state->state == TCP_CONNTRACK_SYN_SENT
632 && dir == IP_CT_DIR_ORIGINAL)
633 || (state->state == TCP_CONNTRACK_SYN_RECV
634 && dir == IP_CT_DIR_REPLY))
635 && after(end, sender->td_end)) {
636 /*
637 * RFC 793: "if a TCP is reinitialized ... then it need
638 * not wait at all; it must only be sure to use sequence
639 * numbers larger than those recently used."
640 */
641 sender->td_end =
642 sender->td_maxend = end;
643 sender->td_maxwin = (win == 0 ? 1 : win);
644
645 tcp_options(skb, iph, tcph, sender);
646 }
647
648 if (!(tcph->ack)) {
649 /*
650 * If there is no ACK, just pretend it was set and OK.
651 */
652 ack = sack = receiver->td_end;
653 } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) ==
654 (TCP_FLAG_ACK|TCP_FLAG_RST))
655 && (ack == 0)) {
656 /*
657 * Broken TCP stacks, that set ACK in RST packets as well
658 * with zero ack value.
659 */
660 ack = sack = receiver->td_end;
661 }
662
663 if (seq == end
664 && (!tcph->rst
665 || (seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT)))
666 /*
667 * Packets contains no data: we assume it is valid
668 * and check the ack value only.
669 * However RST segments are always validated by their
670 * SEQ number, except when seq == 0 (reset sent answering
671 * SYN.
672 */
673 seq = end = sender->td_end;
674
675 DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
676 "seq=%u ack=%u sack =%u win=%u end=%u\n",
677 NIPQUAD(iph->saddr), ntohs(tcph->source),
678 NIPQUAD(iph->daddr), ntohs(tcph->dest),
679 seq, ack, sack, win, end);
680 DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
681 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
682 sender->td_end, sender->td_maxend, sender->td_maxwin,
683 sender->td_scale,
684 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
685 receiver->td_scale);
686
687 DEBUGP("tcp_in_window: I=%i II=%i III=%i IV=%i\n",
688 before(seq, sender->td_maxend + 1),
689 after(end, sender->td_end - receiver->td_maxwin - 1),
690 before(sack, receiver->td_end + 1),
691 after(ack, receiver->td_end - MAXACKWINDOW(sender)));
692
693 if (before(seq, sender->td_maxend + 1) &&
694 after(end, sender->td_end - receiver->td_maxwin - 1) &&
695 before(sack, receiver->td_end + 1) &&
696 after(ack, receiver->td_end - MAXACKWINDOW(sender))) {
697 /*
698 * Take into account window scaling (RFC 1323).
699 */
700 if (!tcph->syn)
701 win <<= sender->td_scale;
702
703 /*
704 * Update sender data.
705 */
706 swin = win + (sack - ack);
707 if (sender->td_maxwin < swin)
708 sender->td_maxwin = swin;
709 if (after(end, sender->td_end))
710 sender->td_end = end;
711 /*
712 * Update receiver data.
713 */
714 if (after(end, sender->td_maxend))
715 receiver->td_maxwin += end - sender->td_maxend;
716 if (after(sack + win, receiver->td_maxend - 1)) {
717 receiver->td_maxend = sack + win;
718 if (win == 0)
719 receiver->td_maxend++;
720 }
721
722 /*
723 * Check retransmissions.
724 */
725 if (index == TCP_ACK_SET) {
726 if (state->last_dir == dir
727 && state->last_seq == seq
728 && state->last_ack == ack
729 && state->last_end == end
730 && state->last_win == win)
731 state->retrans++;
732 else {
733 state->last_dir = dir;
734 state->last_seq = seq;
735 state->last_ack = ack;
736 state->last_end = end;
737 state->last_win = win;
738 state->retrans = 0;
739 }
740 }
741 res = 1;
742 } else {
743 res = 0;
744 if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL ||
745 ip_ct_tcp_be_liberal)
746 res = 1;
747 if (!res && LOG_INVALID(IPPROTO_TCP))
748 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
749 "ip_ct_tcp: %s ",
750 before(seq, sender->td_maxend + 1) ?
751 after(end, sender->td_end - receiver->td_maxwin - 1) ?
752 before(sack, receiver->td_end + 1) ?
753 after(ack, receiver->td_end - MAXACKWINDOW(sender)) ? "BUG"
754 : "ACK is under the lower bound (possible overly delayed ACK)"
755 : "ACK is over the upper bound (ACKed data not seen yet)"
756 : "SEQ is under the lower bound (already ACKed data retransmitted)"
757 : "SEQ is over the upper bound (over the window of the receiver)");
758 }
759
760 DEBUGP("tcp_in_window: res=%i sender end=%u maxend=%u maxwin=%u "
761 "receiver end=%u maxend=%u maxwin=%u\n",
762 res, sender->td_end, sender->td_maxend, sender->td_maxwin,
763 receiver->td_end, receiver->td_maxend, receiver->td_maxwin);
764
765 return res;
766}
767
768#ifdef CONFIG_IP_NF_NAT_NEEDED
769/* Update sender->td_end after NAT successfully mangled the packet */
770void ip_conntrack_tcp_update(struct sk_buff *skb,
771 struct ip_conntrack *conntrack,
772 enum ip_conntrack_dir dir)
773{
774 struct iphdr *iph = skb->nh.iph;
775 struct tcphdr *tcph = (void *)skb->nh.iph + skb->nh.iph->ihl*4;
776 __u32 end;
777#ifdef DEBUGP_VARS
778 struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[dir];
779 struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[!dir];
780#endif
781
782 end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, iph, tcph);
783
784 write_lock_bh(&tcp_lock);
785 /*
786 * We have to worry for the ack in the reply packet only...
787 */
788 if (after(end, conntrack->proto.tcp.seen[dir].td_end))
789 conntrack->proto.tcp.seen[dir].td_end = end;
790 conntrack->proto.tcp.last_end = end;
791 write_unlock_bh(&tcp_lock);
792 DEBUGP("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i "
793 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
794 sender->td_end, sender->td_maxend, sender->td_maxwin,
795 sender->td_scale,
796 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
797 receiver->td_scale);
798}
799
800#endif
801
802#define TH_FIN 0x01
803#define TH_SYN 0x02
804#define TH_RST 0x04
805#define TH_PUSH 0x08
806#define TH_ACK 0x10
807#define TH_URG 0x20
808#define TH_ECE 0x40
809#define TH_CWR 0x80
810
811/* table of valid flag combinations - ECE and CWR are always valid */
812static const u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] =
813{
814 [TH_SYN] = 1,
815 [TH_SYN|TH_PUSH] = 1,
816 [TH_SYN|TH_URG] = 1,
817 [TH_SYN|TH_PUSH|TH_URG] = 1,
818 [TH_SYN|TH_ACK] = 1,
819 [TH_SYN|TH_ACK|TH_PUSH] = 1,
820 [TH_RST] = 1,
821 [TH_RST|TH_ACK] = 1,
822 [TH_RST|TH_ACK|TH_PUSH] = 1,
823 [TH_FIN|TH_ACK] = 1,
824 [TH_ACK] = 1,
825 [TH_ACK|TH_PUSH] = 1,
826 [TH_ACK|TH_URG] = 1,
827 [TH_ACK|TH_URG|TH_PUSH] = 1,
828 [TH_FIN|TH_ACK|TH_PUSH] = 1,
829 [TH_FIN|TH_ACK|TH_URG] = 1,
830 [TH_FIN|TH_ACK|TH_URG|TH_PUSH] = 1,
831};
832
833/* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */
834static int tcp_error(struct sk_buff *skb,
835 enum ip_conntrack_info *ctinfo,
836 unsigned int hooknum)
837{
838 struct iphdr *iph = skb->nh.iph;
839 struct tcphdr _tcph, *th;
840 unsigned int tcplen = skb->len - iph->ihl * 4;
841 u_int8_t tcpflags;
842
843 /* Smaller that minimal TCP header? */
844 th = skb_header_pointer(skb, iph->ihl * 4,
845 sizeof(_tcph), &_tcph);
846 if (th == NULL) {
847 if (LOG_INVALID(IPPROTO_TCP))
848 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
849 "ip_ct_tcp: short packet ");
850 return -NF_ACCEPT;
851 }
852
853 /* Not whole TCP header or malformed packet */
854 if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
855 if (LOG_INVALID(IPPROTO_TCP))
856 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
857 "ip_ct_tcp: truncated/malformed packet ");
858 return -NF_ACCEPT;
859 }
860
861 /* Checksum invalid? Ignore.
862 * We skip checking packets on the outgoing path
863 * because it is assumed to be correct.
864 */
865 /* FIXME: Source route IP option packets --RR */
866 if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING &&
867 nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_TCP)) {
868 if (LOG_INVALID(IPPROTO_TCP))
869 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
870 "ip_ct_tcp: bad TCP checksum ");
871 return -NF_ACCEPT;
872 }
873
874 /* Check TCP flags. */
875 tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR));
876 if (!tcp_valid_flags[tcpflags]) {
877 if (LOG_INVALID(IPPROTO_TCP))
878 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
879 "ip_ct_tcp: invalid TCP flag combination ");
880 return -NF_ACCEPT;
881 }
882
883 return NF_ACCEPT;
884}
885
886/* Returns verdict for packet, or -1 for invalid. */
887static int tcp_packet(struct ip_conntrack *conntrack,
888 const struct sk_buff *skb,
889 enum ip_conntrack_info ctinfo)
890{
891 enum tcp_conntrack new_state, old_state;
892 enum ip_conntrack_dir dir;
893 struct iphdr *iph = skb->nh.iph;
894 struct tcphdr *th, _tcph;
895 unsigned long timeout;
896 unsigned int index;
897
898 th = skb_header_pointer(skb, iph->ihl * 4,
899 sizeof(_tcph), &_tcph);
900 BUG_ON(th == NULL);
901
902 write_lock_bh(&tcp_lock);
903 old_state = conntrack->proto.tcp.state;
904 dir = CTINFO2DIR(ctinfo);
905 index = get_conntrack_index(th);
906 new_state = tcp_conntracks[dir][index][old_state];
907
908 switch (new_state) {
909 case TCP_CONNTRACK_IGNORE:
910 /* Ignored packets:
911 *
912 * a) SYN in ORIGINAL
913 * b) SYN/ACK in REPLY
914 * c) ACK in reply direction after initial SYN in original.
915 */
916 if (index == TCP_SYNACK_SET
917 && conntrack->proto.tcp.last_index == TCP_SYN_SET
918 && conntrack->proto.tcp.last_dir != dir
919 && ntohl(th->ack_seq) ==
920 conntrack->proto.tcp.last_end) {
921 /* This SYN/ACK acknowledges a SYN that we earlier
922 * ignored as invalid. This means that the client and
923 * the server are both in sync, while the firewall is
924 * not. We kill this session and block the SYN/ACK so
925 * that the client cannot but retransmit its SYN and
926 * thus initiate a clean new session.
927 */
928 write_unlock_bh(&tcp_lock);
929 if (LOG_INVALID(IPPROTO_TCP))
930 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
931 NULL, "ip_ct_tcp: "
932 "killing out of sync session ");
933 if (del_timer(&conntrack->timeout))
934 conntrack->timeout.function((unsigned long)
935 conntrack);
936 return -NF_DROP;
937 }
938 conntrack->proto.tcp.last_index = index;
939 conntrack->proto.tcp.last_dir = dir;
940 conntrack->proto.tcp.last_seq = ntohl(th->seq);
941 conntrack->proto.tcp.last_end =
942 segment_seq_plus_len(ntohl(th->seq), skb->len, iph, th);
943
944 write_unlock_bh(&tcp_lock);
945 if (LOG_INVALID(IPPROTO_TCP))
946 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
947 "ip_ct_tcp: invalid packet ignored ");
948 return NF_ACCEPT;
949 case TCP_CONNTRACK_MAX:
950 /* Invalid packet */
951 DEBUGP("ip_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
952 dir, get_conntrack_index(th),
953 old_state);
954 write_unlock_bh(&tcp_lock);
955 if (LOG_INVALID(IPPROTO_TCP))
956 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
957 "ip_ct_tcp: invalid state ");
958 return -NF_ACCEPT;
959 case TCP_CONNTRACK_SYN_SENT:
960 if (old_state < TCP_CONNTRACK_TIME_WAIT)
961 break;
962 if ((conntrack->proto.tcp.seen[dir].flags &
963 IP_CT_TCP_FLAG_CLOSE_INIT)
964 || after(ntohl(th->seq),
965 conntrack->proto.tcp.seen[dir].td_end)) {
966 /* Attempt to reopen a closed connection.
967 * Delete this connection and look up again. */
968 write_unlock_bh(&tcp_lock);
969 if (del_timer(&conntrack->timeout))
970 conntrack->timeout.function((unsigned long)
971 conntrack);
972 return -NF_REPEAT;
973 } else {
974 write_unlock_bh(&tcp_lock);
975 if (LOG_INVALID(IPPROTO_TCP))
976 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
977 NULL, "ip_ct_tcp: invalid SYN");
978 return -NF_ACCEPT;
979 }
980 case TCP_CONNTRACK_CLOSE:
981 if (index == TCP_RST_SET
982 && ((test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)
983 && conntrack->proto.tcp.last_index == TCP_SYN_SET)
984 || (!test_bit(IPS_ASSURED_BIT, &conntrack->status)
985 && conntrack->proto.tcp.last_index == TCP_ACK_SET))
986 && ntohl(th->ack_seq) == conntrack->proto.tcp.last_end) {
987 /* RST sent to invalid SYN or ACK we had let through
988 * at a) and c) above:
989 *
990 * a) SYN was in window then
991 * c) we hold a half-open connection.
992 *
993 * Delete our connection entry.
994 * We skip window checking, because packet might ACK
995 * segments we ignored. */
996 goto in_window;
997 }
998 /* Just fall through */
999 default:
1000 /* Keep compilers happy. */
1001 break;
1002 }
1003
1004 if (!tcp_in_window(&conntrack->proto.tcp, dir, index,
1005 skb, iph, th)) {
1006 write_unlock_bh(&tcp_lock);
1007 return -NF_ACCEPT;
1008 }
1009 in_window:
1010 /* From now on we have got in-window packets */
1011 conntrack->proto.tcp.last_index = index;
1012
1013 DEBUGP("tcp_conntracks: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
1014 "syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n",
1015 NIPQUAD(iph->saddr), ntohs(th->source),
1016 NIPQUAD(iph->daddr), ntohs(th->dest),
1017 (th->syn ? 1 : 0), (th->ack ? 1 : 0),
1018 (th->fin ? 1 : 0), (th->rst ? 1 : 0),
1019 old_state, new_state);
1020
1021 conntrack->proto.tcp.state = new_state;
1022 if (old_state != new_state
1023 && (new_state == TCP_CONNTRACK_FIN_WAIT
1024 || new_state == TCP_CONNTRACK_CLOSE))
1025 conntrack->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
1026 timeout = conntrack->proto.tcp.retrans >= ip_ct_tcp_max_retrans
1027 && *tcp_timeouts[new_state] > ip_ct_tcp_timeout_max_retrans
1028 ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state];
1029 write_unlock_bh(&tcp_lock);
1030
1031 ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb);
1032 if (new_state != old_state)
1033 ip_conntrack_event_cache(IPCT_PROTOINFO, skb);
1034
1035 if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
1036 /* If only reply is a RST, we can consider ourselves not to
1037 have an established connection: this is a fairly common
1038 problem case, so we can delete the conntrack
1039 immediately. --RR */
1040 if (th->rst) {
1041 if (del_timer(&conntrack->timeout))
1042 conntrack->timeout.function((unsigned long)
1043 conntrack);
1044 return NF_ACCEPT;
1045 }
1046 } else if (!test_bit(IPS_ASSURED_BIT, &conntrack->status)
1047 && (old_state == TCP_CONNTRACK_SYN_RECV
1048 || old_state == TCP_CONNTRACK_ESTABLISHED)
1049 && new_state == TCP_CONNTRACK_ESTABLISHED) {
1050 /* Set ASSURED if we see see valid ack in ESTABLISHED
1051 after SYN_RECV or a valid answer for a picked up
1052 connection. */
1053 set_bit(IPS_ASSURED_BIT, &conntrack->status);
1054 ip_conntrack_event_cache(IPCT_STATUS, skb);
1055 }
1056 ip_ct_refresh_acct(conntrack, ctinfo, skb, timeout);
1057
1058 return NF_ACCEPT;
1059}
1060
1061/* Called when a new connection for this protocol found. */
1062static int tcp_new(struct ip_conntrack *conntrack,
1063 const struct sk_buff *skb)
1064{
1065 enum tcp_conntrack new_state;
1066 struct iphdr *iph = skb->nh.iph;
1067 struct tcphdr *th, _tcph;
1068#ifdef DEBUGP_VARS
1069 struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[0];
1070 struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[1];
1071#endif
1072
1073 th = skb_header_pointer(skb, iph->ihl * 4,
1074 sizeof(_tcph), &_tcph);
1075 BUG_ON(th == NULL);
1076
1077 /* Don't need lock here: this conntrack not in circulation yet */
1078 new_state
1079 = tcp_conntracks[0][get_conntrack_index(th)]
1080 [TCP_CONNTRACK_NONE];
1081
1082 /* Invalid: delete conntrack */
1083 if (new_state >= TCP_CONNTRACK_MAX) {
1084 DEBUGP("ip_ct_tcp: invalid new deleting.\n");
1085 return 0;
1086 }
1087
1088 if (new_state == TCP_CONNTRACK_SYN_SENT) {
1089 /* SYN packet */
1090 conntrack->proto.tcp.seen[0].td_end =
1091 segment_seq_plus_len(ntohl(th->seq), skb->len,
1092 iph, th);
1093 conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
1094 if (conntrack->proto.tcp.seen[0].td_maxwin == 0)
1095 conntrack->proto.tcp.seen[0].td_maxwin = 1;
1096 conntrack->proto.tcp.seen[0].td_maxend =
1097 conntrack->proto.tcp.seen[0].td_end;
1098
1099 tcp_options(skb, iph, th, &conntrack->proto.tcp.seen[0]);
1100 conntrack->proto.tcp.seen[1].flags = 0;
1101 } else if (ip_ct_tcp_loose == 0) {
1102 /* Don't try to pick up connections. */
1103 return 0;
1104 } else {
1105 /*
1106 * We are in the middle of a connection,
1107 * its history is lost for us.
1108 * Let's try to use the data from the packet.
1109 */
1110 conntrack->proto.tcp.seen[0].td_end =
1111 segment_seq_plus_len(ntohl(th->seq), skb->len,
1112 iph, th);
1113 conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
1114 if (conntrack->proto.tcp.seen[0].td_maxwin == 0)
1115 conntrack->proto.tcp.seen[0].td_maxwin = 1;
1116 conntrack->proto.tcp.seen[0].td_maxend =
1117 conntrack->proto.tcp.seen[0].td_end +
1118 conntrack->proto.tcp.seen[0].td_maxwin;
1119 conntrack->proto.tcp.seen[0].td_scale = 0;
1120
1121 /* We assume SACK and liberal window checking to handle
1122 * window scaling */
1123 conntrack->proto.tcp.seen[0].flags =
1124 conntrack->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM |
1125 IP_CT_TCP_FLAG_BE_LIBERAL;
1126 }
1127
1128 conntrack->proto.tcp.seen[1].td_end = 0;
1129 conntrack->proto.tcp.seen[1].td_maxend = 0;
1130 conntrack->proto.tcp.seen[1].td_maxwin = 1;
1131 conntrack->proto.tcp.seen[1].td_scale = 0;
1132
1133 /* tcp_packet will set them */
1134 conntrack->proto.tcp.state = TCP_CONNTRACK_NONE;
1135 conntrack->proto.tcp.last_index = TCP_NONE_SET;
1136
1137 DEBUGP("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i "
1138 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
1139 sender->td_end, sender->td_maxend, sender->td_maxwin,
1140 sender->td_scale,
1141 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
1142 receiver->td_scale);
1143 return 1;
1144}
1145
1146struct ip_conntrack_protocol ip_conntrack_protocol_tcp =
1147{
1148 .proto = IPPROTO_TCP,
1149 .name = "tcp",
1150 .pkt_to_tuple = tcp_pkt_to_tuple,
1151 .invert_tuple = tcp_invert_tuple,
1152 .print_tuple = tcp_print_tuple,
1153 .print_conntrack = tcp_print_conntrack,
1154 .packet = tcp_packet,
1155 .new = tcp_new,
1156 .error = tcp_error,
1157#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1158 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1159 .to_nfattr = tcp_to_nfattr,
1160 .from_nfattr = nfattr_to_tcp,
1161 .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr,
1162 .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple,
1163#endif
1164};
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
deleted file mode 100644
index 14c30c646c7f..000000000000
--- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c
+++ /dev/null
@@ -1,148 +0,0 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9#include <linux/types.h>
10#include <linux/timer.h>
11#include <linux/netfilter.h>
12#include <linux/in.h>
13#include <linux/ip.h>
14#include <linux/udp.h>
15#include <linux/seq_file.h>
16#include <net/checksum.h>
17#include <linux/netfilter_ipv4.h>
18#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
19
20unsigned int ip_ct_udp_timeout __read_mostly = 30*HZ;
21unsigned int ip_ct_udp_timeout_stream __read_mostly = 180*HZ;
22
23static int udp_pkt_to_tuple(const struct sk_buff *skb,
24 unsigned int dataoff,
25 struct ip_conntrack_tuple *tuple)
26{
27 struct udphdr _hdr, *hp;
28
29 /* Actually only need first 8 bytes. */
30 hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
31 if (hp == NULL)
32 return 0;
33
34 tuple->src.u.udp.port = hp->source;
35 tuple->dst.u.udp.port = hp->dest;
36
37 return 1;
38}
39
40static int udp_invert_tuple(struct ip_conntrack_tuple *tuple,
41 const struct ip_conntrack_tuple *orig)
42{
43 tuple->src.u.udp.port = orig->dst.u.udp.port;
44 tuple->dst.u.udp.port = orig->src.u.udp.port;
45 return 1;
46}
47
48/* Print out the per-protocol part of the tuple. */
49static int udp_print_tuple(struct seq_file *s,
50 const struct ip_conntrack_tuple *tuple)
51{
52 return seq_printf(s, "sport=%hu dport=%hu ",
53 ntohs(tuple->src.u.udp.port),
54 ntohs(tuple->dst.u.udp.port));
55}
56
57/* Print out the private part of the conntrack. */
58static int udp_print_conntrack(struct seq_file *s,
59 const struct ip_conntrack *conntrack)
60{
61 return 0;
62}
63
64/* Returns verdict for packet, and may modify conntracktype */
65static int udp_packet(struct ip_conntrack *conntrack,
66 const struct sk_buff *skb,
67 enum ip_conntrack_info ctinfo)
68{
69 /* If we've seen traffic both ways, this is some kind of UDP
70 stream. Extend timeout. */
71 if (test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
72 ip_ct_refresh_acct(conntrack, ctinfo, skb,
73 ip_ct_udp_timeout_stream);
74 /* Also, more likely to be important, and not a probe */
75 if (!test_and_set_bit(IPS_ASSURED_BIT, &conntrack->status))
76 ip_conntrack_event_cache(IPCT_STATUS, skb);
77 } else
78 ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout);
79
80 return NF_ACCEPT;
81}
82
83/* Called when a new connection for this protocol found. */
84static int udp_new(struct ip_conntrack *conntrack, const struct sk_buff *skb)
85{
86 return 1;
87}
88
89static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
90 unsigned int hooknum)
91{
92 struct iphdr *iph = skb->nh.iph;
93 unsigned int udplen = skb->len - iph->ihl * 4;
94 struct udphdr _hdr, *hdr;
95
96 /* Header is too small? */
97 hdr = skb_header_pointer(skb, iph->ihl*4, sizeof(_hdr), &_hdr);
98 if (hdr == NULL) {
99 if (LOG_INVALID(IPPROTO_UDP))
100 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
101 "ip_ct_udp: short packet ");
102 return -NF_ACCEPT;
103 }
104
105 /* Truncated/malformed packets */
106 if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) {
107 if (LOG_INVALID(IPPROTO_UDP))
108 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
109 "ip_ct_udp: truncated/malformed packet ");
110 return -NF_ACCEPT;
111 }
112
113 /* Packet with no checksum */
114 if (!hdr->check)
115 return NF_ACCEPT;
116
117 /* Checksum invalid? Ignore.
118 * We skip checking packets on the outgoing path
119 * because the checksum is assumed to be correct.
120 * FIXME: Source route IP option packets --RR */
121 if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING &&
122 nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_UDP)) {
123 if (LOG_INVALID(IPPROTO_UDP))
124 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
125 "ip_ct_udp: bad UDP checksum ");
126 return -NF_ACCEPT;
127 }
128
129 return NF_ACCEPT;
130}
131
132struct ip_conntrack_protocol ip_conntrack_protocol_udp =
133{
134 .proto = IPPROTO_UDP,
135 .name = "udp",
136 .pkt_to_tuple = udp_pkt_to_tuple,
137 .invert_tuple = udp_invert_tuple,
138 .print_tuple = udp_print_tuple,
139 .print_conntrack = udp_print_conntrack,
140 .packet = udp_packet,
141 .new = udp_new,
142 .error = udp_error,
143#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
144 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
145 .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr,
146 .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple,
147#endif
148};
diff --git a/net/ipv4/netfilter/ip_conntrack_sip.c b/net/ipv4/netfilter/ip_conntrack_sip.c
deleted file mode 100644
index c59a962c1f61..000000000000
--- a/net/ipv4/netfilter/ip_conntrack_sip.c
+++ /dev/null
@@ -1,520 +0,0 @@
1/* SIP extension for IP connection tracking.
2 *
3 * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar>
4 * based on RR's ip_conntrack_ftp.c and other modules.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/module.h>
12#include <linux/ctype.h>
13#include <linux/skbuff.h>
14#include <linux/in.h>
15#include <linux/ip.h>
16#include <linux/udp.h>
17
18#include <linux/netfilter.h>
19#include <linux/netfilter_ipv4.h>
20#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
21#include <linux/netfilter_ipv4/ip_conntrack_sip.h>
22
23#if 0
24#define DEBUGP printk
25#else
26#define DEBUGP(format, args...)
27#endif
28
29MODULE_LICENSE("GPL");
30MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>");
31MODULE_DESCRIPTION("SIP connection tracking helper");
32
33#define MAX_PORTS 8
34static unsigned short ports[MAX_PORTS];
35static int ports_c;
36module_param_array(ports, ushort, &ports_c, 0400);
37MODULE_PARM_DESC(ports, "port numbers of sip servers");
38
39static unsigned int sip_timeout = SIP_TIMEOUT;
40module_param(sip_timeout, uint, 0600);
41MODULE_PARM_DESC(sip_timeout, "timeout for the master SIP session");
42
43unsigned int (*ip_nat_sip_hook)(struct sk_buff **pskb,
44 enum ip_conntrack_info ctinfo,
45 struct ip_conntrack *ct,
46 const char **dptr);
47EXPORT_SYMBOL_GPL(ip_nat_sip_hook);
48
49unsigned int (*ip_nat_sdp_hook)(struct sk_buff **pskb,
50 enum ip_conntrack_info ctinfo,
51 struct ip_conntrack_expect *exp,
52 const char *dptr);
53EXPORT_SYMBOL_GPL(ip_nat_sdp_hook);
54
55static int digits_len(const char *dptr, const char *limit, int *shift);
56static int epaddr_len(const char *dptr, const char *limit, int *shift);
57static int skp_digits_len(const char *dptr, const char *limit, int *shift);
58static int skp_epaddr_len(const char *dptr, const char *limit, int *shift);
59
60struct sip_header_nfo {
61 const char *lname;
62 const char *sname;
63 const char *ln_str;
64 size_t lnlen;
65 size_t snlen;
66 size_t ln_strlen;
67 int case_sensitive;
68 int (*match_len)(const char *, const char *, int *);
69};
70
71static struct sip_header_nfo ct_sip_hdrs[] = {
72 [POS_REG_REQ_URI] = { /* SIP REGISTER request URI */
73 .lname = "sip:",
74 .lnlen = sizeof("sip:") - 1,
75 .ln_str = ":",
76 .ln_strlen = sizeof(":") - 1,
77 .match_len = epaddr_len
78 },
79 [POS_REQ_URI] = { /* SIP request URI */
80 .lname = "sip:",
81 .lnlen = sizeof("sip:") - 1,
82 .ln_str = "@",
83 .ln_strlen = sizeof("@") - 1,
84 .match_len = epaddr_len
85 },
86 [POS_FROM] = { /* SIP From header */
87 .lname = "From:",
88 .lnlen = sizeof("From:") - 1,
89 .sname = "\r\nf:",
90 .snlen = sizeof("\r\nf:") - 1,
91 .ln_str = "sip:",
92 .ln_strlen = sizeof("sip:") - 1,
93 .match_len = skp_epaddr_len,
94 },
95 [POS_TO] = { /* SIP To header */
96 .lname = "To:",
97 .lnlen = sizeof("To:") - 1,
98 .sname = "\r\nt:",
99 .snlen = sizeof("\r\nt:") - 1,
100 .ln_str = "sip:",
101 .ln_strlen = sizeof("sip:") - 1,
102 .match_len = skp_epaddr_len,
103 },
104 [POS_VIA] = { /* SIP Via header */
105 .lname = "Via:",
106 .lnlen = sizeof("Via:") - 1,
107 .sname = "\r\nv:",
108 .snlen = sizeof("\r\nv:") - 1, /* rfc3261 "\r\n" */
109 .ln_str = "UDP ",
110 .ln_strlen = sizeof("UDP ") - 1,
111 .match_len = epaddr_len,
112 },
113 [POS_CONTACT] = { /* SIP Contact header */
114 .lname = "Contact:",
115 .lnlen = sizeof("Contact:") - 1,
116 .sname = "\r\nm:",
117 .snlen = sizeof("\r\nm:") - 1,
118 .ln_str = "sip:",
119 .ln_strlen = sizeof("sip:") - 1,
120 .match_len = skp_epaddr_len
121 },
122 [POS_CONTENT] = { /* SIP Content length header */
123 .lname = "Content-Length:",
124 .lnlen = sizeof("Content-Length:") - 1,
125 .sname = "\r\nl:",
126 .snlen = sizeof("\r\nl:") - 1,
127 .ln_str = ":",
128 .ln_strlen = sizeof(":") - 1,
129 .match_len = skp_digits_len
130 },
131 [POS_MEDIA] = { /* SDP media info */
132 .case_sensitive = 1,
133 .lname = "\nm=",
134 .lnlen = sizeof("\nm=") - 1,
135 .sname = "\rm=",
136 .snlen = sizeof("\rm=") - 1,
137 .ln_str = "audio ",
138 .ln_strlen = sizeof("audio ") - 1,
139 .match_len = digits_len
140 },
141 [POS_OWNER] = { /* SDP owner address*/
142 .case_sensitive = 1,
143 .lname = "\no=",
144 .lnlen = sizeof("\no=") - 1,
145 .sname = "\ro=",
146 .snlen = sizeof("\ro=") - 1,
147 .ln_str = "IN IP4 ",
148 .ln_strlen = sizeof("IN IP4 ") - 1,
149 .match_len = epaddr_len
150 },
151 [POS_CONNECTION] = { /* SDP connection info */
152 .case_sensitive = 1,
153 .lname = "\nc=",
154 .lnlen = sizeof("\nc=") - 1,
155 .sname = "\rc=",
156 .snlen = sizeof("\rc=") - 1,
157 .ln_str = "IN IP4 ",
158 .ln_strlen = sizeof("IN IP4 ") - 1,
159 .match_len = epaddr_len
160 },
161 [POS_SDP_HEADER] = { /* SDP version header */
162 .case_sensitive = 1,
163 .lname = "\nv=",
164 .lnlen = sizeof("\nv=") - 1,
165 .sname = "\rv=",
166 .snlen = sizeof("\rv=") - 1,
167 .ln_str = "=",
168 .ln_strlen = sizeof("=") - 1,
169 .match_len = digits_len
170 }
171};
172
173/* get line lenght until first CR or LF seen. */
174int ct_sip_lnlen(const char *line, const char *limit)
175{
176 const char *k = line;
177
178 while ((line <= limit) && (*line == '\r' || *line == '\n'))
179 line++;
180
181 while (line <= limit) {
182 if (*line == '\r' || *line == '\n')
183 break;
184 line++;
185 }
186 return line - k;
187}
188EXPORT_SYMBOL_GPL(ct_sip_lnlen);
189
190/* Linear string search, case sensitive. */
191const char *ct_sip_search(const char *needle, const char *haystack,
192 size_t needle_len, size_t haystack_len,
193 int case_sensitive)
194{
195 const char *limit = haystack + (haystack_len - needle_len);
196
197 while (haystack <= limit) {
198 if (case_sensitive) {
199 if (strncmp(haystack, needle, needle_len) == 0)
200 return haystack;
201 } else {
202 if (strnicmp(haystack, needle, needle_len) == 0)
203 return haystack;
204 }
205 haystack++;
206 }
207 return NULL;
208}
209EXPORT_SYMBOL_GPL(ct_sip_search);
210
211static int digits_len(const char *dptr, const char *limit, int *shift)
212{
213 int len = 0;
214 while (dptr <= limit && isdigit(*dptr)) {
215 dptr++;
216 len++;
217 }
218 return len;
219}
220
221/* get digits lenght, skiping blank spaces. */
222static int skp_digits_len(const char *dptr, const char *limit, int *shift)
223{
224 for (; dptr <= limit && *dptr == ' '; dptr++)
225 (*shift)++;
226
227 return digits_len(dptr, limit, shift);
228}
229
230/* Simple ipaddr parser.. */
231static int parse_ipaddr(const char *cp, const char **endp,
232 __be32 *ipaddr, const char *limit)
233{
234 unsigned long int val;
235 int i, digit = 0;
236
237 for (i = 0, *ipaddr = 0; cp <= limit && i < 4; i++) {
238 digit = 0;
239 if (!isdigit(*cp))
240 break;
241
242 val = simple_strtoul(cp, (char **)&cp, 10);
243 if (val > 0xFF)
244 return -1;
245
246 ((u_int8_t *)ipaddr)[i] = val;
247 digit = 1;
248
249 if (*cp != '.')
250 break;
251 cp++;
252 }
253 if (!digit)
254 return -1;
255
256 if (endp)
257 *endp = cp;
258
259 return 0;
260}
261
262/* skip ip address. returns it lenght. */
263static int epaddr_len(const char *dptr, const char *limit, int *shift)
264{
265 const char *aux = dptr;
266 __be32 ip;
267
268 if (parse_ipaddr(dptr, &dptr, &ip, limit) < 0) {
269 DEBUGP("ip: %s parse failed.!\n", dptr);
270 return 0;
271 }
272
273 /* Port number */
274 if (*dptr == ':') {
275 dptr++;
276 dptr += digits_len(dptr, limit, shift);
277 }
278 return dptr - aux;
279}
280
281/* get address length, skiping user info. */
282static int skp_epaddr_len(const char *dptr, const char *limit, int *shift)
283{
284 int s = *shift;
285
286 /* Search for @, but stop at the end of the line.
287 * We are inside a sip: URI, so we don't need to worry about
288 * continuation lines. */
289 while (dptr <= limit &&
290 *dptr != '@' && *dptr != '\r' && *dptr != '\n') {
291 (*shift)++;
292 dptr++;
293 }
294
295 if (dptr <= limit && *dptr == '@') {
296 dptr++;
297 (*shift)++;
298 } else
299 *shift = s;
300
301 return epaddr_len(dptr, limit, shift);
302}
303
304/* Returns 0 if not found, -1 error parsing. */
305int ct_sip_get_info(const char *dptr, size_t dlen,
306 unsigned int *matchoff,
307 unsigned int *matchlen,
308 enum sip_header_pos pos)
309{
310 struct sip_header_nfo *hnfo = &ct_sip_hdrs[pos];
311 const char *limit, *aux, *k = dptr;
312 int shift = 0;
313
314 limit = dptr + (dlen - hnfo->lnlen);
315
316 while (dptr <= limit) {
317 if ((strncmp(dptr, hnfo->lname, hnfo->lnlen) != 0) &&
318 (hnfo->sname == NULL ||
319 strncmp(dptr, hnfo->sname, hnfo->snlen) != 0)) {
320 dptr++;
321 continue;
322 }
323 aux = ct_sip_search(hnfo->ln_str, dptr, hnfo->ln_strlen,
324 ct_sip_lnlen(dptr, limit),
325 hnfo->case_sensitive);
326 if (!aux) {
327 DEBUGP("'%s' not found in '%s'.\n", hnfo->ln_str,
328 hnfo->lname);
329 return -1;
330 }
331 aux += hnfo->ln_strlen;
332
333 *matchlen = hnfo->match_len(aux, limit, &shift);
334 if (!*matchlen)
335 return -1;
336
337 *matchoff = (aux - k) + shift;
338
339 DEBUGP("%s match succeeded! - len: %u\n", hnfo->lname,
340 *matchlen);
341 return 1;
342 }
343 DEBUGP("%s header not found.\n", hnfo->lname);
344 return 0;
345}
346EXPORT_SYMBOL_GPL(ct_sip_get_info);
347
348static int set_expected_rtp(struct sk_buff **pskb,
349 struct ip_conntrack *ct,
350 enum ip_conntrack_info ctinfo,
351 __be32 ipaddr, u_int16_t port,
352 const char *dptr)
353{
354 struct ip_conntrack_expect *exp;
355 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
356 int ret;
357 typeof(ip_nat_sdp_hook) ip_nat_sdp;
358
359 exp = ip_conntrack_expect_alloc(ct);
360 if (exp == NULL)
361 return NF_DROP;
362
363 exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
364 exp->tuple.src.u.udp.port = 0;
365 exp->tuple.dst.ip = ipaddr;
366 exp->tuple.dst.u.udp.port = htons(port);
367 exp->tuple.dst.protonum = IPPROTO_UDP;
368
369 exp->mask.src.ip = htonl(0xFFFFFFFF);
370 exp->mask.src.u.udp.port = 0;
371 exp->mask.dst.ip = htonl(0xFFFFFFFF);
372 exp->mask.dst.u.udp.port = htons(0xFFFF);
373 exp->mask.dst.protonum = 0xFF;
374
375 exp->expectfn = NULL;
376 exp->flags = 0;
377
378 ip_nat_sdp = rcu_dereference(ip_nat_sdp_hook);
379 if (ip_nat_sdp)
380 ret = ip_nat_sdp(pskb, ctinfo, exp, dptr);
381 else {
382 if (ip_conntrack_expect_related(exp) != 0)
383 ret = NF_DROP;
384 else
385 ret = NF_ACCEPT;
386 }
387 ip_conntrack_expect_put(exp);
388
389 return ret;
390}
391
392static int sip_help(struct sk_buff **pskb,
393 struct ip_conntrack *ct,
394 enum ip_conntrack_info ctinfo)
395{
396 unsigned int dataoff, datalen;
397 const char *dptr;
398 int ret = NF_ACCEPT;
399 int matchoff, matchlen;
400 __be32 ipaddr;
401 u_int16_t port;
402 typeof(ip_nat_sip_hook) ip_nat_sip;
403
404 /* No Data ? */
405 dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
406 if (dataoff >= (*pskb)->len) {
407 DEBUGP("skb->len = %u\n", (*pskb)->len);
408 return NF_ACCEPT;
409 }
410
411 ip_ct_refresh(ct, *pskb, sip_timeout * HZ);
412
413 if (!skb_is_nonlinear(*pskb))
414 dptr = (*pskb)->data + dataoff;
415 else {
416 DEBUGP("Copy of skbuff not supported yet.\n");
417 goto out;
418 }
419
420 ip_nat_sip = rcu_dereference(ip_nat_sip_hook);
421 if (ip_nat_sip) {
422 if (!ip_nat_sip(pskb, ctinfo, ct, &dptr)) {
423 ret = NF_DROP;
424 goto out;
425 }
426 }
427
428 /* After this point NAT, could have mangled skb, so
429 we need to recalculate payload lenght. */
430 datalen = (*pskb)->len - dataoff;
431
432 if (datalen < (sizeof("SIP/2.0 200") - 1))
433 goto out;
434
435 /* RTP info only in some SDP pkts */
436 if (memcmp(dptr, "INVITE", sizeof("INVITE") - 1) != 0 &&
437 memcmp(dptr, "SIP/2.0 200", sizeof("SIP/2.0 200") - 1) != 0) {
438 goto out;
439 }
440 /* Get ip and port address from SDP packet. */
441 if (ct_sip_get_info(dptr, datalen, &matchoff, &matchlen,
442 POS_CONNECTION) > 0) {
443
444 /* We'll drop only if there are parse problems. */
445 if (parse_ipaddr(dptr + matchoff, NULL, &ipaddr,
446 dptr + datalen) < 0) {
447 ret = NF_DROP;
448 goto out;
449 }
450 if (ct_sip_get_info(dptr, datalen, &matchoff, &matchlen,
451 POS_MEDIA) > 0) {
452
453 port = simple_strtoul(dptr + matchoff, NULL, 10);
454 if (port < 1024) {
455 ret = NF_DROP;
456 goto out;
457 }
458 ret = set_expected_rtp(pskb, ct, ctinfo,
459 ipaddr, port, dptr);
460 }
461 }
462out:
463 return ret;
464}
465
466static struct ip_conntrack_helper sip[MAX_PORTS];
467static char sip_names[MAX_PORTS][10];
468
469static void fini(void)
470{
471 int i;
472 for (i = 0; i < ports_c; i++) {
473 DEBUGP("unregistering helper for port %d\n", ports[i]);
474 ip_conntrack_helper_unregister(&sip[i]);
475 }
476}
477
478static int __init init(void)
479{
480 int i, ret;
481 char *tmpname;
482
483 if (ports_c == 0)
484 ports[ports_c++] = SIP_PORT;
485
486 for (i = 0; i < ports_c; i++) {
487 /* Create helper structure */
488 memset(&sip[i], 0, sizeof(struct ip_conntrack_helper));
489
490 sip[i].tuple.dst.protonum = IPPROTO_UDP;
491 sip[i].tuple.src.u.udp.port = htons(ports[i]);
492 sip[i].mask.src.u.udp.port = htons(0xFFFF);
493 sip[i].mask.dst.protonum = 0xFF;
494 sip[i].max_expected = 2;
495 sip[i].timeout = 3 * 60; /* 3 minutes */
496 sip[i].me = THIS_MODULE;
497 sip[i].help = sip_help;
498
499 tmpname = &sip_names[i][0];
500 if (ports[i] == SIP_PORT)
501 sprintf(tmpname, "sip");
502 else
503 sprintf(tmpname, "sip-%d", i);
504 sip[i].name = tmpname;
505
506 DEBUGP("port #%d: %d\n", i, ports[i]);
507
508 ret = ip_conntrack_helper_register(&sip[i]);
509 if (ret) {
510 printk("ERROR registering helper for port %d\n",
511 ports[i]);
512 fini();
513 return ret;
514 }
515 }
516 return 0;
517}
518
519module_init(init);
520module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
deleted file mode 100644
index 56b2f7546d1e..000000000000
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ /dev/null
@@ -1,962 +0,0 @@
1/* This file contains all the functions required for the standalone
2 ip_conntrack module.
3
4 These are not required by the compatibility layer.
5*/
6
7/* (C) 1999-2001 Paul `Rusty' Russell
8 * (C) 2002-2005 Netfilter Core Team <coreteam@netfilter.org>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2 as
12 * published by the Free Software Foundation.
13 */
14
15#include <linux/types.h>
16#include <linux/ip.h>
17#include <linux/netfilter.h>
18#include <linux/netfilter_ipv4.h>
19#include <linux/module.h>
20#include <linux/skbuff.h>
21#include <linux/proc_fs.h>
22#include <linux/seq_file.h>
23#include <linux/percpu.h>
24#ifdef CONFIG_SYSCTL
25#include <linux/sysctl.h>
26#endif
27#include <net/checksum.h>
28#include <net/ip.h>
29#include <net/route.h>
30
31#include <linux/netfilter_ipv4/ip_conntrack.h>
32#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
33#include <linux/netfilter_ipv4/ip_conntrack_core.h>
34#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
35
36#if 0
37#define DEBUGP printk
38#else
39#define DEBUGP(format, args...)
40#endif
41
42MODULE_LICENSE("GPL");
43
44extern atomic_t ip_conntrack_count;
45DECLARE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
46
47static int kill_proto(struct ip_conntrack *i, void *data)
48{
49 return (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum ==
50 *((u_int8_t *) data));
51}
52
53#ifdef CONFIG_PROC_FS
54static int
55print_tuple(struct seq_file *s, const struct ip_conntrack_tuple *tuple,
56 struct ip_conntrack_protocol *proto)
57{
58 seq_printf(s, "src=%u.%u.%u.%u dst=%u.%u.%u.%u ",
59 NIPQUAD(tuple->src.ip), NIPQUAD(tuple->dst.ip));
60 return proto->print_tuple(s, tuple);
61}
62
63#ifdef CONFIG_IP_NF_CT_ACCT
64static unsigned int
65seq_print_counters(struct seq_file *s,
66 const struct ip_conntrack_counter *counter)
67{
68 return seq_printf(s, "packets=%llu bytes=%llu ",
69 (unsigned long long)counter->packets,
70 (unsigned long long)counter->bytes);
71}
72#else
73#define seq_print_counters(x, y) 0
74#endif
75
76struct ct_iter_state {
77 unsigned int bucket;
78};
79
80static struct list_head *ct_get_first(struct seq_file *seq)
81{
82 struct ct_iter_state *st = seq->private;
83
84 for (st->bucket = 0;
85 st->bucket < ip_conntrack_htable_size;
86 st->bucket++) {
87 if (!list_empty(&ip_conntrack_hash[st->bucket]))
88 return ip_conntrack_hash[st->bucket].next;
89 }
90 return NULL;
91}
92
93static struct list_head *ct_get_next(struct seq_file *seq, struct list_head *head)
94{
95 struct ct_iter_state *st = seq->private;
96
97 head = head->next;
98 while (head == &ip_conntrack_hash[st->bucket]) {
99 if (++st->bucket >= ip_conntrack_htable_size)
100 return NULL;
101 head = ip_conntrack_hash[st->bucket].next;
102 }
103 return head;
104}
105
106static struct list_head *ct_get_idx(struct seq_file *seq, loff_t pos)
107{
108 struct list_head *head = ct_get_first(seq);
109
110 if (head)
111 while (pos && (head = ct_get_next(seq, head)))
112 pos--;
113 return pos ? NULL : head;
114}
115
116static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
117{
118 read_lock_bh(&ip_conntrack_lock);
119 return ct_get_idx(seq, *pos);
120}
121
122static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos)
123{
124 (*pos)++;
125 return ct_get_next(s, v);
126}
127
128static void ct_seq_stop(struct seq_file *s, void *v)
129{
130 read_unlock_bh(&ip_conntrack_lock);
131}
132
133static int ct_seq_show(struct seq_file *s, void *v)
134{
135 const struct ip_conntrack_tuple_hash *hash = v;
136 const struct ip_conntrack *conntrack = tuplehash_to_ctrack(hash);
137 struct ip_conntrack_protocol *proto;
138
139 IP_NF_ASSERT(conntrack);
140
141 /* we only want to print DIR_ORIGINAL */
142 if (DIRECTION(hash))
143 return 0;
144
145 proto = __ip_conntrack_proto_find(conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum);
146 IP_NF_ASSERT(proto);
147
148 if (seq_printf(s, "%-8s %u %ld ",
149 proto->name,
150 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum,
151 timer_pending(&conntrack->timeout)
152 ? (long)(conntrack->timeout.expires - jiffies)/HZ
153 : 0) != 0)
154 return -ENOSPC;
155
156 if (proto->print_conntrack(s, conntrack))
157 return -ENOSPC;
158
159 if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
160 proto))
161 return -ENOSPC;
162
163 if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_ORIGINAL]))
164 return -ENOSPC;
165
166 if (!(test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)))
167 if (seq_printf(s, "[UNREPLIED] "))
168 return -ENOSPC;
169
170 if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple,
171 proto))
172 return -ENOSPC;
173
174 if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_REPLY]))
175 return -ENOSPC;
176
177 if (test_bit(IPS_ASSURED_BIT, &conntrack->status))
178 if (seq_printf(s, "[ASSURED] "))
179 return -ENOSPC;
180
181#if defined(CONFIG_IP_NF_CONNTRACK_MARK)
182 if (seq_printf(s, "mark=%u ", conntrack->mark))
183 return -ENOSPC;
184#endif
185
186#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
187 if (seq_printf(s, "secmark=%u ", conntrack->secmark))
188 return -ENOSPC;
189#endif
190
191 if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use)))
192 return -ENOSPC;
193
194 return 0;
195}
196
197static struct seq_operations ct_seq_ops = {
198 .start = ct_seq_start,
199 .next = ct_seq_next,
200 .stop = ct_seq_stop,
201 .show = ct_seq_show
202};
203
204static int ct_open(struct inode *inode, struct file *file)
205{
206 struct seq_file *seq;
207 struct ct_iter_state *st;
208 int ret;
209
210 st = kmalloc(sizeof(struct ct_iter_state), GFP_KERNEL);
211 if (st == NULL)
212 return -ENOMEM;
213 ret = seq_open(file, &ct_seq_ops);
214 if (ret)
215 goto out_free;
216 seq = file->private_data;
217 seq->private = st;
218 memset(st, 0, sizeof(struct ct_iter_state));
219 return ret;
220out_free:
221 kfree(st);
222 return ret;
223}
224
225static const struct file_operations ct_file_ops = {
226 .owner = THIS_MODULE,
227 .open = ct_open,
228 .read = seq_read,
229 .llseek = seq_lseek,
230 .release = seq_release_private,
231};
232
233/* expects */
234static void *exp_seq_start(struct seq_file *s, loff_t *pos)
235{
236 struct list_head *e = &ip_conntrack_expect_list;
237 loff_t i;
238
239 /* strange seq_file api calls stop even if we fail,
240 * thus we need to grab lock since stop unlocks */
241 read_lock_bh(&ip_conntrack_lock);
242
243 if (list_empty(e))
244 return NULL;
245
246 for (i = 0; i <= *pos; i++) {
247 e = e->next;
248 if (e == &ip_conntrack_expect_list)
249 return NULL;
250 }
251 return e;
252}
253
254static void *exp_seq_next(struct seq_file *s, void *v, loff_t *pos)
255{
256 struct list_head *e = v;
257
258 ++*pos;
259 e = e->next;
260
261 if (e == &ip_conntrack_expect_list)
262 return NULL;
263
264 return e;
265}
266
267static void exp_seq_stop(struct seq_file *s, void *v)
268{
269 read_unlock_bh(&ip_conntrack_lock);
270}
271
272static int exp_seq_show(struct seq_file *s, void *v)
273{
274 struct ip_conntrack_expect *expect = v;
275
276 if (expect->timeout.function)
277 seq_printf(s, "%ld ", timer_pending(&expect->timeout)
278 ? (long)(expect->timeout.expires - jiffies)/HZ : 0);
279 else
280 seq_printf(s, "- ");
281
282 seq_printf(s, "proto=%u ", expect->tuple.dst.protonum);
283
284 print_tuple(s, &expect->tuple,
285 __ip_conntrack_proto_find(expect->tuple.dst.protonum));
286 return seq_putc(s, '\n');
287}
288
289static struct seq_operations exp_seq_ops = {
290 .start = exp_seq_start,
291 .next = exp_seq_next,
292 .stop = exp_seq_stop,
293 .show = exp_seq_show
294};
295
296static int exp_open(struct inode *inode, struct file *file)
297{
298 return seq_open(file, &exp_seq_ops);
299}
300
301static const struct file_operations exp_file_ops = {
302 .owner = THIS_MODULE,
303 .open = exp_open,
304 .read = seq_read,
305 .llseek = seq_lseek,
306 .release = seq_release
307};
308
309static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos)
310{
311 int cpu;
312
313 if (*pos == 0)
314 return SEQ_START_TOKEN;
315
316 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
317 if (!cpu_possible(cpu))
318 continue;
319 *pos = cpu+1;
320 return &per_cpu(ip_conntrack_stat, cpu);
321 }
322
323 return NULL;
324}
325
326static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
327{
328 int cpu;
329
330 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
331 if (!cpu_possible(cpu))
332 continue;
333 *pos = cpu+1;
334 return &per_cpu(ip_conntrack_stat, cpu);
335 }
336
337 return NULL;
338}
339
340static void ct_cpu_seq_stop(struct seq_file *seq, void *v)
341{
342}
343
344static int ct_cpu_seq_show(struct seq_file *seq, void *v)
345{
346 unsigned int nr_conntracks = atomic_read(&ip_conntrack_count);
347 struct ip_conntrack_stat *st = v;
348
349 if (v == SEQ_START_TOKEN) {
350 seq_printf(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete\n");
351 return 0;
352 }
353
354 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x "
355 "%08x %08x %08x %08x %08x %08x %08x %08x \n",
356 nr_conntracks,
357 st->searched,
358 st->found,
359 st->new,
360 st->invalid,
361 st->ignore,
362 st->delete,
363 st->delete_list,
364 st->insert,
365 st->insert_failed,
366 st->drop,
367 st->early_drop,
368 st->error,
369
370 st->expect_new,
371 st->expect_create,
372 st->expect_delete
373 );
374 return 0;
375}
376
377static struct seq_operations ct_cpu_seq_ops = {
378 .start = ct_cpu_seq_start,
379 .next = ct_cpu_seq_next,
380 .stop = ct_cpu_seq_stop,
381 .show = ct_cpu_seq_show,
382};
383
384static int ct_cpu_seq_open(struct inode *inode, struct file *file)
385{
386 return seq_open(file, &ct_cpu_seq_ops);
387}
388
389static const struct file_operations ct_cpu_seq_fops = {
390 .owner = THIS_MODULE,
391 .open = ct_cpu_seq_open,
392 .read = seq_read,
393 .llseek = seq_lseek,
394 .release = seq_release_private,
395};
396#endif
397
398static unsigned int ip_confirm(unsigned int hooknum,
399 struct sk_buff **pskb,
400 const struct net_device *in,
401 const struct net_device *out,
402 int (*okfn)(struct sk_buff *))
403{
404 /* We've seen it coming out the other side: confirm it */
405 return ip_conntrack_confirm(pskb);
406}
407
408static unsigned int ip_conntrack_help(unsigned int hooknum,
409 struct sk_buff **pskb,
410 const struct net_device *in,
411 const struct net_device *out,
412 int (*okfn)(struct sk_buff *))
413{
414 struct ip_conntrack *ct;
415 enum ip_conntrack_info ctinfo;
416
417 /* This is where we call the helper: as the packet goes out. */
418 ct = ip_conntrack_get(*pskb, &ctinfo);
419 if (ct && ct->helper && ctinfo != IP_CT_RELATED + IP_CT_IS_REPLY) {
420 unsigned int ret;
421 ret = ct->helper->help(pskb, ct, ctinfo);
422 if (ret != NF_ACCEPT)
423 return ret;
424 }
425 return NF_ACCEPT;
426}
427
428static unsigned int ip_conntrack_defrag(unsigned int hooknum,
429 struct sk_buff **pskb,
430 const struct net_device *in,
431 const struct net_device *out,
432 int (*okfn)(struct sk_buff *))
433{
434#if !defined(CONFIG_IP_NF_NAT) && !defined(CONFIG_IP_NF_NAT_MODULE)
435 /* Previously seen (loopback)? Ignore. Do this before
436 fragment check. */
437 if ((*pskb)->nfct)
438 return NF_ACCEPT;
439#endif
440
441 /* Gather fragments. */
442 if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
443 *pskb = ip_ct_gather_frags(*pskb,
444 hooknum == NF_IP_PRE_ROUTING ?
445 IP_DEFRAG_CONNTRACK_IN :
446 IP_DEFRAG_CONNTRACK_OUT);
447 if (!*pskb)
448 return NF_STOLEN;
449 }
450 return NF_ACCEPT;
451}
452
453static unsigned int ip_conntrack_local(unsigned int hooknum,
454 struct sk_buff **pskb,
455 const struct net_device *in,
456 const struct net_device *out,
457 int (*okfn)(struct sk_buff *))
458{
459 /* root is playing with raw sockets. */
460 if ((*pskb)->len < sizeof(struct iphdr)
461 || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) {
462 if (net_ratelimit())
463 printk("ipt_hook: happy cracking.\n");
464 return NF_ACCEPT;
465 }
466 return ip_conntrack_in(hooknum, pskb, in, out, okfn);
467}
468
469/* Connection tracking may drop packets, but never alters them, so
470 make it the first hook. */
471static struct nf_hook_ops ip_conntrack_ops[] = {
472 {
473 .hook = ip_conntrack_defrag,
474 .owner = THIS_MODULE,
475 .pf = PF_INET,
476 .hooknum = NF_IP_PRE_ROUTING,
477 .priority = NF_IP_PRI_CONNTRACK_DEFRAG,
478 },
479 {
480 .hook = ip_conntrack_in,
481 .owner = THIS_MODULE,
482 .pf = PF_INET,
483 .hooknum = NF_IP_PRE_ROUTING,
484 .priority = NF_IP_PRI_CONNTRACK,
485 },
486 {
487 .hook = ip_conntrack_defrag,
488 .owner = THIS_MODULE,
489 .pf = PF_INET,
490 .hooknum = NF_IP_LOCAL_OUT,
491 .priority = NF_IP_PRI_CONNTRACK_DEFRAG,
492 },
493 {
494 .hook = ip_conntrack_local,
495 .owner = THIS_MODULE,
496 .pf = PF_INET,
497 .hooknum = NF_IP_LOCAL_OUT,
498 .priority = NF_IP_PRI_CONNTRACK,
499 },
500 {
501 .hook = ip_conntrack_help,
502 .owner = THIS_MODULE,
503 .pf = PF_INET,
504 .hooknum = NF_IP_POST_ROUTING,
505 .priority = NF_IP_PRI_CONNTRACK_HELPER,
506 },
507 {
508 .hook = ip_conntrack_help,
509 .owner = THIS_MODULE,
510 .pf = PF_INET,
511 .hooknum = NF_IP_LOCAL_IN,
512 .priority = NF_IP_PRI_CONNTRACK_HELPER,
513 },
514 {
515 .hook = ip_confirm,
516 .owner = THIS_MODULE,
517 .pf = PF_INET,
518 .hooknum = NF_IP_POST_ROUTING,
519 .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
520 },
521 {
522 .hook = ip_confirm,
523 .owner = THIS_MODULE,
524 .pf = PF_INET,
525 .hooknum = NF_IP_LOCAL_IN,
526 .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
527 },
528};
529
530/* Sysctl support */
531
532int ip_conntrack_checksum __read_mostly = 1;
533
534#ifdef CONFIG_SYSCTL
535
536/* From ip_conntrack_core.c */
537extern int ip_conntrack_max;
538extern unsigned int ip_conntrack_htable_size;
539
540/* From ip_conntrack_proto_tcp.c */
541extern unsigned int ip_ct_tcp_timeout_syn_sent;
542extern unsigned int ip_ct_tcp_timeout_syn_recv;
543extern unsigned int ip_ct_tcp_timeout_established;
544extern unsigned int ip_ct_tcp_timeout_fin_wait;
545extern unsigned int ip_ct_tcp_timeout_close_wait;
546extern unsigned int ip_ct_tcp_timeout_last_ack;
547extern unsigned int ip_ct_tcp_timeout_time_wait;
548extern unsigned int ip_ct_tcp_timeout_close;
549extern unsigned int ip_ct_tcp_timeout_max_retrans;
550extern int ip_ct_tcp_loose;
551extern int ip_ct_tcp_be_liberal;
552extern int ip_ct_tcp_max_retrans;
553
554/* From ip_conntrack_proto_udp.c */
555extern unsigned int ip_ct_udp_timeout;
556extern unsigned int ip_ct_udp_timeout_stream;
557
558/* From ip_conntrack_proto_icmp.c */
559extern unsigned int ip_ct_icmp_timeout;
560
561/* From ip_conntrack_proto_generic.c */
562extern unsigned int ip_ct_generic_timeout;
563
564/* Log invalid packets of a given protocol */
565static int log_invalid_proto_min = 0;
566static int log_invalid_proto_max = 255;
567
568static struct ctl_table_header *ip_ct_sysctl_header;
569
570static ctl_table ip_ct_sysctl_table[] = {
571 {
572 .ctl_name = NET_IPV4_NF_CONNTRACK_MAX,
573 .procname = "ip_conntrack_max",
574 .data = &ip_conntrack_max,
575 .maxlen = sizeof(int),
576 .mode = 0644,
577 .proc_handler = &proc_dointvec,
578 },
579 {
580 .ctl_name = NET_IPV4_NF_CONNTRACK_COUNT,
581 .procname = "ip_conntrack_count",
582 .data = &ip_conntrack_count,
583 .maxlen = sizeof(int),
584 .mode = 0444,
585 .proc_handler = &proc_dointvec,
586 },
587 {
588 .ctl_name = NET_IPV4_NF_CONNTRACK_BUCKETS,
589 .procname = "ip_conntrack_buckets",
590 .data = &ip_conntrack_htable_size,
591 .maxlen = sizeof(unsigned int),
592 .mode = 0444,
593 .proc_handler = &proc_dointvec,
594 },
595 {
596 .ctl_name = NET_IPV4_NF_CONNTRACK_CHECKSUM,
597 .procname = "ip_conntrack_checksum",
598 .data = &ip_conntrack_checksum,
599 .maxlen = sizeof(int),
600 .mode = 0644,
601 .proc_handler = &proc_dointvec,
602 },
603 {
604 .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT,
605 .procname = "ip_conntrack_tcp_timeout_syn_sent",
606 .data = &ip_ct_tcp_timeout_syn_sent,
607 .maxlen = sizeof(unsigned int),
608 .mode = 0644,
609 .proc_handler = &proc_dointvec_jiffies,
610 },
611 {
612 .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV,
613 .procname = "ip_conntrack_tcp_timeout_syn_recv",
614 .data = &ip_ct_tcp_timeout_syn_recv,
615 .maxlen = sizeof(unsigned int),
616 .mode = 0644,
617 .proc_handler = &proc_dointvec_jiffies,
618 },
619 {
620 .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED,
621 .procname = "ip_conntrack_tcp_timeout_established",
622 .data = &ip_ct_tcp_timeout_established,
623 .maxlen = sizeof(unsigned int),
624 .mode = 0644,
625 .proc_handler = &proc_dointvec_jiffies,
626 },
627 {
628 .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT,
629 .procname = "ip_conntrack_tcp_timeout_fin_wait",
630 .data = &ip_ct_tcp_timeout_fin_wait,
631 .maxlen = sizeof(unsigned int),
632 .mode = 0644,
633 .proc_handler = &proc_dointvec_jiffies,
634 },
635 {
636 .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT,
637 .procname = "ip_conntrack_tcp_timeout_close_wait",
638 .data = &ip_ct_tcp_timeout_close_wait,
639 .maxlen = sizeof(unsigned int),
640 .mode = 0644,
641 .proc_handler = &proc_dointvec_jiffies,
642 },
643 {
644 .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK,
645 .procname = "ip_conntrack_tcp_timeout_last_ack",
646 .data = &ip_ct_tcp_timeout_last_ack,
647 .maxlen = sizeof(unsigned int),
648 .mode = 0644,
649 .proc_handler = &proc_dointvec_jiffies,
650 },
651 {
652 .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT,
653 .procname = "ip_conntrack_tcp_timeout_time_wait",
654 .data = &ip_ct_tcp_timeout_time_wait,
655 .maxlen = sizeof(unsigned int),
656 .mode = 0644,
657 .proc_handler = &proc_dointvec_jiffies,
658 },
659 {
660 .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE,
661 .procname = "ip_conntrack_tcp_timeout_close",
662 .data = &ip_ct_tcp_timeout_close,
663 .maxlen = sizeof(unsigned int),
664 .mode = 0644,
665 .proc_handler = &proc_dointvec_jiffies,
666 },
667 {
668 .ctl_name = NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT,
669 .procname = "ip_conntrack_udp_timeout",
670 .data = &ip_ct_udp_timeout,
671 .maxlen = sizeof(unsigned int),
672 .mode = 0644,
673 .proc_handler = &proc_dointvec_jiffies,
674 },
675 {
676 .ctl_name = NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM,
677 .procname = "ip_conntrack_udp_timeout_stream",
678 .data = &ip_ct_udp_timeout_stream,
679 .maxlen = sizeof(unsigned int),
680 .mode = 0644,
681 .proc_handler = &proc_dointvec_jiffies,
682 },
683 {
684 .ctl_name = NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT,
685 .procname = "ip_conntrack_icmp_timeout",
686 .data = &ip_ct_icmp_timeout,
687 .maxlen = sizeof(unsigned int),
688 .mode = 0644,
689 .proc_handler = &proc_dointvec_jiffies,
690 },
691 {
692 .ctl_name = NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT,
693 .procname = "ip_conntrack_generic_timeout",
694 .data = &ip_ct_generic_timeout,
695 .maxlen = sizeof(unsigned int),
696 .mode = 0644,
697 .proc_handler = &proc_dointvec_jiffies,
698 },
699 {
700 .ctl_name = NET_IPV4_NF_CONNTRACK_LOG_INVALID,
701 .procname = "ip_conntrack_log_invalid",
702 .data = &ip_ct_log_invalid,
703 .maxlen = sizeof(unsigned int),
704 .mode = 0644,
705 .proc_handler = &proc_dointvec_minmax,
706 .strategy = &sysctl_intvec,
707 .extra1 = &log_invalid_proto_min,
708 .extra2 = &log_invalid_proto_max,
709 },
710 {
711 .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS,
712 .procname = "ip_conntrack_tcp_timeout_max_retrans",
713 .data = &ip_ct_tcp_timeout_max_retrans,
714 .maxlen = sizeof(unsigned int),
715 .mode = 0644,
716 .proc_handler = &proc_dointvec_jiffies,
717 },
718 {
719 .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_LOOSE,
720 .procname = "ip_conntrack_tcp_loose",
721 .data = &ip_ct_tcp_loose,
722 .maxlen = sizeof(unsigned int),
723 .mode = 0644,
724 .proc_handler = &proc_dointvec,
725 },
726 {
727 .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL,
728 .procname = "ip_conntrack_tcp_be_liberal",
729 .data = &ip_ct_tcp_be_liberal,
730 .maxlen = sizeof(unsigned int),
731 .mode = 0644,
732 .proc_handler = &proc_dointvec,
733 },
734 {
735 .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS,
736 .procname = "ip_conntrack_tcp_max_retrans",
737 .data = &ip_ct_tcp_max_retrans,
738 .maxlen = sizeof(unsigned int),
739 .mode = 0644,
740 .proc_handler = &proc_dointvec,
741 },
742 { .ctl_name = 0 }
743};
744
745#define NET_IP_CONNTRACK_MAX 2089
746
747static ctl_table ip_ct_netfilter_table[] = {
748 {
749 .ctl_name = NET_IPV4_NETFILTER,
750 .procname = "netfilter",
751 .mode = 0555,
752 .child = ip_ct_sysctl_table,
753 },
754 {
755 .ctl_name = NET_IP_CONNTRACK_MAX,
756 .procname = "ip_conntrack_max",
757 .data = &ip_conntrack_max,
758 .maxlen = sizeof(int),
759 .mode = 0644,
760 .proc_handler = &proc_dointvec
761 },
762 { .ctl_name = 0 }
763};
764
765static ctl_table ip_ct_ipv4_table[] = {
766 {
767 .ctl_name = NET_IPV4,
768 .procname = "ipv4",
769 .mode = 0555,
770 .child = ip_ct_netfilter_table,
771 },
772 { .ctl_name = 0 }
773};
774
775static ctl_table ip_ct_net_table[] = {
776 {
777 .ctl_name = CTL_NET,
778 .procname = "net",
779 .mode = 0555,
780 .child = ip_ct_ipv4_table,
781 },
782 { .ctl_name = 0 }
783};
784
785EXPORT_SYMBOL(ip_ct_log_invalid);
786#endif /* CONFIG_SYSCTL */
787
788/* FIXME: Allow NULL functions and sub in pointers to generic for
789 them. --RR */
790int ip_conntrack_protocol_register(struct ip_conntrack_protocol *proto)
791{
792 int ret = 0;
793
794 write_lock_bh(&ip_conntrack_lock);
795 if (ip_ct_protos[proto->proto] != &ip_conntrack_generic_protocol) {
796 ret = -EBUSY;
797 goto out;
798 }
799 rcu_assign_pointer(ip_ct_protos[proto->proto], proto);
800 out:
801 write_unlock_bh(&ip_conntrack_lock);
802 return ret;
803}
804
805void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto)
806{
807 write_lock_bh(&ip_conntrack_lock);
808 rcu_assign_pointer(ip_ct_protos[proto->proto],
809 &ip_conntrack_generic_protocol);
810 write_unlock_bh(&ip_conntrack_lock);
811 synchronize_rcu();
812
813 /* Remove all contrack entries for this protocol */
814 ip_ct_iterate_cleanup(kill_proto, &proto->proto);
815}
816
817static int __init ip_conntrack_standalone_init(void)
818{
819#ifdef CONFIG_PROC_FS
820 struct proc_dir_entry *proc, *proc_exp, *proc_stat;
821#endif
822 int ret = 0;
823
824 ret = ip_conntrack_init();
825 if (ret < 0)
826 return ret;
827
828#ifdef CONFIG_PROC_FS
829 ret = -ENOMEM;
830 proc = proc_net_fops_create("ip_conntrack", 0440, &ct_file_ops);
831 if (!proc) goto cleanup_init;
832
833 proc_exp = proc_net_fops_create("ip_conntrack_expect", 0440,
834 &exp_file_ops);
835 if (!proc_exp) goto cleanup_proc;
836
837 proc_stat = create_proc_entry("ip_conntrack", S_IRUGO, proc_net_stat);
838 if (!proc_stat)
839 goto cleanup_proc_exp;
840
841 proc_stat->proc_fops = &ct_cpu_seq_fops;
842 proc_stat->owner = THIS_MODULE;
843#endif
844
845 ret = nf_register_hooks(ip_conntrack_ops, ARRAY_SIZE(ip_conntrack_ops));
846 if (ret < 0) {
847 printk("ip_conntrack: can't register hooks.\n");
848 goto cleanup_proc_stat;
849 }
850#ifdef CONFIG_SYSCTL
851 ip_ct_sysctl_header = register_sysctl_table(ip_ct_net_table);
852 if (ip_ct_sysctl_header == NULL) {
853 printk("ip_conntrack: can't register to sysctl.\n");
854 ret = -ENOMEM;
855 goto cleanup_hooks;
856 }
857#endif
858 return ret;
859
860#ifdef CONFIG_SYSCTL
861 cleanup_hooks:
862 nf_unregister_hooks(ip_conntrack_ops, ARRAY_SIZE(ip_conntrack_ops));
863#endif
864 cleanup_proc_stat:
865#ifdef CONFIG_PROC_FS
866 remove_proc_entry("ip_conntrack", proc_net_stat);
867 cleanup_proc_exp:
868 proc_net_remove("ip_conntrack_expect");
869 cleanup_proc:
870 proc_net_remove("ip_conntrack");
871 cleanup_init:
872#endif /* CONFIG_PROC_FS */
873 ip_conntrack_cleanup();
874 return ret;
875}
876
877static void __exit ip_conntrack_standalone_fini(void)
878{
879 synchronize_net();
880#ifdef CONFIG_SYSCTL
881 unregister_sysctl_table(ip_ct_sysctl_header);
882#endif
883 nf_unregister_hooks(ip_conntrack_ops, ARRAY_SIZE(ip_conntrack_ops));
884#ifdef CONFIG_PROC_FS
885 remove_proc_entry("ip_conntrack", proc_net_stat);
886 proc_net_remove("ip_conntrack_expect");
887 proc_net_remove("ip_conntrack");
888#endif /* CONFIG_PROC_FS */
889 ip_conntrack_cleanup();
890}
891
892module_init(ip_conntrack_standalone_init);
893module_exit(ip_conntrack_standalone_fini);
894
895/* Some modules need us, but don't depend directly on any symbol.
896 They should call this. */
897void need_conntrack(void)
898{
899}
900
901#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
902EXPORT_SYMBOL_GPL(ip_conntrack_chain);
903EXPORT_SYMBOL_GPL(ip_conntrack_expect_chain);
904EXPORT_SYMBOL_GPL(ip_conntrack_register_notifier);
905EXPORT_SYMBOL_GPL(ip_conntrack_unregister_notifier);
906EXPORT_SYMBOL_GPL(__ip_ct_event_cache_init);
907EXPORT_PER_CPU_SYMBOL_GPL(ip_conntrack_ecache);
908#endif
909EXPORT_SYMBOL(ip_conntrack_protocol_register);
910EXPORT_SYMBOL(ip_conntrack_protocol_unregister);
911EXPORT_SYMBOL(ip_ct_get_tuple);
912EXPORT_SYMBOL(invert_tuplepr);
913EXPORT_SYMBOL(ip_conntrack_alter_reply);
914EXPORT_SYMBOL(ip_conntrack_destroyed);
915EXPORT_SYMBOL(need_conntrack);
916EXPORT_SYMBOL(ip_conntrack_helper_register);
917EXPORT_SYMBOL(ip_conntrack_helper_unregister);
918EXPORT_SYMBOL(ip_ct_iterate_cleanup);
919EXPORT_SYMBOL(__ip_ct_refresh_acct);
920
921EXPORT_SYMBOL(ip_conntrack_expect_alloc);
922EXPORT_SYMBOL(ip_conntrack_expect_put);
923EXPORT_SYMBOL_GPL(__ip_conntrack_expect_find);
924EXPORT_SYMBOL_GPL(ip_conntrack_expect_find_get);
925EXPORT_SYMBOL(ip_conntrack_expect_related);
926EXPORT_SYMBOL(ip_conntrack_unexpect_related);
927EXPORT_SYMBOL_GPL(ip_conntrack_expect_list);
928EXPORT_SYMBOL_GPL(ip_ct_unlink_expect);
929
930EXPORT_SYMBOL(ip_conntrack_tuple_taken);
931EXPORT_SYMBOL(ip_ct_gather_frags);
932EXPORT_SYMBOL(ip_conntrack_htable_size);
933EXPORT_SYMBOL(ip_conntrack_lock);
934EXPORT_SYMBOL(ip_conntrack_hash);
935EXPORT_SYMBOL(ip_conntrack_untracked);
936EXPORT_SYMBOL_GPL(ip_conntrack_find_get);
937#ifdef CONFIG_IP_NF_NAT_NEEDED
938EXPORT_SYMBOL(ip_conntrack_tcp_update);
939#endif
940
941EXPORT_SYMBOL_GPL(ip_conntrack_flush);
942EXPORT_SYMBOL_GPL(__ip_conntrack_find);
943
944EXPORT_SYMBOL_GPL(ip_conntrack_alloc);
945EXPORT_SYMBOL_GPL(ip_conntrack_free);
946EXPORT_SYMBOL_GPL(ip_conntrack_hash_insert);
947
948EXPORT_SYMBOL_GPL(ip_ct_remove_expectations);
949
950EXPORT_SYMBOL_GPL(ip_conntrack_helper_find_get);
951EXPORT_SYMBOL_GPL(ip_conntrack_helper_put);
952EXPORT_SYMBOL_GPL(__ip_conntrack_helper_find_byname);
953
954EXPORT_SYMBOL_GPL(ip_conntrack_proto_find_get);
955EXPORT_SYMBOL_GPL(ip_conntrack_proto_put);
956EXPORT_SYMBOL_GPL(__ip_conntrack_proto_find);
957EXPORT_SYMBOL_GPL(ip_conntrack_checksum);
958#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
959 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
960EXPORT_SYMBOL_GPL(ip_ct_port_tuple_to_nfattr);
961EXPORT_SYMBOL_GPL(ip_ct_port_nfattr_to_tuple);
962#endif
diff --git a/net/ipv4/netfilter/ip_conntrack_tftp.c b/net/ipv4/netfilter/ip_conntrack_tftp.c
deleted file mode 100644
index 76e175e7a972..000000000000
--- a/net/ipv4/netfilter/ip_conntrack_tftp.c
+++ /dev/null
@@ -1,161 +0,0 @@
1/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu>
2 *
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License version 2 as
5 * published by the Free Software Foundation.
6 *
7 * Version: 0.0.7
8 *
9 * Thu 21 Mar 2002 Harald Welte <laforge@gnumonks.org>
10 * - port to newnat API
11 *
12 */
13
14#include <linux/module.h>
15#include <linux/ip.h>
16#include <linux/udp.h>
17
18#include <linux/netfilter.h>
19#include <linux/netfilter_ipv4/ip_tables.h>
20#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
21#include <linux/netfilter_ipv4/ip_conntrack_tftp.h>
22#include <linux/moduleparam.h>
23
24MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>");
25MODULE_DESCRIPTION("tftp connection tracking helper");
26MODULE_LICENSE("GPL");
27
28#define MAX_PORTS 8
29static unsigned short ports[MAX_PORTS];
30static int ports_c;
31module_param_array(ports, ushort, &ports_c, 0400);
32MODULE_PARM_DESC(ports, "port numbers of tftp servers");
33
34#if 0
35#define DEBUGP(format, args...) printk("%s:%s:" format, \
36 __FILE__, __FUNCTION__ , ## args)
37#else
38#define DEBUGP(format, args...)
39#endif
40
41unsigned int (*ip_nat_tftp_hook)(struct sk_buff **pskb,
42 enum ip_conntrack_info ctinfo,
43 struct ip_conntrack_expect *exp);
44EXPORT_SYMBOL_GPL(ip_nat_tftp_hook);
45
46static int tftp_help(struct sk_buff **pskb,
47 struct ip_conntrack *ct,
48 enum ip_conntrack_info ctinfo)
49{
50 struct tftphdr _tftph, *tfh;
51 struct ip_conntrack_expect *exp;
52 unsigned int ret = NF_ACCEPT;
53 typeof(ip_nat_tftp_hook) ip_nat_tftp;
54
55 tfh = skb_header_pointer(*pskb,
56 (*pskb)->nh.iph->ihl*4+sizeof(struct udphdr),
57 sizeof(_tftph), &_tftph);
58 if (tfh == NULL)
59 return NF_ACCEPT;
60
61 switch (ntohs(tfh->opcode)) {
62 /* RRQ and WRQ works the same way */
63 case TFTP_OPCODE_READ:
64 case TFTP_OPCODE_WRITE:
65 DEBUGP("");
66 DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
67 DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
68
69 exp = ip_conntrack_expect_alloc(ct);
70 if (exp == NULL)
71 return NF_DROP;
72
73 exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
74 exp->mask.src.ip = htonl(0xffffffff);
75 exp->mask.src.u.udp.port = 0;
76 exp->mask.dst.ip = htonl(0xffffffff);
77 exp->mask.dst.u.udp.port = htons(0xffff);
78 exp->mask.dst.protonum = 0xff;
79 exp->expectfn = NULL;
80 exp->flags = 0;
81
82 DEBUGP("expect: ");
83 DUMP_TUPLE(&exp->tuple);
84 DUMP_TUPLE(&exp->mask);
85 ip_nat_tftp = rcu_dereference(ip_nat_tftp_hook);
86 if (ip_nat_tftp)
87 ret = ip_nat_tftp(pskb, ctinfo, exp);
88 else if (ip_conntrack_expect_related(exp) != 0)
89 ret = NF_DROP;
90 ip_conntrack_expect_put(exp);
91 break;
92 case TFTP_OPCODE_DATA:
93 case TFTP_OPCODE_ACK:
94 DEBUGP("Data/ACK opcode\n");
95 break;
96 case TFTP_OPCODE_ERROR:
97 DEBUGP("Error opcode\n");
98 break;
99 default:
100 DEBUGP("Unknown opcode\n");
101 }
102 return NF_ACCEPT;
103}
104
105static struct ip_conntrack_helper tftp[MAX_PORTS];
106static char tftp_names[MAX_PORTS][sizeof("tftp-65535")];
107
108static void ip_conntrack_tftp_fini(void)
109{
110 int i;
111
112 for (i = 0 ; i < ports_c; i++) {
113 DEBUGP("unregistering helper for port %d\n",
114 ports[i]);
115 ip_conntrack_helper_unregister(&tftp[i]);
116 }
117}
118
119static int __init ip_conntrack_tftp_init(void)
120{
121 int i, ret;
122 char *tmpname;
123
124 if (ports_c == 0)
125 ports[ports_c++] = TFTP_PORT;
126
127 for (i = 0; i < ports_c; i++) {
128 /* Create helper structure */
129 memset(&tftp[i], 0, sizeof(struct ip_conntrack_helper));
130
131 tftp[i].tuple.dst.protonum = IPPROTO_UDP;
132 tftp[i].tuple.src.u.udp.port = htons(ports[i]);
133 tftp[i].mask.dst.protonum = 0xFF;
134 tftp[i].mask.src.u.udp.port = htons(0xFFFF);
135 tftp[i].max_expected = 1;
136 tftp[i].timeout = 5 * 60; /* 5 minutes */
137 tftp[i].me = THIS_MODULE;
138 tftp[i].help = tftp_help;
139
140 tmpname = &tftp_names[i][0];
141 if (ports[i] == TFTP_PORT)
142 sprintf(tmpname, "tftp");
143 else
144 sprintf(tmpname, "tftp-%d", i);
145 tftp[i].name = tmpname;
146
147 DEBUGP("port #%d: %d\n", i, ports[i]);
148
149 ret=ip_conntrack_helper_register(&tftp[i]);
150 if (ret) {
151 printk("ERROR registering helper for port %d\n",
152 ports[i]);
153 ip_conntrack_tftp_fini();
154 return(ret);
155 }
156 }
157 return(0);
158}
159
160module_init(ip_conntrack_tftp_init);
161module_exit(ip_conntrack_tftp_fini);
diff --git a/net/ipv4/netfilter/ip_nat_amanda.c b/net/ipv4/netfilter/ip_nat_amanda.c
deleted file mode 100644
index 85df1a9aed33..000000000000
--- a/net/ipv4/netfilter/ip_nat_amanda.c
+++ /dev/null
@@ -1,85 +0,0 @@
1/* Amanda extension for TCP NAT alteration.
2 * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca>
3 * based on a copy of HW's ip_nat_irc.c as well as other modules
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version
8 * 2 of the License, or (at your option) any later version.
9 *
10 * Module load syntax:
11 * insmod ip_nat_amanda.o
12 */
13
14#include <linux/kernel.h>
15#include <linux/module.h>
16#include <linux/netfilter.h>
17#include <linux/skbuff.h>
18#include <linux/ip.h>
19#include <linux/udp.h>
20#include <net/tcp.h>
21#include <net/udp.h>
22
23#include <linux/netfilter_ipv4.h>
24#include <linux/netfilter_ipv4/ip_nat.h>
25#include <linux/netfilter_ipv4/ip_nat_helper.h>
26#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
27#include <linux/netfilter_ipv4/ip_conntrack_amanda.h>
28
29
30MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>");
31MODULE_DESCRIPTION("Amanda NAT helper");
32MODULE_LICENSE("GPL");
33
34static unsigned int help(struct sk_buff **pskb,
35 enum ip_conntrack_info ctinfo,
36 unsigned int matchoff,
37 unsigned int matchlen,
38 struct ip_conntrack_expect *exp)
39{
40 char buffer[sizeof("65535")];
41 u_int16_t port;
42 unsigned int ret;
43
44 /* Connection comes from client. */
45 exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
46 exp->dir = IP_CT_DIR_ORIGINAL;
47
48 /* When you see the packet, we need to NAT it the same as the
49 * this one (ie. same IP: it will be TCP and master is UDP). */
50 exp->expectfn = ip_nat_follow_master;
51
52 /* Try to get same port: if not, try to change it. */
53 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
54 exp->tuple.dst.u.tcp.port = htons(port);
55 if (ip_conntrack_expect_related(exp) == 0)
56 break;
57 }
58
59 if (port == 0)
60 return NF_DROP;
61
62 sprintf(buffer, "%u", port);
63 ret = ip_nat_mangle_udp_packet(pskb, exp->master, ctinfo,
64 matchoff, matchlen,
65 buffer, strlen(buffer));
66 if (ret != NF_ACCEPT)
67 ip_conntrack_unexpect_related(exp);
68 return ret;
69}
70
71static void __exit ip_nat_amanda_fini(void)
72{
73 rcu_assign_pointer(ip_nat_amanda_hook, NULL);
74 synchronize_rcu();
75}
76
77static int __init ip_nat_amanda_init(void)
78{
79 BUG_ON(rcu_dereference(ip_nat_amanda_hook));
80 rcu_assign_pointer(ip_nat_amanda_hook, help);
81 return 0;
82}
83
84module_init(ip_nat_amanda_init);
85module_exit(ip_nat_amanda_fini);
diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c
deleted file mode 100644
index 40737fdbe9a7..000000000000
--- a/net/ipv4/netfilter/ip_nat_core.c
+++ /dev/null
@@ -1,634 +0,0 @@
1/* NAT for netfilter; shared with compatibility layer. */
2
3/* (C) 1999-2001 Paul `Rusty' Russell
4 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/module.h>
12#include <linux/types.h>
13#include <linux/timer.h>
14#include <linux/skbuff.h>
15#include <linux/netfilter_ipv4.h>
16#include <linux/vmalloc.h>
17#include <net/checksum.h>
18#include <net/icmp.h>
19#include <net/ip.h>
20#include <net/tcp.h> /* For tcp_prot in getorigdst */
21#include <linux/icmp.h>
22#include <linux/udp.h>
23#include <linux/jhash.h>
24
25#include <linux/netfilter_ipv4/ip_conntrack.h>
26#include <linux/netfilter_ipv4/ip_conntrack_core.h>
27#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
28#include <linux/netfilter_ipv4/ip_nat.h>
29#include <linux/netfilter_ipv4/ip_nat_protocol.h>
30#include <linux/netfilter_ipv4/ip_nat_core.h>
31#include <linux/netfilter_ipv4/ip_nat_helper.h>
32#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
33
34#if 0
35#define DEBUGP printk
36#else
37#define DEBUGP(format, args...)
38#endif
39
40DEFINE_RWLOCK(ip_nat_lock);
41
42/* Calculated at init based on memory size */
43static unsigned int ip_nat_htable_size;
44
45static struct list_head *bysource;
46
47#define MAX_IP_NAT_PROTO 256
48static struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO];
49
50static inline struct ip_nat_protocol *
51__ip_nat_proto_find(u_int8_t protonum)
52{
53 return rcu_dereference(ip_nat_protos[protonum]);
54}
55
56struct ip_nat_protocol *
57ip_nat_proto_find_get(u_int8_t protonum)
58{
59 struct ip_nat_protocol *p;
60
61 rcu_read_lock();
62 p = __ip_nat_proto_find(protonum);
63 if (!try_module_get(p->me))
64 p = &ip_nat_unknown_protocol;
65 rcu_read_unlock();
66
67 return p;
68}
69EXPORT_SYMBOL_GPL(ip_nat_proto_find_get);
70
71void
72ip_nat_proto_put(struct ip_nat_protocol *p)
73{
74 module_put(p->me);
75}
76EXPORT_SYMBOL_GPL(ip_nat_proto_put);
77
78/* We keep an extra hash for each conntrack, for fast searching. */
79static inline unsigned int
80hash_by_src(const struct ip_conntrack_tuple *tuple)
81{
82 /* Original src, to ensure we map it consistently if poss. */
83 return jhash_3words((__force u32)tuple->src.ip, tuple->src.u.all,
84 tuple->dst.protonum, 0) % ip_nat_htable_size;
85}
86
87/* Noone using conntrack by the time this called. */
88static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
89{
90 if (!(conn->status & IPS_NAT_DONE_MASK))
91 return;
92
93 write_lock_bh(&ip_nat_lock);
94 list_del(&conn->nat.info.bysource);
95 write_unlock_bh(&ip_nat_lock);
96}
97
98/* Is this tuple already taken? (not by us) */
99int
100ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
101 const struct ip_conntrack *ignored_conntrack)
102{
103 /* Conntrack tracking doesn't keep track of outgoing tuples; only
104 incoming ones. NAT means they don't have a fixed mapping,
105 so we invert the tuple and look for the incoming reply.
106
107 We could keep a separate hash if this proves too slow. */
108 struct ip_conntrack_tuple reply;
109
110 invert_tuplepr(&reply, tuple);
111 return ip_conntrack_tuple_taken(&reply, ignored_conntrack);
112}
113EXPORT_SYMBOL(ip_nat_used_tuple);
114
115/* If we source map this tuple so reply looks like reply_tuple, will
116 * that meet the constraints of range. */
117static int
118in_range(const struct ip_conntrack_tuple *tuple,
119 const struct ip_nat_range *range)
120{
121 struct ip_nat_protocol *proto;
122 int ret = 0;
123
124 /* If we are supposed to map IPs, then we must be in the
125 range specified, otherwise let this drag us onto a new src IP. */
126 if (range->flags & IP_NAT_RANGE_MAP_IPS) {
127 if (ntohl(tuple->src.ip) < ntohl(range->min_ip)
128 || ntohl(tuple->src.ip) > ntohl(range->max_ip))
129 return 0;
130 }
131
132 rcu_read_lock();
133 proto = __ip_nat_proto_find(tuple->dst.protonum);
134 if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
135 || proto->in_range(tuple, IP_NAT_MANIP_SRC,
136 &range->min, &range->max))
137 ret = 1;
138 rcu_read_unlock();
139
140 return ret;
141}
142
143static inline int
144same_src(const struct ip_conntrack *ct,
145 const struct ip_conntrack_tuple *tuple)
146{
147 return (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum
148 == tuple->dst.protonum
149 && ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip
150 == tuple->src.ip
151 && ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all
152 == tuple->src.u.all);
153}
154
155/* Only called for SRC manip */
156static int
157find_appropriate_src(const struct ip_conntrack_tuple *tuple,
158 struct ip_conntrack_tuple *result,
159 const struct ip_nat_range *range)
160{
161 unsigned int h = hash_by_src(tuple);
162 struct ip_conntrack *ct;
163
164 read_lock_bh(&ip_nat_lock);
165 list_for_each_entry(ct, &bysource[h], nat.info.bysource) {
166 if (same_src(ct, tuple)) {
167 /* Copy source part from reply tuple. */
168 invert_tuplepr(result,
169 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
170 result->dst = tuple->dst;
171
172 if (in_range(result, range)) {
173 read_unlock_bh(&ip_nat_lock);
174 return 1;
175 }
176 }
177 }
178 read_unlock_bh(&ip_nat_lock);
179 return 0;
180}
181
182/* For [FUTURE] fragmentation handling, we want the least-used
183 src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus
184 if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
185 1-65535, we don't do pro-rata allocation based on ports; we choose
186 the ip with the lowest src-ip/dst-ip/proto usage.
187*/
188static void
189find_best_ips_proto(struct ip_conntrack_tuple *tuple,
190 const struct ip_nat_range *range,
191 const struct ip_conntrack *conntrack,
192 enum ip_nat_manip_type maniptype)
193{
194 __be32 *var_ipp;
195 /* Host order */
196 u_int32_t minip, maxip, j;
197
198 /* No IP mapping? Do nothing. */
199 if (!(range->flags & IP_NAT_RANGE_MAP_IPS))
200 return;
201
202 if (maniptype == IP_NAT_MANIP_SRC)
203 var_ipp = &tuple->src.ip;
204 else
205 var_ipp = &tuple->dst.ip;
206
207 /* Fast path: only one choice. */
208 if (range->min_ip == range->max_ip) {
209 *var_ipp = range->min_ip;
210 return;
211 }
212
213 /* Hashing source and destination IPs gives a fairly even
214 * spread in practice (if there are a small number of IPs
215 * involved, there usually aren't that many connections
216 * anyway). The consistency means that servers see the same
217 * client coming from the same IP (some Internet Banking sites
218 * like this), even across reboots. */
219 minip = ntohl(range->min_ip);
220 maxip = ntohl(range->max_ip);
221 j = jhash_2words((__force u32)tuple->src.ip, (__force u32)tuple->dst.ip, 0);
222 *var_ipp = htonl(minip + j % (maxip - minip + 1));
223}
224
225/* Manipulate the tuple into the range given. For NF_IP_POST_ROUTING,
226 * we change the source to map into the range. For NF_IP_PRE_ROUTING
227 * and NF_IP_LOCAL_OUT, we change the destination to map into the
228 * range. It might not be possible to get a unique tuple, but we try.
229 * At worst (or if we race), we will end up with a final duplicate in
230 * __ip_conntrack_confirm and drop the packet. */
231static void
232get_unique_tuple(struct ip_conntrack_tuple *tuple,
233 const struct ip_conntrack_tuple *orig_tuple,
234 const struct ip_nat_range *range,
235 struct ip_conntrack *conntrack,
236 enum ip_nat_manip_type maniptype)
237{
238 struct ip_nat_protocol *proto;
239
240 /* 1) If this srcip/proto/src-proto-part is currently mapped,
241 and that same mapping gives a unique tuple within the given
242 range, use that.
243
244 This is only required for source (ie. NAT/masq) mappings.
245 So far, we don't do local source mappings, so multiple
246 manips not an issue. */
247 if (maniptype == IP_NAT_MANIP_SRC) {
248 if (find_appropriate_src(orig_tuple, tuple, range)) {
249 DEBUGP("get_unique_tuple: Found current src map\n");
250 if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM))
251 if (!ip_nat_used_tuple(tuple, conntrack))
252 return;
253 }
254 }
255
256 /* 2) Select the least-used IP/proto combination in the given
257 range. */
258 *tuple = *orig_tuple;
259 find_best_ips_proto(tuple, range, conntrack, maniptype);
260
261 /* 3) The per-protocol part of the manip is made to map into
262 the range to make a unique tuple. */
263
264 rcu_read_lock();
265 proto = __ip_nat_proto_find(orig_tuple->dst.protonum);
266
267 /* Change protocol info to have some randomization */
268 if (range->flags & IP_NAT_RANGE_PROTO_RANDOM) {
269 proto->unique_tuple(tuple, range, maniptype, conntrack);
270 goto out;
271 }
272
273 /* Only bother mapping if it's not already in range and unique */
274 if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
275 || proto->in_range(tuple, maniptype, &range->min, &range->max))
276 && !ip_nat_used_tuple(tuple, conntrack))
277 goto out;
278
279 /* Last change: get protocol to try to obtain unique tuple. */
280 proto->unique_tuple(tuple, range, maniptype, conntrack);
281out:
282 rcu_read_unlock();
283}
284
285unsigned int
286ip_nat_setup_info(struct ip_conntrack *conntrack,
287 const struct ip_nat_range *range,
288 unsigned int hooknum)
289{
290 struct ip_conntrack_tuple curr_tuple, new_tuple;
291 struct ip_nat_info *info = &conntrack->nat.info;
292 int have_to_hash = !(conntrack->status & IPS_NAT_DONE_MASK);
293 enum ip_nat_manip_type maniptype = HOOK2MANIP(hooknum);
294
295 IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
296 || hooknum == NF_IP_POST_ROUTING
297 || hooknum == NF_IP_LOCAL_IN
298 || hooknum == NF_IP_LOCAL_OUT);
299 BUG_ON(ip_nat_initialized(conntrack, maniptype));
300
301 /* What we've got will look like inverse of reply. Normally
302 this is what is in the conntrack, except for prior
303 manipulations (future optimization: if num_manips == 0,
304 orig_tp =
305 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
306 invert_tuplepr(&curr_tuple,
307 &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple);
308
309 get_unique_tuple(&new_tuple, &curr_tuple, range, conntrack, maniptype);
310
311 if (!ip_ct_tuple_equal(&new_tuple, &curr_tuple)) {
312 struct ip_conntrack_tuple reply;
313
314 /* Alter conntrack table so will recognize replies. */
315 invert_tuplepr(&reply, &new_tuple);
316 ip_conntrack_alter_reply(conntrack, &reply);
317
318 /* Non-atomic: we own this at the moment. */
319 if (maniptype == IP_NAT_MANIP_SRC)
320 conntrack->status |= IPS_SRC_NAT;
321 else
322 conntrack->status |= IPS_DST_NAT;
323 }
324
325 /* Place in source hash if this is the first time. */
326 if (have_to_hash) {
327 unsigned int srchash
328 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
329 .tuple);
330 write_lock_bh(&ip_nat_lock);
331 list_add(&info->bysource, &bysource[srchash]);
332 write_unlock_bh(&ip_nat_lock);
333 }
334
335 /* It's done. */
336 if (maniptype == IP_NAT_MANIP_DST)
337 set_bit(IPS_DST_NAT_DONE_BIT, &conntrack->status);
338 else
339 set_bit(IPS_SRC_NAT_DONE_BIT, &conntrack->status);
340
341 return NF_ACCEPT;
342}
343EXPORT_SYMBOL(ip_nat_setup_info);
344
345/* Returns true if succeeded. */
346static int
347manip_pkt(u_int16_t proto,
348 struct sk_buff **pskb,
349 unsigned int iphdroff,
350 const struct ip_conntrack_tuple *target,
351 enum ip_nat_manip_type maniptype)
352{
353 struct iphdr *iph;
354 struct ip_nat_protocol *p;
355
356 if (!skb_make_writable(pskb, iphdroff + sizeof(*iph)))
357 return 0;
358
359 iph = (void *)(*pskb)->data + iphdroff;
360
361 /* Manipulate protcol part. */
362
363 /* rcu_read_lock()ed by nf_hook_slow */
364 p = __ip_nat_proto_find(proto);
365 if (!p->manip_pkt(pskb, iphdroff, target, maniptype))
366 return 0;
367
368 iph = (void *)(*pskb)->data + iphdroff;
369
370 if (maniptype == IP_NAT_MANIP_SRC) {
371 nf_csum_replace4(&iph->check, iph->saddr, target->src.ip);
372 iph->saddr = target->src.ip;
373 } else {
374 nf_csum_replace4(&iph->check, iph->daddr, target->dst.ip);
375 iph->daddr = target->dst.ip;
376 }
377 return 1;
378}
379
380/* Do packet manipulations according to ip_nat_setup_info. */
381unsigned int ip_nat_packet(struct ip_conntrack *ct,
382 enum ip_conntrack_info ctinfo,
383 unsigned int hooknum,
384 struct sk_buff **pskb)
385{
386 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
387 unsigned long statusbit;
388 enum ip_nat_manip_type mtype = HOOK2MANIP(hooknum);
389
390 if (mtype == IP_NAT_MANIP_SRC)
391 statusbit = IPS_SRC_NAT;
392 else
393 statusbit = IPS_DST_NAT;
394
395 /* Invert if this is reply dir. */
396 if (dir == IP_CT_DIR_REPLY)
397 statusbit ^= IPS_NAT_MASK;
398
399 /* Non-atomic: these bits don't change. */
400 if (ct->status & statusbit) {
401 struct ip_conntrack_tuple target;
402
403 /* We are aiming to look like inverse of other direction. */
404 invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
405
406 if (!manip_pkt(target.dst.protonum, pskb, 0, &target, mtype))
407 return NF_DROP;
408 }
409 return NF_ACCEPT;
410}
411EXPORT_SYMBOL_GPL(ip_nat_packet);
412
413/* Dir is direction ICMP is coming from (opposite to packet it contains) */
414int ip_nat_icmp_reply_translation(struct ip_conntrack *ct,
415 enum ip_conntrack_info ctinfo,
416 unsigned int hooknum,
417 struct sk_buff **pskb)
418{
419 struct {
420 struct icmphdr icmp;
421 struct iphdr ip;
422 } *inside;
423 struct ip_conntrack_protocol *proto;
424 struct ip_conntrack_tuple inner, target;
425 int hdrlen = (*pskb)->nh.iph->ihl * 4;
426 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
427 unsigned long statusbit;
428 enum ip_nat_manip_type manip = HOOK2MANIP(hooknum);
429
430 if (!skb_make_writable(pskb, hdrlen + sizeof(*inside)))
431 return 0;
432
433 inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
434
435 /* We're actually going to mangle it beyond trivial checksum
436 adjustment, so make sure the current checksum is correct. */
437 if (nf_ip_checksum(*pskb, hooknum, hdrlen, 0))
438 return 0;
439
440 /* Must be RELATED */
441 IP_NF_ASSERT((*pskb)->nfctinfo == IP_CT_RELATED ||
442 (*pskb)->nfctinfo == IP_CT_RELATED+IP_CT_IS_REPLY);
443
444 /* Redirects on non-null nats must be dropped, else they'll
445 start talking to each other without our translation, and be
446 confused... --RR */
447 if (inside->icmp.type == ICMP_REDIRECT) {
448 /* If NAT isn't finished, assume it and drop. */
449 if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
450 return 0;
451
452 if (ct->status & IPS_NAT_MASK)
453 return 0;
454 }
455
456 DEBUGP("icmp_reply_translation: translating error %p manp %u dir %s\n",
457 *pskb, manip, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
458
459 /* rcu_read_lock()ed by nf_hook_slow */
460 proto = __ip_conntrack_proto_find(inside->ip.protocol);
461 if (!ip_ct_get_tuple(&inside->ip, *pskb, (*pskb)->nh.iph->ihl*4 +
462 sizeof(struct icmphdr) + inside->ip.ihl*4,
463 &inner, proto))
464 return 0;
465
466 /* Change inner back to look like incoming packet. We do the
467 opposite manip on this hook to normal, because it might not
468 pass all hooks (locally-generated ICMP). Consider incoming
469 packet: PREROUTING (DST manip), routing produces ICMP, goes
470 through POSTROUTING (which must correct the DST manip). */
471 if (!manip_pkt(inside->ip.protocol, pskb,
472 (*pskb)->nh.iph->ihl*4
473 + sizeof(inside->icmp),
474 &ct->tuplehash[!dir].tuple,
475 !manip))
476 return 0;
477
478 if ((*pskb)->ip_summed != CHECKSUM_PARTIAL) {
479 /* Reloading "inside" here since manip_pkt inner. */
480 inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
481 inside->icmp.checksum = 0;
482 inside->icmp.checksum = csum_fold(skb_checksum(*pskb, hdrlen,
483 (*pskb)->len - hdrlen,
484 0));
485 }
486
487 /* Change outer to look the reply to an incoming packet
488 * (proto 0 means don't invert per-proto part). */
489 if (manip == IP_NAT_MANIP_SRC)
490 statusbit = IPS_SRC_NAT;
491 else
492 statusbit = IPS_DST_NAT;
493
494 /* Invert if this is reply dir. */
495 if (dir == IP_CT_DIR_REPLY)
496 statusbit ^= IPS_NAT_MASK;
497
498 if (ct->status & statusbit) {
499 invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
500 if (!manip_pkt(0, pskb, 0, &target, manip))
501 return 0;
502 }
503
504 return 1;
505}
506EXPORT_SYMBOL_GPL(ip_nat_icmp_reply_translation);
507
508/* Protocol registration. */
509int ip_nat_protocol_register(struct ip_nat_protocol *proto)
510{
511 int ret = 0;
512
513 write_lock_bh(&ip_nat_lock);
514 if (ip_nat_protos[proto->protonum] != &ip_nat_unknown_protocol) {
515 ret = -EBUSY;
516 goto out;
517 }
518 rcu_assign_pointer(ip_nat_protos[proto->protonum], proto);
519 out:
520 write_unlock_bh(&ip_nat_lock);
521 return ret;
522}
523EXPORT_SYMBOL(ip_nat_protocol_register);
524
525/* Noone stores the protocol anywhere; simply delete it. */
526void ip_nat_protocol_unregister(struct ip_nat_protocol *proto)
527{
528 write_lock_bh(&ip_nat_lock);
529 rcu_assign_pointer(ip_nat_protos[proto->protonum],
530 &ip_nat_unknown_protocol);
531 write_unlock_bh(&ip_nat_lock);
532 synchronize_rcu();
533}
534EXPORT_SYMBOL(ip_nat_protocol_unregister);
535
536#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
537 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
538int
539ip_nat_port_range_to_nfattr(struct sk_buff *skb,
540 const struct ip_nat_range *range)
541{
542 NFA_PUT(skb, CTA_PROTONAT_PORT_MIN, sizeof(__be16),
543 &range->min.tcp.port);
544 NFA_PUT(skb, CTA_PROTONAT_PORT_MAX, sizeof(__be16),
545 &range->max.tcp.port);
546
547 return 0;
548
549nfattr_failure:
550 return -1;
551}
552
553int
554ip_nat_port_nfattr_to_range(struct nfattr *tb[], struct ip_nat_range *range)
555{
556 int ret = 0;
557
558 /* we have to return whether we actually parsed something or not */
559
560 if (tb[CTA_PROTONAT_PORT_MIN-1]) {
561 ret = 1;
562 range->min.tcp.port =
563 *(__be16 *)NFA_DATA(tb[CTA_PROTONAT_PORT_MIN-1]);
564 }
565
566 if (!tb[CTA_PROTONAT_PORT_MAX-1]) {
567 if (ret)
568 range->max.tcp.port = range->min.tcp.port;
569 } else {
570 ret = 1;
571 range->max.tcp.port =
572 *(__be16 *)NFA_DATA(tb[CTA_PROTONAT_PORT_MAX-1]);
573 }
574
575 return ret;
576}
577EXPORT_SYMBOL_GPL(ip_nat_port_nfattr_to_range);
578EXPORT_SYMBOL_GPL(ip_nat_port_range_to_nfattr);
579#endif
580
581static int __init ip_nat_init(void)
582{
583 size_t i;
584
585 /* Leave them the same for the moment. */
586 ip_nat_htable_size = ip_conntrack_htable_size;
587
588 /* One vmalloc for both hash tables */
589 bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size);
590 if (!bysource)
591 return -ENOMEM;
592
593 /* Sew in builtin protocols. */
594 write_lock_bh(&ip_nat_lock);
595 for (i = 0; i < MAX_IP_NAT_PROTO; i++)
596 rcu_assign_pointer(ip_nat_protos[i], &ip_nat_unknown_protocol);
597 rcu_assign_pointer(ip_nat_protos[IPPROTO_TCP], &ip_nat_protocol_tcp);
598 rcu_assign_pointer(ip_nat_protos[IPPROTO_UDP], &ip_nat_protocol_udp);
599 rcu_assign_pointer(ip_nat_protos[IPPROTO_ICMP], &ip_nat_protocol_icmp);
600 write_unlock_bh(&ip_nat_lock);
601
602 for (i = 0; i < ip_nat_htable_size; i++) {
603 INIT_LIST_HEAD(&bysource[i]);
604 }
605
606 /* FIXME: Man, this is a hack. <SIGH> */
607 IP_NF_ASSERT(rcu_dereference(ip_conntrack_destroyed) == NULL);
608 rcu_assign_pointer(ip_conntrack_destroyed, ip_nat_cleanup_conntrack);
609
610 /* Initialize fake conntrack so that NAT will skip it */
611 ip_conntrack_untracked.status |= IPS_NAT_DONE_MASK;
612 return 0;
613}
614
615/* Clear NAT section of all conntracks, in case we're loaded again. */
616static int clean_nat(struct ip_conntrack *i, void *data)
617{
618 memset(&i->nat, 0, sizeof(i->nat));
619 i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | IPS_SEQ_ADJUST);
620 return 0;
621}
622
623static void __exit ip_nat_cleanup(void)
624{
625 ip_ct_iterate_cleanup(&clean_nat, NULL);
626 rcu_assign_pointer(ip_conntrack_destroyed, NULL);
627 synchronize_rcu();
628 vfree(bysource);
629}
630
631MODULE_LICENSE("GPL");
632
633module_init(ip_nat_init);
634module_exit(ip_nat_cleanup);
diff --git a/net/ipv4/netfilter/ip_nat_ftp.c b/net/ipv4/netfilter/ip_nat_ftp.c
deleted file mode 100644
index 32e01d8dffcb..000000000000
--- a/net/ipv4/netfilter/ip_nat_ftp.c
+++ /dev/null
@@ -1,180 +0,0 @@
1/* FTP extension for TCP NAT alteration. */
2
3/* (C) 1999-2001 Paul `Rusty' Russell
4 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/module.h>
12#include <linux/netfilter_ipv4.h>
13#include <linux/ip.h>
14#include <linux/tcp.h>
15#include <linux/moduleparam.h>
16#include <net/tcp.h>
17#include <linux/netfilter_ipv4/ip_nat.h>
18#include <linux/netfilter_ipv4/ip_nat_helper.h>
19#include <linux/netfilter_ipv4/ip_nat_rule.h>
20#include <linux/netfilter_ipv4/ip_conntrack_ftp.h>
21#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
22
23MODULE_LICENSE("GPL");
24MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
25MODULE_DESCRIPTION("ftp NAT helper");
26
27#if 0
28#define DEBUGP printk
29#else
30#define DEBUGP(format, args...)
31#endif
32
33/* FIXME: Time out? --RR */
34
35static int
36mangle_rfc959_packet(struct sk_buff **pskb,
37 __be32 newip,
38 u_int16_t port,
39 unsigned int matchoff,
40 unsigned int matchlen,
41 struct ip_conntrack *ct,
42 enum ip_conntrack_info ctinfo,
43 u32 *seq)
44{
45 char buffer[sizeof("nnn,nnn,nnn,nnn,nnn,nnn")];
46
47 sprintf(buffer, "%u,%u,%u,%u,%u,%u",
48 NIPQUAD(newip), port>>8, port&0xFF);
49
50 DEBUGP("calling ip_nat_mangle_tcp_packet\n");
51
52 *seq += strlen(buffer) - matchlen;
53 return ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, matchoff,
54 matchlen, buffer, strlen(buffer));
55}
56
57/* |1|132.235.1.2|6275| */
58static int
59mangle_eprt_packet(struct sk_buff **pskb,
60 __be32 newip,
61 u_int16_t port,
62 unsigned int matchoff,
63 unsigned int matchlen,
64 struct ip_conntrack *ct,
65 enum ip_conntrack_info ctinfo,
66 u32 *seq)
67{
68 char buffer[sizeof("|1|255.255.255.255|65535|")];
69
70 sprintf(buffer, "|1|%u.%u.%u.%u|%u|", NIPQUAD(newip), port);
71
72 DEBUGP("calling ip_nat_mangle_tcp_packet\n");
73
74 *seq += strlen(buffer) - matchlen;
75 return ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, matchoff,
76 matchlen, buffer, strlen(buffer));
77}
78
79/* |1|132.235.1.2|6275| */
80static int
81mangle_epsv_packet(struct sk_buff **pskb,
82 __be32 newip,
83 u_int16_t port,
84 unsigned int matchoff,
85 unsigned int matchlen,
86 struct ip_conntrack *ct,
87 enum ip_conntrack_info ctinfo,
88 u32 *seq)
89{
90 char buffer[sizeof("|||65535|")];
91
92 sprintf(buffer, "|||%u|", port);
93
94 DEBUGP("calling ip_nat_mangle_tcp_packet\n");
95
96 *seq += strlen(buffer) - matchlen;
97 return ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, matchoff,
98 matchlen, buffer, strlen(buffer));
99}
100
101static int (*mangle[])(struct sk_buff **, __be32, u_int16_t,
102 unsigned int,
103 unsigned int,
104 struct ip_conntrack *,
105 enum ip_conntrack_info,
106 u32 *seq)
107= { [IP_CT_FTP_PORT] = mangle_rfc959_packet,
108 [IP_CT_FTP_PASV] = mangle_rfc959_packet,
109 [IP_CT_FTP_EPRT] = mangle_eprt_packet,
110 [IP_CT_FTP_EPSV] = mangle_epsv_packet
111};
112
113/* So, this packet has hit the connection tracking matching code.
114 Mangle it, and change the expectation to match the new version. */
115static unsigned int ip_nat_ftp(struct sk_buff **pskb,
116 enum ip_conntrack_info ctinfo,
117 enum ip_ct_ftp_type type,
118 unsigned int matchoff,
119 unsigned int matchlen,
120 struct ip_conntrack_expect *exp,
121 u32 *seq)
122{
123 __be32 newip;
124 u_int16_t port;
125 int dir = CTINFO2DIR(ctinfo);
126 struct ip_conntrack *ct = exp->master;
127
128 DEBUGP("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen);
129
130 /* Connection will come from wherever this packet goes, hence !dir */
131 newip = ct->tuplehash[!dir].tuple.dst.ip;
132 exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
133 exp->dir = !dir;
134
135 /* When you see the packet, we need to NAT it the same as the
136 * this one. */
137 exp->expectfn = ip_nat_follow_master;
138
139 /* Try to get same port: if not, try to change it. */
140 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
141 exp->tuple.dst.u.tcp.port = htons(port);
142 if (ip_conntrack_expect_related(exp) == 0)
143 break;
144 }
145
146 if (port == 0)
147 return NF_DROP;
148
149 if (!mangle[type](pskb, newip, port, matchoff, matchlen, ct, ctinfo,
150 seq)) {
151 ip_conntrack_unexpect_related(exp);
152 return NF_DROP;
153 }
154 return NF_ACCEPT;
155}
156
157static void __exit ip_nat_ftp_fini(void)
158{
159 rcu_assign_pointer(ip_nat_ftp_hook, NULL);
160 synchronize_rcu();
161}
162
163static int __init ip_nat_ftp_init(void)
164{
165 BUG_ON(rcu_dereference(ip_nat_ftp_hook));
166 rcu_assign_pointer(ip_nat_ftp_hook, ip_nat_ftp);
167 return 0;
168}
169
170/* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */
171static int warn_set(const char *val, struct kernel_param *kp)
172{
173 printk(KERN_INFO KBUILD_MODNAME
174 ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n");
175 return 0;
176}
177module_param_call(ports, warn_set, NULL, NULL, 0);
178
179module_init(ip_nat_ftp_init);
180module_exit(ip_nat_ftp_fini);
diff --git a/net/ipv4/netfilter/ip_nat_helper.c b/net/ipv4/netfilter/ip_nat_helper.c
deleted file mode 100644
index dc778cfef58b..000000000000
--- a/net/ipv4/netfilter/ip_nat_helper.c
+++ /dev/null
@@ -1,436 +0,0 @@
1/* ip_nat_helper.c - generic support functions for NAT helpers
2 *
3 * (C) 2000-2002 Harald Welte <laforge@netfilter.org>
4 * (C) 2003-2004 Netfilter Core Team <coreteam@netfilter.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 *
10 * 14 Jan 2002 Harald Welte <laforge@gnumonks.org>:
11 * - add support for SACK adjustment
12 * 14 Mar 2002 Harald Welte <laforge@gnumonks.org>:
13 * - merge SACK support into newnat API
14 * 16 Aug 2002 Brian J. Murrell <netfilter@interlinx.bc.ca>:
15 * - make ip_nat_resize_packet more generic (TCP and UDP)
16 * - add ip_nat_mangle_udp_packet
17 */
18#include <linux/module.h>
19#include <linux/kmod.h>
20#include <linux/types.h>
21#include <linux/timer.h>
22#include <linux/skbuff.h>
23#include <linux/netfilter_ipv4.h>
24#include <net/checksum.h>
25#include <net/icmp.h>
26#include <net/ip.h>
27#include <net/tcp.h>
28#include <net/udp.h>
29
30#include <linux/netfilter_ipv4/ip_conntrack.h>
31#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
32#include <linux/netfilter_ipv4/ip_nat.h>
33#include <linux/netfilter_ipv4/ip_nat_protocol.h>
34#include <linux/netfilter_ipv4/ip_nat_core.h>
35#include <linux/netfilter_ipv4/ip_nat_helper.h>
36
37#if 0
38#define DEBUGP printk
39#define DUMP_OFFSET(x) printk("offset_before=%d, offset_after=%d, correction_pos=%u\n", x->offset_before, x->offset_after, x->correction_pos);
40#else
41#define DEBUGP(format, args...)
42#define DUMP_OFFSET(x)
43#endif
44
45static DEFINE_SPINLOCK(ip_nat_seqofs_lock);
46
47/* Setup TCP sequence correction given this change at this sequence */
48static inline void
49adjust_tcp_sequence(u32 seq,
50 int sizediff,
51 struct ip_conntrack *ct,
52 enum ip_conntrack_info ctinfo)
53{
54 int dir;
55 struct ip_nat_seq *this_way, *other_way;
56
57 DEBUGP("ip_nat_resize_packet: old_size = %u, new_size = %u\n",
58 (*skb)->len, new_size);
59
60 dir = CTINFO2DIR(ctinfo);
61
62 this_way = &ct->nat.info.seq[dir];
63 other_way = &ct->nat.info.seq[!dir];
64
65 DEBUGP("ip_nat_resize_packet: Seq_offset before: ");
66 DUMP_OFFSET(this_way);
67
68 spin_lock_bh(&ip_nat_seqofs_lock);
69
70 /* SYN adjust. If it's uninitialized, or this is after last
71 * correction, record it: we don't handle more than one
72 * adjustment in the window, but do deal with common case of a
73 * retransmit */
74 if (this_way->offset_before == this_way->offset_after
75 || before(this_way->correction_pos, seq)) {
76 this_way->correction_pos = seq;
77 this_way->offset_before = this_way->offset_after;
78 this_way->offset_after += sizediff;
79 }
80 spin_unlock_bh(&ip_nat_seqofs_lock);
81
82 DEBUGP("ip_nat_resize_packet: Seq_offset after: ");
83 DUMP_OFFSET(this_way);
84}
85
86/* Frobs data inside this packet, which is linear. */
87static void mangle_contents(struct sk_buff *skb,
88 unsigned int dataoff,
89 unsigned int match_offset,
90 unsigned int match_len,
91 const char *rep_buffer,
92 unsigned int rep_len)
93{
94 unsigned char *data;
95
96 BUG_ON(skb_is_nonlinear(skb));
97 data = (unsigned char *)skb->nh.iph + dataoff;
98
99 /* move post-replacement */
100 memmove(data + match_offset + rep_len,
101 data + match_offset + match_len,
102 skb->tail - (data + match_offset + match_len));
103
104 /* insert data from buffer */
105 memcpy(data + match_offset, rep_buffer, rep_len);
106
107 /* update skb info */
108 if (rep_len > match_len) {
109 DEBUGP("ip_nat_mangle_packet: Extending packet by "
110 "%u from %u bytes\n", rep_len - match_len,
111 skb->len);
112 skb_put(skb, rep_len - match_len);
113 } else {
114 DEBUGP("ip_nat_mangle_packet: Shrinking packet from "
115 "%u from %u bytes\n", match_len - rep_len,
116 skb->len);
117 __skb_trim(skb, skb->len + rep_len - match_len);
118 }
119
120 /* fix IP hdr checksum information */
121 skb->nh.iph->tot_len = htons(skb->len);
122 ip_send_check(skb->nh.iph);
123}
124
125/* Unusual, but possible case. */
126static int enlarge_skb(struct sk_buff **pskb, unsigned int extra)
127{
128 struct sk_buff *nskb;
129
130 if ((*pskb)->len + extra > 65535)
131 return 0;
132
133 nskb = skb_copy_expand(*pskb, skb_headroom(*pskb), extra, GFP_ATOMIC);
134 if (!nskb)
135 return 0;
136
137 /* Transfer socket to new skb. */
138 if ((*pskb)->sk)
139 skb_set_owner_w(nskb, (*pskb)->sk);
140 kfree_skb(*pskb);
141 *pskb = nskb;
142 return 1;
143}
144
145/* Generic function for mangling variable-length address changes inside
146 * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX
147 * command in FTP).
148 *
149 * Takes care about all the nasty sequence number changes, checksumming,
150 * skb enlargement, ...
151 *
152 * */
153int
154ip_nat_mangle_tcp_packet(struct sk_buff **pskb,
155 struct ip_conntrack *ct,
156 enum ip_conntrack_info ctinfo,
157 unsigned int match_offset,
158 unsigned int match_len,
159 const char *rep_buffer,
160 unsigned int rep_len)
161{
162 struct iphdr *iph;
163 struct tcphdr *tcph;
164 int oldlen, datalen;
165
166 if (!skb_make_writable(pskb, (*pskb)->len))
167 return 0;
168
169 if (rep_len > match_len
170 && rep_len - match_len > skb_tailroom(*pskb)
171 && !enlarge_skb(pskb, rep_len - match_len))
172 return 0;
173
174 SKB_LINEAR_ASSERT(*pskb);
175
176 iph = (*pskb)->nh.iph;
177 tcph = (void *)iph + iph->ihl*4;
178
179 oldlen = (*pskb)->len - iph->ihl*4;
180 mangle_contents(*pskb, iph->ihl*4 + tcph->doff*4,
181 match_offset, match_len, rep_buffer, rep_len);
182
183 datalen = (*pskb)->len - iph->ihl*4;
184 if ((*pskb)->ip_summed != CHECKSUM_PARTIAL) {
185 tcph->check = 0;
186 tcph->check = tcp_v4_check(datalen,
187 iph->saddr, iph->daddr,
188 csum_partial((char *)tcph,
189 datalen, 0));
190 } else
191 nf_proto_csum_replace2(&tcph->check, *pskb,
192 htons(oldlen), htons(datalen), 1);
193
194 if (rep_len != match_len) {
195 set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
196 adjust_tcp_sequence(ntohl(tcph->seq),
197 (int)rep_len - (int)match_len,
198 ct, ctinfo);
199 /* Tell TCP window tracking about seq change */
200 ip_conntrack_tcp_update(*pskb, ct, CTINFO2DIR(ctinfo));
201 }
202 return 1;
203}
204EXPORT_SYMBOL(ip_nat_mangle_tcp_packet);
205
206/* Generic function for mangling variable-length address changes inside
207 * NATed UDP connections (like the CONNECT DATA XXXXX MESG XXXXX INDEX XXXXX
208 * command in the Amanda protocol)
209 *
210 * Takes care about all the nasty sequence number changes, checksumming,
211 * skb enlargement, ...
212 *
213 * XXX - This function could be merged with ip_nat_mangle_tcp_packet which
214 * should be fairly easy to do.
215 */
216int
217ip_nat_mangle_udp_packet(struct sk_buff **pskb,
218 struct ip_conntrack *ct,
219 enum ip_conntrack_info ctinfo,
220 unsigned int match_offset,
221 unsigned int match_len,
222 const char *rep_buffer,
223 unsigned int rep_len)
224{
225 struct iphdr *iph;
226 struct udphdr *udph;
227 int datalen, oldlen;
228
229 /* UDP helpers might accidentally mangle the wrong packet */
230 iph = (*pskb)->nh.iph;
231 if ((*pskb)->len < iph->ihl*4 + sizeof(*udph) +
232 match_offset + match_len)
233 return 0;
234
235 if (!skb_make_writable(pskb, (*pskb)->len))
236 return 0;
237
238 if (rep_len > match_len
239 && rep_len - match_len > skb_tailroom(*pskb)
240 && !enlarge_skb(pskb, rep_len - match_len))
241 return 0;
242
243 iph = (*pskb)->nh.iph;
244 udph = (void *)iph + iph->ihl*4;
245
246 oldlen = (*pskb)->len - iph->ihl*4;
247 mangle_contents(*pskb, iph->ihl*4 + sizeof(*udph),
248 match_offset, match_len, rep_buffer, rep_len);
249
250 /* update the length of the UDP packet */
251 datalen = (*pskb)->len - iph->ihl*4;
252 udph->len = htons(datalen);
253
254 /* fix udp checksum if udp checksum was previously calculated */
255 if (!udph->check && (*pskb)->ip_summed != CHECKSUM_PARTIAL)
256 return 1;
257
258 if ((*pskb)->ip_summed != CHECKSUM_PARTIAL) {
259 udph->check = 0;
260 udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
261 datalen, IPPROTO_UDP,
262 csum_partial((char *)udph,
263 datalen, 0));
264 if (!udph->check)
265 udph->check = CSUM_MANGLED_0;
266 } else
267 nf_proto_csum_replace2(&udph->check, *pskb,
268 htons(oldlen), htons(datalen), 1);
269 return 1;
270}
271EXPORT_SYMBOL(ip_nat_mangle_udp_packet);
272
273/* Adjust one found SACK option including checksum correction */
274static void
275sack_adjust(struct sk_buff *skb,
276 struct tcphdr *tcph,
277 unsigned int sackoff,
278 unsigned int sackend,
279 struct ip_nat_seq *natseq)
280{
281 while (sackoff < sackend) {
282 struct tcp_sack_block_wire *sack;
283 __be32 new_start_seq, new_end_seq;
284
285 sack = (void *)skb->data + sackoff;
286 if (after(ntohl(sack->start_seq) - natseq->offset_before,
287 natseq->correction_pos))
288 new_start_seq = htonl(ntohl(sack->start_seq)
289 - natseq->offset_after);
290 else
291 new_start_seq = htonl(ntohl(sack->start_seq)
292 - natseq->offset_before);
293
294 if (after(ntohl(sack->end_seq) - natseq->offset_before,
295 natseq->correction_pos))
296 new_end_seq = htonl(ntohl(sack->end_seq)
297 - natseq->offset_after);
298 else
299 new_end_seq = htonl(ntohl(sack->end_seq)
300 - natseq->offset_before);
301
302 DEBUGP("sack_adjust: start_seq: %d->%d, end_seq: %d->%d\n",
303 ntohl(sack->start_seq), new_start_seq,
304 ntohl(sack->end_seq), new_end_seq);
305
306 nf_proto_csum_replace4(&tcph->check, skb,
307 sack->start_seq, new_start_seq, 0);
308 nf_proto_csum_replace4(&tcph->check, skb,
309 sack->end_seq, new_end_seq, 0);
310 sack->start_seq = new_start_seq;
311 sack->end_seq = new_end_seq;
312 sackoff += sizeof(*sack);
313 }
314}
315
316/* TCP SACK sequence number adjustment */
317static inline unsigned int
318ip_nat_sack_adjust(struct sk_buff **pskb,
319 struct tcphdr *tcph,
320 struct ip_conntrack *ct,
321 enum ip_conntrack_info ctinfo)
322{
323 unsigned int dir, optoff, optend;
324
325 optoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct tcphdr);
326 optend = (*pskb)->nh.iph->ihl*4 + tcph->doff*4;
327
328 if (!skb_make_writable(pskb, optend))
329 return 0;
330
331 dir = CTINFO2DIR(ctinfo);
332
333 while (optoff < optend) {
334 /* Usually: option, length. */
335 unsigned char *op = (*pskb)->data + optoff;
336
337 switch (op[0]) {
338 case TCPOPT_EOL:
339 return 1;
340 case TCPOPT_NOP:
341 optoff++;
342 continue;
343 default:
344 /* no partial options */
345 if (optoff + 1 == optend
346 || optoff + op[1] > optend
347 || op[1] < 2)
348 return 0;
349 if (op[0] == TCPOPT_SACK
350 && op[1] >= 2+TCPOLEN_SACK_PERBLOCK
351 && ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0)
352 sack_adjust(*pskb, tcph, optoff+2,
353 optoff+op[1],
354 &ct->nat.info.seq[!dir]);
355 optoff += op[1];
356 }
357 }
358 return 1;
359}
360
361/* TCP sequence number adjustment. Returns 1 on success, 0 on failure */
362int
363ip_nat_seq_adjust(struct sk_buff **pskb,
364 struct ip_conntrack *ct,
365 enum ip_conntrack_info ctinfo)
366{
367 struct tcphdr *tcph;
368 int dir;
369 __be32 newseq, newack;
370 struct ip_nat_seq *this_way, *other_way;
371
372 dir = CTINFO2DIR(ctinfo);
373
374 this_way = &ct->nat.info.seq[dir];
375 other_way = &ct->nat.info.seq[!dir];
376
377 if (!skb_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
378 return 0;
379
380 tcph = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
381 if (after(ntohl(tcph->seq), this_way->correction_pos))
382 newseq = htonl(ntohl(tcph->seq) + this_way->offset_after);
383 else
384 newseq = htonl(ntohl(tcph->seq) + this_way->offset_before);
385
386 if (after(ntohl(tcph->ack_seq) - other_way->offset_before,
387 other_way->correction_pos))
388 newack = htonl(ntohl(tcph->ack_seq) - other_way->offset_after);
389 else
390 newack = htonl(ntohl(tcph->ack_seq) - other_way->offset_before);
391
392 nf_proto_csum_replace4(&tcph->check, *pskb, tcph->seq, newseq, 0);
393 nf_proto_csum_replace4(&tcph->check, *pskb, tcph->ack_seq, newack, 0);
394
395 DEBUGP("Adjusting sequence number from %u->%u, ack from %u->%u\n",
396 ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq),
397 ntohl(newack));
398
399 tcph->seq = newseq;
400 tcph->ack_seq = newack;
401
402 if (!ip_nat_sack_adjust(pskb, tcph, ct, ctinfo))
403 return 0;
404
405 ip_conntrack_tcp_update(*pskb, ct, dir);
406
407 return 1;
408}
409EXPORT_SYMBOL(ip_nat_seq_adjust);
410
411/* Setup NAT on this expected conntrack so it follows master. */
412/* If we fail to get a free NAT slot, we'll get dropped on confirm */
413void ip_nat_follow_master(struct ip_conntrack *ct,
414 struct ip_conntrack_expect *exp)
415{
416 struct ip_nat_range range;
417
418 /* This must be a fresh one. */
419 BUG_ON(ct->status & IPS_NAT_DONE_MASK);
420
421 /* Change src to where master sends to */
422 range.flags = IP_NAT_RANGE_MAP_IPS;
423 range.min_ip = range.max_ip
424 = ct->master->tuplehash[!exp->dir].tuple.dst.ip;
425 /* hook doesn't matter, but it has to do source manip */
426 ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING);
427
428 /* For DST manip, map port here to where it's expected. */
429 range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED);
430 range.min = range.max = exp->saved_proto;
431 range.min_ip = range.max_ip
432 = ct->master->tuplehash[!exp->dir].tuple.src.ip;
433 /* hook doesn't matter, but it has to do destination manip */
434 ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING);
435}
436EXPORT_SYMBOL(ip_nat_follow_master);
diff --git a/net/ipv4/netfilter/ip_nat_helper_h323.c b/net/ipv4/netfilter/ip_nat_helper_h323.c
deleted file mode 100644
index bdc99ef6159e..000000000000
--- a/net/ipv4/netfilter/ip_nat_helper_h323.c
+++ /dev/null
@@ -1,611 +0,0 @@
1/*
2 * H.323 extension for NAT alteration.
3 *
4 * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net>
5 *
6 * This source code is licensed under General Public License version 2.
7 *
8 * Based on the 'brute force' H.323 NAT module by
9 * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
10 */
11
12#include <linux/module.h>
13#include <linux/netfilter_ipv4.h>
14#include <linux/netfilter.h>
15#include <linux/ip.h>
16#include <linux/tcp.h>
17#include <linux/moduleparam.h>
18#include <net/tcp.h>
19#include <linux/netfilter_ipv4/ip_nat.h>
20#include <linux/netfilter_ipv4/ip_nat_helper.h>
21#include <linux/netfilter_ipv4/ip_nat_rule.h>
22#include <linux/netfilter_ipv4/ip_conntrack_tuple.h>
23#include <linux/netfilter_ipv4/ip_conntrack_h323.h>
24#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
25
26#if 0
27#define DEBUGP printk
28#else
29#define DEBUGP(format, args...)
30#endif
31
32/****************************************************************************/
33static int set_addr(struct sk_buff **pskb,
34 unsigned char **data, int dataoff,
35 unsigned int addroff, __be32 ip, u_int16_t port)
36{
37 enum ip_conntrack_info ctinfo;
38 struct ip_conntrack *ct = ip_conntrack_get(*pskb, &ctinfo);
39 struct {
40 __be32 ip;
41 __be16 port;
42 } __attribute__ ((__packed__)) buf;
43 struct tcphdr _tcph, *th;
44
45 buf.ip = ip;
46 buf.port = htons(port);
47 addroff += dataoff;
48
49 if ((*pskb)->nh.iph->protocol == IPPROTO_TCP) {
50 if (!ip_nat_mangle_tcp_packet(pskb, ct, ctinfo,
51 addroff, sizeof(buf),
52 (char *) &buf, sizeof(buf))) {
53 if (net_ratelimit())
54 printk("ip_nat_h323: ip_nat_mangle_tcp_packet"
55 " error\n");
56 return -1;
57 }
58
59 /* Relocate data pointer */
60 th = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl * 4,
61 sizeof(_tcph), &_tcph);
62 if (th == NULL)
63 return -1;
64 *data = (*pskb)->data + (*pskb)->nh.iph->ihl * 4 +
65 th->doff * 4 + dataoff;
66 } else {
67 if (!ip_nat_mangle_udp_packet(pskb, ct, ctinfo,
68 addroff, sizeof(buf),
69 (char *) &buf, sizeof(buf))) {
70 if (net_ratelimit())
71 printk("ip_nat_h323: ip_nat_mangle_udp_packet"
72 " error\n");
73 return -1;
74 }
75 /* ip_nat_mangle_udp_packet uses skb_make_writable() to copy
76 * or pull everything in a linear buffer, so we can safely
77 * use the skb pointers now */
78 *data = (*pskb)->data + (*pskb)->nh.iph->ihl * 4 +
79 sizeof(struct udphdr);
80 }
81
82 return 0;
83}
84
85/****************************************************************************/
86static int set_h225_addr(struct sk_buff **pskb,
87 unsigned char **data, int dataoff,
88 TransportAddress * addr,
89 __be32 ip, u_int16_t port)
90{
91 return set_addr(pskb, data, dataoff, addr->ipAddress.ip, ip, port);
92}
93
94/****************************************************************************/
95static int set_h245_addr(struct sk_buff **pskb,
96 unsigned char **data, int dataoff,
97 H245_TransportAddress * addr,
98 __be32 ip, u_int16_t port)
99{
100 return set_addr(pskb, data, dataoff,
101 addr->unicastAddress.iPAddress.network, ip, port);
102}
103
104/****************************************************************************/
105static int set_sig_addr(struct sk_buff **pskb, struct ip_conntrack *ct,
106 enum ip_conntrack_info ctinfo,
107 unsigned char **data,
108 TransportAddress * addr, int count)
109{
110 struct ip_ct_h323_master *info = &ct->help.ct_h323_info;
111 int dir = CTINFO2DIR(ctinfo);
112 int i;
113 __be32 ip;
114 u_int16_t port;
115
116 for (i = 0; i < count; i++) {
117 if (get_h225_addr(*data, &addr[i], &ip, &port)) {
118 if (ip == ct->tuplehash[dir].tuple.src.ip &&
119 port == info->sig_port[dir]) {
120 /* GW->GK */
121
122 /* Fix for Gnomemeeting */
123 if (i > 0 &&
124 get_h225_addr(*data, &addr[0],
125 &ip, &port) &&
126 (ntohl(ip) & 0xff000000) == 0x7f000000)
127 i = 0;
128
129 DEBUGP
130 ("ip_nat_ras: set signal address "
131 "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
132 NIPQUAD(ip), port,
133 NIPQUAD(ct->tuplehash[!dir].tuple.dst.
134 ip), info->sig_port[!dir]);
135 return set_h225_addr(pskb, data, 0, &addr[i],
136 ct->tuplehash[!dir].
137 tuple.dst.ip,
138 info->sig_port[!dir]);
139 } else if (ip == ct->tuplehash[dir].tuple.dst.ip &&
140 port == info->sig_port[dir]) {
141 /* GK->GW */
142 DEBUGP
143 ("ip_nat_ras: set signal address "
144 "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
145 NIPQUAD(ip), port,
146 NIPQUAD(ct->tuplehash[!dir].tuple.src.
147 ip), info->sig_port[!dir]);
148 return set_h225_addr(pskb, data, 0, &addr[i],
149 ct->tuplehash[!dir].
150 tuple.src.ip,
151 info->sig_port[!dir]);
152 }
153 }
154 }
155
156 return 0;
157}
158
159/****************************************************************************/
160static int set_ras_addr(struct sk_buff **pskb, struct ip_conntrack *ct,
161 enum ip_conntrack_info ctinfo,
162 unsigned char **data,
163 TransportAddress * addr, int count)
164{
165 int dir = CTINFO2DIR(ctinfo);
166 int i;
167 __be32 ip;
168 u_int16_t port;
169
170 for (i = 0; i < count; i++) {
171 if (get_h225_addr(*data, &addr[i], &ip, &port) &&
172 ip == ct->tuplehash[dir].tuple.src.ip &&
173 port == ntohs(ct->tuplehash[dir].tuple.src.u.udp.port)) {
174 DEBUGP("ip_nat_ras: set rasAddress "
175 "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
176 NIPQUAD(ip), port,
177 NIPQUAD(ct->tuplehash[!dir].tuple.dst.ip),
178 ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.
179 port));
180 return set_h225_addr(pskb, data, 0, &addr[i],
181 ct->tuplehash[!dir].tuple.dst.ip,
182 ntohs(ct->tuplehash[!dir].tuple.
183 dst.u.udp.port));
184 }
185 }
186
187 return 0;
188}
189
190/****************************************************************************/
191static int nat_rtp_rtcp(struct sk_buff **pskb, struct ip_conntrack *ct,
192 enum ip_conntrack_info ctinfo,
193 unsigned char **data, int dataoff,
194 H245_TransportAddress * addr,
195 u_int16_t port, u_int16_t rtp_port,
196 struct ip_conntrack_expect *rtp_exp,
197 struct ip_conntrack_expect *rtcp_exp)
198{
199 struct ip_ct_h323_master *info = &ct->help.ct_h323_info;
200 int dir = CTINFO2DIR(ctinfo);
201 int i;
202 u_int16_t nated_port;
203
204 /* Set expectations for NAT */
205 rtp_exp->saved_proto.udp.port = rtp_exp->tuple.dst.u.udp.port;
206 rtp_exp->expectfn = ip_nat_follow_master;
207 rtp_exp->dir = !dir;
208 rtcp_exp->saved_proto.udp.port = rtcp_exp->tuple.dst.u.udp.port;
209 rtcp_exp->expectfn = ip_nat_follow_master;
210 rtcp_exp->dir = !dir;
211
212 /* Lookup existing expects */
213 for (i = 0; i < H323_RTP_CHANNEL_MAX; i++) {
214 if (info->rtp_port[i][dir] == rtp_port) {
215 /* Expected */
216
217 /* Use allocated ports first. This will refresh
218 * the expects */
219 rtp_exp->tuple.dst.u.udp.port =
220 htons(info->rtp_port[i][dir]);
221 rtcp_exp->tuple.dst.u.udp.port =
222 htons(info->rtp_port[i][dir] + 1);
223 break;
224 } else if (info->rtp_port[i][dir] == 0) {
225 /* Not expected */
226 break;
227 }
228 }
229
230 /* Run out of expectations */
231 if (i >= H323_RTP_CHANNEL_MAX) {
232 if (net_ratelimit())
233 printk("ip_nat_h323: out of expectations\n");
234 return 0;
235 }
236
237 /* Try to get a pair of ports. */
238 for (nated_port = ntohs(rtp_exp->tuple.dst.u.udp.port);
239 nated_port != 0; nated_port += 2) {
240 rtp_exp->tuple.dst.u.udp.port = htons(nated_port);
241 if (ip_conntrack_expect_related(rtp_exp) == 0) {
242 rtcp_exp->tuple.dst.u.udp.port =
243 htons(nated_port + 1);
244 if (ip_conntrack_expect_related(rtcp_exp) == 0)
245 break;
246 ip_conntrack_unexpect_related(rtp_exp);
247 }
248 }
249
250 if (nated_port == 0) { /* No port available */
251 if (net_ratelimit())
252 printk("ip_nat_h323: out of RTP ports\n");
253 return 0;
254 }
255
256 /* Modify signal */
257 if (set_h245_addr(pskb, data, dataoff, addr,
258 ct->tuplehash[!dir].tuple.dst.ip,
259 (port & 1) ? nated_port + 1 : nated_port) == 0) {
260 /* Save ports */
261 info->rtp_port[i][dir] = rtp_port;
262 info->rtp_port[i][!dir] = nated_port;
263 } else {
264 ip_conntrack_unexpect_related(rtp_exp);
265 ip_conntrack_unexpect_related(rtcp_exp);
266 return -1;
267 }
268
269 /* Success */
270 DEBUGP("ip_nat_h323: expect RTP %u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
271 NIPQUAD(rtp_exp->tuple.src.ip),
272 ntohs(rtp_exp->tuple.src.u.udp.port),
273 NIPQUAD(rtp_exp->tuple.dst.ip),
274 ntohs(rtp_exp->tuple.dst.u.udp.port));
275 DEBUGP("ip_nat_h323: expect RTCP %u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
276 NIPQUAD(rtcp_exp->tuple.src.ip),
277 ntohs(rtcp_exp->tuple.src.u.udp.port),
278 NIPQUAD(rtcp_exp->tuple.dst.ip),
279 ntohs(rtcp_exp->tuple.dst.u.udp.port));
280
281 return 0;
282}
283
284/****************************************************************************/
285static int nat_t120(struct sk_buff **pskb, struct ip_conntrack *ct,
286 enum ip_conntrack_info ctinfo,
287 unsigned char **data, int dataoff,
288 H245_TransportAddress * addr, u_int16_t port,
289 struct ip_conntrack_expect *exp)
290{
291 int dir = CTINFO2DIR(ctinfo);
292 u_int16_t nated_port = port;
293
294 /* Set expectations for NAT */
295 exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
296 exp->expectfn = ip_nat_follow_master;
297 exp->dir = !dir;
298
299 /* Try to get same port: if not, try to change it. */
300 for (; nated_port != 0; nated_port++) {
301 exp->tuple.dst.u.tcp.port = htons(nated_port);
302 if (ip_conntrack_expect_related(exp) == 0)
303 break;
304 }
305
306 if (nated_port == 0) { /* No port available */
307 if (net_ratelimit())
308 printk("ip_nat_h323: out of TCP ports\n");
309 return 0;
310 }
311
312 /* Modify signal */
313 if (set_h245_addr(pskb, data, dataoff, addr,
314 ct->tuplehash[!dir].tuple.dst.ip, nated_port) < 0) {
315 ip_conntrack_unexpect_related(exp);
316 return -1;
317 }
318
319 DEBUGP("ip_nat_h323: expect T.120 %u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
320 NIPQUAD(exp->tuple.src.ip), ntohs(exp->tuple.src.u.tcp.port),
321 NIPQUAD(exp->tuple.dst.ip), ntohs(exp->tuple.dst.u.tcp.port));
322
323 return 0;
324}
325
326/****************************************************************************
327 * This conntrack expect function replaces ip_conntrack_h245_expect()
328 * which was set by ip_conntrack_helper_h323.c. It calls both
329 * ip_nat_follow_master() and ip_conntrack_h245_expect()
330 ****************************************************************************/
331static void ip_nat_h245_expect(struct ip_conntrack *new,
332 struct ip_conntrack_expect *this)
333{
334 ip_nat_follow_master(new, this);
335 ip_conntrack_h245_expect(new, this);
336}
337
338/****************************************************************************/
339static int nat_h245(struct sk_buff **pskb, struct ip_conntrack *ct,
340 enum ip_conntrack_info ctinfo,
341 unsigned char **data, int dataoff,
342 TransportAddress * addr, u_int16_t port,
343 struct ip_conntrack_expect *exp)
344{
345 struct ip_ct_h323_master *info = &ct->help.ct_h323_info;
346 int dir = CTINFO2DIR(ctinfo);
347 u_int16_t nated_port = port;
348
349 /* Set expectations for NAT */
350 exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
351 exp->expectfn = ip_nat_h245_expect;
352 exp->dir = !dir;
353
354 /* Check existing expects */
355 if (info->sig_port[dir] == port)
356 nated_port = info->sig_port[!dir];
357
358 /* Try to get same port: if not, try to change it. */
359 for (; nated_port != 0; nated_port++) {
360 exp->tuple.dst.u.tcp.port = htons(nated_port);
361 if (ip_conntrack_expect_related(exp) == 0)
362 break;
363 }
364
365 if (nated_port == 0) { /* No port available */
366 if (net_ratelimit())
367 printk("ip_nat_q931: out of TCP ports\n");
368 return 0;
369 }
370
371 /* Modify signal */
372 if (set_h225_addr(pskb, data, dataoff, addr,
373 ct->tuplehash[!dir].tuple.dst.ip,
374 nated_port) == 0) {
375 /* Save ports */
376 info->sig_port[dir] = port;
377 info->sig_port[!dir] = nated_port;
378 } else {
379 ip_conntrack_unexpect_related(exp);
380 return -1;
381 }
382
383 DEBUGP("ip_nat_q931: expect H.245 %u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
384 NIPQUAD(exp->tuple.src.ip), ntohs(exp->tuple.src.u.tcp.port),
385 NIPQUAD(exp->tuple.dst.ip), ntohs(exp->tuple.dst.u.tcp.port));
386
387 return 0;
388}
389
390/****************************************************************************
391 * This conntrack expect function replaces ip_conntrack_q931_expect()
392 * which was set by ip_conntrack_helper_h323.c.
393 ****************************************************************************/
394static void ip_nat_q931_expect(struct ip_conntrack *new,
395 struct ip_conntrack_expect *this)
396{
397 struct ip_nat_range range;
398
399 if (this->tuple.src.ip != 0) { /* Only accept calls from GK */
400 ip_nat_follow_master(new, this);
401 goto out;
402 }
403
404 /* This must be a fresh one. */
405 BUG_ON(new->status & IPS_NAT_DONE_MASK);
406
407 /* Change src to where master sends to */
408 range.flags = IP_NAT_RANGE_MAP_IPS;
409 range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.ip;
410
411 /* hook doesn't matter, but it has to do source manip */
412 ip_nat_setup_info(new, &range, NF_IP_POST_ROUTING);
413
414 /* For DST manip, map port here to where it's expected. */
415 range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED);
416 range.min = range.max = this->saved_proto;
417 range.min_ip = range.max_ip =
418 new->master->tuplehash[!this->dir].tuple.src.ip;
419
420 /* hook doesn't matter, but it has to do destination manip */
421 ip_nat_setup_info(new, &range, NF_IP_PRE_ROUTING);
422
423 out:
424 ip_conntrack_q931_expect(new, this);
425}
426
427/****************************************************************************/
428static int nat_q931(struct sk_buff **pskb, struct ip_conntrack *ct,
429 enum ip_conntrack_info ctinfo,
430 unsigned char **data, TransportAddress * addr, int idx,
431 u_int16_t port, struct ip_conntrack_expect *exp)
432{
433 struct ip_ct_h323_master *info = &ct->help.ct_h323_info;
434 int dir = CTINFO2DIR(ctinfo);
435 u_int16_t nated_port = port;
436 __be32 ip;
437
438 /* Set expectations for NAT */
439 exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
440 exp->expectfn = ip_nat_q931_expect;
441 exp->dir = !dir;
442
443 /* Check existing expects */
444 if (info->sig_port[dir] == port)
445 nated_port = info->sig_port[!dir];
446
447 /* Try to get same port: if not, try to change it. */
448 for (; nated_port != 0; nated_port++) {
449 exp->tuple.dst.u.tcp.port = htons(nated_port);
450 if (ip_conntrack_expect_related(exp) == 0)
451 break;
452 }
453
454 if (nated_port == 0) { /* No port available */
455 if (net_ratelimit())
456 printk("ip_nat_ras: out of TCP ports\n");
457 return 0;
458 }
459
460 /* Modify signal */
461 if (set_h225_addr(pskb, data, 0, &addr[idx],
462 ct->tuplehash[!dir].tuple.dst.ip,
463 nated_port) == 0) {
464 /* Save ports */
465 info->sig_port[dir] = port;
466 info->sig_port[!dir] = nated_port;
467
468 /* Fix for Gnomemeeting */
469 if (idx > 0 &&
470 get_h225_addr(*data, &addr[0], &ip, &port) &&
471 (ntohl(ip) & 0xff000000) == 0x7f000000) {
472 set_h225_addr_hook(pskb, data, 0, &addr[0],
473 ct->tuplehash[!dir].tuple.dst.ip,
474 info->sig_port[!dir]);
475 }
476 } else {
477 ip_conntrack_unexpect_related(exp);
478 return -1;
479 }
480
481 /* Success */
482 DEBUGP("ip_nat_ras: expect Q.931 %u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
483 NIPQUAD(exp->tuple.src.ip), ntohs(exp->tuple.src.u.tcp.port),
484 NIPQUAD(exp->tuple.dst.ip), ntohs(exp->tuple.dst.u.tcp.port));
485
486 return 0;
487}
488
489/****************************************************************************/
490static void ip_nat_callforwarding_expect(struct ip_conntrack *new,
491 struct ip_conntrack_expect *this)
492{
493 struct ip_nat_range range;
494
495 /* This must be a fresh one. */
496 BUG_ON(new->status & IPS_NAT_DONE_MASK);
497
498 /* Change src to where master sends to */
499 range.flags = IP_NAT_RANGE_MAP_IPS;
500 range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.ip;
501
502 /* hook doesn't matter, but it has to do source manip */
503 ip_nat_setup_info(new, &range, NF_IP_POST_ROUTING);
504
505 /* For DST manip, map port here to where it's expected. */
506 range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED);
507 range.min = range.max = this->saved_proto;
508 range.min_ip = range.max_ip = this->saved_ip;
509
510 /* hook doesn't matter, but it has to do destination manip */
511 ip_nat_setup_info(new, &range, NF_IP_PRE_ROUTING);
512
513 ip_conntrack_q931_expect(new, this);
514}
515
516/****************************************************************************/
517static int nat_callforwarding(struct sk_buff **pskb, struct ip_conntrack *ct,
518 enum ip_conntrack_info ctinfo,
519 unsigned char **data, int dataoff,
520 TransportAddress * addr, u_int16_t port,
521 struct ip_conntrack_expect *exp)
522{
523 int dir = CTINFO2DIR(ctinfo);
524 u_int16_t nated_port;
525
526 /* Set expectations for NAT */
527 exp->saved_ip = exp->tuple.dst.ip;
528 exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip;
529 exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
530 exp->expectfn = ip_nat_callforwarding_expect;
531 exp->dir = !dir;
532
533 /* Try to get same port: if not, try to change it. */
534 for (nated_port = port; nated_port != 0; nated_port++) {
535 exp->tuple.dst.u.tcp.port = htons(nated_port);
536 if (ip_conntrack_expect_related(exp) == 0)
537 break;
538 }
539
540 if (nated_port == 0) { /* No port available */
541 if (net_ratelimit())
542 printk("ip_nat_q931: out of TCP ports\n");
543 return 0;
544 }
545
546 /* Modify signal */
547 if (!set_h225_addr(pskb, data, dataoff, addr,
548 ct->tuplehash[!dir].tuple.dst.ip,
549 nated_port) == 0) {
550 ip_conntrack_unexpect_related(exp);
551 return -1;
552 }
553
554 /* Success */
555 DEBUGP("ip_nat_q931: expect Call Forwarding "
556 "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
557 NIPQUAD(exp->tuple.src.ip), ntohs(exp->tuple.src.u.tcp.port),
558 NIPQUAD(exp->tuple.dst.ip), ntohs(exp->tuple.dst.u.tcp.port));
559
560 return 0;
561}
562
563/****************************************************************************/
564static int __init init(void)
565{
566 BUG_ON(rcu_dereference(set_h245_addr_hook) != NULL);
567 BUG_ON(rcu_dereference(set_h225_addr_hook) != NULL);
568 BUG_ON(rcu_dereference(set_sig_addr_hook) != NULL);
569 BUG_ON(rcu_dereference(set_ras_addr_hook) != NULL);
570 BUG_ON(rcu_dereference(nat_rtp_rtcp_hook) != NULL);
571 BUG_ON(rcu_dereference(nat_t120_hook) != NULL);
572 BUG_ON(rcu_dereference(nat_h245_hook) != NULL);
573 BUG_ON(rcu_dereference(nat_callforwarding_hook) != NULL);
574 BUG_ON(rcu_dereference(nat_q931_hook) != NULL);
575
576 rcu_assign_pointer(set_h245_addr_hook, set_h245_addr);
577 rcu_assign_pointer(set_h225_addr_hook, set_h225_addr);
578 rcu_assign_pointer(set_sig_addr_hook, set_sig_addr);
579 rcu_assign_pointer(set_ras_addr_hook, set_ras_addr);
580 rcu_assign_pointer(nat_rtp_rtcp_hook, nat_rtp_rtcp);
581 rcu_assign_pointer(nat_t120_hook, nat_t120);
582 rcu_assign_pointer(nat_h245_hook, nat_h245);
583 rcu_assign_pointer(nat_callforwarding_hook, nat_callforwarding);
584 rcu_assign_pointer(nat_q931_hook, nat_q931);
585
586 DEBUGP("ip_nat_h323: init success\n");
587 return 0;
588}
589
590/****************************************************************************/
591static void __exit fini(void)
592{
593 rcu_assign_pointer(set_h245_addr_hook, NULL);
594 rcu_assign_pointer(set_h225_addr_hook, NULL);
595 rcu_assign_pointer(set_sig_addr_hook, NULL);
596 rcu_assign_pointer(set_ras_addr_hook, NULL);
597 rcu_assign_pointer(nat_rtp_rtcp_hook, NULL);
598 rcu_assign_pointer(nat_t120_hook, NULL);
599 rcu_assign_pointer(nat_h245_hook, NULL);
600 rcu_assign_pointer(nat_callforwarding_hook, NULL);
601 rcu_assign_pointer(nat_q931_hook, NULL);
602 synchronize_rcu();
603}
604
605/****************************************************************************/
606module_init(init);
607module_exit(fini);
608
609MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>");
610MODULE_DESCRIPTION("H.323 NAT helper");
611MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/ip_nat_helper_pptp.c b/net/ipv4/netfilter/ip_nat_helper_pptp.c
deleted file mode 100644
index 24ce4a5023d7..000000000000
--- a/net/ipv4/netfilter/ip_nat_helper_pptp.c
+++ /dev/null
@@ -1,350 +0,0 @@
1/*
2 * ip_nat_pptp.c - Version 3.0
3 *
4 * NAT support for PPTP (Point to Point Tunneling Protocol).
5 * PPTP is a a protocol for creating virtual private networks.
6 * It is a specification defined by Microsoft and some vendors
7 * working with Microsoft. PPTP is built on top of a modified
8 * version of the Internet Generic Routing Encapsulation Protocol.
9 * GRE is defined in RFC 1701 and RFC 1702. Documentation of
10 * PPTP can be found in RFC 2637
11 *
12 * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org>
13 *
14 * Development of this code funded by Astaro AG (http://www.astaro.com/)
15 *
16 * TODO: - NAT to a unique tuple, not to TCP source port
17 * (needs netfilter tuple reservation)
18 *
19 * Changes:
20 * 2002-02-10 - Version 1.3
21 * - Use ip_nat_mangle_tcp_packet() because of cloned skb's
22 * in local connections (Philip Craig <philipc@snapgear.com>)
23 * - add checks for magicCookie and pptp version
24 * - make argument list of pptp_{out,in}bound_packet() shorter
25 * - move to C99 style initializers
26 * - print version number at module loadtime
27 * 2003-09-22 - Version 1.5
28 * - use SNATed tcp sourceport as callid, since we get called before
29 * TCP header is mangled (Philip Craig <philipc@snapgear.com>)
30 * 2004-10-22 - Version 2.0
31 * - kernel 2.6.x version
32 * 2005-06-10 - Version 3.0
33 * - kernel >= 2.6.11 version,
34 * funded by Oxcoda NetBox Blue (http://www.netboxblue.com/)
35 *
36 */
37
38#include <linux/module.h>
39#include <linux/ip.h>
40#include <linux/tcp.h>
41#include <net/tcp.h>
42
43#include <linux/netfilter_ipv4/ip_nat.h>
44#include <linux/netfilter_ipv4/ip_nat_rule.h>
45#include <linux/netfilter_ipv4/ip_nat_helper.h>
46#include <linux/netfilter_ipv4/ip_nat_pptp.h>
47#include <linux/netfilter_ipv4/ip_conntrack_core.h>
48#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
49#include <linux/netfilter_ipv4/ip_conntrack_proto_gre.h>
50#include <linux/netfilter_ipv4/ip_conntrack_pptp.h>
51
52#define IP_NAT_PPTP_VERSION "3.0"
53
54#define REQ_CID(req, off) (*(__be16 *)((char *)(req) + (off)))
55
56MODULE_LICENSE("GPL");
57MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
58MODULE_DESCRIPTION("Netfilter NAT helper module for PPTP");
59
60
61#if 0
62extern const char *pptp_msg_name[];
63#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s: " format, __FILE__, \
64 __FUNCTION__, ## args)
65#else
66#define DEBUGP(format, args...)
67#endif
68
69static void pptp_nat_expected(struct ip_conntrack *ct,
70 struct ip_conntrack_expect *exp)
71{
72 struct ip_conntrack *master = ct->master;
73 struct ip_conntrack_expect *other_exp;
74 struct ip_conntrack_tuple t;
75 struct ip_ct_pptp_master *ct_pptp_info;
76 struct ip_nat_pptp *nat_pptp_info;
77 struct ip_nat_range range;
78
79 ct_pptp_info = &master->help.ct_pptp_info;
80 nat_pptp_info = &master->nat.help.nat_pptp_info;
81
82 /* And here goes the grand finale of corrosion... */
83
84 if (exp->dir == IP_CT_DIR_ORIGINAL) {
85 DEBUGP("we are PNS->PAC\n");
86 /* therefore, build tuple for PAC->PNS */
87 t.src.ip = master->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip;
88 t.src.u.gre.key = master->help.ct_pptp_info.pac_call_id;
89 t.dst.ip = master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip;
90 t.dst.u.gre.key = master->help.ct_pptp_info.pns_call_id;
91 t.dst.protonum = IPPROTO_GRE;
92 } else {
93 DEBUGP("we are PAC->PNS\n");
94 /* build tuple for PNS->PAC */
95 t.src.ip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip;
96 t.src.u.gre.key = master->nat.help.nat_pptp_info.pns_call_id;
97 t.dst.ip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip;
98 t.dst.u.gre.key = master->nat.help.nat_pptp_info.pac_call_id;
99 t.dst.protonum = IPPROTO_GRE;
100 }
101
102 DEBUGP("trying to unexpect other dir: ");
103 DUMP_TUPLE(&t);
104 other_exp = ip_conntrack_expect_find_get(&t);
105 if (other_exp) {
106 ip_conntrack_unexpect_related(other_exp);
107 ip_conntrack_expect_put(other_exp);
108 DEBUGP("success\n");
109 } else {
110 DEBUGP("not found!\n");
111 }
112
113 /* This must be a fresh one. */
114 BUG_ON(ct->status & IPS_NAT_DONE_MASK);
115
116 /* Change src to where master sends to */
117 range.flags = IP_NAT_RANGE_MAP_IPS;
118 range.min_ip = range.max_ip
119 = ct->master->tuplehash[!exp->dir].tuple.dst.ip;
120 if (exp->dir == IP_CT_DIR_ORIGINAL) {
121 range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED;
122 range.min = range.max = exp->saved_proto;
123 }
124 /* hook doesn't matter, but it has to do source manip */
125 ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING);
126
127 /* For DST manip, map port here to where it's expected. */
128 range.flags = IP_NAT_RANGE_MAP_IPS;
129 range.min_ip = range.max_ip
130 = ct->master->tuplehash[!exp->dir].tuple.src.ip;
131 if (exp->dir == IP_CT_DIR_REPLY) {
132 range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED;
133 range.min = range.max = exp->saved_proto;
134 }
135 /* hook doesn't matter, but it has to do destination manip */
136 ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING);
137}
138
139/* outbound packets == from PNS to PAC */
140static int
141pptp_outbound_pkt(struct sk_buff **pskb,
142 struct ip_conntrack *ct,
143 enum ip_conntrack_info ctinfo,
144 struct PptpControlHeader *ctlh,
145 union pptp_ctrl_union *pptpReq)
146
147{
148 struct ip_ct_pptp_master *ct_pptp_info = &ct->help.ct_pptp_info;
149 struct ip_nat_pptp *nat_pptp_info = &ct->nat.help.nat_pptp_info;
150 u_int16_t msg;
151 __be16 new_callid;
152 unsigned int cid_off;
153
154 new_callid = ct_pptp_info->pns_call_id;
155
156 switch (msg = ntohs(ctlh->messageType)) {
157 case PPTP_OUT_CALL_REQUEST:
158 cid_off = offsetof(union pptp_ctrl_union, ocreq.callID);
159 /* FIXME: ideally we would want to reserve a call ID
160 * here. current netfilter NAT core is not able to do
161 * this :( For now we use TCP source port. This breaks
162 * multiple calls within one control session */
163
164 /* save original call ID in nat_info */
165 nat_pptp_info->pns_call_id = ct_pptp_info->pns_call_id;
166
167 /* don't use tcph->source since we are at a DSTmanip
168 * hook (e.g. PREROUTING) and pkt is not mangled yet */
169 new_callid = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.tcp.port;
170
171 /* save new call ID in ct info */
172 ct_pptp_info->pns_call_id = new_callid;
173 break;
174 case PPTP_IN_CALL_REPLY:
175 cid_off = offsetof(union pptp_ctrl_union, icack.callID);
176 break;
177 case PPTP_CALL_CLEAR_REQUEST:
178 cid_off = offsetof(union pptp_ctrl_union, clrreq.callID);
179 break;
180 default:
181 DEBUGP("unknown outbound packet 0x%04x:%s\n", msg,
182 (msg <= PPTP_MSG_MAX)?
183 pptp_msg_name[msg]:pptp_msg_name[0]);
184 /* fall through */
185
186 case PPTP_SET_LINK_INFO:
187 /* only need to NAT in case PAC is behind NAT box */
188 case PPTP_START_SESSION_REQUEST:
189 case PPTP_START_SESSION_REPLY:
190 case PPTP_STOP_SESSION_REQUEST:
191 case PPTP_STOP_SESSION_REPLY:
192 case PPTP_ECHO_REQUEST:
193 case PPTP_ECHO_REPLY:
194 /* no need to alter packet */
195 return NF_ACCEPT;
196 }
197
198 /* only OUT_CALL_REQUEST, IN_CALL_REPLY, CALL_CLEAR_REQUEST pass
199 * down to here */
200 DEBUGP("altering call id from 0x%04x to 0x%04x\n",
201 ntohs(REQ_CID(pptpReq, cid_off)), ntohs(new_callid));
202
203 /* mangle packet */
204 if (ip_nat_mangle_tcp_packet(pskb, ct, ctinfo,
205 cid_off + sizeof(struct pptp_pkt_hdr) +
206 sizeof(struct PptpControlHeader),
207 sizeof(new_callid), (char *)&new_callid,
208 sizeof(new_callid)) == 0)
209 return NF_DROP;
210
211 return NF_ACCEPT;
212}
213
214static void
215pptp_exp_gre(struct ip_conntrack_expect *expect_orig,
216 struct ip_conntrack_expect *expect_reply)
217{
218 struct ip_conntrack *ct = expect_orig->master;
219 struct ip_ct_pptp_master *ct_pptp_info = &ct->help.ct_pptp_info;
220 struct ip_nat_pptp *nat_pptp_info = &ct->nat.help.nat_pptp_info;
221
222 /* save original PAC call ID in nat_info */
223 nat_pptp_info->pac_call_id = ct_pptp_info->pac_call_id;
224
225 /* alter expectation for PNS->PAC direction */
226 expect_orig->saved_proto.gre.key = ct_pptp_info->pns_call_id;
227 expect_orig->tuple.src.u.gre.key = nat_pptp_info->pns_call_id;
228 expect_orig->tuple.dst.u.gre.key = ct_pptp_info->pac_call_id;
229 expect_orig->dir = IP_CT_DIR_ORIGINAL;
230
231 /* alter expectation for PAC->PNS direction */
232 expect_reply->saved_proto.gre.key = nat_pptp_info->pns_call_id;
233 expect_reply->tuple.src.u.gre.key = nat_pptp_info->pac_call_id;
234 expect_reply->tuple.dst.u.gre.key = ct_pptp_info->pns_call_id;
235 expect_reply->dir = IP_CT_DIR_REPLY;
236}
237
238/* inbound packets == from PAC to PNS */
239static int
240pptp_inbound_pkt(struct sk_buff **pskb,
241 struct ip_conntrack *ct,
242 enum ip_conntrack_info ctinfo,
243 struct PptpControlHeader *ctlh,
244 union pptp_ctrl_union *pptpReq)
245{
246 struct ip_nat_pptp *nat_pptp_info = &ct->nat.help.nat_pptp_info;
247 u_int16_t msg;
248 __be16 new_pcid;
249 unsigned int pcid_off;
250
251 new_pcid = nat_pptp_info->pns_call_id;
252
253 switch (msg = ntohs(ctlh->messageType)) {
254 case PPTP_OUT_CALL_REPLY:
255 pcid_off = offsetof(union pptp_ctrl_union, ocack.peersCallID);
256 break;
257 case PPTP_IN_CALL_CONNECT:
258 pcid_off = offsetof(union pptp_ctrl_union, iccon.peersCallID);
259 break;
260 case PPTP_IN_CALL_REQUEST:
261 /* only need to nat in case PAC is behind NAT box */
262 return NF_ACCEPT;
263 case PPTP_WAN_ERROR_NOTIFY:
264 pcid_off = offsetof(union pptp_ctrl_union, wanerr.peersCallID);
265 break;
266 case PPTP_CALL_DISCONNECT_NOTIFY:
267 pcid_off = offsetof(union pptp_ctrl_union, disc.callID);
268 break;
269 case PPTP_SET_LINK_INFO:
270 pcid_off = offsetof(union pptp_ctrl_union, setlink.peersCallID);
271 break;
272
273 default:
274 DEBUGP("unknown inbound packet %s\n", (msg <= PPTP_MSG_MAX)?
275 pptp_msg_name[msg]:pptp_msg_name[0]);
276 /* fall through */
277
278 case PPTP_START_SESSION_REQUEST:
279 case PPTP_START_SESSION_REPLY:
280 case PPTP_STOP_SESSION_REQUEST:
281 case PPTP_STOP_SESSION_REPLY:
282 case PPTP_ECHO_REQUEST:
283 case PPTP_ECHO_REPLY:
284 /* no need to alter packet */
285 return NF_ACCEPT;
286 }
287
288 /* only OUT_CALL_REPLY, IN_CALL_CONNECT, IN_CALL_REQUEST,
289 * WAN_ERROR_NOTIFY, CALL_DISCONNECT_NOTIFY pass down here */
290
291 /* mangle packet */
292 DEBUGP("altering peer call id from 0x%04x to 0x%04x\n",
293 ntohs(REQ_CID(pptpReq, pcid_off)), ntohs(new_pcid));
294
295 if (ip_nat_mangle_tcp_packet(pskb, ct, ctinfo,
296 pcid_off + sizeof(struct pptp_pkt_hdr) +
297 sizeof(struct PptpControlHeader),
298 sizeof(new_pcid), (char *)&new_pcid,
299 sizeof(new_pcid)) == 0)
300 return NF_DROP;
301 return NF_ACCEPT;
302}
303
304
305extern int __init ip_nat_proto_gre_init(void);
306extern void __exit ip_nat_proto_gre_fini(void);
307
308static int __init ip_nat_helper_pptp_init(void)
309{
310 int ret;
311
312 DEBUGP("%s: registering NAT helper\n", __FILE__);
313
314 ret = ip_nat_proto_gre_init();
315 if (ret < 0)
316 return ret;
317
318 BUG_ON(rcu_dereference(ip_nat_pptp_hook_outbound));
319 rcu_assign_pointer(ip_nat_pptp_hook_outbound, pptp_outbound_pkt);
320
321 BUG_ON(rcu_dereference(ip_nat_pptp_hook_inbound));
322 rcu_assign_pointer(ip_nat_pptp_hook_inbound, pptp_inbound_pkt);
323
324 BUG_ON(rcu_dereference(ip_nat_pptp_hook_exp_gre));
325 rcu_assign_pointer(ip_nat_pptp_hook_exp_gre, pptp_exp_gre);
326
327 BUG_ON(rcu_dereference(ip_nat_pptp_hook_expectfn));
328 rcu_assign_pointer(ip_nat_pptp_hook_expectfn, pptp_nat_expected);
329
330 printk("ip_nat_pptp version %s loaded\n", IP_NAT_PPTP_VERSION);
331 return 0;
332}
333
334static void __exit ip_nat_helper_pptp_fini(void)
335{
336 DEBUGP("cleanup_module\n" );
337
338 rcu_assign_pointer(ip_nat_pptp_hook_expectfn, NULL);
339 rcu_assign_pointer(ip_nat_pptp_hook_exp_gre, NULL);
340 rcu_assign_pointer(ip_nat_pptp_hook_inbound, NULL);
341 rcu_assign_pointer(ip_nat_pptp_hook_outbound, NULL);
342 synchronize_rcu();
343
344 ip_nat_proto_gre_fini();
345
346 printk("ip_nat_pptp version %s unloaded\n", IP_NAT_PPTP_VERSION);
347}
348
349module_init(ip_nat_helper_pptp_init);
350module_exit(ip_nat_helper_pptp_fini);
diff --git a/net/ipv4/netfilter/ip_nat_irc.c b/net/ipv4/netfilter/ip_nat_irc.c
deleted file mode 100644
index cfaeea38314f..000000000000
--- a/net/ipv4/netfilter/ip_nat_irc.c
+++ /dev/null
@@ -1,122 +0,0 @@
1/* IRC extension for TCP NAT alteration.
2 * (C) 2000-2001 by Harald Welte <laforge@gnumonks.org>
3 * (C) 2004 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
4 * based on a copy of RR's ip_nat_ftp.c
5 *
6 * ip_nat_irc.c,v 1.16 2001/12/06 07:42:10 laforge Exp
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 */
13
14#include <linux/module.h>
15#include <linux/netfilter_ipv4.h>
16#include <linux/ip.h>
17#include <linux/tcp.h>
18#include <linux/kernel.h>
19#include <net/tcp.h>
20#include <linux/netfilter_ipv4/ip_nat.h>
21#include <linux/netfilter_ipv4/ip_nat_helper.h>
22#include <linux/netfilter_ipv4/ip_nat_rule.h>
23#include <linux/netfilter_ipv4/ip_conntrack_irc.h>
24#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
25#include <linux/moduleparam.h>
26
27#if 0
28#define DEBUGP printk
29#else
30#define DEBUGP(format, args...)
31#endif
32
33MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
34MODULE_DESCRIPTION("IRC (DCC) NAT helper");
35MODULE_LICENSE("GPL");
36
37static unsigned int help(struct sk_buff **pskb,
38 enum ip_conntrack_info ctinfo,
39 unsigned int matchoff,
40 unsigned int matchlen,
41 struct ip_conntrack_expect *exp)
42{
43 u_int16_t port;
44 unsigned int ret;
45
46 /* "4294967296 65635 " */
47 char buffer[18];
48
49 DEBUGP("IRC_NAT: info (seq %u + %u) in %u\n",
50 expect->seq, exp_irc_info->len,
51 ntohl(tcph->seq));
52
53 /* Reply comes from server. */
54 exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
55 exp->dir = IP_CT_DIR_REPLY;
56
57 /* When you see the packet, we need to NAT it the same as the
58 * this one. */
59 exp->expectfn = ip_nat_follow_master;
60
61 /* Try to get same port: if not, try to change it. */
62 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
63 exp->tuple.dst.u.tcp.port = htons(port);
64 if (ip_conntrack_expect_related(exp) == 0)
65 break;
66 }
67
68 if (port == 0)
69 return NF_DROP;
70
71 /* strlen("\1DCC CHAT chat AAAAAAAA P\1\n")=27
72 * strlen("\1DCC SCHAT chat AAAAAAAA P\1\n")=28
73 * strlen("\1DCC SEND F AAAAAAAA P S\1\n")=26
74 * strlen("\1DCC MOVE F AAAAAAAA P S\1\n")=26
75 * strlen("\1DCC TSEND F AAAAAAAA P S\1\n")=27
76 * AAAAAAAAA: bound addr (1.0.0.0==16777216, min 8 digits,
77 * 255.255.255.255==4294967296, 10 digits)
78 * P: bound port (min 1 d, max 5d (65635))
79 * F: filename (min 1 d )
80 * S: size (min 1 d )
81 * 0x01, \n: terminators
82 */
83
84 /* AAA = "us", ie. where server normally talks to. */
85 sprintf(buffer, "%u %u",
86 ntohl(exp->master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip),
87 port);
88 DEBUGP("ip_nat_irc: Inserting '%s' == %u.%u.%u.%u, port %u\n",
89 buffer, NIPQUAD(exp->tuple.src.ip), port);
90
91 ret = ip_nat_mangle_tcp_packet(pskb, exp->master, ctinfo,
92 matchoff, matchlen, buffer,
93 strlen(buffer));
94 if (ret != NF_ACCEPT)
95 ip_conntrack_unexpect_related(exp);
96 return ret;
97}
98
99static void __exit ip_nat_irc_fini(void)
100{
101 rcu_assign_pointer(ip_nat_irc_hook, NULL);
102 synchronize_rcu();
103}
104
105static int __init ip_nat_irc_init(void)
106{
107 BUG_ON(rcu_dereference(ip_nat_irc_hook));
108 rcu_assign_pointer(ip_nat_irc_hook, help);
109 return 0;
110}
111
112/* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */
113static int warn_set(const char *val, struct kernel_param *kp)
114{
115 printk(KERN_INFO KBUILD_MODNAME
116 ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n");
117 return 0;
118}
119module_param_call(ports, warn_set, NULL, NULL, 0);
120
121module_init(ip_nat_irc_init);
122module_exit(ip_nat_irc_fini);
diff --git a/net/ipv4/netfilter/ip_nat_proto_gre.c b/net/ipv4/netfilter/ip_nat_proto_gre.c
deleted file mode 100644
index 95810202d849..000000000000
--- a/net/ipv4/netfilter/ip_nat_proto_gre.c
+++ /dev/null
@@ -1,174 +0,0 @@
1/*
2 * ip_nat_proto_gre.c - Version 2.0
3 *
4 * NAT protocol helper module for GRE.
5 *
6 * GRE is a generic encapsulation protocol, which is generally not very
7 * suited for NAT, as it has no protocol-specific part as port numbers.
8 *
9 * It has an optional key field, which may help us distinguishing two
10 * connections between the same two hosts.
11 *
12 * GRE is defined in RFC 1701 and RFC 1702, as well as RFC 2784
13 *
14 * PPTP is built on top of a modified version of GRE, and has a mandatory
15 * field called "CallID", which serves us for the same purpose as the key
16 * field in plain GRE.
17 *
18 * Documentation about PPTP can be found in RFC 2637
19 *
20 * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org>
21 *
22 * Development of this code funded by Astaro AG (http://www.astaro.com/)
23 *
24 */
25
26#include <linux/module.h>
27#include <linux/ip.h>
28#include <linux/netfilter_ipv4/ip_nat.h>
29#include <linux/netfilter_ipv4/ip_nat_rule.h>
30#include <linux/netfilter_ipv4/ip_nat_protocol.h>
31#include <linux/netfilter_ipv4/ip_conntrack_proto_gre.h>
32
33MODULE_LICENSE("GPL");
34MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
35MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE");
36
37#if 0
38#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s: " format, __FILE__, \
39 __FUNCTION__, ## args)
40#else
41#define DEBUGP(x, args...)
42#endif
43
44/* is key in given range between min and max */
45static int
46gre_in_range(const struct ip_conntrack_tuple *tuple,
47 enum ip_nat_manip_type maniptype,
48 const union ip_conntrack_manip_proto *min,
49 const union ip_conntrack_manip_proto *max)
50{
51 __be16 key;
52
53 if (maniptype == IP_NAT_MANIP_SRC)
54 key = tuple->src.u.gre.key;
55 else
56 key = tuple->dst.u.gre.key;
57
58 return ntohs(key) >= ntohs(min->gre.key)
59 && ntohs(key) <= ntohs(max->gre.key);
60}
61
62/* generate unique tuple ... */
63static int
64gre_unique_tuple(struct ip_conntrack_tuple *tuple,
65 const struct ip_nat_range *range,
66 enum ip_nat_manip_type maniptype,
67 const struct ip_conntrack *conntrack)
68{
69 static u_int16_t key;
70 __be16 *keyptr;
71 unsigned int min, i, range_size;
72
73 if (maniptype == IP_NAT_MANIP_SRC)
74 keyptr = &tuple->src.u.gre.key;
75 else
76 keyptr = &tuple->dst.u.gre.key;
77
78 if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) {
79 DEBUGP("%p: NATing GRE PPTP\n", conntrack);
80 min = 1;
81 range_size = 0xffff;
82 } else {
83 min = ntohs(range->min.gre.key);
84 range_size = ntohs(range->max.gre.key) - min + 1;
85 }
86
87 DEBUGP("min = %u, range_size = %u\n", min, range_size);
88
89 for (i = 0; i < range_size; i++, key++) {
90 *keyptr = htons(min + key % range_size);
91 if (!ip_nat_used_tuple(tuple, conntrack))
92 return 1;
93 }
94
95 DEBUGP("%p: no NAT mapping\n", conntrack);
96
97 return 0;
98}
99
100/* manipulate a GRE packet according to maniptype */
101static int
102gre_manip_pkt(struct sk_buff **pskb,
103 unsigned int iphdroff,
104 const struct ip_conntrack_tuple *tuple,
105 enum ip_nat_manip_type maniptype)
106{
107 struct gre_hdr *greh;
108 struct gre_hdr_pptp *pgreh;
109 struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff);
110 unsigned int hdroff = iphdroff + iph->ihl*4;
111
112 /* pgreh includes two optional 32bit fields which are not required
113 * to be there. That's where the magic '8' comes from */
114 if (!skb_make_writable(pskb, hdroff + sizeof(*pgreh)-8))
115 return 0;
116
117 greh = (void *)(*pskb)->data + hdroff;
118 pgreh = (struct gre_hdr_pptp *) greh;
119
120 /* we only have destination manip of a packet, since 'source key'
121 * is not present in the packet itself */
122 if (maniptype == IP_NAT_MANIP_DST) {
123 /* key manipulation is always dest */
124 switch (greh->version) {
125 case 0:
126 if (!greh->key) {
127 DEBUGP("can't nat GRE w/o key\n");
128 break;
129 }
130 if (greh->csum) {
131 /* FIXME: Never tested this code... */
132 nf_proto_csum_replace4(gre_csum(greh), *pskb,
133 *(gre_key(greh)),
134 tuple->dst.u.gre.key, 0);
135 }
136 *(gre_key(greh)) = tuple->dst.u.gre.key;
137 break;
138 case GRE_VERSION_PPTP:
139 DEBUGP("call_id -> 0x%04x\n",
140 ntohs(tuple->dst.u.gre.key));
141 pgreh->call_id = tuple->dst.u.gre.key;
142 break;
143 default:
144 DEBUGP("can't nat unknown GRE version\n");
145 return 0;
146 break;
147 }
148 }
149 return 1;
150}
151
152/* nat helper struct */
153static struct ip_nat_protocol gre = {
154 .name = "GRE",
155 .protonum = IPPROTO_GRE,
156 .manip_pkt = gre_manip_pkt,
157 .in_range = gre_in_range,
158 .unique_tuple = gre_unique_tuple,
159#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
160 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
161 .range_to_nfattr = ip_nat_port_range_to_nfattr,
162 .nfattr_to_range = ip_nat_port_nfattr_to_range,
163#endif
164};
165
166int __init ip_nat_proto_gre_init(void)
167{
168 return ip_nat_protocol_register(&gre);
169}
170
171void __exit ip_nat_proto_gre_fini(void)
172{
173 ip_nat_protocol_unregister(&gre);
174}
diff --git a/net/ipv4/netfilter/ip_nat_proto_icmp.c b/net/ipv4/netfilter/ip_nat_proto_icmp.c
deleted file mode 100644
index 22a528ae0380..000000000000
--- a/net/ipv4/netfilter/ip_nat_proto_icmp.c
+++ /dev/null
@@ -1,87 +0,0 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9#include <linux/types.h>
10#include <linux/init.h>
11#include <linux/netfilter.h>
12#include <linux/ip.h>
13#include <linux/icmp.h>
14#include <linux/if.h>
15
16#include <linux/netfilter_ipv4/ip_nat.h>
17#include <linux/netfilter_ipv4/ip_nat_core.h>
18#include <linux/netfilter_ipv4/ip_nat_rule.h>
19#include <linux/netfilter_ipv4/ip_nat_protocol.h>
20
21static int
22icmp_in_range(const struct ip_conntrack_tuple *tuple,
23 enum ip_nat_manip_type maniptype,
24 const union ip_conntrack_manip_proto *min,
25 const union ip_conntrack_manip_proto *max)
26{
27 return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) &&
28 ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id);
29}
30
31static int
32icmp_unique_tuple(struct ip_conntrack_tuple *tuple,
33 const struct ip_nat_range *range,
34 enum ip_nat_manip_type maniptype,
35 const struct ip_conntrack *conntrack)
36{
37 static u_int16_t id;
38 unsigned int range_size;
39 unsigned int i;
40
41 range_size = ntohs(range->max.icmp.id) - ntohs(range->min.icmp.id) + 1;
42 /* If no range specified... */
43 if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED))
44 range_size = 0xFFFF;
45
46 for (i = 0; i < range_size; i++, id++) {
47 tuple->src.u.icmp.id = htons(ntohs(range->min.icmp.id) +
48 (id % range_size));
49 if (!ip_nat_used_tuple(tuple, conntrack))
50 return 1;
51 }
52 return 0;
53}
54
55static int
56icmp_manip_pkt(struct sk_buff **pskb,
57 unsigned int iphdroff,
58 const struct ip_conntrack_tuple *tuple,
59 enum ip_nat_manip_type maniptype)
60{
61 struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff);
62 struct icmphdr *hdr;
63 unsigned int hdroff = iphdroff + iph->ihl*4;
64
65 if (!skb_make_writable(pskb, hdroff + sizeof(*hdr)))
66 return 0;
67
68 hdr = (struct icmphdr *)((*pskb)->data + hdroff);
69 nf_proto_csum_replace2(&hdr->checksum, *pskb,
70 hdr->un.echo.id, tuple->src.u.icmp.id, 0);
71 hdr->un.echo.id = tuple->src.u.icmp.id;
72 return 1;
73}
74
75struct ip_nat_protocol ip_nat_protocol_icmp = {
76 .name = "ICMP",
77 .protonum = IPPROTO_ICMP,
78 .me = THIS_MODULE,
79 .manip_pkt = icmp_manip_pkt,
80 .in_range = icmp_in_range,
81 .unique_tuple = icmp_unique_tuple,
82#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
83 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
84 .range_to_nfattr = ip_nat_port_range_to_nfattr,
85 .nfattr_to_range = ip_nat_port_nfattr_to_range,
86#endif
87};
diff --git a/net/ipv4/netfilter/ip_nat_proto_tcp.c b/net/ipv4/netfilter/ip_nat_proto_tcp.c
deleted file mode 100644
index 14ff24f53a7a..000000000000
--- a/net/ipv4/netfilter/ip_nat_proto_tcp.c
+++ /dev/null
@@ -1,154 +0,0 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9#include <linux/types.h>
10#include <linux/init.h>
11#include <linux/random.h>
12#include <linux/netfilter.h>
13#include <linux/ip.h>
14#include <linux/tcp.h>
15#include <linux/if.h>
16#include <linux/netfilter/nfnetlink_conntrack.h>
17#include <linux/netfilter_ipv4/ip_nat.h>
18#include <linux/netfilter_ipv4/ip_nat_rule.h>
19#include <linux/netfilter_ipv4/ip_nat_protocol.h>
20#include <linux/netfilter_ipv4/ip_nat_core.h>
21
22static int
23tcp_in_range(const struct ip_conntrack_tuple *tuple,
24 enum ip_nat_manip_type maniptype,
25 const union ip_conntrack_manip_proto *min,
26 const union ip_conntrack_manip_proto *max)
27{
28 __be16 port;
29
30 if (maniptype == IP_NAT_MANIP_SRC)
31 port = tuple->src.u.tcp.port;
32 else
33 port = tuple->dst.u.tcp.port;
34
35 return ntohs(port) >= ntohs(min->tcp.port)
36 && ntohs(port) <= ntohs(max->tcp.port);
37}
38
39static int
40tcp_unique_tuple(struct ip_conntrack_tuple *tuple,
41 const struct ip_nat_range *range,
42 enum ip_nat_manip_type maniptype,
43 const struct ip_conntrack *conntrack)
44{
45 static u_int16_t port;
46 __be16 *portptr;
47 unsigned int range_size, min, i;
48
49 if (maniptype == IP_NAT_MANIP_SRC)
50 portptr = &tuple->src.u.tcp.port;
51 else
52 portptr = &tuple->dst.u.tcp.port;
53
54 /* If no range specified... */
55 if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) {
56 /* If it's dst rewrite, can't change port */
57 if (maniptype == IP_NAT_MANIP_DST)
58 return 0;
59
60 /* Map privileged onto privileged. */
61 if (ntohs(*portptr) < 1024) {
62 /* Loose convention: >> 512 is credential passing */
63 if (ntohs(*portptr)<512) {
64 min = 1;
65 range_size = 511 - min + 1;
66 } else {
67 min = 600;
68 range_size = 1023 - min + 1;
69 }
70 } else {
71 min = 1024;
72 range_size = 65535 - 1024 + 1;
73 }
74 } else {
75 min = ntohs(range->min.tcp.port);
76 range_size = ntohs(range->max.tcp.port) - min + 1;
77 }
78
79 /* Start from random port to avoid prediction */
80 if (range->flags & IP_NAT_RANGE_PROTO_RANDOM)
81 port = net_random();
82
83 for (i = 0; i < range_size; i++, port++) {
84 *portptr = htons(min + port % range_size);
85 if (!ip_nat_used_tuple(tuple, conntrack)) {
86 return 1;
87 }
88 }
89 return 0;
90}
91
92static int
93tcp_manip_pkt(struct sk_buff **pskb,
94 unsigned int iphdroff,
95 const struct ip_conntrack_tuple *tuple,
96 enum ip_nat_manip_type maniptype)
97{
98 struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff);
99 struct tcphdr *hdr;
100 unsigned int hdroff = iphdroff + iph->ihl*4;
101 __be32 oldip, newip;
102 __be16 *portptr, newport, oldport;
103 int hdrsize = 8; /* TCP connection tracking guarantees this much */
104
105 /* this could be a inner header returned in icmp packet; in such
106 cases we cannot update the checksum field since it is outside of
107 the 8 bytes of transport layer headers we are guaranteed */
108 if ((*pskb)->len >= hdroff + sizeof(struct tcphdr))
109 hdrsize = sizeof(struct tcphdr);
110
111 if (!skb_make_writable(pskb, hdroff + hdrsize))
112 return 0;
113
114 iph = (struct iphdr *)((*pskb)->data + iphdroff);
115 hdr = (struct tcphdr *)((*pskb)->data + hdroff);
116
117 if (maniptype == IP_NAT_MANIP_SRC) {
118 /* Get rid of src ip and src pt */
119 oldip = iph->saddr;
120 newip = tuple->src.ip;
121 newport = tuple->src.u.tcp.port;
122 portptr = &hdr->source;
123 } else {
124 /* Get rid of dst ip and dst pt */
125 oldip = iph->daddr;
126 newip = tuple->dst.ip;
127 newport = tuple->dst.u.tcp.port;
128 portptr = &hdr->dest;
129 }
130
131 oldport = *portptr;
132 *portptr = newport;
133
134 if (hdrsize < sizeof(*hdr))
135 return 1;
136
137 nf_proto_csum_replace4(&hdr->check, *pskb, oldip, newip, 1);
138 nf_proto_csum_replace2(&hdr->check, *pskb, oldport, newport, 0);
139 return 1;
140}
141
142struct ip_nat_protocol ip_nat_protocol_tcp = {
143 .name = "TCP",
144 .protonum = IPPROTO_TCP,
145 .me = THIS_MODULE,
146 .manip_pkt = tcp_manip_pkt,
147 .in_range = tcp_in_range,
148 .unique_tuple = tcp_unique_tuple,
149#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
150 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
151 .range_to_nfattr = ip_nat_port_range_to_nfattr,
152 .nfattr_to_range = ip_nat_port_nfattr_to_range,
153#endif
154};
diff --git a/net/ipv4/netfilter/ip_nat_proto_udp.c b/net/ipv4/netfilter/ip_nat_proto_udp.c
deleted file mode 100644
index dfd521672891..000000000000
--- a/net/ipv4/netfilter/ip_nat_proto_udp.c
+++ /dev/null
@@ -1,144 +0,0 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9#include <linux/types.h>
10#include <linux/init.h>
11#include <linux/random.h>
12#include <linux/netfilter.h>
13#include <linux/ip.h>
14#include <linux/udp.h>
15#include <linux/if.h>
16
17#include <linux/netfilter_ipv4/ip_nat.h>
18#include <linux/netfilter_ipv4/ip_nat_core.h>
19#include <linux/netfilter_ipv4/ip_nat_rule.h>
20#include <linux/netfilter_ipv4/ip_nat_protocol.h>
21
22static int
23udp_in_range(const struct ip_conntrack_tuple *tuple,
24 enum ip_nat_manip_type maniptype,
25 const union ip_conntrack_manip_proto *min,
26 const union ip_conntrack_manip_proto *max)
27{
28 __be16 port;
29
30 if (maniptype == IP_NAT_MANIP_SRC)
31 port = tuple->src.u.udp.port;
32 else
33 port = tuple->dst.u.udp.port;
34
35 return ntohs(port) >= ntohs(min->udp.port)
36 && ntohs(port) <= ntohs(max->udp.port);
37}
38
39static int
40udp_unique_tuple(struct ip_conntrack_tuple *tuple,
41 const struct ip_nat_range *range,
42 enum ip_nat_manip_type maniptype,
43 const struct ip_conntrack *conntrack)
44{
45 static u_int16_t port;
46 __be16 *portptr;
47 unsigned int range_size, min, i;
48
49 if (maniptype == IP_NAT_MANIP_SRC)
50 portptr = &tuple->src.u.udp.port;
51 else
52 portptr = &tuple->dst.u.udp.port;
53
54 /* If no range specified... */
55 if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) {
56 /* If it's dst rewrite, can't change port */
57 if (maniptype == IP_NAT_MANIP_DST)
58 return 0;
59
60 if (ntohs(*portptr) < 1024) {
61 /* Loose convention: >> 512 is credential passing */
62 if (ntohs(*portptr)<512) {
63 min = 1;
64 range_size = 511 - min + 1;
65 } else {
66 min = 600;
67 range_size = 1023 - min + 1;
68 }
69 } else {
70 min = 1024;
71 range_size = 65535 - 1024 + 1;
72 }
73 } else {
74 min = ntohs(range->min.udp.port);
75 range_size = ntohs(range->max.udp.port) - min + 1;
76 }
77
78 /* Start from random port to avoid prediction */
79 if (range->flags & IP_NAT_RANGE_PROTO_RANDOM)
80 port = net_random();
81
82 for (i = 0; i < range_size; i++, port++) {
83 *portptr = htons(min + port % range_size);
84 if (!ip_nat_used_tuple(tuple, conntrack))
85 return 1;
86 }
87 return 0;
88}
89
90static int
91udp_manip_pkt(struct sk_buff **pskb,
92 unsigned int iphdroff,
93 const struct ip_conntrack_tuple *tuple,
94 enum ip_nat_manip_type maniptype)
95{
96 struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff);
97 struct udphdr *hdr;
98 unsigned int hdroff = iphdroff + iph->ihl*4;
99 __be32 oldip, newip;
100 __be16 *portptr, newport;
101
102 if (!skb_make_writable(pskb, hdroff + sizeof(*hdr)))
103 return 0;
104
105 iph = (struct iphdr *)((*pskb)->data + iphdroff);
106 hdr = (struct udphdr *)((*pskb)->data + hdroff);
107
108 if (maniptype == IP_NAT_MANIP_SRC) {
109 /* Get rid of src ip and src pt */
110 oldip = iph->saddr;
111 newip = tuple->src.ip;
112 newport = tuple->src.u.udp.port;
113 portptr = &hdr->source;
114 } else {
115 /* Get rid of dst ip and dst pt */
116 oldip = iph->daddr;
117 newip = tuple->dst.ip;
118 newport = tuple->dst.u.udp.port;
119 portptr = &hdr->dest;
120 }
121
122 if (hdr->check || (*pskb)->ip_summed == CHECKSUM_PARTIAL) {
123 nf_proto_csum_replace4(&hdr->check, *pskb, oldip, newip, 1);
124 nf_proto_csum_replace2(&hdr->check, *pskb, *portptr, newport, 0);
125 if (!hdr->check)
126 hdr->check = CSUM_MANGLED_0;
127 }
128 *portptr = newport;
129 return 1;
130}
131
132struct ip_nat_protocol ip_nat_protocol_udp = {
133 .name = "UDP",
134 .protonum = IPPROTO_UDP,
135 .me = THIS_MODULE,
136 .manip_pkt = udp_manip_pkt,
137 .in_range = udp_in_range,
138 .unique_tuple = udp_unique_tuple,
139#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
140 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
141 .range_to_nfattr = ip_nat_port_range_to_nfattr,
142 .nfattr_to_range = ip_nat_port_nfattr_to_range,
143#endif
144};
diff --git a/net/ipv4/netfilter/ip_nat_proto_unknown.c b/net/ipv4/netfilter/ip_nat_proto_unknown.c
deleted file mode 100644
index 3bf049517246..000000000000
--- a/net/ipv4/netfilter/ip_nat_proto_unknown.c
+++ /dev/null
@@ -1,55 +0,0 @@
1/* The "unknown" protocol. This is what is used for protocols we
2 * don't understand. It's returned by ip_ct_find_proto().
3 */
4
5/* (C) 1999-2001 Paul `Rusty' Russell
6 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 */
12
13#include <linux/types.h>
14#include <linux/init.h>
15#include <linux/netfilter.h>
16#include <linux/if.h>
17
18#include <linux/netfilter_ipv4/ip_nat.h>
19#include <linux/netfilter_ipv4/ip_nat_rule.h>
20#include <linux/netfilter_ipv4/ip_nat_protocol.h>
21
22static int unknown_in_range(const struct ip_conntrack_tuple *tuple,
23 enum ip_nat_manip_type manip_type,
24 const union ip_conntrack_manip_proto *min,
25 const union ip_conntrack_manip_proto *max)
26{
27 return 1;
28}
29
30static int unknown_unique_tuple(struct ip_conntrack_tuple *tuple,
31 const struct ip_nat_range *range,
32 enum ip_nat_manip_type maniptype,
33 const struct ip_conntrack *conntrack)
34{
35 /* Sorry: we can't help you; if it's not unique, we can't frob
36 anything. */
37 return 0;
38}
39
40static int
41unknown_manip_pkt(struct sk_buff **pskb,
42 unsigned int iphdroff,
43 const struct ip_conntrack_tuple *tuple,
44 enum ip_nat_manip_type maniptype)
45{
46 return 1;
47}
48
49struct ip_nat_protocol ip_nat_unknown_protocol = {
50 .name = "unknown",
51 /* .me isn't set: getting a ref to this cannot fail. */
52 .manip_pkt = unknown_manip_pkt,
53 .in_range = unknown_in_range,
54 .unique_tuple = unknown_unique_tuple,
55};
diff --git a/net/ipv4/netfilter/ip_nat_rule.c b/net/ipv4/netfilter/ip_nat_rule.c
deleted file mode 100644
index 080eb1d92200..000000000000
--- a/net/ipv4/netfilter/ip_nat_rule.c
+++ /dev/null
@@ -1,314 +0,0 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9/* Everything about the rules for NAT. */
10#include <linux/types.h>
11#include <linux/ip.h>
12#include <linux/netfilter.h>
13#include <linux/netfilter_ipv4.h>
14#include <linux/module.h>
15#include <linux/kmod.h>
16#include <linux/skbuff.h>
17#include <linux/proc_fs.h>
18#include <net/checksum.h>
19#include <net/route.h>
20#include <linux/bitops.h>
21
22#include <linux/netfilter_ipv4/ip_tables.h>
23#include <linux/netfilter_ipv4/ip_nat.h>
24#include <linux/netfilter_ipv4/ip_nat_core.h>
25#include <linux/netfilter_ipv4/ip_nat_rule.h>
26
27#if 0
28#define DEBUGP printk
29#else
30#define DEBUGP(format, args...)
31#endif
32
33#define NAT_VALID_HOOKS ((1<<NF_IP_PRE_ROUTING) | (1<<NF_IP_POST_ROUTING) | (1<<NF_IP_LOCAL_OUT))
34
35static struct
36{
37 struct ipt_replace repl;
38 struct ipt_standard entries[3];
39 struct ipt_error term;
40} nat_initial_table __initdata
41= { { "nat", NAT_VALID_HOOKS, 4,
42 sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error),
43 { [NF_IP_PRE_ROUTING] = 0,
44 [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard),
45 [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 },
46 { [NF_IP_PRE_ROUTING] = 0,
47 [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard),
48 [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 },
49 0, NULL, { } },
50 {
51 /* PRE_ROUTING */
52 { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
53 0,
54 sizeof(struct ipt_entry),
55 sizeof(struct ipt_standard),
56 0, { 0, 0 }, { } },
57 { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
58 -NF_ACCEPT - 1 } },
59 /* POST_ROUTING */
60 { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
61 0,
62 sizeof(struct ipt_entry),
63 sizeof(struct ipt_standard),
64 0, { 0, 0 }, { } },
65 { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
66 -NF_ACCEPT - 1 } },
67 /* LOCAL_OUT */
68 { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
69 0,
70 sizeof(struct ipt_entry),
71 sizeof(struct ipt_standard),
72 0, { 0, 0 }, { } },
73 { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
74 -NF_ACCEPT - 1 } }
75 },
76 /* ERROR */
77 { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
78 0,
79 sizeof(struct ipt_entry),
80 sizeof(struct ipt_error),
81 0, { 0, 0 }, { } },
82 { { { { IPT_ALIGN(sizeof(struct ipt_error_target)), IPT_ERROR_TARGET } },
83 { } },
84 "ERROR"
85 }
86 }
87};
88
89static struct xt_table nat_table = {
90 .name = "nat",
91 .valid_hooks = NAT_VALID_HOOKS,
92 .lock = RW_LOCK_UNLOCKED,
93 .me = THIS_MODULE,
94 .af = AF_INET,
95};
96
97/* Source NAT */
98static unsigned int ipt_snat_target(struct sk_buff **pskb,
99 const struct net_device *in,
100 const struct net_device *out,
101 unsigned int hooknum,
102 const struct xt_target *target,
103 const void *targinfo)
104{
105 struct ip_conntrack *ct;
106 enum ip_conntrack_info ctinfo;
107 const struct ip_nat_multi_range_compat *mr = targinfo;
108
109 IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING);
110
111 ct = ip_conntrack_get(*pskb, &ctinfo);
112
113 /* Connection must be valid and new. */
114 IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED
115 || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY));
116 IP_NF_ASSERT(out);
117
118 return ip_nat_setup_info(ct, &mr->range[0], hooknum);
119}
120
121/* Before 2.6.11 we did implicit source NAT if required. Warn about change. */
122static void warn_if_extra_mangle(__be32 dstip, __be32 srcip)
123{
124 static int warned = 0;
125 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dstip } } };
126 struct rtable *rt;
127
128 if (ip_route_output_key(&rt, &fl) != 0)
129 return;
130
131 if (rt->rt_src != srcip && !warned) {
132 printk("NAT: no longer support implicit source local NAT\n");
133 printk("NAT: packet src %u.%u.%u.%u -> dst %u.%u.%u.%u\n",
134 NIPQUAD(srcip), NIPQUAD(dstip));
135 warned = 1;
136 }
137 ip_rt_put(rt);
138}
139
140static unsigned int ipt_dnat_target(struct sk_buff **pskb,
141 const struct net_device *in,
142 const struct net_device *out,
143 unsigned int hooknum,
144 const struct xt_target *target,
145 const void *targinfo)
146{
147 struct ip_conntrack *ct;
148 enum ip_conntrack_info ctinfo;
149 const struct ip_nat_multi_range_compat *mr = targinfo;
150
151 IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
152 || hooknum == NF_IP_LOCAL_OUT);
153
154 ct = ip_conntrack_get(*pskb, &ctinfo);
155
156 /* Connection must be valid and new. */
157 IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
158
159 if (hooknum == NF_IP_LOCAL_OUT
160 && mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)
161 warn_if_extra_mangle((*pskb)->nh.iph->daddr,
162 mr->range[0].min_ip);
163
164 return ip_nat_setup_info(ct, &mr->range[0], hooknum);
165}
166
167static int ipt_snat_checkentry(const char *tablename,
168 const void *entry,
169 const struct xt_target *target,
170 void *targinfo,
171 unsigned int hook_mask)
172{
173 struct ip_nat_multi_range_compat *mr = targinfo;
174
175 /* Must be a valid range */
176 if (mr->rangesize != 1) {
177 printk("SNAT: multiple ranges no longer supported\n");
178 return 0;
179 }
180 return 1;
181}
182
183static int ipt_dnat_checkentry(const char *tablename,
184 const void *entry,
185 const struct xt_target *target,
186 void *targinfo,
187 unsigned int hook_mask)
188{
189 struct ip_nat_multi_range_compat *mr = targinfo;
190
191 /* Must be a valid range */
192 if (mr->rangesize != 1) {
193 printk("DNAT: multiple ranges no longer supported\n");
194 return 0;
195 }
196 if (mr->range[0].flags & IP_NAT_RANGE_PROTO_RANDOM) {
197 printk("DNAT: port randomization not supported\n");
198 return 0;
199 }
200 return 1;
201}
202
203inline unsigned int
204alloc_null_binding(struct ip_conntrack *conntrack,
205 struct ip_nat_info *info,
206 unsigned int hooknum)
207{
208 /* Force range to this IP; let proto decide mapping for
209 per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
210 Use reply in case it's already been mangled (eg local packet).
211 */
212 __be32 ip
213 = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC
214 ? conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip
215 : conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip);
216 struct ip_nat_range range
217 = { IP_NAT_RANGE_MAP_IPS, ip, ip, { 0 }, { 0 } };
218
219 DEBUGP("Allocating NULL binding for %p (%u.%u.%u.%u)\n", conntrack,
220 NIPQUAD(ip));
221 return ip_nat_setup_info(conntrack, &range, hooknum);
222}
223
224unsigned int
225alloc_null_binding_confirmed(struct ip_conntrack *conntrack,
226 struct ip_nat_info *info,
227 unsigned int hooknum)
228{
229 __be32 ip
230 = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC
231 ? conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip
232 : conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip);
233 u_int16_t all
234 = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC
235 ? conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.all
236 : conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.u.all);
237 struct ip_nat_range range
238 = { IP_NAT_RANGE_MAP_IPS, ip, ip, { all }, { all } };
239
240 DEBUGP("Allocating NULL binding for confirmed %p (%u.%u.%u.%u)\n",
241 conntrack, NIPQUAD(ip));
242 return ip_nat_setup_info(conntrack, &range, hooknum);
243}
244
245int ip_nat_rule_find(struct sk_buff **pskb,
246 unsigned int hooknum,
247 const struct net_device *in,
248 const struct net_device *out,
249 struct ip_conntrack *ct,
250 struct ip_nat_info *info)
251{
252 int ret;
253
254 ret = ipt_do_table(pskb, hooknum, in, out, &nat_table);
255
256 if (ret == NF_ACCEPT) {
257 if (!ip_nat_initialized(ct, HOOK2MANIP(hooknum)))
258 /* NUL mapping */
259 ret = alloc_null_binding(ct, info, hooknum);
260 }
261 return ret;
262}
263
264static struct xt_target ipt_snat_reg = {
265 .name = "SNAT",
266 .family = AF_INET,
267 .target = ipt_snat_target,
268 .targetsize = sizeof(struct ip_nat_multi_range_compat),
269 .table = "nat",
270 .hooks = 1 << NF_IP_POST_ROUTING,
271 .checkentry = ipt_snat_checkentry,
272};
273
274static struct xt_target ipt_dnat_reg = {
275 .name = "DNAT",
276 .family = AF_INET,
277 .target = ipt_dnat_target,
278 .targetsize = sizeof(struct ip_nat_multi_range_compat),
279 .table = "nat",
280 .hooks = (1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT),
281 .checkentry = ipt_dnat_checkentry,
282};
283
284int __init ip_nat_rule_init(void)
285{
286 int ret;
287
288 ret = ipt_register_table(&nat_table, &nat_initial_table.repl);
289 if (ret != 0)
290 return ret;
291 ret = xt_register_target(&ipt_snat_reg);
292 if (ret != 0)
293 goto unregister_table;
294
295 ret = xt_register_target(&ipt_dnat_reg);
296 if (ret != 0)
297 goto unregister_snat;
298
299 return ret;
300
301 unregister_snat:
302 xt_unregister_target(&ipt_snat_reg);
303 unregister_table:
304 xt_unregister_table(&nat_table);
305
306 return ret;
307}
308
309void ip_nat_rule_cleanup(void)
310{
311 xt_unregister_target(&ipt_dnat_reg);
312 xt_unregister_target(&ipt_snat_reg);
313 ipt_unregister_table(&nat_table);
314}
diff --git a/net/ipv4/netfilter/ip_nat_sip.c b/net/ipv4/netfilter/ip_nat_sip.c
deleted file mode 100644
index 325c5a9dc2ef..000000000000
--- a/net/ipv4/netfilter/ip_nat_sip.c
+++ /dev/null
@@ -1,282 +0,0 @@
1/* SIP extension for UDP NAT alteration.
2 *
3 * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar>
4 * based on RR's ip_nat_ftp.c and other modules.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/module.h>
12#include <linux/skbuff.h>
13#include <linux/ip.h>
14#include <linux/udp.h>
15
16#include <linux/netfilter_ipv4.h>
17#include <linux/netfilter_ipv4/ip_nat.h>
18#include <linux/netfilter_ipv4/ip_nat_helper.h>
19#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
20#include <linux/netfilter_ipv4/ip_conntrack_sip.h>
21
22MODULE_LICENSE("GPL");
23MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>");
24MODULE_DESCRIPTION("SIP NAT helper");
25
26#if 0
27#define DEBUGP printk
28#else
29#define DEBUGP(format, args...)
30#endif
31
32struct addr_map {
33 struct {
34 char src[sizeof("nnn.nnn.nnn.nnn:nnnnn")];
35 char dst[sizeof("nnn.nnn.nnn.nnn:nnnnn")];
36 unsigned int srclen, srciplen;
37 unsigned int dstlen, dstiplen;
38 } addr[IP_CT_DIR_MAX];
39};
40
41static void addr_map_init(struct ip_conntrack *ct, struct addr_map *map)
42{
43 struct ip_conntrack_tuple *t;
44 enum ip_conntrack_dir dir;
45 unsigned int n;
46
47 for (dir = 0; dir < IP_CT_DIR_MAX; dir++) {
48 t = &ct->tuplehash[dir].tuple;
49
50 n = sprintf(map->addr[dir].src, "%u.%u.%u.%u",
51 NIPQUAD(t->src.ip));
52 map->addr[dir].srciplen = n;
53 n += sprintf(map->addr[dir].src + n, ":%u",
54 ntohs(t->src.u.udp.port));
55 map->addr[dir].srclen = n;
56
57 n = sprintf(map->addr[dir].dst, "%u.%u.%u.%u",
58 NIPQUAD(t->dst.ip));
59 map->addr[dir].dstiplen = n;
60 n += sprintf(map->addr[dir].dst + n, ":%u",
61 ntohs(t->dst.u.udp.port));
62 map->addr[dir].dstlen = n;
63 }
64}
65
66static int map_sip_addr(struct sk_buff **pskb, enum ip_conntrack_info ctinfo,
67 struct ip_conntrack *ct, const char **dptr, size_t dlen,
68 enum sip_header_pos pos, struct addr_map *map)
69{
70 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
71 unsigned int matchlen, matchoff, addrlen;
72 char *addr;
73
74 if (ct_sip_get_info(*dptr, dlen, &matchoff, &matchlen, pos) <= 0)
75 return 1;
76
77 if ((matchlen == map->addr[dir].srciplen ||
78 matchlen == map->addr[dir].srclen) &&
79 memcmp(*dptr + matchoff, map->addr[dir].src, matchlen) == 0) {
80 addr = map->addr[!dir].dst;
81 addrlen = map->addr[!dir].dstlen;
82 } else if ((matchlen == map->addr[dir].dstiplen ||
83 matchlen == map->addr[dir].dstlen) &&
84 memcmp(*dptr + matchoff, map->addr[dir].dst, matchlen) == 0) {
85 addr = map->addr[!dir].src;
86 addrlen = map->addr[!dir].srclen;
87 } else
88 return 1;
89
90 if (!ip_nat_mangle_udp_packet(pskb, ct, ctinfo,
91 matchoff, matchlen, addr, addrlen))
92 return 0;
93 *dptr = (*pskb)->data + (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
94 return 1;
95
96}
97
98static unsigned int ip_nat_sip(struct sk_buff **pskb,
99 enum ip_conntrack_info ctinfo,
100 struct ip_conntrack *ct,
101 const char **dptr)
102{
103 enum sip_header_pos pos;
104 struct addr_map map;
105 int dataoff, datalen;
106
107 dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
108 datalen = (*pskb)->len - dataoff;
109 if (datalen < sizeof("SIP/2.0") - 1)
110 return NF_DROP;
111
112 addr_map_init(ct, &map);
113
114 /* Basic rules: requests and responses. */
115 if (strncmp(*dptr, "SIP/2.0", sizeof("SIP/2.0") - 1) != 0) {
116 /* 10.2: Constructing the REGISTER Request:
117 *
118 * The "userinfo" and "@" components of the SIP URI MUST NOT
119 * be present.
120 */
121 if (datalen >= sizeof("REGISTER") - 1 &&
122 strncmp(*dptr, "REGISTER", sizeof("REGISTER") - 1) == 0)
123 pos = POS_REG_REQ_URI;
124 else
125 pos = POS_REQ_URI;
126
127 if (!map_sip_addr(pskb, ctinfo, ct, dptr, datalen, pos, &map))
128 return NF_DROP;
129 }
130
131 if (!map_sip_addr(pskb, ctinfo, ct, dptr, datalen, POS_FROM, &map) ||
132 !map_sip_addr(pskb, ctinfo, ct, dptr, datalen, POS_TO, &map) ||
133 !map_sip_addr(pskb, ctinfo, ct, dptr, datalen, POS_VIA, &map) ||
134 !map_sip_addr(pskb, ctinfo, ct, dptr, datalen, POS_CONTACT, &map))
135 return NF_DROP;
136 return NF_ACCEPT;
137}
138
139static unsigned int mangle_sip_packet(struct sk_buff **pskb,
140 enum ip_conntrack_info ctinfo,
141 struct ip_conntrack *ct,
142 const char **dptr, size_t dlen,
143 char *buffer, int bufflen,
144 enum sip_header_pos pos)
145{
146 unsigned int matchlen, matchoff;
147
148 if (ct_sip_get_info(*dptr, dlen, &matchoff, &matchlen, pos) <= 0)
149 return 0;
150
151 if (!ip_nat_mangle_udp_packet(pskb, ct, ctinfo,
152 matchoff, matchlen, buffer, bufflen))
153 return 0;
154
155 /* We need to reload this. Thanks Patrick. */
156 *dptr = (*pskb)->data + (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
157 return 1;
158}
159
160static int mangle_content_len(struct sk_buff **pskb,
161 enum ip_conntrack_info ctinfo,
162 struct ip_conntrack *ct,
163 const char *dptr)
164{
165 unsigned int dataoff, matchoff, matchlen;
166 char buffer[sizeof("65536")];
167 int bufflen;
168
169 dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
170
171 /* Get actual SDP lenght */
172 if (ct_sip_get_info(dptr, (*pskb)->len - dataoff, &matchoff,
173 &matchlen, POS_SDP_HEADER) > 0) {
174
175 /* since ct_sip_get_info() give us a pointer passing 'v='
176 we need to add 2 bytes in this count. */
177 int c_len = (*pskb)->len - dataoff - matchoff + 2;
178
179 /* Now, update SDP lenght */
180 if (ct_sip_get_info(dptr, (*pskb)->len - dataoff, &matchoff,
181 &matchlen, POS_CONTENT) > 0) {
182
183 bufflen = sprintf(buffer, "%u", c_len);
184
185 return ip_nat_mangle_udp_packet(pskb, ct, ctinfo,
186 matchoff, matchlen,
187 buffer, bufflen);
188 }
189 }
190 return 0;
191}
192
193static unsigned int mangle_sdp(struct sk_buff **pskb,
194 enum ip_conntrack_info ctinfo,
195 struct ip_conntrack *ct,
196 __be32 newip, u_int16_t port,
197 const char *dptr)
198{
199 char buffer[sizeof("nnn.nnn.nnn.nnn")];
200 unsigned int dataoff, bufflen;
201
202 dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
203
204 /* Mangle owner and contact info. */
205 bufflen = sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(newip));
206 if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff,
207 buffer, bufflen, POS_OWNER))
208 return 0;
209
210 if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff,
211 buffer, bufflen, POS_CONNECTION))
212 return 0;
213
214 /* Mangle media port. */
215 bufflen = sprintf(buffer, "%u", port);
216 if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff,
217 buffer, bufflen, POS_MEDIA))
218 return 0;
219
220 return mangle_content_len(pskb, ctinfo, ct, dptr);
221}
222
223/* So, this packet has hit the connection tracking matching code.
224 Mangle it, and change the expectation to match the new version. */
225static unsigned int ip_nat_sdp(struct sk_buff **pskb,
226 enum ip_conntrack_info ctinfo,
227 struct ip_conntrack_expect *exp,
228 const char *dptr)
229{
230 struct ip_conntrack *ct = exp->master;
231 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
232 __be32 newip;
233 u_int16_t port;
234
235 DEBUGP("ip_nat_sdp():\n");
236
237 /* Connection will come from reply */
238 newip = ct->tuplehash[!dir].tuple.dst.ip;
239
240 exp->tuple.dst.ip = newip;
241 exp->saved_proto.udp.port = exp->tuple.dst.u.udp.port;
242 exp->dir = !dir;
243
244 /* When you see the packet, we need to NAT it the same as the
245 this one. */
246 exp->expectfn = ip_nat_follow_master;
247
248 /* Try to get same port: if not, try to change it. */
249 for (port = ntohs(exp->saved_proto.udp.port); port != 0; port++) {
250 exp->tuple.dst.u.udp.port = htons(port);
251 if (ip_conntrack_expect_related(exp) == 0)
252 break;
253 }
254
255 if (port == 0)
256 return NF_DROP;
257
258 if (!mangle_sdp(pskb, ctinfo, ct, newip, port, dptr)) {
259 ip_conntrack_unexpect_related(exp);
260 return NF_DROP;
261 }
262 return NF_ACCEPT;
263}
264
265static void __exit fini(void)
266{
267 rcu_assign_pointer(ip_nat_sip_hook, NULL);
268 rcu_assign_pointer(ip_nat_sdp_hook, NULL);
269 synchronize_rcu();
270}
271
272static int __init init(void)
273{
274 BUG_ON(rcu_dereference(ip_nat_sip_hook));
275 BUG_ON(rcu_dereference(ip_nat_sdp_hook));
276 rcu_assign_pointer(ip_nat_sip_hook, ip_nat_sip);
277 rcu_assign_pointer(ip_nat_sdp_hook, ip_nat_sdp);
278 return 0;
279}
280
281module_init(init);
282module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c
deleted file mode 100644
index e41d0efae515..000000000000
--- a/net/ipv4/netfilter/ip_nat_snmp_basic.c
+++ /dev/null
@@ -1,1333 +0,0 @@
1/*
2 * ip_nat_snmp_basic.c
3 *
4 * Basic SNMP Application Layer Gateway
5 *
6 * This IP NAT module is intended for use with SNMP network
7 * discovery and monitoring applications where target networks use
8 * conflicting private address realms.
9 *
10 * Static NAT is used to remap the networks from the view of the network
11 * management system at the IP layer, and this module remaps some application
12 * layer addresses to match.
13 *
14 * The simplest form of ALG is performed, where only tagged IP addresses
15 * are modified. The module does not need to be MIB aware and only scans
16 * messages at the ASN.1/BER level.
17 *
18 * Currently, only SNMPv1 and SNMPv2 are supported.
19 *
20 * More information on ALG and associated issues can be found in
21 * RFC 2962
22 *
23 * The ASB.1/BER parsing code is derived from the gxsnmp package by Gregory
24 * McLean & Jochen Friedrich, stripped down for use in the kernel.
25 *
26 * Copyright (c) 2000 RP Internet (www.rpi.net.au).
27 *
28 * This program is free software; you can redistribute it and/or modify
29 * it under the terms of the GNU General Public License as published by
30 * the Free Software Foundation; either version 2 of the License, or
31 * (at your option) any later version.
32 * This program is distributed in the hope that it will be useful,
33 * but WITHOUT ANY WARRANTY; without even the implied warranty of
34 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
35 * GNU General Public License for more details.
36 * You should have received a copy of the GNU General Public License
37 * along with this program; if not, write to the Free Software
38 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
39 *
40 * Author: James Morris <jmorris@intercode.com.au>
41 *
42 * Updates:
43 * 2000-08-06: Convert to new helper API (Harald Welte).
44 *
45 */
46#include <linux/in.h>
47#include <linux/module.h>
48#include <linux/types.h>
49#include <linux/kernel.h>
50#include <linux/moduleparam.h>
51#include <linux/netfilter_ipv4.h>
52#include <linux/netfilter_ipv4/ip_nat.h>
53#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
54#include <linux/netfilter_ipv4/ip_nat_helper.h>
55#include <linux/ip.h>
56#include <linux/udp.h>
57#include <net/checksum.h>
58#include <net/udp.h>
59#include <asm/uaccess.h>
60
61MODULE_LICENSE("GPL");
62MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
63MODULE_DESCRIPTION("Basic SNMP Application Layer Gateway");
64
65#define SNMP_PORT 161
66#define SNMP_TRAP_PORT 162
67#define NOCT1(n) (*(u8 *)n)
68
69static int debug;
70static DEFINE_SPINLOCK(snmp_lock);
71
72/*
73 * Application layer address mapping mimics the NAT mapping, but
74 * only for the first octet in this case (a more flexible system
75 * can be implemented if needed).
76 */
77struct oct1_map
78{
79 u_int8_t from;
80 u_int8_t to;
81};
82
83
84/*****************************************************************************
85 *
86 * Basic ASN.1 decoding routines (gxsnmp author Dirk Wisse)
87 *
88 *****************************************************************************/
89
90/* Class */
91#define ASN1_UNI 0 /* Universal */
92#define ASN1_APL 1 /* Application */
93#define ASN1_CTX 2 /* Context */
94#define ASN1_PRV 3 /* Private */
95
96/* Tag */
97#define ASN1_EOC 0 /* End Of Contents */
98#define ASN1_BOL 1 /* Boolean */
99#define ASN1_INT 2 /* Integer */
100#define ASN1_BTS 3 /* Bit String */
101#define ASN1_OTS 4 /* Octet String */
102#define ASN1_NUL 5 /* Null */
103#define ASN1_OJI 6 /* Object Identifier */
104#define ASN1_OJD 7 /* Object Description */
105#define ASN1_EXT 8 /* External */
106#define ASN1_SEQ 16 /* Sequence */
107#define ASN1_SET 17 /* Set */
108#define ASN1_NUMSTR 18 /* Numerical String */
109#define ASN1_PRNSTR 19 /* Printable String */
110#define ASN1_TEXSTR 20 /* Teletext String */
111#define ASN1_VIDSTR 21 /* Video String */
112#define ASN1_IA5STR 22 /* IA5 String */
113#define ASN1_UNITIM 23 /* Universal Time */
114#define ASN1_GENTIM 24 /* General Time */
115#define ASN1_GRASTR 25 /* Graphical String */
116#define ASN1_VISSTR 26 /* Visible String */
117#define ASN1_GENSTR 27 /* General String */
118
119/* Primitive / Constructed methods*/
120#define ASN1_PRI 0 /* Primitive */
121#define ASN1_CON 1 /* Constructed */
122
123/*
124 * Error codes.
125 */
126#define ASN1_ERR_NOERROR 0
127#define ASN1_ERR_DEC_EMPTY 2
128#define ASN1_ERR_DEC_EOC_MISMATCH 3
129#define ASN1_ERR_DEC_LENGTH_MISMATCH 4
130#define ASN1_ERR_DEC_BADVALUE 5
131
132/*
133 * ASN.1 context.
134 */
135struct asn1_ctx
136{
137 int error; /* Error condition */
138 unsigned char *pointer; /* Octet just to be decoded */
139 unsigned char *begin; /* First octet */
140 unsigned char *end; /* Octet after last octet */
141};
142
143/*
144 * Octet string (not null terminated)
145 */
146struct asn1_octstr
147{
148 unsigned char *data;
149 unsigned int len;
150};
151
152static void asn1_open(struct asn1_ctx *ctx,
153 unsigned char *buf,
154 unsigned int len)
155{
156 ctx->begin = buf;
157 ctx->end = buf + len;
158 ctx->pointer = buf;
159 ctx->error = ASN1_ERR_NOERROR;
160}
161
162static unsigned char asn1_octet_decode(struct asn1_ctx *ctx, unsigned char *ch)
163{
164 if (ctx->pointer >= ctx->end) {
165 ctx->error = ASN1_ERR_DEC_EMPTY;
166 return 0;
167 }
168 *ch = *(ctx->pointer)++;
169 return 1;
170}
171
172static unsigned char asn1_tag_decode(struct asn1_ctx *ctx, unsigned int *tag)
173{
174 unsigned char ch;
175
176 *tag = 0;
177
178 do
179 {
180 if (!asn1_octet_decode(ctx, &ch))
181 return 0;
182 *tag <<= 7;
183 *tag |= ch & 0x7F;
184 } while ((ch & 0x80) == 0x80);
185 return 1;
186}
187
188static unsigned char asn1_id_decode(struct asn1_ctx *ctx,
189 unsigned int *cls,
190 unsigned int *con,
191 unsigned int *tag)
192{
193 unsigned char ch;
194
195 if (!asn1_octet_decode(ctx, &ch))
196 return 0;
197
198 *cls = (ch & 0xC0) >> 6;
199 *con = (ch & 0x20) >> 5;
200 *tag = (ch & 0x1F);
201
202 if (*tag == 0x1F) {
203 if (!asn1_tag_decode(ctx, tag))
204 return 0;
205 }
206 return 1;
207}
208
209static unsigned char asn1_length_decode(struct asn1_ctx *ctx,
210 unsigned int *def,
211 unsigned int *len)
212{
213 unsigned char ch, cnt;
214
215 if (!asn1_octet_decode(ctx, &ch))
216 return 0;
217
218 if (ch == 0x80)
219 *def = 0;
220 else {
221 *def = 1;
222
223 if (ch < 0x80)
224 *len = ch;
225 else {
226 cnt = (unsigned char) (ch & 0x7F);
227 *len = 0;
228
229 while (cnt > 0) {
230 if (!asn1_octet_decode(ctx, &ch))
231 return 0;
232 *len <<= 8;
233 *len |= ch;
234 cnt--;
235 }
236 }
237 }
238 return 1;
239}
240
241static unsigned char asn1_header_decode(struct asn1_ctx *ctx,
242 unsigned char **eoc,
243 unsigned int *cls,
244 unsigned int *con,
245 unsigned int *tag)
246{
247 unsigned int def, len;
248
249 if (!asn1_id_decode(ctx, cls, con, tag))
250 return 0;
251
252 def = len = 0;
253 if (!asn1_length_decode(ctx, &def, &len))
254 return 0;
255
256 if (def)
257 *eoc = ctx->pointer + len;
258 else
259 *eoc = NULL;
260 return 1;
261}
262
263static unsigned char asn1_eoc_decode(struct asn1_ctx *ctx, unsigned char *eoc)
264{
265 unsigned char ch;
266
267 if (eoc == 0) {
268 if (!asn1_octet_decode(ctx, &ch))
269 return 0;
270
271 if (ch != 0x00) {
272 ctx->error = ASN1_ERR_DEC_EOC_MISMATCH;
273 return 0;
274 }
275
276 if (!asn1_octet_decode(ctx, &ch))
277 return 0;
278
279 if (ch != 0x00) {
280 ctx->error = ASN1_ERR_DEC_EOC_MISMATCH;
281 return 0;
282 }
283 return 1;
284 } else {
285 if (ctx->pointer != eoc) {
286 ctx->error = ASN1_ERR_DEC_LENGTH_MISMATCH;
287 return 0;
288 }
289 return 1;
290 }
291}
292
293static unsigned char asn1_null_decode(struct asn1_ctx *ctx, unsigned char *eoc)
294{
295 ctx->pointer = eoc;
296 return 1;
297}
298
299static unsigned char asn1_long_decode(struct asn1_ctx *ctx,
300 unsigned char *eoc,
301 long *integer)
302{
303 unsigned char ch;
304 unsigned int len;
305
306 if (!asn1_octet_decode(ctx, &ch))
307 return 0;
308
309 *integer = (signed char) ch;
310 len = 1;
311
312 while (ctx->pointer < eoc) {
313 if (++len > sizeof (long)) {
314 ctx->error = ASN1_ERR_DEC_BADVALUE;
315 return 0;
316 }
317
318 if (!asn1_octet_decode(ctx, &ch))
319 return 0;
320
321 *integer <<= 8;
322 *integer |= ch;
323 }
324 return 1;
325}
326
327static unsigned char asn1_uint_decode(struct asn1_ctx *ctx,
328 unsigned char *eoc,
329 unsigned int *integer)
330{
331 unsigned char ch;
332 unsigned int len;
333
334 if (!asn1_octet_decode(ctx, &ch))
335 return 0;
336
337 *integer = ch;
338 if (ch == 0) len = 0;
339 else len = 1;
340
341 while (ctx->pointer < eoc) {
342 if (++len > sizeof (unsigned int)) {
343 ctx->error = ASN1_ERR_DEC_BADVALUE;
344 return 0;
345 }
346
347 if (!asn1_octet_decode(ctx, &ch))
348 return 0;
349
350 *integer <<= 8;
351 *integer |= ch;
352 }
353 return 1;
354}
355
356static unsigned char asn1_ulong_decode(struct asn1_ctx *ctx,
357 unsigned char *eoc,
358 unsigned long *integer)
359{
360 unsigned char ch;
361 unsigned int len;
362
363 if (!asn1_octet_decode(ctx, &ch))
364 return 0;
365
366 *integer = ch;
367 if (ch == 0) len = 0;
368 else len = 1;
369
370 while (ctx->pointer < eoc) {
371 if (++len > sizeof (unsigned long)) {
372 ctx->error = ASN1_ERR_DEC_BADVALUE;
373 return 0;
374 }
375
376 if (!asn1_octet_decode(ctx, &ch))
377 return 0;
378
379 *integer <<= 8;
380 *integer |= ch;
381 }
382 return 1;
383}
384
385static unsigned char asn1_octets_decode(struct asn1_ctx *ctx,
386 unsigned char *eoc,
387 unsigned char **octets,
388 unsigned int *len)
389{
390 unsigned char *ptr;
391
392 *len = 0;
393
394 *octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC);
395 if (*octets == NULL) {
396 if (net_ratelimit())
397 printk("OOM in bsalg (%d)\n", __LINE__);
398 return 0;
399 }
400
401 ptr = *octets;
402 while (ctx->pointer < eoc) {
403 if (!asn1_octet_decode(ctx, (unsigned char *)ptr++)) {
404 kfree(*octets);
405 *octets = NULL;
406 return 0;
407 }
408 (*len)++;
409 }
410 return 1;
411}
412
413static unsigned char asn1_subid_decode(struct asn1_ctx *ctx,
414 unsigned long *subid)
415{
416 unsigned char ch;
417
418 *subid = 0;
419
420 do {
421 if (!asn1_octet_decode(ctx, &ch))
422 return 0;
423
424 *subid <<= 7;
425 *subid |= ch & 0x7F;
426 } while ((ch & 0x80) == 0x80);
427 return 1;
428}
429
430static unsigned char asn1_oid_decode(struct asn1_ctx *ctx,
431 unsigned char *eoc,
432 unsigned long **oid,
433 unsigned int *len)
434{
435 unsigned long subid;
436 unsigned int size;
437 unsigned long *optr;
438
439 size = eoc - ctx->pointer + 1;
440 *oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC);
441 if (*oid == NULL) {
442 if (net_ratelimit())
443 printk("OOM in bsalg (%d)\n", __LINE__);
444 return 0;
445 }
446
447 optr = *oid;
448
449 if (!asn1_subid_decode(ctx, &subid)) {
450 kfree(*oid);
451 *oid = NULL;
452 return 0;
453 }
454
455 if (subid < 40) {
456 optr [0] = 0;
457 optr [1] = subid;
458 } else if (subid < 80) {
459 optr [0] = 1;
460 optr [1] = subid - 40;
461 } else {
462 optr [0] = 2;
463 optr [1] = subid - 80;
464 }
465
466 *len = 2;
467 optr += 2;
468
469 while (ctx->pointer < eoc) {
470 if (++(*len) > size) {
471 ctx->error = ASN1_ERR_DEC_BADVALUE;
472 kfree(*oid);
473 *oid = NULL;
474 return 0;
475 }
476
477 if (!asn1_subid_decode(ctx, optr++)) {
478 kfree(*oid);
479 *oid = NULL;
480 return 0;
481 }
482 }
483 return 1;
484}
485
486/*****************************************************************************
487 *
488 * SNMP decoding routines (gxsnmp author Dirk Wisse)
489 *
490 *****************************************************************************/
491
492/* SNMP Versions */
493#define SNMP_V1 0
494#define SNMP_V2C 1
495#define SNMP_V2 2
496#define SNMP_V3 3
497
498/* Default Sizes */
499#define SNMP_SIZE_COMM 256
500#define SNMP_SIZE_OBJECTID 128
501#define SNMP_SIZE_BUFCHR 256
502#define SNMP_SIZE_BUFINT 128
503#define SNMP_SIZE_SMALLOBJECTID 16
504
505/* Requests */
506#define SNMP_PDU_GET 0
507#define SNMP_PDU_NEXT 1
508#define SNMP_PDU_RESPONSE 2
509#define SNMP_PDU_SET 3
510#define SNMP_PDU_TRAP1 4
511#define SNMP_PDU_BULK 5
512#define SNMP_PDU_INFORM 6
513#define SNMP_PDU_TRAP2 7
514
515/* Errors */
516#define SNMP_NOERROR 0
517#define SNMP_TOOBIG 1
518#define SNMP_NOSUCHNAME 2
519#define SNMP_BADVALUE 3
520#define SNMP_READONLY 4
521#define SNMP_GENERROR 5
522#define SNMP_NOACCESS 6
523#define SNMP_WRONGTYPE 7
524#define SNMP_WRONGLENGTH 8
525#define SNMP_WRONGENCODING 9
526#define SNMP_WRONGVALUE 10
527#define SNMP_NOCREATION 11
528#define SNMP_INCONSISTENTVALUE 12
529#define SNMP_RESOURCEUNAVAILABLE 13
530#define SNMP_COMMITFAILED 14
531#define SNMP_UNDOFAILED 15
532#define SNMP_AUTHORIZATIONERROR 16
533#define SNMP_NOTWRITABLE 17
534#define SNMP_INCONSISTENTNAME 18
535
536/* General SNMP V1 Traps */
537#define SNMP_TRAP_COLDSTART 0
538#define SNMP_TRAP_WARMSTART 1
539#define SNMP_TRAP_LINKDOWN 2
540#define SNMP_TRAP_LINKUP 3
541#define SNMP_TRAP_AUTFAILURE 4
542#define SNMP_TRAP_EQPNEIGHBORLOSS 5
543#define SNMP_TRAP_ENTSPECIFIC 6
544
545/* SNMPv1 Types */
546#define SNMP_NULL 0
547#define SNMP_INTEGER 1 /* l */
548#define SNMP_OCTETSTR 2 /* c */
549#define SNMP_DISPLAYSTR 2 /* c */
550#define SNMP_OBJECTID 3 /* ul */
551#define SNMP_IPADDR 4 /* uc */
552#define SNMP_COUNTER 5 /* ul */
553#define SNMP_GAUGE 6 /* ul */
554#define SNMP_TIMETICKS 7 /* ul */
555#define SNMP_OPAQUE 8 /* c */
556
557/* Additional SNMPv2 Types */
558#define SNMP_UINTEGER 5 /* ul */
559#define SNMP_BITSTR 9 /* uc */
560#define SNMP_NSAP 10 /* uc */
561#define SNMP_COUNTER64 11 /* ul */
562#define SNMP_NOSUCHOBJECT 12
563#define SNMP_NOSUCHINSTANCE 13
564#define SNMP_ENDOFMIBVIEW 14
565
566union snmp_syntax
567{
568 unsigned char uc[0]; /* 8 bit unsigned */
569 char c[0]; /* 8 bit signed */
570 unsigned long ul[0]; /* 32 bit unsigned */
571 long l[0]; /* 32 bit signed */
572};
573
574struct snmp_object
575{
576 unsigned long *id;
577 unsigned int id_len;
578 unsigned short type;
579 unsigned int syntax_len;
580 union snmp_syntax syntax;
581};
582
583struct snmp_request
584{
585 unsigned long id;
586 unsigned int error_status;
587 unsigned int error_index;
588};
589
590struct snmp_v1_trap
591{
592 unsigned long *id;
593 unsigned int id_len;
594 unsigned long ip_address; /* pointer */
595 unsigned int general;
596 unsigned int specific;
597 unsigned long time;
598};
599
600/* SNMP types */
601#define SNMP_IPA 0
602#define SNMP_CNT 1
603#define SNMP_GGE 2
604#define SNMP_TIT 3
605#define SNMP_OPQ 4
606#define SNMP_C64 6
607
608/* SNMP errors */
609#define SERR_NSO 0
610#define SERR_NSI 1
611#define SERR_EOM 2
612
613static inline void mangle_address(unsigned char *begin,
614 unsigned char *addr,
615 const struct oct1_map *map,
616 __sum16 *check);
617struct snmp_cnv
618{
619 unsigned int class;
620 unsigned int tag;
621 int syntax;
622};
623
624static struct snmp_cnv snmp_conv [] =
625{
626 {ASN1_UNI, ASN1_NUL, SNMP_NULL},
627 {ASN1_UNI, ASN1_INT, SNMP_INTEGER},
628 {ASN1_UNI, ASN1_OTS, SNMP_OCTETSTR},
629 {ASN1_UNI, ASN1_OTS, SNMP_DISPLAYSTR},
630 {ASN1_UNI, ASN1_OJI, SNMP_OBJECTID},
631 {ASN1_APL, SNMP_IPA, SNMP_IPADDR},
632 {ASN1_APL, SNMP_CNT, SNMP_COUNTER}, /* Counter32 */
633 {ASN1_APL, SNMP_GGE, SNMP_GAUGE}, /* Gauge32 == Unsigned32 */
634 {ASN1_APL, SNMP_TIT, SNMP_TIMETICKS},
635 {ASN1_APL, SNMP_OPQ, SNMP_OPAQUE},
636
637 /* SNMPv2 data types and errors */
638 {ASN1_UNI, ASN1_BTS, SNMP_BITSTR},
639 {ASN1_APL, SNMP_C64, SNMP_COUNTER64},
640 {ASN1_CTX, SERR_NSO, SNMP_NOSUCHOBJECT},
641 {ASN1_CTX, SERR_NSI, SNMP_NOSUCHINSTANCE},
642 {ASN1_CTX, SERR_EOM, SNMP_ENDOFMIBVIEW},
643 {0, 0, -1}
644};
645
646static unsigned char snmp_tag_cls2syntax(unsigned int tag,
647 unsigned int cls,
648 unsigned short *syntax)
649{
650 struct snmp_cnv *cnv;
651
652 cnv = snmp_conv;
653
654 while (cnv->syntax != -1) {
655 if (cnv->tag == tag && cnv->class == cls) {
656 *syntax = cnv->syntax;
657 return 1;
658 }
659 cnv++;
660 }
661 return 0;
662}
663
664static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
665 struct snmp_object **obj)
666{
667 unsigned int cls, con, tag, len, idlen;
668 unsigned short type;
669 unsigned char *eoc, *end, *p;
670 unsigned long *lp, *id;
671 unsigned long ul;
672 long l;
673
674 *obj = NULL;
675 id = NULL;
676
677 if (!asn1_header_decode(ctx, &eoc, &cls, &con, &tag))
678 return 0;
679
680 if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ)
681 return 0;
682
683 if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
684 return 0;
685
686 if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OJI)
687 return 0;
688
689 if (!asn1_oid_decode(ctx, end, &id, &idlen))
690 return 0;
691
692 if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) {
693 kfree(id);
694 return 0;
695 }
696
697 if (con != ASN1_PRI) {
698 kfree(id);
699 return 0;
700 }
701
702 type = 0;
703 if (!snmp_tag_cls2syntax(tag, cls, &type)) {
704 kfree(id);
705 return 0;
706 }
707
708 l = 0;
709 switch (type) {
710 case SNMP_INTEGER:
711 len = sizeof(long);
712 if (!asn1_long_decode(ctx, end, &l)) {
713 kfree(id);
714 return 0;
715 }
716 *obj = kmalloc(sizeof(struct snmp_object) + len,
717 GFP_ATOMIC);
718 if (*obj == NULL) {
719 kfree(id);
720 if (net_ratelimit())
721 printk("OOM in bsalg (%d)\n", __LINE__);
722 return 0;
723 }
724 (*obj)->syntax.l[0] = l;
725 break;
726 case SNMP_OCTETSTR:
727 case SNMP_OPAQUE:
728 if (!asn1_octets_decode(ctx, end, &p, &len)) {
729 kfree(id);
730 return 0;
731 }
732 *obj = kmalloc(sizeof(struct snmp_object) + len,
733 GFP_ATOMIC);
734 if (*obj == NULL) {
735 kfree(id);
736 if (net_ratelimit())
737 printk("OOM in bsalg (%d)\n", __LINE__);
738 return 0;
739 }
740 memcpy((*obj)->syntax.c, p, len);
741 kfree(p);
742 break;
743 case SNMP_NULL:
744 case SNMP_NOSUCHOBJECT:
745 case SNMP_NOSUCHINSTANCE:
746 case SNMP_ENDOFMIBVIEW:
747 len = 0;
748 *obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC);
749 if (*obj == NULL) {
750 kfree(id);
751 if (net_ratelimit())
752 printk("OOM in bsalg (%d)\n", __LINE__);
753 return 0;
754 }
755 if (!asn1_null_decode(ctx, end)) {
756 kfree(id);
757 kfree(*obj);
758 *obj = NULL;
759 return 0;
760 }
761 break;
762 case SNMP_OBJECTID:
763 if (!asn1_oid_decode(ctx, end, (unsigned long **)&lp, &len)) {
764 kfree(id);
765 return 0;
766 }
767 len *= sizeof(unsigned long);
768 *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
769 if (*obj == NULL) {
770 kfree(lp);
771 kfree(id);
772 if (net_ratelimit())
773 printk("OOM in bsalg (%d)\n", __LINE__);
774 return 0;
775 }
776 memcpy((*obj)->syntax.ul, lp, len);
777 kfree(lp);
778 break;
779 case SNMP_IPADDR:
780 if (!asn1_octets_decode(ctx, end, &p, &len)) {
781 kfree(id);
782 return 0;
783 }
784 if (len != 4) {
785 kfree(p);
786 kfree(id);
787 return 0;
788 }
789 *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
790 if (*obj == NULL) {
791 kfree(p);
792 kfree(id);
793 if (net_ratelimit())
794 printk("OOM in bsalg (%d)\n", __LINE__);
795 return 0;
796 }
797 memcpy((*obj)->syntax.uc, p, len);
798 kfree(p);
799 break;
800 case SNMP_COUNTER:
801 case SNMP_GAUGE:
802 case SNMP_TIMETICKS:
803 len = sizeof(unsigned long);
804 if (!asn1_ulong_decode(ctx, end, &ul)) {
805 kfree(id);
806 return 0;
807 }
808 *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
809 if (*obj == NULL) {
810 kfree(id);
811 if (net_ratelimit())
812 printk("OOM in bsalg (%d)\n", __LINE__);
813 return 0;
814 }
815 (*obj)->syntax.ul[0] = ul;
816 break;
817 default:
818 kfree(id);
819 return 0;
820 }
821
822 (*obj)->syntax_len = len;
823 (*obj)->type = type;
824 (*obj)->id = id;
825 (*obj)->id_len = idlen;
826
827 if (!asn1_eoc_decode(ctx, eoc)) {
828 kfree(id);
829 kfree(*obj);
830 *obj = NULL;
831 return 0;
832 }
833 return 1;
834}
835
836static unsigned char snmp_request_decode(struct asn1_ctx *ctx,
837 struct snmp_request *request)
838{
839 unsigned int cls, con, tag;
840 unsigned char *end;
841
842 if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
843 return 0;
844
845 if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
846 return 0;
847
848 if (!asn1_ulong_decode(ctx, end, &request->id))
849 return 0;
850
851 if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
852 return 0;
853
854 if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
855 return 0;
856
857 if (!asn1_uint_decode(ctx, end, &request->error_status))
858 return 0;
859
860 if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
861 return 0;
862
863 if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
864 return 0;
865
866 if (!asn1_uint_decode(ctx, end, &request->error_index))
867 return 0;
868
869 return 1;
870}
871
872/*
873 * Fast checksum update for possibly oddly-aligned UDP byte, from the
874 * code example in the draft.
875 */
876static void fast_csum(__sum16 *csum,
877 const unsigned char *optr,
878 const unsigned char *nptr,
879 int offset)
880{
881 unsigned char s[4];
882
883 if (offset & 1) {
884 s[0] = s[2] = 0;
885 s[1] = ~*optr;
886 s[3] = *nptr;
887 } else {
888 s[1] = s[3] = 0;
889 s[0] = ~*optr;
890 s[2] = *nptr;
891 }
892
893 *csum = csum_fold(csum_partial(s, 4, ~csum_unfold(*csum)));
894}
895
896/*
897 * Mangle IP address.
898 * - begin points to the start of the snmp messgae
899 * - addr points to the start of the address
900 */
901static inline void mangle_address(unsigned char *begin,
902 unsigned char *addr,
903 const struct oct1_map *map,
904 __sum16 *check)
905{
906 if (map->from == NOCT1(addr)) {
907 u_int32_t old;
908
909 if (debug)
910 memcpy(&old, (unsigned char *)addr, sizeof(old));
911
912 *addr = map->to;
913
914 /* Update UDP checksum if being used */
915 if (*check) {
916 fast_csum(check,
917 &map->from, &map->to, addr - begin);
918 }
919
920 if (debug)
921 printk(KERN_DEBUG "bsalg: mapped %u.%u.%u.%u to "
922 "%u.%u.%u.%u\n", NIPQUAD(old), NIPQUAD(*addr));
923 }
924}
925
926static unsigned char snmp_trap_decode(struct asn1_ctx *ctx,
927 struct snmp_v1_trap *trap,
928 const struct oct1_map *map,
929 __sum16 *check)
930{
931 unsigned int cls, con, tag, len;
932 unsigned char *end;
933
934 if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
935 return 0;
936
937 if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OJI)
938 return 0;
939
940 if (!asn1_oid_decode(ctx, end, &trap->id, &trap->id_len))
941 return 0;
942
943 if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
944 goto err_id_free;
945
946 if (!((cls == ASN1_APL && con == ASN1_PRI && tag == SNMP_IPA) ||
947 (cls == ASN1_UNI && con == ASN1_PRI && tag == ASN1_OTS)))
948 goto err_id_free;
949
950 if (!asn1_octets_decode(ctx, end, (unsigned char **)&trap->ip_address, &len))
951 goto err_id_free;
952
953 /* IPv4 only */
954 if (len != 4)
955 goto err_addr_free;
956
957 mangle_address(ctx->begin, ctx->pointer - 4, map, check);
958
959 if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
960 goto err_addr_free;
961
962 if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
963 goto err_addr_free;
964
965 if (!asn1_uint_decode(ctx, end, &trap->general))
966 goto err_addr_free;
967
968 if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
969 goto err_addr_free;
970
971 if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
972 goto err_addr_free;
973
974 if (!asn1_uint_decode(ctx, end, &trap->specific))
975 goto err_addr_free;
976
977 if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
978 goto err_addr_free;
979
980 if (!((cls == ASN1_APL && con == ASN1_PRI && tag == SNMP_TIT) ||
981 (cls == ASN1_UNI && con == ASN1_PRI && tag == ASN1_INT)))
982 goto err_addr_free;
983
984 if (!asn1_ulong_decode(ctx, end, &trap->time))
985 goto err_addr_free;
986
987 return 1;
988
989err_addr_free:
990 kfree((unsigned long *)trap->ip_address);
991
992err_id_free:
993 kfree(trap->id);
994
995 return 0;
996}
997
998/*****************************************************************************
999 *
1000 * Misc. routines
1001 *
1002 *****************************************************************************/
1003
1004static void hex_dump(unsigned char *buf, size_t len)
1005{
1006 size_t i;
1007
1008 for (i = 0; i < len; i++) {
1009 if (i && !(i % 16))
1010 printk("\n");
1011 printk("%02x ", *(buf + i));
1012 }
1013 printk("\n");
1014}
1015
1016/*
1017 * Parse and mangle SNMP message according to mapping.
1018 * (And this is the fucking 'basic' method).
1019 */
1020static int snmp_parse_mangle(unsigned char *msg,
1021 u_int16_t len,
1022 const struct oct1_map *map,
1023 __sum16 *check)
1024{
1025 unsigned char *eoc, *end;
1026 unsigned int cls, con, tag, vers, pdutype;
1027 struct asn1_ctx ctx;
1028 struct asn1_octstr comm;
1029 struct snmp_object **obj;
1030
1031 if (debug > 1)
1032 hex_dump(msg, len);
1033
1034 asn1_open(&ctx, msg, len);
1035
1036 /*
1037 * Start of SNMP message.
1038 */
1039 if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &tag))
1040 return 0;
1041 if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ)
1042 return 0;
1043
1044 /*
1045 * Version 1 or 2 handled.
1046 */
1047 if (!asn1_header_decode(&ctx, &end, &cls, &con, &tag))
1048 return 0;
1049 if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
1050 return 0;
1051 if (!asn1_uint_decode (&ctx, end, &vers))
1052 return 0;
1053 if (debug > 1)
1054 printk(KERN_DEBUG "bsalg: snmp version: %u\n", vers + 1);
1055 if (vers > 1)
1056 return 1;
1057
1058 /*
1059 * Community.
1060 */
1061 if (!asn1_header_decode (&ctx, &end, &cls, &con, &tag))
1062 return 0;
1063 if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OTS)
1064 return 0;
1065 if (!asn1_octets_decode(&ctx, end, &comm.data, &comm.len))
1066 return 0;
1067 if (debug > 1) {
1068 unsigned int i;
1069
1070 printk(KERN_DEBUG "bsalg: community: ");
1071 for (i = 0; i < comm.len; i++)
1072 printk("%c", comm.data[i]);
1073 printk("\n");
1074 }
1075 kfree(comm.data);
1076
1077 /*
1078 * PDU type
1079 */
1080 if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &pdutype))
1081 return 0;
1082 if (cls != ASN1_CTX || con != ASN1_CON)
1083 return 0;
1084 if (debug > 1) {
1085 unsigned char *pdus[] = {
1086 [SNMP_PDU_GET] = "get",
1087 [SNMP_PDU_NEXT] = "get-next",
1088 [SNMP_PDU_RESPONSE] = "response",
1089 [SNMP_PDU_SET] = "set",
1090 [SNMP_PDU_TRAP1] = "trapv1",
1091 [SNMP_PDU_BULK] = "bulk",
1092 [SNMP_PDU_INFORM] = "inform",
1093 [SNMP_PDU_TRAP2] = "trapv2"
1094 };
1095
1096 if (pdutype > SNMP_PDU_TRAP2)
1097 printk(KERN_DEBUG "bsalg: bad pdu type %u\n", pdutype);
1098 else
1099 printk(KERN_DEBUG "bsalg: pdu: %s\n", pdus[pdutype]);
1100 }
1101 if (pdutype != SNMP_PDU_RESPONSE &&
1102 pdutype != SNMP_PDU_TRAP1 && pdutype != SNMP_PDU_TRAP2)
1103 return 1;
1104
1105 /*
1106 * Request header or v1 trap
1107 */
1108 if (pdutype == SNMP_PDU_TRAP1) {
1109 struct snmp_v1_trap trap;
1110 unsigned char ret = snmp_trap_decode(&ctx, &trap, map, check);
1111
1112 if (ret) {
1113 kfree(trap.id);
1114 kfree((unsigned long *)trap.ip_address);
1115 } else
1116 return ret;
1117
1118 } else {
1119 struct snmp_request req;
1120
1121 if (!snmp_request_decode(&ctx, &req))
1122 return 0;
1123
1124 if (debug > 1)
1125 printk(KERN_DEBUG "bsalg: request: id=0x%lx error_status=%u "
1126 "error_index=%u\n", req.id, req.error_status,
1127 req.error_index);
1128 }
1129
1130 /*
1131 * Loop through objects, look for IP addresses to mangle.
1132 */
1133 if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &tag))
1134 return 0;
1135
1136 if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ)
1137 return 0;
1138
1139 obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC);
1140 if (obj == NULL) {
1141 if (net_ratelimit())
1142 printk(KERN_WARNING "OOM in bsalg(%d)\n", __LINE__);
1143 return 0;
1144 }
1145
1146 while (!asn1_eoc_decode(&ctx, eoc)) {
1147 unsigned int i;
1148
1149 if (!snmp_object_decode(&ctx, obj)) {
1150 if (*obj) {
1151 kfree((*obj)->id);
1152 kfree(*obj);
1153 }
1154 kfree(obj);
1155 return 0;
1156 }
1157
1158 if (debug > 1) {
1159 printk(KERN_DEBUG "bsalg: object: ");
1160 for (i = 0; i < (*obj)->id_len; i++) {
1161 if (i > 0)
1162 printk(".");
1163 printk("%lu", (*obj)->id[i]);
1164 }
1165 printk(": type=%u\n", (*obj)->type);
1166
1167 }
1168
1169 if ((*obj)->type == SNMP_IPADDR)
1170 mangle_address(ctx.begin, ctx.pointer - 4 , map, check);
1171
1172 kfree((*obj)->id);
1173 kfree(*obj);
1174 }
1175 kfree(obj);
1176
1177 if (!asn1_eoc_decode(&ctx, eoc))
1178 return 0;
1179
1180 return 1;
1181}
1182
1183/*****************************************************************************
1184 *
1185 * NAT routines.
1186 *
1187 *****************************************************************************/
1188
1189/*
1190 * SNMP translation routine.
1191 */
1192static int snmp_translate(struct ip_conntrack *ct,
1193 enum ip_conntrack_info ctinfo,
1194 struct sk_buff **pskb)
1195{
1196 struct iphdr *iph = (*pskb)->nh.iph;
1197 struct udphdr *udph = (struct udphdr *)((__be32 *)iph + iph->ihl);
1198 u_int16_t udplen = ntohs(udph->len);
1199 u_int16_t paylen = udplen - sizeof(struct udphdr);
1200 int dir = CTINFO2DIR(ctinfo);
1201 struct oct1_map map;
1202
1203 /*
1204 * Determine mappping for application layer addresses based
1205 * on NAT manipulations for the packet.
1206 */
1207 if (dir == IP_CT_DIR_ORIGINAL) {
1208 /* SNAT traps */
1209 map.from = NOCT1(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip);
1210 map.to = NOCT1(&ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip);
1211 } else {
1212 /* DNAT replies */
1213 map.from = NOCT1(&ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip);
1214 map.to = NOCT1(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip);
1215 }
1216
1217 if (map.from == map.to)
1218 return NF_ACCEPT;
1219
1220 if (!snmp_parse_mangle((unsigned char *)udph + sizeof(struct udphdr),
1221 paylen, &map, &udph->check)) {
1222 if (net_ratelimit())
1223 printk(KERN_WARNING "bsalg: parser failed\n");
1224 return NF_DROP;
1225 }
1226 return NF_ACCEPT;
1227}
1228
1229/* We don't actually set up expectations, just adjust internal IP
1230 * addresses if this is being NATted */
1231static int help(struct sk_buff **pskb,
1232 struct ip_conntrack *ct,
1233 enum ip_conntrack_info ctinfo)
1234{
1235 int dir = CTINFO2DIR(ctinfo);
1236 unsigned int ret;
1237 struct iphdr *iph = (*pskb)->nh.iph;
1238 struct udphdr *udph = (struct udphdr *)((u_int32_t *)iph + iph->ihl);
1239
1240 /* SNMP replies and originating SNMP traps get mangled */
1241 if (udph->source == htons(SNMP_PORT) && dir != IP_CT_DIR_REPLY)
1242 return NF_ACCEPT;
1243 if (udph->dest == htons(SNMP_TRAP_PORT) && dir != IP_CT_DIR_ORIGINAL)
1244 return NF_ACCEPT;
1245
1246 /* No NAT? */
1247 if (!(ct->status & IPS_NAT_MASK))
1248 return NF_ACCEPT;
1249
1250 /*
1251 * Make sure the packet length is ok. So far, we were only guaranteed
1252 * to have a valid length IP header plus 8 bytes, which means we have
1253 * enough room for a UDP header. Just verify the UDP length field so we
1254 * can mess around with the payload.
1255 */
1256 if (ntohs(udph->len) != (*pskb)->len - (iph->ihl << 2)) {
1257 if (net_ratelimit())
1258 printk(KERN_WARNING "SNMP: dropping malformed packet "
1259 "src=%u.%u.%u.%u dst=%u.%u.%u.%u\n",
1260 NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
1261 return NF_DROP;
1262 }
1263
1264 if (!skb_make_writable(pskb, (*pskb)->len))
1265 return NF_DROP;
1266
1267 spin_lock_bh(&snmp_lock);
1268 ret = snmp_translate(ct, ctinfo, pskb);
1269 spin_unlock_bh(&snmp_lock);
1270 return ret;
1271}
1272
1273static struct ip_conntrack_helper snmp_helper = {
1274 .max_expected = 0,
1275 .timeout = 180,
1276 .me = THIS_MODULE,
1277 .help = help,
1278 .name = "snmp",
1279
1280 .tuple = {.src = {.u = {.udp = {.port = __constant_htons(SNMP_PORT)}}},
1281 .dst = {.protonum = IPPROTO_UDP},
1282 },
1283 .mask = {.src = {.u = {0xFFFF}},
1284 .dst = {.protonum = 0xFF},
1285 },
1286};
1287
1288static struct ip_conntrack_helper snmp_trap_helper = {
1289 .max_expected = 0,
1290 .timeout = 180,
1291 .me = THIS_MODULE,
1292 .help = help,
1293 .name = "snmp_trap",
1294
1295 .tuple = {.src = {.u = {.udp = {.port = __constant_htons(SNMP_TRAP_PORT)}}},
1296 .dst = {.protonum = IPPROTO_UDP},
1297 },
1298 .mask = {.src = {.u = {0xFFFF}},
1299 .dst = {.protonum = 0xFF},
1300 },
1301};
1302
1303/*****************************************************************************
1304 *
1305 * Module stuff.
1306 *
1307 *****************************************************************************/
1308
1309static int __init ip_nat_snmp_basic_init(void)
1310{
1311 int ret = 0;
1312
1313 ret = ip_conntrack_helper_register(&snmp_helper);
1314 if (ret < 0)
1315 return ret;
1316 ret = ip_conntrack_helper_register(&snmp_trap_helper);
1317 if (ret < 0) {
1318 ip_conntrack_helper_unregister(&snmp_helper);
1319 return ret;
1320 }
1321 return ret;
1322}
1323
1324static void __exit ip_nat_snmp_basic_fini(void)
1325{
1326 ip_conntrack_helper_unregister(&snmp_helper);
1327 ip_conntrack_helper_unregister(&snmp_trap_helper);
1328}
1329
1330module_init(ip_nat_snmp_basic_init);
1331module_exit(ip_nat_snmp_basic_fini);
1332
1333module_param(debug, int, 0600);
diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
deleted file mode 100644
index 6bcfdf6dfcc9..000000000000
--- a/net/ipv4/netfilter/ip_nat_standalone.c
+++ /dev/null
@@ -1,388 +0,0 @@
1/* This file contains all the functions required for the standalone
2 ip_nat module.
3
4 These are not required by the compatibility layer.
5*/
6
7/* (C) 1999-2001 Paul `Rusty' Russell
8 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2 as
12 * published by the Free Software Foundation.
13 */
14
15/*
16 * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
17 * - new API and handling of conntrack/nat helpers
18 * - now capable of multiple expectations for one master
19 * */
20
21#include <linux/types.h>
22#include <linux/icmp.h>
23#include <linux/ip.h>
24#include <linux/netfilter.h>
25#include <linux/netfilter_ipv4.h>
26#include <linux/module.h>
27#include <linux/skbuff.h>
28#include <linux/proc_fs.h>
29#include <net/ip.h>
30#include <net/checksum.h>
31#include <linux/spinlock.h>
32
33#include <linux/netfilter_ipv4/ip_nat.h>
34#include <linux/netfilter_ipv4/ip_nat_rule.h>
35#include <linux/netfilter_ipv4/ip_nat_protocol.h>
36#include <linux/netfilter_ipv4/ip_nat_core.h>
37#include <linux/netfilter_ipv4/ip_nat_helper.h>
38#include <linux/netfilter_ipv4/ip_tables.h>
39#include <linux/netfilter_ipv4/ip_conntrack_core.h>
40
41#if 0
42#define DEBUGP printk
43#else
44#define DEBUGP(format, args...)
45#endif
46
47#ifdef CONFIG_XFRM
48static void nat_decode_session(struct sk_buff *skb, struct flowi *fl)
49{
50 struct ip_conntrack *ct;
51 struct ip_conntrack_tuple *t;
52 enum ip_conntrack_info ctinfo;
53 enum ip_conntrack_dir dir;
54 unsigned long statusbit;
55
56 ct = ip_conntrack_get(skb, &ctinfo);
57 if (ct == NULL)
58 return;
59 dir = CTINFO2DIR(ctinfo);
60 t = &ct->tuplehash[dir].tuple;
61
62 if (dir == IP_CT_DIR_ORIGINAL)
63 statusbit = IPS_DST_NAT;
64 else
65 statusbit = IPS_SRC_NAT;
66
67 if (ct->status & statusbit) {
68 fl->fl4_dst = t->dst.ip;
69 if (t->dst.protonum == IPPROTO_TCP ||
70 t->dst.protonum == IPPROTO_UDP)
71 fl->fl_ip_dport = t->dst.u.tcp.port;
72 }
73
74 statusbit ^= IPS_NAT_MASK;
75
76 if (ct->status & statusbit) {
77 fl->fl4_src = t->src.ip;
78 if (t->dst.protonum == IPPROTO_TCP ||
79 t->dst.protonum == IPPROTO_UDP)
80 fl->fl_ip_sport = t->src.u.tcp.port;
81 }
82}
83#endif
84
85static unsigned int
86ip_nat_fn(unsigned int hooknum,
87 struct sk_buff **pskb,
88 const struct net_device *in,
89 const struct net_device *out,
90 int (*okfn)(struct sk_buff *))
91{
92 struct ip_conntrack *ct;
93 enum ip_conntrack_info ctinfo;
94 struct ip_nat_info *info;
95 /* maniptype == SRC for postrouting. */
96 enum ip_nat_manip_type maniptype = HOOK2MANIP(hooknum);
97
98 /* We never see fragments: conntrack defrags on pre-routing
99 and local-out, and ip_nat_out protects post-routing. */
100 IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off
101 & htons(IP_MF|IP_OFFSET)));
102
103 ct = ip_conntrack_get(*pskb, &ctinfo);
104 /* Can't track? It's not due to stress, or conntrack would
105 have dropped it. Hence it's the user's responsibilty to
106 packet filter it out, or implement conntrack/NAT for that
107 protocol. 8) --RR */
108 if (!ct) {
109 /* Exception: ICMP redirect to new connection (not in
110 hash table yet). We must not let this through, in
111 case we're doing NAT to the same network. */
112 if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP) {
113 struct icmphdr _hdr, *hp;
114
115 hp = skb_header_pointer(*pskb,
116 (*pskb)->nh.iph->ihl*4,
117 sizeof(_hdr), &_hdr);
118 if (hp != NULL &&
119 hp->type == ICMP_REDIRECT)
120 return NF_DROP;
121 }
122 return NF_ACCEPT;
123 }
124
125 /* Don't try to NAT if this packet is not conntracked */
126 if (ct == &ip_conntrack_untracked)
127 return NF_ACCEPT;
128
129 switch (ctinfo) {
130 case IP_CT_RELATED:
131 case IP_CT_RELATED+IP_CT_IS_REPLY:
132 if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP) {
133 if (!ip_nat_icmp_reply_translation(ct, ctinfo,
134 hooknum, pskb))
135 return NF_DROP;
136 else
137 return NF_ACCEPT;
138 }
139 /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */
140 case IP_CT_NEW:
141 info = &ct->nat.info;
142
143 /* Seen it before? This can happen for loopback, retrans,
144 or local packets.. */
145 if (!ip_nat_initialized(ct, maniptype)) {
146 unsigned int ret;
147
148 if (unlikely(is_confirmed(ct)))
149 /* NAT module was loaded late */
150 ret = alloc_null_binding_confirmed(ct, info,
151 hooknum);
152 else if (hooknum == NF_IP_LOCAL_IN)
153 /* LOCAL_IN hook doesn't have a chain! */
154 ret = alloc_null_binding(ct, info, hooknum);
155 else
156 ret = ip_nat_rule_find(pskb, hooknum,
157 in, out, ct,
158 info);
159
160 if (ret != NF_ACCEPT) {
161 return ret;
162 }
163 } else
164 DEBUGP("Already setup manip %s for ct %p\n",
165 maniptype == IP_NAT_MANIP_SRC ? "SRC" : "DST",
166 ct);
167 break;
168
169 default:
170 /* ESTABLISHED */
171 IP_NF_ASSERT(ctinfo == IP_CT_ESTABLISHED
172 || ctinfo == (IP_CT_ESTABLISHED+IP_CT_IS_REPLY));
173 info = &ct->nat.info;
174 }
175
176 IP_NF_ASSERT(info);
177 return ip_nat_packet(ct, ctinfo, hooknum, pskb);
178}
179
180static unsigned int
181ip_nat_in(unsigned int hooknum,
182 struct sk_buff **pskb,
183 const struct net_device *in,
184 const struct net_device *out,
185 int (*okfn)(struct sk_buff *))
186{
187 unsigned int ret;
188 __be32 daddr = (*pskb)->nh.iph->daddr;
189
190 ret = ip_nat_fn(hooknum, pskb, in, out, okfn);
191 if (ret != NF_DROP && ret != NF_STOLEN
192 && daddr != (*pskb)->nh.iph->daddr) {
193 dst_release((*pskb)->dst);
194 (*pskb)->dst = NULL;
195 }
196 return ret;
197}
198
199static unsigned int
200ip_nat_out(unsigned int hooknum,
201 struct sk_buff **pskb,
202 const struct net_device *in,
203 const struct net_device *out,
204 int (*okfn)(struct sk_buff *))
205{
206#ifdef CONFIG_XFRM
207 struct ip_conntrack *ct;
208 enum ip_conntrack_info ctinfo;
209#endif
210 unsigned int ret;
211
212 /* root is playing with raw sockets. */
213 if ((*pskb)->len < sizeof(struct iphdr)
214 || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr))
215 return NF_ACCEPT;
216
217 ret = ip_nat_fn(hooknum, pskb, in, out, okfn);
218#ifdef CONFIG_XFRM
219 if (ret != NF_DROP && ret != NF_STOLEN
220 && (ct = ip_conntrack_get(*pskb, &ctinfo)) != NULL) {
221 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
222
223 if (ct->tuplehash[dir].tuple.src.ip !=
224 ct->tuplehash[!dir].tuple.dst.ip
225 || ct->tuplehash[dir].tuple.src.u.all !=
226 ct->tuplehash[!dir].tuple.dst.u.all
227 )
228 return ip_xfrm_me_harder(pskb) == 0 ? ret : NF_DROP;
229 }
230#endif
231 return ret;
232}
233
234static unsigned int
235ip_nat_local_fn(unsigned int hooknum,
236 struct sk_buff **pskb,
237 const struct net_device *in,
238 const struct net_device *out,
239 int (*okfn)(struct sk_buff *))
240{
241 struct ip_conntrack *ct;
242 enum ip_conntrack_info ctinfo;
243 unsigned int ret;
244
245 /* root is playing with raw sockets. */
246 if ((*pskb)->len < sizeof(struct iphdr)
247 || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr))
248 return NF_ACCEPT;
249
250 ret = ip_nat_fn(hooknum, pskb, in, out, okfn);
251 if (ret != NF_DROP && ret != NF_STOLEN
252 && (ct = ip_conntrack_get(*pskb, &ctinfo)) != NULL) {
253 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
254
255 if (ct->tuplehash[dir].tuple.dst.ip !=
256 ct->tuplehash[!dir].tuple.src.ip) {
257 if (ip_route_me_harder(pskb, RTN_UNSPEC))
258 ret = NF_DROP;
259 }
260#ifdef CONFIG_XFRM
261 else if (ct->tuplehash[dir].tuple.dst.u.all !=
262 ct->tuplehash[!dir].tuple.src.u.all)
263 if (ip_xfrm_me_harder(pskb))
264 ret = NF_DROP;
265#endif
266
267 }
268 return ret;
269}
270
271static unsigned int
272ip_nat_adjust(unsigned int hooknum,
273 struct sk_buff **pskb,
274 const struct net_device *in,
275 const struct net_device *out,
276 int (*okfn)(struct sk_buff *))
277{
278 struct ip_conntrack *ct;
279 enum ip_conntrack_info ctinfo;
280
281 ct = ip_conntrack_get(*pskb, &ctinfo);
282 if (ct && test_bit(IPS_SEQ_ADJUST_BIT, &ct->status)) {
283 DEBUGP("ip_nat_standalone: adjusting sequence number\n");
284 if (!ip_nat_seq_adjust(pskb, ct, ctinfo))
285 return NF_DROP;
286 }
287 return NF_ACCEPT;
288}
289
290/* We must be after connection tracking and before packet filtering. */
291
292static struct nf_hook_ops ip_nat_ops[] = {
293 /* Before packet filtering, change destination */
294 {
295 .hook = ip_nat_in,
296 .owner = THIS_MODULE,
297 .pf = PF_INET,
298 .hooknum = NF_IP_PRE_ROUTING,
299 .priority = NF_IP_PRI_NAT_DST,
300 },
301 /* After packet filtering, change source */
302 {
303 .hook = ip_nat_out,
304 .owner = THIS_MODULE,
305 .pf = PF_INET,
306 .hooknum = NF_IP_POST_ROUTING,
307 .priority = NF_IP_PRI_NAT_SRC,
308 },
309 /* After conntrack, adjust sequence number */
310 {
311 .hook = ip_nat_adjust,
312 .owner = THIS_MODULE,
313 .pf = PF_INET,
314 .hooknum = NF_IP_POST_ROUTING,
315 .priority = NF_IP_PRI_NAT_SEQ_ADJUST,
316 },
317 /* Before packet filtering, change destination */
318 {
319 .hook = ip_nat_local_fn,
320 .owner = THIS_MODULE,
321 .pf = PF_INET,
322 .hooknum = NF_IP_LOCAL_OUT,
323 .priority = NF_IP_PRI_NAT_DST,
324 },
325 /* After packet filtering, change source */
326 {
327 .hook = ip_nat_fn,
328 .owner = THIS_MODULE,
329 .pf = PF_INET,
330 .hooknum = NF_IP_LOCAL_IN,
331 .priority = NF_IP_PRI_NAT_SRC,
332 },
333 /* After conntrack, adjust sequence number */
334 {
335 .hook = ip_nat_adjust,
336 .owner = THIS_MODULE,
337 .pf = PF_INET,
338 .hooknum = NF_IP_LOCAL_IN,
339 .priority = NF_IP_PRI_NAT_SEQ_ADJUST,
340 },
341};
342
343static int __init ip_nat_standalone_init(void)
344{
345 int ret = 0;
346
347 need_conntrack();
348
349#ifdef CONFIG_XFRM
350 BUG_ON(ip_nat_decode_session != NULL);
351 ip_nat_decode_session = nat_decode_session;
352#endif
353 ret = ip_nat_rule_init();
354 if (ret < 0) {
355 printk("ip_nat_init: can't setup rules.\n");
356 goto cleanup_decode_session;
357 }
358 ret = nf_register_hooks(ip_nat_ops, ARRAY_SIZE(ip_nat_ops));
359 if (ret < 0) {
360 printk("ip_nat_init: can't register hooks.\n");
361 goto cleanup_rule_init;
362 }
363 return ret;
364
365 cleanup_rule_init:
366 ip_nat_rule_cleanup();
367 cleanup_decode_session:
368#ifdef CONFIG_XFRM
369 ip_nat_decode_session = NULL;
370 synchronize_net();
371#endif
372 return ret;
373}
374
375static void __exit ip_nat_standalone_fini(void)
376{
377 nf_unregister_hooks(ip_nat_ops, ARRAY_SIZE(ip_nat_ops));
378 ip_nat_rule_cleanup();
379#ifdef CONFIG_XFRM
380 ip_nat_decode_session = NULL;
381 synchronize_net();
382#endif
383}
384
385module_init(ip_nat_standalone_init);
386module_exit(ip_nat_standalone_fini);
387
388MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/ip_nat_tftp.c b/net/ipv4/netfilter/ip_nat_tftp.c
deleted file mode 100644
index 604793536fc1..000000000000
--- a/net/ipv4/netfilter/ip_nat_tftp.c
+++ /dev/null
@@ -1,70 +0,0 @@
1/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu>
2 *
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License version 2 as
5 * published by the Free Software Foundation.
6 *
7 * Version: 0.0.7
8 *
9 * Thu 21 Mar 2002 Harald Welte <laforge@gnumonks.org>
10 * - Port to newnat API
11 *
12 * This module currently supports DNAT:
13 * iptables -t nat -A PREROUTING -d x.x.x.x -j DNAT --to-dest x.x.x.y
14 *
15 * and SNAT:
16 * iptables -t nat -A POSTROUTING { -j MASQUERADE , -j SNAT --to-source x.x.x.x }
17 *
18 * It has not been tested with
19 * -j SNAT --to-source x.x.x.x-x.x.x.y since I only have one external ip
20 * If you do test this please let me know if it works or not.
21 *
22 */
23
24#include <linux/module.h>
25#include <linux/netfilter_ipv4.h>
26#include <linux/ip.h>
27#include <linux/udp.h>
28
29#include <linux/netfilter.h>
30#include <linux/netfilter_ipv4/ip_tables.h>
31#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
32#include <linux/netfilter_ipv4/ip_conntrack_tftp.h>
33#include <linux/netfilter_ipv4/ip_nat_helper.h>
34#include <linux/netfilter_ipv4/ip_nat_rule.h>
35#include <linux/moduleparam.h>
36
37MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>");
38MODULE_DESCRIPTION("tftp NAT helper");
39MODULE_LICENSE("GPL");
40
41static unsigned int help(struct sk_buff **pskb,
42 enum ip_conntrack_info ctinfo,
43 struct ip_conntrack_expect *exp)
44{
45 struct ip_conntrack *ct = exp->master;
46
47 exp->saved_proto.udp.port
48 = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port;
49 exp->dir = IP_CT_DIR_REPLY;
50 exp->expectfn = ip_nat_follow_master;
51 if (ip_conntrack_expect_related(exp) != 0)
52 return NF_DROP;
53 return NF_ACCEPT;
54}
55
56static void __exit ip_nat_tftp_fini(void)
57{
58 rcu_assign_pointer(ip_nat_tftp_hook, NULL);
59 synchronize_rcu();
60}
61
62static int __init ip_nat_tftp_init(void)
63{
64 BUG_ON(rcu_dereference(ip_nat_tftp_hook));
65 rcu_assign_pointer(ip_nat_tftp_hook, help);
66 return 0;
67}
68
69module_init(ip_nat_tftp_init);
70module_exit(ip_nat_tftp_fini);
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index a14798a850d7..702d94db19b9 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -8,18 +8,6 @@
8 * This program is free software; you can redistribute it and/or modify 8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as 9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation. 10 * published by the Free Software Foundation.
11 *
12 * 2000-03-27: Simplified code (thanks to Andi Kleen for clues).
13 * 2000-05-20: Fixed notifier problems (following Miguel Freitas' report).
14 * 2000-06-19: Fixed so nfmark is copied to metadata (reported by Sebastian
15 * Zander).
16 * 2000-08-01: Added Nick Williams' MAC support.
17 * 2002-06-25: Code cleanup.
18 * 2005-01-10: Added /proc counter for dropped packets; fixed so
19 * packets aren't delivered to user space if they're going
20 * to be dropped.
21 * 2005-05-26: local_bh_{disable,enable} around nf_reinject (Harald Welte)
22 *
23 */ 11 */
24#include <linux/module.h> 12#include <linux/module.h>
25#include <linux/skbuff.h> 13#include <linux/skbuff.h>
@@ -191,12 +179,13 @@ ipq_flush(int verdict)
191static struct sk_buff * 179static struct sk_buff *
192ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp) 180ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
193{ 181{
194 unsigned char *old_tail; 182 sk_buff_data_t old_tail;
195 size_t size = 0; 183 size_t size = 0;
196 size_t data_len = 0; 184 size_t data_len = 0;
197 struct sk_buff *skb; 185 struct sk_buff *skb;
198 struct ipq_packet_msg *pmsg; 186 struct ipq_packet_msg *pmsg;
199 struct nlmsghdr *nlh; 187 struct nlmsghdr *nlh;
188 struct timeval tv;
200 189
201 read_lock_bh(&queue_lock); 190 read_lock_bh(&queue_lock);
202 191
@@ -234,15 +223,16 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
234 if (!skb) 223 if (!skb)
235 goto nlmsg_failure; 224 goto nlmsg_failure;
236 225
237 old_tail= skb->tail; 226 old_tail = skb->tail;
238 nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh)); 227 nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh));
239 pmsg = NLMSG_DATA(nlh); 228 pmsg = NLMSG_DATA(nlh);
240 memset(pmsg, 0, sizeof(*pmsg)); 229 memset(pmsg, 0, sizeof(*pmsg));
241 230
242 pmsg->packet_id = (unsigned long )entry; 231 pmsg->packet_id = (unsigned long )entry;
243 pmsg->data_len = data_len; 232 pmsg->data_len = data_len;
244 pmsg->timestamp_sec = entry->skb->tstamp.off_sec; 233 tv = ktime_to_timeval(entry->skb->tstamp);
245 pmsg->timestamp_usec = entry->skb->tstamp.off_usec; 234 pmsg->timestamp_sec = tv.tv_sec;
235 pmsg->timestamp_usec = tv.tv_usec;
246 pmsg->mark = entry->skb->mark; 236 pmsg->mark = entry->skb->mark;
247 pmsg->hook = entry->info->hook; 237 pmsg->hook = entry->info->hook;
248 pmsg->hw_protocol = entry->skb->protocol; 238 pmsg->hw_protocol = entry->skb->protocol;
@@ -378,7 +368,7 @@ ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct ipq_queue_entry *e)
378 } 368 }
379 if (!skb_make_writable(&e->skb, v->data_len)) 369 if (!skb_make_writable(&e->skb, v->data_len))
380 return -ENOMEM; 370 return -ENOMEM;
381 memcpy(e->skb->data, v->payload, v->data_len); 371 skb_copy_to_linear_data(e->skb, v->payload, v->data_len);
382 e->skb->ip_summed = CHECKSUM_NONE; 372 e->skb->ip_summed = CHECKSUM_NONE;
383 373
384 return 0; 374 return 0;
@@ -495,7 +485,7 @@ ipq_rcv_skb(struct sk_buff *skb)
495 if (skblen < sizeof(*nlh)) 485 if (skblen < sizeof(*nlh))
496 return; 486 return;
497 487
498 nlh = (struct nlmsghdr *)skb->data; 488 nlh = nlmsg_hdr(skb);
499 nlmsglen = nlh->nlmsg_len; 489 nlmsglen = nlh->nlmsg_len;
500 if (nlmsglen < sizeof(*nlh) || skblen < nlmsglen) 490 if (nlmsglen < sizeof(*nlh) || skblen < nlmsglen)
501 return; 491 return;
@@ -678,7 +668,7 @@ static int __init ip_queue_init(void)
678 668
679 netlink_register_notifier(&ipq_nl_notifier); 669 netlink_register_notifier(&ipq_nl_notifier);
680 ipqnl = netlink_kernel_create(NETLINK_FIREWALL, 0, ipq_rcv_sk, 670 ipqnl = netlink_kernel_create(NETLINK_FIREWALL, 0, ipq_rcv_sk,
681 THIS_MODULE); 671 NULL, THIS_MODULE);
682 if (ipqnl == NULL) { 672 if (ipqnl == NULL) {
683 printk(KERN_ERR "ip_queue: failed to create netlink socket\n"); 673 printk(KERN_ERR "ip_queue: failed to create netlink socket\n");
684 goto cleanup_netlink_notifier; 674 goto cleanup_netlink_notifier;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 50cc4b92e284..e3f83bf160d9 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -7,12 +7,6 @@
7 * This program is free software; you can redistribute it and/or modify 7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as 8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation. 9 * published by the Free Software Foundation.
10 *
11 * 19 Jan 2002 Harald Welte <laforge@gnumonks.org>
12 * - increase module usage count as soon as we have rules inside
13 * a table
14 * 08 Oct 2005 Harald Welte <lafore@netfilter.org>
15 * - Generalize into "x_tables" layer and "{ip,ip6,arp}_tables"
16 */ 10 */
17#include <linux/cache.h> 11#include <linux/cache.h>
18#include <linux/capability.h> 12#include <linux/capability.h>
@@ -198,7 +192,7 @@ int do_match(struct ipt_entry_match *m,
198{ 192{
199 /* Stop iteration if it doesn't match */ 193 /* Stop iteration if it doesn't match */
200 if (!m->u.kernel.match->match(skb, in, out, m->u.kernel.match, m->data, 194 if (!m->u.kernel.match->match(skb, in, out, m->u.kernel.match, m->data,
201 offset, skb->nh.iph->ihl*4, hotdrop)) 195 offset, ip_hdrlen(skb), hotdrop))
202 return 1; 196 return 1;
203 else 197 else
204 return 0; 198 return 0;
@@ -231,7 +225,7 @@ ipt_do_table(struct sk_buff **pskb,
231 struct xt_table_info *private; 225 struct xt_table_info *private;
232 226
233 /* Initialization */ 227 /* Initialization */
234 ip = (*pskb)->nh.iph; 228 ip = ip_hdr(*pskb);
235 datalen = (*pskb)->len - ip->ihl * 4; 229 datalen = (*pskb)->len - ip->ihl * 4;
236 indev = in ? in->name : nulldevname; 230 indev = in ? in->name : nulldevname;
237 outdev = out ? out->name : nulldevname; 231 outdev = out ? out->name : nulldevname;
@@ -320,7 +314,7 @@ ipt_do_table(struct sk_buff **pskb,
320 = 0x57acc001; 314 = 0x57acc001;
321#endif 315#endif
322 /* Target might have changed stuff. */ 316 /* Target might have changed stuff. */
323 ip = (*pskb)->nh.iph; 317 ip = ip_hdr(*pskb);
324 datalen = (*pskb)->len - ip->ihl * 4; 318 datalen = (*pskb)->len - ip->ihl * 4;
325 319
326 if (verdict == IPT_CONTINUE) 320 if (verdict == IPT_CONTINUE)
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 42b08029e867..40e273421398 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -21,15 +21,12 @@
21#include <linux/if_arp.h> 21#include <linux/if_arp.h>
22#include <linux/proc_fs.h> 22#include <linux/proc_fs.h>
23#include <linux/seq_file.h> 23#include <linux/seq_file.h>
24
25#include <net/checksum.h>
26
27#include <linux/netfilter_arp.h> 24#include <linux/netfilter_arp.h>
28
29#include <linux/netfilter/x_tables.h> 25#include <linux/netfilter/x_tables.h>
30#include <linux/netfilter_ipv4/ip_tables.h> 26#include <linux/netfilter_ipv4/ip_tables.h>
31#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h> 27#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
32#include <net/netfilter/nf_conntrack_compat.h> 28#include <net/netfilter/nf_conntrack.h>
29#include <net/checksum.h>
33 30
34#define CLUSTERIP_VERSION "0.8" 31#define CLUSTERIP_VERSION "0.8"
35 32
@@ -240,7 +237,7 @@ clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum)
240static inline u_int32_t 237static inline u_int32_t
241clusterip_hashfn(struct sk_buff *skb, struct clusterip_config *config) 238clusterip_hashfn(struct sk_buff *skb, struct clusterip_config *config)
242{ 239{
243 struct iphdr *iph = skb->nh.iph; 240 struct iphdr *iph = ip_hdr(skb);
244 unsigned long hashval; 241 unsigned long hashval;
245 u_int16_t sport, dport; 242 u_int16_t sport, dport;
246 u_int16_t *ports; 243 u_int16_t *ports;
@@ -310,15 +307,16 @@ target(struct sk_buff **pskb,
310 const void *targinfo) 307 const void *targinfo)
311{ 308{
312 const struct ipt_clusterip_tgt_info *cipinfo = targinfo; 309 const struct ipt_clusterip_tgt_info *cipinfo = targinfo;
310 struct nf_conn *ct;
313 enum ip_conntrack_info ctinfo; 311 enum ip_conntrack_info ctinfo;
314 u_int32_t *mark, hash; 312 u_int32_t hash;
315 313
316 /* don't need to clusterip_config_get() here, since refcount 314 /* don't need to clusterip_config_get() here, since refcount
317 * is only decremented by destroy() - and ip_tables guarantees 315 * is only decremented by destroy() - and ip_tables guarantees
318 * that the ->target() function isn't called after ->destroy() */ 316 * that the ->target() function isn't called after ->destroy() */
319 317
320 mark = nf_ct_get_mark((*pskb), &ctinfo); 318 ct = nf_ct_get(*pskb, &ctinfo);
321 if (mark == NULL) { 319 if (ct == NULL) {
322 printk(KERN_ERR "CLUSTERIP: no conntrack!\n"); 320 printk(KERN_ERR "CLUSTERIP: no conntrack!\n");
323 /* FIXME: need to drop invalid ones, since replies 321 /* FIXME: need to drop invalid ones, since replies
324 * to outgoing connections of other nodes will be 322 * to outgoing connections of other nodes will be
@@ -328,7 +326,7 @@ target(struct sk_buff **pskb,
328 326
329 /* special case: ICMP error handling. conntrack distinguishes between 327 /* special case: ICMP error handling. conntrack distinguishes between
330 * error messages (RELATED) and information requests (see below) */ 328 * error messages (RELATED) and information requests (see below) */
331 if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP 329 if (ip_hdr(*pskb)->protocol == IPPROTO_ICMP
332 && (ctinfo == IP_CT_RELATED 330 && (ctinfo == IP_CT_RELATED
333 || ctinfo == IP_CT_RELATED+IP_CT_IS_REPLY)) 331 || ctinfo == IP_CT_RELATED+IP_CT_IS_REPLY))
334 return XT_CONTINUE; 332 return XT_CONTINUE;
@@ -341,7 +339,7 @@ target(struct sk_buff **pskb,
341 339
342 switch (ctinfo) { 340 switch (ctinfo) {
343 case IP_CT_NEW: 341 case IP_CT_NEW:
344 *mark = hash; 342 ct->mark = hash;
345 break; 343 break;
346 case IP_CT_RELATED: 344 case IP_CT_RELATED:
347 case IP_CT_RELATED+IP_CT_IS_REPLY: 345 case IP_CT_RELATED+IP_CT_IS_REPLY:
@@ -358,7 +356,7 @@ target(struct sk_buff **pskb,
358#ifdef DEBUG_CLUSTERP 356#ifdef DEBUG_CLUSTERP
359 DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 357 DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
360#endif 358#endif
361 DEBUGP("hash=%u ct_hash=%u ", hash, *mark); 359 DEBUGP("hash=%u ct_hash=%u ", hash, ct->mark);
362 if (!clusterip_responsible(cipinfo->config, hash)) { 360 if (!clusterip_responsible(cipinfo->config, hash)) {
363 DEBUGP("not responsible\n"); 361 DEBUGP("not responsible\n");
364 return NF_DROP; 362 return NF_DROP;
@@ -521,7 +519,7 @@ arp_mangle(unsigned int hook,
521 const struct net_device *out, 519 const struct net_device *out,
522 int (*okfn)(struct sk_buff *)) 520 int (*okfn)(struct sk_buff *))
523{ 521{
524 struct arphdr *arp = (*pskb)->nh.arph; 522 struct arphdr *arp = arp_hdr(*pskb);
525 struct arp_payload *payload; 523 struct arp_payload *payload;
526 struct clusterip_config *c; 524 struct clusterip_config *c;
527 525
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
index 4f565633631d..918ca92e534a 100644
--- a/net/ipv4/netfilter/ipt_ECN.c
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -5,14 +5,13 @@
5 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as 6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation. 7 * published by the Free Software Foundation.
8 *
9 * ipt_ECN.c,v 1.5 2002/08/18 19:36:51 laforge Exp
10*/ 8*/
11 9
12#include <linux/in.h> 10#include <linux/in.h>
13#include <linux/module.h> 11#include <linux/module.h>
14#include <linux/skbuff.h> 12#include <linux/skbuff.h>
15#include <linux/ip.h> 13#include <linux/ip.h>
14#include <net/ip.h>
16#include <linux/tcp.h> 15#include <linux/tcp.h>
17#include <net/checksum.h> 16#include <net/checksum.h>
18 17
@@ -29,13 +28,13 @@ MODULE_DESCRIPTION("iptables ECN modification module");
29static inline int 28static inline int
30set_ect_ip(struct sk_buff **pskb, const struct ipt_ECN_info *einfo) 29set_ect_ip(struct sk_buff **pskb, const struct ipt_ECN_info *einfo)
31{ 30{
32 struct iphdr *iph = (*pskb)->nh.iph; 31 struct iphdr *iph = ip_hdr(*pskb);
33 32
34 if ((iph->tos & IPT_ECN_IP_MASK) != (einfo->ip_ect & IPT_ECN_IP_MASK)) { 33 if ((iph->tos & IPT_ECN_IP_MASK) != (einfo->ip_ect & IPT_ECN_IP_MASK)) {
35 __u8 oldtos; 34 __u8 oldtos;
36 if (!skb_make_writable(pskb, sizeof(struct iphdr))) 35 if (!skb_make_writable(pskb, sizeof(struct iphdr)))
37 return 0; 36 return 0;
38 iph = (*pskb)->nh.iph; 37 iph = ip_hdr(*pskb);
39 oldtos = iph->tos; 38 oldtos = iph->tos;
40 iph->tos &= ~IPT_ECN_IP_MASK; 39 iph->tos &= ~IPT_ECN_IP_MASK;
41 iph->tos |= (einfo->ip_ect & IPT_ECN_IP_MASK); 40 iph->tos |= (einfo->ip_ect & IPT_ECN_IP_MASK);
@@ -52,7 +51,7 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo)
52 __be16 oldval; 51 __be16 oldval;
53 52
54 /* Not enought header? */ 53 /* Not enought header? */
55 tcph = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl*4, 54 tcph = skb_header_pointer(*pskb, ip_hdrlen(*pskb),
56 sizeof(_tcph), &_tcph); 55 sizeof(_tcph), &_tcph);
57 if (!tcph) 56 if (!tcph)
58 return 0; 57 return 0;
@@ -63,9 +62,9 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo)
63 tcph->cwr == einfo->proto.tcp.cwr))) 62 tcph->cwr == einfo->proto.tcp.cwr)))
64 return 1; 63 return 1;
65 64
66 if (!skb_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) 65 if (!skb_make_writable(pskb, ip_hdrlen(*pskb) + sizeof(*tcph)))
67 return 0; 66 return 0;
68 tcph = (void *)(*pskb)->nh.iph + (*pskb)->nh.iph->ihl*4; 67 tcph = (void *)ip_hdr(*pskb) + ip_hdrlen(*pskb);
69 68
70 oldval = ((__be16 *)tcph)[6]; 69 oldval = ((__be16 *)tcph)[6];
71 if (einfo->operation & IPT_ECN_OP_SET_ECE) 70 if (einfo->operation & IPT_ECN_OP_SET_ECE)
@@ -93,7 +92,7 @@ target(struct sk_buff **pskb,
93 return NF_DROP; 92 return NF_DROP;
94 93
95 if (einfo->operation & (IPT_ECN_OP_SET_ECE | IPT_ECN_OP_SET_CWR) 94 if (einfo->operation & (IPT_ECN_OP_SET_ECE | IPT_ECN_OP_SET_CWR)
96 && (*pskb)->nh.iph->protocol == IPPROTO_TCP) 95 && ip_hdr(*pskb)->protocol == IPPROTO_TCP)
97 if (!set_ect_tcp(pskb, einfo)) 96 if (!set_ect_tcp(pskb, einfo))
98 return NF_DROP; 97 return NF_DROP;
99 98
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index d9c37fd94228..a42c5cd968b1 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -399,9 +399,9 @@ ipt_log_packet(unsigned int pf,
399 /* MAC logging for input chain only. */ 399 /* MAC logging for input chain only. */
400 printk("MAC="); 400 printk("MAC=");
401 if (skb->dev && skb->dev->hard_header_len 401 if (skb->dev && skb->dev->hard_header_len
402 && skb->mac.raw != (void*)skb->nh.iph) { 402 && skb->mac_header != skb->network_header) {
403 int i; 403 int i;
404 unsigned char *p = skb->mac.raw; 404 const unsigned char *p = skb_mac_header(skb);
405 for (i = 0; i < skb->dev->hard_header_len; i++,p++) 405 for (i = 0; i < skb->dev->hard_header_len; i++,p++)
406 printk("%02x%c", *p, 406 printk("%02x%c", *p,
407 i==skb->dev->hard_header_len - 1 407 i==skb->dev->hard_header_len - 1
@@ -477,14 +477,10 @@ static int __init ipt_log_init(void)
477 ret = xt_register_target(&ipt_log_reg); 477 ret = xt_register_target(&ipt_log_reg);
478 if (ret < 0) 478 if (ret < 0)
479 return ret; 479 return ret;
480 if (nf_log_register(PF_INET, &ipt_log_logger) < 0) { 480 ret = nf_log_register(PF_INET, &ipt_log_logger);
481 printk(KERN_WARNING "ipt_LOG: not logging via system console " 481 if (ret < 0 && ret != -EEXIST)
482 "since somebody else already registered for PF_INET\n"); 482 xt_unregister_target(&ipt_log_reg);
483 /* we cannot make module load fail here, since otherwise 483 return ret;
484 * iptables userspace would abort */
485 }
486
487 return 0;
488} 484}
489 485
490static void __exit ipt_log_fini(void) 486static void __exit ipt_log_fini(void)
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index b5955f3a3f8f..d4f2d7775330 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -19,12 +19,8 @@
19#include <net/ip.h> 19#include <net/ip.h>
20#include <net/checksum.h> 20#include <net/checksum.h>
21#include <net/route.h> 21#include <net/route.h>
22#include <linux/netfilter_ipv4.h>
23#ifdef CONFIG_NF_NAT_NEEDED
24#include <net/netfilter/nf_nat_rule.h> 22#include <net/netfilter/nf_nat_rule.h>
25#else 23#include <linux/netfilter_ipv4.h>
26#include <linux/netfilter_ipv4/ip_nat_rule.h>
27#endif
28#include <linux/netfilter/x_tables.h> 24#include <linux/netfilter/x_tables.h>
29 25
30MODULE_LICENSE("GPL"); 26MODULE_LICENSE("GPL");
@@ -48,7 +44,7 @@ masquerade_check(const char *tablename,
48 void *targinfo, 44 void *targinfo,
49 unsigned int hook_mask) 45 unsigned int hook_mask)
50{ 46{
51 const struct ip_nat_multi_range_compat *mr = targinfo; 47 const struct nf_nat_multi_range_compat *mr = targinfo;
52 48
53 if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) { 49 if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
54 DEBUGP("masquerade_check: bad MAP_IPS.\n"); 50 DEBUGP("masquerade_check: bad MAP_IPS.\n");
@@ -69,33 +65,26 @@ masquerade_target(struct sk_buff **pskb,
69 const struct xt_target *target, 65 const struct xt_target *target,
70 const void *targinfo) 66 const void *targinfo)
71{ 67{
72#ifdef CONFIG_NF_NAT_NEEDED 68 struct nf_conn *ct;
73 struct nf_conn_nat *nat; 69 struct nf_conn_nat *nat;
74#endif
75 struct ip_conntrack *ct;
76 enum ip_conntrack_info ctinfo; 70 enum ip_conntrack_info ctinfo;
77 struct ip_nat_range newrange; 71 struct nf_nat_range newrange;
78 const struct ip_nat_multi_range_compat *mr; 72 const struct nf_nat_multi_range_compat *mr;
79 struct rtable *rt; 73 struct rtable *rt;
80 __be32 newsrc; 74 __be32 newsrc;
81 75
82 IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING); 76 NF_CT_ASSERT(hooknum == NF_IP_POST_ROUTING);
83 77
84 ct = ip_conntrack_get(*pskb, &ctinfo); 78 ct = nf_ct_get(*pskb, &ctinfo);
85#ifdef CONFIG_NF_NAT_NEEDED
86 nat = nfct_nat(ct); 79 nat = nfct_nat(ct);
87#endif 80
88 IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED 81 NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED
89 || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); 82 || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY));
90 83
91 /* Source address is 0.0.0.0 - locally generated packet that is 84 /* Source address is 0.0.0.0 - locally generated packet that is
92 * probably not supposed to be masqueraded. 85 * probably not supposed to be masqueraded.
93 */ 86 */
94#ifdef CONFIG_NF_NAT_NEEDED
95 if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0) 87 if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0)
96#else
97 if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip == 0)
98#endif
99 return NF_ACCEPT; 88 return NF_ACCEPT;
100 89
101 mr = targinfo; 90 mr = targinfo;
@@ -107,40 +96,30 @@ masquerade_target(struct sk_buff **pskb,
107 } 96 }
108 97
109 write_lock_bh(&masq_lock); 98 write_lock_bh(&masq_lock);
110#ifdef CONFIG_NF_NAT_NEEDED
111 nat->masq_index = out->ifindex; 99 nat->masq_index = out->ifindex;
112#else
113 ct->nat.masq_index = out->ifindex;
114#endif
115 write_unlock_bh(&masq_lock); 100 write_unlock_bh(&masq_lock);
116 101
117 /* Transfer from original range. */ 102 /* Transfer from original range. */
118 newrange = ((struct ip_nat_range) 103 newrange = ((struct nf_nat_range)
119 { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS, 104 { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS,
120 newsrc, newsrc, 105 newsrc, newsrc,
121 mr->range[0].min, mr->range[0].max }); 106 mr->range[0].min, mr->range[0].max });
122 107
123 /* Hand modified range to generic setup. */ 108 /* Hand modified range to generic setup. */
124 return ip_nat_setup_info(ct, &newrange, hooknum); 109 return nf_nat_setup_info(ct, &newrange, hooknum);
125} 110}
126 111
127static inline int 112static inline int
128device_cmp(struct ip_conntrack *i, void *ifindex) 113device_cmp(struct nf_conn *i, void *ifindex)
129{ 114{
130 int ret;
131#ifdef CONFIG_NF_NAT_NEEDED
132 struct nf_conn_nat *nat = nfct_nat(i); 115 struct nf_conn_nat *nat = nfct_nat(i);
116 int ret;
133 117
134 if (!nat) 118 if (!nat)
135 return 0; 119 return 0;
136#endif
137 120
138 read_lock_bh(&masq_lock); 121 read_lock_bh(&masq_lock);
139#ifdef CONFIG_NF_NAT_NEEDED
140 ret = (nat->masq_index == (int)(long)ifindex); 122 ret = (nat->masq_index == (int)(long)ifindex);
141#else
142 ret = (i->nat.masq_index == (int)(long)ifindex);
143#endif
144 read_unlock_bh(&masq_lock); 123 read_unlock_bh(&masq_lock);
145 124
146 return ret; 125 return ret;
@@ -156,9 +135,9 @@ static int masq_device_event(struct notifier_block *this,
156 /* Device was downed. Search entire table for 135 /* Device was downed. Search entire table for
157 conntracks which were associated with that device, 136 conntracks which were associated with that device,
158 and forget them. */ 137 and forget them. */
159 IP_NF_ASSERT(dev->ifindex != 0); 138 NF_CT_ASSERT(dev->ifindex != 0);
160 139
161 ip_ct_iterate_cleanup(device_cmp, (void *)(long)dev->ifindex); 140 nf_ct_iterate_cleanup(device_cmp, (void *)(long)dev->ifindex);
162 } 141 }
163 142
164 return NOTIFY_DONE; 143 return NOTIFY_DONE;
@@ -174,9 +153,9 @@ static int masq_inet_event(struct notifier_block *this,
174 /* IP address was deleted. Search entire table for 153 /* IP address was deleted. Search entire table for
175 conntracks which were associated with that device, 154 conntracks which were associated with that device,
176 and forget them. */ 155 and forget them. */
177 IP_NF_ASSERT(dev->ifindex != 0); 156 NF_CT_ASSERT(dev->ifindex != 0);
178 157
179 ip_ct_iterate_cleanup(device_cmp, (void *)(long)dev->ifindex); 158 nf_ct_iterate_cleanup(device_cmp, (void *)(long)dev->ifindex);
180 } 159 }
181 160
182 return NOTIFY_DONE; 161 return NOTIFY_DONE;
@@ -194,7 +173,7 @@ static struct xt_target masquerade = {
194 .name = "MASQUERADE", 173 .name = "MASQUERADE",
195 .family = AF_INET, 174 .family = AF_INET,
196 .target = masquerade_target, 175 .target = masquerade_target,
197 .targetsize = sizeof(struct ip_nat_multi_range_compat), 176 .targetsize = sizeof(struct nf_nat_multi_range_compat),
198 .table = "nat", 177 .table = "nat",
199 .hooks = 1 << NF_IP_POST_ROUTING, 178 .hooks = 1 << NF_IP_POST_ROUTING,
200 .checkentry = masquerade_check, 179 .checkentry = masquerade_check,
diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c
index fd7aaa347cd8..068c69bce30e 100644
--- a/net/ipv4/netfilter/ipt_NETMAP.c
+++ b/net/ipv4/netfilter/ipt_NETMAP.c
@@ -16,11 +16,7 @@
16#include <linux/netfilter.h> 16#include <linux/netfilter.h>
17#include <linux/netfilter_ipv4.h> 17#include <linux/netfilter_ipv4.h>
18#include <linux/netfilter/x_tables.h> 18#include <linux/netfilter/x_tables.h>
19#ifdef CONFIG_NF_NAT_NEEDED
20#include <net/netfilter/nf_nat_rule.h> 19#include <net/netfilter/nf_nat_rule.h>
21#else
22#include <linux/netfilter_ipv4/ip_nat_rule.h>
23#endif
24 20
25#define MODULENAME "NETMAP" 21#define MODULENAME "NETMAP"
26MODULE_LICENSE("GPL"); 22MODULE_LICENSE("GPL");
@@ -40,7 +36,7 @@ check(const char *tablename,
40 void *targinfo, 36 void *targinfo,
41 unsigned int hook_mask) 37 unsigned int hook_mask)
42{ 38{
43 const struct ip_nat_multi_range_compat *mr = targinfo; 39 const struct nf_nat_multi_range_compat *mr = targinfo;
44 40
45 if (!(mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)) { 41 if (!(mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)) {
46 DEBUGP(MODULENAME":check: bad MAP_IPS.\n"); 42 DEBUGP(MODULENAME":check: bad MAP_IPS.\n");
@@ -61,39 +57,39 @@ target(struct sk_buff **pskb,
61 const struct xt_target *target, 57 const struct xt_target *target,
62 const void *targinfo) 58 const void *targinfo)
63{ 59{
64 struct ip_conntrack *ct; 60 struct nf_conn *ct;
65 enum ip_conntrack_info ctinfo; 61 enum ip_conntrack_info ctinfo;
66 __be32 new_ip, netmask; 62 __be32 new_ip, netmask;
67 const struct ip_nat_multi_range_compat *mr = targinfo; 63 const struct nf_nat_multi_range_compat *mr = targinfo;
68 struct ip_nat_range newrange; 64 struct nf_nat_range newrange;
69 65
70 IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING 66 NF_CT_ASSERT(hooknum == NF_IP_PRE_ROUTING
71 || hooknum == NF_IP_POST_ROUTING 67 || hooknum == NF_IP_POST_ROUTING
72 || hooknum == NF_IP_LOCAL_OUT); 68 || hooknum == NF_IP_LOCAL_OUT);
73 ct = ip_conntrack_get(*pskb, &ctinfo); 69 ct = nf_ct_get(*pskb, &ctinfo);
74 70
75 netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip); 71 netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip);
76 72
77 if (hooknum == NF_IP_PRE_ROUTING || hooknum == NF_IP_LOCAL_OUT) 73 if (hooknum == NF_IP_PRE_ROUTING || hooknum == NF_IP_LOCAL_OUT)
78 new_ip = (*pskb)->nh.iph->daddr & ~netmask; 74 new_ip = ip_hdr(*pskb)->daddr & ~netmask;
79 else 75 else
80 new_ip = (*pskb)->nh.iph->saddr & ~netmask; 76 new_ip = ip_hdr(*pskb)->saddr & ~netmask;
81 new_ip |= mr->range[0].min_ip & netmask; 77 new_ip |= mr->range[0].min_ip & netmask;
82 78
83 newrange = ((struct ip_nat_range) 79 newrange = ((struct nf_nat_range)
84 { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS, 80 { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS,
85 new_ip, new_ip, 81 new_ip, new_ip,
86 mr->range[0].min, mr->range[0].max }); 82 mr->range[0].min, mr->range[0].max });
87 83
88 /* Hand modified range to generic setup. */ 84 /* Hand modified range to generic setup. */
89 return ip_nat_setup_info(ct, &newrange, hooknum); 85 return nf_nat_setup_info(ct, &newrange, hooknum);
90} 86}
91 87
92static struct xt_target target_module = { 88static struct xt_target target_module = {
93 .name = MODULENAME, 89 .name = MODULENAME,
94 .family = AF_INET, 90 .family = AF_INET,
95 .target = target, 91 .target = target,
96 .targetsize = sizeof(struct ip_nat_multi_range_compat), 92 .targetsize = sizeof(struct nf_nat_multi_range_compat),
97 .table = "nat", 93 .table = "nat",
98 .hooks = (1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_POST_ROUTING) | 94 .hooks = (1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_POST_ROUTING) |
99 (1 << NF_IP_LOCAL_OUT), 95 (1 << NF_IP_LOCAL_OUT),
diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c
index c2b6b80670f8..68cc76a198eb 100644
--- a/net/ipv4/netfilter/ipt_REDIRECT.c
+++ b/net/ipv4/netfilter/ipt_REDIRECT.c
@@ -19,11 +19,7 @@
19#include <net/checksum.h> 19#include <net/checksum.h>
20#include <linux/netfilter_ipv4.h> 20#include <linux/netfilter_ipv4.h>
21#include <linux/netfilter/x_tables.h> 21#include <linux/netfilter/x_tables.h>
22#ifdef CONFIG_NF_NAT_NEEDED
23#include <net/netfilter/nf_nat_rule.h> 22#include <net/netfilter/nf_nat_rule.h>
24#else
25#include <linux/netfilter_ipv4/ip_nat_rule.h>
26#endif
27 23
28MODULE_LICENSE("GPL"); 24MODULE_LICENSE("GPL");
29MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); 25MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
@@ -43,7 +39,7 @@ redirect_check(const char *tablename,
43 void *targinfo, 39 void *targinfo,
44 unsigned int hook_mask) 40 unsigned int hook_mask)
45{ 41{
46 const struct ip_nat_multi_range_compat *mr = targinfo; 42 const struct nf_nat_multi_range_compat *mr = targinfo;
47 43
48 if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) { 44 if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
49 DEBUGP("redirect_check: bad MAP_IPS.\n"); 45 DEBUGP("redirect_check: bad MAP_IPS.\n");
@@ -64,17 +60,17 @@ redirect_target(struct sk_buff **pskb,
64 const struct xt_target *target, 60 const struct xt_target *target,
65 const void *targinfo) 61 const void *targinfo)
66{ 62{
67 struct ip_conntrack *ct; 63 struct nf_conn *ct;
68 enum ip_conntrack_info ctinfo; 64 enum ip_conntrack_info ctinfo;
69 __be32 newdst; 65 __be32 newdst;
70 const struct ip_nat_multi_range_compat *mr = targinfo; 66 const struct nf_nat_multi_range_compat *mr = targinfo;
71 struct ip_nat_range newrange; 67 struct nf_nat_range newrange;
72 68
73 IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING 69 NF_CT_ASSERT(hooknum == NF_IP_PRE_ROUTING
74 || hooknum == NF_IP_LOCAL_OUT); 70 || hooknum == NF_IP_LOCAL_OUT);
75 71
76 ct = ip_conntrack_get(*pskb, &ctinfo); 72 ct = nf_ct_get(*pskb, &ctinfo);
77 IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); 73 NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
78 74
79 /* Local packets: make them go to loopback */ 75 /* Local packets: make them go to loopback */
80 if (hooknum == NF_IP_LOCAL_OUT) 76 if (hooknum == NF_IP_LOCAL_OUT)
@@ -96,20 +92,20 @@ redirect_target(struct sk_buff **pskb,
96 } 92 }
97 93
98 /* Transfer from original range. */ 94 /* Transfer from original range. */
99 newrange = ((struct ip_nat_range) 95 newrange = ((struct nf_nat_range)
100 { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS, 96 { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS,
101 newdst, newdst, 97 newdst, newdst,
102 mr->range[0].min, mr->range[0].max }); 98 mr->range[0].min, mr->range[0].max });
103 99
104 /* Hand modified range to generic setup. */ 100 /* Hand modified range to generic setup. */
105 return ip_nat_setup_info(ct, &newrange, hooknum); 101 return nf_nat_setup_info(ct, &newrange, hooknum);
106} 102}
107 103
108static struct xt_target redirect_reg = { 104static struct xt_target redirect_reg = {
109 .name = "REDIRECT", 105 .name = "REDIRECT",
110 .family = AF_INET, 106 .family = AF_INET,
111 .target = redirect_target, 107 .target = redirect_target,
112 .targetsize = sizeof(struct ip_nat_multi_range_compat), 108 .targetsize = sizeof(struct nf_nat_multi_range_compat),
113 .table = "nat", 109 .table = "nat",
114 .hooks = (1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT), 110 .hooks = (1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT),
115 .checkentry = redirect_check, 111 .checkentry = redirect_check,
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 80f739e21824..9041e0741f6f 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -1,7 +1,5 @@
1/* 1/*
2 * This is a module which is used for rejecting packets. 2 * This is a module which is used for rejecting packets.
3 * Added support for customized reject packets (Jozsef Kadlecsik).
4 * Added support for ICMP type-3-code-13 (Maciej Soltysiak). [RFC 1812]
5 */ 3 */
6 4
7/* (C) 1999-2001 Paul `Rusty' Russell 5/* (C) 1999-2001 Paul `Rusty' Russell
@@ -43,7 +41,7 @@ MODULE_DESCRIPTION("iptables REJECT target module");
43static void send_reset(struct sk_buff *oldskb, int hook) 41static void send_reset(struct sk_buff *oldskb, int hook)
44{ 42{
45 struct sk_buff *nskb; 43 struct sk_buff *nskb;
46 struct iphdr *iph = oldskb->nh.iph; 44 struct iphdr *niph;
47 struct tcphdr _otcph, *oth, *tcph; 45 struct tcphdr _otcph, *oth, *tcph;
48 __be16 tmp_port; 46 __be16 tmp_port;
49 __be32 tmp_addr; 47 __be32 tmp_addr;
@@ -51,10 +49,10 @@ static void send_reset(struct sk_buff *oldskb, int hook)
51 unsigned int addr_type; 49 unsigned int addr_type;
52 50
53 /* IP header checks: fragment. */ 51 /* IP header checks: fragment. */
54 if (oldskb->nh.iph->frag_off & htons(IP_OFFSET)) 52 if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET))
55 return; 53 return;
56 54
57 oth = skb_header_pointer(oldskb, oldskb->nh.iph->ihl * 4, 55 oth = skb_header_pointer(oldskb, ip_hdrlen(oldskb),
58 sizeof(_otcph), &_otcph); 56 sizeof(_otcph), &_otcph);
59 if (oth == NULL) 57 if (oth == NULL)
60 return; 58 return;
@@ -64,7 +62,7 @@ static void send_reset(struct sk_buff *oldskb, int hook)
64 return; 62 return;
65 63
66 /* Check checksum */ 64 /* Check checksum */
67 if (nf_ip_checksum(oldskb, hook, iph->ihl * 4, IPPROTO_TCP)) 65 if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP))
68 return; 66 return;
69 67
70 /* We need a linear, writeable skb. We also need to expand 68 /* We need a linear, writeable skb. We also need to expand
@@ -84,20 +82,21 @@ static void send_reset(struct sk_buff *oldskb, int hook)
84 skb_shinfo(nskb)->gso_segs = 0; 82 skb_shinfo(nskb)->gso_segs = 0;
85 skb_shinfo(nskb)->gso_type = 0; 83 skb_shinfo(nskb)->gso_type = 0;
86 84
87 tcph = (struct tcphdr *)((u_int32_t*)nskb->nh.iph + nskb->nh.iph->ihl); 85 tcph = (struct tcphdr *)(skb_network_header(nskb) + ip_hdrlen(nskb));
88 86
89 /* Swap source and dest */ 87 /* Swap source and dest */
90 tmp_addr = nskb->nh.iph->saddr; 88 niph = ip_hdr(nskb);
91 nskb->nh.iph->saddr = nskb->nh.iph->daddr; 89 tmp_addr = niph->saddr;
92 nskb->nh.iph->daddr = tmp_addr; 90 niph->saddr = niph->daddr;
91 niph->daddr = tmp_addr;
93 tmp_port = tcph->source; 92 tmp_port = tcph->source;
94 tcph->source = tcph->dest; 93 tcph->source = tcph->dest;
95 tcph->dest = tmp_port; 94 tcph->dest = tmp_port;
96 95
97 /* Truncate to length (no data) */ 96 /* Truncate to length (no data) */
98 tcph->doff = sizeof(struct tcphdr)/4; 97 tcph->doff = sizeof(struct tcphdr)/4;
99 skb_trim(nskb, nskb->nh.iph->ihl*4 + sizeof(struct tcphdr)); 98 skb_trim(nskb, ip_hdrlen(nskb) + sizeof(struct tcphdr));
100 nskb->nh.iph->tot_len = htons(nskb->len); 99 niph->tot_len = htons(nskb->len);
101 100
102 if (tcph->ack) { 101 if (tcph->ack) {
103 needs_ack = 0; 102 needs_ack = 0;
@@ -105,9 +104,9 @@ static void send_reset(struct sk_buff *oldskb, int hook)
105 tcph->ack_seq = 0; 104 tcph->ack_seq = 0;
106 } else { 105 } else {
107 needs_ack = 1; 106 needs_ack = 1;
108 tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin 107 tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin +
109 + oldskb->len - oldskb->nh.iph->ihl*4 108 oldskb->len - ip_hdrlen(oldskb) -
110 - (oth->doff<<2)); 109 (oth->doff << 2));
111 tcph->seq = 0; 110 tcph->seq = 0;
112 } 111 }
113 112
@@ -122,14 +121,13 @@ static void send_reset(struct sk_buff *oldskb, int hook)
122 /* Adjust TCP checksum */ 121 /* Adjust TCP checksum */
123 tcph->check = 0; 122 tcph->check = 0;
124 tcph->check = tcp_v4_check(sizeof(struct tcphdr), 123 tcph->check = tcp_v4_check(sizeof(struct tcphdr),
125 nskb->nh.iph->saddr, 124 niph->saddr, niph->daddr,
126 nskb->nh.iph->daddr,
127 csum_partial((char *)tcph, 125 csum_partial((char *)tcph,
128 sizeof(struct tcphdr), 0)); 126 sizeof(struct tcphdr), 0));
129 127
130 /* Set DF, id = 0 */ 128 /* Set DF, id = 0 */
131 nskb->nh.iph->frag_off = htons(IP_DF); 129 niph->frag_off = htons(IP_DF);
132 nskb->nh.iph->id = 0; 130 niph->id = 0;
133 131
134 addr_type = RTN_UNSPEC; 132 addr_type = RTN_UNSPEC;
135 if (hook != NF_IP_FORWARD 133 if (hook != NF_IP_FORWARD
@@ -145,12 +143,11 @@ static void send_reset(struct sk_buff *oldskb, int hook)
145 nskb->ip_summed = CHECKSUM_NONE; 143 nskb->ip_summed = CHECKSUM_NONE;
146 144
147 /* Adjust IP TTL */ 145 /* Adjust IP TTL */
148 nskb->nh.iph->ttl = dst_metric(nskb->dst, RTAX_HOPLIMIT); 146 niph->ttl = dst_metric(nskb->dst, RTAX_HOPLIMIT);
149 147
150 /* Adjust IP checksum */ 148 /* Adjust IP checksum */
151 nskb->nh.iph->check = 0; 149 niph->check = 0;
152 nskb->nh.iph->check = ip_fast_csum((unsigned char *)nskb->nh.iph, 150 niph->check = ip_fast_csum(skb_network_header(nskb), niph->ihl);
153 nskb->nh.iph->ihl);
154 151
155 /* "Never happens" */ 152 /* "Never happens" */
156 if (nskb->len > dst_mtu(nskb->dst)) 153 if (nskb->len > dst_mtu(nskb->dst))
@@ -182,7 +179,7 @@ static unsigned int reject(struct sk_buff **pskb,
182 179
183 /* Our naive response construction doesn't deal with IP 180 /* Our naive response construction doesn't deal with IP
184 options, and probably shouldn't try. */ 181 options, and probably shouldn't try. */
185 if ((*pskb)->nh.iph->ihl<<2 != sizeof(struct iphdr)) 182 if (ip_hdrlen(*pskb) != sizeof(struct iphdr))
186 return NF_DROP; 183 return NF_DROP;
187 184
188 /* WARNING: This code causes reentry within iptables. 185 /* WARNING: This code causes reentry within iptables.
diff --git a/net/ipv4/netfilter/ipt_SAME.c b/net/ipv4/netfilter/ipt_SAME.c
index bd4404e5c688..511e5ff84938 100644
--- a/net/ipv4/netfilter/ipt_SAME.c
+++ b/net/ipv4/netfilter/ipt_SAME.c
@@ -7,21 +7,6 @@
7 * This program is free software; you can redistribute it and/or modify 7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as 8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation. 9 * published by the Free Software Foundation.
10 *
11 * 010320 Martin Josefsson <gandalf@wlug.westbo.se>
12 * * copied ipt_BALANCE.c to ipt_SAME.c and changed a few things.
13 * 010728 Martin Josefsson <gandalf@wlug.westbo.se>
14 * * added --nodst to not include destination-ip in new source
15 * calculations.
16 * * added some more sanity-checks.
17 * 010729 Martin Josefsson <gandalf@wlug.westbo.se>
18 * * fixed a buggy if-statement in same_check(), should have
19 * used ntohl() but didn't.
20 * * added support for multiple ranges. IPT_SAME_MAX_RANGE is
21 * defined in linux/include/linux/netfilter_ipv4/ipt_SAME.h
22 * and is currently set to 10.
23 * * added support for 1-address range, nice to have now that
24 * we have multiple ranges.
25 */ 10 */
26#include <linux/types.h> 11#include <linux/types.h>
27#include <linux/ip.h> 12#include <linux/ip.h>
@@ -35,11 +20,7 @@
35#include <net/checksum.h> 20#include <net/checksum.h>
36#include <linux/netfilter_ipv4.h> 21#include <linux/netfilter_ipv4.h>
37#include <linux/netfilter/x_tables.h> 22#include <linux/netfilter/x_tables.h>
38#ifdef CONFIG_NF_NAT_NEEDED
39#include <net/netfilter/nf_nat_rule.h> 23#include <net/netfilter/nf_nat_rule.h>
40#else
41#include <linux/netfilter_ipv4/ip_nat_rule.h>
42#endif
43#include <linux/netfilter_ipv4/ipt_SAME.h> 24#include <linux/netfilter_ipv4/ipt_SAME.h>
44 25
45MODULE_LICENSE("GPL"); 26MODULE_LICENSE("GPL");
@@ -138,17 +119,17 @@ same_target(struct sk_buff **pskb,
138 const struct xt_target *target, 119 const struct xt_target *target,
139 const void *targinfo) 120 const void *targinfo)
140{ 121{
141 struct ip_conntrack *ct; 122 struct nf_conn *ct;
142 enum ip_conntrack_info ctinfo; 123 enum ip_conntrack_info ctinfo;
143 u_int32_t tmpip, aindex; 124 u_int32_t tmpip, aindex;
144 __be32 new_ip; 125 __be32 new_ip;
145 const struct ipt_same_info *same = targinfo; 126 const struct ipt_same_info *same = targinfo;
146 struct ip_nat_range newrange; 127 struct nf_nat_range newrange;
147 const struct ip_conntrack_tuple *t; 128 const struct nf_conntrack_tuple *t;
148 129
149 IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING || 130 NF_CT_ASSERT(hooknum == NF_IP_PRE_ROUTING ||
150 hooknum == NF_IP_POST_ROUTING); 131 hooknum == NF_IP_POST_ROUTING);
151 ct = ip_conntrack_get(*pskb, &ctinfo); 132 ct = nf_ct_get(*pskb, &ctinfo);
152 133
153 t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; 134 t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
154 135
@@ -157,17 +138,10 @@ same_target(struct sk_buff **pskb,
157 Here we calculate the index in same->iparray which 138 Here we calculate the index in same->iparray which
158 holds the ipaddress we should use */ 139 holds the ipaddress we should use */
159 140
160#ifdef CONFIG_NF_NAT_NEEDED
161 tmpip = ntohl(t->src.u3.ip); 141 tmpip = ntohl(t->src.u3.ip);
162 142
163 if (!(same->info & IPT_SAME_NODST)) 143 if (!(same->info & IPT_SAME_NODST))
164 tmpip += ntohl(t->dst.u3.ip); 144 tmpip += ntohl(t->dst.u3.ip);
165#else
166 tmpip = ntohl(t->src.ip);
167
168 if (!(same->info & IPT_SAME_NODST))
169 tmpip += ntohl(t->dst.ip);
170#endif
171 aindex = tmpip % same->ipnum; 145 aindex = tmpip % same->ipnum;
172 146
173 new_ip = htonl(same->iparray[aindex]); 147 new_ip = htonl(same->iparray[aindex]);
@@ -178,13 +152,13 @@ same_target(struct sk_buff **pskb,
178 NIPQUAD(new_ip)); 152 NIPQUAD(new_ip));
179 153
180 /* Transfer from original range. */ 154 /* Transfer from original range. */
181 newrange = ((struct ip_nat_range) 155 newrange = ((struct nf_nat_range)
182 { same->range[0].flags, new_ip, new_ip, 156 { same->range[0].flags, new_ip, new_ip,
183 /* FIXME: Use ports from correct range! */ 157 /* FIXME: Use ports from correct range! */
184 same->range[0].min, same->range[0].max }); 158 same->range[0].min, same->range[0].max });
185 159
186 /* Hand modified range to generic setup. */ 160 /* Hand modified range to generic setup. */
187 return ip_nat_setup_info(ct, &newrange, hooknum); 161 return nf_nat_setup_info(ct, &newrange, hooknum);
188} 162}
189 163
190static struct xt_target same_reg = { 164static struct xt_target same_reg = {
diff --git a/net/ipv4/netfilter/ipt_TOS.c b/net/ipv4/netfilter/ipt_TOS.c
index cedf9f7d9d6e..0ad02f249837 100644
--- a/net/ipv4/netfilter/ipt_TOS.c
+++ b/net/ipv4/netfilter/ipt_TOS.c
@@ -29,13 +29,13 @@ target(struct sk_buff **pskb,
29 const void *targinfo) 29 const void *targinfo)
30{ 30{
31 const struct ipt_tos_target_info *tosinfo = targinfo; 31 const struct ipt_tos_target_info *tosinfo = targinfo;
32 struct iphdr *iph = (*pskb)->nh.iph; 32 struct iphdr *iph = ip_hdr(*pskb);
33 33
34 if ((iph->tos & IPTOS_TOS_MASK) != tosinfo->tos) { 34 if ((iph->tos & IPTOS_TOS_MASK) != tosinfo->tos) {
35 __u8 oldtos; 35 __u8 oldtos;
36 if (!skb_make_writable(pskb, sizeof(struct iphdr))) 36 if (!skb_make_writable(pskb, sizeof(struct iphdr)))
37 return NF_DROP; 37 return NF_DROP;
38 iph = (*pskb)->nh.iph; 38 iph = ip_hdr(*pskb);
39 oldtos = iph->tos; 39 oldtos = iph->tos;
40 iph->tos = (iph->tos & IPTOS_PREC_MASK) | tosinfo->tos; 40 iph->tos = (iph->tos & IPTOS_PREC_MASK) | tosinfo->tos;
41 nf_csum_replace2(&iph->check, htons(oldtos), htons(iph->tos)); 41 nf_csum_replace2(&iph->check, htons(oldtos), htons(iph->tos));
diff --git a/net/ipv4/netfilter/ipt_TTL.c b/net/ipv4/netfilter/ipt_TTL.c
index 64be31c22ba9..a991ec7bd4e7 100644
--- a/net/ipv4/netfilter/ipt_TTL.c
+++ b/net/ipv4/netfilter/ipt_TTL.c
@@ -32,7 +32,7 @@ ipt_ttl_target(struct sk_buff **pskb,
32 if (!skb_make_writable(pskb, (*pskb)->len)) 32 if (!skb_make_writable(pskb, (*pskb)->len))
33 return NF_DROP; 33 return NF_DROP;
34 34
35 iph = (*pskb)->nh.iph; 35 iph = ip_hdr(*pskb);
36 36
37 switch (info->mode) { 37 switch (info->mode) {
38 case IPT_TTL_SET: 38 case IPT_TTL_SET:
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index 9acc018766f2..23b607b33b32 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -2,20 +2,6 @@
2 * netfilter module for userspace packet logging daemons 2 * netfilter module for userspace packet logging daemons
3 * 3 *
4 * (C) 2000-2004 by Harald Welte <laforge@netfilter.org> 4 * (C) 2000-2004 by Harald Welte <laforge@netfilter.org>
5 *
6 * 2000/09/22 ulog-cprange feature added
7 * 2001/01/04 in-kernel queue as proposed by Sebastian Zander
8 * <zander@fokus.gmd.de>
9 * 2001/01/30 per-rule nlgroup conflicts with global queue.
10 * nlgroup now global (sysctl)
11 * 2001/04/19 ulog-queue reworked, now fixed buffer size specified at
12 * module loadtime -HW
13 * 2002/07/07 remove broken nflog_rcv() function -HW
14 * 2002/08/29 fix shifted/unshifted nlgroup bug -HW
15 * 2002/10/30 fix uninitialized mac_len field - <Anders K. Pedersen>
16 * 2004/10/25 fix erroneous calculation of 'len' parameter to NLMSG_PUT
17 * resulting in bogus 'error during NLMSG_PUT' messages.
18 *
19 * (C) 1999-2001 Paul `Rusty' Russell 5 * (C) 1999-2001 Paul `Rusty' Russell
20 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> 6 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
21 * 7 *
@@ -42,8 +28,6 @@
42 * flushtimeout: 28 * flushtimeout:
43 * Specify, after how many hundredths of a second the queue should be 29 * Specify, after how many hundredths of a second the queue should be
44 * flushed even if it is not full yet. 30 * flushed even if it is not full yet.
45 *
46 * ipt_ULOG.c,v 1.22 2002/10/30 09:07:31 laforge Exp
47 */ 31 */
48 32
49#include <linux/module.h> 33#include <linux/module.h>
@@ -187,6 +171,7 @@ static void ipt_ulog_packet(unsigned int hooknum,
187 ulog_packet_msg_t *pm; 171 ulog_packet_msg_t *pm;
188 size_t size, copy_len; 172 size_t size, copy_len;
189 struct nlmsghdr *nlh; 173 struct nlmsghdr *nlh;
174 struct timeval tv;
190 175
191 /* ffs == find first bit set, necessary because userspace 176 /* ffs == find first bit set, necessary because userspace
192 * is already shifting groupnumber, but we need unshifted. 177 * is already shifting groupnumber, but we need unshifted.
@@ -232,13 +217,14 @@ static void ipt_ulog_packet(unsigned int hooknum,
232 pm = NLMSG_DATA(nlh); 217 pm = NLMSG_DATA(nlh);
233 218
234 /* We might not have a timestamp, get one */ 219 /* We might not have a timestamp, get one */
235 if (skb->tstamp.off_sec == 0) 220 if (skb->tstamp.tv64 == 0)
236 __net_timestamp((struct sk_buff *)skb); 221 __net_timestamp((struct sk_buff *)skb);
237 222
238 /* copy hook, prefix, timestamp, payload, etc. */ 223 /* copy hook, prefix, timestamp, payload, etc. */
239 pm->data_len = copy_len; 224 pm->data_len = copy_len;
240 put_unaligned(skb->tstamp.off_sec, &pm->timestamp_sec); 225 tv = ktime_to_timeval(skb->tstamp);
241 put_unaligned(skb->tstamp.off_usec, &pm->timestamp_usec); 226 put_unaligned(tv.tv_sec, &pm->timestamp_sec);
227 put_unaligned(tv.tv_usec, &pm->timestamp_usec);
242 put_unaligned(skb->mark, &pm->mark); 228 put_unaligned(skb->mark, &pm->mark);
243 pm->hook = hooknum; 229 pm->hook = hooknum;
244 if (prefix != NULL) 230 if (prefix != NULL)
@@ -249,9 +235,9 @@ static void ipt_ulog_packet(unsigned int hooknum,
249 *(pm->prefix) = '\0'; 235 *(pm->prefix) = '\0';
250 236
251 if (in && in->hard_header_len > 0 237 if (in && in->hard_header_len > 0
252 && skb->mac.raw != (void *) skb->nh.iph 238 && skb->mac_header != skb->network_header
253 && in->hard_header_len <= ULOG_MAC_LEN) { 239 && in->hard_header_len <= ULOG_MAC_LEN) {
254 memcpy(pm->mac, skb->mac.raw, in->hard_header_len); 240 memcpy(pm->mac, skb_mac_header(skb), in->hard_header_len);
255 pm->mac_len = in->hard_header_len; 241 pm->mac_len = in->hard_header_len;
256 } else 242 } else
257 pm->mac_len = 0; 243 pm->mac_len = 0;
@@ -363,12 +349,52 @@ static int ipt_ulog_checkentry(const char *tablename,
363 return 1; 349 return 1;
364} 350}
365 351
352#ifdef CONFIG_COMPAT
353struct compat_ipt_ulog_info {
354 compat_uint_t nl_group;
355 compat_size_t copy_range;
356 compat_size_t qthreshold;
357 char prefix[ULOG_PREFIX_LEN];
358};
359
360static void compat_from_user(void *dst, void *src)
361{
362 struct compat_ipt_ulog_info *cl = src;
363 struct ipt_ulog_info l = {
364 .nl_group = cl->nl_group,
365 .copy_range = cl->copy_range,
366 .qthreshold = cl->qthreshold,
367 };
368
369 memcpy(l.prefix, cl->prefix, sizeof(l.prefix));
370 memcpy(dst, &l, sizeof(l));
371}
372
373static int compat_to_user(void __user *dst, void *src)
374{
375 struct ipt_ulog_info *l = src;
376 struct compat_ipt_ulog_info cl = {
377 .nl_group = l->nl_group,
378 .copy_range = l->copy_range,
379 .qthreshold = l->qthreshold,
380 };
381
382 memcpy(cl.prefix, l->prefix, sizeof(cl.prefix));
383 return copy_to_user(dst, &cl, sizeof(cl)) ? -EFAULT : 0;
384}
385#endif /* CONFIG_COMPAT */
386
366static struct xt_target ipt_ulog_reg = { 387static struct xt_target ipt_ulog_reg = {
367 .name = "ULOG", 388 .name = "ULOG",
368 .family = AF_INET, 389 .family = AF_INET,
369 .target = ipt_ulog_target, 390 .target = ipt_ulog_target,
370 .targetsize = sizeof(struct ipt_ulog_info), 391 .targetsize = sizeof(struct ipt_ulog_info),
371 .checkentry = ipt_ulog_checkentry, 392 .checkentry = ipt_ulog_checkentry,
393#ifdef CONFIG_COMPAT
394 .compatsize = sizeof(struct compat_ipt_ulog_info),
395 .compat_from_user = compat_from_user,
396 .compat_to_user = compat_to_user,
397#endif
372 .me = THIS_MODULE, 398 .me = THIS_MODULE,
373}; 399};
374 400
@@ -390,14 +416,11 @@ static int __init ipt_ulog_init(void)
390 } 416 }
391 417
392 /* initialize ulog_buffers */ 418 /* initialize ulog_buffers */
393 for (i = 0; i < ULOG_MAXNLGROUPS; i++) { 419 for (i = 0; i < ULOG_MAXNLGROUPS; i++)
394 init_timer(&ulog_buffers[i].timer); 420 setup_timer(&ulog_buffers[i].timer, ulog_timer, i);
395 ulog_buffers[i].timer.function = ulog_timer;
396 ulog_buffers[i].timer.data = i;
397 }
398 421
399 nflognl = netlink_kernel_create(NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL, 422 nflognl = netlink_kernel_create(NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL,
400 THIS_MODULE); 423 NULL, THIS_MODULE);
401 if (!nflognl) 424 if (!nflognl)
402 return -ENOMEM; 425 return -ENOMEM;
403 426
diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c
index cfa0472617f6..a652a1451552 100644
--- a/net/ipv4/netfilter/ipt_addrtype.c
+++ b/net/ipv4/netfilter/ipt_addrtype.c
@@ -33,7 +33,7 @@ static int match(const struct sk_buff *skb,
33 int offset, unsigned int protoff, int *hotdrop) 33 int offset, unsigned int protoff, int *hotdrop)
34{ 34{
35 const struct ipt_addrtype_info *info = matchinfo; 35 const struct ipt_addrtype_info *info = matchinfo;
36 const struct iphdr *iph = skb->nh.iph; 36 const struct iphdr *iph = ip_hdr(skb);
37 int ret = 1; 37 int ret = 1;
38 38
39 if (info->source) 39 if (info->source)
diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c
index 37508b2cfea6..26218122f865 100644
--- a/net/ipv4/netfilter/ipt_ecn.c
+++ b/net/ipv4/netfilter/ipt_ecn.c
@@ -1,7 +1,5 @@
1/* IP tables module for matching the value of the IPv4 and TCP ECN bits 1/* IP tables module for matching the value of the IPv4 and TCP ECN bits
2 * 2 *
3 * ipt_ecn.c,v 1.3 2002/05/29 15:09:00 laforge Exp
4 *
5 * (C) 2002 by Harald Welte <laforge@gnumonks.org> 3 * (C) 2002 by Harald Welte <laforge@gnumonks.org>
6 * 4 *
7 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
@@ -11,6 +9,7 @@
11 9
12#include <linux/in.h> 10#include <linux/in.h>
13#include <linux/ip.h> 11#include <linux/ip.h>
12#include <net/ip.h>
14#include <linux/module.h> 13#include <linux/module.h>
15#include <linux/skbuff.h> 14#include <linux/skbuff.h>
16#include <linux/tcp.h> 15#include <linux/tcp.h>
@@ -26,7 +25,7 @@ MODULE_LICENSE("GPL");
26static inline int match_ip(const struct sk_buff *skb, 25static inline int match_ip(const struct sk_buff *skb,
27 const struct ipt_ecn_info *einfo) 26 const struct ipt_ecn_info *einfo)
28{ 27{
29 return ((skb->nh.iph->tos&IPT_ECN_IP_MASK) == einfo->ip_ect); 28 return (ip_hdr(skb)->tos & IPT_ECN_IP_MASK) == einfo->ip_ect;
30} 29}
31 30
32static inline int match_tcp(const struct sk_buff *skb, 31static inline int match_tcp(const struct sk_buff *skb,
@@ -38,8 +37,7 @@ static inline int match_tcp(const struct sk_buff *skb,
38 /* In practice, TCP match does this, so can't fail. But let's 37 /* In practice, TCP match does this, so can't fail. But let's
39 * be good citizens. 38 * be good citizens.
40 */ 39 */
41 th = skb_header_pointer(skb, skb->nh.iph->ihl * 4, 40 th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph);
42 sizeof(_tcph), &_tcph);
43 if (th == NULL) { 41 if (th == NULL) {
44 *hotdrop = 0; 42 *hotdrop = 0;
45 return 0; 43 return 0;
@@ -80,7 +78,7 @@ static int match(const struct sk_buff *skb,
80 return 0; 78 return 0;
81 79
82 if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)) { 80 if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)) {
83 if (skb->nh.iph->protocol != IPPROTO_TCP) 81 if (ip_hdr(skb)->protocol != IPPROTO_TCP)
84 return 0; 82 return 0;
85 if (!match_tcp(skb, info, hotdrop)) 83 if (!match_tcp(skb, info, hotdrop))
86 return 0; 84 return 0;
diff --git a/net/ipv4/netfilter/ipt_iprange.c b/net/ipv4/netfilter/ipt_iprange.c
index bc5d5e6091e4..33af9e940887 100644
--- a/net/ipv4/netfilter/ipt_iprange.c
+++ b/net/ipv4/netfilter/ipt_iprange.c
@@ -32,7 +32,7 @@ match(const struct sk_buff *skb,
32 int offset, unsigned int protoff, int *hotdrop) 32 int offset, unsigned int protoff, int *hotdrop)
33{ 33{
34 const struct ipt_iprange_info *info = matchinfo; 34 const struct ipt_iprange_info *info = matchinfo;
35 const struct iphdr *iph = skb->nh.iph; 35 const struct iphdr *iph = ip_hdr(skb);
36 36
37 if (info->flags & IPRANGE_SRC) { 37 if (info->flags & IPRANGE_SRC) {
38 if (((ntohl(iph->saddr) < ntohl(info->src.min_ip)) 38 if (((ntohl(iph->saddr) < ntohl(info->src.min_ip))
diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c
index aecb9c48e152..15a9e8bbb7cc 100644
--- a/net/ipv4/netfilter/ipt_recent.c
+++ b/net/ipv4/netfilter/ipt_recent.c
@@ -183,11 +183,11 @@ ipt_recent_match(const struct sk_buff *skb,
183 int ret = info->invert; 183 int ret = info->invert;
184 184
185 if (info->side == IPT_RECENT_DEST) 185 if (info->side == IPT_RECENT_DEST)
186 addr = skb->nh.iph->daddr; 186 addr = ip_hdr(skb)->daddr;
187 else 187 else
188 addr = skb->nh.iph->saddr; 188 addr = ip_hdr(skb)->saddr;
189 189
190 ttl = skb->nh.iph->ttl; 190 ttl = ip_hdr(skb)->ttl;
191 /* use TTL as seen before forwarding */ 191 /* use TTL as seen before forwarding */
192 if (out && !skb->sk) 192 if (out && !skb->sk)
193 ttl++; 193 ttl++;
diff --git a/net/ipv4/netfilter/ipt_tos.c b/net/ipv4/netfilter/ipt_tos.c
index 5d33b51d49d8..d314844af12b 100644
--- a/net/ipv4/netfilter/ipt_tos.c
+++ b/net/ipv4/netfilter/ipt_tos.c
@@ -30,7 +30,7 @@ match(const struct sk_buff *skb,
30{ 30{
31 const struct ipt_tos_info *info = matchinfo; 31 const struct ipt_tos_info *info = matchinfo;
32 32
33 return (skb->nh.iph->tos == info->tos) ^ info->invert; 33 return (ip_hdr(skb)->tos == info->tos) ^ info->invert;
34} 34}
35 35
36static struct xt_match tos_match = { 36static struct xt_match tos_match = {
diff --git a/net/ipv4/netfilter/ipt_ttl.c b/net/ipv4/netfilter/ipt_ttl.c
index 1eca9f400374..ab02d9e3139c 100644
--- a/net/ipv4/netfilter/ipt_ttl.c
+++ b/net/ipv4/netfilter/ipt_ttl.c
@@ -1,7 +1,5 @@
1/* IP tables module for matching the value of the TTL 1/* IP tables module for matching the value of the TTL
2 * 2 *
3 * ipt_ttl.c,v 1.5 2000/11/13 11:16:08 laforge Exp
4 *
5 * (C) 2000,2001 by Harald Welte <laforge@netfilter.org> 3 * (C) 2000,2001 by Harald Welte <laforge@netfilter.org>
6 * 4 *
7 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
@@ -26,19 +24,20 @@ static int match(const struct sk_buff *skb,
26 int offset, unsigned int protoff, int *hotdrop) 24 int offset, unsigned int protoff, int *hotdrop)
27{ 25{
28 const struct ipt_ttl_info *info = matchinfo; 26 const struct ipt_ttl_info *info = matchinfo;
27 const u8 ttl = ip_hdr(skb)->ttl;
29 28
30 switch (info->mode) { 29 switch (info->mode) {
31 case IPT_TTL_EQ: 30 case IPT_TTL_EQ:
32 return (skb->nh.iph->ttl == info->ttl); 31 return (ttl == info->ttl);
33 break; 32 break;
34 case IPT_TTL_NE: 33 case IPT_TTL_NE:
35 return (!(skb->nh.iph->ttl == info->ttl)); 34 return (!(ttl == info->ttl));
36 break; 35 break;
37 case IPT_TTL_LT: 36 case IPT_TTL_LT:
38 return (skb->nh.iph->ttl < info->ttl); 37 return (ttl < info->ttl);
39 break; 38 break;
40 case IPT_TTL_GT: 39 case IPT_TTL_GT:
41 return (skb->nh.iph->ttl > info->ttl); 40 return (ttl > info->ttl);
42 break; 41 break;
43 default: 42 default:
44 printk(KERN_WARNING "ipt_ttl: unknown mode %d\n", 43 printk(KERN_WARNING "ipt_ttl: unknown mode %d\n",
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index d1d61e97b976..42728909eba0 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -13,6 +13,7 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/moduleparam.h> 14#include <linux/moduleparam.h>
15#include <linux/netfilter_ipv4/ip_tables.h> 15#include <linux/netfilter_ipv4/ip_tables.h>
16#include <net/ip.h>
16 17
17MODULE_LICENSE("GPL"); 18MODULE_LICENSE("GPL");
18MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); 19MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
@@ -102,7 +103,7 @@ ipt_local_out_hook(unsigned int hook,
102{ 103{
103 /* root is playing with raw sockets. */ 104 /* root is playing with raw sockets. */
104 if ((*pskb)->len < sizeof(struct iphdr) 105 if ((*pskb)->len < sizeof(struct iphdr)
105 || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) { 106 || ip_hdrlen(*pskb) < sizeof(struct iphdr)) {
106 if (net_ratelimit()) 107 if (net_ratelimit())
107 printk("ipt_hook: happy cracking.\n"); 108 printk("ipt_hook: happy cracking.\n");
108 return NF_ACCEPT; 109 return NF_ACCEPT;
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index 98b66ef0c714..9278802f2742 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -7,8 +7,6 @@
7 * This program is free software; you can redistribute it and/or modify 7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as 8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation. 9 * published by the Free Software Foundation.
10 *
11 * Extended to all five netfilter hooks by Brad Chapman & Harald Welte
12 */ 10 */
13#include <linux/module.h> 11#include <linux/module.h>
14#include <linux/netfilter_ipv4/ip_tables.h> 12#include <linux/netfilter_ipv4/ip_tables.h>
@@ -17,6 +15,7 @@
17#include <net/sock.h> 15#include <net/sock.h>
18#include <net/route.h> 16#include <net/route.h>
19#include <linux/ip.h> 17#include <linux/ip.h>
18#include <net/ip.h>
20 19
21MODULE_LICENSE("GPL"); 20MODULE_LICENSE("GPL");
22MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); 21MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
@@ -130,13 +129,14 @@ ipt_local_hook(unsigned int hook,
130 int (*okfn)(struct sk_buff *)) 129 int (*okfn)(struct sk_buff *))
131{ 130{
132 unsigned int ret; 131 unsigned int ret;
132 const struct iphdr *iph;
133 u_int8_t tos; 133 u_int8_t tos;
134 __be32 saddr, daddr; 134 __be32 saddr, daddr;
135 u_int32_t mark; 135 u_int32_t mark;
136 136
137 /* root is playing with raw sockets. */ 137 /* root is playing with raw sockets. */
138 if ((*pskb)->len < sizeof(struct iphdr) 138 if ((*pskb)->len < sizeof(struct iphdr)
139 || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) { 139 || ip_hdrlen(*pskb) < sizeof(struct iphdr)) {
140 if (net_ratelimit()) 140 if (net_ratelimit())
141 printk("ipt_hook: happy cracking.\n"); 141 printk("ipt_hook: happy cracking.\n");
142 return NF_ACCEPT; 142 return NF_ACCEPT;
@@ -144,19 +144,23 @@ ipt_local_hook(unsigned int hook,
144 144
145 /* Save things which could affect route */ 145 /* Save things which could affect route */
146 mark = (*pskb)->mark; 146 mark = (*pskb)->mark;
147 saddr = (*pskb)->nh.iph->saddr; 147 iph = ip_hdr(*pskb);
148 daddr = (*pskb)->nh.iph->daddr; 148 saddr = iph->saddr;
149 tos = (*pskb)->nh.iph->tos; 149 daddr = iph->daddr;
150 tos = iph->tos;
150 151
151 ret = ipt_do_table(pskb, hook, in, out, &packet_mangler); 152 ret = ipt_do_table(pskb, hook, in, out, &packet_mangler);
152 /* Reroute for ANY change. */ 153 /* Reroute for ANY change. */
153 if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE 154 if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE) {
154 && ((*pskb)->nh.iph->saddr != saddr 155 iph = ip_hdr(*pskb);
155 || (*pskb)->nh.iph->daddr != daddr 156
156 || (*pskb)->mark != mark 157 if (iph->saddr != saddr ||
157 || (*pskb)->nh.iph->tos != tos)) 158 iph->daddr != daddr ||
158 if (ip_route_me_harder(pskb, RTN_UNSPEC)) 159 (*pskb)->mark != mark ||
159 ret = NF_DROP; 160 iph->tos != tos)
161 if (ip_route_me_harder(pskb, RTN_UNSPEC))
162 ret = NF_DROP;
163 }
160 164
161 return ret; 165 return ret;
162} 166}
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 8f3e92d20df8..0654eaae70c9 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -4,14 +4,6 @@
4 * This program is free software; you can redistribute it and/or modify 4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as 5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation. 6 * published by the Free Software Foundation.
7 *
8 * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
9 * - move L3 protocol dependent part to this file.
10 * 23 Mar 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
11 * - add get_features() to support various size of conntrack
12 * structures.
13 *
14 * Derived from net/ipv4/netfilter/ip_conntrack_standalone.c
15 */ 7 */
16 8
17#include <linux/types.h> 9#include <linux/types.h>
@@ -87,7 +79,7 @@ nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)
87 local_bh_enable(); 79 local_bh_enable();
88 80
89 if (skb) 81 if (skb)
90 ip_send_check(skb->nh.iph); 82 ip_send_check(ip_hdr(skb));
91 83
92 return skb; 84 return skb;
93} 85}
@@ -97,16 +89,16 @@ ipv4_prepare(struct sk_buff **pskb, unsigned int hooknum, unsigned int *dataoff,
97 u_int8_t *protonum) 89 u_int8_t *protonum)
98{ 90{
99 /* Never happen */ 91 /* Never happen */
100 if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) { 92 if (ip_hdr(*pskb)->frag_off & htons(IP_OFFSET)) {
101 if (net_ratelimit()) { 93 if (net_ratelimit()) {
102 printk(KERN_ERR "ipv4_prepare: Frag of proto %u (hook=%u)\n", 94 printk(KERN_ERR "ipv4_prepare: Frag of proto %u (hook=%u)\n",
103 (*pskb)->nh.iph->protocol, hooknum); 95 ip_hdr(*pskb)->protocol, hooknum);
104 } 96 }
105 return -NF_DROP; 97 return -NF_DROP;
106 } 98 }
107 99
108 *dataoff = (*pskb)->nh.raw - (*pskb)->data + (*pskb)->nh.iph->ihl*4; 100 *dataoff = skb_network_offset(*pskb) + ip_hdrlen(*pskb);
109 *protonum = (*pskb)->nh.iph->protocol; 101 *protonum = ip_hdr(*pskb)->protocol;
110 102
111 return NF_ACCEPT; 103 return NF_ACCEPT;
112} 104}
@@ -152,9 +144,8 @@ static unsigned int ipv4_conntrack_help(unsigned int hooknum,
152 return NF_ACCEPT; 144 return NF_ACCEPT;
153 145
154 return help->helper->help(pskb, 146 return help->helper->help(pskb,
155 (*pskb)->nh.raw - (*pskb)->data 147 skb_network_offset(*pskb) + ip_hdrlen(*pskb),
156 + (*pskb)->nh.iph->ihl*4, 148 ct, ctinfo);
157 ct, ctinfo);
158} 149}
159 150
160static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, 151static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,
@@ -171,7 +162,7 @@ static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,
171#endif 162#endif
172 163
173 /* Gather fragments. */ 164 /* Gather fragments. */
174 if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { 165 if (ip_hdr(*pskb)->frag_off & htons(IP_MF | IP_OFFSET)) {
175 *pskb = nf_ct_ipv4_gather_frags(*pskb, 166 *pskb = nf_ct_ipv4_gather_frags(*pskb,
176 hooknum == NF_IP_PRE_ROUTING ? 167 hooknum == NF_IP_PRE_ROUTING ?
177 IP_DEFRAG_CONNTRACK_IN : 168 IP_DEFRAG_CONNTRACK_IN :
@@ -199,7 +190,7 @@ static unsigned int ipv4_conntrack_local(unsigned int hooknum,
199{ 190{
200 /* root is playing with raw sockets. */ 191 /* root is playing with raw sockets. */
201 if ((*pskb)->len < sizeof(struct iphdr) 192 if ((*pskb)->len < sizeof(struct iphdr)
202 || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) { 193 || ip_hdrlen(*pskb) < sizeof(struct iphdr)) {
203 if (net_ratelimit()) 194 if (net_ratelimit())
204 printk("ipt_hook: happy cracking.\n"); 195 printk("ipt_hook: happy cracking.\n");
205 return NF_ACCEPT; 196 return NF_ACCEPT;
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 5fd1e5363c1a..f4fc657c1983 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -4,11 +4,6 @@
4 * This program is free software; you can redistribute it and/or modify 4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as 5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation. 6 * published by the Free Software Foundation.
7 *
8 * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
9 * - enable working with Layer 3 protocol independent connection tracking.
10 *
11 * Derived from net/ipv4/netfilter/ip_conntrack_proto_icmp.c
12 */ 7 */
13 8
14#include <linux/types.h> 9#include <linux/types.h>
@@ -158,7 +153,7 @@ icmp_error_message(struct sk_buff *skb,
158 NF_CT_ASSERT(skb->nfct == NULL); 153 NF_CT_ASSERT(skb->nfct == NULL);
159 154
160 /* Not enough header? */ 155 /* Not enough header? */
161 inside = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_in), &_in); 156 inside = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_in), &_in);
162 if (inside == NULL) 157 if (inside == NULL)
163 return -NF_ACCEPT; 158 return -NF_ACCEPT;
164 159
@@ -172,7 +167,7 @@ icmp_error_message(struct sk_buff *skb,
172 /* rcu_read_lock()ed by nf_hook_slow */ 167 /* rcu_read_lock()ed by nf_hook_slow */
173 innerproto = __nf_ct_l4proto_find(PF_INET, inside->ip.protocol); 168 innerproto = __nf_ct_l4proto_find(PF_INET, inside->ip.protocol);
174 169
175 dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp); 170 dataoff = ip_hdrlen(skb) + sizeof(inside->icmp);
176 /* Are they talking about one of our connections? */ 171 /* Are they talking about one of our connections? */
177 if (!nf_ct_get_tuple(skb, dataoff, dataoff + inside->ip.ihl*4, PF_INET, 172 if (!nf_ct_get_tuple(skb, dataoff, dataoff + inside->ip.ihl*4, PF_INET,
178 inside->ip.protocol, &origtuple, 173 inside->ip.protocol, &origtuple,
@@ -227,7 +222,7 @@ icmp_error(struct sk_buff *skb, unsigned int dataoff,
227 struct icmphdr _ih, *icmph; 222 struct icmphdr _ih, *icmph;
228 223
229 /* Not enough header? */ 224 /* Not enough header? */
230 icmph = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_ih), &_ih); 225 icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih);
231 if (icmph == NULL) { 226 if (icmph == NULL) {
232 if (LOG_INVALID(IPPROTO_ICMP)) 227 if (LOG_INVALID(IPPROTO_ICMP))
233 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, 228 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index 452e9d326684..ea02f00d2dac 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -431,7 +431,7 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
431 } *inside; 431 } *inside;
432 struct nf_conntrack_l4proto *l4proto; 432 struct nf_conntrack_l4proto *l4proto;
433 struct nf_conntrack_tuple inner, target; 433 struct nf_conntrack_tuple inner, target;
434 int hdrlen = (*pskb)->nh.iph->ihl * 4; 434 int hdrlen = ip_hdrlen(*pskb);
435 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); 435 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
436 unsigned long statusbit; 436 unsigned long statusbit;
437 enum nf_nat_manip_type manip = HOOK2MANIP(hooknum); 437 enum nf_nat_manip_type manip = HOOK2MANIP(hooknum);
@@ -439,7 +439,7 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
439 if (!skb_make_writable(pskb, hdrlen + sizeof(*inside))) 439 if (!skb_make_writable(pskb, hdrlen + sizeof(*inside)))
440 return 0; 440 return 0;
441 441
442 inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4; 442 inside = (void *)(*pskb)->data + ip_hdrlen(*pskb);
443 443
444 /* We're actually going to mangle it beyond trivial checksum 444 /* We're actually going to mangle it beyond trivial checksum
445 adjustment, so make sure the current checksum is correct. */ 445 adjustment, so make sure the current checksum is correct. */
@@ -469,9 +469,9 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
469 l4proto = __nf_ct_l4proto_find(PF_INET, inside->ip.protocol); 469 l4proto = __nf_ct_l4proto_find(PF_INET, inside->ip.protocol);
470 470
471 if (!nf_ct_get_tuple(*pskb, 471 if (!nf_ct_get_tuple(*pskb,
472 (*pskb)->nh.iph->ihl*4 + sizeof(struct icmphdr), 472 ip_hdrlen(*pskb) + sizeof(struct icmphdr),
473 (*pskb)->nh.iph->ihl*4 + 473 (ip_hdrlen(*pskb) +
474 sizeof(struct icmphdr) + inside->ip.ihl*4, 474 sizeof(struct icmphdr) + inside->ip.ihl * 4),
475 (u_int16_t)AF_INET, 475 (u_int16_t)AF_INET,
476 inside->ip.protocol, 476 inside->ip.protocol,
477 &inner, l3proto, l4proto)) 477 &inner, l3proto, l4proto))
@@ -483,14 +483,14 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
483 packet: PREROUTING (DST manip), routing produces ICMP, goes 483 packet: PREROUTING (DST manip), routing produces ICMP, goes
484 through POSTROUTING (which must correct the DST manip). */ 484 through POSTROUTING (which must correct the DST manip). */
485 if (!manip_pkt(inside->ip.protocol, pskb, 485 if (!manip_pkt(inside->ip.protocol, pskb,
486 (*pskb)->nh.iph->ihl*4 + sizeof(inside->icmp), 486 ip_hdrlen(*pskb) + sizeof(inside->icmp),
487 &ct->tuplehash[!dir].tuple, 487 &ct->tuplehash[!dir].tuple,
488 !manip)) 488 !manip))
489 return 0; 489 return 0;
490 490
491 if ((*pskb)->ip_summed != CHECKSUM_PARTIAL) { 491 if ((*pskb)->ip_summed != CHECKSUM_PARTIAL) {
492 /* Reloading "inside" here since manip_pkt inner. */ 492 /* Reloading "inside" here since manip_pkt inner. */
493 inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4; 493 inside = (void *)(*pskb)->data + ip_hdrlen(*pskb);
494 inside->icmp.checksum = 0; 494 inside->icmp.checksum = 0;
495 inside->icmp.checksum = 495 inside->icmp.checksum =
496 csum_fold(skb_checksum(*pskb, hdrlen, 496 csum_fold(skb_checksum(*pskb, hdrlen,
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index 9cbf3f9be13b..fcebc968d37f 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -33,7 +33,7 @@ static int set_addr(struct sk_buff **pskb,
33 unsigned int addroff, __be32 ip, __be16 port) 33 unsigned int addroff, __be32 ip, __be16 port)
34{ 34{
35 enum ip_conntrack_info ctinfo; 35 enum ip_conntrack_info ctinfo;
36 struct nf_conn *ct = ip_conntrack_get(*pskb, &ctinfo); 36 struct nf_conn *ct = nf_ct_get(*pskb, &ctinfo);
37 struct { 37 struct {
38 __be32 ip; 38 __be32 ip;
39 __be16 port; 39 __be16 port;
@@ -44,7 +44,7 @@ static int set_addr(struct sk_buff **pskb,
44 buf.port = port; 44 buf.port = port;
45 addroff += dataoff; 45 addroff += dataoff;
46 46
47 if ((*pskb)->nh.iph->protocol == IPPROTO_TCP) { 47 if (ip_hdr(*pskb)->protocol == IPPROTO_TCP) {
48 if (!nf_nat_mangle_tcp_packet(pskb, ct, ctinfo, 48 if (!nf_nat_mangle_tcp_packet(pskb, ct, ctinfo,
49 addroff, sizeof(buf), 49 addroff, sizeof(buf),
50 (char *) &buf, sizeof(buf))) { 50 (char *) &buf, sizeof(buf))) {
@@ -55,11 +55,11 @@ static int set_addr(struct sk_buff **pskb,
55 } 55 }
56 56
57 /* Relocate data pointer */ 57 /* Relocate data pointer */
58 th = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl * 4, 58 th = skb_header_pointer(*pskb, ip_hdrlen(*pskb),
59 sizeof(_tcph), &_tcph); 59 sizeof(_tcph), &_tcph);
60 if (th == NULL) 60 if (th == NULL)
61 return -1; 61 return -1;
62 *data = (*pskb)->data + (*pskb)->nh.iph->ihl * 4 + 62 *data = (*pskb)->data + ip_hdrlen(*pskb) +
63 th->doff * 4 + dataoff; 63 th->doff * 4 + dataoff;
64 } else { 64 } else {
65 if (!nf_nat_mangle_udp_packet(pskb, ct, ctinfo, 65 if (!nf_nat_mangle_udp_packet(pskb, ct, ctinfo,
@@ -73,8 +73,8 @@ static int set_addr(struct sk_buff **pskb,
73 /* nf_nat_mangle_udp_packet uses skb_make_writable() to copy 73 /* nf_nat_mangle_udp_packet uses skb_make_writable() to copy
74 * or pull everything in a linear buffer, so we can safely 74 * or pull everything in a linear buffer, so we can safely
75 * use the skb pointers now */ 75 * use the skb pointers now */
76 *data = (*pskb)->data + (*pskb)->nh.iph->ihl * 4 + 76 *data = ((*pskb)->data + ip_hdrlen(*pskb) +
77 sizeof(struct udphdr); 77 sizeof(struct udphdr));
78 } 78 }
79 79
80 return 0; 80 return 0;
@@ -383,7 +383,7 @@ static int nat_h245(struct sk_buff **pskb, struct nf_conn *ct,
383static void ip_nat_q931_expect(struct nf_conn *new, 383static void ip_nat_q931_expect(struct nf_conn *new,
384 struct nf_conntrack_expect *this) 384 struct nf_conntrack_expect *this)
385{ 385{
386 struct ip_nat_range range; 386 struct nf_nat_range range;
387 387
388 if (this->tuple.src.u3.ip != 0) { /* Only accept calls from GK */ 388 if (this->tuple.src.u3.ip != 0) { /* Only accept calls from GK */
389 nf_nat_follow_master(new, this); 389 nf_nat_follow_master(new, this);
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c
index 49a90c39ffce..15b6e5ce3a04 100644
--- a/net/ipv4/netfilter/nf_nat_helper.c
+++ b/net/ipv4/netfilter/nf_nat_helper.c
@@ -87,12 +87,13 @@ static void mangle_contents(struct sk_buff *skb,
87 unsigned char *data; 87 unsigned char *data;
88 88
89 BUG_ON(skb_is_nonlinear(skb)); 89 BUG_ON(skb_is_nonlinear(skb));
90 data = (unsigned char *)skb->nh.iph + dataoff; 90 data = skb_network_header(skb) + dataoff;
91 91
92 /* move post-replacement */ 92 /* move post-replacement */
93 memmove(data + match_offset + rep_len, 93 memmove(data + match_offset + rep_len,
94 data + match_offset + match_len, 94 data + match_offset + match_len,
95 skb->tail - (data + match_offset + match_len)); 95 skb->tail - (skb->network_header + dataoff +
96 match_offset + match_len));
96 97
97 /* insert data from buffer */ 98 /* insert data from buffer */
98 memcpy(data + match_offset, rep_buffer, rep_len); 99 memcpy(data + match_offset, rep_buffer, rep_len);
@@ -111,8 +112,8 @@ static void mangle_contents(struct sk_buff *skb,
111 } 112 }
112 113
113 /* fix IP hdr checksum information */ 114 /* fix IP hdr checksum information */
114 skb->nh.iph->tot_len = htons(skb->len); 115 ip_hdr(skb)->tot_len = htons(skb->len);
115 ip_send_check(skb->nh.iph); 116 ip_send_check(ip_hdr(skb));
116} 117}
117 118
118/* Unusual, but possible case. */ 119/* Unusual, but possible case. */
@@ -152,6 +153,7 @@ nf_nat_mangle_tcp_packet(struct sk_buff **pskb,
152 const char *rep_buffer, 153 const char *rep_buffer,
153 unsigned int rep_len) 154 unsigned int rep_len)
154{ 155{
156 struct rtable *rt = (struct rtable *)(*pskb)->dst;
155 struct iphdr *iph; 157 struct iphdr *iph;
156 struct tcphdr *tcph; 158 struct tcphdr *tcph;
157 int oldlen, datalen; 159 int oldlen, datalen;
@@ -166,7 +168,7 @@ nf_nat_mangle_tcp_packet(struct sk_buff **pskb,
166 168
167 SKB_LINEAR_ASSERT(*pskb); 169 SKB_LINEAR_ASSERT(*pskb);
168 170
169 iph = (*pskb)->nh.iph; 171 iph = ip_hdr(*pskb);
170 tcph = (void *)iph + iph->ihl*4; 172 tcph = (void *)iph + iph->ihl*4;
171 173
172 oldlen = (*pskb)->len - iph->ihl*4; 174 oldlen = (*pskb)->len - iph->ihl*4;
@@ -175,11 +177,22 @@ nf_nat_mangle_tcp_packet(struct sk_buff **pskb,
175 177
176 datalen = (*pskb)->len - iph->ihl*4; 178 datalen = (*pskb)->len - iph->ihl*4;
177 if ((*pskb)->ip_summed != CHECKSUM_PARTIAL) { 179 if ((*pskb)->ip_summed != CHECKSUM_PARTIAL) {
178 tcph->check = 0; 180 if (!(rt->rt_flags & RTCF_LOCAL) &&
179 tcph->check = tcp_v4_check(datalen, 181 (*pskb)->dev->features & NETIF_F_ALL_CSUM) {
180 iph->saddr, iph->daddr, 182 (*pskb)->ip_summed = CHECKSUM_PARTIAL;
181 csum_partial((char *)tcph, 183 (*pskb)->csum_start = skb_headroom(*pskb) +
182 datalen, 0)); 184 skb_network_offset(*pskb) +
185 iph->ihl * 4;
186 (*pskb)->csum_offset = offsetof(struct tcphdr, check);
187 tcph->check = ~tcp_v4_check(datalen,
188 iph->saddr, iph->daddr, 0);
189 } else {
190 tcph->check = 0;
191 tcph->check = tcp_v4_check(datalen,
192 iph->saddr, iph->daddr,
193 csum_partial((char *)tcph,
194 datalen, 0));
195 }
183 } else 196 } else
184 nf_proto_csum_replace2(&tcph->check, *pskb, 197 nf_proto_csum_replace2(&tcph->check, *pskb,
185 htons(oldlen), htons(datalen), 1); 198 htons(oldlen), htons(datalen), 1);
@@ -190,7 +203,7 @@ nf_nat_mangle_tcp_packet(struct sk_buff **pskb,
190 (int)rep_len - (int)match_len, 203 (int)rep_len - (int)match_len,
191 ct, ctinfo); 204 ct, ctinfo);
192 /* Tell TCP window tracking about seq change */ 205 /* Tell TCP window tracking about seq change */
193 nf_conntrack_tcp_update(*pskb, (*pskb)->nh.iph->ihl*4, 206 nf_conntrack_tcp_update(*pskb, ip_hdrlen(*pskb),
194 ct, CTINFO2DIR(ctinfo)); 207 ct, CTINFO2DIR(ctinfo));
195 } 208 }
196 return 1; 209 return 1;
@@ -216,12 +229,13 @@ nf_nat_mangle_udp_packet(struct sk_buff **pskb,
216 const char *rep_buffer, 229 const char *rep_buffer,
217 unsigned int rep_len) 230 unsigned int rep_len)
218{ 231{
232 struct rtable *rt = (struct rtable *)(*pskb)->dst;
219 struct iphdr *iph; 233 struct iphdr *iph;
220 struct udphdr *udph; 234 struct udphdr *udph;
221 int datalen, oldlen; 235 int datalen, oldlen;
222 236
223 /* UDP helpers might accidentally mangle the wrong packet */ 237 /* UDP helpers might accidentally mangle the wrong packet */
224 iph = (*pskb)->nh.iph; 238 iph = ip_hdr(*pskb);
225 if ((*pskb)->len < iph->ihl*4 + sizeof(*udph) + 239 if ((*pskb)->len < iph->ihl*4 + sizeof(*udph) +
226 match_offset + match_len) 240 match_offset + match_len)
227 return 0; 241 return 0;
@@ -234,7 +248,7 @@ nf_nat_mangle_udp_packet(struct sk_buff **pskb,
234 !enlarge_skb(pskb, rep_len - match_len)) 248 !enlarge_skb(pskb, rep_len - match_len))
235 return 0; 249 return 0;
236 250
237 iph = (*pskb)->nh.iph; 251 iph = ip_hdr(*pskb);
238 udph = (void *)iph + iph->ihl*4; 252 udph = (void *)iph + iph->ihl*4;
239 253
240 oldlen = (*pskb)->len - iph->ihl*4; 254 oldlen = (*pskb)->len - iph->ihl*4;
@@ -250,13 +264,25 @@ nf_nat_mangle_udp_packet(struct sk_buff **pskb,
250 return 1; 264 return 1;
251 265
252 if ((*pskb)->ip_summed != CHECKSUM_PARTIAL) { 266 if ((*pskb)->ip_summed != CHECKSUM_PARTIAL) {
253 udph->check = 0; 267 if (!(rt->rt_flags & RTCF_LOCAL) &&
254 udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr, 268 (*pskb)->dev->features & NETIF_F_ALL_CSUM) {
255 datalen, IPPROTO_UDP, 269 (*pskb)->ip_summed = CHECKSUM_PARTIAL;
256 csum_partial((char *)udph, 270 (*pskb)->csum_start = skb_headroom(*pskb) +
257 datalen, 0)); 271 skb_network_offset(*pskb) +
258 if (!udph->check) 272 iph->ihl * 4;
259 udph->check = CSUM_MANGLED_0; 273 (*pskb)->csum_offset = offsetof(struct udphdr, check);
274 udph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
275 datalen, IPPROTO_UDP,
276 0);
277 } else {
278 udph->check = 0;
279 udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
280 datalen, IPPROTO_UDP,
281 csum_partial((char *)udph,
282 datalen, 0));
283 if (!udph->check)
284 udph->check = CSUM_MANGLED_0;
285 }
260 } else 286 } else
261 nf_proto_csum_replace2(&udph->check, *pskb, 287 nf_proto_csum_replace2(&udph->check, *pskb,
262 htons(oldlen), htons(datalen), 1); 288 htons(oldlen), htons(datalen), 1);
@@ -318,8 +344,8 @@ nf_nat_sack_adjust(struct sk_buff **pskb,
318 unsigned int dir, optoff, optend; 344 unsigned int dir, optoff, optend;
319 struct nf_conn_nat *nat = nfct_nat(ct); 345 struct nf_conn_nat *nat = nfct_nat(ct);
320 346
321 optoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct tcphdr); 347 optoff = ip_hdrlen(*pskb) + sizeof(struct tcphdr);
322 optend = (*pskb)->nh.iph->ihl*4 + tcph->doff*4; 348 optend = ip_hdrlen(*pskb) + tcph->doff * 4;
323 349
324 if (!skb_make_writable(pskb, optend)) 350 if (!skb_make_writable(pskb, optend))
325 return 0; 351 return 0;
@@ -371,10 +397,10 @@ nf_nat_seq_adjust(struct sk_buff **pskb,
371 this_way = &nat->info.seq[dir]; 397 this_way = &nat->info.seq[dir];
372 other_way = &nat->info.seq[!dir]; 398 other_way = &nat->info.seq[!dir];
373 399
374 if (!skb_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) 400 if (!skb_make_writable(pskb, ip_hdrlen(*pskb) + sizeof(*tcph)))
375 return 0; 401 return 0;
376 402
377 tcph = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4; 403 tcph = (void *)(*pskb)->data + ip_hdrlen(*pskb);
378 if (after(ntohl(tcph->seq), this_way->correction_pos)) 404 if (after(ntohl(tcph->seq), this_way->correction_pos))
379 newseq = htonl(ntohl(tcph->seq) + this_way->offset_after); 405 newseq = htonl(ntohl(tcph->seq) + this_way->offset_after);
380 else 406 else
@@ -399,7 +425,7 @@ nf_nat_seq_adjust(struct sk_buff **pskb,
399 if (!nf_nat_sack_adjust(pskb, tcph, ct, ctinfo)) 425 if (!nf_nat_sack_adjust(pskb, tcph, ct, ctinfo))
400 return 0; 426 return 0;
401 427
402 nf_conntrack_tcp_update(*pskb, (*pskb)->nh.iph->ihl*4, ct, dir); 428 nf_conntrack_tcp_update(*pskb, ip_hdrlen(*pskb), ct, dir);
403 429
404 return 1; 430 return 1;
405} 431}
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index 7ba341c22eaa..a66888749ceb 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -53,7 +53,7 @@ static void pptp_nat_expected(struct nf_conn *ct,
53 struct nf_conntrack_tuple t; 53 struct nf_conntrack_tuple t;
54 struct nf_ct_pptp_master *ct_pptp_info; 54 struct nf_ct_pptp_master *ct_pptp_info;
55 struct nf_nat_pptp *nat_pptp_info; 55 struct nf_nat_pptp *nat_pptp_info;
56 struct ip_nat_range range; 56 struct nf_nat_range range;
57 57
58 ct_pptp_info = &nfct_help(master)->help.ct_pptp_info; 58 ct_pptp_info = &nfct_help(master)->help.ct_pptp_info;
59 nat_pptp_info = &nfct_nat(master)->help.nat_pptp_info; 59 nat_pptp_info = &nfct_nat(master)->help.nat_pptp_info;
diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c
index 147a4370cf03..2a283397a8b6 100644
--- a/net/ipv4/netfilter/nf_nat_rule.c
+++ b/net/ipv4/netfilter/nf_nat_rule.c
@@ -191,7 +191,7 @@ static unsigned int ipt_dnat_target(struct sk_buff **pskb,
191 191
192 if (hooknum == NF_IP_LOCAL_OUT && 192 if (hooknum == NF_IP_LOCAL_OUT &&
193 mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) 193 mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)
194 warn_if_extra_mangle((*pskb)->nh.iph->daddr, 194 warn_if_extra_mangle(ip_hdr(*pskb)->daddr,
195 mr->range[0].min_ip); 195 mr->range[0].min_ip);
196 196
197 return nf_nat_setup_info(ct, &mr->range[0], hooknum); 197 return nf_nat_setup_info(ct, &mr->range[0], hooknum);
diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c
index b12cd7c314ca..bfd88e4e0685 100644
--- a/net/ipv4/netfilter/nf_nat_sip.c
+++ b/net/ipv4/netfilter/nf_nat_sip.c
@@ -11,6 +11,7 @@
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/skbuff.h> 12#include <linux/skbuff.h>
13#include <linux/ip.h> 13#include <linux/ip.h>
14#include <net/ip.h>
14#include <linux/udp.h> 15#include <linux/udp.h>
15 16
16#include <net/netfilter/nf_nat.h> 17#include <net/netfilter/nf_nat.h>
@@ -92,7 +93,7 @@ static int map_sip_addr(struct sk_buff **pskb, enum ip_conntrack_info ctinfo,
92 if (!nf_nat_mangle_udp_packet(pskb, ct, ctinfo, 93 if (!nf_nat_mangle_udp_packet(pskb, ct, ctinfo,
93 matchoff, matchlen, addr, addrlen)) 94 matchoff, matchlen, addr, addrlen))
94 return 0; 95 return 0;
95 *dptr = (*pskb)->data + (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); 96 *dptr = (*pskb)->data + ip_hdrlen(*pskb) + sizeof(struct udphdr);
96 return 1; 97 return 1;
97 98
98} 99}
@@ -106,7 +107,7 @@ static unsigned int ip_nat_sip(struct sk_buff **pskb,
106 struct addr_map map; 107 struct addr_map map;
107 int dataoff, datalen; 108 int dataoff, datalen;
108 109
109 dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); 110 dataoff = ip_hdrlen(*pskb) + sizeof(struct udphdr);
110 datalen = (*pskb)->len - dataoff; 111 datalen = (*pskb)->len - dataoff;
111 if (datalen < sizeof("SIP/2.0") - 1) 112 if (datalen < sizeof("SIP/2.0") - 1)
112 return NF_DROP; 113 return NF_DROP;
@@ -155,7 +156,7 @@ static unsigned int mangle_sip_packet(struct sk_buff **pskb,
155 return 0; 156 return 0;
156 157
157 /* We need to reload this. Thanks Patrick. */ 158 /* We need to reload this. Thanks Patrick. */
158 *dptr = (*pskb)->data + (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); 159 *dptr = (*pskb)->data + ip_hdrlen(*pskb) + sizeof(struct udphdr);
159 return 1; 160 return 1;
160} 161}
161 162
@@ -168,7 +169,7 @@ static int mangle_content_len(struct sk_buff **pskb,
168 char buffer[sizeof("65536")]; 169 char buffer[sizeof("65536")];
169 int bufflen; 170 int bufflen;
170 171
171 dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); 172 dataoff = ip_hdrlen(*pskb) + sizeof(struct udphdr);
172 173
173 /* Get actual SDP lenght */ 174 /* Get actual SDP lenght */
174 if (ct_sip_get_info(ct, dptr, (*pskb)->len - dataoff, &matchoff, 175 if (ct_sip_get_info(ct, dptr, (*pskb)->len - dataoff, &matchoff,
@@ -200,7 +201,7 @@ static unsigned int mangle_sdp(struct sk_buff **pskb,
200 char buffer[sizeof("nnn.nnn.nnn.nnn")]; 201 char buffer[sizeof("nnn.nnn.nnn.nnn")];
201 unsigned int dataoff, bufflen; 202 unsigned int dataoff, bufflen;
202 203
203 dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); 204 dataoff = ip_hdrlen(*pskb) + sizeof(struct udphdr);
204 205
205 /* Mangle owner and contact info. */ 206 /* Mangle owner and contact info. */
206 bufflen = sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(newip)); 207 bufflen = sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(newip));
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index ce5c4939a6ee..6e88505d6162 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -38,10 +38,6 @@
38 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 38 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
39 * 39 *
40 * Author: James Morris <jmorris@intercode.com.au> 40 * Author: James Morris <jmorris@intercode.com.au>
41 *
42 * Updates:
43 * 2000-08-06: Convert to new helper API (Harald Welte).
44 *
45 */ 41 */
46#include <linux/module.h> 42#include <linux/module.h>
47#include <linux/moduleparam.h> 43#include <linux/moduleparam.h>
@@ -1194,7 +1190,7 @@ static int snmp_translate(struct nf_conn *ct,
1194 enum ip_conntrack_info ctinfo, 1190 enum ip_conntrack_info ctinfo,
1195 struct sk_buff **pskb) 1191 struct sk_buff **pskb)
1196{ 1192{
1197 struct iphdr *iph = (*pskb)->nh.iph; 1193 struct iphdr *iph = ip_hdr(*pskb);
1198 struct udphdr *udph = (struct udphdr *)((__be32 *)iph + iph->ihl); 1194 struct udphdr *udph = (struct udphdr *)((__be32 *)iph + iph->ihl);
1199 u_int16_t udplen = ntohs(udph->len); 1195 u_int16_t udplen = ntohs(udph->len);
1200 u_int16_t paylen = udplen - sizeof(struct udphdr); 1196 u_int16_t paylen = udplen - sizeof(struct udphdr);
@@ -1235,7 +1231,7 @@ static int help(struct sk_buff **pskb, unsigned int protoff,
1235{ 1231{
1236 int dir = CTINFO2DIR(ctinfo); 1232 int dir = CTINFO2DIR(ctinfo);
1237 unsigned int ret; 1233 unsigned int ret;
1238 struct iphdr *iph = (*pskb)->nh.iph; 1234 struct iphdr *iph = ip_hdr(*pskb);
1239 struct udphdr *udph = (struct udphdr *)((u_int32_t *)iph + iph->ihl); 1235 struct udphdr *udph = (struct udphdr *)((u_int32_t *)iph + iph->ihl);
1240 1236
1241 /* SNMP replies and originating SNMP traps get mangled */ 1237 /* SNMP replies and originating SNMP traps get mangled */
diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c
index 15aa3db8cb33..64bbed2ba780 100644
--- a/net/ipv4/netfilter/nf_nat_standalone.c
+++ b/net/ipv4/netfilter/nf_nat_standalone.c
@@ -86,8 +86,7 @@ nf_nat_fn(unsigned int hooknum,
86 86
87 /* We never see fragments: conntrack defrags on pre-routing 87 /* We never see fragments: conntrack defrags on pre-routing
88 and local-out, and nf_nat_out protects post-routing. */ 88 and local-out, and nf_nat_out protects post-routing. */
89 NF_CT_ASSERT(!((*pskb)->nh.iph->frag_off 89 NF_CT_ASSERT(!(ip_hdr(*pskb)->frag_off & htons(IP_MF | IP_OFFSET)));
90 & htons(IP_MF|IP_OFFSET)));
91 90
92 ct = nf_ct_get(*pskb, &ctinfo); 91 ct = nf_ct_get(*pskb, &ctinfo);
93 /* Can't track? It's not due to stress, or conntrack would 92 /* Can't track? It's not due to stress, or conntrack would
@@ -98,11 +97,10 @@ nf_nat_fn(unsigned int hooknum,
98 /* Exception: ICMP redirect to new connection (not in 97 /* Exception: ICMP redirect to new connection (not in
99 hash table yet). We must not let this through, in 98 hash table yet). We must not let this through, in
100 case we're doing NAT to the same network. */ 99 case we're doing NAT to the same network. */
101 if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP) { 100 if (ip_hdr(*pskb)->protocol == IPPROTO_ICMP) {
102 struct icmphdr _hdr, *hp; 101 struct icmphdr _hdr, *hp;
103 102
104 hp = skb_header_pointer(*pskb, 103 hp = skb_header_pointer(*pskb, ip_hdrlen(*pskb),
105 (*pskb)->nh.iph->ihl*4,
106 sizeof(_hdr), &_hdr); 104 sizeof(_hdr), &_hdr);
107 if (hp != NULL && 105 if (hp != NULL &&
108 hp->type == ICMP_REDIRECT) 106 hp->type == ICMP_REDIRECT)
@@ -122,7 +120,7 @@ nf_nat_fn(unsigned int hooknum,
122 switch (ctinfo) { 120 switch (ctinfo) {
123 case IP_CT_RELATED: 121 case IP_CT_RELATED:
124 case IP_CT_RELATED+IP_CT_IS_REPLY: 122 case IP_CT_RELATED+IP_CT_IS_REPLY:
125 if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP) { 123 if (ip_hdr(*pskb)->protocol == IPPROTO_ICMP) {
126 if (!nf_nat_icmp_reply_translation(ct, ctinfo, 124 if (!nf_nat_icmp_reply_translation(ct, ctinfo,
127 hooknum, pskb)) 125 hooknum, pskb))
128 return NF_DROP; 126 return NF_DROP;
@@ -177,11 +175,11 @@ nf_nat_in(unsigned int hooknum,
177 int (*okfn)(struct sk_buff *)) 175 int (*okfn)(struct sk_buff *))
178{ 176{
179 unsigned int ret; 177 unsigned int ret;
180 __be32 daddr = (*pskb)->nh.iph->daddr; 178 __be32 daddr = ip_hdr(*pskb)->daddr;
181 179
182 ret = nf_nat_fn(hooknum, pskb, in, out, okfn); 180 ret = nf_nat_fn(hooknum, pskb, in, out, okfn);
183 if (ret != NF_DROP && ret != NF_STOLEN && 181 if (ret != NF_DROP && ret != NF_STOLEN &&
184 daddr != (*pskb)->nh.iph->daddr) { 182 daddr != ip_hdr(*pskb)->daddr) {
185 dst_release((*pskb)->dst); 183 dst_release((*pskb)->dst);
186 (*pskb)->dst = NULL; 184 (*pskb)->dst = NULL;
187 } 185 }
@@ -203,7 +201,7 @@ nf_nat_out(unsigned int hooknum,
203 201
204 /* root is playing with raw sockets. */ 202 /* root is playing with raw sockets. */
205 if ((*pskb)->len < sizeof(struct iphdr) || 203 if ((*pskb)->len < sizeof(struct iphdr) ||
206 (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) 204 ip_hdrlen(*pskb) < sizeof(struct iphdr))
207 return NF_ACCEPT; 205 return NF_ACCEPT;
208 206
209 ret = nf_nat_fn(hooknum, pskb, in, out, okfn); 207 ret = nf_nat_fn(hooknum, pskb, in, out, okfn);
@@ -236,7 +234,7 @@ nf_nat_local_fn(unsigned int hooknum,
236 234
237 /* root is playing with raw sockets. */ 235 /* root is playing with raw sockets. */
238 if ((*pskb)->len < sizeof(struct iphdr) || 236 if ((*pskb)->len < sizeof(struct iphdr) ||
239 (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) 237 ip_hdrlen(*pskb) < sizeof(struct iphdr))
240 return NF_ACCEPT; 238 return NF_ACCEPT;
241 239
242 ret = nf_nat_fn(hooknum, pskb, in, out, okfn); 240 ret = nf_nat_fn(hooknum, pskb, in, out, okfn);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index ae68a691e8cd..37ab5802ca08 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -87,19 +87,6 @@ static const struct file_operations sockstat_seq_fops = {
87 .release = single_release, 87 .release = single_release,
88}; 88};
89 89
90static unsigned long
91fold_field(void *mib[], int offt)
92{
93 unsigned long res = 0;
94 int i;
95
96 for_each_possible_cpu(i) {
97 res += *(((unsigned long *) per_cpu_ptr(mib[0], i)) + offt);
98 res += *(((unsigned long *) per_cpu_ptr(mib[1], i)) + offt);
99 }
100 return res;
101}
102
103/* snmp items */ 90/* snmp items */
104static const struct snmp_mib snmp4_ipstats_list[] = { 91static const struct snmp_mib snmp4_ipstats_list[] = {
105 SNMP_MIB_ITEM("InReceives", IPSTATS_MIB_INRECEIVES), 92 SNMP_MIB_ITEM("InReceives", IPSTATS_MIB_INRECEIVES),
@@ -266,8 +253,8 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
266 253
267 for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) 254 for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
268 seq_printf(seq, " %lu", 255 seq_printf(seq, " %lu",
269 fold_field((void **) ip_statistics, 256 snmp_fold_field((void **)ip_statistics,
270 snmp4_ipstats_list[i].entry)); 257 snmp4_ipstats_list[i].entry));
271 258
272 seq_puts(seq, "\nIcmp:"); 259 seq_puts(seq, "\nIcmp:");
273 for (i = 0; snmp4_icmp_list[i].name != NULL; i++) 260 for (i = 0; snmp4_icmp_list[i].name != NULL; i++)
@@ -276,8 +263,8 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
276 seq_puts(seq, "\nIcmp:"); 263 seq_puts(seq, "\nIcmp:");
277 for (i = 0; snmp4_icmp_list[i].name != NULL; i++) 264 for (i = 0; snmp4_icmp_list[i].name != NULL; i++)
278 seq_printf(seq, " %lu", 265 seq_printf(seq, " %lu",
279 fold_field((void **) icmp_statistics, 266 snmp_fold_field((void **)icmp_statistics,
280 snmp4_icmp_list[i].entry)); 267 snmp4_icmp_list[i].entry));
281 268
282 seq_puts(seq, "\nTcp:"); 269 seq_puts(seq, "\nTcp:");
283 for (i = 0; snmp4_tcp_list[i].name != NULL; i++) 270 for (i = 0; snmp4_tcp_list[i].name != NULL; i++)
@@ -288,12 +275,12 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
288 /* MaxConn field is signed, RFC 2012 */ 275 /* MaxConn field is signed, RFC 2012 */
289 if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN) 276 if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN)
290 seq_printf(seq, " %ld", 277 seq_printf(seq, " %ld",
291 fold_field((void **) tcp_statistics, 278 snmp_fold_field((void **)tcp_statistics,
292 snmp4_tcp_list[i].entry)); 279 snmp4_tcp_list[i].entry));
293 else 280 else
294 seq_printf(seq, " %lu", 281 seq_printf(seq, " %lu",
295 fold_field((void **) tcp_statistics, 282 snmp_fold_field((void **)tcp_statistics,
296 snmp4_tcp_list[i].entry)); 283 snmp4_tcp_list[i].entry));
297 } 284 }
298 285
299 seq_puts(seq, "\nUdp:"); 286 seq_puts(seq, "\nUdp:");
@@ -303,8 +290,8 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
303 seq_puts(seq, "\nUdp:"); 290 seq_puts(seq, "\nUdp:");
304 for (i = 0; snmp4_udp_list[i].name != NULL; i++) 291 for (i = 0; snmp4_udp_list[i].name != NULL; i++)
305 seq_printf(seq, " %lu", 292 seq_printf(seq, " %lu",
306 fold_field((void **) udp_statistics, 293 snmp_fold_field((void **)udp_statistics,
307 snmp4_udp_list[i].entry)); 294 snmp4_udp_list[i].entry));
308 295
309 /* the UDP and UDP-Lite MIBs are the same */ 296 /* the UDP and UDP-Lite MIBs are the same */
310 seq_puts(seq, "\nUdpLite:"); 297 seq_puts(seq, "\nUdpLite:");
@@ -314,8 +301,8 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
314 seq_puts(seq, "\nUdpLite:"); 301 seq_puts(seq, "\nUdpLite:");
315 for (i = 0; snmp4_udp_list[i].name != NULL; i++) 302 for (i = 0; snmp4_udp_list[i].name != NULL; i++)
316 seq_printf(seq, " %lu", 303 seq_printf(seq, " %lu",
317 fold_field((void **) udplite_statistics, 304 snmp_fold_field((void **)udplite_statistics,
318 snmp4_udp_list[i].entry) ); 305 snmp4_udp_list[i].entry));
319 306
320 seq_putc(seq, '\n'); 307 seq_putc(seq, '\n');
321 return 0; 308 return 0;
@@ -348,8 +335,8 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
348 seq_puts(seq, "\nTcpExt:"); 335 seq_puts(seq, "\nTcpExt:");
349 for (i = 0; snmp4_net_list[i].name != NULL; i++) 336 for (i = 0; snmp4_net_list[i].name != NULL; i++)
350 seq_printf(seq, " %lu", 337 seq_printf(seq, " %lu",
351 fold_field((void **) net_statistics, 338 snmp_fold_field((void **)net_statistics,
352 snmp4_net_list[i].entry)); 339 snmp4_net_list[i].entry));
353 340
354 seq_putc(seq, '\n'); 341 seq_putc(seq, '\n');
355 return 0; 342 return 0;
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index da70fef82c93..971ab9356e51 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -45,7 +45,7 @@
45#include <net/ipip.h> 45#include <net/ipip.h>
46#include <linux/igmp.h> 46#include <linux/igmp.h>
47 47
48struct net_protocol *inet_protos[MAX_INET_PROTOS]; 48struct net_protocol *inet_protos[MAX_INET_PROTOS] ____cacheline_aligned_in_smp;
49static DEFINE_SPINLOCK(inet_proto_lock); 49static DEFINE_SPINLOCK(inet_proto_lock);
50 50
51/* 51/*
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 87e9c1618100..24d7c9f31918 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -132,7 +132,7 @@ static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb)
132 if (!pskb_may_pull(skb, sizeof(struct icmphdr))) 132 if (!pskb_may_pull(skb, sizeof(struct icmphdr)))
133 return 1; 133 return 1;
134 134
135 type = skb->h.icmph->type; 135 type = icmp_hdr(skb)->type;
136 if (type < 32) { 136 if (type < 32) {
137 __u32 data = raw_sk(sk)->filter.data; 137 __u32 data = raw_sk(sk)->filter.data;
138 138
@@ -184,8 +184,8 @@ out:
184void raw_err (struct sock *sk, struct sk_buff *skb, u32 info) 184void raw_err (struct sock *sk, struct sk_buff *skb, u32 info)
185{ 185{
186 struct inet_sock *inet = inet_sk(sk); 186 struct inet_sock *inet = inet_sk(sk);
187 int type = skb->h.icmph->type; 187 const int type = icmp_hdr(skb)->type;
188 int code = skb->h.icmph->code; 188 const int code = icmp_hdr(skb)->code;
189 int err = 0; 189 int err = 0;
190 int harderr = 0; 190 int harderr = 0;
191 191
@@ -256,7 +256,7 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb)
256 } 256 }
257 nf_reset(skb); 257 nf_reset(skb);
258 258
259 skb_push(skb, skb->data - skb->nh.raw); 259 skb_push(skb, skb->data - skb_network_header(skb));
260 260
261 raw_rcv_skb(sk, skb); 261 raw_rcv_skb(sk, skb);
262 return 0; 262 return 0;
@@ -291,11 +291,13 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
291 skb->priority = sk->sk_priority; 291 skb->priority = sk->sk_priority;
292 skb->dst = dst_clone(&rt->u.dst); 292 skb->dst = dst_clone(&rt->u.dst);
293 293
294 skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length); 294 skb_reset_network_header(skb);
295 iph = ip_hdr(skb);
296 skb_put(skb, length);
295 297
296 skb->ip_summed = CHECKSUM_NONE; 298 skb->ip_summed = CHECKSUM_NONE;
297 299
298 skb->h.raw = skb->nh.raw; 300 skb->transport_header = skb->network_header;
299 err = memcpy_fromiovecend((void *)iph, from, 0, length); 301 err = memcpy_fromiovecend((void *)iph, from, 0, length);
300 if (err) 302 if (err)
301 goto error_fault; 303 goto error_fault;
@@ -613,7 +615,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
613 /* Copy the address. */ 615 /* Copy the address. */
614 if (sin) { 616 if (sin) {
615 sin->sin_family = AF_INET; 617 sin->sin_family = AF_INET;
616 sin->sin_addr.s_addr = skb->nh.iph->saddr; 618 sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
617 sin->sin_port = 0; 619 sin->sin_port = 0;
618 memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); 620 memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
619 } 621 }
@@ -887,7 +889,7 @@ static int raw_seq_show(struct seq_file *seq, void *v)
887 return 0; 889 return 0;
888} 890}
889 891
890static struct seq_operations raw_seq_ops = { 892static const struct seq_operations raw_seq_ops = {
891 .start = raw_seq_start, 893 .start = raw_seq_start,
892 .next = raw_seq_next, 894 .next = raw_seq_next,
893 .stop = raw_seq_stop, 895 .stop = raw_seq_stop,
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 37e0d4d5cf94..cb76e3c725a0 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -82,7 +82,6 @@
82#include <linux/proc_fs.h> 82#include <linux/proc_fs.h>
83#include <linux/init.h> 83#include <linux/init.h>
84#include <linux/skbuff.h> 84#include <linux/skbuff.h>
85#include <linux/rtnetlink.h>
86#include <linux/inetdevice.h> 85#include <linux/inetdevice.h>
87#include <linux/igmp.h> 86#include <linux/igmp.h>
88#include <linux/pkt_sched.h> 87#include <linux/pkt_sched.h>
@@ -104,6 +103,7 @@
104#include <net/xfrm.h> 103#include <net/xfrm.h>
105#include <net/ip_mp_alg.h> 104#include <net/ip_mp_alg.h>
106#include <net/netevent.h> 105#include <net/netevent.h>
106#include <net/rtnetlink.h>
107#ifdef CONFIG_SYSCTL 107#ifdef CONFIG_SYSCTL
108#include <linux/sysctl.h> 108#include <linux/sysctl.h>
109#endif 109#endif
@@ -364,7 +364,7 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
364 return 0; 364 return 0;
365} 365}
366 366
367static struct seq_operations rt_cache_seq_ops = { 367static const struct seq_operations rt_cache_seq_ops = {
368 .start = rt_cache_seq_start, 368 .start = rt_cache_seq_start,
369 .next = rt_cache_seq_next, 369 .next = rt_cache_seq_next,
370 .stop = rt_cache_seq_stop, 370 .stop = rt_cache_seq_stop,
@@ -470,7 +470,7 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v)
470 return 0; 470 return 0;
471} 471}
472 472
473static struct seq_operations rt_cpu_seq_ops = { 473static const struct seq_operations rt_cpu_seq_ops = {
474 .start = rt_cpu_seq_start, 474 .start = rt_cpu_seq_start,
475 .next = rt_cpu_seq_next, 475 .next = rt_cpu_seq_next,
476 .stop = rt_cpu_seq_stop, 476 .stop = rt_cpu_seq_stop,
@@ -1519,7 +1519,7 @@ static void ipv4_link_failure(struct sk_buff *skb)
1519static int ip_rt_bug(struct sk_buff *skb) 1519static int ip_rt_bug(struct sk_buff *skb)
1520{ 1520{
1521 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n", 1521 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1522 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr), 1522 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1523 skb->dev ? skb->dev->name : "?"); 1523 skb->dev ? skb->dev->name : "?");
1524 kfree_skb(skb); 1524 kfree_skb(skb);
1525 return 0; 1525 return 0;
@@ -1698,9 +1698,9 @@ static void ip_handle_martian_source(struct net_device *dev,
1698 printk(KERN_WARNING "martian source %u.%u.%u.%u from " 1698 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1699 "%u.%u.%u.%u, on dev %s\n", 1699 "%u.%u.%u.%u, on dev %s\n",
1700 NIPQUAD(daddr), NIPQUAD(saddr), dev->name); 1700 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1701 if (dev->hard_header_len && skb->mac.raw) { 1701 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1702 int i; 1702 int i;
1703 unsigned char *p = skb->mac.raw; 1703 const unsigned char *p = skb_mac_header(skb);
1704 printk(KERN_WARNING "ll header: "); 1704 printk(KERN_WARNING "ll header: ");
1705 for (i = 0; i < dev->hard_header_len; i++, p++) { 1705 for (i = 0; i < dev->hard_header_len; i++, p++) {
1706 printk("%02x", *p); 1706 printk("%02x", *p);
@@ -2134,7 +2134,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2134 rcu_read_lock(); 2134 rcu_read_lock();
2135 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) { 2135 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2136 int our = ip_check_mc(in_dev, daddr, saddr, 2136 int our = ip_check_mc(in_dev, daddr, saddr,
2137 skb->nh.iph->protocol); 2137 ip_hdr(skb)->protocol);
2138 if (our 2138 if (our
2139#ifdef CONFIG_IP_MROUTE 2139#ifdef CONFIG_IP_MROUTE
2140 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev)) 2140 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
@@ -2396,7 +2396,7 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2396 2396
2397 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2397 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2398 dev_out = ip_dev_find(oldflp->fl4_src); 2398 dev_out = ip_dev_find(oldflp->fl4_src);
2399 if (dev_out == NULL) 2399 if ((dev_out == NULL) && !(sysctl_ip_nonlocal_bind))
2400 goto out; 2400 goto out;
2401 2401
2402 /* I removed check for oif == dev_out->oif here. 2402 /* I removed check for oif == dev_out->oif here.
@@ -2407,7 +2407,7 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2407 of another iface. --ANK 2407 of another iface. --ANK
2408 */ 2408 */
2409 2409
2410 if (oldflp->oif == 0 2410 if (dev_out && oldflp->oif == 0
2411 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) { 2411 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2412 /* Special hack: user can direct multicasts 2412 /* Special hack: user can direct multicasts
2413 and limited broadcast via necessary interface 2413 and limited broadcast via necessary interface
@@ -2683,7 +2683,7 @@ static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2683 id = rt->peer->ip_id_count; 2683 id = rt->peer->ip_id_count;
2684 if (rt->peer->tcp_ts_stamp) { 2684 if (rt->peer->tcp_ts_stamp) {
2685 ts = rt->peer->tcp_ts; 2685 ts = rt->peer->tcp_ts;
2686 tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp; 2686 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2687 } 2687 }
2688 } 2688 }
2689 2689
@@ -2721,7 +2721,7 @@ nla_put_failure:
2721 return -EMSGSIZE; 2721 return -EMSGSIZE;
2722} 2722}
2723 2723
2724int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) 2724static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2725{ 2725{
2726 struct rtmsg *rtm; 2726 struct rtmsg *rtm;
2727 struct nlattr *tb[RTA_MAX+1]; 2727 struct nlattr *tb[RTA_MAX+1];
@@ -2747,10 +2747,11 @@ int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2747 /* Reserve room for dummy headers, this skb can pass 2747 /* Reserve room for dummy headers, this skb can pass
2748 through good chunk of routing engine. 2748 through good chunk of routing engine.
2749 */ 2749 */
2750 skb->mac.raw = skb->nh.raw = skb->data; 2750 skb_reset_mac_header(skb);
2751 skb_reset_network_header(skb);
2751 2752
2752 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */ 2753 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2753 skb->nh.iph->protocol = IPPROTO_ICMP; 2754 ip_hdr(skb)->protocol = IPPROTO_ICMP;
2754 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr)); 2755 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2755 2756
2756 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0; 2757 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
@@ -3193,6 +3194,8 @@ int __init ip_rt_init(void)
3193 xfrm_init(); 3194 xfrm_init();
3194 xfrm4_init(); 3195 xfrm4_init();
3195#endif 3196#endif
3197 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3198
3196 return rc; 3199 return rc;
3197} 3200}
3198 3201
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 33016cc90f0b..2da1be0589a9 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -125,10 +125,11 @@ static __u16 const msstab[] = {
125__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) 125__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
126{ 126{
127 struct tcp_sock *tp = tcp_sk(sk); 127 struct tcp_sock *tp = tcp_sk(sk);
128 const struct iphdr *iph = ip_hdr(skb);
129 const struct tcphdr *th = tcp_hdr(skb);
128 int mssind; 130 int mssind;
129 const __u16 mss = *mssp; 131 const __u16 mss = *mssp;
130 132
131
132 tp->last_synq_overflow = jiffies; 133 tp->last_synq_overflow = jiffies;
133 134
134 /* XXX sort msstab[] by probability? Binary search? */ 135 /* XXX sort msstab[] by probability? Binary search? */
@@ -138,9 +139,8 @@ __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
138 139
139 NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESSENT); 140 NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESSENT);
140 141
141 return secure_tcp_syn_cookie(skb->nh.iph->saddr, skb->nh.iph->daddr, 142 return secure_tcp_syn_cookie(iph->saddr, iph->daddr,
142 skb->h.th->source, skb->h.th->dest, 143 th->source, th->dest, ntohl(th->seq),
143 ntohl(skb->h.th->seq),
144 jiffies / (HZ * 60), mssind); 144 jiffies / (HZ * 60), mssind);
145} 145}
146 146
@@ -157,14 +157,13 @@ __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
157 */ 157 */
158static inline int cookie_check(struct sk_buff *skb, __u32 cookie) 158static inline int cookie_check(struct sk_buff *skb, __u32 cookie)
159{ 159{
160 __u32 seq; 160 const struct iphdr *iph = ip_hdr(skb);
161 __u32 mssind; 161 const struct tcphdr *th = tcp_hdr(skb);
162 162 __u32 seq = ntohl(th->seq) - 1;
163 seq = ntohl(skb->h.th->seq)-1; 163 __u32 mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr,
164 mssind = check_tcp_syn_cookie(cookie, 164 th->source, th->dest, seq,
165 skb->nh.iph->saddr, skb->nh.iph->daddr, 165 jiffies / (HZ * 60),
166 skb->h.th->source, skb->h.th->dest, 166 COUNTER_TRIES);
167 seq, jiffies / (HZ * 60), COUNTER_TRIES);
168 167
169 return mssind < NUM_MSS ? msstab[mssind] + 1 : 0; 168 return mssind < NUM_MSS ? msstab[mssind] + 1 : 0;
170} 169}
@@ -191,14 +190,15 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
191 struct inet_request_sock *ireq; 190 struct inet_request_sock *ireq;
192 struct tcp_request_sock *treq; 191 struct tcp_request_sock *treq;
193 struct tcp_sock *tp = tcp_sk(sk); 192 struct tcp_sock *tp = tcp_sk(sk);
194 __u32 cookie = ntohl(skb->h.th->ack_seq) - 1; 193 const struct tcphdr *th = tcp_hdr(skb);
194 __u32 cookie = ntohl(th->ack_seq) - 1;
195 struct sock *ret = sk; 195 struct sock *ret = sk;
196 struct request_sock *req; 196 struct request_sock *req;
197 int mss; 197 int mss;
198 struct rtable *rt; 198 struct rtable *rt;
199 __u8 rcv_wscale; 199 __u8 rcv_wscale;
200 200
201 if (!sysctl_tcp_syncookies || !skb->h.th->ack) 201 if (!sysctl_tcp_syncookies || !th->ack)
202 goto out; 202 goto out;
203 203
204 if (time_after(jiffies, tp->last_synq_overflow + TCP_TIMEOUT_INIT) || 204 if (time_after(jiffies, tp->last_synq_overflow + TCP_TIMEOUT_INIT) ||
@@ -220,12 +220,12 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
220 } 220 }
221 ireq = inet_rsk(req); 221 ireq = inet_rsk(req);
222 treq = tcp_rsk(req); 222 treq = tcp_rsk(req);
223 treq->rcv_isn = ntohl(skb->h.th->seq) - 1; 223 treq->rcv_isn = ntohl(th->seq) - 1;
224 treq->snt_isn = cookie; 224 treq->snt_isn = cookie;
225 req->mss = mss; 225 req->mss = mss;
226 ireq->rmt_port = skb->h.th->source; 226 ireq->rmt_port = th->source;
227 ireq->loc_addr = skb->nh.iph->daddr; 227 ireq->loc_addr = ip_hdr(skb)->daddr;
228 ireq->rmt_addr = skb->nh.iph->saddr; 228 ireq->rmt_addr = ip_hdr(skb)->saddr;
229 ireq->opt = NULL; 229 ireq->opt = NULL;
230 230
231 /* We throwed the options of the initial SYN away, so we hope 231 /* We throwed the options of the initial SYN away, so we hope
@@ -261,8 +261,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
261 .tos = RT_CONN_FLAGS(sk) } }, 261 .tos = RT_CONN_FLAGS(sk) } },
262 .proto = IPPROTO_TCP, 262 .proto = IPPROTO_TCP,
263 .uli_u = { .ports = 263 .uli_u = { .ports =
264 { .sport = skb->h.th->dest, 264 { .sport = th->dest,
265 .dport = skb->h.th->source } } }; 265 .dport = th->source } } };
266 security_req_classify_flow(req, &fl); 266 security_req_classify_flow(req, &fl);
267 if (ip_route_output_key(&rt, &fl)) { 267 if (ip_route_output_key(&rt, &fl)) {
268 reqsk_free(req); 268 reqsk_free(req);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 0aa304711a96..6817d6485df5 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -647,6 +647,14 @@ ctl_table ipv4_table[] = {
647 .proc_handler = &proc_dointvec 647 .proc_handler = &proc_dointvec
648 }, 648 },
649 { 649 {
650 .ctl_name = NET_TCP_FRTO_RESPONSE,
651 .procname = "tcp_frto_response",
652 .data = &sysctl_tcp_frto_response,
653 .maxlen = sizeof(int),
654 .mode = 0644,
655 .proc_handler = &proc_dointvec
656 },
657 {
650 .ctl_name = NET_TCP_LOW_LATENCY, 658 .ctl_name = NET_TCP_LOW_LATENCY,
651 .procname = "tcp_low_latency", 659 .procname = "tcp_low_latency",
652 .data = &sysctl_tcp_low_latency, 660 .data = &sysctl_tcp_low_latency,
@@ -803,6 +811,14 @@ ctl_table ipv4_table[] = {
803 .proc_handler = &proc_allowed_congestion_control, 811 .proc_handler = &proc_allowed_congestion_control,
804 .strategy = &strategy_allowed_congestion_control, 812 .strategy = &strategy_allowed_congestion_control,
805 }, 813 },
814 {
815 .ctl_name = NET_TCP_MAX_SSTHRESH,
816 .procname = "tcp_max_ssthresh",
817 .data = &sysctl_tcp_max_ssthresh,
818 .maxlen = sizeof(int),
819 .mode = 0644,
820 .proc_handler = &proc_dointvec,
821 },
806 { .ctl_name = 0 } 822 { .ctl_name = 0 }
807}; 823};
808 824
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 3834b10b5115..2cf9a898ce50 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -297,7 +297,7 @@ EXPORT_SYMBOL(tcp_sockets_allocated);
297 * All the sk_stream_mem_schedule() is of this nature: accounting 297 * All the sk_stream_mem_schedule() is of this nature: accounting
298 * is strict, actions are advisory and have some latency. 298 * is strict, actions are advisory and have some latency.
299 */ 299 */
300int tcp_memory_pressure; 300int tcp_memory_pressure __read_mostly;
301 301
302EXPORT_SYMBOL(tcp_memory_pressure); 302EXPORT_SYMBOL(tcp_memory_pressure);
303 303
@@ -425,7 +425,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
425 /* Subtract 1, if FIN is in queue. */ 425 /* Subtract 1, if FIN is in queue. */
426 if (answ && !skb_queue_empty(&sk->sk_receive_queue)) 426 if (answ && !skb_queue_empty(&sk->sk_receive_queue))
427 answ -= 427 answ -=
428 ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin; 428 tcp_hdr((struct sk_buff *)sk->sk_receive_queue.prev)->fin;
429 } else 429 } else
430 answ = tp->urg_seq - tp->copied_seq; 430 answ = tp->urg_seq - tp->copied_seq;
431 release_sock(sk); 431 release_sock(sk);
@@ -444,7 +444,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
444 break; 444 break;
445 default: 445 default:
446 return -ENOIOCTLCMD; 446 return -ENOIOCTLCMD;
447 }; 447 }
448 448
449 return put_user(answ, (int __user *)arg); 449 return put_user(answ, (int __user *)arg);
450} 450}
@@ -460,9 +460,9 @@ static inline int forced_push(struct tcp_sock *tp)
460 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1)); 460 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
461} 461}
462 462
463static inline void skb_entail(struct sock *sk, struct tcp_sock *tp, 463static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
464 struct sk_buff *skb)
465{ 464{
465 struct tcp_sock *tp = tcp_sk(sk);
466 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); 466 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
467 467
468 skb->csum = 0; 468 skb->csum = 0;
@@ -470,10 +470,8 @@ static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
470 tcb->flags = TCPCB_FLAG_ACK; 470 tcb->flags = TCPCB_FLAG_ACK;
471 tcb->sacked = 0; 471 tcb->sacked = 0;
472 skb_header_release(skb); 472 skb_header_release(skb);
473 __skb_queue_tail(&sk->sk_write_queue, skb); 473 tcp_add_write_queue_tail(sk, skb);
474 sk_charge_skb(sk, skb); 474 sk_charge_skb(sk, skb);
475 if (!sk->sk_send_head)
476 sk->sk_send_head = skb;
477 if (tp->nonagle & TCP_NAGLE_PUSH) 475 if (tp->nonagle & TCP_NAGLE_PUSH)
478 tp->nonagle &= ~TCP_NAGLE_PUSH; 476 tp->nonagle &= ~TCP_NAGLE_PUSH;
479} 477}
@@ -488,15 +486,17 @@ static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
488 } 486 }
489} 487}
490 488
491static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags, 489static inline void tcp_push(struct sock *sk, int flags, int mss_now,
492 int mss_now, int nonagle) 490 int nonagle)
493{ 491{
494 if (sk->sk_send_head) { 492 struct tcp_sock *tp = tcp_sk(sk);
495 struct sk_buff *skb = sk->sk_write_queue.prev; 493
494 if (tcp_send_head(sk)) {
495 struct sk_buff *skb = tcp_write_queue_tail(sk);
496 if (!(flags & MSG_MORE) || forced_push(tp)) 496 if (!(flags & MSG_MORE) || forced_push(tp))
497 tcp_mark_push(tp, skb); 497 tcp_mark_push(tp, skb);
498 tcp_mark_urg(tp, flags, skb); 498 tcp_mark_urg(tp, flags, skb);
499 __tcp_push_pending_frames(sk, tp, mss_now, 499 __tcp_push_pending_frames(sk, mss_now,
500 (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle); 500 (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
501 } 501 }
502} 502}
@@ -526,13 +526,13 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
526 goto do_error; 526 goto do_error;
527 527
528 while (psize > 0) { 528 while (psize > 0) {
529 struct sk_buff *skb = sk->sk_write_queue.prev; 529 struct sk_buff *skb = tcp_write_queue_tail(sk);
530 struct page *page = pages[poffset / PAGE_SIZE]; 530 struct page *page = pages[poffset / PAGE_SIZE];
531 int copy, i, can_coalesce; 531 int copy, i, can_coalesce;
532 int offset = poffset % PAGE_SIZE; 532 int offset = poffset % PAGE_SIZE;
533 int size = min_t(size_t, psize, PAGE_SIZE - offset); 533 int size = min_t(size_t, psize, PAGE_SIZE - offset);
534 534
535 if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) { 535 if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
536new_segment: 536new_segment:
537 if (!sk_stream_memory_free(sk)) 537 if (!sk_stream_memory_free(sk))
538 goto wait_for_sndbuf; 538 goto wait_for_sndbuf;
@@ -542,7 +542,7 @@ new_segment:
542 if (!skb) 542 if (!skb)
543 goto wait_for_memory; 543 goto wait_for_memory;
544 544
545 skb_entail(sk, tp, skb); 545 skb_entail(sk, skb);
546 copy = size_goal; 546 copy = size_goal;
547 } 547 }
548 548
@@ -588,8 +588,8 @@ new_segment:
588 588
589 if (forced_push(tp)) { 589 if (forced_push(tp)) {
590 tcp_mark_push(tp, skb); 590 tcp_mark_push(tp, skb);
591 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH); 591 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
592 } else if (skb == sk->sk_send_head) 592 } else if (skb == tcp_send_head(sk))
593 tcp_push_one(sk, mss_now); 593 tcp_push_one(sk, mss_now);
594 continue; 594 continue;
595 595
@@ -597,7 +597,7 @@ wait_for_sndbuf:
597 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 597 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
598wait_for_memory: 598wait_for_memory:
599 if (copied) 599 if (copied)
600 tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); 600 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
601 601
602 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) 602 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
603 goto do_error; 603 goto do_error;
@@ -608,7 +608,7 @@ wait_for_memory:
608 608
609out: 609out:
610 if (copied) 610 if (copied)
611 tcp_push(sk, tp, flags, mss_now, tp->nonagle); 611 tcp_push(sk, flags, mss_now, tp->nonagle);
612 return copied; 612 return copied;
613 613
614do_error: 614do_error:
@@ -639,8 +639,9 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
639#define TCP_PAGE(sk) (sk->sk_sndmsg_page) 639#define TCP_PAGE(sk) (sk->sk_sndmsg_page)
640#define TCP_OFF(sk) (sk->sk_sndmsg_off) 640#define TCP_OFF(sk) (sk->sk_sndmsg_off)
641 641
642static inline int select_size(struct sock *sk, struct tcp_sock *tp) 642static inline int select_size(struct sock *sk)
643{ 643{
644 struct tcp_sock *tp = tcp_sk(sk);
644 int tmp = tp->mss_cache; 645 int tmp = tp->mss_cache;
645 646
646 if (sk->sk_route_caps & NETIF_F_SG) { 647 if (sk->sk_route_caps & NETIF_F_SG) {
@@ -704,9 +705,9 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
704 while (seglen > 0) { 705 while (seglen > 0) {
705 int copy; 706 int copy;
706 707
707 skb = sk->sk_write_queue.prev; 708 skb = tcp_write_queue_tail(sk);
708 709
709 if (!sk->sk_send_head || 710 if (!tcp_send_head(sk) ||
710 (copy = size_goal - skb->len) <= 0) { 711 (copy = size_goal - skb->len) <= 0) {
711 712
712new_segment: 713new_segment:
@@ -716,7 +717,7 @@ new_segment:
716 if (!sk_stream_memory_free(sk)) 717 if (!sk_stream_memory_free(sk))
717 goto wait_for_sndbuf; 718 goto wait_for_sndbuf;
718 719
719 skb = sk_stream_alloc_pskb(sk, select_size(sk, tp), 720 skb = sk_stream_alloc_pskb(sk, select_size(sk),
720 0, sk->sk_allocation); 721 0, sk->sk_allocation);
721 if (!skb) 722 if (!skb)
722 goto wait_for_memory; 723 goto wait_for_memory;
@@ -727,7 +728,7 @@ new_segment:
727 if (sk->sk_route_caps & NETIF_F_ALL_CSUM) 728 if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
728 skb->ip_summed = CHECKSUM_PARTIAL; 729 skb->ip_summed = CHECKSUM_PARTIAL;
729 730
730 skb_entail(sk, tp, skb); 731 skb_entail(sk, skb);
731 copy = size_goal; 732 copy = size_goal;
732 } 733 }
733 734
@@ -832,8 +833,8 @@ new_segment:
832 833
833 if (forced_push(tp)) { 834 if (forced_push(tp)) {
834 tcp_mark_push(tp, skb); 835 tcp_mark_push(tp, skb);
835 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH); 836 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
836 } else if (skb == sk->sk_send_head) 837 } else if (skb == tcp_send_head(sk))
837 tcp_push_one(sk, mss_now); 838 tcp_push_one(sk, mss_now);
838 continue; 839 continue;
839 840
@@ -841,7 +842,7 @@ wait_for_sndbuf:
841 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 842 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
842wait_for_memory: 843wait_for_memory:
843 if (copied) 844 if (copied)
844 tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); 845 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
845 846
846 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) 847 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
847 goto do_error; 848 goto do_error;
@@ -853,16 +854,18 @@ wait_for_memory:
853 854
854out: 855out:
855 if (copied) 856 if (copied)
856 tcp_push(sk, tp, flags, mss_now, tp->nonagle); 857 tcp_push(sk, flags, mss_now, tp->nonagle);
857 TCP_CHECK_TIMER(sk); 858 TCP_CHECK_TIMER(sk);
858 release_sock(sk); 859 release_sock(sk);
859 return copied; 860 return copied;
860 861
861do_fault: 862do_fault:
862 if (!skb->len) { 863 if (!skb->len) {
863 if (sk->sk_send_head == skb) 864 tcp_unlink_write_queue(skb, sk);
864 sk->sk_send_head = NULL; 865 /* It is the one place in all of TCP, except connection
865 __skb_unlink(skb, &sk->sk_write_queue); 866 * reset, where we can be unlinking the send_head.
867 */
868 tcp_check_send_head(sk, skb);
866 sk_stream_free_skb(sk, skb); 869 sk_stream_free_skb(sk, skb);
867 } 870 }
868 871
@@ -1016,9 +1019,9 @@ static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1016 1019
1017 skb_queue_walk(&sk->sk_receive_queue, skb) { 1020 skb_queue_walk(&sk->sk_receive_queue, skb) {
1018 offset = seq - TCP_SKB_CB(skb)->seq; 1021 offset = seq - TCP_SKB_CB(skb)->seq;
1019 if (skb->h.th->syn) 1022 if (tcp_hdr(skb)->syn)
1020 offset--; 1023 offset--;
1021 if (offset < skb->len || skb->h.th->fin) { 1024 if (offset < skb->len || tcp_hdr(skb)->fin) {
1022 *off = offset; 1025 *off = offset;
1023 return skb; 1026 return skb;
1024 } 1027 }
@@ -1070,7 +1073,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1070 if (offset != skb->len) 1073 if (offset != skb->len)
1071 break; 1074 break;
1072 } 1075 }
1073 if (skb->h.th->fin) { 1076 if (tcp_hdr(skb)->fin) {
1074 sk_eat_skb(sk, skb, 0); 1077 sk_eat_skb(sk, skb, 0);
1075 ++seq; 1078 ++seq;
1076 break; 1079 break;
@@ -1174,11 +1177,11 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1174 break; 1177 break;
1175 } 1178 }
1176 offset = *seq - TCP_SKB_CB(skb)->seq; 1179 offset = *seq - TCP_SKB_CB(skb)->seq;
1177 if (skb->h.th->syn) 1180 if (tcp_hdr(skb)->syn)
1178 offset--; 1181 offset--;
1179 if (offset < skb->len) 1182 if (offset < skb->len)
1180 goto found_ok_skb; 1183 goto found_ok_skb;
1181 if (skb->h.th->fin) 1184 if (tcp_hdr(skb)->fin)
1182 goto found_fin_ok; 1185 goto found_fin_ok;
1183 BUG_TRAP(flags & MSG_PEEK); 1186 BUG_TRAP(flags & MSG_PEEK);
1184 skb = skb->next; 1187 skb = skb->next;
@@ -1389,12 +1392,12 @@ do_prequeue:
1389skip_copy: 1392skip_copy:
1390 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) { 1393 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1391 tp->urg_data = 0; 1394 tp->urg_data = 0;
1392 tcp_fast_path_check(sk, tp); 1395 tcp_fast_path_check(sk);
1393 } 1396 }
1394 if (used + offset < skb->len) 1397 if (used + offset < skb->len)
1395 continue; 1398 continue;
1396 1399
1397 if (skb->h.th->fin) 1400 if (tcp_hdr(skb)->fin)
1398 goto found_fin_ok; 1401 goto found_fin_ok;
1399 if (!(flags & MSG_PEEK)) { 1402 if (!(flags & MSG_PEEK)) {
1400 sk_eat_skb(sk, skb, copied_early); 1403 sk_eat_skb(sk, skb, copied_early);
@@ -1563,7 +1566,7 @@ void tcp_close(struct sock *sk, long timeout)
1563 */ 1566 */
1564 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { 1567 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1565 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - 1568 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1566 skb->h.th->fin; 1569 tcp_hdr(skb)->fin;
1567 data_was_unread += len; 1570 data_was_unread += len;
1568 __kfree_skb(skb); 1571 __kfree_skb(skb);
1569 } 1572 }
@@ -1732,7 +1735,7 @@ int tcp_disconnect(struct sock *sk, int flags)
1732 1735
1733 tcp_clear_xmit_timers(sk); 1736 tcp_clear_xmit_timers(sk);
1734 __skb_queue_purge(&sk->sk_receive_queue); 1737 __skb_queue_purge(&sk->sk_receive_queue);
1735 sk_stream_writequeue_purge(sk); 1738 tcp_write_queue_purge(sk);
1736 __skb_queue_purge(&tp->out_of_order_queue); 1739 __skb_queue_purge(&tp->out_of_order_queue);
1737#ifdef CONFIG_NET_DMA 1740#ifdef CONFIG_NET_DMA
1738 __skb_queue_purge(&sk->sk_async_wait_queue); 1741 __skb_queue_purge(&sk->sk_async_wait_queue);
@@ -1758,7 +1761,7 @@ int tcp_disconnect(struct sock *sk, int flags)
1758 tcp_set_ca_state(sk, TCP_CA_Open); 1761 tcp_set_ca_state(sk, TCP_CA_Open);
1759 tcp_clear_retrans(tp); 1762 tcp_clear_retrans(tp);
1760 inet_csk_delack_init(sk); 1763 inet_csk_delack_init(sk);
1761 sk->sk_send_head = NULL; 1764 tcp_init_send_head(sk);
1762 tp->rx_opt.saw_tstamp = 0; 1765 tp->rx_opt.saw_tstamp = 0;
1763 tcp_sack_reset(&tp->rx_opt); 1766 tcp_sack_reset(&tp->rx_opt);
1764 __sk_dst_reset(sk); 1767 __sk_dst_reset(sk);
@@ -1830,7 +1833,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
1830 * for currently queued segments. 1833 * for currently queued segments.
1831 */ 1834 */
1832 tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH; 1835 tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
1833 tcp_push_pending_frames(sk, tp); 1836 tcp_push_pending_frames(sk);
1834 } else { 1837 } else {
1835 tp->nonagle &= ~TCP_NAGLE_OFF; 1838 tp->nonagle &= ~TCP_NAGLE_OFF;
1836 } 1839 }
@@ -1854,7 +1857,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
1854 tp->nonagle &= ~TCP_NAGLE_CORK; 1857 tp->nonagle &= ~TCP_NAGLE_CORK;
1855 if (tp->nonagle&TCP_NAGLE_OFF) 1858 if (tp->nonagle&TCP_NAGLE_OFF)
1856 tp->nonagle |= TCP_NAGLE_PUSH; 1859 tp->nonagle |= TCP_NAGLE_PUSH;
1857 tcp_push_pending_frames(sk, tp); 1860 tcp_push_pending_frames(sk);
1858 } 1861 }
1859 break; 1862 break;
1860 1863
@@ -1954,7 +1957,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
1954 default: 1957 default:
1955 err = -ENOPROTOOPT; 1958 err = -ENOPROTOOPT;
1956 break; 1959 break;
1957 }; 1960 }
1961
1958 release_sock(sk); 1962 release_sock(sk);
1959 return err; 1963 return err;
1960} 1964}
@@ -2124,7 +2128,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
2124 return 0; 2128 return 0;
2125 default: 2129 default:
2126 return -ENOPROTOOPT; 2130 return -ENOPROTOOPT;
2127 }; 2131 }
2128 2132
2129 if (put_user(len, optlen)) 2133 if (put_user(len, optlen))
2130 return -EFAULT; 2134 return -EFAULT;
@@ -2170,7 +2174,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features)
2170 if (!pskb_may_pull(skb, sizeof(*th))) 2174 if (!pskb_may_pull(skb, sizeof(*th)))
2171 goto out; 2175 goto out;
2172 2176
2173 th = skb->h.th; 2177 th = tcp_hdr(skb);
2174 thlen = th->doff * 4; 2178 thlen = th->doff * 4;
2175 if (thlen < sizeof(*th)) 2179 if (thlen < sizeof(*th))
2176 goto out; 2180 goto out;
@@ -2210,7 +2214,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features)
2210 delta = htonl(oldlen + (thlen + len)); 2214 delta = htonl(oldlen + (thlen + len));
2211 2215
2212 skb = segs; 2216 skb = segs;
2213 th = skb->h.th; 2217 th = tcp_hdr(skb);
2214 seq = ntohl(th->seq); 2218 seq = ntohl(th->seq);
2215 2219
2216 do { 2220 do {
@@ -2219,23 +2223,25 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features)
2219 th->check = ~csum_fold((__force __wsum)((__force u32)th->check + 2223 th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
2220 (__force u32)delta)); 2224 (__force u32)delta));
2221 if (skb->ip_summed != CHECKSUM_PARTIAL) 2225 if (skb->ip_summed != CHECKSUM_PARTIAL)
2222 th->check = csum_fold(csum_partial(skb->h.raw, thlen, 2226 th->check =
2223 skb->csum)); 2227 csum_fold(csum_partial(skb_transport_header(skb),
2228 thlen, skb->csum));
2224 2229
2225 seq += len; 2230 seq += len;
2226 skb = skb->next; 2231 skb = skb->next;
2227 th = skb->h.th; 2232 th = tcp_hdr(skb);
2228 2233
2229 th->seq = htonl(seq); 2234 th->seq = htonl(seq);
2230 th->cwr = 0; 2235 th->cwr = 0;
2231 } while (skb->next); 2236 } while (skb->next);
2232 2237
2233 delta = htonl(oldlen + (skb->tail - skb->h.raw) + skb->data_len); 2238 delta = htonl(oldlen + (skb->tail - skb->transport_header) +
2239 skb->data_len);
2234 th->check = ~csum_fold((__force __wsum)((__force u32)th->check + 2240 th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
2235 (__force u32)delta)); 2241 (__force u32)delta));
2236 if (skb->ip_summed != CHECKSUM_PARTIAL) 2242 if (skb->ip_summed != CHECKSUM_PARTIAL)
2237 th->check = csum_fold(csum_partial(skb->h.raw, thlen, 2243 th->check = csum_fold(csum_partial(skb_transport_header(skb),
2238 skb->csum)); 2244 thlen, skb->csum));
2239 2245
2240out: 2246out:
2241 return segs; 2247 return segs;
@@ -2372,6 +2378,23 @@ void __tcp_put_md5sig_pool(void)
2372EXPORT_SYMBOL(__tcp_put_md5sig_pool); 2378EXPORT_SYMBOL(__tcp_put_md5sig_pool);
2373#endif 2379#endif
2374 2380
2381void tcp_done(struct sock *sk)
2382{
2383 if(sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
2384 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
2385
2386 tcp_set_state(sk, TCP_CLOSE);
2387 tcp_clear_xmit_timers(sk);
2388
2389 sk->sk_shutdown = SHUTDOWN_MASK;
2390
2391 if (!sock_flag(sk, SOCK_DEAD))
2392 sk->sk_state_change(sk);
2393 else
2394 inet_csk_destroy_sock(sk);
2395}
2396EXPORT_SYMBOL_GPL(tcp_done);
2397
2375extern void __skb_cb_too_small_for_tcp(int, int); 2398extern void __skb_cb_too_small_for_tcp(int, int);
2376extern struct tcp_congestion_ops tcp_reno; 2399extern struct tcp_congestion_ops tcp_reno;
2377 2400
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 5730333cd0ac..281c9f913257 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -206,7 +206,7 @@ static void bictcp_state(struct sock *sk, u8 new_state)
206/* Track delayed acknowledgment ratio using sliding window 206/* Track delayed acknowledgment ratio using sliding window
207 * ratio = (15*ratio + sample) / 16 207 * ratio = (15*ratio + sample) / 16
208 */ 208 */
209static void bictcp_acked(struct sock *sk, u32 cnt) 209static void bictcp_acked(struct sock *sk, u32 cnt, ktime_t last)
210{ 210{
211 const struct inet_connection_sock *icsk = inet_csk(sk); 211 const struct inet_connection_sock *icsk = inet_csk(sk);
212 212
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 34ae3f13483a..86b26539e54b 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -12,6 +12,8 @@
12#include <linux/list.h> 12#include <linux/list.h>
13#include <net/tcp.h> 13#include <net/tcp.h>
14 14
15int sysctl_tcp_max_ssthresh = 0;
16
15static DEFINE_SPINLOCK(tcp_cong_list_lock); 17static DEFINE_SPINLOCK(tcp_cong_list_lock);
16static LIST_HEAD(tcp_cong_list); 18static LIST_HEAD(tcp_cong_list);
17 19
@@ -124,7 +126,7 @@ int tcp_set_default_congestion_control(const char *name)
124#endif 126#endif
125 127
126 if (ca) { 128 if (ca) {
127 ca->non_restricted = 1; /* default is always allowed */ 129 ca->flags |= TCP_CONG_NON_RESTRICTED; /* default is always allowed */
128 list_move(&ca->list, &tcp_cong_list); 130 list_move(&ca->list, &tcp_cong_list);
129 ret = 0; 131 ret = 0;
130 } 132 }
@@ -179,7 +181,7 @@ void tcp_get_allowed_congestion_control(char *buf, size_t maxlen)
179 *buf = '\0'; 181 *buf = '\0';
180 rcu_read_lock(); 182 rcu_read_lock();
181 list_for_each_entry_rcu(ca, &tcp_cong_list, list) { 183 list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
182 if (!ca->non_restricted) 184 if (!(ca->flags & TCP_CONG_NON_RESTRICTED))
183 continue; 185 continue;
184 offs += snprintf(buf + offs, maxlen - offs, 186 offs += snprintf(buf + offs, maxlen - offs,
185 "%s%s", 187 "%s%s",
@@ -210,16 +212,16 @@ int tcp_set_allowed_congestion_control(char *val)
210 } 212 }
211 } 213 }
212 214
213 /* pass 2 clear */ 215 /* pass 2 clear old values */
214 list_for_each_entry_rcu(ca, &tcp_cong_list, list) 216 list_for_each_entry_rcu(ca, &tcp_cong_list, list)
215 ca->non_restricted = 0; 217 ca->flags &= ~TCP_CONG_NON_RESTRICTED;
216 218
217 /* pass 3 mark as allowed */ 219 /* pass 3 mark as allowed */
218 while ((name = strsep(&val, " ")) && *name) { 220 while ((name = strsep(&val, " ")) && *name) {
219 ca = tcp_ca_find(name); 221 ca = tcp_ca_find(name);
220 WARN_ON(!ca); 222 WARN_ON(!ca);
221 if (ca) 223 if (ca)
222 ca->non_restricted = 1; 224 ca->flags |= TCP_CONG_NON_RESTRICTED;
223 } 225 }
224out: 226out:
225 spin_unlock(&tcp_cong_list_lock); 227 spin_unlock(&tcp_cong_list_lock);
@@ -254,7 +256,7 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
254 if (!ca) 256 if (!ca)
255 err = -ENOENT; 257 err = -ENOENT;
256 258
257 else if (!(ca->non_restricted || capable(CAP_NET_ADMIN))) 259 else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || capable(CAP_NET_ADMIN)))
258 err = -EPERM; 260 err = -EPERM;
259 261
260 else if (!try_module_get(ca->owner)) 262 else if (!try_module_get(ca->owner))
@@ -274,10 +276,13 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
274 276
275 277
276/* 278/*
277 * Linear increase during slow start 279 * Slow start (exponential increase) with
280 * RFC3742 Limited Slow Start (fast linear increase) support.
278 */ 281 */
279void tcp_slow_start(struct tcp_sock *tp) 282void tcp_slow_start(struct tcp_sock *tp)
280{ 283{
284 int cnt = 0;
285
281 if (sysctl_tcp_abc) { 286 if (sysctl_tcp_abc) {
282 /* RFC3465: Slow Start 287 /* RFC3465: Slow Start
283 * TCP sender SHOULD increase cwnd by the number of 288 * TCP sender SHOULD increase cwnd by the number of
@@ -286,17 +291,25 @@ void tcp_slow_start(struct tcp_sock *tp)
286 */ 291 */
287 if (tp->bytes_acked < tp->mss_cache) 292 if (tp->bytes_acked < tp->mss_cache)
288 return; 293 return;
289
290 /* We MAY increase by 2 if discovered delayed ack */
291 if (sysctl_tcp_abc > 1 && tp->bytes_acked >= 2*tp->mss_cache) {
292 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
293 tp->snd_cwnd++;
294 }
295 } 294 }
295
296 if (sysctl_tcp_max_ssthresh > 0 &&
297 tp->snd_cwnd > sysctl_tcp_max_ssthresh)
298 cnt += sysctl_tcp_max_ssthresh>>1;
299 else
300 cnt += tp->snd_cwnd;
301
302 /* RFC3465: We MAY increase by 2 if discovered delayed ack */
303 if (sysctl_tcp_abc > 1 && tp->bytes_acked >= 2*tp->mss_cache)
304 cnt <<= 1;
296 tp->bytes_acked = 0; 305 tp->bytes_acked = 0;
297 306
298 if (tp->snd_cwnd < tp->snd_cwnd_clamp) 307 tp->snd_cwnd_cnt += cnt;
299 tp->snd_cwnd++; 308 while (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
309 tp->snd_cwnd_cnt -= tp->snd_cwnd;
310 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
311 tp->snd_cwnd++;
312 }
300} 313}
301EXPORT_SYMBOL_GPL(tcp_slow_start); 314EXPORT_SYMBOL_GPL(tcp_slow_start);
302 315
@@ -358,8 +371,8 @@ u32 tcp_reno_min_cwnd(const struct sock *sk)
358EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd); 371EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd);
359 372
360struct tcp_congestion_ops tcp_reno = { 373struct tcp_congestion_ops tcp_reno = {
374 .flags = TCP_CONG_NON_RESTRICTED,
361 .name = "reno", 375 .name = "reno",
362 .non_restricted = 1,
363 .owner = THIS_MODULE, 376 .owner = THIS_MODULE,
364 .ssthresh = tcp_reno_ssthresh, 377 .ssthresh = tcp_reno_ssthresh,
365 .cong_avoid = tcp_reno_cong_avoid, 378 .cong_avoid = tcp_reno_cong_avoid,
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 9a582fb4ef9f..14224487b16b 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * TCP CUBIC: Binary Increase Congestion control for TCP v2.0 2 * TCP CUBIC: Binary Increase Congestion control for TCP v2.1
3 * 3 *
4 * This is from the implementation of CUBIC TCP in 4 * This is from the implementation of CUBIC TCP in
5 * Injong Rhee, Lisong Xu. 5 * Injong Rhee, Lisong Xu.
@@ -51,8 +51,6 @@ MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_
51module_param(tcp_friendliness, int, 0644); 51module_param(tcp_friendliness, int, 0644);
52MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness"); 52MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness");
53 53
54#include <asm/div64.h>
55
56/* BIC TCP Parameters */ 54/* BIC TCP Parameters */
57struct bictcp { 55struct bictcp {
58 u32 cnt; /* increase cwnd by 1 after ACKs */ 56 u32 cnt; /* increase cwnd by 1 after ACKs */
@@ -93,50 +91,51 @@ static void bictcp_init(struct sock *sk)
93 tcp_sk(sk)->snd_ssthresh = initial_ssthresh; 91 tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
94} 92}
95 93
96/* 64bit divisor, dividend and result. dynamic precision */ 94/* calculate the cubic root of x using a table lookup followed by one
97static inline u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor) 95 * Newton-Raphson iteration.
98{ 96 * Avg err ~= 0.195%
99 u_int32_t d = divisor;
100
101 if (divisor > 0xffffffffULL) {
102 unsigned int shift = fls(divisor >> 32);
103
104 d = divisor >> shift;
105 dividend >>= shift;
106 }
107
108 /* avoid 64 bit division if possible */
109 if (dividend >> 32)
110 do_div(dividend, d);
111 else
112 dividend = (uint32_t) dividend / d;
113
114 return dividend;
115}
116
117/*
118 * calculate the cubic root of x using Newton-Raphson
119 */ 97 */
120static u32 cubic_root(u64 a) 98static u32 cubic_root(u64 a)
121{ 99{
122 u32 x, x1; 100 u32 x, b, shift;
123 101 /*
124 /* Initial estimate is based on: 102 * cbrt(x) MSB values for x MSB values in [0..63].
125 * cbrt(x) = exp(log(x) / 3) 103 * Precomputed then refined by hand - Willy Tarreau
104 *
105 * For x in [0..63],
106 * v = cbrt(x << 18) - 1
107 * cbrt(x) = (v[x] + 10) >> 6
126 */ 108 */
127 x = 1u << (fls64(a)/3); 109 static const u8 v[] = {
110 /* 0x00 */ 0, 54, 54, 54, 118, 118, 118, 118,
111 /* 0x08 */ 123, 129, 134, 138, 143, 147, 151, 156,
112 /* 0x10 */ 157, 161, 164, 168, 170, 173, 176, 179,
113 /* 0x18 */ 181, 185, 187, 190, 192, 194, 197, 199,
114 /* 0x20 */ 200, 202, 204, 206, 209, 211, 213, 215,
115 /* 0x28 */ 217, 219, 221, 222, 224, 225, 227, 229,
116 /* 0x30 */ 231, 232, 234, 236, 237, 239, 240, 242,
117 /* 0x38 */ 244, 245, 246, 248, 250, 251, 252, 254,
118 };
119
120 b = fls64(a);
121 if (b < 7) {
122 /* a in [0..63] */
123 return ((u32)v[(u32)a] + 35) >> 6;
124 }
125
126 b = ((b * 84) >> 8) - 1;
127 shift = (a >> (b * 3));
128
129 x = ((u32)(((u32)v[shift] + 10) << b)) >> 6;
128 130
129 /* 131 /*
130 * Iteration based on: 132 * Newton-Raphson iteration
131 * 2 133 * 2
132 * x = ( 2 * x + a / x ) / 3 134 * x = ( 2 * x + a / x ) / 3
133 * k+1 k k 135 * k+1 k k
134 */ 136 */
135 do { 137 x = (2 * x + (u32)div64_64(a, (u64)x * (u64)(x - 1)));
136 x1 = x; 138 x = ((x * 341) >> 10);
137 x = (2 * x + (uint32_t) div64_64(a, x*x)) / 3;
138 } while (abs(x1 - x) > 1);
139
140 return x; 139 return x;
141} 140}
142 141
@@ -215,7 +214,9 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
215 if (ca->delay_min > 0) { 214 if (ca->delay_min > 0) {
216 /* max increment = Smax * rtt / 0.1 */ 215 /* max increment = Smax * rtt / 0.1 */
217 min_cnt = (cwnd * HZ * 8)/(10 * max_increment * ca->delay_min); 216 min_cnt = (cwnd * HZ * 8)/(10 * max_increment * ca->delay_min);
218 if (ca->cnt < min_cnt) 217
218 /* use concave growth when the target is above the origin */
219 if (ca->cnt < min_cnt && t >= ca->bic_K)
219 ca->cnt = min_cnt; 220 ca->cnt = min_cnt;
220 } 221 }
221 222
@@ -333,7 +334,7 @@ static void bictcp_state(struct sock *sk, u8 new_state)
333/* Track delayed acknowledgment ratio using sliding window 334/* Track delayed acknowledgment ratio using sliding window
334 * ratio = (15*ratio + sample) / 16 335 * ratio = (15*ratio + sample) / 16
335 */ 336 */
336static void bictcp_acked(struct sock *sk, u32 cnt) 337static void bictcp_acked(struct sock *sk, u32 cnt, ktime_t last)
337{ 338{
338 const struct inet_connection_sock *icsk = inet_csk(sk); 339 const struct inet_connection_sock *icsk = inet_csk(sk);
339 340
@@ -401,4 +402,4 @@ module_exit(cubictcp_unregister);
401MODULE_AUTHOR("Sangtae Ha, Stephen Hemminger"); 402MODULE_AUTHOR("Sangtae Ha, Stephen Hemminger");
402MODULE_LICENSE("GPL"); 403MODULE_LICENSE("GPL");
403MODULE_DESCRIPTION("CUBIC TCP"); 404MODULE_DESCRIPTION("CUBIC TCP");
404MODULE_VERSION("2.0"); 405MODULE_VERSION("2.1");
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 1020eb48d8d1..4ba4a7ae0a85 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -98,7 +98,7 @@ static inline void measure_rtt(struct sock *sk)
98 } 98 }
99} 99}
100 100
101static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked) 101static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked, ktime_t last)
102{ 102{
103 const struct inet_connection_sock *icsk = inet_csk(sk); 103 const struct inet_connection_sock *icsk = inet_csk(sk);
104 const struct tcp_sock *tp = tcp_sk(sk); 104 const struct tcp_sock *tp = tcp_sk(sk);
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index 59e691d26f64..e5be35117223 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -144,7 +144,7 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
144 ca->snd_cwnd_cents += odd; 144 ca->snd_cwnd_cents += odd;
145 145
146 /* check when fractions goes >=128 and increase cwnd by 1. */ 146 /* check when fractions goes >=128 and increase cwnd by 1. */
147 while(ca->snd_cwnd_cents >= 128) { 147 while (ca->snd_cwnd_cents >= 128) {
148 tp->snd_cwnd++; 148 tp->snd_cwnd++;
149 ca->snd_cwnd_cents -= 128; 149 ca->snd_cwnd_cents -= 128;
150 tp->snd_cwnd_cnt = 0; 150 tp->snd_cwnd_cnt = 0;
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
new file mode 100644
index 000000000000..4adc47c55351
--- /dev/null
+++ b/net/ipv4/tcp_illinois.c
@@ -0,0 +1,356 @@
1/*
2 * TCP Illinois congestion control.
3 * Home page:
4 * http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html
5 *
6 * The algorithm is described in:
7 * "TCP-Illinois: A Loss and Delay-Based Congestion Control Algorithm
8 * for High-Speed Networks"
9 * http://www.ews.uiuc.edu/~shaoliu/papersandslides/liubassri06perf.pdf
10 *
11 * Implemented from description in paper and ns-2 simulation.
12 * Copyright (C) 2007 Stephen Hemminger <shemminger@linux-foundation.org>
13 */
14
15#include <linux/module.h>
16#include <linux/skbuff.h>
17#include <linux/inet_diag.h>
18#include <asm/div64.h>
19#include <net/tcp.h>
20
21#define ALPHA_SHIFT 7
22#define ALPHA_SCALE (1u<<ALPHA_SHIFT)
23#define ALPHA_MIN ((3*ALPHA_SCALE)/10) /* ~0.3 */
24#define ALPHA_MAX (10*ALPHA_SCALE) /* 10.0 */
25#define ALPHA_BASE ALPHA_SCALE /* 1.0 */
26#define U32_MAX ((u32)~0U)
27#define RTT_MAX (U32_MAX / ALPHA_MAX) /* 3.3 secs */
28
29#define BETA_SHIFT 6
30#define BETA_SCALE (1u<<BETA_SHIFT)
31#define BETA_MIN (BETA_SCALE/8) /* 0.125 */
32#define BETA_MAX (BETA_SCALE/2) /* 0.5 */
33#define BETA_BASE BETA_MAX
34
35static int win_thresh __read_mostly = 15;
36module_param(win_thresh, int, 0);
37MODULE_PARM_DESC(win_thresh, "Window threshold for starting adaptive sizing");
38
39static int theta __read_mostly = 5;
40module_param(theta, int, 0);
41MODULE_PARM_DESC(theta, "# of fast RTT's before full growth");
42
43/* TCP Illinois Parameters */
44struct illinois {
45 u64 sum_rtt; /* sum of rtt's measured within last rtt */
46 u16 cnt_rtt; /* # of rtts measured within last rtt */
47 u32 base_rtt; /* min of all rtt in usec */
48 u32 max_rtt; /* max of all rtt in usec */
49 u32 end_seq; /* right edge of current RTT */
50 u32 alpha; /* Additive increase */
51 u32 beta; /* Muliplicative decrease */
52 u16 acked; /* # packets acked by current ACK */
53 u8 rtt_above; /* average rtt has gone above threshold */
54 u8 rtt_low; /* # of rtts measurements below threshold */
55};
56
57static void rtt_reset(struct sock *sk)
58{
59 struct tcp_sock *tp = tcp_sk(sk);
60 struct illinois *ca = inet_csk_ca(sk);
61
62 ca->end_seq = tp->snd_nxt;
63 ca->cnt_rtt = 0;
64 ca->sum_rtt = 0;
65
66 /* TODO: age max_rtt? */
67}
68
69static void tcp_illinois_init(struct sock *sk)
70{
71 struct illinois *ca = inet_csk_ca(sk);
72
73 ca->alpha = ALPHA_MAX;
74 ca->beta = BETA_BASE;
75 ca->base_rtt = 0x7fffffff;
76 ca->max_rtt = 0;
77
78 ca->acked = 0;
79 ca->rtt_low = 0;
80 ca->rtt_above = 0;
81
82 rtt_reset(sk);
83}
84
85/* Measure RTT for each ack. */
86static void tcp_illinois_acked(struct sock *sk, u32 pkts_acked, ktime_t last)
87{
88 struct illinois *ca = inet_csk_ca(sk);
89 u32 rtt;
90
91 ca->acked = pkts_acked;
92
93 rtt = ktime_to_us(net_timedelta(last));
94
95 /* ignore bogus values, this prevents wraparound in alpha math */
96 if (rtt > RTT_MAX)
97 rtt = RTT_MAX;
98
99 /* keep track of minimum RTT seen so far */
100 if (ca->base_rtt > rtt)
101 ca->base_rtt = rtt;
102
103 /* and max */
104 if (ca->max_rtt < rtt)
105 ca->max_rtt = rtt;
106
107 ++ca->cnt_rtt;
108 ca->sum_rtt += rtt;
109}
110
111/* Maximum queuing delay */
112static inline u32 max_delay(const struct illinois *ca)
113{
114 return ca->max_rtt - ca->base_rtt;
115}
116
117/* Average queuing delay */
118static inline u32 avg_delay(const struct illinois *ca)
119{
120 u64 t = ca->sum_rtt;
121
122 do_div(t, ca->cnt_rtt);
123 return t - ca->base_rtt;
124}
125
126/*
127 * Compute value of alpha used for additive increase.
128 * If small window then use 1.0, equivalent to Reno.
129 *
130 * For larger windows, adjust based on average delay.
131 * A. If average delay is at minimum (we are uncongested),
132 * then use large alpha (10.0) to increase faster.
133 * B. If average delay is at maximum (getting congested)
134 * then use small alpha (0.3)
135 *
136 * The result is a convex window growth curve.
137 */
138static u32 alpha(struct illinois *ca, u32 da, u32 dm)
139{
140 u32 d1 = dm / 100; /* Low threshold */
141
142 if (da <= d1) {
143 /* If never got out of low delay zone, then use max */
144 if (!ca->rtt_above)
145 return ALPHA_MAX;
146
147 /* Wait for 5 good RTT's before allowing alpha to go alpha max.
148 * This prevents one good RTT from causing sudden window increase.
149 */
150 if (++ca->rtt_low < theta)
151 return ca->alpha;
152
153 ca->rtt_low = 0;
154 ca->rtt_above = 0;
155 return ALPHA_MAX;
156 }
157
158 ca->rtt_above = 1;
159
160 /*
161 * Based on:
162 *
163 * (dm - d1) amin amax
164 * k1 = -------------------
165 * amax - amin
166 *
167 * (dm - d1) amin
168 * k2 = ---------------- - d1
169 * amax - amin
170 *
171 * k1
172 * alpha = ----------
173 * k2 + da
174 */
175
176 dm -= d1;
177 da -= d1;
178 return (dm * ALPHA_MAX) /
179 (dm + (da * (ALPHA_MAX - ALPHA_MIN)) / ALPHA_MIN);
180}
181
182/*
183 * Beta used for multiplicative decrease.
184 * For small window sizes returns same value as Reno (0.5)
185 *
186 * If delay is small (10% of max) then beta = 1/8
187 * If delay is up to 80% of max then beta = 1/2
188 * In between is a linear function
189 */
190static u32 beta(u32 da, u32 dm)
191{
192 u32 d2, d3;
193
194 d2 = dm / 10;
195 if (da <= d2)
196 return BETA_MIN;
197
198 d3 = (8 * dm) / 10;
199 if (da >= d3 || d3 <= d2)
200 return BETA_MAX;
201
202 /*
203 * Based on:
204 *
205 * bmin d3 - bmax d2
206 * k3 = -------------------
207 * d3 - d2
208 *
209 * bmax - bmin
210 * k4 = -------------
211 * d3 - d2
212 *
213 * b = k3 + k4 da
214 */
215 return (BETA_MIN * d3 - BETA_MAX * d2 + (BETA_MAX - BETA_MIN) * da)
216 / (d3 - d2);
217}
218
219/* Update alpha and beta values once per RTT */
220static void update_params(struct sock *sk)
221{
222 struct tcp_sock *tp = tcp_sk(sk);
223 struct illinois *ca = inet_csk_ca(sk);
224
225 if (tp->snd_cwnd < win_thresh) {
226 ca->alpha = ALPHA_BASE;
227 ca->beta = BETA_BASE;
228 } else if (ca->cnt_rtt > 0) {
229 u32 dm = max_delay(ca);
230 u32 da = avg_delay(ca);
231
232 ca->alpha = alpha(ca, da, dm);
233 ca->beta = beta(da, dm);
234 }
235
236 rtt_reset(sk);
237}
238
239/*
240 * In case of loss, reset to default values
241 */
242static void tcp_illinois_state(struct sock *sk, u8 new_state)
243{
244 struct illinois *ca = inet_csk_ca(sk);
245
246 if (new_state == TCP_CA_Loss) {
247 ca->alpha = ALPHA_BASE;
248 ca->beta = BETA_BASE;
249 ca->rtt_low = 0;
250 ca->rtt_above = 0;
251 rtt_reset(sk);
252 }
253}
254
255/*
256 * Increase window in response to successful acknowledgment.
257 */
258static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
259 u32 in_flight, int flag)
260{
261 struct tcp_sock *tp = tcp_sk(sk);
262 struct illinois *ca = inet_csk_ca(sk);
263
264 if (after(ack, ca->end_seq))
265 update_params(sk);
266
267 /* RFC2861 only increase cwnd if fully utilized */
268 if (!tcp_is_cwnd_limited(sk, in_flight))
269 return;
270
271 /* In slow start */
272 if (tp->snd_cwnd <= tp->snd_ssthresh)
273 tcp_slow_start(tp);
274
275 else {
276 u32 delta;
277
278 /* snd_cwnd_cnt is # of packets since last cwnd increment */
279 tp->snd_cwnd_cnt += ca->acked;
280 ca->acked = 1;
281
282 /* This is close approximation of:
283 * tp->snd_cwnd += alpha/tp->snd_cwnd
284 */
285 delta = (tp->snd_cwnd_cnt * ca->alpha) >> ALPHA_SHIFT;
286 if (delta >= tp->snd_cwnd) {
287 tp->snd_cwnd = min(tp->snd_cwnd + delta / tp->snd_cwnd,
288 (u32) tp->snd_cwnd_clamp);
289 tp->snd_cwnd_cnt = 0;
290 }
291 }
292}
293
294static u32 tcp_illinois_ssthresh(struct sock *sk)
295{
296 struct tcp_sock *tp = tcp_sk(sk);
297 struct illinois *ca = inet_csk_ca(sk);
298
299 /* Multiplicative decrease */
300 return max((tp->snd_cwnd * ca->beta) >> BETA_SHIFT, 2U);
301}
302
303
304/* Extract info for Tcp socket info provided via netlink. */
305static void tcp_illinois_info(struct sock *sk, u32 ext,
306 struct sk_buff *skb)
307{
308 const struct illinois *ca = inet_csk_ca(sk);
309
310 if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
311 struct tcpvegas_info info = {
312 .tcpv_enabled = 1,
313 .tcpv_rttcnt = ca->cnt_rtt,
314 .tcpv_minrtt = ca->base_rtt,
315 };
316 u64 t = ca->sum_rtt;
317
318 do_div(t, ca->cnt_rtt);
319 info.tcpv_rtt = t;
320
321 nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info);
322 }
323}
324
325static struct tcp_congestion_ops tcp_illinois = {
326 .flags = TCP_CONG_RTT_STAMP,
327 .init = tcp_illinois_init,
328 .ssthresh = tcp_illinois_ssthresh,
329 .min_cwnd = tcp_reno_min_cwnd,
330 .cong_avoid = tcp_illinois_cong_avoid,
331 .set_state = tcp_illinois_state,
332 .get_info = tcp_illinois_info,
333 .pkts_acked = tcp_illinois_acked,
334
335 .owner = THIS_MODULE,
336 .name = "illinois",
337};
338
339static int __init tcp_illinois_register(void)
340{
341 BUILD_BUG_ON(sizeof(struct illinois) > ICSK_CA_PRIV_SIZE);
342 return tcp_register_congestion_control(&tcp_illinois);
343}
344
345static void __exit tcp_illinois_unregister(void)
346{
347 tcp_unregister_congestion_control(&tcp_illinois);
348}
349
350module_init(tcp_illinois_register);
351module_exit(tcp_illinois_unregister);
352
353MODULE_AUTHOR("Stephen Hemminger, Shao Liu");
354MODULE_LICENSE("GPL");
355MODULE_DESCRIPTION("TCP Illinois");
356MODULE_VERSION("1.0");
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 1a14191687ac..051f0f815f17 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -86,6 +86,7 @@ int sysctl_tcp_stdurg __read_mostly;
86int sysctl_tcp_rfc1337 __read_mostly; 86int sysctl_tcp_rfc1337 __read_mostly;
87int sysctl_tcp_max_orphans __read_mostly = NR_FILE; 87int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
88int sysctl_tcp_frto __read_mostly; 88int sysctl_tcp_frto __read_mostly;
89int sysctl_tcp_frto_response __read_mostly;
89int sysctl_tcp_nometrics_save __read_mostly; 90int sysctl_tcp_nometrics_save __read_mostly;
90 91
91int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; 92int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
@@ -100,6 +101,7 @@ int sysctl_tcp_abc __read_mostly;
100#define FLAG_ECE 0x40 /* ECE in this ACK */ 101#define FLAG_ECE 0x40 /* ECE in this ACK */
101#define FLAG_DATA_LOST 0x80 /* SACK detected data lossage. */ 102#define FLAG_DATA_LOST 0x80 /* SACK detected data lossage. */
102#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ 103#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
104#define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */
103 105
104#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) 106#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
105#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) 107#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
@@ -110,6 +112,8 @@ int sysctl_tcp_abc __read_mostly;
110#define IsFack(tp) ((tp)->rx_opt.sack_ok & 2) 112#define IsFack(tp) ((tp)->rx_opt.sack_ok & 2)
111#define IsDSack(tp) ((tp)->rx_opt.sack_ok & 4) 113#define IsDSack(tp) ((tp)->rx_opt.sack_ok & 4)
112 114
115#define IsSackFrto() (sysctl_tcp_frto == 0x2)
116
113#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) 117#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
114 118
115/* Adapt the MSS value used to make delayed ack decision to the 119/* Adapt the MSS value used to make delayed ack decision to the
@@ -136,7 +140,7 @@ static void tcp_measure_rcv_mss(struct sock *sk,
136 * 140 *
137 * "len" is invariant segment length, including TCP header. 141 * "len" is invariant segment length, including TCP header.
138 */ 142 */
139 len += skb->data - skb->h.raw; 143 len += skb->data - skb_transport_header(skb);
140 if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr) || 144 if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr) ||
141 /* If PSH is not set, packet should be 145 /* If PSH is not set, packet should be
142 * full sized, provided peer TCP is not badly broken. 146 * full sized, provided peer TCP is not badly broken.
@@ -144,7 +148,7 @@ static void tcp_measure_rcv_mss(struct sock *sk,
144 * to handle super-low mtu links fairly. 148 * to handle super-low mtu links fairly.
145 */ 149 */
146 (len >= TCP_MIN_MSS + sizeof(struct tcphdr) && 150 (len >= TCP_MIN_MSS + sizeof(struct tcphdr) &&
147 !(tcp_flag_word(skb->h.th)&TCP_REMNANT))) { 151 !(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) {
148 /* Subtract also invariant (if peer is RFC compliant), 152 /* Subtract also invariant (if peer is RFC compliant),
149 * tcp header plus fixed timestamp option length. 153 * tcp header plus fixed timestamp option length.
150 * Resulting "len" is MSS free of SACK jitter. 154 * Resulting "len" is MSS free of SACK jitter.
@@ -231,9 +235,9 @@ static void tcp_fixup_sndbuf(struct sock *sk)
231 */ 235 */
232 236
233/* Slow part of check#2. */ 237/* Slow part of check#2. */
234static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp, 238static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
235 const struct sk_buff *skb)
236{ 239{
240 struct tcp_sock *tp = tcp_sk(sk);
237 /* Optimize this! */ 241 /* Optimize this! */
238 int truesize = tcp_win_from_space(skb->truesize)/2; 242 int truesize = tcp_win_from_space(skb->truesize)/2;
239 int window = tcp_win_from_space(sysctl_tcp_rmem[2])/2; 243 int window = tcp_win_from_space(sysctl_tcp_rmem[2])/2;
@@ -248,9 +252,11 @@ static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp,
248 return 0; 252 return 0;
249} 253}
250 254
251static void tcp_grow_window(struct sock *sk, struct tcp_sock *tp, 255static void tcp_grow_window(struct sock *sk,
252 struct sk_buff *skb) 256 struct sk_buff *skb)
253{ 257{
258 struct tcp_sock *tp = tcp_sk(sk);
259
254 /* Check #1 */ 260 /* Check #1 */
255 if (tp->rcv_ssthresh < tp->window_clamp && 261 if (tp->rcv_ssthresh < tp->window_clamp &&
256 (int)tp->rcv_ssthresh < tcp_space(sk) && 262 (int)tp->rcv_ssthresh < tcp_space(sk) &&
@@ -263,7 +269,7 @@ static void tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
263 if (tcp_win_from_space(skb->truesize) <= skb->len) 269 if (tcp_win_from_space(skb->truesize) <= skb->len)
264 incr = 2*tp->advmss; 270 incr = 2*tp->advmss;
265 else 271 else
266 incr = __tcp_grow_window(sk, tp, skb); 272 incr = __tcp_grow_window(sk, skb);
267 273
268 if (incr) { 274 if (incr) {
269 tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp); 275 tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp);
@@ -326,8 +332,9 @@ static void tcp_init_buffer_space(struct sock *sk)
326} 332}
327 333
328/* 5. Recalculate window clamp after socket hit its memory bounds. */ 334/* 5. Recalculate window clamp after socket hit its memory bounds. */
329static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) 335static void tcp_clamp_window(struct sock *sk)
330{ 336{
337 struct tcp_sock *tp = tcp_sk(sk);
331 struct inet_connection_sock *icsk = inet_csk(sk); 338 struct inet_connection_sock *icsk = inet_csk(sk);
332 339
333 icsk->icsk_ack.quick = 0; 340 icsk->icsk_ack.quick = 0;
@@ -499,8 +506,9 @@ new_measure:
499 * each ACK we send, he increments snd_cwnd and transmits more of his 506 * each ACK we send, he increments snd_cwnd and transmits more of his
500 * queue. -DaveM 507 * queue. -DaveM
501 */ 508 */
502static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) 509static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
503{ 510{
511 struct tcp_sock *tp = tcp_sk(sk);
504 struct inet_connection_sock *icsk = inet_csk(sk); 512 struct inet_connection_sock *icsk = inet_csk(sk);
505 u32 now; 513 u32 now;
506 514
@@ -541,7 +549,7 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_
541 TCP_ECN_check_ce(tp, skb); 549 TCP_ECN_check_ce(tp, skb);
542 550
543 if (skb->len >= 128) 551 if (skb->len >= 128)
544 tcp_grow_window(sk, tp, skb); 552 tcp_grow_window(sk, skb);
545} 553}
546 554
547/* Called to compute a smoothed rtt estimate. The data fed to this 555/* Called to compute a smoothed rtt estimate. The data fed to this
@@ -574,7 +582,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
574 * does not matter how to _calculate_ it. Seems, it was trap 582 * does not matter how to _calculate_ it. Seems, it was trap
575 * that VJ failed to avoid. 8) 583 * that VJ failed to avoid. 8)
576 */ 584 */
577 if(m == 0) 585 if (m == 0)
578 m = 1; 586 m = 1;
579 if (tp->srtt != 0) { 587 if (tp->srtt != 0) {
580 m -= (tp->srtt >> 3); /* m is now error in rtt est */ 588 m -= (tp->srtt >> 3); /* m is now error in rtt est */
@@ -759,15 +767,17 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
759} 767}
760 768
761/* Set slow start threshold and cwnd not falling to slow start */ 769/* Set slow start threshold and cwnd not falling to slow start */
762void tcp_enter_cwr(struct sock *sk) 770void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
763{ 771{
764 struct tcp_sock *tp = tcp_sk(sk); 772 struct tcp_sock *tp = tcp_sk(sk);
773 const struct inet_connection_sock *icsk = inet_csk(sk);
765 774
766 tp->prior_ssthresh = 0; 775 tp->prior_ssthresh = 0;
767 tp->bytes_acked = 0; 776 tp->bytes_acked = 0;
768 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { 777 if (icsk->icsk_ca_state < TCP_CA_CWR) {
769 tp->undo_marker = 0; 778 tp->undo_marker = 0;
770 tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); 779 if (set_ssthresh)
780 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
771 tp->snd_cwnd = min(tp->snd_cwnd, 781 tp->snd_cwnd = min(tp->snd_cwnd,
772 tcp_packets_in_flight(tp) + 1U); 782 tcp_packets_in_flight(tp) + 1U);
773 tp->snd_cwnd_cnt = 0; 783 tp->snd_cwnd_cnt = 0;
@@ -934,7 +944,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
934{ 944{
935 const struct inet_connection_sock *icsk = inet_csk(sk); 945 const struct inet_connection_sock *icsk = inet_csk(sk);
936 struct tcp_sock *tp = tcp_sk(sk); 946 struct tcp_sock *tp = tcp_sk(sk);
937 unsigned char *ptr = ack_skb->h.raw + TCP_SKB_CB(ack_skb)->sacked; 947 unsigned char *ptr = (skb_transport_header(ack_skb) +
948 TCP_SKB_CB(ack_skb)->sacked);
938 struct tcp_sack_block_wire *sp = (struct tcp_sack_block_wire *)(ptr+2); 949 struct tcp_sack_block_wire *sp = (struct tcp_sack_block_wire *)(ptr+2);
939 struct sk_buff *cached_skb; 950 struct sk_buff *cached_skb;
940 int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3; 951 int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3;
@@ -1038,7 +1049,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
1038 cached_skb = tp->fastpath_skb_hint; 1049 cached_skb = tp->fastpath_skb_hint;
1039 cached_fack_count = tp->fastpath_cnt_hint; 1050 cached_fack_count = tp->fastpath_cnt_hint;
1040 if (!cached_skb) { 1051 if (!cached_skb) {
1041 cached_skb = sk->sk_write_queue.next; 1052 cached_skb = tcp_write_queue_head(sk);
1042 cached_fack_count = 0; 1053 cached_fack_count = 0;
1043 } 1054 }
1044 1055
@@ -1055,10 +1066,13 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
1055 if (after(end_seq, tp->high_seq)) 1066 if (after(end_seq, tp->high_seq))
1056 flag |= FLAG_DATA_LOST; 1067 flag |= FLAG_DATA_LOST;
1057 1068
1058 sk_stream_for_retrans_queue_from(skb, sk) { 1069 tcp_for_write_queue_from(skb, sk) {
1059 int in_sack, pcount; 1070 int in_sack, pcount;
1060 u8 sacked; 1071 u8 sacked;
1061 1072
1073 if (skb == tcp_send_head(sk))
1074 break;
1075
1062 cached_skb = skb; 1076 cached_skb = skb;
1063 cached_fack_count = fack_count; 1077 cached_fack_count = fack_count;
1064 if (i == first_sack_index) { 1078 if (i == first_sack_index) {
@@ -1159,6 +1173,18 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
1159 /* clear lost hint */ 1173 /* clear lost hint */
1160 tp->retransmit_skb_hint = NULL; 1174 tp->retransmit_skb_hint = NULL;
1161 } 1175 }
1176 /* SACK enhanced F-RTO detection.
1177 * Set flag if and only if non-rexmitted
1178 * segments below frto_highmark are
1179 * SACKed (RFC4138; Appendix B).
1180 * Clearing correct due to in-order walk
1181 */
1182 if (after(end_seq, tp->frto_highmark)) {
1183 flag &= ~FLAG_ONLY_ORIG_SACKED;
1184 } else {
1185 if (!(sacked & TCPCB_RETRANS))
1186 flag |= FLAG_ONLY_ORIG_SACKED;
1187 }
1162 } 1188 }
1163 1189
1164 TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED; 1190 TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED;
@@ -1195,7 +1221,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
1195 if (lost_retrans && icsk->icsk_ca_state == TCP_CA_Recovery) { 1221 if (lost_retrans && icsk->icsk_ca_state == TCP_CA_Recovery) {
1196 struct sk_buff *skb; 1222 struct sk_buff *skb;
1197 1223
1198 sk_stream_for_retrans_queue(skb, sk) { 1224 tcp_for_write_queue(skb, sk) {
1225 if (skb == tcp_send_head(sk))
1226 break;
1199 if (after(TCP_SKB_CB(skb)->seq, lost_retrans)) 1227 if (after(TCP_SKB_CB(skb)->seq, lost_retrans))
1200 break; 1228 break;
1201 if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) 1229 if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
@@ -1224,7 +1252,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
1224 1252
1225 tp->left_out = tp->sacked_out + tp->lost_out; 1253 tp->left_out = tp->sacked_out + tp->lost_out;
1226 1254
1227 if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss) 1255 if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss &&
1256 (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark)))
1228 tcp_update_reordering(sk, ((tp->fackets_out + 1) - reord), 0); 1257 tcp_update_reordering(sk, ((tp->fackets_out + 1) - reord), 0);
1229 1258
1230#if FASTRETRANS_DEBUG > 0 1259#if FASTRETRANS_DEBUG > 0
@@ -1236,9 +1265,54 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
1236 return flag; 1265 return flag;
1237} 1266}
1238 1267
1239/* RTO occurred, but do not yet enter loss state. Instead, transmit two new 1268/* F-RTO can only be used if these conditions are satisfied:
1240 * segments to see from the next ACKs whether any data was really missing. 1269 * - there must be some unsent new data
1241 * If the RTO was spurious, new ACKs should arrive. 1270 * - the advertised window should allow sending it
1271 * - TCP has never retransmitted anything other than head (SACK enhanced
1272 * variant from Appendix B of RFC4138 is more robust here)
1273 */
1274int tcp_use_frto(struct sock *sk)
1275{
1276 const struct tcp_sock *tp = tcp_sk(sk);
1277 struct sk_buff *skb;
1278
1279 if (!sysctl_tcp_frto || !tcp_send_head(sk) ||
1280 after(TCP_SKB_CB(tcp_send_head(sk))->end_seq,
1281 tp->snd_una + tp->snd_wnd))
1282 return 0;
1283
1284 if (IsSackFrto())
1285 return 1;
1286
1287 /* Avoid expensive walking of rexmit queue if possible */
1288 if (tp->retrans_out > 1)
1289 return 0;
1290
1291 skb = tcp_write_queue_head(sk);
1292 skb = tcp_write_queue_next(sk, skb); /* Skips head */
1293 tcp_for_write_queue_from(skb, sk) {
1294 if (skb == tcp_send_head(sk))
1295 break;
1296 if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS)
1297 return 0;
1298 /* Short-circuit when first non-SACKed skb has been checked */
1299 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED))
1300 break;
1301 }
1302 return 1;
1303}
1304
1305/* RTO occurred, but do not yet enter Loss state. Instead, defer RTO
1306 * recovery a bit and use heuristics in tcp_process_frto() to detect if
1307 * the RTO was spurious. Only clear SACKED_RETRANS of the head here to
1308 * keep retrans_out counting accurate (with SACK F-RTO, other than head
1309 * may still have that bit set); TCPCB_LOST and remaining SACKED_RETRANS
1310 * bits are handled if the Loss state is really to be entered (in
1311 * tcp_enter_frto_loss).
1312 *
1313 * Do like tcp_enter_loss() would; when RTO expires the second time it
1314 * does:
1315 * "Reduce ssthresh if it has not yet been made inside this window."
1242 */ 1316 */
1243void tcp_enter_frto(struct sock *sk) 1317void tcp_enter_frto(struct sock *sk)
1244{ 1318{
@@ -1246,39 +1320,69 @@ void tcp_enter_frto(struct sock *sk)
1246 struct tcp_sock *tp = tcp_sk(sk); 1320 struct tcp_sock *tp = tcp_sk(sk);
1247 struct sk_buff *skb; 1321 struct sk_buff *skb;
1248 1322
1249 tp->frto_counter = 1; 1323 if ((!tp->frto_counter && icsk->icsk_ca_state <= TCP_CA_Disorder) ||
1250
1251 if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
1252 tp->snd_una == tp->high_seq || 1324 tp->snd_una == tp->high_seq ||
1253 (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { 1325 ((icsk->icsk_ca_state == TCP_CA_Loss || tp->frto_counter) &&
1326 !icsk->icsk_retransmits)) {
1254 tp->prior_ssthresh = tcp_current_ssthresh(sk); 1327 tp->prior_ssthresh = tcp_current_ssthresh(sk);
1255 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); 1328 /* Our state is too optimistic in ssthresh() call because cwnd
1329 * is not reduced until tcp_enter_frto_loss() when previous FRTO
1330 * recovery has not yet completed. Pattern would be this: RTO,
1331 * Cumulative ACK, RTO (2xRTO for the same segment does not end
1332 * up here twice).
1333 * RFC4138 should be more specific on what to do, even though
1334 * RTO is quite unlikely to occur after the first Cumulative ACK
1335 * due to back-off and complexity of triggering events ...
1336 */
1337 if (tp->frto_counter) {
1338 u32 stored_cwnd;
1339 stored_cwnd = tp->snd_cwnd;
1340 tp->snd_cwnd = 2;
1341 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
1342 tp->snd_cwnd = stored_cwnd;
1343 } else {
1344 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
1345 }
1346 /* ... in theory, cong.control module could do "any tricks" in
1347 * ssthresh(), which means that ca_state, lost bits and lost_out
1348 * counter would have to be faked before the call occurs. We
1349 * consider that too expensive, unlikely and hacky, so modules
1350 * using these in ssthresh() must deal these incompatibility
1351 * issues if they receives CA_EVENT_FRTO and frto_counter != 0
1352 */
1256 tcp_ca_event(sk, CA_EVENT_FRTO); 1353 tcp_ca_event(sk, CA_EVENT_FRTO);
1257 } 1354 }
1258 1355
1259 /* Have to clear retransmission markers here to keep the bookkeeping
1260 * in shape, even though we are not yet in Loss state.
1261 * If something was really lost, it is eventually caught up
1262 * in tcp_enter_frto_loss.
1263 */
1264 tp->retrans_out = 0;
1265 tp->undo_marker = tp->snd_una; 1356 tp->undo_marker = tp->snd_una;
1266 tp->undo_retrans = 0; 1357 tp->undo_retrans = 0;
1267 1358
1268 sk_stream_for_retrans_queue(skb, sk) { 1359 skb = tcp_write_queue_head(sk);
1269 TCP_SKB_CB(skb)->sacked &= ~TCPCB_RETRANS; 1360 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
1361 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1362 tp->retrans_out -= tcp_skb_pcount(skb);
1270 } 1363 }
1271 tcp_sync_left_out(tp); 1364 tcp_sync_left_out(tp);
1272 1365
1273 tcp_set_ca_state(sk, TCP_CA_Open); 1366 /* Earlier loss recovery underway (see RFC4138; Appendix B).
1274 tp->frto_highmark = tp->snd_nxt; 1367 * The last condition is necessary at least in tp->frto_counter case.
1368 */
1369 if (IsSackFrto() && (tp->frto_counter ||
1370 ((1 << icsk->icsk_ca_state) & (TCPF_CA_Recovery|TCPF_CA_Loss))) &&
1371 after(tp->high_seq, tp->snd_una)) {
1372 tp->frto_highmark = tp->high_seq;
1373 } else {
1374 tp->frto_highmark = tp->snd_nxt;
1375 }
1376 tcp_set_ca_state(sk, TCP_CA_Disorder);
1377 tp->high_seq = tp->snd_nxt;
1378 tp->frto_counter = 1;
1275} 1379}
1276 1380
1277/* Enter Loss state after F-RTO was applied. Dupack arrived after RTO, 1381/* Enter Loss state after F-RTO was applied. Dupack arrived after RTO,
1278 * which indicates that we should follow the traditional RTO recovery, 1382 * which indicates that we should follow the traditional RTO recovery,
1279 * i.e. mark everything lost and do go-back-N retransmission. 1383 * i.e. mark everything lost and do go-back-N retransmission.
1280 */ 1384 */
1281static void tcp_enter_frto_loss(struct sock *sk) 1385static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
1282{ 1386{
1283 struct tcp_sock *tp = tcp_sk(sk); 1387 struct tcp_sock *tp = tcp_sk(sk);
1284 struct sk_buff *skb; 1388 struct sk_buff *skb;
@@ -1287,10 +1391,23 @@ static void tcp_enter_frto_loss(struct sock *sk)
1287 tp->sacked_out = 0; 1391 tp->sacked_out = 0;
1288 tp->lost_out = 0; 1392 tp->lost_out = 0;
1289 tp->fackets_out = 0; 1393 tp->fackets_out = 0;
1394 tp->retrans_out = 0;
1290 1395
1291 sk_stream_for_retrans_queue(skb, sk) { 1396 tcp_for_write_queue(skb, sk) {
1397 if (skb == tcp_send_head(sk))
1398 break;
1292 cnt += tcp_skb_pcount(skb); 1399 cnt += tcp_skb_pcount(skb);
1293 TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; 1400 /*
1401 * Count the retransmission made on RTO correctly (only when
1402 * waiting for the first ACK and did not get it)...
1403 */
1404 if ((tp->frto_counter == 1) && !(flag&FLAG_DATA_ACKED)) {
1405 tp->retrans_out += tcp_skb_pcount(skb);
1406 /* ...enter this if branch just for the first segment */
1407 flag |= FLAG_DATA_ACKED;
1408 } else {
1409 TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
1410 }
1294 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) { 1411 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
1295 1412
1296 /* Do not mark those segments lost that were 1413 /* Do not mark those segments lost that were
@@ -1308,7 +1425,7 @@ static void tcp_enter_frto_loss(struct sock *sk)
1308 } 1425 }
1309 tcp_sync_left_out(tp); 1426 tcp_sync_left_out(tp);
1310 1427
1311 tp->snd_cwnd = tp->frto_counter + tcp_packets_in_flight(tp)+1; 1428 tp->snd_cwnd = tcp_packets_in_flight(tp) + allowed_segments;
1312 tp->snd_cwnd_cnt = 0; 1429 tp->snd_cwnd_cnt = 0;
1313 tp->snd_cwnd_stamp = tcp_time_stamp; 1430 tp->snd_cwnd_stamp = tcp_time_stamp;
1314 tp->undo_marker = 0; 1431 tp->undo_marker = 0;
@@ -1366,7 +1483,9 @@ void tcp_enter_loss(struct sock *sk, int how)
1366 if (!how) 1483 if (!how)
1367 tp->undo_marker = tp->snd_una; 1484 tp->undo_marker = tp->snd_una;
1368 1485
1369 sk_stream_for_retrans_queue(skb, sk) { 1486 tcp_for_write_queue(skb, sk) {
1487 if (skb == tcp_send_head(sk))
1488 break;
1370 cnt += tcp_skb_pcount(skb); 1489 cnt += tcp_skb_pcount(skb);
1371 if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS) 1490 if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS)
1372 tp->undo_marker = 0; 1491 tp->undo_marker = 0;
@@ -1401,14 +1520,14 @@ static int tcp_check_sack_reneging(struct sock *sk)
1401 * receiver _host_ is heavily congested (or buggy). 1520 * receiver _host_ is heavily congested (or buggy).
1402 * Do processing similar to RTO timeout. 1521 * Do processing similar to RTO timeout.
1403 */ 1522 */
1404 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL && 1523 if ((skb = tcp_write_queue_head(sk)) != NULL &&
1405 (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { 1524 (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
1406 struct inet_connection_sock *icsk = inet_csk(sk); 1525 struct inet_connection_sock *icsk = inet_csk(sk);
1407 NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING); 1526 NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING);
1408 1527
1409 tcp_enter_loss(sk, 1); 1528 tcp_enter_loss(sk, 1);
1410 icsk->icsk_retransmits++; 1529 icsk->icsk_retransmits++;
1411 tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)); 1530 tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
1412 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 1531 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
1413 icsk->icsk_rto, TCP_RTO_MAX); 1532 icsk->icsk_rto, TCP_RTO_MAX);
1414 return 1; 1533 return 1;
@@ -1426,10 +1545,12 @@ static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb)
1426 return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto); 1545 return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto);
1427} 1546}
1428 1547
1429static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp) 1548static inline int tcp_head_timedout(struct sock *sk)
1430{ 1549{
1550 struct tcp_sock *tp = tcp_sk(sk);
1551
1431 return tp->packets_out && 1552 return tp->packets_out &&
1432 tcp_skb_timedout(sk, skb_peek(&sk->sk_write_queue)); 1553 tcp_skb_timedout(sk, tcp_write_queue_head(sk));
1433} 1554}
1434 1555
1435/* Linux NewReno/SACK/FACK/ECN state machine. 1556/* Linux NewReno/SACK/FACK/ECN state machine.
@@ -1525,10 +1646,15 @@ static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp)
1525 * Main question: may we further continue forward transmission 1646 * Main question: may we further continue forward transmission
1526 * with the same cwnd? 1647 * with the same cwnd?
1527 */ 1648 */
1528static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp) 1649static int tcp_time_to_recover(struct sock *sk)
1529{ 1650{
1651 struct tcp_sock *tp = tcp_sk(sk);
1530 __u32 packets_out; 1652 __u32 packets_out;
1531 1653
1654 /* Do not perform any recovery during FRTO algorithm */
1655 if (tp->frto_counter)
1656 return 0;
1657
1532 /* Trick#1: The loss is proven. */ 1658 /* Trick#1: The loss is proven. */
1533 if (tp->lost_out) 1659 if (tp->lost_out)
1534 return 1; 1660 return 1;
@@ -1540,7 +1666,7 @@ static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp)
1540 /* Trick#3 : when we use RFC2988 timer restart, fast 1666 /* Trick#3 : when we use RFC2988 timer restart, fast
1541 * retransmit can be triggered by timeout of queue head. 1667 * retransmit can be triggered by timeout of queue head.
1542 */ 1668 */
1543 if (tcp_head_timedout(sk, tp)) 1669 if (tcp_head_timedout(sk))
1544 return 1; 1670 return 1;
1545 1671
1546 /* Trick#4: It is still not OK... But will it be useful to delay 1672 /* Trick#4: It is still not OK... But will it be useful to delay
@@ -1549,7 +1675,7 @@ static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp)
1549 packets_out = tp->packets_out; 1675 packets_out = tp->packets_out;
1550 if (packets_out <= tp->reordering && 1676 if (packets_out <= tp->reordering &&
1551 tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) && 1677 tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) &&
1552 !tcp_may_send_now(sk, tp)) { 1678 !tcp_may_send_now(sk)) {
1553 /* We have nothing to send. This connection is limited 1679 /* We have nothing to send. This connection is limited
1554 * either by receiver window or by application. 1680 * either by receiver window or by application.
1555 */ 1681 */
@@ -1589,8 +1715,10 @@ static void tcp_add_reno_sack(struct sock *sk)
1589 1715
1590/* Account for ACK, ACKing some data in Reno Recovery phase. */ 1716/* Account for ACK, ACKing some data in Reno Recovery phase. */
1591 1717
1592static void tcp_remove_reno_sacks(struct sock *sk, struct tcp_sock *tp, int acked) 1718static void tcp_remove_reno_sacks(struct sock *sk, int acked)
1593{ 1719{
1720 struct tcp_sock *tp = tcp_sk(sk);
1721
1594 if (acked > 0) { 1722 if (acked > 0) {
1595 /* One ACK acked hole. The rest eat duplicate ACKs. */ 1723 /* One ACK acked hole. The rest eat duplicate ACKs. */
1596 if (acked-1 >= tp->sacked_out) 1724 if (acked-1 >= tp->sacked_out)
@@ -1609,9 +1737,10 @@ static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
1609} 1737}
1610 1738
1611/* Mark head of queue up as lost. */ 1739/* Mark head of queue up as lost. */
1612static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp, 1740static void tcp_mark_head_lost(struct sock *sk,
1613 int packets, u32 high_seq) 1741 int packets, u32 high_seq)
1614{ 1742{
1743 struct tcp_sock *tp = tcp_sk(sk);
1615 struct sk_buff *skb; 1744 struct sk_buff *skb;
1616 int cnt; 1745 int cnt;
1617 1746
@@ -1620,11 +1749,13 @@ static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp,
1620 skb = tp->lost_skb_hint; 1749 skb = tp->lost_skb_hint;
1621 cnt = tp->lost_cnt_hint; 1750 cnt = tp->lost_cnt_hint;
1622 } else { 1751 } else {
1623 skb = sk->sk_write_queue.next; 1752 skb = tcp_write_queue_head(sk);
1624 cnt = 0; 1753 cnt = 0;
1625 } 1754 }
1626 1755
1627 sk_stream_for_retrans_queue_from(skb, sk) { 1756 tcp_for_write_queue_from(skb, sk) {
1757 if (skb == tcp_send_head(sk))
1758 break;
1628 /* TODO: do this better */ 1759 /* TODO: do this better */
1629 /* this is not the most efficient way to do this... */ 1760 /* this is not the most efficient way to do this... */
1630 tp->lost_skb_hint = skb; 1761 tp->lost_skb_hint = skb;
@@ -1638,12 +1769,11 @@ static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp,
1638 1769
1639 /* clear xmit_retransmit_queue hints 1770 /* clear xmit_retransmit_queue hints
1640 * if this is beyond hint */ 1771 * if this is beyond hint */
1641 if(tp->retransmit_skb_hint != NULL && 1772 if (tp->retransmit_skb_hint != NULL &&
1642 before(TCP_SKB_CB(skb)->seq, 1773 before(TCP_SKB_CB(skb)->seq,
1643 TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) { 1774 TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
1644
1645 tp->retransmit_skb_hint = NULL; 1775 tp->retransmit_skb_hint = NULL;
1646 } 1776
1647 } 1777 }
1648 } 1778 }
1649 tcp_sync_left_out(tp); 1779 tcp_sync_left_out(tp);
@@ -1651,15 +1781,17 @@ static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp,
1651 1781
1652/* Account newly detected lost packet(s) */ 1782/* Account newly detected lost packet(s) */
1653 1783
1654static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp) 1784static void tcp_update_scoreboard(struct sock *sk)
1655{ 1785{
1786 struct tcp_sock *tp = tcp_sk(sk);
1787
1656 if (IsFack(tp)) { 1788 if (IsFack(tp)) {
1657 int lost = tp->fackets_out - tp->reordering; 1789 int lost = tp->fackets_out - tp->reordering;
1658 if (lost <= 0) 1790 if (lost <= 0)
1659 lost = 1; 1791 lost = 1;
1660 tcp_mark_head_lost(sk, tp, lost, tp->high_seq); 1792 tcp_mark_head_lost(sk, lost, tp->high_seq);
1661 } else { 1793 } else {
1662 tcp_mark_head_lost(sk, tp, 1, tp->high_seq); 1794 tcp_mark_head_lost(sk, 1, tp->high_seq);
1663 } 1795 }
1664 1796
1665 /* New heuristics: it is possible only after we switched 1797 /* New heuristics: it is possible only after we switched
@@ -1667,13 +1799,15 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp)
1667 * Hence, we can detect timed out packets during fast 1799 * Hence, we can detect timed out packets during fast
1668 * retransmit without falling to slow start. 1800 * retransmit without falling to slow start.
1669 */ 1801 */
1670 if (!IsReno(tp) && tcp_head_timedout(sk, tp)) { 1802 if (!IsReno(tp) && tcp_head_timedout(sk)) {
1671 struct sk_buff *skb; 1803 struct sk_buff *skb;
1672 1804
1673 skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint 1805 skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint
1674 : sk->sk_write_queue.next; 1806 : tcp_write_queue_head(sk);
1675 1807
1676 sk_stream_for_retrans_queue_from(skb, sk) { 1808 tcp_for_write_queue_from(skb, sk) {
1809 if (skb == tcp_send_head(sk))
1810 break;
1677 if (!tcp_skb_timedout(sk, skb)) 1811 if (!tcp_skb_timedout(sk, skb))
1678 break; 1812 break;
1679 1813
@@ -1745,9 +1879,11 @@ static inline int tcp_packet_delayed(struct tcp_sock *tp)
1745/* Undo procedures. */ 1879/* Undo procedures. */
1746 1880
1747#if FASTRETRANS_DEBUG > 1 1881#if FASTRETRANS_DEBUG > 1
1748static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg) 1882static void DBGUNDO(struct sock *sk, const char *msg)
1749{ 1883{
1884 struct tcp_sock *tp = tcp_sk(sk);
1750 struct inet_sock *inet = inet_sk(sk); 1885 struct inet_sock *inet = inet_sk(sk);
1886
1751 printk(KERN_DEBUG "Undo %s %u.%u.%u.%u/%u c%u l%u ss%u/%u p%u\n", 1887 printk(KERN_DEBUG "Undo %s %u.%u.%u.%u/%u c%u l%u ss%u/%u p%u\n",
1752 msg, 1888 msg,
1753 NIPQUAD(inet->daddr), ntohs(inet->dport), 1889 NIPQUAD(inet->daddr), ntohs(inet->dport),
@@ -1793,13 +1929,15 @@ static inline int tcp_may_undo(struct tcp_sock *tp)
1793} 1929}
1794 1930
1795/* People celebrate: "We love our President!" */ 1931/* People celebrate: "We love our President!" */
1796static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp) 1932static int tcp_try_undo_recovery(struct sock *sk)
1797{ 1933{
1934 struct tcp_sock *tp = tcp_sk(sk);
1935
1798 if (tcp_may_undo(tp)) { 1936 if (tcp_may_undo(tp)) {
1799 /* Happy end! We did not retransmit anything 1937 /* Happy end! We did not retransmit anything
1800 * or our original transmission succeeded. 1938 * or our original transmission succeeded.
1801 */ 1939 */
1802 DBGUNDO(sk, tp, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans"); 1940 DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
1803 tcp_undo_cwr(sk, 1); 1941 tcp_undo_cwr(sk, 1);
1804 if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) 1942 if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
1805 NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); 1943 NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO);
@@ -1819,10 +1957,12 @@ static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp)
1819} 1957}
1820 1958
1821/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */ 1959/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */
1822static void tcp_try_undo_dsack(struct sock *sk, struct tcp_sock *tp) 1960static void tcp_try_undo_dsack(struct sock *sk)
1823{ 1961{
1962 struct tcp_sock *tp = tcp_sk(sk);
1963
1824 if (tp->undo_marker && !tp->undo_retrans) { 1964 if (tp->undo_marker && !tp->undo_retrans) {
1825 DBGUNDO(sk, tp, "D-SACK"); 1965 DBGUNDO(sk, "D-SACK");
1826 tcp_undo_cwr(sk, 1); 1966 tcp_undo_cwr(sk, 1);
1827 tp->undo_marker = 0; 1967 tp->undo_marker = 0;
1828 NET_INC_STATS_BH(LINUX_MIB_TCPDSACKUNDO); 1968 NET_INC_STATS_BH(LINUX_MIB_TCPDSACKUNDO);
@@ -1831,9 +1971,9 @@ static void tcp_try_undo_dsack(struct sock *sk, struct tcp_sock *tp)
1831 1971
1832/* Undo during fast recovery after partial ACK. */ 1972/* Undo during fast recovery after partial ACK. */
1833 1973
1834static int tcp_try_undo_partial(struct sock *sk, struct tcp_sock *tp, 1974static int tcp_try_undo_partial(struct sock *sk, int acked)
1835 int acked)
1836{ 1975{
1976 struct tcp_sock *tp = tcp_sk(sk);
1837 /* Partial ACK arrived. Force Hoe's retransmit. */ 1977 /* Partial ACK arrived. Force Hoe's retransmit. */
1838 int failed = IsReno(tp) || tp->fackets_out>tp->reordering; 1978 int failed = IsReno(tp) || tp->fackets_out>tp->reordering;
1839 1979
@@ -1846,7 +1986,7 @@ static int tcp_try_undo_partial(struct sock *sk, struct tcp_sock *tp,
1846 1986
1847 tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); 1987 tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
1848 1988
1849 DBGUNDO(sk, tp, "Hoe"); 1989 DBGUNDO(sk, "Hoe");
1850 tcp_undo_cwr(sk, 0); 1990 tcp_undo_cwr(sk, 0);
1851 NET_INC_STATS_BH(LINUX_MIB_TCPPARTIALUNDO); 1991 NET_INC_STATS_BH(LINUX_MIB_TCPPARTIALUNDO);
1852 1992
@@ -1860,17 +2000,21 @@ static int tcp_try_undo_partial(struct sock *sk, struct tcp_sock *tp,
1860} 2000}
1861 2001
1862/* Undo during loss recovery after partial ACK. */ 2002/* Undo during loss recovery after partial ACK. */
1863static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp) 2003static int tcp_try_undo_loss(struct sock *sk)
1864{ 2004{
2005 struct tcp_sock *tp = tcp_sk(sk);
2006
1865 if (tcp_may_undo(tp)) { 2007 if (tcp_may_undo(tp)) {
1866 struct sk_buff *skb; 2008 struct sk_buff *skb;
1867 sk_stream_for_retrans_queue(skb, sk) { 2009 tcp_for_write_queue(skb, sk) {
2010 if (skb == tcp_send_head(sk))
2011 break;
1868 TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; 2012 TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
1869 } 2013 }
1870 2014
1871 clear_all_retrans_hints(tp); 2015 clear_all_retrans_hints(tp);
1872 2016
1873 DBGUNDO(sk, tp, "partial loss"); 2017 DBGUNDO(sk, "partial loss");
1874 tp->lost_out = 0; 2018 tp->lost_out = 0;
1875 tp->left_out = tp->sacked_out; 2019 tp->left_out = tp->sacked_out;
1876 tcp_undo_cwr(sk, 1); 2020 tcp_undo_cwr(sk, 1);
@@ -1892,15 +2036,17 @@ static inline void tcp_complete_cwr(struct sock *sk)
1892 tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); 2036 tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
1893} 2037}
1894 2038
1895static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) 2039static void tcp_try_to_open(struct sock *sk, int flag)
1896{ 2040{
2041 struct tcp_sock *tp = tcp_sk(sk);
2042
1897 tp->left_out = tp->sacked_out; 2043 tp->left_out = tp->sacked_out;
1898 2044
1899 if (tp->retrans_out == 0) 2045 if (tp->retrans_out == 0)
1900 tp->retrans_stamp = 0; 2046 tp->retrans_stamp = 0;
1901 2047
1902 if (flag&FLAG_ECE) 2048 if (flag&FLAG_ECE)
1903 tcp_enter_cwr(sk); 2049 tcp_enter_cwr(sk, 1);
1904 2050
1905 if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { 2051 if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
1906 int state = TCP_CA_Open; 2052 int state = TCP_CA_Open;
@@ -1987,7 +2133,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1987 before(tp->snd_una, tp->high_seq) && 2133 before(tp->snd_una, tp->high_seq) &&
1988 icsk->icsk_ca_state != TCP_CA_Open && 2134 icsk->icsk_ca_state != TCP_CA_Open &&
1989 tp->fackets_out > tp->reordering) { 2135 tp->fackets_out > tp->reordering) {
1990 tcp_mark_head_lost(sk, tp, tp->fackets_out-tp->reordering, tp->high_seq); 2136 tcp_mark_head_lost(sk, tp->fackets_out-tp->reordering, tp->high_seq);
1991 NET_INC_STATS_BH(LINUX_MIB_TCPLOSS); 2137 NET_INC_STATS_BH(LINUX_MIB_TCPLOSS);
1992 } 2138 }
1993 2139
@@ -1997,14 +2143,13 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1997 /* E. Check state exit conditions. State can be terminated 2143 /* E. Check state exit conditions. State can be terminated
1998 * when high_seq is ACKed. */ 2144 * when high_seq is ACKed. */
1999 if (icsk->icsk_ca_state == TCP_CA_Open) { 2145 if (icsk->icsk_ca_state == TCP_CA_Open) {
2000 if (!sysctl_tcp_frto) 2146 BUG_TRAP(tp->retrans_out == 0);
2001 BUG_TRAP(tp->retrans_out == 0);
2002 tp->retrans_stamp = 0; 2147 tp->retrans_stamp = 0;
2003 } else if (!before(tp->snd_una, tp->high_seq)) { 2148 } else if (!before(tp->snd_una, tp->high_seq)) {
2004 switch (icsk->icsk_ca_state) { 2149 switch (icsk->icsk_ca_state) {
2005 case TCP_CA_Loss: 2150 case TCP_CA_Loss:
2006 icsk->icsk_retransmits = 0; 2151 icsk->icsk_retransmits = 0;
2007 if (tcp_try_undo_recovery(sk, tp)) 2152 if (tcp_try_undo_recovery(sk))
2008 return; 2153 return;
2009 break; 2154 break;
2010 2155
@@ -2018,7 +2163,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
2018 break; 2163 break;
2019 2164
2020 case TCP_CA_Disorder: 2165 case TCP_CA_Disorder:
2021 tcp_try_undo_dsack(sk, tp); 2166 tcp_try_undo_dsack(sk);
2022 if (!tp->undo_marker || 2167 if (!tp->undo_marker ||
2023 /* For SACK case do not Open to allow to undo 2168 /* For SACK case do not Open to allow to undo
2024 * catching for all duplicate ACKs. */ 2169 * catching for all duplicate ACKs. */
@@ -2031,7 +2176,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
2031 case TCP_CA_Recovery: 2176 case TCP_CA_Recovery:
2032 if (IsReno(tp)) 2177 if (IsReno(tp))
2033 tcp_reset_reno_sack(tp); 2178 tcp_reset_reno_sack(tp);
2034 if (tcp_try_undo_recovery(sk, tp)) 2179 if (tcp_try_undo_recovery(sk))
2035 return; 2180 return;
2036 tcp_complete_cwr(sk); 2181 tcp_complete_cwr(sk);
2037 break; 2182 break;
@@ -2047,14 +2192,14 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
2047 } else { 2192 } else {
2048 int acked = prior_packets - tp->packets_out; 2193 int acked = prior_packets - tp->packets_out;
2049 if (IsReno(tp)) 2194 if (IsReno(tp))
2050 tcp_remove_reno_sacks(sk, tp, acked); 2195 tcp_remove_reno_sacks(sk, acked);
2051 is_dupack = tcp_try_undo_partial(sk, tp, acked); 2196 is_dupack = tcp_try_undo_partial(sk, acked);
2052 } 2197 }
2053 break; 2198 break;
2054 case TCP_CA_Loss: 2199 case TCP_CA_Loss:
2055 if (flag&FLAG_DATA_ACKED) 2200 if (flag&FLAG_DATA_ACKED)
2056 icsk->icsk_retransmits = 0; 2201 icsk->icsk_retransmits = 0;
2057 if (!tcp_try_undo_loss(sk, tp)) { 2202 if (!tcp_try_undo_loss(sk)) {
2058 tcp_moderate_cwnd(tp); 2203 tcp_moderate_cwnd(tp);
2059 tcp_xmit_retransmit_queue(sk); 2204 tcp_xmit_retransmit_queue(sk);
2060 return; 2205 return;
@@ -2071,10 +2216,10 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
2071 } 2216 }
2072 2217
2073 if (icsk->icsk_ca_state == TCP_CA_Disorder) 2218 if (icsk->icsk_ca_state == TCP_CA_Disorder)
2074 tcp_try_undo_dsack(sk, tp); 2219 tcp_try_undo_dsack(sk);
2075 2220
2076 if (!tcp_time_to_recover(sk, tp)) { 2221 if (!tcp_time_to_recover(sk)) {
2077 tcp_try_to_open(sk, tp, flag); 2222 tcp_try_to_open(sk, flag);
2078 return; 2223 return;
2079 } 2224 }
2080 2225
@@ -2113,8 +2258,8 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
2113 tcp_set_ca_state(sk, TCP_CA_Recovery); 2258 tcp_set_ca_state(sk, TCP_CA_Recovery);
2114 } 2259 }
2115 2260
2116 if (is_dupack || tcp_head_timedout(sk, tp)) 2261 if (is_dupack || tcp_head_timedout(sk))
2117 tcp_update_scoreboard(sk, tp); 2262 tcp_update_scoreboard(sk);
2118 tcp_cwnd_down(sk); 2263 tcp_cwnd_down(sk);
2119 tcp_xmit_retransmit_queue(sk); 2264 tcp_xmit_retransmit_queue(sk);
2120} 2265}
@@ -2190,8 +2335,10 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
2190 * RFC2988 recommends to restart timer to now+rto. 2335 * RFC2988 recommends to restart timer to now+rto.
2191 */ 2336 */
2192 2337
2193static void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp) 2338static void tcp_ack_packets_out(struct sock *sk)
2194{ 2339{
2340 struct tcp_sock *tp = tcp_sk(sk);
2341
2195 if (!tp->packets_out) { 2342 if (!tp->packets_out) {
2196 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); 2343 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
2197 } else { 2344 } else {
@@ -2255,14 +2402,6 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
2255 return acked; 2402 return acked;
2256} 2403}
2257 2404
2258static u32 tcp_usrtt(struct timeval *tv)
2259{
2260 struct timeval now;
2261
2262 do_gettimeofday(&now);
2263 return (now.tv_sec - tv->tv_sec) * 1000000 + (now.tv_usec - tv->tv_usec);
2264}
2265
2266/* Remove acknowledged frames from the retransmission queue. */ 2405/* Remove acknowledged frames from the retransmission queue. */
2267static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) 2406static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
2268{ 2407{
@@ -2273,12 +2412,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
2273 int acked = 0; 2412 int acked = 0;
2274 __s32 seq_rtt = -1; 2413 __s32 seq_rtt = -1;
2275 u32 pkts_acked = 0; 2414 u32 pkts_acked = 0;
2276 void (*rtt_sample)(struct sock *sk, u32 usrtt) 2415 ktime_t last_ackt = ktime_set(0,0);
2277 = icsk->icsk_ca_ops->rtt_sample;
2278 struct timeval tv = { .tv_sec = 0, .tv_usec = 0 };
2279 2416
2280 while ((skb = skb_peek(&sk->sk_write_queue)) && 2417 while ((skb = tcp_write_queue_head(sk)) &&
2281 skb != sk->sk_send_head) { 2418 skb != tcp_send_head(sk)) {
2282 struct tcp_skb_cb *scb = TCP_SKB_CB(skb); 2419 struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
2283 __u8 sacked = scb->sacked; 2420 __u8 sacked = scb->sacked;
2284 2421
@@ -2318,13 +2455,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
2318 2455
2319 if (sacked) { 2456 if (sacked) {
2320 if (sacked & TCPCB_RETRANS) { 2457 if (sacked & TCPCB_RETRANS) {
2321 if(sacked & TCPCB_SACKED_RETRANS) 2458 if (sacked & TCPCB_SACKED_RETRANS)
2322 tp->retrans_out -= tcp_skb_pcount(skb); 2459 tp->retrans_out -= tcp_skb_pcount(skb);
2323 acked |= FLAG_RETRANS_DATA_ACKED; 2460 acked |= FLAG_RETRANS_DATA_ACKED;
2324 seq_rtt = -1; 2461 seq_rtt = -1;
2325 } else if (seq_rtt < 0) { 2462 } else if (seq_rtt < 0) {
2326 seq_rtt = now - scb->when; 2463 seq_rtt = now - scb->when;
2327 skb_get_timestamp(skb, &tv); 2464 last_ackt = skb->tstamp;
2328 } 2465 }
2329 if (sacked & TCPCB_SACKED_ACKED) 2466 if (sacked & TCPCB_SACKED_ACKED)
2330 tp->sacked_out -= tcp_skb_pcount(skb); 2467 tp->sacked_out -= tcp_skb_pcount(skb);
@@ -2337,23 +2474,24 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
2337 } 2474 }
2338 } else if (seq_rtt < 0) { 2475 } else if (seq_rtt < 0) {
2339 seq_rtt = now - scb->when; 2476 seq_rtt = now - scb->when;
2340 skb_get_timestamp(skb, &tv); 2477 last_ackt = skb->tstamp;
2341 } 2478 }
2342 tcp_dec_pcount_approx(&tp->fackets_out, skb); 2479 tcp_dec_pcount_approx(&tp->fackets_out, skb);
2343 tcp_packets_out_dec(tp, skb); 2480 tcp_packets_out_dec(tp, skb);
2344 __skb_unlink(skb, &sk->sk_write_queue); 2481 tcp_unlink_write_queue(skb, sk);
2345 sk_stream_free_skb(sk, skb); 2482 sk_stream_free_skb(sk, skb);
2346 clear_all_retrans_hints(tp); 2483 clear_all_retrans_hints(tp);
2347 } 2484 }
2348 2485
2349 if (acked&FLAG_ACKED) { 2486 if (acked&FLAG_ACKED) {
2487 const struct tcp_congestion_ops *ca_ops
2488 = inet_csk(sk)->icsk_ca_ops;
2489
2350 tcp_ack_update_rtt(sk, acked, seq_rtt); 2490 tcp_ack_update_rtt(sk, acked, seq_rtt);
2351 tcp_ack_packets_out(sk, tp); 2491 tcp_ack_packets_out(sk);
2352 if (rtt_sample && !(acked & FLAG_RETRANS_DATA_ACKED))
2353 (*rtt_sample)(sk, tcp_usrtt(&tv));
2354 2492
2355 if (icsk->icsk_ca_ops->pkts_acked) 2493 if (ca_ops->pkts_acked)
2356 icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked); 2494 ca_ops->pkts_acked(sk, pkts_acked, last_ackt);
2357 } 2495 }
2358 2496
2359#if FASTRETRANS_DEBUG > 0 2497#if FASTRETRANS_DEBUG > 0
@@ -2390,7 +2528,7 @@ static void tcp_ack_probe(struct sock *sk)
2390 2528
2391 /* Was it a usable window open? */ 2529 /* Was it a usable window open? */
2392 2530
2393 if (!after(TCP_SKB_CB(sk->sk_send_head)->end_seq, 2531 if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq,
2394 tp->snd_una + tp->snd_wnd)) { 2532 tp->snd_una + tp->snd_wnd)) {
2395 icsk->icsk_backoff = 0; 2533 icsk->icsk_backoff = 0;
2396 inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); 2534 inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
@@ -2433,13 +2571,14 @@ static inline int tcp_may_update_window(const struct tcp_sock *tp, const u32 ack
2433 * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2 2571 * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
2434 * and in FreeBSD. NetBSD's one is even worse.) is wrong. 2572 * and in FreeBSD. NetBSD's one is even worse.) is wrong.
2435 */ 2573 */
2436static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp, 2574static int tcp_ack_update_window(struct sock *sk, struct sk_buff *skb, u32 ack,
2437 struct sk_buff *skb, u32 ack, u32 ack_seq) 2575 u32 ack_seq)
2438{ 2576{
2577 struct tcp_sock *tp = tcp_sk(sk);
2439 int flag = 0; 2578 int flag = 0;
2440 u32 nwin = ntohs(skb->h.th->window); 2579 u32 nwin = ntohs(tcp_hdr(skb)->window);
2441 2580
2442 if (likely(!skb->h.th->syn)) 2581 if (likely(!tcp_hdr(skb)->syn))
2443 nwin <<= tp->rx_opt.snd_wscale; 2582 nwin <<= tp->rx_opt.snd_wscale;
2444 2583
2445 if (tcp_may_update_window(tp, ack, ack_seq, nwin)) { 2584 if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
@@ -2453,7 +2592,7 @@ static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp,
2453 * fast path is recovered for sending TCP. 2592 * fast path is recovered for sending TCP.
2454 */ 2593 */
2455 tp->pred_flags = 0; 2594 tp->pred_flags = 0;
2456 tcp_fast_path_check(sk, tp); 2595 tcp_fast_path_check(sk);
2457 2596
2458 if (nwin > tp->max_window) { 2597 if (nwin > tp->max_window) {
2459 tp->max_window = nwin; 2598 tp->max_window = nwin;
@@ -2467,39 +2606,128 @@ static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp,
2467 return flag; 2606 return flag;
2468} 2607}
2469 2608
2470static void tcp_process_frto(struct sock *sk, u32 prior_snd_una) 2609/* A very conservative spurious RTO response algorithm: reduce cwnd and
2610 * continue in congestion avoidance.
2611 */
2612static void tcp_conservative_spur_to_response(struct tcp_sock *tp)
2613{
2614 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
2615 tp->snd_cwnd_cnt = 0;
2616 tcp_moderate_cwnd(tp);
2617}
2618
2619/* A conservative spurious RTO response algorithm: reduce cwnd using
2620 * rate halving and continue in congestion avoidance.
2621 */
2622static void tcp_ratehalving_spur_to_response(struct sock *sk)
2623{
2624 tcp_enter_cwr(sk, 0);
2625}
2626
2627static void tcp_undo_spur_to_response(struct sock *sk, int flag)
2628{
2629 if (flag&FLAG_ECE)
2630 tcp_ratehalving_spur_to_response(sk);
2631 else
2632 tcp_undo_cwr(sk, 1);
2633}
2634
2635/* F-RTO spurious RTO detection algorithm (RFC4138)
2636 *
2637 * F-RTO affects during two new ACKs following RTO (well, almost, see inline
2638 * comments). State (ACK number) is kept in frto_counter. When ACK advances
2639 * window (but not to or beyond highest sequence sent before RTO):
2640 * On First ACK, send two new segments out.
2641 * On Second ACK, RTO was likely spurious. Do spurious response (response
2642 * algorithm is not part of the F-RTO detection algorithm
2643 * given in RFC4138 but can be selected separately).
2644 * Otherwise (basically on duplicate ACK), RTO was (likely) caused by a loss
2645 * and TCP falls back to conventional RTO recovery.
2646 *
2647 * Rationale: if the RTO was spurious, new ACKs should arrive from the
2648 * original window even after we transmit two new data segments.
2649 *
2650 * SACK version:
2651 * on first step, wait until first cumulative ACK arrives, then move to
2652 * the second step. In second step, the next ACK decides.
2653 *
2654 * F-RTO is implemented (mainly) in four functions:
2655 * - tcp_use_frto() is used to determine if TCP is can use F-RTO
2656 * - tcp_enter_frto() prepares TCP state on RTO if F-RTO is used, it is
2657 * called when tcp_use_frto() showed green light
2658 * - tcp_process_frto() handles incoming ACKs during F-RTO algorithm
2659 * - tcp_enter_frto_loss() is called if there is not enough evidence
2660 * to prove that the RTO is indeed spurious. It transfers the control
2661 * from F-RTO to the conventional RTO recovery
2662 */
2663static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag)
2471{ 2664{
2472 struct tcp_sock *tp = tcp_sk(sk); 2665 struct tcp_sock *tp = tcp_sk(sk);
2473 2666
2474 tcp_sync_left_out(tp); 2667 tcp_sync_left_out(tp);
2475 2668
2476 if (tp->snd_una == prior_snd_una || 2669 /* Duplicate the behavior from Loss state (fastretrans_alert) */
2477 !before(tp->snd_una, tp->frto_highmark)) { 2670 if (flag&FLAG_DATA_ACKED)
2478 /* RTO was caused by loss, start retransmitting in 2671 inet_csk(sk)->icsk_retransmits = 0;
2479 * go-back-N slow start 2672
2480 */ 2673 if (!before(tp->snd_una, tp->frto_highmark)) {
2481 tcp_enter_frto_loss(sk); 2674 tcp_enter_frto_loss(sk, tp->frto_counter + 1, flag);
2482 return; 2675 return 1;
2483 } 2676 }
2484 2677
2485 if (tp->frto_counter == 1) { 2678 if (!IsSackFrto() || IsReno(tp)) {
2486 /* First ACK after RTO advances the window: allow two new 2679 /* RFC4138 shortcoming in step 2; should also have case c):
2487 * segments out. 2680 * ACK isn't duplicate nor advances window, e.g., opposite dir
2681 * data, winupdate
2488 */ 2682 */
2489 tp->snd_cwnd = tcp_packets_in_flight(tp) + 2; 2683 if ((tp->snd_una == prior_snd_una) && (flag&FLAG_NOT_DUP) &&
2684 !(flag&FLAG_FORWARD_PROGRESS))
2685 return 1;
2686
2687 if (!(flag&FLAG_DATA_ACKED)) {
2688 tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3),
2689 flag);
2690 return 1;
2691 }
2490 } else { 2692 } else {
2491 /* Also the second ACK after RTO advances the window. 2693 if (!(flag&FLAG_DATA_ACKED) && (tp->frto_counter == 1)) {
2492 * The RTO was likely spurious. Reduce cwnd and continue 2694 /* Prevent sending of new data. */
2493 * in congestion avoidance 2695 tp->snd_cwnd = min(tp->snd_cwnd,
2494 */ 2696 tcp_packets_in_flight(tp));
2495 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); 2697 return 1;
2496 tcp_moderate_cwnd(tp); 2698 }
2699
2700 if ((tp->frto_counter == 2) &&
2701 (!(flag&FLAG_FORWARD_PROGRESS) ||
2702 ((flag&FLAG_DATA_SACKED) && !(flag&FLAG_ONLY_ORIG_SACKED)))) {
2703 /* RFC4138 shortcoming (see comment above) */
2704 if (!(flag&FLAG_FORWARD_PROGRESS) && (flag&FLAG_NOT_DUP))
2705 return 1;
2706
2707 tcp_enter_frto_loss(sk, 3, flag);
2708 return 1;
2709 }
2497 } 2710 }
2498 2711
2499 /* F-RTO affects on two new ACKs following RTO. 2712 if (tp->frto_counter == 1) {
2500 * At latest on third ACK the TCP behavior is back to normal. 2713 tp->snd_cwnd = tcp_packets_in_flight(tp) + 2;
2501 */ 2714 tp->frto_counter = 2;
2502 tp->frto_counter = (tp->frto_counter + 1) % 3; 2715 return 1;
2716 } else /* frto_counter == 2 */ {
2717 switch (sysctl_tcp_frto_response) {
2718 case 2:
2719 tcp_undo_spur_to_response(sk, flag);
2720 break;
2721 case 1:
2722 tcp_conservative_spur_to_response(tp);
2723 break;
2724 default:
2725 tcp_ratehalving_spur_to_response(sk);
2726 break;
2727 }
2728 tp->frto_counter = 0;
2729 }
2730 return 0;
2503} 2731}
2504 2732
2505/* This routine deals with incoming acks, but not outgoing ones. */ 2733/* This routine deals with incoming acks, but not outgoing ones. */
@@ -2513,6 +2741,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2513 u32 prior_in_flight; 2741 u32 prior_in_flight;
2514 s32 seq_rtt; 2742 s32 seq_rtt;
2515 int prior_packets; 2743 int prior_packets;
2744 int frto_cwnd = 0;
2516 2745
2517 /* If the ack is newer than sent or older than previous acks 2746 /* If the ack is newer than sent or older than previous acks
2518 * then we can probably ignore it. 2747 * then we can probably ignore it.
@@ -2549,12 +2778,12 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2549 else 2778 else
2550 NET_INC_STATS_BH(LINUX_MIB_TCPPUREACKS); 2779 NET_INC_STATS_BH(LINUX_MIB_TCPPUREACKS);
2551 2780
2552 flag |= tcp_ack_update_window(sk, tp, skb, ack, ack_seq); 2781 flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
2553 2782
2554 if (TCP_SKB_CB(skb)->sacked) 2783 if (TCP_SKB_CB(skb)->sacked)
2555 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); 2784 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una);
2556 2785
2557 if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) 2786 if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))
2558 flag |= FLAG_ECE; 2787 flag |= FLAG_ECE;
2559 2788
2560 tcp_ca_event(sk, CA_EVENT_SLOW_ACK); 2789 tcp_ca_event(sk, CA_EVENT_SLOW_ACK);
@@ -2575,15 +2804,16 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2575 flag |= tcp_clean_rtx_queue(sk, &seq_rtt); 2804 flag |= tcp_clean_rtx_queue(sk, &seq_rtt);
2576 2805
2577 if (tp->frto_counter) 2806 if (tp->frto_counter)
2578 tcp_process_frto(sk, prior_snd_una); 2807 frto_cwnd = tcp_process_frto(sk, prior_snd_una, flag);
2579 2808
2580 if (tcp_ack_is_dubious(sk, flag)) { 2809 if (tcp_ack_is_dubious(sk, flag)) {
2581 /* Advance CWND, if state allows this. */ 2810 /* Advance CWND, if state allows this. */
2582 if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag)) 2811 if ((flag & FLAG_DATA_ACKED) && !frto_cwnd &&
2812 tcp_may_raise_cwnd(sk, flag))
2583 tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0); 2813 tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0);
2584 tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); 2814 tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag);
2585 } else { 2815 } else {
2586 if ((flag & FLAG_DATA_ACKED)) 2816 if ((flag & FLAG_DATA_ACKED) && !frto_cwnd)
2587 tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 1); 2817 tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 1);
2588 } 2818 }
2589 2819
@@ -2599,7 +2829,7 @@ no_queue:
2599 * being used to time the probes, and is probably far higher than 2829 * being used to time the probes, and is probably far higher than
2600 * it needs to be for normal retransmission. 2830 * it needs to be for normal retransmission.
2601 */ 2831 */
2602 if (sk->sk_send_head) 2832 if (tcp_send_head(sk))
2603 tcp_ack_probe(sk); 2833 tcp_ack_probe(sk);
2604 return 1; 2834 return 1;
2605 2835
@@ -2620,13 +2850,13 @@ uninteresting_ack:
2620void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab) 2850void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab)
2621{ 2851{
2622 unsigned char *ptr; 2852 unsigned char *ptr;
2623 struct tcphdr *th = skb->h.th; 2853 struct tcphdr *th = tcp_hdr(skb);
2624 int length=(th->doff*4)-sizeof(struct tcphdr); 2854 int length=(th->doff*4)-sizeof(struct tcphdr);
2625 2855
2626 ptr = (unsigned char *)(th + 1); 2856 ptr = (unsigned char *)(th + 1);
2627 opt_rx->saw_tstamp = 0; 2857 opt_rx->saw_tstamp = 0;
2628 2858
2629 while(length>0) { 2859 while (length > 0) {
2630 int opcode=*ptr++; 2860 int opcode=*ptr++;
2631 int opsize; 2861 int opsize;
2632 2862
@@ -2642,9 +2872,9 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
2642 return; 2872 return;
2643 if (opsize > length) 2873 if (opsize > length)
2644 return; /* don't parse partial options */ 2874 return; /* don't parse partial options */
2645 switch(opcode) { 2875 switch (opcode) {
2646 case TCPOPT_MSS: 2876 case TCPOPT_MSS:
2647 if(opsize==TCPOLEN_MSS && th->syn && !estab) { 2877 if (opsize==TCPOLEN_MSS && th->syn && !estab) {
2648 u16 in_mss = ntohs(get_unaligned((__be16 *)ptr)); 2878 u16 in_mss = ntohs(get_unaligned((__be16 *)ptr));
2649 if (in_mss) { 2879 if (in_mss) {
2650 if (opt_rx->user_mss && opt_rx->user_mss < in_mss) 2880 if (opt_rx->user_mss && opt_rx->user_mss < in_mss)
@@ -2654,12 +2884,12 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
2654 } 2884 }
2655 break; 2885 break;
2656 case TCPOPT_WINDOW: 2886 case TCPOPT_WINDOW:
2657 if(opsize==TCPOLEN_WINDOW && th->syn && !estab) 2887 if (opsize==TCPOLEN_WINDOW && th->syn && !estab)
2658 if (sysctl_tcp_window_scaling) { 2888 if (sysctl_tcp_window_scaling) {
2659 __u8 snd_wscale = *(__u8 *) ptr; 2889 __u8 snd_wscale = *(__u8 *) ptr;
2660 opt_rx->wscale_ok = 1; 2890 opt_rx->wscale_ok = 1;
2661 if (snd_wscale > 14) { 2891 if (snd_wscale > 14) {
2662 if(net_ratelimit()) 2892 if (net_ratelimit())
2663 printk(KERN_INFO "tcp_parse_options: Illegal window " 2893 printk(KERN_INFO "tcp_parse_options: Illegal window "
2664 "scaling value %d >14 received.\n", 2894 "scaling value %d >14 received.\n",
2665 snd_wscale); 2895 snd_wscale);
@@ -2669,7 +2899,7 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
2669 } 2899 }
2670 break; 2900 break;
2671 case TCPOPT_TIMESTAMP: 2901 case TCPOPT_TIMESTAMP:
2672 if(opsize==TCPOLEN_TIMESTAMP) { 2902 if (opsize==TCPOLEN_TIMESTAMP) {
2673 if ((estab && opt_rx->tstamp_ok) || 2903 if ((estab && opt_rx->tstamp_ok) ||
2674 (!estab && sysctl_tcp_timestamps)) { 2904 (!estab && sysctl_tcp_timestamps)) {
2675 opt_rx->saw_tstamp = 1; 2905 opt_rx->saw_tstamp = 1;
@@ -2679,7 +2909,7 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
2679 } 2909 }
2680 break; 2910 break;
2681 case TCPOPT_SACK_PERM: 2911 case TCPOPT_SACK_PERM:
2682 if(opsize==TCPOLEN_SACK_PERM && th->syn && !estab) { 2912 if (opsize==TCPOLEN_SACK_PERM && th->syn && !estab) {
2683 if (sysctl_tcp_sack) { 2913 if (sysctl_tcp_sack) {
2684 opt_rx->sack_ok = 1; 2914 opt_rx->sack_ok = 1;
2685 tcp_sack_reset(opt_rx); 2915 tcp_sack_reset(opt_rx);
@@ -2688,7 +2918,7 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
2688 break; 2918 break;
2689 2919
2690 case TCPOPT_SACK: 2920 case TCPOPT_SACK:
2691 if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) && 2921 if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
2692 !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) && 2922 !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
2693 opt_rx->sack_ok) { 2923 opt_rx->sack_ok) {
2694 TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th; 2924 TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
@@ -2701,10 +2931,11 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
2701 */ 2931 */
2702 break; 2932 break;
2703#endif 2933#endif
2704 }; 2934 }
2935
2705 ptr+=opsize-2; 2936 ptr+=opsize-2;
2706 length-=opsize; 2937 length-=opsize;
2707 }; 2938 }
2708 } 2939 }
2709} 2940}
2710 2941
@@ -2737,7 +2968,7 @@ static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
2737static inline void tcp_store_ts_recent(struct tcp_sock *tp) 2968static inline void tcp_store_ts_recent(struct tcp_sock *tp)
2738{ 2969{
2739 tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval; 2970 tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
2740 tp->rx_opt.ts_recent_stamp = xtime.tv_sec; 2971 tp->rx_opt.ts_recent_stamp = get_seconds();
2741} 2972}
2742 2973
2743static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) 2974static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
@@ -2750,8 +2981,8 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
2750 * Not only, also it occurs for expired timestamps. 2981 * Not only, also it occurs for expired timestamps.
2751 */ 2982 */
2752 2983
2753 if((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) >= 0 || 2984 if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) >= 0 ||
2754 xtime.tv_sec >= tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS) 2985 get_seconds() >= tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS)
2755 tcp_store_ts_recent(tp); 2986 tcp_store_ts_recent(tp);
2756 } 2987 }
2757} 2988}
@@ -2782,7 +3013,7 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
2782static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb) 3013static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
2783{ 3014{
2784 struct tcp_sock *tp = tcp_sk(sk); 3015 struct tcp_sock *tp = tcp_sk(sk);
2785 struct tcphdr *th = skb->h.th; 3016 struct tcphdr *th = tcp_hdr(skb);
2786 u32 seq = TCP_SKB_CB(skb)->seq; 3017 u32 seq = TCP_SKB_CB(skb)->seq;
2787 u32 ack = TCP_SKB_CB(skb)->ack_seq; 3018 u32 ack = TCP_SKB_CB(skb)->ack_seq;
2788 3019
@@ -2803,7 +3034,7 @@ static inline int tcp_paws_discard(const struct sock *sk, const struct sk_buff *
2803{ 3034{
2804 const struct tcp_sock *tp = tcp_sk(sk); 3035 const struct tcp_sock *tp = tcp_sk(sk);
2805 return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW && 3036 return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW &&
2806 xtime.tv_sec < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS && 3037 get_seconds() < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS &&
2807 !tcp_disordered_ack(sk, skb)); 3038 !tcp_disordered_ack(sk, skb));
2808} 3039}
2809 3040
@@ -2910,7 +3141,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
2910 printk(KERN_ERR "%s: Impossible, sk->sk_state=%d\n", 3141 printk(KERN_ERR "%s: Impossible, sk->sk_state=%d\n",
2911 __FUNCTION__, sk->sk_state); 3142 __FUNCTION__, sk->sk_state);
2912 break; 3143 break;
2913 }; 3144 }
2914 3145
2915 /* It _is_ possible, that we have something out-of-order _after_ FIN. 3146 /* It _is_ possible, that we have something out-of-order _after_ FIN.
2916 * Probably, we should reset in this case. For now drop them. 3147 * Probably, we should reset in this case. For now drop them.
@@ -3009,7 +3240,7 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
3009 */ 3240 */
3010 tp->rx_opt.num_sacks--; 3241 tp->rx_opt.num_sacks--;
3011 tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok); 3242 tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok);
3012 for(i=this_sack; i < tp->rx_opt.num_sacks; i++) 3243 for (i=this_sack; i < tp->rx_opt.num_sacks; i++)
3013 sp[i] = sp[i+1]; 3244 sp[i] = sp[i+1];
3014 continue; 3245 continue;
3015 } 3246 }
@@ -3062,7 +3293,7 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
3062 tp->rx_opt.num_sacks--; 3293 tp->rx_opt.num_sacks--;
3063 sp--; 3294 sp--;
3064 } 3295 }
3065 for(; this_sack > 0; this_sack--, sp--) 3296 for (; this_sack > 0; this_sack--, sp--)
3066 *sp = *(sp-1); 3297 *sp = *(sp-1);
3067 3298
3068new_sack: 3299new_sack:
@@ -3088,7 +3319,7 @@ static void tcp_sack_remove(struct tcp_sock *tp)
3088 return; 3319 return;
3089 } 3320 }
3090 3321
3091 for(this_sack = 0; this_sack < num_sacks; ) { 3322 for (this_sack = 0; this_sack < num_sacks; ) {
3092 /* Check if the start of the sack is covered by RCV.NXT. */ 3323 /* Check if the start of the sack is covered by RCV.NXT. */
3093 if (!before(tp->rcv_nxt, sp->start_seq)) { 3324 if (!before(tp->rcv_nxt, sp->start_seq)) {
3094 int i; 3325 int i;
@@ -3144,8 +3375,8 @@ static void tcp_ofo_queue(struct sock *sk)
3144 __skb_unlink(skb, &tp->out_of_order_queue); 3375 __skb_unlink(skb, &tp->out_of_order_queue);
3145 __skb_queue_tail(&sk->sk_receive_queue, skb); 3376 __skb_queue_tail(&sk->sk_receive_queue, skb);
3146 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; 3377 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
3147 if(skb->h.th->fin) 3378 if (tcp_hdr(skb)->fin)
3148 tcp_fin(skb, sk, skb->h.th); 3379 tcp_fin(skb, sk, tcp_hdr(skb));
3149 } 3380 }
3150} 3381}
3151 3382
@@ -3153,7 +3384,7 @@ static int tcp_prune_queue(struct sock *sk);
3153 3384
3154static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) 3385static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
3155{ 3386{
3156 struct tcphdr *th = skb->h.th; 3387 struct tcphdr *th = tcp_hdr(skb);
3157 struct tcp_sock *tp = tcp_sk(sk); 3388 struct tcp_sock *tp = tcp_sk(sk);
3158 int eaten = -1; 3389 int eaten = -1;
3159 3390
@@ -3210,9 +3441,9 @@ queue_and_out:
3210 __skb_queue_tail(&sk->sk_receive_queue, skb); 3441 __skb_queue_tail(&sk->sk_receive_queue, skb);
3211 } 3442 }
3212 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; 3443 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
3213 if(skb->len) 3444 if (skb->len)
3214 tcp_event_data_recv(sk, tp, skb); 3445 tcp_event_data_recv(sk, skb);
3215 if(th->fin) 3446 if (th->fin)
3216 tcp_fin(skb, sk, th); 3447 tcp_fin(skb, sk, th);
3217 3448
3218 if (!skb_queue_empty(&tp->out_of_order_queue)) { 3449 if (!skb_queue_empty(&tp->out_of_order_queue)) {
@@ -3228,7 +3459,7 @@ queue_and_out:
3228 if (tp->rx_opt.num_sacks) 3459 if (tp->rx_opt.num_sacks)
3229 tcp_sack_remove(tp); 3460 tcp_sack_remove(tp);
3230 3461
3231 tcp_fast_path_check(sk, tp); 3462 tcp_fast_path_check(sk);
3232 3463
3233 if (eaten > 0) 3464 if (eaten > 0)
3234 __kfree_skb(skb); 3465 __kfree_skb(skb);
@@ -3392,7 +3623,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
3392 * - bloated or contains data before "start" or 3623 * - bloated or contains data before "start" or
3393 * overlaps to the next one. 3624 * overlaps to the next one.
3394 */ 3625 */
3395 if (!skb->h.th->syn && !skb->h.th->fin && 3626 if (!tcp_hdr(skb)->syn && !tcp_hdr(skb)->fin &&
3396 (tcp_win_from_space(skb->truesize) > skb->len || 3627 (tcp_win_from_space(skb->truesize) > skb->len ||
3397 before(TCP_SKB_CB(skb)->seq, start) || 3628 before(TCP_SKB_CB(skb)->seq, start) ||
3398 (skb->next != tail && 3629 (skb->next != tail &&
@@ -3403,7 +3634,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
3403 start = TCP_SKB_CB(skb)->end_seq; 3634 start = TCP_SKB_CB(skb)->end_seq;
3404 skb = skb->next; 3635 skb = skb->next;
3405 } 3636 }
3406 if (skb == tail || skb->h.th->syn || skb->h.th->fin) 3637 if (skb == tail || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin)
3407 return; 3638 return;
3408 3639
3409 while (before(start, end)) { 3640 while (before(start, end)) {
@@ -3419,11 +3650,14 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
3419 nskb = alloc_skb(copy+header, GFP_ATOMIC); 3650 nskb = alloc_skb(copy+header, GFP_ATOMIC);
3420 if (!nskb) 3651 if (!nskb)
3421 return; 3652 return;
3653
3654 skb_set_mac_header(nskb, skb_mac_header(skb) - skb->head);
3655 skb_set_network_header(nskb, (skb_network_header(skb) -
3656 skb->head));
3657 skb_set_transport_header(nskb, (skb_transport_header(skb) -
3658 skb->head));
3422 skb_reserve(nskb, header); 3659 skb_reserve(nskb, header);
3423 memcpy(nskb->head, skb->head, header); 3660 memcpy(nskb->head, skb->head, header);
3424 nskb->nh.raw = nskb->head + (skb->nh.raw-skb->head);
3425 nskb->h.raw = nskb->head + (skb->h.raw-skb->head);
3426 nskb->mac.raw = nskb->head + (skb->mac.raw-skb->head);
3427 memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); 3661 memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
3428 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; 3662 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
3429 __skb_insert(nskb, skb->prev, skb, list); 3663 __skb_insert(nskb, skb->prev, skb, list);
@@ -3449,7 +3683,9 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
3449 __kfree_skb(skb); 3683 __kfree_skb(skb);
3450 NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED); 3684 NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED);
3451 skb = next; 3685 skb = next;
3452 if (skb == tail || skb->h.th->syn || skb->h.th->fin) 3686 if (skb == tail ||
3687 tcp_hdr(skb)->syn ||
3688 tcp_hdr(skb)->fin)
3453 return; 3689 return;
3454 } 3690 }
3455 } 3691 }
@@ -3514,7 +3750,7 @@ static int tcp_prune_queue(struct sock *sk)
3514 NET_INC_STATS_BH(LINUX_MIB_PRUNECALLED); 3750 NET_INC_STATS_BH(LINUX_MIB_PRUNECALLED);
3515 3751
3516 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) 3752 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
3517 tcp_clamp_window(sk, tp); 3753 tcp_clamp_window(sk);
3518 else if (tcp_memory_pressure) 3754 else if (tcp_memory_pressure)
3519 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); 3755 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
3520 3756
@@ -3583,8 +3819,10 @@ void tcp_cwnd_application_limited(struct sock *sk)
3583 tp->snd_cwnd_stamp = tcp_time_stamp; 3819 tp->snd_cwnd_stamp = tcp_time_stamp;
3584} 3820}
3585 3821
3586static int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp) 3822static int tcp_should_expand_sndbuf(struct sock *sk)
3587{ 3823{
3824 struct tcp_sock *tp = tcp_sk(sk);
3825
3588 /* If the user specified a specific send buffer setting, do 3826 /* If the user specified a specific send buffer setting, do
3589 * not modify it. 3827 * not modify it.
3590 */ 3828 */
@@ -3616,7 +3854,7 @@ static void tcp_new_space(struct sock *sk)
3616{ 3854{
3617 struct tcp_sock *tp = tcp_sk(sk); 3855 struct tcp_sock *tp = tcp_sk(sk);
3618 3856
3619 if (tcp_should_expand_sndbuf(sk, tp)) { 3857 if (tcp_should_expand_sndbuf(sk)) {
3620 int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) + 3858 int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
3621 MAX_TCP_HEADER + 16 + sizeof(struct sk_buff), 3859 MAX_TCP_HEADER + 16 + sizeof(struct sk_buff),
3622 demanded = max_t(unsigned int, tp->snd_cwnd, 3860 demanded = max_t(unsigned int, tp->snd_cwnd,
@@ -3640,9 +3878,9 @@ static void tcp_check_space(struct sock *sk)
3640 } 3878 }
3641} 3879}
3642 3880
3643static inline void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp) 3881static inline void tcp_data_snd_check(struct sock *sk)
3644{ 3882{
3645 tcp_push_pending_frames(sk, tp); 3883 tcp_push_pending_frames(sk);
3646 tcp_check_space(sk); 3884 tcp_check_space(sk);
3647} 3885}
3648 3886
@@ -3790,7 +4028,7 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
3790 int err; 4028 int err;
3791 4029
3792 local_bh_enable(); 4030 local_bh_enable();
3793 if (skb->ip_summed==CHECKSUM_UNNECESSARY) 4031 if (skb_csum_unnecessary(skb))
3794 err = skb_copy_datagram_iovec(skb, hlen, tp->ucopy.iov, chunk); 4032 err = skb_copy_datagram_iovec(skb, hlen, tp->ucopy.iov, chunk);
3795 else 4033 else
3796 err = skb_copy_and_csum_datagram_iovec(skb, hlen, 4034 err = skb_copy_and_csum_datagram_iovec(skb, hlen,
@@ -3822,7 +4060,7 @@ static __sum16 __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb
3822 4060
3823static inline int tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) 4061static inline int tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
3824{ 4062{
3825 return skb->ip_summed != CHECKSUM_UNNECESSARY && 4063 return !skb_csum_unnecessary(skb) &&
3826 __tcp_checksum_complete_user(sk, skb); 4064 __tcp_checksum_complete_user(sk, skb);
3827} 4065}
3828 4066
@@ -3840,7 +4078,7 @@ static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen
3840 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) 4078 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
3841 tp->ucopy.dma_chan = get_softnet_dma(); 4079 tp->ucopy.dma_chan = get_softnet_dma();
3842 4080
3843 if (tp->ucopy.dma_chan && skb->ip_summed == CHECKSUM_UNNECESSARY) { 4081 if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) {
3844 4082
3845 dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan, 4083 dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan,
3846 skb, hlen, tp->ucopy.iov, chunk, tp->ucopy.pinned_list); 4084 skb, hlen, tp->ucopy.iov, chunk, tp->ucopy.pinned_list);
@@ -3856,7 +4094,7 @@ static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen
3856 tcp_rcv_space_adjust(sk); 4094 tcp_rcv_space_adjust(sk);
3857 4095
3858 if ((tp->ucopy.len == 0) || 4096 if ((tp->ucopy.len == 0) ||
3859 (tcp_flag_word(skb->h.th) & TCP_FLAG_PSH) || 4097 (tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) ||
3860 (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) { 4098 (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) {
3861 tp->ucopy.wakeup = 1; 4099 tp->ucopy.wakeup = 1;
3862 sk->sk_data_ready(sk, 0); 4100 sk->sk_data_ready(sk, 0);
@@ -3976,7 +4214,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
3976 */ 4214 */
3977 tcp_ack(sk, skb, 0); 4215 tcp_ack(sk, skb, 0);
3978 __kfree_skb(skb); 4216 __kfree_skb(skb);
3979 tcp_data_snd_check(sk, tp); 4217 tcp_data_snd_check(sk);
3980 return 0; 4218 return 0;
3981 } else { /* Header too small */ 4219 } else { /* Header too small */
3982 TCP_INC_STATS_BH(TCP_MIB_INERRS); 4220 TCP_INC_STATS_BH(TCP_MIB_INERRS);
@@ -4047,12 +4285,12 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
4047 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; 4285 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
4048 } 4286 }
4049 4287
4050 tcp_event_data_recv(sk, tp, skb); 4288 tcp_event_data_recv(sk, skb);
4051 4289
4052 if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) { 4290 if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
4053 /* Well, only one small jumplet in fast path... */ 4291 /* Well, only one small jumplet in fast path... */
4054 tcp_ack(sk, skb, FLAG_DATA); 4292 tcp_ack(sk, skb, FLAG_DATA);
4055 tcp_data_snd_check(sk, tp); 4293 tcp_data_snd_check(sk);
4056 if (!inet_csk_ack_scheduled(sk)) 4294 if (!inet_csk_ack_scheduled(sk))
4057 goto no_ack; 4295 goto no_ack;
4058 } 4296 }
@@ -4109,7 +4347,7 @@ slow_path:
4109 goto discard; 4347 goto discard;
4110 } 4348 }
4111 4349
4112 if(th->rst) { 4350 if (th->rst) {
4113 tcp_reset(sk); 4351 tcp_reset(sk);
4114 goto discard; 4352 goto discard;
4115 } 4353 }
@@ -4124,7 +4362,7 @@ slow_path:
4124 } 4362 }
4125 4363
4126step5: 4364step5:
4127 if(th->ack) 4365 if (th->ack)
4128 tcp_ack(sk, skb, FLAG_SLOWPATH); 4366 tcp_ack(sk, skb, FLAG_SLOWPATH);
4129 4367
4130 tcp_rcv_rtt_measure_ts(sk, skb); 4368 tcp_rcv_rtt_measure_ts(sk, skb);
@@ -4135,7 +4373,7 @@ step5:
4135 /* step 7: process the segment text */ 4373 /* step 7: process the segment text */
4136 tcp_data_queue(sk, skb); 4374 tcp_data_queue(sk, skb);
4137 4375
4138 tcp_data_snd_check(sk, tp); 4376 tcp_data_snd_check(sk);
4139 tcp_ack_snd_check(sk); 4377 tcp_ack_snd_check(sk);
4140 return 0; 4378 return 0;
4141 4379
@@ -4412,13 +4650,13 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4412 goto discard; 4650 goto discard;
4413 4651
4414 case TCP_LISTEN: 4652 case TCP_LISTEN:
4415 if(th->ack) 4653 if (th->ack)
4416 return 1; 4654 return 1;
4417 4655
4418 if(th->rst) 4656 if (th->rst)
4419 goto discard; 4657 goto discard;
4420 4658
4421 if(th->syn) { 4659 if (th->syn) {
4422 if (icsk->icsk_af_ops->conn_request(sk, skb) < 0) 4660 if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
4423 return 1; 4661 return 1;
4424 4662
@@ -4452,7 +4690,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4452 /* Do step6 onward by hand. */ 4690 /* Do step6 onward by hand. */
4453 tcp_urg(sk, skb, th); 4691 tcp_urg(sk, skb, th);
4454 __kfree_skb(skb); 4692 __kfree_skb(skb);
4455 tcp_data_snd_check(sk, tp); 4693 tcp_data_snd_check(sk);
4456 return 0; 4694 return 0;
4457 } 4695 }
4458 4696
@@ -4474,7 +4712,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4474 } 4712 }
4475 4713
4476 /* step 2: check RST bit */ 4714 /* step 2: check RST bit */
4477 if(th->rst) { 4715 if (th->rst) {
4478 tcp_reset(sk); 4716 tcp_reset(sk);
4479 goto discard; 4717 goto discard;
4480 } 4718 }
@@ -4497,7 +4735,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4497 if (th->ack) { 4735 if (th->ack) {
4498 int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH); 4736 int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH);
4499 4737
4500 switch(sk->sk_state) { 4738 switch (sk->sk_state) {
4501 case TCP_SYN_RECV: 4739 case TCP_SYN_RECV:
4502 if (acceptable) { 4740 if (acceptable) {
4503 tp->copied_seq = tp->rcv_nxt; 4741 tp->copied_seq = tp->rcv_nxt;
@@ -4644,7 +4882,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4644 4882
4645 /* tcp_data could move socket to TIME-WAIT */ 4883 /* tcp_data could move socket to TIME-WAIT */
4646 if (sk->sk_state != TCP_CLOSE) { 4884 if (sk->sk_state != TCP_CLOSE) {
4647 tcp_data_snd_check(sk, tp); 4885 tcp_data_snd_check(sk);
4648 tcp_ack_snd_check(sk); 4886 tcp_ack_snd_check(sk);
4649 } 4887 }
4650 4888
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 0ba74bbe7d30..5a3e7f839fc5 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -88,7 +88,7 @@ int sysctl_tcp_low_latency __read_mostly;
88#define ICMP_MIN_LENGTH 8 88#define ICMP_MIN_LENGTH 8
89 89
90/* Socket used for sending RSTs */ 90/* Socket used for sending RSTs */
91static struct socket *tcp_socket; 91static struct socket *tcp_socket __read_mostly;
92 92
93void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb); 93void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
94 94
@@ -125,10 +125,10 @@ void tcp_unhash(struct sock *sk)
125 125
126static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb) 126static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
127{ 127{
128 return secure_tcp_sequence_number(skb->nh.iph->daddr, 128 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
129 skb->nh.iph->saddr, 129 ip_hdr(skb)->saddr,
130 skb->h.th->dest, 130 tcp_hdr(skb)->dest,
131 skb->h.th->source); 131 tcp_hdr(skb)->source);
132} 132}
133 133
134int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 134int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
@@ -149,7 +149,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
149 */ 149 */
150 if (tcptw->tw_ts_recent_stamp && 150 if (tcptw->tw_ts_recent_stamp &&
151 (twp == NULL || (sysctl_tcp_tw_reuse && 151 (twp == NULL || (sysctl_tcp_tw_reuse &&
152 xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) { 152 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
153 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; 153 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
154 if (tp->write_seq == 0) 154 if (tp->write_seq == 0)
155 tp->write_seq = 1; 155 tp->write_seq = 1;
@@ -224,7 +224,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
224 * when trying new connection. 224 * when trying new connection.
225 */ 225 */
226 if (peer != NULL && 226 if (peer != NULL &&
227 peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) { 227 peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
228 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; 228 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
229 tp->rx_opt.ts_recent = peer->tcp_ts; 229 tp->rx_opt.ts_recent = peer->tcp_ts;
230 } 230 }
@@ -354,8 +354,8 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
354 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 354 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
355 struct tcp_sock *tp; 355 struct tcp_sock *tp;
356 struct inet_sock *inet; 356 struct inet_sock *inet;
357 int type = skb->h.icmph->type; 357 const int type = icmp_hdr(skb)->type;
358 int code = skb->h.icmph->code; 358 const int code = icmp_hdr(skb)->code;
359 struct sock *sk; 359 struct sock *sk;
360 __u32 seq; 360 __u32 seq;
361 int err; 361 int err;
@@ -499,11 +499,12 @@ out:
499void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb) 499void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
500{ 500{
501 struct inet_sock *inet = inet_sk(sk); 501 struct inet_sock *inet = inet_sk(sk);
502 struct tcphdr *th = skb->h.th; 502 struct tcphdr *th = tcp_hdr(skb);
503 503
504 if (skb->ip_summed == CHECKSUM_PARTIAL) { 504 if (skb->ip_summed == CHECKSUM_PARTIAL) {
505 th->check = ~tcp_v4_check(len, inet->saddr, 505 th->check = ~tcp_v4_check(len, inet->saddr,
506 inet->daddr, 0); 506 inet->daddr, 0);
507 skb->csum_start = skb_transport_header(skb) - skb->head;
507 skb->csum_offset = offsetof(struct tcphdr, check); 508 skb->csum_offset = offsetof(struct tcphdr, check);
508 } else { 509 } else {
509 th->check = tcp_v4_check(len, inet->saddr, inet->daddr, 510 th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
@@ -515,17 +516,18 @@ void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
515 516
516int tcp_v4_gso_send_check(struct sk_buff *skb) 517int tcp_v4_gso_send_check(struct sk_buff *skb)
517{ 518{
518 struct iphdr *iph; 519 const struct iphdr *iph;
519 struct tcphdr *th; 520 struct tcphdr *th;
520 521
521 if (!pskb_may_pull(skb, sizeof(*th))) 522 if (!pskb_may_pull(skb, sizeof(*th)))
522 return -EINVAL; 523 return -EINVAL;
523 524
524 iph = skb->nh.iph; 525 iph = ip_hdr(skb);
525 th = skb->h.th; 526 th = tcp_hdr(skb);
526 527
527 th->check = 0; 528 th->check = 0;
528 th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0); 529 th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
530 skb->csum_start = skb_transport_header(skb) - skb->head;
529 skb->csum_offset = offsetof(struct tcphdr, check); 531 skb->csum_offset = offsetof(struct tcphdr, check);
530 skb->ip_summed = CHECKSUM_PARTIAL; 532 skb->ip_summed = CHECKSUM_PARTIAL;
531 return 0; 533 return 0;
@@ -546,7 +548,7 @@ int tcp_v4_gso_send_check(struct sk_buff *skb)
546 548
547static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) 549static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
548{ 550{
549 struct tcphdr *th = skb->h.th; 551 struct tcphdr *th = tcp_hdr(skb);
550 struct { 552 struct {
551 struct tcphdr th; 553 struct tcphdr th;
552#ifdef CONFIG_TCP_MD5SIG 554#ifdef CONFIG_TCP_MD5SIG
@@ -585,7 +587,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
585 arg.iov[0].iov_len = sizeof(rep.th); 587 arg.iov[0].iov_len = sizeof(rep.th);
586 588
587#ifdef CONFIG_TCP_MD5SIG 589#ifdef CONFIG_TCP_MD5SIG
588 key = sk ? tcp_v4_md5_do_lookup(sk, skb->nh.iph->daddr) : NULL; 590 key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
589 if (key) { 591 if (key) {
590 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 592 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
591 (TCPOPT_NOP << 16) | 593 (TCPOPT_NOP << 16) |
@@ -597,14 +599,14 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
597 599
598 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1], 600 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1],
599 key, 601 key,
600 skb->nh.iph->daddr, 602 ip_hdr(skb)->daddr,
601 skb->nh.iph->saddr, 603 ip_hdr(skb)->saddr,
602 &rep.th, IPPROTO_TCP, 604 &rep.th, IPPROTO_TCP,
603 arg.iov[0].iov_len); 605 arg.iov[0].iov_len);
604 } 606 }
605#endif 607#endif
606 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr, 608 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
607 skb->nh.iph->saddr, /* XXX */ 609 ip_hdr(skb)->saddr, /* XXX */
608 sizeof(struct tcphdr), IPPROTO_TCP, 0); 610 sizeof(struct tcphdr), IPPROTO_TCP, 0);
609 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 611 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
610 612
@@ -622,7 +624,7 @@ static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk,
622 struct sk_buff *skb, u32 seq, u32 ack, 624 struct sk_buff *skb, u32 seq, u32 ack,
623 u32 win, u32 ts) 625 u32 win, u32 ts)
624{ 626{
625 struct tcphdr *th = skb->h.th; 627 struct tcphdr *th = tcp_hdr(skb);
626 struct { 628 struct {
627 struct tcphdr th; 629 struct tcphdr th;
628 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) 630 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
@@ -670,7 +672,7 @@ static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk,
670 * skb->sk) holds true, but we program defensively. 672 * skb->sk) holds true, but we program defensively.
671 */ 673 */
672 if (!twsk && skb->sk) { 674 if (!twsk && skb->sk) {
673 key = tcp_v4_md5_do_lookup(skb->sk, skb->nh.iph->daddr); 675 key = tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr);
674 } else if (twsk && twsk->tw_md5_keylen) { 676 } else if (twsk && twsk->tw_md5_keylen) {
675 tw_key.key = twsk->tw_md5_key; 677 tw_key.key = twsk->tw_md5_key;
676 tw_key.keylen = twsk->tw_md5_keylen; 678 tw_key.keylen = twsk->tw_md5_keylen;
@@ -690,14 +692,14 @@ static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk,
690 692
691 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset], 693 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset],
692 key, 694 key,
693 skb->nh.iph->daddr, 695 ip_hdr(skb)->daddr,
694 skb->nh.iph->saddr, 696 ip_hdr(skb)->saddr,
695 &rep.th, IPPROTO_TCP, 697 &rep.th, IPPROTO_TCP,
696 arg.iov[0].iov_len); 698 arg.iov[0].iov_len);
697 } 699 }
698#endif 700#endif
699 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr, 701 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
700 skb->nh.iph->saddr, /* XXX */ 702 ip_hdr(skb)->saddr, /* XXX */
701 arg.iov[0].iov_len, IPPROTO_TCP, 0); 703 arg.iov[0].iov_len, IPPROTO_TCP, 0);
702 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 704 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
703 705
@@ -745,7 +747,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
745 skb = tcp_make_synack(sk, dst, req); 747 skb = tcp_make_synack(sk, dst, req);
746 748
747 if (skb) { 749 if (skb) {
748 struct tcphdr *th = skb->h.th; 750 struct tcphdr *th = tcp_hdr(skb);
749 751
750 th->check = tcp_v4_check(skb->len, 752 th->check = tcp_v4_check(skb->len,
751 ireq->loc_addr, 753 ireq->loc_addr,
@@ -781,7 +783,7 @@ static void syn_flood_warning(struct sk_buff *skb)
781 warntime = jiffies; 783 warntime = jiffies;
782 printk(KERN_INFO 784 printk(KERN_INFO
783 "possible SYN flooding on port %d. Sending cookies.\n", 785 "possible SYN flooding on port %d. Sending cookies.\n",
784 ntohs(skb->h.th->dest)); 786 ntohs(tcp_hdr(skb)->dest));
785 } 787 }
786} 788}
787#endif 789#endif
@@ -1133,8 +1135,8 @@ static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1133 */ 1135 */
1134 __u8 *hash_location = NULL; 1136 __u8 *hash_location = NULL;
1135 struct tcp_md5sig_key *hash_expected; 1137 struct tcp_md5sig_key *hash_expected;
1136 struct iphdr *iph = skb->nh.iph; 1138 const struct iphdr *iph = ip_hdr(skb);
1137 struct tcphdr *th = skb->h.th; 1139 struct tcphdr *th = tcp_hdr(skb);
1138 int length = (th->doff << 2) - sizeof(struct tcphdr); 1140 int length = (th->doff << 2) - sizeof(struct tcphdr);
1139 int genhash; 1141 int genhash;
1140 unsigned char *ptr; 1142 unsigned char *ptr;
@@ -1251,8 +1253,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1251 struct inet_request_sock *ireq; 1253 struct inet_request_sock *ireq;
1252 struct tcp_options_received tmp_opt; 1254 struct tcp_options_received tmp_opt;
1253 struct request_sock *req; 1255 struct request_sock *req;
1254 __be32 saddr = skb->nh.iph->saddr; 1256 __be32 saddr = ip_hdr(skb)->saddr;
1255 __be32 daddr = skb->nh.iph->daddr; 1257 __be32 daddr = ip_hdr(skb)->daddr;
1256 __u32 isn = TCP_SKB_CB(skb)->when; 1258 __u32 isn = TCP_SKB_CB(skb)->when;
1257 struct dst_entry *dst = NULL; 1259 struct dst_entry *dst = NULL;
1258#ifdef CONFIG_SYN_COOKIES 1260#ifdef CONFIG_SYN_COOKIES
@@ -1327,7 +1329,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1327 ireq->rmt_addr = saddr; 1329 ireq->rmt_addr = saddr;
1328 ireq->opt = tcp_v4_save_options(sk, skb); 1330 ireq->opt = tcp_v4_save_options(sk, skb);
1329 if (!want_cookie) 1331 if (!want_cookie)
1330 TCP_ECN_create_request(req, skb->h.th); 1332 TCP_ECN_create_request(req, tcp_hdr(skb));
1331 1333
1332 if (want_cookie) { 1334 if (want_cookie) {
1333#ifdef CONFIG_SYN_COOKIES 1335#ifdef CONFIG_SYN_COOKIES
@@ -1351,7 +1353,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1351 (dst = inet_csk_route_req(sk, req)) != NULL && 1353 (dst = inet_csk_route_req(sk, req)) != NULL &&
1352 (peer = rt_get_peer((struct rtable *)dst)) != NULL && 1354 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1353 peer->v4daddr == saddr) { 1355 peer->v4daddr == saddr) {
1354 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL && 1356 if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1355 (s32)(peer->tcp_ts - req->ts_recent) > 1357 (s32)(peer->tcp_ts - req->ts_recent) >
1356 TCP_PAWS_WINDOW) { 1358 TCP_PAWS_WINDOW) {
1357 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED); 1359 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
@@ -1375,7 +1377,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1375 LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open " 1377 LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1376 "request from %u.%u.%u.%u/%u\n", 1378 "request from %u.%u.%u.%u/%u\n",
1377 NIPQUAD(saddr), 1379 NIPQUAD(saddr),
1378 ntohs(skb->h.th->source)); 1380 ntohs(tcp_hdr(skb)->source));
1379 dst_release(dst); 1381 dst_release(dst);
1380 goto drop_and_free; 1382 goto drop_and_free;
1381 } 1383 }
@@ -1439,7 +1441,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1439 newinet->opt = ireq->opt; 1441 newinet->opt = ireq->opt;
1440 ireq->opt = NULL; 1442 ireq->opt = NULL;
1441 newinet->mc_index = inet_iif(skb); 1443 newinet->mc_index = inet_iif(skb);
1442 newinet->mc_ttl = skb->nh.iph->ttl; 1444 newinet->mc_ttl = ip_hdr(skb)->ttl;
1443 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1445 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1444 if (newinet->opt) 1446 if (newinet->opt)
1445 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen; 1447 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
@@ -1481,8 +1483,8 @@ exit:
1481 1483
1482static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) 1484static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1483{ 1485{
1484 struct tcphdr *th = skb->h.th; 1486 struct tcphdr *th = tcp_hdr(skb);
1485 struct iphdr *iph = skb->nh.iph; 1487 const struct iphdr *iph = ip_hdr(skb);
1486 struct sock *nsk; 1488 struct sock *nsk;
1487 struct request_sock **prev; 1489 struct request_sock **prev;
1488 /* Find possible connection requests. */ 1490 /* Find possible connection requests. */
@@ -1491,9 +1493,8 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1491 if (req) 1493 if (req)
1492 return tcp_check_req(sk, skb, req, prev); 1494 return tcp_check_req(sk, skb, req, prev);
1493 1495
1494 nsk = inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr, 1496 nsk = inet_lookup_established(&tcp_hashinfo, iph->saddr, th->source,
1495 th->source, skb->nh.iph->daddr, 1497 iph->daddr, th->dest, inet_iif(skb));
1496 th->dest, inet_iif(skb));
1497 1498
1498 if (nsk) { 1499 if (nsk) {
1499 if (nsk->sk_state != TCP_TIME_WAIT) { 1500 if (nsk->sk_state != TCP_TIME_WAIT) {
@@ -1513,15 +1514,17 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1513 1514
1514static __sum16 tcp_v4_checksum_init(struct sk_buff *skb) 1515static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1515{ 1516{
1517 const struct iphdr *iph = ip_hdr(skb);
1518
1516 if (skb->ip_summed == CHECKSUM_COMPLETE) { 1519 if (skb->ip_summed == CHECKSUM_COMPLETE) {
1517 if (!tcp_v4_check(skb->len, skb->nh.iph->saddr, 1520 if (!tcp_v4_check(skb->len, iph->saddr,
1518 skb->nh.iph->daddr, skb->csum)) { 1521 iph->daddr, skb->csum)) {
1519 skb->ip_summed = CHECKSUM_UNNECESSARY; 1522 skb->ip_summed = CHECKSUM_UNNECESSARY;
1520 return 0; 1523 return 0;
1521 } 1524 }
1522 } 1525 }
1523 1526
1524 skb->csum = csum_tcpudp_nofold(skb->nh.iph->saddr, skb->nh.iph->daddr, 1527 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1525 skb->len, IPPROTO_TCP, 0); 1528 skb->len, IPPROTO_TCP, 0);
1526 1529
1527 if (skb->len <= 76) { 1530 if (skb->len <= 76) {
@@ -1555,7 +1558,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1555 1558
1556 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1559 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1557 TCP_CHECK_TIMER(sk); 1560 TCP_CHECK_TIMER(sk);
1558 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len)) { 1561 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1559 rsk = sk; 1562 rsk = sk;
1560 goto reset; 1563 goto reset;
1561 } 1564 }
@@ -1563,7 +1566,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1563 return 0; 1566 return 0;
1564 } 1567 }
1565 1568
1566 if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb)) 1569 if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1567 goto csum_err; 1570 goto csum_err;
1568 1571
1569 if (sk->sk_state == TCP_LISTEN) { 1572 if (sk->sk_state == TCP_LISTEN) {
@@ -1581,7 +1584,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1581 } 1584 }
1582 1585
1583 TCP_CHECK_TIMER(sk); 1586 TCP_CHECK_TIMER(sk);
1584 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len)) { 1587 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1585 rsk = sk; 1588 rsk = sk;
1586 goto reset; 1589 goto reset;
1587 } 1590 }
@@ -1610,6 +1613,7 @@ csum_err:
1610 1613
1611int tcp_v4_rcv(struct sk_buff *skb) 1614int tcp_v4_rcv(struct sk_buff *skb)
1612{ 1615{
1616 const struct iphdr *iph;
1613 struct tcphdr *th; 1617 struct tcphdr *th;
1614 struct sock *sk; 1618 struct sock *sk;
1615 int ret; 1619 int ret;
@@ -1623,7 +1627,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
1623 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 1627 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1624 goto discard_it; 1628 goto discard_it;
1625 1629
1626 th = skb->h.th; 1630 th = tcp_hdr(skb);
1627 1631
1628 if (th->doff < sizeof(struct tcphdr) / 4) 1632 if (th->doff < sizeof(struct tcphdr) / 4)
1629 goto bad_packet; 1633 goto bad_packet;
@@ -1634,23 +1638,21 @@ int tcp_v4_rcv(struct sk_buff *skb)
1634 * Packet length and doff are validated by header prediction, 1638 * Packet length and doff are validated by header prediction,
1635 * provided case of th->doff==0 is eliminated. 1639 * provided case of th->doff==0 is eliminated.
1636 * So, we defer the checks. */ 1640 * So, we defer the checks. */
1637 if ((skb->ip_summed != CHECKSUM_UNNECESSARY && 1641 if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1638 tcp_v4_checksum_init(skb)))
1639 goto bad_packet; 1642 goto bad_packet;
1640 1643
1641 th = skb->h.th; 1644 th = tcp_hdr(skb);
1645 iph = ip_hdr(skb);
1642 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 1646 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1643 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 1647 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1644 skb->len - th->doff * 4); 1648 skb->len - th->doff * 4);
1645 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 1649 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1646 TCP_SKB_CB(skb)->when = 0; 1650 TCP_SKB_CB(skb)->when = 0;
1647 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos; 1651 TCP_SKB_CB(skb)->flags = iph->tos;
1648 TCP_SKB_CB(skb)->sacked = 0; 1652 TCP_SKB_CB(skb)->sacked = 0;
1649 1653
1650 sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source, 1654 sk = __inet_lookup(&tcp_hashinfo, iph->saddr, th->source,
1651 skb->nh.iph->daddr, th->dest, 1655 iph->daddr, th->dest, inet_iif(skb));
1652 inet_iif(skb));
1653
1654 if (!sk) 1656 if (!sk)
1655 goto no_tcp_socket; 1657 goto no_tcp_socket;
1656 1658
@@ -1724,8 +1726,7 @@ do_time_wait:
1724 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { 1726 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1725 case TCP_TW_SYN: { 1727 case TCP_TW_SYN: {
1726 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo, 1728 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1727 skb->nh.iph->daddr, 1729 iph->daddr, th->dest,
1728 th->dest,
1729 inet_iif(skb)); 1730 inet_iif(skb));
1730 if (sk2) { 1731 if (sk2) {
1731 inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row); 1732 inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
@@ -1770,7 +1771,7 @@ int tcp_v4_remember_stamp(struct sock *sk)
1770 1771
1771 if (peer) { 1772 if (peer) {
1772 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 || 1773 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1773 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec && 1774 (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1774 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) { 1775 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1775 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp; 1776 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1776 peer->tcp_ts = tp->rx_opt.ts_recent; 1777 peer->tcp_ts = tp->rx_opt.ts_recent;
@@ -1791,7 +1792,7 @@ int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1791 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); 1792 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1792 1793
1793 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 || 1794 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1794 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec && 1795 (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1795 peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) { 1796 peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1796 peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp; 1797 peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1797 peer->tcp_ts = tcptw->tw_ts_recent; 1798 peer->tcp_ts = tcptw->tw_ts_recent;
@@ -1890,7 +1891,7 @@ int tcp_v4_destroy_sock(struct sock *sk)
1890 tcp_cleanup_congestion_control(sk); 1891 tcp_cleanup_congestion_control(sk);
1891 1892
1892 /* Cleanup up the write buffer. */ 1893 /* Cleanup up the write buffer. */
1893 sk_stream_writequeue_purge(sk); 1894 tcp_write_queue_purge(sk);
1894 1895
1895 /* Cleans up our, hopefully empty, out_of_order_queue. */ 1896 /* Cleans up our, hopefully empty, out_of_order_queue. */
1896 __skb_queue_purge(&tp->out_of_order_queue); 1897 __skb_queue_purge(&tp->out_of_order_queue);
@@ -2293,13 +2294,13 @@ static void get_openreq4(struct sock *sk, struct request_sock *req,
2293 req); 2294 req);
2294} 2295}
2295 2296
2296static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i) 2297static void get_tcp4_sock(struct sock *sk, char *tmpbuf, int i)
2297{ 2298{
2298 int timer_active; 2299 int timer_active;
2299 unsigned long timer_expires; 2300 unsigned long timer_expires;
2300 struct tcp_sock *tp = tcp_sk(sp); 2301 struct tcp_sock *tp = tcp_sk(sk);
2301 const struct inet_connection_sock *icsk = inet_csk(sp); 2302 const struct inet_connection_sock *icsk = inet_csk(sk);
2302 struct inet_sock *inet = inet_sk(sp); 2303 struct inet_sock *inet = inet_sk(sk);
2303 __be32 dest = inet->daddr; 2304 __be32 dest = inet->daddr;
2304 __be32 src = inet->rcv_saddr; 2305 __be32 src = inet->rcv_saddr;
2305 __u16 destp = ntohs(inet->dport); 2306 __u16 destp = ntohs(inet->dport);
@@ -2311,9 +2312,9 @@ static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2311 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2312 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2312 timer_active = 4; 2313 timer_active = 4;
2313 timer_expires = icsk->icsk_timeout; 2314 timer_expires = icsk->icsk_timeout;
2314 } else if (timer_pending(&sp->sk_timer)) { 2315 } else if (timer_pending(&sk->sk_timer)) {
2315 timer_active = 2; 2316 timer_active = 2;
2316 timer_expires = sp->sk_timer.expires; 2317 timer_expires = sk->sk_timer.expires;
2317 } else { 2318 } else {
2318 timer_active = 0; 2319 timer_active = 0;
2319 timer_expires = jiffies; 2320 timer_expires = jiffies;
@@ -2321,17 +2322,17 @@ static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2321 2322
2322 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2323 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2323 "%08X %5d %8d %lu %d %p %u %u %u %u %d", 2324 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2324 i, src, srcp, dest, destp, sp->sk_state, 2325 i, src, srcp, dest, destp, sk->sk_state,
2325 tp->write_seq - tp->snd_una, 2326 tp->write_seq - tp->snd_una,
2326 sp->sk_state == TCP_LISTEN ? sp->sk_ack_backlog : 2327 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2327 (tp->rcv_nxt - tp->copied_seq), 2328 (tp->rcv_nxt - tp->copied_seq),
2328 timer_active, 2329 timer_active,
2329 jiffies_to_clock_t(timer_expires - jiffies), 2330 jiffies_to_clock_t(timer_expires - jiffies),
2330 icsk->icsk_retransmits, 2331 icsk->icsk_retransmits,
2331 sock_i_uid(sp), 2332 sock_i_uid(sk),
2332 icsk->icsk_probes_out, 2333 icsk->icsk_probes_out,
2333 sock_i_ino(sp), 2334 sock_i_ino(sk),
2334 atomic_read(&sp->sk_refcnt), sp, 2335 atomic_read(&sk->sk_refcnt), sk,
2335 icsk->icsk_rto, 2336 icsk->icsk_rto,
2336 icsk->icsk_ack.ato, 2337 icsk->icsk_ack.ato,
2337 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, 2338 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index f0ebaf0e21cb..43294ad9f63e 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -218,7 +218,7 @@ static u32 tcp_lp_owd_calculator(struct sock *sk)
218 * 3. calc smoothed OWD (SOWD). 218 * 3. calc smoothed OWD (SOWD).
219 * Most ideas come from the original TCP-LP implementation. 219 * Most ideas come from the original TCP-LP implementation.
220 */ 220 */
221static void tcp_lp_rtt_sample(struct sock *sk, u32 usrtt) 221static void tcp_lp_rtt_sample(struct sock *sk, u32 rtt)
222{ 222{
223 struct lp *lp = inet_csk_ca(sk); 223 struct lp *lp = inet_csk_ca(sk);
224 s64 mowd = tcp_lp_owd_calculator(sk); 224 s64 mowd = tcp_lp_owd_calculator(sk);
@@ -261,11 +261,13 @@ static void tcp_lp_rtt_sample(struct sock *sk, u32 usrtt)
261 * newReno in increase case. 261 * newReno in increase case.
262 * We work it out by following the idea from TCP-LP's paper directly 262 * We work it out by following the idea from TCP-LP's paper directly
263 */ 263 */
264static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked) 264static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, ktime_t last)
265{ 265{
266 struct tcp_sock *tp = tcp_sk(sk); 266 struct tcp_sock *tp = tcp_sk(sk);
267 struct lp *lp = inet_csk_ca(sk); 267 struct lp *lp = inet_csk_ca(sk);
268 268
269 tcp_lp_rtt_sample(sk, ktime_to_us(net_timedelta(last)));
270
269 /* calc inference */ 271 /* calc inference */
270 if (tcp_time_stamp > tp->rx_opt.rcv_tsecr) 272 if (tcp_time_stamp > tp->rx_opt.rcv_tsecr)
271 lp->inference = 3 * (tcp_time_stamp - tp->rx_opt.rcv_tsecr); 273 lp->inference = 3 * (tcp_time_stamp - tp->rx_opt.rcv_tsecr);
@@ -312,11 +314,11 @@ static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked)
312} 314}
313 315
314static struct tcp_congestion_ops tcp_lp = { 316static struct tcp_congestion_ops tcp_lp = {
317 .flags = TCP_CONG_RTT_STAMP,
315 .init = tcp_lp_init, 318 .init = tcp_lp_init,
316 .ssthresh = tcp_reno_ssthresh, 319 .ssthresh = tcp_reno_ssthresh,
317 .cong_avoid = tcp_lp_cong_avoid, 320 .cong_avoid = tcp_lp_cong_avoid,
318 .min_cwnd = tcp_reno_min_cwnd, 321 .min_cwnd = tcp_reno_min_cwnd,
319 .rtt_sample = tcp_lp_rtt_sample,
320 .pkts_acked = tcp_lp_pkts_acked, 322 .pkts_acked = tcp_lp_pkts_acked,
321 323
322 .owner = THIS_MODULE, 324 .owner = THIS_MODULE,
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 6b5c64f3c925..a12b08fca5ad 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -149,7 +149,7 @@ kill_with_rst:
149 tw->tw_substate = TCP_TIME_WAIT; 149 tw->tw_substate = TCP_TIME_WAIT;
150 tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq; 150 tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
151 if (tmp_opt.saw_tstamp) { 151 if (tmp_opt.saw_tstamp) {
152 tcptw->tw_ts_recent_stamp = xtime.tv_sec; 152 tcptw->tw_ts_recent_stamp = get_seconds();
153 tcptw->tw_ts_recent = tmp_opt.rcv_tsval; 153 tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
154 } 154 }
155 155
@@ -208,7 +208,7 @@ kill:
208 208
209 if (tmp_opt.saw_tstamp) { 209 if (tmp_opt.saw_tstamp) {
210 tcptw->tw_ts_recent = tmp_opt.rcv_tsval; 210 tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
211 tcptw->tw_ts_recent_stamp = xtime.tv_sec; 211 tcptw->tw_ts_recent_stamp = get_seconds();
212 } 212 }
213 213
214 inet_twsk_put(tw); 214 inet_twsk_put(tw);
@@ -246,7 +246,7 @@ kill:
246 if (paws_reject) 246 if (paws_reject)
247 NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); 247 NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
248 248
249 if(!th->rst) { 249 if (!th->rst) {
250 /* In this case we must reset the TIMEWAIT timer. 250 /* In this case we must reset the TIMEWAIT timer.
251 * 251 *
252 * If it is ACKless SYN it may be both old duplicate 252 * If it is ACKless SYN it may be both old duplicate
@@ -324,7 +324,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
324 if (tcp_alloc_md5sig_pool() == NULL) 324 if (tcp_alloc_md5sig_pool() == NULL)
325 BUG(); 325 BUG();
326 } 326 }
327 } while(0); 327 } while (0);
328#endif 328#endif
329 329
330 /* Linkage updates. */ 330 /* Linkage updates. */
@@ -387,8 +387,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
387 /* Now setup tcp_sock */ 387 /* Now setup tcp_sock */
388 newtp = tcp_sk(newsk); 388 newtp = tcp_sk(newsk);
389 newtp->pred_flags = 0; 389 newtp->pred_flags = 0;
390 newtp->rcv_nxt = treq->rcv_isn + 1; 390 newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1;
391 newtp->snd_nxt = newtp->snd_una = newtp->snd_sml = treq->snt_isn + 1; 391 newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = treq->snt_isn + 1;
392 392
393 tcp_prequeue_init(newtp); 393 tcp_prequeue_init(newtp);
394 394
@@ -422,10 +422,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
422 tcp_set_ca_state(newsk, TCP_CA_Open); 422 tcp_set_ca_state(newsk, TCP_CA_Open);
423 tcp_init_xmit_timers(newsk); 423 tcp_init_xmit_timers(newsk);
424 skb_queue_head_init(&newtp->out_of_order_queue); 424 skb_queue_head_init(&newtp->out_of_order_queue);
425 newtp->rcv_wup = treq->rcv_isn + 1;
426 newtp->write_seq = treq->snt_isn + 1; 425 newtp->write_seq = treq->snt_isn + 1;
427 newtp->pushed_seq = newtp->write_seq; 426 newtp->pushed_seq = newtp->write_seq;
428 newtp->copied_seq = treq->rcv_isn + 1;
429 427
430 newtp->rx_opt.saw_tstamp = 0; 428 newtp->rx_opt.saw_tstamp = 0;
431 429
@@ -440,7 +438,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
440 keepalive_time_when(newtp)); 438 keepalive_time_when(newtp));
441 439
442 newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; 440 newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
443 if((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) { 441 if ((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {
444 if (sysctl_tcp_fack) 442 if (sysctl_tcp_fack)
445 newtp->rx_opt.sack_ok |= 2; 443 newtp->rx_opt.sack_ok |= 2;
446 } 444 }
@@ -455,12 +453,13 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
455 newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0; 453 newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
456 newtp->window_clamp = min(newtp->window_clamp, 65535U); 454 newtp->window_clamp = min(newtp->window_clamp, 65535U);
457 } 455 }
458 newtp->snd_wnd = ntohs(skb->h.th->window) << newtp->rx_opt.snd_wscale; 456 newtp->snd_wnd = (ntohs(tcp_hdr(skb)->window) <<
457 newtp->rx_opt.snd_wscale);
459 newtp->max_window = newtp->snd_wnd; 458 newtp->max_window = newtp->snd_wnd;
460 459
461 if (newtp->rx_opt.tstamp_ok) { 460 if (newtp->rx_opt.tstamp_ok) {
462 newtp->rx_opt.ts_recent = req->ts_recent; 461 newtp->rx_opt.ts_recent = req->ts_recent;
463 newtp->rx_opt.ts_recent_stamp = xtime.tv_sec; 462 newtp->rx_opt.ts_recent_stamp = get_seconds();
464 newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; 463 newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
465 } else { 464 } else {
466 newtp->rx_opt.ts_recent_stamp = 0; 465 newtp->rx_opt.ts_recent_stamp = 0;
@@ -490,7 +489,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
490 struct request_sock *req, 489 struct request_sock *req,
491 struct request_sock **prev) 490 struct request_sock **prev)
492{ 491{
493 struct tcphdr *th = skb->h.th; 492 const struct tcphdr *th = tcp_hdr(skb);
494 __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); 493 __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
495 int paws_reject = 0; 494 int paws_reject = 0;
496 struct tcp_options_received tmp_opt; 495 struct tcp_options_received tmp_opt;
@@ -506,7 +505,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
506 * it can be estimated (approximately) 505 * it can be estimated (approximately)
507 * from another data. 506 * from another data.
508 */ 507 */
509 tmp_opt.ts_recent_stamp = xtime.tv_sec - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans); 508 tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
510 paws_reject = tcp_paws_check(&tmp_opt, th->rst); 509 paws_reject = tcp_paws_check(&tmp_opt, th->rst);
511 } 510 }
512 } 511 }
@@ -712,8 +711,8 @@ int tcp_child_process(struct sock *parent, struct sock *child,
712 int state = child->sk_state; 711 int state = child->sk_state;
713 712
714 if (!sock_owned_by_user(child)) { 713 if (!sock_owned_by_user(child)) {
715 ret = tcp_rcv_state_process(child, skb, skb->h.th, skb->len); 714 ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb),
716 715 skb->len);
717 /* Wakeup parent, send SIGIO */ 716 /* Wakeup parent, send SIGIO */
718 if (state == TCP_SYN_RECV && child->sk_state != state) 717 if (state == TCP_SYN_RECV && child->sk_state != state)
719 parent->sk_data_ready(parent, 0); 718 parent->sk_data_ready(parent, 0);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 3c24881f2a65..e70a6840cb64 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -62,14 +62,13 @@ int sysctl_tcp_base_mss __read_mostly = 512;
62/* By default, RFC2861 behavior. */ 62/* By default, RFC2861 behavior. */
63int sysctl_tcp_slow_start_after_idle __read_mostly = 1; 63int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
64 64
65static void update_send_head(struct sock *sk, struct tcp_sock *tp, 65static void update_send_head(struct sock *sk, struct sk_buff *skb)
66 struct sk_buff *skb)
67{ 66{
68 sk->sk_send_head = skb->next; 67 struct tcp_sock *tp = tcp_sk(sk);
69 if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue) 68
70 sk->sk_send_head = NULL; 69 tcp_advance_send_head(sk, skb);
71 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; 70 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
72 tcp_packets_out_inc(sk, tp, skb); 71 tcp_packets_out_inc(sk, skb);
73} 72}
74 73
75/* SND.NXT, if window was not shrunk. 74/* SND.NXT, if window was not shrunk.
@@ -78,8 +77,10 @@ static void update_send_head(struct sock *sk, struct tcp_sock *tp,
78 * Anything in between SND.UNA...SND.UNA+SND.WND also can be already 77 * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
79 * invalid. OK, let's make this for now: 78 * invalid. OK, let's make this for now:
80 */ 79 */
81static inline __u32 tcp_acceptable_seq(struct sock *sk, struct tcp_sock *tp) 80static inline __u32 tcp_acceptable_seq(struct sock *sk)
82{ 81{
82 struct tcp_sock *tp = tcp_sk(sk);
83
83 if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt)) 84 if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt))
84 return tp->snd_nxt; 85 return tp->snd_nxt;
85 else 86 else
@@ -238,7 +239,7 @@ static u16 tcp_select_window(struct sock *sk)
238 u32 new_win = __tcp_select_window(sk); 239 u32 new_win = __tcp_select_window(sk);
239 240
240 /* Never shrink the offered window */ 241 /* Never shrink the offered window */
241 if(new_win < cur_win) { 242 if (new_win < cur_win) {
242 /* Danger Will Robinson! 243 /* Danger Will Robinson!
243 * Don't update rcv_wup/rcv_wnd here or else 244 * Don't update rcv_wup/rcv_wnd here or else
244 * we will not be able to advertise a zero 245 * we will not be able to advertise a zero
@@ -289,10 +290,12 @@ static void tcp_build_and_update_options(__be32 *ptr, struct tcp_sock *tp,
289 (TCPOPT_SACK << 8) | 290 (TCPOPT_SACK << 8) |
290 (TCPOLEN_SACK_BASE + (tp->rx_opt.eff_sacks * 291 (TCPOLEN_SACK_BASE + (tp->rx_opt.eff_sacks *
291 TCPOLEN_SACK_PERBLOCK))); 292 TCPOLEN_SACK_PERBLOCK)));
292 for(this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) { 293
294 for (this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) {
293 *ptr++ = htonl(sp[this_sack].start_seq); 295 *ptr++ = htonl(sp[this_sack].start_seq);
294 *ptr++ = htonl(sp[this_sack].end_seq); 296 *ptr++ = htonl(sp[this_sack].end_seq);
295 } 297 }
298
296 if (tp->rx_opt.dsack) { 299 if (tp->rx_opt.dsack) {
297 tp->rx_opt.dsack = 0; 300 tp->rx_opt.dsack = 0;
298 tp->rx_opt.eff_sacks--; 301 tp->rx_opt.eff_sacks--;
@@ -337,7 +340,7 @@ static void tcp_syn_build_options(__be32 *ptr, int mss, int ts, int sack,
337 */ 340 */
338 *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss); 341 *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss);
339 if (ts) { 342 if (ts) {
340 if(sack) 343 if (sack)
341 *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | 344 *ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
342 (TCPOLEN_SACK_PERM << 16) | 345 (TCPOLEN_SACK_PERM << 16) |
343 (TCPOPT_TIMESTAMP << 8) | 346 (TCPOPT_TIMESTAMP << 8) |
@@ -349,7 +352,7 @@ static void tcp_syn_build_options(__be32 *ptr, int mss, int ts, int sack,
349 TCPOLEN_TIMESTAMP); 352 TCPOLEN_TIMESTAMP);
350 *ptr++ = htonl(tstamp); /* TSVAL */ 353 *ptr++ = htonl(tstamp); /* TSVAL */
351 *ptr++ = htonl(ts_recent); /* TSECR */ 354 *ptr++ = htonl(ts_recent); /* TSECR */
352 } else if(sack) 355 } else if (sack)
353 *ptr++ = htonl((TCPOPT_NOP << 24) | 356 *ptr++ = htonl((TCPOPT_NOP << 24) |
354 (TCPOPT_NOP << 16) | 357 (TCPOPT_NOP << 16) |
355 (TCPOPT_SACK_PERM << 8) | 358 (TCPOPT_SACK_PERM << 8) |
@@ -406,7 +409,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
406 /* If congestion control is doing timestamping, we must 409 /* If congestion control is doing timestamping, we must
407 * take such a timestamp before we potentially clone/copy. 410 * take such a timestamp before we potentially clone/copy.
408 */ 411 */
409 if (icsk->icsk_ca_ops->rtt_sample) 412 if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP)
410 __net_timestamp(skb); 413 __net_timestamp(skb);
411 414
412 if (likely(clone_it)) { 415 if (likely(clone_it)) {
@@ -430,7 +433,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
430 sysctl_flags = 0; 433 sysctl_flags = 0;
431 if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { 434 if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
432 tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; 435 tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
433 if(sysctl_tcp_timestamps) { 436 if (sysctl_tcp_timestamps) {
434 tcp_header_size += TCPOLEN_TSTAMP_ALIGNED; 437 tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
435 sysctl_flags |= SYSCTL_FLAG_TSTAMPS; 438 sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
436 } 439 }
@@ -465,11 +468,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
465 tcp_header_size += TCPOLEN_MD5SIG_ALIGNED; 468 tcp_header_size += TCPOLEN_MD5SIG_ALIGNED;
466#endif 469#endif
467 470
468 th = (struct tcphdr *) skb_push(skb, tcp_header_size); 471 skb_push(skb, tcp_header_size);
469 skb->h.th = th; 472 skb_reset_transport_header(skb);
470 skb_set_owner_w(skb, sk); 473 skb_set_owner_w(skb, sk);
471 474
472 /* Build TCP header and checksum it. */ 475 /* Build TCP header and checksum it. */
476 th = tcp_hdr(skb);
473 th->source = inet->sport; 477 th->source = inet->sport;
474 th->dest = inet->dport; 478 th->dest = inet->dport;
475 th->seq = htonl(tcb->seq); 479 th->seq = htonl(tcb->seq);
@@ -515,7 +519,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
515 md5 ? &md5_hash_location : 519 md5 ? &md5_hash_location :
516#endif 520#endif
517 NULL); 521 NULL);
518 TCP_ECN_send(sk, tp, skb, tcp_header_size); 522 TCP_ECN_send(sk, skb, tcp_header_size);
519 } 523 }
520 524
521#ifdef CONFIG_TCP_MD5SIG 525#ifdef CONFIG_TCP_MD5SIG
@@ -524,7 +528,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
524 tp->af_specific->calc_md5_hash(md5_hash_location, 528 tp->af_specific->calc_md5_hash(md5_hash_location,
525 md5, 529 md5,
526 sk, NULL, NULL, 530 sk, NULL, NULL,
527 skb->h.th, 531 tcp_hdr(skb),
528 sk->sk_protocol, 532 sk->sk_protocol,
529 skb->len); 533 skb->len);
530 } 534 }
@@ -545,7 +549,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
545 if (likely(err <= 0)) 549 if (likely(err <= 0))
546 return err; 550 return err;
547 551
548 tcp_enter_cwr(sk); 552 tcp_enter_cwr(sk, 1);
549 553
550 return net_xmit_eval(err); 554 return net_xmit_eval(err);
551 555
@@ -567,12 +571,8 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
567 /* Advance write_seq and place onto the write_queue. */ 571 /* Advance write_seq and place onto the write_queue. */
568 tp->write_seq = TCP_SKB_CB(skb)->end_seq; 572 tp->write_seq = TCP_SKB_CB(skb)->end_seq;
569 skb_header_release(skb); 573 skb_header_release(skb);
570 __skb_queue_tail(&sk->sk_write_queue, skb); 574 tcp_add_write_queue_tail(sk, skb);
571 sk_charge_skb(sk, skb); 575 sk_charge_skb(sk, skb);
572
573 /* Queue it, remembering where we must start sending. */
574 if (sk->sk_send_head == NULL)
575 sk->sk_send_head = skb;
576} 576}
577 577
578static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now) 578static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
@@ -705,7 +705,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss
705 705
706 /* Link BUFF into the send queue. */ 706 /* Link BUFF into the send queue. */
707 skb_header_release(buff); 707 skb_header_release(buff);
708 __skb_append(skb, buff, &sk->sk_write_queue); 708 tcp_insert_write_queue_after(skb, buff, sk);
709 709
710 return 0; 710 return 0;
711} 711}
@@ -736,7 +736,7 @@ static void __pskb_trim_head(struct sk_buff *skb, int len)
736 } 736 }
737 skb_shinfo(skb)->nr_frags = k; 737 skb_shinfo(skb)->nr_frags = k;
738 738
739 skb->tail = skb->data; 739 skb_reset_tail_pointer(skb);
740 skb->data_len -= len; 740 skb->data_len -= len;
741 skb->len = skb->data_len; 741 skb->len = skb->data_len;
742} 742}
@@ -930,8 +930,9 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
930 930
931/* Congestion window validation. (RFC2861) */ 931/* Congestion window validation. (RFC2861) */
932 932
933static void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) 933static void tcp_cwnd_validate(struct sock *sk)
934{ 934{
935 struct tcp_sock *tp = tcp_sk(sk);
935 __u32 packets_out = tp->packets_out; 936 __u32 packets_out = tp->packets_out;
936 937
937 if (packets_out >= tp->snd_cwnd) { 938 if (packets_out >= tp->snd_cwnd) {
@@ -1056,7 +1057,7 @@ static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, uns
1056 return !after(end_seq, tp->snd_una + tp->snd_wnd); 1057 return !after(end_seq, tp->snd_una + tp->snd_wnd);
1057} 1058}
1058 1059
1059/* This checks if the data bearing packet SKB (usually sk->sk_send_head) 1060/* This checks if the data bearing packet SKB (usually tcp_send_head(sk))
1060 * should be put on the wire right now. If so, it returns the number of 1061 * should be put on the wire right now. If so, it returns the number of
1061 * packets allowed by the congestion window. 1062 * packets allowed by the congestion window.
1062 */ 1063 */
@@ -1079,15 +1080,10 @@ static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
1079 return cwnd_quota; 1080 return cwnd_quota;
1080} 1081}
1081 1082
1082static inline int tcp_skb_is_last(const struct sock *sk, 1083int tcp_may_send_now(struct sock *sk)
1083 const struct sk_buff *skb)
1084{
1085 return skb->next == (struct sk_buff *)&sk->sk_write_queue;
1086}
1087
1088int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
1089{ 1084{
1090 struct sk_buff *skb = sk->sk_send_head; 1085 struct tcp_sock *tp = tcp_sk(sk);
1086 struct sk_buff *skb = tcp_send_head(sk);
1091 1087
1092 return (skb && 1088 return (skb &&
1093 tcp_snd_test(sk, skb, tcp_current_mss(sk, 1), 1089 tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
@@ -1143,7 +1139,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1143 1139
1144 /* Link BUFF into the send queue. */ 1140 /* Link BUFF into the send queue. */
1145 skb_header_release(buff); 1141 skb_header_release(buff);
1146 __skb_append(skb, buff, &sk->sk_write_queue); 1142 tcp_insert_write_queue_after(skb, buff, sk);
1147 1143
1148 return 0; 1144 return 0;
1149} 1145}
@@ -1153,8 +1149,9 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1153 * 1149 *
1154 * This algorithm is from John Heffner. 1150 * This algorithm is from John Heffner.
1155 */ 1151 */
1156static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) 1152static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
1157{ 1153{
1154 struct tcp_sock *tp = tcp_sk(sk);
1158 const struct inet_connection_sock *icsk = inet_csk(sk); 1155 const struct inet_connection_sock *icsk = inet_csk(sk);
1159 u32 send_win, cong_win, limit, in_flight; 1156 u32 send_win, cong_win, limit, in_flight;
1160 1157
@@ -1249,10 +1246,10 @@ static int tcp_mtu_probe(struct sock *sk)
1249 1246
1250 /* Have enough data in the send queue to probe? */ 1247 /* Have enough data in the send queue to probe? */
1251 len = 0; 1248 len = 0;
1252 if ((skb = sk->sk_send_head) == NULL) 1249 if ((skb = tcp_send_head(sk)) == NULL)
1253 return -1; 1250 return -1;
1254 while ((len += skb->len) < probe_size && !tcp_skb_is_last(sk, skb)) 1251 while ((len += skb->len) < probe_size && !tcp_skb_is_last(sk, skb))
1255 skb = skb->next; 1252 skb = tcp_write_queue_next(sk, skb);
1256 if (len < probe_size) 1253 if (len < probe_size)
1257 return -1; 1254 return -1;
1258 1255
@@ -1279,9 +1276,9 @@ static int tcp_mtu_probe(struct sock *sk)
1279 return -1; 1276 return -1;
1280 sk_charge_skb(sk, nskb); 1277 sk_charge_skb(sk, nskb);
1281 1278
1282 skb = sk->sk_send_head; 1279 skb = tcp_send_head(sk);
1283 __skb_insert(nskb, skb->prev, skb, &sk->sk_write_queue); 1280 tcp_insert_write_queue_before(nskb, skb, sk);
1284 sk->sk_send_head = nskb; 1281 tcp_advance_send_head(sk, skb);
1285 1282
1286 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; 1283 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
1287 TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; 1284 TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
@@ -1292,7 +1289,7 @@ static int tcp_mtu_probe(struct sock *sk)
1292 1289
1293 len = 0; 1290 len = 0;
1294 while (len < probe_size) { 1291 while (len < probe_size) {
1295 next = skb->next; 1292 next = tcp_write_queue_next(sk, skb);
1296 1293
1297 copy = min_t(int, skb->len, probe_size - len); 1294 copy = min_t(int, skb->len, probe_size - len);
1298 if (nskb->ip_summed) 1295 if (nskb->ip_summed)
@@ -1305,7 +1302,7 @@ static int tcp_mtu_probe(struct sock *sk)
1305 /* We've eaten all the data from this skb. 1302 /* We've eaten all the data from this skb.
1306 * Throw it away. */ 1303 * Throw it away. */
1307 TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags; 1304 TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags;
1308 __skb_unlink(skb, &sk->sk_write_queue); 1305 tcp_unlink_write_queue(skb, sk);
1309 sk_stream_free_skb(sk, skb); 1306 sk_stream_free_skb(sk, skb);
1310 } else { 1307 } else {
1311 TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags & 1308 TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags &
@@ -1333,7 +1330,7 @@ static int tcp_mtu_probe(struct sock *sk)
1333 /* Decrement cwnd here because we are sending 1330 /* Decrement cwnd here because we are sending
1334 * effectively two packets. */ 1331 * effectively two packets. */
1335 tp->snd_cwnd--; 1332 tp->snd_cwnd--;
1336 update_send_head(sk, tp, nskb); 1333 update_send_head(sk, nskb);
1337 1334
1338 icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len); 1335 icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
1339 tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq; 1336 tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq;
@@ -1377,7 +1374,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
1377 sent_pkts = 1; 1374 sent_pkts = 1;
1378 } 1375 }
1379 1376
1380 while ((skb = sk->sk_send_head)) { 1377 while ((skb = tcp_send_head(sk))) {
1381 unsigned int limit; 1378 unsigned int limit;
1382 1379
1383 tso_segs = tcp_init_tso_segs(sk, skb, mss_now); 1380 tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
@@ -1396,7 +1393,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
1396 nonagle : TCP_NAGLE_PUSH)))) 1393 nonagle : TCP_NAGLE_PUSH))))
1397 break; 1394 break;
1398 } else { 1395 } else {
1399 if (tcp_tso_should_defer(sk, tp, skb)) 1396 if (tcp_tso_should_defer(sk, skb))
1400 break; 1397 break;
1401 } 1398 }
1402 1399
@@ -1425,31 +1422,31 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
1425 /* Advance the send_head. This one is sent out. 1422 /* Advance the send_head. This one is sent out.
1426 * This call will increment packets_out. 1423 * This call will increment packets_out.
1427 */ 1424 */
1428 update_send_head(sk, tp, skb); 1425 update_send_head(sk, skb);
1429 1426
1430 tcp_minshall_update(tp, mss_now, skb); 1427 tcp_minshall_update(tp, mss_now, skb);
1431 sent_pkts++; 1428 sent_pkts++;
1432 } 1429 }
1433 1430
1434 if (likely(sent_pkts)) { 1431 if (likely(sent_pkts)) {
1435 tcp_cwnd_validate(sk, tp); 1432 tcp_cwnd_validate(sk);
1436 return 0; 1433 return 0;
1437 } 1434 }
1438 return !tp->packets_out && sk->sk_send_head; 1435 return !tp->packets_out && tcp_send_head(sk);
1439} 1436}
1440 1437
1441/* Push out any pending frames which were held back due to 1438/* Push out any pending frames which were held back due to
1442 * TCP_CORK or attempt at coalescing tiny packets. 1439 * TCP_CORK or attempt at coalescing tiny packets.
1443 * The socket must be locked by the caller. 1440 * The socket must be locked by the caller.
1444 */ 1441 */
1445void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, 1442void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
1446 unsigned int cur_mss, int nonagle) 1443 int nonagle)
1447{ 1444{
1448 struct sk_buff *skb = sk->sk_send_head; 1445 struct sk_buff *skb = tcp_send_head(sk);
1449 1446
1450 if (skb) { 1447 if (skb) {
1451 if (tcp_write_xmit(sk, cur_mss, nonagle)) 1448 if (tcp_write_xmit(sk, cur_mss, nonagle))
1452 tcp_check_probe_timer(sk, tp); 1449 tcp_check_probe_timer(sk);
1453 } 1450 }
1454} 1451}
1455 1452
@@ -1459,7 +1456,7 @@ void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp,
1459void tcp_push_one(struct sock *sk, unsigned int mss_now) 1456void tcp_push_one(struct sock *sk, unsigned int mss_now)
1460{ 1457{
1461 struct tcp_sock *tp = tcp_sk(sk); 1458 struct tcp_sock *tp = tcp_sk(sk);
1462 struct sk_buff *skb = sk->sk_send_head; 1459 struct sk_buff *skb = tcp_send_head(sk);
1463 unsigned int tso_segs, cwnd_quota; 1460 unsigned int tso_segs, cwnd_quota;
1464 1461
1465 BUG_ON(!skb || skb->len < mss_now); 1462 BUG_ON(!skb || skb->len < mss_now);
@@ -1493,8 +1490,8 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
1493 TCP_SKB_CB(skb)->when = tcp_time_stamp; 1490 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1494 1491
1495 if (likely(!tcp_transmit_skb(sk, skb, 1, sk->sk_allocation))) { 1492 if (likely(!tcp_transmit_skb(sk, skb, 1, sk->sk_allocation))) {
1496 update_send_head(sk, tp, skb); 1493 update_send_head(sk, skb);
1497 tcp_cwnd_validate(sk, tp); 1494 tcp_cwnd_validate(sk);
1498 return; 1495 return;
1499 } 1496 }
1500 } 1497 }
@@ -1620,7 +1617,7 @@ u32 __tcp_select_window(struct sock *sk)
1620static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now) 1617static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now)
1621{ 1618{
1622 struct tcp_sock *tp = tcp_sk(sk); 1619 struct tcp_sock *tp = tcp_sk(sk);
1623 struct sk_buff *next_skb = skb->next; 1620 struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
1624 1621
1625 /* The first test we must make is that neither of these two 1622 /* The first test we must make is that neither of these two
1626 * SKB's are still referenced by someone else. 1623 * SKB's are still referenced by someone else.
@@ -1630,7 +1627,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
1630 u16 flags = TCP_SKB_CB(skb)->flags; 1627 u16 flags = TCP_SKB_CB(skb)->flags;
1631 1628
1632 /* Also punt if next skb has been SACK'd. */ 1629 /* Also punt if next skb has been SACK'd. */
1633 if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED) 1630 if (TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
1634 return; 1631 return;
1635 1632
1636 /* Next skb is out of window. */ 1633 /* Next skb is out of window. */
@@ -1652,9 +1649,11 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
1652 clear_all_retrans_hints(tp); 1649 clear_all_retrans_hints(tp);
1653 1650
1654 /* Ok. We will be able to collapse the packet. */ 1651 /* Ok. We will be able to collapse the packet. */
1655 __skb_unlink(next_skb, &sk->sk_write_queue); 1652 tcp_unlink_write_queue(next_skb, sk);
1656 1653
1657 memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size); 1654 skb_copy_from_linear_data(next_skb,
1655 skb_put(skb, next_skb_size),
1656 next_skb_size);
1658 1657
1659 if (next_skb->ip_summed == CHECKSUM_PARTIAL) 1658 if (next_skb->ip_summed == CHECKSUM_PARTIAL)
1660 skb->ip_summed = CHECKSUM_PARTIAL; 1659 skb->ip_summed = CHECKSUM_PARTIAL;
@@ -1706,7 +1705,9 @@ void tcp_simple_retransmit(struct sock *sk)
1706 unsigned int mss = tcp_current_mss(sk, 0); 1705 unsigned int mss = tcp_current_mss(sk, 0);
1707 int lost = 0; 1706 int lost = 0;
1708 1707
1709 sk_stream_for_retrans_queue(skb, sk) { 1708 tcp_for_write_queue(skb, sk) {
1709 if (skb == tcp_send_head(sk))
1710 break;
1710 if (skb->len > mss && 1711 if (skb->len > mss &&
1711 !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) { 1712 !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
1712 if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) { 1713 if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
@@ -1788,13 +1789,13 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1788 } 1789 }
1789 1790
1790 /* Collapse two adjacent packets if worthwhile and we can. */ 1791 /* Collapse two adjacent packets if worthwhile and we can. */
1791 if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) && 1792 if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
1792 (skb->len < (cur_mss >> 1)) && 1793 (skb->len < (cur_mss >> 1)) &&
1793 (skb->next != sk->sk_send_head) && 1794 (tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) &&
1794 (skb->next != (struct sk_buff *)&sk->sk_write_queue) && 1795 (!tcp_skb_is_last(sk, skb)) &&
1795 (skb_shinfo(skb)->nr_frags == 0 && skb_shinfo(skb->next)->nr_frags == 0) && 1796 (skb_shinfo(skb)->nr_frags == 0 && skb_shinfo(tcp_write_queue_next(sk, skb))->nr_frags == 0) &&
1796 (tcp_skb_pcount(skb) == 1 && tcp_skb_pcount(skb->next) == 1) && 1797 (tcp_skb_pcount(skb) == 1 && tcp_skb_pcount(tcp_write_queue_next(sk, skb)) == 1) &&
1797 (sysctl_tcp_retrans_collapse != 0)) 1798 (sysctl_tcp_retrans_collapse != 0))
1798 tcp_retrans_try_collapse(sk, skb, cur_mss); 1799 tcp_retrans_try_collapse(sk, skb, cur_mss);
1799 1800
1800 if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) 1801 if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
@@ -1804,9 +1805,9 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1804 * retransmit when old data is attached. So strip it off 1805 * retransmit when old data is attached. So strip it off
1805 * since it is cheap to do so and saves bytes on the network. 1806 * since it is cheap to do so and saves bytes on the network.
1806 */ 1807 */
1807 if(skb->len > 0 && 1808 if (skb->len > 0 &&
1808 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && 1809 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
1809 tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) { 1810 tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
1810 if (!pskb_trim(skb, 0)) { 1811 if (!pskb_trim(skb, 0)) {
1811 TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1; 1812 TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
1812 skb_shinfo(skb)->gso_segs = 1; 1813 skb_shinfo(skb)->gso_segs = 1;
@@ -1872,15 +1873,17 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
1872 skb = tp->retransmit_skb_hint; 1873 skb = tp->retransmit_skb_hint;
1873 packet_cnt = tp->retransmit_cnt_hint; 1874 packet_cnt = tp->retransmit_cnt_hint;
1874 }else{ 1875 }else{
1875 skb = sk->sk_write_queue.next; 1876 skb = tcp_write_queue_head(sk);
1876 packet_cnt = 0; 1877 packet_cnt = 0;
1877 } 1878 }
1878 1879
1879 /* First pass: retransmit lost packets. */ 1880 /* First pass: retransmit lost packets. */
1880 if (tp->lost_out) { 1881 if (tp->lost_out) {
1881 sk_stream_for_retrans_queue_from(skb, sk) { 1882 tcp_for_write_queue_from(skb, sk) {
1882 __u8 sacked = TCP_SKB_CB(skb)->sacked; 1883 __u8 sacked = TCP_SKB_CB(skb)->sacked;
1883 1884
1885 if (skb == tcp_send_head(sk))
1886 break;
1884 /* we could do better than to assign each time */ 1887 /* we could do better than to assign each time */
1885 tp->retransmit_skb_hint = skb; 1888 tp->retransmit_skb_hint = skb;
1886 tp->retransmit_cnt_hint = packet_cnt; 1889 tp->retransmit_cnt_hint = packet_cnt;
@@ -1906,8 +1909,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
1906 else 1909 else
1907 NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS); 1910 NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS);
1908 1911
1909 if (skb == 1912 if (skb == tcp_write_queue_head(sk))
1910 skb_peek(&sk->sk_write_queue))
1911 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 1913 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
1912 inet_csk(sk)->icsk_rto, 1914 inet_csk(sk)->icsk_rto,
1913 TCP_RTO_MAX); 1915 TCP_RTO_MAX);
@@ -1937,18 +1939,20 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
1937 * segments to send. 1939 * segments to send.
1938 */ 1940 */
1939 1941
1940 if (tcp_may_send_now(sk, tp)) 1942 if (tcp_may_send_now(sk))
1941 return; 1943 return;
1942 1944
1943 if (tp->forward_skb_hint) { 1945 if (tp->forward_skb_hint) {
1944 skb = tp->forward_skb_hint; 1946 skb = tp->forward_skb_hint;
1945 packet_cnt = tp->forward_cnt_hint; 1947 packet_cnt = tp->forward_cnt_hint;
1946 } else{ 1948 } else{
1947 skb = sk->sk_write_queue.next; 1949 skb = tcp_write_queue_head(sk);
1948 packet_cnt = 0; 1950 packet_cnt = 0;
1949 } 1951 }
1950 1952
1951 sk_stream_for_retrans_queue_from(skb, sk) { 1953 tcp_for_write_queue_from(skb, sk) {
1954 if (skb == tcp_send_head(sk))
1955 break;
1952 tp->forward_cnt_hint = packet_cnt; 1956 tp->forward_cnt_hint = packet_cnt;
1953 tp->forward_skb_hint = skb; 1957 tp->forward_skb_hint = skb;
1954 1958
@@ -1973,7 +1977,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
1973 break; 1977 break;
1974 } 1978 }
1975 1979
1976 if (skb == skb_peek(&sk->sk_write_queue)) 1980 if (skb == tcp_write_queue_head(sk))
1977 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 1981 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
1978 inet_csk(sk)->icsk_rto, 1982 inet_csk(sk)->icsk_rto,
1979 TCP_RTO_MAX); 1983 TCP_RTO_MAX);
@@ -1989,7 +1993,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
1989void tcp_send_fin(struct sock *sk) 1993void tcp_send_fin(struct sock *sk)
1990{ 1994{
1991 struct tcp_sock *tp = tcp_sk(sk); 1995 struct tcp_sock *tp = tcp_sk(sk);
1992 struct sk_buff *skb = skb_peek_tail(&sk->sk_write_queue); 1996 struct sk_buff *skb = tcp_write_queue_tail(sk);
1993 int mss_now; 1997 int mss_now;
1994 1998
1995 /* Optimization, tack on the FIN if we have a queue of 1999 /* Optimization, tack on the FIN if we have a queue of
@@ -1998,7 +2002,7 @@ void tcp_send_fin(struct sock *sk)
1998 */ 2002 */
1999 mss_now = tcp_current_mss(sk, 1); 2003 mss_now = tcp_current_mss(sk, 1);
2000 2004
2001 if (sk->sk_send_head != NULL) { 2005 if (tcp_send_head(sk) != NULL) {
2002 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN; 2006 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
2003 TCP_SKB_CB(skb)->end_seq++; 2007 TCP_SKB_CB(skb)->end_seq++;
2004 tp->write_seq++; 2008 tp->write_seq++;
@@ -2025,7 +2029,7 @@ void tcp_send_fin(struct sock *sk)
2025 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; 2029 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
2026 tcp_queue_skb(sk, skb); 2030 tcp_queue_skb(sk, skb);
2027 } 2031 }
2028 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_OFF); 2032 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
2029} 2033}
2030 2034
2031/* We get here when a process closes a file descriptor (either due to 2035/* We get here when a process closes a file descriptor (either due to
@@ -2035,7 +2039,6 @@ void tcp_send_fin(struct sock *sk)
2035 */ 2039 */
2036void tcp_send_active_reset(struct sock *sk, gfp_t priority) 2040void tcp_send_active_reset(struct sock *sk, gfp_t priority)
2037{ 2041{
2038 struct tcp_sock *tp = tcp_sk(sk);
2039 struct sk_buff *skb; 2042 struct sk_buff *skb;
2040 2043
2041 /* NOTE: No TCP options attached and we never retransmit this. */ 2044 /* NOTE: No TCP options attached and we never retransmit this. */
@@ -2055,7 +2058,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
2055 skb_shinfo(skb)->gso_type = 0; 2058 skb_shinfo(skb)->gso_type = 0;
2056 2059
2057 /* Send it off. */ 2060 /* Send it off. */
2058 TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp); 2061 TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk);
2059 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; 2062 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
2060 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2063 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2061 if (tcp_transmit_skb(sk, skb, 0, priority)) 2064 if (tcp_transmit_skb(sk, skb, 0, priority))
@@ -2071,7 +2074,7 @@ int tcp_send_synack(struct sock *sk)
2071{ 2074{
2072 struct sk_buff* skb; 2075 struct sk_buff* skb;
2073 2076
2074 skb = skb_peek(&sk->sk_write_queue); 2077 skb = tcp_write_queue_head(sk);
2075 if (skb == NULL || !(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_SYN)) { 2078 if (skb == NULL || !(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_SYN)) {
2076 printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n"); 2079 printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
2077 return -EFAULT; 2080 return -EFAULT;
@@ -2081,9 +2084,9 @@ int tcp_send_synack(struct sock *sk)
2081 struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); 2084 struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
2082 if (nskb == NULL) 2085 if (nskb == NULL)
2083 return -ENOMEM; 2086 return -ENOMEM;
2084 __skb_unlink(skb, &sk->sk_write_queue); 2087 tcp_unlink_write_queue(skb, sk);
2085 skb_header_release(nskb); 2088 skb_header_release(nskb);
2086 __skb_queue_head(&sk->sk_write_queue, nskb); 2089 __tcp_add_write_queue_head(sk, nskb);
2087 sk_stream_free_skb(sk, skb); 2090 sk_stream_free_skb(sk, skb);
2088 sk_charge_skb(sk, nskb); 2091 sk_charge_skb(sk, nskb);
2089 skb = nskb; 2092 skb = nskb;
@@ -2133,8 +2136,10 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2133 if (md5) 2136 if (md5)
2134 tcp_header_size += TCPOLEN_MD5SIG_ALIGNED; 2137 tcp_header_size += TCPOLEN_MD5SIG_ALIGNED;
2135#endif 2138#endif
2136 skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size); 2139 skb_push(skb, tcp_header_size);
2140 skb_reset_transport_header(skb);
2137 2141
2142 th = tcp_hdr(skb);
2138 memset(th, 0, sizeof(struct tcphdr)); 2143 memset(th, 0, sizeof(struct tcphdr));
2139 th->syn = 1; 2144 th->syn = 1;
2140 th->ack = 1; 2145 th->ack = 1;
@@ -2188,7 +2193,7 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2188 tp->af_specific->calc_md5_hash(md5_hash_location, 2193 tp->af_specific->calc_md5_hash(md5_hash_location,
2189 md5, 2194 md5,
2190 NULL, dst, req, 2195 NULL, dst, req,
2191 skb->h.th, sk->sk_protocol, 2196 tcp_hdr(skb), sk->sk_protocol,
2192 skb->len); 2197 skb->len);
2193 } 2198 }
2194#endif 2199#endif
@@ -2271,7 +2276,7 @@ int tcp_connect(struct sock *sk)
2271 skb_reserve(buff, MAX_TCP_HEADER); 2276 skb_reserve(buff, MAX_TCP_HEADER);
2272 2277
2273 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN; 2278 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
2274 TCP_ECN_send_syn(sk, tp, buff); 2279 TCP_ECN_send_syn(sk, buff);
2275 TCP_SKB_CB(buff)->sacked = 0; 2280 TCP_SKB_CB(buff)->sacked = 0;
2276 skb_shinfo(buff)->gso_segs = 1; 2281 skb_shinfo(buff)->gso_segs = 1;
2277 skb_shinfo(buff)->gso_size = 0; 2282 skb_shinfo(buff)->gso_size = 0;
@@ -2285,7 +2290,7 @@ int tcp_connect(struct sock *sk)
2285 TCP_SKB_CB(buff)->when = tcp_time_stamp; 2290 TCP_SKB_CB(buff)->when = tcp_time_stamp;
2286 tp->retrans_stamp = TCP_SKB_CB(buff)->when; 2291 tp->retrans_stamp = TCP_SKB_CB(buff)->when;
2287 skb_header_release(buff); 2292 skb_header_release(buff);
2288 __skb_queue_tail(&sk->sk_write_queue, buff); 2293 __tcp_add_write_queue_tail(sk, buff);
2289 sk_charge_skb(sk, buff); 2294 sk_charge_skb(sk, buff);
2290 tp->packets_out += tcp_skb_pcount(buff); 2295 tp->packets_out += tcp_skb_pcount(buff);
2291 tcp_transmit_skb(sk, buff, 1, GFP_KERNEL); 2296 tcp_transmit_skb(sk, buff, 1, GFP_KERNEL);
@@ -2363,7 +2368,6 @@ void tcp_send_ack(struct sock *sk)
2363{ 2368{
2364 /* If we have been reset, we may not send again. */ 2369 /* If we have been reset, we may not send again. */
2365 if (sk->sk_state != TCP_CLOSE) { 2370 if (sk->sk_state != TCP_CLOSE) {
2366 struct tcp_sock *tp = tcp_sk(sk);
2367 struct sk_buff *buff; 2371 struct sk_buff *buff;
2368 2372
2369 /* We are not putting this on the write queue, so 2373 /* We are not putting this on the write queue, so
@@ -2389,7 +2393,7 @@ void tcp_send_ack(struct sock *sk)
2389 skb_shinfo(buff)->gso_type = 0; 2393 skb_shinfo(buff)->gso_type = 0;
2390 2394
2391 /* Send it off, this clears delayed acks for us. */ 2395 /* Send it off, this clears delayed acks for us. */
2392 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp); 2396 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk);
2393 TCP_SKB_CB(buff)->when = tcp_time_stamp; 2397 TCP_SKB_CB(buff)->when = tcp_time_stamp;
2394 tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC); 2398 tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC);
2395 } 2399 }
@@ -2441,7 +2445,7 @@ int tcp_write_wakeup(struct sock *sk)
2441 struct tcp_sock *tp = tcp_sk(sk); 2445 struct tcp_sock *tp = tcp_sk(sk);
2442 struct sk_buff *skb; 2446 struct sk_buff *skb;
2443 2447
2444 if ((skb = sk->sk_send_head) != NULL && 2448 if ((skb = tcp_send_head(sk)) != NULL &&
2445 before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) { 2449 before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) {
2446 int err; 2450 int err;
2447 unsigned int mss = tcp_current_mss(sk, 0); 2451 unsigned int mss = tcp_current_mss(sk, 0);
@@ -2467,7 +2471,7 @@ int tcp_write_wakeup(struct sock *sk)
2467 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2471 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2468 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); 2472 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
2469 if (!err) { 2473 if (!err) {
2470 update_send_head(sk, tp, skb); 2474 update_send_head(sk, skb);
2471 } 2475 }
2472 return err; 2476 return err;
2473 } else { 2477 } else {
@@ -2491,7 +2495,7 @@ void tcp_send_probe0(struct sock *sk)
2491 2495
2492 err = tcp_write_wakeup(sk); 2496 err = tcp_write_wakeup(sk);
2493 2497
2494 if (tp->packets_out || !sk->sk_send_head) { 2498 if (tp->packets_out || !tcp_send_head(sk)) {
2495 /* Cancel probe timer, if it is not required. */ 2499 /* Cancel probe timer, if it is not required. */
2496 icsk->icsk_probes_out = 0; 2500 icsk->icsk_probes_out = 0;
2497 icsk->icsk_backoff = 0; 2501 icsk->icsk_backoff = 0;
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index 61f406f27294..3938d5dbdf20 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -26,6 +26,8 @@
26#include <linux/proc_fs.h> 26#include <linux/proc_fs.h>
27#include <linux/module.h> 27#include <linux/module.h>
28#include <linux/kfifo.h> 28#include <linux/kfifo.h>
29#include <linux/ktime.h>
30#include <linux/time.h>
29#include <linux/vmalloc.h> 31#include <linux/vmalloc.h>
30 32
31#include <net/tcp.h> 33#include <net/tcp.h>
@@ -34,43 +36,45 @@ MODULE_AUTHOR("Stephen Hemminger <shemminger@linux-foundation.org>");
34MODULE_DESCRIPTION("TCP cwnd snooper"); 36MODULE_DESCRIPTION("TCP cwnd snooper");
35MODULE_LICENSE("GPL"); 37MODULE_LICENSE("GPL");
36 38
37static int port = 0; 39static int port __read_mostly = 0;
38MODULE_PARM_DESC(port, "Port to match (0=all)"); 40MODULE_PARM_DESC(port, "Port to match (0=all)");
39module_param(port, int, 0); 41module_param(port, int, 0);
40 42
41static int bufsize = 64*1024; 43static int bufsize __read_mostly = 64*1024;
42MODULE_PARM_DESC(bufsize, "Log buffer size (default 64k)"); 44MODULE_PARM_DESC(bufsize, "Log buffer size (default 64k)");
43module_param(bufsize, int, 0); 45module_param(bufsize, int, 0);
44 46
47static int full __read_mostly;
48MODULE_PARM_DESC(full, "Full log (1=every ack packet received, 0=only cwnd changes)");
49module_param(full, int, 0);
50
45static const char procname[] = "tcpprobe"; 51static const char procname[] = "tcpprobe";
46 52
47struct { 53struct {
48 struct kfifo *fifo; 54 struct kfifo *fifo;
49 spinlock_t lock; 55 spinlock_t lock;
50 wait_queue_head_t wait; 56 wait_queue_head_t wait;
51 struct timeval tstart; 57 ktime_t start;
58 u32 lastcwnd;
52} tcpw; 59} tcpw;
53 60
61/*
62 * Print to log with timestamps.
63 * FIXME: causes an extra copy
64 */
54static void printl(const char *fmt, ...) 65static void printl(const char *fmt, ...)
55{ 66{
56 va_list args; 67 va_list args;
57 int len; 68 int len;
58 struct timeval now; 69 struct timespec tv;
59 char tbuf[256]; 70 char tbuf[256];
60 71
61 va_start(args, fmt); 72 va_start(args, fmt);
62 do_gettimeofday(&now); 73 /* want monotonic time since start of tcp_probe */
74 tv = ktime_to_timespec(ktime_sub(ktime_get(), tcpw.start));
63 75
64 now.tv_sec -= tcpw.tstart.tv_sec; 76 len = sprintf(tbuf, "%lu.%09lu ",
65 now.tv_usec -= tcpw.tstart.tv_usec; 77 (unsigned long) tv.tv_sec, (unsigned long) tv.tv_nsec);
66 if (now.tv_usec < 0) {
67 --now.tv_sec;
68 now.tv_usec += 1000000;
69 }
70
71 len = sprintf(tbuf, "%lu.%06lu ",
72 (unsigned long) now.tv_sec,
73 (unsigned long) now.tv_usec);
74 len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args); 78 len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args);
75 va_end(args); 79 va_end(args);
76 80
@@ -78,38 +82,44 @@ static void printl(const char *fmt, ...)
78 wake_up(&tcpw.wait); 82 wake_up(&tcpw.wait);
79} 83}
80 84
81static int jtcp_sendmsg(struct kiocb *iocb, struct sock *sk, 85/*
82 struct msghdr *msg, size_t size) 86 * Hook inserted to be called before each receive packet.
87 * Note: arguments must match tcp_rcv_established()!
88 */
89static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
90 struct tcphdr *th, unsigned len)
83{ 91{
84 const struct tcp_sock *tp = tcp_sk(sk); 92 const struct tcp_sock *tp = tcp_sk(sk);
85 const struct inet_sock *inet = inet_sk(sk); 93 const struct inet_sock *inet = inet_sk(sk);
86 94
87 if (port == 0 || ntohs(inet->dport) == port || 95 /* Only update if port matches */
88 ntohs(inet->sport) == port) { 96 if ((port == 0 || ntohs(inet->dport) == port || ntohs(inet->sport) == port)
97 && (full || tp->snd_cwnd != tcpw.lastcwnd)) {
89 printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d %#x %#x %u %u %u\n", 98 printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d %#x %#x %u %u %u\n",
90 NIPQUAD(inet->saddr), ntohs(inet->sport), 99 NIPQUAD(inet->saddr), ntohs(inet->sport),
91 NIPQUAD(inet->daddr), ntohs(inet->dport), 100 NIPQUAD(inet->daddr), ntohs(inet->dport),
92 size, tp->snd_nxt, tp->snd_una, 101 skb->len, tp->snd_nxt, tp->snd_una,
93 tp->snd_cwnd, tcp_current_ssthresh(sk), 102 tp->snd_cwnd, tcp_current_ssthresh(sk),
94 tp->snd_wnd); 103 tp->snd_wnd, tp->srtt >> 3);
104 tcpw.lastcwnd = tp->snd_cwnd;
95 } 105 }
96 106
97 jprobe_return(); 107 jprobe_return();
98 return 0; 108 return 0;
99} 109}
100 110
101static struct jprobe tcp_send_probe = { 111static struct jprobe tcp_probe = {
102 .kp = { 112 .kp = {
103 .symbol_name = "tcp_sendmsg", 113 .symbol_name = "tcp_rcv_established",
104 }, 114 },
105 .entry = JPROBE_ENTRY(jtcp_sendmsg), 115 .entry = JPROBE_ENTRY(jtcp_rcv_established),
106}; 116};
107 117
108 118
109static int tcpprobe_open(struct inode * inode, struct file * file) 119static int tcpprobe_open(struct inode * inode, struct file * file)
110{ 120{
111 kfifo_reset(tcpw.fifo); 121 kfifo_reset(tcpw.fifo);
112 do_gettimeofday(&tcpw.tstart); 122 tcpw.start = ktime_get();
113 return 0; 123 return 0;
114} 124}
115 125
@@ -162,7 +172,7 @@ static __init int tcpprobe_init(void)
162 if (!proc_net_fops_create(procname, S_IRUSR, &tcpprobe_fops)) 172 if (!proc_net_fops_create(procname, S_IRUSR, &tcpprobe_fops))
163 goto err0; 173 goto err0;
164 174
165 ret = register_jprobe(&tcp_send_probe); 175 ret = register_jprobe(&tcp_probe);
166 if (ret) 176 if (ret)
167 goto err1; 177 goto err1;
168 178
@@ -180,7 +190,7 @@ static __exit void tcpprobe_exit(void)
180{ 190{
181 kfifo_free(tcpw.fifo); 191 kfifo_free(tcpw.fifo);
182 proc_net_remove(procname); 192 proc_net_remove(procname);
183 unregister_jprobe(&tcp_send_probe); 193 unregister_jprobe(&tcp_probe);
184 194
185} 195}
186module_exit(tcpprobe_exit); 196module_exit(tcpprobe_exit);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index a9243cfc1bea..2ca97b20929d 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -233,7 +233,7 @@ static void tcp_probe_timer(struct sock *sk)
233 struct tcp_sock *tp = tcp_sk(sk); 233 struct tcp_sock *tp = tcp_sk(sk);
234 int max_probes; 234 int max_probes;
235 235
236 if (tp->packets_out || !sk->sk_send_head) { 236 if (tp->packets_out || !tcp_send_head(sk)) {
237 icsk->icsk_probes_out = 0; 237 icsk->icsk_probes_out = 0;
238 return; 238 return;
239 } 239 }
@@ -284,7 +284,7 @@ static void tcp_retransmit_timer(struct sock *sk)
284 if (!tp->packets_out) 284 if (!tp->packets_out)
285 goto out; 285 goto out;
286 286
287 BUG_TRAP(!skb_queue_empty(&sk->sk_write_queue)); 287 BUG_TRAP(!tcp_write_queue_empty(sk));
288 288
289 if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) && 289 if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&
290 !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) { 290 !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
@@ -306,7 +306,7 @@ static void tcp_retransmit_timer(struct sock *sk)
306 goto out; 306 goto out;
307 } 307 }
308 tcp_enter_loss(sk, 0); 308 tcp_enter_loss(sk, 0);
309 tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)); 309 tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
310 __sk_dst_reset(sk); 310 __sk_dst_reset(sk);
311 goto out_reset_timer; 311 goto out_reset_timer;
312 } 312 }
@@ -341,7 +341,7 @@ static void tcp_retransmit_timer(struct sock *sk)
341 tcp_enter_loss(sk, 0); 341 tcp_enter_loss(sk, 0);
342 } 342 }
343 343
344 if (tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)) > 0) { 344 if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) {
345 /* Retransmission failed because of local congestion, 345 /* Retransmission failed because of local congestion,
346 * do not backoff. 346 * do not backoff.
347 */ 347 */
@@ -482,7 +482,7 @@ static void tcp_keepalive_timer (unsigned long data)
482 elapsed = keepalive_time_when(tp); 482 elapsed = keepalive_time_when(tp);
483 483
484 /* It is alive without keepalive 8) */ 484 /* It is alive without keepalive 8) */
485 if (tp->packets_out || sk->sk_send_head) 485 if (tp->packets_out || tcp_send_head(sk))
486 goto resched; 486 goto resched;
487 487
488 elapsed = tcp_time_stamp - tp->rcv_tstamp; 488 elapsed = tcp_time_stamp - tp->rcv_tstamp;
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 5c484dceb967..73e19cf7df21 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -38,6 +38,8 @@
38 38
39#include <net/tcp.h> 39#include <net/tcp.h>
40 40
41#include "tcp_vegas.h"
42
41/* Default values of the Vegas variables, in fixed-point representation 43/* Default values of the Vegas variables, in fixed-point representation
42 * with V_PARAM_SHIFT bits to the right of the binary point. 44 * with V_PARAM_SHIFT bits to the right of the binary point.
43 */ 45 */
@@ -54,17 +56,6 @@ module_param(gamma, int, 0644);
54MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)"); 56MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)");
55 57
56 58
57/* Vegas variables */
58struct vegas {
59 u32 beg_snd_nxt; /* right edge during last RTT */
60 u32 beg_snd_una; /* left edge during last RTT */
61 u32 beg_snd_cwnd; /* saves the size of the cwnd */
62 u8 doing_vegas_now;/* if true, do vegas for this RTT */
63 u16 cntRTT; /* # of RTTs measured within last RTT */
64 u32 minRTT; /* min of RTTs measured within last RTT (in usec) */
65 u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */
66};
67
68/* There are several situations when we must "re-start" Vegas: 59/* There are several situations when we must "re-start" Vegas:
69 * 60 *
70 * o when a connection is established 61 * o when a connection is established
@@ -81,7 +72,7 @@ struct vegas {
81 * Instead we must wait until the completion of an RTT during 72 * Instead we must wait until the completion of an RTT during
82 * which we actually receive ACKs. 73 * which we actually receive ACKs.
83 */ 74 */
84static inline void vegas_enable(struct sock *sk) 75static void vegas_enable(struct sock *sk)
85{ 76{
86 const struct tcp_sock *tp = tcp_sk(sk); 77 const struct tcp_sock *tp = tcp_sk(sk);
87 struct vegas *vegas = inet_csk_ca(sk); 78 struct vegas *vegas = inet_csk_ca(sk);
@@ -104,13 +95,14 @@ static inline void vegas_disable(struct sock *sk)
104 vegas->doing_vegas_now = 0; 95 vegas->doing_vegas_now = 0;
105} 96}
106 97
107static void tcp_vegas_init(struct sock *sk) 98void tcp_vegas_init(struct sock *sk)
108{ 99{
109 struct vegas *vegas = inet_csk_ca(sk); 100 struct vegas *vegas = inet_csk_ca(sk);
110 101
111 vegas->baseRTT = 0x7fffffff; 102 vegas->baseRTT = 0x7fffffff;
112 vegas_enable(sk); 103 vegas_enable(sk);
113} 104}
105EXPORT_SYMBOL_GPL(tcp_vegas_init);
114 106
115/* Do RTT sampling needed for Vegas. 107/* Do RTT sampling needed for Vegas.
116 * Basically we: 108 * Basically we:
@@ -120,10 +112,13 @@ static void tcp_vegas_init(struct sock *sk)
120 * o min-filter RTT samples from a much longer window (forever for now) 112 * o min-filter RTT samples from a much longer window (forever for now)
121 * to find the propagation delay (baseRTT) 113 * to find the propagation delay (baseRTT)
122 */ 114 */
123static void tcp_vegas_rtt_calc(struct sock *sk, u32 usrtt) 115void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, ktime_t last)
124{ 116{
125 struct vegas *vegas = inet_csk_ca(sk); 117 struct vegas *vegas = inet_csk_ca(sk);
126 u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */ 118 u32 vrtt;
119
120 /* Never allow zero rtt or baseRTT */
121 vrtt = ktime_to_us(net_timedelta(last)) + 1;
127 122
128 /* Filter to find propagation delay: */ 123 /* Filter to find propagation delay: */
129 if (vrtt < vegas->baseRTT) 124 if (vrtt < vegas->baseRTT)
@@ -135,8 +130,9 @@ static void tcp_vegas_rtt_calc(struct sock *sk, u32 usrtt)
135 vegas->minRTT = min(vegas->minRTT, vrtt); 130 vegas->minRTT = min(vegas->minRTT, vrtt);
136 vegas->cntRTT++; 131 vegas->cntRTT++;
137} 132}
133EXPORT_SYMBOL_GPL(tcp_vegas_pkts_acked);
138 134
139static void tcp_vegas_state(struct sock *sk, u8 ca_state) 135void tcp_vegas_state(struct sock *sk, u8 ca_state)
140{ 136{
141 137
142 if (ca_state == TCP_CA_Open) 138 if (ca_state == TCP_CA_Open)
@@ -144,6 +140,7 @@ static void tcp_vegas_state(struct sock *sk, u8 ca_state)
144 else 140 else
145 vegas_disable(sk); 141 vegas_disable(sk);
146} 142}
143EXPORT_SYMBOL_GPL(tcp_vegas_state);
147 144
148/* 145/*
149 * If the connection is idle and we are restarting, 146 * If the connection is idle and we are restarting,
@@ -154,12 +151,13 @@ static void tcp_vegas_state(struct sock *sk, u8 ca_state)
154 * packets, _then_ we can make Vegas calculations 151 * packets, _then_ we can make Vegas calculations
155 * again. 152 * again.
156 */ 153 */
157static void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event) 154void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event)
158{ 155{
159 if (event == CA_EVENT_CWND_RESTART || 156 if (event == CA_EVENT_CWND_RESTART ||
160 event == CA_EVENT_TX_START) 157 event == CA_EVENT_TX_START)
161 tcp_vegas_init(sk); 158 tcp_vegas_init(sk);
162} 159}
160EXPORT_SYMBOL_GPL(tcp_vegas_cwnd_event);
163 161
164static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, 162static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
165 u32 seq_rtt, u32 in_flight, int flag) 163 u32 seq_rtt, u32 in_flight, int flag)
@@ -336,30 +334,29 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
336} 334}
337 335
338/* Extract info for Tcp socket info provided via netlink. */ 336/* Extract info for Tcp socket info provided via netlink. */
339static void tcp_vegas_get_info(struct sock *sk, u32 ext, 337void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
340 struct sk_buff *skb)
341{ 338{
342 const struct vegas *ca = inet_csk_ca(sk); 339 const struct vegas *ca = inet_csk_ca(sk);
343 if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { 340 if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
344 struct tcpvegas_info *info; 341 struct tcpvegas_info info = {
345 342 .tcpv_enabled = ca->doing_vegas_now,
346 info = RTA_DATA(__RTA_PUT(skb, INET_DIAG_VEGASINFO, 343 .tcpv_rttcnt = ca->cntRTT,
347 sizeof(*info))); 344 .tcpv_rtt = ca->baseRTT,
348 345 .tcpv_minrtt = ca->minRTT,
349 info->tcpv_enabled = ca->doing_vegas_now; 346 };
350 info->tcpv_rttcnt = ca->cntRTT; 347
351 info->tcpv_rtt = ca->baseRTT; 348 nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info);
352 info->tcpv_minrtt = ca->minRTT;
353 rtattr_failure: ;
354 } 349 }
355} 350}
351EXPORT_SYMBOL_GPL(tcp_vegas_get_info);
356 352
357static struct tcp_congestion_ops tcp_vegas = { 353static struct tcp_congestion_ops tcp_vegas = {
354 .flags = TCP_CONG_RTT_STAMP,
358 .init = tcp_vegas_init, 355 .init = tcp_vegas_init,
359 .ssthresh = tcp_reno_ssthresh, 356 .ssthresh = tcp_reno_ssthresh,
360 .cong_avoid = tcp_vegas_cong_avoid, 357 .cong_avoid = tcp_vegas_cong_avoid,
361 .min_cwnd = tcp_reno_min_cwnd, 358 .min_cwnd = tcp_reno_min_cwnd,
362 .rtt_sample = tcp_vegas_rtt_calc, 359 .pkts_acked = tcp_vegas_pkts_acked,
363 .set_state = tcp_vegas_state, 360 .set_state = tcp_vegas_state,
364 .cwnd_event = tcp_vegas_cwnd_event, 361 .cwnd_event = tcp_vegas_cwnd_event,
365 .get_info = tcp_vegas_get_info, 362 .get_info = tcp_vegas_get_info,
diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp_vegas.h
new file mode 100644
index 000000000000..502fa8183634
--- /dev/null
+++ b/net/ipv4/tcp_vegas.h
@@ -0,0 +1,24 @@
1/*
2 * TCP Vegas congestion control interface
3 */
4#ifndef __TCP_VEGAS_H
5#define __TCP_VEGAS_H 1
6
7/* Vegas variables */
8struct vegas {
9 u32 beg_snd_nxt; /* right edge during last RTT */
10 u32 beg_snd_una; /* left edge during last RTT */
11 u32 beg_snd_cwnd; /* saves the size of the cwnd */
12 u8 doing_vegas_now;/* if true, do vegas for this RTT */
13 u16 cntRTT; /* # of RTTs measured within last RTT */
14 u32 minRTT; /* min of RTTs measured within last RTT (in usec) */
15 u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */
16};
17
18extern void tcp_vegas_init(struct sock *sk);
19extern void tcp_vegas_state(struct sock *sk, u8 ca_state);
20extern void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, ktime_t last);
21extern void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event);
22extern void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb);
23
24#endif /* __TCP_VEGAS_H */
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index ce57bf302f6c..9edb340f2f95 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -69,10 +69,13 @@ static void tcp_veno_init(struct sock *sk)
69} 69}
70 70
71/* Do rtt sampling needed for Veno. */ 71/* Do rtt sampling needed for Veno. */
72static void tcp_veno_rtt_calc(struct sock *sk, u32 usrtt) 72static void tcp_veno_pkts_acked(struct sock *sk, u32 cnt, ktime_t last)
73{ 73{
74 struct veno *veno = inet_csk_ca(sk); 74 struct veno *veno = inet_csk_ca(sk);
75 u32 vrtt = usrtt + 1; /* Never allow zero rtt or basertt */ 75 u32 vrtt;
76
77 /* Never allow zero rtt or baseRTT */
78 vrtt = ktime_to_us(net_timedelta(last)) + 1;
76 79
77 /* Filter to find propagation delay: */ 80 /* Filter to find propagation delay: */
78 if (vrtt < veno->basertt) 81 if (vrtt < veno->basertt)
@@ -199,10 +202,11 @@ static u32 tcp_veno_ssthresh(struct sock *sk)
199} 202}
200 203
201static struct tcp_congestion_ops tcp_veno = { 204static struct tcp_congestion_ops tcp_veno = {
205 .flags = TCP_CONG_RTT_STAMP,
202 .init = tcp_veno_init, 206 .init = tcp_veno_init,
203 .ssthresh = tcp_veno_ssthresh, 207 .ssthresh = tcp_veno_ssthresh,
204 .cong_avoid = tcp_veno_cong_avoid, 208 .cong_avoid = tcp_veno_cong_avoid,
205 .rtt_sample = tcp_veno_rtt_calc, 209 .pkts_acked = tcp_veno_pkts_acked,
206 .set_state = tcp_veno_state, 210 .set_state = tcp_veno_state,
207 .cwnd_event = tcp_veno_cwnd_event, 211 .cwnd_event = tcp_veno_cwnd_event,
208 212
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index 4e1b61032a9c..e61e09dd513e 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -100,7 +100,7 @@ static void westwood_filter(struct westwood *w, u32 delta)
100 * Called after processing group of packets. 100 * Called after processing group of packets.
101 * but all westwood needs is the last sample of srtt. 101 * but all westwood needs is the last sample of srtt.
102 */ 102 */
103static void tcp_westwood_pkts_acked(struct sock *sk, u32 cnt) 103static void tcp_westwood_pkts_acked(struct sock *sk, u32 cnt, ktime_t last)
104{ 104{
105 struct westwood *w = inet_csk_ca(sk); 105 struct westwood *w = inet_csk_ca(sk);
106 if (cnt > 0) 106 if (cnt > 0)
@@ -226,7 +226,7 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
226 struct tcp_sock *tp = tcp_sk(sk); 226 struct tcp_sock *tp = tcp_sk(sk);
227 struct westwood *w = inet_csk_ca(sk); 227 struct westwood *w = inet_csk_ca(sk);
228 228
229 switch(event) { 229 switch (event) {
230 case CA_EVENT_FAST_ACK: 230 case CA_EVENT_FAST_ACK:
231 westwood_fast_bw(sk); 231 westwood_fast_bw(sk);
232 break; 232 break;
@@ -260,16 +260,13 @@ static void tcp_westwood_info(struct sock *sk, u32 ext,
260{ 260{
261 const struct westwood *ca = inet_csk_ca(sk); 261 const struct westwood *ca = inet_csk_ca(sk);
262 if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { 262 if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
263 struct rtattr *rta; 263 struct tcpvegas_info info = {
264 struct tcpvegas_info *info; 264 .tcpv_enabled = 1,
265 265 .tcpv_rtt = jiffies_to_usecs(ca->rtt),
266 rta = __RTA_PUT(skb, INET_DIAG_VEGASINFO, sizeof(*info)); 266 .tcpv_minrtt = jiffies_to_usecs(ca->rtt_min),
267 info = RTA_DATA(rta); 267 };
268 info->tcpv_enabled = 1; 268
269 info->tcpv_rttcnt = 0; 269 nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info);
270 info->tcpv_rtt = jiffies_to_usecs(ca->rtt);
271 info->tcpv_minrtt = jiffies_to_usecs(ca->rtt_min);
272 rtattr_failure: ;
273 } 270 }
274} 271}
275 272
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
new file mode 100644
index 000000000000..545ed237ab53
--- /dev/null
+++ b/net/ipv4/tcp_yeah.c
@@ -0,0 +1,268 @@
1/*
2 *
3 * YeAH TCP
4 *
5 * For further details look at:
6 * http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf
7 *
8 */
9#include <linux/mm.h>
10#include <linux/module.h>
11#include <linux/skbuff.h>
12#include <linux/inet_diag.h>
13
14#include <net/tcp.h>
15
16#include "tcp_vegas.h"
17
18#define TCP_YEAH_ALPHA 80 //lin number of packets queued at the bottleneck
19#define TCP_YEAH_GAMMA 1 //lin fraction of queue to be removed per rtt
20#define TCP_YEAH_DELTA 3 //log minimum fraction of cwnd to be removed on loss
21#define TCP_YEAH_EPSILON 1 //log maximum fraction to be removed on early decongestion
22#define TCP_YEAH_PHY 8 //lin maximum delta from base
23#define TCP_YEAH_RHO 16 //lin minumum number of consecutive rtt to consider competition on loss
24#define TCP_YEAH_ZETA 50 //lin minimum number of state switchs to reset reno_count
25
26#define TCP_SCALABLE_AI_CNT 100U
27
28/* YeAH variables */
29struct yeah {
30 struct vegas vegas; /* must be first */
31
32 /* YeAH */
33 u32 lastQ;
34 u32 doing_reno_now;
35
36 u32 reno_count;
37 u32 fast_count;
38
39 u32 pkts_acked;
40};
41
42static void tcp_yeah_init(struct sock *sk)
43{
44 struct tcp_sock *tp = tcp_sk(sk);
45 struct yeah *yeah = inet_csk_ca(sk);
46
47 tcp_vegas_init(sk);
48
49 yeah->doing_reno_now = 0;
50 yeah->lastQ = 0;
51
52 yeah->reno_count = 2;
53
54 /* Ensure the MD arithmetic works. This is somewhat pedantic,
55 * since I don't think we will see a cwnd this large. :) */
56 tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
57
58}
59
60
61static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, ktime_t last)
62{
63 const struct inet_connection_sock *icsk = inet_csk(sk);
64 struct yeah *yeah = inet_csk_ca(sk);
65
66 if (icsk->icsk_ca_state == TCP_CA_Open)
67 yeah->pkts_acked = pkts_acked;
68
69 tcp_vegas_pkts_acked(sk, pkts_acked, last);
70}
71
72static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack,
73 u32 seq_rtt, u32 in_flight, int flag)
74{
75 struct tcp_sock *tp = tcp_sk(sk);
76 struct yeah *yeah = inet_csk_ca(sk);
77
78 if (!tcp_is_cwnd_limited(sk, in_flight))
79 return;
80
81 if (tp->snd_cwnd <= tp->snd_ssthresh)
82 tcp_slow_start(tp);
83
84 else if (!yeah->doing_reno_now) {
85 /* Scalable */
86
87 tp->snd_cwnd_cnt+=yeah->pkts_acked;
88 if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){
89 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
90 tp->snd_cwnd++;
91 tp->snd_cwnd_cnt = 0;
92 }
93
94 yeah->pkts_acked = 1;
95
96 } else {
97 /* Reno */
98
99 if (tp->snd_cwnd_cnt < tp->snd_cwnd)
100 tp->snd_cwnd_cnt++;
101
102 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
103 tp->snd_cwnd++;
104 tp->snd_cwnd_cnt = 0;
105 }
106 }
107
108 /* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt.
109 *
110 * These are so named because they represent the approximate values
111 * of snd_una and snd_nxt at the beginning of the current RTT. More
112 * precisely, they represent the amount of data sent during the RTT.
113 * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
114 * we will calculate that (v_beg_snd_nxt - v_vegas.beg_snd_una) outstanding
115 * bytes of data have been ACKed during the course of the RTT, giving
116 * an "actual" rate of:
117 *
118 * (v_beg_snd_nxt - v_vegas.beg_snd_una) / (rtt duration)
119 *
120 * Unfortunately, v_vegas.beg_snd_una is not exactly equal to snd_una,
121 * because delayed ACKs can cover more than one segment, so they
122 * don't line up yeahly with the boundaries of RTTs.
123 *
124 * Another unfortunate fact of life is that delayed ACKs delay the
125 * advance of the left edge of our send window, so that the number
126 * of bytes we send in an RTT is often less than our cwnd will allow.
127 * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
128 */
129
130 if (after(ack, yeah->vegas.beg_snd_nxt)) {
131
132 /* We do the Vegas calculations only if we got enough RTT
133 * samples that we can be reasonably sure that we got
134 * at least one RTT sample that wasn't from a delayed ACK.
135 * If we only had 2 samples total,
136 * then that means we're getting only 1 ACK per RTT, which
137 * means they're almost certainly delayed ACKs.
138 * If we have 3 samples, we should be OK.
139 */
140
141 if (yeah->vegas.cntRTT > 2) {
142 u32 rtt, queue;
143 u64 bw;
144
145 /* We have enough RTT samples, so, using the Vegas
146 * algorithm, we determine if we should increase or
147 * decrease cwnd, and by how much.
148 */
149
150 /* Pluck out the RTT we are using for the Vegas
151 * calculations. This is the min RTT seen during the
152 * last RTT. Taking the min filters out the effects
153 * of delayed ACKs, at the cost of noticing congestion
154 * a bit later.
155 */
156 rtt = yeah->vegas.minRTT;
157
158 /* Compute excess number of packets above bandwidth
159 * Avoid doing full 64 bit divide.
160 */
161 bw = tp->snd_cwnd;
162 bw *= rtt - yeah->vegas.baseRTT;
163 do_div(bw, rtt);
164 queue = bw;
165
166 if (queue > TCP_YEAH_ALPHA ||
167 rtt - yeah->vegas.baseRTT > (yeah->vegas.baseRTT / TCP_YEAH_PHY)) {
168 if (queue > TCP_YEAH_ALPHA
169 && tp->snd_cwnd > yeah->reno_count) {
170 u32 reduction = min(queue / TCP_YEAH_GAMMA ,
171 tp->snd_cwnd >> TCP_YEAH_EPSILON);
172
173 tp->snd_cwnd -= reduction;
174
175 tp->snd_cwnd = max(tp->snd_cwnd,
176 yeah->reno_count);
177
178 tp->snd_ssthresh = tp->snd_cwnd;
179 }
180
181 if (yeah->reno_count <= 2)
182 yeah->reno_count = max(tp->snd_cwnd>>1, 2U);
183 else
184 yeah->reno_count++;
185
186 yeah->doing_reno_now = min(yeah->doing_reno_now + 1,
187 0xffffffU);
188 } else {
189 yeah->fast_count++;
190
191 if (yeah->fast_count > TCP_YEAH_ZETA) {
192 yeah->reno_count = 2;
193 yeah->fast_count = 0;
194 }
195
196 yeah->doing_reno_now = 0;
197 }
198
199 yeah->lastQ = queue;
200
201 }
202
203 /* Save the extent of the current window so we can use this
204 * at the end of the next RTT.
205 */
206 yeah->vegas.beg_snd_una = yeah->vegas.beg_snd_nxt;
207 yeah->vegas.beg_snd_nxt = tp->snd_nxt;
208 yeah->vegas.beg_snd_cwnd = tp->snd_cwnd;
209
210 /* Wipe the slate clean for the next RTT. */
211 yeah->vegas.cntRTT = 0;
212 yeah->vegas.minRTT = 0x7fffffff;
213 }
214}
215
216static u32 tcp_yeah_ssthresh(struct sock *sk) {
217 const struct tcp_sock *tp = tcp_sk(sk);
218 struct yeah *yeah = inet_csk_ca(sk);
219 u32 reduction;
220
221 if (yeah->doing_reno_now < TCP_YEAH_RHO) {
222 reduction = yeah->lastQ;
223
224 reduction = min( reduction, max(tp->snd_cwnd>>1, 2U) );
225
226 reduction = max( reduction, tp->snd_cwnd >> TCP_YEAH_DELTA);
227 } else
228 reduction = max(tp->snd_cwnd>>1,2U);
229
230 yeah->fast_count = 0;
231 yeah->reno_count = max(yeah->reno_count>>1, 2U);
232
233 return tp->snd_cwnd - reduction;
234}
235
236static struct tcp_congestion_ops tcp_yeah = {
237 .flags = TCP_CONG_RTT_STAMP,
238 .init = tcp_yeah_init,
239 .ssthresh = tcp_yeah_ssthresh,
240 .cong_avoid = tcp_yeah_cong_avoid,
241 .min_cwnd = tcp_reno_min_cwnd,
242 .set_state = tcp_vegas_state,
243 .cwnd_event = tcp_vegas_cwnd_event,
244 .get_info = tcp_vegas_get_info,
245 .pkts_acked = tcp_yeah_pkts_acked,
246
247 .owner = THIS_MODULE,
248 .name = "yeah",
249};
250
251static int __init tcp_yeah_register(void)
252{
253 BUG_ON(sizeof(struct yeah) > ICSK_CA_PRIV_SIZE);
254 tcp_register_congestion_control(&tcp_yeah);
255 return 0;
256}
257
258static void __exit tcp_yeah_unregister(void)
259{
260 tcp_unregister_congestion_control(&tcp_yeah);
261}
262
263module_init(tcp_yeah_register);
264module_exit(tcp_yeah_unregister);
265
266MODULE_AUTHOR("Angelo P. Castellani");
267MODULE_LICENSE("GPL");
268MODULE_DESCRIPTION("YeAH TCP");
diff --git a/net/ipv4/tcp_yeah.h b/net/ipv4/tcp_yeah.h
new file mode 100644
index 000000000000..ed3b7198f23c
--- /dev/null
+++ b/net/ipv4/tcp_yeah.h
@@ -0,0 +1,7 @@
1#include <linux/mm.h>
2#include <linux/module.h>
3#include <linux/skbuff.h>
4#include <linux/inet_diag.h>
5#include <asm/div64.h>
6
7#include <net/tcp.h>
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index fc620a7c1db4..cec0f2cc49b7 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -175,7 +175,8 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum,
175 ; 175 ;
176 } 176 }
177 result = best; 177 result = best;
178 for(i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; i++, result += UDP_HTABLE_SIZE) { 178 for (i = 0; i < (1 << 16) / UDP_HTABLE_SIZE;
179 i++, result += UDP_HTABLE_SIZE) {
179 if (result > sysctl_local_port_range[1]) 180 if (result > sysctl_local_port_range[1])
180 result = sysctl_local_port_range[0] 181 result = sysctl_local_port_range[0]
181 + ((result - sysctl_local_port_range[0]) & 182 + ((result - sysctl_local_port_range[0]) &
@@ -212,13 +213,13 @@ fail:
212 return error; 213 return error;
213} 214}
214 215
215__inline__ int udp_get_port(struct sock *sk, unsigned short snum, 216int udp_get_port(struct sock *sk, unsigned short snum,
216 int (*scmp)(const struct sock *, const struct sock *)) 217 int (*scmp)(const struct sock *, const struct sock *))
217{ 218{
218 return __udp_lib_get_port(sk, snum, udp_hash, &udp_port_rover, scmp); 219 return __udp_lib_get_port(sk, snum, udp_hash, &udp_port_rover, scmp);
219} 220}
220 221
221inline int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) 222int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2)
222{ 223{
223 struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2); 224 struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2);
224 225
@@ -270,10 +271,10 @@ static struct sock *__udp4_lib_lookup(__be32 saddr, __be16 sport,
270 continue; 271 continue;
271 score+=2; 272 score+=2;
272 } 273 }
273 if(score == 9) { 274 if (score == 9) {
274 result = sk; 275 result = sk;
275 break; 276 break;
276 } else if(score > badness) { 277 } else if (score > badness) {
277 result = sk; 278 result = sk;
278 badness = score; 279 badness = score;
279 } 280 }
@@ -329,8 +330,8 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct hlist_head udptable[])
329 struct inet_sock *inet; 330 struct inet_sock *inet;
330 struct iphdr *iph = (struct iphdr*)skb->data; 331 struct iphdr *iph = (struct iphdr*)skb->data;
331 struct udphdr *uh = (struct udphdr*)(skb->data+(iph->ihl<<2)); 332 struct udphdr *uh = (struct udphdr*)(skb->data+(iph->ihl<<2));
332 int type = skb->h.icmph->type; 333 const int type = icmp_hdr(skb)->type;
333 int code = skb->h.icmph->code; 334 const int code = icmp_hdr(skb)->code;
334 struct sock *sk; 335 struct sock *sk;
335 int harderr; 336 int harderr;
336 int err; 337 int err;
@@ -390,7 +391,7 @@ out:
390 sock_put(sk); 391 sock_put(sk);
391} 392}
392 393
393__inline__ void udp_err(struct sk_buff *skb, u32 info) 394void udp_err(struct sk_buff *skb, u32 info)
394{ 395{
395 return __udp4_lib_err(skb, info, udp_hash); 396 return __udp4_lib_err(skb, info, udp_hash);
396} 397}
@@ -419,13 +420,14 @@ static void udp4_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb,
419 __be32 src, __be32 dst, int len ) 420 __be32 src, __be32 dst, int len )
420{ 421{
421 unsigned int offset; 422 unsigned int offset;
422 struct udphdr *uh = skb->h.uh; 423 struct udphdr *uh = udp_hdr(skb);
423 __wsum csum = 0; 424 __wsum csum = 0;
424 425
425 if (skb_queue_len(&sk->sk_write_queue) == 1) { 426 if (skb_queue_len(&sk->sk_write_queue) == 1) {
426 /* 427 /*
427 * Only one fragment on the socket. 428 * Only one fragment on the socket.
428 */ 429 */
430 skb->csum_start = skb_transport_header(skb) - skb->head;
429 skb->csum_offset = offsetof(struct udphdr, check); 431 skb->csum_offset = offsetof(struct udphdr, check);
430 uh->check = ~csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, 0); 432 uh->check = ~csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, 0);
431 } else { 433 } else {
@@ -434,7 +436,7 @@ static void udp4_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb,
434 * fragments on the socket so that all csums of sk_buffs 436 * fragments on the socket so that all csums of sk_buffs
435 * should be together 437 * should be together
436 */ 438 */
437 offset = skb->h.raw - skb->data; 439 offset = skb_transport_offset(skb);
438 skb->csum = skb_checksum(skb, offset, skb->len - offset, 0); 440 skb->csum = skb_checksum(skb, offset, skb->len - offset, 0);
439 441
440 skb->ip_summed = CHECKSUM_NONE; 442 skb->ip_summed = CHECKSUM_NONE;
@@ -469,7 +471,7 @@ static int udp_push_pending_frames(struct sock *sk)
469 /* 471 /*
470 * Create a UDP header 472 * Create a UDP header
471 */ 473 */
472 uh = skb->h.uh; 474 uh = udp_hdr(skb);
473 uh->source = fl->fl_ip_sport; 475 uh->source = fl->fl_ip_sport;
474 uh->dest = fl->fl_ip_dport; 476 uh->dest = fl->fl_ip_dport;
475 uh->len = htons(up->len); 477 uh->len = htons(up->len);
@@ -765,38 +767,38 @@ out:
765 767
766int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) 768int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
767{ 769{
768 switch(cmd) 770 switch (cmd) {
771 case SIOCOUTQ:
769 { 772 {
770 case SIOCOUTQ: 773 int amount = atomic_read(&sk->sk_wmem_alloc);
771 { 774 return put_user(amount, (int __user *)arg);
772 int amount = atomic_read(&sk->sk_wmem_alloc); 775 }
773 return put_user(amount, (int __user *)arg);
774 }
775 776
776 case SIOCINQ: 777 case SIOCINQ:
777 { 778 {
778 struct sk_buff *skb; 779 struct sk_buff *skb;
779 unsigned long amount; 780 unsigned long amount;
780 781
781 amount = 0; 782 amount = 0;
782 spin_lock_bh(&sk->sk_receive_queue.lock); 783 spin_lock_bh(&sk->sk_receive_queue.lock);
783 skb = skb_peek(&sk->sk_receive_queue); 784 skb = skb_peek(&sk->sk_receive_queue);
784 if (skb != NULL) { 785 if (skb != NULL) {
785 /* 786 /*
786 * We will only return the amount 787 * We will only return the amount
787 * of this packet since that is all 788 * of this packet since that is all
788 * that will be read. 789 * that will be read.
789 */ 790 */
790 amount = skb->len - sizeof(struct udphdr); 791 amount = skb->len - sizeof(struct udphdr);
791 }
792 spin_unlock_bh(&sk->sk_receive_queue.lock);
793 return put_user(amount, (int __user *)arg);
794 } 792 }
793 spin_unlock_bh(&sk->sk_receive_queue.lock);
794 return put_user(amount, (int __user *)arg);
795 }
795 796
796 default: 797 default:
797 return -ENOIOCTLCMD; 798 return -ENOIOCTLCMD;
798 } 799 }
799 return(0); 800
801 return 0;
800} 802}
801 803
802/* 804/*
@@ -810,7 +812,9 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
810 struct inet_sock *inet = inet_sk(sk); 812 struct inet_sock *inet = inet_sk(sk);
811 struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; 813 struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
812 struct sk_buff *skb; 814 struct sk_buff *skb;
813 int copied, err, copy_only, is_udplite = IS_UDPLITE(sk); 815 unsigned int ulen, copied;
816 int err;
817 int is_udplite = IS_UDPLITE(sk);
814 818
815 /* 819 /*
816 * Check any passed addresses 820 * Check any passed addresses
@@ -826,28 +830,25 @@ try_again:
826 if (!skb) 830 if (!skb)
827 goto out; 831 goto out;
828 832
829 copied = skb->len - sizeof(struct udphdr); 833 ulen = skb->len - sizeof(struct udphdr);
830 if (copied > len) { 834 copied = len;
831 copied = len; 835 if (copied > ulen)
836 copied = ulen;
837 else if (copied < ulen)
832 msg->msg_flags |= MSG_TRUNC; 838 msg->msg_flags |= MSG_TRUNC;
833 }
834 839
835 /* 840 /*
836 * Decide whether to checksum and/or copy data. 841 * If checksum is needed at all, try to do it while copying the
837 * 842 * data. If the data is truncated, or if we only want a partial
838 * UDP: checksum may have been computed in HW, 843 * coverage checksum (UDP-Lite), do it before the copy.
839 * (re-)compute it if message is truncated.
840 * UDP-Lite: always needs to checksum, no HW support.
841 */ 844 */
842 copy_only = (skb->ip_summed==CHECKSUM_UNNECESSARY);
843 845
844 if (is_udplite || (!copy_only && msg->msg_flags&MSG_TRUNC)) { 846 if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) {
845 if (__udp_lib_checksum_complete(skb)) 847 if (udp_lib_checksum_complete(skb))
846 goto csum_copy_err; 848 goto csum_copy_err;
847 copy_only = 1;
848 } 849 }
849 850
850 if (copy_only) 851 if (skb_csum_unnecessary(skb))
851 err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), 852 err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr),
852 msg->msg_iov, copied ); 853 msg->msg_iov, copied );
853 else { 854 else {
@@ -866,8 +867,8 @@ try_again:
866 if (sin) 867 if (sin)
867 { 868 {
868 sin->sin_family = AF_INET; 869 sin->sin_family = AF_INET;
869 sin->sin_port = skb->h.uh->source; 870 sin->sin_port = udp_hdr(skb)->source;
870 sin->sin_addr.s_addr = skb->nh.iph->saddr; 871 sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
871 memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); 872 memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
872 } 873 }
873 if (inet->cmsg_flags) 874 if (inet->cmsg_flags)
@@ -875,7 +876,7 @@ try_again:
875 876
876 err = copied; 877 err = copied;
877 if (flags & MSG_TRUNC) 878 if (flags & MSG_TRUNC)
878 err = skb->len - sizeof(struct udphdr); 879 err = ulen;
879 880
880out_free: 881out_free:
881 skb_free_datagram(sk, skb); 882 skb_free_datagram(sk, skb);
@@ -949,7 +950,7 @@ static int udp_encap_rcv(struct sock * sk, struct sk_buff *skb)
949 return 1; 950 return 1;
950 951
951 /* Now we can get the pointers */ 952 /* Now we can get the pointers */
952 uh = skb->h.uh; 953 uh = udp_hdr(skb);
953 udpdata = (__u8 *)uh + sizeof(struct udphdr); 954 udpdata = (__u8 *)uh + sizeof(struct udphdr);
954 udpdata32 = (__be32 *)udpdata; 955 udpdata32 = (__be32 *)udpdata;
955 956
@@ -959,7 +960,7 @@ static int udp_encap_rcv(struct sock * sk, struct sk_buff *skb)
959 /* Check if this is a keepalive packet. If so, eat it. */ 960 /* Check if this is a keepalive packet. If so, eat it. */
960 if (len == 1 && udpdata[0] == 0xff) { 961 if (len == 1 && udpdata[0] == 0xff) {
961 return 0; 962 return 0;
962 } else if (len > sizeof(struct ip_esp_hdr) && udpdata32[0] != 0 ) { 963 } else if (len > sizeof(struct ip_esp_hdr) && udpdata32[0] != 0) {
963 /* ESP Packet without Non-ESP header */ 964 /* ESP Packet without Non-ESP header */
964 len = sizeof(struct udphdr); 965 len = sizeof(struct udphdr);
965 } else 966 } else
@@ -990,7 +991,7 @@ static int udp_encap_rcv(struct sock * sk, struct sk_buff *skb)
990 return 0; 991 return 0;
991 992
992 /* Now we can update and verify the packet length... */ 993 /* Now we can update and verify the packet length... */
993 iph = skb->nh.iph; 994 iph = ip_hdr(skb);
994 iphlen = iph->ihl << 2; 995 iphlen = iph->ihl << 2;
995 iph->tot_len = htons(ntohs(iph->tot_len) - len); 996 iph->tot_len = htons(ntohs(iph->tot_len) - len);
996 if (skb->len < iphlen + len) { 997 if (skb->len < iphlen + len) {
@@ -1002,7 +1003,8 @@ static int udp_encap_rcv(struct sock * sk, struct sk_buff *skb)
1002 * transport header to point to ESP. Keep UDP on the stack 1003 * transport header to point to ESP. Keep UDP on the stack
1003 * for later. 1004 * for later.
1004 */ 1005 */
1005 skb->h.raw = skb_pull(skb, len); 1006 __skb_pull(skb, len);
1007 skb_reset_transport_header(skb);
1006 1008
1007 /* modify the protocol (it's ESP!) */ 1009 /* modify the protocol (it's ESP!) */
1008 iph->protocol = IPPROTO_ESP; 1010 iph->protocol = IPPROTO_ESP;
@@ -1095,10 +1097,9 @@ int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
1095 } 1097 }
1096 } 1098 }
1097 1099
1098 if (sk->sk_filter && skb->ip_summed != CHECKSUM_UNNECESSARY) { 1100 if (sk->sk_filter) {
1099 if (__udp_lib_checksum_complete(skb)) 1101 if (udp_lib_checksum_complete(skb))
1100 goto drop; 1102 goto drop;
1101 skb->ip_summed = CHECKSUM_UNNECESSARY;
1102 } 1103 }
1103 1104
1104 if ((rc = sock_queue_rcv_skb(sk,skb)) < 0) { 1105 if ((rc = sock_queue_rcv_skb(sk,skb)) < 0) {
@@ -1143,10 +1144,10 @@ static int __udp4_lib_mcast_deliver(struct sk_buff *skb,
1143 1144
1144 sknext = udp_v4_mcast_next(sk_next(sk), uh->dest, daddr, 1145 sknext = udp_v4_mcast_next(sk_next(sk), uh->dest, daddr,
1145 uh->source, saddr, dif); 1146 uh->source, saddr, dif);
1146 if(sknext) 1147 if (sknext)
1147 skb1 = skb_clone(skb, GFP_ATOMIC); 1148 skb1 = skb_clone(skb, GFP_ATOMIC);
1148 1149
1149 if(skb1) { 1150 if (skb1) {
1150 int ret = udp_queue_rcv_skb(sk, skb1); 1151 int ret = udp_queue_rcv_skb(sk, skb1);
1151 if (ret > 0) 1152 if (ret > 0)
1152 /* we should probably re-process instead 1153 /* we should probably re-process instead
@@ -1154,7 +1155,7 @@ static int __udp4_lib_mcast_deliver(struct sk_buff *skb,
1154 kfree_skb(skb1); 1155 kfree_skb(skb1);
1155 } 1156 }
1156 sk = sknext; 1157 sk = sknext;
1157 } while(sknext); 1158 } while (sknext);
1158 } else 1159 } else
1159 kfree_skb(skb); 1160 kfree_skb(skb);
1160 read_unlock(&udp_hash_lock); 1161 read_unlock(&udp_hash_lock);
@@ -1166,25 +1167,37 @@ static int __udp4_lib_mcast_deliver(struct sk_buff *skb,
1166 * Otherwise, csum completion requires chacksumming packet body, 1167 * Otherwise, csum completion requires chacksumming packet body,
1167 * including udp header and folding it to skb->csum. 1168 * including udp header and folding it to skb->csum.
1168 */ 1169 */
1169static inline void udp4_csum_init(struct sk_buff *skb, struct udphdr *uh) 1170static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
1171 int proto)
1170{ 1172{
1173 const struct iphdr *iph;
1174 int err;
1175
1176 UDP_SKB_CB(skb)->partial_cov = 0;
1177 UDP_SKB_CB(skb)->cscov = skb->len;
1178
1179 if (proto == IPPROTO_UDPLITE) {
1180 err = udplite_checksum_init(skb, uh);
1181 if (err)
1182 return err;
1183 }
1184
1185 iph = ip_hdr(skb);
1171 if (uh->check == 0) { 1186 if (uh->check == 0) {
1172 skb->ip_summed = CHECKSUM_UNNECESSARY; 1187 skb->ip_summed = CHECKSUM_UNNECESSARY;
1173 } else if (skb->ip_summed == CHECKSUM_COMPLETE) { 1188 } else if (skb->ip_summed == CHECKSUM_COMPLETE) {
1174 if (!csum_tcpudp_magic(skb->nh.iph->saddr, skb->nh.iph->daddr, 1189 if (!csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len,
1175 skb->len, IPPROTO_UDP, skb->csum )) 1190 proto, skb->csum))
1176 skb->ip_summed = CHECKSUM_UNNECESSARY; 1191 skb->ip_summed = CHECKSUM_UNNECESSARY;
1177 } 1192 }
1178 if (skb->ip_summed != CHECKSUM_UNNECESSARY) 1193 if (!skb_csum_unnecessary(skb))
1179 skb->csum = csum_tcpudp_nofold(skb->nh.iph->saddr, 1194 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1180 skb->nh.iph->daddr, 1195 skb->len, proto, 0);
1181 skb->len, IPPROTO_UDP, 0);
1182 /* Probably, we should checksum udp header (it should be in cache 1196 /* Probably, we should checksum udp header (it should be in cache
1183 * in any case) and data in tiny packets (< rx copybreak). 1197 * in any case) and data in tiny packets (< rx copybreak).
1184 */ 1198 */
1185 1199
1186 /* UDP = UDP-Lite with a non-partial checksum coverage */ 1200 return 0;
1187 UDP_SKB_CB(skb)->partial_cov = 0;
1188} 1201}
1189 1202
1190/* 1203/*
@@ -1192,14 +1205,14 @@ static inline void udp4_csum_init(struct sk_buff *skb, struct udphdr *uh)
1192 */ 1205 */
1193 1206
1194int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[], 1207int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[],
1195 int is_udplite) 1208 int proto)
1196{ 1209{
1197 struct sock *sk; 1210 struct sock *sk;
1198 struct udphdr *uh = skb->h.uh; 1211 struct udphdr *uh = udp_hdr(skb);
1199 unsigned short ulen; 1212 unsigned short ulen;
1200 struct rtable *rt = (struct rtable*)skb->dst; 1213 struct rtable *rt = (struct rtable*)skb->dst;
1201 __be32 saddr = skb->nh.iph->saddr; 1214 __be32 saddr = ip_hdr(skb)->saddr;
1202 __be32 daddr = skb->nh.iph->daddr; 1215 __be32 daddr = ip_hdr(skb)->daddr;
1203 1216
1204 /* 1217 /*
1205 * Validate the packet. 1218 * Validate the packet.
@@ -1211,20 +1224,17 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[],
1211 if (ulen > skb->len) 1224 if (ulen > skb->len)
1212 goto short_packet; 1225 goto short_packet;
1213 1226
1214 if(! is_udplite ) { /* UDP validates ulen. */ 1227 if (proto == IPPROTO_UDP) {
1215 1228 /* UDP validates ulen. */
1216 if (ulen < sizeof(*uh) || pskb_trim_rcsum(skb, ulen)) 1229 if (ulen < sizeof(*uh) || pskb_trim_rcsum(skb, ulen))
1217 goto short_packet; 1230 goto short_packet;
1218 uh = skb->h.uh; 1231 uh = udp_hdr(skb);
1219
1220 udp4_csum_init(skb, uh);
1221
1222 } else { /* UDP-Lite validates cscov. */
1223 if (udplite4_csum_init(skb, uh))
1224 goto csum_error;
1225 } 1232 }
1226 1233
1227 if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) 1234 if (udp4_csum_init(skb, uh, proto))
1235 goto csum_error;
1236
1237 if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
1228 return __udp4_lib_mcast_deliver(skb, uh, saddr, daddr, udptable); 1238 return __udp4_lib_mcast_deliver(skb, uh, saddr, daddr, udptable);
1229 1239
1230 sk = __udp4_lib_lookup(saddr, uh->source, daddr, uh->dest, 1240 sk = __udp4_lib_lookup(saddr, uh->source, daddr, uh->dest,
@@ -1250,7 +1260,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[],
1250 if (udp_lib_checksum_complete(skb)) 1260 if (udp_lib_checksum_complete(skb))
1251 goto csum_error; 1261 goto csum_error;
1252 1262
1253 UDP_INC_STATS_BH(UDP_MIB_NOPORTS, is_udplite); 1263 UDP_INC_STATS_BH(UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
1254 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); 1264 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
1255 1265
1256 /* 1266 /*
@@ -1258,11 +1268,11 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[],
1258 * don't wanna listen. Ignore it. 1268 * don't wanna listen. Ignore it.
1259 */ 1269 */
1260 kfree_skb(skb); 1270 kfree_skb(skb);
1261 return(0); 1271 return 0;
1262 1272
1263short_packet: 1273short_packet:
1264 LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n", 1274 LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n",
1265 is_udplite? "-Lite" : "", 1275 proto == IPPROTO_UDPLITE ? "-Lite" : "",
1266 NIPQUAD(saddr), 1276 NIPQUAD(saddr),
1267 ntohs(uh->source), 1277 ntohs(uh->source),
1268 ulen, 1278 ulen,
@@ -1277,21 +1287,21 @@ csum_error:
1277 * the network is concerned, anyway) as per 4.1.3.4 (MUST). 1287 * the network is concerned, anyway) as per 4.1.3.4 (MUST).
1278 */ 1288 */
1279 LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n", 1289 LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n",
1280 is_udplite? "-Lite" : "", 1290 proto == IPPROTO_UDPLITE ? "-Lite" : "",
1281 NIPQUAD(saddr), 1291 NIPQUAD(saddr),
1282 ntohs(uh->source), 1292 ntohs(uh->source),
1283 NIPQUAD(daddr), 1293 NIPQUAD(daddr),
1284 ntohs(uh->dest), 1294 ntohs(uh->dest),
1285 ulen); 1295 ulen);
1286drop: 1296drop:
1287 UDP_INC_STATS_BH(UDP_MIB_INERRORS, is_udplite); 1297 UDP_INC_STATS_BH(UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
1288 kfree_skb(skb); 1298 kfree_skb(skb);
1289 return(0); 1299 return 0;
1290} 1300}
1291 1301
1292__inline__ int udp_rcv(struct sk_buff *skb) 1302int udp_rcv(struct sk_buff *skb)
1293{ 1303{
1294 return __udp4_lib_rcv(skb, udp_hash, 0); 1304 return __udp4_lib_rcv(skb, udp_hash, IPPROTO_UDP);
1295} 1305}
1296 1306
1297int udp_destroy_sock(struct sock *sk) 1307int udp_destroy_sock(struct sock *sk)
@@ -1313,13 +1323,13 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
1313 int val; 1323 int val;
1314 int err = 0; 1324 int err = 0;
1315 1325
1316 if(optlen<sizeof(int)) 1326 if (optlen<sizeof(int))
1317 return -EINVAL; 1327 return -EINVAL;
1318 1328
1319 if (get_user(val, (int __user *)optval)) 1329 if (get_user(val, (int __user *)optval))
1320 return -EFAULT; 1330 return -EFAULT;
1321 1331
1322 switch(optname) { 1332 switch (optname) {
1323 case UDP_CORK: 1333 case UDP_CORK:
1324 if (val != 0) { 1334 if (val != 0) {
1325 up->corkflag = 1; 1335 up->corkflag = 1;
@@ -1373,7 +1383,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
1373 default: 1383 default:
1374 err = -ENOPROTOOPT; 1384 err = -ENOPROTOOPT;
1375 break; 1385 break;
1376 }; 1386 }
1377 1387
1378 return err; 1388 return err;
1379} 1389}
@@ -1404,15 +1414,15 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
1404 struct udp_sock *up = udp_sk(sk); 1414 struct udp_sock *up = udp_sk(sk);
1405 int val, len; 1415 int val, len;
1406 1416
1407 if(get_user(len,optlen)) 1417 if (get_user(len,optlen))
1408 return -EFAULT; 1418 return -EFAULT;
1409 1419
1410 len = min_t(unsigned int, len, sizeof(int)); 1420 len = min_t(unsigned int, len, sizeof(int));
1411 1421
1412 if(len < 0) 1422 if (len < 0)
1413 return -EINVAL; 1423 return -EINVAL;
1414 1424
1415 switch(optname) { 1425 switch (optname) {
1416 case UDP_CORK: 1426 case UDP_CORK:
1417 val = up->corkflag; 1427 val = up->corkflag;
1418 break; 1428 break;
@@ -1433,11 +1443,11 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
1433 1443
1434 default: 1444 default:
1435 return -ENOPROTOOPT; 1445 return -ENOPROTOOPT;
1436 }; 1446 }
1437 1447
1438 if(put_user(len, optlen)) 1448 if (put_user(len, optlen))
1439 return -EFAULT; 1449 return -EFAULT;
1440 if(copy_to_user(optval, &val,len)) 1450 if (copy_to_user(optval, &val,len))
1441 return -EFAULT; 1451 return -EFAULT;
1442 return 0; 1452 return 0;
1443} 1453}
@@ -1486,15 +1496,11 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait)
1486 struct sk_buff *skb; 1496 struct sk_buff *skb;
1487 1497
1488 spin_lock_bh(&rcvq->lock); 1498 spin_lock_bh(&rcvq->lock);
1489 while ((skb = skb_peek(rcvq)) != NULL) { 1499 while ((skb = skb_peek(rcvq)) != NULL &&
1490 if (udp_lib_checksum_complete(skb)) { 1500 udp_lib_checksum_complete(skb)) {
1491 UDP_INC_STATS_BH(UDP_MIB_INERRORS, is_lite); 1501 UDP_INC_STATS_BH(UDP_MIB_INERRORS, is_lite);
1492 __skb_unlink(skb, rcvq); 1502 __skb_unlink(skb, rcvq);
1493 kfree_skb(skb); 1503 kfree_skb(skb);
1494 } else {
1495 skb->ip_summed = CHECKSUM_UNNECESSARY;
1496 break;
1497 }
1498 } 1504 }
1499 spin_unlock_bh(&rcvq->lock); 1505 spin_unlock_bh(&rcvq->lock);
1500 1506
@@ -1573,7 +1579,7 @@ static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
1573 struct sock *sk = udp_get_first(seq); 1579 struct sock *sk = udp_get_first(seq);
1574 1580
1575 if (sk) 1581 if (sk)
1576 while(pos && (sk = udp_get_next(seq, sk)) != NULL) 1582 while (pos && (sk = udp_get_next(seq, sk)) != NULL)
1577 --pos; 1583 --pos;
1578 return pos ? NULL : sk; 1584 return pos ? NULL : sk;
1579} 1585}
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index b28fe1edf98b..f34fd686a8f1 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -31,7 +31,7 @@ static int udplite_v4_get_port(struct sock *sk, unsigned short snum)
31 31
32static int udplite_rcv(struct sk_buff *skb) 32static int udplite_rcv(struct sk_buff *skb)
33{ 33{
34 return __udp4_lib_rcv(skb, udplite_hash, 1); 34 return __udp4_lib_rcv(skb, udplite_hash, IPPROTO_UDPLITE);
35} 35}
36 36
37static void udplite_err(struct sk_buff *skb, u32 info) 37static void udplite_err(struct sk_buff *skb, u32 info)
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 78e80deb7e89..5ceca951d73f 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -28,7 +28,7 @@ static int xfrm4_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32
28 switch (nexthdr) { 28 switch (nexthdr) {
29 case IPPROTO_IPIP: 29 case IPPROTO_IPIP:
30 case IPPROTO_IPV6: 30 case IPPROTO_IPV6:
31 *spi = skb->nh.iph->saddr; 31 *spi = ip_hdr(skb)->saddr;
32 *seq = 0; 32 *seq = 0;
33 return 0; 33 return 0;
34 } 34 }
@@ -39,9 +39,9 @@ static int xfrm4_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32
39#ifdef CONFIG_NETFILTER 39#ifdef CONFIG_NETFILTER
40static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb) 40static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb)
41{ 41{
42 struct iphdr *iph = skb->nh.iph;
43
44 if (skb->dst == NULL) { 42 if (skb->dst == NULL) {
43 const struct iphdr *iph = ip_hdr(skb);
44
45 if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, 45 if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
46 skb->dev)) 46 skb->dev))
47 goto drop; 47 goto drop;
@@ -55,18 +55,18 @@ drop:
55 55
56int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type) 56int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type)
57{ 57{
58 int err;
59 __be32 spi, seq; 58 __be32 spi, seq;
60 struct xfrm_state *xfrm_vec[XFRM_MAX_DEPTH]; 59 struct xfrm_state *xfrm_vec[XFRM_MAX_DEPTH];
61 struct xfrm_state *x; 60 struct xfrm_state *x;
62 int xfrm_nr = 0; 61 int xfrm_nr = 0;
63 int decaps = 0; 62 int decaps = 0;
63 int err = xfrm4_parse_spi(skb, ip_hdr(skb)->protocol, &spi, &seq);
64 64
65 if ((err = xfrm4_parse_spi(skb, skb->nh.iph->protocol, &spi, &seq)) != 0) 65 if (err != 0)
66 goto drop; 66 goto drop;
67 67
68 do { 68 do {
69 struct iphdr *iph = skb->nh.iph; 69 const struct iphdr *iph = ip_hdr(skb);
70 70
71 if (xfrm_nr == XFRM_MAX_DEPTH) 71 if (xfrm_nr == XFRM_MAX_DEPTH)
72 goto drop; 72 goto drop;
@@ -113,7 +113,8 @@ int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type)
113 break; 113 break;
114 } 114 }
115 115
116 if ((err = xfrm_parse_spi(skb, skb->nh.iph->protocol, &spi, &seq)) < 0) 116 err = xfrm_parse_spi(skb, ip_hdr(skb)->protocol, &spi, &seq);
117 if (err < 0)
117 goto drop; 118 goto drop;
118 } while (!err); 119 } while (!err);
119 120
@@ -146,15 +147,15 @@ int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type)
146 return 0; 147 return 0;
147 } else { 148 } else {
148#ifdef CONFIG_NETFILTER 149#ifdef CONFIG_NETFILTER
149 __skb_push(skb, skb->data - skb->nh.raw); 150 __skb_push(skb, skb->data - skb_network_header(skb));
150 skb->nh.iph->tot_len = htons(skb->len); 151 ip_hdr(skb)->tot_len = htons(skb->len);
151 ip_send_check(skb->nh.iph); 152 ip_send_check(ip_hdr(skb));
152 153
153 NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, skb->dev, NULL, 154 NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, skb->dev, NULL,
154 xfrm4_rcv_encap_finish); 155 xfrm4_rcv_encap_finish);
155 return 0; 156 return 0;
156#else 157#else
157 return -skb->nh.iph->protocol; 158 return -ip_hdr(skb)->protocol;
158#endif 159#endif
159 } 160 }
160 161
diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c
index d419e15d9803..a73e710740c2 100644
--- a/net/ipv4/xfrm4_mode_beet.c
+++ b/net/ipv4/xfrm4_mode_beet.c
@@ -29,20 +29,21 @@
29 */ 29 */
30static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb) 30static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb)
31{ 31{
32 struct iphdr *iph, *top_iph = NULL; 32 struct iphdr *iph, *top_iph;
33 int hdrlen, optlen; 33 int hdrlen, optlen;
34 34
35 iph = skb->nh.iph; 35 iph = ip_hdr(skb);
36 skb->h.ipiph = iph; 36 skb->transport_header = skb->network_header;
37 37
38 hdrlen = 0; 38 hdrlen = 0;
39 optlen = iph->ihl * 4 - sizeof(*iph); 39 optlen = iph->ihl * 4 - sizeof(*iph);
40 if (unlikely(optlen)) 40 if (unlikely(optlen))
41 hdrlen += IPV4_BEET_PHMAXLEN - (optlen & 4); 41 hdrlen += IPV4_BEET_PHMAXLEN - (optlen & 4);
42 42
43 skb->nh.raw = skb_push(skb, x->props.header_len + hdrlen); 43 skb_push(skb, x->props.header_len - IPV4_BEET_PHMAXLEN + hdrlen);
44 top_iph = skb->nh.iph; 44 skb_reset_network_header(skb);
45 skb->h.raw += sizeof(*iph) - hdrlen; 45 top_iph = ip_hdr(skb);
46 skb->transport_header += sizeof(*iph) - hdrlen;
46 47
47 memmove(top_iph, iph, sizeof(*iph)); 48 memmove(top_iph, iph, sizeof(*iph));
48 if (unlikely(optlen)) { 49 if (unlikely(optlen)) {
@@ -50,7 +51,7 @@ static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb)
50 51
51 BUG_ON(optlen < 0); 52 BUG_ON(optlen < 0);
52 53
53 ph = (struct ip_beet_phdr *)skb->h.raw; 54 ph = (struct ip_beet_phdr *)skb_transport_header(skb);
54 ph->padlen = 4 - (optlen & 4); 55 ph->padlen = 4 - (optlen & 4);
55 ph->hdrlen = optlen / 8; 56 ph->hdrlen = optlen / 8;
56 ph->nexthdr = top_iph->protocol; 57 ph->nexthdr = top_iph->protocol;
@@ -69,20 +70,18 @@ static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb)
69 70
70static int xfrm4_beet_input(struct xfrm_state *x, struct sk_buff *skb) 71static int xfrm4_beet_input(struct xfrm_state *x, struct sk_buff *skb)
71{ 72{
72 struct iphdr *iph = skb->nh.iph; 73 struct iphdr *iph = ip_hdr(skb);
73 int phlen = 0; 74 int phlen = 0;
74 int optlen = 0; 75 int optlen = 0;
75 __u8 ph_nexthdr = 0, protocol = 0; 76 u8 ph_nexthdr = 0;
76 int err = -EINVAL; 77 int err = -EINVAL;
77 78
78 protocol = iph->protocol;
79
80 if (unlikely(iph->protocol == IPPROTO_BEETPH)) { 79 if (unlikely(iph->protocol == IPPROTO_BEETPH)) {
81 struct ip_beet_phdr *ph; 80 struct ip_beet_phdr *ph;
82 81
83 if (!pskb_may_pull(skb, sizeof(*ph))) 82 if (!pskb_may_pull(skb, sizeof(*ph)))
84 goto out; 83 goto out;
85 ph = (struct ip_beet_phdr *)(skb->h.ipiph + 1); 84 ph = (struct ip_beet_phdr *)(ipip_hdr(skb) + 1);
86 85
87 phlen = sizeof(*ph) + ph->padlen; 86 phlen = sizeof(*ph) + ph->padlen;
88 optlen = ph->hdrlen * 8 + (IPV4_BEET_PHMAXLEN - phlen); 87 optlen = ph->hdrlen * 8 + (IPV4_BEET_PHMAXLEN - phlen);
@@ -96,22 +95,20 @@ static int xfrm4_beet_input(struct xfrm_state *x, struct sk_buff *skb)
96 ph_nexthdr = ph->nexthdr; 95 ph_nexthdr = ph->nexthdr;
97 } 96 }
98 97
99 skb->nh.raw = skb->data + (phlen - sizeof(*iph)); 98 skb_set_network_header(skb, phlen - sizeof(*iph));
100 memmove(skb->nh.raw, iph, sizeof(*iph)); 99 memmove(skb_network_header(skb), iph, sizeof(*iph));
101 skb->h.raw = skb->data + (phlen + optlen); 100 skb_set_transport_header(skb, phlen + optlen);
102 skb->data = skb->h.raw; 101 skb->data = skb_transport_header(skb);
103 102
104 iph = skb->nh.iph; 103 iph = ip_hdr(skb);
105 iph->ihl = (sizeof(*iph) + optlen) / 4; 104 iph->ihl = (sizeof(*iph) + optlen) / 4;
106 iph->tot_len = htons(skb->len + iph->ihl * 4); 105 iph->tot_len = htons(skb->len + iph->ihl * 4);
107 iph->daddr = x->sel.daddr.a4; 106 iph->daddr = x->sel.daddr.a4;
108 iph->saddr = x->sel.saddr.a4; 107 iph->saddr = x->sel.saddr.a4;
109 if (ph_nexthdr) 108 if (ph_nexthdr)
110 iph->protocol = ph_nexthdr; 109 iph->protocol = ph_nexthdr;
111 else
112 iph->protocol = protocol;
113 iph->check = 0; 110 iph->check = 0;
114 iph->check = ip_fast_csum(skb->nh.raw, iph->ihl); 111 iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl);
115 err = 0; 112 err = 0;
116out: 113out:
117 return err; 114 return err;
diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c
index 92676b7e4034..601047161ea6 100644
--- a/net/ipv4/xfrm4_mode_transport.c
+++ b/net/ipv4/xfrm4_mode_transport.c
@@ -23,16 +23,13 @@
23 */ 23 */
24static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb) 24static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb)
25{ 25{
26 struct iphdr *iph; 26 struct iphdr *iph = ip_hdr(skb);
27 int ihl; 27 int ihl = iph->ihl * 4;
28 28
29 iph = skb->nh.iph; 29 skb->transport_header = skb->network_header + ihl;
30 skb->h.ipiph = iph; 30 skb_push(skb, x->props.header_len);
31 31 skb_reset_network_header(skb);
32 ihl = iph->ihl * 4; 32 memmove(skb_network_header(skb), iph, ihl);
33 skb->h.raw += ihl;
34
35 skb->nh.raw = memmove(skb_push(skb, x->props.header_len), iph, ihl);
36 return 0; 33 return 0;
37} 34}
38 35
@@ -46,12 +43,15 @@ static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb)
46 */ 43 */
47static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb) 44static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb)
48{ 45{
49 int ihl = skb->data - skb->h.raw; 46 int ihl = skb->data - skb_transport_header(skb);
50 47
51 if (skb->h.raw != skb->nh.raw) 48 if (skb->transport_header != skb->network_header) {
52 skb->nh.raw = memmove(skb->h.raw, skb->nh.raw, ihl); 49 memmove(skb_transport_header(skb),
53 skb->nh.iph->tot_len = htons(skb->len + ihl); 50 skb_network_header(skb), ihl);
54 skb->h.raw = skb->data; 51 skb->network_header = skb->transport_header;
52 }
53 ip_hdr(skb)->tot_len = htons(skb->len + ihl);
54 skb_reset_transport_header(skb);
55 return 0; 55 return 0;
56} 56}
57 57
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index ceb4376f572a..a2f2e6a5ec5d 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -16,8 +16,8 @@
16 16
17static inline void ipip_ecn_decapsulate(struct sk_buff *skb) 17static inline void ipip_ecn_decapsulate(struct sk_buff *skb)
18{ 18{
19 struct iphdr *outer_iph = skb->nh.iph; 19 struct iphdr *outer_iph = ip_hdr(skb);
20 struct iphdr *inner_iph = skb->h.ipiph; 20 struct iphdr *inner_iph = ipip_hdr(skb);
21 21
22 if (INET_ECN_is_ce(outer_iph->tos)) 22 if (INET_ECN_is_ce(outer_iph->tos))
23 IP_ECN_set_ce(inner_iph); 23 IP_ECN_set_ce(inner_iph);
@@ -26,7 +26,7 @@ static inline void ipip_ecn_decapsulate(struct sk_buff *skb)
26static inline void ipip6_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb) 26static inline void ipip6_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
27{ 27{
28 if (INET_ECN_is_ce(iph->tos)) 28 if (INET_ECN_is_ce(iph->tos))
29 IP6_ECN_set_ce(skb->nh.ipv6h); 29 IP6_ECN_set_ce(ipv6_hdr(skb));
30} 30}
31 31
32/* Add encapsulation header. 32/* Add encapsulation header.
@@ -46,11 +46,12 @@ static int xfrm4_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
46 struct iphdr *iph, *top_iph; 46 struct iphdr *iph, *top_iph;
47 int flags; 47 int flags;
48 48
49 iph = skb->nh.iph; 49 iph = ip_hdr(skb);
50 skb->h.ipiph = iph; 50 skb->transport_header = skb->network_header;
51 51
52 skb->nh.raw = skb_push(skb, x->props.header_len); 52 skb_push(skb, x->props.header_len);
53 top_iph = skb->nh.iph; 53 skb_reset_network_header(skb);
54 top_iph = ip_hdr(skb);
54 55
55 top_iph->ihl = 5; 56 top_iph->ihl = 5;
56 top_iph->version = 4; 57 top_iph->version = 4;
@@ -90,10 +91,11 @@ static int xfrm4_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
90 91
91static int xfrm4_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) 92static int xfrm4_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
92{ 93{
93 struct iphdr *iph = skb->nh.iph; 94 struct iphdr *iph = ip_hdr(skb);
95 const unsigned char *old_mac;
94 int err = -EINVAL; 96 int err = -EINVAL;
95 97
96 switch(iph->protocol){ 98 switch (iph->protocol){
97 case IPPROTO_IPIP: 99 case IPPROTO_IPIP:
98 break; 100 break;
99#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) 101#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
@@ -111,10 +113,10 @@ static int xfrm4_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
111 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) 113 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
112 goto out; 114 goto out;
113 115
114 iph = skb->nh.iph; 116 iph = ip_hdr(skb);
115 if (iph->protocol == IPPROTO_IPIP) { 117 if (iph->protocol == IPPROTO_IPIP) {
116 if (x->props.flags & XFRM_STATE_DECAP_DSCP) 118 if (x->props.flags & XFRM_STATE_DECAP_DSCP)
117 ipv4_copy_dscp(iph, skb->h.ipiph); 119 ipv4_copy_dscp(iph, ipip_hdr(skb));
118 if (!(x->props.flags & XFRM_STATE_NOECN)) 120 if (!(x->props.flags & XFRM_STATE_NOECN))
119 ipip_ecn_decapsulate(skb); 121 ipip_ecn_decapsulate(skb);
120 } 122 }
@@ -125,9 +127,10 @@ static int xfrm4_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
125 skb->protocol = htons(ETH_P_IPV6); 127 skb->protocol = htons(ETH_P_IPV6);
126 } 128 }
127#endif 129#endif
128 skb->mac.raw = memmove(skb->data - skb->mac_len, 130 old_mac = skb_mac_header(skb);
129 skb->mac.raw, skb->mac_len); 131 skb_set_mac_header(skb, -skb->mac_len);
130 skb->nh.raw = skb->data; 132 memmove(skb_mac_header(skb), old_mac, skb->mac_len);
133 skb_reset_network_header(skb);
131 err = 0; 134 err = 0;
132 135
133out: 136out:
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 038ca160fe2c..44ef208a75cb 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -22,14 +22,13 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb)
22{ 22{
23 int mtu, ret = 0; 23 int mtu, ret = 0;
24 struct dst_entry *dst; 24 struct dst_entry *dst;
25 struct iphdr *iph = skb->nh.iph;
26 25
27 if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE) 26 if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE)
28 goto out; 27 goto out;
29 28
30 IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE; 29 IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE;
31 30
32 if (!(iph->frag_off & htons(IP_DF)) || skb->local_df) 31 if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->local_df)
33 goto out; 32 goto out;
34 33
35 dst = skb->dst; 34 dst = skb->dst;
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 5d51a2af34c1..4ff8ed30024f 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -119,7 +119,7 @@ __xfrm4_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int
119 119
120 if (xfrm[i]->props.mode == XFRM_MODE_TUNNEL) { 120 if (xfrm[i]->props.mode == XFRM_MODE_TUNNEL) {
121 unsigned short encap_family = xfrm[i]->props.family; 121 unsigned short encap_family = xfrm[i]->props.family;
122 switch(encap_family) { 122 switch (encap_family) {
123 case AF_INET: 123 case AF_INET:
124 fl_tunnel.fl4_dst = xfrm[i]->id.daddr.a4; 124 fl_tunnel.fl4_dst = xfrm[i]->id.daddr.a4;
125 fl_tunnel.fl4_src = xfrm[i]->props.saddr.a4; 125 fl_tunnel.fl4_src = xfrm[i]->props.saddr.a4;
@@ -209,8 +209,8 @@ error:
209static void 209static void
210_decode_session4(struct sk_buff *skb, struct flowi *fl) 210_decode_session4(struct sk_buff *skb, struct flowi *fl)
211{ 211{
212 struct iphdr *iph = skb->nh.iph; 212 struct iphdr *iph = ip_hdr(skb);
213 u8 *xprth = skb->nh.raw + iph->ihl*4; 213 u8 *xprth = skb_network_header(skb) + iph->ihl * 4;
214 214
215 memset(fl, 0, sizeof(struct flowi)); 215 memset(fl, 0, sizeof(struct flowi));
216 if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) { 216 if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) {
@@ -263,7 +263,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl)
263 default: 263 default:
264 fl->fl_ipsec_spi = 0; 264 fl->fl_ipsec_spi = 0;
265 break; 265 break;
266 }; 266 }
267 } 267 }
268 fl->proto = iph->protocol; 268 fl->proto = iph->protocol;
269 fl->fl4_dst = iph->daddr; 269 fl->fl4_dst = iph->daddr;
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
index 3eef06454da9..568510304553 100644
--- a/net/ipv4/xfrm4_tunnel.c
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -12,9 +12,8 @@
12 12
13static int ipip_output(struct xfrm_state *x, struct sk_buff *skb) 13static int ipip_output(struct xfrm_state *x, struct sk_buff *skb)
14{ 14{
15 struct iphdr *iph; 15 struct iphdr *iph = ip_hdr(skb);
16 16
17 iph = skb->nh.iph;
18 iph->tot_len = htons(skb->len); 17 iph->tot_len = htons(skb->len);
19 ip_send_check(iph); 18 ip_send_check(iph);
20 19