aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorJeff Garzik <jgarzik@pobox.com>2005-11-11 23:39:35 -0500
committerJeff Garzik <jgarzik@pobox.com>2005-11-11 23:39:35 -0500
commitf4256e301d9800b1e0276404cb01b3ac85b51067 (patch)
tree975f56627b78f757608b31684311a24ca1478481 /net
parentfb2a26b9f8f5eda6b96ba9753edf105e5999d6d9 (diff)
parentcd52d1ee9a92587b242d946a2300a3245d3b885a (diff)
Merge branch 'master'
Diffstat (limited to 'net')
-rw-r--r--net/core/datagram.c21
-rw-r--r--net/core/dev.c12
-rw-r--r--net/core/netpoll.c18
-rw-r--r--net/core/rtnetlink.c83
-rw-r--r--net/core/skbuff.c15
-rw-r--r--net/decnet/af_decnet.c14
-rw-r--r--net/ieee80211/ieee80211_crypt.c152
-rw-r--r--net/ieee80211/ieee80211_rx.c2
-rw-r--r--net/ieee80211/ieee80211_wx.c14
-rw-r--r--net/ipv4/icmp.c6
-rw-r--r--net/ipv4/igmp.c19
-rw-r--r--net/ipv4/inet_diag.c9
-rw-r--r--net/ipv4/ip_gre.c15
-rw-r--r--net/ipv4/netfilter/Kconfig41
-rw-r--r--net/ipv4/netfilter/Makefile6
-rw-r--r--net/ipv4/netfilter/ip_conntrack_netlink.c85
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_icmp.c26
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_tcp.c11
-rw-r--r--net/ipv4/netfilter/ip_nat_helper_pptp.c28
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c12
-rw-r--r--net/ipv4/netfilter/ipt_CONNMARK.c22
-rw-r--r--net/ipv4/netfilter/ipt_NOTRACK.c4
-rw-r--r--net/ipv4/netfilter/ipt_connbytes.c39
-rw-r--r--net/ipv4/netfilter/ipt_connmark.c10
-rw-r--r--net/ipv4/netfilter/ipt_conntrack.c96
-rw-r--r--net/ipv4/netfilter/ipt_helper.c54
-rw-r--r--net/ipv4/netfilter/ipt_state.c6
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c571
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c301
-rw-r--r--net/ipv4/sysctl_net_ipv4.c8
-rw-r--r--net/ipv4/tcp.c3
-rw-r--r--net/ipv4/tcp_bic.c12
-rw-r--r--net/ipv4/tcp_cong.c40
-rw-r--r--net/ipv4/tcp_highspeed.c11
-rw-r--r--net/ipv4/tcp_htcp.c13
-rw-r--r--net/ipv4/tcp_hybla.c6
-rw-r--r--net/ipv4/tcp_input.c288
-rw-r--r--net/ipv4/tcp_ipv4.c28
-rw-r--r--net/ipv4/tcp_minisocks.c7
-rw-r--r--net/ipv4/tcp_output.c61
-rw-r--r--net/ipv4/tcp_scalable.c14
-rw-r--r--net/ipv4/tcp_timer.c4
-rw-r--r--net/ipv4/tcp_vegas.c42
-rw-r--r--net/ipv4/udp.c7
-rw-r--r--net/ipv6/addrconf.c3
-rw-r--r--net/ipv6/af_inet6.c8
-rw-r--r--net/ipv6/icmp.c21
-rw-r--r--net/ipv6/ip6_input.c5
-rw-r--r--net/ipv6/ip6_output.c6
-rw-r--r--net/ipv6/ip6_tunnel.c1
-rw-r--r--net/ipv6/netfilter/Kconfig14
-rw-r--r--net/ipv6/netfilter/Makefile6
-rw-r--r--net/ipv6/netfilter/ip6t_MARK.c6
-rw-r--r--net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c556
-rw-r--r--net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c272
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c885
-rw-r--r--net/ipv6/raw.c46
-rw-r--r--net/ipv6/route.c2
-rw-r--r--net/ipv6/tcp_ipv6.c20
-rw-r--r--net/ipv6/udp.c25
-rw-r--r--net/netfilter/Kconfig74
-rw-r--r--net/netfilter/Makefile8
-rw-r--r--net/netfilter/nf_conntrack_core.c1538
-rw-r--r--net/netfilter/nf_conntrack_ftp.c698
-rw-r--r--net/netfilter/nf_conntrack_l3proto_generic.c98
-rw-r--r--net/netfilter/nf_conntrack_proto_generic.c85
-rw-r--r--net/netfilter/nf_conntrack_proto_sctp.c670
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c1162
-rw-r--r--net/netfilter/nf_conntrack_proto_udp.c216
-rw-r--r--net/netfilter/nf_conntrack_standalone.c869
-rw-r--r--net/netfilter/nfnetlink.c21
-rw-r--r--net/netlink/Makefile2
-rw-r--r--net/netlink/af_netlink.c97
-rw-r--r--net/netlink/attr.c328
-rw-r--r--net/netlink/genetlink.c579
-rw-r--r--net/rxrpc/transport.c15
-rw-r--r--net/sctp/associola.c33
-rw-r--r--net/sctp/endpointola.c26
-rw-r--r--net/sctp/input.c20
-rw-r--r--net/sctp/protocol.c6
-rw-r--r--net/sctp/sm_sideeffect.c6
-rw-r--r--net/sctp/sm_statefuns.c22
-rw-r--r--net/sctp/socket.c5
-rw-r--r--net/sctp/sysctl.c8
-rw-r--r--net/sctp/ulpevent.c24
-rw-r--r--net/sunrpc/clnt.c32
-rw-r--r--net/sunrpc/rpc_pipe.c6
-rw-r--r--net/sunrpc/socklib.c5
-rw-r--r--net/sunrpc/svcsock.c9
-rw-r--r--net/unix/af_unix.c2
-rw-r--r--net/xfrm/xfrm_user.c69
91 files changed, 10036 insertions, 809 deletions
diff --git a/net/core/datagram.c b/net/core/datagram.c
index d219435d086c..1bcfef51ac58 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -350,6 +350,20 @@ fault:
350 return -EFAULT; 350 return -EFAULT;
351} 351}
352 352
353unsigned int __skb_checksum_complete(struct sk_buff *skb)
354{
355 unsigned int sum;
356
357 sum = (u16)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum));
358 if (likely(!sum)) {
359 if (unlikely(skb->ip_summed == CHECKSUM_HW))
360 netdev_rx_csum_fault(skb->dev);
361 skb->ip_summed = CHECKSUM_UNNECESSARY;
362 }
363 return sum;
364}
365EXPORT_SYMBOL(__skb_checksum_complete);
366
353/** 367/**
354 * skb_copy_and_csum_datagram_iovec - Copy and checkum skb to user iovec. 368 * skb_copy_and_csum_datagram_iovec - Copy and checkum skb to user iovec.
355 * @skb: skbuff 369 * @skb: skbuff
@@ -363,7 +377,7 @@ fault:
363 * -EFAULT - fault during copy. Beware, in this case iovec 377 * -EFAULT - fault during copy. Beware, in this case iovec
364 * can be modified! 378 * can be modified!
365 */ 379 */
366int skb_copy_and_csum_datagram_iovec(const struct sk_buff *skb, 380int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb,
367 int hlen, struct iovec *iov) 381 int hlen, struct iovec *iov)
368{ 382{
369 unsigned int csum; 383 unsigned int csum;
@@ -376,8 +390,7 @@ int skb_copy_and_csum_datagram_iovec(const struct sk_buff *skb,
376 iov++; 390 iov++;
377 391
378 if (iov->iov_len < chunk) { 392 if (iov->iov_len < chunk) {
379 if ((unsigned short)csum_fold(skb_checksum(skb, 0, chunk + hlen, 393 if (__skb_checksum_complete(skb))
380 skb->csum)))
381 goto csum_error; 394 goto csum_error;
382 if (skb_copy_datagram_iovec(skb, hlen, iov, chunk)) 395 if (skb_copy_datagram_iovec(skb, hlen, iov, chunk))
383 goto fault; 396 goto fault;
@@ -388,6 +401,8 @@ int skb_copy_and_csum_datagram_iovec(const struct sk_buff *skb,
388 goto fault; 401 goto fault;
389 if ((unsigned short)csum_fold(csum)) 402 if ((unsigned short)csum_fold(csum))
390 goto csum_error; 403 goto csum_error;
404 if (unlikely(skb->ip_summed == CHECKSUM_HW))
405 netdev_rx_csum_fault(skb->dev);
391 iov->iov_len -= chunk; 406 iov->iov_len -= chunk;
392 iov->iov_base += chunk; 407 iov->iov_base += chunk;
393 } 408 }
diff --git a/net/core/dev.c b/net/core/dev.c
index 8d1541595277..0b48e294aafe 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1108,6 +1108,18 @@ out:
1108 return ret; 1108 return ret;
1109} 1109}
1110 1110
1111/* Take action when hardware reception checksum errors are detected. */
1112#ifdef CONFIG_BUG
1113void netdev_rx_csum_fault(struct net_device *dev)
1114{
1115 if (net_ratelimit()) {
1116 printk(KERN_ERR "%s: hw csum failure.\n", dev->name);
1117 dump_stack();
1118 }
1119}
1120EXPORT_SYMBOL(netdev_rx_csum_fault);
1121#endif
1122
1111#ifdef CONFIG_HIGHMEM 1123#ifdef CONFIG_HIGHMEM
1112/* Actually, we should eliminate this check as soon as we know, that: 1124/* Actually, we should eliminate this check as soon as we know, that:
1113 * 1. IOMMU is present and allows to map all the memory. 1125 * 1. IOMMU is present and allows to map all the memory.
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 802fe11efad0..49424a42a2c0 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -101,16 +101,20 @@ void netpoll_queue(struct sk_buff *skb)
101static int checksum_udp(struct sk_buff *skb, struct udphdr *uh, 101static int checksum_udp(struct sk_buff *skb, struct udphdr *uh,
102 unsigned short ulen, u32 saddr, u32 daddr) 102 unsigned short ulen, u32 saddr, u32 daddr)
103{ 103{
104 if (uh->check == 0) 104 unsigned int psum;
105
106 if (uh->check == 0 || skb->ip_summed == CHECKSUM_UNNECESSARY)
105 return 0; 107 return 0;
106 108
107 if (skb->ip_summed == CHECKSUM_HW) 109 psum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
108 return csum_tcpudp_magic( 110
109 saddr, daddr, ulen, IPPROTO_UDP, skb->csum); 111 if (skb->ip_summed == CHECKSUM_HW &&
112 !(u16)csum_fold(csum_add(psum, skb->csum)))
113 return 0;
110 114
111 skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); 115 skb->csum = psum;
112 116
113 return csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)); 117 return __skb_checksum_complete(skb);
114} 118}
115 119
116/* 120/*
@@ -489,7 +493,7 @@ int __netpoll_rx(struct sk_buff *skb)
489 493
490 if (ulen != len) 494 if (ulen != len)
491 goto out; 495 goto out;
492 if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr) < 0) 496 if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr))
493 goto out; 497 goto out;
494 if (np->local_ip && np->local_ip != ntohl(iph->daddr)) 498 if (np->local_ip && np->local_ip != ntohl(iph->daddr))
495 goto out; 499 goto out;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 9bed7569ce3f..8700379685e0 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -49,6 +49,7 @@
49#include <net/udp.h> 49#include <net/udp.h>
50#include <net/sock.h> 50#include <net/sock.h>
51#include <net/pkt_sched.h> 51#include <net/pkt_sched.h>
52#include <net/netlink.h>
52 53
53DECLARE_MUTEX(rtnl_sem); 54DECLARE_MUTEX(rtnl_sem);
54 55
@@ -462,11 +463,6 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change)
462 netlink_broadcast(rtnl, skb, 0, RTNLGRP_LINK, GFP_KERNEL); 463 netlink_broadcast(rtnl, skb, 0, RTNLGRP_LINK, GFP_KERNEL);
463} 464}
464 465
465static int rtnetlink_done(struct netlink_callback *cb)
466{
467 return 0;
468}
469
470/* Protected by RTNL sempahore. */ 466/* Protected by RTNL sempahore. */
471static struct rtattr **rta_buf; 467static struct rtattr **rta_buf;
472static int rtattr_max; 468static int rtattr_max;
@@ -524,8 +520,6 @@ rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp)
524 } 520 }
525 521
526 if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { 522 if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {
527 u32 rlen;
528
529 if (link->dumpit == NULL) 523 if (link->dumpit == NULL)
530 link = &(rtnetlink_links[PF_UNSPEC][type]); 524 link = &(rtnetlink_links[PF_UNSPEC][type]);
531 525
@@ -533,14 +527,11 @@ rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp)
533 goto err_inval; 527 goto err_inval;
534 528
535 if ((*errp = netlink_dump_start(rtnl, skb, nlh, 529 if ((*errp = netlink_dump_start(rtnl, skb, nlh,
536 link->dumpit, 530 link->dumpit, NULL)) != 0) {
537 rtnetlink_done)) != 0) {
538 return -1; 531 return -1;
539 } 532 }
540 rlen = NLMSG_ALIGN(nlh->nlmsg_len); 533
541 if (rlen > skb->len) 534 netlink_queue_skip(nlh, skb);
542 rlen = skb->len;
543 skb_pull(skb, rlen);
544 return -1; 535 return -1;
545 } 536 }
546 537
@@ -579,75 +570,13 @@ err_inval:
579 return -1; 570 return -1;
580} 571}
581 572
582/*
583 * Process one packet of messages.
584 * Malformed skbs with wrong lengths of messages are discarded silently.
585 */
586
587static inline int rtnetlink_rcv_skb(struct sk_buff *skb)
588{
589 int err;
590 struct nlmsghdr * nlh;
591
592 while (skb->len >= NLMSG_SPACE(0)) {
593 u32 rlen;
594
595 nlh = (struct nlmsghdr *)skb->data;
596 if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
597 return 0;
598 rlen = NLMSG_ALIGN(nlh->nlmsg_len);
599 if (rlen > skb->len)
600 rlen = skb->len;
601 if (rtnetlink_rcv_msg(skb, nlh, &err)) {
602 /* Not error, but we must interrupt processing here:
603 * Note, that in this case we do not pull message
604 * from skb, it will be processed later.
605 */
606 if (err == 0)
607 return -1;
608 netlink_ack(skb, nlh, err);
609 } else if (nlh->nlmsg_flags&NLM_F_ACK)
610 netlink_ack(skb, nlh, 0);
611 skb_pull(skb, rlen);
612 }
613
614 return 0;
615}
616
617/*
618 * rtnetlink input queue processing routine:
619 * - process as much as there was in the queue upon entry.
620 * - feed skbs to rtnetlink_rcv_skb, until it refuse a message,
621 * that will occur, when a dump started.
622 */
623
624static void rtnetlink_rcv(struct sock *sk, int len) 573static void rtnetlink_rcv(struct sock *sk, int len)
625{ 574{
626 unsigned int qlen = skb_queue_len(&sk->sk_receive_queue); 575 unsigned int qlen = 0;
627 576
628 do { 577 do {
629 struct sk_buff *skb;
630
631 rtnl_lock(); 578 rtnl_lock();
632 579 netlink_run_queue(sk, &qlen, &rtnetlink_rcv_msg);
633 if (qlen > skb_queue_len(&sk->sk_receive_queue))
634 qlen = skb_queue_len(&sk->sk_receive_queue);
635
636 for (; qlen; qlen--) {
637 skb = skb_dequeue(&sk->sk_receive_queue);
638 if (rtnetlink_rcv_skb(skb)) {
639 if (skb->len)
640 skb_queue_head(&sk->sk_receive_queue,
641 skb);
642 else {
643 kfree_skb(skb);
644 qlen--;
645 }
646 break;
647 }
648 kfree_skb(skb);
649 }
650
651 up(&rtnl_sem); 580 up(&rtnl_sem);
652 581
653 netdev_run_todo(); 582 netdev_run_todo();
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 95501e40100e..b7d13a4fff48 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -336,6 +336,9 @@ void __kfree_skb(struct sk_buff *skb)
336 } 336 }
337#ifdef CONFIG_NETFILTER 337#ifdef CONFIG_NETFILTER
338 nf_conntrack_put(skb->nfct); 338 nf_conntrack_put(skb->nfct);
339#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
340 nf_conntrack_put_reasm(skb->nfct_reasm);
341#endif
339#ifdef CONFIG_BRIDGE_NETFILTER 342#ifdef CONFIG_BRIDGE_NETFILTER
340 nf_bridge_put(skb->nf_bridge); 343 nf_bridge_put(skb->nf_bridge);
341#endif 344#endif
@@ -414,9 +417,17 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
414 C(nfct); 417 C(nfct);
415 nf_conntrack_get(skb->nfct); 418 nf_conntrack_get(skb->nfct);
416 C(nfctinfo); 419 C(nfctinfo);
420#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
421 C(nfct_reasm);
422 nf_conntrack_get_reasm(skb->nfct_reasm);
423#endif
417#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) 424#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
418 C(ipvs_property); 425 C(ipvs_property);
419#endif 426#endif
427#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
428 C(nfct_reasm);
429 nf_conntrack_get_reasm(skb->nfct_reasm);
430#endif
420#ifdef CONFIG_BRIDGE_NETFILTER 431#ifdef CONFIG_BRIDGE_NETFILTER
421 C(nf_bridge); 432 C(nf_bridge);
422 nf_bridge_get(skb->nf_bridge); 433 nf_bridge_get(skb->nf_bridge);
@@ -474,6 +485,10 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
474 new->nfct = old->nfct; 485 new->nfct = old->nfct;
475 nf_conntrack_get(old->nfct); 486 nf_conntrack_get(old->nfct);
476 new->nfctinfo = old->nfctinfo; 487 new->nfctinfo = old->nfctinfo;
488#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
489 new->nfct_reasm = old->nfct_reasm;
490 nf_conntrack_get_reasm(old->nfct_reasm);
491#endif
477#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) 492#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
478 new->ipvs_property = old->ipvs_property; 493 new->ipvs_property = old->ipvs_property;
479#endif 494#endif
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 3f25cadccddd..f89e55f814d9 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -1664,17 +1664,15 @@ static int dn_recvmsg(struct kiocb *iocb, struct socket *sock,
1664 goto out; 1664 goto out;
1665 } 1665 }
1666 1666
1667 rv = dn_check_state(sk, NULL, 0, &timeo, flags);
1668 if (rv)
1669 goto out;
1670
1671 if (sk->sk_shutdown & RCV_SHUTDOWN) { 1667 if (sk->sk_shutdown & RCV_SHUTDOWN) {
1672 if (!(flags & MSG_NOSIGNAL)) 1668 rv = 0;
1673 send_sig(SIGPIPE, current, 0);
1674 rv = -EPIPE;
1675 goto out; 1669 goto out;
1676 } 1670 }
1677 1671
1672 rv = dn_check_state(sk, NULL, 0, &timeo, flags);
1673 if (rv)
1674 goto out;
1675
1678 if (flags & ~(MSG_PEEK|MSG_OOB|MSG_WAITALL|MSG_DONTWAIT|MSG_NOSIGNAL)) { 1676 if (flags & ~(MSG_PEEK|MSG_OOB|MSG_WAITALL|MSG_DONTWAIT|MSG_NOSIGNAL)) {
1679 rv = -EOPNOTSUPP; 1677 rv = -EOPNOTSUPP;
1680 goto out; 1678 goto out;
@@ -1928,6 +1926,8 @@ static int dn_sendmsg(struct kiocb *iocb, struct socket *sock,
1928 1926
1929 if (sk->sk_shutdown & SEND_SHUTDOWN) { 1927 if (sk->sk_shutdown & SEND_SHUTDOWN) {
1930 err = -EPIPE; 1928 err = -EPIPE;
1929 if (!(flags & MSG_NOSIGNAL))
1930 send_sig(SIGPIPE, current, 0);
1931 goto out_err; 1931 goto out_err;
1932 } 1932 }
1933 1933
diff --git a/net/ieee80211/ieee80211_crypt.c b/net/ieee80211/ieee80211_crypt.c
index 20cc580a07e0..ecc9bb196abc 100644
--- a/net/ieee80211/ieee80211_crypt.c
+++ b/net/ieee80211/ieee80211_crypt.c
@@ -11,15 +11,14 @@
11 * 11 *
12 */ 12 */
13 13
14#include <linux/config.h> 14#include <linux/errno.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <asm/string.h> 18#include <linux/string.h>
19#include <asm/errno.h>
20
21#include <net/ieee80211.h> 19#include <net/ieee80211.h>
22 20
21
23MODULE_AUTHOR("Jouni Malinen"); 22MODULE_AUTHOR("Jouni Malinen");
24MODULE_DESCRIPTION("HostAP crypto"); 23MODULE_DESCRIPTION("HostAP crypto");
25MODULE_LICENSE("GPL"); 24MODULE_LICENSE("GPL");
@@ -29,32 +28,20 @@ struct ieee80211_crypto_alg {
29 struct ieee80211_crypto_ops *ops; 28 struct ieee80211_crypto_ops *ops;
30}; 29};
31 30
32struct ieee80211_crypto { 31static LIST_HEAD(ieee80211_crypto_algs);
33 struct list_head algs; 32static DEFINE_SPINLOCK(ieee80211_crypto_lock);
34 spinlock_t lock;
35};
36
37static struct ieee80211_crypto *hcrypt;
38 33
39void ieee80211_crypt_deinit_entries(struct ieee80211_device *ieee, int force) 34void ieee80211_crypt_deinit_entries(struct ieee80211_device *ieee, int force)
40{ 35{
41 struct list_head *ptr, *n; 36 struct ieee80211_crypt_data *entry, *next;
42 struct ieee80211_crypt_data *entry;
43 unsigned long flags; 37 unsigned long flags;
44 38
45 spin_lock_irqsave(&ieee->lock, flags); 39 spin_lock_irqsave(&ieee->lock, flags);
46 40 list_for_each_entry_safe(entry, next, &ieee->crypt_deinit_list, list) {
47 if (list_empty(&ieee->crypt_deinit_list))
48 goto unlock;
49
50 for (ptr = ieee->crypt_deinit_list.next, n = ptr->next;
51 ptr != &ieee->crypt_deinit_list; ptr = n, n = ptr->next) {
52 entry = list_entry(ptr, struct ieee80211_crypt_data, list);
53
54 if (atomic_read(&entry->refcnt) != 0 && !force) 41 if (atomic_read(&entry->refcnt) != 0 && !force)
55 continue; 42 continue;
56 43
57 list_del(ptr); 44 list_del(&entry->list);
58 45
59 if (entry->ops) { 46 if (entry->ops) {
60 entry->ops->deinit(entry->priv); 47 entry->ops->deinit(entry->priv);
@@ -62,7 +49,6 @@ void ieee80211_crypt_deinit_entries(struct ieee80211_device *ieee, int force)
62 } 49 }
63 kfree(entry); 50 kfree(entry);
64 } 51 }
65 unlock:
66 spin_unlock_irqrestore(&ieee->lock, flags); 52 spin_unlock_irqrestore(&ieee->lock, flags);
67} 53}
68 54
@@ -125,9 +111,6 @@ int ieee80211_register_crypto_ops(struct ieee80211_crypto_ops *ops)
125 unsigned long flags; 111 unsigned long flags;
126 struct ieee80211_crypto_alg *alg; 112 struct ieee80211_crypto_alg *alg;
127 113
128 if (hcrypt == NULL)
129 return -1;
130
131 alg = kmalloc(sizeof(*alg), GFP_KERNEL); 114 alg = kmalloc(sizeof(*alg), GFP_KERNEL);
132 if (alg == NULL) 115 if (alg == NULL)
133 return -ENOMEM; 116 return -ENOMEM;
@@ -135,9 +118,9 @@ int ieee80211_register_crypto_ops(struct ieee80211_crypto_ops *ops)
135 memset(alg, 0, sizeof(*alg)); 118 memset(alg, 0, sizeof(*alg));
136 alg->ops = ops; 119 alg->ops = ops;
137 120
138 spin_lock_irqsave(&hcrypt->lock, flags); 121 spin_lock_irqsave(&ieee80211_crypto_lock, flags);
139 list_add(&alg->list, &hcrypt->algs); 122 list_add(&alg->list, &ieee80211_crypto_algs);
140 spin_unlock_irqrestore(&hcrypt->lock, flags); 123 spin_unlock_irqrestore(&ieee80211_crypto_lock, flags);
141 124
142 printk(KERN_DEBUG "ieee80211_crypt: registered algorithm '%s'\n", 125 printk(KERN_DEBUG "ieee80211_crypt: registered algorithm '%s'\n",
143 ops->name); 126 ops->name);
@@ -147,64 +130,49 @@ int ieee80211_register_crypto_ops(struct ieee80211_crypto_ops *ops)
147 130
148int ieee80211_unregister_crypto_ops(struct ieee80211_crypto_ops *ops) 131int ieee80211_unregister_crypto_ops(struct ieee80211_crypto_ops *ops)
149{ 132{
133 struct ieee80211_crypto_alg *alg;
150 unsigned long flags; 134 unsigned long flags;
151 struct list_head *ptr;
152 struct ieee80211_crypto_alg *del_alg = NULL;
153
154 if (hcrypt == NULL)
155 return -1;
156
157 spin_lock_irqsave(&hcrypt->lock, flags);
158 for (ptr = hcrypt->algs.next; ptr != &hcrypt->algs; ptr = ptr->next) {
159 struct ieee80211_crypto_alg *alg =
160 (struct ieee80211_crypto_alg *)ptr;
161 if (alg->ops == ops) {
162 list_del(&alg->list);
163 del_alg = alg;
164 break;
165 }
166 }
167 spin_unlock_irqrestore(&hcrypt->lock, flags);
168 135
169 if (del_alg) { 136 spin_lock_irqsave(&ieee80211_crypto_lock, flags);
170 printk(KERN_DEBUG "ieee80211_crypt: unregistered algorithm " 137 list_for_each_entry(alg, &ieee80211_crypto_algs, list) {
171 "'%s'\n", ops->name); 138 if (alg->ops == ops)
172 kfree(del_alg); 139 goto found;
173 } 140 }
174 141 spin_unlock_irqrestore(&ieee80211_crypto_lock, flags);
175 return del_alg ? 0 : -1; 142 return -EINVAL;
143
144 found:
145 printk(KERN_DEBUG "ieee80211_crypt: unregistered algorithm "
146 "'%s'\n", ops->name);
147 list_del(&alg->list);
148 spin_unlock_irqrestore(&ieee80211_crypto_lock, flags);
149 kfree(alg);
150 return 0;
176} 151}
177 152
178struct ieee80211_crypto_ops *ieee80211_get_crypto_ops(const char *name) 153struct ieee80211_crypto_ops *ieee80211_get_crypto_ops(const char *name)
179{ 154{
155 struct ieee80211_crypto_alg *alg;
180 unsigned long flags; 156 unsigned long flags;
181 struct list_head *ptr; 157
182 struct ieee80211_crypto_alg *found_alg = NULL; 158 spin_lock_irqsave(&ieee80211_crypto_lock, flags);
183 159 list_for_each_entry(alg, &ieee80211_crypto_algs, list) {
184 if (hcrypt == NULL) 160 if (strcmp(alg->ops->name, name) == 0)
185 return NULL; 161 goto found;
186
187 spin_lock_irqsave(&hcrypt->lock, flags);
188 for (ptr = hcrypt->algs.next; ptr != &hcrypt->algs; ptr = ptr->next) {
189 struct ieee80211_crypto_alg *alg =
190 (struct ieee80211_crypto_alg *)ptr;
191 if (strcmp(alg->ops->name, name) == 0) {
192 found_alg = alg;
193 break;
194 }
195 } 162 }
196 spin_unlock_irqrestore(&hcrypt->lock, flags); 163 spin_unlock_irqrestore(&ieee80211_crypto_lock, flags);
164 return NULL;
197 165
198 if (found_alg) 166 found:
199 return found_alg->ops; 167 spin_unlock_irqrestore(&ieee80211_crypto_lock, flags);
200 else 168 return alg->ops;
201 return NULL;
202} 169}
203 170
204static void *ieee80211_crypt_null_init(int keyidx) 171static void *ieee80211_crypt_null_init(int keyidx)
205{ 172{
206 return (void *)1; 173 return (void *)1;
207} 174}
175
208static void ieee80211_crypt_null_deinit(void *priv) 176static void ieee80211_crypt_null_deinit(void *priv)
209{ 177{
210} 178}
@@ -213,56 +181,18 @@ static struct ieee80211_crypto_ops ieee80211_crypt_null = {
213 .name = "NULL", 181 .name = "NULL",
214 .init = ieee80211_crypt_null_init, 182 .init = ieee80211_crypt_null_init,
215 .deinit = ieee80211_crypt_null_deinit, 183 .deinit = ieee80211_crypt_null_deinit,
216 .encrypt_mpdu = NULL,
217 .decrypt_mpdu = NULL,
218 .encrypt_msdu = NULL,
219 .decrypt_msdu = NULL,
220 .set_key = NULL,
221 .get_key = NULL,
222 .extra_mpdu_prefix_len = 0,
223 .extra_mpdu_postfix_len = 0,
224 .owner = THIS_MODULE, 184 .owner = THIS_MODULE,
225}; 185};
226 186
227static int __init ieee80211_crypto_init(void) 187static int __init ieee80211_crypto_init(void)
228{ 188{
229 int ret = -ENOMEM; 189 return ieee80211_register_crypto_ops(&ieee80211_crypt_null);
230
231 hcrypt = kmalloc(sizeof(*hcrypt), GFP_KERNEL);
232 if (!hcrypt)
233 goto out;
234
235 memset(hcrypt, 0, sizeof(*hcrypt));
236 INIT_LIST_HEAD(&hcrypt->algs);
237 spin_lock_init(&hcrypt->lock);
238
239 ret = ieee80211_register_crypto_ops(&ieee80211_crypt_null);
240 if (ret < 0) {
241 kfree(hcrypt);
242 hcrypt = NULL;
243 }
244 out:
245 return ret;
246} 190}
247 191
248static void __exit ieee80211_crypto_deinit(void) 192static void __exit ieee80211_crypto_deinit(void)
249{ 193{
250 struct list_head *ptr, *n; 194 ieee80211_unregister_crypto_ops(&ieee80211_crypt_null);
251 195 BUG_ON(!list_empty(&ieee80211_crypto_algs));
252 if (hcrypt == NULL)
253 return;
254
255 for (ptr = hcrypt->algs.next, n = ptr->next; ptr != &hcrypt->algs;
256 ptr = n, n = ptr->next) {
257 struct ieee80211_crypto_alg *alg =
258 (struct ieee80211_crypto_alg *)ptr;
259 list_del(ptr);
260 printk(KERN_DEBUG "ieee80211_crypt: unregistered algorithm "
261 "'%s' (deinit)\n", alg->ops->name);
262 kfree(alg);
263 }
264
265 kfree(hcrypt);
266} 196}
267 197
268EXPORT_SYMBOL(ieee80211_crypt_deinit_entries); 198EXPORT_SYMBOL(ieee80211_crypt_deinit_entries);
diff --git a/net/ieee80211/ieee80211_rx.c b/net/ieee80211/ieee80211_rx.c
index 6ad88218f573..03efaacbdb73 100644
--- a/net/ieee80211/ieee80211_rx.c
+++ b/net/ieee80211/ieee80211_rx.c
@@ -369,6 +369,7 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb,
369 /* Put this code here so that we avoid duplicating it in all 369 /* Put this code here so that we avoid duplicating it in all
370 * Rx paths. - Jean II */ 370 * Rx paths. - Jean II */
371#ifdef IW_WIRELESS_SPY /* defined in iw_handler.h */ 371#ifdef IW_WIRELESS_SPY /* defined in iw_handler.h */
372#ifdef CONFIG_NET_RADIO
372 /* If spy monitoring on */ 373 /* If spy monitoring on */
373 if (ieee->spy_data.spy_number > 0) { 374 if (ieee->spy_data.spy_number > 0) {
374 struct iw_quality wstats; 375 struct iw_quality wstats;
@@ -395,6 +396,7 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb,
395 /* Update spy records */ 396 /* Update spy records */
396 wireless_spy_update(ieee->dev, hdr->addr2, &wstats); 397 wireless_spy_update(ieee->dev, hdr->addr2, &wstats);
397 } 398 }
399#endif /* CONFIG_NET_RADIO */
398#endif /* IW_WIRELESS_SPY */ 400#endif /* IW_WIRELESS_SPY */
399 401
400#ifdef NOT_YET 402#ifdef NOT_YET
diff --git a/net/ieee80211/ieee80211_wx.c b/net/ieee80211/ieee80211_wx.c
index 1ce7af9bec35..181755f2aa8b 100644
--- a/net/ieee80211/ieee80211_wx.c
+++ b/net/ieee80211/ieee80211_wx.c
@@ -161,9 +161,11 @@ static inline char *ipw2100_translate_scan(struct ieee80211_device *ieee,
161 (ieee->perfect_rssi - ieee->worst_rssi) - 161 (ieee->perfect_rssi - ieee->worst_rssi) -
162 (ieee->perfect_rssi - network->stats.rssi) * 162 (ieee->perfect_rssi - network->stats.rssi) *
163 (15 * (ieee->perfect_rssi - ieee->worst_rssi) + 163 (15 * (ieee->perfect_rssi - ieee->worst_rssi) +
164 62 * (ieee->perfect_rssi - network->stats.rssi))) / 164 62 * (ieee->perfect_rssi -
165 ((ieee->perfect_rssi - ieee->worst_rssi) * 165 network->stats.rssi))) /
166 (ieee->perfect_rssi - ieee->worst_rssi)); 166 ((ieee->perfect_rssi -
167 ieee->worst_rssi) * (ieee->perfect_rssi -
168 ieee->worst_rssi));
167 if (iwe.u.qual.qual > 100) 169 if (iwe.u.qual.qual > 100)
168 iwe.u.qual.qual = 100; 170 iwe.u.qual.qual = 100;
169 else if (iwe.u.qual.qual < 1) 171 else if (iwe.u.qual.qual < 1)
@@ -520,7 +522,8 @@ int ieee80211_wx_set_encodeext(struct ieee80211_device *ieee,
520 crypt = &ieee->crypt[idx]; 522 crypt = &ieee->crypt[idx];
521 group_key = 1; 523 group_key = 1;
522 } else { 524 } else {
523 if (idx != 0) 525 /* some Cisco APs use idx>0 for unicast in dynamic WEP */
526 if (idx != 0 && ext->alg != IW_ENCODE_ALG_WEP)
524 return -EINVAL; 527 return -EINVAL;
525 if (ieee->iw_mode == IW_MODE_INFRA) 528 if (ieee->iw_mode == IW_MODE_INFRA)
526 crypt = &ieee->crypt[idx]; 529 crypt = &ieee->crypt[idx];
@@ -688,7 +691,8 @@ int ieee80211_wx_get_encodeext(struct ieee80211_device *ieee,
688 } else 691 } else
689 idx = ieee->tx_keyidx; 692 idx = ieee->tx_keyidx;
690 693
691 if (!ext->ext_flags & IW_ENCODE_EXT_GROUP_KEY) 694 if (!ext->ext_flags & IW_ENCODE_EXT_GROUP_KEY &&
695 ext->alg != IW_ENCODE_ALG_WEP)
692 if (idx != 0 || ieee->iw_mode != IW_MODE_INFRA) 696 if (idx != 0 || ieee->iw_mode != IW_MODE_INFRA)
693 return -EINVAL; 697 return -EINVAL;
694 698
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 175e093ec564..e3eceecd0496 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -934,11 +934,11 @@ int icmp_rcv(struct sk_buff *skb)
934 case CHECKSUM_HW: 934 case CHECKSUM_HW:
935 if (!(u16)csum_fold(skb->csum)) 935 if (!(u16)csum_fold(skb->csum))
936 break; 936 break;
937 LIMIT_NETDEBUG(KERN_DEBUG "icmp v4 hw csum failure\n"); 937 /* fall through */
938 case CHECKSUM_NONE: 938 case CHECKSUM_NONE:
939 if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) 939 skb->csum = 0;
940 if (__skb_checksum_complete(skb))
940 goto error; 941 goto error;
941 default:;
942 } 942 }
943 943
944 if (!pskb_pull(skb, sizeof(struct icmphdr))) 944 if (!pskb_pull(skb, sizeof(struct icmphdr)))
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index c6247fc84060..c04607b49212 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -872,11 +872,18 @@ int igmp_rcv(struct sk_buff *skb)
872 return 0; 872 return 0;
873 } 873 }
874 874
875 if (!pskb_may_pull(skb, sizeof(struct igmphdr)) || 875 if (!pskb_may_pull(skb, sizeof(struct igmphdr)))
876 (u16)csum_fold(skb_checksum(skb, 0, len, 0))) { 876 goto drop;
877 in_dev_put(in_dev); 877
878 kfree_skb(skb); 878 switch (skb->ip_summed) {
879 return 0; 879 case CHECKSUM_HW:
880 if (!(u16)csum_fold(skb->csum))
881 break;
882 /* fall through */
883 case CHECKSUM_NONE:
884 skb->csum = 0;
885 if (__skb_checksum_complete(skb))
886 goto drop;
880 } 887 }
881 888
882 ih = skb->h.igmph; 889 ih = skb->h.igmph;
@@ -906,6 +913,8 @@ int igmp_rcv(struct sk_buff *skb)
906 default: 913 default:
907 NETDEBUG(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type); 914 NETDEBUG(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type);
908 } 915 }
916
917drop:
909 in_dev_put(in_dev); 918 in_dev_put(in_dev);
910 kfree_skb(skb); 919 kfree_skb(skb);
911 return 0; 920 return 0;
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 71f3c7350c6e..39061ed53cfd 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -724,12 +724,6 @@ done:
724 return skb->len; 724 return skb->len;
725} 725}
726 726
727static int inet_diag_dump_done(struct netlink_callback *cb)
728{
729 return 0;
730}
731
732
733static __inline__ int 727static __inline__ int
734inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) 728inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
735{ 729{
@@ -760,8 +754,7 @@ inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
760 goto err_inval; 754 goto err_inval;
761 } 755 }
762 return netlink_dump_start(idiagnl, skb, nlh, 756 return netlink_dump_start(idiagnl, skb, nlh,
763 inet_diag_dump, 757 inet_diag_dump, NULL);
764 inet_diag_dump_done);
765 } else { 758 } else {
766 return inet_diag_get_exact(skb, nlh); 759 return inet_diag_get_exact(skb, nlh);
767 } 760 }
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 896ce3f8f53a..4e9c74b54b15 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -577,15 +577,16 @@ static int ipgre_rcv(struct sk_buff *skb)
577 goto drop_nolock; 577 goto drop_nolock;
578 578
579 if (flags&GRE_CSUM) { 579 if (flags&GRE_CSUM) {
580 if (skb->ip_summed == CHECKSUM_HW) { 580 switch (skb->ip_summed) {
581 case CHECKSUM_HW:
581 csum = (u16)csum_fold(skb->csum); 582 csum = (u16)csum_fold(skb->csum);
582 if (csum) 583 if (!csum)
583 skb->ip_summed = CHECKSUM_NONE; 584 break;
584 } 585 /* fall through */
585 if (skb->ip_summed == CHECKSUM_NONE) { 586 case CHECKSUM_NONE:
586 skb->csum = skb_checksum(skb, 0, skb->len, 0); 587 skb->csum = 0;
588 csum = __skb_checksum_complete(skb);
587 skb->ip_summed = CHECKSUM_HW; 589 skb->ip_summed = CHECKSUM_HW;
588 csum = (u16)csum_fold(skb->csum);
589 } 590 }
590 offset += 4; 591 offset += 4;
591 } 592 }
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 7d917e4ce1d9..9d3c8b5f327e 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -5,6 +5,20 @@
5menu "IP: Netfilter Configuration" 5menu "IP: Netfilter Configuration"
6 depends on INET && NETFILTER 6 depends on INET && NETFILTER
7 7
8config NF_CONNTRACK_IPV4
9 tristate "IPv4 support for new connection tracking (EXPERIMENTAL)"
10 depends on EXPERIMENTAL && NF_CONNTRACK
11 ---help---
12 Connection tracking keeps a record of what packets have passed
13 through your machine, in order to figure out how they are related
14 into connections.
15
16 This is IPv4 support on Layer 3 independent connection tracking.
17 Layer 3 independent connection tracking is experimental scheme
18 which generalize ip_conntrack to support other layer 3 protocols.
19
20 To compile it as a module, choose M here. If unsure, say N.
21
8# connection tracking, helpers and protocols 22# connection tracking, helpers and protocols
9config IP_NF_CONNTRACK 23config IP_NF_CONNTRACK
10 tristate "Connection tracking (required for masq/NAT)" 24 tristate "Connection tracking (required for masq/NAT)"
@@ -209,8 +223,8 @@ config IP_NF_MATCH_PKTTYPE
209 tristate "Packet type match support" 223 tristate "Packet type match support"
210 depends on IP_NF_IPTABLES 224 depends on IP_NF_IPTABLES
211 help 225 help
212 Packet type matching allows you to match a packet by 226 Packet type matching allows you to match a packet by
213 its "class", eg. BROADCAST, MULTICAST, ... 227 its "class", eg. BROADCAST, MULTICAST, ...
214 228
215 Typical usage: 229 Typical usage:
216 iptables -A INPUT -m pkttype --pkt-type broadcast -j LOG 230 iptables -A INPUT -m pkttype --pkt-type broadcast -j LOG
@@ -317,7 +331,8 @@ config IP_NF_MATCH_TCPMSS
317 331
318config IP_NF_MATCH_HELPER 332config IP_NF_MATCH_HELPER
319 tristate "Helper match support" 333 tristate "Helper match support"
320 depends on IP_NF_CONNTRACK && IP_NF_IPTABLES 334 depends on IP_NF_IPTABLES
335 depends on IP_NF_CONNTRACK || NF_CONNTRACK_IPV4
321 help 336 help
322 Helper matching allows you to match packets in dynamic connections 337 Helper matching allows you to match packets in dynamic connections
323 tracked by a conntrack-helper, ie. ip_conntrack_ftp 338 tracked by a conntrack-helper, ie. ip_conntrack_ftp
@@ -326,7 +341,8 @@ config IP_NF_MATCH_HELPER
326 341
327config IP_NF_MATCH_STATE 342config IP_NF_MATCH_STATE
328 tristate "Connection state match support" 343 tristate "Connection state match support"
329 depends on IP_NF_CONNTRACK && IP_NF_IPTABLES 344 depends on IP_NF_IPTABLES
345 depends on IP_NF_CONNTRACK || NF_CONNTRACK_IPV4
330 help 346 help
331 Connection state matching allows you to match packets based on their 347 Connection state matching allows you to match packets based on their
332 relationship to a tracked connection (ie. previous packets). This 348 relationship to a tracked connection (ie. previous packets). This
@@ -336,7 +352,8 @@ config IP_NF_MATCH_STATE
336 352
337config IP_NF_MATCH_CONNTRACK 353config IP_NF_MATCH_CONNTRACK
338 tristate "Connection tracking match support" 354 tristate "Connection tracking match support"
339 depends on IP_NF_CONNTRACK && IP_NF_IPTABLES 355 depends on IP_NF_IPTABLES
356 depends on IP_NF_CONNTRACK || NF_CONNTRACK_IPV4
340 help 357 help
341 This is a general conntrack match module, a superset of the state match. 358 This is a general conntrack match module, a superset of the state match.
342 359
@@ -422,7 +439,8 @@ config IP_NF_MATCH_COMMENT
422 439
423config IP_NF_MATCH_CONNMARK 440config IP_NF_MATCH_CONNMARK
424 tristate 'Connection mark match support' 441 tristate 'Connection mark match support'
425 depends on IP_NF_CONNTRACK_MARK && IP_NF_IPTABLES 442 depends on IP_NF_IPTABLES
443 depends on IP_NF_CONNTRACK_MARK || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4)
426 help 444 help
427 This option adds a `connmark' match, which allows you to match the 445 This option adds a `connmark' match, which allows you to match the
428 connection mark value previously set for the session by `CONNMARK'. 446 connection mark value previously set for the session by `CONNMARK'.
@@ -433,7 +451,8 @@ config IP_NF_MATCH_CONNMARK
433 451
434config IP_NF_MATCH_CONNBYTES 452config IP_NF_MATCH_CONNBYTES
435 tristate 'Connection byte/packet counter match support' 453 tristate 'Connection byte/packet counter match support'
436 depends on IP_NF_CT_ACCT && IP_NF_IPTABLES 454 depends on IP_NF_IPTABLES
455 depends on IP_NF_CT_ACCT || (NF_CT_ACCT && NF_CONNTRACK_IPV4)
437 help 456 help
438 This option adds a `connbytes' match, which allows you to match the 457 This option adds a `connbytes' match, which allows you to match the
439 number of bytes and/or packets for each direction within a connection. 458 number of bytes and/or packets for each direction within a connection.
@@ -747,7 +766,8 @@ config IP_NF_TARGET_TTL
747 766
748config IP_NF_TARGET_CONNMARK 767config IP_NF_TARGET_CONNMARK
749 tristate 'CONNMARK target support' 768 tristate 'CONNMARK target support'
750 depends on IP_NF_CONNTRACK_MARK && IP_NF_MANGLE 769 depends on IP_NF_MANGLE
770 depends on IP_NF_CONNTRACK_MARK || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4)
751 help 771 help
752 This option adds a `CONNMARK' target, which allows one to manipulate 772 This option adds a `CONNMARK' target, which allows one to manipulate
753 the connection mark value. Similar to the MARK target, but 773 the connection mark value. Similar to the MARK target, but
@@ -759,7 +779,8 @@ config IP_NF_TARGET_CONNMARK
759 779
760config IP_NF_TARGET_CLUSTERIP 780config IP_NF_TARGET_CLUSTERIP
761 tristate "CLUSTERIP target support (EXPERIMENTAL)" 781 tristate "CLUSTERIP target support (EXPERIMENTAL)"
762 depends on IP_NF_CONNTRACK_MARK && IP_NF_IPTABLES && EXPERIMENTAL 782 depends on IP_NF_IPTABLES && EXPERIMENTAL
783 depends on IP_NF_CONNTRACK_MARK || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4)
763 help 784 help
764 The CLUSTERIP target allows you to build load-balancing clusters of 785 The CLUSTERIP target allows you to build load-balancing clusters of
765 network servers without having a dedicated load-balancing 786 network servers without having a dedicated load-balancing
@@ -782,7 +803,7 @@ config IP_NF_RAW
782config IP_NF_TARGET_NOTRACK 803config IP_NF_TARGET_NOTRACK
783 tristate 'NOTRACK target support' 804 tristate 'NOTRACK target support'
784 depends on IP_NF_RAW 805 depends on IP_NF_RAW
785 depends on IP_NF_CONNTRACK 806 depends on IP_NF_CONNTRACK || NF_CONNTRACK_IPV4
786 help 807 help
787 The NOTRACK target allows a select rule to specify 808 The NOTRACK target allows a select rule to specify
788 which packets *not* to enter the conntrack/NAT 809 which packets *not* to enter the conntrack/NAT
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index dab4b58dd31e..058c48e258fc 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -103,3 +103,9 @@ obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o
103obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o 103obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o
104 104
105obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o 105obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o
106
107# objects for l3 independent conntrack
108nf_conntrack_ipv4-objs := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o
109
110# l3 independent conntrack
111obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o
diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c
index 82a65043a8ef..d2a4fec22862 100644
--- a/net/ipv4/netfilter/ip_conntrack_netlink.c
+++ b/net/ipv4/netfilter/ip_conntrack_netlink.c
@@ -28,11 +28,8 @@
28#include <linux/netlink.h> 28#include <linux/netlink.h>
29#include <linux/spinlock.h> 29#include <linux/spinlock.h>
30#include <linux/notifier.h> 30#include <linux/notifier.h>
31#include <linux/rtnetlink.h>
32 31
33#include <linux/netfilter.h> 32#include <linux/netfilter.h>
34#include <linux/netfilter_ipv4.h>
35#include <linux/netfilter_ipv4/ip_tables.h>
36#include <linux/netfilter_ipv4/ip_conntrack.h> 33#include <linux/netfilter_ipv4/ip_conntrack.h>
37#include <linux/netfilter_ipv4/ip_conntrack_core.h> 34#include <linux/netfilter_ipv4/ip_conntrack_core.h>
38#include <linux/netfilter_ipv4/ip_conntrack_helper.h> 35#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
@@ -58,14 +55,17 @@ ctnetlink_dump_tuples_proto(struct sk_buff *skb,
58 const struct ip_conntrack_tuple *tuple) 55 const struct ip_conntrack_tuple *tuple)
59{ 56{
60 struct ip_conntrack_protocol *proto; 57 struct ip_conntrack_protocol *proto;
58 int ret = 0;
61 59
62 NFA_PUT(skb, CTA_PROTO_NUM, sizeof(u_int8_t), &tuple->dst.protonum); 60 NFA_PUT(skb, CTA_PROTO_NUM, sizeof(u_int8_t), &tuple->dst.protonum);
63 61
64 proto = ip_conntrack_proto_find_get(tuple->dst.protonum); 62 proto = ip_conntrack_proto_find_get(tuple->dst.protonum);
65 if (proto && proto->tuple_to_nfattr) 63 if (likely(proto && proto->tuple_to_nfattr)) {
66 return proto->tuple_to_nfattr(skb, tuple); 64 ret = proto->tuple_to_nfattr(skb, tuple);
65 ip_conntrack_proto_put(proto);
66 }
67 67
68 return 0; 68 return ret;
69 69
70nfattr_failure: 70nfattr_failure:
71 return -1; 71 return -1;
@@ -175,7 +175,7 @@ ctnetlink_dump_counters(struct sk_buff *skb, const struct ip_conntrack *ct,
175{ 175{
176 enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG; 176 enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG;
177 struct nfattr *nest_count = NFA_NEST(skb, type); 177 struct nfattr *nest_count = NFA_NEST(skb, type);
178 u_int64_t tmp; 178 u_int32_t tmp;
179 179
180 tmp = htonl(ct->counters[dir].packets); 180 tmp = htonl(ct->counters[dir].packets);
181 NFA_PUT(skb, CTA_COUNTERS32_PACKETS, sizeof(u_int32_t), &tmp); 181 NFA_PUT(skb, CTA_COUNTERS32_PACKETS, sizeof(u_int32_t), &tmp);
@@ -479,9 +479,7 @@ ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple)
479 479
480 DEBUGP("entered %s\n", __FUNCTION__); 480 DEBUGP("entered %s\n", __FUNCTION__);
481 481
482 482 nfattr_parse_nested(tb, CTA_IP_MAX, attr);
483 if (nfattr_parse_nested(tb, CTA_IP_MAX, attr) < 0)
484 goto nfattr_failure;
485 483
486 if (nfattr_bad_size(tb, CTA_IP_MAX, cta_min_ip)) 484 if (nfattr_bad_size(tb, CTA_IP_MAX, cta_min_ip))
487 return -EINVAL; 485 return -EINVAL;
@@ -497,9 +495,6 @@ ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple)
497 DEBUGP("leaving\n"); 495 DEBUGP("leaving\n");
498 496
499 return 0; 497 return 0;
500
501nfattr_failure:
502 return -1;
503} 498}
504 499
505static const int cta_min_proto[CTA_PROTO_MAX] = { 500static const int cta_min_proto[CTA_PROTO_MAX] = {
@@ -521,8 +516,7 @@ ctnetlink_parse_tuple_proto(struct nfattr *attr,
521 516
522 DEBUGP("entered %s\n", __FUNCTION__); 517 DEBUGP("entered %s\n", __FUNCTION__);
523 518
524 if (nfattr_parse_nested(tb, CTA_PROTO_MAX, attr) < 0) 519 nfattr_parse_nested(tb, CTA_PROTO_MAX, attr);
525 goto nfattr_failure;
526 520
527 if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto)) 521 if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto))
528 return -EINVAL; 522 return -EINVAL;
@@ -539,9 +533,6 @@ ctnetlink_parse_tuple_proto(struct nfattr *attr,
539 } 533 }
540 534
541 return ret; 535 return ret;
542
543nfattr_failure:
544 return -1;
545} 536}
546 537
547static inline int 538static inline int
@@ -555,8 +546,7 @@ ctnetlink_parse_tuple(struct nfattr *cda[], struct ip_conntrack_tuple *tuple,
555 546
556 memset(tuple, 0, sizeof(*tuple)); 547 memset(tuple, 0, sizeof(*tuple));
557 548
558 if (nfattr_parse_nested(tb, CTA_TUPLE_MAX, cda[type-1]) < 0) 549 nfattr_parse_nested(tb, CTA_TUPLE_MAX, cda[type-1]);
559 goto nfattr_failure;
560 550
561 if (!tb[CTA_TUPLE_IP-1]) 551 if (!tb[CTA_TUPLE_IP-1])
562 return -EINVAL; 552 return -EINVAL;
@@ -583,9 +573,6 @@ ctnetlink_parse_tuple(struct nfattr *cda[], struct ip_conntrack_tuple *tuple,
583 DEBUGP("leaving\n"); 573 DEBUGP("leaving\n");
584 574
585 return 0; 575 return 0;
586
587nfattr_failure:
588 return -1;
589} 576}
590 577
591#ifdef CONFIG_IP_NF_NAT_NEEDED 578#ifdef CONFIG_IP_NF_NAT_NEEDED
@@ -603,11 +590,10 @@ static int ctnetlink_parse_nat_proto(struct nfattr *attr,
603 590
604 DEBUGP("entered %s\n", __FUNCTION__); 591 DEBUGP("entered %s\n", __FUNCTION__);
605 592
606 if (nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr) < 0) 593 nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr);
607 goto nfattr_failure;
608 594
609 if (nfattr_bad_size(tb, CTA_PROTONAT_MAX, cta_min_protonat)) 595 if (nfattr_bad_size(tb, CTA_PROTONAT_MAX, cta_min_protonat))
610 goto nfattr_failure; 596 return -EINVAL;
611 597
612 npt = ip_nat_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum); 598 npt = ip_nat_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum);
613 if (!npt) 599 if (!npt)
@@ -626,9 +612,6 @@ static int ctnetlink_parse_nat_proto(struct nfattr *attr,
626 612
627 DEBUGP("leaving\n"); 613 DEBUGP("leaving\n");
628 return 0; 614 return 0;
629
630nfattr_failure:
631 return -1;
632} 615}
633 616
634static inline int 617static inline int
@@ -642,8 +625,7 @@ ctnetlink_parse_nat(struct nfattr *cda[],
642 625
643 memset(range, 0, sizeof(*range)); 626 memset(range, 0, sizeof(*range));
644 627
645 if (nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]) < 0) 628 nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]);
646 goto nfattr_failure;
647 629
648 if (tb[CTA_NAT_MINIP-1]) 630 if (tb[CTA_NAT_MINIP-1])
649 range->min_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MINIP-1]); 631 range->min_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MINIP-1]);
@@ -665,9 +647,6 @@ ctnetlink_parse_nat(struct nfattr *cda[],
665 647
666 DEBUGP("leaving\n"); 648 DEBUGP("leaving\n");
667 return 0; 649 return 0;
668
669nfattr_failure:
670 return -1;
671} 650}
672#endif 651#endif
673 652
@@ -678,8 +657,7 @@ ctnetlink_parse_help(struct nfattr *attr, char **helper_name)
678 657
679 DEBUGP("entered %s\n", __FUNCTION__); 658 DEBUGP("entered %s\n", __FUNCTION__);
680 659
681 if (nfattr_parse_nested(tb, CTA_HELP_MAX, attr) < 0) 660 nfattr_parse_nested(tb, CTA_HELP_MAX, attr);
682 goto nfattr_failure;
683 661
684 if (!tb[CTA_HELP_NAME-1]) 662 if (!tb[CTA_HELP_NAME-1])
685 return -EINVAL; 663 return -EINVAL;
@@ -687,9 +665,6 @@ ctnetlink_parse_help(struct nfattr *attr, char **helper_name)
687 *helper_name = NFA_DATA(tb[CTA_HELP_NAME-1]); 665 *helper_name = NFA_DATA(tb[CTA_HELP_NAME-1]);
688 666
689 return 0; 667 return 0;
690
691nfattr_failure:
692 return -1;
693} 668}
694 669
695static int 670static int
@@ -804,7 +779,7 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
804 ct = tuplehash_to_ctrack(h); 779 ct = tuplehash_to_ctrack(h);
805 780
806 err = -ENOMEM; 781 err = -ENOMEM;
807 skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); 782 skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
808 if (!skb2) { 783 if (!skb2) {
809 ip_conntrack_put(ct); 784 ip_conntrack_put(ct);
810 return -ENOMEM; 785 return -ENOMEM;
@@ -827,7 +802,7 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
827free: 802free:
828 kfree_skb(skb2); 803 kfree_skb(skb2);
829out: 804out:
830 return -1; 805 return err;
831} 806}
832 807
833static inline int 808static inline int
@@ -957,8 +932,7 @@ ctnetlink_change_protoinfo(struct ip_conntrack *ct, struct nfattr *cda[])
957 u_int16_t npt = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum; 932 u_int16_t npt = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum;
958 int err = 0; 933 int err = 0;
959 934
960 if (nfattr_parse_nested(tb, CTA_PROTOINFO_MAX, attr) < 0) 935 nfattr_parse_nested(tb, CTA_PROTOINFO_MAX, attr);
961 goto nfattr_failure;
962 936
963 proto = ip_conntrack_proto_find_get(npt); 937 proto = ip_conntrack_proto_find_get(npt);
964 if (!proto) 938 if (!proto)
@@ -969,9 +943,6 @@ ctnetlink_change_protoinfo(struct ip_conntrack *ct, struct nfattr *cda[])
969 ip_conntrack_proto_put(proto); 943 ip_conntrack_proto_put(proto);
970 944
971 return err; 945 return err;
972
973nfattr_failure:
974 return -ENOMEM;
975} 946}
976 947
977static int 948static int
@@ -1005,6 +976,11 @@ ctnetlink_change_conntrack(struct ip_conntrack *ct, struct nfattr *cda[])
1005 return err; 976 return err;
1006 } 977 }
1007 978
979#if defined(CONFIG_IP_NF_CONNTRACK_MARK)
980 if (cda[CTA_MARK-1])
981 ct->mark = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_MARK-1]));
982#endif
983
1008 DEBUGP("all done\n"); 984 DEBUGP("all done\n");
1009 return 0; 985 return 0;
1010} 986}
@@ -1048,6 +1024,11 @@ ctnetlink_create_conntrack(struct nfattr *cda[],
1048 if (ct->helper) 1024 if (ct->helper)
1049 ip_conntrack_helper_put(ct->helper); 1025 ip_conntrack_helper_put(ct->helper);
1050 1026
1027#if defined(CONFIG_IP_NF_CONNTRACK_MARK)
1028 if (cda[CTA_MARK-1])
1029 ct->mark = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_MARK-1]));
1030#endif
1031
1051 DEBUGP("conntrack with id %u inserted\n", ct->id); 1032 DEBUGP("conntrack with id %u inserted\n", ct->id);
1052 return 0; 1033 return 0;
1053 1034
@@ -1312,6 +1293,14 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
1312 if (!exp) 1293 if (!exp)
1313 return -ENOENT; 1294 return -ENOENT;
1314 1295
1296 if (cda[CTA_EXPECT_ID-1]) {
1297 u_int32_t id = *(u_int32_t *)NFA_DATA(cda[CTA_EXPECT_ID-1]);
1298 if (exp->id != ntohl(id)) {
1299 ip_conntrack_expect_put(exp);
1300 return -ENOENT;
1301 }
1302 }
1303
1315 err = -ENOMEM; 1304 err = -ENOMEM;
1316 skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 1305 skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1317 if (!skb2) 1306 if (!skb2)
@@ -1387,7 +1376,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
1387 ip_conntrack_expect_put(exp); 1376 ip_conntrack_expect_put(exp);
1388 } 1377 }
1389 } 1378 }
1390 write_unlock(&ip_conntrack_lock); 1379 write_unlock_bh(&ip_conntrack_lock);
1391 } else { 1380 } else {
1392 /* This basically means we have to flush everything*/ 1381 /* This basically means we have to flush everything*/
1393 write_lock_bh(&ip_conntrack_lock); 1382 write_lock_bh(&ip_conntrack_lock);
@@ -1554,6 +1543,8 @@ static struct nfnetlink_subsystem ctnl_exp_subsys = {
1554 .cb = ctnl_exp_cb, 1543 .cb = ctnl_exp_cb,
1555}; 1544};
1556 1545
1546MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK);
1547
1557static int __init ctnetlink_init(void) 1548static int __init ctnetlink_init(void)
1558{ 1549{
1559 int ret; 1550 int ret;
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
index 98f0015dd255..e4d6b268e8c4 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
@@ -13,6 +13,7 @@
13#include <linux/in.h> 13#include <linux/in.h>
14#include <linux/icmp.h> 14#include <linux/icmp.h>
15#include <linux/seq_file.h> 15#include <linux/seq_file.h>
16#include <linux/skbuff.h>
16#include <net/ip.h> 17#include <net/ip.h>
17#include <net/checksum.h> 18#include <net/checksum.h>
18#include <linux/netfilter.h> 19#include <linux/netfilter.h>
@@ -151,13 +152,13 @@ icmp_error_message(struct sk_buff *skb,
151 /* Not enough header? */ 152 /* Not enough header? */
152 inside = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_in), &_in); 153 inside = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_in), &_in);
153 if (inside == NULL) 154 if (inside == NULL)
154 return NF_ACCEPT; 155 return -NF_ACCEPT;
155 156
156 /* Ignore ICMP's containing fragments (shouldn't happen) */ 157 /* Ignore ICMP's containing fragments (shouldn't happen) */
157 if (inside->ip.frag_off & htons(IP_OFFSET)) { 158 if (inside->ip.frag_off & htons(IP_OFFSET)) {
158 DEBUGP("icmp_error_track: fragment of proto %u\n", 159 DEBUGP("icmp_error_track: fragment of proto %u\n",
159 inside->ip.protocol); 160 inside->ip.protocol);
160 return NF_ACCEPT; 161 return -NF_ACCEPT;
161 } 162 }
162 163
163 innerproto = ip_conntrack_proto_find_get(inside->ip.protocol); 164 innerproto = ip_conntrack_proto_find_get(inside->ip.protocol);
@@ -166,7 +167,7 @@ icmp_error_message(struct sk_buff *skb,
166 if (!ip_ct_get_tuple(&inside->ip, skb, dataoff, &origtuple, innerproto)) { 167 if (!ip_ct_get_tuple(&inside->ip, skb, dataoff, &origtuple, innerproto)) {
167 DEBUGP("icmp_error: ! get_tuple p=%u", inside->ip.protocol); 168 DEBUGP("icmp_error: ! get_tuple p=%u", inside->ip.protocol);
168 ip_conntrack_proto_put(innerproto); 169 ip_conntrack_proto_put(innerproto);
169 return NF_ACCEPT; 170 return -NF_ACCEPT;
170 } 171 }
171 172
172 /* Ordinarily, we'd expect the inverted tupleproto, but it's 173 /* Ordinarily, we'd expect the inverted tupleproto, but it's
@@ -174,7 +175,7 @@ icmp_error_message(struct sk_buff *skb,
174 if (!ip_ct_invert_tuple(&innertuple, &origtuple, innerproto)) { 175 if (!ip_ct_invert_tuple(&innertuple, &origtuple, innerproto)) {
175 DEBUGP("icmp_error_track: Can't invert tuple\n"); 176 DEBUGP("icmp_error_track: Can't invert tuple\n");
176 ip_conntrack_proto_put(innerproto); 177 ip_conntrack_proto_put(innerproto);
177 return NF_ACCEPT; 178 return -NF_ACCEPT;
178 } 179 }
179 ip_conntrack_proto_put(innerproto); 180 ip_conntrack_proto_put(innerproto);
180 181
@@ -190,7 +191,7 @@ icmp_error_message(struct sk_buff *skb,
190 191
191 if (!h) { 192 if (!h) {
192 DEBUGP("icmp_error_track: no match\n"); 193 DEBUGP("icmp_error_track: no match\n");
193 return NF_ACCEPT; 194 return -NF_ACCEPT;
194 } 195 }
195 /* Reverse direction from that found */ 196 /* Reverse direction from that found */
196 if (DIRECTION(h) != IP_CT_DIR_REPLY) 197 if (DIRECTION(h) != IP_CT_DIR_REPLY)
@@ -230,19 +231,15 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
230 case CHECKSUM_HW: 231 case CHECKSUM_HW:
231 if (!(u16)csum_fold(skb->csum)) 232 if (!(u16)csum_fold(skb->csum))
232 break; 233 break;
233 if (LOG_INVALID(IPPROTO_ICMP)) 234 /* fall through */
234 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
235 "ip_ct_icmp: bad HW ICMP checksum ");
236 return -NF_ACCEPT;
237 case CHECKSUM_NONE: 235 case CHECKSUM_NONE:
238 if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) { 236 skb->csum = 0;
237 if (__skb_checksum_complete(skb)) {
239 if (LOG_INVALID(IPPROTO_ICMP)) 238 if (LOG_INVALID(IPPROTO_ICMP))
240 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, 239 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
241 "ip_ct_icmp: bad ICMP checksum "); 240 "ip_ct_icmp: bad ICMP checksum ");
242 return -NF_ACCEPT; 241 return -NF_ACCEPT;
243 } 242 }
244 default:
245 break;
246 } 243 }
247 244
248checksum_skipped: 245checksum_skipped:
@@ -296,7 +293,8 @@ static int icmp_nfattr_to_tuple(struct nfattr *tb[],
296 struct ip_conntrack_tuple *tuple) 293 struct ip_conntrack_tuple *tuple)
297{ 294{
298 if (!tb[CTA_PROTO_ICMP_TYPE-1] 295 if (!tb[CTA_PROTO_ICMP_TYPE-1]
299 || !tb[CTA_PROTO_ICMP_CODE-1]) 296 || !tb[CTA_PROTO_ICMP_CODE-1]
297 || !tb[CTA_PROTO_ICMP_ID-1])
300 return -1; 298 return -1;
301 299
302 tuple->dst.u.icmp.type = 300 tuple->dst.u.icmp.type =
@@ -304,7 +302,7 @@ static int icmp_nfattr_to_tuple(struct nfattr *tb[],
304 tuple->dst.u.icmp.code = 302 tuple->dst.u.icmp.code =
305 *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_CODE-1]); 303 *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_CODE-1]);
306 tuple->src.u.icmp.id = 304 tuple->src.u.icmp.id =
307 *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_ID-1]); 305 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_ICMP_ID-1]);
308 306
309 return 0; 307 return 0;
310} 308}
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
index d6701cafbcc2..468c6003b4c7 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -362,8 +362,12 @@ static int nfattr_to_tcp(struct nfattr *cda[], struct ip_conntrack *ct)
362 struct nfattr *attr = cda[CTA_PROTOINFO_TCP-1]; 362 struct nfattr *attr = cda[CTA_PROTOINFO_TCP-1];
363 struct nfattr *tb[CTA_PROTOINFO_TCP_MAX]; 363 struct nfattr *tb[CTA_PROTOINFO_TCP_MAX];
364 364
365 if (nfattr_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, attr) < 0) 365 /* updates could not contain anything about the private
366 goto nfattr_failure; 366 * protocol info, in that case skip the parsing */
367 if (!attr)
368 return 0;
369
370 nfattr_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, attr);
367 371
368 if (!tb[CTA_PROTOINFO_TCP_STATE-1]) 372 if (!tb[CTA_PROTOINFO_TCP_STATE-1])
369 return -EINVAL; 373 return -EINVAL;
@@ -374,9 +378,6 @@ static int nfattr_to_tcp(struct nfattr *cda[], struct ip_conntrack *ct)
374 write_unlock_bh(&tcp_lock); 378 write_unlock_bh(&tcp_lock);
375 379
376 return 0; 380 return 0;
377
378nfattr_failure:
379 return -1;
380} 381}
381#endif 382#endif
382 383
diff --git a/net/ipv4/netfilter/ip_nat_helper_pptp.c b/net/ipv4/netfilter/ip_nat_helper_pptp.c
index ee6ab74ad3a9..e546203f5662 100644
--- a/net/ipv4/netfilter/ip_nat_helper_pptp.c
+++ b/net/ipv4/netfilter/ip_nat_helper_pptp.c
@@ -73,6 +73,7 @@ static void pptp_nat_expected(struct ip_conntrack *ct,
73 struct ip_conntrack_tuple t; 73 struct ip_conntrack_tuple t;
74 struct ip_ct_pptp_master *ct_pptp_info; 74 struct ip_ct_pptp_master *ct_pptp_info;
75 struct ip_nat_pptp *nat_pptp_info; 75 struct ip_nat_pptp *nat_pptp_info;
76 struct ip_nat_range range;
76 77
77 ct_pptp_info = &master->help.ct_pptp_info; 78 ct_pptp_info = &master->help.ct_pptp_info;
78 nat_pptp_info = &master->nat.help.nat_pptp_info; 79 nat_pptp_info = &master->nat.help.nat_pptp_info;
@@ -110,7 +111,30 @@ static void pptp_nat_expected(struct ip_conntrack *ct,
110 DEBUGP("not found!\n"); 111 DEBUGP("not found!\n");
111 } 112 }
112 113
113 ip_nat_follow_master(ct, exp); 114 /* This must be a fresh one. */
115 BUG_ON(ct->status & IPS_NAT_DONE_MASK);
116
117 /* Change src to where master sends to */
118 range.flags = IP_NAT_RANGE_MAP_IPS;
119 range.min_ip = range.max_ip
120 = ct->master->tuplehash[!exp->dir].tuple.dst.ip;
121 if (exp->dir == IP_CT_DIR_ORIGINAL) {
122 range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED;
123 range.min = range.max = exp->saved_proto;
124 }
125 /* hook doesn't matter, but it has to do source manip */
126 ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING);
127
128 /* For DST manip, map port here to where it's expected. */
129 range.flags = IP_NAT_RANGE_MAP_IPS;
130 range.min_ip = range.max_ip
131 = ct->master->tuplehash[!exp->dir].tuple.src.ip;
132 if (exp->dir == IP_CT_DIR_REPLY) {
133 range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED;
134 range.min = range.max = exp->saved_proto;
135 }
136 /* hook doesn't matter, but it has to do destination manip */
137 ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING);
114} 138}
115 139
116/* outbound packets == from PNS to PAC */ 140/* outbound packets == from PNS to PAC */
@@ -213,7 +237,7 @@ pptp_exp_gre(struct ip_conntrack_expect *expect_orig,
213 237
214 /* alter expectation for PNS->PAC direction */ 238 /* alter expectation for PNS->PAC direction */
215 invert_tuplepr(&inv_t, &expect_orig->tuple); 239 invert_tuplepr(&inv_t, &expect_orig->tuple);
216 expect_orig->saved_proto.gre.key = htons(nat_pptp_info->pac_call_id); 240 expect_orig->saved_proto.gre.key = htons(ct_pptp_info->pns_call_id);
217 expect_orig->tuple.src.u.gre.key = htons(nat_pptp_info->pns_call_id); 241 expect_orig->tuple.src.u.gre.key = htons(nat_pptp_info->pns_call_id);
218 expect_orig->tuple.dst.u.gre.key = htons(ct_pptp_info->pac_call_id); 242 expect_orig->tuple.dst.u.gre.key = htons(ct_pptp_info->pac_call_id);
219 expect_orig->dir = IP_CT_DIR_ORIGINAL; 243 expect_orig->dir = IP_CT_DIR_ORIGINAL;
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 9bcb398fbc1f..45c52d8f4d99 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -29,7 +29,7 @@
29 29
30#include <linux/netfilter_ipv4/ip_tables.h> 30#include <linux/netfilter_ipv4/ip_tables.h>
31#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h> 31#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
32#include <linux/netfilter_ipv4/ip_conntrack.h> 32#include <net/netfilter/nf_conntrack_compat.h>
33 33
34#define CLUSTERIP_VERSION "0.8" 34#define CLUSTERIP_VERSION "0.8"
35 35
@@ -316,14 +316,14 @@ target(struct sk_buff **pskb,
316{ 316{
317 const struct ipt_clusterip_tgt_info *cipinfo = targinfo; 317 const struct ipt_clusterip_tgt_info *cipinfo = targinfo;
318 enum ip_conntrack_info ctinfo; 318 enum ip_conntrack_info ctinfo;
319 struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo); 319 u_int32_t *mark, hash;
320 u_int32_t hash;
321 320
322 /* don't need to clusterip_config_get() here, since refcount 321 /* don't need to clusterip_config_get() here, since refcount
323 * is only decremented by destroy() - and ip_tables guarantees 322 * is only decremented by destroy() - and ip_tables guarantees
324 * that the ->target() function isn't called after ->destroy() */ 323 * that the ->target() function isn't called after ->destroy() */
325 324
326 if (!ct) { 325 mark = nf_ct_get_mark((*pskb), &ctinfo);
326 if (mark == NULL) {
327 printk(KERN_ERR "CLUSTERIP: no conntrack!\n"); 327 printk(KERN_ERR "CLUSTERIP: no conntrack!\n");
328 /* FIXME: need to drop invalid ones, since replies 328 /* FIXME: need to drop invalid ones, since replies
329 * to outgoing connections of other nodes will be 329 * to outgoing connections of other nodes will be
@@ -346,7 +346,7 @@ target(struct sk_buff **pskb,
346 346
347 switch (ctinfo) { 347 switch (ctinfo) {
348 case IP_CT_NEW: 348 case IP_CT_NEW:
349 ct->mark = hash; 349 *mark = hash;
350 break; 350 break;
351 case IP_CT_RELATED: 351 case IP_CT_RELATED:
352 case IP_CT_RELATED+IP_CT_IS_REPLY: 352 case IP_CT_RELATED+IP_CT_IS_REPLY:
@@ -363,7 +363,7 @@ target(struct sk_buff **pskb,
363#ifdef DEBUG_CLUSTERP 363#ifdef DEBUG_CLUSTERP
364 DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 364 DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
365#endif 365#endif
366 DEBUGP("hash=%u ct_hash=%u ", hash, ct->mark); 366 DEBUGP("hash=%u ct_hash=%u ", hash, *mark);
367 if (!clusterip_responsible(cipinfo->config, hash)) { 367 if (!clusterip_responsible(cipinfo->config, hash)) {
368 DEBUGP("not responsible\n"); 368 DEBUGP("not responsible\n");
369 return NF_DROP; 369 return NF_DROP;
diff --git a/net/ipv4/netfilter/ipt_CONNMARK.c b/net/ipv4/netfilter/ipt_CONNMARK.c
index 05d66ab59424..8acac5a40a92 100644
--- a/net/ipv4/netfilter/ipt_CONNMARK.c
+++ b/net/ipv4/netfilter/ipt_CONNMARK.c
@@ -29,7 +29,7 @@ MODULE_LICENSE("GPL");
29 29
30#include <linux/netfilter_ipv4/ip_tables.h> 30#include <linux/netfilter_ipv4/ip_tables.h>
31#include <linux/netfilter_ipv4/ipt_CONNMARK.h> 31#include <linux/netfilter_ipv4/ipt_CONNMARK.h>
32#include <linux/netfilter_ipv4/ip_conntrack.h> 32#include <net/netfilter/nf_conntrack_compat.h>
33 33
34static unsigned int 34static unsigned int
35target(struct sk_buff **pskb, 35target(struct sk_buff **pskb,
@@ -43,24 +43,24 @@ target(struct sk_buff **pskb,
43 u_int32_t diff; 43 u_int32_t diff;
44 u_int32_t nfmark; 44 u_int32_t nfmark;
45 u_int32_t newmark; 45 u_int32_t newmark;
46 u_int32_t ctinfo;
47 u_int32_t *ctmark = nf_ct_get_mark(*pskb, &ctinfo);
46 48
47 enum ip_conntrack_info ctinfo; 49 if (ctmark) {
48 struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo);
49 if (ct) {
50 switch(markinfo->mode) { 50 switch(markinfo->mode) {
51 case IPT_CONNMARK_SET: 51 case IPT_CONNMARK_SET:
52 newmark = (ct->mark & ~markinfo->mask) | markinfo->mark; 52 newmark = (*ctmark & ~markinfo->mask) | markinfo->mark;
53 if (newmark != ct->mark) 53 if (newmark != *ctmark)
54 ct->mark = newmark; 54 *ctmark = newmark;
55 break; 55 break;
56 case IPT_CONNMARK_SAVE: 56 case IPT_CONNMARK_SAVE:
57 newmark = (ct->mark & ~markinfo->mask) | ((*pskb)->nfmark & markinfo->mask); 57 newmark = (*ctmark & ~markinfo->mask) | ((*pskb)->nfmark & markinfo->mask);
58 if (ct->mark != newmark) 58 if (*ctmark != newmark)
59 ct->mark = newmark; 59 *ctmark = newmark;
60 break; 60 break;
61 case IPT_CONNMARK_RESTORE: 61 case IPT_CONNMARK_RESTORE:
62 nfmark = (*pskb)->nfmark; 62 nfmark = (*pskb)->nfmark;
63 diff = (ct->mark ^ nfmark) & markinfo->mask; 63 diff = (*ctmark ^ nfmark) & markinfo->mask;
64 if (diff != 0) 64 if (diff != 0)
65 (*pskb)->nfmark = nfmark ^ diff; 65 (*pskb)->nfmark = nfmark ^ diff;
66 break; 66 break;
diff --git a/net/ipv4/netfilter/ipt_NOTRACK.c b/net/ipv4/netfilter/ipt_NOTRACK.c
index a4bb9b3bc292..e3c69d072c6e 100644
--- a/net/ipv4/netfilter/ipt_NOTRACK.c
+++ b/net/ipv4/netfilter/ipt_NOTRACK.c
@@ -5,7 +5,7 @@
5#include <linux/skbuff.h> 5#include <linux/skbuff.h>
6 6
7#include <linux/netfilter_ipv4/ip_tables.h> 7#include <linux/netfilter_ipv4/ip_tables.h>
8#include <linux/netfilter_ipv4/ip_conntrack.h> 8#include <net/netfilter/nf_conntrack_compat.h>
9 9
10static unsigned int 10static unsigned int
11target(struct sk_buff **pskb, 11target(struct sk_buff **pskb,
@@ -23,7 +23,7 @@ target(struct sk_buff **pskb,
23 If there is a real ct entry correspondig to this packet, 23 If there is a real ct entry correspondig to this packet,
24 it'll hang aroun till timing out. We don't deal with it 24 it'll hang aroun till timing out. We don't deal with it
25 for performance reasons. JK */ 25 for performance reasons. JK */
26 (*pskb)->nfct = &ip_conntrack_untracked.ct_general; 26 nf_ct_untrack(*pskb);
27 (*pskb)->nfctinfo = IP_CT_NEW; 27 (*pskb)->nfctinfo = IP_CT_NEW;
28 nf_conntrack_get((*pskb)->nfct); 28 nf_conntrack_get((*pskb)->nfct);
29 29
diff --git a/net/ipv4/netfilter/ipt_connbytes.c b/net/ipv4/netfilter/ipt_connbytes.c
index df4a42c6da22..d68a048b7176 100644
--- a/net/ipv4/netfilter/ipt_connbytes.c
+++ b/net/ipv4/netfilter/ipt_connbytes.c
@@ -10,7 +10,7 @@
10 */ 10 */
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/skbuff.h> 12#include <linux/skbuff.h>
13#include <linux/netfilter_ipv4/ip_conntrack.h> 13#include <net/netfilter/nf_conntrack_compat.h>
14#include <linux/netfilter_ipv4/ip_tables.h> 14#include <linux/netfilter_ipv4/ip_tables.h>
15#include <linux/netfilter_ipv4/ipt_connbytes.h> 15#include <linux/netfilter_ipv4/ipt_connbytes.h>
16 16
@@ -46,60 +46,59 @@ match(const struct sk_buff *skb,
46 int *hotdrop) 46 int *hotdrop)
47{ 47{
48 const struct ipt_connbytes_info *sinfo = matchinfo; 48 const struct ipt_connbytes_info *sinfo = matchinfo;
49 enum ip_conntrack_info ctinfo;
50 struct ip_conntrack *ct;
51 u_int64_t what = 0; /* initialize to make gcc happy */ 49 u_int64_t what = 0; /* initialize to make gcc happy */
50 const struct ip_conntrack_counter *counters;
52 51
53 if (!(ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo))) 52 if (!(counters = nf_ct_get_counters(skb)))
54 return 0; /* no match */ 53 return 0; /* no match */
55 54
56 switch (sinfo->what) { 55 switch (sinfo->what) {
57 case IPT_CONNBYTES_PKTS: 56 case IPT_CONNBYTES_PKTS:
58 switch (sinfo->direction) { 57 switch (sinfo->direction) {
59 case IPT_CONNBYTES_DIR_ORIGINAL: 58 case IPT_CONNBYTES_DIR_ORIGINAL:
60 what = ct->counters[IP_CT_DIR_ORIGINAL].packets; 59 what = counters[IP_CT_DIR_ORIGINAL].packets;
61 break; 60 break;
62 case IPT_CONNBYTES_DIR_REPLY: 61 case IPT_CONNBYTES_DIR_REPLY:
63 what = ct->counters[IP_CT_DIR_REPLY].packets; 62 what = counters[IP_CT_DIR_REPLY].packets;
64 break; 63 break;
65 case IPT_CONNBYTES_DIR_BOTH: 64 case IPT_CONNBYTES_DIR_BOTH:
66 what = ct->counters[IP_CT_DIR_ORIGINAL].packets; 65 what = counters[IP_CT_DIR_ORIGINAL].packets;
67 what += ct->counters[IP_CT_DIR_REPLY].packets; 66 what += counters[IP_CT_DIR_REPLY].packets;
68 break; 67 break;
69 } 68 }
70 break; 69 break;
71 case IPT_CONNBYTES_BYTES: 70 case IPT_CONNBYTES_BYTES:
72 switch (sinfo->direction) { 71 switch (sinfo->direction) {
73 case IPT_CONNBYTES_DIR_ORIGINAL: 72 case IPT_CONNBYTES_DIR_ORIGINAL:
74 what = ct->counters[IP_CT_DIR_ORIGINAL].bytes; 73 what = counters[IP_CT_DIR_ORIGINAL].bytes;
75 break; 74 break;
76 case IPT_CONNBYTES_DIR_REPLY: 75 case IPT_CONNBYTES_DIR_REPLY:
77 what = ct->counters[IP_CT_DIR_REPLY].bytes; 76 what = counters[IP_CT_DIR_REPLY].bytes;
78 break; 77 break;
79 case IPT_CONNBYTES_DIR_BOTH: 78 case IPT_CONNBYTES_DIR_BOTH:
80 what = ct->counters[IP_CT_DIR_ORIGINAL].bytes; 79 what = counters[IP_CT_DIR_ORIGINAL].bytes;
81 what += ct->counters[IP_CT_DIR_REPLY].bytes; 80 what += counters[IP_CT_DIR_REPLY].bytes;
82 break; 81 break;
83 } 82 }
84 break; 83 break;
85 case IPT_CONNBYTES_AVGPKT: 84 case IPT_CONNBYTES_AVGPKT:
86 switch (sinfo->direction) { 85 switch (sinfo->direction) {
87 case IPT_CONNBYTES_DIR_ORIGINAL: 86 case IPT_CONNBYTES_DIR_ORIGINAL:
88 what = div64_64(ct->counters[IP_CT_DIR_ORIGINAL].bytes, 87 what = div64_64(counters[IP_CT_DIR_ORIGINAL].bytes,
89 ct->counters[IP_CT_DIR_ORIGINAL].packets); 88 counters[IP_CT_DIR_ORIGINAL].packets);
90 break; 89 break;
91 case IPT_CONNBYTES_DIR_REPLY: 90 case IPT_CONNBYTES_DIR_REPLY:
92 what = div64_64(ct->counters[IP_CT_DIR_REPLY].bytes, 91 what = div64_64(counters[IP_CT_DIR_REPLY].bytes,
93 ct->counters[IP_CT_DIR_REPLY].packets); 92 counters[IP_CT_DIR_REPLY].packets);
94 break; 93 break;
95 case IPT_CONNBYTES_DIR_BOTH: 94 case IPT_CONNBYTES_DIR_BOTH:
96 { 95 {
97 u_int64_t bytes; 96 u_int64_t bytes;
98 u_int64_t pkts; 97 u_int64_t pkts;
99 bytes = ct->counters[IP_CT_DIR_ORIGINAL].bytes + 98 bytes = counters[IP_CT_DIR_ORIGINAL].bytes +
100 ct->counters[IP_CT_DIR_REPLY].bytes; 99 counters[IP_CT_DIR_REPLY].bytes;
101 pkts = ct->counters[IP_CT_DIR_ORIGINAL].packets+ 100 pkts = counters[IP_CT_DIR_ORIGINAL].packets+
102 ct->counters[IP_CT_DIR_REPLY].packets; 101 counters[IP_CT_DIR_REPLY].packets;
103 102
104 /* FIXME_THEORETICAL: what to do if sum 103 /* FIXME_THEORETICAL: what to do if sum
105 * overflows ? */ 104 * overflows ? */
diff --git a/net/ipv4/netfilter/ipt_connmark.c b/net/ipv4/netfilter/ipt_connmark.c
index bf8de47ce004..5306ef293b92 100644
--- a/net/ipv4/netfilter/ipt_connmark.c
+++ b/net/ipv4/netfilter/ipt_connmark.c
@@ -28,7 +28,7 @@ MODULE_LICENSE("GPL");
28 28
29#include <linux/netfilter_ipv4/ip_tables.h> 29#include <linux/netfilter_ipv4/ip_tables.h>
30#include <linux/netfilter_ipv4/ipt_connmark.h> 30#include <linux/netfilter_ipv4/ipt_connmark.h>
31#include <linux/netfilter_ipv4/ip_conntrack.h> 31#include <net/netfilter/nf_conntrack_compat.h>
32 32
33static int 33static int
34match(const struct sk_buff *skb, 34match(const struct sk_buff *skb,
@@ -39,12 +39,12 @@ match(const struct sk_buff *skb,
39 int *hotdrop) 39 int *hotdrop)
40{ 40{
41 const struct ipt_connmark_info *info = matchinfo; 41 const struct ipt_connmark_info *info = matchinfo;
42 enum ip_conntrack_info ctinfo; 42 u_int32_t ctinfo;
43 struct ip_conntrack *ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo); 43 const u_int32_t *ctmark = nf_ct_get_mark(skb, &ctinfo);
44 if (!ct) 44 if (!ctmark)
45 return 0; 45 return 0;
46 46
47 return ((ct->mark & info->mask) == info->mark) ^ info->invert; 47 return (((*ctmark) & info->mask) == info->mark) ^ info->invert;
48} 48}
49 49
50static int 50static int
diff --git a/net/ipv4/netfilter/ipt_conntrack.c b/net/ipv4/netfilter/ipt_conntrack.c
index c1d22801b7cf..c8d18705469b 100644
--- a/net/ipv4/netfilter/ipt_conntrack.c
+++ b/net/ipv4/netfilter/ipt_conntrack.c
@@ -10,7 +10,14 @@
10 10
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/skbuff.h> 12#include <linux/skbuff.h>
13
14#if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE)
13#include <linux/netfilter_ipv4/ip_conntrack.h> 15#include <linux/netfilter_ipv4/ip_conntrack.h>
16#include <linux/netfilter_ipv4/ip_conntrack_tuple.h>
17#else
18#include <net/netfilter/nf_conntrack.h>
19#endif
20
14#include <linux/netfilter_ipv4/ip_tables.h> 21#include <linux/netfilter_ipv4/ip_tables.h>
15#include <linux/netfilter_ipv4/ipt_conntrack.h> 22#include <linux/netfilter_ipv4/ipt_conntrack.h>
16 23
@@ -18,6 +25,8 @@ MODULE_LICENSE("GPL");
18MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); 25MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
19MODULE_DESCRIPTION("iptables connection tracking match module"); 26MODULE_DESCRIPTION("iptables connection tracking match module");
20 27
28#if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE)
29
21static int 30static int
22match(const struct sk_buff *skb, 31match(const struct sk_buff *skb,
23 const struct net_device *in, 32 const struct net_device *in,
@@ -102,6 +111,93 @@ match(const struct sk_buff *skb,
102 return 1; 111 return 1;
103} 112}
104 113
114#else /* CONFIG_IP_NF_CONNTRACK */
115static int
116match(const struct sk_buff *skb,
117 const struct net_device *in,
118 const struct net_device *out,
119 const void *matchinfo,
120 int offset,
121 int *hotdrop)
122{
123 const struct ipt_conntrack_info *sinfo = matchinfo;
124 struct nf_conn *ct;
125 enum ip_conntrack_info ctinfo;
126 unsigned int statebit;
127
128 ct = nf_ct_get((struct sk_buff *)skb, &ctinfo);
129
130#define FWINV(bool,invflg) ((bool) ^ !!(sinfo->invflags & invflg))
131
132 if (ct == &nf_conntrack_untracked)
133 statebit = IPT_CONNTRACK_STATE_UNTRACKED;
134 else if (ct)
135 statebit = IPT_CONNTRACK_STATE_BIT(ctinfo);
136 else
137 statebit = IPT_CONNTRACK_STATE_INVALID;
138
139 if(sinfo->flags & IPT_CONNTRACK_STATE) {
140 if (ct) {
141 if(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip !=
142 ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip)
143 statebit |= IPT_CONNTRACK_STATE_SNAT;
144
145 if(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.ip !=
146 ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip)
147 statebit |= IPT_CONNTRACK_STATE_DNAT;
148 }
149
150 if (FWINV((statebit & sinfo->statemask) == 0, IPT_CONNTRACK_STATE))
151 return 0;
152 }
153
154 if(sinfo->flags & IPT_CONNTRACK_PROTO) {
155 if (!ct || FWINV(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.protonum, IPT_CONNTRACK_PROTO))
156 return 0;
157 }
158
159 if(sinfo->flags & IPT_CONNTRACK_ORIGSRC) {
160 if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip&sinfo->sipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].src.ip, IPT_CONNTRACK_ORIGSRC))
161 return 0;
162 }
163
164 if(sinfo->flags & IPT_CONNTRACK_ORIGDST) {
165 if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.ip&sinfo->dipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.ip, IPT_CONNTRACK_ORIGDST))
166 return 0;
167 }
168
169 if(sinfo->flags & IPT_CONNTRACK_REPLSRC) {
170 if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip&sinfo->sipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].src.ip, IPT_CONNTRACK_REPLSRC))
171 return 0;
172 }
173
174 if(sinfo->flags & IPT_CONNTRACK_REPLDST) {
175 if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip&sinfo->dipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].dst.ip, IPT_CONNTRACK_REPLDST))
176 return 0;
177 }
178
179 if(sinfo->flags & IPT_CONNTRACK_STATUS) {
180 if (!ct || FWINV((ct->status & sinfo->statusmask) == 0, IPT_CONNTRACK_STATUS))
181 return 0;
182 }
183
184 if(sinfo->flags & IPT_CONNTRACK_EXPIRES) {
185 unsigned long expires;
186
187 if(!ct)
188 return 0;
189
190 expires = timer_pending(&ct->timeout) ? (ct->timeout.expires - jiffies)/HZ : 0;
191
192 if (FWINV(!(expires >= sinfo->expires_min && expires <= sinfo->expires_max), IPT_CONNTRACK_EXPIRES))
193 return 0;
194 }
195
196 return 1;
197}
198
199#endif /* CONFIG_NF_IP_CONNTRACK */
200
105static int check(const char *tablename, 201static int check(const char *tablename,
106 const struct ipt_ip *ip, 202 const struct ipt_ip *ip,
107 void *matchinfo, 203 void *matchinfo,
diff --git a/net/ipv4/netfilter/ipt_helper.c b/net/ipv4/netfilter/ipt_helper.c
index 3e7dd014de43..bf14e1c7798a 100644
--- a/net/ipv4/netfilter/ipt_helper.c
+++ b/net/ipv4/netfilter/ipt_helper.c
@@ -13,9 +13,15 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/skbuff.h> 14#include <linux/skbuff.h>
15#include <linux/netfilter.h> 15#include <linux/netfilter.h>
16#if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE)
16#include <linux/netfilter_ipv4/ip_conntrack.h> 17#include <linux/netfilter_ipv4/ip_conntrack.h>
17#include <linux/netfilter_ipv4/ip_conntrack_core.h> 18#include <linux/netfilter_ipv4/ip_conntrack_core.h>
18#include <linux/netfilter_ipv4/ip_conntrack_helper.h> 19#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
20#else
21#include <net/netfilter/nf_conntrack.h>
22#include <net/netfilter/nf_conntrack_core.h>
23#include <net/netfilter/nf_conntrack_helper.h>
24#endif
19#include <linux/netfilter_ipv4/ip_tables.h> 25#include <linux/netfilter_ipv4/ip_tables.h>
20#include <linux/netfilter_ipv4/ipt_helper.h> 26#include <linux/netfilter_ipv4/ipt_helper.h>
21 27
@@ -29,6 +35,7 @@ MODULE_DESCRIPTION("iptables helper match module");
29#define DEBUGP(format, args...) 35#define DEBUGP(format, args...)
30#endif 36#endif
31 37
38#if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE)
32static int 39static int
33match(const struct sk_buff *skb, 40match(const struct sk_buff *skb,
34 const struct net_device *in, 41 const struct net_device *in,
@@ -73,6 +80,53 @@ out_unlock:
73 return ret; 80 return ret;
74} 81}
75 82
83#else /* CONFIG_IP_NF_CONNTRACK */
84
85static int
86match(const struct sk_buff *skb,
87 const struct net_device *in,
88 const struct net_device *out,
89 const void *matchinfo,
90 int offset,
91 int *hotdrop)
92{
93 const struct ipt_helper_info *info = matchinfo;
94 struct nf_conn *ct;
95 enum ip_conntrack_info ctinfo;
96 int ret = info->invert;
97
98 ct = nf_ct_get((struct sk_buff *)skb, &ctinfo);
99 if (!ct) {
100 DEBUGP("ipt_helper: Eek! invalid conntrack?\n");
101 return ret;
102 }
103
104 if (!ct->master) {
105 DEBUGP("ipt_helper: conntrack %p has no master\n", ct);
106 return ret;
107 }
108
109 read_lock_bh(&nf_conntrack_lock);
110 if (!ct->master->helper) {
111 DEBUGP("ipt_helper: master ct %p has no helper\n",
112 exp->expectant);
113 goto out_unlock;
114 }
115
116 DEBUGP("master's name = %s , info->name = %s\n",
117 ct->master->helper->name, info->name);
118
119 if (info->name[0] == '\0')
120 ret ^= 1;
121 else
122 ret ^= !strncmp(ct->master->helper->name, info->name,
123 strlen(ct->master->helper->name));
124out_unlock:
125 read_unlock_bh(&nf_conntrack_lock);
126 return ret;
127}
128#endif
129
76static int check(const char *tablename, 130static int check(const char *tablename,
77 const struct ipt_ip *ip, 131 const struct ipt_ip *ip,
78 void *matchinfo, 132 void *matchinfo,
diff --git a/net/ipv4/netfilter/ipt_state.c b/net/ipv4/netfilter/ipt_state.c
index b1511b97ea5f..4d7f16b70cec 100644
--- a/net/ipv4/netfilter/ipt_state.c
+++ b/net/ipv4/netfilter/ipt_state.c
@@ -10,7 +10,7 @@
10 10
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/skbuff.h> 12#include <linux/skbuff.h>
13#include <linux/netfilter_ipv4/ip_conntrack.h> 13#include <net/netfilter/nf_conntrack_compat.h>
14#include <linux/netfilter_ipv4/ip_tables.h> 14#include <linux/netfilter_ipv4/ip_tables.h>
15#include <linux/netfilter_ipv4/ipt_state.h> 15#include <linux/netfilter_ipv4/ipt_state.h>
16 16
@@ -30,9 +30,9 @@ match(const struct sk_buff *skb,
30 enum ip_conntrack_info ctinfo; 30 enum ip_conntrack_info ctinfo;
31 unsigned int statebit; 31 unsigned int statebit;
32 32
33 if (skb->nfct == &ip_conntrack_untracked.ct_general) 33 if (nf_ct_is_untracked(skb))
34 statebit = IPT_STATE_UNTRACKED; 34 statebit = IPT_STATE_UNTRACKED;
35 else if (!ip_conntrack_get(skb, &ctinfo)) 35 else if (!nf_ct_get_ctinfo(skb, &ctinfo))
36 statebit = IPT_STATE_INVALID; 36 statebit = IPT_STATE_INVALID;
37 else 37 else
38 statebit = IPT_STATE_BIT(ctinfo); 38 statebit = IPT_STATE_BIT(ctinfo);
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
new file mode 100644
index 000000000000..8202c1c0afad
--- /dev/null
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -0,0 +1,571 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
9 * - move L3 protocol dependent part to this file.
10 * 23 Mar 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
11 * - add get_features() to support various size of conntrack
12 * structures.
13 *
14 * Derived from net/ipv4/netfilter/ip_conntrack_standalone.c
15 */
16
17#include <linux/config.h>
18#include <linux/types.h>
19#include <linux/ip.h>
20#include <linux/netfilter.h>
21#include <linux/module.h>
22#include <linux/skbuff.h>
23#include <linux/icmp.h>
24#include <linux/sysctl.h>
25#include <net/ip.h>
26
27#include <linux/netfilter_ipv4.h>
28#include <net/netfilter/nf_conntrack.h>
29#include <net/netfilter/nf_conntrack_helper.h>
30#include <net/netfilter/nf_conntrack_protocol.h>
31#include <net/netfilter/nf_conntrack_l3proto.h>
32#include <net/netfilter/nf_conntrack_core.h>
33#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
34
35#if 0
36#define DEBUGP printk
37#else
38#define DEBUGP(format, args...)
39#endif
40
41DECLARE_PER_CPU(struct nf_conntrack_stat, nf_conntrack_stat);
42
43static int ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
44 struct nf_conntrack_tuple *tuple)
45{
46 u_int32_t _addrs[2], *ap;
47 ap = skb_header_pointer(skb, nhoff + offsetof(struct iphdr, saddr),
48 sizeof(u_int32_t) * 2, _addrs);
49 if (ap == NULL)
50 return 0;
51
52 tuple->src.u3.ip = ap[0];
53 tuple->dst.u3.ip = ap[1];
54
55 return 1;
56}
57
58static int ipv4_invert_tuple(struct nf_conntrack_tuple *tuple,
59 const struct nf_conntrack_tuple *orig)
60{
61 tuple->src.u3.ip = orig->dst.u3.ip;
62 tuple->dst.u3.ip = orig->src.u3.ip;
63
64 return 1;
65}
66
67static int ipv4_print_tuple(struct seq_file *s,
68 const struct nf_conntrack_tuple *tuple)
69{
70 return seq_printf(s, "src=%u.%u.%u.%u dst=%u.%u.%u.%u ",
71 NIPQUAD(tuple->src.u3.ip),
72 NIPQUAD(tuple->dst.u3.ip));
73}
74
75static int ipv4_print_conntrack(struct seq_file *s,
76 const struct nf_conn *conntrack)
77{
78 return 0;
79}
80
81/* Returns new sk_buff, or NULL */
82static struct sk_buff *
83nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)
84{
85 skb_orphan(skb);
86
87 local_bh_disable();
88 skb = ip_defrag(skb, user);
89 local_bh_enable();
90
91 if (skb)
92 ip_send_check(skb->nh.iph);
93
94 return skb;
95}
96
97static int
98ipv4_prepare(struct sk_buff **pskb, unsigned int hooknum, unsigned int *dataoff,
99 u_int8_t *protonum)
100{
101 /* Never happen */
102 if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
103 if (net_ratelimit()) {
104 printk(KERN_ERR "ipv4_prepare: Frag of proto %u (hook=%u)\n",
105 (*pskb)->nh.iph->protocol, hooknum);
106 }
107 return -NF_DROP;
108 }
109
110 *dataoff = (*pskb)->nh.raw - (*pskb)->data + (*pskb)->nh.iph->ihl*4;
111 *protonum = (*pskb)->nh.iph->protocol;
112
113 return NF_ACCEPT;
114}
115
116int nat_module_is_loaded = 0;
117static u_int32_t ipv4_get_features(const struct nf_conntrack_tuple *tuple)
118{
119 if (nat_module_is_loaded)
120 return NF_CT_F_NAT;
121
122 return NF_CT_F_BASIC;
123}
124
125static unsigned int ipv4_confirm(unsigned int hooknum,
126 struct sk_buff **pskb,
127 const struct net_device *in,
128 const struct net_device *out,
129 int (*okfn)(struct sk_buff *))
130{
131 /* We've seen it coming out the other side: confirm it */
132 return nf_conntrack_confirm(pskb);
133}
134
135static unsigned int ipv4_conntrack_help(unsigned int hooknum,
136 struct sk_buff **pskb,
137 const struct net_device *in,
138 const struct net_device *out,
139 int (*okfn)(struct sk_buff *))
140{
141 struct nf_conn *ct;
142 enum ip_conntrack_info ctinfo;
143
144 /* This is where we call the helper: as the packet goes out. */
145 ct = nf_ct_get(*pskb, &ctinfo);
146 if (ct && ct->helper) {
147 unsigned int ret;
148 ret = ct->helper->help(pskb,
149 (*pskb)->nh.raw - (*pskb)->data
150 + (*pskb)->nh.iph->ihl*4,
151 ct, ctinfo);
152 if (ret != NF_ACCEPT)
153 return ret;
154 }
155 return NF_ACCEPT;
156}
157
158static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,
159 struct sk_buff **pskb,
160 const struct net_device *in,
161 const struct net_device *out,
162 int (*okfn)(struct sk_buff *))
163{
164#if !defined(CONFIG_IP_NF_NAT) && !defined(CONFIG_IP_NF_NAT_MODULE)
165 /* Previously seen (loopback)? Ignore. Do this before
166 fragment check. */
167 if ((*pskb)->nfct)
168 return NF_ACCEPT;
169#endif
170
171 /* Gather fragments. */
172 if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
173 *pskb = nf_ct_ipv4_gather_frags(*pskb,
174 hooknum == NF_IP_PRE_ROUTING ?
175 IP_DEFRAG_CONNTRACK_IN :
176 IP_DEFRAG_CONNTRACK_OUT);
177 if (!*pskb)
178 return NF_STOLEN;
179 }
180 return NF_ACCEPT;
181}
182
183static unsigned int ipv4_refrag(unsigned int hooknum,
184 struct sk_buff **pskb,
185 const struct net_device *in,
186 const struct net_device *out,
187 int (*okfn)(struct sk_buff *))
188{
189 struct rtable *rt = (struct rtable *)(*pskb)->dst;
190
191 /* We've seen it coming out the other side: confirm */
192 if (ipv4_confirm(hooknum, pskb, in, out, okfn) != NF_ACCEPT)
193 return NF_DROP;
194
195 /* Local packets are never produced too large for their
196 interface. We degfragment them at LOCAL_OUT, however,
197 so we have to refragment them here. */
198 if ((*pskb)->len > dst_mtu(&rt->u.dst) &&
199 !skb_shinfo(*pskb)->tso_size) {
200 /* No hook can be after us, so this should be OK. */
201 ip_fragment(*pskb, okfn);
202 return NF_STOLEN;
203 }
204 return NF_ACCEPT;
205}
206
207static unsigned int ipv4_conntrack_in(unsigned int hooknum,
208 struct sk_buff **pskb,
209 const struct net_device *in,
210 const struct net_device *out,
211 int (*okfn)(struct sk_buff *))
212{
213 return nf_conntrack_in(PF_INET, hooknum, pskb);
214}
215
216static unsigned int ipv4_conntrack_local(unsigned int hooknum,
217 struct sk_buff **pskb,
218 const struct net_device *in,
219 const struct net_device *out,
220 int (*okfn)(struct sk_buff *))
221{
222 /* root is playing with raw sockets. */
223 if ((*pskb)->len < sizeof(struct iphdr)
224 || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) {
225 if (net_ratelimit())
226 printk("ipt_hook: happy cracking.\n");
227 return NF_ACCEPT;
228 }
229 return nf_conntrack_in(PF_INET, hooknum, pskb);
230}
231
232/* Connection tracking may drop packets, but never alters them, so
233 make it the first hook. */
234static struct nf_hook_ops ipv4_conntrack_defrag_ops = {
235 .hook = ipv4_conntrack_defrag,
236 .owner = THIS_MODULE,
237 .pf = PF_INET,
238 .hooknum = NF_IP_PRE_ROUTING,
239 .priority = NF_IP_PRI_CONNTRACK_DEFRAG,
240};
241
242static struct nf_hook_ops ipv4_conntrack_in_ops = {
243 .hook = ipv4_conntrack_in,
244 .owner = THIS_MODULE,
245 .pf = PF_INET,
246 .hooknum = NF_IP_PRE_ROUTING,
247 .priority = NF_IP_PRI_CONNTRACK,
248};
249
250static struct nf_hook_ops ipv4_conntrack_defrag_local_out_ops = {
251 .hook = ipv4_conntrack_defrag,
252 .owner = THIS_MODULE,
253 .pf = PF_INET,
254 .hooknum = NF_IP_LOCAL_OUT,
255 .priority = NF_IP_PRI_CONNTRACK_DEFRAG,
256};
257
258static struct nf_hook_ops ipv4_conntrack_local_out_ops = {
259 .hook = ipv4_conntrack_local,
260 .owner = THIS_MODULE,
261 .pf = PF_INET,
262 .hooknum = NF_IP_LOCAL_OUT,
263 .priority = NF_IP_PRI_CONNTRACK,
264};
265
266/* helpers */
267static struct nf_hook_ops ipv4_conntrack_helper_out_ops = {
268 .hook = ipv4_conntrack_help,
269 .owner = THIS_MODULE,
270 .pf = PF_INET,
271 .hooknum = NF_IP_POST_ROUTING,
272 .priority = NF_IP_PRI_CONNTRACK_HELPER,
273};
274
275static struct nf_hook_ops ipv4_conntrack_helper_in_ops = {
276 .hook = ipv4_conntrack_help,
277 .owner = THIS_MODULE,
278 .pf = PF_INET,
279 .hooknum = NF_IP_LOCAL_IN,
280 .priority = NF_IP_PRI_CONNTRACK_HELPER,
281};
282
283
284/* Refragmenter; last chance. */
285static struct nf_hook_ops ipv4_conntrack_out_ops = {
286 .hook = ipv4_refrag,
287 .owner = THIS_MODULE,
288 .pf = PF_INET,
289 .hooknum = NF_IP_POST_ROUTING,
290 .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
291};
292
293static struct nf_hook_ops ipv4_conntrack_local_in_ops = {
294 .hook = ipv4_confirm,
295 .owner = THIS_MODULE,
296 .pf = PF_INET,
297 .hooknum = NF_IP_LOCAL_IN,
298 .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
299};
300
301#ifdef CONFIG_SYSCTL
302/* From nf_conntrack_proto_icmp.c */
303extern unsigned long nf_ct_icmp_timeout;
304static struct ctl_table_header *nf_ct_ipv4_sysctl_header;
305
306static ctl_table nf_ct_sysctl_table[] = {
307 {
308 .ctl_name = NET_NF_CONNTRACK_ICMP_TIMEOUT,
309 .procname = "nf_conntrack_icmp_timeout",
310 .data = &nf_ct_icmp_timeout,
311 .maxlen = sizeof(unsigned int),
312 .mode = 0644,
313 .proc_handler = &proc_dointvec_jiffies,
314 },
315 { .ctl_name = 0 }
316};
317
318static ctl_table nf_ct_netfilter_table[] = {
319 {
320 .ctl_name = NET_NETFILTER,
321 .procname = "netfilter",
322 .mode = 0555,
323 .child = nf_ct_sysctl_table,
324 },
325 { .ctl_name = 0 }
326};
327
328static ctl_table nf_ct_net_table[] = {
329 {
330 .ctl_name = CTL_NET,
331 .procname = "net",
332 .mode = 0555,
333 .child = nf_ct_netfilter_table,
334 },
335 { .ctl_name = 0 }
336};
337#endif
338
339/* Fast function for those who don't want to parse /proc (and I don't
340 blame them). */
341/* Reversing the socket's dst/src point of view gives us the reply
342 mapping. */
343static int
344getorigdst(struct sock *sk, int optval, void __user *user, int *len)
345{
346 struct inet_sock *inet = inet_sk(sk);
347 struct nf_conntrack_tuple_hash *h;
348 struct nf_conntrack_tuple tuple;
349
350 NF_CT_TUPLE_U_BLANK(&tuple);
351 tuple.src.u3.ip = inet->rcv_saddr;
352 tuple.src.u.tcp.port = inet->sport;
353 tuple.dst.u3.ip = inet->daddr;
354 tuple.dst.u.tcp.port = inet->dport;
355 tuple.src.l3num = PF_INET;
356 tuple.dst.protonum = IPPROTO_TCP;
357
358 /* We only do TCP at the moment: is there a better way? */
359 if (strcmp(sk->sk_prot->name, "TCP")) {
360 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
361 return -ENOPROTOOPT;
362 }
363
364 if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
365 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
366 *len, sizeof(struct sockaddr_in));
367 return -EINVAL;
368 }
369
370 h = nf_conntrack_find_get(&tuple, NULL);
371 if (h) {
372 struct sockaddr_in sin;
373 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
374
375 sin.sin_family = AF_INET;
376 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
377 .tuple.dst.u.tcp.port;
378 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
379 .tuple.dst.u3.ip;
380
381 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
382 NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
383 nf_ct_put(ct);
384 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
385 return -EFAULT;
386 else
387 return 0;
388 }
389 DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
390 NIPQUAD(tuple.src.u3.ip), ntohs(tuple.src.u.tcp.port),
391 NIPQUAD(tuple.dst.u3.ip), ntohs(tuple.dst.u.tcp.port));
392 return -ENOENT;
393}
394
395static struct nf_sockopt_ops so_getorigdst = {
396 .pf = PF_INET,
397 .get_optmin = SO_ORIGINAL_DST,
398 .get_optmax = SO_ORIGINAL_DST+1,
399 .get = &getorigdst,
400};
401
402struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = {
403 .l3proto = PF_INET,
404 .name = "ipv4",
405 .pkt_to_tuple = ipv4_pkt_to_tuple,
406 .invert_tuple = ipv4_invert_tuple,
407 .print_tuple = ipv4_print_tuple,
408 .print_conntrack = ipv4_print_conntrack,
409 .prepare = ipv4_prepare,
410 .get_features = ipv4_get_features,
411 .me = THIS_MODULE,
412};
413
414extern struct nf_conntrack_protocol nf_conntrack_protocol_tcp4;
415extern struct nf_conntrack_protocol nf_conntrack_protocol_udp4;
416extern struct nf_conntrack_protocol nf_conntrack_protocol_icmp;
417static int init_or_cleanup(int init)
418{
419 int ret = 0;
420
421 if (!init) goto cleanup;
422
423 ret = nf_register_sockopt(&so_getorigdst);
424 if (ret < 0) {
425 printk(KERN_ERR "Unable to register netfilter socket option\n");
426 goto cleanup_nothing;
427 }
428
429 ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_tcp4);
430 if (ret < 0) {
431 printk("nf_conntrack_ipv4: can't register tcp.\n");
432 goto cleanup_sockopt;
433 }
434
435 ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_udp4);
436 if (ret < 0) {
437 printk("nf_conntrack_ipv4: can't register udp.\n");
438 goto cleanup_tcp;
439 }
440
441 ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_icmp);
442 if (ret < 0) {
443 printk("nf_conntrack_ipv4: can't register icmp.\n");
444 goto cleanup_udp;
445 }
446
447 ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv4);
448 if (ret < 0) {
449 printk("nf_conntrack_ipv4: can't register ipv4\n");
450 goto cleanup_icmp;
451 }
452
453 ret = nf_register_hook(&ipv4_conntrack_defrag_ops);
454 if (ret < 0) {
455 printk("nf_conntrack_ipv4: can't register pre-routing defrag hook.\n");
456 goto cleanup_ipv4;
457 }
458 ret = nf_register_hook(&ipv4_conntrack_defrag_local_out_ops);
459 if (ret < 0) {
460 printk("nf_conntrack_ipv4: can't register local_out defrag hook.\n");
461 goto cleanup_defragops;
462 }
463
464 ret = nf_register_hook(&ipv4_conntrack_in_ops);
465 if (ret < 0) {
466 printk("nf_conntrack_ipv4: can't register pre-routing hook.\n");
467 goto cleanup_defraglocalops;
468 }
469
470 ret = nf_register_hook(&ipv4_conntrack_local_out_ops);
471 if (ret < 0) {
472 printk("nf_conntrack_ipv4: can't register local out hook.\n");
473 goto cleanup_inops;
474 }
475
476 ret = nf_register_hook(&ipv4_conntrack_helper_in_ops);
477 if (ret < 0) {
478 printk("nf_conntrack_ipv4: can't register local helper hook.\n");
479 goto cleanup_inandlocalops;
480 }
481
482 ret = nf_register_hook(&ipv4_conntrack_helper_out_ops);
483 if (ret < 0) {
484 printk("nf_conntrack_ipv4: can't register postrouting helper hook.\n");
485 goto cleanup_helperinops;
486 }
487
488 ret = nf_register_hook(&ipv4_conntrack_out_ops);
489 if (ret < 0) {
490 printk("nf_conntrack_ipv4: can't register post-routing hook.\n");
491 goto cleanup_helperoutops;
492 }
493
494 ret = nf_register_hook(&ipv4_conntrack_local_in_ops);
495 if (ret < 0) {
496 printk("nf_conntrack_ipv4: can't register local in hook.\n");
497 goto cleanup_inoutandlocalops;
498 }
499
500#ifdef CONFIG_SYSCTL
501 nf_ct_ipv4_sysctl_header = register_sysctl_table(nf_ct_net_table, 0);
502 if (nf_ct_ipv4_sysctl_header == NULL) {
503 printk("nf_conntrack: can't register to sysctl.\n");
504 ret = -ENOMEM;
505 goto cleanup_localinops;
506 }
507#endif
508
509 /* For use by REJECT target */
510 ip_ct_attach = __nf_conntrack_attach;
511
512 return ret;
513
514 cleanup:
515 synchronize_net();
516 ip_ct_attach = NULL;
517#ifdef CONFIG_SYSCTL
518 unregister_sysctl_table(nf_ct_ipv4_sysctl_header);
519 cleanup_localinops:
520#endif
521 nf_unregister_hook(&ipv4_conntrack_local_in_ops);
522 cleanup_inoutandlocalops:
523 nf_unregister_hook(&ipv4_conntrack_out_ops);
524 cleanup_helperoutops:
525 nf_unregister_hook(&ipv4_conntrack_helper_out_ops);
526 cleanup_helperinops:
527 nf_unregister_hook(&ipv4_conntrack_helper_in_ops);
528 cleanup_inandlocalops:
529 nf_unregister_hook(&ipv4_conntrack_local_out_ops);
530 cleanup_inops:
531 nf_unregister_hook(&ipv4_conntrack_in_ops);
532 cleanup_defraglocalops:
533 nf_unregister_hook(&ipv4_conntrack_defrag_local_out_ops);
534 cleanup_defragops:
535 nf_unregister_hook(&ipv4_conntrack_defrag_ops);
536 cleanup_ipv4:
537 nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
538 cleanup_icmp:
539 nf_conntrack_protocol_unregister(&nf_conntrack_protocol_icmp);
540 cleanup_udp:
541 nf_conntrack_protocol_unregister(&nf_conntrack_protocol_udp4);
542 cleanup_tcp:
543 nf_conntrack_protocol_unregister(&nf_conntrack_protocol_tcp4);
544 cleanup_sockopt:
545 nf_unregister_sockopt(&so_getorigdst);
546 cleanup_nothing:
547 return ret;
548}
549
550MODULE_LICENSE("GPL");
551
552static int __init init(void)
553{
554 need_nf_conntrack();
555 return init_or_cleanup(1);
556}
557
558static void __exit fini(void)
559{
560 init_or_cleanup(0);
561}
562
563module_init(init);
564module_exit(fini);
565
566void need_ip_conntrack(void)
567{
568}
569
570EXPORT_SYMBOL(need_ip_conntrack);
571EXPORT_SYMBOL(nf_ct_ipv4_gather_frags);
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
new file mode 100644
index 000000000000..7ddb5c08f7b8
--- /dev/null
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -0,0 +1,301 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
9 * - enable working with Layer 3 protocol independent connection tracking.
10 *
11 * Derived from net/ipv4/netfilter/ip_conntrack_proto_icmp.c
12 */
13
14#include <linux/types.h>
15#include <linux/sched.h>
16#include <linux/timer.h>
17#include <linux/netfilter.h>
18#include <linux/in.h>
19#include <linux/icmp.h>
20#include <linux/seq_file.h>
21#include <net/ip.h>
22#include <net/checksum.h>
23#include <linux/netfilter_ipv4.h>
24#include <net/netfilter/nf_conntrack_tuple.h>
25#include <net/netfilter/nf_conntrack_protocol.h>
26#include <net/netfilter/nf_conntrack_core.h>
27
28unsigned long nf_ct_icmp_timeout = 30*HZ;
29
30#if 0
31#define DEBUGP printk
32#else
33#define DEBUGP(format, args...)
34#endif
35
36static int icmp_pkt_to_tuple(const struct sk_buff *skb,
37 unsigned int dataoff,
38 struct nf_conntrack_tuple *tuple)
39{
40 struct icmphdr _hdr, *hp;
41
42 hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
43 if (hp == NULL)
44 return 0;
45
46 tuple->dst.u.icmp.type = hp->type;
47 tuple->src.u.icmp.id = hp->un.echo.id;
48 tuple->dst.u.icmp.code = hp->code;
49
50 return 1;
51}
52
53static int icmp_invert_tuple(struct nf_conntrack_tuple *tuple,
54 const struct nf_conntrack_tuple *orig)
55{
56 /* Add 1; spaces filled with 0. */
57 static u_int8_t invmap[]
58 = { [ICMP_ECHO] = ICMP_ECHOREPLY + 1,
59 [ICMP_ECHOREPLY] = ICMP_ECHO + 1,
60 [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1,
61 [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1,
62 [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1,
63 [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1,
64 [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1,
65 [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1};
66
67 if (orig->dst.u.icmp.type >= sizeof(invmap)
68 || !invmap[orig->dst.u.icmp.type])
69 return 0;
70
71 tuple->src.u.icmp.id = orig->src.u.icmp.id;
72 tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1;
73 tuple->dst.u.icmp.code = orig->dst.u.icmp.code;
74 return 1;
75}
76
77/* Print out the per-protocol part of the tuple. */
78static int icmp_print_tuple(struct seq_file *s,
79 const struct nf_conntrack_tuple *tuple)
80{
81 return seq_printf(s, "type=%u code=%u id=%u ",
82 tuple->dst.u.icmp.type,
83 tuple->dst.u.icmp.code,
84 ntohs(tuple->src.u.icmp.id));
85}
86
87/* Print out the private part of the conntrack. */
88static int icmp_print_conntrack(struct seq_file *s,
89 const struct nf_conn *conntrack)
90{
91 return 0;
92}
93
94/* Returns verdict for packet, or -1 for invalid. */
95static int icmp_packet(struct nf_conn *ct,
96 const struct sk_buff *skb,
97 unsigned int dataoff,
98 enum ip_conntrack_info ctinfo,
99 int pf,
100 unsigned int hooknum)
101{
102 /* Try to delete connection immediately after all replies:
103 won't actually vanish as we still have skb, and del_timer
104 means this will only run once even if count hits zero twice
105 (theoretically possible with SMP) */
106 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) {
107 if (atomic_dec_and_test(&ct->proto.icmp.count)
108 && del_timer(&ct->timeout))
109 ct->timeout.function((unsigned long)ct);
110 } else {
111 atomic_inc(&ct->proto.icmp.count);
112 nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb);
113 nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmp_timeout);
114 }
115
116 return NF_ACCEPT;
117}
118
119/* Called when a new connection for this protocol found. */
120static int icmp_new(struct nf_conn *conntrack,
121 const struct sk_buff *skb, unsigned int dataoff)
122{
123 static u_int8_t valid_new[]
124 = { [ICMP_ECHO] = 1,
125 [ICMP_TIMESTAMP] = 1,
126 [ICMP_INFO_REQUEST] = 1,
127 [ICMP_ADDRESS] = 1 };
128
129 if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new)
130 || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) {
131 /* Can't create a new ICMP `conn' with this. */
132 DEBUGP("icmp: can't create new conn with type %u\n",
133 conntrack->tuplehash[0].tuple.dst.u.icmp.type);
134 NF_CT_DUMP_TUPLE(&conntrack->tuplehash[0].tuple);
135 return 0;
136 }
137 atomic_set(&conntrack->proto.icmp.count, 0);
138 return 1;
139}
140
141extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4;
142/* Returns conntrack if it dealt with ICMP, and filled in skb fields */
143static int
144icmp_error_message(struct sk_buff *skb,
145 enum ip_conntrack_info *ctinfo,
146 unsigned int hooknum)
147{
148 struct nf_conntrack_tuple innertuple, origtuple;
149 struct {
150 struct icmphdr icmp;
151 struct iphdr ip;
152 } _in, *inside;
153 struct nf_conntrack_protocol *innerproto;
154 struct nf_conntrack_tuple_hash *h;
155 int dataoff;
156
157 NF_CT_ASSERT(skb->nfct == NULL);
158
159 /* Not enough header? */
160 inside = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_in), &_in);
161 if (inside == NULL)
162 return -NF_ACCEPT;
163
164 /* Ignore ICMP's containing fragments (shouldn't happen) */
165 if (inside->ip.frag_off & htons(IP_OFFSET)) {
166 DEBUGP("icmp_error_message: fragment of proto %u\n",
167 inside->ip.protocol);
168 return -NF_ACCEPT;
169 }
170
171 innerproto = nf_ct_find_proto(PF_INET, inside->ip.protocol);
172 dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp);
173 /* Are they talking about one of our connections? */
174 if (!nf_ct_get_tuple(skb, dataoff, dataoff + inside->ip.ihl*4, PF_INET,
175 inside->ip.protocol, &origtuple,
176 &nf_conntrack_l3proto_ipv4, innerproto)) {
177 DEBUGP("icmp_error_message: ! get_tuple p=%u",
178 inside->ip.protocol);
179 return -NF_ACCEPT;
180 }
181
182 /* Ordinarily, we'd expect the inverted tupleproto, but it's
183 been preserved inside the ICMP. */
184 if (!nf_ct_invert_tuple(&innertuple, &origtuple,
185 &nf_conntrack_l3proto_ipv4, innerproto)) {
186 DEBUGP("icmp_error_message: no match\n");
187 return -NF_ACCEPT;
188 }
189
190 *ctinfo = IP_CT_RELATED;
191
192 h = nf_conntrack_find_get(&innertuple, NULL);
193 if (!h) {
194 /* Locally generated ICMPs will match inverted if they
195 haven't been SNAT'ed yet */
196 /* FIXME: NAT code has to handle half-done double NAT --RR */
197 if (hooknum == NF_IP_LOCAL_OUT)
198 h = nf_conntrack_find_get(&origtuple, NULL);
199
200 if (!h) {
201 DEBUGP("icmp_error_message: no match\n");
202 return -NF_ACCEPT;
203 }
204
205 /* Reverse direction from that found */
206 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
207 *ctinfo += IP_CT_IS_REPLY;
208 } else {
209 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
210 *ctinfo += IP_CT_IS_REPLY;
211 }
212
213 /* Update skb to refer to this connection */
214 skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general;
215 skb->nfctinfo = *ctinfo;
216 return -NF_ACCEPT;
217}
218
219/* Small and modified version of icmp_rcv */
220static int
221icmp_error(struct sk_buff *skb, unsigned int dataoff,
222 enum ip_conntrack_info *ctinfo, int pf, unsigned int hooknum)
223{
224 struct icmphdr _ih, *icmph;
225
226 /* Not enough header? */
227 icmph = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_ih), &_ih);
228 if (icmph == NULL) {
229 if (LOG_INVALID(IPPROTO_ICMP))
230 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
231 "nf_ct_icmp: short packet ");
232 return -NF_ACCEPT;
233 }
234
235 /* See ip_conntrack_proto_tcp.c */
236 if (hooknum != NF_IP_PRE_ROUTING)
237 goto checksum_skipped;
238
239 switch (skb->ip_summed) {
240 case CHECKSUM_HW:
241 if (!(u16)csum_fold(skb->csum))
242 break;
243 if (LOG_INVALID(IPPROTO_ICMP))
244 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
245 "nf_ct_icmp: bad HW ICMP checksum ");
246 return -NF_ACCEPT;
247 case CHECKSUM_NONE:
248 if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) {
249 if (LOG_INVALID(IPPROTO_ICMP))
250 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
251 NULL,
252 "nf_ct_icmp: bad ICMP checksum ");
253 return -NF_ACCEPT;
254 }
255 default:
256 break;
257 }
258
259checksum_skipped:
260 /*
261 * 18 is the highest 'known' ICMP type. Anything else is a mystery
262 *
263 * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently
264 * discarded.
265 */
266 if (icmph->type > NR_ICMP_TYPES) {
267 if (LOG_INVALID(IPPROTO_ICMP))
268 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
269 "nf_ct_icmp: invalid ICMP type ");
270 return -NF_ACCEPT;
271 }
272
273 /* Need to track icmp error message? */
274 if (icmph->type != ICMP_DEST_UNREACH
275 && icmph->type != ICMP_SOURCE_QUENCH
276 && icmph->type != ICMP_TIME_EXCEEDED
277 && icmph->type != ICMP_PARAMETERPROB
278 && icmph->type != ICMP_REDIRECT)
279 return NF_ACCEPT;
280
281 return icmp_error_message(skb, ctinfo, hooknum);
282}
283
284struct nf_conntrack_protocol nf_conntrack_protocol_icmp =
285{
286 .list = { NULL, NULL },
287 .l3proto = PF_INET,
288 .proto = IPPROTO_ICMP,
289 .name = "icmp",
290 .pkt_to_tuple = icmp_pkt_to_tuple,
291 .invert_tuple = icmp_invert_tuple,
292 .print_tuple = icmp_print_tuple,
293 .print_conntrack = icmp_print_conntrack,
294 .packet = icmp_packet,
295 .new = icmp_new,
296 .error = icmp_error,
297 .destroy = NULL,
298 .me = NULL
299};
300
301EXPORT_SYMBOL(nf_conntrack_protocol_icmp);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 652685623519..01444a02b48b 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -645,6 +645,14 @@ ctl_table ipv4_table[] = {
645 .proc_handler = &proc_tcp_congestion_control, 645 .proc_handler = &proc_tcp_congestion_control,
646 .strategy = &sysctl_tcp_congestion_control, 646 .strategy = &sysctl_tcp_congestion_control,
647 }, 647 },
648 {
649 .ctl_name = NET_TCP_ABC,
650 .procname = "tcp_abc",
651 .data = &sysctl_tcp_abc,
652 .maxlen = sizeof(int),
653 .mode = 0644,
654 .proc_handler = &proc_dointvec,
655 },
648 656
649 { .ctl_name = 0 } 657 { .ctl_name = 0 }
650}; 658};
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 72b7c22e1ea5..9ac7a4f46bd8 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1640,7 +1640,7 @@ int tcp_disconnect(struct sock *sk, int flags)
1640 } else if (tcp_need_reset(old_state) || 1640 } else if (tcp_need_reset(old_state) ||
1641 (tp->snd_nxt != tp->write_seq && 1641 (tp->snd_nxt != tp->write_seq &&
1642 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { 1642 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1643 /* The last check adjusts for discrepance of Linux wrt. RFC 1643 /* The last check adjusts for discrepancy of Linux wrt. RFC
1644 * states 1644 * states
1645 */ 1645 */
1646 tcp_send_active_reset(sk, gfp_any()); 1646 tcp_send_active_reset(sk, gfp_any());
@@ -1669,6 +1669,7 @@ int tcp_disconnect(struct sock *sk, int flags)
1669 tp->packets_out = 0; 1669 tp->packets_out = 0;
1670 tp->snd_ssthresh = 0x7fffffff; 1670 tp->snd_ssthresh = 0x7fffffff;
1671 tp->snd_cwnd_cnt = 0; 1671 tp->snd_cwnd_cnt = 0;
1672 tp->bytes_acked = 0;
1672 tcp_set_ca_state(sk, TCP_CA_Open); 1673 tcp_set_ca_state(sk, TCP_CA_Open);
1673 tcp_clear_retrans(tp); 1674 tcp_clear_retrans(tp);
1674 inet_csk_delack_init(sk); 1675 inet_csk_delack_init(sk);
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index ae35e0609047..1d0cd86621b1 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -217,17 +217,15 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack,
217 217
218 bictcp_low_utilization(sk, data_acked); 218 bictcp_low_utilization(sk, data_acked);
219 219
220 if (in_flight < tp->snd_cwnd) 220 if (!tcp_is_cwnd_limited(sk, in_flight))
221 return; 221 return;
222 222
223 if (tp->snd_cwnd <= tp->snd_ssthresh) { 223 if (tp->snd_cwnd <= tp->snd_ssthresh)
224 /* In "safe" area, increase. */ 224 tcp_slow_start(tp);
225 if (tp->snd_cwnd < tp->snd_cwnd_clamp) 225 else {
226 tp->snd_cwnd++;
227 } else {
228 bictcp_update(ca, tp->snd_cwnd); 226 bictcp_update(ca, tp->snd_cwnd);
229 227
230 /* In dangerous area, increase slowly. 228 /* In dangerous area, increase slowly.
231 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd 229 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
232 */ 230 */
233 if (tp->snd_cwnd_cnt >= ca->cnt) { 231 if (tp->snd_cwnd_cnt >= ca->cnt) {
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index bbf2d6624e89..c7cc62c8dc12 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -186,24 +186,32 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight,
186{ 186{
187 struct tcp_sock *tp = tcp_sk(sk); 187 struct tcp_sock *tp = tcp_sk(sk);
188 188
189 if (in_flight < tp->snd_cwnd) 189 if (!tcp_is_cwnd_limited(sk, in_flight))
190 return; 190 return;
191 191
192 if (tp->snd_cwnd <= tp->snd_ssthresh) { 192 /* In "safe" area, increase. */
193 /* In "safe" area, increase. */ 193 if (tp->snd_cwnd <= tp->snd_ssthresh)
194 if (tp->snd_cwnd < tp->snd_cwnd_clamp) 194 tcp_slow_start(tp);
195 tp->snd_cwnd++; 195
196 } else { 196 /* In dangerous area, increase slowly. */
197 /* In dangerous area, increase slowly. 197 else if (sysctl_tcp_abc) {
198 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd 198 /* RFC3465: Apppriate Byte Count
199 */ 199 * increase once for each full cwnd acked
200 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { 200 */
201 if (tp->snd_cwnd < tp->snd_cwnd_clamp) 201 if (tp->bytes_acked >= tp->snd_cwnd*tp->mss_cache) {
202 tp->snd_cwnd++; 202 tp->bytes_acked -= tp->snd_cwnd*tp->mss_cache;
203 tp->snd_cwnd_cnt = 0; 203 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
204 } else 204 tp->snd_cwnd++;
205 tp->snd_cwnd_cnt++; 205 }
206 } 206 } else {
207 /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */
208 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
209 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
210 tp->snd_cwnd++;
211 tp->snd_cwnd_cnt = 0;
212 } else
213 tp->snd_cwnd_cnt++;
214 }
207} 215}
208EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); 216EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
209 217
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index 6acc04bde080..82b3c189bd7d 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -111,18 +111,17 @@ static void hstcp_init(struct sock *sk)
111} 111}
112 112
113static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, 113static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt,
114 u32 in_flight, int good) 114 u32 in_flight, u32 pkts_acked)
115{ 115{
116 struct tcp_sock *tp = tcp_sk(sk); 116 struct tcp_sock *tp = tcp_sk(sk);
117 struct hstcp *ca = inet_csk_ca(sk); 117 struct hstcp *ca = inet_csk_ca(sk);
118 118
119 if (in_flight < tp->snd_cwnd) 119 if (!tcp_is_cwnd_limited(sk, in_flight))
120 return; 120 return;
121 121
122 if (tp->snd_cwnd <= tp->snd_ssthresh) { 122 if (tp->snd_cwnd <= tp->snd_ssthresh)
123 if (tp->snd_cwnd < tp->snd_cwnd_clamp) 123 tcp_slow_start(tp);
124 tp->snd_cwnd++; 124 else {
125 } else {
126 /* Update AIMD parameters */ 125 /* Update AIMD parameters */
127 if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) { 126 if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) {
128 while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && 127 while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd &&
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index e47b37984e95..3284cfb993e6 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -207,14 +207,13 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
207 struct tcp_sock *tp = tcp_sk(sk); 207 struct tcp_sock *tp = tcp_sk(sk);
208 struct htcp *ca = inet_csk_ca(sk); 208 struct htcp *ca = inet_csk_ca(sk);
209 209
210 if (in_flight < tp->snd_cwnd) 210 if (!tcp_is_cwnd_limited(sk, in_flight))
211 return; 211 return;
212 212
213 if (tp->snd_cwnd <= tp->snd_ssthresh) { 213 if (tp->snd_cwnd <= tp->snd_ssthresh)
214 /* In "safe" area, increase. */ 214 tcp_slow_start(tp);
215 if (tp->snd_cwnd < tp->snd_cwnd_clamp) 215 else {
216 tp->snd_cwnd++; 216
217 } else {
218 measure_rtt(sk); 217 measure_rtt(sk);
219 218
220 /* keep track of number of round-trip times since last backoff event */ 219 /* keep track of number of round-trip times since last backoff event */
@@ -224,7 +223,7 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
224 htcp_alpha_update(ca); 223 htcp_alpha_update(ca);
225 } 224 }
226 225
227 /* In dangerous area, increase slowly. 226 /* In dangerous area, increase slowly.
228 * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd 227 * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd
229 */ 228 */
230 if ((tp->snd_cwnd_cnt++ * ca->alpha)>>7 >= tp->snd_cwnd) { 229 if ((tp->snd_cwnd_cnt++ * ca->alpha)>>7 >= tp->snd_cwnd) {
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index 77add63623df..40dbb3877510 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -100,12 +100,12 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
100 ca->minrtt = tp->srtt; 100 ca->minrtt = tp->srtt;
101 } 101 }
102 102
103 if (!tcp_is_cwnd_limited(sk, in_flight))
104 return;
105
103 if (!ca->hybla_en) 106 if (!ca->hybla_en)
104 return tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag); 107 return tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag);
105 108
106 if (in_flight < tp->snd_cwnd)
107 return;
108
109 if (ca->rho == 0) 109 if (ca->rho == 0)
110 hybla_recalc_param(sk); 110 hybla_recalc_param(sk);
111 111
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3e98b57578dc..40a26b7157b4 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -42,7 +42,7 @@
42 * Andi Kleen : Moved open_request checking here 42 * Andi Kleen : Moved open_request checking here
43 * and process RSTs for open_requests. 43 * and process RSTs for open_requests.
44 * Andi Kleen : Better prune_queue, and other fixes. 44 * Andi Kleen : Better prune_queue, and other fixes.
45 * Andrey Savochkin: Fix RTT measurements in the presnce of 45 * Andrey Savochkin: Fix RTT measurements in the presence of
46 * timestamps. 46 * timestamps.
47 * Andrey Savochkin: Check sequence numbers correctly when 47 * Andrey Savochkin: Check sequence numbers correctly when
48 * removing SACKs due to in sequence incoming 48 * removing SACKs due to in sequence incoming
@@ -89,6 +89,7 @@ int sysctl_tcp_frto;
89int sysctl_tcp_nometrics_save; 89int sysctl_tcp_nometrics_save;
90 90
91int sysctl_tcp_moderate_rcvbuf = 1; 91int sysctl_tcp_moderate_rcvbuf = 1;
92int sysctl_tcp_abc = 1;
92 93
93#define FLAG_DATA 0x01 /* Incoming frame contained data. */ 94#define FLAG_DATA 0x01 /* Incoming frame contained data. */
94#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ 95#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
@@ -223,7 +224,7 @@ static void tcp_fixup_sndbuf(struct sock *sk)
223 * of receiver window. Check #2. 224 * of receiver window. Check #2.
224 * 225 *
225 * The scheme does not work when sender sends good segments opening 226 * The scheme does not work when sender sends good segments opening
226 * window and then starts to feed us spagetti. But it should work 227 * window and then starts to feed us spaghetti. But it should work
227 * in common situations. Otherwise, we have to rely on queue collapsing. 228 * in common situations. Otherwise, we have to rely on queue collapsing.
228 */ 229 */
229 230
@@ -233,7 +234,7 @@ static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp,
233{ 234{
234 /* Optimize this! */ 235 /* Optimize this! */
235 int truesize = tcp_win_from_space(skb->truesize)/2; 236 int truesize = tcp_win_from_space(skb->truesize)/2;
236 int window = tcp_full_space(sk)/2; 237 int window = tcp_win_from_space(sysctl_tcp_rmem[2])/2;
237 238
238 while (tp->rcv_ssthresh <= window) { 239 while (tp->rcv_ssthresh <= window) {
239 if (truesize <= skb->len) 240 if (truesize <= skb->len)
@@ -277,7 +278,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
277 int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); 278 int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff);
278 279
279 /* Try to select rcvbuf so that 4 mss-sized segments 280 /* Try to select rcvbuf so that 4 mss-sized segments
280 * will fit to window and correspoding skbs will fit to our rcvbuf. 281 * will fit to window and corresponding skbs will fit to our rcvbuf.
281 * (was 3; 4 is minimum to allow fast retransmit to work.) 282 * (was 3; 4 is minimum to allow fast retransmit to work.)
282 */ 283 */
283 while (tcp_win_from_space(rcvmem) < tp->advmss) 284 while (tcp_win_from_space(rcvmem) < tp->advmss)
@@ -286,7 +287,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
286 sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]); 287 sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]);
287} 288}
288 289
289/* 4. Try to fixup all. It is made iimediately after connection enters 290/* 4. Try to fixup all. It is made immediately after connection enters
290 * established state. 291 * established state.
291 */ 292 */
292static void tcp_init_buffer_space(struct sock *sk) 293static void tcp_init_buffer_space(struct sock *sk)
@@ -326,37 +327,18 @@ static void tcp_init_buffer_space(struct sock *sk)
326static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) 327static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
327{ 328{
328 struct inet_connection_sock *icsk = inet_csk(sk); 329 struct inet_connection_sock *icsk = inet_csk(sk);
329 struct sk_buff *skb;
330 unsigned int app_win = tp->rcv_nxt - tp->copied_seq;
331 int ofo_win = 0;
332 330
333 icsk->icsk_ack.quick = 0; 331 icsk->icsk_ack.quick = 0;
334 332
335 skb_queue_walk(&tp->out_of_order_queue, skb) { 333 if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
336 ofo_win += skb->len; 334 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
337 } 335 !tcp_memory_pressure &&
338 336 atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
339 /* If overcommit is due to out of order segments, 337 sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
340 * do not clamp window. Try to expand rcvbuf instead. 338 sysctl_tcp_rmem[2]);
341 */
342 if (ofo_win) {
343 if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
344 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
345 !tcp_memory_pressure &&
346 atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0])
347 sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
348 sysctl_tcp_rmem[2]);
349 } 339 }
350 if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) { 340 if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
351 app_win += ofo_win;
352 if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf)
353 app_win >>= 1;
354 if (app_win > icsk->icsk_ack.rcv_mss)
355 app_win -= icsk->icsk_ack.rcv_mss;
356 app_win = max(app_win, 2U*tp->advmss);
357
358 tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss); 341 tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss);
359 }
360} 342}
361 343
362/* Receiver "autotuning" code. 344/* Receiver "autotuning" code.
@@ -385,8 +367,8 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
385 * are stalled on filesystem I/O. 367 * are stalled on filesystem I/O.
386 * 368 *
387 * Also, since we are only going for a minimum in the 369 * Also, since we are only going for a minimum in the
388 * non-timestamp case, we do not smoothe things out 370 * non-timestamp case, we do not smoother things out
389 * else with timestamps disabled convergance takes too 371 * else with timestamps disabled convergence takes too
390 * long. 372 * long.
391 */ 373 */
392 if (!win_dep) { 374 if (!win_dep) {
@@ -395,7 +377,7 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
395 } else if (m < new_sample) 377 } else if (m < new_sample)
396 new_sample = m << 3; 378 new_sample = m << 3;
397 } else { 379 } else {
398 /* No previous mesaure. */ 380 /* No previous measure. */
399 new_sample = m << 3; 381 new_sample = m << 3;
400 } 382 }
401 383
@@ -524,7 +506,7 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_
524 if (icsk->icsk_ack.ato > icsk->icsk_rto) 506 if (icsk->icsk_ack.ato > icsk->icsk_rto)
525 icsk->icsk_ack.ato = icsk->icsk_rto; 507 icsk->icsk_ack.ato = icsk->icsk_rto;
526 } else if (m > icsk->icsk_rto) { 508 } else if (m > icsk->icsk_rto) {
527 /* Too long gap. Apparently sender falled to 509 /* Too long gap. Apparently sender failed to
528 * restart window, so that we send ACKs quickly. 510 * restart window, so that we send ACKs quickly.
529 */ 511 */
530 tcp_incr_quickack(sk); 512 tcp_incr_quickack(sk);
@@ -548,10 +530,9 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_
548 * To save cycles in the RFC 1323 implementation it was better to break 530 * To save cycles in the RFC 1323 implementation it was better to break
549 * it up into three procedures. -- erics 531 * it up into three procedures. -- erics
550 */ 532 */
551static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt) 533static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
552{ 534{
553 struct tcp_sock *tp = tcp_sk(sk); 535 struct tcp_sock *tp = tcp_sk(sk);
554 const struct inet_connection_sock *icsk = inet_csk(sk);
555 long m = mrtt; /* RTT */ 536 long m = mrtt; /* RTT */
556 537
557 /* The following amusing code comes from Jacobson's 538 /* The following amusing code comes from Jacobson's
@@ -565,7 +546,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt)
565 * 546 *
566 * Funny. This algorithm seems to be very broken. 547 * Funny. This algorithm seems to be very broken.
567 * These formulae increase RTO, when it should be decreased, increase 548 * These formulae increase RTO, when it should be decreased, increase
568 * too slowly, when it should be incresed fastly, decrease too fastly 549 * too slowly, when it should be increased fastly, decrease too fastly
569 * etc. I guess in BSD RTO takes ONE value, so that it is absolutely 550 * etc. I guess in BSD RTO takes ONE value, so that it is absolutely
570 * does not matter how to _calculate_ it. Seems, it was trap 551 * does not matter how to _calculate_ it. Seems, it was trap
571 * that VJ failed to avoid. 8) 552 * that VJ failed to avoid. 8)
@@ -610,9 +591,6 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt)
610 tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); 591 tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN);
611 tp->rtt_seq = tp->snd_nxt; 592 tp->rtt_seq = tp->snd_nxt;
612 } 593 }
613
614 if (icsk->icsk_ca_ops->rtt_sample)
615 icsk->icsk_ca_ops->rtt_sample(sk, *usrtt);
616} 594}
617 595
618/* Calculate rto without backoff. This is the second half of Van Jacobson's 596/* Calculate rto without backoff. This is the second half of Van Jacobson's
@@ -629,14 +607,14 @@ static inline void tcp_set_rto(struct sock *sk)
629 * at least by solaris and freebsd. "Erratic ACKs" has _nothing_ 607 * at least by solaris and freebsd. "Erratic ACKs" has _nothing_
630 * to do with delayed acks, because at cwnd>2 true delack timeout 608 * to do with delayed acks, because at cwnd>2 true delack timeout
631 * is invisible. Actually, Linux-2.4 also generates erratic 609 * is invisible. Actually, Linux-2.4 also generates erratic
632 * ACKs in some curcumstances. 610 * ACKs in some circumstances.
633 */ 611 */
634 inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar; 612 inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar;
635 613
636 /* 2. Fixups made earlier cannot be right. 614 /* 2. Fixups made earlier cannot be right.
637 * If we do not estimate RTO correctly without them, 615 * If we do not estimate RTO correctly without them,
638 * all the algo is pure shit and should be replaced 616 * all the algo is pure shit and should be replaced
639 * with correct one. It is exaclty, which we pretend to do. 617 * with correct one. It is exactly, which we pretend to do.
640 */ 618 */
641} 619}
642 620
@@ -794,7 +772,7 @@ static void tcp_init_metrics(struct sock *sk)
794 * to make it more realistic. 772 * to make it more realistic.
795 * 773 *
796 * A bit of theory. RTT is time passed after "normal" sized packet 774 * A bit of theory. RTT is time passed after "normal" sized packet
797 * is sent until it is ACKed. In normal curcumstances sending small 775 * is sent until it is ACKed. In normal circumstances sending small
798 * packets force peer to delay ACKs and calculation is correct too. 776 * packets force peer to delay ACKs and calculation is correct too.
799 * The algorithm is adaptive and, provided we follow specs, it 777 * The algorithm is adaptive and, provided we follow specs, it
800 * NEVER underestimate RTT. BUT! If peer tries to make some clever 778 * NEVER underestimate RTT. BUT! If peer tries to make some clever
@@ -919,18 +897,32 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
919 int prior_fackets; 897 int prior_fackets;
920 u32 lost_retrans = 0; 898 u32 lost_retrans = 0;
921 int flag = 0; 899 int flag = 0;
900 int dup_sack = 0;
922 int i; 901 int i;
923 902
924 if (!tp->sacked_out) 903 if (!tp->sacked_out)
925 tp->fackets_out = 0; 904 tp->fackets_out = 0;
926 prior_fackets = tp->fackets_out; 905 prior_fackets = tp->fackets_out;
927 906
928 for (i=0; i<num_sacks; i++, sp++) { 907 /* SACK fastpath:
929 struct sk_buff *skb; 908 * if the only SACK change is the increase of the end_seq of
930 __u32 start_seq = ntohl(sp->start_seq); 909 * the first block then only apply that SACK block
931 __u32 end_seq = ntohl(sp->end_seq); 910 * and use retrans queue hinting otherwise slowpath */
932 int fack_count = 0; 911 flag = 1;
933 int dup_sack = 0; 912 for (i = 0; i< num_sacks; i++) {
913 __u32 start_seq = ntohl(sp[i].start_seq);
914 __u32 end_seq = ntohl(sp[i].end_seq);
915
916 if (i == 0){
917 if (tp->recv_sack_cache[i].start_seq != start_seq)
918 flag = 0;
919 } else {
920 if ((tp->recv_sack_cache[i].start_seq != start_seq) ||
921 (tp->recv_sack_cache[i].end_seq != end_seq))
922 flag = 0;
923 }
924 tp->recv_sack_cache[i].start_seq = start_seq;
925 tp->recv_sack_cache[i].end_seq = end_seq;
934 926
935 /* Check for D-SACK. */ 927 /* Check for D-SACK. */
936 if (i == 0) { 928 if (i == 0) {
@@ -962,15 +954,58 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
962 if (before(ack, prior_snd_una - tp->max_window)) 954 if (before(ack, prior_snd_una - tp->max_window))
963 return 0; 955 return 0;
964 } 956 }
957 }
958
959 if (flag)
960 num_sacks = 1;
961 else {
962 int j;
963 tp->fastpath_skb_hint = NULL;
964
965 /* order SACK blocks to allow in order walk of the retrans queue */
966 for (i = num_sacks-1; i > 0; i--) {
967 for (j = 0; j < i; j++){
968 if (after(ntohl(sp[j].start_seq),
969 ntohl(sp[j+1].start_seq))){
970 sp[j].start_seq = htonl(tp->recv_sack_cache[j+1].start_seq);
971 sp[j].end_seq = htonl(tp->recv_sack_cache[j+1].end_seq);
972 sp[j+1].start_seq = htonl(tp->recv_sack_cache[j].start_seq);
973 sp[j+1].end_seq = htonl(tp->recv_sack_cache[j].end_seq);
974 }
975
976 }
977 }
978 }
979
980 /* clear flag as used for different purpose in following code */
981 flag = 0;
982
983 for (i=0; i<num_sacks; i++, sp++) {
984 struct sk_buff *skb;
985 __u32 start_seq = ntohl(sp->start_seq);
986 __u32 end_seq = ntohl(sp->end_seq);
987 int fack_count;
988
989 /* Use SACK fastpath hint if valid */
990 if (tp->fastpath_skb_hint) {
991 skb = tp->fastpath_skb_hint;
992 fack_count = tp->fastpath_cnt_hint;
993 } else {
994 skb = sk->sk_write_queue.next;
995 fack_count = 0;
996 }
965 997
966 /* Event "B" in the comment above. */ 998 /* Event "B" in the comment above. */
967 if (after(end_seq, tp->high_seq)) 999 if (after(end_seq, tp->high_seq))
968 flag |= FLAG_DATA_LOST; 1000 flag |= FLAG_DATA_LOST;
969 1001
970 sk_stream_for_retrans_queue(skb, sk) { 1002 sk_stream_for_retrans_queue_from(skb, sk) {
971 int in_sack, pcount; 1003 int in_sack, pcount;
972 u8 sacked; 1004 u8 sacked;
973 1005
1006 tp->fastpath_skb_hint = skb;
1007 tp->fastpath_cnt_hint = fack_count;
1008
974 /* The retransmission queue is always in order, so 1009 /* The retransmission queue is always in order, so
975 * we can short-circuit the walk early. 1010 * we can short-circuit the walk early.
976 */ 1011 */
@@ -1045,6 +1080,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
1045 TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); 1080 TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
1046 tp->lost_out -= tcp_skb_pcount(skb); 1081 tp->lost_out -= tcp_skb_pcount(skb);
1047 tp->retrans_out -= tcp_skb_pcount(skb); 1082 tp->retrans_out -= tcp_skb_pcount(skb);
1083
1084 /* clear lost hint */
1085 tp->retransmit_skb_hint = NULL;
1048 } 1086 }
1049 } else { 1087 } else {
1050 /* New sack for not retransmitted frame, 1088 /* New sack for not retransmitted frame,
@@ -1057,6 +1095,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
1057 if (sacked & TCPCB_LOST) { 1095 if (sacked & TCPCB_LOST) {
1058 TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; 1096 TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
1059 tp->lost_out -= tcp_skb_pcount(skb); 1097 tp->lost_out -= tcp_skb_pcount(skb);
1098
1099 /* clear lost hint */
1100 tp->retransmit_skb_hint = NULL;
1060 } 1101 }
1061 } 1102 }
1062 1103
@@ -1080,6 +1121,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
1080 (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) { 1121 (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) {
1081 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; 1122 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1082 tp->retrans_out -= tcp_skb_pcount(skb); 1123 tp->retrans_out -= tcp_skb_pcount(skb);
1124 tp->retransmit_skb_hint = NULL;
1083 } 1125 }
1084 } 1126 }
1085 } 1127 }
@@ -1107,6 +1149,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
1107 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; 1149 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1108 tp->retrans_out -= tcp_skb_pcount(skb); 1150 tp->retrans_out -= tcp_skb_pcount(skb);
1109 1151
1152 /* clear lost hint */
1153 tp->retransmit_skb_hint = NULL;
1154
1110 if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) { 1155 if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) {
1111 tp->lost_out += tcp_skb_pcount(skb); 1156 tp->lost_out += tcp_skb_pcount(skb);
1112 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; 1157 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
@@ -1214,6 +1259,8 @@ static void tcp_enter_frto_loss(struct sock *sk)
1214 tcp_set_ca_state(sk, TCP_CA_Loss); 1259 tcp_set_ca_state(sk, TCP_CA_Loss);
1215 tp->high_seq = tp->frto_highmark; 1260 tp->high_seq = tp->frto_highmark;
1216 TCP_ECN_queue_cwr(tp); 1261 TCP_ECN_queue_cwr(tp);
1262
1263 clear_all_retrans_hints(tp);
1217} 1264}
1218 1265
1219void tcp_clear_retrans(struct tcp_sock *tp) 1266void tcp_clear_retrans(struct tcp_sock *tp)
@@ -1251,6 +1298,7 @@ void tcp_enter_loss(struct sock *sk, int how)
1251 tp->snd_cwnd_cnt = 0; 1298 tp->snd_cwnd_cnt = 0;
1252 tp->snd_cwnd_stamp = tcp_time_stamp; 1299 tp->snd_cwnd_stamp = tcp_time_stamp;
1253 1300
1301 tp->bytes_acked = 0;
1254 tcp_clear_retrans(tp); 1302 tcp_clear_retrans(tp);
1255 1303
1256 /* Push undo marker, if it was plain RTO and nothing 1304 /* Push undo marker, if it was plain RTO and nothing
@@ -1279,6 +1327,8 @@ void tcp_enter_loss(struct sock *sk, int how)
1279 tcp_set_ca_state(sk, TCP_CA_Loss); 1327 tcp_set_ca_state(sk, TCP_CA_Loss);
1280 tp->high_seq = tp->snd_nxt; 1328 tp->high_seq = tp->snd_nxt;
1281 TCP_ECN_queue_cwr(tp); 1329 TCP_ECN_queue_cwr(tp);
1330
1331 clear_all_retrans_hints(tp);
1282} 1332}
1283 1333
1284static int tcp_check_sack_reneging(struct sock *sk) 1334static int tcp_check_sack_reneging(struct sock *sk)
@@ -1503,17 +1553,37 @@ static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp,
1503 int packets, u32 high_seq) 1553 int packets, u32 high_seq)
1504{ 1554{
1505 struct sk_buff *skb; 1555 struct sk_buff *skb;
1506 int cnt = packets; 1556 int cnt;
1507 1557
1508 BUG_TRAP(cnt <= tp->packets_out); 1558 BUG_TRAP(packets <= tp->packets_out);
1559 if (tp->lost_skb_hint) {
1560 skb = tp->lost_skb_hint;
1561 cnt = tp->lost_cnt_hint;
1562 } else {
1563 skb = sk->sk_write_queue.next;
1564 cnt = 0;
1565 }
1509 1566
1510 sk_stream_for_retrans_queue(skb, sk) { 1567 sk_stream_for_retrans_queue_from(skb, sk) {
1511 cnt -= tcp_skb_pcount(skb); 1568 /* TODO: do this better */
1512 if (cnt < 0 || after(TCP_SKB_CB(skb)->end_seq, high_seq)) 1569 /* this is not the most efficient way to do this... */
1570 tp->lost_skb_hint = skb;
1571 tp->lost_cnt_hint = cnt;
1572 cnt += tcp_skb_pcount(skb);
1573 if (cnt > packets || after(TCP_SKB_CB(skb)->end_seq, high_seq))
1513 break; 1574 break;
1514 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { 1575 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
1515 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; 1576 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1516 tp->lost_out += tcp_skb_pcount(skb); 1577 tp->lost_out += tcp_skb_pcount(skb);
1578
1579 /* clear xmit_retransmit_queue hints
1580 * if this is beyond hint */
1581 if(tp->retransmit_skb_hint != NULL &&
1582 before(TCP_SKB_CB(skb)->seq,
1583 TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) {
1584
1585 tp->retransmit_skb_hint = NULL;
1586 }
1517 } 1587 }
1518 } 1588 }
1519 tcp_sync_left_out(tp); 1589 tcp_sync_left_out(tp);
@@ -1540,13 +1610,28 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp)
1540 if (tcp_head_timedout(sk, tp)) { 1610 if (tcp_head_timedout(sk, tp)) {
1541 struct sk_buff *skb; 1611 struct sk_buff *skb;
1542 1612
1543 sk_stream_for_retrans_queue(skb, sk) { 1613 skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint
1544 if (tcp_skb_timedout(sk, skb) && 1614 : sk->sk_write_queue.next;
1545 !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { 1615
1616 sk_stream_for_retrans_queue_from(skb, sk) {
1617 if (!tcp_skb_timedout(sk, skb))
1618 break;
1619
1620 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
1546 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; 1621 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1547 tp->lost_out += tcp_skb_pcount(skb); 1622 tp->lost_out += tcp_skb_pcount(skb);
1623
1624 /* clear xmit_retrans hint */
1625 if (tp->retransmit_skb_hint &&
1626 before(TCP_SKB_CB(skb)->seq,
1627 TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
1628
1629 tp->retransmit_skb_hint = NULL;
1548 } 1630 }
1549 } 1631 }
1632
1633 tp->scoreboard_skb_hint = skb;
1634
1550 tcp_sync_left_out(tp); 1635 tcp_sync_left_out(tp);
1551 } 1636 }
1552} 1637}
@@ -1626,6 +1711,10 @@ static void tcp_undo_cwr(struct sock *sk, const int undo)
1626 } 1711 }
1627 tcp_moderate_cwnd(tp); 1712 tcp_moderate_cwnd(tp);
1628 tp->snd_cwnd_stamp = tcp_time_stamp; 1713 tp->snd_cwnd_stamp = tcp_time_stamp;
1714
1715 /* There is something screwy going on with the retrans hints after
1716 an undo */
1717 clear_all_retrans_hints(tp);
1629} 1718}
1630 1719
1631static inline int tcp_may_undo(struct tcp_sock *tp) 1720static inline int tcp_may_undo(struct tcp_sock *tp)
@@ -1709,6 +1798,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp)
1709 sk_stream_for_retrans_queue(skb, sk) { 1798 sk_stream_for_retrans_queue(skb, sk) {
1710 TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; 1799 TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
1711 } 1800 }
1801
1802 clear_all_retrans_hints(tp);
1803
1712 DBGUNDO(sk, tp, "partial loss"); 1804 DBGUNDO(sk, tp, "partial loss");
1713 tp->lost_out = 0; 1805 tp->lost_out = 0;
1714 tp->left_out = tp->sacked_out; 1806 tp->left_out = tp->sacked_out;
@@ -1908,6 +2000,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1908 TCP_ECN_queue_cwr(tp); 2000 TCP_ECN_queue_cwr(tp);
1909 } 2001 }
1910 2002
2003 tp->bytes_acked = 0;
1911 tp->snd_cwnd_cnt = 0; 2004 tp->snd_cwnd_cnt = 0;
1912 tcp_set_ca_state(sk, TCP_CA_Recovery); 2005 tcp_set_ca_state(sk, TCP_CA_Recovery);
1913 } 2006 }
@@ -1919,9 +2012,9 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1919} 2012}
1920 2013
1921/* Read draft-ietf-tcplw-high-performance before mucking 2014/* Read draft-ietf-tcplw-high-performance before mucking
1922 * with this code. (Superceeds RFC1323) 2015 * with this code. (Supersedes RFC1323)
1923 */ 2016 */
1924static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag) 2017static void tcp_ack_saw_tstamp(struct sock *sk, int flag)
1925{ 2018{
1926 /* RTTM Rule: A TSecr value received in a segment is used to 2019 /* RTTM Rule: A TSecr value received in a segment is used to
1927 * update the averaged RTT measurement only if the segment 2020 * update the averaged RTT measurement only if the segment
@@ -1932,7 +2025,7 @@ static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag)
1932 * 1998/04/10 Andrey V. Savochkin <saw@msu.ru> 2025 * 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
1933 * 2026 *
1934 * Changed: reset backoff as soon as we see the first valid sample. 2027 * Changed: reset backoff as soon as we see the first valid sample.
1935 * If we do not, we get strongly overstimated rto. With timestamps 2028 * If we do not, we get strongly overestimated rto. With timestamps
1936 * samples are accepted even from very old segments: f.e., when rtt=1 2029 * samples are accepted even from very old segments: f.e., when rtt=1
1937 * increases to 8, we retransmit 5 times and after 8 seconds delayed 2030 * increases to 8, we retransmit 5 times and after 8 seconds delayed
1938 * answer arrives rto becomes 120 seconds! If at least one of segments 2031 * answer arrives rto becomes 120 seconds! If at least one of segments
@@ -1940,13 +2033,13 @@ static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag)
1940 */ 2033 */
1941 struct tcp_sock *tp = tcp_sk(sk); 2034 struct tcp_sock *tp = tcp_sk(sk);
1942 const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; 2035 const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
1943 tcp_rtt_estimator(sk, seq_rtt, usrtt); 2036 tcp_rtt_estimator(sk, seq_rtt);
1944 tcp_set_rto(sk); 2037 tcp_set_rto(sk);
1945 inet_csk(sk)->icsk_backoff = 0; 2038 inet_csk(sk)->icsk_backoff = 0;
1946 tcp_bound_rto(sk); 2039 tcp_bound_rto(sk);
1947} 2040}
1948 2041
1949static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag) 2042static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag)
1950{ 2043{
1951 /* We don't have a timestamp. Can only use 2044 /* We don't have a timestamp. Can only use
1952 * packets that are not retransmitted to determine 2045 * packets that are not retransmitted to determine
@@ -1960,21 +2053,21 @@ static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag
1960 if (flag & FLAG_RETRANS_DATA_ACKED) 2053 if (flag & FLAG_RETRANS_DATA_ACKED)
1961 return; 2054 return;
1962 2055
1963 tcp_rtt_estimator(sk, seq_rtt, usrtt); 2056 tcp_rtt_estimator(sk, seq_rtt);
1964 tcp_set_rto(sk); 2057 tcp_set_rto(sk);
1965 inet_csk(sk)->icsk_backoff = 0; 2058 inet_csk(sk)->icsk_backoff = 0;
1966 tcp_bound_rto(sk); 2059 tcp_bound_rto(sk);
1967} 2060}
1968 2061
1969static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, 2062static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
1970 const s32 seq_rtt, u32 *usrtt) 2063 const s32 seq_rtt)
1971{ 2064{
1972 const struct tcp_sock *tp = tcp_sk(sk); 2065 const struct tcp_sock *tp = tcp_sk(sk);
1973 /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ 2066 /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
1974 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) 2067 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
1975 tcp_ack_saw_tstamp(sk, usrtt, flag); 2068 tcp_ack_saw_tstamp(sk, flag);
1976 else if (seq_rtt >= 0) 2069 else if (seq_rtt >= 0)
1977 tcp_ack_no_tstamp(sk, seq_rtt, usrtt, flag); 2070 tcp_ack_no_tstamp(sk, seq_rtt, flag);
1978} 2071}
1979 2072
1980static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, 2073static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
@@ -2054,20 +2147,27 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
2054 return acked; 2147 return acked;
2055} 2148}
2056 2149
2150static inline u32 tcp_usrtt(const struct sk_buff *skb)
2151{
2152 struct timeval tv, now;
2153
2154 do_gettimeofday(&now);
2155 skb_get_timestamp(skb, &tv);
2156 return (now.tv_sec - tv.tv_sec) * 1000000 + (now.tv_usec - tv.tv_usec);
2157}
2057 2158
2058/* Remove acknowledged frames from the retransmission queue. */ 2159/* Remove acknowledged frames from the retransmission queue. */
2059static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt) 2160static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
2060{ 2161{
2061 struct tcp_sock *tp = tcp_sk(sk); 2162 struct tcp_sock *tp = tcp_sk(sk);
2163 const struct inet_connection_sock *icsk = inet_csk(sk);
2062 struct sk_buff *skb; 2164 struct sk_buff *skb;
2063 __u32 now = tcp_time_stamp; 2165 __u32 now = tcp_time_stamp;
2064 int acked = 0; 2166 int acked = 0;
2065 __s32 seq_rtt = -1; 2167 __s32 seq_rtt = -1;
2066 struct timeval usnow;
2067 u32 pkts_acked = 0; 2168 u32 pkts_acked = 0;
2068 2169 void (*rtt_sample)(struct sock *sk, u32 usrtt)
2069 if (seq_usrtt) 2170 = icsk->icsk_ca_ops->rtt_sample;
2070 do_gettimeofday(&usnow);
2071 2171
2072 while ((skb = skb_peek(&sk->sk_write_queue)) && 2172 while ((skb = skb_peek(&sk->sk_write_queue)) &&
2073 skb != sk->sk_send_head) { 2173 skb != sk->sk_send_head) {
@@ -2107,16 +2207,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
2107 tp->retrans_out -= tcp_skb_pcount(skb); 2207 tp->retrans_out -= tcp_skb_pcount(skb);
2108 acked |= FLAG_RETRANS_DATA_ACKED; 2208 acked |= FLAG_RETRANS_DATA_ACKED;
2109 seq_rtt = -1; 2209 seq_rtt = -1;
2110 } else if (seq_rtt < 0) 2210 } else if (seq_rtt < 0) {
2111 seq_rtt = now - scb->when; 2211 seq_rtt = now - scb->when;
2112 if (seq_usrtt) { 2212 if (rtt_sample)
2113 struct timeval tv; 2213 (*rtt_sample)(sk, tcp_usrtt(skb));
2114
2115 skb_get_timestamp(skb, &tv);
2116 *seq_usrtt = (usnow.tv_sec - tv.tv_sec) * 1000000
2117 + (usnow.tv_usec - tv.tv_usec);
2118 } 2214 }
2119
2120 if (sacked & TCPCB_SACKED_ACKED) 2215 if (sacked & TCPCB_SACKED_ACKED)
2121 tp->sacked_out -= tcp_skb_pcount(skb); 2216 tp->sacked_out -= tcp_skb_pcount(skb);
2122 if (sacked & TCPCB_LOST) 2217 if (sacked & TCPCB_LOST)
@@ -2126,17 +2221,20 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
2126 !before(scb->end_seq, tp->snd_up)) 2221 !before(scb->end_seq, tp->snd_up))
2127 tp->urg_mode = 0; 2222 tp->urg_mode = 0;
2128 } 2223 }
2129 } else if (seq_rtt < 0) 2224 } else if (seq_rtt < 0) {
2130 seq_rtt = now - scb->when; 2225 seq_rtt = now - scb->when;
2226 if (rtt_sample)
2227 (*rtt_sample)(sk, tcp_usrtt(skb));
2228 }
2131 tcp_dec_pcount_approx(&tp->fackets_out, skb); 2229 tcp_dec_pcount_approx(&tp->fackets_out, skb);
2132 tcp_packets_out_dec(tp, skb); 2230 tcp_packets_out_dec(tp, skb);
2133 __skb_unlink(skb, &sk->sk_write_queue); 2231 __skb_unlink(skb, &sk->sk_write_queue);
2134 sk_stream_free_skb(sk, skb); 2232 sk_stream_free_skb(sk, skb);
2233 clear_all_retrans_hints(tp);
2135 } 2234 }
2136 2235
2137 if (acked&FLAG_ACKED) { 2236 if (acked&FLAG_ACKED) {
2138 const struct inet_connection_sock *icsk = inet_csk(sk); 2237 tcp_ack_update_rtt(sk, acked, seq_rtt);
2139 tcp_ack_update_rtt(sk, acked, seq_rtt, seq_usrtt);
2140 tcp_ack_packets_out(sk, tp); 2238 tcp_ack_packets_out(sk, tp);
2141 2239
2142 if (icsk->icsk_ca_ops->pkts_acked) 2240 if (icsk->icsk_ca_ops->pkts_acked)
@@ -2284,7 +2382,7 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una)
2284 } 2382 }
2285 2383
2286 /* F-RTO affects on two new ACKs following RTO. 2384 /* F-RTO affects on two new ACKs following RTO.
2287 * At latest on third ACK the TCP behavor is back to normal. 2385 * At latest on third ACK the TCP behavior is back to normal.
2288 */ 2386 */
2289 tp->frto_counter = (tp->frto_counter + 1) % 3; 2387 tp->frto_counter = (tp->frto_counter + 1) % 3;
2290} 2388}
@@ -2299,7 +2397,6 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2299 u32 ack = TCP_SKB_CB(skb)->ack_seq; 2397 u32 ack = TCP_SKB_CB(skb)->ack_seq;
2300 u32 prior_in_flight; 2398 u32 prior_in_flight;
2301 s32 seq_rtt; 2399 s32 seq_rtt;
2302 s32 seq_usrtt = 0;
2303 int prior_packets; 2400 int prior_packets;
2304 2401
2305 /* If the ack is newer than sent or older than previous acks 2402 /* If the ack is newer than sent or older than previous acks
@@ -2311,6 +2408,9 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2311 if (before(ack, prior_snd_una)) 2408 if (before(ack, prior_snd_una))
2312 goto old_ack; 2409 goto old_ack;
2313 2410
2411 if (sysctl_tcp_abc && icsk->icsk_ca_state < TCP_CA_CWR)
2412 tp->bytes_acked += ack - prior_snd_una;
2413
2314 if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) { 2414 if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
2315 /* Window is constant, pure forward advance. 2415 /* Window is constant, pure forward advance.
2316 * No more checks are required. 2416 * No more checks are required.
@@ -2352,14 +2452,13 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2352 prior_in_flight = tcp_packets_in_flight(tp); 2452 prior_in_flight = tcp_packets_in_flight(tp);
2353 2453
2354 /* See if we can take anything off of the retransmit queue. */ 2454 /* See if we can take anything off of the retransmit queue. */
2355 flag |= tcp_clean_rtx_queue(sk, &seq_rtt, 2455 flag |= tcp_clean_rtx_queue(sk, &seq_rtt);
2356 icsk->icsk_ca_ops->rtt_sample ? &seq_usrtt : NULL);
2357 2456
2358 if (tp->frto_counter) 2457 if (tp->frto_counter)
2359 tcp_process_frto(sk, prior_snd_una); 2458 tcp_process_frto(sk, prior_snd_una);
2360 2459
2361 if (tcp_ack_is_dubious(sk, flag)) { 2460 if (tcp_ack_is_dubious(sk, flag)) {
2362 /* Advanve CWND, if state allows this. */ 2461 /* Advance CWND, if state allows this. */
2363 if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag)) 2462 if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag))
2364 tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0); 2463 tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0);
2365 tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); 2464 tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag);
@@ -3148,7 +3247,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
3148{ 3247{
3149 struct sk_buff *skb; 3248 struct sk_buff *skb;
3150 3249
3151 /* First, check that queue is collapsable and find 3250 /* First, check that queue is collapsible and find
3152 * the point where collapsing can be useful. */ 3251 * the point where collapsing can be useful. */
3153 for (skb = head; skb != tail; ) { 3252 for (skb = head; skb != tail; ) {
3154 /* No new bits? It is possible on ofo queue. */ 3253 /* No new bits? It is possible on ofo queue. */
@@ -3456,7 +3555,7 @@ static __inline__ void tcp_ack_snd_check(struct sock *sk)
3456 3555
3457/* 3556/*
3458 * This routine is only called when we have urgent data 3557 * This routine is only called when we have urgent data
3459 * signalled. Its the 'slow' part of tcp_urg. It could be 3558 * signaled. Its the 'slow' part of tcp_urg. It could be
3460 * moved inline now as tcp_urg is only called from one 3559 * moved inline now as tcp_urg is only called from one
3461 * place. We handle URGent data wrong. We have to - as 3560 * place. We handle URGent data wrong. We have to - as
3462 * BSD still doesn't use the correction from RFC961. 3561 * BSD still doesn't use the correction from RFC961.
@@ -3501,7 +3600,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
3501 * urgent. To do this requires some care. We cannot just ignore 3600 * urgent. To do this requires some care. We cannot just ignore
3502 * tp->copied_seq since we would read the last urgent byte again 3601 * tp->copied_seq since we would read the last urgent byte again
3503 * as data, nor can we alter copied_seq until this data arrives 3602 * as data, nor can we alter copied_seq until this data arrives
3504 * or we break the sematics of SIOCATMARK (and thus sockatmark()) 3603 * or we break the semantics of SIOCATMARK (and thus sockatmark())
3505 * 3604 *
3506 * NOTE. Double Dutch. Rendering to plain English: author of comment 3605 * NOTE. Double Dutch. Rendering to plain English: author of comment
3507 * above did something sort of send("A", MSG_OOB); send("B", MSG_OOB); 3606 * above did something sort of send("A", MSG_OOB); send("B", MSG_OOB);
@@ -3646,7 +3745,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
3646 tp->rx_opt.saw_tstamp = 0; 3745 tp->rx_opt.saw_tstamp = 0;
3647 3746
3648 /* pred_flags is 0xS?10 << 16 + snd_wnd 3747 /* pred_flags is 0xS?10 << 16 + snd_wnd
3649 * if header_predition is to be made 3748 * if header_prediction is to be made
3650 * 'S' will always be tp->tcp_header_len >> 2 3749 * 'S' will always be tp->tcp_header_len >> 2
3651 * '?' will be 0 for the fast path, otherwise pred_flags is 0 to 3750 * '?' will be 0 for the fast path, otherwise pred_flags is 0 to
3652 * turn it off (when there are holes in the receive 3751 * turn it off (when there are holes in the receive
@@ -4242,7 +4341,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4242 */ 4341 */
4243 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && 4342 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
4244 !tp->srtt) 4343 !tp->srtt)
4245 tcp_ack_saw_tstamp(sk, NULL, 0); 4344 tcp_ack_saw_tstamp(sk, 0);
4246 4345
4247 if (tp->rx_opt.tstamp_ok) 4346 if (tp->rx_opt.tstamp_ok)
4248 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; 4347 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
@@ -4372,6 +4471,7 @@ discard:
4372 4471
4373EXPORT_SYMBOL(sysctl_tcp_ecn); 4472EXPORT_SYMBOL(sysctl_tcp_ecn);
4374EXPORT_SYMBOL(sysctl_tcp_reordering); 4473EXPORT_SYMBOL(sysctl_tcp_reordering);
4474EXPORT_SYMBOL(sysctl_tcp_abc);
4375EXPORT_SYMBOL(tcp_parse_options); 4475EXPORT_SYMBOL(tcp_parse_options);
4376EXPORT_SYMBOL(tcp_rcv_established); 4476EXPORT_SYMBOL(tcp_rcv_established);
4377EXPORT_SYMBOL(tcp_rcv_state_process); 4477EXPORT_SYMBOL(tcp_rcv_state_process);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 634dabb558fd..4d5021e1929b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -39,7 +39,7 @@
39 * request_sock handling and moved 39 * request_sock handling and moved
40 * most of it into the af independent code. 40 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes. 41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics. 42 * Added new listen semantics.
43 * Mike McLagan : Routing by source 43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits 44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes. 45 * Andi Kleen: various fixes.
@@ -1110,24 +1110,18 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1110static int tcp_v4_checksum_init(struct sk_buff *skb) 1110static int tcp_v4_checksum_init(struct sk_buff *skb)
1111{ 1111{
1112 if (skb->ip_summed == CHECKSUM_HW) { 1112 if (skb->ip_summed == CHECKSUM_HW) {
1113 skb->ip_summed = CHECKSUM_UNNECESSARY;
1114 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr, 1113 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1115 skb->nh.iph->daddr, skb->csum)) 1114 skb->nh.iph->daddr, skb->csum)) {
1115 skb->ip_summed = CHECKSUM_UNNECESSARY;
1116 return 0; 1116 return 0;
1117 1117 }
1118 LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v4 csum failed\n");
1119 skb->ip_summed = CHECKSUM_NONE;
1120 } 1118 }
1119
1120 skb->csum = csum_tcpudp_nofold(skb->nh.iph->saddr, skb->nh.iph->daddr,
1121 skb->len, IPPROTO_TCP, 0);
1122
1121 if (skb->len <= 76) { 1123 if (skb->len <= 76) {
1122 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr, 1124 return __skb_checksum_complete(skb);
1123 skb->nh.iph->daddr,
1124 skb_checksum(skb, 0, skb->len, 0)))
1125 return -1;
1126 skb->ip_summed = CHECKSUM_UNNECESSARY;
1127 } else {
1128 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1129 skb->nh.iph->saddr,
1130 skb->nh.iph->daddr, 0);
1131 } 1125 }
1132 return 0; 1126 return 0;
1133} 1127}
@@ -1216,10 +1210,10 @@ int tcp_v4_rcv(struct sk_buff *skb)
1216 1210
1217 /* An explanation is required here, I think. 1211 /* An explanation is required here, I think.
1218 * Packet length and doff are validated by header prediction, 1212 * Packet length and doff are validated by header prediction,
1219 * provided case of th->doff==0 is elimineted. 1213 * provided case of th->doff==0 is eliminated.
1220 * So, we defer the checks. */ 1214 * So, we defer the checks. */
1221 if ((skb->ip_summed != CHECKSUM_UNNECESSARY && 1215 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1222 tcp_v4_checksum_init(skb) < 0)) 1216 tcp_v4_checksum_init(skb)))
1223 goto bad_packet; 1217 goto bad_packet;
1224 1218
1225 th = skb->h.th; 1219 th = skb->h.th;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index b1a63b2c6b4a..1b66a2ac4321 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -158,7 +158,7 @@ kill_with_rst:
158 /* I am shamed, but failed to make it more elegant. 158 /* I am shamed, but failed to make it more elegant.
159 * Yes, it is direct reference to IP, which is impossible 159 * Yes, it is direct reference to IP, which is impossible
160 * to generalize to IPv6. Taking into account that IPv6 160 * to generalize to IPv6. Taking into account that IPv6
161 * do not undertsnad recycling in any case, it not 161 * do not understand recycling in any case, it not
162 * a big problem in practice. --ANK */ 162 * a big problem in practice. --ANK */
163 if (tw->tw_family == AF_INET && 163 if (tw->tw_family == AF_INET &&
164 tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && 164 tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp &&
@@ -194,7 +194,7 @@ kill_with_rst:
194 /* In window segment, it may be only reset or bare ack. */ 194 /* In window segment, it may be only reset or bare ack. */
195 195
196 if (th->rst) { 196 if (th->rst) {
197 /* This is TIME_WAIT assasination, in two flavors. 197 /* This is TIME_WAIT assassination, in two flavors.
198 * Oh well... nobody has a sufficient solution to this 198 * Oh well... nobody has a sufficient solution to this
199 * protocol bug yet. 199 * protocol bug yet.
200 */ 200 */
@@ -380,6 +380,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
380 */ 380 */
381 newtp->snd_cwnd = 2; 381 newtp->snd_cwnd = 2;
382 newtp->snd_cwnd_cnt = 0; 382 newtp->snd_cwnd_cnt = 0;
383 newtp->bytes_acked = 0;
383 384
384 newtp->frto_counter = 0; 385 newtp->frto_counter = 0;
385 newtp->frto_highmark = 0; 386 newtp->frto_highmark = 0;
@@ -550,7 +551,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
550 551
551 /* RFC793 page 36: "If the connection is in any non-synchronized state ... 552 /* RFC793 page 36: "If the connection is in any non-synchronized state ...
552 * and the incoming segment acknowledges something not yet 553 * and the incoming segment acknowledges something not yet
553 * sent (the segment carries an unaccaptable ACK) ... 554 * sent (the segment carries an unacceptable ACK) ...
554 * a reset is sent." 555 * a reset is sent."
555 * 556 *
556 * Invalid ACK: reset will be sent by listening socket 557 * Invalid ACK: reset will be sent by listening socket
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b907456a79f4..029c70dfb585 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -436,6 +436,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss
436 u16 flags; 436 u16 flags;
437 437
438 BUG_ON(len > skb->len); 438 BUG_ON(len > skb->len);
439
440 clear_all_retrans_hints(tp);
439 nsize = skb_headlen(skb) - len; 441 nsize = skb_headlen(skb) - len;
440 if (nsize < 0) 442 if (nsize < 0)
441 nsize = 0; 443 nsize = 0;
@@ -599,7 +601,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
599 for TCP options, but includes only bare TCP header. 601 for TCP options, but includes only bare TCP header.
600 602
601 tp->rx_opt.mss_clamp is mss negotiated at connection setup. 603 tp->rx_opt.mss_clamp is mss negotiated at connection setup.
602 It is minumum of user_mss and mss received with SYN. 604 It is minimum of user_mss and mss received with SYN.
603 It also does not include TCP options. 605 It also does not include TCP options.
604 606
605 tp->pmtu_cookie is last pmtu, seen by this function. 607 tp->pmtu_cookie is last pmtu, seen by this function.
@@ -1171,7 +1173,7 @@ u32 __tcp_select_window(struct sock *sk)
1171{ 1173{
1172 struct inet_connection_sock *icsk = inet_csk(sk); 1174 struct inet_connection_sock *icsk = inet_csk(sk);
1173 struct tcp_sock *tp = tcp_sk(sk); 1175 struct tcp_sock *tp = tcp_sk(sk);
1174 /* MSS for the peer's data. Previous verions used mss_clamp 1176 /* MSS for the peer's data. Previous versions used mss_clamp
1175 * here. I don't know if the value based on our guesses 1177 * here. I don't know if the value based on our guesses
1176 * of peer's MSS is better for the performance. It's more correct 1178 * of peer's MSS is better for the performance. It's more correct
1177 * but may be worse for the performance because of rcv_mss 1179 * but may be worse for the performance because of rcv_mss
@@ -1260,7 +1262,10 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
1260 BUG_ON(tcp_skb_pcount(skb) != 1 || 1262 BUG_ON(tcp_skb_pcount(skb) != 1 ||
1261 tcp_skb_pcount(next_skb) != 1); 1263 tcp_skb_pcount(next_skb) != 1);
1262 1264
1263 /* Ok. We will be able to collapse the packet. */ 1265 /* changing transmit queue under us so clear hints */
1266 clear_all_retrans_hints(tp);
1267
1268 /* Ok. We will be able to collapse the packet. */
1264 __skb_unlink(next_skb, &sk->sk_write_queue); 1269 __skb_unlink(next_skb, &sk->sk_write_queue);
1265 1270
1266 memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size); 1271 memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
@@ -1330,6 +1335,8 @@ void tcp_simple_retransmit(struct sock *sk)
1330 } 1335 }
1331 } 1336 }
1332 1337
1338 clear_all_retrans_hints(tp);
1339
1333 if (!lost) 1340 if (!lost)
1334 return; 1341 return;
1335 1342
@@ -1361,7 +1368,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1361 int err; 1368 int err;
1362 1369
1363 /* Do not sent more than we queued. 1/4 is reserved for possible 1370 /* Do not sent more than we queued. 1/4 is reserved for possible
1364 * copying overhead: frgagmentation, tunneling, mangling etc. 1371 * copying overhead: fragmentation, tunneling, mangling etc.
1365 */ 1372 */
1366 if (atomic_read(&sk->sk_wmem_alloc) > 1373 if (atomic_read(&sk->sk_wmem_alloc) >
1367 min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) 1374 min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
@@ -1468,13 +1475,25 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
1468 const struct inet_connection_sock *icsk = inet_csk(sk); 1475 const struct inet_connection_sock *icsk = inet_csk(sk);
1469 struct tcp_sock *tp = tcp_sk(sk); 1476 struct tcp_sock *tp = tcp_sk(sk);
1470 struct sk_buff *skb; 1477 struct sk_buff *skb;
1471 int packet_cnt = tp->lost_out; 1478 int packet_cnt;
1479
1480 if (tp->retransmit_skb_hint) {
1481 skb = tp->retransmit_skb_hint;
1482 packet_cnt = tp->retransmit_cnt_hint;
1483 }else{
1484 skb = sk->sk_write_queue.next;
1485 packet_cnt = 0;
1486 }
1472 1487
1473 /* First pass: retransmit lost packets. */ 1488 /* First pass: retransmit lost packets. */
1474 if (packet_cnt) { 1489 if (tp->lost_out) {
1475 sk_stream_for_retrans_queue(skb, sk) { 1490 sk_stream_for_retrans_queue_from(skb, sk) {
1476 __u8 sacked = TCP_SKB_CB(skb)->sacked; 1491 __u8 sacked = TCP_SKB_CB(skb)->sacked;
1477 1492
1493 /* we could do better than to assign each time */
1494 tp->retransmit_skb_hint = skb;
1495 tp->retransmit_cnt_hint = packet_cnt;
1496
1478 /* Assume this retransmit will generate 1497 /* Assume this retransmit will generate
1479 * only one packet for congestion window 1498 * only one packet for congestion window
1480 * calculation purposes. This works because 1499 * calculation purposes. This works because
@@ -1485,10 +1504,12 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
1485 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) 1504 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
1486 return; 1505 return;
1487 1506
1488 if (sacked&TCPCB_LOST) { 1507 if (sacked & TCPCB_LOST) {
1489 if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { 1508 if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
1490 if (tcp_retransmit_skb(sk, skb)) 1509 if (tcp_retransmit_skb(sk, skb)) {
1510 tp->retransmit_skb_hint = NULL;
1491 return; 1511 return;
1512 }
1492 if (icsk->icsk_ca_state != TCP_CA_Loss) 1513 if (icsk->icsk_ca_state != TCP_CA_Loss)
1493 NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS); 1514 NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS);
1494 else 1515 else
@@ -1501,8 +1522,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
1501 TCP_RTO_MAX); 1522 TCP_RTO_MAX);
1502 } 1523 }
1503 1524
1504 packet_cnt -= tcp_skb_pcount(skb); 1525 packet_cnt += tcp_skb_pcount(skb);
1505 if (packet_cnt <= 0) 1526 if (packet_cnt >= tp->lost_out)
1506 break; 1527 break;
1507 } 1528 }
1508 } 1529 }
@@ -1528,9 +1549,18 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
1528 if (tcp_may_send_now(sk, tp)) 1549 if (tcp_may_send_now(sk, tp))
1529 return; 1550 return;
1530 1551
1531 packet_cnt = 0; 1552 if (tp->forward_skb_hint) {
1553 skb = tp->forward_skb_hint;
1554 packet_cnt = tp->forward_cnt_hint;
1555 } else{
1556 skb = sk->sk_write_queue.next;
1557 packet_cnt = 0;
1558 }
1559
1560 sk_stream_for_retrans_queue_from(skb, sk) {
1561 tp->forward_cnt_hint = packet_cnt;
1562 tp->forward_skb_hint = skb;
1532 1563
1533 sk_stream_for_retrans_queue(skb, sk) {
1534 /* Similar to the retransmit loop above we 1564 /* Similar to the retransmit loop above we
1535 * can pretend that the retransmitted SKB 1565 * can pretend that the retransmitted SKB
1536 * we send out here will be composed of one 1566 * we send out here will be composed of one
@@ -1547,8 +1577,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
1547 continue; 1577 continue;
1548 1578
1549 /* Ok, retransmit it. */ 1579 /* Ok, retransmit it. */
1550 if (tcp_retransmit_skb(sk, skb)) 1580 if (tcp_retransmit_skb(sk, skb)) {
1581 tp->forward_skb_hint = NULL;
1551 break; 1582 break;
1583 }
1552 1584
1553 if (skb == skb_peek(&sk->sk_write_queue)) 1585 if (skb == skb_peek(&sk->sk_write_queue))
1554 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 1586 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
@@ -2058,3 +2090,4 @@ EXPORT_SYMBOL(tcp_connect);
2058EXPORT_SYMBOL(tcp_make_synack); 2090EXPORT_SYMBOL(tcp_make_synack);
2059EXPORT_SYMBOL(tcp_simple_retransmit); 2091EXPORT_SYMBOL(tcp_simple_retransmit);
2060EXPORT_SYMBOL(tcp_sync_mss); 2092EXPORT_SYMBOL(tcp_sync_mss);
2093EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor);
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 327770bf5522..26d7486ee501 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -20,20 +20,20 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
20 u32 in_flight, int flag) 20 u32 in_flight, int flag)
21{ 21{
22 struct tcp_sock *tp = tcp_sk(sk); 22 struct tcp_sock *tp = tcp_sk(sk);
23 if (in_flight < tp->snd_cwnd) 23
24 if (!tcp_is_cwnd_limited(sk, in_flight))
24 return; 25 return;
25 26
26 if (tp->snd_cwnd <= tp->snd_ssthresh) { 27 if (tp->snd_cwnd <= tp->snd_ssthresh)
27 tp->snd_cwnd++; 28 tcp_slow_start(tp);
28 } else { 29 else {
29 tp->snd_cwnd_cnt++; 30 tp->snd_cwnd_cnt++;
30 if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){ 31 if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){
31 tp->snd_cwnd++; 32 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
33 tp->snd_cwnd++;
32 tp->snd_cwnd_cnt = 0; 34 tp->snd_cwnd_cnt = 0;
33 } 35 }
34 } 36 }
35 tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
36 tp->snd_cwnd_stamp = tcp_time_stamp;
37} 37}
38 38
39static u32 tcp_scalable_ssthresh(struct sock *sk) 39static u32 tcp_scalable_ssthresh(struct sock *sk)
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 415ee47ac1c5..e1880959614a 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -58,7 +58,7 @@ static void tcp_write_err(struct sock *sk)
58 * to prevent DoS attacks. It is called when a retransmission timeout 58 * to prevent DoS attacks. It is called when a retransmission timeout
59 * or zero probe timeout occurs on orphaned socket. 59 * or zero probe timeout occurs on orphaned socket.
60 * 60 *
61 * Criterium is still not confirmed experimentally and may change. 61 * Criteria is still not confirmed experimentally and may change.
62 * We kill the socket, if: 62 * We kill the socket, if:
63 * 1. If number of orphaned sockets exceeds an administratively configured 63 * 1. If number of orphaned sockets exceeds an administratively configured
64 * limit. 64 * limit.
@@ -132,7 +132,7 @@ static int tcp_write_timeout(struct sock *sk)
132 hole detection. :-( 132 hole detection. :-(
133 133
134 It is place to make it. It is not made. I do not want 134 It is place to make it. It is not made. I do not want
135 to make it. It is disguisting. It does not work in any 135 to make it. It is disgusting. It does not work in any
136 case. Let me to cite the same draft, which requires for 136 case. Let me to cite the same draft, which requires for
137 us to implement this: 137 us to implement this:
138 138
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 93c5f92070f9..b7d296a8ac6d 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -236,8 +236,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
236 /* We don't have enough RTT samples to do the Vegas 236 /* We don't have enough RTT samples to do the Vegas
237 * calculation, so we'll behave like Reno. 237 * calculation, so we'll behave like Reno.
238 */ 238 */
239 if (tp->snd_cwnd > tp->snd_ssthresh) 239 tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag);
240 tp->snd_cwnd++;
241 } else { 240 } else {
242 u32 rtt, target_cwnd, diff; 241 u32 rtt, target_cwnd, diff;
243 242
@@ -275,7 +274,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
275 */ 274 */
276 diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; 275 diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
277 276
278 if (tp->snd_cwnd < tp->snd_ssthresh) { 277 if (tp->snd_cwnd <= tp->snd_ssthresh) {
279 /* Slow start. */ 278 /* Slow start. */
280 if (diff > gamma) { 279 if (diff > gamma) {
281 /* Going too fast. Time to slow down 280 /* Going too fast. Time to slow down
@@ -295,6 +294,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
295 V_PARAM_SHIFT)+1); 294 V_PARAM_SHIFT)+1);
296 295
297 } 296 }
297 tcp_slow_start(tp);
298 } else { 298 } else {
299 /* Congestion avoidance. */ 299 /* Congestion avoidance. */
300 u32 next_snd_cwnd; 300 u32 next_snd_cwnd;
@@ -327,37 +327,17 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
327 else if (next_snd_cwnd < tp->snd_cwnd) 327 else if (next_snd_cwnd < tp->snd_cwnd)
328 tp->snd_cwnd--; 328 tp->snd_cwnd--;
329 } 329 }
330 }
331 330
332 /* Wipe the slate clean for the next RTT. */ 331 if (tp->snd_cwnd < 2)
333 vegas->cntRTT = 0; 332 tp->snd_cwnd = 2;
334 vegas->minRTT = 0x7fffffff; 333 else if (tp->snd_cwnd > tp->snd_cwnd_clamp)
334 tp->snd_cwnd = tp->snd_cwnd_clamp;
335 }
335 } 336 }
336 337
337 /* The following code is executed for every ack we receive, 338 /* Wipe the slate clean for the next RTT. */
338 * except for conditions checked in should_advance_cwnd() 339 vegas->cntRTT = 0;
339 * before the call to tcp_cong_avoid(). Mainly this means that 340 vegas->minRTT = 0x7fffffff;
340 * we only execute this code if the ack actually acked some
341 * data.
342 */
343
344 /* If we are in slow start, increase our cwnd in response to this ACK.
345 * (If we are not in slow start then we are in congestion avoidance,
346 * and adjust our congestion window only once per RTT. See the code
347 * above.)
348 */
349 if (tp->snd_cwnd <= tp->snd_ssthresh)
350 tp->snd_cwnd++;
351
352 /* to keep cwnd from growing without bound */
353 tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
354
355 /* Make sure that we are never so timid as to reduce our cwnd below
356 * 2 MSS.
357 *
358 * Going below 2 MSS would risk huge delayed ACKs from our receiver.
359 */
360 tp->snd_cwnd = max(tp->snd_cwnd, 2U);
361} 341}
362 342
363/* Extract info for Tcp socket info provided via netlink. */ 343/* Extract info for Tcp socket info provided via netlink. */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index e0bd1013cb0d..2422a5f7195d 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -761,7 +761,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
761 761
762static __inline__ int __udp_checksum_complete(struct sk_buff *skb) 762static __inline__ int __udp_checksum_complete(struct sk_buff *skb)
763{ 763{
764 return (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)); 764 return __skb_checksum_complete(skb);
765} 765}
766 766
767static __inline__ int udp_checksum_complete(struct sk_buff *skb) 767static __inline__ int udp_checksum_complete(struct sk_buff *skb)
@@ -1100,11 +1100,8 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
1100 if (uh->check == 0) { 1100 if (uh->check == 0) {
1101 skb->ip_summed = CHECKSUM_UNNECESSARY; 1101 skb->ip_summed = CHECKSUM_UNNECESSARY;
1102 } else if (skb->ip_summed == CHECKSUM_HW) { 1102 } else if (skb->ip_summed == CHECKSUM_HW) {
1103 skb->ip_summed = CHECKSUM_UNNECESSARY;
1104 if (!udp_check(uh, ulen, saddr, daddr, skb->csum)) 1103 if (!udp_check(uh, ulen, saddr, daddr, skb->csum))
1105 return 0; 1104 skb->ip_summed = CHECKSUM_UNNECESSARY;
1106 LIMIT_NETDEBUG(KERN_DEBUG "udp v4 hw csum failure.\n");
1107 skb->ip_summed = CHECKSUM_NONE;
1108 } 1105 }
1109 if (skb->ip_summed != CHECKSUM_UNNECESSARY) 1106 if (skb->ip_summed != CHECKSUM_UNNECESSARY)
1110 skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); 1107 skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index b7a5f51238b3..ddcf7754eec2 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1022,6 +1022,7 @@ int ipv6_dev_get_saddr(struct net_device *daddr_dev,
1022 continue; 1022 continue;
1023 } 1023 }
1024 1024
1025#ifdef CONFIG_IPV6_PRIVACY
1025 /* Rule 7: Prefer public address 1026 /* Rule 7: Prefer public address
1026 * Note: prefer temprary address if use_tempaddr >= 2 1027 * Note: prefer temprary address if use_tempaddr >= 2
1027 */ 1028 */
@@ -1042,7 +1043,7 @@ int ipv6_dev_get_saddr(struct net_device *daddr_dev,
1042 if (hiscore.attrs & IPV6_SADDR_SCORE_PRIVACY) 1043 if (hiscore.attrs & IPV6_SADDR_SCORE_PRIVACY)
1043 continue; 1044 continue;
1044 } 1045 }
1045 1046#endif
1046 /* Rule 8: Use longest matching prefix */ 1047 /* Rule 8: Use longest matching prefix */
1047 if (hiscore.rule < 8) 1048 if (hiscore.rule < 8)
1048 hiscore.matchlen = ipv6_addr_diff(&ifa_result->addr, daddr); 1049 hiscore.matchlen = ipv6_addr_diff(&ifa_result->addr, daddr);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 4f8795af2edb..c63b8ce0e1b5 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -699,12 +699,14 @@ static int __init inet6_init(void)
699 /* Register the family here so that the init calls below will 699 /* Register the family here so that the init calls below will
700 * be able to create sockets. (?? is this dangerous ??) 700 * be able to create sockets. (?? is this dangerous ??)
701 */ 701 */
702 (void) sock_register(&inet6_family_ops); 702 err = sock_register(&inet6_family_ops);
703 if (err)
704 goto out_unregister_raw_proto;
703 705
704 /* Initialise ipv6 mibs */ 706 /* Initialise ipv6 mibs */
705 err = init_ipv6_mibs(); 707 err = init_ipv6_mibs();
706 if (err) 708 if (err)
707 goto out_unregister_raw_proto; 709 goto out_unregister_sock;
708 710
709 /* 711 /*
710 * ipngwg API draft makes clear that the correct semantics 712 * ipngwg API draft makes clear that the correct semantics
@@ -796,6 +798,8 @@ icmp_fail:
796 ipv6_sysctl_unregister(); 798 ipv6_sysctl_unregister();
797#endif 799#endif
798 cleanup_ipv6_mibs(); 800 cleanup_ipv6_mibs();
801out_unregister_sock:
802 sock_unregister(PF_INET6);
799out_unregister_raw_proto: 803out_unregister_raw_proto:
800 proto_unregister(&rawv6_prot); 804 proto_unregister(&rawv6_prot);
801out_unregister_udp_proto: 805out_unregister_udp_proto:
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 23e540365a14..1bdf0fb8bf8a 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -585,17 +585,16 @@ static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
585 daddr = &skb->nh.ipv6h->daddr; 585 daddr = &skb->nh.ipv6h->daddr;
586 586
587 /* Perform checksum. */ 587 /* Perform checksum. */
588 if (skb->ip_summed == CHECKSUM_HW) { 588 switch (skb->ip_summed) {
589 skb->ip_summed = CHECKSUM_UNNECESSARY; 589 case CHECKSUM_HW:
590 if (csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, 590 if (!csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6,
591 skb->csum)) { 591 skb->csum))
592 LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 hw checksum failed\n"); 592 break;
593 skb->ip_summed = CHECKSUM_NONE; 593 /* fall through */
594 } 594 case CHECKSUM_NONE:
595 } 595 skb->csum = ~csum_ipv6_magic(saddr, daddr, skb->len,
596 if (skb->ip_summed == CHECKSUM_NONE) { 596 IPPROTO_ICMPV6, 0);
597 if (csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, 597 if (__skb_checksum_complete(skb)) {
598 skb_checksum(skb, 0, skb->len, 0))) {
599 LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 checksum failed [%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x > %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x]\n", 598 LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 checksum failed [%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x > %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x]\n",
600 NIP6(*saddr), NIP6(*daddr)); 599 NIP6(*saddr), NIP6(*daddr));
601 goto discard_it; 600 goto discard_it;
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 6e3480426939..a6026d2787d2 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -176,6 +176,11 @@ resubmit:
176 if (ipprot->flags & INET6_PROTO_FINAL) { 176 if (ipprot->flags & INET6_PROTO_FINAL) {
177 struct ipv6hdr *hdr; 177 struct ipv6hdr *hdr;
178 178
179 /* Free reference early: we don't need it any more,
180 and it may hold ip_conntrack module loaded
181 indefinitely. */
182 nf_reset(skb);
183
179 skb_postpull_rcsum(skb, skb->nh.raw, 184 skb_postpull_rcsum(skb, skb->nh.raw,
180 skb->h.raw - skb->nh.raw); 185 skb->h.raw - skb->nh.raw);
181 hdr = skb->nh.ipv6h; 186 hdr = skb->nh.ipv6h;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index dbd9767b32e4..c1fa693511a1 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -441,9 +441,15 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
441#ifdef CONFIG_NETFILTER 441#ifdef CONFIG_NETFILTER
442 to->nfmark = from->nfmark; 442 to->nfmark = from->nfmark;
443 /* Connection association is same as pre-frag packet */ 443 /* Connection association is same as pre-frag packet */
444 nf_conntrack_put(to->nfct);
444 to->nfct = from->nfct; 445 to->nfct = from->nfct;
445 nf_conntrack_get(to->nfct); 446 nf_conntrack_get(to->nfct);
446 to->nfctinfo = from->nfctinfo; 447 to->nfctinfo = from->nfctinfo;
448#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
449 nf_conntrack_put_reasm(to->nfct_reasm);
450 to->nfct_reasm = from->nfct_reasm;
451 nf_conntrack_get_reasm(to->nfct_reasm);
452#endif
447#ifdef CONFIG_BRIDGE_NETFILTER 453#ifdef CONFIG_BRIDGE_NETFILTER
448 nf_bridge_put(to->nf_bridge); 454 nf_bridge_put(to->nf_bridge);
449 to->nf_bridge = from->nf_bridge; 455 to->nf_bridge = from->nf_bridge;
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index e6b0e3954c02..e315d0f80af1 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -525,6 +525,7 @@ ip6ip6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
525 525
526 if ((t = ip6ip6_tnl_lookup(&ipv6h->saddr, &ipv6h->daddr)) != NULL) { 526 if ((t = ip6ip6_tnl_lookup(&ipv6h->saddr, &ipv6h->daddr)) != NULL) {
527 if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { 527 if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
528 read_unlock(&ip6ip6_lock);
528 kfree_skb(skb); 529 kfree_skb(skb);
529 return 0; 530 return 0;
530 } 531 }
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index bb7ccfe33f23..971ba60bf6e9 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -278,5 +278,19 @@ config IP6_NF_RAW
278 If you want to compile it as a module, say M here and read 278 If you want to compile it as a module, say M here and read
279 <file:Documentation/modules.txt>. If unsure, say `N'. 279 <file:Documentation/modules.txt>. If unsure, say `N'.
280 280
281config NF_CONNTRACK_IPV6
282 tristate "IPv6 support for new connection tracking (EXPERIMENTAL)"
283 depends on EXPERIMENTAL && NF_CONNTRACK
284 ---help---
285 Connection tracking keeps a record of what packets have passed
286 through your machine, in order to figure out how they are related
287 into connections.
288
289 This is IPv6 support on Layer 3 independent connection tracking.
290 Layer 3 independent connection tracking is experimental scheme
291 which generalize ip_conntrack to support other layer 3 protocols.
292
293 To compile it as a module, choose M here. If unsure, say N.
294
281endmenu 295endmenu
282 296
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index 2b2c370e8b1c..9ab5b2ca1f59 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -27,3 +27,9 @@ obj-$(CONFIG_IP6_NF_TARGET_LOG) += ip6t_LOG.o
27obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o 27obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o
28obj-$(CONFIG_IP6_NF_MATCH_HL) += ip6t_hl.o 28obj-$(CONFIG_IP6_NF_MATCH_HL) += ip6t_hl.o
29obj-$(CONFIG_IP6_NF_TARGET_REJECT) += ip6t_REJECT.o 29obj-$(CONFIG_IP6_NF_TARGET_REJECT) += ip6t_REJECT.o
30
31# objects for l3 independent conntrack
32nf_conntrack_ipv6-objs := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o nf_conntrack_reasm.o
33
34# l3 independent conntrack
35obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o
diff --git a/net/ipv6/netfilter/ip6t_MARK.c b/net/ipv6/netfilter/ip6t_MARK.c
index 0c7584f92172..eab8fb864ee0 100644
--- a/net/ipv6/netfilter/ip6t_MARK.c
+++ b/net/ipv6/netfilter/ip6t_MARK.c
@@ -56,9 +56,9 @@ checkentry(const char *tablename,
56 return 1; 56 return 1;
57} 57}
58 58
59static struct ip6t_target ip6t_mark_reg = { 59static struct ip6t_target ip6t_mark_reg = {
60 .name = "MARK", 60 .name = "MARK",
61 .target = target, 61 .target = target,
62 .checkentry = checkentry, 62 .checkentry = checkentry,
63 .me = THIS_MODULE 63 .me = THIS_MODULE
64}; 64};
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
new file mode 100644
index 000000000000..e2c90b3a8074
--- /dev/null
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -0,0 +1,556 @@
1/*
2 * Copyright (C)2004 USAGI/WIDE Project
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * Author:
9 * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
10 *
11 * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
12 * - support Layer 3 protocol independent connection tracking.
13 * Based on the original ip_conntrack code which had the following
14 * copyright information:
15 * (C) 1999-2001 Paul `Rusty' Russell
16 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
17 *
18 * 23 Mar 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
19 * - add get_features() to support various size of conntrack
20 * structures.
21 */
22
23#include <linux/config.h>
24#include <linux/types.h>
25#include <linux/ipv6.h>
26#include <linux/in6.h>
27#include <linux/netfilter.h>
28#include <linux/module.h>
29#include <linux/skbuff.h>
30#include <linux/icmp.h>
31#include <linux/sysctl.h>
32#include <net/ipv6.h>
33
34#include <linux/netfilter_ipv6.h>
35#include <net/netfilter/nf_conntrack.h>
36#include <net/netfilter/nf_conntrack_helper.h>
37#include <net/netfilter/nf_conntrack_protocol.h>
38#include <net/netfilter/nf_conntrack_l3proto.h>
39#include <net/netfilter/nf_conntrack_core.h>
40
41#if 0
42#define DEBUGP printk
43#else
44#define DEBUGP(format, args...)
45#endif
46
47DECLARE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat);
48
49static int ipv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
50 struct nf_conntrack_tuple *tuple)
51{
52 u_int32_t _addrs[8], *ap;
53
54 ap = skb_header_pointer(skb, nhoff + offsetof(struct ipv6hdr, saddr),
55 sizeof(_addrs), _addrs);
56 if (ap == NULL)
57 return 0;
58
59 memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6));
60 memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6));
61
62 return 1;
63}
64
65static int ipv6_invert_tuple(struct nf_conntrack_tuple *tuple,
66 const struct nf_conntrack_tuple *orig)
67{
68 memcpy(tuple->src.u3.ip6, orig->dst.u3.ip6, sizeof(tuple->src.u3.ip6));
69 memcpy(tuple->dst.u3.ip6, orig->src.u3.ip6, sizeof(tuple->dst.u3.ip6));
70
71 return 1;
72}
73
74static int ipv6_print_tuple(struct seq_file *s,
75 const struct nf_conntrack_tuple *tuple)
76{
77 return seq_printf(s, "src=%x:%x:%x:%x:%x:%x:%x:%x dst=%x:%x:%x:%x:%x:%x:%x:%x ",
78 NIP6(*((struct in6_addr *)tuple->src.u3.ip6)),
79 NIP6(*((struct in6_addr *)tuple->dst.u3.ip6)));
80}
81
82static int ipv6_print_conntrack(struct seq_file *s,
83 const struct nf_conn *conntrack)
84{
85 return 0;
86}
87
88/*
89 * Based on ipv6_skip_exthdr() in net/ipv6/exthdr.c
90 *
91 * This function parses (probably truncated) exthdr set "hdr"
92 * of length "len". "nexthdrp" initially points to some place,
93 * where type of the first header can be found.
94 *
95 * It skips all well-known exthdrs, and returns pointer to the start
96 * of unparsable area i.e. the first header with unknown type.
97 * if success, *nexthdr is updated by type/protocol of this header.
98 *
99 * NOTES: - it may return pointer pointing beyond end of packet,
100 * if the last recognized header is truncated in the middle.
101 * - if packet is truncated, so that all parsed headers are skipped,
102 * it returns -1.
103 * - if packet is fragmented, return pointer of the fragment header.
104 * - ESP is unparsable for now and considered like
105 * normal payload protocol.
106 * - Note also special handling of AUTH header. Thanks to IPsec wizards.
107 */
108
109int nf_ct_ipv6_skip_exthdr(struct sk_buff *skb, int start, u8 *nexthdrp,
110 int len)
111{
112 u8 nexthdr = *nexthdrp;
113
114 while (ipv6_ext_hdr(nexthdr)) {
115 struct ipv6_opt_hdr hdr;
116 int hdrlen;
117
118 if (len < (int)sizeof(struct ipv6_opt_hdr))
119 return -1;
120 if (nexthdr == NEXTHDR_NONE)
121 break;
122 if (nexthdr == NEXTHDR_FRAGMENT)
123 break;
124 if (skb_copy_bits(skb, start, &hdr, sizeof(hdr)))
125 BUG();
126 if (nexthdr == NEXTHDR_AUTH)
127 hdrlen = (hdr.hdrlen+2)<<2;
128 else
129 hdrlen = ipv6_optlen(&hdr);
130
131 nexthdr = hdr.nexthdr;
132 len -= hdrlen;
133 start += hdrlen;
134 }
135
136 *nexthdrp = nexthdr;
137 return start;
138}
139
140static int
141ipv6_prepare(struct sk_buff **pskb, unsigned int hooknum, unsigned int *dataoff,
142 u_int8_t *protonum)
143{
144 unsigned int extoff;
145 unsigned char pnum;
146 int protoff;
147
148 extoff = (u8*)((*pskb)->nh.ipv6h + 1) - (*pskb)->data;
149 pnum = (*pskb)->nh.ipv6h->nexthdr;
150
151 protoff = nf_ct_ipv6_skip_exthdr(*pskb, extoff, &pnum,
152 (*pskb)->len - extoff);
153
154 /*
155 * (protoff == (*pskb)->len) mean that the packet doesn't have no data
156 * except of IPv6 & ext headers. but it's tracked anyway. - YK
157 */
158 if ((protoff < 0) || (protoff > (*pskb)->len)) {
159 DEBUGP("ip6_conntrack_core: can't find proto in pkt\n");
160 NF_CT_STAT_INC(error);
161 NF_CT_STAT_INC(invalid);
162 return -NF_ACCEPT;
163 }
164
165 *dataoff = protoff;
166 *protonum = pnum;
167 return NF_ACCEPT;
168}
169
170static u_int32_t ipv6_get_features(const struct nf_conntrack_tuple *tuple)
171{
172 return NF_CT_F_BASIC;
173}
174
175static unsigned int ipv6_confirm(unsigned int hooknum,
176 struct sk_buff **pskb,
177 const struct net_device *in,
178 const struct net_device *out,
179 int (*okfn)(struct sk_buff *))
180{
181 struct nf_conn *ct;
182 enum ip_conntrack_info ctinfo;
183
184 /* This is where we call the helper: as the packet goes out. */
185 ct = nf_ct_get(*pskb, &ctinfo);
186 if (ct && ct->helper) {
187 unsigned int ret, protoff;
188 unsigned int extoff = (u8*)((*pskb)->nh.ipv6h + 1)
189 - (*pskb)->data;
190 unsigned char pnum = (*pskb)->nh.ipv6h->nexthdr;
191
192 protoff = nf_ct_ipv6_skip_exthdr(*pskb, extoff, &pnum,
193 (*pskb)->len - extoff);
194 if (protoff < 0 || protoff > (*pskb)->len ||
195 pnum == NEXTHDR_FRAGMENT) {
196 DEBUGP("proto header not found\n");
197 return NF_ACCEPT;
198 }
199
200 ret = ct->helper->help(pskb, protoff, ct, ctinfo);
201 if (ret != NF_ACCEPT)
202 return ret;
203 }
204
205 /* We've seen it coming out the other side: confirm it */
206
207 return nf_conntrack_confirm(pskb);
208}
209
210extern struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb);
211extern void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb,
212 struct net_device *in,
213 struct net_device *out,
214 int (*okfn)(struct sk_buff *));
215static unsigned int ipv6_defrag(unsigned int hooknum,
216 struct sk_buff **pskb,
217 const struct net_device *in,
218 const struct net_device *out,
219 int (*okfn)(struct sk_buff *))
220{
221 struct sk_buff *reasm;
222
223 /* Previously seen (loopback)? */
224 if ((*pskb)->nfct)
225 return NF_ACCEPT;
226
227 reasm = nf_ct_frag6_gather(*pskb);
228
229 /* queued */
230 if (reasm == NULL)
231 return NF_STOLEN;
232
233 /* error occured or not fragmented */
234 if (reasm == *pskb)
235 return NF_ACCEPT;
236
237 nf_ct_frag6_output(hooknum, reasm, (struct net_device *)in,
238 (struct net_device *)out, okfn);
239
240 return NF_STOLEN;
241}
242
243static unsigned int ipv6_conntrack_in(unsigned int hooknum,
244 struct sk_buff **pskb,
245 const struct net_device *in,
246 const struct net_device *out,
247 int (*okfn)(struct sk_buff *))
248{
249 struct sk_buff *reasm = (*pskb)->nfct_reasm;
250
251 /* This packet is fragmented and has reassembled packet. */
252 if (reasm) {
253 /* Reassembled packet isn't parsed yet ? */
254 if (!reasm->nfct) {
255 unsigned int ret;
256
257 ret = nf_conntrack_in(PF_INET6, hooknum, &reasm);
258 if (ret != NF_ACCEPT)
259 return ret;
260 }
261 nf_conntrack_get(reasm->nfct);
262 (*pskb)->nfct = reasm->nfct;
263 return NF_ACCEPT;
264 }
265
266 return nf_conntrack_in(PF_INET6, hooknum, pskb);
267}
268
269static unsigned int ipv6_conntrack_local(unsigned int hooknum,
270 struct sk_buff **pskb,
271 const struct net_device *in,
272 const struct net_device *out,
273 int (*okfn)(struct sk_buff *))
274{
275 /* root is playing with raw sockets. */
276 if ((*pskb)->len < sizeof(struct ipv6hdr)) {
277 if (net_ratelimit())
278 printk("ipv6_conntrack_local: packet too short\n");
279 return NF_ACCEPT;
280 }
281 return ipv6_conntrack_in(hooknum, pskb, in, out, okfn);
282}
283
284/* Connection tracking may drop packets, but never alters them, so
285 make it the first hook. */
286static struct nf_hook_ops ipv6_conntrack_defrag_ops = {
287 .hook = ipv6_defrag,
288 .owner = THIS_MODULE,
289 .pf = PF_INET6,
290 .hooknum = NF_IP6_PRE_ROUTING,
291 .priority = NF_IP6_PRI_CONNTRACK_DEFRAG,
292};
293
294static struct nf_hook_ops ipv6_conntrack_in_ops = {
295 .hook = ipv6_conntrack_in,
296 .owner = THIS_MODULE,
297 .pf = PF_INET6,
298 .hooknum = NF_IP6_PRE_ROUTING,
299 .priority = NF_IP6_PRI_CONNTRACK,
300};
301
302static struct nf_hook_ops ipv6_conntrack_local_out_ops = {
303 .hook = ipv6_conntrack_local,
304 .owner = THIS_MODULE,
305 .pf = PF_INET6,
306 .hooknum = NF_IP6_LOCAL_OUT,
307 .priority = NF_IP6_PRI_CONNTRACK,
308};
309
310static struct nf_hook_ops ipv6_conntrack_defrag_local_out_ops = {
311 .hook = ipv6_defrag,
312 .owner = THIS_MODULE,
313 .pf = PF_INET6,
314 .hooknum = NF_IP6_LOCAL_OUT,
315 .priority = NF_IP6_PRI_CONNTRACK_DEFRAG,
316};
317
318/* Refragmenter; last chance. */
319static struct nf_hook_ops ipv6_conntrack_out_ops = {
320 .hook = ipv6_confirm,
321 .owner = THIS_MODULE,
322 .pf = PF_INET6,
323 .hooknum = NF_IP6_POST_ROUTING,
324 .priority = NF_IP6_PRI_LAST,
325};
326
327static struct nf_hook_ops ipv6_conntrack_local_in_ops = {
328 .hook = ipv6_confirm,
329 .owner = THIS_MODULE,
330 .pf = PF_INET6,
331 .hooknum = NF_IP6_LOCAL_IN,
332 .priority = NF_IP6_PRI_LAST-1,
333};
334
335#ifdef CONFIG_SYSCTL
336
337/* From nf_conntrack_proto_icmpv6.c */
338extern unsigned long nf_ct_icmpv6_timeout;
339
340/* From nf_conntrack_frag6.c */
341extern unsigned long nf_ct_frag6_timeout;
342extern unsigned long nf_ct_frag6_low_thresh;
343extern unsigned long nf_ct_frag6_high_thresh;
344
345static struct ctl_table_header *nf_ct_ipv6_sysctl_header;
346
347static ctl_table nf_ct_sysctl_table[] = {
348 {
349 .ctl_name = NET_NF_CONNTRACK_ICMPV6_TIMEOUT,
350 .procname = "nf_conntrack_icmpv6_timeout",
351 .data = &nf_ct_icmpv6_timeout,
352 .maxlen = sizeof(unsigned int),
353 .mode = 0644,
354 .proc_handler = &proc_dointvec_jiffies,
355 },
356 {
357 .ctl_name = NET_NF_CONNTRACK_FRAG6_TIMEOUT,
358 .procname = "nf_conntrack_frag6_timeout",
359 .data = &nf_ct_frag6_timeout,
360 .maxlen = sizeof(unsigned int),
361 .mode = 0644,
362 .proc_handler = &proc_dointvec_jiffies,
363 },
364 {
365 .ctl_name = NET_NF_CONNTRACK_FRAG6_LOW_THRESH,
366 .procname = "nf_conntrack_frag6_low_thresh",
367 .data = &nf_ct_frag6_low_thresh,
368 .maxlen = sizeof(unsigned int),
369 .mode = 0644,
370 .proc_handler = &proc_dointvec_jiffies,
371 },
372 {
373 .ctl_name = NET_NF_CONNTRACK_FRAG6_HIGH_THRESH,
374 .procname = "nf_conntrack_frag6_high_thresh",
375 .data = &nf_ct_frag6_high_thresh,
376 .maxlen = sizeof(unsigned int),
377 .mode = 0644,
378 .proc_handler = &proc_dointvec_jiffies,
379 },
380 { .ctl_name = 0 }
381};
382
383static ctl_table nf_ct_netfilter_table[] = {
384 {
385 .ctl_name = NET_NETFILTER,
386 .procname = "netfilter",
387 .mode = 0555,
388 .child = nf_ct_sysctl_table,
389 },
390 { .ctl_name = 0 }
391};
392
393static ctl_table nf_ct_net_table[] = {
394 {
395 .ctl_name = CTL_NET,
396 .procname = "net",
397 .mode = 0555,
398 .child = nf_ct_netfilter_table,
399 },
400 { .ctl_name = 0 }
401};
402#endif
403
404struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 = {
405 .l3proto = PF_INET6,
406 .name = "ipv6",
407 .pkt_to_tuple = ipv6_pkt_to_tuple,
408 .invert_tuple = ipv6_invert_tuple,
409 .print_tuple = ipv6_print_tuple,
410 .print_conntrack = ipv6_print_conntrack,
411 .prepare = ipv6_prepare,
412 .get_features = ipv6_get_features,
413 .me = THIS_MODULE,
414};
415
416extern struct nf_conntrack_protocol nf_conntrack_protocol_tcp6;
417extern struct nf_conntrack_protocol nf_conntrack_protocol_udp6;
418extern struct nf_conntrack_protocol nf_conntrack_protocol_icmpv6;
419extern int nf_ct_frag6_init(void);
420extern void nf_ct_frag6_cleanup(void);
421static int init_or_cleanup(int init)
422{
423 int ret = 0;
424
425 if (!init) goto cleanup;
426
427 ret = nf_ct_frag6_init();
428 if (ret < 0) {
429 printk("nf_conntrack_ipv6: can't initialize frag6.\n");
430 goto cleanup_nothing;
431 }
432 ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_tcp6);
433 if (ret < 0) {
434 printk("nf_conntrack_ipv6: can't register tcp.\n");
435 goto cleanup_frag6;
436 }
437
438 ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_udp6);
439 if (ret < 0) {
440 printk("nf_conntrack_ipv6: can't register udp.\n");
441 goto cleanup_tcp;
442 }
443
444 ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_icmpv6);
445 if (ret < 0) {
446 printk("nf_conntrack_ipv6: can't register icmpv6.\n");
447 goto cleanup_udp;
448 }
449
450 ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv6);
451 if (ret < 0) {
452 printk("nf_conntrack_ipv6: can't register ipv6\n");
453 goto cleanup_icmpv6;
454 }
455
456 ret = nf_register_hook(&ipv6_conntrack_defrag_ops);
457 if (ret < 0) {
458 printk("nf_conntrack_ipv6: can't register pre-routing defrag "
459 "hook.\n");
460 goto cleanup_ipv6;
461 }
462
463 ret = nf_register_hook(&ipv6_conntrack_defrag_local_out_ops);
464 if (ret < 0) {
465 printk("nf_conntrack_ipv6: can't register local_out defrag "
466 "hook.\n");
467 goto cleanup_defragops;
468 }
469
470 ret = nf_register_hook(&ipv6_conntrack_in_ops);
471 if (ret < 0) {
472 printk("nf_conntrack_ipv6: can't register pre-routing hook.\n");
473 goto cleanup_defraglocalops;
474 }
475
476 ret = nf_register_hook(&ipv6_conntrack_local_out_ops);
477 if (ret < 0) {
478 printk("nf_conntrack_ipv6: can't register local out hook.\n");
479 goto cleanup_inops;
480 }
481
482 ret = nf_register_hook(&ipv6_conntrack_out_ops);
483 if (ret < 0) {
484 printk("nf_conntrack_ipv6: can't register post-routing hook.\n");
485 goto cleanup_inandlocalops;
486 }
487
488 ret = nf_register_hook(&ipv6_conntrack_local_in_ops);
489 if (ret < 0) {
490 printk("nf_conntrack_ipv6: can't register local in hook.\n");
491 goto cleanup_inoutandlocalops;
492 }
493
494#ifdef CONFIG_SYSCTL
495 nf_ct_ipv6_sysctl_header = register_sysctl_table(nf_ct_net_table, 0);
496 if (nf_ct_ipv6_sysctl_header == NULL) {
497 printk("nf_conntrack: can't register to sysctl.\n");
498 ret = -ENOMEM;
499 goto cleanup_localinops;
500 }
501#endif
502 return ret;
503
504 cleanup:
505 synchronize_net();
506#ifdef CONFIG_SYSCTL
507 unregister_sysctl_table(nf_ct_ipv6_sysctl_header);
508 cleanup_localinops:
509#endif
510 nf_unregister_hook(&ipv6_conntrack_local_in_ops);
511 cleanup_inoutandlocalops:
512 nf_unregister_hook(&ipv6_conntrack_out_ops);
513 cleanup_inandlocalops:
514 nf_unregister_hook(&ipv6_conntrack_local_out_ops);
515 cleanup_inops:
516 nf_unregister_hook(&ipv6_conntrack_in_ops);
517 cleanup_defraglocalops:
518 nf_unregister_hook(&ipv6_conntrack_defrag_local_out_ops);
519 cleanup_defragops:
520 nf_unregister_hook(&ipv6_conntrack_defrag_ops);
521 cleanup_ipv6:
522 nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv6);
523 cleanup_icmpv6:
524 nf_conntrack_protocol_unregister(&nf_conntrack_protocol_icmpv6);
525 cleanup_udp:
526 nf_conntrack_protocol_unregister(&nf_conntrack_protocol_udp6);
527 cleanup_tcp:
528 nf_conntrack_protocol_unregister(&nf_conntrack_protocol_tcp6);
529 cleanup_frag6:
530 nf_ct_frag6_cleanup();
531 cleanup_nothing:
532 return ret;
533}
534
535MODULE_LICENSE("GPL");
536MODULE_AUTHOR("Yasuyuki KOZAKAI @USAGI <yasuyuki.kozakai@toshiba.co.jp>");
537
538static int __init init(void)
539{
540 need_nf_conntrack();
541 return init_or_cleanup(1);
542}
543
544static void __exit fini(void)
545{
546 init_or_cleanup(0);
547}
548
549module_init(init);
550module_exit(fini);
551
552void need_ip6_conntrack(void)
553{
554}
555
556EXPORT_SYMBOL(need_ip6_conntrack);
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
new file mode 100644
index 000000000000..c0f1da5497a9
--- /dev/null
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -0,0 +1,272 @@
1/*
2 * Copyright (C)2003,2004 USAGI/WIDE Project
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * Author:
9 * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
10 *
11 * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
12 * - ICMPv6 tracking support. Derived from the original ip_conntrack code
13 * net/ipv4/netfilter/ip_conntrack_proto_icmp.c which had the following
14 * copyright information:
15 * (C) 1999-2001 Paul `Rusty' Russell
16 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
17 */
18
19#include <linux/types.h>
20#include <linux/sched.h>
21#include <linux/timer.h>
22#include <linux/module.h>
23#include <linux/netfilter.h>
24#include <linux/in6.h>
25#include <linux/icmpv6.h>
26#include <linux/ipv6.h>
27#include <net/ipv6.h>
28#include <net/ip6_checksum.h>
29#include <linux/seq_file.h>
30#include <linux/netfilter_ipv6.h>
31#include <net/netfilter/nf_conntrack_tuple.h>
32#include <net/netfilter/nf_conntrack_protocol.h>
33#include <net/netfilter/nf_conntrack_core.h>
34#include <net/netfilter/ipv6/nf_conntrack_icmpv6.h>
35
36unsigned long nf_ct_icmpv6_timeout = 30*HZ;
37
38#if 0
39#define DEBUGP printk
40#else
41#define DEBUGP(format, args...)
42#endif
43
44static int icmpv6_pkt_to_tuple(const struct sk_buff *skb,
45 unsigned int dataoff,
46 struct nf_conntrack_tuple *tuple)
47{
48 struct icmp6hdr _hdr, *hp;
49
50 hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
51 if (hp == NULL)
52 return 0;
53 tuple->dst.u.icmp.type = hp->icmp6_type;
54 tuple->src.u.icmp.id = hp->icmp6_identifier;
55 tuple->dst.u.icmp.code = hp->icmp6_code;
56
57 return 1;
58}
59
60static int icmpv6_invert_tuple(struct nf_conntrack_tuple *tuple,
61 const struct nf_conntrack_tuple *orig)
62{
63 /* Add 1; spaces filled with 0. */
64 static u_int8_t invmap[] = {
65 [ICMPV6_ECHO_REQUEST - 128] = ICMPV6_ECHO_REPLY + 1,
66 [ICMPV6_ECHO_REPLY - 128] = ICMPV6_ECHO_REQUEST + 1,
67 [ICMPV6_NI_QUERY - 128] = ICMPV6_NI_QUERY + 1,
68 [ICMPV6_NI_REPLY - 128] = ICMPV6_NI_REPLY +1
69 };
70
71 __u8 type = orig->dst.u.icmp.type - 128;
72 if (type >= sizeof(invmap) || !invmap[type])
73 return 0;
74
75 tuple->src.u.icmp.id = orig->src.u.icmp.id;
76 tuple->dst.u.icmp.type = invmap[type] - 1;
77 tuple->dst.u.icmp.code = orig->dst.u.icmp.code;
78 return 1;
79}
80
81/* Print out the per-protocol part of the tuple. */
82static int icmpv6_print_tuple(struct seq_file *s,
83 const struct nf_conntrack_tuple *tuple)
84{
85 return seq_printf(s, "type=%u code=%u id=%u ",
86 tuple->dst.u.icmp.type,
87 tuple->dst.u.icmp.code,
88 ntohs(tuple->src.u.icmp.id));
89}
90
91/* Print out the private part of the conntrack. */
92static int icmpv6_print_conntrack(struct seq_file *s,
93 const struct nf_conn *conntrack)
94{
95 return 0;
96}
97
98/* Returns verdict for packet, or -1 for invalid. */
99static int icmpv6_packet(struct nf_conn *ct,
100 const struct sk_buff *skb,
101 unsigned int dataoff,
102 enum ip_conntrack_info ctinfo,
103 int pf,
104 unsigned int hooknum)
105{
106 /* Try to delete connection immediately after all replies:
107 won't actually vanish as we still have skb, and del_timer
108 means this will only run once even if count hits zero twice
109 (theoretically possible with SMP) */
110 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) {
111 if (atomic_dec_and_test(&ct->proto.icmp.count)
112 && del_timer(&ct->timeout))
113 ct->timeout.function((unsigned long)ct);
114 } else {
115 atomic_inc(&ct->proto.icmp.count);
116 nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb);
117 nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmpv6_timeout);
118 }
119
120 return NF_ACCEPT;
121}
122
123/* Called when a new connection for this protocol found. */
124static int icmpv6_new(struct nf_conn *conntrack,
125 const struct sk_buff *skb,
126 unsigned int dataoff)
127{
128 static u_int8_t valid_new[] = {
129 [ICMPV6_ECHO_REQUEST - 128] = 1,
130 [ICMPV6_NI_QUERY - 128] = 1
131 };
132
133 if (conntrack->tuplehash[0].tuple.dst.u.icmp.type - 128 >= sizeof(valid_new)
134 || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type - 128]) {
135 /* Can't create a new ICMPv6 `conn' with this. */
136 DEBUGP("icmp: can't create new conn with type %u\n",
137 conntrack->tuplehash[0].tuple.dst.u.icmp.type);
138 NF_CT_DUMP_TUPLE(&conntrack->tuplehash[0].tuple);
139 return 0;
140 }
141 atomic_set(&conntrack->proto.icmp.count, 0);
142 return 1;
143}
144
145extern int
146nf_ct_ipv6_skip_exthdr(struct sk_buff *skb, int start, u8 *nexthdrp, int len);
147extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6;
148static int
149icmpv6_error_message(struct sk_buff *skb,
150 unsigned int icmp6off,
151 enum ip_conntrack_info *ctinfo,
152 unsigned int hooknum)
153{
154 struct nf_conntrack_tuple intuple, origtuple;
155 struct nf_conntrack_tuple_hash *h;
156 struct icmp6hdr _hdr, *hp;
157 unsigned int inip6off;
158 struct nf_conntrack_protocol *inproto;
159 u_int8_t inprotonum;
160 unsigned int inprotoff;
161
162 NF_CT_ASSERT(skb->nfct == NULL);
163
164 hp = skb_header_pointer(skb, icmp6off, sizeof(_hdr), &_hdr);
165 if (hp == NULL) {
166 DEBUGP("icmpv6_error: Can't get ICMPv6 hdr.\n");
167 return -NF_ACCEPT;
168 }
169
170 inip6off = icmp6off + sizeof(_hdr);
171 if (skb_copy_bits(skb, inip6off+offsetof(struct ipv6hdr, nexthdr),
172 &inprotonum, sizeof(inprotonum)) != 0) {
173 DEBUGP("icmpv6_error: Can't get nexthdr in inner IPv6 header.\n");
174 return -NF_ACCEPT;
175 }
176 inprotoff = nf_ct_ipv6_skip_exthdr(skb,
177 inip6off + sizeof(struct ipv6hdr),
178 &inprotonum,
179 skb->len - inip6off
180 - sizeof(struct ipv6hdr));
181
182 if ((inprotoff < 0) || (inprotoff > skb->len) ||
183 (inprotonum == NEXTHDR_FRAGMENT)) {
184 DEBUGP("icmpv6_error: Can't get protocol header in ICMPv6 payload.\n");
185 return -NF_ACCEPT;
186 }
187
188 inproto = nf_ct_find_proto(PF_INET6, inprotonum);
189
190 /* Are they talking about one of our connections? */
191 if (!nf_ct_get_tuple(skb, inip6off, inprotoff, PF_INET6, inprotonum,
192 &origtuple, &nf_conntrack_l3proto_ipv6, inproto)) {
193 DEBUGP("icmpv6_error: Can't get tuple\n");
194 return -NF_ACCEPT;
195 }
196
197 /* Ordinarily, we'd expect the inverted tupleproto, but it's
198 been preserved inside the ICMP. */
199 if (!nf_ct_invert_tuple(&intuple, &origtuple,
200 &nf_conntrack_l3proto_ipv6, inproto)) {
201 DEBUGP("icmpv6_error: Can't invert tuple\n");
202 return -NF_ACCEPT;
203 }
204
205 *ctinfo = IP_CT_RELATED;
206
207 h = nf_conntrack_find_get(&intuple, NULL);
208 if (!h) {
209 DEBUGP("icmpv6_error: no match\n");
210 return -NF_ACCEPT;
211 } else {
212 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
213 *ctinfo += IP_CT_IS_REPLY;
214 }
215
216 /* Update skb to refer to this connection */
217 skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general;
218 skb->nfctinfo = *ctinfo;
219 return -NF_ACCEPT;
220}
221
222static int
223icmpv6_error(struct sk_buff *skb, unsigned int dataoff,
224 enum ip_conntrack_info *ctinfo, int pf, unsigned int hooknum)
225{
226 struct icmp6hdr _ih, *icmp6h;
227
228 icmp6h = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih);
229 if (icmp6h == NULL) {
230 if (LOG_INVALID(IPPROTO_ICMPV6))
231 nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL,
232 "nf_ct_icmpv6: short packet ");
233 return -NF_ACCEPT;
234 }
235
236 if (hooknum != NF_IP6_PRE_ROUTING)
237 goto skipped;
238
239 /* Ignore it if the checksum's bogus. */
240 if (csum_ipv6_magic(&skb->nh.ipv6h->saddr, &skb->nh.ipv6h->daddr,
241 skb->len - dataoff, IPPROTO_ICMPV6,
242 skb_checksum(skb, dataoff,
243 skb->len - dataoff, 0))) {
244 nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL,
245 "nf_ct_icmpv6: ICMPv6 checksum failed\n");
246 return -NF_ACCEPT;
247 }
248
249skipped:
250
251 /* is not error message ? */
252 if (icmp6h->icmp6_type >= 128)
253 return NF_ACCEPT;
254
255 return icmpv6_error_message(skb, dataoff, ctinfo, hooknum);
256}
257
258struct nf_conntrack_protocol nf_conntrack_protocol_icmpv6 =
259{
260 .l3proto = PF_INET6,
261 .proto = IPPROTO_ICMPV6,
262 .name = "icmpv6",
263 .pkt_to_tuple = icmpv6_pkt_to_tuple,
264 .invert_tuple = icmpv6_invert_tuple,
265 .print_tuple = icmpv6_print_tuple,
266 .print_conntrack = icmpv6_print_conntrack,
267 .packet = icmpv6_packet,
268 .new = icmpv6_new,
269 .error = icmpv6_error,
270};
271
272EXPORT_SYMBOL(nf_conntrack_protocol_icmpv6);
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
new file mode 100644
index 000000000000..7640b9bb7694
--- /dev/null
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -0,0 +1,885 @@
1/*
2 * IPv6 fragment reassembly for connection tracking
3 *
4 * Copyright (C)2004 USAGI/WIDE Project
5 *
6 * Author:
7 * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
8 *
9 * Based on: net/ipv6/reassembly.c
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License
13 * as published by the Free Software Foundation; either version
14 * 2 of the License, or (at your option) any later version.
15 */
16
17#include <linux/config.h>
18#include <linux/errno.h>
19#include <linux/types.h>
20#include <linux/string.h>
21#include <linux/socket.h>
22#include <linux/sockios.h>
23#include <linux/jiffies.h>
24#include <linux/net.h>
25#include <linux/list.h>
26#include <linux/netdevice.h>
27#include <linux/in6.h>
28#include <linux/ipv6.h>
29#include <linux/icmpv6.h>
30#include <linux/random.h>
31#include <linux/jhash.h>
32
33#include <net/sock.h>
34#include <net/snmp.h>
35
36#include <net/ipv6.h>
37#include <net/protocol.h>
38#include <net/transp_v6.h>
39#include <net/rawv6.h>
40#include <net/ndisc.h>
41#include <net/addrconf.h>
42#include <linux/sysctl.h>
43#include <linux/netfilter.h>
44#include <linux/netfilter_ipv6.h>
45#include <linux/kernel.h>
46#include <linux/module.h>
47
48#if 0
49#define DEBUGP printk
50#else
51#define DEBUGP(format, args...)
52#endif
53
54#define NF_CT_FRAG6_HIGH_THRESH 262144 /* == 256*1024 */
55#define NF_CT_FRAG6_LOW_THRESH 196608 /* == 192*1024 */
56#define NF_CT_FRAG6_TIMEOUT IPV6_FRAG_TIMEOUT
57
58int nf_ct_frag6_high_thresh = 256*1024;
59int nf_ct_frag6_low_thresh = 192*1024;
60int nf_ct_frag6_timeout = IPV6_FRAG_TIMEOUT;
61
62struct nf_ct_frag6_skb_cb
63{
64 struct inet6_skb_parm h;
65 int offset;
66 struct sk_buff *orig;
67};
68
69#define NFCT_FRAG6_CB(skb) ((struct nf_ct_frag6_skb_cb*)((skb)->cb))
70
71struct nf_ct_frag6_queue
72{
73 struct nf_ct_frag6_queue *next;
74 struct list_head lru_list; /* lru list member */
75
76 __u32 id; /* fragment id */
77 struct in6_addr saddr;
78 struct in6_addr daddr;
79
80 spinlock_t lock;
81 atomic_t refcnt;
82 struct timer_list timer; /* expire timer */
83 struct sk_buff *fragments;
84 int len;
85 int meat;
86 struct timeval stamp;
87 unsigned int csum;
88 __u8 last_in; /* has first/last segment arrived? */
89#define COMPLETE 4
90#define FIRST_IN 2
91#define LAST_IN 1
92 __u16 nhoffset;
93 struct nf_ct_frag6_queue **pprev;
94};
95
96/* Hash table. */
97
98#define FRAG6Q_HASHSZ 64
99
100static struct nf_ct_frag6_queue *nf_ct_frag6_hash[FRAG6Q_HASHSZ];
101static rwlock_t nf_ct_frag6_lock = RW_LOCK_UNLOCKED;
102static u32 nf_ct_frag6_hash_rnd;
103static LIST_HEAD(nf_ct_frag6_lru_list);
104int nf_ct_frag6_nqueues = 0;
105
106static __inline__ void __fq_unlink(struct nf_ct_frag6_queue *fq)
107{
108 if (fq->next)
109 fq->next->pprev = fq->pprev;
110 *fq->pprev = fq->next;
111 list_del(&fq->lru_list);
112 nf_ct_frag6_nqueues--;
113}
114
115static __inline__ void fq_unlink(struct nf_ct_frag6_queue *fq)
116{
117 write_lock(&nf_ct_frag6_lock);
118 __fq_unlink(fq);
119 write_unlock(&nf_ct_frag6_lock);
120}
121
122static unsigned int ip6qhashfn(u32 id, struct in6_addr *saddr,
123 struct in6_addr *daddr)
124{
125 u32 a, b, c;
126
127 a = saddr->s6_addr32[0];
128 b = saddr->s6_addr32[1];
129 c = saddr->s6_addr32[2];
130
131 a += JHASH_GOLDEN_RATIO;
132 b += JHASH_GOLDEN_RATIO;
133 c += nf_ct_frag6_hash_rnd;
134 __jhash_mix(a, b, c);
135
136 a += saddr->s6_addr32[3];
137 b += daddr->s6_addr32[0];
138 c += daddr->s6_addr32[1];
139 __jhash_mix(a, b, c);
140
141 a += daddr->s6_addr32[2];
142 b += daddr->s6_addr32[3];
143 c += id;
144 __jhash_mix(a, b, c);
145
146 return c & (FRAG6Q_HASHSZ - 1);
147}
148
149static struct timer_list nf_ct_frag6_secret_timer;
150int nf_ct_frag6_secret_interval = 10 * 60 * HZ;
151
152static void nf_ct_frag6_secret_rebuild(unsigned long dummy)
153{
154 unsigned long now = jiffies;
155 int i;
156
157 write_lock(&nf_ct_frag6_lock);
158 get_random_bytes(&nf_ct_frag6_hash_rnd, sizeof(u32));
159 for (i = 0; i < FRAG6Q_HASHSZ; i++) {
160 struct nf_ct_frag6_queue *q;
161
162 q = nf_ct_frag6_hash[i];
163 while (q) {
164 struct nf_ct_frag6_queue *next = q->next;
165 unsigned int hval = ip6qhashfn(q->id,
166 &q->saddr,
167 &q->daddr);
168
169 if (hval != i) {
170 /* Unlink. */
171 if (q->next)
172 q->next->pprev = q->pprev;
173 *q->pprev = q->next;
174
175 /* Relink to new hash chain. */
176 if ((q->next = nf_ct_frag6_hash[hval]) != NULL)
177 q->next->pprev = &q->next;
178 nf_ct_frag6_hash[hval] = q;
179 q->pprev = &nf_ct_frag6_hash[hval];
180 }
181
182 q = next;
183 }
184 }
185 write_unlock(&nf_ct_frag6_lock);
186
187 mod_timer(&nf_ct_frag6_secret_timer, now + nf_ct_frag6_secret_interval);
188}
189
190atomic_t nf_ct_frag6_mem = ATOMIC_INIT(0);
191
192/* Memory Tracking Functions. */
193static inline void frag_kfree_skb(struct sk_buff *skb)
194{
195 atomic_sub(skb->truesize, &nf_ct_frag6_mem);
196 if (NFCT_FRAG6_CB(skb)->orig)
197 kfree_skb(NFCT_FRAG6_CB(skb)->orig);
198
199 kfree_skb(skb);
200}
201
202static inline void frag_free_queue(struct nf_ct_frag6_queue *fq)
203{
204 atomic_sub(sizeof(struct nf_ct_frag6_queue), &nf_ct_frag6_mem);
205 kfree(fq);
206}
207
208static inline struct nf_ct_frag6_queue *frag_alloc_queue(void)
209{
210 struct nf_ct_frag6_queue *fq = kmalloc(sizeof(struct nf_ct_frag6_queue), GFP_ATOMIC);
211
212 if (!fq)
213 return NULL;
214 atomic_add(sizeof(struct nf_ct_frag6_queue), &nf_ct_frag6_mem);
215 return fq;
216}
217
218/* Destruction primitives. */
219
220/* Complete destruction of fq. */
221static void nf_ct_frag6_destroy(struct nf_ct_frag6_queue *fq)
222{
223 struct sk_buff *fp;
224
225 BUG_TRAP(fq->last_in&COMPLETE);
226 BUG_TRAP(del_timer(&fq->timer) == 0);
227
228 /* Release all fragment data. */
229 fp = fq->fragments;
230 while (fp) {
231 struct sk_buff *xp = fp->next;
232
233 frag_kfree_skb(fp);
234 fp = xp;
235 }
236
237 frag_free_queue(fq);
238}
239
240static __inline__ void fq_put(struct nf_ct_frag6_queue *fq)
241{
242 if (atomic_dec_and_test(&fq->refcnt))
243 nf_ct_frag6_destroy(fq);
244}
245
246/* Kill fq entry. It is not destroyed immediately,
247 * because caller (and someone more) holds reference count.
248 */
249static __inline__ void fq_kill(struct nf_ct_frag6_queue *fq)
250{
251 if (del_timer(&fq->timer))
252 atomic_dec(&fq->refcnt);
253
254 if (!(fq->last_in & COMPLETE)) {
255 fq_unlink(fq);
256 atomic_dec(&fq->refcnt);
257 fq->last_in |= COMPLETE;
258 }
259}
260
261static void nf_ct_frag6_evictor(void)
262{
263 struct nf_ct_frag6_queue *fq;
264 struct list_head *tmp;
265
266 for (;;) {
267 if (atomic_read(&nf_ct_frag6_mem) <= nf_ct_frag6_low_thresh)
268 return;
269 read_lock(&nf_ct_frag6_lock);
270 if (list_empty(&nf_ct_frag6_lru_list)) {
271 read_unlock(&nf_ct_frag6_lock);
272 return;
273 }
274 tmp = nf_ct_frag6_lru_list.next;
275 fq = list_entry(tmp, struct nf_ct_frag6_queue, lru_list);
276 atomic_inc(&fq->refcnt);
277 read_unlock(&nf_ct_frag6_lock);
278
279 spin_lock(&fq->lock);
280 if (!(fq->last_in&COMPLETE))
281 fq_kill(fq);
282 spin_unlock(&fq->lock);
283
284 fq_put(fq);
285 }
286}
287
288static void nf_ct_frag6_expire(unsigned long data)
289{
290 struct nf_ct_frag6_queue *fq = (struct nf_ct_frag6_queue *) data;
291
292 spin_lock(&fq->lock);
293
294 if (fq->last_in & COMPLETE)
295 goto out;
296
297 fq_kill(fq);
298
299out:
300 spin_unlock(&fq->lock);
301 fq_put(fq);
302}
303
304/* Creation primitives. */
305
306
307static struct nf_ct_frag6_queue *nf_ct_frag6_intern(unsigned int hash,
308 struct nf_ct_frag6_queue *fq_in)
309{
310 struct nf_ct_frag6_queue *fq;
311
312 write_lock(&nf_ct_frag6_lock);
313#ifdef CONFIG_SMP
314 for (fq = nf_ct_frag6_hash[hash]; fq; fq = fq->next) {
315 if (fq->id == fq_in->id &&
316 !ipv6_addr_cmp(&fq_in->saddr, &fq->saddr) &&
317 !ipv6_addr_cmp(&fq_in->daddr, &fq->daddr)) {
318 atomic_inc(&fq->refcnt);
319 write_unlock(&nf_ct_frag6_lock);
320 fq_in->last_in |= COMPLETE;
321 fq_put(fq_in);
322 return fq;
323 }
324 }
325#endif
326 fq = fq_in;
327
328 if (!mod_timer(&fq->timer, jiffies + nf_ct_frag6_timeout))
329 atomic_inc(&fq->refcnt);
330
331 atomic_inc(&fq->refcnt);
332 if ((fq->next = nf_ct_frag6_hash[hash]) != NULL)
333 fq->next->pprev = &fq->next;
334 nf_ct_frag6_hash[hash] = fq;
335 fq->pprev = &nf_ct_frag6_hash[hash];
336 INIT_LIST_HEAD(&fq->lru_list);
337 list_add_tail(&fq->lru_list, &nf_ct_frag6_lru_list);
338 nf_ct_frag6_nqueues++;
339 write_unlock(&nf_ct_frag6_lock);
340 return fq;
341}
342
343
344static struct nf_ct_frag6_queue *
345nf_ct_frag6_create(unsigned int hash, u32 id, struct in6_addr *src, struct in6_addr *dst)
346{
347 struct nf_ct_frag6_queue *fq;
348
349 if ((fq = frag_alloc_queue()) == NULL) {
350 DEBUGP("Can't alloc new queue\n");
351 goto oom;
352 }
353
354 memset(fq, 0, sizeof(struct nf_ct_frag6_queue));
355
356 fq->id = id;
357 ipv6_addr_copy(&fq->saddr, src);
358 ipv6_addr_copy(&fq->daddr, dst);
359
360 init_timer(&fq->timer);
361 fq->timer.function = nf_ct_frag6_expire;
362 fq->timer.data = (long) fq;
363 fq->lock = SPIN_LOCK_UNLOCKED;
364 atomic_set(&fq->refcnt, 1);
365
366 return nf_ct_frag6_intern(hash, fq);
367
368oom:
369 return NULL;
370}
371
372static __inline__ struct nf_ct_frag6_queue *
373fq_find(u32 id, struct in6_addr *src, struct in6_addr *dst)
374{
375 struct nf_ct_frag6_queue *fq;
376 unsigned int hash = ip6qhashfn(id, src, dst);
377
378 read_lock(&nf_ct_frag6_lock);
379 for (fq = nf_ct_frag6_hash[hash]; fq; fq = fq->next) {
380 if (fq->id == id &&
381 !ipv6_addr_cmp(src, &fq->saddr) &&
382 !ipv6_addr_cmp(dst, &fq->daddr)) {
383 atomic_inc(&fq->refcnt);
384 read_unlock(&nf_ct_frag6_lock);
385 return fq;
386 }
387 }
388 read_unlock(&nf_ct_frag6_lock);
389
390 return nf_ct_frag6_create(hash, id, src, dst);
391}
392
393
394static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
395 struct frag_hdr *fhdr, int nhoff)
396{
397 struct sk_buff *prev, *next;
398 int offset, end;
399
400 if (fq->last_in & COMPLETE) {
401 DEBUGP("Allready completed\n");
402 goto err;
403 }
404
405 offset = ntohs(fhdr->frag_off) & ~0x7;
406 end = offset + (ntohs(skb->nh.ipv6h->payload_len) -
407 ((u8 *) (fhdr + 1) - (u8 *) (skb->nh.ipv6h + 1)));
408
409 if ((unsigned int)end > IPV6_MAXPLEN) {
410 DEBUGP("offset is too large.\n");
411 return -1;
412 }
413
414 if (skb->ip_summed == CHECKSUM_HW)
415 skb->csum = csum_sub(skb->csum,
416 csum_partial(skb->nh.raw,
417 (u8*)(fhdr + 1) - skb->nh.raw,
418 0));
419
420 /* Is this the final fragment? */
421 if (!(fhdr->frag_off & htons(IP6_MF))) {
422 /* If we already have some bits beyond end
423 * or have different end, the segment is corrupted.
424 */
425 if (end < fq->len ||
426 ((fq->last_in & LAST_IN) && end != fq->len)) {
427 DEBUGP("already received last fragment\n");
428 goto err;
429 }
430 fq->last_in |= LAST_IN;
431 fq->len = end;
432 } else {
433 /* Check if the fragment is rounded to 8 bytes.
434 * Required by the RFC.
435 */
436 if (end & 0x7) {
437 /* RFC2460 says always send parameter problem in
438 * this case. -DaveM
439 */
440 DEBUGP("the end of this fragment is not rounded to 8 bytes.\n");
441 return -1;
442 }
443 if (end > fq->len) {
444 /* Some bits beyond end -> corruption. */
445 if (fq->last_in & LAST_IN) {
446 DEBUGP("last packet already reached.\n");
447 goto err;
448 }
449 fq->len = end;
450 }
451 }
452
453 if (end == offset)
454 goto err;
455
456 /* Point into the IP datagram 'data' part. */
457 if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data)) {
458 DEBUGP("queue: message is too short.\n");
459 goto err;
460 }
461 if (end-offset < skb->len) {
462 if (pskb_trim(skb, end - offset)) {
463 DEBUGP("Can't trim\n");
464 goto err;
465 }
466 if (skb->ip_summed != CHECKSUM_UNNECESSARY)
467 skb->ip_summed = CHECKSUM_NONE;
468 }
469
470 /* Find out which fragments are in front and at the back of us
471 * in the chain of fragments so far. We must know where to put
472 * this fragment, right?
473 */
474 prev = NULL;
475 for (next = fq->fragments; next != NULL; next = next->next) {
476 if (NFCT_FRAG6_CB(next)->offset >= offset)
477 break; /* bingo! */
478 prev = next;
479 }
480
481 /* We found where to put this one. Check for overlap with
482 * preceding fragment, and, if needed, align things so that
483 * any overlaps are eliminated.
484 */
485 if (prev) {
486 int i = (NFCT_FRAG6_CB(prev)->offset + prev->len) - offset;
487
488 if (i > 0) {
489 offset += i;
490 if (end <= offset) {
491 DEBUGP("overlap\n");
492 goto err;
493 }
494 if (!pskb_pull(skb, i)) {
495 DEBUGP("Can't pull\n");
496 goto err;
497 }
498 if (skb->ip_summed != CHECKSUM_UNNECESSARY)
499 skb->ip_summed = CHECKSUM_NONE;
500 }
501 }
502
503 /* Look for overlap with succeeding segments.
504 * If we can merge fragments, do it.
505 */
506 while (next && NFCT_FRAG6_CB(next)->offset < end) {
507 /* overlap is 'i' bytes */
508 int i = end - NFCT_FRAG6_CB(next)->offset;
509
510 if (i < next->len) {
511 /* Eat head of the next overlapped fragment
512 * and leave the loop. The next ones cannot overlap.
513 */
514 DEBUGP("Eat head of the overlapped parts.: %d", i);
515 if (!pskb_pull(next, i))
516 goto err;
517
518 /* next fragment */
519 NFCT_FRAG6_CB(next)->offset += i;
520 fq->meat -= i;
521 if (next->ip_summed != CHECKSUM_UNNECESSARY)
522 next->ip_summed = CHECKSUM_NONE;
523 break;
524 } else {
525 struct sk_buff *free_it = next;
526
527 /* Old fragmnet is completely overridden with
528 * new one drop it.
529 */
530 next = next->next;
531
532 if (prev)
533 prev->next = next;
534 else
535 fq->fragments = next;
536
537 fq->meat -= free_it->len;
538 frag_kfree_skb(free_it);
539 }
540 }
541
542 NFCT_FRAG6_CB(skb)->offset = offset;
543
544 /* Insert this fragment in the chain of fragments. */
545 skb->next = next;
546 if (prev)
547 prev->next = skb;
548 else
549 fq->fragments = skb;
550
551 skb->dev = NULL;
552 skb_get_timestamp(skb, &fq->stamp);
553 fq->meat += skb->len;
554 atomic_add(skb->truesize, &nf_ct_frag6_mem);
555
556 /* The first fragment.
557 * nhoffset is obtained from the first fragment, of course.
558 */
559 if (offset == 0) {
560 fq->nhoffset = nhoff;
561 fq->last_in |= FIRST_IN;
562 }
563 write_lock(&nf_ct_frag6_lock);
564 list_move_tail(&fq->lru_list, &nf_ct_frag6_lru_list);
565 write_unlock(&nf_ct_frag6_lock);
566 return 0;
567
568err:
569 return -1;
570}
571
572/*
573 * Check if this packet is complete.
574 * Returns NULL on failure by any reason, and pointer
575 * to current nexthdr field in reassembled frame.
576 *
577 * It is called with locked fq, and caller must check that
578 * queue is eligible for reassembly i.e. it is not COMPLETE,
579 * the last and the first frames arrived and all the bits are here.
580 */
581static struct sk_buff *
582nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev)
583{
584 struct sk_buff *fp, *op, *head = fq->fragments;
585 int payload_len;
586
587 fq_kill(fq);
588
589 BUG_TRAP(head != NULL);
590 BUG_TRAP(NFCT_FRAG6_CB(head)->offset == 0);
591
592 /* Unfragmented part is taken from the first segment. */
593 payload_len = (head->data - head->nh.raw) - sizeof(struct ipv6hdr) + fq->len - sizeof(struct frag_hdr);
594 if (payload_len > IPV6_MAXPLEN) {
595 DEBUGP("payload len is too large.\n");
596 goto out_oversize;
597 }
598
599 /* Head of list must not be cloned. */
600 if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC)) {
601 DEBUGP("skb is cloned but can't expand head");
602 goto out_oom;
603 }
604
605 /* If the first fragment is fragmented itself, we split
606 * it to two chunks: the first with data and paged part
607 * and the second, holding only fragments. */
608 if (skb_shinfo(head)->frag_list) {
609 struct sk_buff *clone;
610 int i, plen = 0;
611
612 if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL) {
613 DEBUGP("Can't alloc skb\n");
614 goto out_oom;
615 }
616 clone->next = head->next;
617 head->next = clone;
618 skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
619 skb_shinfo(head)->frag_list = NULL;
620 for (i=0; i<skb_shinfo(head)->nr_frags; i++)
621 plen += skb_shinfo(head)->frags[i].size;
622 clone->len = clone->data_len = head->data_len - plen;
623 head->data_len -= clone->len;
624 head->len -= clone->len;
625 clone->csum = 0;
626 clone->ip_summed = head->ip_summed;
627
628 NFCT_FRAG6_CB(clone)->orig = NULL;
629 atomic_add(clone->truesize, &nf_ct_frag6_mem);
630 }
631
632 /* We have to remove fragment header from datagram and to relocate
633 * header in order to calculate ICV correctly. */
634 head->nh.raw[fq->nhoffset] = head->h.raw[0];
635 memmove(head->head + sizeof(struct frag_hdr), head->head,
636 (head->data - head->head) - sizeof(struct frag_hdr));
637 head->mac.raw += sizeof(struct frag_hdr);
638 head->nh.raw += sizeof(struct frag_hdr);
639
640 skb_shinfo(head)->frag_list = head->next;
641 head->h.raw = head->data;
642 skb_push(head, head->data - head->nh.raw);
643 atomic_sub(head->truesize, &nf_ct_frag6_mem);
644
645 for (fp=head->next; fp; fp = fp->next) {
646 head->data_len += fp->len;
647 head->len += fp->len;
648 if (head->ip_summed != fp->ip_summed)
649 head->ip_summed = CHECKSUM_NONE;
650 else if (head->ip_summed == CHECKSUM_HW)
651 head->csum = csum_add(head->csum, fp->csum);
652 head->truesize += fp->truesize;
653 atomic_sub(fp->truesize, &nf_ct_frag6_mem);
654 }
655
656 head->next = NULL;
657 head->dev = dev;
658 skb_set_timestamp(head, &fq->stamp);
659 head->nh.ipv6h->payload_len = htons(payload_len);
660
661 /* Yes, and fold redundant checksum back. 8) */
662 if (head->ip_summed == CHECKSUM_HW)
663 head->csum = csum_partial(head->nh.raw, head->h.raw-head->nh.raw, head->csum);
664
665 fq->fragments = NULL;
666
667 /* all original skbs are linked into the NFCT_FRAG6_CB(head).orig */
668 fp = skb_shinfo(head)->frag_list;
669 if (NFCT_FRAG6_CB(fp)->orig == NULL)
670 /* at above code, head skb is divided into two skbs. */
671 fp = fp->next;
672
673 op = NFCT_FRAG6_CB(head)->orig;
674 for (; fp; fp = fp->next) {
675 struct sk_buff *orig = NFCT_FRAG6_CB(fp)->orig;
676
677 op->next = orig;
678 op = orig;
679 NFCT_FRAG6_CB(fp)->orig = NULL;
680 }
681
682 return head;
683
684out_oversize:
685 if (net_ratelimit())
686 printk(KERN_DEBUG "nf_ct_frag6_reasm: payload len = %d\n", payload_len);
687 goto out_fail;
688out_oom:
689 if (net_ratelimit())
690 printk(KERN_DEBUG "nf_ct_frag6_reasm: no memory for reassembly\n");
691out_fail:
692 return NULL;
693}
694
695/*
696 * find the header just before Fragment Header.
697 *
698 * if success return 0 and set ...
699 * (*prevhdrp): the value of "Next Header Field" in the header
700 * just before Fragment Header.
701 * (*prevhoff): the offset of "Next Header Field" in the header
702 * just before Fragment Header.
703 * (*fhoff) : the offset of Fragment Header.
704 *
705 * Based on ipv6_skip_hdr() in net/ipv6/exthdr.c
706 *
707 */
708static int
709find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff)
710{
711 u8 nexthdr = skb->nh.ipv6h->nexthdr;
712 u8 prev_nhoff = (u8 *)&skb->nh.ipv6h->nexthdr - skb->data;
713 int start = (u8 *)(skb->nh.ipv6h+1) - skb->data;
714 int len = skb->len - start;
715 u8 prevhdr = NEXTHDR_IPV6;
716
717 while (nexthdr != NEXTHDR_FRAGMENT) {
718 struct ipv6_opt_hdr hdr;
719 int hdrlen;
720
721 if (!ipv6_ext_hdr(nexthdr)) {
722 return -1;
723 }
724 if (len < (int)sizeof(struct ipv6_opt_hdr)) {
725 DEBUGP("too short\n");
726 return -1;
727 }
728 if (nexthdr == NEXTHDR_NONE) {
729 DEBUGP("next header is none\n");
730 return -1;
731 }
732 if (skb_copy_bits(skb, start, &hdr, sizeof(hdr)))
733 BUG();
734 if (nexthdr == NEXTHDR_AUTH)
735 hdrlen = (hdr.hdrlen+2)<<2;
736 else
737 hdrlen = ipv6_optlen(&hdr);
738
739 prevhdr = nexthdr;
740 prev_nhoff = start;
741
742 nexthdr = hdr.nexthdr;
743 len -= hdrlen;
744 start += hdrlen;
745 }
746
747 if (len < 0)
748 return -1;
749
750 *prevhdrp = prevhdr;
751 *prevhoff = prev_nhoff;
752 *fhoff = start;
753
754 return 0;
755}
756
757struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb)
758{
759 struct sk_buff *clone;
760 struct net_device *dev = skb->dev;
761 struct frag_hdr *fhdr;
762 struct nf_ct_frag6_queue *fq;
763 struct ipv6hdr *hdr;
764 int fhoff, nhoff;
765 u8 prevhdr;
766 struct sk_buff *ret_skb = NULL;
767
768 /* Jumbo payload inhibits frag. header */
769 if (skb->nh.ipv6h->payload_len == 0) {
770 DEBUGP("payload len = 0\n");
771 return skb;
772 }
773
774 if (find_prev_fhdr(skb, &prevhdr, &nhoff, &fhoff) < 0)
775 return skb;
776
777 clone = skb_clone(skb, GFP_ATOMIC);
778 if (clone == NULL) {
779 DEBUGP("Can't clone skb\n");
780 return skb;
781 }
782
783 NFCT_FRAG6_CB(clone)->orig = skb;
784
785 if (!pskb_may_pull(clone, fhoff + sizeof(*fhdr))) {
786 DEBUGP("message is too short.\n");
787 goto ret_orig;
788 }
789
790 clone->h.raw = clone->data + fhoff;
791 hdr = clone->nh.ipv6h;
792 fhdr = (struct frag_hdr *)clone->h.raw;
793
794 if (!(fhdr->frag_off & htons(0xFFF9))) {
795 DEBUGP("Invalid fragment offset\n");
796 /* It is not a fragmented frame */
797 goto ret_orig;
798 }
799
800 if (atomic_read(&nf_ct_frag6_mem) > nf_ct_frag6_high_thresh)
801 nf_ct_frag6_evictor();
802
803 fq = fq_find(fhdr->identification, &hdr->saddr, &hdr->daddr);
804 if (fq == NULL) {
805 DEBUGP("Can't find and can't create new queue\n");
806 goto ret_orig;
807 }
808
809 spin_lock(&fq->lock);
810
811 if (nf_ct_frag6_queue(fq, clone, fhdr, nhoff) < 0) {
812 spin_unlock(&fq->lock);
813 DEBUGP("Can't insert skb to queue\n");
814 fq_put(fq);
815 goto ret_orig;
816 }
817
818 if (fq->last_in == (FIRST_IN|LAST_IN) && fq->meat == fq->len) {
819 ret_skb = nf_ct_frag6_reasm(fq, dev);
820 if (ret_skb == NULL)
821 DEBUGP("Can't reassemble fragmented packets\n");
822 }
823 spin_unlock(&fq->lock);
824
825 fq_put(fq);
826 return ret_skb;
827
828ret_orig:
829 kfree_skb(clone);
830 return skb;
831}
832
833void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb,
834 struct net_device *in, struct net_device *out,
835 int (*okfn)(struct sk_buff *))
836{
837 struct sk_buff *s, *s2;
838
839 for (s = NFCT_FRAG6_CB(skb)->orig; s;) {
840 nf_conntrack_put_reasm(s->nfct_reasm);
841 nf_conntrack_get_reasm(skb);
842 s->nfct_reasm = skb;
843
844 s2 = s->next;
845 NF_HOOK_THRESH(PF_INET6, hooknum, s, in, out, okfn,
846 NF_IP6_PRI_CONNTRACK_DEFRAG + 1);
847 s = s2;
848 }
849 nf_conntrack_put_reasm(skb);
850}
851
852int nf_ct_frag6_kfree_frags(struct sk_buff *skb)
853{
854 struct sk_buff *s, *s2;
855
856 for (s = NFCT_FRAG6_CB(skb)->orig; s; s = s2) {
857
858 s2 = s->next;
859 kfree_skb(s);
860 }
861
862 kfree_skb(skb);
863
864 return 0;
865}
866
867int nf_ct_frag6_init(void)
868{
869 nf_ct_frag6_hash_rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^
870 (jiffies ^ (jiffies >> 6)));
871
872 init_timer(&nf_ct_frag6_secret_timer);
873 nf_ct_frag6_secret_timer.function = nf_ct_frag6_secret_rebuild;
874 nf_ct_frag6_secret_timer.expires = jiffies
875 + nf_ct_frag6_secret_interval;
876 add_timer(&nf_ct_frag6_secret_timer);
877
878 return 0;
879}
880
881void nf_ct_frag6_cleanup(void)
882{
883 del_timer(&nf_ct_frag6_secret_timer);
884 nf_ct_frag6_evictor();
885}
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index a1265a320b11..8e9628f1c4c5 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -174,8 +174,10 @@ int ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
174 struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); 174 struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
175 175
176 /* Not releasing hash table! */ 176 /* Not releasing hash table! */
177 if (clone) 177 if (clone) {
178 nf_reset(clone);
178 rawv6_rcv(sk, clone); 179 rawv6_rcv(sk, clone);
180 }
179 } 181 }
180 sk = __raw_v6_lookup(sk_next(sk), nexthdr, daddr, saddr, 182 sk = __raw_v6_lookup(sk_next(sk), nexthdr, daddr, saddr,
181 IP6CB(skb)->iif); 183 IP6CB(skb)->iif);
@@ -296,13 +298,10 @@ void rawv6_err(struct sock *sk, struct sk_buff *skb,
296static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb) 298static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb)
297{ 299{
298 if ((raw6_sk(sk)->checksum || sk->sk_filter) && 300 if ((raw6_sk(sk)->checksum || sk->sk_filter) &&
299 skb->ip_summed != CHECKSUM_UNNECESSARY) { 301 skb_checksum_complete(skb)) {
300 if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { 302 /* FIXME: increment a raw6 drops counter here */
301 /* FIXME: increment a raw6 drops counter here */ 303 kfree_skb(skb);
302 kfree_skb(skb); 304 return 0;
303 return 0;
304 }
305 skb->ip_summed = CHECKSUM_UNNECESSARY;
306 } 305 }
307 306
308 /* Charge it to the socket. */ 307 /* Charge it to the socket. */
@@ -335,32 +334,25 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb)
335 if (!rp->checksum) 334 if (!rp->checksum)
336 skb->ip_summed = CHECKSUM_UNNECESSARY; 335 skb->ip_summed = CHECKSUM_UNNECESSARY;
337 336
338 if (skb->ip_summed != CHECKSUM_UNNECESSARY) { 337 if (skb->ip_summed == CHECKSUM_HW) {
339 if (skb->ip_summed == CHECKSUM_HW) { 338 skb_postpull_rcsum(skb, skb->nh.raw,
340 skb_postpull_rcsum(skb, skb->nh.raw, 339 skb->h.raw - skb->nh.raw);
341 skb->h.raw - skb->nh.raw); 340 if (!csum_ipv6_magic(&skb->nh.ipv6h->saddr,
341 &skb->nh.ipv6h->daddr,
342 skb->len, inet->num, skb->csum))
342 skb->ip_summed = CHECKSUM_UNNECESSARY; 343 skb->ip_summed = CHECKSUM_UNNECESSARY;
343 if (csum_ipv6_magic(&skb->nh.ipv6h->saddr,
344 &skb->nh.ipv6h->daddr,
345 skb->len, inet->num, skb->csum)) {
346 LIMIT_NETDEBUG(KERN_DEBUG "raw v6 hw csum failure.\n");
347 skb->ip_summed = CHECKSUM_NONE;
348 }
349 }
350 if (skb->ip_summed == CHECKSUM_NONE)
351 skb->csum = ~csum_ipv6_magic(&skb->nh.ipv6h->saddr,
352 &skb->nh.ipv6h->daddr,
353 skb->len, inet->num, 0);
354 } 344 }
345 if (skb->ip_summed != CHECKSUM_UNNECESSARY)
346 skb->csum = ~csum_ipv6_magic(&skb->nh.ipv6h->saddr,
347 &skb->nh.ipv6h->daddr,
348 skb->len, inet->num, 0);
355 349
356 if (inet->hdrincl) { 350 if (inet->hdrincl) {
357 if (skb->ip_summed != CHECKSUM_UNNECESSARY && 351 if (skb_checksum_complete(skb)) {
358 (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) {
359 /* FIXME: increment a raw6 drops counter here */ 352 /* FIXME: increment a raw6 drops counter here */
360 kfree_skb(skb); 353 kfree_skb(skb);
361 return 0; 354 return 0;
362 } 355 }
363 skb->ip_summed = CHECKSUM_UNNECESSARY;
364 } 356 }
365 357
366 rawv6_rcv_skb(sk, skb); 358 rawv6_rcv_skb(sk, skb);
@@ -405,7 +397,7 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk,
405 if (skb->ip_summed==CHECKSUM_UNNECESSARY) { 397 if (skb->ip_summed==CHECKSUM_UNNECESSARY) {
406 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); 398 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
407 } else if (msg->msg_flags&MSG_TRUNC) { 399 } else if (msg->msg_flags&MSG_TRUNC) {
408 if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) 400 if (__skb_checksum_complete(skb))
409 goto csum_copy_err; 401 goto csum_copy_err;
410 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); 402 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
411 } else { 403 } else {
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 227e99ed510c..f7f42c3e96cb 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1710,7 +1710,7 @@ static void fib6_dump_end(struct netlink_callback *cb)
1710static int fib6_dump_done(struct netlink_callback *cb) 1710static int fib6_dump_done(struct netlink_callback *cb)
1711{ 1711{
1712 fib6_dump_end(cb); 1712 fib6_dump_end(cb);
1713 return cb->done(cb); 1713 return cb->done ? cb->done(cb) : 0;
1714} 1714}
1715 1715
1716int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) 1716int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index d746d3b27efb..62c0e5bd931c 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1401,20 +1401,18 @@ out:
1401static int tcp_v6_checksum_init(struct sk_buff *skb) 1401static int tcp_v6_checksum_init(struct sk_buff *skb)
1402{ 1402{
1403 if (skb->ip_summed == CHECKSUM_HW) { 1403 if (skb->ip_summed == CHECKSUM_HW) {
1404 skb->ip_summed = CHECKSUM_UNNECESSARY;
1405 if (!tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, 1404 if (!tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr,
1406 &skb->nh.ipv6h->daddr,skb->csum)) 1405 &skb->nh.ipv6h->daddr,skb->csum)) {
1406 skb->ip_summed = CHECKSUM_UNNECESSARY;
1407 return 0; 1407 return 0;
1408 LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v6 csum failed\n"); 1408 }
1409 } 1409 }
1410
1411 skb->csum = ~tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr,
1412 &skb->nh.ipv6h->daddr, 0);
1413
1410 if (skb->len <= 76) { 1414 if (skb->len <= 76) {
1411 if (tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, 1415 return __skb_checksum_complete(skb);
1412 &skb->nh.ipv6h->daddr,skb_checksum(skb, 0, skb->len, 0)))
1413 return -1;
1414 skb->ip_summed = CHECKSUM_UNNECESSARY;
1415 } else {
1416 skb->csum = ~tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr,
1417 &skb->nh.ipv6h->daddr,0);
1418 } 1416 }
1419 return 0; 1417 return 0;
1420} 1418}
@@ -1575,7 +1573,7 @@ static int tcp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
1575 goto discard_it; 1573 goto discard_it;
1576 1574
1577 if ((skb->ip_summed != CHECKSUM_UNNECESSARY && 1575 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1578 tcp_v6_checksum_init(skb) < 0)) 1576 tcp_v6_checksum_init(skb)))
1579 goto bad_packet; 1577 goto bad_packet;
1580 1578
1581 th = skb->h.th; 1579 th = skb->h.th;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index bf9519341fd3..e671153b47b2 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -248,7 +248,7 @@ try_again:
248 err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, 248 err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
249 copied); 249 copied);
250 } else if (msg->msg_flags&MSG_TRUNC) { 250 } else if (msg->msg_flags&MSG_TRUNC) {
251 if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) 251 if (__skb_checksum_complete(skb))
252 goto csum_copy_err; 252 goto csum_copy_err;
253 err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, 253 err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
254 copied); 254 copied);
@@ -363,13 +363,10 @@ static inline int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
363 return -1; 363 return -1;
364 } 364 }
365 365
366 if (skb->ip_summed != CHECKSUM_UNNECESSARY) { 366 if (skb_checksum_complete(skb)) {
367 if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { 367 UDP6_INC_STATS_BH(UDP_MIB_INERRORS);
368 UDP6_INC_STATS_BH(UDP_MIB_INERRORS); 368 kfree_skb(skb);
369 kfree_skb(skb); 369 return 0;
370 return 0;
371 }
372 skb->ip_summed = CHECKSUM_UNNECESSARY;
373 } 370 }
374 371
375 if (sock_queue_rcv_skb(sk,skb)<0) { 372 if (sock_queue_rcv_skb(sk,skb)<0) {
@@ -491,13 +488,10 @@ static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
491 uh = skb->h.uh; 488 uh = skb->h.uh;
492 } 489 }
493 490
494 if (skb->ip_summed==CHECKSUM_HW) { 491 if (skb->ip_summed == CHECKSUM_HW &&
492 !csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum))
495 skb->ip_summed = CHECKSUM_UNNECESSARY; 493 skb->ip_summed = CHECKSUM_UNNECESSARY;
496 if (csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum)) { 494
497 LIMIT_NETDEBUG(KERN_DEBUG "udp v6 hw csum failure.\n");
498 skb->ip_summed = CHECKSUM_NONE;
499 }
500 }
501 if (skb->ip_summed != CHECKSUM_UNNECESSARY) 495 if (skb->ip_summed != CHECKSUM_UNNECESSARY)
502 skb->csum = ~csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, 0); 496 skb->csum = ~csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, 0);
503 497
@@ -521,8 +515,7 @@ static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
521 if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) 515 if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
522 goto discard; 516 goto discard;
523 517
524 if (skb->ip_summed != CHECKSUM_UNNECESSARY && 518 if (skb_checksum_complete(skb))
525 (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)))
526 goto discard; 519 goto discard;
527 UDP6_INC_STATS_BH(UDP_MIB_NOPORTS); 520 UDP6_INC_STATS_BH(UDP_MIB_NOPORTS);
528 521
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 8296b38bf270..a84f9221e5f0 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -1,3 +1,6 @@
1menu "Core Netfilter Configuration"
2 depends on NET && NETFILTER
3
1config NETFILTER_NETLINK 4config NETFILTER_NETLINK
2 tristate "Netfilter netlink interface" 5 tristate "Netfilter netlink interface"
3 help 6 help
@@ -22,3 +25,74 @@ config NETFILTER_NETLINK_LOG
22 and is also scheduled to replace the old syslog-based ipt_LOG 25 and is also scheduled to replace the old syslog-based ipt_LOG
23 and ip6t_LOG modules. 26 and ip6t_LOG modules.
24 27
28config NF_CONNTRACK
29 tristate "Layer 3 Independent Connection tracking (EXPERIMENTAL)"
30 depends on EXPERIMENTAL && IP_NF_CONNTRACK=n
31 default n
32 ---help---
33 Connection tracking keeps a record of what packets have passed
34 through your machine, in order to figure out how they are related
35 into connections.
36
37 Layer 3 independent connection tracking is experimental scheme
38 which generalize ip_conntrack to support other layer 3 protocols.
39
40 To compile it as a module, choose M here. If unsure, say N.
41
42config NF_CT_ACCT
43 bool "Connection tracking flow accounting"
44 depends on NF_CONNTRACK
45 help
46 If this option is enabled, the connection tracking code will
47 keep per-flow packet and byte counters.
48
49 Those counters can be used for flow-based accounting or the
50 `connbytes' match.
51
52 If unsure, say `N'.
53
54config NF_CONNTRACK_MARK
55 bool 'Connection mark tracking support'
56 depends on NF_CONNTRACK
57 help
58 This option enables support for connection marks, used by the
59 `CONNMARK' target and `connmark' match. Similar to the mark value
60 of packets, but this mark value is kept in the conntrack session
61 instead of the individual packets.
62
63config NF_CONNTRACK_EVENTS
64 bool "Connection tracking events"
65 depends on NF_CONNTRACK
66 help
67 If this option is enabled, the connection tracking code will
68 provide a notifier chain that can be used by other kernel code
69 to get notified aboutchanges in the connection tracking state.
70
71 If unsure, say `N'.
72
73config NF_CT_PROTO_SCTP
74 tristate 'SCTP protocol on new connection tracking support (EXPERIMENTAL)'
75 depends on EXPERIMENTAL && NF_CONNTRACK
76 default n
77 help
78 With this option enabled, the layer 3 independent connection
79 tracking code will be able to do state tracking on SCTP connections.
80
81 If you want to compile it as a module, say M here and read
82 Documentation/modules.txt. If unsure, say `N'.
83
84config NF_CONNTRACK_FTP
85 tristate "FTP support on new connection tracking (EXPERIMENTAL)"
86 depends on EXPERIMENTAL && NF_CONNTRACK
87 help
88 Tracking FTP connections is problematic: special helpers are
89 required for tracking them, and doing masquerading and other forms
90 of Network Address Translation on them.
91
92 This is FTP support on Layer 3 independent connection tracking.
93 Layer 3 independent connection tracking is experimental scheme
94 which generalize ip_conntrack to support other layer 3 protocols.
95
96 To compile it as a module, choose M here. If unsure, say N.
97
98endmenu
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index b3b44f8b415a..55f019ad2c08 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -5,3 +5,11 @@ obj-$(CONFIG_NETFILTER) = netfilter.o
5obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o 5obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o
6obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o 6obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o
7obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o 7obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o
8
9nf_conntrack-objs := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o
10
11obj-$(CONFIG_NF_CONNTRACK) += nf_conntrack.o
12obj-$(CONFIG_NF_CONNTRACK_FTP) += nf_conntrack_ftp.o
13
14# SCTP protocol connection tracking
15obj-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
new file mode 100644
index 000000000000..9a67c796b385
--- /dev/null
+++ b/net/netfilter/nf_conntrack_core.c
@@ -0,0 +1,1538 @@
1/* Connection state tracking for netfilter. This is separated from,
2 but required by, the NAT layer; it can also be used by an iptables
3 extension. */
4
5/* (C) 1999-2001 Paul `Rusty' Russell
6 * (C) 2002-2005 Netfilter Core Team <coreteam@netfilter.org>
7 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 *
13 * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
14 * - new API and handling of conntrack/nat helpers
15 * - now capable of multiple expectations for one master
16 * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
17 * - add usage/reference counts to ip_conntrack_expect
18 * - export ip_conntrack[_expect]_{find_get,put} functions
19 * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
20 * - generalize L3 protocol denendent part.
21 * 23 Mar 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
22 * - add support various size of conntrack structures.
23 *
24 * Derived from net/ipv4/netfilter/ip_conntrack_core.c
25 */
26
27#include <linux/config.h>
28#include <linux/types.h>
29#include <linux/netfilter.h>
30#include <linux/module.h>
31#include <linux/skbuff.h>
32#include <linux/proc_fs.h>
33#include <linux/vmalloc.h>
34#include <linux/stddef.h>
35#include <linux/slab.h>
36#include <linux/random.h>
37#include <linux/jhash.h>
38#include <linux/err.h>
39#include <linux/percpu.h>
40#include <linux/moduleparam.h>
41#include <linux/notifier.h>
42#include <linux/kernel.h>
43#include <linux/netdevice.h>
44#include <linux/socket.h>
45
46/* This rwlock protects the main hash table, protocol/helper/expected
47 registrations, conntrack timers*/
48#define ASSERT_READ_LOCK(x)
49#define ASSERT_WRITE_LOCK(x)
50
51#include <net/netfilter/nf_conntrack.h>
52#include <net/netfilter/nf_conntrack_l3proto.h>
53#include <net/netfilter/nf_conntrack_protocol.h>
54#include <net/netfilter/nf_conntrack_helper.h>
55#include <net/netfilter/nf_conntrack_core.h>
56#include <linux/netfilter_ipv4/listhelp.h>
57
58#define NF_CONNTRACK_VERSION "0.4.1"
59
60#if 0
61#define DEBUGP printk
62#else
63#define DEBUGP(format, args...)
64#endif
65
66DEFINE_RWLOCK(nf_conntrack_lock);
67
68/* nf_conntrack_standalone needs this */
69atomic_t nf_conntrack_count = ATOMIC_INIT(0);
70
71void (*nf_conntrack_destroyed)(struct nf_conn *conntrack) = NULL;
72LIST_HEAD(nf_conntrack_expect_list);
73struct nf_conntrack_protocol **nf_ct_protos[PF_MAX];
74struct nf_conntrack_l3proto *nf_ct_l3protos[PF_MAX];
75static LIST_HEAD(helpers);
76unsigned int nf_conntrack_htable_size = 0;
77int nf_conntrack_max;
78struct list_head *nf_conntrack_hash;
79static kmem_cache_t *nf_conntrack_expect_cachep;
80struct nf_conn nf_conntrack_untracked;
81unsigned int nf_ct_log_invalid;
82static LIST_HEAD(unconfirmed);
83static int nf_conntrack_vmalloc;
84
85#ifdef CONFIG_NF_CONNTRACK_EVENTS
86struct notifier_block *nf_conntrack_chain;
87struct notifier_block *nf_conntrack_expect_chain;
88
89DEFINE_PER_CPU(struct nf_conntrack_ecache, nf_conntrack_ecache);
90
91/* deliver cached events and clear cache entry - must be called with locally
92 * disabled softirqs */
93static inline void
94__nf_ct_deliver_cached_events(struct nf_conntrack_ecache *ecache)
95{
96 DEBUGP("ecache: delivering events for %p\n", ecache->ct);
97 if (nf_ct_is_confirmed(ecache->ct) && !nf_ct_is_dying(ecache->ct)
98 && ecache->events)
99 notifier_call_chain(&nf_conntrack_chain, ecache->events,
100 ecache->ct);
101
102 ecache->events = 0;
103 nf_ct_put(ecache->ct);
104 ecache->ct = NULL;
105}
106
107/* Deliver all cached events for a particular conntrack. This is called
108 * by code prior to async packet handling for freeing the skb */
109void nf_ct_deliver_cached_events(const struct nf_conn *ct)
110{
111 struct nf_conntrack_ecache *ecache;
112
113 local_bh_disable();
114 ecache = &__get_cpu_var(nf_conntrack_ecache);
115 if (ecache->ct == ct)
116 __nf_ct_deliver_cached_events(ecache);
117 local_bh_enable();
118}
119
120/* Deliver cached events for old pending events, if current conntrack != old */
121void __nf_ct_event_cache_init(struct nf_conn *ct)
122{
123 struct nf_conntrack_ecache *ecache;
124
125 /* take care of delivering potentially old events */
126 ecache = &__get_cpu_var(nf_conntrack_ecache);
127 BUG_ON(ecache->ct == ct);
128 if (ecache->ct)
129 __nf_ct_deliver_cached_events(ecache);
130 /* initialize for this conntrack/packet */
131 ecache->ct = ct;
132 nf_conntrack_get(&ct->ct_general);
133}
134
135/* flush the event cache - touches other CPU's data and must not be called
136 * while packets are still passing through the code */
137static void nf_ct_event_cache_flush(void)
138{
139 struct nf_conntrack_ecache *ecache;
140 int cpu;
141
142 for_each_cpu(cpu) {
143 ecache = &per_cpu(nf_conntrack_ecache, cpu);
144 if (ecache->ct)
145 nf_ct_put(ecache->ct);
146 }
147}
148#else
149static inline void nf_ct_event_cache_flush(void) {}
150#endif /* CONFIG_NF_CONNTRACK_EVENTS */
151
152DEFINE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat);
153EXPORT_PER_CPU_SYMBOL(nf_conntrack_stat);
154
155/*
156 * This scheme offers various size of "struct nf_conn" dependent on
157 * features(helper, nat, ...)
158 */
159
160#define NF_CT_FEATURES_NAMELEN 256
161static struct {
162 /* name of slab cache. printed in /proc/slabinfo */
163 char *name;
164
165 /* size of slab cache */
166 size_t size;
167
168 /* slab cache pointer */
169 kmem_cache_t *cachep;
170
171 /* allocated slab cache + modules which uses this slab cache */
172 int use;
173
174 /* Initialization */
175 int (*init_conntrack)(struct nf_conn *, u_int32_t);
176
177} nf_ct_cache[NF_CT_F_NUM];
178
179/* protect members of nf_ct_cache except of "use" */
180DEFINE_RWLOCK(nf_ct_cache_lock);
181
182/* This avoids calling kmem_cache_create() with same name simultaneously */
183DECLARE_MUTEX(nf_ct_cache_mutex);
184
185extern struct nf_conntrack_protocol nf_conntrack_generic_protocol;
186struct nf_conntrack_protocol *
187nf_ct_find_proto(u_int16_t l3proto, u_int8_t protocol)
188{
189 if (unlikely(nf_ct_protos[l3proto] == NULL))
190 return &nf_conntrack_generic_protocol;
191
192 return nf_ct_protos[l3proto][protocol];
193}
194
195static int nf_conntrack_hash_rnd_initted;
196static unsigned int nf_conntrack_hash_rnd;
197
198static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
199 unsigned int size, unsigned int rnd)
200{
201 unsigned int a, b;
202 a = jhash((void *)tuple->src.u3.all, sizeof(tuple->src.u3.all),
203 ((tuple->src.l3num) << 16) | tuple->dst.protonum);
204 b = jhash((void *)tuple->dst.u3.all, sizeof(tuple->dst.u3.all),
205 (tuple->src.u.all << 16) | tuple->dst.u.all);
206
207 return jhash_2words(a, b, rnd) % size;
208}
209
210static inline u_int32_t hash_conntrack(const struct nf_conntrack_tuple *tuple)
211{
212 return __hash_conntrack(tuple, nf_conntrack_htable_size,
213 nf_conntrack_hash_rnd);
214}
215
216/* Initialize "struct nf_conn" which has spaces for helper */
217static int
218init_conntrack_for_helper(struct nf_conn *conntrack, u_int32_t features)
219{
220
221 conntrack->help = (union nf_conntrack_help *)
222 (((unsigned long)conntrack->data
223 + (__alignof__(union nf_conntrack_help) - 1))
224 & (~((unsigned long)(__alignof__(union nf_conntrack_help) -1))));
225 return 0;
226}
227
228int nf_conntrack_register_cache(u_int32_t features, const char *name,
229 size_t size,
230 int (*init)(struct nf_conn *, u_int32_t))
231{
232 int ret = 0;
233 char *cache_name;
234 kmem_cache_t *cachep;
235
236 DEBUGP("nf_conntrack_register_cache: features=0x%x, name=%s, size=%d\n",
237 features, name, size);
238
239 if (features < NF_CT_F_BASIC || features >= NF_CT_F_NUM) {
240 DEBUGP("nf_conntrack_register_cache: invalid features.: 0x%x\n",
241 features);
242 return -EINVAL;
243 }
244
245 down(&nf_ct_cache_mutex);
246
247 write_lock_bh(&nf_ct_cache_lock);
248 /* e.g: multiple helpers are loaded */
249 if (nf_ct_cache[features].use > 0) {
250 DEBUGP("nf_conntrack_register_cache: already resisterd.\n");
251 if ((!strncmp(nf_ct_cache[features].name, name,
252 NF_CT_FEATURES_NAMELEN))
253 && nf_ct_cache[features].size == size
254 && nf_ct_cache[features].init_conntrack == init) {
255 DEBUGP("nf_conntrack_register_cache: reusing.\n");
256 nf_ct_cache[features].use++;
257 ret = 0;
258 } else
259 ret = -EBUSY;
260
261 write_unlock_bh(&nf_ct_cache_lock);
262 up(&nf_ct_cache_mutex);
263 return ret;
264 }
265 write_unlock_bh(&nf_ct_cache_lock);
266
267 /*
268 * The memory space for name of slab cache must be alive until
269 * cache is destroyed.
270 */
271 cache_name = kmalloc(sizeof(char)*NF_CT_FEATURES_NAMELEN, GFP_ATOMIC);
272 if (cache_name == NULL) {
273 DEBUGP("nf_conntrack_register_cache: can't alloc cache_name\n");
274 ret = -ENOMEM;
275 goto out_up_mutex;
276 }
277
278 if (strlcpy(cache_name, name, NF_CT_FEATURES_NAMELEN)
279 >= NF_CT_FEATURES_NAMELEN) {
280 printk("nf_conntrack_register_cache: name too long\n");
281 ret = -EINVAL;
282 goto out_free_name;
283 }
284
285 cachep = kmem_cache_create(cache_name, size, 0, 0,
286 NULL, NULL);
287 if (!cachep) {
288 printk("nf_conntrack_register_cache: Can't create slab cache "
289 "for the features = 0x%x\n", features);
290 ret = -ENOMEM;
291 goto out_free_name;
292 }
293
294 write_lock_bh(&nf_ct_cache_lock);
295 nf_ct_cache[features].use = 1;
296 nf_ct_cache[features].size = size;
297 nf_ct_cache[features].init_conntrack = init;
298 nf_ct_cache[features].cachep = cachep;
299 nf_ct_cache[features].name = cache_name;
300 write_unlock_bh(&nf_ct_cache_lock);
301
302 goto out_up_mutex;
303
304out_free_name:
305 kfree(cache_name);
306out_up_mutex:
307 up(&nf_ct_cache_mutex);
308 return ret;
309}
310
311/* FIXME: In the current, only nf_conntrack_cleanup() can call this function. */
312void nf_conntrack_unregister_cache(u_int32_t features)
313{
314 kmem_cache_t *cachep;
315 char *name;
316
317 /*
318 * This assures that kmem_cache_create() isn't called before destroying
319 * slab cache.
320 */
321 DEBUGP("nf_conntrack_unregister_cache: 0x%04x\n", features);
322 down(&nf_ct_cache_mutex);
323
324 write_lock_bh(&nf_ct_cache_lock);
325 if (--nf_ct_cache[features].use > 0) {
326 write_unlock_bh(&nf_ct_cache_lock);
327 up(&nf_ct_cache_mutex);
328 return;
329 }
330 cachep = nf_ct_cache[features].cachep;
331 name = nf_ct_cache[features].name;
332 nf_ct_cache[features].cachep = NULL;
333 nf_ct_cache[features].name = NULL;
334 nf_ct_cache[features].init_conntrack = NULL;
335 nf_ct_cache[features].size = 0;
336 write_unlock_bh(&nf_ct_cache_lock);
337
338 synchronize_net();
339
340 kmem_cache_destroy(cachep);
341 kfree(name);
342
343 up(&nf_ct_cache_mutex);
344}
345
346int
347nf_ct_get_tuple(const struct sk_buff *skb,
348 unsigned int nhoff,
349 unsigned int dataoff,
350 u_int16_t l3num,
351 u_int8_t protonum,
352 struct nf_conntrack_tuple *tuple,
353 const struct nf_conntrack_l3proto *l3proto,
354 const struct nf_conntrack_protocol *protocol)
355{
356 NF_CT_TUPLE_U_BLANK(tuple);
357
358 tuple->src.l3num = l3num;
359 if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0)
360 return 0;
361
362 tuple->dst.protonum = protonum;
363 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
364
365 return protocol->pkt_to_tuple(skb, dataoff, tuple);
366}
367
368int
369nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
370 const struct nf_conntrack_tuple *orig,
371 const struct nf_conntrack_l3proto *l3proto,
372 const struct nf_conntrack_protocol *protocol)
373{
374 NF_CT_TUPLE_U_BLANK(inverse);
375
376 inverse->src.l3num = orig->src.l3num;
377 if (l3proto->invert_tuple(inverse, orig) == 0)
378 return 0;
379
380 inverse->dst.dir = !orig->dst.dir;
381
382 inverse->dst.protonum = orig->dst.protonum;
383 return protocol->invert_tuple(inverse, orig);
384}
385
386/* nf_conntrack_expect helper functions */
387static void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
388{
389 ASSERT_WRITE_LOCK(&nf_conntrack_lock);
390 NF_CT_ASSERT(!timer_pending(&exp_timeout));
391 list_del(&exp->list);
392 NF_CT_STAT_INC(expect_delete);
393 exp->master->expecting--;
394 nf_conntrack_expect_put(exp);
395}
396
397static void expectation_timed_out(unsigned long ul_expect)
398{
399 struct nf_conntrack_expect *exp = (void *)ul_expect;
400
401 write_lock_bh(&nf_conntrack_lock);
402 nf_ct_unlink_expect(exp);
403 write_unlock_bh(&nf_conntrack_lock);
404 nf_conntrack_expect_put(exp);
405}
406
407/* If an expectation for this connection is found, it gets delete from
408 * global list then returned. */
409static struct nf_conntrack_expect *
410find_expectation(const struct nf_conntrack_tuple *tuple)
411{
412 struct nf_conntrack_expect *i;
413
414 list_for_each_entry(i, &nf_conntrack_expect_list, list) {
415 /* If master is not in hash table yet (ie. packet hasn't left
416 this machine yet), how can other end know about expected?
417 Hence these are not the droids you are looking for (if
418 master ct never got confirmed, we'd hold a reference to it
419 and weird things would happen to future packets). */
420 if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
421 && nf_ct_is_confirmed(i->master)) {
422 if (i->flags & NF_CT_EXPECT_PERMANENT) {
423 atomic_inc(&i->use);
424 return i;
425 } else if (del_timer(&i->timeout)) {
426 nf_ct_unlink_expect(i);
427 return i;
428 }
429 }
430 }
431 return NULL;
432}
433
434/* delete all expectations for this conntrack */
435static void remove_expectations(struct nf_conn *ct)
436{
437 struct nf_conntrack_expect *i, *tmp;
438
439 /* Optimization: most connection never expect any others. */
440 if (ct->expecting == 0)
441 return;
442
443 list_for_each_entry_safe(i, tmp, &nf_conntrack_expect_list, list) {
444 if (i->master == ct && del_timer(&i->timeout)) {
445 nf_ct_unlink_expect(i);
446 nf_conntrack_expect_put(i);
447 }
448 }
449}
450
451static void
452clean_from_lists(struct nf_conn *ct)
453{
454 unsigned int ho, hr;
455
456 DEBUGP("clean_from_lists(%p)\n", ct);
457 ASSERT_WRITE_LOCK(&nf_conntrack_lock);
458
459 ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
460 hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
461 LIST_DELETE(&nf_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
462 LIST_DELETE(&nf_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
463
464 /* Destroy all pending expectations */
465 remove_expectations(ct);
466}
467
468static void
469destroy_conntrack(struct nf_conntrack *nfct)
470{
471 struct nf_conn *ct = (struct nf_conn *)nfct;
472 struct nf_conntrack_l3proto *l3proto;
473 struct nf_conntrack_protocol *proto;
474
475 DEBUGP("destroy_conntrack(%p)\n", ct);
476 NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
477 NF_CT_ASSERT(!timer_pending(&ct->timeout));
478
479 nf_conntrack_event(IPCT_DESTROY, ct);
480 set_bit(IPS_DYING_BIT, &ct->status);
481
482 /* To make sure we don't get any weird locking issues here:
483 * destroy_conntrack() MUST NOT be called with a write lock
484 * to nf_conntrack_lock!!! -HW */
485 l3proto = nf_ct_find_l3proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num);
486 if (l3proto && l3proto->destroy)
487 l3proto->destroy(ct);
488
489 proto = nf_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num,
490 ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
491 if (proto && proto->destroy)
492 proto->destroy(ct);
493
494 if (nf_conntrack_destroyed)
495 nf_conntrack_destroyed(ct);
496
497 write_lock_bh(&nf_conntrack_lock);
498 /* Expectations will have been removed in clean_from_lists,
499 * except TFTP can create an expectation on the first packet,
500 * before connection is in the list, so we need to clean here,
501 * too. */
502 remove_expectations(ct);
503
504 /* We overload first tuple to link into unconfirmed list. */
505 if (!nf_ct_is_confirmed(ct)) {
506 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
507 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
508 }
509
510 NF_CT_STAT_INC(delete);
511 write_unlock_bh(&nf_conntrack_lock);
512
513 if (ct->master)
514 nf_ct_put(ct->master);
515
516 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
517 nf_conntrack_free(ct);
518}
519
520static void death_by_timeout(unsigned long ul_conntrack)
521{
522 struct nf_conn *ct = (void *)ul_conntrack;
523
524 write_lock_bh(&nf_conntrack_lock);
525 /* Inside lock so preempt is disabled on module removal path.
526 * Otherwise we can get spurious warnings. */
527 NF_CT_STAT_INC(delete_list);
528 clean_from_lists(ct);
529 write_unlock_bh(&nf_conntrack_lock);
530 nf_ct_put(ct);
531}
532
533static inline int
534conntrack_tuple_cmp(const struct nf_conntrack_tuple_hash *i,
535 const struct nf_conntrack_tuple *tuple,
536 const struct nf_conn *ignored_conntrack)
537{
538 ASSERT_READ_LOCK(&nf_conntrack_lock);
539 return nf_ct_tuplehash_to_ctrack(i) != ignored_conntrack
540 && nf_ct_tuple_equal(tuple, &i->tuple);
541}
542
543static struct nf_conntrack_tuple_hash *
544__nf_conntrack_find(const struct nf_conntrack_tuple *tuple,
545 const struct nf_conn *ignored_conntrack)
546{
547 struct nf_conntrack_tuple_hash *h;
548 unsigned int hash = hash_conntrack(tuple);
549
550 ASSERT_READ_LOCK(&nf_conntrack_lock);
551 list_for_each_entry(h, &nf_conntrack_hash[hash], list) {
552 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
553 NF_CT_STAT_INC(found);
554 return h;
555 }
556 NF_CT_STAT_INC(searched);
557 }
558
559 return NULL;
560}
561
562/* Find a connection corresponding to a tuple. */
563struct nf_conntrack_tuple_hash *
564nf_conntrack_find_get(const struct nf_conntrack_tuple *tuple,
565 const struct nf_conn *ignored_conntrack)
566{
567 struct nf_conntrack_tuple_hash *h;
568
569 read_lock_bh(&nf_conntrack_lock);
570 h = __nf_conntrack_find(tuple, ignored_conntrack);
571 if (h)
572 atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
573 read_unlock_bh(&nf_conntrack_lock);
574
575 return h;
576}
577
578/* Confirm a connection given skb; places it in hash table */
579int
580__nf_conntrack_confirm(struct sk_buff **pskb)
581{
582 unsigned int hash, repl_hash;
583 struct nf_conn *ct;
584 enum ip_conntrack_info ctinfo;
585
586 ct = nf_ct_get(*pskb, &ctinfo);
587
588 /* ipt_REJECT uses nf_conntrack_attach to attach related
589 ICMP/TCP RST packets in other direction. Actual packet
590 which created connection will be IP_CT_NEW or for an
591 expected connection, IP_CT_RELATED. */
592 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
593 return NF_ACCEPT;
594
595 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
596 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
597
598 /* We're not in hash table, and we refuse to set up related
599 connections for unconfirmed conns. But packet copies and
600 REJECT will give spurious warnings here. */
601 /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
602
603 /* No external references means noone else could have
604 confirmed us. */
605 NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
606 DEBUGP("Confirming conntrack %p\n", ct);
607
608 write_lock_bh(&nf_conntrack_lock);
609
610 /* See if there's one in the list already, including reverse:
611 NAT could have grabbed it without realizing, since we're
612 not in the hash. If there is, we lost race. */
613 if (!LIST_FIND(&nf_conntrack_hash[hash],
614 conntrack_tuple_cmp,
615 struct nf_conntrack_tuple_hash *,
616 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
617 && !LIST_FIND(&nf_conntrack_hash[repl_hash],
618 conntrack_tuple_cmp,
619 struct nf_conntrack_tuple_hash *,
620 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
621 /* Remove from unconfirmed list */
622 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
623
624 list_prepend(&nf_conntrack_hash[hash],
625 &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
626 list_prepend(&nf_conntrack_hash[repl_hash],
627 &ct->tuplehash[IP_CT_DIR_REPLY]);
628 /* Timer relative to confirmation time, not original
629 setting time, otherwise we'd get timer wrap in
630 weird delay cases. */
631 ct->timeout.expires += jiffies;
632 add_timer(&ct->timeout);
633 atomic_inc(&ct->ct_general.use);
634 set_bit(IPS_CONFIRMED_BIT, &ct->status);
635 NF_CT_STAT_INC(insert);
636 write_unlock_bh(&nf_conntrack_lock);
637 if (ct->helper)
638 nf_conntrack_event_cache(IPCT_HELPER, *pskb);
639#ifdef CONFIG_NF_NAT_NEEDED
640 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
641 test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
642 nf_conntrack_event_cache(IPCT_NATINFO, *pskb);
643#endif
644 nf_conntrack_event_cache(master_ct(ct) ?
645 IPCT_RELATED : IPCT_NEW, *pskb);
646 return NF_ACCEPT;
647 }
648
649 NF_CT_STAT_INC(insert_failed);
650 write_unlock_bh(&nf_conntrack_lock);
651 return NF_DROP;
652}
653
654/* Returns true if a connection correspondings to the tuple (required
655 for NAT). */
656int
657nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
658 const struct nf_conn *ignored_conntrack)
659{
660 struct nf_conntrack_tuple_hash *h;
661
662 read_lock_bh(&nf_conntrack_lock);
663 h = __nf_conntrack_find(tuple, ignored_conntrack);
664 read_unlock_bh(&nf_conntrack_lock);
665
666 return h != NULL;
667}
668
669/* There's a small race here where we may free a just-assured
670 connection. Too bad: we're in trouble anyway. */
671static inline int unreplied(const struct nf_conntrack_tuple_hash *i)
672{
673 return !(test_bit(IPS_ASSURED_BIT,
674 &nf_ct_tuplehash_to_ctrack(i)->status));
675}
676
677static int early_drop(struct list_head *chain)
678{
679 /* Traverse backwards: gives us oldest, which is roughly LRU */
680 struct nf_conntrack_tuple_hash *h;
681 struct nf_conn *ct = NULL;
682 int dropped = 0;
683
684 read_lock_bh(&nf_conntrack_lock);
685 h = LIST_FIND_B(chain, unreplied, struct nf_conntrack_tuple_hash *);
686 if (h) {
687 ct = nf_ct_tuplehash_to_ctrack(h);
688 atomic_inc(&ct->ct_general.use);
689 }
690 read_unlock_bh(&nf_conntrack_lock);
691
692 if (!ct)
693 return dropped;
694
695 if (del_timer(&ct->timeout)) {
696 death_by_timeout((unsigned long)ct);
697 dropped = 1;
698 NF_CT_STAT_INC(early_drop);
699 }
700 nf_ct_put(ct);
701 return dropped;
702}
703
704static inline int helper_cmp(const struct nf_conntrack_helper *i,
705 const struct nf_conntrack_tuple *rtuple)
706{
707 return nf_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
708}
709
710static struct nf_conntrack_helper *
711nf_ct_find_helper(const struct nf_conntrack_tuple *tuple)
712{
713 return LIST_FIND(&helpers, helper_cmp,
714 struct nf_conntrack_helper *,
715 tuple);
716}
717
718static struct nf_conn *
719__nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
720 const struct nf_conntrack_tuple *repl,
721 const struct nf_conntrack_l3proto *l3proto)
722{
723 struct nf_conn *conntrack = NULL;
724 u_int32_t features = 0;
725
726 if (!nf_conntrack_hash_rnd_initted) {
727 get_random_bytes(&nf_conntrack_hash_rnd, 4);
728 nf_conntrack_hash_rnd_initted = 1;
729 }
730
731 if (nf_conntrack_max
732 && atomic_read(&nf_conntrack_count) >= nf_conntrack_max) {
733 unsigned int hash = hash_conntrack(orig);
734 /* Try dropping from this hash chain. */
735 if (!early_drop(&nf_conntrack_hash[hash])) {
736 if (net_ratelimit())
737 printk(KERN_WARNING
738 "nf_conntrack: table full, dropping"
739 " packet.\n");
740 return ERR_PTR(-ENOMEM);
741 }
742 }
743
744 /* find features needed by this conntrack. */
745 features = l3proto->get_features(orig);
746 read_lock_bh(&nf_conntrack_lock);
747 if (nf_ct_find_helper(repl) != NULL)
748 features |= NF_CT_F_HELP;
749 read_unlock_bh(&nf_conntrack_lock);
750
751 DEBUGP("nf_conntrack_alloc: features=0x%x\n", features);
752
753 read_lock_bh(&nf_ct_cache_lock);
754
755 if (!nf_ct_cache[features].use) {
756 DEBUGP("nf_conntrack_alloc: not supported features = 0x%x\n",
757 features);
758 goto out;
759 }
760
761 conntrack = kmem_cache_alloc(nf_ct_cache[features].cachep, GFP_ATOMIC);
762 if (conntrack == NULL) {
763 DEBUGP("nf_conntrack_alloc: Can't alloc conntrack from cache\n");
764 goto out;
765 }
766
767 memset(conntrack, 0, nf_ct_cache[features].size);
768 conntrack->features = features;
769 if (nf_ct_cache[features].init_conntrack &&
770 nf_ct_cache[features].init_conntrack(conntrack, features) < 0) {
771 DEBUGP("nf_conntrack_alloc: failed to init\n");
772 kmem_cache_free(nf_ct_cache[features].cachep, conntrack);
773 conntrack = NULL;
774 goto out;
775 }
776
777 atomic_set(&conntrack->ct_general.use, 1);
778 conntrack->ct_general.destroy = destroy_conntrack;
779 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
780 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
781 /* Don't set timer yet: wait for confirmation */
782 init_timer(&conntrack->timeout);
783 conntrack->timeout.data = (unsigned long)conntrack;
784 conntrack->timeout.function = death_by_timeout;
785
786 atomic_inc(&nf_conntrack_count);
787out:
788 read_unlock_bh(&nf_ct_cache_lock);
789 return conntrack;
790}
791
792struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
793 const struct nf_conntrack_tuple *repl)
794{
795 struct nf_conntrack_l3proto *l3proto;
796
797 l3proto = nf_ct_find_l3proto(orig->src.l3num);
798 return __nf_conntrack_alloc(orig, repl, l3proto);
799}
800
801void nf_conntrack_free(struct nf_conn *conntrack)
802{
803 u_int32_t features = conntrack->features;
804 NF_CT_ASSERT(features >= NF_CT_F_BASIC && features < NF_CT_F_NUM);
805 DEBUGP("nf_conntrack_free: features = 0x%x, conntrack=%p\n", features,
806 conntrack);
807 kmem_cache_free(nf_ct_cache[features].cachep, conntrack);
808 atomic_dec(&nf_conntrack_count);
809}
810
811/* Allocate a new conntrack: we return -ENOMEM if classification
812 failed due to stress. Otherwise it really is unclassifiable. */
813static struct nf_conntrack_tuple_hash *
814init_conntrack(const struct nf_conntrack_tuple *tuple,
815 struct nf_conntrack_l3proto *l3proto,
816 struct nf_conntrack_protocol *protocol,
817 struct sk_buff *skb,
818 unsigned int dataoff)
819{
820 struct nf_conn *conntrack;
821 struct nf_conntrack_tuple repl_tuple;
822 struct nf_conntrack_expect *exp;
823
824 if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, protocol)) {
825 DEBUGP("Can't invert tuple.\n");
826 return NULL;
827 }
828
829 conntrack = __nf_conntrack_alloc(tuple, &repl_tuple, l3proto);
830 if (conntrack == NULL || IS_ERR(conntrack)) {
831 DEBUGP("Can't allocate conntrack.\n");
832 return (struct nf_conntrack_tuple_hash *)conntrack;
833 }
834
835 if (!protocol->new(conntrack, skb, dataoff)) {
836 nf_conntrack_free(conntrack);
837 DEBUGP("init conntrack: can't track with proto module\n");
838 return NULL;
839 }
840
841 write_lock_bh(&nf_conntrack_lock);
842 exp = find_expectation(tuple);
843
844 if (exp) {
845 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
846 conntrack, exp);
847 /* Welcome, Mr. Bond. We've been expecting you... */
848 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
849 conntrack->master = exp->master;
850#ifdef CONFIG_NF_CONNTRACK_MARK
851 conntrack->mark = exp->master->mark;
852#endif
853 nf_conntrack_get(&conntrack->master->ct_general);
854 NF_CT_STAT_INC(expect_new);
855 } else {
856 conntrack->helper = nf_ct_find_helper(&repl_tuple);
857
858 NF_CT_STAT_INC(new);
859 }
860
861 /* Overload tuple linked list to put us in unconfirmed list. */
862 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
863
864 write_unlock_bh(&nf_conntrack_lock);
865
866 if (exp) {
867 if (exp->expectfn)
868 exp->expectfn(conntrack, exp);
869 nf_conntrack_expect_put(exp);
870 }
871
872 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
873}
874
875/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
876static inline struct nf_conn *
877resolve_normal_ct(struct sk_buff *skb,
878 unsigned int dataoff,
879 u_int16_t l3num,
880 u_int8_t protonum,
881 struct nf_conntrack_l3proto *l3proto,
882 struct nf_conntrack_protocol *proto,
883 int *set_reply,
884 enum ip_conntrack_info *ctinfo)
885{
886 struct nf_conntrack_tuple tuple;
887 struct nf_conntrack_tuple_hash *h;
888 struct nf_conn *ct;
889
890 if (!nf_ct_get_tuple(skb, (unsigned int)(skb->nh.raw - skb->data),
891 dataoff, l3num, protonum, &tuple, l3proto,
892 proto)) {
893 DEBUGP("resolve_normal_ct: Can't get tuple\n");
894 return NULL;
895 }
896
897 /* look for tuple match */
898 h = nf_conntrack_find_get(&tuple, NULL);
899 if (!h) {
900 h = init_conntrack(&tuple, l3proto, proto, skb, dataoff);
901 if (!h)
902 return NULL;
903 if (IS_ERR(h))
904 return (void *)h;
905 }
906 ct = nf_ct_tuplehash_to_ctrack(h);
907
908 /* It exists; we have (non-exclusive) reference. */
909 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
910 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
911 /* Please set reply bit if this packet OK */
912 *set_reply = 1;
913 } else {
914 /* Once we've had two way comms, always ESTABLISHED. */
915 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
916 DEBUGP("nf_conntrack_in: normal packet for %p\n", ct);
917 *ctinfo = IP_CT_ESTABLISHED;
918 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
919 DEBUGP("nf_conntrack_in: related packet for %p\n", ct);
920 *ctinfo = IP_CT_RELATED;
921 } else {
922 DEBUGP("nf_conntrack_in: new packet for %p\n", ct);
923 *ctinfo = IP_CT_NEW;
924 }
925 *set_reply = 0;
926 }
927 skb->nfct = &ct->ct_general;
928 skb->nfctinfo = *ctinfo;
929 return ct;
930}
931
932unsigned int
933nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff **pskb)
934{
935 struct nf_conn *ct;
936 enum ip_conntrack_info ctinfo;
937 struct nf_conntrack_l3proto *l3proto;
938 struct nf_conntrack_protocol *proto;
939 unsigned int dataoff;
940 u_int8_t protonum;
941 int set_reply = 0;
942 int ret;
943
944 /* Previously seen (loopback or untracked)? Ignore. */
945 if ((*pskb)->nfct) {
946 NF_CT_STAT_INC(ignore);
947 return NF_ACCEPT;
948 }
949
950 l3proto = nf_ct_find_l3proto((u_int16_t)pf);
951 if ((ret = l3proto->prepare(pskb, hooknum, &dataoff, &protonum)) <= 0) {
952 DEBUGP("not prepared to track yet or error occured\n");
953 return -ret;
954 }
955
956 proto = nf_ct_find_proto((u_int16_t)pf, protonum);
957
958 /* It may be an special packet, error, unclean...
959 * inverse of the return code tells to the netfilter
960 * core what to do with the packet. */
961 if (proto->error != NULL &&
962 (ret = proto->error(*pskb, dataoff, &ctinfo, pf, hooknum)) <= 0) {
963 NF_CT_STAT_INC(error);
964 NF_CT_STAT_INC(invalid);
965 return -ret;
966 }
967
968 ct = resolve_normal_ct(*pskb, dataoff, pf, protonum, l3proto, proto,
969 &set_reply, &ctinfo);
970 if (!ct) {
971 /* Not valid part of a connection */
972 NF_CT_STAT_INC(invalid);
973 return NF_ACCEPT;
974 }
975
976 if (IS_ERR(ct)) {
977 /* Too stressed to deal. */
978 NF_CT_STAT_INC(drop);
979 return NF_DROP;
980 }
981
982 NF_CT_ASSERT((*pskb)->nfct);
983
984 ret = proto->packet(ct, *pskb, dataoff, ctinfo, pf, hooknum);
985 if (ret < 0) {
986 /* Invalid: inverse of the return code tells
987 * the netfilter core what to do */
988 DEBUGP("nf_conntrack_in: Can't track with proto module\n");
989 nf_conntrack_put((*pskb)->nfct);
990 (*pskb)->nfct = NULL;
991 NF_CT_STAT_INC(invalid);
992 return -ret;
993 }
994
995 if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
996 nf_conntrack_event_cache(IPCT_STATUS, *pskb);
997
998 return ret;
999}
1000
1001int nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
1002 const struct nf_conntrack_tuple *orig)
1003{
1004 return nf_ct_invert_tuple(inverse, orig,
1005 nf_ct_find_l3proto(orig->src.l3num),
1006 nf_ct_find_proto(orig->src.l3num,
1007 orig->dst.protonum));
1008}
1009
1010/* Would two expected things clash? */
1011static inline int expect_clash(const struct nf_conntrack_expect *a,
1012 const struct nf_conntrack_expect *b)
1013{
1014 /* Part covered by intersection of masks must be unequal,
1015 otherwise they clash */
1016 struct nf_conntrack_tuple intersect_mask;
1017 int count;
1018
1019 intersect_mask.src.l3num = a->mask.src.l3num & b->mask.src.l3num;
1020 intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all;
1021 intersect_mask.dst.u.all = a->mask.dst.u.all & b->mask.dst.u.all;
1022 intersect_mask.dst.protonum = a->mask.dst.protonum
1023 & b->mask.dst.protonum;
1024
1025 for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
1026 intersect_mask.src.u3.all[count] =
1027 a->mask.src.u3.all[count] & b->mask.src.u3.all[count];
1028 }
1029
1030 for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
1031 intersect_mask.dst.u3.all[count] =
1032 a->mask.dst.u3.all[count] & b->mask.dst.u3.all[count];
1033 }
1034
1035 return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
1036}
1037
1038static inline int expect_matches(const struct nf_conntrack_expect *a,
1039 const struct nf_conntrack_expect *b)
1040{
1041 return a->master == b->master
1042 && nf_ct_tuple_equal(&a->tuple, &b->tuple)
1043 && nf_ct_tuple_equal(&a->mask, &b->mask);
1044}
1045
1046/* Generally a bad idea to call this: could have matched already. */
1047void nf_conntrack_unexpect_related(struct nf_conntrack_expect *exp)
1048{
1049 struct nf_conntrack_expect *i;
1050
1051 write_lock_bh(&nf_conntrack_lock);
1052 /* choose the the oldest expectation to evict */
1053 list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) {
1054 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
1055 nf_ct_unlink_expect(i);
1056 write_unlock_bh(&nf_conntrack_lock);
1057 nf_conntrack_expect_put(i);
1058 return;
1059 }
1060 }
1061 write_unlock_bh(&nf_conntrack_lock);
1062}
1063
1064/* We don't increase the master conntrack refcount for non-fulfilled
1065 * conntracks. During the conntrack destruction, the expectations are
1066 * always killed before the conntrack itself */
1067struct nf_conntrack_expect *nf_conntrack_expect_alloc(struct nf_conn *me)
1068{
1069 struct nf_conntrack_expect *new;
1070
1071 new = kmem_cache_alloc(nf_conntrack_expect_cachep, GFP_ATOMIC);
1072 if (!new) {
1073 DEBUGP("expect_related: OOM allocating expect\n");
1074 return NULL;
1075 }
1076 new->master = me;
1077 atomic_set(&new->use, 1);
1078 return new;
1079}
1080
1081void nf_conntrack_expect_put(struct nf_conntrack_expect *exp)
1082{
1083 if (atomic_dec_and_test(&exp->use))
1084 kmem_cache_free(nf_conntrack_expect_cachep, exp);
1085}
1086
1087static void nf_conntrack_expect_insert(struct nf_conntrack_expect *exp)
1088{
1089 atomic_inc(&exp->use);
1090 exp->master->expecting++;
1091 list_add(&exp->list, &nf_conntrack_expect_list);
1092
1093 init_timer(&exp->timeout);
1094 exp->timeout.data = (unsigned long)exp;
1095 exp->timeout.function = expectation_timed_out;
1096 exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
1097 add_timer(&exp->timeout);
1098
1099 atomic_inc(&exp->use);
1100 NF_CT_STAT_INC(expect_create);
1101}
1102
1103/* Race with expectations being used means we could have none to find; OK. */
1104static void evict_oldest_expect(struct nf_conn *master)
1105{
1106 struct nf_conntrack_expect *i;
1107
1108 list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) {
1109 if (i->master == master) {
1110 if (del_timer(&i->timeout)) {
1111 nf_ct_unlink_expect(i);
1112 nf_conntrack_expect_put(i);
1113 }
1114 break;
1115 }
1116 }
1117}
1118
1119static inline int refresh_timer(struct nf_conntrack_expect *i)
1120{
1121 if (!del_timer(&i->timeout))
1122 return 0;
1123
1124 i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
1125 add_timer(&i->timeout);
1126 return 1;
1127}
1128
1129int nf_conntrack_expect_related(struct nf_conntrack_expect *expect)
1130{
1131 struct nf_conntrack_expect *i;
1132 int ret;
1133
1134 DEBUGP("nf_conntrack_expect_related %p\n", related_to);
1135 DEBUGP("tuple: "); NF_CT_DUMP_TUPLE(&expect->tuple);
1136 DEBUGP("mask: "); NF_CT_DUMP_TUPLE(&expect->mask);
1137
1138 write_lock_bh(&nf_conntrack_lock);
1139 list_for_each_entry(i, &nf_conntrack_expect_list, list) {
1140 if (expect_matches(i, expect)) {
1141 /* Refresh timer: if it's dying, ignore.. */
1142 if (refresh_timer(i)) {
1143 ret = 0;
1144 goto out;
1145 }
1146 } else if (expect_clash(i, expect)) {
1147 ret = -EBUSY;
1148 goto out;
1149 }
1150 }
1151 /* Will be over limit? */
1152 if (expect->master->helper->max_expected &&
1153 expect->master->expecting >= expect->master->helper->max_expected)
1154 evict_oldest_expect(expect->master);
1155
1156 nf_conntrack_expect_insert(expect);
1157 nf_conntrack_expect_event(IPEXP_NEW, expect);
1158 ret = 0;
1159out:
1160 write_unlock_bh(&nf_conntrack_lock);
1161 return ret;
1162}
1163
1164/* Alter reply tuple (maybe alter helper). This is for NAT, and is
1165 implicitly racy: see __nf_conntrack_confirm */
1166void nf_conntrack_alter_reply(struct nf_conn *conntrack,
1167 const struct nf_conntrack_tuple *newreply)
1168{
1169 write_lock_bh(&nf_conntrack_lock);
1170 /* Should be unconfirmed, so not in hash table yet */
1171 NF_CT_ASSERT(!nf_ct_is_confirmed(conntrack));
1172
1173 DEBUGP("Altering reply tuple of %p to ", conntrack);
1174 NF_CT_DUMP_TUPLE(newreply);
1175
1176 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1177 if (!conntrack->master && conntrack->expecting == 0)
1178 conntrack->helper = nf_ct_find_helper(newreply);
1179 write_unlock_bh(&nf_conntrack_lock);
1180}
1181
1182int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
1183{
1184 int ret;
1185 BUG_ON(me->timeout == 0);
1186
1187 ret = nf_conntrack_register_cache(NF_CT_F_HELP, "nf_conntrack:help",
1188 sizeof(struct nf_conn)
1189 + sizeof(union nf_conntrack_help)
1190 + __alignof__(union nf_conntrack_help),
1191 init_conntrack_for_helper);
1192 if (ret < 0) {
1193 printk(KERN_ERR "nf_conntrack_helper_reigster: Unable to create slab cache for conntracks\n");
1194 return ret;
1195 }
1196 write_lock_bh(&nf_conntrack_lock);
1197 list_prepend(&helpers, me);
1198 write_unlock_bh(&nf_conntrack_lock);
1199
1200 return 0;
1201}
1202
1203static inline int unhelp(struct nf_conntrack_tuple_hash *i,
1204 const struct nf_conntrack_helper *me)
1205{
1206 if (nf_ct_tuplehash_to_ctrack(i)->helper == me) {
1207 nf_conntrack_event(IPCT_HELPER, nf_ct_tuplehash_to_ctrack(i));
1208 nf_ct_tuplehash_to_ctrack(i)->helper = NULL;
1209 }
1210 return 0;
1211}
1212
1213void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
1214{
1215 unsigned int i;
1216 struct nf_conntrack_expect *exp, *tmp;
1217
1218 /* Need write lock here, to delete helper. */
1219 write_lock_bh(&nf_conntrack_lock);
1220 LIST_DELETE(&helpers, me);
1221
1222 /* Get rid of expectations */
1223 list_for_each_entry_safe(exp, tmp, &nf_conntrack_expect_list, list) {
1224 if (exp->master->helper == me && del_timer(&exp->timeout)) {
1225 nf_ct_unlink_expect(exp);
1226 nf_conntrack_expect_put(exp);
1227 }
1228 }
1229
1230 /* Get rid of expecteds, set helpers to NULL. */
1231 LIST_FIND_W(&unconfirmed, unhelp, struct nf_conntrack_tuple_hash*, me);
1232 for (i = 0; i < nf_conntrack_htable_size; i++)
1233 LIST_FIND_W(&nf_conntrack_hash[i], unhelp,
1234 struct nf_conntrack_tuple_hash *, me);
1235 write_unlock_bh(&nf_conntrack_lock);
1236
1237 /* Someone could be still looking at the helper in a bh. */
1238 synchronize_net();
1239}
1240
1241/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1242void __nf_ct_refresh_acct(struct nf_conn *ct,
1243 enum ip_conntrack_info ctinfo,
1244 const struct sk_buff *skb,
1245 unsigned long extra_jiffies,
1246 int do_acct)
1247{
1248 int event = 0;
1249
1250 NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
1251 NF_CT_ASSERT(skb);
1252
1253 write_lock_bh(&nf_conntrack_lock);
1254
1255 /* If not in hash table, timer will not be active yet */
1256 if (!nf_ct_is_confirmed(ct)) {
1257 ct->timeout.expires = extra_jiffies;
1258 event = IPCT_REFRESH;
1259 } else {
1260 /* Need del_timer for race avoidance (may already be dying). */
1261 if (del_timer(&ct->timeout)) {
1262 ct->timeout.expires = jiffies + extra_jiffies;
1263 add_timer(&ct->timeout);
1264 event = IPCT_REFRESH;
1265 }
1266 }
1267
1268#ifdef CONFIG_NF_CT_ACCT
1269 if (do_acct) {
1270 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1271 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
1272 skb->len - (unsigned int)(skb->nh.raw - skb->data);
1273 if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
1274 || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
1275 event |= IPCT_COUNTER_FILLING;
1276 }
1277#endif
1278
1279 write_unlock_bh(&nf_conntrack_lock);
1280
1281 /* must be unlocked when calling event cache */
1282 if (event)
1283 nf_conntrack_event_cache(event, skb);
1284}
1285
1286/* Used by ipt_REJECT and ip6t_REJECT. */
1287void __nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1288{
1289 struct nf_conn *ct;
1290 enum ip_conntrack_info ctinfo;
1291
1292 /* This ICMP is in reverse direction to the packet which caused it */
1293 ct = nf_ct_get(skb, &ctinfo);
1294 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1295 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1296 else
1297 ctinfo = IP_CT_RELATED;
1298
1299 /* Attach to new skbuff, and increment count */
1300 nskb->nfct = &ct->ct_general;
1301 nskb->nfctinfo = ctinfo;
1302 nf_conntrack_get(nskb->nfct);
1303}
1304
1305static inline int
1306do_iter(const struct nf_conntrack_tuple_hash *i,
1307 int (*iter)(struct nf_conn *i, void *data),
1308 void *data)
1309{
1310 return iter(nf_ct_tuplehash_to_ctrack(i), data);
1311}
1312
1313/* Bring out ya dead! */
1314static struct nf_conntrack_tuple_hash *
1315get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
1316 void *data, unsigned int *bucket)
1317{
1318 struct nf_conntrack_tuple_hash *h = NULL;
1319
1320 write_lock_bh(&nf_conntrack_lock);
1321 for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
1322 h = LIST_FIND_W(&nf_conntrack_hash[*bucket], do_iter,
1323 struct nf_conntrack_tuple_hash *, iter, data);
1324 if (h)
1325 break;
1326 }
1327 if (!h)
1328 h = LIST_FIND_W(&unconfirmed, do_iter,
1329 struct nf_conntrack_tuple_hash *, iter, data);
1330 if (h)
1331 atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
1332 write_unlock_bh(&nf_conntrack_lock);
1333
1334 return h;
1335}
1336
1337void
1338nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data)
1339{
1340 struct nf_conntrack_tuple_hash *h;
1341 unsigned int bucket = 0;
1342
1343 while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1344 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
1345 /* Time to push up daises... */
1346 if (del_timer(&ct->timeout))
1347 death_by_timeout((unsigned long)ct);
1348 /* ... else the timer will get him soon. */
1349
1350 nf_ct_put(ct);
1351 }
1352}
1353
1354static int kill_all(struct nf_conn *i, void *data)
1355{
1356 return 1;
1357}
1358
1359static void free_conntrack_hash(struct list_head *hash, int vmalloced, int size)
1360{
1361 if (vmalloced)
1362 vfree(hash);
1363 else
1364 free_pages((unsigned long)hash,
1365 get_order(sizeof(struct list_head) * size));
1366}
1367
1368/* Mishearing the voices in his head, our hero wonders how he's
1369 supposed to kill the mall. */
1370void nf_conntrack_cleanup(void)
1371{
1372 int i;
1373
1374 /* This makes sure all current packets have passed through
1375 netfilter framework. Roll on, two-stage module
1376 delete... */
1377 synchronize_net();
1378
1379 nf_ct_event_cache_flush();
1380 i_see_dead_people:
1381 nf_ct_iterate_cleanup(kill_all, NULL);
1382 if (atomic_read(&nf_conntrack_count) != 0) {
1383 schedule();
1384 goto i_see_dead_people;
1385 }
1386
1387 for (i = 0; i < NF_CT_F_NUM; i++) {
1388 if (nf_ct_cache[i].use == 0)
1389 continue;
1390
1391 NF_CT_ASSERT(nf_ct_cache[i].use == 1);
1392 nf_ct_cache[i].use = 1;
1393 nf_conntrack_unregister_cache(i);
1394 }
1395 kmem_cache_destroy(nf_conntrack_expect_cachep);
1396 free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
1397 nf_conntrack_htable_size);
1398}
1399
1400static struct list_head *alloc_hashtable(int size, int *vmalloced)
1401{
1402 struct list_head *hash;
1403 unsigned int i;
1404
1405 *vmalloced = 0;
1406 hash = (void*)__get_free_pages(GFP_KERNEL,
1407 get_order(sizeof(struct list_head)
1408 * size));
1409 if (!hash) {
1410 *vmalloced = 1;
1411 printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n");
1412 hash = vmalloc(sizeof(struct list_head) * size);
1413 }
1414
1415 if (hash)
1416 for (i = 0; i < size; i++)
1417 INIT_LIST_HEAD(&hash[i]);
1418
1419 return hash;
1420}
1421
1422int set_hashsize(const char *val, struct kernel_param *kp)
1423{
1424 int i, bucket, hashsize, vmalloced;
1425 int old_vmalloced, old_size;
1426 int rnd;
1427 struct list_head *hash, *old_hash;
1428 struct nf_conntrack_tuple_hash *h;
1429
1430 /* On boot, we can set this without any fancy locking. */
1431 if (!nf_conntrack_htable_size)
1432 return param_set_uint(val, kp);
1433
1434 hashsize = simple_strtol(val, NULL, 0);
1435 if (!hashsize)
1436 return -EINVAL;
1437
1438 hash = alloc_hashtable(hashsize, &vmalloced);
1439 if (!hash)
1440 return -ENOMEM;
1441
1442 /* We have to rehahs for the new table anyway, so we also can
1443 * use a newrandom seed */
1444 get_random_bytes(&rnd, 4);
1445
1446 write_lock_bh(&nf_conntrack_lock);
1447 for (i = 0; i < nf_conntrack_htable_size; i++) {
1448 while (!list_empty(&nf_conntrack_hash[i])) {
1449 h = list_entry(nf_conntrack_hash[i].next,
1450 struct nf_conntrack_tuple_hash, list);
1451 list_del(&h->list);
1452 bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1453 list_add_tail(&h->list, &hash[bucket]);
1454 }
1455 }
1456 old_size = nf_conntrack_htable_size;
1457 old_vmalloced = nf_conntrack_vmalloc;
1458 old_hash = nf_conntrack_hash;
1459
1460 nf_conntrack_htable_size = hashsize;
1461 nf_conntrack_vmalloc = vmalloced;
1462 nf_conntrack_hash = hash;
1463 nf_conntrack_hash_rnd = rnd;
1464 write_unlock_bh(&nf_conntrack_lock);
1465
1466 free_conntrack_hash(old_hash, old_vmalloced, old_size);
1467 return 0;
1468}
1469
1470module_param_call(hashsize, set_hashsize, param_get_uint,
1471 &nf_conntrack_htable_size, 0600);
1472
1473int __init nf_conntrack_init(void)
1474{
1475 unsigned int i;
1476 int ret;
1477
1478 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1479 * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
1480 if (!nf_conntrack_htable_size) {
1481 nf_conntrack_htable_size
1482 = (((num_physpages << PAGE_SHIFT) / 16384)
1483 / sizeof(struct list_head));
1484 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1485 nf_conntrack_htable_size = 8192;
1486 if (nf_conntrack_htable_size < 16)
1487 nf_conntrack_htable_size = 16;
1488 }
1489 nf_conntrack_max = 8 * nf_conntrack_htable_size;
1490
1491 printk("nf_conntrack version %s (%u buckets, %d max)\n",
1492 NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
1493 nf_conntrack_max);
1494
1495 nf_conntrack_hash = alloc_hashtable(nf_conntrack_htable_size,
1496 &nf_conntrack_vmalloc);
1497 if (!nf_conntrack_hash) {
1498 printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
1499 goto err_out;
1500 }
1501
1502 ret = nf_conntrack_register_cache(NF_CT_F_BASIC, "nf_conntrack:basic",
1503 sizeof(struct nf_conn), NULL);
1504 if (ret < 0) {
1505 printk(KERN_ERR "Unable to create nf_conn slab cache\n");
1506 goto err_free_hash;
1507 }
1508
1509 nf_conntrack_expect_cachep = kmem_cache_create("nf_conntrack_expect",
1510 sizeof(struct nf_conntrack_expect),
1511 0, 0, NULL, NULL);
1512 if (!nf_conntrack_expect_cachep) {
1513 printk(KERN_ERR "Unable to create nf_expect slab cache\n");
1514 goto err_free_conntrack_slab;
1515 }
1516
1517 /* Don't NEED lock here, but good form anyway. */
1518 write_lock_bh(&nf_conntrack_lock);
1519 for (i = 0; i < PF_MAX; i++)
1520 nf_ct_l3protos[i] = &nf_conntrack_generic_l3proto;
1521 write_unlock_bh(&nf_conntrack_lock);
1522
1523 /* Set up fake conntrack:
1524 - to never be deleted, not in any hashes */
1525 atomic_set(&nf_conntrack_untracked.ct_general.use, 1);
1526 /* - and look it like as a confirmed connection */
1527 set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status);
1528
1529 return ret;
1530
1531err_free_conntrack_slab:
1532 nf_conntrack_unregister_cache(NF_CT_F_BASIC);
1533err_free_hash:
1534 free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
1535 nf_conntrack_htable_size);
1536err_out:
1537 return -ENOMEM;
1538}
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
new file mode 100644
index 000000000000..65080e269f27
--- /dev/null
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -0,0 +1,698 @@
1/* FTP extension for connection tracking. */
2
3/* (C) 1999-2001 Paul `Rusty' Russell
4 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
5 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
12 * - enable working with Layer 3 protocol independent connection tracking.
13 * - track EPRT and EPSV commands with IPv6 address.
14 *
15 * Derived from net/ipv4/netfilter/ip_conntrack_ftp.c
16 */
17
18#include <linux/config.h>
19#include <linux/module.h>
20#include <linux/moduleparam.h>
21#include <linux/netfilter.h>
22#include <linux/ip.h>
23#include <linux/ipv6.h>
24#include <linux/ctype.h>
25#include <net/checksum.h>
26#include <net/tcp.h>
27
28#include <net/netfilter/nf_conntrack.h>
29#include <net/netfilter/nf_conntrack_helper.h>
30#include <linux/netfilter/nf_conntrack_ftp.h>
31
32MODULE_LICENSE("GPL");
33MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
34MODULE_DESCRIPTION("ftp connection tracking helper");
35
36/* This is slow, but it's simple. --RR */
37static char *ftp_buffer;
38
39static DEFINE_SPINLOCK(nf_ftp_lock);
40
41#define MAX_PORTS 8
42static u_int16_t ports[MAX_PORTS];
43static unsigned int ports_c;
44module_param_array(ports, ushort, &ports_c, 0400);
45
46static int loose;
47module_param(loose, int, 0600);
48
49unsigned int (*nf_nat_ftp_hook)(struct sk_buff **pskb,
50 enum ip_conntrack_info ctinfo,
51 enum ip_ct_ftp_type type,
52 unsigned int matchoff,
53 unsigned int matchlen,
54 struct nf_conntrack_expect *exp,
55 u32 *seq);
56EXPORT_SYMBOL_GPL(nf_nat_ftp_hook);
57
58#if 0
59#define DEBUGP printk
60#else
61#define DEBUGP(format, args...)
62#endif
63
64static int try_rfc959(const char *, size_t, struct nf_conntrack_man *, char);
65static int try_eprt(const char *, size_t, struct nf_conntrack_man *, char);
66static int try_epsv_response(const char *, size_t, struct nf_conntrack_man *,
67 char);
68
69static struct ftp_search {
70 enum ip_conntrack_dir dir;
71 const char *pattern;
72 size_t plen;
73 char skip;
74 char term;
75 enum ip_ct_ftp_type ftptype;
76 int (*getnum)(const char *, size_t, struct nf_conntrack_man *, char);
77} search[] = {
78 {
79 IP_CT_DIR_ORIGINAL,
80 "PORT", sizeof("PORT") - 1, ' ', '\r',
81 IP_CT_FTP_PORT,
82 try_rfc959,
83 },
84 {
85 IP_CT_DIR_REPLY,
86 "227 ", sizeof("227 ") - 1, '(', ')',
87 IP_CT_FTP_PASV,
88 try_rfc959,
89 },
90 {
91 IP_CT_DIR_ORIGINAL,
92 "EPRT", sizeof("EPRT") - 1, ' ', '\r',
93 IP_CT_FTP_EPRT,
94 try_eprt,
95 },
96 {
97 IP_CT_DIR_REPLY,
98 "229 ", sizeof("229 ") - 1, '(', ')',
99 IP_CT_FTP_EPSV,
100 try_epsv_response,
101 },
102};
103
104/* This code is based on inet_pton() in glibc-2.2.4 */
105static int
106get_ipv6_addr(const char *src, size_t dlen, struct in6_addr *dst, u_int8_t term)
107{
108 static const char xdigits[] = "0123456789abcdef";
109 u_int8_t tmp[16], *tp, *endp, *colonp;
110 int ch, saw_xdigit;
111 u_int32_t val;
112 size_t clen = 0;
113
114 tp = memset(tmp, '\0', sizeof(tmp));
115 endp = tp + sizeof(tmp);
116 colonp = NULL;
117
118 /* Leading :: requires some special handling. */
119 if (*src == ':'){
120 if (*++src != ':') {
121 DEBUGP("invalid \":\" at the head of addr\n");
122 return 0;
123 }
124 clen++;
125 }
126
127 saw_xdigit = 0;
128 val = 0;
129 while ((clen < dlen) && (*src != term)) {
130 const char *pch;
131
132 ch = tolower(*src++);
133 clen++;
134
135 pch = strchr(xdigits, ch);
136 if (pch != NULL) {
137 val <<= 4;
138 val |= (pch - xdigits);
139 if (val > 0xffff)
140 return 0;
141
142 saw_xdigit = 1;
143 continue;
144 }
145 if (ch != ':') {
146 DEBUGP("get_ipv6_addr: invalid char. \'%c\'\n", ch);
147 return 0;
148 }
149
150 if (!saw_xdigit) {
151 if (colonp) {
152 DEBUGP("invalid location of \"::\".\n");
153 return 0;
154 }
155 colonp = tp;
156 continue;
157 } else if (*src == term) {
158 DEBUGP("trancated IPv6 addr\n");
159 return 0;
160 }
161
162 if (tp + 2 > endp)
163 return 0;
164 *tp++ = (u_int8_t) (val >> 8) & 0xff;
165 *tp++ = (u_int8_t) val & 0xff;
166
167 saw_xdigit = 0;
168 val = 0;
169 continue;
170 }
171 if (saw_xdigit) {
172 if (tp + 2 > endp)
173 return 0;
174 *tp++ = (u_int8_t) (val >> 8) & 0xff;
175 *tp++ = (u_int8_t) val & 0xff;
176 }
177 if (colonp != NULL) {
178 /*
179 * Since some memmove()'s erroneously fail to handle
180 * overlapping regions, we'll do the shift by hand.
181 */
182 const int n = tp - colonp;
183 int i;
184
185 if (tp == endp)
186 return 0;
187
188 for (i = 1; i <= n; i++) {
189 endp[- i] = colonp[n - i];
190 colonp[n - i] = 0;
191 }
192 tp = endp;
193 }
194 if (tp != endp || (*src != term))
195 return 0;
196
197 memcpy(dst->s6_addr, tmp, sizeof(dst->s6_addr));
198 return clen;
199}
200
201static int try_number(const char *data, size_t dlen, u_int32_t array[],
202 int array_size, char sep, char term)
203{
204 u_int32_t i, len;
205
206 memset(array, 0, sizeof(array[0])*array_size);
207
208 /* Keep data pointing at next char. */
209 for (i = 0, len = 0; len < dlen && i < array_size; len++, data++) {
210 if (*data >= '0' && *data <= '9') {
211 array[i] = array[i]*10 + *data - '0';
212 }
213 else if (*data == sep)
214 i++;
215 else {
216 /* Unexpected character; true if it's the
217 terminator and we're finished. */
218 if (*data == term && i == array_size - 1)
219 return len;
220
221 DEBUGP("Char %u (got %u nums) `%u' unexpected\n",
222 len, i, *data);
223 return 0;
224 }
225 }
226 DEBUGP("Failed to fill %u numbers separated by %c\n", array_size, sep);
227
228 return 0;
229}
230
231/* Returns 0, or length of numbers: 192,168,1,1,5,6 */
232static int try_rfc959(const char *data, size_t dlen,
233 struct nf_conntrack_man *cmd, char term)
234{
235 int length;
236 u_int32_t array[6];
237
238 length = try_number(data, dlen, array, 6, ',', term);
239 if (length == 0)
240 return 0;
241
242 cmd->u3.ip = htonl((array[0] << 24) | (array[1] << 16) |
243 (array[2] << 8) | array[3]);
244 cmd->u.tcp.port = htons((array[4] << 8) | array[5]);
245 return length;
246}
247
248/* Grab port: number up to delimiter */
249static int get_port(const char *data, int start, size_t dlen, char delim,
250 u_int16_t *port)
251{
252 u_int16_t tmp_port = 0;
253 int i;
254
255 for (i = start; i < dlen; i++) {
256 /* Finished? */
257 if (data[i] == delim) {
258 if (tmp_port == 0)
259 break;
260 *port = htons(tmp_port);
261 DEBUGP("get_port: return %d\n", tmp_port);
262 return i + 1;
263 }
264 else if (data[i] >= '0' && data[i] <= '9')
265 tmp_port = tmp_port*10 + data[i] - '0';
266 else { /* Some other crap */
267 DEBUGP("get_port: invalid char.\n");
268 break;
269 }
270 }
271 return 0;
272}
273
274/* Returns 0, or length of numbers: |1|132.235.1.2|6275| or |2|3ffe::1|6275| */
275static int try_eprt(const char *data, size_t dlen, struct nf_conntrack_man *cmd,
276 char term)
277{
278 char delim;
279 int length;
280
281 /* First character is delimiter, then "1" for IPv4 or "2" for IPv6,
282 then delimiter again. */
283 if (dlen <= 3) {
284 DEBUGP("EPRT: too short\n");
285 return 0;
286 }
287 delim = data[0];
288 if (isdigit(delim) || delim < 33 || delim > 126 || data[2] != delim) {
289 DEBUGP("try_eprt: invalid delimitter.\n");
290 return 0;
291 }
292
293 if ((cmd->l3num == PF_INET && data[1] != '1') ||
294 (cmd->l3num == PF_INET6 && data[1] != '2')) {
295 DEBUGP("EPRT: invalid protocol number.\n");
296 return 0;
297 }
298
299 DEBUGP("EPRT: Got %c%c%c\n", delim, data[1], delim);
300
301 if (data[1] == '1') {
302 u_int32_t array[4];
303
304 /* Now we have IP address. */
305 length = try_number(data + 3, dlen - 3, array, 4, '.', delim);
306 if (length != 0)
307 cmd->u3.ip = htonl((array[0] << 24) | (array[1] << 16)
308 | (array[2] << 8) | array[3]);
309 } else {
310 /* Now we have IPv6 address. */
311 length = get_ipv6_addr(data + 3, dlen - 3,
312 (struct in6_addr *)cmd->u3.ip6, delim);
313 }
314
315 if (length == 0)
316 return 0;
317 DEBUGP("EPRT: Got IP address!\n");
318 /* Start offset includes initial "|1|", and trailing delimiter */
319 return get_port(data, 3 + length + 1, dlen, delim, &cmd->u.tcp.port);
320}
321
322/* Returns 0, or length of numbers: |||6446| */
323static int try_epsv_response(const char *data, size_t dlen,
324 struct nf_conntrack_man *cmd, char term)
325{
326 char delim;
327
328 /* Three delimiters. */
329 if (dlen <= 3) return 0;
330 delim = data[0];
331 if (isdigit(delim) || delim < 33 || delim > 126
332 || data[1] != delim || data[2] != delim)
333 return 0;
334
335 return get_port(data, 3, dlen, delim, &cmd->u.tcp.port);
336}
337
338/* Return 1 for match, 0 for accept, -1 for partial. */
339static int find_pattern(const char *data, size_t dlen,
340 const char *pattern, size_t plen,
341 char skip, char term,
342 unsigned int *numoff,
343 unsigned int *numlen,
344 struct nf_conntrack_man *cmd,
345 int (*getnum)(const char *, size_t,
346 struct nf_conntrack_man *, char))
347{
348 size_t i;
349
350 DEBUGP("find_pattern `%s': dlen = %u\n", pattern, dlen);
351 if (dlen == 0)
352 return 0;
353
354 if (dlen <= plen) {
355 /* Short packet: try for partial? */
356 if (strnicmp(data, pattern, dlen) == 0)
357 return -1;
358 else return 0;
359 }
360
361 if (strnicmp(data, pattern, plen) != 0) {
362#if 0
363 size_t i;
364
365 DEBUGP("ftp: string mismatch\n");
366 for (i = 0; i < plen; i++) {
367 DEBUGP("ftp:char %u `%c'(%u) vs `%c'(%u)\n",
368 i, data[i], data[i],
369 pattern[i], pattern[i]);
370 }
371#endif
372 return 0;
373 }
374
375 DEBUGP("Pattern matches!\n");
376 /* Now we've found the constant string, try to skip
377 to the 'skip' character */
378 for (i = plen; data[i] != skip; i++)
379 if (i == dlen - 1) return -1;
380
381 /* Skip over the last character */
382 i++;
383
384 DEBUGP("Skipped up to `%c'!\n", skip);
385
386 *numoff = i;
387 *numlen = getnum(data + i, dlen - i, cmd, term);
388 if (!*numlen)
389 return -1;
390
391 DEBUGP("Match succeeded!\n");
392 return 1;
393}
394
395/* Look up to see if we're just after a \n. */
396static int find_nl_seq(u32 seq, const struct ip_ct_ftp_master *info, int dir)
397{
398 unsigned int i;
399
400 for (i = 0; i < info->seq_aft_nl_num[dir]; i++)
401 if (info->seq_aft_nl[dir][i] == seq)
402 return 1;
403 return 0;
404}
405
406/* We don't update if it's older than what we have. */
407static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir,
408 struct sk_buff *skb)
409{
410 unsigned int i, oldest = NUM_SEQ_TO_REMEMBER;
411
412 /* Look for oldest: if we find exact match, we're done. */
413 for (i = 0; i < info->seq_aft_nl_num[dir]; i++) {
414 if (info->seq_aft_nl[dir][i] == nl_seq)
415 return;
416
417 if (oldest == info->seq_aft_nl_num[dir]
418 || before(info->seq_aft_nl[dir][i], oldest))
419 oldest = i;
420 }
421
422 if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) {
423 info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq;
424 nf_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb);
425 } else if (oldest != NUM_SEQ_TO_REMEMBER) {
426 info->seq_aft_nl[dir][oldest] = nl_seq;
427 nf_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb);
428 }
429}
430
431static int help(struct sk_buff **pskb,
432 unsigned int protoff,
433 struct nf_conn *ct,
434 enum ip_conntrack_info ctinfo)
435{
436 unsigned int dataoff, datalen;
437 struct tcphdr _tcph, *th;
438 char *fb_ptr;
439 int ret;
440 u32 seq;
441 int dir = CTINFO2DIR(ctinfo);
442 unsigned int matchlen, matchoff;
443 struct ip_ct_ftp_master *ct_ftp_info = &ct->help->ct_ftp_info;
444 struct nf_conntrack_expect *exp;
445 struct nf_conntrack_man cmd = {};
446
447 unsigned int i;
448 int found = 0, ends_in_nl;
449
450 /* Until there's been traffic both ways, don't look in packets. */
451 if (ctinfo != IP_CT_ESTABLISHED
452 && ctinfo != IP_CT_ESTABLISHED+IP_CT_IS_REPLY) {
453 DEBUGP("ftp: Conntrackinfo = %u\n", ctinfo);
454 return NF_ACCEPT;
455 }
456
457 th = skb_header_pointer(*pskb, protoff, sizeof(_tcph), &_tcph);
458 if (th == NULL)
459 return NF_ACCEPT;
460
461 dataoff = protoff + th->doff * 4;
462 /* No data? */
463 if (dataoff >= (*pskb)->len) {
464 DEBUGP("ftp: dataoff(%u) >= skblen(%u)\n", dataoff,
465 (*pskb)->len);
466 return NF_ACCEPT;
467 }
468 datalen = (*pskb)->len - dataoff;
469
470 spin_lock_bh(&nf_ftp_lock);
471 fb_ptr = skb_header_pointer(*pskb, dataoff, datalen, ftp_buffer);
472 BUG_ON(fb_ptr == NULL);
473
474 ends_in_nl = (fb_ptr[datalen - 1] == '\n');
475 seq = ntohl(th->seq) + datalen;
476
477 /* Look up to see if we're just after a \n. */
478 if (!find_nl_seq(ntohl(th->seq), ct_ftp_info, dir)) {
479 /* Now if this ends in \n, update ftp info. */
480 DEBUGP("nf_conntrack_ftp_help: wrong seq pos %s(%u) or %s(%u)\n",
481 ct_ftp_info->seq_aft_nl_num[dir] > 0 ? "" : "(UNSET)",
482 ct_ftp_info->seq_aft_nl[dir][0],
483 ct_ftp_info->seq_aft_nl_num[dir] > 1 ? "" : "(UNSET)",
484 ct_ftp_info->seq_aft_nl[dir][1]);
485 ret = NF_ACCEPT;
486 goto out_update_nl;
487 }
488
489 /* Initialize IP/IPv6 addr to expected address (it's not mentioned
490 in EPSV responses) */
491 cmd.l3num = ct->tuplehash[dir].tuple.src.l3num;
492 memcpy(cmd.u3.all, &ct->tuplehash[dir].tuple.src.u3.all,
493 sizeof(cmd.u3.all));
494
495 for (i = 0; i < ARRAY_SIZE(search); i++) {
496 if (search[i].dir != dir) continue;
497
498 found = find_pattern(fb_ptr, datalen,
499 search[i].pattern,
500 search[i].plen,
501 search[i].skip,
502 search[i].term,
503 &matchoff, &matchlen,
504 &cmd,
505 search[i].getnum);
506 if (found) break;
507 }
508 if (found == -1) {
509 /* We don't usually drop packets. After all, this is
510 connection tracking, not packet filtering.
511 However, it is necessary for accurate tracking in
512 this case. */
513 if (net_ratelimit())
514 printk("conntrack_ftp: partial %s %u+%u\n",
515 search[i].pattern,
516 ntohl(th->seq), datalen);
517 ret = NF_DROP;
518 goto out;
519 } else if (found == 0) { /* No match */
520 ret = NF_ACCEPT;
521 goto out_update_nl;
522 }
523
524 DEBUGP("conntrack_ftp: match `%.*s' (%u bytes at %u)\n",
525 (int)matchlen, fb_ptr + matchoff,
526 matchlen, ntohl(th->seq) + matchoff);
527
528 exp = nf_conntrack_expect_alloc(ct);
529 if (exp == NULL) {
530 ret = NF_DROP;
531 goto out;
532 }
533
534 /* We refer to the reverse direction ("!dir") tuples here,
535 * because we're expecting something in the other direction.
536 * Doesn't matter unless NAT is happening. */
537 exp->tuple.dst.u3 = ct->tuplehash[!dir].tuple.dst.u3;
538
539 /* Update the ftp info */
540 if ((cmd.l3num == ct->tuplehash[dir].tuple.src.l3num) &&
541 memcmp(&cmd.u3.all, &ct->tuplehash[dir].tuple.src.u3.all,
542 sizeof(cmd.u3.all))) {
543 /* Enrico Scholz's passive FTP to partially RNAT'd ftp
544 server: it really wants us to connect to a
545 different IP address. Simply don't record it for
546 NAT. */
547 if (cmd.l3num == PF_INET) {
548 DEBUGP("conntrack_ftp: NOT RECORDING: %u,%u,%u,%u != %u.%u.%u.%u\n",
549 NIPQUAD(cmd.u3.ip),
550 NIPQUAD(ct->tuplehash[dir].tuple.src.u3.ip));
551 } else {
552 DEBUGP("conntrack_ftp: NOT RECORDING: %x:%x:%x:%x:%x:%x:%x:%x != %x:%x:%x:%x:%x:%x:%x:%x\n",
553 NIP6(*((struct in6_addr *)cmd.u3.ip6)),
554 NIP6(*((struct in6_addr *)ct->tuplehash[dir]
555 .tuple.src.u3.ip6)));
556 }
557
558 /* Thanks to Cristiano Lincoln Mattos
559 <lincoln@cesar.org.br> for reporting this potential
560 problem (DMZ machines opening holes to internal
561 networks, or the packet filter itself). */
562 if (!loose) {
563 ret = NF_ACCEPT;
564 goto out_put_expect;
565 }
566 memcpy(&exp->tuple.dst.u3, &cmd.u3.all,
567 sizeof(exp->tuple.dst.u3));
568 }
569
570 exp->tuple.src.u3 = ct->tuplehash[!dir].tuple.src.u3;
571 exp->tuple.src.l3num = cmd.l3num;
572 exp->tuple.src.u.tcp.port = 0;
573 exp->tuple.dst.u.tcp.port = cmd.u.tcp.port;
574 exp->tuple.dst.protonum = IPPROTO_TCP;
575
576 exp->mask = (struct nf_conntrack_tuple)
577 { .src = { .l3num = 0xFFFF,
578 .u = { .tcp = { 0 }},
579 },
580 .dst = { .protonum = 0xFF,
581 .u = { .tcp = { 0xFFFF }},
582 },
583 };
584 if (cmd.l3num == PF_INET) {
585 exp->mask.src.u3.ip = 0xFFFFFFFF;
586 exp->mask.dst.u3.ip = 0xFFFFFFFF;
587 } else {
588 memset(exp->mask.src.u3.ip6, 0xFF,
589 sizeof(exp->mask.src.u3.ip6));
590 memset(exp->mask.dst.u3.ip6, 0xFF,
591 sizeof(exp->mask.src.u3.ip6));
592 }
593
594 exp->expectfn = NULL;
595 exp->flags = 0;
596
597 /* Now, NAT might want to mangle the packet, and register the
598 * (possibly changed) expectation itself. */
599 if (nf_nat_ftp_hook)
600 ret = nf_nat_ftp_hook(pskb, ctinfo, search[i].ftptype,
601 matchoff, matchlen, exp, &seq);
602 else {
603 /* Can't expect this? Best to drop packet now. */
604 if (nf_conntrack_expect_related(exp) != 0)
605 ret = NF_DROP;
606 else
607 ret = NF_ACCEPT;
608 }
609
610out_put_expect:
611 nf_conntrack_expect_put(exp);
612
613out_update_nl:
614 /* Now if this ends in \n, update ftp info. Seq may have been
615 * adjusted by NAT code. */
616 if (ends_in_nl)
617 update_nl_seq(seq, ct_ftp_info, dir, *pskb);
618 out:
619 spin_unlock_bh(&nf_ftp_lock);
620 return ret;
621}
622
623static struct nf_conntrack_helper ftp[MAX_PORTS][2];
624static char ftp_names[MAX_PORTS][2][sizeof("ftp-65535")];
625
626/* don't make this __exit, since it's called from __init ! */
627static void fini(void)
628{
629 int i, j;
630 for (i = 0; i < ports_c; i++) {
631 for (j = 0; j < 2; j++) {
632 if (ftp[i][j].me == NULL)
633 continue;
634
635 DEBUGP("nf_ct_ftp: unregistering helper for pf: %d "
636 "port: %d\n",
637 ftp[i][j].tuple.src.l3num, ports[i]);
638 nf_conntrack_helper_unregister(&ftp[i][j]);
639 }
640 }
641
642 kfree(ftp_buffer);
643}
644
645static int __init init(void)
646{
647 int i, j = -1, ret = 0;
648 char *tmpname;
649
650 ftp_buffer = kmalloc(65536, GFP_KERNEL);
651 if (!ftp_buffer)
652 return -ENOMEM;
653
654 if (ports_c == 0)
655 ports[ports_c++] = FTP_PORT;
656
657 /* FIXME should be configurable whether IPv4 and IPv6 FTP connections
658 are tracked or not - YK */
659 for (i = 0; i < ports_c; i++) {
660 memset(&ftp[i], 0, sizeof(struct nf_conntrack_helper));
661
662 ftp[i][0].tuple.src.l3num = PF_INET;
663 ftp[i][1].tuple.src.l3num = PF_INET6;
664 for (j = 0; j < 2; j++) {
665 ftp[i][j].tuple.src.u.tcp.port = htons(ports[i]);
666 ftp[i][j].tuple.dst.protonum = IPPROTO_TCP;
667 ftp[i][j].mask.src.u.tcp.port = 0xFFFF;
668 ftp[i][j].mask.dst.protonum = 0xFF;
669 ftp[i][j].max_expected = 1;
670 ftp[i][j].timeout = 5 * 60; /* 5 Minutes */
671 ftp[i][j].me = THIS_MODULE;
672 ftp[i][j].help = help;
673 tmpname = &ftp_names[i][j][0];
674 if (ports[i] == FTP_PORT)
675 sprintf(tmpname, "ftp");
676 else
677 sprintf(tmpname, "ftp-%d", ports[i]);
678 ftp[i][j].name = tmpname;
679
680 DEBUGP("nf_ct_ftp: registering helper for pf: %d "
681 "port: %d\n",
682 ftp[i][j].tuple.src.l3num, ports[i]);
683 ret = nf_conntrack_helper_register(&ftp[i][j]);
684 if (ret) {
685 printk("nf_ct_ftp: failed to register helper "
686 " for pf: %d port: %d\n",
687 ftp[i][j].tuple.src.l3num, ports[i]);
688 fini();
689 return ret;
690 }
691 }
692 }
693
694 return 0;
695}
696
697module_init(init);
698module_exit(fini);
diff --git a/net/netfilter/nf_conntrack_l3proto_generic.c b/net/netfilter/nf_conntrack_l3proto_generic.c
new file mode 100644
index 000000000000..7de4f06c63c5
--- /dev/null
+++ b/net/netfilter/nf_conntrack_l3proto_generic.c
@@ -0,0 +1,98 @@
1/*
2 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
3 *
4 * Based largely upon the original ip_conntrack code which
5 * had the following copyright information:
6 *
7 * (C) 1999-2001 Paul `Rusty' Russell
8 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2 as
12 * published by the Free Software Foundation.
13 *
14 * Author:
15 * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
16 */
17
18#include <linux/config.h>
19#include <linux/types.h>
20#include <linux/ip.h>
21#include <linux/netfilter.h>
22#include <linux/module.h>
23#include <linux/skbuff.h>
24#include <linux/icmp.h>
25#include <linux/sysctl.h>
26#include <net/ip.h>
27
28#include <linux/netfilter_ipv4.h>
29#include <net/netfilter/nf_conntrack.h>
30#include <net/netfilter/nf_conntrack_protocol.h>
31#include <net/netfilter/nf_conntrack_l3proto.h>
32#include <net/netfilter/nf_conntrack_core.h>
33#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
34
35#if 0
36#define DEBUGP printk
37#else
38#define DEBUGP(format, args...)
39#endif
40
41DECLARE_PER_CPU(struct nf_conntrack_stat, nf_conntrack_stat);
42
43static int generic_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
44 struct nf_conntrack_tuple *tuple)
45{
46 memset(&tuple->src.u3, 0, sizeof(tuple->src.u3));
47 memset(&tuple->dst.u3, 0, sizeof(tuple->dst.u3));
48
49 return 1;
50}
51
52static int generic_invert_tuple(struct nf_conntrack_tuple *tuple,
53 const struct nf_conntrack_tuple *orig)
54{
55 memset(&tuple->src.u3, 0, sizeof(tuple->src.u3));
56 memset(&tuple->dst.u3, 0, sizeof(tuple->dst.u3));
57
58 return 1;
59}
60
61static int generic_print_tuple(struct seq_file *s,
62 const struct nf_conntrack_tuple *tuple)
63{
64 return 0;
65}
66
67static int generic_print_conntrack(struct seq_file *s,
68 const struct nf_conn *conntrack)
69{
70 return 0;
71}
72
73static int
74generic_prepare(struct sk_buff **pskb, unsigned int hooknum,
75 unsigned int *dataoff, u_int8_t *protonum)
76{
77 /* Never track !!! */
78 return -NF_ACCEPT;
79}
80
81
82static u_int32_t generic_get_features(const struct nf_conntrack_tuple *tuple)
83
84{
85 return NF_CT_F_BASIC;
86}
87
88struct nf_conntrack_l3proto nf_conntrack_generic_l3proto = {
89 .l3proto = PF_UNSPEC,
90 .name = "unknown",
91 .pkt_to_tuple = generic_pkt_to_tuple,
92 .invert_tuple = generic_invert_tuple,
93 .print_tuple = generic_print_tuple,
94 .print_conntrack = generic_print_conntrack,
95 .prepare = generic_prepare,
96 .get_features = generic_get_features,
97 .me = THIS_MODULE,
98};
diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c
new file mode 100644
index 000000000000..36425f6c833f
--- /dev/null
+++ b/net/netfilter/nf_conntrack_proto_generic.c
@@ -0,0 +1,85 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
9 * - enable working with L3 protocol independent connection tracking.
10 *
11 * Derived from net/ipv4/netfilter/ip_conntrack_proto_generic.c
12 */
13
14#include <linux/types.h>
15#include <linux/sched.h>
16#include <linux/timer.h>
17#include <linux/netfilter.h>
18#include <net/netfilter/nf_conntrack_protocol.h>
19
20unsigned long nf_ct_generic_timeout = 600*HZ;
21
22static int generic_pkt_to_tuple(const struct sk_buff *skb,
23 unsigned int dataoff,
24 struct nf_conntrack_tuple *tuple)
25{
26 tuple->src.u.all = 0;
27 tuple->dst.u.all = 0;
28
29 return 1;
30}
31
32static int generic_invert_tuple(struct nf_conntrack_tuple *tuple,
33 const struct nf_conntrack_tuple *orig)
34{
35 tuple->src.u.all = 0;
36 tuple->dst.u.all = 0;
37
38 return 1;
39}
40
41/* Print out the per-protocol part of the tuple. */
42static int generic_print_tuple(struct seq_file *s,
43 const struct nf_conntrack_tuple *tuple)
44{
45 return 0;
46}
47
48/* Print out the private part of the conntrack. */
49static int generic_print_conntrack(struct seq_file *s,
50 const struct nf_conn *state)
51{
52 return 0;
53}
54
55/* Returns verdict for packet, or -1 for invalid. */
56static int packet(struct nf_conn *conntrack,
57 const struct sk_buff *skb,
58 unsigned int dataoff,
59 enum ip_conntrack_info ctinfo,
60 int pf,
61 unsigned int hooknum)
62{
63 nf_ct_refresh_acct(conntrack, ctinfo, skb, nf_ct_generic_timeout);
64 return NF_ACCEPT;
65}
66
67/* Called when a new connection for this protocol found. */
68static int new(struct nf_conn *conntrack, const struct sk_buff *skb,
69 unsigned int dataoff)
70{
71 return 1;
72}
73
74struct nf_conntrack_protocol nf_conntrack_generic_protocol =
75{
76 .l3proto = PF_UNSPEC,
77 .proto = 0,
78 .name = "unknown",
79 .pkt_to_tuple = generic_pkt_to_tuple,
80 .invert_tuple = generic_invert_tuple,
81 .print_tuple = generic_print_tuple,
82 .print_conntrack = generic_print_conntrack,
83 .packet = packet,
84 .new = new,
85};
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
new file mode 100644
index 000000000000..3a600f77b4e0
--- /dev/null
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -0,0 +1,670 @@
1/*
2 * Connection tracking protocol helper module for SCTP.
3 *
4 * SCTP is defined in RFC 2960. References to various sections in this code
5 * are to this RFC.
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * 17 Oct 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
12 * - enable working with L3 protocol independent connection tracking.
13 *
14 * Derived from net/ipv4/ip_conntrack_sctp.c
15 */
16
17/*
18 * Added support for proc manipulation of timeouts.
19 */
20
21#include <linux/types.h>
22#include <linux/sched.h>
23#include <linux/timer.h>
24#include <linux/netfilter.h>
25#include <linux/module.h>
26#include <linux/in.h>
27#include <linux/ip.h>
28#include <linux/sctp.h>
29#include <linux/string.h>
30#include <linux/seq_file.h>
31
32#include <net/netfilter/nf_conntrack.h>
33#include <net/netfilter/nf_conntrack_protocol.h>
34
35#if 0
36#define DEBUGP(format, ...) printk(format, ## __VA_ARGS__)
37#else
38#define DEBUGP(format, args...)
39#endif
40
41/* Protects conntrack->proto.sctp */
42static DEFINE_RWLOCK(sctp_lock);
43
44/* FIXME: Examine ipfilter's timeouts and conntrack transitions more
45 closely. They're more complex. --RR
46
47 And so for me for SCTP :D -Kiran */
48
49static const char *sctp_conntrack_names[] = {
50 "NONE",
51 "CLOSED",
52 "COOKIE_WAIT",
53 "COOKIE_ECHOED",
54 "ESTABLISHED",
55 "SHUTDOWN_SENT",
56 "SHUTDOWN_RECD",
57 "SHUTDOWN_ACK_SENT",
58};
59
60#define SECS * HZ
61#define MINS * 60 SECS
62#define HOURS * 60 MINS
63#define DAYS * 24 HOURS
64
65static unsigned long nf_ct_sctp_timeout_closed = 10 SECS;
66static unsigned long nf_ct_sctp_timeout_cookie_wait = 3 SECS;
67static unsigned long nf_ct_sctp_timeout_cookie_echoed = 3 SECS;
68static unsigned long nf_ct_sctp_timeout_established = 5 DAYS;
69static unsigned long nf_ct_sctp_timeout_shutdown_sent = 300 SECS / 1000;
70static unsigned long nf_ct_sctp_timeout_shutdown_recd = 300 SECS / 1000;
71static unsigned long nf_ct_sctp_timeout_shutdown_ack_sent = 3 SECS;
72
73static unsigned long * sctp_timeouts[]
74= { NULL, /* SCTP_CONNTRACK_NONE */
75 &nf_ct_sctp_timeout_closed, /* SCTP_CONNTRACK_CLOSED */
76 &nf_ct_sctp_timeout_cookie_wait, /* SCTP_CONNTRACK_COOKIE_WAIT */
77 &nf_ct_sctp_timeout_cookie_echoed, /* SCTP_CONNTRACK_COOKIE_ECHOED */
78 &nf_ct_sctp_timeout_established, /* SCTP_CONNTRACK_ESTABLISHED */
79 &nf_ct_sctp_timeout_shutdown_sent, /* SCTP_CONNTRACK_SHUTDOWN_SENT */
80 &nf_ct_sctp_timeout_shutdown_recd, /* SCTP_CONNTRACK_SHUTDOWN_RECD */
81 &nf_ct_sctp_timeout_shutdown_ack_sent /* SCTP_CONNTRACK_SHUTDOWN_ACK_SENT */
82 };
83
84#define sNO SCTP_CONNTRACK_NONE
85#define sCL SCTP_CONNTRACK_CLOSED
86#define sCW SCTP_CONNTRACK_COOKIE_WAIT
87#define sCE SCTP_CONNTRACK_COOKIE_ECHOED
88#define sES SCTP_CONNTRACK_ESTABLISHED
89#define sSS SCTP_CONNTRACK_SHUTDOWN_SENT
90#define sSR SCTP_CONNTRACK_SHUTDOWN_RECD
91#define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT
92#define sIV SCTP_CONNTRACK_MAX
93
94/*
95 These are the descriptions of the states:
96
97NOTE: These state names are tantalizingly similar to the states of an
98SCTP endpoint. But the interpretation of the states is a little different,
99considering that these are the states of the connection and not of an end
100point. Please note the subtleties. -Kiran
101
102NONE - Nothing so far.
103COOKIE WAIT - We have seen an INIT chunk in the original direction, or also
104 an INIT_ACK chunk in the reply direction.
105COOKIE ECHOED - We have seen a COOKIE_ECHO chunk in the original direction.
106ESTABLISHED - We have seen a COOKIE_ACK in the reply direction.
107SHUTDOWN_SENT - We have seen a SHUTDOWN chunk in the original direction.
108SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply directoin.
109SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite
110 to that of the SHUTDOWN chunk.
111CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of
112 the SHUTDOWN chunk. Connection is closed.
113*/
114
115/* TODO
116 - I have assumed that the first INIT is in the original direction.
117 This messes things when an INIT comes in the reply direction in CLOSED
118 state.
119 - Check the error type in the reply dir before transitioning from
120cookie echoed to closed.
121 - Sec 5.2.4 of RFC 2960
122 - Multi Homing support.
123*/
124
125/* SCTP conntrack state transitions */
126static enum sctp_conntrack sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = {
127 {
128/* ORIGINAL */
129/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */
130/* init */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA},
131/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},
132/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
133/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA},
134/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA},
135/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant have Stale cookie*/
136/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA},/* 5.2.4 - Big TODO */
137/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in orig dir */
138/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL}
139 },
140 {
141/* REPLY */
142/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */
143/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* INIT in sCL Big TODO */
144/* init_ack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},
145/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
146/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA},
147/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA},
148/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA},
149/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in reply dir */
150/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA},
151/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL}
152 }
153};
154
155static int sctp_pkt_to_tuple(const struct sk_buff *skb,
156 unsigned int dataoff,
157 struct nf_conntrack_tuple *tuple)
158{
159 sctp_sctphdr_t _hdr, *hp;
160
161 DEBUGP(__FUNCTION__);
162 DEBUGP("\n");
163
164 /* Actually only need first 8 bytes. */
165 hp = skb_header_pointer(skb, dataoff, 8, &_hdr);
166 if (hp == NULL)
167 return 0;
168
169 tuple->src.u.sctp.port = hp->source;
170 tuple->dst.u.sctp.port = hp->dest;
171 return 1;
172}
173
174static int sctp_invert_tuple(struct nf_conntrack_tuple *tuple,
175 const struct nf_conntrack_tuple *orig)
176{
177 DEBUGP(__FUNCTION__);
178 DEBUGP("\n");
179
180 tuple->src.u.sctp.port = orig->dst.u.sctp.port;
181 tuple->dst.u.sctp.port = orig->src.u.sctp.port;
182 return 1;
183}
184
185/* Print out the per-protocol part of the tuple. */
186static int sctp_print_tuple(struct seq_file *s,
187 const struct nf_conntrack_tuple *tuple)
188{
189 DEBUGP(__FUNCTION__);
190 DEBUGP("\n");
191
192 return seq_printf(s, "sport=%hu dport=%hu ",
193 ntohs(tuple->src.u.sctp.port),
194 ntohs(tuple->dst.u.sctp.port));
195}
196
197/* Print out the private part of the conntrack. */
198static int sctp_print_conntrack(struct seq_file *s,
199 const struct nf_conn *conntrack)
200{
201 enum sctp_conntrack state;
202
203 DEBUGP(__FUNCTION__);
204 DEBUGP("\n");
205
206 read_lock_bh(&sctp_lock);
207 state = conntrack->proto.sctp.state;
208 read_unlock_bh(&sctp_lock);
209
210 return seq_printf(s, "%s ", sctp_conntrack_names[state]);
211}
212
213#define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) \
214for (offset = dataoff + sizeof(sctp_sctphdr_t), count = 0; \
215 offset < skb->len && \
216 (sch = skb_header_pointer(skb, offset, sizeof(_sch), &_sch)); \
217 offset += (htons(sch->length) + 3) & ~3, count++)
218
219/* Some validity checks to make sure the chunks are fine */
220static int do_basic_checks(struct nf_conn *conntrack,
221 const struct sk_buff *skb,
222 unsigned int dataoff,
223 char *map)
224{
225 u_int32_t offset, count;
226 sctp_chunkhdr_t _sch, *sch;
227 int flag;
228
229 DEBUGP(__FUNCTION__);
230 DEBUGP("\n");
231
232 flag = 0;
233
234 for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) {
235 DEBUGP("Chunk Num: %d Type: %d\n", count, sch->type);
236
237 if (sch->type == SCTP_CID_INIT
238 || sch->type == SCTP_CID_INIT_ACK
239 || sch->type == SCTP_CID_SHUTDOWN_COMPLETE) {
240 flag = 1;
241 }
242
243 /* Cookie Ack/Echo chunks not the first OR
244 Init / Init Ack / Shutdown compl chunks not the only chunks */
245 if ((sch->type == SCTP_CID_COOKIE_ACK
246 || sch->type == SCTP_CID_COOKIE_ECHO
247 || flag)
248 && count !=0 ) {
249 DEBUGP("Basic checks failed\n");
250 return 1;
251 }
252
253 if (map) {
254 set_bit(sch->type, (void *)map);
255 }
256 }
257
258 DEBUGP("Basic checks passed\n");
259 return 0;
260}
261
262static int new_state(enum ip_conntrack_dir dir,
263 enum sctp_conntrack cur_state,
264 int chunk_type)
265{
266 int i;
267
268 DEBUGP(__FUNCTION__);
269 DEBUGP("\n");
270
271 DEBUGP("Chunk type: %d\n", chunk_type);
272
273 switch (chunk_type) {
274 case SCTP_CID_INIT:
275 DEBUGP("SCTP_CID_INIT\n");
276 i = 0; break;
277 case SCTP_CID_INIT_ACK:
278 DEBUGP("SCTP_CID_INIT_ACK\n");
279 i = 1; break;
280 case SCTP_CID_ABORT:
281 DEBUGP("SCTP_CID_ABORT\n");
282 i = 2; break;
283 case SCTP_CID_SHUTDOWN:
284 DEBUGP("SCTP_CID_SHUTDOWN\n");
285 i = 3; break;
286 case SCTP_CID_SHUTDOWN_ACK:
287 DEBUGP("SCTP_CID_SHUTDOWN_ACK\n");
288 i = 4; break;
289 case SCTP_CID_ERROR:
290 DEBUGP("SCTP_CID_ERROR\n");
291 i = 5; break;
292 case SCTP_CID_COOKIE_ECHO:
293 DEBUGP("SCTP_CID_COOKIE_ECHO\n");
294 i = 6; break;
295 case SCTP_CID_COOKIE_ACK:
296 DEBUGP("SCTP_CID_COOKIE_ACK\n");
297 i = 7; break;
298 case SCTP_CID_SHUTDOWN_COMPLETE:
299 DEBUGP("SCTP_CID_SHUTDOWN_COMPLETE\n");
300 i = 8; break;
301 default:
302 /* Other chunks like DATA, SACK, HEARTBEAT and
303 its ACK do not cause a change in state */
304 DEBUGP("Unknown chunk type, Will stay in %s\n",
305 sctp_conntrack_names[cur_state]);
306 return cur_state;
307 }
308
309 DEBUGP("dir: %d cur_state: %s chunk_type: %d new_state: %s\n",
310 dir, sctp_conntrack_names[cur_state], chunk_type,
311 sctp_conntrack_names[sctp_conntracks[dir][i][cur_state]]);
312
313 return sctp_conntracks[dir][i][cur_state];
314}
315
316/* Returns verdict for packet, or -1 for invalid. */
317static int sctp_packet(struct nf_conn *conntrack,
318 const struct sk_buff *skb,
319 unsigned int dataoff,
320 enum ip_conntrack_info ctinfo,
321 int pf,
322 unsigned int hooknum)
323{
324 enum sctp_conntrack newconntrack, oldsctpstate;
325 sctp_sctphdr_t _sctph, *sh;
326 sctp_chunkhdr_t _sch, *sch;
327 u_int32_t offset, count;
328 char map[256 / sizeof (char)] = {0};
329
330 DEBUGP(__FUNCTION__);
331 DEBUGP("\n");
332
333 sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph);
334 if (sh == NULL)
335 return -1;
336
337 if (do_basic_checks(conntrack, skb, dataoff, map) != 0)
338 return -1;
339
340 /* Check the verification tag (Sec 8.5) */
341 if (!test_bit(SCTP_CID_INIT, (void *)map)
342 && !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, (void *)map)
343 && !test_bit(SCTP_CID_COOKIE_ECHO, (void *)map)
344 && !test_bit(SCTP_CID_ABORT, (void *)map)
345 && !test_bit(SCTP_CID_SHUTDOWN_ACK, (void *)map)
346 && (sh->vtag != conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) {
347 DEBUGP("Verification tag check failed\n");
348 return -1;
349 }
350
351 oldsctpstate = newconntrack = SCTP_CONNTRACK_MAX;
352 for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) {
353 write_lock_bh(&sctp_lock);
354
355 /* Special cases of Verification tag check (Sec 8.5.1) */
356 if (sch->type == SCTP_CID_INIT) {
357 /* Sec 8.5.1 (A) */
358 if (sh->vtag != 0) {
359 write_unlock_bh(&sctp_lock);
360 return -1;
361 }
362 } else if (sch->type == SCTP_CID_ABORT) {
363 /* Sec 8.5.1 (B) */
364 if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])
365 && !(sh->vtag == conntrack->proto.sctp.vtag
366 [1 - CTINFO2DIR(ctinfo)])) {
367 write_unlock_bh(&sctp_lock);
368 return -1;
369 }
370 } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) {
371 /* Sec 8.5.1 (C) */
372 if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])
373 && !(sh->vtag == conntrack->proto.sctp.vtag
374 [1 - CTINFO2DIR(ctinfo)]
375 && (sch->flags & 1))) {
376 write_unlock_bh(&sctp_lock);
377 return -1;
378 }
379 } else if (sch->type == SCTP_CID_COOKIE_ECHO) {
380 /* Sec 8.5.1 (D) */
381 if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) {
382 write_unlock_bh(&sctp_lock);
383 return -1;
384 }
385 }
386
387 oldsctpstate = conntrack->proto.sctp.state;
388 newconntrack = new_state(CTINFO2DIR(ctinfo), oldsctpstate, sch->type);
389
390 /* Invalid */
391 if (newconntrack == SCTP_CONNTRACK_MAX) {
392 DEBUGP("nf_conntrack_sctp: Invalid dir=%i ctype=%u conntrack=%u\n",
393 CTINFO2DIR(ctinfo), sch->type, oldsctpstate);
394 write_unlock_bh(&sctp_lock);
395 return -1;
396 }
397
398 /* If it is an INIT or an INIT ACK note down the vtag */
399 if (sch->type == SCTP_CID_INIT
400 || sch->type == SCTP_CID_INIT_ACK) {
401 sctp_inithdr_t _inithdr, *ih;
402
403 ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t),
404 sizeof(_inithdr), &_inithdr);
405 if (ih == NULL) {
406 write_unlock_bh(&sctp_lock);
407 return -1;
408 }
409 DEBUGP("Setting vtag %x for dir %d\n",
410 ih->init_tag, !CTINFO2DIR(ctinfo));
411 conntrack->proto.sctp.vtag[!CTINFO2DIR(ctinfo)] = ih->init_tag;
412 }
413
414 conntrack->proto.sctp.state = newconntrack;
415 if (oldsctpstate != newconntrack)
416 nf_conntrack_event_cache(IPCT_PROTOINFO, skb);
417 write_unlock_bh(&sctp_lock);
418 }
419
420 nf_ct_refresh_acct(conntrack, ctinfo, skb, *sctp_timeouts[newconntrack]);
421
422 if (oldsctpstate == SCTP_CONNTRACK_COOKIE_ECHOED
423 && CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY
424 && newconntrack == SCTP_CONNTRACK_ESTABLISHED) {
425 DEBUGP("Setting assured bit\n");
426 set_bit(IPS_ASSURED_BIT, &conntrack->status);
427 nf_conntrack_event_cache(IPCT_STATUS, skb);
428 }
429
430 return NF_ACCEPT;
431}
432
433/* Called when a new connection for this protocol found. */
434static int sctp_new(struct nf_conn *conntrack, const struct sk_buff *skb,
435 unsigned int dataoff)
436{
437 enum sctp_conntrack newconntrack;
438 sctp_sctphdr_t _sctph, *sh;
439 sctp_chunkhdr_t _sch, *sch;
440 u_int32_t offset, count;
441 char map[256 / sizeof (char)] = {0};
442
443 DEBUGP(__FUNCTION__);
444 DEBUGP("\n");
445
446 sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph);
447 if (sh == NULL)
448 return 0;
449
450 if (do_basic_checks(conntrack, skb, dataoff, map) != 0)
451 return 0;
452
453 /* If an OOTB packet has any of these chunks discard (Sec 8.4) */
454 if ((test_bit (SCTP_CID_ABORT, (void *)map))
455 || (test_bit (SCTP_CID_SHUTDOWN_COMPLETE, (void *)map))
456 || (test_bit (SCTP_CID_COOKIE_ACK, (void *)map))) {
457 return 0;
458 }
459
460 newconntrack = SCTP_CONNTRACK_MAX;
461 for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) {
462 /* Don't need lock here: this conntrack not in circulation yet */
463 newconntrack = new_state(IP_CT_DIR_ORIGINAL,
464 SCTP_CONNTRACK_NONE, sch->type);
465
466 /* Invalid: delete conntrack */
467 if (newconntrack == SCTP_CONNTRACK_MAX) {
468 DEBUGP("nf_conntrack_sctp: invalid new deleting.\n");
469 return 0;
470 }
471
472 /* Copy the vtag into the state info */
473 if (sch->type == SCTP_CID_INIT) {
474 if (sh->vtag == 0) {
475 sctp_inithdr_t _inithdr, *ih;
476
477 ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t),
478 sizeof(_inithdr), &_inithdr);
479 if (ih == NULL)
480 return 0;
481
482 DEBUGP("Setting vtag %x for new conn\n",
483 ih->init_tag);
484
485 conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] =
486 ih->init_tag;
487 } else {
488 /* Sec 8.5.1 (A) */
489 return 0;
490 }
491 }
492 /* If it is a shutdown ack OOTB packet, we expect a return
493 shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */
494 else {
495 DEBUGP("Setting vtag %x for new conn OOTB\n",
496 sh->vtag);
497 conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag;
498 }
499
500 conntrack->proto.sctp.state = newconntrack;
501 }
502
503 return 1;
504}
505
506struct nf_conntrack_protocol nf_conntrack_protocol_sctp4 = {
507 .l3proto = PF_INET,
508 .proto = IPPROTO_SCTP,
509 .name = "sctp",
510 .pkt_to_tuple = sctp_pkt_to_tuple,
511 .invert_tuple = sctp_invert_tuple,
512 .print_tuple = sctp_print_tuple,
513 .print_conntrack = sctp_print_conntrack,
514 .packet = sctp_packet,
515 .new = sctp_new,
516 .destroy = NULL,
517 .me = THIS_MODULE
518};
519
520struct nf_conntrack_protocol nf_conntrack_protocol_sctp6 = {
521 .l3proto = PF_INET6,
522 .proto = IPPROTO_SCTP,
523 .name = "sctp",
524 .pkt_to_tuple = sctp_pkt_to_tuple,
525 .invert_tuple = sctp_invert_tuple,
526 .print_tuple = sctp_print_tuple,
527 .print_conntrack = sctp_print_conntrack,
528 .packet = sctp_packet,
529 .new = sctp_new,
530 .destroy = NULL,
531 .me = THIS_MODULE
532};
533
534#ifdef CONFIG_SYSCTL
535static ctl_table nf_ct_sysctl_table[] = {
536 {
537 .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED,
538 .procname = "nf_conntrack_sctp_timeout_closed",
539 .data = &nf_ct_sctp_timeout_closed,
540 .maxlen = sizeof(unsigned int),
541 .mode = 0644,
542 .proc_handler = &proc_dointvec_jiffies,
543 },
544 {
545 .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT,
546 .procname = "nf_conntrack_sctp_timeout_cookie_wait",
547 .data = &nf_ct_sctp_timeout_cookie_wait,
548 .maxlen = sizeof(unsigned int),
549 .mode = 0644,
550 .proc_handler = &proc_dointvec_jiffies,
551 },
552 {
553 .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED,
554 .procname = "nf_conntrack_sctp_timeout_cookie_echoed",
555 .data = &nf_ct_sctp_timeout_cookie_echoed,
556 .maxlen = sizeof(unsigned int),
557 .mode = 0644,
558 .proc_handler = &proc_dointvec_jiffies,
559 },
560 {
561 .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED,
562 .procname = "nf_conntrack_sctp_timeout_established",
563 .data = &nf_ct_sctp_timeout_established,
564 .maxlen = sizeof(unsigned int),
565 .mode = 0644,
566 .proc_handler = &proc_dointvec_jiffies,
567 },
568 {
569 .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT,
570 .procname = "nf_conntrack_sctp_timeout_shutdown_sent",
571 .data = &nf_ct_sctp_timeout_shutdown_sent,
572 .maxlen = sizeof(unsigned int),
573 .mode = 0644,
574 .proc_handler = &proc_dointvec_jiffies,
575 },
576 {
577 .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD,
578 .procname = "nf_conntrack_sctp_timeout_shutdown_recd",
579 .data = &nf_ct_sctp_timeout_shutdown_recd,
580 .maxlen = sizeof(unsigned int),
581 .mode = 0644,
582 .proc_handler = &proc_dointvec_jiffies,
583 },
584 {
585 .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT,
586 .procname = "nf_conntrack_sctp_timeout_shutdown_ack_sent",
587 .data = &nf_ct_sctp_timeout_shutdown_ack_sent,
588 .maxlen = sizeof(unsigned int),
589 .mode = 0644,
590 .proc_handler = &proc_dointvec_jiffies,
591 },
592 { .ctl_name = 0 }
593};
594
595static ctl_table nf_ct_netfilter_table[] = {
596 {
597 .ctl_name = NET_NETFILTER,
598 .procname = "netfilter",
599 .mode = 0555,
600 .child = nf_ct_sysctl_table,
601 },
602 { .ctl_name = 0 }
603};
604
605static ctl_table nf_ct_net_table[] = {
606 {
607 .ctl_name = CTL_NET,
608 .procname = "net",
609 .mode = 0555,
610 .child = nf_ct_netfilter_table,
611 },
612 { .ctl_name = 0 }
613};
614
615static struct ctl_table_header *nf_ct_sysctl_header;
616#endif
617
618int __init init(void)
619{
620 int ret;
621
622 ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_sctp4);
623 if (ret) {
624 printk("nf_conntrack_proto_sctp4: protocol register failed\n");
625 goto out;
626 }
627 ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_sctp6);
628 if (ret) {
629 printk("nf_conntrack_proto_sctp6: protocol register failed\n");
630 goto cleanup_sctp4;
631 }
632
633#ifdef CONFIG_SYSCTL
634 nf_ct_sysctl_header = register_sysctl_table(nf_ct_net_table, 0);
635 if (nf_ct_sysctl_header == NULL) {
636 printk("nf_conntrack_proto_sctp: can't register to sysctl.\n");
637 goto cleanup;
638 }
639#endif
640
641 return ret;
642
643#ifdef CONFIG_SYSCTL
644 cleanup:
645 nf_conntrack_protocol_unregister(&nf_conntrack_protocol_sctp6);
646#endif
647 cleanup_sctp4:
648 nf_conntrack_protocol_unregister(&nf_conntrack_protocol_sctp4);
649 out:
650 DEBUGP("SCTP conntrack module loading %s\n",
651 ret ? "failed": "succeeded");
652 return ret;
653}
654
655void __exit fini(void)
656{
657 nf_conntrack_protocol_unregister(&nf_conntrack_protocol_sctp6);
658 nf_conntrack_protocol_unregister(&nf_conntrack_protocol_sctp4);
659#ifdef CONFIG_SYSCTL
660 unregister_sysctl_table(nf_ct_sysctl_header);
661#endif
662 DEBUGP("SCTP conntrack module unloaded\n");
663}
664
665module_init(init);
666module_exit(fini);
667
668MODULE_LICENSE("GPL");
669MODULE_AUTHOR("Kiran Kumar Immidi");
670MODULE_DESCRIPTION("Netfilter connection tracking protocol helper for SCTP");
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
new file mode 100644
index 000000000000..83d90dd624f0
--- /dev/null
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -0,0 +1,1162 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>:
9 * - Real stateful connection tracking
10 * - Modified state transitions table
11 * - Window scaling support added
12 * - SACK support added
13 *
14 * Willy Tarreau:
15 * - State table bugfixes
16 * - More robust state changes
17 * - Tuning timer parameters
18 *
19 * 27 Oct 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
20 * - genelized Layer 3 protocol part.
21 *
22 * Derived from net/ipv4/netfilter/ip_conntrack_proto_tcp.c
23 *
24 * version 2.2
25 */
26
27#include <linux/config.h>
28#include <linux/types.h>
29#include <linux/sched.h>
30#include <linux/timer.h>
31#include <linux/netfilter.h>
32#include <linux/module.h>
33#include <linux/in.h>
34#include <linux/tcp.h>
35#include <linux/spinlock.h>
36#include <linux/skbuff.h>
37#include <linux/ipv6.h>
38#include <net/ip6_checksum.h>
39
40#include <net/tcp.h>
41
42#include <linux/netfilter.h>
43#include <linux/netfilter_ipv4.h>
44#include <linux/netfilter_ipv6.h>
45#include <net/netfilter/nf_conntrack.h>
46#include <net/netfilter/nf_conntrack_protocol.h>
47
48#if 0
49#define DEBUGP printk
50#define DEBUGP_VARS
51#else
52#define DEBUGP(format, args...)
53#endif
54
55/* Protects conntrack->proto.tcp */
56static DEFINE_RWLOCK(tcp_lock);
57
58/* "Be conservative in what you do,
59 be liberal in what you accept from others."
60 If it's non-zero, we mark only out of window RST segments as INVALID. */
61int nf_ct_tcp_be_liberal = 0;
62
63/* When connection is picked up from the middle, how many packets are required
64 to pass in each direction when we assume we are in sync - if any side uses
65 window scaling, we lost the game.
66 If it is set to zero, we disable picking up already established
67 connections. */
68int nf_ct_tcp_loose = 3;
69
70/* Max number of the retransmitted packets without receiving an (acceptable)
71 ACK from the destination. If this number is reached, a shorter timer
72 will be started. */
73int nf_ct_tcp_max_retrans = 3;
74
75 /* FIXME: Examine ipfilter's timeouts and conntrack transitions more
76 closely. They're more complex. --RR */
77
78static const char *tcp_conntrack_names[] = {
79 "NONE",
80 "SYN_SENT",
81 "SYN_RECV",
82 "ESTABLISHED",
83 "FIN_WAIT",
84 "CLOSE_WAIT",
85 "LAST_ACK",
86 "TIME_WAIT",
87 "CLOSE",
88 "LISTEN"
89};
90
91#define SECS * HZ
92#define MINS * 60 SECS
93#define HOURS * 60 MINS
94#define DAYS * 24 HOURS
95
96unsigned long nf_ct_tcp_timeout_syn_sent = 2 MINS;
97unsigned long nf_ct_tcp_timeout_syn_recv = 60 SECS;
98unsigned long nf_ct_tcp_timeout_established = 5 DAYS;
99unsigned long nf_ct_tcp_timeout_fin_wait = 2 MINS;
100unsigned long nf_ct_tcp_timeout_close_wait = 60 SECS;
101unsigned long nf_ct_tcp_timeout_last_ack = 30 SECS;
102unsigned long nf_ct_tcp_timeout_time_wait = 2 MINS;
103unsigned long nf_ct_tcp_timeout_close = 10 SECS;
104
105/* RFC1122 says the R2 limit should be at least 100 seconds.
106 Linux uses 15 packets as limit, which corresponds
107 to ~13-30min depending on RTO. */
108unsigned long nf_ct_tcp_timeout_max_retrans = 5 MINS;
109
110static unsigned long * tcp_timeouts[]
111= { NULL, /* TCP_CONNTRACK_NONE */
112 &nf_ct_tcp_timeout_syn_sent, /* TCP_CONNTRACK_SYN_SENT, */
113 &nf_ct_tcp_timeout_syn_recv, /* TCP_CONNTRACK_SYN_RECV, */
114 &nf_ct_tcp_timeout_established, /* TCP_CONNTRACK_ESTABLISHED, */
115 &nf_ct_tcp_timeout_fin_wait, /* TCP_CONNTRACK_FIN_WAIT, */
116 &nf_ct_tcp_timeout_close_wait, /* TCP_CONNTRACK_CLOSE_WAIT, */
117 &nf_ct_tcp_timeout_last_ack, /* TCP_CONNTRACK_LAST_ACK, */
118 &nf_ct_tcp_timeout_time_wait, /* TCP_CONNTRACK_TIME_WAIT, */
119 &nf_ct_tcp_timeout_close, /* TCP_CONNTRACK_CLOSE, */
120 NULL, /* TCP_CONNTRACK_LISTEN */
121 };
122
123#define sNO TCP_CONNTRACK_NONE
124#define sSS TCP_CONNTRACK_SYN_SENT
125#define sSR TCP_CONNTRACK_SYN_RECV
126#define sES TCP_CONNTRACK_ESTABLISHED
127#define sFW TCP_CONNTRACK_FIN_WAIT
128#define sCW TCP_CONNTRACK_CLOSE_WAIT
129#define sLA TCP_CONNTRACK_LAST_ACK
130#define sTW TCP_CONNTRACK_TIME_WAIT
131#define sCL TCP_CONNTRACK_CLOSE
132#define sLI TCP_CONNTRACK_LISTEN
133#define sIV TCP_CONNTRACK_MAX
134#define sIG TCP_CONNTRACK_IGNORE
135
136/* What TCP flags are set from RST/SYN/FIN/ACK. */
137enum tcp_bit_set {
138 TCP_SYN_SET,
139 TCP_SYNACK_SET,
140 TCP_FIN_SET,
141 TCP_ACK_SET,
142 TCP_RST_SET,
143 TCP_NONE_SET,
144};
145
146/*
147 * The TCP state transition table needs a few words...
148 *
149 * We are the man in the middle. All the packets go through us
150 * but might get lost in transit to the destination.
151 * It is assumed that the destinations can't receive segments
152 * we haven't seen.
153 *
154 * The checked segment is in window, but our windows are *not*
155 * equivalent with the ones of the sender/receiver. We always
156 * try to guess the state of the current sender.
157 *
158 * The meaning of the states are:
159 *
160 * NONE: initial state
161 * SYN_SENT: SYN-only packet seen
162 * SYN_RECV: SYN-ACK packet seen
163 * ESTABLISHED: ACK packet seen
164 * FIN_WAIT: FIN packet seen
165 * CLOSE_WAIT: ACK seen (after FIN)
166 * LAST_ACK: FIN seen (after FIN)
167 * TIME_WAIT: last ACK seen
168 * CLOSE: closed connection
169 *
170 * LISTEN state is not used.
171 *
172 * Packets marked as IGNORED (sIG):
173 * if they may be either invalid or valid
174 * and the receiver may send back a connection
175 * closing RST or a SYN/ACK.
176 *
177 * Packets marked as INVALID (sIV):
178 * if they are invalid
179 * or we do not support the request (simultaneous open)
180 */
181static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
182 {
183/* ORIGINAL */
184/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
185/*syn*/ { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sIV },
186/*
187 * sNO -> sSS Initialize a new connection
188 * sSS -> sSS Retransmitted SYN
189 * sSR -> sIG Late retransmitted SYN?
190 * sES -> sIG Error: SYNs in window outside the SYN_SENT state
191 * are errors. Receiver will reply with RST
192 * and close the connection.
193 * Or we are not in sync and hold a dead connection.
194 * sFW -> sIG
195 * sCW -> sIG
196 * sLA -> sIG
197 * sTW -> sSS Reopened connection (RFC 1122).
198 * sCL -> sSS
199 */
200/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
201/*synack*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV },
202/*
203 * A SYN/ACK from the client is always invalid:
204 * - either it tries to set up a simultaneous open, which is
205 * not supported;
206 * - or the firewall has just been inserted between the two hosts
207 * during the session set-up. The SYN will be retransmitted
208 * by the true client (or it'll time out).
209 */
210/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
211/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
212/*
213 * sNO -> sIV Too late and no reason to do anything...
214 * sSS -> sIV Client migth not send FIN in this state:
215 * we enforce waiting for a SYN/ACK reply first.
216 * sSR -> sFW Close started.
217 * sES -> sFW
218 * sFW -> sLA FIN seen in both directions, waiting for
219 * the last ACK.
220 * Migth be a retransmitted FIN as well...
221 * sCW -> sLA
222 * sLA -> sLA Retransmitted FIN. Remain in the same state.
223 * sTW -> sTW
224 * sCL -> sCL
225 */
226/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
227/*ack*/ { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV },
228/*
229 * sNO -> sES Assumed.
230 * sSS -> sIV ACK is invalid: we haven't seen a SYN/ACK yet.
231 * sSR -> sES Established state is reached.
232 * sES -> sES :-)
233 * sFW -> sCW Normal close request answered by ACK.
234 * sCW -> sCW
235 * sLA -> sTW Last ACK detected.
236 * sTW -> sTW Retransmitted last ACK. Remain in the same state.
237 * sCL -> sCL
238 */
239/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
240/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV },
241/*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
242 },
243 {
244/* REPLY */
245/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
246/*syn*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV },
247/*
248 * sNO -> sIV Never reached.
249 * sSS -> sIV Simultaneous open, not supported
250 * sSR -> sIV Simultaneous open, not supported.
251 * sES -> sIV Server may not initiate a connection.
252 * sFW -> sIV
253 * sCW -> sIV
254 * sLA -> sIV
255 * sTW -> sIV Reopened connection, but server may not do it.
256 * sCL -> sIV
257 */
258/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
259/*synack*/ { sIV, sSR, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIV },
260/*
261 * sSS -> sSR Standard open.
262 * sSR -> sSR Retransmitted SYN/ACK.
263 * sES -> sIG Late retransmitted SYN/ACK?
264 * sFW -> sIG Might be SYN/ACK answering ignored SYN
265 * sCW -> sIG
266 * sLA -> sIG
267 * sTW -> sIG
268 * sCL -> sIG
269 */
270/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
271/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
272/*
273 * sSS -> sIV Server might not send FIN in this state.
274 * sSR -> sFW Close started.
275 * sES -> sFW
276 * sFW -> sLA FIN seen in both directions.
277 * sCW -> sLA
278 * sLA -> sLA Retransmitted FIN.
279 * sTW -> sTW
280 * sCL -> sCL
281 */
282/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
283/*ack*/ { sIV, sIV, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIV },
284/*
285 * sSS -> sIV Might be a half-open connection.
286 * sSR -> sSR Might answer late resent SYN.
287 * sES -> sES :-)
288 * sFW -> sCW Normal close request answered by ACK.
289 * sCW -> sCW
290 * sLA -> sTW Last ACK detected.
291 * sTW -> sTW Retransmitted last ACK.
292 * sCL -> sCL
293 */
294/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
295/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV },
296/*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
297 }
298};
299
300static int tcp_pkt_to_tuple(const struct sk_buff *skb,
301 unsigned int dataoff,
302 struct nf_conntrack_tuple *tuple)
303{
304 struct tcphdr _hdr, *hp;
305
306 /* Actually only need first 8 bytes. */
307 hp = skb_header_pointer(skb, dataoff, 8, &_hdr);
308 if (hp == NULL)
309 return 0;
310
311 tuple->src.u.tcp.port = hp->source;
312 tuple->dst.u.tcp.port = hp->dest;
313
314 return 1;
315}
316
317static int tcp_invert_tuple(struct nf_conntrack_tuple *tuple,
318 const struct nf_conntrack_tuple *orig)
319{
320 tuple->src.u.tcp.port = orig->dst.u.tcp.port;
321 tuple->dst.u.tcp.port = orig->src.u.tcp.port;
322 return 1;
323}
324
325/* Print out the per-protocol part of the tuple. */
326static int tcp_print_tuple(struct seq_file *s,
327 const struct nf_conntrack_tuple *tuple)
328{
329 return seq_printf(s, "sport=%hu dport=%hu ",
330 ntohs(tuple->src.u.tcp.port),
331 ntohs(tuple->dst.u.tcp.port));
332}
333
334/* Print out the private part of the conntrack. */
335static int tcp_print_conntrack(struct seq_file *s,
336 const struct nf_conn *conntrack)
337{
338 enum tcp_conntrack state;
339
340 read_lock_bh(&tcp_lock);
341 state = conntrack->proto.tcp.state;
342 read_unlock_bh(&tcp_lock);
343
344 return seq_printf(s, "%s ", tcp_conntrack_names[state]);
345}
346
347static unsigned int get_conntrack_index(const struct tcphdr *tcph)
348{
349 if (tcph->rst) return TCP_RST_SET;
350 else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET);
351 else if (tcph->fin) return TCP_FIN_SET;
352 else if (tcph->ack) return TCP_ACK_SET;
353 else return TCP_NONE_SET;
354}
355
356/* TCP connection tracking based on 'Real Stateful TCP Packet Filtering
357 in IP Filter' by Guido van Rooij.
358
359 http://www.nluug.nl/events/sane2000/papers.html
360 http://www.iae.nl/users/guido/papers/tcp_filtering.ps.gz
361
362 The boundaries and the conditions are changed according to RFC793:
363 the packet must intersect the window (i.e. segments may be
364 after the right or before the left edge) and thus receivers may ACK
365 segments after the right edge of the window.
366
367 td_maxend = max(sack + max(win,1)) seen in reply packets
368 td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets
369 td_maxwin += seq + len - sender.td_maxend
370 if seq + len > sender.td_maxend
371 td_end = max(seq + len) seen in sent packets
372
373 I. Upper bound for valid data: seq <= sender.td_maxend
374 II. Lower bound for valid data: seq + len >= sender.td_end - receiver.td_maxwin
375 III. Upper bound for valid ack: sack <= receiver.td_end
376 IV. Lower bound for valid ack: ack >= receiver.td_end - MAXACKWINDOW
377
378 where sack is the highest right edge of sack block found in the packet.
379
380 The upper bound limit for a valid ack is not ignored -
381 we doesn't have to deal with fragments.
382*/
383
384static inline __u32 segment_seq_plus_len(__u32 seq,
385 size_t len,
386 unsigned int dataoff,
387 struct tcphdr *tcph)
388{
389 /* XXX Should I use payload length field in IP/IPv6 header ?
390 * - YK */
391 return (seq + len - dataoff - tcph->doff*4
392 + (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0));
393}
394
395/* Fixme: what about big packets? */
396#define MAXACKWINCONST 66000
397#define MAXACKWINDOW(sender) \
398 ((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin \
399 : MAXACKWINCONST)
400
401/*
402 * Simplified tcp_parse_options routine from tcp_input.c
403 */
404static void tcp_options(const struct sk_buff *skb,
405 unsigned int dataoff,
406 struct tcphdr *tcph,
407 struct ip_ct_tcp_state *state)
408{
409 unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
410 unsigned char *ptr;
411 int length = (tcph->doff*4) - sizeof(struct tcphdr);
412
413 if (!length)
414 return;
415
416 ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
417 length, buff);
418 BUG_ON(ptr == NULL);
419
420 state->td_scale =
421 state->flags = 0;
422
423 while (length > 0) {
424 int opcode=*ptr++;
425 int opsize;
426
427 switch (opcode) {
428 case TCPOPT_EOL:
429 return;
430 case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
431 length--;
432 continue;
433 default:
434 opsize=*ptr++;
435 if (opsize < 2) /* "silly options" */
436 return;
437 if (opsize > length)
438 break; /* don't parse partial options */
439
440 if (opcode == TCPOPT_SACK_PERM
441 && opsize == TCPOLEN_SACK_PERM)
442 state->flags |= IP_CT_TCP_FLAG_SACK_PERM;
443 else if (opcode == TCPOPT_WINDOW
444 && opsize == TCPOLEN_WINDOW) {
445 state->td_scale = *(u_int8_t *)ptr;
446
447 if (state->td_scale > 14) {
448 /* See RFC1323 */
449 state->td_scale = 14;
450 }
451 state->flags |=
452 IP_CT_TCP_FLAG_WINDOW_SCALE;
453 }
454 ptr += opsize - 2;
455 length -= opsize;
456 }
457 }
458}
459
460static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
461 struct tcphdr *tcph, __u32 *sack)
462{
463 unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
464 unsigned char *ptr;
465 int length = (tcph->doff*4) - sizeof(struct tcphdr);
466 __u32 tmp;
467
468 if (!length)
469 return;
470
471 ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
472 length, buff);
473 BUG_ON(ptr == NULL);
474
475 /* Fast path for timestamp-only option */
476 if (length == TCPOLEN_TSTAMP_ALIGNED*4
477 && *(__u32 *)ptr ==
478 __constant_ntohl((TCPOPT_NOP << 24)
479 | (TCPOPT_NOP << 16)
480 | (TCPOPT_TIMESTAMP << 8)
481 | TCPOLEN_TIMESTAMP))
482 return;
483
484 while (length > 0) {
485 int opcode = *ptr++;
486 int opsize, i;
487
488 switch (opcode) {
489 case TCPOPT_EOL:
490 return;
491 case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
492 length--;
493 continue;
494 default:
495 opsize = *ptr++;
496 if (opsize < 2) /* "silly options" */
497 return;
498 if (opsize > length)
499 break; /* don't parse partial options */
500
501 if (opcode == TCPOPT_SACK
502 && opsize >= (TCPOLEN_SACK_BASE
503 + TCPOLEN_SACK_PERBLOCK)
504 && !((opsize - TCPOLEN_SACK_BASE)
505 % TCPOLEN_SACK_PERBLOCK)) {
506 for (i = 0;
507 i < (opsize - TCPOLEN_SACK_BASE);
508 i += TCPOLEN_SACK_PERBLOCK) {
509 memcpy(&tmp, (__u32 *)(ptr + i) + 1,
510 sizeof(__u32));
511 tmp = ntohl(tmp);
512
513 if (after(tmp, *sack))
514 *sack = tmp;
515 }
516 return;
517 }
518 ptr += opsize - 2;
519 length -= opsize;
520 }
521 }
522}
523
524static int tcp_in_window(struct ip_ct_tcp *state,
525 enum ip_conntrack_dir dir,
526 unsigned int index,
527 const struct sk_buff *skb,
528 unsigned int dataoff,
529 struct tcphdr *tcph,
530 int pf)
531{
532 struct ip_ct_tcp_state *sender = &state->seen[dir];
533 struct ip_ct_tcp_state *receiver = &state->seen[!dir];
534 __u32 seq, ack, sack, end, win, swin;
535 int res;
536
537 /*
538 * Get the required data from the packet.
539 */
540 seq = ntohl(tcph->seq);
541 ack = sack = ntohl(tcph->ack_seq);
542 win = ntohs(tcph->window);
543 end = segment_seq_plus_len(seq, skb->len, dataoff, tcph);
544
545 if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM)
546 tcp_sack(skb, dataoff, tcph, &sack);
547
548 DEBUGP("tcp_in_window: START\n");
549 DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
550 "seq=%u ack=%u sack=%u win=%u end=%u\n",
551 NIPQUAD(iph->saddr), ntohs(tcph->source),
552 NIPQUAD(iph->daddr), ntohs(tcph->dest),
553 seq, ack, sack, win, end);
554 DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
555 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
556 sender->td_end, sender->td_maxend, sender->td_maxwin,
557 sender->td_scale,
558 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
559 receiver->td_scale);
560
561 if (sender->td_end == 0) {
562 /*
563 * Initialize sender data.
564 */
565 if (tcph->syn && tcph->ack) {
566 /*
567 * Outgoing SYN-ACK in reply to a SYN.
568 */
569 sender->td_end =
570 sender->td_maxend = end;
571 sender->td_maxwin = (win == 0 ? 1 : win);
572
573 tcp_options(skb, dataoff, tcph, sender);
574 /*
575 * RFC 1323:
576 * Both sides must send the Window Scale option
577 * to enable window scaling in either direction.
578 */
579 if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE
580 && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE))
581 sender->td_scale =
582 receiver->td_scale = 0;
583 } else {
584 /*
585 * We are in the middle of a connection,
586 * its history is lost for us.
587 * Let's try to use the data from the packet.
588 */
589 sender->td_end = end;
590 sender->td_maxwin = (win == 0 ? 1 : win);
591 sender->td_maxend = end + sender->td_maxwin;
592 }
593 } else if (((state->state == TCP_CONNTRACK_SYN_SENT
594 && dir == IP_CT_DIR_ORIGINAL)
595 || (state->state == TCP_CONNTRACK_SYN_RECV
596 && dir == IP_CT_DIR_REPLY))
597 && after(end, sender->td_end)) {
598 /*
599 * RFC 793: "if a TCP is reinitialized ... then it need
600 * not wait at all; it must only be sure to use sequence
601 * numbers larger than those recently used."
602 */
603 sender->td_end =
604 sender->td_maxend = end;
605 sender->td_maxwin = (win == 0 ? 1 : win);
606
607 tcp_options(skb, dataoff, tcph, sender);
608 }
609
610 if (!(tcph->ack)) {
611 /*
612 * If there is no ACK, just pretend it was set and OK.
613 */
614 ack = sack = receiver->td_end;
615 } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) ==
616 (TCP_FLAG_ACK|TCP_FLAG_RST))
617 && (ack == 0)) {
618 /*
619 * Broken TCP stacks, that set ACK in RST packets as well
620 * with zero ack value.
621 */
622 ack = sack = receiver->td_end;
623 }
624
625 if (seq == end
626 && (!tcph->rst
627 || (seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT)))
628 /*
629 * Packets contains no data: we assume it is valid
630 * and check the ack value only.
631 * However RST segments are always validated by their
632 * SEQ number, except when seq == 0 (reset sent answering
633 * SYN.
634 */
635 seq = end = sender->td_end;
636
637 DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
638 "seq=%u ack=%u sack =%u win=%u end=%u\n",
639 NIPQUAD(iph->saddr), ntohs(tcph->source),
640 NIPQUAD(iph->daddr), ntohs(tcph->dest),
641 seq, ack, sack, win, end);
642 DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
643 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
644 sender->td_end, sender->td_maxend, sender->td_maxwin,
645 sender->td_scale,
646 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
647 receiver->td_scale);
648
649 DEBUGP("tcp_in_window: I=%i II=%i III=%i IV=%i\n",
650 before(seq, sender->td_maxend + 1),
651 after(end, sender->td_end - receiver->td_maxwin - 1),
652 before(sack, receiver->td_end + 1),
653 after(ack, receiver->td_end - MAXACKWINDOW(sender)));
654
655 if (sender->loose || receiver->loose ||
656 (before(seq, sender->td_maxend + 1) &&
657 after(end, sender->td_end - receiver->td_maxwin - 1) &&
658 before(sack, receiver->td_end + 1) &&
659 after(ack, receiver->td_end - MAXACKWINDOW(sender)))) {
660 /*
661 * Take into account window scaling (RFC 1323).
662 */
663 if (!tcph->syn)
664 win <<= sender->td_scale;
665
666 /*
667 * Update sender data.
668 */
669 swin = win + (sack - ack);
670 if (sender->td_maxwin < swin)
671 sender->td_maxwin = swin;
672 if (after(end, sender->td_end))
673 sender->td_end = end;
674 /*
675 * Update receiver data.
676 */
677 if (after(end, sender->td_maxend))
678 receiver->td_maxwin += end - sender->td_maxend;
679 if (after(sack + win, receiver->td_maxend - 1)) {
680 receiver->td_maxend = sack + win;
681 if (win == 0)
682 receiver->td_maxend++;
683 }
684
685 /*
686 * Check retransmissions.
687 */
688 if (index == TCP_ACK_SET) {
689 if (state->last_dir == dir
690 && state->last_seq == seq
691 && state->last_ack == ack
692 && state->last_end == end)
693 state->retrans++;
694 else {
695 state->last_dir = dir;
696 state->last_seq = seq;
697 state->last_ack = ack;
698 state->last_end = end;
699 state->retrans = 0;
700 }
701 }
702 /*
703 * Close the window of disabled window tracking :-)
704 */
705 if (sender->loose)
706 sender->loose--;
707
708 res = 1;
709 } else {
710 if (LOG_INVALID(IPPROTO_TCP))
711 nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
712 "nf_ct_tcp: %s ",
713 before(seq, sender->td_maxend + 1) ?
714 after(end, sender->td_end - receiver->td_maxwin - 1) ?
715 before(sack, receiver->td_end + 1) ?
716 after(ack, receiver->td_end - MAXACKWINDOW(sender)) ? "BUG"
717 : "ACK is under the lower bound (possible overly delayed ACK)"
718 : "ACK is over the upper bound (ACKed data not seen yet)"
719 : "SEQ is under the lower bound (already ACKed data retransmitted)"
720 : "SEQ is over the upper bound (over the window of the receiver)");
721
722 res = nf_ct_tcp_be_liberal;
723 }
724
725 DEBUGP("tcp_in_window: res=%i sender end=%u maxend=%u maxwin=%u "
726 "receiver end=%u maxend=%u maxwin=%u\n",
727 res, sender->td_end, sender->td_maxend, sender->td_maxwin,
728 receiver->td_end, receiver->td_maxend, receiver->td_maxwin);
729
730 return res;
731}
732
733#ifdef CONFIG_IP_NF_NAT_NEEDED
734/* Update sender->td_end after NAT successfully mangled the packet */
735/* Caller must linearize skb at tcp header. */
736void nf_conntrack_tcp_update(struct sk_buff *skb,
737 unsigned int dataoff,
738 struct nf_conn *conntrack,
739 int dir)
740{
741 struct tcphdr *tcph = (void *)skb->data + dataoff;
742 __u32 end;
743#ifdef DEBUGP_VARS
744 struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[dir];
745 struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[!dir];
746#endif
747
748 end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, dataoff, tcph);
749
750 write_lock_bh(&tcp_lock);
751 /*
752 * We have to worry for the ack in the reply packet only...
753 */
754 if (after(end, conntrack->proto.tcp.seen[dir].td_end))
755 conntrack->proto.tcp.seen[dir].td_end = end;
756 conntrack->proto.tcp.last_end = end;
757 write_unlock_bh(&tcp_lock);
758 DEBUGP("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i "
759 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
760 sender->td_end, sender->td_maxend, sender->td_maxwin,
761 sender->td_scale,
762 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
763 receiver->td_scale);
764}
765
766#endif
767
768#define TH_FIN 0x01
769#define TH_SYN 0x02
770#define TH_RST 0x04
771#define TH_PUSH 0x08
772#define TH_ACK 0x10
773#define TH_URG 0x20
774#define TH_ECE 0x40
775#define TH_CWR 0x80
776
777/* table of valid flag combinations - ECE and CWR are always valid */
778static u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] =
779{
780 [TH_SYN] = 1,
781 [TH_SYN|TH_ACK] = 1,
782 [TH_SYN|TH_ACK|TH_PUSH] = 1,
783 [TH_RST] = 1,
784 [TH_RST|TH_ACK] = 1,
785 [TH_RST|TH_ACK|TH_PUSH] = 1,
786 [TH_FIN|TH_ACK] = 1,
787 [TH_ACK] = 1,
788 [TH_ACK|TH_PUSH] = 1,
789 [TH_ACK|TH_URG] = 1,
790 [TH_ACK|TH_URG|TH_PUSH] = 1,
791 [TH_FIN|TH_ACK|TH_PUSH] = 1,
792 [TH_FIN|TH_ACK|TH_URG] = 1,
793 [TH_FIN|TH_ACK|TH_URG|TH_PUSH] = 1,
794};
795
796/* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */
797static int tcp_error(struct sk_buff *skb,
798 unsigned int dataoff,
799 enum ip_conntrack_info *ctinfo,
800 int pf,
801 unsigned int hooknum,
802 int(*csum)(const struct sk_buff *,unsigned int))
803{
804 struct tcphdr _tcph, *th;
805 unsigned int tcplen = skb->len - dataoff;
806 u_int8_t tcpflags;
807
808 /* Smaller that minimal TCP header? */
809 th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
810 if (th == NULL) {
811 if (LOG_INVALID(IPPROTO_TCP))
812 nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
813 "nf_ct_tcp: short packet ");
814 return -NF_ACCEPT;
815 }
816
817 /* Not whole TCP header or malformed packet */
818 if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
819 if (LOG_INVALID(IPPROTO_TCP))
820 nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
821 "nf_ct_tcp: truncated/malformed packet ");
822 return -NF_ACCEPT;
823 }
824
825 /* Checksum invalid? Ignore.
826 * We skip checking packets on the outgoing path
827 * because the semantic of CHECKSUM_HW is different there
828 * and moreover root might send raw packets.
829 */
830 /* FIXME: Source route IP option packets --RR */
831 if (((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) ||
832 (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING))
833 && skb->ip_summed != CHECKSUM_UNNECESSARY
834 && csum(skb, dataoff)) {
835 if (LOG_INVALID(IPPROTO_TCP))
836 nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
837 "nf_ct_tcp: bad TCP checksum ");
838 return -NF_ACCEPT;
839 }
840
841 /* Check TCP flags. */
842 tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR));
843 if (!tcp_valid_flags[tcpflags]) {
844 if (LOG_INVALID(IPPROTO_TCP))
845 nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
846 "nf_ct_tcp: invalid TCP flag combination ");
847 return -NF_ACCEPT;
848 }
849
850 return NF_ACCEPT;
851}
852
853static int csum4(const struct sk_buff *skb, unsigned int dataoff)
854{
855 return csum_tcpudp_magic(skb->nh.iph->saddr, skb->nh.iph->daddr,
856 skb->len - dataoff, IPPROTO_TCP,
857 skb->ip_summed == CHECKSUM_HW ? skb->csum
858 : skb_checksum(skb, dataoff,
859 skb->len - dataoff, 0));
860}
861
862static int csum6(const struct sk_buff *skb, unsigned int dataoff)
863{
864 return csum_ipv6_magic(&skb->nh.ipv6h->saddr, &skb->nh.ipv6h->daddr,
865 skb->len - dataoff, IPPROTO_TCP,
866 skb->ip_summed == CHECKSUM_HW ? skb->csum
867 : skb_checksum(skb, dataoff, skb->len - dataoff,
868 0));
869}
870
871static int tcp_error4(struct sk_buff *skb,
872 unsigned int dataoff,
873 enum ip_conntrack_info *ctinfo,
874 int pf,
875 unsigned int hooknum)
876{
877 return tcp_error(skb, dataoff, ctinfo, pf, hooknum, csum4);
878}
879
880static int tcp_error6(struct sk_buff *skb,
881 unsigned int dataoff,
882 enum ip_conntrack_info *ctinfo,
883 int pf,
884 unsigned int hooknum)
885{
886 return tcp_error(skb, dataoff, ctinfo, pf, hooknum, csum6);
887}
888
889/* Returns verdict for packet, or -1 for invalid. */
890static int tcp_packet(struct nf_conn *conntrack,
891 const struct sk_buff *skb,
892 unsigned int dataoff,
893 enum ip_conntrack_info ctinfo,
894 int pf,
895 unsigned int hooknum)
896{
897 enum tcp_conntrack new_state, old_state;
898 enum ip_conntrack_dir dir;
899 struct tcphdr *th, _tcph;
900 unsigned long timeout;
901 unsigned int index;
902
903 th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
904 BUG_ON(th == NULL);
905
906 write_lock_bh(&tcp_lock);
907 old_state = conntrack->proto.tcp.state;
908 dir = CTINFO2DIR(ctinfo);
909 index = get_conntrack_index(th);
910 new_state = tcp_conntracks[dir][index][old_state];
911
912 switch (new_state) {
913 case TCP_CONNTRACK_IGNORE:
914 /* Either SYN in ORIGINAL
915 * or SYN/ACK in REPLY. */
916 if (index == TCP_SYNACK_SET
917 && conntrack->proto.tcp.last_index == TCP_SYN_SET
918 && conntrack->proto.tcp.last_dir != dir
919 && ntohl(th->ack_seq) ==
920 conntrack->proto.tcp.last_end) {
921 /* This SYN/ACK acknowledges a SYN that we earlier
922 * ignored as invalid. This means that the client and
923 * the server are both in sync, while the firewall is
924 * not. We kill this session and block the SYN/ACK so
925 * that the client cannot but retransmit its SYN and
926 * thus initiate a clean new session.
927 */
928 write_unlock_bh(&tcp_lock);
929 if (LOG_INVALID(IPPROTO_TCP))
930 nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
931 "nf_ct_tcp: killing out of sync session ");
932 if (del_timer(&conntrack->timeout))
933 conntrack->timeout.function((unsigned long)
934 conntrack);
935 return -NF_DROP;
936 }
937 conntrack->proto.tcp.last_index = index;
938 conntrack->proto.tcp.last_dir = dir;
939 conntrack->proto.tcp.last_seq = ntohl(th->seq);
940 conntrack->proto.tcp.last_end =
941 segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th);
942
943 write_unlock_bh(&tcp_lock);
944 if (LOG_INVALID(IPPROTO_TCP))
945 nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
946 "nf_ct_tcp: invalid packed ignored ");
947 return NF_ACCEPT;
948 case TCP_CONNTRACK_MAX:
949 /* Invalid packet */
950 DEBUGP("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
951 dir, get_conntrack_index(th),
952 old_state);
953 write_unlock_bh(&tcp_lock);
954 if (LOG_INVALID(IPPROTO_TCP))
955 nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
956 "nf_ct_tcp: invalid state ");
957 return -NF_ACCEPT;
958 case TCP_CONNTRACK_SYN_SENT:
959 if (old_state < TCP_CONNTRACK_TIME_WAIT)
960 break;
961 if ((conntrack->proto.tcp.seen[dir].flags &
962 IP_CT_TCP_FLAG_CLOSE_INIT)
963 || after(ntohl(th->seq),
964 conntrack->proto.tcp.seen[dir].td_end)) {
965 /* Attempt to reopen a closed connection.
966 * Delete this connection and look up again. */
967 write_unlock_bh(&tcp_lock);
968 if (del_timer(&conntrack->timeout))
969 conntrack->timeout.function((unsigned long)
970 conntrack);
971 return -NF_REPEAT;
972 }
973 case TCP_CONNTRACK_CLOSE:
974 if (index == TCP_RST_SET
975 && test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)
976 && conntrack->proto.tcp.last_index == TCP_SYN_SET
977 && ntohl(th->ack_seq) == conntrack->proto.tcp.last_end) {
978 /* RST sent to invalid SYN we had let trough
979 * SYN was in window then, tear down connection.
980 * We skip window checking, because packet might ACK
981 * segments we ignored in the SYN. */
982 goto in_window;
983 }
984 /* Just fall trough */
985 default:
986 /* Keep compilers happy. */
987 break;
988 }
989
990 if (!tcp_in_window(&conntrack->proto.tcp, dir, index,
991 skb, dataoff, th, pf)) {
992 write_unlock_bh(&tcp_lock);
993 return -NF_ACCEPT;
994 }
995 in_window:
996 /* From now on we have got in-window packets */
997 conntrack->proto.tcp.last_index = index;
998
999 DEBUGP("tcp_conntracks: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
1000 "syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n",
1001 NIPQUAD(iph->saddr), ntohs(th->source),
1002 NIPQUAD(iph->daddr), ntohs(th->dest),
1003 (th->syn ? 1 : 0), (th->ack ? 1 : 0),
1004 (th->fin ? 1 : 0), (th->rst ? 1 : 0),
1005 old_state, new_state);
1006
1007 conntrack->proto.tcp.state = new_state;
1008 if (old_state != new_state
1009 && (new_state == TCP_CONNTRACK_FIN_WAIT
1010 || new_state == TCP_CONNTRACK_CLOSE))
1011 conntrack->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
1012 timeout = conntrack->proto.tcp.retrans >= nf_ct_tcp_max_retrans
1013 && *tcp_timeouts[new_state] > nf_ct_tcp_timeout_max_retrans
1014 ? nf_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state];
1015 write_unlock_bh(&tcp_lock);
1016
1017 nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb);
1018 if (new_state != old_state)
1019 nf_conntrack_event_cache(IPCT_PROTOINFO, skb);
1020
1021 if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
1022 /* If only reply is a RST, we can consider ourselves not to
1023 have an established connection: this is a fairly common
1024 problem case, so we can delete the conntrack
1025 immediately. --RR */
1026 if (th->rst) {
1027 if (del_timer(&conntrack->timeout))
1028 conntrack->timeout.function((unsigned long)
1029 conntrack);
1030 return NF_ACCEPT;
1031 }
1032 } else if (!test_bit(IPS_ASSURED_BIT, &conntrack->status)
1033 && (old_state == TCP_CONNTRACK_SYN_RECV
1034 || old_state == TCP_CONNTRACK_ESTABLISHED)
1035 && new_state == TCP_CONNTRACK_ESTABLISHED) {
1036 /* Set ASSURED if we see see valid ack in ESTABLISHED
1037 after SYN_RECV or a valid answer for a picked up
1038 connection. */
1039 set_bit(IPS_ASSURED_BIT, &conntrack->status);
1040 nf_conntrack_event_cache(IPCT_STATUS, skb);
1041 }
1042 nf_ct_refresh_acct(conntrack, ctinfo, skb, timeout);
1043
1044 return NF_ACCEPT;
1045}
1046
1047/* Called when a new connection for this protocol found. */
1048static int tcp_new(struct nf_conn *conntrack,
1049 const struct sk_buff *skb,
1050 unsigned int dataoff)
1051{
1052 enum tcp_conntrack new_state;
1053 struct tcphdr *th, _tcph;
1054#ifdef DEBUGP_VARS
1055 struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[0];
1056 struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[1];
1057#endif
1058
1059 th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
1060 BUG_ON(th == NULL);
1061
1062 /* Don't need lock here: this conntrack not in circulation yet */
1063 new_state
1064 = tcp_conntracks[0][get_conntrack_index(th)]
1065 [TCP_CONNTRACK_NONE];
1066
1067 /* Invalid: delete conntrack */
1068 if (new_state >= TCP_CONNTRACK_MAX) {
1069 DEBUGP("nf_ct_tcp: invalid new deleting.\n");
1070 return 0;
1071 }
1072
1073 if (new_state == TCP_CONNTRACK_SYN_SENT) {
1074 /* SYN packet */
1075 conntrack->proto.tcp.seen[0].td_end =
1076 segment_seq_plus_len(ntohl(th->seq), skb->len,
1077 dataoff, th);
1078 conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
1079 if (conntrack->proto.tcp.seen[0].td_maxwin == 0)
1080 conntrack->proto.tcp.seen[0].td_maxwin = 1;
1081 conntrack->proto.tcp.seen[0].td_maxend =
1082 conntrack->proto.tcp.seen[0].td_end;
1083
1084 tcp_options(skb, dataoff, th, &conntrack->proto.tcp.seen[0]);
1085 conntrack->proto.tcp.seen[1].flags = 0;
1086 conntrack->proto.tcp.seen[0].loose =
1087 conntrack->proto.tcp.seen[1].loose = 0;
1088 } else if (nf_ct_tcp_loose == 0) {
1089 /* Don't try to pick up connections. */
1090 return 0;
1091 } else {
1092 /*
1093 * We are in the middle of a connection,
1094 * its history is lost for us.
1095 * Let's try to use the data from the packet.
1096 */
1097 conntrack->proto.tcp.seen[0].td_end =
1098 segment_seq_plus_len(ntohl(th->seq), skb->len,
1099 dataoff, th);
1100 conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
1101 if (conntrack->proto.tcp.seen[0].td_maxwin == 0)
1102 conntrack->proto.tcp.seen[0].td_maxwin = 1;
1103 conntrack->proto.tcp.seen[0].td_maxend =
1104 conntrack->proto.tcp.seen[0].td_end +
1105 conntrack->proto.tcp.seen[0].td_maxwin;
1106 conntrack->proto.tcp.seen[0].td_scale = 0;
1107
1108 /* We assume SACK. Should we assume window scaling too? */
1109 conntrack->proto.tcp.seen[0].flags =
1110 conntrack->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM;
1111 conntrack->proto.tcp.seen[0].loose =
1112 conntrack->proto.tcp.seen[1].loose = nf_ct_tcp_loose;
1113 }
1114
1115 conntrack->proto.tcp.seen[1].td_end = 0;
1116 conntrack->proto.tcp.seen[1].td_maxend = 0;
1117 conntrack->proto.tcp.seen[1].td_maxwin = 1;
1118 conntrack->proto.tcp.seen[1].td_scale = 0;
1119
1120 /* tcp_packet will set them */
1121 conntrack->proto.tcp.state = TCP_CONNTRACK_NONE;
1122 conntrack->proto.tcp.last_index = TCP_NONE_SET;
1123
1124 DEBUGP("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i "
1125 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
1126 sender->td_end, sender->td_maxend, sender->td_maxwin,
1127 sender->td_scale,
1128 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
1129 receiver->td_scale);
1130 return 1;
1131}
1132
1133struct nf_conntrack_protocol nf_conntrack_protocol_tcp4 =
1134{
1135 .l3proto = PF_INET,
1136 .proto = IPPROTO_TCP,
1137 .name = "tcp",
1138 .pkt_to_tuple = tcp_pkt_to_tuple,
1139 .invert_tuple = tcp_invert_tuple,
1140 .print_tuple = tcp_print_tuple,
1141 .print_conntrack = tcp_print_conntrack,
1142 .packet = tcp_packet,
1143 .new = tcp_new,
1144 .error = tcp_error4,
1145};
1146
1147struct nf_conntrack_protocol nf_conntrack_protocol_tcp6 =
1148{
1149 .l3proto = PF_INET6,
1150 .proto = IPPROTO_TCP,
1151 .name = "tcp",
1152 .pkt_to_tuple = tcp_pkt_to_tuple,
1153 .invert_tuple = tcp_invert_tuple,
1154 .print_tuple = tcp_print_tuple,
1155 .print_conntrack = tcp_print_conntrack,
1156 .packet = tcp_packet,
1157 .new = tcp_new,
1158 .error = tcp_error6,
1159};
1160
1161EXPORT_SYMBOL(nf_conntrack_protocol_tcp4);
1162EXPORT_SYMBOL(nf_conntrack_protocol_tcp6);
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
new file mode 100644
index 000000000000..3cae7ce420dd
--- /dev/null
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -0,0 +1,216 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
9 * - enable working with Layer 3 protocol independent connection tracking.
10 *
11 * Derived from net/ipv4/netfilter/ip_conntrack_proto_udp.c
12 */
13
14#include <linux/types.h>
15#include <linux/sched.h>
16#include <linux/timer.h>
17#include <linux/module.h>
18#include <linux/netfilter.h>
19#include <linux/udp.h>
20#include <linux/seq_file.h>
21#include <linux/skbuff.h>
22#include <linux/ipv6.h>
23#include <net/ip6_checksum.h>
24#include <net/checksum.h>
25#include <linux/netfilter.h>
26#include <linux/netfilter_ipv4.h>
27#include <linux/netfilter_ipv6.h>
28#include <net/netfilter/nf_conntrack_protocol.h>
29
30unsigned long nf_ct_udp_timeout = 30*HZ;
31unsigned long nf_ct_udp_timeout_stream = 180*HZ;
32
33static int udp_pkt_to_tuple(const struct sk_buff *skb,
34 unsigned int dataoff,
35 struct nf_conntrack_tuple *tuple)
36{
37 struct udphdr _hdr, *hp;
38
39 /* Actually only need first 8 bytes. */
40 hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
41 if (hp == NULL)
42 return 0;
43
44 tuple->src.u.udp.port = hp->source;
45 tuple->dst.u.udp.port = hp->dest;
46
47 return 1;
48}
49
50static int udp_invert_tuple(struct nf_conntrack_tuple *tuple,
51 const struct nf_conntrack_tuple *orig)
52{
53 tuple->src.u.udp.port = orig->dst.u.udp.port;
54 tuple->dst.u.udp.port = orig->src.u.udp.port;
55 return 1;
56}
57
58/* Print out the per-protocol part of the tuple. */
59static int udp_print_tuple(struct seq_file *s,
60 const struct nf_conntrack_tuple *tuple)
61{
62 return seq_printf(s, "sport=%hu dport=%hu ",
63 ntohs(tuple->src.u.udp.port),
64 ntohs(tuple->dst.u.udp.port));
65}
66
67/* Print out the private part of the conntrack. */
68static int udp_print_conntrack(struct seq_file *s,
69 const struct nf_conn *conntrack)
70{
71 return 0;
72}
73
74/* Returns verdict for packet, and may modify conntracktype */
75static int udp_packet(struct nf_conn *conntrack,
76 const struct sk_buff *skb,
77 unsigned int dataoff,
78 enum ip_conntrack_info ctinfo,
79 int pf,
80 unsigned int hooknum)
81{
82 /* If we've seen traffic both ways, this is some kind of UDP
83 stream. Extend timeout. */
84 if (test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
85 nf_ct_refresh_acct(conntrack, ctinfo, skb,
86 nf_ct_udp_timeout_stream);
87 /* Also, more likely to be important, and not a probe */
88 if (!test_and_set_bit(IPS_ASSURED_BIT, &conntrack->status))
89 nf_conntrack_event_cache(IPCT_STATUS, skb);
90 } else
91 nf_ct_refresh_acct(conntrack, ctinfo, skb, nf_ct_udp_timeout);
92
93 return NF_ACCEPT;
94}
95
96/* Called when a new connection for this protocol found. */
97static int udp_new(struct nf_conn *conntrack, const struct sk_buff *skb,
98 unsigned int dataoff)
99{
100 return 1;
101}
102
103static int udp_error(struct sk_buff *skb, unsigned int dataoff,
104 enum ip_conntrack_info *ctinfo,
105 int pf,
106 unsigned int hooknum,
107 int (*csum)(const struct sk_buff *, unsigned int))
108{
109 unsigned int udplen = skb->len - dataoff;
110 struct udphdr _hdr, *hdr;
111
112 /* Header is too small? */
113 hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
114 if (hdr == NULL) {
115 if (LOG_INVALID(IPPROTO_UDP))
116 nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
117 "nf_ct_udp: short packet ");
118 return -NF_ACCEPT;
119 }
120
121 /* Truncated/malformed packets */
122 if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) {
123 if (LOG_INVALID(IPPROTO_UDP))
124 nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
125 "nf_ct_udp: truncated/malformed packet ");
126 return -NF_ACCEPT;
127 }
128
129 /* Packet with no checksum */
130 if (!hdr->check)
131 return NF_ACCEPT;
132
133 /* Checksum invalid? Ignore.
134 * We skip checking packets on the outgoing path
135 * because the semantic of CHECKSUM_HW is different there
136 * and moreover root might send raw packets.
137 * FIXME: Source route IP option packets --RR */
138 if (((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) ||
139 (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING))
140 && skb->ip_summed != CHECKSUM_UNNECESSARY
141 && csum(skb, dataoff)) {
142 if (LOG_INVALID(IPPROTO_UDP))
143 nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
144 "nf_ct_udp: bad UDP checksum ");
145 return -NF_ACCEPT;
146 }
147
148 return NF_ACCEPT;
149}
150
151static int csum4(const struct sk_buff *skb, unsigned int dataoff)
152{
153 return csum_tcpudp_magic(skb->nh.iph->saddr, skb->nh.iph->daddr,
154 skb->len - dataoff, IPPROTO_UDP,
155 skb->ip_summed == CHECKSUM_HW ? skb->csum
156 : skb_checksum(skb, dataoff,
157 skb->len - dataoff, 0));
158}
159
160static int csum6(const struct sk_buff *skb, unsigned int dataoff)
161{
162 return csum_ipv6_magic(&skb->nh.ipv6h->saddr, &skb->nh.ipv6h->daddr,
163 skb->len - dataoff, IPPROTO_UDP,
164 skb->ip_summed == CHECKSUM_HW ? skb->csum
165 : skb_checksum(skb, dataoff, skb->len - dataoff,
166 0));
167}
168
169static int udp_error4(struct sk_buff *skb,
170 unsigned int dataoff,
171 enum ip_conntrack_info *ctinfo,
172 int pf,
173 unsigned int hooknum)
174{
175 return udp_error(skb, dataoff, ctinfo, pf, hooknum, csum4);
176}
177
178static int udp_error6(struct sk_buff *skb,
179 unsigned int dataoff,
180 enum ip_conntrack_info *ctinfo,
181 int pf,
182 unsigned int hooknum)
183{
184 return udp_error(skb, dataoff, ctinfo, pf, hooknum, csum6);
185}
186
187struct nf_conntrack_protocol nf_conntrack_protocol_udp4 =
188{
189 .l3proto = PF_INET,
190 .proto = IPPROTO_UDP,
191 .name = "udp",
192 .pkt_to_tuple = udp_pkt_to_tuple,
193 .invert_tuple = udp_invert_tuple,
194 .print_tuple = udp_print_tuple,
195 .print_conntrack = udp_print_conntrack,
196 .packet = udp_packet,
197 .new = udp_new,
198 .error = udp_error4,
199};
200
201struct nf_conntrack_protocol nf_conntrack_protocol_udp6 =
202{
203 .l3proto = PF_INET6,
204 .proto = IPPROTO_UDP,
205 .name = "udp",
206 .pkt_to_tuple = udp_pkt_to_tuple,
207 .invert_tuple = udp_invert_tuple,
208 .print_tuple = udp_print_tuple,
209 .print_conntrack = udp_print_conntrack,
210 .packet = udp_packet,
211 .new = udp_new,
212 .error = udp_error6,
213};
214
215EXPORT_SYMBOL(nf_conntrack_protocol_udp4);
216EXPORT_SYMBOL(nf_conntrack_protocol_udp6);
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
new file mode 100644
index 000000000000..45224db4fe2f
--- /dev/null
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -0,0 +1,869 @@
1/* This file contains all the functions required for the standalone
2 nf_conntrack module.
3
4 These are not required by the compatibility layer.
5*/
6
7/* (C) 1999-2001 Paul `Rusty' Russell
8 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2 as
12 * published by the Free Software Foundation.
13 *
14 * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
15 * - generalize L3 protocol dependent part.
16 *
17 * Derived from net/ipv4/netfilter/ip_conntrack_standalone.c
18 */
19
20#include <linux/config.h>
21#include <linux/types.h>
22#include <linux/netfilter.h>
23#include <linux/module.h>
24#include <linux/skbuff.h>
25#include <linux/proc_fs.h>
26#include <linux/seq_file.h>
27#include <linux/percpu.h>
28#include <linux/netdevice.h>
29#ifdef CONFIG_SYSCTL
30#include <linux/sysctl.h>
31#endif
32
33#define ASSERT_READ_LOCK(x)
34#define ASSERT_WRITE_LOCK(x)
35
36#include <net/netfilter/nf_conntrack.h>
37#include <net/netfilter/nf_conntrack_l3proto.h>
38#include <net/netfilter/nf_conntrack_protocol.h>
39#include <net/netfilter/nf_conntrack_core.h>
40#include <net/netfilter/nf_conntrack_helper.h>
41#include <linux/netfilter_ipv4/listhelp.h>
42
43#if 0
44#define DEBUGP printk
45#else
46#define DEBUGP(format, args...)
47#endif
48
49MODULE_LICENSE("GPL");
50
51extern atomic_t nf_conntrack_count;
52DECLARE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat);
53
54static int kill_l3proto(struct nf_conn *i, void *data)
55{
56 return (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num ==
57 ((struct nf_conntrack_l3proto *)data)->l3proto);
58}
59
60static int kill_proto(struct nf_conn *i, void *data)
61{
62 struct nf_conntrack_protocol *proto;
63 proto = (struct nf_conntrack_protocol *)data;
64 return (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum ==
65 proto->proto) &&
66 (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num ==
67 proto->l3proto);
68}
69
70#ifdef CONFIG_PROC_FS
71static int
72print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
73 struct nf_conntrack_l3proto *l3proto,
74 struct nf_conntrack_protocol *proto)
75{
76 return l3proto->print_tuple(s, tuple) || proto->print_tuple(s, tuple);
77}
78
79#ifdef CONFIG_NF_CT_ACCT
80static unsigned int
81seq_print_counters(struct seq_file *s,
82 const struct ip_conntrack_counter *counter)
83{
84 return seq_printf(s, "packets=%llu bytes=%llu ",
85 (unsigned long long)counter->packets,
86 (unsigned long long)counter->bytes);
87}
88#else
89#define seq_print_counters(x, y) 0
90#endif
91
92struct ct_iter_state {
93 unsigned int bucket;
94};
95
96static struct list_head *ct_get_first(struct seq_file *seq)
97{
98 struct ct_iter_state *st = seq->private;
99
100 for (st->bucket = 0;
101 st->bucket < nf_conntrack_htable_size;
102 st->bucket++) {
103 if (!list_empty(&nf_conntrack_hash[st->bucket]))
104 return nf_conntrack_hash[st->bucket].next;
105 }
106 return NULL;
107}
108
109static struct list_head *ct_get_next(struct seq_file *seq, struct list_head *head)
110{
111 struct ct_iter_state *st = seq->private;
112
113 head = head->next;
114 while (head == &nf_conntrack_hash[st->bucket]) {
115 if (++st->bucket >= nf_conntrack_htable_size)
116 return NULL;
117 head = nf_conntrack_hash[st->bucket].next;
118 }
119 return head;
120}
121
122static struct list_head *ct_get_idx(struct seq_file *seq, loff_t pos)
123{
124 struct list_head *head = ct_get_first(seq);
125
126 if (head)
127 while (pos && (head = ct_get_next(seq, head)))
128 pos--;
129 return pos ? NULL : head;
130}
131
132static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
133{
134 read_lock_bh(&nf_conntrack_lock);
135 return ct_get_idx(seq, *pos);
136}
137
138static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos)
139{
140 (*pos)++;
141 return ct_get_next(s, v);
142}
143
144static void ct_seq_stop(struct seq_file *s, void *v)
145{
146 read_unlock_bh(&nf_conntrack_lock);
147}
148
149/* return 0 on success, 1 in case of error */
150static int ct_seq_show(struct seq_file *s, void *v)
151{
152 const struct nf_conntrack_tuple_hash *hash = v;
153 const struct nf_conn *conntrack = nf_ct_tuplehash_to_ctrack(hash);
154 struct nf_conntrack_l3proto *l3proto;
155 struct nf_conntrack_protocol *proto;
156
157 ASSERT_READ_LOCK(&nf_conntrack_lock);
158 NF_CT_ASSERT(conntrack);
159
160 /* we only want to print DIR_ORIGINAL */
161 if (NF_CT_DIRECTION(hash))
162 return 0;
163
164 l3proto = nf_ct_find_l3proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
165 .tuple.src.l3num);
166
167 NF_CT_ASSERT(l3proto);
168 proto = nf_ct_find_proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
169 .tuple.src.l3num,
170 conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
171 .tuple.dst.protonum);
172 NF_CT_ASSERT(proto);
173
174 if (seq_printf(s, "%-8s %u %-8s %u %ld ",
175 l3proto->name,
176 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num,
177 proto->name,
178 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum,
179 timer_pending(&conntrack->timeout)
180 ? (long)(conntrack->timeout.expires - jiffies)/HZ : 0) != 0)
181 return -ENOSPC;
182
183 if (l3proto->print_conntrack(s, conntrack))
184 return -ENOSPC;
185
186 if (proto->print_conntrack(s, conntrack))
187 return -ENOSPC;
188
189 if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
190 l3proto, proto))
191 return -ENOSPC;
192
193 if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_ORIGINAL]))
194 return -ENOSPC;
195
196 if (!(test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)))
197 if (seq_printf(s, "[UNREPLIED] "))
198 return -ENOSPC;
199
200 if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple,
201 l3proto, proto))
202 return -ENOSPC;
203
204 if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_REPLY]))
205 return -ENOSPC;
206
207 if (test_bit(IPS_ASSURED_BIT, &conntrack->status))
208 if (seq_printf(s, "[ASSURED] "))
209 return -ENOSPC;
210
211#if defined(CONFIG_NF_CONNTRACK_MARK)
212 if (seq_printf(s, "mark=%u ", conntrack->mark))
213 return -ENOSPC;
214#endif
215
216 if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use)))
217 return -ENOSPC;
218
219 return 0;
220}
221
222static struct seq_operations ct_seq_ops = {
223 .start = ct_seq_start,
224 .next = ct_seq_next,
225 .stop = ct_seq_stop,
226 .show = ct_seq_show
227};
228
229static int ct_open(struct inode *inode, struct file *file)
230{
231 struct seq_file *seq;
232 struct ct_iter_state *st;
233 int ret;
234
235 st = kmalloc(sizeof(struct ct_iter_state), GFP_KERNEL);
236 if (st == NULL)
237 return -ENOMEM;
238 ret = seq_open(file, &ct_seq_ops);
239 if (ret)
240 goto out_free;
241 seq = file->private_data;
242 seq->private = st;
243 memset(st, 0, sizeof(struct ct_iter_state));
244 return ret;
245out_free:
246 kfree(st);
247 return ret;
248}
249
250static struct file_operations ct_file_ops = {
251 .owner = THIS_MODULE,
252 .open = ct_open,
253 .read = seq_read,
254 .llseek = seq_lseek,
255 .release = seq_release_private,
256};
257
258/* expects */
259static void *exp_seq_start(struct seq_file *s, loff_t *pos)
260{
261 struct list_head *e = &nf_conntrack_expect_list;
262 loff_t i;
263
264 /* strange seq_file api calls stop even if we fail,
265 * thus we need to grab lock since stop unlocks */
266 read_lock_bh(&nf_conntrack_lock);
267
268 if (list_empty(e))
269 return NULL;
270
271 for (i = 0; i <= *pos; i++) {
272 e = e->next;
273 if (e == &nf_conntrack_expect_list)
274 return NULL;
275 }
276 return e;
277}
278
279static void *exp_seq_next(struct seq_file *s, void *v, loff_t *pos)
280{
281 struct list_head *e = v;
282
283 ++*pos;
284 e = e->next;
285
286 if (e == &nf_conntrack_expect_list)
287 return NULL;
288
289 return e;
290}
291
292static void exp_seq_stop(struct seq_file *s, void *v)
293{
294 read_unlock_bh(&nf_conntrack_lock);
295}
296
297static int exp_seq_show(struct seq_file *s, void *v)
298{
299 struct nf_conntrack_expect *expect = v;
300
301 if (expect->timeout.function)
302 seq_printf(s, "%ld ", timer_pending(&expect->timeout)
303 ? (long)(expect->timeout.expires - jiffies)/HZ : 0);
304 else
305 seq_printf(s, "- ");
306 seq_printf(s, "l3proto = %u proto=%u ",
307 expect->tuple.src.l3num,
308 expect->tuple.dst.protonum);
309 print_tuple(s, &expect->tuple,
310 nf_ct_find_l3proto(expect->tuple.src.l3num),
311 nf_ct_find_proto(expect->tuple.src.l3num,
312 expect->tuple.dst.protonum));
313 return seq_putc(s, '\n');
314}
315
316static struct seq_operations exp_seq_ops = {
317 .start = exp_seq_start,
318 .next = exp_seq_next,
319 .stop = exp_seq_stop,
320 .show = exp_seq_show
321};
322
323static int exp_open(struct inode *inode, struct file *file)
324{
325 return seq_open(file, &exp_seq_ops);
326}
327
328static struct file_operations exp_file_ops = {
329 .owner = THIS_MODULE,
330 .open = exp_open,
331 .read = seq_read,
332 .llseek = seq_lseek,
333 .release = seq_release
334};
335
336static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos)
337{
338 int cpu;
339
340 if (*pos == 0)
341 return SEQ_START_TOKEN;
342
343 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
344 if (!cpu_possible(cpu))
345 continue;
346 *pos = cpu + 1;
347 return &per_cpu(nf_conntrack_stat, cpu);
348 }
349
350 return NULL;
351}
352
353static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
354{
355 int cpu;
356
357 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
358 if (!cpu_possible(cpu))
359 continue;
360 *pos = cpu + 1;
361 return &per_cpu(nf_conntrack_stat, cpu);
362 }
363
364 return NULL;
365}
366
367static void ct_cpu_seq_stop(struct seq_file *seq, void *v)
368{
369}
370
371static int ct_cpu_seq_show(struct seq_file *seq, void *v)
372{
373 unsigned int nr_conntracks = atomic_read(&nf_conntrack_count);
374 struct ip_conntrack_stat *st = v;
375
376 if (v == SEQ_START_TOKEN) {
377 seq_printf(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete\n");
378 return 0;
379 }
380
381 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x "
382 "%08x %08x %08x %08x %08x %08x %08x %08x \n",
383 nr_conntracks,
384 st->searched,
385 st->found,
386 st->new,
387 st->invalid,
388 st->ignore,
389 st->delete,
390 st->delete_list,
391 st->insert,
392 st->insert_failed,
393 st->drop,
394 st->early_drop,
395 st->error,
396
397 st->expect_new,
398 st->expect_create,
399 st->expect_delete
400 );
401 return 0;
402}
403
404static struct seq_operations ct_cpu_seq_ops = {
405 .start = ct_cpu_seq_start,
406 .next = ct_cpu_seq_next,
407 .stop = ct_cpu_seq_stop,
408 .show = ct_cpu_seq_show,
409};
410
411static int ct_cpu_seq_open(struct inode *inode, struct file *file)
412{
413 return seq_open(file, &ct_cpu_seq_ops);
414}
415
416static struct file_operations ct_cpu_seq_fops = {
417 .owner = THIS_MODULE,
418 .open = ct_cpu_seq_open,
419 .read = seq_read,
420 .llseek = seq_lseek,
421 .release = seq_release_private,
422};
423#endif /* CONFIG_PROC_FS */
424
425/* Sysctl support */
426
427#ifdef CONFIG_SYSCTL
428
429/* From nf_conntrack_core.c */
430extern int nf_conntrack_max;
431extern unsigned int nf_conntrack_htable_size;
432
433/* From nf_conntrack_proto_tcp.c */
434extern unsigned long nf_ct_tcp_timeout_syn_sent;
435extern unsigned long nf_ct_tcp_timeout_syn_recv;
436extern unsigned long nf_ct_tcp_timeout_established;
437extern unsigned long nf_ct_tcp_timeout_fin_wait;
438extern unsigned long nf_ct_tcp_timeout_close_wait;
439extern unsigned long nf_ct_tcp_timeout_last_ack;
440extern unsigned long nf_ct_tcp_timeout_time_wait;
441extern unsigned long nf_ct_tcp_timeout_close;
442extern unsigned long nf_ct_tcp_timeout_max_retrans;
443extern int nf_ct_tcp_loose;
444extern int nf_ct_tcp_be_liberal;
445extern int nf_ct_tcp_max_retrans;
446
447/* From nf_conntrack_proto_udp.c */
448extern unsigned long nf_ct_udp_timeout;
449extern unsigned long nf_ct_udp_timeout_stream;
450
451/* From nf_conntrack_proto_generic.c */
452extern unsigned long nf_ct_generic_timeout;
453
454/* Log invalid packets of a given protocol */
455static int log_invalid_proto_min = 0;
456static int log_invalid_proto_max = 255;
457
458static struct ctl_table_header *nf_ct_sysctl_header;
459
460static ctl_table nf_ct_sysctl_table[] = {
461 {
462 .ctl_name = NET_NF_CONNTRACK_MAX,
463 .procname = "nf_conntrack_max",
464 .data = &nf_conntrack_max,
465 .maxlen = sizeof(int),
466 .mode = 0644,
467 .proc_handler = &proc_dointvec,
468 },
469 {
470 .ctl_name = NET_NF_CONNTRACK_COUNT,
471 .procname = "nf_conntrack_count",
472 .data = &nf_conntrack_count,
473 .maxlen = sizeof(int),
474 .mode = 0444,
475 .proc_handler = &proc_dointvec,
476 },
477 {
478 .ctl_name = NET_NF_CONNTRACK_BUCKETS,
479 .procname = "nf_conntrack_buckets",
480 .data = &nf_conntrack_htable_size,
481 .maxlen = sizeof(unsigned int),
482 .mode = 0444,
483 .proc_handler = &proc_dointvec,
484 },
485 {
486 .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT,
487 .procname = "nf_conntrack_tcp_timeout_syn_sent",
488 .data = &nf_ct_tcp_timeout_syn_sent,
489 .maxlen = sizeof(unsigned int),
490 .mode = 0644,
491 .proc_handler = &proc_dointvec_jiffies,
492 },
493 {
494 .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV,
495 .procname = "nf_conntrack_tcp_timeout_syn_recv",
496 .data = &nf_ct_tcp_timeout_syn_recv,
497 .maxlen = sizeof(unsigned int),
498 .mode = 0644,
499 .proc_handler = &proc_dointvec_jiffies,
500 },
501 {
502 .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED,
503 .procname = "nf_conntrack_tcp_timeout_established",
504 .data = &nf_ct_tcp_timeout_established,
505 .maxlen = sizeof(unsigned int),
506 .mode = 0644,
507 .proc_handler = &proc_dointvec_jiffies,
508 },
509 {
510 .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT,
511 .procname = "nf_conntrack_tcp_timeout_fin_wait",
512 .data = &nf_ct_tcp_timeout_fin_wait,
513 .maxlen = sizeof(unsigned int),
514 .mode = 0644,
515 .proc_handler = &proc_dointvec_jiffies,
516 },
517 {
518 .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT,
519 .procname = "nf_conntrack_tcp_timeout_close_wait",
520 .data = &nf_ct_tcp_timeout_close_wait,
521 .maxlen = sizeof(unsigned int),
522 .mode = 0644,
523 .proc_handler = &proc_dointvec_jiffies,
524 },
525 {
526 .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK,
527 .procname = "nf_conntrack_tcp_timeout_last_ack",
528 .data = &nf_ct_tcp_timeout_last_ack,
529 .maxlen = sizeof(unsigned int),
530 .mode = 0644,
531 .proc_handler = &proc_dointvec_jiffies,
532 },
533 {
534 .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT,
535 .procname = "nf_conntrack_tcp_timeout_time_wait",
536 .data = &nf_ct_tcp_timeout_time_wait,
537 .maxlen = sizeof(unsigned int),
538 .mode = 0644,
539 .proc_handler = &proc_dointvec_jiffies,
540 },
541 {
542 .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE,
543 .procname = "nf_conntrack_tcp_timeout_close",
544 .data = &nf_ct_tcp_timeout_close,
545 .maxlen = sizeof(unsigned int),
546 .mode = 0644,
547 .proc_handler = &proc_dointvec_jiffies,
548 },
549 {
550 .ctl_name = NET_NF_CONNTRACK_UDP_TIMEOUT,
551 .procname = "nf_conntrack_udp_timeout",
552 .data = &nf_ct_udp_timeout,
553 .maxlen = sizeof(unsigned int),
554 .mode = 0644,
555 .proc_handler = &proc_dointvec_jiffies,
556 },
557 {
558 .ctl_name = NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM,
559 .procname = "nf_conntrack_udp_timeout_stream",
560 .data = &nf_ct_udp_timeout_stream,
561 .maxlen = sizeof(unsigned int),
562 .mode = 0644,
563 .proc_handler = &proc_dointvec_jiffies,
564 },
565 {
566 .ctl_name = NET_NF_CONNTRACK_GENERIC_TIMEOUT,
567 .procname = "nf_conntrack_generic_timeout",
568 .data = &nf_ct_generic_timeout,
569 .maxlen = sizeof(unsigned int),
570 .mode = 0644,
571 .proc_handler = &proc_dointvec_jiffies,
572 },
573 {
574 .ctl_name = NET_NF_CONNTRACK_LOG_INVALID,
575 .procname = "nf_conntrack_log_invalid",
576 .data = &nf_ct_log_invalid,
577 .maxlen = sizeof(unsigned int),
578 .mode = 0644,
579 .proc_handler = &proc_dointvec_minmax,
580 .strategy = &sysctl_intvec,
581 .extra1 = &log_invalid_proto_min,
582 .extra2 = &log_invalid_proto_max,
583 },
584 {
585 .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS,
586 .procname = "nf_conntrack_tcp_timeout_max_retrans",
587 .data = &nf_ct_tcp_timeout_max_retrans,
588 .maxlen = sizeof(unsigned int),
589 .mode = 0644,
590 .proc_handler = &proc_dointvec_jiffies,
591 },
592 {
593 .ctl_name = NET_NF_CONNTRACK_TCP_LOOSE,
594 .procname = "nf_conntrack_tcp_loose",
595 .data = &nf_ct_tcp_loose,
596 .maxlen = sizeof(unsigned int),
597 .mode = 0644,
598 .proc_handler = &proc_dointvec,
599 },
600 {
601 .ctl_name = NET_NF_CONNTRACK_TCP_BE_LIBERAL,
602 .procname = "nf_conntrack_tcp_be_liberal",
603 .data = &nf_ct_tcp_be_liberal,
604 .maxlen = sizeof(unsigned int),
605 .mode = 0644,
606 .proc_handler = &proc_dointvec,
607 },
608 {
609 .ctl_name = NET_NF_CONNTRACK_TCP_MAX_RETRANS,
610 .procname = "nf_conntrack_tcp_max_retrans",
611 .data = &nf_ct_tcp_max_retrans,
612 .maxlen = sizeof(unsigned int),
613 .mode = 0644,
614 .proc_handler = &proc_dointvec,
615 },
616
617 { .ctl_name = 0 }
618};
619
620#define NET_NF_CONNTRACK_MAX 2089
621
622static ctl_table nf_ct_netfilter_table[] = {
623 {
624 .ctl_name = NET_NETFILTER,
625 .procname = "netfilter",
626 .mode = 0555,
627 .child = nf_ct_sysctl_table,
628 },
629 {
630 .ctl_name = NET_NF_CONNTRACK_MAX,
631 .procname = "nf_conntrack_max",
632 .data = &nf_conntrack_max,
633 .maxlen = sizeof(int),
634 .mode = 0644,
635 .proc_handler = &proc_dointvec,
636 },
637 { .ctl_name = 0 }
638};
639
640static ctl_table nf_ct_net_table[] = {
641 {
642 .ctl_name = CTL_NET,
643 .procname = "net",
644 .mode = 0555,
645 .child = nf_ct_netfilter_table,
646 },
647 { .ctl_name = 0 }
648};
649EXPORT_SYMBOL(nf_ct_log_invalid);
650#endif /* CONFIG_SYSCTL */
651
652static int init_or_cleanup(int init)
653{
654#ifdef CONFIG_PROC_FS
655 struct proc_dir_entry *proc, *proc_exp, *proc_stat;
656#endif
657 int ret = 0;
658
659 if (!init) goto cleanup;
660
661 ret = nf_conntrack_init();
662 if (ret < 0)
663 goto cleanup_nothing;
664
665#ifdef CONFIG_PROC_FS
666 proc = proc_net_fops_create("nf_conntrack", 0440, &ct_file_ops);
667 if (!proc) goto cleanup_init;
668
669 proc_exp = proc_net_fops_create("nf_conntrack_expect", 0440,
670 &exp_file_ops);
671 if (!proc_exp) goto cleanup_proc;
672
673 proc_stat = create_proc_entry("nf_conntrack", S_IRUGO, proc_net_stat);
674 if (!proc_stat)
675 goto cleanup_proc_exp;
676
677 proc_stat->proc_fops = &ct_cpu_seq_fops;
678 proc_stat->owner = THIS_MODULE;
679#endif
680#ifdef CONFIG_SYSCTL
681 nf_ct_sysctl_header = register_sysctl_table(nf_ct_net_table, 0);
682 if (nf_ct_sysctl_header == NULL) {
683 printk("nf_conntrack: can't register to sysctl.\n");
684 ret = -ENOMEM;
685 goto cleanup_proc_stat;
686 }
687#endif
688
689 return ret;
690
691 cleanup:
692#ifdef CONFIG_SYSCTL
693 unregister_sysctl_table(nf_ct_sysctl_header);
694 cleanup_proc_stat:
695#endif
696#ifdef CONFIG_PROC_FS
697 proc_net_remove("nf_conntrack_stat");
698 cleanup_proc_exp:
699 proc_net_remove("nf_conntrack_expect");
700 cleanup_proc:
701 proc_net_remove("nf_conntrack");
702 cleanup_init:
703#endif /* CNFIG_PROC_FS */
704 nf_conntrack_cleanup();
705 cleanup_nothing:
706 return ret;
707}
708
709int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto)
710{
711 int ret = 0;
712
713 write_lock_bh(&nf_conntrack_lock);
714 if (nf_ct_l3protos[proto->l3proto] != &nf_conntrack_generic_l3proto) {
715 ret = -EBUSY;
716 goto out;
717 }
718 nf_ct_l3protos[proto->l3proto] = proto;
719out:
720 write_unlock_bh(&nf_conntrack_lock);
721
722 return ret;
723}
724
725void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto)
726{
727 write_lock_bh(&nf_conntrack_lock);
728 nf_ct_l3protos[proto->l3proto] = &nf_conntrack_generic_l3proto;
729 write_unlock_bh(&nf_conntrack_lock);
730
731 /* Somebody could be still looking at the proto in bh. */
732 synchronize_net();
733
734 /* Remove all contrack entries for this protocol */
735 nf_ct_iterate_cleanup(kill_l3proto, proto);
736}
737
738/* FIXME: Allow NULL functions and sub in pointers to generic for
739 them. --RR */
740int nf_conntrack_protocol_register(struct nf_conntrack_protocol *proto)
741{
742 int ret = 0;
743
744retry:
745 write_lock_bh(&nf_conntrack_lock);
746 if (nf_ct_protos[proto->l3proto]) {
747 if (nf_ct_protos[proto->l3proto][proto->proto]
748 != &nf_conntrack_generic_protocol) {
749 ret = -EBUSY;
750 goto out_unlock;
751 }
752 } else {
753 /* l3proto may be loaded latter. */
754 struct nf_conntrack_protocol **proto_array;
755 int i;
756
757 write_unlock_bh(&nf_conntrack_lock);
758
759 proto_array = (struct nf_conntrack_protocol **)
760 kmalloc(MAX_NF_CT_PROTO *
761 sizeof(struct nf_conntrack_protocol *),
762 GFP_KERNEL);
763 if (proto_array == NULL) {
764 ret = -ENOMEM;
765 goto out;
766 }
767 for (i = 0; i < MAX_NF_CT_PROTO; i++)
768 proto_array[i] = &nf_conntrack_generic_protocol;
769
770 write_lock_bh(&nf_conntrack_lock);
771 if (nf_ct_protos[proto->l3proto]) {
772 /* bad timing, but no problem */
773 write_unlock_bh(&nf_conntrack_lock);
774 kfree(proto_array);
775 } else {
776 nf_ct_protos[proto->l3proto] = proto_array;
777 write_unlock_bh(&nf_conntrack_lock);
778 }
779
780 /*
781 * Just once because array is never freed until unloading
782 * nf_conntrack.ko
783 */
784 goto retry;
785 }
786
787 nf_ct_protos[proto->l3proto][proto->proto] = proto;
788
789out_unlock:
790 write_unlock_bh(&nf_conntrack_lock);
791out:
792 return ret;
793}
794
795void nf_conntrack_protocol_unregister(struct nf_conntrack_protocol *proto)
796{
797 write_lock_bh(&nf_conntrack_lock);
798 nf_ct_protos[proto->l3proto][proto->proto]
799 = &nf_conntrack_generic_protocol;
800 write_unlock_bh(&nf_conntrack_lock);
801
802 /* Somebody could be still looking at the proto in bh. */
803 synchronize_net();
804
805 /* Remove all contrack entries for this protocol */
806 nf_ct_iterate_cleanup(kill_proto, proto);
807}
808
809static int __init init(void)
810{
811 return init_or_cleanup(1);
812}
813
814static void __exit fini(void)
815{
816 init_or_cleanup(0);
817}
818
819module_init(init);
820module_exit(fini);
821
822/* Some modules need us, but don't depend directly on any symbol.
823 They should call this. */
824void need_nf_conntrack(void)
825{
826}
827
828#ifdef CONFIG_NF_CONNTRACK_EVENTS
829EXPORT_SYMBOL_GPL(nf_conntrack_chain);
830EXPORT_SYMBOL_GPL(nf_conntrack_expect_chain);
831EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier);
832EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier);
833EXPORT_SYMBOL_GPL(__nf_ct_event_cache_init);
834EXPORT_PER_CPU_SYMBOL_GPL(nf_conntrack_ecache);
835EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events);
836#endif
837EXPORT_SYMBOL(nf_conntrack_l3proto_register);
838EXPORT_SYMBOL(nf_conntrack_l3proto_unregister);
839EXPORT_SYMBOL(nf_conntrack_protocol_register);
840EXPORT_SYMBOL(nf_conntrack_protocol_unregister);
841EXPORT_SYMBOL(nf_ct_invert_tuplepr);
842EXPORT_SYMBOL(nf_conntrack_alter_reply);
843EXPORT_SYMBOL(nf_conntrack_destroyed);
844EXPORT_SYMBOL(need_nf_conntrack);
845EXPORT_SYMBOL(nf_conntrack_helper_register);
846EXPORT_SYMBOL(nf_conntrack_helper_unregister);
847EXPORT_SYMBOL(nf_ct_iterate_cleanup);
848EXPORT_SYMBOL(__nf_ct_refresh_acct);
849EXPORT_SYMBOL(nf_ct_protos);
850EXPORT_SYMBOL(nf_ct_find_proto);
851EXPORT_SYMBOL(nf_ct_l3protos);
852EXPORT_SYMBOL(nf_conntrack_expect_alloc);
853EXPORT_SYMBOL(nf_conntrack_expect_put);
854EXPORT_SYMBOL(nf_conntrack_expect_related);
855EXPORT_SYMBOL(nf_conntrack_unexpect_related);
856EXPORT_SYMBOL(nf_conntrack_tuple_taken);
857EXPORT_SYMBOL(nf_conntrack_htable_size);
858EXPORT_SYMBOL(nf_conntrack_lock);
859EXPORT_SYMBOL(nf_conntrack_hash);
860EXPORT_SYMBOL(nf_conntrack_untracked);
861EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
862#ifdef CONFIG_IP_NF_NAT_NEEDED
863EXPORT_SYMBOL(nf_conntrack_tcp_update);
864#endif
865EXPORT_SYMBOL(__nf_conntrack_confirm);
866EXPORT_SYMBOL(nf_ct_get_tuple);
867EXPORT_SYMBOL(nf_ct_invert_tuple);
868EXPORT_SYMBOL(nf_conntrack_in);
869EXPORT_SYMBOL(__nf_conntrack_attach);
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 4bc27a6334c1..83f4c53030fc 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -128,7 +128,7 @@ void __nfa_fill(struct sk_buff *skb, int attrtype, int attrlen,
128 memset(NFA_DATA(nfa) + attrlen, 0, NFA_ALIGN(size) - size); 128 memset(NFA_DATA(nfa) + attrlen, 0, NFA_ALIGN(size) - size);
129} 129}
130 130
131int nfattr_parse(struct nfattr *tb[], int maxattr, struct nfattr *nfa, int len) 131void nfattr_parse(struct nfattr *tb[], int maxattr, struct nfattr *nfa, int len)
132{ 132{
133 memset(tb, 0, sizeof(struct nfattr *) * maxattr); 133 memset(tb, 0, sizeof(struct nfattr *) * maxattr);
134 134
@@ -138,8 +138,6 @@ int nfattr_parse(struct nfattr *tb[], int maxattr, struct nfattr *nfa, int len)
138 tb[flavor-1] = nfa; 138 tb[flavor-1] = nfa;
139 nfa = NFA_NEXT(nfa, len); 139 nfa = NFA_NEXT(nfa, len);
140 } 140 }
141
142 return 0;
143} 141}
144 142
145/** 143/**
@@ -242,15 +240,18 @@ static inline int nfnetlink_rcv_msg(struct sk_buff *skb,
242 ss = nfnetlink_get_subsys(type); 240 ss = nfnetlink_get_subsys(type);
243 if (!ss) { 241 if (!ss) {
244#ifdef CONFIG_KMOD 242#ifdef CONFIG_KMOD
245 /* don't call nfnl_shunlock, since it would reenter 243 if (cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN)) {
246 * with further packet processing */ 244 /* don't call nfnl_shunlock, since it would reenter
247 up(&nfnl_sem); 245 * with further packet processing */
248 request_module("nfnetlink-subsys-%d", NFNL_SUBSYS_ID(type)); 246 up(&nfnl_sem);
249 nfnl_shlock(); 247 request_module("nfnetlink-subsys-%d",
250 ss = nfnetlink_get_subsys(type); 248 NFNL_SUBSYS_ID(type));
249 nfnl_shlock();
250 ss = nfnetlink_get_subsys(type);
251 }
251 if (!ss) 252 if (!ss)
252#endif 253#endif
253 goto err_inval; 254 goto err_inval;
254 } 255 }
255 256
256 nc = nfnetlink_find_client(type, ss); 257 nc = nfnetlink_find_client(type, ss);
diff --git a/net/netlink/Makefile b/net/netlink/Makefile
index 39d9c2dcd03c..e3589c2de49e 100644
--- a/net/netlink/Makefile
+++ b/net/netlink/Makefile
@@ -2,4 +2,4 @@
2# Makefile for the netlink driver. 2# Makefile for the netlink driver.
3# 3#
4 4
5obj-y := af_netlink.o 5obj-y := af_netlink.o attr.o genetlink.o
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 5ca283537bc6..8c38ee6d255e 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -58,6 +58,7 @@
58 58
59#include <net/sock.h> 59#include <net/sock.h>
60#include <net/scm.h> 60#include <net/scm.h>
61#include <net/netlink.h>
61 62
62#define Nprintk(a...) 63#define Nprintk(a...)
63#define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) 64#define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8)
@@ -427,7 +428,8 @@ static int netlink_release(struct socket *sock)
427 428
428 spin_lock(&nlk->cb_lock); 429 spin_lock(&nlk->cb_lock);
429 if (nlk->cb) { 430 if (nlk->cb) {
430 nlk->cb->done(nlk->cb); 431 if (nlk->cb->done)
432 nlk->cb->done(nlk->cb);
431 netlink_destroy_callback(nlk->cb); 433 netlink_destroy_callback(nlk->cb);
432 nlk->cb = NULL; 434 nlk->cb = NULL;
433 } 435 }
@@ -1322,7 +1324,8 @@ static int netlink_dump(struct sock *sk)
1322 skb_queue_tail(&sk->sk_receive_queue, skb); 1324 skb_queue_tail(&sk->sk_receive_queue, skb);
1323 sk->sk_data_ready(sk, skb->len); 1325 sk->sk_data_ready(sk, skb->len);
1324 1326
1325 cb->done(cb); 1327 if (cb->done)
1328 cb->done(cb);
1326 nlk->cb = NULL; 1329 nlk->cb = NULL;
1327 spin_unlock(&nlk->cb_lock); 1330 spin_unlock(&nlk->cb_lock);
1328 1331
@@ -1409,6 +1412,94 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err)
1409 netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); 1412 netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
1410} 1413}
1411 1414
1415static int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *,
1416 struct nlmsghdr *, int *))
1417{
1418 unsigned int total_len;
1419 struct nlmsghdr *nlh;
1420 int err;
1421
1422 while (skb->len >= nlmsg_total_size(0)) {
1423 nlh = (struct nlmsghdr *) skb->data;
1424
1425 if (skb->len < nlh->nlmsg_len)
1426 return 0;
1427
1428 total_len = min(NLMSG_ALIGN(nlh->nlmsg_len), skb->len);
1429
1430 if (cb(skb, nlh, &err) < 0) {
1431 /* Not an error, but we have to interrupt processing
1432 * here. Note: that in this case we do not pull
1433 * message from skb, it will be processed later.
1434 */
1435 if (err == 0)
1436 return -1;
1437 netlink_ack(skb, nlh, err);
1438 } else if (nlh->nlmsg_flags & NLM_F_ACK)
1439 netlink_ack(skb, nlh, 0);
1440
1441 skb_pull(skb, total_len);
1442 }
1443
1444 return 0;
1445}
1446
1447/**
1448 * nelink_run_queue - Process netlink receive queue.
1449 * @sk: Netlink socket containing the queue
1450 * @qlen: Place to store queue length upon entry
1451 * @cb: Callback function invoked for each netlink message found
1452 *
1453 * Processes as much as there was in the queue upon entry and invokes
1454 * a callback function for each netlink message found. The callback
1455 * function may refuse a message by returning a negative error code
1456 * but setting the error pointer to 0 in which case this function
1457 * returns with a qlen != 0.
1458 *
1459 * qlen must be initialized to 0 before the initial entry, afterwards
1460 * the function may be called repeatedly until qlen reaches 0.
1461 */
1462void netlink_run_queue(struct sock *sk, unsigned int *qlen,
1463 int (*cb)(struct sk_buff *, struct nlmsghdr *, int *))
1464{
1465 struct sk_buff *skb;
1466
1467 if (!*qlen || *qlen > skb_queue_len(&sk->sk_receive_queue))
1468 *qlen = skb_queue_len(&sk->sk_receive_queue);
1469
1470 for (; *qlen; (*qlen)--) {
1471 skb = skb_dequeue(&sk->sk_receive_queue);
1472 if (netlink_rcv_skb(skb, cb)) {
1473 if (skb->len)
1474 skb_queue_head(&sk->sk_receive_queue, skb);
1475 else {
1476 kfree_skb(skb);
1477 (*qlen)--;
1478 }
1479 break;
1480 }
1481
1482 kfree_skb(skb);
1483 }
1484}
1485
1486/**
1487 * netlink_queue_skip - Skip netlink message while processing queue.
1488 * @nlh: Netlink message to be skipped
1489 * @skb: Socket buffer containing the netlink messages.
1490 *
1491 * Pulls the given netlink message off the socket buffer so the next
1492 * call to netlink_queue_run() will not reconsider the message.
1493 */
1494void netlink_queue_skip(struct nlmsghdr *nlh, struct sk_buff *skb)
1495{
1496 int msglen = NLMSG_ALIGN(nlh->nlmsg_len);
1497
1498 if (msglen > skb->len)
1499 msglen = skb->len;
1500
1501 skb_pull(skb, msglen);
1502}
1412 1503
1413#ifdef CONFIG_PROC_FS 1504#ifdef CONFIG_PROC_FS
1414struct nl_seq_iter { 1505struct nl_seq_iter {
@@ -1657,6 +1748,8 @@ out:
1657core_initcall(netlink_proto_init); 1748core_initcall(netlink_proto_init);
1658 1749
1659EXPORT_SYMBOL(netlink_ack); 1750EXPORT_SYMBOL(netlink_ack);
1751EXPORT_SYMBOL(netlink_run_queue);
1752EXPORT_SYMBOL(netlink_queue_skip);
1660EXPORT_SYMBOL(netlink_broadcast); 1753EXPORT_SYMBOL(netlink_broadcast);
1661EXPORT_SYMBOL(netlink_dump_start); 1754EXPORT_SYMBOL(netlink_dump_start);
1662EXPORT_SYMBOL(netlink_kernel_create); 1755EXPORT_SYMBOL(netlink_kernel_create);
diff --git a/net/netlink/attr.c b/net/netlink/attr.c
new file mode 100644
index 000000000000..fffef4ab276f
--- /dev/null
+++ b/net/netlink/attr.c
@@ -0,0 +1,328 @@
1/*
2 * NETLINK Netlink attributes
3 *
4 * Authors: Thomas Graf <tgraf@suug.ch>
5 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
6 */
7
8#include <linux/config.h>
9#include <linux/module.h>
10#include <linux/kernel.h>
11#include <linux/errno.h>
12#include <linux/jiffies.h>
13#include <linux/netdevice.h>
14#include <linux/skbuff.h>
15#include <linux/string.h>
16#include <linux/types.h>
17#include <net/netlink.h>
18
19static u16 nla_attr_minlen[NLA_TYPE_MAX+1] __read_mostly = {
20 [NLA_U8] = sizeof(u8),
21 [NLA_U16] = sizeof(u16),
22 [NLA_U32] = sizeof(u32),
23 [NLA_U64] = sizeof(u64),
24 [NLA_STRING] = 1,
25 [NLA_NESTED] = NLA_HDRLEN,
26};
27
28static int validate_nla(struct nlattr *nla, int maxtype,
29 struct nla_policy *policy)
30{
31 struct nla_policy *pt;
32 int minlen = 0;
33
34 if (nla->nla_type <= 0 || nla->nla_type > maxtype)
35 return 0;
36
37 pt = &policy[nla->nla_type];
38
39 BUG_ON(pt->type > NLA_TYPE_MAX);
40
41 if (pt->minlen)
42 minlen = pt->minlen;
43 else if (pt->type != NLA_UNSPEC)
44 minlen = nla_attr_minlen[pt->type];
45
46 if (pt->type == NLA_FLAG && nla_len(nla) > 0)
47 return -ERANGE;
48
49 if (nla_len(nla) < minlen)
50 return -ERANGE;
51
52 return 0;
53}
54
55/**
56 * nla_validate - Validate a stream of attributes
57 * @head: head of attribute stream
58 * @len: length of attribute stream
59 * @maxtype: maximum attribute type to be expected
60 * @policy: validation policy
61 *
62 * Validates all attributes in the specified attribute stream against the
63 * specified policy. Attributes with a type exceeding maxtype will be
64 * ignored. See documenation of struct nla_policy for more details.
65 *
66 * Returns 0 on success or a negative error code.
67 */
68int nla_validate(struct nlattr *head, int len, int maxtype,
69 struct nla_policy *policy)
70{
71 struct nlattr *nla;
72 int rem, err;
73
74 nla_for_each_attr(nla, head, len, rem) {
75 err = validate_nla(nla, maxtype, policy);
76 if (err < 0)
77 goto errout;
78 }
79
80 err = 0;
81errout:
82 return err;
83}
84
85/**
86 * nla_parse - Parse a stream of attributes into a tb buffer
87 * @tb: destination array with maxtype+1 elements
88 * @maxtype: maximum attribute type to be expected
89 * @head: head of attribute stream
90 * @len: length of attribute stream
91 *
92 * Parses a stream of attributes and stores a pointer to each attribute in
93 * the tb array accessable via the attribute type. Attributes with a type
94 * exceeding maxtype will be silently ignored for backwards compatibility
95 * reasons. policy may be set to NULL if no validation is required.
96 *
97 * Returns 0 on success or a negative error code.
98 */
99int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len,
100 struct nla_policy *policy)
101{
102 struct nlattr *nla;
103 int rem, err;
104
105 memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
106
107 nla_for_each_attr(nla, head, len, rem) {
108 u16 type = nla->nla_type;
109
110 if (type > 0 && type <= maxtype) {
111 if (policy) {
112 err = validate_nla(nla, maxtype, policy);
113 if (err < 0)
114 goto errout;
115 }
116
117 tb[type] = nla;
118 }
119 }
120
121 if (unlikely(rem > 0))
122 printk(KERN_WARNING "netlink: %d bytes leftover after parsing "
123 "attributes.\n", rem);
124
125 err = 0;
126errout:
127 return err;
128}
129
130/**
131 * nla_find - Find a specific attribute in a stream of attributes
132 * @head: head of attribute stream
133 * @len: length of attribute stream
134 * @attrtype: type of attribute to look for
135 *
136 * Returns the first attribute in the stream matching the specified type.
137 */
138struct nlattr *nla_find(struct nlattr *head, int len, int attrtype)
139{
140 struct nlattr *nla;
141 int rem;
142
143 nla_for_each_attr(nla, head, len, rem)
144 if (nla->nla_type == attrtype)
145 return nla;
146
147 return NULL;
148}
149
150/**
151 * nla_strlcpy - Copy string attribute payload into a sized buffer
152 * @dst: where to copy the string to
153 * @src: attribute to copy the string from
154 * @dstsize: size of destination buffer
155 *
156 * Copies at most dstsize - 1 bytes into the destination buffer.
157 * The result is always a valid NUL-terminated string. Unlike
158 * strlcpy the destination buffer is always padded out.
159 *
160 * Returns the length of the source buffer.
161 */
162size_t nla_strlcpy(char *dst, const struct nlattr *nla, size_t dstsize)
163{
164 size_t srclen = nla_len(nla);
165 char *src = nla_data(nla);
166
167 if (srclen > 0 && src[srclen - 1] == '\0')
168 srclen--;
169
170 if (dstsize > 0) {
171 size_t len = (srclen >= dstsize) ? dstsize - 1 : srclen;
172
173 memset(dst, 0, dstsize);
174 memcpy(dst, src, len);
175 }
176
177 return srclen;
178}
179
180/**
181 * nla_memcpy - Copy a netlink attribute into another memory area
182 * @dest: where to copy to memcpy
183 * @src: netlink attribute to copy from
184 * @count: size of the destination area
185 *
186 * Note: The number of bytes copied is limited by the length of
187 * attribute's payload. memcpy
188 *
189 * Returns the number of bytes copied.
190 */
191int nla_memcpy(void *dest, struct nlattr *src, int count)
192{
193 int minlen = min_t(int, count, nla_len(src));
194
195 memcpy(dest, nla_data(src), minlen);
196
197 return minlen;
198}
199
200/**
201 * nla_memcmp - Compare an attribute with sized memory area
202 * @nla: netlink attribute
203 * @data: memory area
204 * @size: size of memory area
205 */
206int nla_memcmp(const struct nlattr *nla, const void *data,
207 size_t size)
208{
209 int d = nla_len(nla) - size;
210
211 if (d == 0)
212 d = memcmp(nla_data(nla), data, size);
213
214 return d;
215}
216
217/**
218 * nla_strcmp - Compare a string attribute against a string
219 * @nla: netlink string attribute
220 * @str: another string
221 */
222int nla_strcmp(const struct nlattr *nla, const char *str)
223{
224 int len = strlen(str) + 1;
225 int d = nla_len(nla) - len;
226
227 if (d == 0)
228 d = memcmp(nla_data(nla), str, len);
229
230 return d;
231}
232
233/**
234 * __nla_reserve - reserve room for attribute on the skb
235 * @skb: socket buffer to reserve room on
236 * @attrtype: attribute type
237 * @attrlen: length of attribute payload
238 *
239 * Adds a netlink attribute header to a socket buffer and reserves
240 * room for the payload but does not copy it.
241 *
242 * The caller is responsible to ensure that the skb provides enough
243 * tailroom for the attribute header and payload.
244 */
245struct nlattr *__nla_reserve(struct sk_buff *skb, int attrtype, int attrlen)
246{
247 struct nlattr *nla;
248
249 nla = (struct nlattr *) skb_put(skb, nla_total_size(attrlen));
250 nla->nla_type = attrtype;
251 nla->nla_len = nla_attr_size(attrlen);
252
253 memset((unsigned char *) nla + nla->nla_len, 0, nla_padlen(attrlen));
254
255 return nla;
256}
257
258/**
259 * nla_reserve - reserve room for attribute on the skb
260 * @skb: socket buffer to reserve room on
261 * @attrtype: attribute type
262 * @attrlen: length of attribute payload
263 *
264 * Adds a netlink attribute header to a socket buffer and reserves
265 * room for the payload but does not copy it.
266 *
267 * Returns NULL if the tailroom of the skb is insufficient to store
268 * the attribute header and payload.
269 */
270struct nlattr *nla_reserve(struct sk_buff *skb, int attrtype, int attrlen)
271{
272 if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen)))
273 return NULL;
274
275 return __nla_reserve(skb, attrtype, attrlen);
276}
277
278/**
279 * __nla_put - Add a netlink attribute to a socket buffer
280 * @skb: socket buffer to add attribute to
281 * @attrtype: attribute type
282 * @attrlen: length of attribute payload
283 * @data: head of attribute payload
284 *
285 * The caller is responsible to ensure that the skb provides enough
286 * tailroom for the attribute header and payload.
287 */
288void __nla_put(struct sk_buff *skb, int attrtype, int attrlen,
289 const void *data)
290{
291 struct nlattr *nla;
292
293 nla = __nla_reserve(skb, attrtype, attrlen);
294 memcpy(nla_data(nla), data, attrlen);
295}
296
297
298/**
299 * nla_put - Add a netlink attribute to a socket buffer
300 * @skb: socket buffer to add attribute to
301 * @attrtype: attribute type
302 * @attrlen: length of attribute payload
303 * @data: head of attribute payload
304 *
305 * Returns -1 if the tailroom of the skb is insufficient to store
306 * the attribute header and payload.
307 */
308int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data)
309{
310 if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen)))
311 return -1;
312
313 __nla_put(skb, attrtype, attrlen, data);
314 return 0;
315}
316
317
318EXPORT_SYMBOL(nla_validate);
319EXPORT_SYMBOL(nla_parse);
320EXPORT_SYMBOL(nla_find);
321EXPORT_SYMBOL(nla_strlcpy);
322EXPORT_SYMBOL(__nla_reserve);
323EXPORT_SYMBOL(nla_reserve);
324EXPORT_SYMBOL(__nla_put);
325EXPORT_SYMBOL(nla_put);
326EXPORT_SYMBOL(nla_memcpy);
327EXPORT_SYMBOL(nla_memcmp);
328EXPORT_SYMBOL(nla_strcmp);
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
new file mode 100644
index 000000000000..287cfcc56951
--- /dev/null
+++ b/net/netlink/genetlink.c
@@ -0,0 +1,579 @@
1/*
2 * NETLINK Generic Netlink Family
3 *
4 * Authors: Jamal Hadi Salim
5 * Thomas Graf <tgraf@suug.ch>
6 */
7
8#include <linux/config.h>
9#include <linux/module.h>
10#include <linux/kernel.h>
11#include <linux/errno.h>
12#include <linux/types.h>
13#include <linux/socket.h>
14#include <linux/string.h>
15#include <linux/skbuff.h>
16#include <net/sock.h>
17#include <net/genetlink.h>
18
19struct sock *genl_sock = NULL;
20
21static DECLARE_MUTEX(genl_sem); /* serialization of message processing */
22
23static void genl_lock(void)
24{
25 down(&genl_sem);
26}
27
28static int genl_trylock(void)
29{
30 return down_trylock(&genl_sem);
31}
32
33static void genl_unlock(void)
34{
35 up(&genl_sem);
36
37 if (genl_sock && genl_sock->sk_receive_queue.qlen)
38 genl_sock->sk_data_ready(genl_sock, 0);
39}
40
41#define GENL_FAM_TAB_SIZE 16
42#define GENL_FAM_TAB_MASK (GENL_FAM_TAB_SIZE - 1)
43
44static struct list_head family_ht[GENL_FAM_TAB_SIZE];
45
46static int genl_ctrl_event(int event, void *data);
47
48static inline unsigned int genl_family_hash(unsigned int id)
49{
50 return id & GENL_FAM_TAB_MASK;
51}
52
53static inline struct list_head *genl_family_chain(unsigned int id)
54{
55 return &family_ht[genl_family_hash(id)];
56}
57
58static struct genl_family *genl_family_find_byid(unsigned int id)
59{
60 struct genl_family *f;
61
62 list_for_each_entry(f, genl_family_chain(id), family_list)
63 if (f->id == id)
64 return f;
65
66 return NULL;
67}
68
69static struct genl_family *genl_family_find_byname(char *name)
70{
71 struct genl_family *f;
72 int i;
73
74 for (i = 0; i < GENL_FAM_TAB_SIZE; i++)
75 list_for_each_entry(f, genl_family_chain(i), family_list)
76 if (strcmp(f->name, name) == 0)
77 return f;
78
79 return NULL;
80}
81
82static struct genl_ops *genl_get_cmd(u8 cmd, struct genl_family *family)
83{
84 struct genl_ops *ops;
85
86 list_for_each_entry(ops, &family->ops_list, ops_list)
87 if (ops->cmd == cmd)
88 return ops;
89
90 return NULL;
91}
92
93/* Of course we are going to have problems once we hit
94 * 2^16 alive types, but that can only happen by year 2K
95*/
96static inline u16 genl_generate_id(void)
97{
98 static u16 id_gen_idx;
99 int overflowed = 0;
100
101 do {
102 if (id_gen_idx == 0)
103 id_gen_idx = GENL_MIN_ID;
104
105 if (++id_gen_idx > GENL_MAX_ID) {
106 if (!overflowed) {
107 overflowed = 1;
108 id_gen_idx = 0;
109 continue;
110 } else
111 return 0;
112 }
113
114 } while (genl_family_find_byid(id_gen_idx));
115
116 return id_gen_idx;
117}
118
119/**
120 * genl_register_ops - register generic netlink operations
121 * @family: generic netlink family
122 * @ops: operations to be registered
123 *
124 * Registers the specified operations and assigns them to the specified
125 * family. Either a doit or dumpit callback must be specified or the
126 * operation will fail. Only one operation structure per command
127 * identifier may be registered.
128 *
129 * See include/net/genetlink.h for more documenation on the operations
130 * structure.
131 *
132 * Returns 0 on success or a negative error code.
133 */
134int genl_register_ops(struct genl_family *family, struct genl_ops *ops)
135{
136 int err = -EINVAL;
137
138 if (ops->dumpit == NULL && ops->doit == NULL)
139 goto errout;
140
141 if (genl_get_cmd(ops->cmd, family)) {
142 err = -EEXIST;
143 goto errout;
144 }
145
146 genl_lock();
147 list_add_tail(&ops->ops_list, &family->ops_list);
148 genl_unlock();
149
150 genl_ctrl_event(CTRL_CMD_NEWOPS, ops);
151 err = 0;
152errout:
153 return err;
154}
155
156/**
157 * genl_unregister_ops - unregister generic netlink operations
158 * @family: generic netlink family
159 * @ops: operations to be unregistered
160 *
161 * Unregisters the specified operations and unassigns them from the
162 * specified family. The operation blocks until the current message
163 * processing has finished and doesn't start again until the
164 * unregister process has finished.
165 *
166 * Note: It is not necessary to unregister all operations before
167 * unregistering the family, unregistering the family will cause
168 * all assigned operations to be unregistered automatically.
169 *
170 * Returns 0 on success or a negative error code.
171 */
172int genl_unregister_ops(struct genl_family *family, struct genl_ops *ops)
173{
174 struct genl_ops *rc;
175
176 genl_lock();
177 list_for_each_entry(rc, &family->ops_list, ops_list) {
178 if (rc == ops) {
179 list_del(&ops->ops_list);
180 genl_unlock();
181 genl_ctrl_event(CTRL_CMD_DELOPS, ops);
182 return 0;
183 }
184 }
185 genl_unlock();
186
187 return -ENOENT;
188}
189
190/**
191 * genl_register_family - register a generic netlink family
192 * @family: generic netlink family
193 *
194 * Registers the specified family after validating it first. Only one
195 * family may be registered with the same family name or identifier.
196 * The family id may equal GENL_ID_GENERATE causing an unique id to
197 * be automatically generated and assigned.
198 *
199 * Return 0 on success or a negative error code.
200 */
201int genl_register_family(struct genl_family *family)
202{
203 int err = -EINVAL;
204
205 if (family->id && family->id < GENL_MIN_ID)
206 goto errout;
207
208 if (family->id > GENL_MAX_ID)
209 goto errout;
210
211 INIT_LIST_HEAD(&family->ops_list);
212
213 genl_lock();
214
215 if (genl_family_find_byname(family->name)) {
216 err = -EEXIST;
217 goto errout_locked;
218 }
219
220 if (genl_family_find_byid(family->id)) {
221 err = -EEXIST;
222 goto errout_locked;
223 }
224
225 if (!try_module_get(family->owner)) {
226 err = -EBUSY;
227 goto errout_locked;
228 }
229
230 if (family->id == GENL_ID_GENERATE) {
231 u16 newid = genl_generate_id();
232
233 if (!newid) {
234 err = -ENOMEM;
235 goto errout_locked;
236 }
237
238 family->id = newid;
239 }
240
241 if (family->maxattr) {
242 family->attrbuf = kmalloc((family->maxattr+1) *
243 sizeof(struct nlattr *), GFP_KERNEL);
244 if (family->attrbuf == NULL) {
245 err = -ENOMEM;
246 goto errout;
247 }
248 } else
249 family->attrbuf = NULL;
250
251 list_add_tail(&family->family_list, genl_family_chain(family->id));
252 genl_unlock();
253
254 genl_ctrl_event(CTRL_CMD_NEWFAMILY, family);
255
256 return 0;
257
258errout_locked:
259 genl_unlock();
260errout:
261 return err;
262}
263
264/**
265 * genl_unregister_family - unregister generic netlink family
266 * @family: generic netlink family
267 *
268 * Unregisters the specified family.
269 *
270 * Returns 0 on success or a negative error code.
271 */
272int genl_unregister_family(struct genl_family *family)
273{
274 struct genl_family *rc;
275
276 genl_lock();
277
278 list_for_each_entry(rc, genl_family_chain(family->id), family_list) {
279 if (family->id != rc->id || strcmp(rc->name, family->name))
280 continue;
281
282 list_del(&rc->family_list);
283 INIT_LIST_HEAD(&family->ops_list);
284 genl_unlock();
285
286 module_put(family->owner);
287 kfree(family->attrbuf);
288 genl_ctrl_event(CTRL_CMD_DELFAMILY, family);
289 return 0;
290 }
291
292 genl_unlock();
293
294 return -ENOENT;
295}
296
297static inline int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
298 int *errp)
299{
300 struct genl_ops *ops;
301 struct genl_family *family;
302 struct genl_info info;
303 struct genlmsghdr *hdr = nlmsg_data(nlh);
304 int hdrlen, err = -EINVAL;
305
306 if (!(nlh->nlmsg_flags & NLM_F_REQUEST))
307 goto ignore;
308
309 if (nlh->nlmsg_type < NLMSG_MIN_TYPE)
310 goto ignore;
311
312 family = genl_family_find_byid(nlh->nlmsg_type);
313 if (family == NULL) {
314 err = -ENOENT;
315 goto errout;
316 }
317
318 hdrlen = GENL_HDRLEN + family->hdrsize;
319 if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen))
320 goto errout;
321
322 ops = genl_get_cmd(hdr->cmd, family);
323 if (ops == NULL) {
324 err = -EOPNOTSUPP;
325 goto errout;
326 }
327
328 if ((ops->flags & GENL_ADMIN_PERM) && security_netlink_recv(skb)) {
329 err = -EPERM;
330 goto errout;
331 }
332
333 if (nlh->nlmsg_flags & NLM_F_DUMP) {
334 if (ops->dumpit == NULL) {
335 err = -EOPNOTSUPP;
336 goto errout;
337 }
338
339 *errp = err = netlink_dump_start(genl_sock, skb, nlh,
340 ops->dumpit, NULL);
341 if (err == 0)
342 skb_pull(skb, min(NLMSG_ALIGN(nlh->nlmsg_len),
343 skb->len));
344 return -1;
345 }
346
347 if (ops->doit == NULL) {
348 err = -EOPNOTSUPP;
349 goto errout;
350 }
351
352 if (family->attrbuf) {
353 err = nlmsg_parse(nlh, hdrlen, family->attrbuf, family->maxattr,
354 ops->policy);
355 if (err < 0)
356 goto errout;
357 }
358
359 info.snd_seq = nlh->nlmsg_seq;
360 info.snd_pid = NETLINK_CB(skb).pid;
361 info.nlhdr = nlh;
362 info.genlhdr = nlmsg_data(nlh);
363 info.userhdr = nlmsg_data(nlh) + GENL_HDRLEN;
364 info.attrs = family->attrbuf;
365
366 *errp = err = ops->doit(skb, &info);
367 return err;
368
369ignore:
370 return 0;
371
372errout:
373 *errp = err;
374 return -1;
375}
376
377static void genl_rcv(struct sock *sk, int len)
378{
379 unsigned int qlen = 0;
380
381 do {
382 if (genl_trylock())
383 return;
384 netlink_run_queue(sk, &qlen, &genl_rcv_msg);
385 genl_unlock();
386 } while (qlen && genl_sock && genl_sock->sk_receive_queue.qlen);
387}
388
389/**************************************************************************
390 * Controller
391 **************************************************************************/
392
393static int ctrl_fill_info(struct genl_family *family, u32 pid, u32 seq,
394 u32 flags, struct sk_buff *skb, u8 cmd)
395{
396 void *hdr;
397
398 hdr = genlmsg_put(skb, pid, seq, GENL_ID_CTRL, 0, flags, cmd,
399 family->version);
400 if (hdr == NULL)
401 return -1;
402
403 NLA_PUT_STRING(skb, CTRL_ATTR_FAMILY_NAME, family->name);
404 NLA_PUT_U16(skb, CTRL_ATTR_FAMILY_ID, family->id);
405
406 return genlmsg_end(skb, hdr);
407
408nla_put_failure:
409 return genlmsg_cancel(skb, hdr);
410}
411
412static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb)
413{
414
415 int i, n = 0;
416 struct genl_family *rt;
417 int chains_to_skip = cb->args[0];
418 int fams_to_skip = cb->args[1];
419
420 for (i = 0; i < GENL_FAM_TAB_SIZE; i++) {
421 if (i < chains_to_skip)
422 continue;
423 n = 0;
424 list_for_each_entry(rt, genl_family_chain(i), family_list) {
425 if (++n < fams_to_skip)
426 continue;
427 if (ctrl_fill_info(rt, NETLINK_CB(cb->skb).pid,
428 cb->nlh->nlmsg_seq, NLM_F_MULTI,
429 skb, CTRL_CMD_NEWFAMILY) < 0)
430 goto errout;
431 }
432
433 fams_to_skip = 0;
434 }
435
436errout:
437 cb->args[0] = i;
438 cb->args[1] = n;
439
440 return skb->len;
441}
442
443static struct sk_buff *ctrl_build_msg(struct genl_family *family, u32 pid,
444 int seq, int cmd)
445{
446 struct sk_buff *skb;
447 int err;
448
449 skb = nlmsg_new(NLMSG_GOODSIZE);
450 if (skb == NULL)
451 return ERR_PTR(-ENOBUFS);
452
453 err = ctrl_fill_info(family, pid, seq, 0, skb, cmd);
454 if (err < 0) {
455 nlmsg_free(skb);
456 return ERR_PTR(err);
457 }
458
459 return skb;
460}
461
462static struct nla_policy ctrl_policy[CTRL_ATTR_MAX+1] __read_mostly = {
463 [CTRL_ATTR_FAMILY_ID] = { .type = NLA_U16 },
464 [CTRL_ATTR_FAMILY_NAME] = { .type = NLA_STRING },
465};
466
467static int ctrl_getfamily(struct sk_buff *skb, struct genl_info *info)
468{
469 struct sk_buff *msg;
470 struct genl_family *res = NULL;
471 int err = -EINVAL;
472
473 if (info->attrs[CTRL_ATTR_FAMILY_ID]) {
474 u16 id = nla_get_u16(info->attrs[CTRL_ATTR_FAMILY_ID]);
475 res = genl_family_find_byid(id);
476 }
477
478 if (info->attrs[CTRL_ATTR_FAMILY_NAME]) {
479 char name[GENL_NAMSIZ];
480
481 if (nla_strlcpy(name, info->attrs[CTRL_ATTR_FAMILY_NAME],
482 GENL_NAMSIZ) >= GENL_NAMSIZ)
483 goto errout;
484
485 res = genl_family_find_byname(name);
486 }
487
488 if (res == NULL) {
489 err = -ENOENT;
490 goto errout;
491 }
492
493 msg = ctrl_build_msg(res, info->snd_pid, info->snd_seq,
494 CTRL_CMD_NEWFAMILY);
495 if (IS_ERR(msg)) {
496 err = PTR_ERR(msg);
497 goto errout;
498 }
499
500 err = genlmsg_unicast(msg, info->snd_pid);
501errout:
502 return err;
503}
504
505static int genl_ctrl_event(int event, void *data)
506{
507 struct sk_buff *msg;
508
509 if (genl_sock == NULL)
510 return 0;
511
512 switch (event) {
513 case CTRL_CMD_NEWFAMILY:
514 case CTRL_CMD_DELFAMILY:
515 msg = ctrl_build_msg(data, 0, 0, event);
516 if (IS_ERR(msg))
517 return PTR_ERR(msg);
518
519 genlmsg_multicast(msg, 0, GENL_ID_CTRL);
520 break;
521 }
522
523 return 0;
524}
525
526static struct genl_ops genl_ctrl_ops = {
527 .cmd = CTRL_CMD_GETFAMILY,
528 .doit = ctrl_getfamily,
529 .dumpit = ctrl_dumpfamily,
530 .policy = ctrl_policy,
531};
532
533static struct genl_family genl_ctrl = {
534 .id = GENL_ID_CTRL,
535 .name = "nlctrl",
536 .version = 0x1,
537 .maxattr = CTRL_ATTR_MAX,
538 .owner = THIS_MODULE,
539};
540
541static int __init genl_init(void)
542{
543 int i, err;
544
545 for (i = 0; i < GENL_FAM_TAB_SIZE; i++)
546 INIT_LIST_HEAD(&family_ht[i]);
547
548 err = genl_register_family(&genl_ctrl);
549 if (err < 0)
550 goto errout;
551
552 err = genl_register_ops(&genl_ctrl, &genl_ctrl_ops);
553 if (err < 0)
554 goto errout_register;
555
556 netlink_set_nonroot(NETLINK_GENERIC, NL_NONROOT_RECV);
557 genl_sock = netlink_kernel_create(NETLINK_GENERIC, GENL_MAX_ID,
558 genl_rcv, THIS_MODULE);
559 if (genl_sock == NULL) {
560 panic("GENL: Cannot initialize generic netlink\n");
561 return -ENOMEM;
562 }
563
564 return 0;
565
566errout_register:
567 genl_unregister_family(&genl_ctrl);
568errout:
569 panic("GENL: Cannot register controller: %d\n", err);
570 return err;
571}
572
573subsys_initcall(genl_init);
574
575EXPORT_SYMBOL(genl_sock);
576EXPORT_SYMBOL(genl_register_ops);
577EXPORT_SYMBOL(genl_unregister_ops);
578EXPORT_SYMBOL(genl_register_family);
579EXPORT_SYMBOL(genl_unregister_family);
diff --git a/net/rxrpc/transport.c b/net/rxrpc/transport.c
index 122c086ee2db..dbe6105e83a5 100644
--- a/net/rxrpc/transport.c
+++ b/net/rxrpc/transport.c
@@ -23,6 +23,7 @@
23#include <linux/in.h> 23#include <linux/in.h>
24#include <linux/in6.h> 24#include <linux/in6.h>
25#include <linux/icmp.h> 25#include <linux/icmp.h>
26#include <linux/skbuff.h>
26#include <net/sock.h> 27#include <net/sock.h>
27#include <net/ip.h> 28#include <net/ip.h>
28#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) 29#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
@@ -475,15 +476,11 @@ void rxrpc_trans_receive_packet(struct rxrpc_transport *trans)
475 476
476 /* we'll probably need to checksum it (didn't call 477 /* we'll probably need to checksum it (didn't call
477 * sock_recvmsg) */ 478 * sock_recvmsg) */
478 if (pkt->ip_summed != CHECKSUM_UNNECESSARY) { 479 if (skb_checksum_complete(pkt)) {
479 if ((unsigned short) 480 kfree_skb(pkt);
480 csum_fold(skb_checksum(pkt, 0, pkt->len, 481 rxrpc_krxiod_queue_transport(trans);
481 pkt->csum))) { 482 _leave(" CSUM failed");
482 kfree_skb(pkt); 483 return;
483 rxrpc_krxiod_queue_transport(trans);
484 _leave(" CSUM failed");
485 return;
486 }
487 } 484 }
488 485
489 addr = pkt->nh.iph->saddr; 486 addr = pkt->nh.iph->saddr;
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 8c8ddf7f9b61..dec68a604773 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -128,9 +128,29 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
128 */ 128 */
129 asoc->max_burst = sctp_max_burst; 129 asoc->max_burst = sctp_max_burst;
130 130
131 /* Copy things from the endpoint. */ 131 /* initialize association timers */
132 asoc->timeouts[SCTP_EVENT_TIMEOUT_NONE] = 0;
133 asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = asoc->rto_initial;
134 asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = asoc->rto_initial;
135 asoc->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = asoc->rto_initial;
136 asoc->timeouts[SCTP_EVENT_TIMEOUT_T3_RTX] = 0;
137 asoc->timeouts[SCTP_EVENT_TIMEOUT_T4_RTO] = 0;
138
139 /* sctpimpguide Section 2.12.2
140 * If the 'T5-shutdown-guard' timer is used, it SHOULD be set to the
141 * recommended value of 5 times 'RTO.Max'.
142 */
143 asoc->timeouts[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD]
144 = 5 * asoc->rto_max;
145
146 asoc->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0;
147 asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] =
148 SCTP_DEFAULT_TIMEOUT_SACK;
149 asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] =
150 sp->autoclose * HZ;
151
152 /* Initilizes the timers */
132 for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i) { 153 for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i) {
133 asoc->timeouts[i] = ep->timeouts[i];
134 init_timer(&asoc->timers[i]); 154 init_timer(&asoc->timers[i]);
135 asoc->timers[i].function = sctp_timer_events[i]; 155 asoc->timers[i].function = sctp_timer_events[i];
136 asoc->timers[i].data = (unsigned long) asoc; 156 asoc->timers[i].data = (unsigned long) asoc;
@@ -157,10 +177,10 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
157 * RFC 6 - A SCTP receiver MUST be able to receive a minimum of 177 * RFC 6 - A SCTP receiver MUST be able to receive a minimum of
158 * 1500 bytes in one SCTP packet. 178 * 1500 bytes in one SCTP packet.
159 */ 179 */
160 if (sk->sk_rcvbuf < SCTP_DEFAULT_MINWINDOW) 180 if ((sk->sk_rcvbuf/2) < SCTP_DEFAULT_MINWINDOW)
161 asoc->rwnd = SCTP_DEFAULT_MINWINDOW; 181 asoc->rwnd = SCTP_DEFAULT_MINWINDOW;
162 else 182 else
163 asoc->rwnd = sk->sk_rcvbuf; 183 asoc->rwnd = sk->sk_rcvbuf/2;
164 184
165 asoc->a_rwnd = asoc->rwnd; 185 asoc->a_rwnd = asoc->rwnd;
166 186
@@ -172,6 +192,9 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
172 /* Set the sndbuf size for transmit. */ 192 /* Set the sndbuf size for transmit. */
173 asoc->sndbuf_used = 0; 193 asoc->sndbuf_used = 0;
174 194
195 /* Initialize the receive memory counter */
196 atomic_set(&asoc->rmem_alloc, 0);
197
175 init_waitqueue_head(&asoc->wait); 198 init_waitqueue_head(&asoc->wait);
176 199
177 asoc->c.my_vtag = sctp_generate_tag(ep); 200 asoc->c.my_vtag = sctp_generate_tag(ep);
@@ -380,6 +403,8 @@ static void sctp_association_destroy(struct sctp_association *asoc)
380 spin_unlock_bh(&sctp_assocs_id_lock); 403 spin_unlock_bh(&sctp_assocs_id_lock);
381 } 404 }
382 405
406 BUG_TRAP(!atomic_read(&asoc->rmem_alloc));
407
383 if (asoc->base.malloced) { 408 if (asoc->base.malloced) {
384 kfree(asoc); 409 kfree(asoc);
385 SCTP_DBG_OBJCNT_DEC(assoc); 410 SCTP_DBG_OBJCNT_DEC(assoc);
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 96984f7a2d69..67bd53070ee0 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -70,7 +70,6 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
70 struct sock *sk, 70 struct sock *sk,
71 gfp_t gfp) 71 gfp_t gfp)
72{ 72{
73 struct sctp_sock *sp = sctp_sk(sk);
74 memset(ep, 0, sizeof(struct sctp_endpoint)); 73 memset(ep, 0, sizeof(struct sctp_endpoint));
75 74
76 /* Initialize the base structure. */ 75 /* Initialize the base structure. */
@@ -100,33 +99,14 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
100 /* Create the lists of associations. */ 99 /* Create the lists of associations. */
101 INIT_LIST_HEAD(&ep->asocs); 100 INIT_LIST_HEAD(&ep->asocs);
102 101
103 /* Set up the base timeout information. */
104 ep->timeouts[SCTP_EVENT_TIMEOUT_NONE] = 0;
105 ep->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] =
106 msecs_to_jiffies(sp->rtoinfo.srto_initial);
107 ep->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] =
108 msecs_to_jiffies(sp->rtoinfo.srto_initial);
109 ep->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] =
110 msecs_to_jiffies(sp->rtoinfo.srto_initial);
111 ep->timeouts[SCTP_EVENT_TIMEOUT_T3_RTX] = 0;
112 ep->timeouts[SCTP_EVENT_TIMEOUT_T4_RTO] = 0;
113
114 /* sctpimpguide-05 Section 2.12.2
115 * If the 'T5-shutdown-guard' timer is used, it SHOULD be set to the
116 * recommended value of 5 times 'RTO.Max'.
117 */
118 ep->timeouts[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD]
119 = 5 * msecs_to_jiffies(sp->rtoinfo.srto_max);
120
121 ep->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0;
122 ep->timeouts[SCTP_EVENT_TIMEOUT_SACK] = sctp_sack_timeout;
123 ep->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sp->autoclose * HZ;
124
125 /* Use SCTP specific send buffer space queues. */ 102 /* Use SCTP specific send buffer space queues. */
126 ep->sndbuf_policy = sctp_sndbuf_policy; 103 ep->sndbuf_policy = sctp_sndbuf_policy;
127 sk->sk_write_space = sctp_write_space; 104 sk->sk_write_space = sctp_write_space;
128 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 105 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
129 106
107 /* Get the receive buffer policy for this endpoint */
108 ep->rcvbuf_policy = sctp_rcvbuf_policy;
109
130 /* Initialize the secret key used with cookie. */ 110 /* Initialize the secret key used with cookie. */
131 get_random_bytes(&ep->secret_key[0], SCTP_SECRET_SIZE); 111 get_random_bytes(&ep->secret_key[0], SCTP_SECRET_SIZE);
132 ep->last_key = ep->current_key = 0; 112 ep->last_key = ep->current_key = 0;
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 28f32243397f..b24ff2c1aef5 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -100,21 +100,6 @@ static inline int sctp_rcv_checksum(struct sk_buff *skb)
100 return 0; 100 return 0;
101} 101}
102 102
103/* The free routine for skbuffs that sctp receives */
104static void sctp_rfree(struct sk_buff *skb)
105{
106 atomic_sub(sizeof(struct sctp_chunk),&skb->sk->sk_rmem_alloc);
107 sock_rfree(skb);
108}
109
110/* The ownership wrapper routine to do receive buffer accounting */
111static void sctp_rcv_set_owner_r(struct sk_buff *skb, struct sock *sk)
112{
113 skb_set_owner_r(skb,sk);
114 skb->destructor = sctp_rfree;
115 atomic_add(sizeof(struct sctp_chunk),&sk->sk_rmem_alloc);
116}
117
118struct sctp_input_cb { 103struct sctp_input_cb {
119 union { 104 union {
120 struct inet_skb_parm h4; 105 struct inet_skb_parm h4;
@@ -217,9 +202,6 @@ int sctp_rcv(struct sk_buff *skb)
217 rcvr = &ep->base; 202 rcvr = &ep->base;
218 } 203 }
219 204
220 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
221 goto discard_release;
222
223 /* 205 /*
224 * RFC 2960, 8.4 - Handle "Out of the blue" Packets. 206 * RFC 2960, 8.4 - Handle "Out of the blue" Packets.
225 * An SCTP packet is called an "out of the blue" (OOTB) 207 * An SCTP packet is called an "out of the blue" (OOTB)
@@ -256,8 +238,6 @@ int sctp_rcv(struct sk_buff *skb)
256 } 238 }
257 SCTP_INPUT_CB(skb)->chunk = chunk; 239 SCTP_INPUT_CB(skb)->chunk = chunk;
258 240
259 sctp_rcv_set_owner_r(skb,sk);
260
261 /* Remember what endpoint is to handle this packet. */ 241 /* Remember what endpoint is to handle this packet. */
262 chunk->rcvr = rcvr; 242 chunk->rcvr = rcvr;
263 243
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 26de4d3e1bd9..f775d78aa59d 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -530,6 +530,9 @@ static void sctp_v4_get_saddr(struct sctp_association *asoc,
530{ 530{
531 struct rtable *rt = (struct rtable *)dst; 531 struct rtable *rt = (struct rtable *)dst;
532 532
533 if (!asoc)
534 return;
535
533 if (rt) { 536 if (rt) {
534 saddr->v4.sin_family = AF_INET; 537 saddr->v4.sin_family = AF_INET;
535 saddr->v4.sin_port = asoc->base.bind_addr.port; 538 saddr->v4.sin_port = asoc->base.bind_addr.port;
@@ -1047,6 +1050,9 @@ SCTP_STATIC __init int sctp_init(void)
1047 /* Sendbuffer growth - do per-socket accounting */ 1050 /* Sendbuffer growth - do per-socket accounting */
1048 sctp_sndbuf_policy = 0; 1051 sctp_sndbuf_policy = 0;
1049 1052
1053 /* Rcvbuffer growth - do per-socket accounting */
1054 sctp_rcvbuf_policy = 0;
1055
1050 /* HB.interval - 30 seconds */ 1056 /* HB.interval - 30 seconds */
1051 sctp_hb_interval = SCTP_DEFAULT_TIMEOUT_HEARTBEAT; 1057 sctp_hb_interval = SCTP_DEFAULT_TIMEOUT_HEARTBEAT;
1052 1058
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index f84173ea8ec1..823947170a33 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -385,7 +385,7 @@ sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = {
385 NULL, 385 NULL,
386 sctp_generate_t4_rto_event, 386 sctp_generate_t4_rto_event,
387 sctp_generate_t5_shutdown_guard_event, 387 sctp_generate_t5_shutdown_guard_event,
388 sctp_generate_heartbeat_event, 388 NULL,
389 sctp_generate_sack_event, 389 sctp_generate_sack_event,
390 sctp_generate_autoclose_event, 390 sctp_generate_autoclose_event,
391}; 391};
@@ -689,9 +689,9 @@ static void sctp_cmd_new_state(sctp_cmd_seq_t *cmds,
689 * increased due to timer expirations. 689 * increased due to timer expirations.
690 */ 690 */
691 asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = 691 asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] =
692 asoc->ep->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT]; 692 asoc->rto_initial;
693 asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = 693 asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] =
694 asoc->ep->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE]; 694 asoc->rto_initial;
695 } 695 }
696 696
697 if (sctp_state(asoc, ESTABLISHED) || 697 if (sctp_state(asoc, ESTABLISHED) ||
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 505c7de10c50..475bfb4972d9 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -5160,6 +5160,8 @@ static int sctp_eat_data(const struct sctp_association *asoc,
5160 sctp_verb_t deliver; 5160 sctp_verb_t deliver;
5161 int tmp; 5161 int tmp;
5162 __u32 tsn; 5162 __u32 tsn;
5163 int account_value;
5164 struct sock *sk = asoc->base.sk;
5163 5165
5164 data_hdr = chunk->subh.data_hdr = (sctp_datahdr_t *)chunk->skb->data; 5166 data_hdr = chunk->subh.data_hdr = (sctp_datahdr_t *)chunk->skb->data;
5165 skb_pull(chunk->skb, sizeof(sctp_datahdr_t)); 5167 skb_pull(chunk->skb, sizeof(sctp_datahdr_t));
@@ -5169,6 +5171,26 @@ static int sctp_eat_data(const struct sctp_association *asoc,
5169 5171
5170 /* ASSERT: Now skb->data is really the user data. */ 5172 /* ASSERT: Now skb->data is really the user data. */
5171 5173
5174 /*
5175 * if we are established, and we have used up our receive
5176 * buffer memory, drop the frame
5177 */
5178 if (asoc->state == SCTP_STATE_ESTABLISHED) {
5179 /*
5180 * If the receive buffer policy is 1, then each
5181 * association can allocate up to sk_rcvbuf bytes
5182 * otherwise, all the associations in aggregate
5183 * may allocate up to sk_rcvbuf bytes
5184 */
5185 if (asoc->ep->rcvbuf_policy)
5186 account_value = atomic_read(&asoc->rmem_alloc);
5187 else
5188 account_value = atomic_read(&sk->sk_rmem_alloc);
5189
5190 if (account_value > sk->sk_rcvbuf)
5191 return SCTP_IERROR_IGNORE_TSN;
5192 }
5193
5172 /* Process ECN based congestion. 5194 /* Process ECN based congestion.
5173 * 5195 *
5174 * Since the chunk structure is reused for all chunks within 5196 * Since the chunk structure is reused for all chunks within
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index b529af5e6f2a..abab81f3818f 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1932,7 +1932,6 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval,
1932 if (copy_from_user(&sp->autoclose, optval, optlen)) 1932 if (copy_from_user(&sp->autoclose, optval, optlen))
1933 return -EFAULT; 1933 return -EFAULT;
1934 1934
1935 sp->ep->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sp->autoclose * HZ;
1936 return 0; 1935 return 0;
1937} 1936}
1938 1937
@@ -5115,8 +5114,10 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
5115 sctp_skb_for_each(skb, &oldsk->sk_receive_queue, tmp) { 5114 sctp_skb_for_each(skb, &oldsk->sk_receive_queue, tmp) {
5116 event = sctp_skb2event(skb); 5115 event = sctp_skb2event(skb);
5117 if (event->asoc == assoc) { 5116 if (event->asoc == assoc) {
5117 sock_rfree(skb);
5118 __skb_unlink(skb, &oldsk->sk_receive_queue); 5118 __skb_unlink(skb, &oldsk->sk_receive_queue);
5119 __skb_queue_tail(&newsk->sk_receive_queue, skb); 5119 __skb_queue_tail(&newsk->sk_receive_queue, skb);
5120 skb_set_owner_r(skb, newsk);
5120 } 5121 }
5121 } 5122 }
5122 5123
@@ -5144,8 +5145,10 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
5144 sctp_skb_for_each(skb, &oldsp->pd_lobby, tmp) { 5145 sctp_skb_for_each(skb, &oldsp->pd_lobby, tmp) {
5145 event = sctp_skb2event(skb); 5146 event = sctp_skb2event(skb);
5146 if (event->asoc == assoc) { 5147 if (event->asoc == assoc) {
5148 sock_rfree(skb);
5147 __skb_unlink(skb, &oldsp->pd_lobby); 5149 __skb_unlink(skb, &oldsp->pd_lobby);
5148 __skb_queue_tail(queue, skb); 5150 __skb_queue_tail(queue, skb);
5151 skb_set_owner_r(skb, newsk);
5149 } 5152 }
5150 } 5153 }
5151 5154
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index 75b28dd634fe..fcd7096c953d 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -121,6 +121,14 @@ static ctl_table sctp_table[] = {
121 .proc_handler = &proc_dointvec 121 .proc_handler = &proc_dointvec
122 }, 122 },
123 { 123 {
124 .ctl_name = NET_SCTP_RCVBUF_POLICY,
125 .procname = "rcvbuf_policy",
126 .data = &sctp_rcvbuf_policy,
127 .maxlen = sizeof(int),
128 .mode = 0644,
129 .proc_handler = &proc_dointvec
130 },
131 {
124 .ctl_name = NET_SCTP_PATH_MAX_RETRANS, 132 .ctl_name = NET_SCTP_PATH_MAX_RETRANS,
125 .procname = "path_max_retrans", 133 .procname = "path_max_retrans",
126 .data = &sctp_max_retrans_path, 134 .data = &sctp_max_retrans_path,
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index e049f41faa47..ba97f974f57c 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -52,19 +52,6 @@ static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event,
52 struct sctp_association *asoc); 52 struct sctp_association *asoc);
53static void sctp_ulpevent_release_data(struct sctp_ulpevent *event); 53static void sctp_ulpevent_release_data(struct sctp_ulpevent *event);
54 54
55/* Stub skb destructor. */
56static void sctp_stub_rfree(struct sk_buff *skb)
57{
58/* WARNING: This function is just a warning not to use the
59 * skb destructor. If the skb is shared, we may get the destructor
60 * callback on some processor that does not own the sock_lock. This
61 * was occuring with PACKET socket applications that were monitoring
62 * our skbs. We can't take the sock_lock, because we can't risk
63 * recursing if we do really own the sock lock. Instead, do all
64 * of our rwnd manipulation while we own the sock_lock outright.
65 */
66}
67
68/* Initialize an ULP event from an given skb. */ 55/* Initialize an ULP event from an given skb. */
69SCTP_STATIC void sctp_ulpevent_init(struct sctp_ulpevent *event, int msg_flags) 56SCTP_STATIC void sctp_ulpevent_init(struct sctp_ulpevent *event, int msg_flags)
70{ 57{
@@ -111,15 +98,19 @@ static inline void sctp_ulpevent_set_owner(struct sctp_ulpevent *event,
111 */ 98 */
112 sctp_association_hold((struct sctp_association *)asoc); 99 sctp_association_hold((struct sctp_association *)asoc);
113 skb = sctp_event2skb(event); 100 skb = sctp_event2skb(event);
114 skb->sk = asoc->base.sk;
115 event->asoc = (struct sctp_association *)asoc; 101 event->asoc = (struct sctp_association *)asoc;
116 skb->destructor = sctp_stub_rfree; 102 atomic_add(skb->truesize, &event->asoc->rmem_alloc);
103 skb_set_owner_r(skb, asoc->base.sk);
117} 104}
118 105
119/* A simple destructor to give up the reference to the association. */ 106/* A simple destructor to give up the reference to the association. */
120static inline void sctp_ulpevent_release_owner(struct sctp_ulpevent *event) 107static inline void sctp_ulpevent_release_owner(struct sctp_ulpevent *event)
121{ 108{
122 sctp_association_put(event->asoc); 109 struct sctp_association *asoc = event->asoc;
110 struct sk_buff *skb = sctp_event2skb(event);
111
112 atomic_sub(skb->truesize, &asoc->rmem_alloc);
113 sctp_association_put(asoc);
123} 114}
124 115
125/* Create and initialize an SCTP_ASSOC_CHANGE event. 116/* Create and initialize an SCTP_ASSOC_CHANGE event.
@@ -922,7 +913,6 @@ done:
922/* Free a ulpevent that has an owner. It includes releasing the reference 913/* Free a ulpevent that has an owner. It includes releasing the reference
923 * to the owner, updating the rwnd in case of a DATA event and freeing the 914 * to the owner, updating the rwnd in case of a DATA event and freeing the
924 * skb. 915 * skb.
925 * See comments in sctp_stub_rfree().
926 */ 916 */
927void sctp_ulpevent_free(struct sctp_ulpevent *event) 917void sctp_ulpevent_free(struct sctp_ulpevent *event)
928{ 918{
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 702ede309b06..61c3abeaccae 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -55,6 +55,7 @@ static void call_bind(struct rpc_task *task);
55static void call_bind_status(struct rpc_task *task); 55static void call_bind_status(struct rpc_task *task);
56static void call_transmit(struct rpc_task *task); 56static void call_transmit(struct rpc_task *task);
57static void call_status(struct rpc_task *task); 57static void call_status(struct rpc_task *task);
58static void call_transmit_status(struct rpc_task *task);
58static void call_refresh(struct rpc_task *task); 59static void call_refresh(struct rpc_task *task);
59static void call_refreshresult(struct rpc_task *task); 60static void call_refreshresult(struct rpc_task *task);
60static void call_timeout(struct rpc_task *task); 61static void call_timeout(struct rpc_task *task);
@@ -672,6 +673,18 @@ call_allocate(struct rpc_task *task)
672 rpc_exit(task, -ERESTARTSYS); 673 rpc_exit(task, -ERESTARTSYS);
673} 674}
674 675
676static inline int
677rpc_task_need_encode(struct rpc_task *task)
678{
679 return task->tk_rqstp->rq_snd_buf.len == 0;
680}
681
682static inline void
683rpc_task_force_reencode(struct rpc_task *task)
684{
685 task->tk_rqstp->rq_snd_buf.len = 0;
686}
687
675/* 688/*
676 * 3. Encode arguments of an RPC call 689 * 3. Encode arguments of an RPC call
677 */ 690 */
@@ -867,12 +880,14 @@ call_transmit(struct rpc_task *task)
867 if (task->tk_status != 0) 880 if (task->tk_status != 0)
868 return; 881 return;
869 /* Encode here so that rpcsec_gss can use correct sequence number. */ 882 /* Encode here so that rpcsec_gss can use correct sequence number. */
870 if (task->tk_rqstp->rq_bytes_sent == 0) { 883 if (rpc_task_need_encode(task)) {
884 task->tk_rqstp->rq_bytes_sent = 0;
871 call_encode(task); 885 call_encode(task);
872 /* Did the encode result in an error condition? */ 886 /* Did the encode result in an error condition? */
873 if (task->tk_status != 0) 887 if (task->tk_status != 0)
874 goto out_nosend; 888 goto out_nosend;
875 } 889 }
890 task->tk_action = call_transmit_status;
876 xprt_transmit(task); 891 xprt_transmit(task);
877 if (task->tk_status < 0) 892 if (task->tk_status < 0)
878 return; 893 return;
@@ -884,6 +899,7 @@ call_transmit(struct rpc_task *task)
884out_nosend: 899out_nosend:
885 /* release socket write lock before attempting to handle error */ 900 /* release socket write lock before attempting to handle error */
886 xprt_abort_transmit(task); 901 xprt_abort_transmit(task);
902 rpc_task_force_reencode(task);
887} 903}
888 904
889/* 905/*
@@ -915,7 +931,6 @@ call_status(struct rpc_task *task)
915 break; 931 break;
916 case -ECONNREFUSED: 932 case -ECONNREFUSED:
917 case -ENOTCONN: 933 case -ENOTCONN:
918 req->rq_bytes_sent = 0;
919 if (clnt->cl_autobind) 934 if (clnt->cl_autobind)
920 clnt->cl_port = 0; 935 clnt->cl_port = 0;
921 task->tk_action = call_bind; 936 task->tk_action = call_bind;
@@ -937,7 +952,18 @@ call_status(struct rpc_task *task)
937} 952}
938 953
939/* 954/*
940 * 6a. Handle RPC timeout 955 * 6a. Handle transmission errors.
956 */
957static void
958call_transmit_status(struct rpc_task *task)
959{
960 if (task->tk_status != -EAGAIN)
961 rpc_task_force_reencode(task);
962 call_status(task);
963}
964
965/*
966 * 6b. Handle RPC timeout
941 * We do not release the request slot, so we keep using the 967 * We do not release the request slot, so we keep using the
942 * same XID for all retransmits. 968 * same XID for all retransmits.
943 */ 969 */
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 4f188d0a5d11..81e00a6c19de 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -603,7 +603,7 @@ rpc_lookup_negative(char *path, struct nameidata *nd)
603 return ERR_PTR(error); 603 return ERR_PTR(error);
604 dir = nd->dentry->d_inode; 604 dir = nd->dentry->d_inode;
605 down(&dir->i_sem); 605 down(&dir->i_sem);
606 dentry = lookup_hash(&nd->last, nd->dentry); 606 dentry = lookup_hash(nd);
607 if (IS_ERR(dentry)) 607 if (IS_ERR(dentry))
608 goto out_err; 608 goto out_err;
609 if (dentry->d_inode) { 609 if (dentry->d_inode) {
@@ -665,7 +665,7 @@ rpc_rmdir(char *path)
665 return error; 665 return error;
666 dir = nd.dentry->d_inode; 666 dir = nd.dentry->d_inode;
667 down(&dir->i_sem); 667 down(&dir->i_sem);
668 dentry = lookup_hash(&nd.last, nd.dentry); 668 dentry = lookup_hash(&nd);
669 if (IS_ERR(dentry)) { 669 if (IS_ERR(dentry)) {
670 error = PTR_ERR(dentry); 670 error = PTR_ERR(dentry);
671 goto out_release; 671 goto out_release;
@@ -726,7 +726,7 @@ rpc_unlink(char *path)
726 return error; 726 return error;
727 dir = nd.dentry->d_inode; 727 dir = nd.dentry->d_inode;
728 down(&dir->i_sem); 728 down(&dir->i_sem);
729 dentry = lookup_hash(&nd.last, nd.dentry); 729 dentry = lookup_hash(&nd);
730 if (IS_ERR(dentry)) { 730 if (IS_ERR(dentry)) {
731 error = PTR_ERR(dentry); 731 error = PTR_ERR(dentry);
732 goto out_release; 732 goto out_release;
diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c
index 8f97e90f36c8..eb330d4f66d6 100644
--- a/net/sunrpc/socklib.c
+++ b/net/sunrpc/socklib.c
@@ -6,6 +6,9 @@
6 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> 6 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
7 */ 7 */
8 8
9#include <linux/compiler.h>
10#include <linux/netdevice.h>
11#include <linux/skbuff.h>
9#include <linux/types.h> 12#include <linux/types.h>
10#include <linux/pagemap.h> 13#include <linux/pagemap.h>
11#include <linux/udp.h> 14#include <linux/udp.h>
@@ -165,6 +168,8 @@ int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
165 return -1; 168 return -1;
166 if ((unsigned short)csum_fold(desc.csum)) 169 if ((unsigned short)csum_fold(desc.csum))
167 return -1; 170 return -1;
171 if (unlikely(skb->ip_summed == CHECKSUM_HW))
172 netdev_rx_csum_fault(skb->dev);
168 return 0; 173 return 0;
169no_checksum: 174no_checksum:
170 if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_bits) < 0) 175 if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_bits) < 0)
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index f16e7cdd6150..e50e7cf43737 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -623,12 +623,9 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
623 /* we can use it in-place */ 623 /* we can use it in-place */
624 rqstp->rq_arg.head[0].iov_base = skb->data + sizeof(struct udphdr); 624 rqstp->rq_arg.head[0].iov_base = skb->data + sizeof(struct udphdr);
625 rqstp->rq_arg.head[0].iov_len = len; 625 rqstp->rq_arg.head[0].iov_len = len;
626 if (skb->ip_summed != CHECKSUM_UNNECESSARY) { 626 if (skb_checksum_complete(skb)) {
627 if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { 627 skb_free_datagram(svsk->sk_sk, skb);
628 skb_free_datagram(svsk->sk_sk, skb); 628 return 0;
629 return 0;
630 }
631 skb->ip_summed = CHECKSUM_UNNECESSARY;
632 } 629 }
633 rqstp->rq_skbuff = skb; 630 rqstp->rq_skbuff = skb;
634 } 631 }
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 41feca3bef86..acc73ba8bade 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -676,7 +676,7 @@ static struct sock *unix_find_other(struct sockaddr_un *sunname, int len,
676 err = path_lookup(sunname->sun_path, LOOKUP_FOLLOW, &nd); 676 err = path_lookup(sunname->sun_path, LOOKUP_FOLLOW, &nd);
677 if (err) 677 if (err)
678 goto fail; 678 goto fail;
679 err = permission(nd.dentry->d_inode,MAY_WRITE, &nd); 679 err = vfs_permission(&nd, MAY_WRITE);
680 if (err) 680 if (err)
681 goto put_fail; 681 goto put_fail;
682 682
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index c35336a0f71b..0cdd9a07e043 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -18,7 +18,6 @@
18#include <linux/string.h> 18#include <linux/string.h>
19#include <linux/net.h> 19#include <linux/net.h>
20#include <linux/skbuff.h> 20#include <linux/skbuff.h>
21#include <linux/netlink.h>
22#include <linux/rtnetlink.h> 21#include <linux/rtnetlink.h>
23#include <linux/pfkeyv2.h> 22#include <linux/pfkeyv2.h>
24#include <linux/ipsec.h> 23#include <linux/ipsec.h>
@@ -26,6 +25,7 @@
26#include <linux/security.h> 25#include <linux/security.h>
27#include <net/sock.h> 26#include <net/sock.h>
28#include <net/xfrm.h> 27#include <net/xfrm.h>
28#include <net/netlink.h>
29#include <asm/uaccess.h> 29#include <asm/uaccess.h>
30 30
31static struct sock *xfrm_nl; 31static struct sock *xfrm_nl;
@@ -948,11 +948,6 @@ static struct xfrm_link {
948 [XFRM_MSG_FLUSHPOLICY - XFRM_MSG_BASE] = { .doit = xfrm_flush_policy }, 948 [XFRM_MSG_FLUSHPOLICY - XFRM_MSG_BASE] = { .doit = xfrm_flush_policy },
949}; 949};
950 950
951static int xfrm_done(struct netlink_callback *cb)
952{
953 return 0;
954}
955
956static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp) 951static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp)
957{ 952{
958 struct rtattr *xfrma[XFRMA_MAX]; 953 struct rtattr *xfrma[XFRMA_MAX];
@@ -984,20 +979,15 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *err
984 if ((type == (XFRM_MSG_GETSA - XFRM_MSG_BASE) || 979 if ((type == (XFRM_MSG_GETSA - XFRM_MSG_BASE) ||
985 type == (XFRM_MSG_GETPOLICY - XFRM_MSG_BASE)) && 980 type == (XFRM_MSG_GETPOLICY - XFRM_MSG_BASE)) &&
986 (nlh->nlmsg_flags & NLM_F_DUMP)) { 981 (nlh->nlmsg_flags & NLM_F_DUMP)) {
987 u32 rlen;
988
989 if (link->dump == NULL) 982 if (link->dump == NULL)
990 goto err_einval; 983 goto err_einval;
991 984
992 if ((*errp = netlink_dump_start(xfrm_nl, skb, nlh, 985 if ((*errp = netlink_dump_start(xfrm_nl, skb, nlh,
993 link->dump, 986 link->dump, NULL)) != 0) {
994 xfrm_done)) != 0) {
995 return -1; 987 return -1;
996 } 988 }
997 rlen = NLMSG_ALIGN(nlh->nlmsg_len); 989
998 if (rlen > skb->len) 990 netlink_queue_skip(nlh, skb);
999 rlen = skb->len;
1000 skb_pull(skb, rlen);
1001 return -1; 991 return -1;
1002 } 992 }
1003 993
@@ -1032,60 +1022,13 @@ err_einval:
1032 return -1; 1022 return -1;
1033} 1023}
1034 1024
1035static int xfrm_user_rcv_skb(struct sk_buff *skb)
1036{
1037 int err;
1038 struct nlmsghdr *nlh;
1039
1040 while (skb->len >= NLMSG_SPACE(0)) {
1041 u32 rlen;
1042
1043 nlh = (struct nlmsghdr *) skb->data;
1044 if (nlh->nlmsg_len < sizeof(*nlh) ||
1045 skb->len < nlh->nlmsg_len)
1046 return 0;
1047 rlen = NLMSG_ALIGN(nlh->nlmsg_len);
1048 if (rlen > skb->len)
1049 rlen = skb->len;
1050 if (xfrm_user_rcv_msg(skb, nlh, &err) < 0) {
1051 if (err == 0)
1052 return -1;
1053 netlink_ack(skb, nlh, err);
1054 } else if (nlh->nlmsg_flags & NLM_F_ACK)
1055 netlink_ack(skb, nlh, 0);
1056 skb_pull(skb, rlen);
1057 }
1058
1059 return 0;
1060}
1061
1062static void xfrm_netlink_rcv(struct sock *sk, int len) 1025static void xfrm_netlink_rcv(struct sock *sk, int len)
1063{ 1026{
1064 unsigned int qlen = skb_queue_len(&sk->sk_receive_queue); 1027 unsigned int qlen = 0;
1065 1028
1066 do { 1029 do {
1067 struct sk_buff *skb;
1068
1069 down(&xfrm_cfg_sem); 1030 down(&xfrm_cfg_sem);
1070 1031 netlink_run_queue(sk, &qlen, &xfrm_user_rcv_msg);
1071 if (qlen > skb_queue_len(&sk->sk_receive_queue))
1072 qlen = skb_queue_len(&sk->sk_receive_queue);
1073
1074 for (; qlen; qlen--) {
1075 skb = skb_dequeue(&sk->sk_receive_queue);
1076 if (xfrm_user_rcv_skb(skb)) {
1077 if (skb->len)
1078 skb_queue_head(&sk->sk_receive_queue,
1079 skb);
1080 else {
1081 kfree_skb(skb);
1082 qlen--;
1083 }
1084 break;
1085 }
1086 kfree_skb(skb);
1087 }
1088
1089 up(&xfrm_cfg_sem); 1032 up(&xfrm_cfg_sem);
1090 1033
1091 } while (qlen); 1034 } while (qlen);