diff options
author | Jeff Garzik <jgarzik@pobox.com> | 2005-11-11 23:39:35 -0500 |
---|---|---|
committer | Jeff Garzik <jgarzik@pobox.com> | 2005-11-11 23:39:35 -0500 |
commit | f4256e301d9800b1e0276404cb01b3ac85b51067 (patch) | |
tree | 975f56627b78f757608b31684311a24ca1478481 /net | |
parent | fb2a26b9f8f5eda6b96ba9753edf105e5999d6d9 (diff) | |
parent | cd52d1ee9a92587b242d946a2300a3245d3b885a (diff) |
Merge branch 'master'
Diffstat (limited to 'net')
91 files changed, 10036 insertions, 809 deletions
diff --git a/net/core/datagram.c b/net/core/datagram.c index d219435d086c..1bcfef51ac58 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c | |||
@@ -350,6 +350,20 @@ fault: | |||
350 | return -EFAULT; | 350 | return -EFAULT; |
351 | } | 351 | } |
352 | 352 | ||
353 | unsigned int __skb_checksum_complete(struct sk_buff *skb) | ||
354 | { | ||
355 | unsigned int sum; | ||
356 | |||
357 | sum = (u16)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)); | ||
358 | if (likely(!sum)) { | ||
359 | if (unlikely(skb->ip_summed == CHECKSUM_HW)) | ||
360 | netdev_rx_csum_fault(skb->dev); | ||
361 | skb->ip_summed = CHECKSUM_UNNECESSARY; | ||
362 | } | ||
363 | return sum; | ||
364 | } | ||
365 | EXPORT_SYMBOL(__skb_checksum_complete); | ||
366 | |||
353 | /** | 367 | /** |
354 | * skb_copy_and_csum_datagram_iovec - Copy and checkum skb to user iovec. | 368 | * skb_copy_and_csum_datagram_iovec - Copy and checkum skb to user iovec. |
355 | * @skb: skbuff | 369 | * @skb: skbuff |
@@ -363,7 +377,7 @@ fault: | |||
363 | * -EFAULT - fault during copy. Beware, in this case iovec | 377 | * -EFAULT - fault during copy. Beware, in this case iovec |
364 | * can be modified! | 378 | * can be modified! |
365 | */ | 379 | */ |
366 | int skb_copy_and_csum_datagram_iovec(const struct sk_buff *skb, | 380 | int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb, |
367 | int hlen, struct iovec *iov) | 381 | int hlen, struct iovec *iov) |
368 | { | 382 | { |
369 | unsigned int csum; | 383 | unsigned int csum; |
@@ -376,8 +390,7 @@ int skb_copy_and_csum_datagram_iovec(const struct sk_buff *skb, | |||
376 | iov++; | 390 | iov++; |
377 | 391 | ||
378 | if (iov->iov_len < chunk) { | 392 | if (iov->iov_len < chunk) { |
379 | if ((unsigned short)csum_fold(skb_checksum(skb, 0, chunk + hlen, | 393 | if (__skb_checksum_complete(skb)) |
380 | skb->csum))) | ||
381 | goto csum_error; | 394 | goto csum_error; |
382 | if (skb_copy_datagram_iovec(skb, hlen, iov, chunk)) | 395 | if (skb_copy_datagram_iovec(skb, hlen, iov, chunk)) |
383 | goto fault; | 396 | goto fault; |
@@ -388,6 +401,8 @@ int skb_copy_and_csum_datagram_iovec(const struct sk_buff *skb, | |||
388 | goto fault; | 401 | goto fault; |
389 | if ((unsigned short)csum_fold(csum)) | 402 | if ((unsigned short)csum_fold(csum)) |
390 | goto csum_error; | 403 | goto csum_error; |
404 | if (unlikely(skb->ip_summed == CHECKSUM_HW)) | ||
405 | netdev_rx_csum_fault(skb->dev); | ||
391 | iov->iov_len -= chunk; | 406 | iov->iov_len -= chunk; |
392 | iov->iov_base += chunk; | 407 | iov->iov_base += chunk; |
393 | } | 408 | } |
diff --git a/net/core/dev.c b/net/core/dev.c index 8d1541595277..0b48e294aafe 100644 --- a/net/core/dev.c +++ b/net/core/dev.c | |||
@@ -1108,6 +1108,18 @@ out: | |||
1108 | return ret; | 1108 | return ret; |
1109 | } | 1109 | } |
1110 | 1110 | ||
1111 | /* Take action when hardware reception checksum errors are detected. */ | ||
1112 | #ifdef CONFIG_BUG | ||
1113 | void netdev_rx_csum_fault(struct net_device *dev) | ||
1114 | { | ||
1115 | if (net_ratelimit()) { | ||
1116 | printk(KERN_ERR "%s: hw csum failure.\n", dev->name); | ||
1117 | dump_stack(); | ||
1118 | } | ||
1119 | } | ||
1120 | EXPORT_SYMBOL(netdev_rx_csum_fault); | ||
1121 | #endif | ||
1122 | |||
1111 | #ifdef CONFIG_HIGHMEM | 1123 | #ifdef CONFIG_HIGHMEM |
1112 | /* Actually, we should eliminate this check as soon as we know, that: | 1124 | /* Actually, we should eliminate this check as soon as we know, that: |
1113 | * 1. IOMMU is present and allows to map all the memory. | 1125 | * 1. IOMMU is present and allows to map all the memory. |
diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 802fe11efad0..49424a42a2c0 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c | |||
@@ -101,16 +101,20 @@ void netpoll_queue(struct sk_buff *skb) | |||
101 | static int checksum_udp(struct sk_buff *skb, struct udphdr *uh, | 101 | static int checksum_udp(struct sk_buff *skb, struct udphdr *uh, |
102 | unsigned short ulen, u32 saddr, u32 daddr) | 102 | unsigned short ulen, u32 saddr, u32 daddr) |
103 | { | 103 | { |
104 | if (uh->check == 0) | 104 | unsigned int psum; |
105 | |||
106 | if (uh->check == 0 || skb->ip_summed == CHECKSUM_UNNECESSARY) | ||
105 | return 0; | 107 | return 0; |
106 | 108 | ||
107 | if (skb->ip_summed == CHECKSUM_HW) | 109 | psum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); |
108 | return csum_tcpudp_magic( | 110 | |
109 | saddr, daddr, ulen, IPPROTO_UDP, skb->csum); | 111 | if (skb->ip_summed == CHECKSUM_HW && |
112 | !(u16)csum_fold(csum_add(psum, skb->csum))) | ||
113 | return 0; | ||
110 | 114 | ||
111 | skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); | 115 | skb->csum = psum; |
112 | 116 | ||
113 | return csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)); | 117 | return __skb_checksum_complete(skb); |
114 | } | 118 | } |
115 | 119 | ||
116 | /* | 120 | /* |
@@ -489,7 +493,7 @@ int __netpoll_rx(struct sk_buff *skb) | |||
489 | 493 | ||
490 | if (ulen != len) | 494 | if (ulen != len) |
491 | goto out; | 495 | goto out; |
492 | if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr) < 0) | 496 | if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr)) |
493 | goto out; | 497 | goto out; |
494 | if (np->local_ip && np->local_ip != ntohl(iph->daddr)) | 498 | if (np->local_ip && np->local_ip != ntohl(iph->daddr)) |
495 | goto out; | 499 | goto out; |
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 9bed7569ce3f..8700379685e0 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c | |||
@@ -49,6 +49,7 @@ | |||
49 | #include <net/udp.h> | 49 | #include <net/udp.h> |
50 | #include <net/sock.h> | 50 | #include <net/sock.h> |
51 | #include <net/pkt_sched.h> | 51 | #include <net/pkt_sched.h> |
52 | #include <net/netlink.h> | ||
52 | 53 | ||
53 | DECLARE_MUTEX(rtnl_sem); | 54 | DECLARE_MUTEX(rtnl_sem); |
54 | 55 | ||
@@ -462,11 +463,6 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change) | |||
462 | netlink_broadcast(rtnl, skb, 0, RTNLGRP_LINK, GFP_KERNEL); | 463 | netlink_broadcast(rtnl, skb, 0, RTNLGRP_LINK, GFP_KERNEL); |
463 | } | 464 | } |
464 | 465 | ||
465 | static int rtnetlink_done(struct netlink_callback *cb) | ||
466 | { | ||
467 | return 0; | ||
468 | } | ||
469 | |||
470 | /* Protected by RTNL sempahore. */ | 466 | /* Protected by RTNL sempahore. */ |
471 | static struct rtattr **rta_buf; | 467 | static struct rtattr **rta_buf; |
472 | static int rtattr_max; | 468 | static int rtattr_max; |
@@ -524,8 +520,6 @@ rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp) | |||
524 | } | 520 | } |
525 | 521 | ||
526 | if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { | 522 | if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { |
527 | u32 rlen; | ||
528 | |||
529 | if (link->dumpit == NULL) | 523 | if (link->dumpit == NULL) |
530 | link = &(rtnetlink_links[PF_UNSPEC][type]); | 524 | link = &(rtnetlink_links[PF_UNSPEC][type]); |
531 | 525 | ||
@@ -533,14 +527,11 @@ rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp) | |||
533 | goto err_inval; | 527 | goto err_inval; |
534 | 528 | ||
535 | if ((*errp = netlink_dump_start(rtnl, skb, nlh, | 529 | if ((*errp = netlink_dump_start(rtnl, skb, nlh, |
536 | link->dumpit, | 530 | link->dumpit, NULL)) != 0) { |
537 | rtnetlink_done)) != 0) { | ||
538 | return -1; | 531 | return -1; |
539 | } | 532 | } |
540 | rlen = NLMSG_ALIGN(nlh->nlmsg_len); | 533 | |
541 | if (rlen > skb->len) | 534 | netlink_queue_skip(nlh, skb); |
542 | rlen = skb->len; | ||
543 | skb_pull(skb, rlen); | ||
544 | return -1; | 535 | return -1; |
545 | } | 536 | } |
546 | 537 | ||
@@ -579,75 +570,13 @@ err_inval: | |||
579 | return -1; | 570 | return -1; |
580 | } | 571 | } |
581 | 572 | ||
582 | /* | ||
583 | * Process one packet of messages. | ||
584 | * Malformed skbs with wrong lengths of messages are discarded silently. | ||
585 | */ | ||
586 | |||
587 | static inline int rtnetlink_rcv_skb(struct sk_buff *skb) | ||
588 | { | ||
589 | int err; | ||
590 | struct nlmsghdr * nlh; | ||
591 | |||
592 | while (skb->len >= NLMSG_SPACE(0)) { | ||
593 | u32 rlen; | ||
594 | |||
595 | nlh = (struct nlmsghdr *)skb->data; | ||
596 | if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) | ||
597 | return 0; | ||
598 | rlen = NLMSG_ALIGN(nlh->nlmsg_len); | ||
599 | if (rlen > skb->len) | ||
600 | rlen = skb->len; | ||
601 | if (rtnetlink_rcv_msg(skb, nlh, &err)) { | ||
602 | /* Not error, but we must interrupt processing here: | ||
603 | * Note, that in this case we do not pull message | ||
604 | * from skb, it will be processed later. | ||
605 | */ | ||
606 | if (err == 0) | ||
607 | return -1; | ||
608 | netlink_ack(skb, nlh, err); | ||
609 | } else if (nlh->nlmsg_flags&NLM_F_ACK) | ||
610 | netlink_ack(skb, nlh, 0); | ||
611 | skb_pull(skb, rlen); | ||
612 | } | ||
613 | |||
614 | return 0; | ||
615 | } | ||
616 | |||
617 | /* | ||
618 | * rtnetlink input queue processing routine: | ||
619 | * - process as much as there was in the queue upon entry. | ||
620 | * - feed skbs to rtnetlink_rcv_skb, until it refuse a message, | ||
621 | * that will occur, when a dump started. | ||
622 | */ | ||
623 | |||
624 | static void rtnetlink_rcv(struct sock *sk, int len) | 573 | static void rtnetlink_rcv(struct sock *sk, int len) |
625 | { | 574 | { |
626 | unsigned int qlen = skb_queue_len(&sk->sk_receive_queue); | 575 | unsigned int qlen = 0; |
627 | 576 | ||
628 | do { | 577 | do { |
629 | struct sk_buff *skb; | ||
630 | |||
631 | rtnl_lock(); | 578 | rtnl_lock(); |
632 | 579 | netlink_run_queue(sk, &qlen, &rtnetlink_rcv_msg); | |
633 | if (qlen > skb_queue_len(&sk->sk_receive_queue)) | ||
634 | qlen = skb_queue_len(&sk->sk_receive_queue); | ||
635 | |||
636 | for (; qlen; qlen--) { | ||
637 | skb = skb_dequeue(&sk->sk_receive_queue); | ||
638 | if (rtnetlink_rcv_skb(skb)) { | ||
639 | if (skb->len) | ||
640 | skb_queue_head(&sk->sk_receive_queue, | ||
641 | skb); | ||
642 | else { | ||
643 | kfree_skb(skb); | ||
644 | qlen--; | ||
645 | } | ||
646 | break; | ||
647 | } | ||
648 | kfree_skb(skb); | ||
649 | } | ||
650 | |||
651 | up(&rtnl_sem); | 580 | up(&rtnl_sem); |
652 | 581 | ||
653 | netdev_run_todo(); | 582 | netdev_run_todo(); |
diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 95501e40100e..b7d13a4fff48 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c | |||
@@ -336,6 +336,9 @@ void __kfree_skb(struct sk_buff *skb) | |||
336 | } | 336 | } |
337 | #ifdef CONFIG_NETFILTER | 337 | #ifdef CONFIG_NETFILTER |
338 | nf_conntrack_put(skb->nfct); | 338 | nf_conntrack_put(skb->nfct); |
339 | #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) | ||
340 | nf_conntrack_put_reasm(skb->nfct_reasm); | ||
341 | #endif | ||
339 | #ifdef CONFIG_BRIDGE_NETFILTER | 342 | #ifdef CONFIG_BRIDGE_NETFILTER |
340 | nf_bridge_put(skb->nf_bridge); | 343 | nf_bridge_put(skb->nf_bridge); |
341 | #endif | 344 | #endif |
@@ -414,9 +417,17 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) | |||
414 | C(nfct); | 417 | C(nfct); |
415 | nf_conntrack_get(skb->nfct); | 418 | nf_conntrack_get(skb->nfct); |
416 | C(nfctinfo); | 419 | C(nfctinfo); |
420 | #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) | ||
421 | C(nfct_reasm); | ||
422 | nf_conntrack_get_reasm(skb->nfct_reasm); | ||
423 | #endif | ||
417 | #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) | 424 | #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) |
418 | C(ipvs_property); | 425 | C(ipvs_property); |
419 | #endif | 426 | #endif |
427 | #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) | ||
428 | C(nfct_reasm); | ||
429 | nf_conntrack_get_reasm(skb->nfct_reasm); | ||
430 | #endif | ||
420 | #ifdef CONFIG_BRIDGE_NETFILTER | 431 | #ifdef CONFIG_BRIDGE_NETFILTER |
421 | C(nf_bridge); | 432 | C(nf_bridge); |
422 | nf_bridge_get(skb->nf_bridge); | 433 | nf_bridge_get(skb->nf_bridge); |
@@ -474,6 +485,10 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) | |||
474 | new->nfct = old->nfct; | 485 | new->nfct = old->nfct; |
475 | nf_conntrack_get(old->nfct); | 486 | nf_conntrack_get(old->nfct); |
476 | new->nfctinfo = old->nfctinfo; | 487 | new->nfctinfo = old->nfctinfo; |
488 | #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) | ||
489 | new->nfct_reasm = old->nfct_reasm; | ||
490 | nf_conntrack_get_reasm(old->nfct_reasm); | ||
491 | #endif | ||
477 | #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) | 492 | #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) |
478 | new->ipvs_property = old->ipvs_property; | 493 | new->ipvs_property = old->ipvs_property; |
479 | #endif | 494 | #endif |
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 3f25cadccddd..f89e55f814d9 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c | |||
@@ -1664,17 +1664,15 @@ static int dn_recvmsg(struct kiocb *iocb, struct socket *sock, | |||
1664 | goto out; | 1664 | goto out; |
1665 | } | 1665 | } |
1666 | 1666 | ||
1667 | rv = dn_check_state(sk, NULL, 0, &timeo, flags); | ||
1668 | if (rv) | ||
1669 | goto out; | ||
1670 | |||
1671 | if (sk->sk_shutdown & RCV_SHUTDOWN) { | 1667 | if (sk->sk_shutdown & RCV_SHUTDOWN) { |
1672 | if (!(flags & MSG_NOSIGNAL)) | 1668 | rv = 0; |
1673 | send_sig(SIGPIPE, current, 0); | ||
1674 | rv = -EPIPE; | ||
1675 | goto out; | 1669 | goto out; |
1676 | } | 1670 | } |
1677 | 1671 | ||
1672 | rv = dn_check_state(sk, NULL, 0, &timeo, flags); | ||
1673 | if (rv) | ||
1674 | goto out; | ||
1675 | |||
1678 | if (flags & ~(MSG_PEEK|MSG_OOB|MSG_WAITALL|MSG_DONTWAIT|MSG_NOSIGNAL)) { | 1676 | if (flags & ~(MSG_PEEK|MSG_OOB|MSG_WAITALL|MSG_DONTWAIT|MSG_NOSIGNAL)) { |
1679 | rv = -EOPNOTSUPP; | 1677 | rv = -EOPNOTSUPP; |
1680 | goto out; | 1678 | goto out; |
@@ -1928,6 +1926,8 @@ static int dn_sendmsg(struct kiocb *iocb, struct socket *sock, | |||
1928 | 1926 | ||
1929 | if (sk->sk_shutdown & SEND_SHUTDOWN) { | 1927 | if (sk->sk_shutdown & SEND_SHUTDOWN) { |
1930 | err = -EPIPE; | 1928 | err = -EPIPE; |
1929 | if (!(flags & MSG_NOSIGNAL)) | ||
1930 | send_sig(SIGPIPE, current, 0); | ||
1931 | goto out_err; | 1931 | goto out_err; |
1932 | } | 1932 | } |
1933 | 1933 | ||
diff --git a/net/ieee80211/ieee80211_crypt.c b/net/ieee80211/ieee80211_crypt.c index 20cc580a07e0..ecc9bb196abc 100644 --- a/net/ieee80211/ieee80211_crypt.c +++ b/net/ieee80211/ieee80211_crypt.c | |||
@@ -11,15 +11,14 @@ | |||
11 | * | 11 | * |
12 | */ | 12 | */ |
13 | 13 | ||
14 | #include <linux/config.h> | 14 | #include <linux/errno.h> |
15 | #include <linux/module.h> | 15 | #include <linux/module.h> |
16 | #include <linux/init.h> | 16 | #include <linux/init.h> |
17 | #include <linux/slab.h> | 17 | #include <linux/slab.h> |
18 | #include <asm/string.h> | 18 | #include <linux/string.h> |
19 | #include <asm/errno.h> | ||
20 | |||
21 | #include <net/ieee80211.h> | 19 | #include <net/ieee80211.h> |
22 | 20 | ||
21 | |||
23 | MODULE_AUTHOR("Jouni Malinen"); | 22 | MODULE_AUTHOR("Jouni Malinen"); |
24 | MODULE_DESCRIPTION("HostAP crypto"); | 23 | MODULE_DESCRIPTION("HostAP crypto"); |
25 | MODULE_LICENSE("GPL"); | 24 | MODULE_LICENSE("GPL"); |
@@ -29,32 +28,20 @@ struct ieee80211_crypto_alg { | |||
29 | struct ieee80211_crypto_ops *ops; | 28 | struct ieee80211_crypto_ops *ops; |
30 | }; | 29 | }; |
31 | 30 | ||
32 | struct ieee80211_crypto { | 31 | static LIST_HEAD(ieee80211_crypto_algs); |
33 | struct list_head algs; | 32 | static DEFINE_SPINLOCK(ieee80211_crypto_lock); |
34 | spinlock_t lock; | ||
35 | }; | ||
36 | |||
37 | static struct ieee80211_crypto *hcrypt; | ||
38 | 33 | ||
39 | void ieee80211_crypt_deinit_entries(struct ieee80211_device *ieee, int force) | 34 | void ieee80211_crypt_deinit_entries(struct ieee80211_device *ieee, int force) |
40 | { | 35 | { |
41 | struct list_head *ptr, *n; | 36 | struct ieee80211_crypt_data *entry, *next; |
42 | struct ieee80211_crypt_data *entry; | ||
43 | unsigned long flags; | 37 | unsigned long flags; |
44 | 38 | ||
45 | spin_lock_irqsave(&ieee->lock, flags); | 39 | spin_lock_irqsave(&ieee->lock, flags); |
46 | 40 | list_for_each_entry_safe(entry, next, &ieee->crypt_deinit_list, list) { | |
47 | if (list_empty(&ieee->crypt_deinit_list)) | ||
48 | goto unlock; | ||
49 | |||
50 | for (ptr = ieee->crypt_deinit_list.next, n = ptr->next; | ||
51 | ptr != &ieee->crypt_deinit_list; ptr = n, n = ptr->next) { | ||
52 | entry = list_entry(ptr, struct ieee80211_crypt_data, list); | ||
53 | |||
54 | if (atomic_read(&entry->refcnt) != 0 && !force) | 41 | if (atomic_read(&entry->refcnt) != 0 && !force) |
55 | continue; | 42 | continue; |
56 | 43 | ||
57 | list_del(ptr); | 44 | list_del(&entry->list); |
58 | 45 | ||
59 | if (entry->ops) { | 46 | if (entry->ops) { |
60 | entry->ops->deinit(entry->priv); | 47 | entry->ops->deinit(entry->priv); |
@@ -62,7 +49,6 @@ void ieee80211_crypt_deinit_entries(struct ieee80211_device *ieee, int force) | |||
62 | } | 49 | } |
63 | kfree(entry); | 50 | kfree(entry); |
64 | } | 51 | } |
65 | unlock: | ||
66 | spin_unlock_irqrestore(&ieee->lock, flags); | 52 | spin_unlock_irqrestore(&ieee->lock, flags); |
67 | } | 53 | } |
68 | 54 | ||
@@ -125,9 +111,6 @@ int ieee80211_register_crypto_ops(struct ieee80211_crypto_ops *ops) | |||
125 | unsigned long flags; | 111 | unsigned long flags; |
126 | struct ieee80211_crypto_alg *alg; | 112 | struct ieee80211_crypto_alg *alg; |
127 | 113 | ||
128 | if (hcrypt == NULL) | ||
129 | return -1; | ||
130 | |||
131 | alg = kmalloc(sizeof(*alg), GFP_KERNEL); | 114 | alg = kmalloc(sizeof(*alg), GFP_KERNEL); |
132 | if (alg == NULL) | 115 | if (alg == NULL) |
133 | return -ENOMEM; | 116 | return -ENOMEM; |
@@ -135,9 +118,9 @@ int ieee80211_register_crypto_ops(struct ieee80211_crypto_ops *ops) | |||
135 | memset(alg, 0, sizeof(*alg)); | 118 | memset(alg, 0, sizeof(*alg)); |
136 | alg->ops = ops; | 119 | alg->ops = ops; |
137 | 120 | ||
138 | spin_lock_irqsave(&hcrypt->lock, flags); | 121 | spin_lock_irqsave(&ieee80211_crypto_lock, flags); |
139 | list_add(&alg->list, &hcrypt->algs); | 122 | list_add(&alg->list, &ieee80211_crypto_algs); |
140 | spin_unlock_irqrestore(&hcrypt->lock, flags); | 123 | spin_unlock_irqrestore(&ieee80211_crypto_lock, flags); |
141 | 124 | ||
142 | printk(KERN_DEBUG "ieee80211_crypt: registered algorithm '%s'\n", | 125 | printk(KERN_DEBUG "ieee80211_crypt: registered algorithm '%s'\n", |
143 | ops->name); | 126 | ops->name); |
@@ -147,64 +130,49 @@ int ieee80211_register_crypto_ops(struct ieee80211_crypto_ops *ops) | |||
147 | 130 | ||
148 | int ieee80211_unregister_crypto_ops(struct ieee80211_crypto_ops *ops) | 131 | int ieee80211_unregister_crypto_ops(struct ieee80211_crypto_ops *ops) |
149 | { | 132 | { |
133 | struct ieee80211_crypto_alg *alg; | ||
150 | unsigned long flags; | 134 | unsigned long flags; |
151 | struct list_head *ptr; | ||
152 | struct ieee80211_crypto_alg *del_alg = NULL; | ||
153 | |||
154 | if (hcrypt == NULL) | ||
155 | return -1; | ||
156 | |||
157 | spin_lock_irqsave(&hcrypt->lock, flags); | ||
158 | for (ptr = hcrypt->algs.next; ptr != &hcrypt->algs; ptr = ptr->next) { | ||
159 | struct ieee80211_crypto_alg *alg = | ||
160 | (struct ieee80211_crypto_alg *)ptr; | ||
161 | if (alg->ops == ops) { | ||
162 | list_del(&alg->list); | ||
163 | del_alg = alg; | ||
164 | break; | ||
165 | } | ||
166 | } | ||
167 | spin_unlock_irqrestore(&hcrypt->lock, flags); | ||
168 | 135 | ||
169 | if (del_alg) { | 136 | spin_lock_irqsave(&ieee80211_crypto_lock, flags); |
170 | printk(KERN_DEBUG "ieee80211_crypt: unregistered algorithm " | 137 | list_for_each_entry(alg, &ieee80211_crypto_algs, list) { |
171 | "'%s'\n", ops->name); | 138 | if (alg->ops == ops) |
172 | kfree(del_alg); | 139 | goto found; |
173 | } | 140 | } |
174 | 141 | spin_unlock_irqrestore(&ieee80211_crypto_lock, flags); | |
175 | return del_alg ? 0 : -1; | 142 | return -EINVAL; |
143 | |||
144 | found: | ||
145 | printk(KERN_DEBUG "ieee80211_crypt: unregistered algorithm " | ||
146 | "'%s'\n", ops->name); | ||
147 | list_del(&alg->list); | ||
148 | spin_unlock_irqrestore(&ieee80211_crypto_lock, flags); | ||
149 | kfree(alg); | ||
150 | return 0; | ||
176 | } | 151 | } |
177 | 152 | ||
178 | struct ieee80211_crypto_ops *ieee80211_get_crypto_ops(const char *name) | 153 | struct ieee80211_crypto_ops *ieee80211_get_crypto_ops(const char *name) |
179 | { | 154 | { |
155 | struct ieee80211_crypto_alg *alg; | ||
180 | unsigned long flags; | 156 | unsigned long flags; |
181 | struct list_head *ptr; | 157 | |
182 | struct ieee80211_crypto_alg *found_alg = NULL; | 158 | spin_lock_irqsave(&ieee80211_crypto_lock, flags); |
183 | 159 | list_for_each_entry(alg, &ieee80211_crypto_algs, list) { | |
184 | if (hcrypt == NULL) | 160 | if (strcmp(alg->ops->name, name) == 0) |
185 | return NULL; | 161 | goto found; |
186 | |||
187 | spin_lock_irqsave(&hcrypt->lock, flags); | ||
188 | for (ptr = hcrypt->algs.next; ptr != &hcrypt->algs; ptr = ptr->next) { | ||
189 | struct ieee80211_crypto_alg *alg = | ||
190 | (struct ieee80211_crypto_alg *)ptr; | ||
191 | if (strcmp(alg->ops->name, name) == 0) { | ||
192 | found_alg = alg; | ||
193 | break; | ||
194 | } | ||
195 | } | 162 | } |
196 | spin_unlock_irqrestore(&hcrypt->lock, flags); | 163 | spin_unlock_irqrestore(&ieee80211_crypto_lock, flags); |
164 | return NULL; | ||
197 | 165 | ||
198 | if (found_alg) | 166 | found: |
199 | return found_alg->ops; | 167 | spin_unlock_irqrestore(&ieee80211_crypto_lock, flags); |
200 | else | 168 | return alg->ops; |
201 | return NULL; | ||
202 | } | 169 | } |
203 | 170 | ||
204 | static void *ieee80211_crypt_null_init(int keyidx) | 171 | static void *ieee80211_crypt_null_init(int keyidx) |
205 | { | 172 | { |
206 | return (void *)1; | 173 | return (void *)1; |
207 | } | 174 | } |
175 | |||
208 | static void ieee80211_crypt_null_deinit(void *priv) | 176 | static void ieee80211_crypt_null_deinit(void *priv) |
209 | { | 177 | { |
210 | } | 178 | } |
@@ -213,56 +181,18 @@ static struct ieee80211_crypto_ops ieee80211_crypt_null = { | |||
213 | .name = "NULL", | 181 | .name = "NULL", |
214 | .init = ieee80211_crypt_null_init, | 182 | .init = ieee80211_crypt_null_init, |
215 | .deinit = ieee80211_crypt_null_deinit, | 183 | .deinit = ieee80211_crypt_null_deinit, |
216 | .encrypt_mpdu = NULL, | ||
217 | .decrypt_mpdu = NULL, | ||
218 | .encrypt_msdu = NULL, | ||
219 | .decrypt_msdu = NULL, | ||
220 | .set_key = NULL, | ||
221 | .get_key = NULL, | ||
222 | .extra_mpdu_prefix_len = 0, | ||
223 | .extra_mpdu_postfix_len = 0, | ||
224 | .owner = THIS_MODULE, | 184 | .owner = THIS_MODULE, |
225 | }; | 185 | }; |
226 | 186 | ||
227 | static int __init ieee80211_crypto_init(void) | 187 | static int __init ieee80211_crypto_init(void) |
228 | { | 188 | { |
229 | int ret = -ENOMEM; | 189 | return ieee80211_register_crypto_ops(&ieee80211_crypt_null); |
230 | |||
231 | hcrypt = kmalloc(sizeof(*hcrypt), GFP_KERNEL); | ||
232 | if (!hcrypt) | ||
233 | goto out; | ||
234 | |||
235 | memset(hcrypt, 0, sizeof(*hcrypt)); | ||
236 | INIT_LIST_HEAD(&hcrypt->algs); | ||
237 | spin_lock_init(&hcrypt->lock); | ||
238 | |||
239 | ret = ieee80211_register_crypto_ops(&ieee80211_crypt_null); | ||
240 | if (ret < 0) { | ||
241 | kfree(hcrypt); | ||
242 | hcrypt = NULL; | ||
243 | } | ||
244 | out: | ||
245 | return ret; | ||
246 | } | 190 | } |
247 | 191 | ||
248 | static void __exit ieee80211_crypto_deinit(void) | 192 | static void __exit ieee80211_crypto_deinit(void) |
249 | { | 193 | { |
250 | struct list_head *ptr, *n; | 194 | ieee80211_unregister_crypto_ops(&ieee80211_crypt_null); |
251 | 195 | BUG_ON(!list_empty(&ieee80211_crypto_algs)); | |
252 | if (hcrypt == NULL) | ||
253 | return; | ||
254 | |||
255 | for (ptr = hcrypt->algs.next, n = ptr->next; ptr != &hcrypt->algs; | ||
256 | ptr = n, n = ptr->next) { | ||
257 | struct ieee80211_crypto_alg *alg = | ||
258 | (struct ieee80211_crypto_alg *)ptr; | ||
259 | list_del(ptr); | ||
260 | printk(KERN_DEBUG "ieee80211_crypt: unregistered algorithm " | ||
261 | "'%s' (deinit)\n", alg->ops->name); | ||
262 | kfree(alg); | ||
263 | } | ||
264 | |||
265 | kfree(hcrypt); | ||
266 | } | 196 | } |
267 | 197 | ||
268 | EXPORT_SYMBOL(ieee80211_crypt_deinit_entries); | 198 | EXPORT_SYMBOL(ieee80211_crypt_deinit_entries); |
diff --git a/net/ieee80211/ieee80211_rx.c b/net/ieee80211/ieee80211_rx.c index 6ad88218f573..03efaacbdb73 100644 --- a/net/ieee80211/ieee80211_rx.c +++ b/net/ieee80211/ieee80211_rx.c | |||
@@ -369,6 +369,7 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb, | |||
369 | /* Put this code here so that we avoid duplicating it in all | 369 | /* Put this code here so that we avoid duplicating it in all |
370 | * Rx paths. - Jean II */ | 370 | * Rx paths. - Jean II */ |
371 | #ifdef IW_WIRELESS_SPY /* defined in iw_handler.h */ | 371 | #ifdef IW_WIRELESS_SPY /* defined in iw_handler.h */ |
372 | #ifdef CONFIG_NET_RADIO | ||
372 | /* If spy monitoring on */ | 373 | /* If spy monitoring on */ |
373 | if (ieee->spy_data.spy_number > 0) { | 374 | if (ieee->spy_data.spy_number > 0) { |
374 | struct iw_quality wstats; | 375 | struct iw_quality wstats; |
@@ -395,6 +396,7 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb, | |||
395 | /* Update spy records */ | 396 | /* Update spy records */ |
396 | wireless_spy_update(ieee->dev, hdr->addr2, &wstats); | 397 | wireless_spy_update(ieee->dev, hdr->addr2, &wstats); |
397 | } | 398 | } |
399 | #endif /* CONFIG_NET_RADIO */ | ||
398 | #endif /* IW_WIRELESS_SPY */ | 400 | #endif /* IW_WIRELESS_SPY */ |
399 | 401 | ||
400 | #ifdef NOT_YET | 402 | #ifdef NOT_YET |
diff --git a/net/ieee80211/ieee80211_wx.c b/net/ieee80211/ieee80211_wx.c index 1ce7af9bec35..181755f2aa8b 100644 --- a/net/ieee80211/ieee80211_wx.c +++ b/net/ieee80211/ieee80211_wx.c | |||
@@ -161,9 +161,11 @@ static inline char *ipw2100_translate_scan(struct ieee80211_device *ieee, | |||
161 | (ieee->perfect_rssi - ieee->worst_rssi) - | 161 | (ieee->perfect_rssi - ieee->worst_rssi) - |
162 | (ieee->perfect_rssi - network->stats.rssi) * | 162 | (ieee->perfect_rssi - network->stats.rssi) * |
163 | (15 * (ieee->perfect_rssi - ieee->worst_rssi) + | 163 | (15 * (ieee->perfect_rssi - ieee->worst_rssi) + |
164 | 62 * (ieee->perfect_rssi - network->stats.rssi))) / | 164 | 62 * (ieee->perfect_rssi - |
165 | ((ieee->perfect_rssi - ieee->worst_rssi) * | 165 | network->stats.rssi))) / |
166 | (ieee->perfect_rssi - ieee->worst_rssi)); | 166 | ((ieee->perfect_rssi - |
167 | ieee->worst_rssi) * (ieee->perfect_rssi - | ||
168 | ieee->worst_rssi)); | ||
167 | if (iwe.u.qual.qual > 100) | 169 | if (iwe.u.qual.qual > 100) |
168 | iwe.u.qual.qual = 100; | 170 | iwe.u.qual.qual = 100; |
169 | else if (iwe.u.qual.qual < 1) | 171 | else if (iwe.u.qual.qual < 1) |
@@ -520,7 +522,8 @@ int ieee80211_wx_set_encodeext(struct ieee80211_device *ieee, | |||
520 | crypt = &ieee->crypt[idx]; | 522 | crypt = &ieee->crypt[idx]; |
521 | group_key = 1; | 523 | group_key = 1; |
522 | } else { | 524 | } else { |
523 | if (idx != 0) | 525 | /* some Cisco APs use idx>0 for unicast in dynamic WEP */ |
526 | if (idx != 0 && ext->alg != IW_ENCODE_ALG_WEP) | ||
524 | return -EINVAL; | 527 | return -EINVAL; |
525 | if (ieee->iw_mode == IW_MODE_INFRA) | 528 | if (ieee->iw_mode == IW_MODE_INFRA) |
526 | crypt = &ieee->crypt[idx]; | 529 | crypt = &ieee->crypt[idx]; |
@@ -688,7 +691,8 @@ int ieee80211_wx_get_encodeext(struct ieee80211_device *ieee, | |||
688 | } else | 691 | } else |
689 | idx = ieee->tx_keyidx; | 692 | idx = ieee->tx_keyidx; |
690 | 693 | ||
691 | if (!ext->ext_flags & IW_ENCODE_EXT_GROUP_KEY) | 694 | if (!ext->ext_flags & IW_ENCODE_EXT_GROUP_KEY && |
695 | ext->alg != IW_ENCODE_ALG_WEP) | ||
692 | if (idx != 0 || ieee->iw_mode != IW_MODE_INFRA) | 696 | if (idx != 0 || ieee->iw_mode != IW_MODE_INFRA) |
693 | return -EINVAL; | 697 | return -EINVAL; |
694 | 698 | ||
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 175e093ec564..e3eceecd0496 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c | |||
@@ -934,11 +934,11 @@ int icmp_rcv(struct sk_buff *skb) | |||
934 | case CHECKSUM_HW: | 934 | case CHECKSUM_HW: |
935 | if (!(u16)csum_fold(skb->csum)) | 935 | if (!(u16)csum_fold(skb->csum)) |
936 | break; | 936 | break; |
937 | LIMIT_NETDEBUG(KERN_DEBUG "icmp v4 hw csum failure\n"); | 937 | /* fall through */ |
938 | case CHECKSUM_NONE: | 938 | case CHECKSUM_NONE: |
939 | if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) | 939 | skb->csum = 0; |
940 | if (__skb_checksum_complete(skb)) | ||
940 | goto error; | 941 | goto error; |
941 | default:; | ||
942 | } | 942 | } |
943 | 943 | ||
944 | if (!pskb_pull(skb, sizeof(struct icmphdr))) | 944 | if (!pskb_pull(skb, sizeof(struct icmphdr))) |
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index c6247fc84060..c04607b49212 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c | |||
@@ -872,11 +872,18 @@ int igmp_rcv(struct sk_buff *skb) | |||
872 | return 0; | 872 | return 0; |
873 | } | 873 | } |
874 | 874 | ||
875 | if (!pskb_may_pull(skb, sizeof(struct igmphdr)) || | 875 | if (!pskb_may_pull(skb, sizeof(struct igmphdr))) |
876 | (u16)csum_fold(skb_checksum(skb, 0, len, 0))) { | 876 | goto drop; |
877 | in_dev_put(in_dev); | 877 | |
878 | kfree_skb(skb); | 878 | switch (skb->ip_summed) { |
879 | return 0; | 879 | case CHECKSUM_HW: |
880 | if (!(u16)csum_fold(skb->csum)) | ||
881 | break; | ||
882 | /* fall through */ | ||
883 | case CHECKSUM_NONE: | ||
884 | skb->csum = 0; | ||
885 | if (__skb_checksum_complete(skb)) | ||
886 | goto drop; | ||
880 | } | 887 | } |
881 | 888 | ||
882 | ih = skb->h.igmph; | 889 | ih = skb->h.igmph; |
@@ -906,6 +913,8 @@ int igmp_rcv(struct sk_buff *skb) | |||
906 | default: | 913 | default: |
907 | NETDEBUG(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type); | 914 | NETDEBUG(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type); |
908 | } | 915 | } |
916 | |||
917 | drop: | ||
909 | in_dev_put(in_dev); | 918 | in_dev_put(in_dev); |
910 | kfree_skb(skb); | 919 | kfree_skb(skb); |
911 | return 0; | 920 | return 0; |
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 71f3c7350c6e..39061ed53cfd 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c | |||
@@ -724,12 +724,6 @@ done: | |||
724 | return skb->len; | 724 | return skb->len; |
725 | } | 725 | } |
726 | 726 | ||
727 | static int inet_diag_dump_done(struct netlink_callback *cb) | ||
728 | { | ||
729 | return 0; | ||
730 | } | ||
731 | |||
732 | |||
733 | static __inline__ int | 727 | static __inline__ int |
734 | inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | 728 | inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) |
735 | { | 729 | { |
@@ -760,8 +754,7 @@ inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
760 | goto err_inval; | 754 | goto err_inval; |
761 | } | 755 | } |
762 | return netlink_dump_start(idiagnl, skb, nlh, | 756 | return netlink_dump_start(idiagnl, skb, nlh, |
763 | inet_diag_dump, | 757 | inet_diag_dump, NULL); |
764 | inet_diag_dump_done); | ||
765 | } else { | 758 | } else { |
766 | return inet_diag_get_exact(skb, nlh); | 759 | return inet_diag_get_exact(skb, nlh); |
767 | } | 760 | } |
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 896ce3f8f53a..4e9c74b54b15 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c | |||
@@ -577,15 +577,16 @@ static int ipgre_rcv(struct sk_buff *skb) | |||
577 | goto drop_nolock; | 577 | goto drop_nolock; |
578 | 578 | ||
579 | if (flags&GRE_CSUM) { | 579 | if (flags&GRE_CSUM) { |
580 | if (skb->ip_summed == CHECKSUM_HW) { | 580 | switch (skb->ip_summed) { |
581 | case CHECKSUM_HW: | ||
581 | csum = (u16)csum_fold(skb->csum); | 582 | csum = (u16)csum_fold(skb->csum); |
582 | if (csum) | 583 | if (!csum) |
583 | skb->ip_summed = CHECKSUM_NONE; | 584 | break; |
584 | } | 585 | /* fall through */ |
585 | if (skb->ip_summed == CHECKSUM_NONE) { | 586 | case CHECKSUM_NONE: |
586 | skb->csum = skb_checksum(skb, 0, skb->len, 0); | 587 | skb->csum = 0; |
588 | csum = __skb_checksum_complete(skb); | ||
587 | skb->ip_summed = CHECKSUM_HW; | 589 | skb->ip_summed = CHECKSUM_HW; |
588 | csum = (u16)csum_fold(skb->csum); | ||
589 | } | 590 | } |
590 | offset += 4; | 591 | offset += 4; |
591 | } | 592 | } |
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 7d917e4ce1d9..9d3c8b5f327e 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig | |||
@@ -5,6 +5,20 @@ | |||
5 | menu "IP: Netfilter Configuration" | 5 | menu "IP: Netfilter Configuration" |
6 | depends on INET && NETFILTER | 6 | depends on INET && NETFILTER |
7 | 7 | ||
8 | config NF_CONNTRACK_IPV4 | ||
9 | tristate "IPv4 support for new connection tracking (EXPERIMENTAL)" | ||
10 | depends on EXPERIMENTAL && NF_CONNTRACK | ||
11 | ---help--- | ||
12 | Connection tracking keeps a record of what packets have passed | ||
13 | through your machine, in order to figure out how they are related | ||
14 | into connections. | ||
15 | |||
16 | This is IPv4 support on Layer 3 independent connection tracking. | ||
17 | Layer 3 independent connection tracking is experimental scheme | ||
18 | which generalize ip_conntrack to support other layer 3 protocols. | ||
19 | |||
20 | To compile it as a module, choose M here. If unsure, say N. | ||
21 | |||
8 | # connection tracking, helpers and protocols | 22 | # connection tracking, helpers and protocols |
9 | config IP_NF_CONNTRACK | 23 | config IP_NF_CONNTRACK |
10 | tristate "Connection tracking (required for masq/NAT)" | 24 | tristate "Connection tracking (required for masq/NAT)" |
@@ -209,8 +223,8 @@ config IP_NF_MATCH_PKTTYPE | |||
209 | tristate "Packet type match support" | 223 | tristate "Packet type match support" |
210 | depends on IP_NF_IPTABLES | 224 | depends on IP_NF_IPTABLES |
211 | help | 225 | help |
212 | Packet type matching allows you to match a packet by | 226 | Packet type matching allows you to match a packet by |
213 | its "class", eg. BROADCAST, MULTICAST, ... | 227 | its "class", eg. BROADCAST, MULTICAST, ... |
214 | 228 | ||
215 | Typical usage: | 229 | Typical usage: |
216 | iptables -A INPUT -m pkttype --pkt-type broadcast -j LOG | 230 | iptables -A INPUT -m pkttype --pkt-type broadcast -j LOG |
@@ -317,7 +331,8 @@ config IP_NF_MATCH_TCPMSS | |||
317 | 331 | ||
318 | config IP_NF_MATCH_HELPER | 332 | config IP_NF_MATCH_HELPER |
319 | tristate "Helper match support" | 333 | tristate "Helper match support" |
320 | depends on IP_NF_CONNTRACK && IP_NF_IPTABLES | 334 | depends on IP_NF_IPTABLES |
335 | depends on IP_NF_CONNTRACK || NF_CONNTRACK_IPV4 | ||
321 | help | 336 | help |
322 | Helper matching allows you to match packets in dynamic connections | 337 | Helper matching allows you to match packets in dynamic connections |
323 | tracked by a conntrack-helper, ie. ip_conntrack_ftp | 338 | tracked by a conntrack-helper, ie. ip_conntrack_ftp |
@@ -326,7 +341,8 @@ config IP_NF_MATCH_HELPER | |||
326 | 341 | ||
327 | config IP_NF_MATCH_STATE | 342 | config IP_NF_MATCH_STATE |
328 | tristate "Connection state match support" | 343 | tristate "Connection state match support" |
329 | depends on IP_NF_CONNTRACK && IP_NF_IPTABLES | 344 | depends on IP_NF_IPTABLES |
345 | depends on IP_NF_CONNTRACK || NF_CONNTRACK_IPV4 | ||
330 | help | 346 | help |
331 | Connection state matching allows you to match packets based on their | 347 | Connection state matching allows you to match packets based on their |
332 | relationship to a tracked connection (ie. previous packets). This | 348 | relationship to a tracked connection (ie. previous packets). This |
@@ -336,7 +352,8 @@ config IP_NF_MATCH_STATE | |||
336 | 352 | ||
337 | config IP_NF_MATCH_CONNTRACK | 353 | config IP_NF_MATCH_CONNTRACK |
338 | tristate "Connection tracking match support" | 354 | tristate "Connection tracking match support" |
339 | depends on IP_NF_CONNTRACK && IP_NF_IPTABLES | 355 | depends on IP_NF_IPTABLES |
356 | depends on IP_NF_CONNTRACK || NF_CONNTRACK_IPV4 | ||
340 | help | 357 | help |
341 | This is a general conntrack match module, a superset of the state match. | 358 | This is a general conntrack match module, a superset of the state match. |
342 | 359 | ||
@@ -422,7 +439,8 @@ config IP_NF_MATCH_COMMENT | |||
422 | 439 | ||
423 | config IP_NF_MATCH_CONNMARK | 440 | config IP_NF_MATCH_CONNMARK |
424 | tristate 'Connection mark match support' | 441 | tristate 'Connection mark match support' |
425 | depends on IP_NF_CONNTRACK_MARK && IP_NF_IPTABLES | 442 | depends on IP_NF_IPTABLES |
443 | depends on IP_NF_CONNTRACK_MARK || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4) | ||
426 | help | 444 | help |
427 | This option adds a `connmark' match, which allows you to match the | 445 | This option adds a `connmark' match, which allows you to match the |
428 | connection mark value previously set for the session by `CONNMARK'. | 446 | connection mark value previously set for the session by `CONNMARK'. |
@@ -433,7 +451,8 @@ config IP_NF_MATCH_CONNMARK | |||
433 | 451 | ||
434 | config IP_NF_MATCH_CONNBYTES | 452 | config IP_NF_MATCH_CONNBYTES |
435 | tristate 'Connection byte/packet counter match support' | 453 | tristate 'Connection byte/packet counter match support' |
436 | depends on IP_NF_CT_ACCT && IP_NF_IPTABLES | 454 | depends on IP_NF_IPTABLES |
455 | depends on IP_NF_CT_ACCT || (NF_CT_ACCT && NF_CONNTRACK_IPV4) | ||
437 | help | 456 | help |
438 | This option adds a `connbytes' match, which allows you to match the | 457 | This option adds a `connbytes' match, which allows you to match the |
439 | number of bytes and/or packets for each direction within a connection. | 458 | number of bytes and/or packets for each direction within a connection. |
@@ -747,7 +766,8 @@ config IP_NF_TARGET_TTL | |||
747 | 766 | ||
748 | config IP_NF_TARGET_CONNMARK | 767 | config IP_NF_TARGET_CONNMARK |
749 | tristate 'CONNMARK target support' | 768 | tristate 'CONNMARK target support' |
750 | depends on IP_NF_CONNTRACK_MARK && IP_NF_MANGLE | 769 | depends on IP_NF_MANGLE |
770 | depends on IP_NF_CONNTRACK_MARK || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4) | ||
751 | help | 771 | help |
752 | This option adds a `CONNMARK' target, which allows one to manipulate | 772 | This option adds a `CONNMARK' target, which allows one to manipulate |
753 | the connection mark value. Similar to the MARK target, but | 773 | the connection mark value. Similar to the MARK target, but |
@@ -759,7 +779,8 @@ config IP_NF_TARGET_CONNMARK | |||
759 | 779 | ||
760 | config IP_NF_TARGET_CLUSTERIP | 780 | config IP_NF_TARGET_CLUSTERIP |
761 | tristate "CLUSTERIP target support (EXPERIMENTAL)" | 781 | tristate "CLUSTERIP target support (EXPERIMENTAL)" |
762 | depends on IP_NF_CONNTRACK_MARK && IP_NF_IPTABLES && EXPERIMENTAL | 782 | depends on IP_NF_IPTABLES && EXPERIMENTAL |
783 | depends on IP_NF_CONNTRACK_MARK || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4) | ||
763 | help | 784 | help |
764 | The CLUSTERIP target allows you to build load-balancing clusters of | 785 | The CLUSTERIP target allows you to build load-balancing clusters of |
765 | network servers without having a dedicated load-balancing | 786 | network servers without having a dedicated load-balancing |
@@ -782,7 +803,7 @@ config IP_NF_RAW | |||
782 | config IP_NF_TARGET_NOTRACK | 803 | config IP_NF_TARGET_NOTRACK |
783 | tristate 'NOTRACK target support' | 804 | tristate 'NOTRACK target support' |
784 | depends on IP_NF_RAW | 805 | depends on IP_NF_RAW |
785 | depends on IP_NF_CONNTRACK | 806 | depends on IP_NF_CONNTRACK || NF_CONNTRACK_IPV4 |
786 | help | 807 | help |
787 | The NOTRACK target allows a select rule to specify | 808 | The NOTRACK target allows a select rule to specify |
788 | which packets *not* to enter the conntrack/NAT | 809 | which packets *not* to enter the conntrack/NAT |
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index dab4b58dd31e..058c48e258fc 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile | |||
@@ -103,3 +103,9 @@ obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o | |||
103 | obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o | 103 | obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o |
104 | 104 | ||
105 | obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o | 105 | obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o |
106 | |||
107 | # objects for l3 independent conntrack | ||
108 | nf_conntrack_ipv4-objs := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o | ||
109 | |||
110 | # l3 independent conntrack | ||
111 | obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o | ||
diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 82a65043a8ef..d2a4fec22862 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c | |||
@@ -28,11 +28,8 @@ | |||
28 | #include <linux/netlink.h> | 28 | #include <linux/netlink.h> |
29 | #include <linux/spinlock.h> | 29 | #include <linux/spinlock.h> |
30 | #include <linux/notifier.h> | 30 | #include <linux/notifier.h> |
31 | #include <linux/rtnetlink.h> | ||
32 | 31 | ||
33 | #include <linux/netfilter.h> | 32 | #include <linux/netfilter.h> |
34 | #include <linux/netfilter_ipv4.h> | ||
35 | #include <linux/netfilter_ipv4/ip_tables.h> | ||
36 | #include <linux/netfilter_ipv4/ip_conntrack.h> | 33 | #include <linux/netfilter_ipv4/ip_conntrack.h> |
37 | #include <linux/netfilter_ipv4/ip_conntrack_core.h> | 34 | #include <linux/netfilter_ipv4/ip_conntrack_core.h> |
38 | #include <linux/netfilter_ipv4/ip_conntrack_helper.h> | 35 | #include <linux/netfilter_ipv4/ip_conntrack_helper.h> |
@@ -58,14 +55,17 @@ ctnetlink_dump_tuples_proto(struct sk_buff *skb, | |||
58 | const struct ip_conntrack_tuple *tuple) | 55 | const struct ip_conntrack_tuple *tuple) |
59 | { | 56 | { |
60 | struct ip_conntrack_protocol *proto; | 57 | struct ip_conntrack_protocol *proto; |
58 | int ret = 0; | ||
61 | 59 | ||
62 | NFA_PUT(skb, CTA_PROTO_NUM, sizeof(u_int8_t), &tuple->dst.protonum); | 60 | NFA_PUT(skb, CTA_PROTO_NUM, sizeof(u_int8_t), &tuple->dst.protonum); |
63 | 61 | ||
64 | proto = ip_conntrack_proto_find_get(tuple->dst.protonum); | 62 | proto = ip_conntrack_proto_find_get(tuple->dst.protonum); |
65 | if (proto && proto->tuple_to_nfattr) | 63 | if (likely(proto && proto->tuple_to_nfattr)) { |
66 | return proto->tuple_to_nfattr(skb, tuple); | 64 | ret = proto->tuple_to_nfattr(skb, tuple); |
65 | ip_conntrack_proto_put(proto); | ||
66 | } | ||
67 | 67 | ||
68 | return 0; | 68 | return ret; |
69 | 69 | ||
70 | nfattr_failure: | 70 | nfattr_failure: |
71 | return -1; | 71 | return -1; |
@@ -175,7 +175,7 @@ ctnetlink_dump_counters(struct sk_buff *skb, const struct ip_conntrack *ct, | |||
175 | { | 175 | { |
176 | enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG; | 176 | enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG; |
177 | struct nfattr *nest_count = NFA_NEST(skb, type); | 177 | struct nfattr *nest_count = NFA_NEST(skb, type); |
178 | u_int64_t tmp; | 178 | u_int32_t tmp; |
179 | 179 | ||
180 | tmp = htonl(ct->counters[dir].packets); | 180 | tmp = htonl(ct->counters[dir].packets); |
181 | NFA_PUT(skb, CTA_COUNTERS32_PACKETS, sizeof(u_int32_t), &tmp); | 181 | NFA_PUT(skb, CTA_COUNTERS32_PACKETS, sizeof(u_int32_t), &tmp); |
@@ -479,9 +479,7 @@ ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple) | |||
479 | 479 | ||
480 | DEBUGP("entered %s\n", __FUNCTION__); | 480 | DEBUGP("entered %s\n", __FUNCTION__); |
481 | 481 | ||
482 | 482 | nfattr_parse_nested(tb, CTA_IP_MAX, attr); | |
483 | if (nfattr_parse_nested(tb, CTA_IP_MAX, attr) < 0) | ||
484 | goto nfattr_failure; | ||
485 | 483 | ||
486 | if (nfattr_bad_size(tb, CTA_IP_MAX, cta_min_ip)) | 484 | if (nfattr_bad_size(tb, CTA_IP_MAX, cta_min_ip)) |
487 | return -EINVAL; | 485 | return -EINVAL; |
@@ -497,9 +495,6 @@ ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple) | |||
497 | DEBUGP("leaving\n"); | 495 | DEBUGP("leaving\n"); |
498 | 496 | ||
499 | return 0; | 497 | return 0; |
500 | |||
501 | nfattr_failure: | ||
502 | return -1; | ||
503 | } | 498 | } |
504 | 499 | ||
505 | static const int cta_min_proto[CTA_PROTO_MAX] = { | 500 | static const int cta_min_proto[CTA_PROTO_MAX] = { |
@@ -521,8 +516,7 @@ ctnetlink_parse_tuple_proto(struct nfattr *attr, | |||
521 | 516 | ||
522 | DEBUGP("entered %s\n", __FUNCTION__); | 517 | DEBUGP("entered %s\n", __FUNCTION__); |
523 | 518 | ||
524 | if (nfattr_parse_nested(tb, CTA_PROTO_MAX, attr) < 0) | 519 | nfattr_parse_nested(tb, CTA_PROTO_MAX, attr); |
525 | goto nfattr_failure; | ||
526 | 520 | ||
527 | if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto)) | 521 | if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto)) |
528 | return -EINVAL; | 522 | return -EINVAL; |
@@ -539,9 +533,6 @@ ctnetlink_parse_tuple_proto(struct nfattr *attr, | |||
539 | } | 533 | } |
540 | 534 | ||
541 | return ret; | 535 | return ret; |
542 | |||
543 | nfattr_failure: | ||
544 | return -1; | ||
545 | } | 536 | } |
546 | 537 | ||
547 | static inline int | 538 | static inline int |
@@ -555,8 +546,7 @@ ctnetlink_parse_tuple(struct nfattr *cda[], struct ip_conntrack_tuple *tuple, | |||
555 | 546 | ||
556 | memset(tuple, 0, sizeof(*tuple)); | 547 | memset(tuple, 0, sizeof(*tuple)); |
557 | 548 | ||
558 | if (nfattr_parse_nested(tb, CTA_TUPLE_MAX, cda[type-1]) < 0) | 549 | nfattr_parse_nested(tb, CTA_TUPLE_MAX, cda[type-1]); |
559 | goto nfattr_failure; | ||
560 | 550 | ||
561 | if (!tb[CTA_TUPLE_IP-1]) | 551 | if (!tb[CTA_TUPLE_IP-1]) |
562 | return -EINVAL; | 552 | return -EINVAL; |
@@ -583,9 +573,6 @@ ctnetlink_parse_tuple(struct nfattr *cda[], struct ip_conntrack_tuple *tuple, | |||
583 | DEBUGP("leaving\n"); | 573 | DEBUGP("leaving\n"); |
584 | 574 | ||
585 | return 0; | 575 | return 0; |
586 | |||
587 | nfattr_failure: | ||
588 | return -1; | ||
589 | } | 576 | } |
590 | 577 | ||
591 | #ifdef CONFIG_IP_NF_NAT_NEEDED | 578 | #ifdef CONFIG_IP_NF_NAT_NEEDED |
@@ -603,11 +590,10 @@ static int ctnetlink_parse_nat_proto(struct nfattr *attr, | |||
603 | 590 | ||
604 | DEBUGP("entered %s\n", __FUNCTION__); | 591 | DEBUGP("entered %s\n", __FUNCTION__); |
605 | 592 | ||
606 | if (nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr) < 0) | 593 | nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr); |
607 | goto nfattr_failure; | ||
608 | 594 | ||
609 | if (nfattr_bad_size(tb, CTA_PROTONAT_MAX, cta_min_protonat)) | 595 | if (nfattr_bad_size(tb, CTA_PROTONAT_MAX, cta_min_protonat)) |
610 | goto nfattr_failure; | 596 | return -EINVAL; |
611 | 597 | ||
612 | npt = ip_nat_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum); | 598 | npt = ip_nat_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum); |
613 | if (!npt) | 599 | if (!npt) |
@@ -626,9 +612,6 @@ static int ctnetlink_parse_nat_proto(struct nfattr *attr, | |||
626 | 612 | ||
627 | DEBUGP("leaving\n"); | 613 | DEBUGP("leaving\n"); |
628 | return 0; | 614 | return 0; |
629 | |||
630 | nfattr_failure: | ||
631 | return -1; | ||
632 | } | 615 | } |
633 | 616 | ||
634 | static inline int | 617 | static inline int |
@@ -642,8 +625,7 @@ ctnetlink_parse_nat(struct nfattr *cda[], | |||
642 | 625 | ||
643 | memset(range, 0, sizeof(*range)); | 626 | memset(range, 0, sizeof(*range)); |
644 | 627 | ||
645 | if (nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]) < 0) | 628 | nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]); |
646 | goto nfattr_failure; | ||
647 | 629 | ||
648 | if (tb[CTA_NAT_MINIP-1]) | 630 | if (tb[CTA_NAT_MINIP-1]) |
649 | range->min_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MINIP-1]); | 631 | range->min_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MINIP-1]); |
@@ -665,9 +647,6 @@ ctnetlink_parse_nat(struct nfattr *cda[], | |||
665 | 647 | ||
666 | DEBUGP("leaving\n"); | 648 | DEBUGP("leaving\n"); |
667 | return 0; | 649 | return 0; |
668 | |||
669 | nfattr_failure: | ||
670 | return -1; | ||
671 | } | 650 | } |
672 | #endif | 651 | #endif |
673 | 652 | ||
@@ -678,8 +657,7 @@ ctnetlink_parse_help(struct nfattr *attr, char **helper_name) | |||
678 | 657 | ||
679 | DEBUGP("entered %s\n", __FUNCTION__); | 658 | DEBUGP("entered %s\n", __FUNCTION__); |
680 | 659 | ||
681 | if (nfattr_parse_nested(tb, CTA_HELP_MAX, attr) < 0) | 660 | nfattr_parse_nested(tb, CTA_HELP_MAX, attr); |
682 | goto nfattr_failure; | ||
683 | 661 | ||
684 | if (!tb[CTA_HELP_NAME-1]) | 662 | if (!tb[CTA_HELP_NAME-1]) |
685 | return -EINVAL; | 663 | return -EINVAL; |
@@ -687,9 +665,6 @@ ctnetlink_parse_help(struct nfattr *attr, char **helper_name) | |||
687 | *helper_name = NFA_DATA(tb[CTA_HELP_NAME-1]); | 665 | *helper_name = NFA_DATA(tb[CTA_HELP_NAME-1]); |
688 | 666 | ||
689 | return 0; | 667 | return 0; |
690 | |||
691 | nfattr_failure: | ||
692 | return -1; | ||
693 | } | 668 | } |
694 | 669 | ||
695 | static int | 670 | static int |
@@ -804,7 +779,7 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, | |||
804 | ct = tuplehash_to_ctrack(h); | 779 | ct = tuplehash_to_ctrack(h); |
805 | 780 | ||
806 | err = -ENOMEM; | 781 | err = -ENOMEM; |
807 | skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); | 782 | skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); |
808 | if (!skb2) { | 783 | if (!skb2) { |
809 | ip_conntrack_put(ct); | 784 | ip_conntrack_put(ct); |
810 | return -ENOMEM; | 785 | return -ENOMEM; |
@@ -827,7 +802,7 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, | |||
827 | free: | 802 | free: |
828 | kfree_skb(skb2); | 803 | kfree_skb(skb2); |
829 | out: | 804 | out: |
830 | return -1; | 805 | return err; |
831 | } | 806 | } |
832 | 807 | ||
833 | static inline int | 808 | static inline int |
@@ -957,8 +932,7 @@ ctnetlink_change_protoinfo(struct ip_conntrack *ct, struct nfattr *cda[]) | |||
957 | u_int16_t npt = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum; | 932 | u_int16_t npt = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum; |
958 | int err = 0; | 933 | int err = 0; |
959 | 934 | ||
960 | if (nfattr_parse_nested(tb, CTA_PROTOINFO_MAX, attr) < 0) | 935 | nfattr_parse_nested(tb, CTA_PROTOINFO_MAX, attr); |
961 | goto nfattr_failure; | ||
962 | 936 | ||
963 | proto = ip_conntrack_proto_find_get(npt); | 937 | proto = ip_conntrack_proto_find_get(npt); |
964 | if (!proto) | 938 | if (!proto) |
@@ -969,9 +943,6 @@ ctnetlink_change_protoinfo(struct ip_conntrack *ct, struct nfattr *cda[]) | |||
969 | ip_conntrack_proto_put(proto); | 943 | ip_conntrack_proto_put(proto); |
970 | 944 | ||
971 | return err; | 945 | return err; |
972 | |||
973 | nfattr_failure: | ||
974 | return -ENOMEM; | ||
975 | } | 946 | } |
976 | 947 | ||
977 | static int | 948 | static int |
@@ -1005,6 +976,11 @@ ctnetlink_change_conntrack(struct ip_conntrack *ct, struct nfattr *cda[]) | |||
1005 | return err; | 976 | return err; |
1006 | } | 977 | } |
1007 | 978 | ||
979 | #if defined(CONFIG_IP_NF_CONNTRACK_MARK) | ||
980 | if (cda[CTA_MARK-1]) | ||
981 | ct->mark = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_MARK-1])); | ||
982 | #endif | ||
983 | |||
1008 | DEBUGP("all done\n"); | 984 | DEBUGP("all done\n"); |
1009 | return 0; | 985 | return 0; |
1010 | } | 986 | } |
@@ -1048,6 +1024,11 @@ ctnetlink_create_conntrack(struct nfattr *cda[], | |||
1048 | if (ct->helper) | 1024 | if (ct->helper) |
1049 | ip_conntrack_helper_put(ct->helper); | 1025 | ip_conntrack_helper_put(ct->helper); |
1050 | 1026 | ||
1027 | #if defined(CONFIG_IP_NF_CONNTRACK_MARK) | ||
1028 | if (cda[CTA_MARK-1]) | ||
1029 | ct->mark = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_MARK-1])); | ||
1030 | #endif | ||
1031 | |||
1051 | DEBUGP("conntrack with id %u inserted\n", ct->id); | 1032 | DEBUGP("conntrack with id %u inserted\n", ct->id); |
1052 | return 0; | 1033 | return 0; |
1053 | 1034 | ||
@@ -1312,6 +1293,14 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, | |||
1312 | if (!exp) | 1293 | if (!exp) |
1313 | return -ENOENT; | 1294 | return -ENOENT; |
1314 | 1295 | ||
1296 | if (cda[CTA_EXPECT_ID-1]) { | ||
1297 | u_int32_t id = *(u_int32_t *)NFA_DATA(cda[CTA_EXPECT_ID-1]); | ||
1298 | if (exp->id != ntohl(id)) { | ||
1299 | ip_conntrack_expect_put(exp); | ||
1300 | return -ENOENT; | ||
1301 | } | ||
1302 | } | ||
1303 | |||
1315 | err = -ENOMEM; | 1304 | err = -ENOMEM; |
1316 | skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); | 1305 | skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); |
1317 | if (!skb2) | 1306 | if (!skb2) |
@@ -1387,7 +1376,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, | |||
1387 | ip_conntrack_expect_put(exp); | 1376 | ip_conntrack_expect_put(exp); |
1388 | } | 1377 | } |
1389 | } | 1378 | } |
1390 | write_unlock(&ip_conntrack_lock); | 1379 | write_unlock_bh(&ip_conntrack_lock); |
1391 | } else { | 1380 | } else { |
1392 | /* This basically means we have to flush everything*/ | 1381 | /* This basically means we have to flush everything*/ |
1393 | write_lock_bh(&ip_conntrack_lock); | 1382 | write_lock_bh(&ip_conntrack_lock); |
@@ -1554,6 +1543,8 @@ static struct nfnetlink_subsystem ctnl_exp_subsys = { | |||
1554 | .cb = ctnl_exp_cb, | 1543 | .cb = ctnl_exp_cb, |
1555 | }; | 1544 | }; |
1556 | 1545 | ||
1546 | MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK); | ||
1547 | |||
1557 | static int __init ctnetlink_init(void) | 1548 | static int __init ctnetlink_init(void) |
1558 | { | 1549 | { |
1559 | int ret; | 1550 | int ret; |
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c index 98f0015dd255..e4d6b268e8c4 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/in.h> | 13 | #include <linux/in.h> |
14 | #include <linux/icmp.h> | 14 | #include <linux/icmp.h> |
15 | #include <linux/seq_file.h> | 15 | #include <linux/seq_file.h> |
16 | #include <linux/skbuff.h> | ||
16 | #include <net/ip.h> | 17 | #include <net/ip.h> |
17 | #include <net/checksum.h> | 18 | #include <net/checksum.h> |
18 | #include <linux/netfilter.h> | 19 | #include <linux/netfilter.h> |
@@ -151,13 +152,13 @@ icmp_error_message(struct sk_buff *skb, | |||
151 | /* Not enough header? */ | 152 | /* Not enough header? */ |
152 | inside = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_in), &_in); | 153 | inside = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_in), &_in); |
153 | if (inside == NULL) | 154 | if (inside == NULL) |
154 | return NF_ACCEPT; | 155 | return -NF_ACCEPT; |
155 | 156 | ||
156 | /* Ignore ICMP's containing fragments (shouldn't happen) */ | 157 | /* Ignore ICMP's containing fragments (shouldn't happen) */ |
157 | if (inside->ip.frag_off & htons(IP_OFFSET)) { | 158 | if (inside->ip.frag_off & htons(IP_OFFSET)) { |
158 | DEBUGP("icmp_error_track: fragment of proto %u\n", | 159 | DEBUGP("icmp_error_track: fragment of proto %u\n", |
159 | inside->ip.protocol); | 160 | inside->ip.protocol); |
160 | return NF_ACCEPT; | 161 | return -NF_ACCEPT; |
161 | } | 162 | } |
162 | 163 | ||
163 | innerproto = ip_conntrack_proto_find_get(inside->ip.protocol); | 164 | innerproto = ip_conntrack_proto_find_get(inside->ip.protocol); |
@@ -166,7 +167,7 @@ icmp_error_message(struct sk_buff *skb, | |||
166 | if (!ip_ct_get_tuple(&inside->ip, skb, dataoff, &origtuple, innerproto)) { | 167 | if (!ip_ct_get_tuple(&inside->ip, skb, dataoff, &origtuple, innerproto)) { |
167 | DEBUGP("icmp_error: ! get_tuple p=%u", inside->ip.protocol); | 168 | DEBUGP("icmp_error: ! get_tuple p=%u", inside->ip.protocol); |
168 | ip_conntrack_proto_put(innerproto); | 169 | ip_conntrack_proto_put(innerproto); |
169 | return NF_ACCEPT; | 170 | return -NF_ACCEPT; |
170 | } | 171 | } |
171 | 172 | ||
172 | /* Ordinarily, we'd expect the inverted tupleproto, but it's | 173 | /* Ordinarily, we'd expect the inverted tupleproto, but it's |
@@ -174,7 +175,7 @@ icmp_error_message(struct sk_buff *skb, | |||
174 | if (!ip_ct_invert_tuple(&innertuple, &origtuple, innerproto)) { | 175 | if (!ip_ct_invert_tuple(&innertuple, &origtuple, innerproto)) { |
175 | DEBUGP("icmp_error_track: Can't invert tuple\n"); | 176 | DEBUGP("icmp_error_track: Can't invert tuple\n"); |
176 | ip_conntrack_proto_put(innerproto); | 177 | ip_conntrack_proto_put(innerproto); |
177 | return NF_ACCEPT; | 178 | return -NF_ACCEPT; |
178 | } | 179 | } |
179 | ip_conntrack_proto_put(innerproto); | 180 | ip_conntrack_proto_put(innerproto); |
180 | 181 | ||
@@ -190,7 +191,7 @@ icmp_error_message(struct sk_buff *skb, | |||
190 | 191 | ||
191 | if (!h) { | 192 | if (!h) { |
192 | DEBUGP("icmp_error_track: no match\n"); | 193 | DEBUGP("icmp_error_track: no match\n"); |
193 | return NF_ACCEPT; | 194 | return -NF_ACCEPT; |
194 | } | 195 | } |
195 | /* Reverse direction from that found */ | 196 | /* Reverse direction from that found */ |
196 | if (DIRECTION(h) != IP_CT_DIR_REPLY) | 197 | if (DIRECTION(h) != IP_CT_DIR_REPLY) |
@@ -230,19 +231,15 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, | |||
230 | case CHECKSUM_HW: | 231 | case CHECKSUM_HW: |
231 | if (!(u16)csum_fold(skb->csum)) | 232 | if (!(u16)csum_fold(skb->csum)) |
232 | break; | 233 | break; |
233 | if (LOG_INVALID(IPPROTO_ICMP)) | 234 | /* fall through */ |
234 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, | ||
235 | "ip_ct_icmp: bad HW ICMP checksum "); | ||
236 | return -NF_ACCEPT; | ||
237 | case CHECKSUM_NONE: | 235 | case CHECKSUM_NONE: |
238 | if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) { | 236 | skb->csum = 0; |
237 | if (__skb_checksum_complete(skb)) { | ||
239 | if (LOG_INVALID(IPPROTO_ICMP)) | 238 | if (LOG_INVALID(IPPROTO_ICMP)) |
240 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, | 239 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, |
241 | "ip_ct_icmp: bad ICMP checksum "); | 240 | "ip_ct_icmp: bad ICMP checksum "); |
242 | return -NF_ACCEPT; | 241 | return -NF_ACCEPT; |
243 | } | 242 | } |
244 | default: | ||
245 | break; | ||
246 | } | 243 | } |
247 | 244 | ||
248 | checksum_skipped: | 245 | checksum_skipped: |
@@ -296,7 +293,8 @@ static int icmp_nfattr_to_tuple(struct nfattr *tb[], | |||
296 | struct ip_conntrack_tuple *tuple) | 293 | struct ip_conntrack_tuple *tuple) |
297 | { | 294 | { |
298 | if (!tb[CTA_PROTO_ICMP_TYPE-1] | 295 | if (!tb[CTA_PROTO_ICMP_TYPE-1] |
299 | || !tb[CTA_PROTO_ICMP_CODE-1]) | 296 | || !tb[CTA_PROTO_ICMP_CODE-1] |
297 | || !tb[CTA_PROTO_ICMP_ID-1]) | ||
300 | return -1; | 298 | return -1; |
301 | 299 | ||
302 | tuple->dst.u.icmp.type = | 300 | tuple->dst.u.icmp.type = |
@@ -304,7 +302,7 @@ static int icmp_nfattr_to_tuple(struct nfattr *tb[], | |||
304 | tuple->dst.u.icmp.code = | 302 | tuple->dst.u.icmp.code = |
305 | *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_CODE-1]); | 303 | *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_CODE-1]); |
306 | tuple->src.u.icmp.id = | 304 | tuple->src.u.icmp.id = |
307 | *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_ID-1]); | 305 | *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_ICMP_ID-1]); |
308 | 306 | ||
309 | return 0; | 307 | return 0; |
310 | } | 308 | } |
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index d6701cafbcc2..468c6003b4c7 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c | |||
@@ -362,8 +362,12 @@ static int nfattr_to_tcp(struct nfattr *cda[], struct ip_conntrack *ct) | |||
362 | struct nfattr *attr = cda[CTA_PROTOINFO_TCP-1]; | 362 | struct nfattr *attr = cda[CTA_PROTOINFO_TCP-1]; |
363 | struct nfattr *tb[CTA_PROTOINFO_TCP_MAX]; | 363 | struct nfattr *tb[CTA_PROTOINFO_TCP_MAX]; |
364 | 364 | ||
365 | if (nfattr_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, attr) < 0) | 365 | /* updates could not contain anything about the private |
366 | goto nfattr_failure; | 366 | * protocol info, in that case skip the parsing */ |
367 | if (!attr) | ||
368 | return 0; | ||
369 | |||
370 | nfattr_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, attr); | ||
367 | 371 | ||
368 | if (!tb[CTA_PROTOINFO_TCP_STATE-1]) | 372 | if (!tb[CTA_PROTOINFO_TCP_STATE-1]) |
369 | return -EINVAL; | 373 | return -EINVAL; |
@@ -374,9 +378,6 @@ static int nfattr_to_tcp(struct nfattr *cda[], struct ip_conntrack *ct) | |||
374 | write_unlock_bh(&tcp_lock); | 378 | write_unlock_bh(&tcp_lock); |
375 | 379 | ||
376 | return 0; | 380 | return 0; |
377 | |||
378 | nfattr_failure: | ||
379 | return -1; | ||
380 | } | 381 | } |
381 | #endif | 382 | #endif |
382 | 383 | ||
diff --git a/net/ipv4/netfilter/ip_nat_helper_pptp.c b/net/ipv4/netfilter/ip_nat_helper_pptp.c index ee6ab74ad3a9..e546203f5662 100644 --- a/net/ipv4/netfilter/ip_nat_helper_pptp.c +++ b/net/ipv4/netfilter/ip_nat_helper_pptp.c | |||
@@ -73,6 +73,7 @@ static void pptp_nat_expected(struct ip_conntrack *ct, | |||
73 | struct ip_conntrack_tuple t; | 73 | struct ip_conntrack_tuple t; |
74 | struct ip_ct_pptp_master *ct_pptp_info; | 74 | struct ip_ct_pptp_master *ct_pptp_info; |
75 | struct ip_nat_pptp *nat_pptp_info; | 75 | struct ip_nat_pptp *nat_pptp_info; |
76 | struct ip_nat_range range; | ||
76 | 77 | ||
77 | ct_pptp_info = &master->help.ct_pptp_info; | 78 | ct_pptp_info = &master->help.ct_pptp_info; |
78 | nat_pptp_info = &master->nat.help.nat_pptp_info; | 79 | nat_pptp_info = &master->nat.help.nat_pptp_info; |
@@ -110,7 +111,30 @@ static void pptp_nat_expected(struct ip_conntrack *ct, | |||
110 | DEBUGP("not found!\n"); | 111 | DEBUGP("not found!\n"); |
111 | } | 112 | } |
112 | 113 | ||
113 | ip_nat_follow_master(ct, exp); | 114 | /* This must be a fresh one. */ |
115 | BUG_ON(ct->status & IPS_NAT_DONE_MASK); | ||
116 | |||
117 | /* Change src to where master sends to */ | ||
118 | range.flags = IP_NAT_RANGE_MAP_IPS; | ||
119 | range.min_ip = range.max_ip | ||
120 | = ct->master->tuplehash[!exp->dir].tuple.dst.ip; | ||
121 | if (exp->dir == IP_CT_DIR_ORIGINAL) { | ||
122 | range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED; | ||
123 | range.min = range.max = exp->saved_proto; | ||
124 | } | ||
125 | /* hook doesn't matter, but it has to do source manip */ | ||
126 | ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING); | ||
127 | |||
128 | /* For DST manip, map port here to where it's expected. */ | ||
129 | range.flags = IP_NAT_RANGE_MAP_IPS; | ||
130 | range.min_ip = range.max_ip | ||
131 | = ct->master->tuplehash[!exp->dir].tuple.src.ip; | ||
132 | if (exp->dir == IP_CT_DIR_REPLY) { | ||
133 | range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED; | ||
134 | range.min = range.max = exp->saved_proto; | ||
135 | } | ||
136 | /* hook doesn't matter, but it has to do destination manip */ | ||
137 | ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING); | ||
114 | } | 138 | } |
115 | 139 | ||
116 | /* outbound packets == from PNS to PAC */ | 140 | /* outbound packets == from PNS to PAC */ |
@@ -213,7 +237,7 @@ pptp_exp_gre(struct ip_conntrack_expect *expect_orig, | |||
213 | 237 | ||
214 | /* alter expectation for PNS->PAC direction */ | 238 | /* alter expectation for PNS->PAC direction */ |
215 | invert_tuplepr(&inv_t, &expect_orig->tuple); | 239 | invert_tuplepr(&inv_t, &expect_orig->tuple); |
216 | expect_orig->saved_proto.gre.key = htons(nat_pptp_info->pac_call_id); | 240 | expect_orig->saved_proto.gre.key = htons(ct_pptp_info->pns_call_id); |
217 | expect_orig->tuple.src.u.gre.key = htons(nat_pptp_info->pns_call_id); | 241 | expect_orig->tuple.src.u.gre.key = htons(nat_pptp_info->pns_call_id); |
218 | expect_orig->tuple.dst.u.gre.key = htons(ct_pptp_info->pac_call_id); | 242 | expect_orig->tuple.dst.u.gre.key = htons(ct_pptp_info->pac_call_id); |
219 | expect_orig->dir = IP_CT_DIR_ORIGINAL; | 243 | expect_orig->dir = IP_CT_DIR_ORIGINAL; |
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 9bcb398fbc1f..45c52d8f4d99 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c | |||
@@ -29,7 +29,7 @@ | |||
29 | 29 | ||
30 | #include <linux/netfilter_ipv4/ip_tables.h> | 30 | #include <linux/netfilter_ipv4/ip_tables.h> |
31 | #include <linux/netfilter_ipv4/ipt_CLUSTERIP.h> | 31 | #include <linux/netfilter_ipv4/ipt_CLUSTERIP.h> |
32 | #include <linux/netfilter_ipv4/ip_conntrack.h> | 32 | #include <net/netfilter/nf_conntrack_compat.h> |
33 | 33 | ||
34 | #define CLUSTERIP_VERSION "0.8" | 34 | #define CLUSTERIP_VERSION "0.8" |
35 | 35 | ||
@@ -316,14 +316,14 @@ target(struct sk_buff **pskb, | |||
316 | { | 316 | { |
317 | const struct ipt_clusterip_tgt_info *cipinfo = targinfo; | 317 | const struct ipt_clusterip_tgt_info *cipinfo = targinfo; |
318 | enum ip_conntrack_info ctinfo; | 318 | enum ip_conntrack_info ctinfo; |
319 | struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo); | 319 | u_int32_t *mark, hash; |
320 | u_int32_t hash; | ||
321 | 320 | ||
322 | /* don't need to clusterip_config_get() here, since refcount | 321 | /* don't need to clusterip_config_get() here, since refcount |
323 | * is only decremented by destroy() - and ip_tables guarantees | 322 | * is only decremented by destroy() - and ip_tables guarantees |
324 | * that the ->target() function isn't called after ->destroy() */ | 323 | * that the ->target() function isn't called after ->destroy() */ |
325 | 324 | ||
326 | if (!ct) { | 325 | mark = nf_ct_get_mark((*pskb), &ctinfo); |
326 | if (mark == NULL) { | ||
327 | printk(KERN_ERR "CLUSTERIP: no conntrack!\n"); | 327 | printk(KERN_ERR "CLUSTERIP: no conntrack!\n"); |
328 | /* FIXME: need to drop invalid ones, since replies | 328 | /* FIXME: need to drop invalid ones, since replies |
329 | * to outgoing connections of other nodes will be | 329 | * to outgoing connections of other nodes will be |
@@ -346,7 +346,7 @@ target(struct sk_buff **pskb, | |||
346 | 346 | ||
347 | switch (ctinfo) { | 347 | switch (ctinfo) { |
348 | case IP_CT_NEW: | 348 | case IP_CT_NEW: |
349 | ct->mark = hash; | 349 | *mark = hash; |
350 | break; | 350 | break; |
351 | case IP_CT_RELATED: | 351 | case IP_CT_RELATED: |
352 | case IP_CT_RELATED+IP_CT_IS_REPLY: | 352 | case IP_CT_RELATED+IP_CT_IS_REPLY: |
@@ -363,7 +363,7 @@ target(struct sk_buff **pskb, | |||
363 | #ifdef DEBUG_CLUSTERP | 363 | #ifdef DEBUG_CLUSTERP |
364 | DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); | 364 | DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); |
365 | #endif | 365 | #endif |
366 | DEBUGP("hash=%u ct_hash=%u ", hash, ct->mark); | 366 | DEBUGP("hash=%u ct_hash=%u ", hash, *mark); |
367 | if (!clusterip_responsible(cipinfo->config, hash)) { | 367 | if (!clusterip_responsible(cipinfo->config, hash)) { |
368 | DEBUGP("not responsible\n"); | 368 | DEBUGP("not responsible\n"); |
369 | return NF_DROP; | 369 | return NF_DROP; |
diff --git a/net/ipv4/netfilter/ipt_CONNMARK.c b/net/ipv4/netfilter/ipt_CONNMARK.c index 05d66ab59424..8acac5a40a92 100644 --- a/net/ipv4/netfilter/ipt_CONNMARK.c +++ b/net/ipv4/netfilter/ipt_CONNMARK.c | |||
@@ -29,7 +29,7 @@ MODULE_LICENSE("GPL"); | |||
29 | 29 | ||
30 | #include <linux/netfilter_ipv4/ip_tables.h> | 30 | #include <linux/netfilter_ipv4/ip_tables.h> |
31 | #include <linux/netfilter_ipv4/ipt_CONNMARK.h> | 31 | #include <linux/netfilter_ipv4/ipt_CONNMARK.h> |
32 | #include <linux/netfilter_ipv4/ip_conntrack.h> | 32 | #include <net/netfilter/nf_conntrack_compat.h> |
33 | 33 | ||
34 | static unsigned int | 34 | static unsigned int |
35 | target(struct sk_buff **pskb, | 35 | target(struct sk_buff **pskb, |
@@ -43,24 +43,24 @@ target(struct sk_buff **pskb, | |||
43 | u_int32_t diff; | 43 | u_int32_t diff; |
44 | u_int32_t nfmark; | 44 | u_int32_t nfmark; |
45 | u_int32_t newmark; | 45 | u_int32_t newmark; |
46 | u_int32_t ctinfo; | ||
47 | u_int32_t *ctmark = nf_ct_get_mark(*pskb, &ctinfo); | ||
46 | 48 | ||
47 | enum ip_conntrack_info ctinfo; | 49 | if (ctmark) { |
48 | struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo); | ||
49 | if (ct) { | ||
50 | switch(markinfo->mode) { | 50 | switch(markinfo->mode) { |
51 | case IPT_CONNMARK_SET: | 51 | case IPT_CONNMARK_SET: |
52 | newmark = (ct->mark & ~markinfo->mask) | markinfo->mark; | 52 | newmark = (*ctmark & ~markinfo->mask) | markinfo->mark; |
53 | if (newmark != ct->mark) | 53 | if (newmark != *ctmark) |
54 | ct->mark = newmark; | 54 | *ctmark = newmark; |
55 | break; | 55 | break; |
56 | case IPT_CONNMARK_SAVE: | 56 | case IPT_CONNMARK_SAVE: |
57 | newmark = (ct->mark & ~markinfo->mask) | ((*pskb)->nfmark & markinfo->mask); | 57 | newmark = (*ctmark & ~markinfo->mask) | ((*pskb)->nfmark & markinfo->mask); |
58 | if (ct->mark != newmark) | 58 | if (*ctmark != newmark) |
59 | ct->mark = newmark; | 59 | *ctmark = newmark; |
60 | break; | 60 | break; |
61 | case IPT_CONNMARK_RESTORE: | 61 | case IPT_CONNMARK_RESTORE: |
62 | nfmark = (*pskb)->nfmark; | 62 | nfmark = (*pskb)->nfmark; |
63 | diff = (ct->mark ^ nfmark) & markinfo->mask; | 63 | diff = (*ctmark ^ nfmark) & markinfo->mask; |
64 | if (diff != 0) | 64 | if (diff != 0) |
65 | (*pskb)->nfmark = nfmark ^ diff; | 65 | (*pskb)->nfmark = nfmark ^ diff; |
66 | break; | 66 | break; |
diff --git a/net/ipv4/netfilter/ipt_NOTRACK.c b/net/ipv4/netfilter/ipt_NOTRACK.c index a4bb9b3bc292..e3c69d072c6e 100644 --- a/net/ipv4/netfilter/ipt_NOTRACK.c +++ b/net/ipv4/netfilter/ipt_NOTRACK.c | |||
@@ -5,7 +5,7 @@ | |||
5 | #include <linux/skbuff.h> | 5 | #include <linux/skbuff.h> |
6 | 6 | ||
7 | #include <linux/netfilter_ipv4/ip_tables.h> | 7 | #include <linux/netfilter_ipv4/ip_tables.h> |
8 | #include <linux/netfilter_ipv4/ip_conntrack.h> | 8 | #include <net/netfilter/nf_conntrack_compat.h> |
9 | 9 | ||
10 | static unsigned int | 10 | static unsigned int |
11 | target(struct sk_buff **pskb, | 11 | target(struct sk_buff **pskb, |
@@ -23,7 +23,7 @@ target(struct sk_buff **pskb, | |||
23 | If there is a real ct entry correspondig to this packet, | 23 | If there is a real ct entry correspondig to this packet, |
24 | it'll hang aroun till timing out. We don't deal with it | 24 | it'll hang aroun till timing out. We don't deal with it |
25 | for performance reasons. JK */ | 25 | for performance reasons. JK */ |
26 | (*pskb)->nfct = &ip_conntrack_untracked.ct_general; | 26 | nf_ct_untrack(*pskb); |
27 | (*pskb)->nfctinfo = IP_CT_NEW; | 27 | (*pskb)->nfctinfo = IP_CT_NEW; |
28 | nf_conntrack_get((*pskb)->nfct); | 28 | nf_conntrack_get((*pskb)->nfct); |
29 | 29 | ||
diff --git a/net/ipv4/netfilter/ipt_connbytes.c b/net/ipv4/netfilter/ipt_connbytes.c index df4a42c6da22..d68a048b7176 100644 --- a/net/ipv4/netfilter/ipt_connbytes.c +++ b/net/ipv4/netfilter/ipt_connbytes.c | |||
@@ -10,7 +10,7 @@ | |||
10 | */ | 10 | */ |
11 | #include <linux/module.h> | 11 | #include <linux/module.h> |
12 | #include <linux/skbuff.h> | 12 | #include <linux/skbuff.h> |
13 | #include <linux/netfilter_ipv4/ip_conntrack.h> | 13 | #include <net/netfilter/nf_conntrack_compat.h> |
14 | #include <linux/netfilter_ipv4/ip_tables.h> | 14 | #include <linux/netfilter_ipv4/ip_tables.h> |
15 | #include <linux/netfilter_ipv4/ipt_connbytes.h> | 15 | #include <linux/netfilter_ipv4/ipt_connbytes.h> |
16 | 16 | ||
@@ -46,60 +46,59 @@ match(const struct sk_buff *skb, | |||
46 | int *hotdrop) | 46 | int *hotdrop) |
47 | { | 47 | { |
48 | const struct ipt_connbytes_info *sinfo = matchinfo; | 48 | const struct ipt_connbytes_info *sinfo = matchinfo; |
49 | enum ip_conntrack_info ctinfo; | ||
50 | struct ip_conntrack *ct; | ||
51 | u_int64_t what = 0; /* initialize to make gcc happy */ | 49 | u_int64_t what = 0; /* initialize to make gcc happy */ |
50 | const struct ip_conntrack_counter *counters; | ||
52 | 51 | ||
53 | if (!(ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo))) | 52 | if (!(counters = nf_ct_get_counters(skb))) |
54 | return 0; /* no match */ | 53 | return 0; /* no match */ |
55 | 54 | ||
56 | switch (sinfo->what) { | 55 | switch (sinfo->what) { |
57 | case IPT_CONNBYTES_PKTS: | 56 | case IPT_CONNBYTES_PKTS: |
58 | switch (sinfo->direction) { | 57 | switch (sinfo->direction) { |
59 | case IPT_CONNBYTES_DIR_ORIGINAL: | 58 | case IPT_CONNBYTES_DIR_ORIGINAL: |
60 | what = ct->counters[IP_CT_DIR_ORIGINAL].packets; | 59 | what = counters[IP_CT_DIR_ORIGINAL].packets; |
61 | break; | 60 | break; |
62 | case IPT_CONNBYTES_DIR_REPLY: | 61 | case IPT_CONNBYTES_DIR_REPLY: |
63 | what = ct->counters[IP_CT_DIR_REPLY].packets; | 62 | what = counters[IP_CT_DIR_REPLY].packets; |
64 | break; | 63 | break; |
65 | case IPT_CONNBYTES_DIR_BOTH: | 64 | case IPT_CONNBYTES_DIR_BOTH: |
66 | what = ct->counters[IP_CT_DIR_ORIGINAL].packets; | 65 | what = counters[IP_CT_DIR_ORIGINAL].packets; |
67 | what += ct->counters[IP_CT_DIR_REPLY].packets; | 66 | what += counters[IP_CT_DIR_REPLY].packets; |
68 | break; | 67 | break; |
69 | } | 68 | } |
70 | break; | 69 | break; |
71 | case IPT_CONNBYTES_BYTES: | 70 | case IPT_CONNBYTES_BYTES: |
72 | switch (sinfo->direction) { | 71 | switch (sinfo->direction) { |
73 | case IPT_CONNBYTES_DIR_ORIGINAL: | 72 | case IPT_CONNBYTES_DIR_ORIGINAL: |
74 | what = ct->counters[IP_CT_DIR_ORIGINAL].bytes; | 73 | what = counters[IP_CT_DIR_ORIGINAL].bytes; |
75 | break; | 74 | break; |
76 | case IPT_CONNBYTES_DIR_REPLY: | 75 | case IPT_CONNBYTES_DIR_REPLY: |
77 | what = ct->counters[IP_CT_DIR_REPLY].bytes; | 76 | what = counters[IP_CT_DIR_REPLY].bytes; |
78 | break; | 77 | break; |
79 | case IPT_CONNBYTES_DIR_BOTH: | 78 | case IPT_CONNBYTES_DIR_BOTH: |
80 | what = ct->counters[IP_CT_DIR_ORIGINAL].bytes; | 79 | what = counters[IP_CT_DIR_ORIGINAL].bytes; |
81 | what += ct->counters[IP_CT_DIR_REPLY].bytes; | 80 | what += counters[IP_CT_DIR_REPLY].bytes; |
82 | break; | 81 | break; |
83 | } | 82 | } |
84 | break; | 83 | break; |
85 | case IPT_CONNBYTES_AVGPKT: | 84 | case IPT_CONNBYTES_AVGPKT: |
86 | switch (sinfo->direction) { | 85 | switch (sinfo->direction) { |
87 | case IPT_CONNBYTES_DIR_ORIGINAL: | 86 | case IPT_CONNBYTES_DIR_ORIGINAL: |
88 | what = div64_64(ct->counters[IP_CT_DIR_ORIGINAL].bytes, | 87 | what = div64_64(counters[IP_CT_DIR_ORIGINAL].bytes, |
89 | ct->counters[IP_CT_DIR_ORIGINAL].packets); | 88 | counters[IP_CT_DIR_ORIGINAL].packets); |
90 | break; | 89 | break; |
91 | case IPT_CONNBYTES_DIR_REPLY: | 90 | case IPT_CONNBYTES_DIR_REPLY: |
92 | what = div64_64(ct->counters[IP_CT_DIR_REPLY].bytes, | 91 | what = div64_64(counters[IP_CT_DIR_REPLY].bytes, |
93 | ct->counters[IP_CT_DIR_REPLY].packets); | 92 | counters[IP_CT_DIR_REPLY].packets); |
94 | break; | 93 | break; |
95 | case IPT_CONNBYTES_DIR_BOTH: | 94 | case IPT_CONNBYTES_DIR_BOTH: |
96 | { | 95 | { |
97 | u_int64_t bytes; | 96 | u_int64_t bytes; |
98 | u_int64_t pkts; | 97 | u_int64_t pkts; |
99 | bytes = ct->counters[IP_CT_DIR_ORIGINAL].bytes + | 98 | bytes = counters[IP_CT_DIR_ORIGINAL].bytes + |
100 | ct->counters[IP_CT_DIR_REPLY].bytes; | 99 | counters[IP_CT_DIR_REPLY].bytes; |
101 | pkts = ct->counters[IP_CT_DIR_ORIGINAL].packets+ | 100 | pkts = counters[IP_CT_DIR_ORIGINAL].packets+ |
102 | ct->counters[IP_CT_DIR_REPLY].packets; | 101 | counters[IP_CT_DIR_REPLY].packets; |
103 | 102 | ||
104 | /* FIXME_THEORETICAL: what to do if sum | 103 | /* FIXME_THEORETICAL: what to do if sum |
105 | * overflows ? */ | 104 | * overflows ? */ |
diff --git a/net/ipv4/netfilter/ipt_connmark.c b/net/ipv4/netfilter/ipt_connmark.c index bf8de47ce004..5306ef293b92 100644 --- a/net/ipv4/netfilter/ipt_connmark.c +++ b/net/ipv4/netfilter/ipt_connmark.c | |||
@@ -28,7 +28,7 @@ MODULE_LICENSE("GPL"); | |||
28 | 28 | ||
29 | #include <linux/netfilter_ipv4/ip_tables.h> | 29 | #include <linux/netfilter_ipv4/ip_tables.h> |
30 | #include <linux/netfilter_ipv4/ipt_connmark.h> | 30 | #include <linux/netfilter_ipv4/ipt_connmark.h> |
31 | #include <linux/netfilter_ipv4/ip_conntrack.h> | 31 | #include <net/netfilter/nf_conntrack_compat.h> |
32 | 32 | ||
33 | static int | 33 | static int |
34 | match(const struct sk_buff *skb, | 34 | match(const struct sk_buff *skb, |
@@ -39,12 +39,12 @@ match(const struct sk_buff *skb, | |||
39 | int *hotdrop) | 39 | int *hotdrop) |
40 | { | 40 | { |
41 | const struct ipt_connmark_info *info = matchinfo; | 41 | const struct ipt_connmark_info *info = matchinfo; |
42 | enum ip_conntrack_info ctinfo; | 42 | u_int32_t ctinfo; |
43 | struct ip_conntrack *ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo); | 43 | const u_int32_t *ctmark = nf_ct_get_mark(skb, &ctinfo); |
44 | if (!ct) | 44 | if (!ctmark) |
45 | return 0; | 45 | return 0; |
46 | 46 | ||
47 | return ((ct->mark & info->mask) == info->mark) ^ info->invert; | 47 | return (((*ctmark) & info->mask) == info->mark) ^ info->invert; |
48 | } | 48 | } |
49 | 49 | ||
50 | static int | 50 | static int |
diff --git a/net/ipv4/netfilter/ipt_conntrack.c b/net/ipv4/netfilter/ipt_conntrack.c index c1d22801b7cf..c8d18705469b 100644 --- a/net/ipv4/netfilter/ipt_conntrack.c +++ b/net/ipv4/netfilter/ipt_conntrack.c | |||
@@ -10,7 +10,14 @@ | |||
10 | 10 | ||
11 | #include <linux/module.h> | 11 | #include <linux/module.h> |
12 | #include <linux/skbuff.h> | 12 | #include <linux/skbuff.h> |
13 | |||
14 | #if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE) | ||
13 | #include <linux/netfilter_ipv4/ip_conntrack.h> | 15 | #include <linux/netfilter_ipv4/ip_conntrack.h> |
16 | #include <linux/netfilter_ipv4/ip_conntrack_tuple.h> | ||
17 | #else | ||
18 | #include <net/netfilter/nf_conntrack.h> | ||
19 | #endif | ||
20 | |||
14 | #include <linux/netfilter_ipv4/ip_tables.h> | 21 | #include <linux/netfilter_ipv4/ip_tables.h> |
15 | #include <linux/netfilter_ipv4/ipt_conntrack.h> | 22 | #include <linux/netfilter_ipv4/ipt_conntrack.h> |
16 | 23 | ||
@@ -18,6 +25,8 @@ MODULE_LICENSE("GPL"); | |||
18 | MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); | 25 | MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); |
19 | MODULE_DESCRIPTION("iptables connection tracking match module"); | 26 | MODULE_DESCRIPTION("iptables connection tracking match module"); |
20 | 27 | ||
28 | #if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE) | ||
29 | |||
21 | static int | 30 | static int |
22 | match(const struct sk_buff *skb, | 31 | match(const struct sk_buff *skb, |
23 | const struct net_device *in, | 32 | const struct net_device *in, |
@@ -102,6 +111,93 @@ match(const struct sk_buff *skb, | |||
102 | return 1; | 111 | return 1; |
103 | } | 112 | } |
104 | 113 | ||
114 | #else /* CONFIG_IP_NF_CONNTRACK */ | ||
115 | static int | ||
116 | match(const struct sk_buff *skb, | ||
117 | const struct net_device *in, | ||
118 | const struct net_device *out, | ||
119 | const void *matchinfo, | ||
120 | int offset, | ||
121 | int *hotdrop) | ||
122 | { | ||
123 | const struct ipt_conntrack_info *sinfo = matchinfo; | ||
124 | struct nf_conn *ct; | ||
125 | enum ip_conntrack_info ctinfo; | ||
126 | unsigned int statebit; | ||
127 | |||
128 | ct = nf_ct_get((struct sk_buff *)skb, &ctinfo); | ||
129 | |||
130 | #define FWINV(bool,invflg) ((bool) ^ !!(sinfo->invflags & invflg)) | ||
131 | |||
132 | if (ct == &nf_conntrack_untracked) | ||
133 | statebit = IPT_CONNTRACK_STATE_UNTRACKED; | ||
134 | else if (ct) | ||
135 | statebit = IPT_CONNTRACK_STATE_BIT(ctinfo); | ||
136 | else | ||
137 | statebit = IPT_CONNTRACK_STATE_INVALID; | ||
138 | |||
139 | if(sinfo->flags & IPT_CONNTRACK_STATE) { | ||
140 | if (ct) { | ||
141 | if(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip != | ||
142 | ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip) | ||
143 | statebit |= IPT_CONNTRACK_STATE_SNAT; | ||
144 | |||
145 | if(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.ip != | ||
146 | ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip) | ||
147 | statebit |= IPT_CONNTRACK_STATE_DNAT; | ||
148 | } | ||
149 | |||
150 | if (FWINV((statebit & sinfo->statemask) == 0, IPT_CONNTRACK_STATE)) | ||
151 | return 0; | ||
152 | } | ||
153 | |||
154 | if(sinfo->flags & IPT_CONNTRACK_PROTO) { | ||
155 | if (!ct || FWINV(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.protonum, IPT_CONNTRACK_PROTO)) | ||
156 | return 0; | ||
157 | } | ||
158 | |||
159 | if(sinfo->flags & IPT_CONNTRACK_ORIGSRC) { | ||
160 | if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip&sinfo->sipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].src.ip, IPT_CONNTRACK_ORIGSRC)) | ||
161 | return 0; | ||
162 | } | ||
163 | |||
164 | if(sinfo->flags & IPT_CONNTRACK_ORIGDST) { | ||
165 | if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.ip&sinfo->dipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.ip, IPT_CONNTRACK_ORIGDST)) | ||
166 | return 0; | ||
167 | } | ||
168 | |||
169 | if(sinfo->flags & IPT_CONNTRACK_REPLSRC) { | ||
170 | if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip&sinfo->sipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].src.ip, IPT_CONNTRACK_REPLSRC)) | ||
171 | return 0; | ||
172 | } | ||
173 | |||
174 | if(sinfo->flags & IPT_CONNTRACK_REPLDST) { | ||
175 | if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip&sinfo->dipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].dst.ip, IPT_CONNTRACK_REPLDST)) | ||
176 | return 0; | ||
177 | } | ||
178 | |||
179 | if(sinfo->flags & IPT_CONNTRACK_STATUS) { | ||
180 | if (!ct || FWINV((ct->status & sinfo->statusmask) == 0, IPT_CONNTRACK_STATUS)) | ||
181 | return 0; | ||
182 | } | ||
183 | |||
184 | if(sinfo->flags & IPT_CONNTRACK_EXPIRES) { | ||
185 | unsigned long expires; | ||
186 | |||
187 | if(!ct) | ||
188 | return 0; | ||
189 | |||
190 | expires = timer_pending(&ct->timeout) ? (ct->timeout.expires - jiffies)/HZ : 0; | ||
191 | |||
192 | if (FWINV(!(expires >= sinfo->expires_min && expires <= sinfo->expires_max), IPT_CONNTRACK_EXPIRES)) | ||
193 | return 0; | ||
194 | } | ||
195 | |||
196 | return 1; | ||
197 | } | ||
198 | |||
199 | #endif /* CONFIG_NF_IP_CONNTRACK */ | ||
200 | |||
105 | static int check(const char *tablename, | 201 | static int check(const char *tablename, |
106 | const struct ipt_ip *ip, | 202 | const struct ipt_ip *ip, |
107 | void *matchinfo, | 203 | void *matchinfo, |
diff --git a/net/ipv4/netfilter/ipt_helper.c b/net/ipv4/netfilter/ipt_helper.c index 3e7dd014de43..bf14e1c7798a 100644 --- a/net/ipv4/netfilter/ipt_helper.c +++ b/net/ipv4/netfilter/ipt_helper.c | |||
@@ -13,9 +13,15 @@ | |||
13 | #include <linux/module.h> | 13 | #include <linux/module.h> |
14 | #include <linux/skbuff.h> | 14 | #include <linux/skbuff.h> |
15 | #include <linux/netfilter.h> | 15 | #include <linux/netfilter.h> |
16 | #if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE) | ||
16 | #include <linux/netfilter_ipv4/ip_conntrack.h> | 17 | #include <linux/netfilter_ipv4/ip_conntrack.h> |
17 | #include <linux/netfilter_ipv4/ip_conntrack_core.h> | 18 | #include <linux/netfilter_ipv4/ip_conntrack_core.h> |
18 | #include <linux/netfilter_ipv4/ip_conntrack_helper.h> | 19 | #include <linux/netfilter_ipv4/ip_conntrack_helper.h> |
20 | #else | ||
21 | #include <net/netfilter/nf_conntrack.h> | ||
22 | #include <net/netfilter/nf_conntrack_core.h> | ||
23 | #include <net/netfilter/nf_conntrack_helper.h> | ||
24 | #endif | ||
19 | #include <linux/netfilter_ipv4/ip_tables.h> | 25 | #include <linux/netfilter_ipv4/ip_tables.h> |
20 | #include <linux/netfilter_ipv4/ipt_helper.h> | 26 | #include <linux/netfilter_ipv4/ipt_helper.h> |
21 | 27 | ||
@@ -29,6 +35,7 @@ MODULE_DESCRIPTION("iptables helper match module"); | |||
29 | #define DEBUGP(format, args...) | 35 | #define DEBUGP(format, args...) |
30 | #endif | 36 | #endif |
31 | 37 | ||
38 | #if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE) | ||
32 | static int | 39 | static int |
33 | match(const struct sk_buff *skb, | 40 | match(const struct sk_buff *skb, |
34 | const struct net_device *in, | 41 | const struct net_device *in, |
@@ -73,6 +80,53 @@ out_unlock: | |||
73 | return ret; | 80 | return ret; |
74 | } | 81 | } |
75 | 82 | ||
83 | #else /* CONFIG_IP_NF_CONNTRACK */ | ||
84 | |||
85 | static int | ||
86 | match(const struct sk_buff *skb, | ||
87 | const struct net_device *in, | ||
88 | const struct net_device *out, | ||
89 | const void *matchinfo, | ||
90 | int offset, | ||
91 | int *hotdrop) | ||
92 | { | ||
93 | const struct ipt_helper_info *info = matchinfo; | ||
94 | struct nf_conn *ct; | ||
95 | enum ip_conntrack_info ctinfo; | ||
96 | int ret = info->invert; | ||
97 | |||
98 | ct = nf_ct_get((struct sk_buff *)skb, &ctinfo); | ||
99 | if (!ct) { | ||
100 | DEBUGP("ipt_helper: Eek! invalid conntrack?\n"); | ||
101 | return ret; | ||
102 | } | ||
103 | |||
104 | if (!ct->master) { | ||
105 | DEBUGP("ipt_helper: conntrack %p has no master\n", ct); | ||
106 | return ret; | ||
107 | } | ||
108 | |||
109 | read_lock_bh(&nf_conntrack_lock); | ||
110 | if (!ct->master->helper) { | ||
111 | DEBUGP("ipt_helper: master ct %p has no helper\n", | ||
112 | exp->expectant); | ||
113 | goto out_unlock; | ||
114 | } | ||
115 | |||
116 | DEBUGP("master's name = %s , info->name = %s\n", | ||
117 | ct->master->helper->name, info->name); | ||
118 | |||
119 | if (info->name[0] == '\0') | ||
120 | ret ^= 1; | ||
121 | else | ||
122 | ret ^= !strncmp(ct->master->helper->name, info->name, | ||
123 | strlen(ct->master->helper->name)); | ||
124 | out_unlock: | ||
125 | read_unlock_bh(&nf_conntrack_lock); | ||
126 | return ret; | ||
127 | } | ||
128 | #endif | ||
129 | |||
76 | static int check(const char *tablename, | 130 | static int check(const char *tablename, |
77 | const struct ipt_ip *ip, | 131 | const struct ipt_ip *ip, |
78 | void *matchinfo, | 132 | void *matchinfo, |
diff --git a/net/ipv4/netfilter/ipt_state.c b/net/ipv4/netfilter/ipt_state.c index b1511b97ea5f..4d7f16b70cec 100644 --- a/net/ipv4/netfilter/ipt_state.c +++ b/net/ipv4/netfilter/ipt_state.c | |||
@@ -10,7 +10,7 @@ | |||
10 | 10 | ||
11 | #include <linux/module.h> | 11 | #include <linux/module.h> |
12 | #include <linux/skbuff.h> | 12 | #include <linux/skbuff.h> |
13 | #include <linux/netfilter_ipv4/ip_conntrack.h> | 13 | #include <net/netfilter/nf_conntrack_compat.h> |
14 | #include <linux/netfilter_ipv4/ip_tables.h> | 14 | #include <linux/netfilter_ipv4/ip_tables.h> |
15 | #include <linux/netfilter_ipv4/ipt_state.h> | 15 | #include <linux/netfilter_ipv4/ipt_state.h> |
16 | 16 | ||
@@ -30,9 +30,9 @@ match(const struct sk_buff *skb, | |||
30 | enum ip_conntrack_info ctinfo; | 30 | enum ip_conntrack_info ctinfo; |
31 | unsigned int statebit; | 31 | unsigned int statebit; |
32 | 32 | ||
33 | if (skb->nfct == &ip_conntrack_untracked.ct_general) | 33 | if (nf_ct_is_untracked(skb)) |
34 | statebit = IPT_STATE_UNTRACKED; | 34 | statebit = IPT_STATE_UNTRACKED; |
35 | else if (!ip_conntrack_get(skb, &ctinfo)) | 35 | else if (!nf_ct_get_ctinfo(skb, &ctinfo)) |
36 | statebit = IPT_STATE_INVALID; | 36 | statebit = IPT_STATE_INVALID; |
37 | else | 37 | else |
38 | statebit = IPT_STATE_BIT(ctinfo); | 38 | statebit = IPT_STATE_BIT(ctinfo); |
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c new file mode 100644 index 000000000000..8202c1c0afad --- /dev/null +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | |||
@@ -0,0 +1,571 @@ | |||
1 | /* (C) 1999-2001 Paul `Rusty' Russell | ||
2 | * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License version 2 as | ||
6 | * published by the Free Software Foundation. | ||
7 | * | ||
8 | * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> | ||
9 | * - move L3 protocol dependent part to this file. | ||
10 | * 23 Mar 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> | ||
11 | * - add get_features() to support various size of conntrack | ||
12 | * structures. | ||
13 | * | ||
14 | * Derived from net/ipv4/netfilter/ip_conntrack_standalone.c | ||
15 | */ | ||
16 | |||
17 | #include <linux/config.h> | ||
18 | #include <linux/types.h> | ||
19 | #include <linux/ip.h> | ||
20 | #include <linux/netfilter.h> | ||
21 | #include <linux/module.h> | ||
22 | #include <linux/skbuff.h> | ||
23 | #include <linux/icmp.h> | ||
24 | #include <linux/sysctl.h> | ||
25 | #include <net/ip.h> | ||
26 | |||
27 | #include <linux/netfilter_ipv4.h> | ||
28 | #include <net/netfilter/nf_conntrack.h> | ||
29 | #include <net/netfilter/nf_conntrack_helper.h> | ||
30 | #include <net/netfilter/nf_conntrack_protocol.h> | ||
31 | #include <net/netfilter/nf_conntrack_l3proto.h> | ||
32 | #include <net/netfilter/nf_conntrack_core.h> | ||
33 | #include <net/netfilter/ipv4/nf_conntrack_ipv4.h> | ||
34 | |||
35 | #if 0 | ||
36 | #define DEBUGP printk | ||
37 | #else | ||
38 | #define DEBUGP(format, args...) | ||
39 | #endif | ||
40 | |||
41 | DECLARE_PER_CPU(struct nf_conntrack_stat, nf_conntrack_stat); | ||
42 | |||
43 | static int ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, | ||
44 | struct nf_conntrack_tuple *tuple) | ||
45 | { | ||
46 | u_int32_t _addrs[2], *ap; | ||
47 | ap = skb_header_pointer(skb, nhoff + offsetof(struct iphdr, saddr), | ||
48 | sizeof(u_int32_t) * 2, _addrs); | ||
49 | if (ap == NULL) | ||
50 | return 0; | ||
51 | |||
52 | tuple->src.u3.ip = ap[0]; | ||
53 | tuple->dst.u3.ip = ap[1]; | ||
54 | |||
55 | return 1; | ||
56 | } | ||
57 | |||
58 | static int ipv4_invert_tuple(struct nf_conntrack_tuple *tuple, | ||
59 | const struct nf_conntrack_tuple *orig) | ||
60 | { | ||
61 | tuple->src.u3.ip = orig->dst.u3.ip; | ||
62 | tuple->dst.u3.ip = orig->src.u3.ip; | ||
63 | |||
64 | return 1; | ||
65 | } | ||
66 | |||
67 | static int ipv4_print_tuple(struct seq_file *s, | ||
68 | const struct nf_conntrack_tuple *tuple) | ||
69 | { | ||
70 | return seq_printf(s, "src=%u.%u.%u.%u dst=%u.%u.%u.%u ", | ||
71 | NIPQUAD(tuple->src.u3.ip), | ||
72 | NIPQUAD(tuple->dst.u3.ip)); | ||
73 | } | ||
74 | |||
75 | static int ipv4_print_conntrack(struct seq_file *s, | ||
76 | const struct nf_conn *conntrack) | ||
77 | { | ||
78 | return 0; | ||
79 | } | ||
80 | |||
81 | /* Returns new sk_buff, or NULL */ | ||
82 | static struct sk_buff * | ||
83 | nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) | ||
84 | { | ||
85 | skb_orphan(skb); | ||
86 | |||
87 | local_bh_disable(); | ||
88 | skb = ip_defrag(skb, user); | ||
89 | local_bh_enable(); | ||
90 | |||
91 | if (skb) | ||
92 | ip_send_check(skb->nh.iph); | ||
93 | |||
94 | return skb; | ||
95 | } | ||
96 | |||
97 | static int | ||
98 | ipv4_prepare(struct sk_buff **pskb, unsigned int hooknum, unsigned int *dataoff, | ||
99 | u_int8_t *protonum) | ||
100 | { | ||
101 | /* Never happen */ | ||
102 | if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) { | ||
103 | if (net_ratelimit()) { | ||
104 | printk(KERN_ERR "ipv4_prepare: Frag of proto %u (hook=%u)\n", | ||
105 | (*pskb)->nh.iph->protocol, hooknum); | ||
106 | } | ||
107 | return -NF_DROP; | ||
108 | } | ||
109 | |||
110 | *dataoff = (*pskb)->nh.raw - (*pskb)->data + (*pskb)->nh.iph->ihl*4; | ||
111 | *protonum = (*pskb)->nh.iph->protocol; | ||
112 | |||
113 | return NF_ACCEPT; | ||
114 | } | ||
115 | |||
116 | int nat_module_is_loaded = 0; | ||
117 | static u_int32_t ipv4_get_features(const struct nf_conntrack_tuple *tuple) | ||
118 | { | ||
119 | if (nat_module_is_loaded) | ||
120 | return NF_CT_F_NAT; | ||
121 | |||
122 | return NF_CT_F_BASIC; | ||
123 | } | ||
124 | |||
125 | static unsigned int ipv4_confirm(unsigned int hooknum, | ||
126 | struct sk_buff **pskb, | ||
127 | const struct net_device *in, | ||
128 | const struct net_device *out, | ||
129 | int (*okfn)(struct sk_buff *)) | ||
130 | { | ||
131 | /* We've seen it coming out the other side: confirm it */ | ||
132 | return nf_conntrack_confirm(pskb); | ||
133 | } | ||
134 | |||
135 | static unsigned int ipv4_conntrack_help(unsigned int hooknum, | ||
136 | struct sk_buff **pskb, | ||
137 | const struct net_device *in, | ||
138 | const struct net_device *out, | ||
139 | int (*okfn)(struct sk_buff *)) | ||
140 | { | ||
141 | struct nf_conn *ct; | ||
142 | enum ip_conntrack_info ctinfo; | ||
143 | |||
144 | /* This is where we call the helper: as the packet goes out. */ | ||
145 | ct = nf_ct_get(*pskb, &ctinfo); | ||
146 | if (ct && ct->helper) { | ||
147 | unsigned int ret; | ||
148 | ret = ct->helper->help(pskb, | ||
149 | (*pskb)->nh.raw - (*pskb)->data | ||
150 | + (*pskb)->nh.iph->ihl*4, | ||
151 | ct, ctinfo); | ||
152 | if (ret != NF_ACCEPT) | ||
153 | return ret; | ||
154 | } | ||
155 | return NF_ACCEPT; | ||
156 | } | ||
157 | |||
158 | static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, | ||
159 | struct sk_buff **pskb, | ||
160 | const struct net_device *in, | ||
161 | const struct net_device *out, | ||
162 | int (*okfn)(struct sk_buff *)) | ||
163 | { | ||
164 | #if !defined(CONFIG_IP_NF_NAT) && !defined(CONFIG_IP_NF_NAT_MODULE) | ||
165 | /* Previously seen (loopback)? Ignore. Do this before | ||
166 | fragment check. */ | ||
167 | if ((*pskb)->nfct) | ||
168 | return NF_ACCEPT; | ||
169 | #endif | ||
170 | |||
171 | /* Gather fragments. */ | ||
172 | if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { | ||
173 | *pskb = nf_ct_ipv4_gather_frags(*pskb, | ||
174 | hooknum == NF_IP_PRE_ROUTING ? | ||
175 | IP_DEFRAG_CONNTRACK_IN : | ||
176 | IP_DEFRAG_CONNTRACK_OUT); | ||
177 | if (!*pskb) | ||
178 | return NF_STOLEN; | ||
179 | } | ||
180 | return NF_ACCEPT; | ||
181 | } | ||
182 | |||
183 | static unsigned int ipv4_refrag(unsigned int hooknum, | ||
184 | struct sk_buff **pskb, | ||
185 | const struct net_device *in, | ||
186 | const struct net_device *out, | ||
187 | int (*okfn)(struct sk_buff *)) | ||
188 | { | ||
189 | struct rtable *rt = (struct rtable *)(*pskb)->dst; | ||
190 | |||
191 | /* We've seen it coming out the other side: confirm */ | ||
192 | if (ipv4_confirm(hooknum, pskb, in, out, okfn) != NF_ACCEPT) | ||
193 | return NF_DROP; | ||
194 | |||
195 | /* Local packets are never produced too large for their | ||
196 | interface. We degfragment them at LOCAL_OUT, however, | ||
197 | so we have to refragment them here. */ | ||
198 | if ((*pskb)->len > dst_mtu(&rt->u.dst) && | ||
199 | !skb_shinfo(*pskb)->tso_size) { | ||
200 | /* No hook can be after us, so this should be OK. */ | ||
201 | ip_fragment(*pskb, okfn); | ||
202 | return NF_STOLEN; | ||
203 | } | ||
204 | return NF_ACCEPT; | ||
205 | } | ||
206 | |||
207 | static unsigned int ipv4_conntrack_in(unsigned int hooknum, | ||
208 | struct sk_buff **pskb, | ||
209 | const struct net_device *in, | ||
210 | const struct net_device *out, | ||
211 | int (*okfn)(struct sk_buff *)) | ||
212 | { | ||
213 | return nf_conntrack_in(PF_INET, hooknum, pskb); | ||
214 | } | ||
215 | |||
216 | static unsigned int ipv4_conntrack_local(unsigned int hooknum, | ||
217 | struct sk_buff **pskb, | ||
218 | const struct net_device *in, | ||
219 | const struct net_device *out, | ||
220 | int (*okfn)(struct sk_buff *)) | ||
221 | { | ||
222 | /* root is playing with raw sockets. */ | ||
223 | if ((*pskb)->len < sizeof(struct iphdr) | ||
224 | || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) { | ||
225 | if (net_ratelimit()) | ||
226 | printk("ipt_hook: happy cracking.\n"); | ||
227 | return NF_ACCEPT; | ||
228 | } | ||
229 | return nf_conntrack_in(PF_INET, hooknum, pskb); | ||
230 | } | ||
231 | |||
232 | /* Connection tracking may drop packets, but never alters them, so | ||
233 | make it the first hook. */ | ||
234 | static struct nf_hook_ops ipv4_conntrack_defrag_ops = { | ||
235 | .hook = ipv4_conntrack_defrag, | ||
236 | .owner = THIS_MODULE, | ||
237 | .pf = PF_INET, | ||
238 | .hooknum = NF_IP_PRE_ROUTING, | ||
239 | .priority = NF_IP_PRI_CONNTRACK_DEFRAG, | ||
240 | }; | ||
241 | |||
242 | static struct nf_hook_ops ipv4_conntrack_in_ops = { | ||
243 | .hook = ipv4_conntrack_in, | ||
244 | .owner = THIS_MODULE, | ||
245 | .pf = PF_INET, | ||
246 | .hooknum = NF_IP_PRE_ROUTING, | ||
247 | .priority = NF_IP_PRI_CONNTRACK, | ||
248 | }; | ||
249 | |||
250 | static struct nf_hook_ops ipv4_conntrack_defrag_local_out_ops = { | ||
251 | .hook = ipv4_conntrack_defrag, | ||
252 | .owner = THIS_MODULE, | ||
253 | .pf = PF_INET, | ||
254 | .hooknum = NF_IP_LOCAL_OUT, | ||
255 | .priority = NF_IP_PRI_CONNTRACK_DEFRAG, | ||
256 | }; | ||
257 | |||
258 | static struct nf_hook_ops ipv4_conntrack_local_out_ops = { | ||
259 | .hook = ipv4_conntrack_local, | ||
260 | .owner = THIS_MODULE, | ||
261 | .pf = PF_INET, | ||
262 | .hooknum = NF_IP_LOCAL_OUT, | ||
263 | .priority = NF_IP_PRI_CONNTRACK, | ||
264 | }; | ||
265 | |||
266 | /* helpers */ | ||
267 | static struct nf_hook_ops ipv4_conntrack_helper_out_ops = { | ||
268 | .hook = ipv4_conntrack_help, | ||
269 | .owner = THIS_MODULE, | ||
270 | .pf = PF_INET, | ||
271 | .hooknum = NF_IP_POST_ROUTING, | ||
272 | .priority = NF_IP_PRI_CONNTRACK_HELPER, | ||
273 | }; | ||
274 | |||
275 | static struct nf_hook_ops ipv4_conntrack_helper_in_ops = { | ||
276 | .hook = ipv4_conntrack_help, | ||
277 | .owner = THIS_MODULE, | ||
278 | .pf = PF_INET, | ||
279 | .hooknum = NF_IP_LOCAL_IN, | ||
280 | .priority = NF_IP_PRI_CONNTRACK_HELPER, | ||
281 | }; | ||
282 | |||
283 | |||
284 | /* Refragmenter; last chance. */ | ||
285 | static struct nf_hook_ops ipv4_conntrack_out_ops = { | ||
286 | .hook = ipv4_refrag, | ||
287 | .owner = THIS_MODULE, | ||
288 | .pf = PF_INET, | ||
289 | .hooknum = NF_IP_POST_ROUTING, | ||
290 | .priority = NF_IP_PRI_CONNTRACK_CONFIRM, | ||
291 | }; | ||
292 | |||
293 | static struct nf_hook_ops ipv4_conntrack_local_in_ops = { | ||
294 | .hook = ipv4_confirm, | ||
295 | .owner = THIS_MODULE, | ||
296 | .pf = PF_INET, | ||
297 | .hooknum = NF_IP_LOCAL_IN, | ||
298 | .priority = NF_IP_PRI_CONNTRACK_CONFIRM, | ||
299 | }; | ||
300 | |||
301 | #ifdef CONFIG_SYSCTL | ||
302 | /* From nf_conntrack_proto_icmp.c */ | ||
303 | extern unsigned long nf_ct_icmp_timeout; | ||
304 | static struct ctl_table_header *nf_ct_ipv4_sysctl_header; | ||
305 | |||
306 | static ctl_table nf_ct_sysctl_table[] = { | ||
307 | { | ||
308 | .ctl_name = NET_NF_CONNTRACK_ICMP_TIMEOUT, | ||
309 | .procname = "nf_conntrack_icmp_timeout", | ||
310 | .data = &nf_ct_icmp_timeout, | ||
311 | .maxlen = sizeof(unsigned int), | ||
312 | .mode = 0644, | ||
313 | .proc_handler = &proc_dointvec_jiffies, | ||
314 | }, | ||
315 | { .ctl_name = 0 } | ||
316 | }; | ||
317 | |||
318 | static ctl_table nf_ct_netfilter_table[] = { | ||
319 | { | ||
320 | .ctl_name = NET_NETFILTER, | ||
321 | .procname = "netfilter", | ||
322 | .mode = 0555, | ||
323 | .child = nf_ct_sysctl_table, | ||
324 | }, | ||
325 | { .ctl_name = 0 } | ||
326 | }; | ||
327 | |||
328 | static ctl_table nf_ct_net_table[] = { | ||
329 | { | ||
330 | .ctl_name = CTL_NET, | ||
331 | .procname = "net", | ||
332 | .mode = 0555, | ||
333 | .child = nf_ct_netfilter_table, | ||
334 | }, | ||
335 | { .ctl_name = 0 } | ||
336 | }; | ||
337 | #endif | ||
338 | |||
339 | /* Fast function for those who don't want to parse /proc (and I don't | ||
340 | blame them). */ | ||
341 | /* Reversing the socket's dst/src point of view gives us the reply | ||
342 | mapping. */ | ||
343 | static int | ||
344 | getorigdst(struct sock *sk, int optval, void __user *user, int *len) | ||
345 | { | ||
346 | struct inet_sock *inet = inet_sk(sk); | ||
347 | struct nf_conntrack_tuple_hash *h; | ||
348 | struct nf_conntrack_tuple tuple; | ||
349 | |||
350 | NF_CT_TUPLE_U_BLANK(&tuple); | ||
351 | tuple.src.u3.ip = inet->rcv_saddr; | ||
352 | tuple.src.u.tcp.port = inet->sport; | ||
353 | tuple.dst.u3.ip = inet->daddr; | ||
354 | tuple.dst.u.tcp.port = inet->dport; | ||
355 | tuple.src.l3num = PF_INET; | ||
356 | tuple.dst.protonum = IPPROTO_TCP; | ||
357 | |||
358 | /* We only do TCP at the moment: is there a better way? */ | ||
359 | if (strcmp(sk->sk_prot->name, "TCP")) { | ||
360 | DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n"); | ||
361 | return -ENOPROTOOPT; | ||
362 | } | ||
363 | |||
364 | if ((unsigned int) *len < sizeof(struct sockaddr_in)) { | ||
365 | DEBUGP("SO_ORIGINAL_DST: len %u not %u\n", | ||
366 | *len, sizeof(struct sockaddr_in)); | ||
367 | return -EINVAL; | ||
368 | } | ||
369 | |||
370 | h = nf_conntrack_find_get(&tuple, NULL); | ||
371 | if (h) { | ||
372 | struct sockaddr_in sin; | ||
373 | struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); | ||
374 | |||
375 | sin.sin_family = AF_INET; | ||
376 | sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL] | ||
377 | .tuple.dst.u.tcp.port; | ||
378 | sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL] | ||
379 | .tuple.dst.u3.ip; | ||
380 | |||
381 | DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n", | ||
382 | NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port)); | ||
383 | nf_ct_put(ct); | ||
384 | if (copy_to_user(user, &sin, sizeof(sin)) != 0) | ||
385 | return -EFAULT; | ||
386 | else | ||
387 | return 0; | ||
388 | } | ||
389 | DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n", | ||
390 | NIPQUAD(tuple.src.u3.ip), ntohs(tuple.src.u.tcp.port), | ||
391 | NIPQUAD(tuple.dst.u3.ip), ntohs(tuple.dst.u.tcp.port)); | ||
392 | return -ENOENT; | ||
393 | } | ||
394 | |||
395 | static struct nf_sockopt_ops so_getorigdst = { | ||
396 | .pf = PF_INET, | ||
397 | .get_optmin = SO_ORIGINAL_DST, | ||
398 | .get_optmax = SO_ORIGINAL_DST+1, | ||
399 | .get = &getorigdst, | ||
400 | }; | ||
401 | |||
402 | struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = { | ||
403 | .l3proto = PF_INET, | ||
404 | .name = "ipv4", | ||
405 | .pkt_to_tuple = ipv4_pkt_to_tuple, | ||
406 | .invert_tuple = ipv4_invert_tuple, | ||
407 | .print_tuple = ipv4_print_tuple, | ||
408 | .print_conntrack = ipv4_print_conntrack, | ||
409 | .prepare = ipv4_prepare, | ||
410 | .get_features = ipv4_get_features, | ||
411 | .me = THIS_MODULE, | ||
412 | }; | ||
413 | |||
414 | extern struct nf_conntrack_protocol nf_conntrack_protocol_tcp4; | ||
415 | extern struct nf_conntrack_protocol nf_conntrack_protocol_udp4; | ||
416 | extern struct nf_conntrack_protocol nf_conntrack_protocol_icmp; | ||
417 | static int init_or_cleanup(int init) | ||
418 | { | ||
419 | int ret = 0; | ||
420 | |||
421 | if (!init) goto cleanup; | ||
422 | |||
423 | ret = nf_register_sockopt(&so_getorigdst); | ||
424 | if (ret < 0) { | ||
425 | printk(KERN_ERR "Unable to register netfilter socket option\n"); | ||
426 | goto cleanup_nothing; | ||
427 | } | ||
428 | |||
429 | ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_tcp4); | ||
430 | if (ret < 0) { | ||
431 | printk("nf_conntrack_ipv4: can't register tcp.\n"); | ||
432 | goto cleanup_sockopt; | ||
433 | } | ||
434 | |||
435 | ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_udp4); | ||
436 | if (ret < 0) { | ||
437 | printk("nf_conntrack_ipv4: can't register udp.\n"); | ||
438 | goto cleanup_tcp; | ||
439 | } | ||
440 | |||
441 | ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_icmp); | ||
442 | if (ret < 0) { | ||
443 | printk("nf_conntrack_ipv4: can't register icmp.\n"); | ||
444 | goto cleanup_udp; | ||
445 | } | ||
446 | |||
447 | ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv4); | ||
448 | if (ret < 0) { | ||
449 | printk("nf_conntrack_ipv4: can't register ipv4\n"); | ||
450 | goto cleanup_icmp; | ||
451 | } | ||
452 | |||
453 | ret = nf_register_hook(&ipv4_conntrack_defrag_ops); | ||
454 | if (ret < 0) { | ||
455 | printk("nf_conntrack_ipv4: can't register pre-routing defrag hook.\n"); | ||
456 | goto cleanup_ipv4; | ||
457 | } | ||
458 | ret = nf_register_hook(&ipv4_conntrack_defrag_local_out_ops); | ||
459 | if (ret < 0) { | ||
460 | printk("nf_conntrack_ipv4: can't register local_out defrag hook.\n"); | ||
461 | goto cleanup_defragops; | ||
462 | } | ||
463 | |||
464 | ret = nf_register_hook(&ipv4_conntrack_in_ops); | ||
465 | if (ret < 0) { | ||
466 | printk("nf_conntrack_ipv4: can't register pre-routing hook.\n"); | ||
467 | goto cleanup_defraglocalops; | ||
468 | } | ||
469 | |||
470 | ret = nf_register_hook(&ipv4_conntrack_local_out_ops); | ||
471 | if (ret < 0) { | ||
472 | printk("nf_conntrack_ipv4: can't register local out hook.\n"); | ||
473 | goto cleanup_inops; | ||
474 | } | ||
475 | |||
476 | ret = nf_register_hook(&ipv4_conntrack_helper_in_ops); | ||
477 | if (ret < 0) { | ||
478 | printk("nf_conntrack_ipv4: can't register local helper hook.\n"); | ||
479 | goto cleanup_inandlocalops; | ||
480 | } | ||
481 | |||
482 | ret = nf_register_hook(&ipv4_conntrack_helper_out_ops); | ||
483 | if (ret < 0) { | ||
484 | printk("nf_conntrack_ipv4: can't register postrouting helper hook.\n"); | ||
485 | goto cleanup_helperinops; | ||
486 | } | ||
487 | |||
488 | ret = nf_register_hook(&ipv4_conntrack_out_ops); | ||
489 | if (ret < 0) { | ||
490 | printk("nf_conntrack_ipv4: can't register post-routing hook.\n"); | ||
491 | goto cleanup_helperoutops; | ||
492 | } | ||
493 | |||
494 | ret = nf_register_hook(&ipv4_conntrack_local_in_ops); | ||
495 | if (ret < 0) { | ||
496 | printk("nf_conntrack_ipv4: can't register local in hook.\n"); | ||
497 | goto cleanup_inoutandlocalops; | ||
498 | } | ||
499 | |||
500 | #ifdef CONFIG_SYSCTL | ||
501 | nf_ct_ipv4_sysctl_header = register_sysctl_table(nf_ct_net_table, 0); | ||
502 | if (nf_ct_ipv4_sysctl_header == NULL) { | ||
503 | printk("nf_conntrack: can't register to sysctl.\n"); | ||
504 | ret = -ENOMEM; | ||
505 | goto cleanup_localinops; | ||
506 | } | ||
507 | #endif | ||
508 | |||
509 | /* For use by REJECT target */ | ||
510 | ip_ct_attach = __nf_conntrack_attach; | ||
511 | |||
512 | return ret; | ||
513 | |||
514 | cleanup: | ||
515 | synchronize_net(); | ||
516 | ip_ct_attach = NULL; | ||
517 | #ifdef CONFIG_SYSCTL | ||
518 | unregister_sysctl_table(nf_ct_ipv4_sysctl_header); | ||
519 | cleanup_localinops: | ||
520 | #endif | ||
521 | nf_unregister_hook(&ipv4_conntrack_local_in_ops); | ||
522 | cleanup_inoutandlocalops: | ||
523 | nf_unregister_hook(&ipv4_conntrack_out_ops); | ||
524 | cleanup_helperoutops: | ||
525 | nf_unregister_hook(&ipv4_conntrack_helper_out_ops); | ||
526 | cleanup_helperinops: | ||
527 | nf_unregister_hook(&ipv4_conntrack_helper_in_ops); | ||
528 | cleanup_inandlocalops: | ||
529 | nf_unregister_hook(&ipv4_conntrack_local_out_ops); | ||
530 | cleanup_inops: | ||
531 | nf_unregister_hook(&ipv4_conntrack_in_ops); | ||
532 | cleanup_defraglocalops: | ||
533 | nf_unregister_hook(&ipv4_conntrack_defrag_local_out_ops); | ||
534 | cleanup_defragops: | ||
535 | nf_unregister_hook(&ipv4_conntrack_defrag_ops); | ||
536 | cleanup_ipv4: | ||
537 | nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4); | ||
538 | cleanup_icmp: | ||
539 | nf_conntrack_protocol_unregister(&nf_conntrack_protocol_icmp); | ||
540 | cleanup_udp: | ||
541 | nf_conntrack_protocol_unregister(&nf_conntrack_protocol_udp4); | ||
542 | cleanup_tcp: | ||
543 | nf_conntrack_protocol_unregister(&nf_conntrack_protocol_tcp4); | ||
544 | cleanup_sockopt: | ||
545 | nf_unregister_sockopt(&so_getorigdst); | ||
546 | cleanup_nothing: | ||
547 | return ret; | ||
548 | } | ||
549 | |||
550 | MODULE_LICENSE("GPL"); | ||
551 | |||
552 | static int __init init(void) | ||
553 | { | ||
554 | need_nf_conntrack(); | ||
555 | return init_or_cleanup(1); | ||
556 | } | ||
557 | |||
558 | static void __exit fini(void) | ||
559 | { | ||
560 | init_or_cleanup(0); | ||
561 | } | ||
562 | |||
563 | module_init(init); | ||
564 | module_exit(fini); | ||
565 | |||
566 | void need_ip_conntrack(void) | ||
567 | { | ||
568 | } | ||
569 | |||
570 | EXPORT_SYMBOL(need_ip_conntrack); | ||
571 | EXPORT_SYMBOL(nf_ct_ipv4_gather_frags); | ||
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c new file mode 100644 index 000000000000..7ddb5c08f7b8 --- /dev/null +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c | |||
@@ -0,0 +1,301 @@ | |||
1 | /* (C) 1999-2001 Paul `Rusty' Russell | ||
2 | * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License version 2 as | ||
6 | * published by the Free Software Foundation. | ||
7 | * | ||
8 | * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> | ||
9 | * - enable working with Layer 3 protocol independent connection tracking. | ||
10 | * | ||
11 | * Derived from net/ipv4/netfilter/ip_conntrack_proto_icmp.c | ||
12 | */ | ||
13 | |||
14 | #include <linux/types.h> | ||
15 | #include <linux/sched.h> | ||
16 | #include <linux/timer.h> | ||
17 | #include <linux/netfilter.h> | ||
18 | #include <linux/in.h> | ||
19 | #include <linux/icmp.h> | ||
20 | #include <linux/seq_file.h> | ||
21 | #include <net/ip.h> | ||
22 | #include <net/checksum.h> | ||
23 | #include <linux/netfilter_ipv4.h> | ||
24 | #include <net/netfilter/nf_conntrack_tuple.h> | ||
25 | #include <net/netfilter/nf_conntrack_protocol.h> | ||
26 | #include <net/netfilter/nf_conntrack_core.h> | ||
27 | |||
28 | unsigned long nf_ct_icmp_timeout = 30*HZ; | ||
29 | |||
30 | #if 0 | ||
31 | #define DEBUGP printk | ||
32 | #else | ||
33 | #define DEBUGP(format, args...) | ||
34 | #endif | ||
35 | |||
36 | static int icmp_pkt_to_tuple(const struct sk_buff *skb, | ||
37 | unsigned int dataoff, | ||
38 | struct nf_conntrack_tuple *tuple) | ||
39 | { | ||
40 | struct icmphdr _hdr, *hp; | ||
41 | |||
42 | hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); | ||
43 | if (hp == NULL) | ||
44 | return 0; | ||
45 | |||
46 | tuple->dst.u.icmp.type = hp->type; | ||
47 | tuple->src.u.icmp.id = hp->un.echo.id; | ||
48 | tuple->dst.u.icmp.code = hp->code; | ||
49 | |||
50 | return 1; | ||
51 | } | ||
52 | |||
53 | static int icmp_invert_tuple(struct nf_conntrack_tuple *tuple, | ||
54 | const struct nf_conntrack_tuple *orig) | ||
55 | { | ||
56 | /* Add 1; spaces filled with 0. */ | ||
57 | static u_int8_t invmap[] | ||
58 | = { [ICMP_ECHO] = ICMP_ECHOREPLY + 1, | ||
59 | [ICMP_ECHOREPLY] = ICMP_ECHO + 1, | ||
60 | [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1, | ||
61 | [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1, | ||
62 | [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1, | ||
63 | [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1, | ||
64 | [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1, | ||
65 | [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1}; | ||
66 | |||
67 | if (orig->dst.u.icmp.type >= sizeof(invmap) | ||
68 | || !invmap[orig->dst.u.icmp.type]) | ||
69 | return 0; | ||
70 | |||
71 | tuple->src.u.icmp.id = orig->src.u.icmp.id; | ||
72 | tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1; | ||
73 | tuple->dst.u.icmp.code = orig->dst.u.icmp.code; | ||
74 | return 1; | ||
75 | } | ||
76 | |||
77 | /* Print out the per-protocol part of the tuple. */ | ||
78 | static int icmp_print_tuple(struct seq_file *s, | ||
79 | const struct nf_conntrack_tuple *tuple) | ||
80 | { | ||
81 | return seq_printf(s, "type=%u code=%u id=%u ", | ||
82 | tuple->dst.u.icmp.type, | ||
83 | tuple->dst.u.icmp.code, | ||
84 | ntohs(tuple->src.u.icmp.id)); | ||
85 | } | ||
86 | |||
87 | /* Print out the private part of the conntrack. */ | ||
88 | static int icmp_print_conntrack(struct seq_file *s, | ||
89 | const struct nf_conn *conntrack) | ||
90 | { | ||
91 | return 0; | ||
92 | } | ||
93 | |||
94 | /* Returns verdict for packet, or -1 for invalid. */ | ||
95 | static int icmp_packet(struct nf_conn *ct, | ||
96 | const struct sk_buff *skb, | ||
97 | unsigned int dataoff, | ||
98 | enum ip_conntrack_info ctinfo, | ||
99 | int pf, | ||
100 | unsigned int hooknum) | ||
101 | { | ||
102 | /* Try to delete connection immediately after all replies: | ||
103 | won't actually vanish as we still have skb, and del_timer | ||
104 | means this will only run once even if count hits zero twice | ||
105 | (theoretically possible with SMP) */ | ||
106 | if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) { | ||
107 | if (atomic_dec_and_test(&ct->proto.icmp.count) | ||
108 | && del_timer(&ct->timeout)) | ||
109 | ct->timeout.function((unsigned long)ct); | ||
110 | } else { | ||
111 | atomic_inc(&ct->proto.icmp.count); | ||
112 | nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); | ||
113 | nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmp_timeout); | ||
114 | } | ||
115 | |||
116 | return NF_ACCEPT; | ||
117 | } | ||
118 | |||
119 | /* Called when a new connection for this protocol found. */ | ||
120 | static int icmp_new(struct nf_conn *conntrack, | ||
121 | const struct sk_buff *skb, unsigned int dataoff) | ||
122 | { | ||
123 | static u_int8_t valid_new[] | ||
124 | = { [ICMP_ECHO] = 1, | ||
125 | [ICMP_TIMESTAMP] = 1, | ||
126 | [ICMP_INFO_REQUEST] = 1, | ||
127 | [ICMP_ADDRESS] = 1 }; | ||
128 | |||
129 | if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) | ||
130 | || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) { | ||
131 | /* Can't create a new ICMP `conn' with this. */ | ||
132 | DEBUGP("icmp: can't create new conn with type %u\n", | ||
133 | conntrack->tuplehash[0].tuple.dst.u.icmp.type); | ||
134 | NF_CT_DUMP_TUPLE(&conntrack->tuplehash[0].tuple); | ||
135 | return 0; | ||
136 | } | ||
137 | atomic_set(&conntrack->proto.icmp.count, 0); | ||
138 | return 1; | ||
139 | } | ||
140 | |||
141 | extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4; | ||
142 | /* Returns conntrack if it dealt with ICMP, and filled in skb fields */ | ||
143 | static int | ||
144 | icmp_error_message(struct sk_buff *skb, | ||
145 | enum ip_conntrack_info *ctinfo, | ||
146 | unsigned int hooknum) | ||
147 | { | ||
148 | struct nf_conntrack_tuple innertuple, origtuple; | ||
149 | struct { | ||
150 | struct icmphdr icmp; | ||
151 | struct iphdr ip; | ||
152 | } _in, *inside; | ||
153 | struct nf_conntrack_protocol *innerproto; | ||
154 | struct nf_conntrack_tuple_hash *h; | ||
155 | int dataoff; | ||
156 | |||
157 | NF_CT_ASSERT(skb->nfct == NULL); | ||
158 | |||
159 | /* Not enough header? */ | ||
160 | inside = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_in), &_in); | ||
161 | if (inside == NULL) | ||
162 | return -NF_ACCEPT; | ||
163 | |||
164 | /* Ignore ICMP's containing fragments (shouldn't happen) */ | ||
165 | if (inside->ip.frag_off & htons(IP_OFFSET)) { | ||
166 | DEBUGP("icmp_error_message: fragment of proto %u\n", | ||
167 | inside->ip.protocol); | ||
168 | return -NF_ACCEPT; | ||
169 | } | ||
170 | |||
171 | innerproto = nf_ct_find_proto(PF_INET, inside->ip.protocol); | ||
172 | dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp); | ||
173 | /* Are they talking about one of our connections? */ | ||
174 | if (!nf_ct_get_tuple(skb, dataoff, dataoff + inside->ip.ihl*4, PF_INET, | ||
175 | inside->ip.protocol, &origtuple, | ||
176 | &nf_conntrack_l3proto_ipv4, innerproto)) { | ||
177 | DEBUGP("icmp_error_message: ! get_tuple p=%u", | ||
178 | inside->ip.protocol); | ||
179 | return -NF_ACCEPT; | ||
180 | } | ||
181 | |||
182 | /* Ordinarily, we'd expect the inverted tupleproto, but it's | ||
183 | been preserved inside the ICMP. */ | ||
184 | if (!nf_ct_invert_tuple(&innertuple, &origtuple, | ||
185 | &nf_conntrack_l3proto_ipv4, innerproto)) { | ||
186 | DEBUGP("icmp_error_message: no match\n"); | ||
187 | return -NF_ACCEPT; | ||
188 | } | ||
189 | |||
190 | *ctinfo = IP_CT_RELATED; | ||
191 | |||
192 | h = nf_conntrack_find_get(&innertuple, NULL); | ||
193 | if (!h) { | ||
194 | /* Locally generated ICMPs will match inverted if they | ||
195 | haven't been SNAT'ed yet */ | ||
196 | /* FIXME: NAT code has to handle half-done double NAT --RR */ | ||
197 | if (hooknum == NF_IP_LOCAL_OUT) | ||
198 | h = nf_conntrack_find_get(&origtuple, NULL); | ||
199 | |||
200 | if (!h) { | ||
201 | DEBUGP("icmp_error_message: no match\n"); | ||
202 | return -NF_ACCEPT; | ||
203 | } | ||
204 | |||
205 | /* Reverse direction from that found */ | ||
206 | if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) | ||
207 | *ctinfo += IP_CT_IS_REPLY; | ||
208 | } else { | ||
209 | if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) | ||
210 | *ctinfo += IP_CT_IS_REPLY; | ||
211 | } | ||
212 | |||
213 | /* Update skb to refer to this connection */ | ||
214 | skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general; | ||
215 | skb->nfctinfo = *ctinfo; | ||
216 | return -NF_ACCEPT; | ||
217 | } | ||
218 | |||
219 | /* Small and modified version of icmp_rcv */ | ||
220 | static int | ||
221 | icmp_error(struct sk_buff *skb, unsigned int dataoff, | ||
222 | enum ip_conntrack_info *ctinfo, int pf, unsigned int hooknum) | ||
223 | { | ||
224 | struct icmphdr _ih, *icmph; | ||
225 | |||
226 | /* Not enough header? */ | ||
227 | icmph = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_ih), &_ih); | ||
228 | if (icmph == NULL) { | ||
229 | if (LOG_INVALID(IPPROTO_ICMP)) | ||
230 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, | ||
231 | "nf_ct_icmp: short packet "); | ||
232 | return -NF_ACCEPT; | ||
233 | } | ||
234 | |||
235 | /* See ip_conntrack_proto_tcp.c */ | ||
236 | if (hooknum != NF_IP_PRE_ROUTING) | ||
237 | goto checksum_skipped; | ||
238 | |||
239 | switch (skb->ip_summed) { | ||
240 | case CHECKSUM_HW: | ||
241 | if (!(u16)csum_fold(skb->csum)) | ||
242 | break; | ||
243 | if (LOG_INVALID(IPPROTO_ICMP)) | ||
244 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, | ||
245 | "nf_ct_icmp: bad HW ICMP checksum "); | ||
246 | return -NF_ACCEPT; | ||
247 | case CHECKSUM_NONE: | ||
248 | if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) { | ||
249 | if (LOG_INVALID(IPPROTO_ICMP)) | ||
250 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, | ||
251 | NULL, | ||
252 | "nf_ct_icmp: bad ICMP checksum "); | ||
253 | return -NF_ACCEPT; | ||
254 | } | ||
255 | default: | ||
256 | break; | ||
257 | } | ||
258 | |||
259 | checksum_skipped: | ||
260 | /* | ||
261 | * 18 is the highest 'known' ICMP type. Anything else is a mystery | ||
262 | * | ||
263 | * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently | ||
264 | * discarded. | ||
265 | */ | ||
266 | if (icmph->type > NR_ICMP_TYPES) { | ||
267 | if (LOG_INVALID(IPPROTO_ICMP)) | ||
268 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, | ||
269 | "nf_ct_icmp: invalid ICMP type "); | ||
270 | return -NF_ACCEPT; | ||
271 | } | ||
272 | |||
273 | /* Need to track icmp error message? */ | ||
274 | if (icmph->type != ICMP_DEST_UNREACH | ||
275 | && icmph->type != ICMP_SOURCE_QUENCH | ||
276 | && icmph->type != ICMP_TIME_EXCEEDED | ||
277 | && icmph->type != ICMP_PARAMETERPROB | ||
278 | && icmph->type != ICMP_REDIRECT) | ||
279 | return NF_ACCEPT; | ||
280 | |||
281 | return icmp_error_message(skb, ctinfo, hooknum); | ||
282 | } | ||
283 | |||
284 | struct nf_conntrack_protocol nf_conntrack_protocol_icmp = | ||
285 | { | ||
286 | .list = { NULL, NULL }, | ||
287 | .l3proto = PF_INET, | ||
288 | .proto = IPPROTO_ICMP, | ||
289 | .name = "icmp", | ||
290 | .pkt_to_tuple = icmp_pkt_to_tuple, | ||
291 | .invert_tuple = icmp_invert_tuple, | ||
292 | .print_tuple = icmp_print_tuple, | ||
293 | .print_conntrack = icmp_print_conntrack, | ||
294 | .packet = icmp_packet, | ||
295 | .new = icmp_new, | ||
296 | .error = icmp_error, | ||
297 | .destroy = NULL, | ||
298 | .me = NULL | ||
299 | }; | ||
300 | |||
301 | EXPORT_SYMBOL(nf_conntrack_protocol_icmp); | ||
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 652685623519..01444a02b48b 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c | |||
@@ -645,6 +645,14 @@ ctl_table ipv4_table[] = { | |||
645 | .proc_handler = &proc_tcp_congestion_control, | 645 | .proc_handler = &proc_tcp_congestion_control, |
646 | .strategy = &sysctl_tcp_congestion_control, | 646 | .strategy = &sysctl_tcp_congestion_control, |
647 | }, | 647 | }, |
648 | { | ||
649 | .ctl_name = NET_TCP_ABC, | ||
650 | .procname = "tcp_abc", | ||
651 | .data = &sysctl_tcp_abc, | ||
652 | .maxlen = sizeof(int), | ||
653 | .mode = 0644, | ||
654 | .proc_handler = &proc_dointvec, | ||
655 | }, | ||
648 | 656 | ||
649 | { .ctl_name = 0 } | 657 | { .ctl_name = 0 } |
650 | }; | 658 | }; |
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 72b7c22e1ea5..9ac7a4f46bd8 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -1640,7 +1640,7 @@ int tcp_disconnect(struct sock *sk, int flags) | |||
1640 | } else if (tcp_need_reset(old_state) || | 1640 | } else if (tcp_need_reset(old_state) || |
1641 | (tp->snd_nxt != tp->write_seq && | 1641 | (tp->snd_nxt != tp->write_seq && |
1642 | (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { | 1642 | (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { |
1643 | /* The last check adjusts for discrepance of Linux wrt. RFC | 1643 | /* The last check adjusts for discrepancy of Linux wrt. RFC |
1644 | * states | 1644 | * states |
1645 | */ | 1645 | */ |
1646 | tcp_send_active_reset(sk, gfp_any()); | 1646 | tcp_send_active_reset(sk, gfp_any()); |
@@ -1669,6 +1669,7 @@ int tcp_disconnect(struct sock *sk, int flags) | |||
1669 | tp->packets_out = 0; | 1669 | tp->packets_out = 0; |
1670 | tp->snd_ssthresh = 0x7fffffff; | 1670 | tp->snd_ssthresh = 0x7fffffff; |
1671 | tp->snd_cwnd_cnt = 0; | 1671 | tp->snd_cwnd_cnt = 0; |
1672 | tp->bytes_acked = 0; | ||
1672 | tcp_set_ca_state(sk, TCP_CA_Open); | 1673 | tcp_set_ca_state(sk, TCP_CA_Open); |
1673 | tcp_clear_retrans(tp); | 1674 | tcp_clear_retrans(tp); |
1674 | inet_csk_delack_init(sk); | 1675 | inet_csk_delack_init(sk); |
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index ae35e0609047..1d0cd86621b1 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c | |||
@@ -217,17 +217,15 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, | |||
217 | 217 | ||
218 | bictcp_low_utilization(sk, data_acked); | 218 | bictcp_low_utilization(sk, data_acked); |
219 | 219 | ||
220 | if (in_flight < tp->snd_cwnd) | 220 | if (!tcp_is_cwnd_limited(sk, in_flight)) |
221 | return; | 221 | return; |
222 | 222 | ||
223 | if (tp->snd_cwnd <= tp->snd_ssthresh) { | 223 | if (tp->snd_cwnd <= tp->snd_ssthresh) |
224 | /* In "safe" area, increase. */ | 224 | tcp_slow_start(tp); |
225 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | 225 | else { |
226 | tp->snd_cwnd++; | ||
227 | } else { | ||
228 | bictcp_update(ca, tp->snd_cwnd); | 226 | bictcp_update(ca, tp->snd_cwnd); |
229 | 227 | ||
230 | /* In dangerous area, increase slowly. | 228 | /* In dangerous area, increase slowly. |
231 | * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd | 229 | * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd |
232 | */ | 230 | */ |
233 | if (tp->snd_cwnd_cnt >= ca->cnt) { | 231 | if (tp->snd_cwnd_cnt >= ca->cnt) { |
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index bbf2d6624e89..c7cc62c8dc12 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c | |||
@@ -186,24 +186,32 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, | |||
186 | { | 186 | { |
187 | struct tcp_sock *tp = tcp_sk(sk); | 187 | struct tcp_sock *tp = tcp_sk(sk); |
188 | 188 | ||
189 | if (in_flight < tp->snd_cwnd) | 189 | if (!tcp_is_cwnd_limited(sk, in_flight)) |
190 | return; | 190 | return; |
191 | 191 | ||
192 | if (tp->snd_cwnd <= tp->snd_ssthresh) { | 192 | /* In "safe" area, increase. */ |
193 | /* In "safe" area, increase. */ | 193 | if (tp->snd_cwnd <= tp->snd_ssthresh) |
194 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | 194 | tcp_slow_start(tp); |
195 | tp->snd_cwnd++; | 195 | |
196 | } else { | 196 | /* In dangerous area, increase slowly. */ |
197 | /* In dangerous area, increase slowly. | 197 | else if (sysctl_tcp_abc) { |
198 | * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd | 198 | /* RFC3465: Apppriate Byte Count |
199 | */ | 199 | * increase once for each full cwnd acked |
200 | if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { | 200 | */ |
201 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | 201 | if (tp->bytes_acked >= tp->snd_cwnd*tp->mss_cache) { |
202 | tp->snd_cwnd++; | 202 | tp->bytes_acked -= tp->snd_cwnd*tp->mss_cache; |
203 | tp->snd_cwnd_cnt = 0; | 203 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) |
204 | } else | 204 | tp->snd_cwnd++; |
205 | tp->snd_cwnd_cnt++; | 205 | } |
206 | } | 206 | } else { |
207 | /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */ | ||
208 | if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { | ||
209 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
210 | tp->snd_cwnd++; | ||
211 | tp->snd_cwnd_cnt = 0; | ||
212 | } else | ||
213 | tp->snd_cwnd_cnt++; | ||
214 | } | ||
207 | } | 215 | } |
208 | EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); | 216 | EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); |
209 | 217 | ||
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 6acc04bde080..82b3c189bd7d 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c | |||
@@ -111,18 +111,17 @@ static void hstcp_init(struct sock *sk) | |||
111 | } | 111 | } |
112 | 112 | ||
113 | static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, | 113 | static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, |
114 | u32 in_flight, int good) | 114 | u32 in_flight, u32 pkts_acked) |
115 | { | 115 | { |
116 | struct tcp_sock *tp = tcp_sk(sk); | 116 | struct tcp_sock *tp = tcp_sk(sk); |
117 | struct hstcp *ca = inet_csk_ca(sk); | 117 | struct hstcp *ca = inet_csk_ca(sk); |
118 | 118 | ||
119 | if (in_flight < tp->snd_cwnd) | 119 | if (!tcp_is_cwnd_limited(sk, in_flight)) |
120 | return; | 120 | return; |
121 | 121 | ||
122 | if (tp->snd_cwnd <= tp->snd_ssthresh) { | 122 | if (tp->snd_cwnd <= tp->snd_ssthresh) |
123 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | 123 | tcp_slow_start(tp); |
124 | tp->snd_cwnd++; | 124 | else { |
125 | } else { | ||
126 | /* Update AIMD parameters */ | 125 | /* Update AIMD parameters */ |
127 | if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) { | 126 | if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) { |
128 | while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && | 127 | while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && |
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index e47b37984e95..3284cfb993e6 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c | |||
@@ -207,14 +207,13 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, | |||
207 | struct tcp_sock *tp = tcp_sk(sk); | 207 | struct tcp_sock *tp = tcp_sk(sk); |
208 | struct htcp *ca = inet_csk_ca(sk); | 208 | struct htcp *ca = inet_csk_ca(sk); |
209 | 209 | ||
210 | if (in_flight < tp->snd_cwnd) | 210 | if (!tcp_is_cwnd_limited(sk, in_flight)) |
211 | return; | 211 | return; |
212 | 212 | ||
213 | if (tp->snd_cwnd <= tp->snd_ssthresh) { | 213 | if (tp->snd_cwnd <= tp->snd_ssthresh) |
214 | /* In "safe" area, increase. */ | 214 | tcp_slow_start(tp); |
215 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | 215 | else { |
216 | tp->snd_cwnd++; | 216 | |
217 | } else { | ||
218 | measure_rtt(sk); | 217 | measure_rtt(sk); |
219 | 218 | ||
220 | /* keep track of number of round-trip times since last backoff event */ | 219 | /* keep track of number of round-trip times since last backoff event */ |
@@ -224,7 +223,7 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, | |||
224 | htcp_alpha_update(ca); | 223 | htcp_alpha_update(ca); |
225 | } | 224 | } |
226 | 225 | ||
227 | /* In dangerous area, increase slowly. | 226 | /* In dangerous area, increase slowly. |
228 | * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd | 227 | * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd |
229 | */ | 228 | */ |
230 | if ((tp->snd_cwnd_cnt++ * ca->alpha)>>7 >= tp->snd_cwnd) { | 229 | if ((tp->snd_cwnd_cnt++ * ca->alpha)>>7 >= tp->snd_cwnd) { |
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index 77add63623df..40dbb3877510 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c | |||
@@ -100,12 +100,12 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt, | |||
100 | ca->minrtt = tp->srtt; | 100 | ca->minrtt = tp->srtt; |
101 | } | 101 | } |
102 | 102 | ||
103 | if (!tcp_is_cwnd_limited(sk, in_flight)) | ||
104 | return; | ||
105 | |||
103 | if (!ca->hybla_en) | 106 | if (!ca->hybla_en) |
104 | return tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag); | 107 | return tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag); |
105 | 108 | ||
106 | if (in_flight < tp->snd_cwnd) | ||
107 | return; | ||
108 | |||
109 | if (ca->rho == 0) | 109 | if (ca->rho == 0) |
110 | hybla_recalc_param(sk); | 110 | hybla_recalc_param(sk); |
111 | 111 | ||
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3e98b57578dc..40a26b7157b4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -42,7 +42,7 @@ | |||
42 | * Andi Kleen : Moved open_request checking here | 42 | * Andi Kleen : Moved open_request checking here |
43 | * and process RSTs for open_requests. | 43 | * and process RSTs for open_requests. |
44 | * Andi Kleen : Better prune_queue, and other fixes. | 44 | * Andi Kleen : Better prune_queue, and other fixes. |
45 | * Andrey Savochkin: Fix RTT measurements in the presnce of | 45 | * Andrey Savochkin: Fix RTT measurements in the presence of |
46 | * timestamps. | 46 | * timestamps. |
47 | * Andrey Savochkin: Check sequence numbers correctly when | 47 | * Andrey Savochkin: Check sequence numbers correctly when |
48 | * removing SACKs due to in sequence incoming | 48 | * removing SACKs due to in sequence incoming |
@@ -89,6 +89,7 @@ int sysctl_tcp_frto; | |||
89 | int sysctl_tcp_nometrics_save; | 89 | int sysctl_tcp_nometrics_save; |
90 | 90 | ||
91 | int sysctl_tcp_moderate_rcvbuf = 1; | 91 | int sysctl_tcp_moderate_rcvbuf = 1; |
92 | int sysctl_tcp_abc = 1; | ||
92 | 93 | ||
93 | #define FLAG_DATA 0x01 /* Incoming frame contained data. */ | 94 | #define FLAG_DATA 0x01 /* Incoming frame contained data. */ |
94 | #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ | 95 | #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ |
@@ -223,7 +224,7 @@ static void tcp_fixup_sndbuf(struct sock *sk) | |||
223 | * of receiver window. Check #2. | 224 | * of receiver window. Check #2. |
224 | * | 225 | * |
225 | * The scheme does not work when sender sends good segments opening | 226 | * The scheme does not work when sender sends good segments opening |
226 | * window and then starts to feed us spagetti. But it should work | 227 | * window and then starts to feed us spaghetti. But it should work |
227 | * in common situations. Otherwise, we have to rely on queue collapsing. | 228 | * in common situations. Otherwise, we have to rely on queue collapsing. |
228 | */ | 229 | */ |
229 | 230 | ||
@@ -233,7 +234,7 @@ static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp, | |||
233 | { | 234 | { |
234 | /* Optimize this! */ | 235 | /* Optimize this! */ |
235 | int truesize = tcp_win_from_space(skb->truesize)/2; | 236 | int truesize = tcp_win_from_space(skb->truesize)/2; |
236 | int window = tcp_full_space(sk)/2; | 237 | int window = tcp_win_from_space(sysctl_tcp_rmem[2])/2; |
237 | 238 | ||
238 | while (tp->rcv_ssthresh <= window) { | 239 | while (tp->rcv_ssthresh <= window) { |
239 | if (truesize <= skb->len) | 240 | if (truesize <= skb->len) |
@@ -277,7 +278,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk) | |||
277 | int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); | 278 | int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); |
278 | 279 | ||
279 | /* Try to select rcvbuf so that 4 mss-sized segments | 280 | /* Try to select rcvbuf so that 4 mss-sized segments |
280 | * will fit to window and correspoding skbs will fit to our rcvbuf. | 281 | * will fit to window and corresponding skbs will fit to our rcvbuf. |
281 | * (was 3; 4 is minimum to allow fast retransmit to work.) | 282 | * (was 3; 4 is minimum to allow fast retransmit to work.) |
282 | */ | 283 | */ |
283 | while (tcp_win_from_space(rcvmem) < tp->advmss) | 284 | while (tcp_win_from_space(rcvmem) < tp->advmss) |
@@ -286,7 +287,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk) | |||
286 | sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]); | 287 | sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]); |
287 | } | 288 | } |
288 | 289 | ||
289 | /* 4. Try to fixup all. It is made iimediately after connection enters | 290 | /* 4. Try to fixup all. It is made immediately after connection enters |
290 | * established state. | 291 | * established state. |
291 | */ | 292 | */ |
292 | static void tcp_init_buffer_space(struct sock *sk) | 293 | static void tcp_init_buffer_space(struct sock *sk) |
@@ -326,37 +327,18 @@ static void tcp_init_buffer_space(struct sock *sk) | |||
326 | static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) | 327 | static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) |
327 | { | 328 | { |
328 | struct inet_connection_sock *icsk = inet_csk(sk); | 329 | struct inet_connection_sock *icsk = inet_csk(sk); |
329 | struct sk_buff *skb; | ||
330 | unsigned int app_win = tp->rcv_nxt - tp->copied_seq; | ||
331 | int ofo_win = 0; | ||
332 | 330 | ||
333 | icsk->icsk_ack.quick = 0; | 331 | icsk->icsk_ack.quick = 0; |
334 | 332 | ||
335 | skb_queue_walk(&tp->out_of_order_queue, skb) { | 333 | if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && |
336 | ofo_win += skb->len; | 334 | !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && |
337 | } | 335 | !tcp_memory_pressure && |
338 | 336 | atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { | |
339 | /* If overcommit is due to out of order segments, | 337 | sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), |
340 | * do not clamp window. Try to expand rcvbuf instead. | 338 | sysctl_tcp_rmem[2]); |
341 | */ | ||
342 | if (ofo_win) { | ||
343 | if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && | ||
344 | !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && | ||
345 | !tcp_memory_pressure && | ||
346 | atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) | ||
347 | sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), | ||
348 | sysctl_tcp_rmem[2]); | ||
349 | } | 339 | } |
350 | if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) { | 340 | if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) |
351 | app_win += ofo_win; | ||
352 | if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf) | ||
353 | app_win >>= 1; | ||
354 | if (app_win > icsk->icsk_ack.rcv_mss) | ||
355 | app_win -= icsk->icsk_ack.rcv_mss; | ||
356 | app_win = max(app_win, 2U*tp->advmss); | ||
357 | |||
358 | tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss); | 341 | tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss); |
359 | } | ||
360 | } | 342 | } |
361 | 343 | ||
362 | /* Receiver "autotuning" code. | 344 | /* Receiver "autotuning" code. |
@@ -385,8 +367,8 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) | |||
385 | * are stalled on filesystem I/O. | 367 | * are stalled on filesystem I/O. |
386 | * | 368 | * |
387 | * Also, since we are only going for a minimum in the | 369 | * Also, since we are only going for a minimum in the |
388 | * non-timestamp case, we do not smoothe things out | 370 | * non-timestamp case, we do not smoother things out |
389 | * else with timestamps disabled convergance takes too | 371 | * else with timestamps disabled convergence takes too |
390 | * long. | 372 | * long. |
391 | */ | 373 | */ |
392 | if (!win_dep) { | 374 | if (!win_dep) { |
@@ -395,7 +377,7 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) | |||
395 | } else if (m < new_sample) | 377 | } else if (m < new_sample) |
396 | new_sample = m << 3; | 378 | new_sample = m << 3; |
397 | } else { | 379 | } else { |
398 | /* No previous mesaure. */ | 380 | /* No previous measure. */ |
399 | new_sample = m << 3; | 381 | new_sample = m << 3; |
400 | } | 382 | } |
401 | 383 | ||
@@ -524,7 +506,7 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ | |||
524 | if (icsk->icsk_ack.ato > icsk->icsk_rto) | 506 | if (icsk->icsk_ack.ato > icsk->icsk_rto) |
525 | icsk->icsk_ack.ato = icsk->icsk_rto; | 507 | icsk->icsk_ack.ato = icsk->icsk_rto; |
526 | } else if (m > icsk->icsk_rto) { | 508 | } else if (m > icsk->icsk_rto) { |
527 | /* Too long gap. Apparently sender falled to | 509 | /* Too long gap. Apparently sender failed to |
528 | * restart window, so that we send ACKs quickly. | 510 | * restart window, so that we send ACKs quickly. |
529 | */ | 511 | */ |
530 | tcp_incr_quickack(sk); | 512 | tcp_incr_quickack(sk); |
@@ -548,10 +530,9 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ | |||
548 | * To save cycles in the RFC 1323 implementation it was better to break | 530 | * To save cycles in the RFC 1323 implementation it was better to break |
549 | * it up into three procedures. -- erics | 531 | * it up into three procedures. -- erics |
550 | */ | 532 | */ |
551 | static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt) | 533 | static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) |
552 | { | 534 | { |
553 | struct tcp_sock *tp = tcp_sk(sk); | 535 | struct tcp_sock *tp = tcp_sk(sk); |
554 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
555 | long m = mrtt; /* RTT */ | 536 | long m = mrtt; /* RTT */ |
556 | 537 | ||
557 | /* The following amusing code comes from Jacobson's | 538 | /* The following amusing code comes from Jacobson's |
@@ -565,7 +546,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt) | |||
565 | * | 546 | * |
566 | * Funny. This algorithm seems to be very broken. | 547 | * Funny. This algorithm seems to be very broken. |
567 | * These formulae increase RTO, when it should be decreased, increase | 548 | * These formulae increase RTO, when it should be decreased, increase |
568 | * too slowly, when it should be incresed fastly, decrease too fastly | 549 | * too slowly, when it should be increased fastly, decrease too fastly |
569 | * etc. I guess in BSD RTO takes ONE value, so that it is absolutely | 550 | * etc. I guess in BSD RTO takes ONE value, so that it is absolutely |
570 | * does not matter how to _calculate_ it. Seems, it was trap | 551 | * does not matter how to _calculate_ it. Seems, it was trap |
571 | * that VJ failed to avoid. 8) | 552 | * that VJ failed to avoid. 8) |
@@ -610,9 +591,6 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt) | |||
610 | tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); | 591 | tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); |
611 | tp->rtt_seq = tp->snd_nxt; | 592 | tp->rtt_seq = tp->snd_nxt; |
612 | } | 593 | } |
613 | |||
614 | if (icsk->icsk_ca_ops->rtt_sample) | ||
615 | icsk->icsk_ca_ops->rtt_sample(sk, *usrtt); | ||
616 | } | 594 | } |
617 | 595 | ||
618 | /* Calculate rto without backoff. This is the second half of Van Jacobson's | 596 | /* Calculate rto without backoff. This is the second half of Van Jacobson's |
@@ -629,14 +607,14 @@ static inline void tcp_set_rto(struct sock *sk) | |||
629 | * at least by solaris and freebsd. "Erratic ACKs" has _nothing_ | 607 | * at least by solaris and freebsd. "Erratic ACKs" has _nothing_ |
630 | * to do with delayed acks, because at cwnd>2 true delack timeout | 608 | * to do with delayed acks, because at cwnd>2 true delack timeout |
631 | * is invisible. Actually, Linux-2.4 also generates erratic | 609 | * is invisible. Actually, Linux-2.4 also generates erratic |
632 | * ACKs in some curcumstances. | 610 | * ACKs in some circumstances. |
633 | */ | 611 | */ |
634 | inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar; | 612 | inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar; |
635 | 613 | ||
636 | /* 2. Fixups made earlier cannot be right. | 614 | /* 2. Fixups made earlier cannot be right. |
637 | * If we do not estimate RTO correctly without them, | 615 | * If we do not estimate RTO correctly without them, |
638 | * all the algo is pure shit and should be replaced | 616 | * all the algo is pure shit and should be replaced |
639 | * with correct one. It is exaclty, which we pretend to do. | 617 | * with correct one. It is exactly, which we pretend to do. |
640 | */ | 618 | */ |
641 | } | 619 | } |
642 | 620 | ||
@@ -794,7 +772,7 @@ static void tcp_init_metrics(struct sock *sk) | |||
794 | * to make it more realistic. | 772 | * to make it more realistic. |
795 | * | 773 | * |
796 | * A bit of theory. RTT is time passed after "normal" sized packet | 774 | * A bit of theory. RTT is time passed after "normal" sized packet |
797 | * is sent until it is ACKed. In normal curcumstances sending small | 775 | * is sent until it is ACKed. In normal circumstances sending small |
798 | * packets force peer to delay ACKs and calculation is correct too. | 776 | * packets force peer to delay ACKs and calculation is correct too. |
799 | * The algorithm is adaptive and, provided we follow specs, it | 777 | * The algorithm is adaptive and, provided we follow specs, it |
800 | * NEVER underestimate RTT. BUT! If peer tries to make some clever | 778 | * NEVER underestimate RTT. BUT! If peer tries to make some clever |
@@ -919,18 +897,32 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ | |||
919 | int prior_fackets; | 897 | int prior_fackets; |
920 | u32 lost_retrans = 0; | 898 | u32 lost_retrans = 0; |
921 | int flag = 0; | 899 | int flag = 0; |
900 | int dup_sack = 0; | ||
922 | int i; | 901 | int i; |
923 | 902 | ||
924 | if (!tp->sacked_out) | 903 | if (!tp->sacked_out) |
925 | tp->fackets_out = 0; | 904 | tp->fackets_out = 0; |
926 | prior_fackets = tp->fackets_out; | 905 | prior_fackets = tp->fackets_out; |
927 | 906 | ||
928 | for (i=0; i<num_sacks; i++, sp++) { | 907 | /* SACK fastpath: |
929 | struct sk_buff *skb; | 908 | * if the only SACK change is the increase of the end_seq of |
930 | __u32 start_seq = ntohl(sp->start_seq); | 909 | * the first block then only apply that SACK block |
931 | __u32 end_seq = ntohl(sp->end_seq); | 910 | * and use retrans queue hinting otherwise slowpath */ |
932 | int fack_count = 0; | 911 | flag = 1; |
933 | int dup_sack = 0; | 912 | for (i = 0; i< num_sacks; i++) { |
913 | __u32 start_seq = ntohl(sp[i].start_seq); | ||
914 | __u32 end_seq = ntohl(sp[i].end_seq); | ||
915 | |||
916 | if (i == 0){ | ||
917 | if (tp->recv_sack_cache[i].start_seq != start_seq) | ||
918 | flag = 0; | ||
919 | } else { | ||
920 | if ((tp->recv_sack_cache[i].start_seq != start_seq) || | ||
921 | (tp->recv_sack_cache[i].end_seq != end_seq)) | ||
922 | flag = 0; | ||
923 | } | ||
924 | tp->recv_sack_cache[i].start_seq = start_seq; | ||
925 | tp->recv_sack_cache[i].end_seq = end_seq; | ||
934 | 926 | ||
935 | /* Check for D-SACK. */ | 927 | /* Check for D-SACK. */ |
936 | if (i == 0) { | 928 | if (i == 0) { |
@@ -962,15 +954,58 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ | |||
962 | if (before(ack, prior_snd_una - tp->max_window)) | 954 | if (before(ack, prior_snd_una - tp->max_window)) |
963 | return 0; | 955 | return 0; |
964 | } | 956 | } |
957 | } | ||
958 | |||
959 | if (flag) | ||
960 | num_sacks = 1; | ||
961 | else { | ||
962 | int j; | ||
963 | tp->fastpath_skb_hint = NULL; | ||
964 | |||
965 | /* order SACK blocks to allow in order walk of the retrans queue */ | ||
966 | for (i = num_sacks-1; i > 0; i--) { | ||
967 | for (j = 0; j < i; j++){ | ||
968 | if (after(ntohl(sp[j].start_seq), | ||
969 | ntohl(sp[j+1].start_seq))){ | ||
970 | sp[j].start_seq = htonl(tp->recv_sack_cache[j+1].start_seq); | ||
971 | sp[j].end_seq = htonl(tp->recv_sack_cache[j+1].end_seq); | ||
972 | sp[j+1].start_seq = htonl(tp->recv_sack_cache[j].start_seq); | ||
973 | sp[j+1].end_seq = htonl(tp->recv_sack_cache[j].end_seq); | ||
974 | } | ||
975 | |||
976 | } | ||
977 | } | ||
978 | } | ||
979 | |||
980 | /* clear flag as used for different purpose in following code */ | ||
981 | flag = 0; | ||
982 | |||
983 | for (i=0; i<num_sacks; i++, sp++) { | ||
984 | struct sk_buff *skb; | ||
985 | __u32 start_seq = ntohl(sp->start_seq); | ||
986 | __u32 end_seq = ntohl(sp->end_seq); | ||
987 | int fack_count; | ||
988 | |||
989 | /* Use SACK fastpath hint if valid */ | ||
990 | if (tp->fastpath_skb_hint) { | ||
991 | skb = tp->fastpath_skb_hint; | ||
992 | fack_count = tp->fastpath_cnt_hint; | ||
993 | } else { | ||
994 | skb = sk->sk_write_queue.next; | ||
995 | fack_count = 0; | ||
996 | } | ||
965 | 997 | ||
966 | /* Event "B" in the comment above. */ | 998 | /* Event "B" in the comment above. */ |
967 | if (after(end_seq, tp->high_seq)) | 999 | if (after(end_seq, tp->high_seq)) |
968 | flag |= FLAG_DATA_LOST; | 1000 | flag |= FLAG_DATA_LOST; |
969 | 1001 | ||
970 | sk_stream_for_retrans_queue(skb, sk) { | 1002 | sk_stream_for_retrans_queue_from(skb, sk) { |
971 | int in_sack, pcount; | 1003 | int in_sack, pcount; |
972 | u8 sacked; | 1004 | u8 sacked; |
973 | 1005 | ||
1006 | tp->fastpath_skb_hint = skb; | ||
1007 | tp->fastpath_cnt_hint = fack_count; | ||
1008 | |||
974 | /* The retransmission queue is always in order, so | 1009 | /* The retransmission queue is always in order, so |
975 | * we can short-circuit the walk early. | 1010 | * we can short-circuit the walk early. |
976 | */ | 1011 | */ |
@@ -1045,6 +1080,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ | |||
1045 | TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); | 1080 | TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); |
1046 | tp->lost_out -= tcp_skb_pcount(skb); | 1081 | tp->lost_out -= tcp_skb_pcount(skb); |
1047 | tp->retrans_out -= tcp_skb_pcount(skb); | 1082 | tp->retrans_out -= tcp_skb_pcount(skb); |
1083 | |||
1084 | /* clear lost hint */ | ||
1085 | tp->retransmit_skb_hint = NULL; | ||
1048 | } | 1086 | } |
1049 | } else { | 1087 | } else { |
1050 | /* New sack for not retransmitted frame, | 1088 | /* New sack for not retransmitted frame, |
@@ -1057,6 +1095,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ | |||
1057 | if (sacked & TCPCB_LOST) { | 1095 | if (sacked & TCPCB_LOST) { |
1058 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; | 1096 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; |
1059 | tp->lost_out -= tcp_skb_pcount(skb); | 1097 | tp->lost_out -= tcp_skb_pcount(skb); |
1098 | |||
1099 | /* clear lost hint */ | ||
1100 | tp->retransmit_skb_hint = NULL; | ||
1060 | } | 1101 | } |
1061 | } | 1102 | } |
1062 | 1103 | ||
@@ -1080,6 +1121,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ | |||
1080 | (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) { | 1121 | (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) { |
1081 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; | 1122 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; |
1082 | tp->retrans_out -= tcp_skb_pcount(skb); | 1123 | tp->retrans_out -= tcp_skb_pcount(skb); |
1124 | tp->retransmit_skb_hint = NULL; | ||
1083 | } | 1125 | } |
1084 | } | 1126 | } |
1085 | } | 1127 | } |
@@ -1107,6 +1149,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ | |||
1107 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; | 1149 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; |
1108 | tp->retrans_out -= tcp_skb_pcount(skb); | 1150 | tp->retrans_out -= tcp_skb_pcount(skb); |
1109 | 1151 | ||
1152 | /* clear lost hint */ | ||
1153 | tp->retransmit_skb_hint = NULL; | ||
1154 | |||
1110 | if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) { | 1155 | if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) { |
1111 | tp->lost_out += tcp_skb_pcount(skb); | 1156 | tp->lost_out += tcp_skb_pcount(skb); |
1112 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; | 1157 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; |
@@ -1214,6 +1259,8 @@ static void tcp_enter_frto_loss(struct sock *sk) | |||
1214 | tcp_set_ca_state(sk, TCP_CA_Loss); | 1259 | tcp_set_ca_state(sk, TCP_CA_Loss); |
1215 | tp->high_seq = tp->frto_highmark; | 1260 | tp->high_seq = tp->frto_highmark; |
1216 | TCP_ECN_queue_cwr(tp); | 1261 | TCP_ECN_queue_cwr(tp); |
1262 | |||
1263 | clear_all_retrans_hints(tp); | ||
1217 | } | 1264 | } |
1218 | 1265 | ||
1219 | void tcp_clear_retrans(struct tcp_sock *tp) | 1266 | void tcp_clear_retrans(struct tcp_sock *tp) |
@@ -1251,6 +1298,7 @@ void tcp_enter_loss(struct sock *sk, int how) | |||
1251 | tp->snd_cwnd_cnt = 0; | 1298 | tp->snd_cwnd_cnt = 0; |
1252 | tp->snd_cwnd_stamp = tcp_time_stamp; | 1299 | tp->snd_cwnd_stamp = tcp_time_stamp; |
1253 | 1300 | ||
1301 | tp->bytes_acked = 0; | ||
1254 | tcp_clear_retrans(tp); | 1302 | tcp_clear_retrans(tp); |
1255 | 1303 | ||
1256 | /* Push undo marker, if it was plain RTO and nothing | 1304 | /* Push undo marker, if it was plain RTO and nothing |
@@ -1279,6 +1327,8 @@ void tcp_enter_loss(struct sock *sk, int how) | |||
1279 | tcp_set_ca_state(sk, TCP_CA_Loss); | 1327 | tcp_set_ca_state(sk, TCP_CA_Loss); |
1280 | tp->high_seq = tp->snd_nxt; | 1328 | tp->high_seq = tp->snd_nxt; |
1281 | TCP_ECN_queue_cwr(tp); | 1329 | TCP_ECN_queue_cwr(tp); |
1330 | |||
1331 | clear_all_retrans_hints(tp); | ||
1282 | } | 1332 | } |
1283 | 1333 | ||
1284 | static int tcp_check_sack_reneging(struct sock *sk) | 1334 | static int tcp_check_sack_reneging(struct sock *sk) |
@@ -1503,17 +1553,37 @@ static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp, | |||
1503 | int packets, u32 high_seq) | 1553 | int packets, u32 high_seq) |
1504 | { | 1554 | { |
1505 | struct sk_buff *skb; | 1555 | struct sk_buff *skb; |
1506 | int cnt = packets; | 1556 | int cnt; |
1507 | 1557 | ||
1508 | BUG_TRAP(cnt <= tp->packets_out); | 1558 | BUG_TRAP(packets <= tp->packets_out); |
1559 | if (tp->lost_skb_hint) { | ||
1560 | skb = tp->lost_skb_hint; | ||
1561 | cnt = tp->lost_cnt_hint; | ||
1562 | } else { | ||
1563 | skb = sk->sk_write_queue.next; | ||
1564 | cnt = 0; | ||
1565 | } | ||
1509 | 1566 | ||
1510 | sk_stream_for_retrans_queue(skb, sk) { | 1567 | sk_stream_for_retrans_queue_from(skb, sk) { |
1511 | cnt -= tcp_skb_pcount(skb); | 1568 | /* TODO: do this better */ |
1512 | if (cnt < 0 || after(TCP_SKB_CB(skb)->end_seq, high_seq)) | 1569 | /* this is not the most efficient way to do this... */ |
1570 | tp->lost_skb_hint = skb; | ||
1571 | tp->lost_cnt_hint = cnt; | ||
1572 | cnt += tcp_skb_pcount(skb); | ||
1573 | if (cnt > packets || after(TCP_SKB_CB(skb)->end_seq, high_seq)) | ||
1513 | break; | 1574 | break; |
1514 | if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { | 1575 | if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { |
1515 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; | 1576 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; |
1516 | tp->lost_out += tcp_skb_pcount(skb); | 1577 | tp->lost_out += tcp_skb_pcount(skb); |
1578 | |||
1579 | /* clear xmit_retransmit_queue hints | ||
1580 | * if this is beyond hint */ | ||
1581 | if(tp->retransmit_skb_hint != NULL && | ||
1582 | before(TCP_SKB_CB(skb)->seq, | ||
1583 | TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) { | ||
1584 | |||
1585 | tp->retransmit_skb_hint = NULL; | ||
1586 | } | ||
1517 | } | 1587 | } |
1518 | } | 1588 | } |
1519 | tcp_sync_left_out(tp); | 1589 | tcp_sync_left_out(tp); |
@@ -1540,13 +1610,28 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp) | |||
1540 | if (tcp_head_timedout(sk, tp)) { | 1610 | if (tcp_head_timedout(sk, tp)) { |
1541 | struct sk_buff *skb; | 1611 | struct sk_buff *skb; |
1542 | 1612 | ||
1543 | sk_stream_for_retrans_queue(skb, sk) { | 1613 | skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint |
1544 | if (tcp_skb_timedout(sk, skb) && | 1614 | : sk->sk_write_queue.next; |
1545 | !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { | 1615 | |
1616 | sk_stream_for_retrans_queue_from(skb, sk) { | ||
1617 | if (!tcp_skb_timedout(sk, skb)) | ||
1618 | break; | ||
1619 | |||
1620 | if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { | ||
1546 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; | 1621 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; |
1547 | tp->lost_out += tcp_skb_pcount(skb); | 1622 | tp->lost_out += tcp_skb_pcount(skb); |
1623 | |||
1624 | /* clear xmit_retrans hint */ | ||
1625 | if (tp->retransmit_skb_hint && | ||
1626 | before(TCP_SKB_CB(skb)->seq, | ||
1627 | TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) | ||
1628 | |||
1629 | tp->retransmit_skb_hint = NULL; | ||
1548 | } | 1630 | } |
1549 | } | 1631 | } |
1632 | |||
1633 | tp->scoreboard_skb_hint = skb; | ||
1634 | |||
1550 | tcp_sync_left_out(tp); | 1635 | tcp_sync_left_out(tp); |
1551 | } | 1636 | } |
1552 | } | 1637 | } |
@@ -1626,6 +1711,10 @@ static void tcp_undo_cwr(struct sock *sk, const int undo) | |||
1626 | } | 1711 | } |
1627 | tcp_moderate_cwnd(tp); | 1712 | tcp_moderate_cwnd(tp); |
1628 | tp->snd_cwnd_stamp = tcp_time_stamp; | 1713 | tp->snd_cwnd_stamp = tcp_time_stamp; |
1714 | |||
1715 | /* There is something screwy going on with the retrans hints after | ||
1716 | an undo */ | ||
1717 | clear_all_retrans_hints(tp); | ||
1629 | } | 1718 | } |
1630 | 1719 | ||
1631 | static inline int tcp_may_undo(struct tcp_sock *tp) | 1720 | static inline int tcp_may_undo(struct tcp_sock *tp) |
@@ -1709,6 +1798,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp) | |||
1709 | sk_stream_for_retrans_queue(skb, sk) { | 1798 | sk_stream_for_retrans_queue(skb, sk) { |
1710 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; | 1799 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; |
1711 | } | 1800 | } |
1801 | |||
1802 | clear_all_retrans_hints(tp); | ||
1803 | |||
1712 | DBGUNDO(sk, tp, "partial loss"); | 1804 | DBGUNDO(sk, tp, "partial loss"); |
1713 | tp->lost_out = 0; | 1805 | tp->lost_out = 0; |
1714 | tp->left_out = tp->sacked_out; | 1806 | tp->left_out = tp->sacked_out; |
@@ -1908,6 +2000,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, | |||
1908 | TCP_ECN_queue_cwr(tp); | 2000 | TCP_ECN_queue_cwr(tp); |
1909 | } | 2001 | } |
1910 | 2002 | ||
2003 | tp->bytes_acked = 0; | ||
1911 | tp->snd_cwnd_cnt = 0; | 2004 | tp->snd_cwnd_cnt = 0; |
1912 | tcp_set_ca_state(sk, TCP_CA_Recovery); | 2005 | tcp_set_ca_state(sk, TCP_CA_Recovery); |
1913 | } | 2006 | } |
@@ -1919,9 +2012,9 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, | |||
1919 | } | 2012 | } |
1920 | 2013 | ||
1921 | /* Read draft-ietf-tcplw-high-performance before mucking | 2014 | /* Read draft-ietf-tcplw-high-performance before mucking |
1922 | * with this code. (Superceeds RFC1323) | 2015 | * with this code. (Supersedes RFC1323) |
1923 | */ | 2016 | */ |
1924 | static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag) | 2017 | static void tcp_ack_saw_tstamp(struct sock *sk, int flag) |
1925 | { | 2018 | { |
1926 | /* RTTM Rule: A TSecr value received in a segment is used to | 2019 | /* RTTM Rule: A TSecr value received in a segment is used to |
1927 | * update the averaged RTT measurement only if the segment | 2020 | * update the averaged RTT measurement only if the segment |
@@ -1932,7 +2025,7 @@ static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag) | |||
1932 | * 1998/04/10 Andrey V. Savochkin <saw@msu.ru> | 2025 | * 1998/04/10 Andrey V. Savochkin <saw@msu.ru> |
1933 | * | 2026 | * |
1934 | * Changed: reset backoff as soon as we see the first valid sample. | 2027 | * Changed: reset backoff as soon as we see the first valid sample. |
1935 | * If we do not, we get strongly overstimated rto. With timestamps | 2028 | * If we do not, we get strongly overestimated rto. With timestamps |
1936 | * samples are accepted even from very old segments: f.e., when rtt=1 | 2029 | * samples are accepted even from very old segments: f.e., when rtt=1 |
1937 | * increases to 8, we retransmit 5 times and after 8 seconds delayed | 2030 | * increases to 8, we retransmit 5 times and after 8 seconds delayed |
1938 | * answer arrives rto becomes 120 seconds! If at least one of segments | 2031 | * answer arrives rto becomes 120 seconds! If at least one of segments |
@@ -1940,13 +2033,13 @@ static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag) | |||
1940 | */ | 2033 | */ |
1941 | struct tcp_sock *tp = tcp_sk(sk); | 2034 | struct tcp_sock *tp = tcp_sk(sk); |
1942 | const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; | 2035 | const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; |
1943 | tcp_rtt_estimator(sk, seq_rtt, usrtt); | 2036 | tcp_rtt_estimator(sk, seq_rtt); |
1944 | tcp_set_rto(sk); | 2037 | tcp_set_rto(sk); |
1945 | inet_csk(sk)->icsk_backoff = 0; | 2038 | inet_csk(sk)->icsk_backoff = 0; |
1946 | tcp_bound_rto(sk); | 2039 | tcp_bound_rto(sk); |
1947 | } | 2040 | } |
1948 | 2041 | ||
1949 | static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag) | 2042 | static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag) |
1950 | { | 2043 | { |
1951 | /* We don't have a timestamp. Can only use | 2044 | /* We don't have a timestamp. Can only use |
1952 | * packets that are not retransmitted to determine | 2045 | * packets that are not retransmitted to determine |
@@ -1960,21 +2053,21 @@ static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag | |||
1960 | if (flag & FLAG_RETRANS_DATA_ACKED) | 2053 | if (flag & FLAG_RETRANS_DATA_ACKED) |
1961 | return; | 2054 | return; |
1962 | 2055 | ||
1963 | tcp_rtt_estimator(sk, seq_rtt, usrtt); | 2056 | tcp_rtt_estimator(sk, seq_rtt); |
1964 | tcp_set_rto(sk); | 2057 | tcp_set_rto(sk); |
1965 | inet_csk(sk)->icsk_backoff = 0; | 2058 | inet_csk(sk)->icsk_backoff = 0; |
1966 | tcp_bound_rto(sk); | 2059 | tcp_bound_rto(sk); |
1967 | } | 2060 | } |
1968 | 2061 | ||
1969 | static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, | 2062 | static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, |
1970 | const s32 seq_rtt, u32 *usrtt) | 2063 | const s32 seq_rtt) |
1971 | { | 2064 | { |
1972 | const struct tcp_sock *tp = tcp_sk(sk); | 2065 | const struct tcp_sock *tp = tcp_sk(sk); |
1973 | /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ | 2066 | /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ |
1974 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) | 2067 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) |
1975 | tcp_ack_saw_tstamp(sk, usrtt, flag); | 2068 | tcp_ack_saw_tstamp(sk, flag); |
1976 | else if (seq_rtt >= 0) | 2069 | else if (seq_rtt >= 0) |
1977 | tcp_ack_no_tstamp(sk, seq_rtt, usrtt, flag); | 2070 | tcp_ack_no_tstamp(sk, seq_rtt, flag); |
1978 | } | 2071 | } |
1979 | 2072 | ||
1980 | static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, | 2073 | static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, |
@@ -2054,20 +2147,27 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, | |||
2054 | return acked; | 2147 | return acked; |
2055 | } | 2148 | } |
2056 | 2149 | ||
2150 | static inline u32 tcp_usrtt(const struct sk_buff *skb) | ||
2151 | { | ||
2152 | struct timeval tv, now; | ||
2153 | |||
2154 | do_gettimeofday(&now); | ||
2155 | skb_get_timestamp(skb, &tv); | ||
2156 | return (now.tv_sec - tv.tv_sec) * 1000000 + (now.tv_usec - tv.tv_usec); | ||
2157 | } | ||
2057 | 2158 | ||
2058 | /* Remove acknowledged frames from the retransmission queue. */ | 2159 | /* Remove acknowledged frames from the retransmission queue. */ |
2059 | static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt) | 2160 | static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) |
2060 | { | 2161 | { |
2061 | struct tcp_sock *tp = tcp_sk(sk); | 2162 | struct tcp_sock *tp = tcp_sk(sk); |
2163 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
2062 | struct sk_buff *skb; | 2164 | struct sk_buff *skb; |
2063 | __u32 now = tcp_time_stamp; | 2165 | __u32 now = tcp_time_stamp; |
2064 | int acked = 0; | 2166 | int acked = 0; |
2065 | __s32 seq_rtt = -1; | 2167 | __s32 seq_rtt = -1; |
2066 | struct timeval usnow; | ||
2067 | u32 pkts_acked = 0; | 2168 | u32 pkts_acked = 0; |
2068 | 2169 | void (*rtt_sample)(struct sock *sk, u32 usrtt) | |
2069 | if (seq_usrtt) | 2170 | = icsk->icsk_ca_ops->rtt_sample; |
2070 | do_gettimeofday(&usnow); | ||
2071 | 2171 | ||
2072 | while ((skb = skb_peek(&sk->sk_write_queue)) && | 2172 | while ((skb = skb_peek(&sk->sk_write_queue)) && |
2073 | skb != sk->sk_send_head) { | 2173 | skb != sk->sk_send_head) { |
@@ -2107,16 +2207,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt | |||
2107 | tp->retrans_out -= tcp_skb_pcount(skb); | 2207 | tp->retrans_out -= tcp_skb_pcount(skb); |
2108 | acked |= FLAG_RETRANS_DATA_ACKED; | 2208 | acked |= FLAG_RETRANS_DATA_ACKED; |
2109 | seq_rtt = -1; | 2209 | seq_rtt = -1; |
2110 | } else if (seq_rtt < 0) | 2210 | } else if (seq_rtt < 0) { |
2111 | seq_rtt = now - scb->when; | 2211 | seq_rtt = now - scb->when; |
2112 | if (seq_usrtt) { | 2212 | if (rtt_sample) |
2113 | struct timeval tv; | 2213 | (*rtt_sample)(sk, tcp_usrtt(skb)); |
2114 | |||
2115 | skb_get_timestamp(skb, &tv); | ||
2116 | *seq_usrtt = (usnow.tv_sec - tv.tv_sec) * 1000000 | ||
2117 | + (usnow.tv_usec - tv.tv_usec); | ||
2118 | } | 2214 | } |
2119 | |||
2120 | if (sacked & TCPCB_SACKED_ACKED) | 2215 | if (sacked & TCPCB_SACKED_ACKED) |
2121 | tp->sacked_out -= tcp_skb_pcount(skb); | 2216 | tp->sacked_out -= tcp_skb_pcount(skb); |
2122 | if (sacked & TCPCB_LOST) | 2217 | if (sacked & TCPCB_LOST) |
@@ -2126,17 +2221,20 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt | |||
2126 | !before(scb->end_seq, tp->snd_up)) | 2221 | !before(scb->end_seq, tp->snd_up)) |
2127 | tp->urg_mode = 0; | 2222 | tp->urg_mode = 0; |
2128 | } | 2223 | } |
2129 | } else if (seq_rtt < 0) | 2224 | } else if (seq_rtt < 0) { |
2130 | seq_rtt = now - scb->when; | 2225 | seq_rtt = now - scb->when; |
2226 | if (rtt_sample) | ||
2227 | (*rtt_sample)(sk, tcp_usrtt(skb)); | ||
2228 | } | ||
2131 | tcp_dec_pcount_approx(&tp->fackets_out, skb); | 2229 | tcp_dec_pcount_approx(&tp->fackets_out, skb); |
2132 | tcp_packets_out_dec(tp, skb); | 2230 | tcp_packets_out_dec(tp, skb); |
2133 | __skb_unlink(skb, &sk->sk_write_queue); | 2231 | __skb_unlink(skb, &sk->sk_write_queue); |
2134 | sk_stream_free_skb(sk, skb); | 2232 | sk_stream_free_skb(sk, skb); |
2233 | clear_all_retrans_hints(tp); | ||
2135 | } | 2234 | } |
2136 | 2235 | ||
2137 | if (acked&FLAG_ACKED) { | 2236 | if (acked&FLAG_ACKED) { |
2138 | const struct inet_connection_sock *icsk = inet_csk(sk); | 2237 | tcp_ack_update_rtt(sk, acked, seq_rtt); |
2139 | tcp_ack_update_rtt(sk, acked, seq_rtt, seq_usrtt); | ||
2140 | tcp_ack_packets_out(sk, tp); | 2238 | tcp_ack_packets_out(sk, tp); |
2141 | 2239 | ||
2142 | if (icsk->icsk_ca_ops->pkts_acked) | 2240 | if (icsk->icsk_ca_ops->pkts_acked) |
@@ -2284,7 +2382,7 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una) | |||
2284 | } | 2382 | } |
2285 | 2383 | ||
2286 | /* F-RTO affects on two new ACKs following RTO. | 2384 | /* F-RTO affects on two new ACKs following RTO. |
2287 | * At latest on third ACK the TCP behavor is back to normal. | 2385 | * At latest on third ACK the TCP behavior is back to normal. |
2288 | */ | 2386 | */ |
2289 | tp->frto_counter = (tp->frto_counter + 1) % 3; | 2387 | tp->frto_counter = (tp->frto_counter + 1) % 3; |
2290 | } | 2388 | } |
@@ -2299,7 +2397,6 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | |||
2299 | u32 ack = TCP_SKB_CB(skb)->ack_seq; | 2397 | u32 ack = TCP_SKB_CB(skb)->ack_seq; |
2300 | u32 prior_in_flight; | 2398 | u32 prior_in_flight; |
2301 | s32 seq_rtt; | 2399 | s32 seq_rtt; |
2302 | s32 seq_usrtt = 0; | ||
2303 | int prior_packets; | 2400 | int prior_packets; |
2304 | 2401 | ||
2305 | /* If the ack is newer than sent or older than previous acks | 2402 | /* If the ack is newer than sent or older than previous acks |
@@ -2311,6 +2408,9 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | |||
2311 | if (before(ack, prior_snd_una)) | 2408 | if (before(ack, prior_snd_una)) |
2312 | goto old_ack; | 2409 | goto old_ack; |
2313 | 2410 | ||
2411 | if (sysctl_tcp_abc && icsk->icsk_ca_state < TCP_CA_CWR) | ||
2412 | tp->bytes_acked += ack - prior_snd_una; | ||
2413 | |||
2314 | if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) { | 2414 | if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) { |
2315 | /* Window is constant, pure forward advance. | 2415 | /* Window is constant, pure forward advance. |
2316 | * No more checks are required. | 2416 | * No more checks are required. |
@@ -2352,14 +2452,13 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | |||
2352 | prior_in_flight = tcp_packets_in_flight(tp); | 2452 | prior_in_flight = tcp_packets_in_flight(tp); |
2353 | 2453 | ||
2354 | /* See if we can take anything off of the retransmit queue. */ | 2454 | /* See if we can take anything off of the retransmit queue. */ |
2355 | flag |= tcp_clean_rtx_queue(sk, &seq_rtt, | 2455 | flag |= tcp_clean_rtx_queue(sk, &seq_rtt); |
2356 | icsk->icsk_ca_ops->rtt_sample ? &seq_usrtt : NULL); | ||
2357 | 2456 | ||
2358 | if (tp->frto_counter) | 2457 | if (tp->frto_counter) |
2359 | tcp_process_frto(sk, prior_snd_una); | 2458 | tcp_process_frto(sk, prior_snd_una); |
2360 | 2459 | ||
2361 | if (tcp_ack_is_dubious(sk, flag)) { | 2460 | if (tcp_ack_is_dubious(sk, flag)) { |
2362 | /* Advanve CWND, if state allows this. */ | 2461 | /* Advance CWND, if state allows this. */ |
2363 | if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag)) | 2462 | if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag)) |
2364 | tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0); | 2463 | tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0); |
2365 | tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); | 2464 | tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); |
@@ -3148,7 +3247,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, | |||
3148 | { | 3247 | { |
3149 | struct sk_buff *skb; | 3248 | struct sk_buff *skb; |
3150 | 3249 | ||
3151 | /* First, check that queue is collapsable and find | 3250 | /* First, check that queue is collapsible and find |
3152 | * the point where collapsing can be useful. */ | 3251 | * the point where collapsing can be useful. */ |
3153 | for (skb = head; skb != tail; ) { | 3252 | for (skb = head; skb != tail; ) { |
3154 | /* No new bits? It is possible on ofo queue. */ | 3253 | /* No new bits? It is possible on ofo queue. */ |
@@ -3456,7 +3555,7 @@ static __inline__ void tcp_ack_snd_check(struct sock *sk) | |||
3456 | 3555 | ||
3457 | /* | 3556 | /* |
3458 | * This routine is only called when we have urgent data | 3557 | * This routine is only called when we have urgent data |
3459 | * signalled. Its the 'slow' part of tcp_urg. It could be | 3558 | * signaled. Its the 'slow' part of tcp_urg. It could be |
3460 | * moved inline now as tcp_urg is only called from one | 3559 | * moved inline now as tcp_urg is only called from one |
3461 | * place. We handle URGent data wrong. We have to - as | 3560 | * place. We handle URGent data wrong. We have to - as |
3462 | * BSD still doesn't use the correction from RFC961. | 3561 | * BSD still doesn't use the correction from RFC961. |
@@ -3501,7 +3600,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th) | |||
3501 | * urgent. To do this requires some care. We cannot just ignore | 3600 | * urgent. To do this requires some care. We cannot just ignore |
3502 | * tp->copied_seq since we would read the last urgent byte again | 3601 | * tp->copied_seq since we would read the last urgent byte again |
3503 | * as data, nor can we alter copied_seq until this data arrives | 3602 | * as data, nor can we alter copied_seq until this data arrives |
3504 | * or we break the sematics of SIOCATMARK (and thus sockatmark()) | 3603 | * or we break the semantics of SIOCATMARK (and thus sockatmark()) |
3505 | * | 3604 | * |
3506 | * NOTE. Double Dutch. Rendering to plain English: author of comment | 3605 | * NOTE. Double Dutch. Rendering to plain English: author of comment |
3507 | * above did something sort of send("A", MSG_OOB); send("B", MSG_OOB); | 3606 | * above did something sort of send("A", MSG_OOB); send("B", MSG_OOB); |
@@ -3646,7 +3745,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, | |||
3646 | tp->rx_opt.saw_tstamp = 0; | 3745 | tp->rx_opt.saw_tstamp = 0; |
3647 | 3746 | ||
3648 | /* pred_flags is 0xS?10 << 16 + snd_wnd | 3747 | /* pred_flags is 0xS?10 << 16 + snd_wnd |
3649 | * if header_predition is to be made | 3748 | * if header_prediction is to be made |
3650 | * 'S' will always be tp->tcp_header_len >> 2 | 3749 | * 'S' will always be tp->tcp_header_len >> 2 |
3651 | * '?' will be 0 for the fast path, otherwise pred_flags is 0 to | 3750 | * '?' will be 0 for the fast path, otherwise pred_flags is 0 to |
3652 | * turn it off (when there are holes in the receive | 3751 | * turn it off (when there are holes in the receive |
@@ -4242,7 +4341,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
4242 | */ | 4341 | */ |
4243 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && | 4342 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && |
4244 | !tp->srtt) | 4343 | !tp->srtt) |
4245 | tcp_ack_saw_tstamp(sk, NULL, 0); | 4344 | tcp_ack_saw_tstamp(sk, 0); |
4246 | 4345 | ||
4247 | if (tp->rx_opt.tstamp_ok) | 4346 | if (tp->rx_opt.tstamp_ok) |
4248 | tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; | 4347 | tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; |
@@ -4372,6 +4471,7 @@ discard: | |||
4372 | 4471 | ||
4373 | EXPORT_SYMBOL(sysctl_tcp_ecn); | 4472 | EXPORT_SYMBOL(sysctl_tcp_ecn); |
4374 | EXPORT_SYMBOL(sysctl_tcp_reordering); | 4473 | EXPORT_SYMBOL(sysctl_tcp_reordering); |
4474 | EXPORT_SYMBOL(sysctl_tcp_abc); | ||
4375 | EXPORT_SYMBOL(tcp_parse_options); | 4475 | EXPORT_SYMBOL(tcp_parse_options); |
4376 | EXPORT_SYMBOL(tcp_rcv_established); | 4476 | EXPORT_SYMBOL(tcp_rcv_established); |
4377 | EXPORT_SYMBOL(tcp_rcv_state_process); | 4477 | EXPORT_SYMBOL(tcp_rcv_state_process); |
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 634dabb558fd..4d5021e1929b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
@@ -39,7 +39,7 @@ | |||
39 | * request_sock handling and moved | 39 | * request_sock handling and moved |
40 | * most of it into the af independent code. | 40 | * most of it into the af independent code. |
41 | * Added tail drop and some other bugfixes. | 41 | * Added tail drop and some other bugfixes. |
42 | * Added new listen sematics. | 42 | * Added new listen semantics. |
43 | * Mike McLagan : Routing by source | 43 | * Mike McLagan : Routing by source |
44 | * Juan Jose Ciarlante: ip_dynaddr bits | 44 | * Juan Jose Ciarlante: ip_dynaddr bits |
45 | * Andi Kleen: various fixes. | 45 | * Andi Kleen: various fixes. |
@@ -1110,24 +1110,18 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) | |||
1110 | static int tcp_v4_checksum_init(struct sk_buff *skb) | 1110 | static int tcp_v4_checksum_init(struct sk_buff *skb) |
1111 | { | 1111 | { |
1112 | if (skb->ip_summed == CHECKSUM_HW) { | 1112 | if (skb->ip_summed == CHECKSUM_HW) { |
1113 | skb->ip_summed = CHECKSUM_UNNECESSARY; | ||
1114 | if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr, | 1113 | if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr, |
1115 | skb->nh.iph->daddr, skb->csum)) | 1114 | skb->nh.iph->daddr, skb->csum)) { |
1115 | skb->ip_summed = CHECKSUM_UNNECESSARY; | ||
1116 | return 0; | 1116 | return 0; |
1117 | 1117 | } | |
1118 | LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v4 csum failed\n"); | ||
1119 | skb->ip_summed = CHECKSUM_NONE; | ||
1120 | } | 1118 | } |
1119 | |||
1120 | skb->csum = csum_tcpudp_nofold(skb->nh.iph->saddr, skb->nh.iph->daddr, | ||
1121 | skb->len, IPPROTO_TCP, 0); | ||
1122 | |||
1121 | if (skb->len <= 76) { | 1123 | if (skb->len <= 76) { |
1122 | if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr, | 1124 | return __skb_checksum_complete(skb); |
1123 | skb->nh.iph->daddr, | ||
1124 | skb_checksum(skb, 0, skb->len, 0))) | ||
1125 | return -1; | ||
1126 | skb->ip_summed = CHECKSUM_UNNECESSARY; | ||
1127 | } else { | ||
1128 | skb->csum = ~tcp_v4_check(skb->h.th, skb->len, | ||
1129 | skb->nh.iph->saddr, | ||
1130 | skb->nh.iph->daddr, 0); | ||
1131 | } | 1125 | } |
1132 | return 0; | 1126 | return 0; |
1133 | } | 1127 | } |
@@ -1216,10 +1210,10 @@ int tcp_v4_rcv(struct sk_buff *skb) | |||
1216 | 1210 | ||
1217 | /* An explanation is required here, I think. | 1211 | /* An explanation is required here, I think. |
1218 | * Packet length and doff are validated by header prediction, | 1212 | * Packet length and doff are validated by header prediction, |
1219 | * provided case of th->doff==0 is elimineted. | 1213 | * provided case of th->doff==0 is eliminated. |
1220 | * So, we defer the checks. */ | 1214 | * So, we defer the checks. */ |
1221 | if ((skb->ip_summed != CHECKSUM_UNNECESSARY && | 1215 | if ((skb->ip_summed != CHECKSUM_UNNECESSARY && |
1222 | tcp_v4_checksum_init(skb) < 0)) | 1216 | tcp_v4_checksum_init(skb))) |
1223 | goto bad_packet; | 1217 | goto bad_packet; |
1224 | 1218 | ||
1225 | th = skb->h.th; | 1219 | th = skb->h.th; |
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index b1a63b2c6b4a..1b66a2ac4321 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c | |||
@@ -158,7 +158,7 @@ kill_with_rst: | |||
158 | /* I am shamed, but failed to make it more elegant. | 158 | /* I am shamed, but failed to make it more elegant. |
159 | * Yes, it is direct reference to IP, which is impossible | 159 | * Yes, it is direct reference to IP, which is impossible |
160 | * to generalize to IPv6. Taking into account that IPv6 | 160 | * to generalize to IPv6. Taking into account that IPv6 |
161 | * do not undertsnad recycling in any case, it not | 161 | * do not understand recycling in any case, it not |
162 | * a big problem in practice. --ANK */ | 162 | * a big problem in practice. --ANK */ |
163 | if (tw->tw_family == AF_INET && | 163 | if (tw->tw_family == AF_INET && |
164 | tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && | 164 | tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && |
@@ -194,7 +194,7 @@ kill_with_rst: | |||
194 | /* In window segment, it may be only reset or bare ack. */ | 194 | /* In window segment, it may be only reset or bare ack. */ |
195 | 195 | ||
196 | if (th->rst) { | 196 | if (th->rst) { |
197 | /* This is TIME_WAIT assasination, in two flavors. | 197 | /* This is TIME_WAIT assassination, in two flavors. |
198 | * Oh well... nobody has a sufficient solution to this | 198 | * Oh well... nobody has a sufficient solution to this |
199 | * protocol bug yet. | 199 | * protocol bug yet. |
200 | */ | 200 | */ |
@@ -380,6 +380,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
380 | */ | 380 | */ |
381 | newtp->snd_cwnd = 2; | 381 | newtp->snd_cwnd = 2; |
382 | newtp->snd_cwnd_cnt = 0; | 382 | newtp->snd_cwnd_cnt = 0; |
383 | newtp->bytes_acked = 0; | ||
383 | 384 | ||
384 | newtp->frto_counter = 0; | 385 | newtp->frto_counter = 0; |
385 | newtp->frto_highmark = 0; | 386 | newtp->frto_highmark = 0; |
@@ -550,7 +551,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, | |||
550 | 551 | ||
551 | /* RFC793 page 36: "If the connection is in any non-synchronized state ... | 552 | /* RFC793 page 36: "If the connection is in any non-synchronized state ... |
552 | * and the incoming segment acknowledges something not yet | 553 | * and the incoming segment acknowledges something not yet |
553 | * sent (the segment carries an unaccaptable ACK) ... | 554 | * sent (the segment carries an unacceptable ACK) ... |
554 | * a reset is sent." | 555 | * a reset is sent." |
555 | * | 556 | * |
556 | * Invalid ACK: reset will be sent by listening socket | 557 | * Invalid ACK: reset will be sent by listening socket |
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b907456a79f4..029c70dfb585 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -436,6 +436,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss | |||
436 | u16 flags; | 436 | u16 flags; |
437 | 437 | ||
438 | BUG_ON(len > skb->len); | 438 | BUG_ON(len > skb->len); |
439 | |||
440 | clear_all_retrans_hints(tp); | ||
439 | nsize = skb_headlen(skb) - len; | 441 | nsize = skb_headlen(skb) - len; |
440 | if (nsize < 0) | 442 | if (nsize < 0) |
441 | nsize = 0; | 443 | nsize = 0; |
@@ -599,7 +601,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) | |||
599 | for TCP options, but includes only bare TCP header. | 601 | for TCP options, but includes only bare TCP header. |
600 | 602 | ||
601 | tp->rx_opt.mss_clamp is mss negotiated at connection setup. | 603 | tp->rx_opt.mss_clamp is mss negotiated at connection setup. |
602 | It is minumum of user_mss and mss received with SYN. | 604 | It is minimum of user_mss and mss received with SYN. |
603 | It also does not include TCP options. | 605 | It also does not include TCP options. |
604 | 606 | ||
605 | tp->pmtu_cookie is last pmtu, seen by this function. | 607 | tp->pmtu_cookie is last pmtu, seen by this function. |
@@ -1171,7 +1173,7 @@ u32 __tcp_select_window(struct sock *sk) | |||
1171 | { | 1173 | { |
1172 | struct inet_connection_sock *icsk = inet_csk(sk); | 1174 | struct inet_connection_sock *icsk = inet_csk(sk); |
1173 | struct tcp_sock *tp = tcp_sk(sk); | 1175 | struct tcp_sock *tp = tcp_sk(sk); |
1174 | /* MSS for the peer's data. Previous verions used mss_clamp | 1176 | /* MSS for the peer's data. Previous versions used mss_clamp |
1175 | * here. I don't know if the value based on our guesses | 1177 | * here. I don't know if the value based on our guesses |
1176 | * of peer's MSS is better for the performance. It's more correct | 1178 | * of peer's MSS is better for the performance. It's more correct |
1177 | * but may be worse for the performance because of rcv_mss | 1179 | * but may be worse for the performance because of rcv_mss |
@@ -1260,7 +1262,10 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m | |||
1260 | BUG_ON(tcp_skb_pcount(skb) != 1 || | 1262 | BUG_ON(tcp_skb_pcount(skb) != 1 || |
1261 | tcp_skb_pcount(next_skb) != 1); | 1263 | tcp_skb_pcount(next_skb) != 1); |
1262 | 1264 | ||
1263 | /* Ok. We will be able to collapse the packet. */ | 1265 | /* changing transmit queue under us so clear hints */ |
1266 | clear_all_retrans_hints(tp); | ||
1267 | |||
1268 | /* Ok. We will be able to collapse the packet. */ | ||
1264 | __skb_unlink(next_skb, &sk->sk_write_queue); | 1269 | __skb_unlink(next_skb, &sk->sk_write_queue); |
1265 | 1270 | ||
1266 | memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size); | 1271 | memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size); |
@@ -1330,6 +1335,8 @@ void tcp_simple_retransmit(struct sock *sk) | |||
1330 | } | 1335 | } |
1331 | } | 1336 | } |
1332 | 1337 | ||
1338 | clear_all_retrans_hints(tp); | ||
1339 | |||
1333 | if (!lost) | 1340 | if (!lost) |
1334 | return; | 1341 | return; |
1335 | 1342 | ||
@@ -1361,7 +1368,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | |||
1361 | int err; | 1368 | int err; |
1362 | 1369 | ||
1363 | /* Do not sent more than we queued. 1/4 is reserved for possible | 1370 | /* Do not sent more than we queued. 1/4 is reserved for possible |
1364 | * copying overhead: frgagmentation, tunneling, mangling etc. | 1371 | * copying overhead: fragmentation, tunneling, mangling etc. |
1365 | */ | 1372 | */ |
1366 | if (atomic_read(&sk->sk_wmem_alloc) > | 1373 | if (atomic_read(&sk->sk_wmem_alloc) > |
1367 | min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) | 1374 | min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) |
@@ -1468,13 +1475,25 @@ void tcp_xmit_retransmit_queue(struct sock *sk) | |||
1468 | const struct inet_connection_sock *icsk = inet_csk(sk); | 1475 | const struct inet_connection_sock *icsk = inet_csk(sk); |
1469 | struct tcp_sock *tp = tcp_sk(sk); | 1476 | struct tcp_sock *tp = tcp_sk(sk); |
1470 | struct sk_buff *skb; | 1477 | struct sk_buff *skb; |
1471 | int packet_cnt = tp->lost_out; | 1478 | int packet_cnt; |
1479 | |||
1480 | if (tp->retransmit_skb_hint) { | ||
1481 | skb = tp->retransmit_skb_hint; | ||
1482 | packet_cnt = tp->retransmit_cnt_hint; | ||
1483 | }else{ | ||
1484 | skb = sk->sk_write_queue.next; | ||
1485 | packet_cnt = 0; | ||
1486 | } | ||
1472 | 1487 | ||
1473 | /* First pass: retransmit lost packets. */ | 1488 | /* First pass: retransmit lost packets. */ |
1474 | if (packet_cnt) { | 1489 | if (tp->lost_out) { |
1475 | sk_stream_for_retrans_queue(skb, sk) { | 1490 | sk_stream_for_retrans_queue_from(skb, sk) { |
1476 | __u8 sacked = TCP_SKB_CB(skb)->sacked; | 1491 | __u8 sacked = TCP_SKB_CB(skb)->sacked; |
1477 | 1492 | ||
1493 | /* we could do better than to assign each time */ | ||
1494 | tp->retransmit_skb_hint = skb; | ||
1495 | tp->retransmit_cnt_hint = packet_cnt; | ||
1496 | |||
1478 | /* Assume this retransmit will generate | 1497 | /* Assume this retransmit will generate |
1479 | * only one packet for congestion window | 1498 | * only one packet for congestion window |
1480 | * calculation purposes. This works because | 1499 | * calculation purposes. This works because |
@@ -1485,10 +1504,12 @@ void tcp_xmit_retransmit_queue(struct sock *sk) | |||
1485 | if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) | 1504 | if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) |
1486 | return; | 1505 | return; |
1487 | 1506 | ||
1488 | if (sacked&TCPCB_LOST) { | 1507 | if (sacked & TCPCB_LOST) { |
1489 | if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { | 1508 | if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { |
1490 | if (tcp_retransmit_skb(sk, skb)) | 1509 | if (tcp_retransmit_skb(sk, skb)) { |
1510 | tp->retransmit_skb_hint = NULL; | ||
1491 | return; | 1511 | return; |
1512 | } | ||
1492 | if (icsk->icsk_ca_state != TCP_CA_Loss) | 1513 | if (icsk->icsk_ca_state != TCP_CA_Loss) |
1493 | NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS); | 1514 | NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS); |
1494 | else | 1515 | else |
@@ -1501,8 +1522,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk) | |||
1501 | TCP_RTO_MAX); | 1522 | TCP_RTO_MAX); |
1502 | } | 1523 | } |
1503 | 1524 | ||
1504 | packet_cnt -= tcp_skb_pcount(skb); | 1525 | packet_cnt += tcp_skb_pcount(skb); |
1505 | if (packet_cnt <= 0) | 1526 | if (packet_cnt >= tp->lost_out) |
1506 | break; | 1527 | break; |
1507 | } | 1528 | } |
1508 | } | 1529 | } |
@@ -1528,9 +1549,18 @@ void tcp_xmit_retransmit_queue(struct sock *sk) | |||
1528 | if (tcp_may_send_now(sk, tp)) | 1549 | if (tcp_may_send_now(sk, tp)) |
1529 | return; | 1550 | return; |
1530 | 1551 | ||
1531 | packet_cnt = 0; | 1552 | if (tp->forward_skb_hint) { |
1553 | skb = tp->forward_skb_hint; | ||
1554 | packet_cnt = tp->forward_cnt_hint; | ||
1555 | } else{ | ||
1556 | skb = sk->sk_write_queue.next; | ||
1557 | packet_cnt = 0; | ||
1558 | } | ||
1559 | |||
1560 | sk_stream_for_retrans_queue_from(skb, sk) { | ||
1561 | tp->forward_cnt_hint = packet_cnt; | ||
1562 | tp->forward_skb_hint = skb; | ||
1532 | 1563 | ||
1533 | sk_stream_for_retrans_queue(skb, sk) { | ||
1534 | /* Similar to the retransmit loop above we | 1564 | /* Similar to the retransmit loop above we |
1535 | * can pretend that the retransmitted SKB | 1565 | * can pretend that the retransmitted SKB |
1536 | * we send out here will be composed of one | 1566 | * we send out here will be composed of one |
@@ -1547,8 +1577,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk) | |||
1547 | continue; | 1577 | continue; |
1548 | 1578 | ||
1549 | /* Ok, retransmit it. */ | 1579 | /* Ok, retransmit it. */ |
1550 | if (tcp_retransmit_skb(sk, skb)) | 1580 | if (tcp_retransmit_skb(sk, skb)) { |
1581 | tp->forward_skb_hint = NULL; | ||
1551 | break; | 1582 | break; |
1583 | } | ||
1552 | 1584 | ||
1553 | if (skb == skb_peek(&sk->sk_write_queue)) | 1585 | if (skb == skb_peek(&sk->sk_write_queue)) |
1554 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, | 1586 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, |
@@ -2058,3 +2090,4 @@ EXPORT_SYMBOL(tcp_connect); | |||
2058 | EXPORT_SYMBOL(tcp_make_synack); | 2090 | EXPORT_SYMBOL(tcp_make_synack); |
2059 | EXPORT_SYMBOL(tcp_simple_retransmit); | 2091 | EXPORT_SYMBOL(tcp_simple_retransmit); |
2060 | EXPORT_SYMBOL(tcp_sync_mss); | 2092 | EXPORT_SYMBOL(tcp_sync_mss); |
2093 | EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor); | ||
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index 327770bf5522..26d7486ee501 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c | |||
@@ -20,20 +20,20 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 rtt, | |||
20 | u32 in_flight, int flag) | 20 | u32 in_flight, int flag) |
21 | { | 21 | { |
22 | struct tcp_sock *tp = tcp_sk(sk); | 22 | struct tcp_sock *tp = tcp_sk(sk); |
23 | if (in_flight < tp->snd_cwnd) | 23 | |
24 | if (!tcp_is_cwnd_limited(sk, in_flight)) | ||
24 | return; | 25 | return; |
25 | 26 | ||
26 | if (tp->snd_cwnd <= tp->snd_ssthresh) { | 27 | if (tp->snd_cwnd <= tp->snd_ssthresh) |
27 | tp->snd_cwnd++; | 28 | tcp_slow_start(tp); |
28 | } else { | 29 | else { |
29 | tp->snd_cwnd_cnt++; | 30 | tp->snd_cwnd_cnt++; |
30 | if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){ | 31 | if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){ |
31 | tp->snd_cwnd++; | 32 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) |
33 | tp->snd_cwnd++; | ||
32 | tp->snd_cwnd_cnt = 0; | 34 | tp->snd_cwnd_cnt = 0; |
33 | } | 35 | } |
34 | } | 36 | } |
35 | tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); | ||
36 | tp->snd_cwnd_stamp = tcp_time_stamp; | ||
37 | } | 37 | } |
38 | 38 | ||
39 | static u32 tcp_scalable_ssthresh(struct sock *sk) | 39 | static u32 tcp_scalable_ssthresh(struct sock *sk) |
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 415ee47ac1c5..e1880959614a 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c | |||
@@ -58,7 +58,7 @@ static void tcp_write_err(struct sock *sk) | |||
58 | * to prevent DoS attacks. It is called when a retransmission timeout | 58 | * to prevent DoS attacks. It is called when a retransmission timeout |
59 | * or zero probe timeout occurs on orphaned socket. | 59 | * or zero probe timeout occurs on orphaned socket. |
60 | * | 60 | * |
61 | * Criterium is still not confirmed experimentally and may change. | 61 | * Criteria is still not confirmed experimentally and may change. |
62 | * We kill the socket, if: | 62 | * We kill the socket, if: |
63 | * 1. If number of orphaned sockets exceeds an administratively configured | 63 | * 1. If number of orphaned sockets exceeds an administratively configured |
64 | * limit. | 64 | * limit. |
@@ -132,7 +132,7 @@ static int tcp_write_timeout(struct sock *sk) | |||
132 | hole detection. :-( | 132 | hole detection. :-( |
133 | 133 | ||
134 | It is place to make it. It is not made. I do not want | 134 | It is place to make it. It is not made. I do not want |
135 | to make it. It is disguisting. It does not work in any | 135 | to make it. It is disgusting. It does not work in any |
136 | case. Let me to cite the same draft, which requires for | 136 | case. Let me to cite the same draft, which requires for |
137 | us to implement this: | 137 | us to implement this: |
138 | 138 | ||
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 93c5f92070f9..b7d296a8ac6d 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c | |||
@@ -236,8 +236,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, | |||
236 | /* We don't have enough RTT samples to do the Vegas | 236 | /* We don't have enough RTT samples to do the Vegas |
237 | * calculation, so we'll behave like Reno. | 237 | * calculation, so we'll behave like Reno. |
238 | */ | 238 | */ |
239 | if (tp->snd_cwnd > tp->snd_ssthresh) | 239 | tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag); |
240 | tp->snd_cwnd++; | ||
241 | } else { | 240 | } else { |
242 | u32 rtt, target_cwnd, diff; | 241 | u32 rtt, target_cwnd, diff; |
243 | 242 | ||
@@ -275,7 +274,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, | |||
275 | */ | 274 | */ |
276 | diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; | 275 | diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; |
277 | 276 | ||
278 | if (tp->snd_cwnd < tp->snd_ssthresh) { | 277 | if (tp->snd_cwnd <= tp->snd_ssthresh) { |
279 | /* Slow start. */ | 278 | /* Slow start. */ |
280 | if (diff > gamma) { | 279 | if (diff > gamma) { |
281 | /* Going too fast. Time to slow down | 280 | /* Going too fast. Time to slow down |
@@ -295,6 +294,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, | |||
295 | V_PARAM_SHIFT)+1); | 294 | V_PARAM_SHIFT)+1); |
296 | 295 | ||
297 | } | 296 | } |
297 | tcp_slow_start(tp); | ||
298 | } else { | 298 | } else { |
299 | /* Congestion avoidance. */ | 299 | /* Congestion avoidance. */ |
300 | u32 next_snd_cwnd; | 300 | u32 next_snd_cwnd; |
@@ -327,37 +327,17 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, | |||
327 | else if (next_snd_cwnd < tp->snd_cwnd) | 327 | else if (next_snd_cwnd < tp->snd_cwnd) |
328 | tp->snd_cwnd--; | 328 | tp->snd_cwnd--; |
329 | } | 329 | } |
330 | } | ||
331 | 330 | ||
332 | /* Wipe the slate clean for the next RTT. */ | 331 | if (tp->snd_cwnd < 2) |
333 | vegas->cntRTT = 0; | 332 | tp->snd_cwnd = 2; |
334 | vegas->minRTT = 0x7fffffff; | 333 | else if (tp->snd_cwnd > tp->snd_cwnd_clamp) |
334 | tp->snd_cwnd = tp->snd_cwnd_clamp; | ||
335 | } | ||
335 | } | 336 | } |
336 | 337 | ||
337 | /* The following code is executed for every ack we receive, | 338 | /* Wipe the slate clean for the next RTT. */ |
338 | * except for conditions checked in should_advance_cwnd() | 339 | vegas->cntRTT = 0; |
339 | * before the call to tcp_cong_avoid(). Mainly this means that | 340 | vegas->minRTT = 0x7fffffff; |
340 | * we only execute this code if the ack actually acked some | ||
341 | * data. | ||
342 | */ | ||
343 | |||
344 | /* If we are in slow start, increase our cwnd in response to this ACK. | ||
345 | * (If we are not in slow start then we are in congestion avoidance, | ||
346 | * and adjust our congestion window only once per RTT. See the code | ||
347 | * above.) | ||
348 | */ | ||
349 | if (tp->snd_cwnd <= tp->snd_ssthresh) | ||
350 | tp->snd_cwnd++; | ||
351 | |||
352 | /* to keep cwnd from growing without bound */ | ||
353 | tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); | ||
354 | |||
355 | /* Make sure that we are never so timid as to reduce our cwnd below | ||
356 | * 2 MSS. | ||
357 | * | ||
358 | * Going below 2 MSS would risk huge delayed ACKs from our receiver. | ||
359 | */ | ||
360 | tp->snd_cwnd = max(tp->snd_cwnd, 2U); | ||
361 | } | 341 | } |
362 | 342 | ||
363 | /* Extract info for Tcp socket info provided via netlink. */ | 343 | /* Extract info for Tcp socket info provided via netlink. */ |
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index e0bd1013cb0d..2422a5f7195d 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c | |||
@@ -761,7 +761,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) | |||
761 | 761 | ||
762 | static __inline__ int __udp_checksum_complete(struct sk_buff *skb) | 762 | static __inline__ int __udp_checksum_complete(struct sk_buff *skb) |
763 | { | 763 | { |
764 | return (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)); | 764 | return __skb_checksum_complete(skb); |
765 | } | 765 | } |
766 | 766 | ||
767 | static __inline__ int udp_checksum_complete(struct sk_buff *skb) | 767 | static __inline__ int udp_checksum_complete(struct sk_buff *skb) |
@@ -1100,11 +1100,8 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, | |||
1100 | if (uh->check == 0) { | 1100 | if (uh->check == 0) { |
1101 | skb->ip_summed = CHECKSUM_UNNECESSARY; | 1101 | skb->ip_summed = CHECKSUM_UNNECESSARY; |
1102 | } else if (skb->ip_summed == CHECKSUM_HW) { | 1102 | } else if (skb->ip_summed == CHECKSUM_HW) { |
1103 | skb->ip_summed = CHECKSUM_UNNECESSARY; | ||
1104 | if (!udp_check(uh, ulen, saddr, daddr, skb->csum)) | 1103 | if (!udp_check(uh, ulen, saddr, daddr, skb->csum)) |
1105 | return 0; | 1104 | skb->ip_summed = CHECKSUM_UNNECESSARY; |
1106 | LIMIT_NETDEBUG(KERN_DEBUG "udp v4 hw csum failure.\n"); | ||
1107 | skb->ip_summed = CHECKSUM_NONE; | ||
1108 | } | 1105 | } |
1109 | if (skb->ip_summed != CHECKSUM_UNNECESSARY) | 1106 | if (skb->ip_summed != CHECKSUM_UNNECESSARY) |
1110 | skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); | 1107 | skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); |
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index b7a5f51238b3..ddcf7754eec2 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c | |||
@@ -1022,6 +1022,7 @@ int ipv6_dev_get_saddr(struct net_device *daddr_dev, | |||
1022 | continue; | 1022 | continue; |
1023 | } | 1023 | } |
1024 | 1024 | ||
1025 | #ifdef CONFIG_IPV6_PRIVACY | ||
1025 | /* Rule 7: Prefer public address | 1026 | /* Rule 7: Prefer public address |
1026 | * Note: prefer temprary address if use_tempaddr >= 2 | 1027 | * Note: prefer temprary address if use_tempaddr >= 2 |
1027 | */ | 1028 | */ |
@@ -1042,7 +1043,7 @@ int ipv6_dev_get_saddr(struct net_device *daddr_dev, | |||
1042 | if (hiscore.attrs & IPV6_SADDR_SCORE_PRIVACY) | 1043 | if (hiscore.attrs & IPV6_SADDR_SCORE_PRIVACY) |
1043 | continue; | 1044 | continue; |
1044 | } | 1045 | } |
1045 | 1046 | #endif | |
1046 | /* Rule 8: Use longest matching prefix */ | 1047 | /* Rule 8: Use longest matching prefix */ |
1047 | if (hiscore.rule < 8) | 1048 | if (hiscore.rule < 8) |
1048 | hiscore.matchlen = ipv6_addr_diff(&ifa_result->addr, daddr); | 1049 | hiscore.matchlen = ipv6_addr_diff(&ifa_result->addr, daddr); |
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 4f8795af2edb..c63b8ce0e1b5 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c | |||
@@ -699,12 +699,14 @@ static int __init inet6_init(void) | |||
699 | /* Register the family here so that the init calls below will | 699 | /* Register the family here so that the init calls below will |
700 | * be able to create sockets. (?? is this dangerous ??) | 700 | * be able to create sockets. (?? is this dangerous ??) |
701 | */ | 701 | */ |
702 | (void) sock_register(&inet6_family_ops); | 702 | err = sock_register(&inet6_family_ops); |
703 | if (err) | ||
704 | goto out_unregister_raw_proto; | ||
703 | 705 | ||
704 | /* Initialise ipv6 mibs */ | 706 | /* Initialise ipv6 mibs */ |
705 | err = init_ipv6_mibs(); | 707 | err = init_ipv6_mibs(); |
706 | if (err) | 708 | if (err) |
707 | goto out_unregister_raw_proto; | 709 | goto out_unregister_sock; |
708 | 710 | ||
709 | /* | 711 | /* |
710 | * ipngwg API draft makes clear that the correct semantics | 712 | * ipngwg API draft makes clear that the correct semantics |
@@ -796,6 +798,8 @@ icmp_fail: | |||
796 | ipv6_sysctl_unregister(); | 798 | ipv6_sysctl_unregister(); |
797 | #endif | 799 | #endif |
798 | cleanup_ipv6_mibs(); | 800 | cleanup_ipv6_mibs(); |
801 | out_unregister_sock: | ||
802 | sock_unregister(PF_INET6); | ||
799 | out_unregister_raw_proto: | 803 | out_unregister_raw_proto: |
800 | proto_unregister(&rawv6_prot); | 804 | proto_unregister(&rawv6_prot); |
801 | out_unregister_udp_proto: | 805 | out_unregister_udp_proto: |
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 23e540365a14..1bdf0fb8bf8a 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c | |||
@@ -585,17 +585,16 @@ static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) | |||
585 | daddr = &skb->nh.ipv6h->daddr; | 585 | daddr = &skb->nh.ipv6h->daddr; |
586 | 586 | ||
587 | /* Perform checksum. */ | 587 | /* Perform checksum. */ |
588 | if (skb->ip_summed == CHECKSUM_HW) { | 588 | switch (skb->ip_summed) { |
589 | skb->ip_summed = CHECKSUM_UNNECESSARY; | 589 | case CHECKSUM_HW: |
590 | if (csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, | 590 | if (!csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, |
591 | skb->csum)) { | 591 | skb->csum)) |
592 | LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 hw checksum failed\n"); | 592 | break; |
593 | skb->ip_summed = CHECKSUM_NONE; | 593 | /* fall through */ |
594 | } | 594 | case CHECKSUM_NONE: |
595 | } | 595 | skb->csum = ~csum_ipv6_magic(saddr, daddr, skb->len, |
596 | if (skb->ip_summed == CHECKSUM_NONE) { | 596 | IPPROTO_ICMPV6, 0); |
597 | if (csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, | 597 | if (__skb_checksum_complete(skb)) { |
598 | skb_checksum(skb, 0, skb->len, 0))) { | ||
599 | LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 checksum failed [%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x > %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x]\n", | 598 | LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 checksum failed [%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x > %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x]\n", |
600 | NIP6(*saddr), NIP6(*daddr)); | 599 | NIP6(*saddr), NIP6(*daddr)); |
601 | goto discard_it; | 600 | goto discard_it; |
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 6e3480426939..a6026d2787d2 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c | |||
@@ -176,6 +176,11 @@ resubmit: | |||
176 | if (ipprot->flags & INET6_PROTO_FINAL) { | 176 | if (ipprot->flags & INET6_PROTO_FINAL) { |
177 | struct ipv6hdr *hdr; | 177 | struct ipv6hdr *hdr; |
178 | 178 | ||
179 | /* Free reference early: we don't need it any more, | ||
180 | and it may hold ip_conntrack module loaded | ||
181 | indefinitely. */ | ||
182 | nf_reset(skb); | ||
183 | |||
179 | skb_postpull_rcsum(skb, skb->nh.raw, | 184 | skb_postpull_rcsum(skb, skb->nh.raw, |
180 | skb->h.raw - skb->nh.raw); | 185 | skb->h.raw - skb->nh.raw); |
181 | hdr = skb->nh.ipv6h; | 186 | hdr = skb->nh.ipv6h; |
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index dbd9767b32e4..c1fa693511a1 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c | |||
@@ -441,9 +441,15 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) | |||
441 | #ifdef CONFIG_NETFILTER | 441 | #ifdef CONFIG_NETFILTER |
442 | to->nfmark = from->nfmark; | 442 | to->nfmark = from->nfmark; |
443 | /* Connection association is same as pre-frag packet */ | 443 | /* Connection association is same as pre-frag packet */ |
444 | nf_conntrack_put(to->nfct); | ||
444 | to->nfct = from->nfct; | 445 | to->nfct = from->nfct; |
445 | nf_conntrack_get(to->nfct); | 446 | nf_conntrack_get(to->nfct); |
446 | to->nfctinfo = from->nfctinfo; | 447 | to->nfctinfo = from->nfctinfo; |
448 | #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) | ||
449 | nf_conntrack_put_reasm(to->nfct_reasm); | ||
450 | to->nfct_reasm = from->nfct_reasm; | ||
451 | nf_conntrack_get_reasm(to->nfct_reasm); | ||
452 | #endif | ||
447 | #ifdef CONFIG_BRIDGE_NETFILTER | 453 | #ifdef CONFIG_BRIDGE_NETFILTER |
448 | nf_bridge_put(to->nf_bridge); | 454 | nf_bridge_put(to->nf_bridge); |
449 | to->nf_bridge = from->nf_bridge; | 455 | to->nf_bridge = from->nf_bridge; |
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index e6b0e3954c02..e315d0f80af1 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c | |||
@@ -525,6 +525,7 @@ ip6ip6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) | |||
525 | 525 | ||
526 | if ((t = ip6ip6_tnl_lookup(&ipv6h->saddr, &ipv6h->daddr)) != NULL) { | 526 | if ((t = ip6ip6_tnl_lookup(&ipv6h->saddr, &ipv6h->daddr)) != NULL) { |
527 | if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { | 527 | if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { |
528 | read_unlock(&ip6ip6_lock); | ||
528 | kfree_skb(skb); | 529 | kfree_skb(skb); |
529 | return 0; | 530 | return 0; |
530 | } | 531 | } |
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index bb7ccfe33f23..971ba60bf6e9 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig | |||
@@ -278,5 +278,19 @@ config IP6_NF_RAW | |||
278 | If you want to compile it as a module, say M here and read | 278 | If you want to compile it as a module, say M here and read |
279 | <file:Documentation/modules.txt>. If unsure, say `N'. | 279 | <file:Documentation/modules.txt>. If unsure, say `N'. |
280 | 280 | ||
281 | config NF_CONNTRACK_IPV6 | ||
282 | tristate "IPv6 support for new connection tracking (EXPERIMENTAL)" | ||
283 | depends on EXPERIMENTAL && NF_CONNTRACK | ||
284 | ---help--- | ||
285 | Connection tracking keeps a record of what packets have passed | ||
286 | through your machine, in order to figure out how they are related | ||
287 | into connections. | ||
288 | |||
289 | This is IPv6 support on Layer 3 independent connection tracking. | ||
290 | Layer 3 independent connection tracking is experimental scheme | ||
291 | which generalize ip_conntrack to support other layer 3 protocols. | ||
292 | |||
293 | To compile it as a module, choose M here. If unsure, say N. | ||
294 | |||
281 | endmenu | 295 | endmenu |
282 | 296 | ||
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index 2b2c370e8b1c..9ab5b2ca1f59 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile | |||
@@ -27,3 +27,9 @@ obj-$(CONFIG_IP6_NF_TARGET_LOG) += ip6t_LOG.o | |||
27 | obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o | 27 | obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o |
28 | obj-$(CONFIG_IP6_NF_MATCH_HL) += ip6t_hl.o | 28 | obj-$(CONFIG_IP6_NF_MATCH_HL) += ip6t_hl.o |
29 | obj-$(CONFIG_IP6_NF_TARGET_REJECT) += ip6t_REJECT.o | 29 | obj-$(CONFIG_IP6_NF_TARGET_REJECT) += ip6t_REJECT.o |
30 | |||
31 | # objects for l3 independent conntrack | ||
32 | nf_conntrack_ipv6-objs := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o nf_conntrack_reasm.o | ||
33 | |||
34 | # l3 independent conntrack | ||
35 | obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o | ||
diff --git a/net/ipv6/netfilter/ip6t_MARK.c b/net/ipv6/netfilter/ip6t_MARK.c index 0c7584f92172..eab8fb864ee0 100644 --- a/net/ipv6/netfilter/ip6t_MARK.c +++ b/net/ipv6/netfilter/ip6t_MARK.c | |||
@@ -56,9 +56,9 @@ checkentry(const char *tablename, | |||
56 | return 1; | 56 | return 1; |
57 | } | 57 | } |
58 | 58 | ||
59 | static struct ip6t_target ip6t_mark_reg = { | 59 | static struct ip6t_target ip6t_mark_reg = { |
60 | .name = "MARK", | 60 | .name = "MARK", |
61 | .target = target, | 61 | .target = target, |
62 | .checkentry = checkentry, | 62 | .checkentry = checkentry, |
63 | .me = THIS_MODULE | 63 | .me = THIS_MODULE |
64 | }; | 64 | }; |
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c new file mode 100644 index 000000000000..e2c90b3a8074 --- /dev/null +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | |||
@@ -0,0 +1,556 @@ | |||
1 | /* | ||
2 | * Copyright (C)2004 USAGI/WIDE Project | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License version 2 as | ||
6 | * published by the Free Software Foundation. | ||
7 | * | ||
8 | * Author: | ||
9 | * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> | ||
10 | * | ||
11 | * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> | ||
12 | * - support Layer 3 protocol independent connection tracking. | ||
13 | * Based on the original ip_conntrack code which had the following | ||
14 | * copyright information: | ||
15 | * (C) 1999-2001 Paul `Rusty' Russell | ||
16 | * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> | ||
17 | * | ||
18 | * 23 Mar 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> | ||
19 | * - add get_features() to support various size of conntrack | ||
20 | * structures. | ||
21 | */ | ||
22 | |||
23 | #include <linux/config.h> | ||
24 | #include <linux/types.h> | ||
25 | #include <linux/ipv6.h> | ||
26 | #include <linux/in6.h> | ||
27 | #include <linux/netfilter.h> | ||
28 | #include <linux/module.h> | ||
29 | #include <linux/skbuff.h> | ||
30 | #include <linux/icmp.h> | ||
31 | #include <linux/sysctl.h> | ||
32 | #include <net/ipv6.h> | ||
33 | |||
34 | #include <linux/netfilter_ipv6.h> | ||
35 | #include <net/netfilter/nf_conntrack.h> | ||
36 | #include <net/netfilter/nf_conntrack_helper.h> | ||
37 | #include <net/netfilter/nf_conntrack_protocol.h> | ||
38 | #include <net/netfilter/nf_conntrack_l3proto.h> | ||
39 | #include <net/netfilter/nf_conntrack_core.h> | ||
40 | |||
41 | #if 0 | ||
42 | #define DEBUGP printk | ||
43 | #else | ||
44 | #define DEBUGP(format, args...) | ||
45 | #endif | ||
46 | |||
47 | DECLARE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat); | ||
48 | |||
49 | static int ipv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, | ||
50 | struct nf_conntrack_tuple *tuple) | ||
51 | { | ||
52 | u_int32_t _addrs[8], *ap; | ||
53 | |||
54 | ap = skb_header_pointer(skb, nhoff + offsetof(struct ipv6hdr, saddr), | ||
55 | sizeof(_addrs), _addrs); | ||
56 | if (ap == NULL) | ||
57 | return 0; | ||
58 | |||
59 | memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6)); | ||
60 | memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6)); | ||
61 | |||
62 | return 1; | ||
63 | } | ||
64 | |||
65 | static int ipv6_invert_tuple(struct nf_conntrack_tuple *tuple, | ||
66 | const struct nf_conntrack_tuple *orig) | ||
67 | { | ||
68 | memcpy(tuple->src.u3.ip6, orig->dst.u3.ip6, sizeof(tuple->src.u3.ip6)); | ||
69 | memcpy(tuple->dst.u3.ip6, orig->src.u3.ip6, sizeof(tuple->dst.u3.ip6)); | ||
70 | |||
71 | return 1; | ||
72 | } | ||
73 | |||
74 | static int ipv6_print_tuple(struct seq_file *s, | ||
75 | const struct nf_conntrack_tuple *tuple) | ||
76 | { | ||
77 | return seq_printf(s, "src=%x:%x:%x:%x:%x:%x:%x:%x dst=%x:%x:%x:%x:%x:%x:%x:%x ", | ||
78 | NIP6(*((struct in6_addr *)tuple->src.u3.ip6)), | ||
79 | NIP6(*((struct in6_addr *)tuple->dst.u3.ip6))); | ||
80 | } | ||
81 | |||
82 | static int ipv6_print_conntrack(struct seq_file *s, | ||
83 | const struct nf_conn *conntrack) | ||
84 | { | ||
85 | return 0; | ||
86 | } | ||
87 | |||
88 | /* | ||
89 | * Based on ipv6_skip_exthdr() in net/ipv6/exthdr.c | ||
90 | * | ||
91 | * This function parses (probably truncated) exthdr set "hdr" | ||
92 | * of length "len". "nexthdrp" initially points to some place, | ||
93 | * where type of the first header can be found. | ||
94 | * | ||
95 | * It skips all well-known exthdrs, and returns pointer to the start | ||
96 | * of unparsable area i.e. the first header with unknown type. | ||
97 | * if success, *nexthdr is updated by type/protocol of this header. | ||
98 | * | ||
99 | * NOTES: - it may return pointer pointing beyond end of packet, | ||
100 | * if the last recognized header is truncated in the middle. | ||
101 | * - if packet is truncated, so that all parsed headers are skipped, | ||
102 | * it returns -1. | ||
103 | * - if packet is fragmented, return pointer of the fragment header. | ||
104 | * - ESP is unparsable for now and considered like | ||
105 | * normal payload protocol. | ||
106 | * - Note also special handling of AUTH header. Thanks to IPsec wizards. | ||
107 | */ | ||
108 | |||
109 | int nf_ct_ipv6_skip_exthdr(struct sk_buff *skb, int start, u8 *nexthdrp, | ||
110 | int len) | ||
111 | { | ||
112 | u8 nexthdr = *nexthdrp; | ||
113 | |||
114 | while (ipv6_ext_hdr(nexthdr)) { | ||
115 | struct ipv6_opt_hdr hdr; | ||
116 | int hdrlen; | ||
117 | |||
118 | if (len < (int)sizeof(struct ipv6_opt_hdr)) | ||
119 | return -1; | ||
120 | if (nexthdr == NEXTHDR_NONE) | ||
121 | break; | ||
122 | if (nexthdr == NEXTHDR_FRAGMENT) | ||
123 | break; | ||
124 | if (skb_copy_bits(skb, start, &hdr, sizeof(hdr))) | ||
125 | BUG(); | ||
126 | if (nexthdr == NEXTHDR_AUTH) | ||
127 | hdrlen = (hdr.hdrlen+2)<<2; | ||
128 | else | ||
129 | hdrlen = ipv6_optlen(&hdr); | ||
130 | |||
131 | nexthdr = hdr.nexthdr; | ||
132 | len -= hdrlen; | ||
133 | start += hdrlen; | ||
134 | } | ||
135 | |||
136 | *nexthdrp = nexthdr; | ||
137 | return start; | ||
138 | } | ||
139 | |||
140 | static int | ||
141 | ipv6_prepare(struct sk_buff **pskb, unsigned int hooknum, unsigned int *dataoff, | ||
142 | u_int8_t *protonum) | ||
143 | { | ||
144 | unsigned int extoff; | ||
145 | unsigned char pnum; | ||
146 | int protoff; | ||
147 | |||
148 | extoff = (u8*)((*pskb)->nh.ipv6h + 1) - (*pskb)->data; | ||
149 | pnum = (*pskb)->nh.ipv6h->nexthdr; | ||
150 | |||
151 | protoff = nf_ct_ipv6_skip_exthdr(*pskb, extoff, &pnum, | ||
152 | (*pskb)->len - extoff); | ||
153 | |||
154 | /* | ||
155 | * (protoff == (*pskb)->len) mean that the packet doesn't have no data | ||
156 | * except of IPv6 & ext headers. but it's tracked anyway. - YK | ||
157 | */ | ||
158 | if ((protoff < 0) || (protoff > (*pskb)->len)) { | ||
159 | DEBUGP("ip6_conntrack_core: can't find proto in pkt\n"); | ||
160 | NF_CT_STAT_INC(error); | ||
161 | NF_CT_STAT_INC(invalid); | ||
162 | return -NF_ACCEPT; | ||
163 | } | ||
164 | |||
165 | *dataoff = protoff; | ||
166 | *protonum = pnum; | ||
167 | return NF_ACCEPT; | ||
168 | } | ||
169 | |||
170 | static u_int32_t ipv6_get_features(const struct nf_conntrack_tuple *tuple) | ||
171 | { | ||
172 | return NF_CT_F_BASIC; | ||
173 | } | ||
174 | |||
175 | static unsigned int ipv6_confirm(unsigned int hooknum, | ||
176 | struct sk_buff **pskb, | ||
177 | const struct net_device *in, | ||
178 | const struct net_device *out, | ||
179 | int (*okfn)(struct sk_buff *)) | ||
180 | { | ||
181 | struct nf_conn *ct; | ||
182 | enum ip_conntrack_info ctinfo; | ||
183 | |||
184 | /* This is where we call the helper: as the packet goes out. */ | ||
185 | ct = nf_ct_get(*pskb, &ctinfo); | ||
186 | if (ct && ct->helper) { | ||
187 | unsigned int ret, protoff; | ||
188 | unsigned int extoff = (u8*)((*pskb)->nh.ipv6h + 1) | ||
189 | - (*pskb)->data; | ||
190 | unsigned char pnum = (*pskb)->nh.ipv6h->nexthdr; | ||
191 | |||
192 | protoff = nf_ct_ipv6_skip_exthdr(*pskb, extoff, &pnum, | ||
193 | (*pskb)->len - extoff); | ||
194 | if (protoff < 0 || protoff > (*pskb)->len || | ||
195 | pnum == NEXTHDR_FRAGMENT) { | ||
196 | DEBUGP("proto header not found\n"); | ||
197 | return NF_ACCEPT; | ||
198 | } | ||
199 | |||
200 | ret = ct->helper->help(pskb, protoff, ct, ctinfo); | ||
201 | if (ret != NF_ACCEPT) | ||
202 | return ret; | ||
203 | } | ||
204 | |||
205 | /* We've seen it coming out the other side: confirm it */ | ||
206 | |||
207 | return nf_conntrack_confirm(pskb); | ||
208 | } | ||
209 | |||
210 | extern struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb); | ||
211 | extern void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb, | ||
212 | struct net_device *in, | ||
213 | struct net_device *out, | ||
214 | int (*okfn)(struct sk_buff *)); | ||
215 | static unsigned int ipv6_defrag(unsigned int hooknum, | ||
216 | struct sk_buff **pskb, | ||
217 | const struct net_device *in, | ||
218 | const struct net_device *out, | ||
219 | int (*okfn)(struct sk_buff *)) | ||
220 | { | ||
221 | struct sk_buff *reasm; | ||
222 | |||
223 | /* Previously seen (loopback)? */ | ||
224 | if ((*pskb)->nfct) | ||
225 | return NF_ACCEPT; | ||
226 | |||
227 | reasm = nf_ct_frag6_gather(*pskb); | ||
228 | |||
229 | /* queued */ | ||
230 | if (reasm == NULL) | ||
231 | return NF_STOLEN; | ||
232 | |||
233 | /* error occured or not fragmented */ | ||
234 | if (reasm == *pskb) | ||
235 | return NF_ACCEPT; | ||
236 | |||
237 | nf_ct_frag6_output(hooknum, reasm, (struct net_device *)in, | ||
238 | (struct net_device *)out, okfn); | ||
239 | |||
240 | return NF_STOLEN; | ||
241 | } | ||
242 | |||
243 | static unsigned int ipv6_conntrack_in(unsigned int hooknum, | ||
244 | struct sk_buff **pskb, | ||
245 | const struct net_device *in, | ||
246 | const struct net_device *out, | ||
247 | int (*okfn)(struct sk_buff *)) | ||
248 | { | ||
249 | struct sk_buff *reasm = (*pskb)->nfct_reasm; | ||
250 | |||
251 | /* This packet is fragmented and has reassembled packet. */ | ||
252 | if (reasm) { | ||
253 | /* Reassembled packet isn't parsed yet ? */ | ||
254 | if (!reasm->nfct) { | ||
255 | unsigned int ret; | ||
256 | |||
257 | ret = nf_conntrack_in(PF_INET6, hooknum, &reasm); | ||
258 | if (ret != NF_ACCEPT) | ||
259 | return ret; | ||
260 | } | ||
261 | nf_conntrack_get(reasm->nfct); | ||
262 | (*pskb)->nfct = reasm->nfct; | ||
263 | return NF_ACCEPT; | ||
264 | } | ||
265 | |||
266 | return nf_conntrack_in(PF_INET6, hooknum, pskb); | ||
267 | } | ||
268 | |||
269 | static unsigned int ipv6_conntrack_local(unsigned int hooknum, | ||
270 | struct sk_buff **pskb, | ||
271 | const struct net_device *in, | ||
272 | const struct net_device *out, | ||
273 | int (*okfn)(struct sk_buff *)) | ||
274 | { | ||
275 | /* root is playing with raw sockets. */ | ||
276 | if ((*pskb)->len < sizeof(struct ipv6hdr)) { | ||
277 | if (net_ratelimit()) | ||
278 | printk("ipv6_conntrack_local: packet too short\n"); | ||
279 | return NF_ACCEPT; | ||
280 | } | ||
281 | return ipv6_conntrack_in(hooknum, pskb, in, out, okfn); | ||
282 | } | ||
283 | |||
284 | /* Connection tracking may drop packets, but never alters them, so | ||
285 | make it the first hook. */ | ||
286 | static struct nf_hook_ops ipv6_conntrack_defrag_ops = { | ||
287 | .hook = ipv6_defrag, | ||
288 | .owner = THIS_MODULE, | ||
289 | .pf = PF_INET6, | ||
290 | .hooknum = NF_IP6_PRE_ROUTING, | ||
291 | .priority = NF_IP6_PRI_CONNTRACK_DEFRAG, | ||
292 | }; | ||
293 | |||
294 | static struct nf_hook_ops ipv6_conntrack_in_ops = { | ||
295 | .hook = ipv6_conntrack_in, | ||
296 | .owner = THIS_MODULE, | ||
297 | .pf = PF_INET6, | ||
298 | .hooknum = NF_IP6_PRE_ROUTING, | ||
299 | .priority = NF_IP6_PRI_CONNTRACK, | ||
300 | }; | ||
301 | |||
302 | static struct nf_hook_ops ipv6_conntrack_local_out_ops = { | ||
303 | .hook = ipv6_conntrack_local, | ||
304 | .owner = THIS_MODULE, | ||
305 | .pf = PF_INET6, | ||
306 | .hooknum = NF_IP6_LOCAL_OUT, | ||
307 | .priority = NF_IP6_PRI_CONNTRACK, | ||
308 | }; | ||
309 | |||
310 | static struct nf_hook_ops ipv6_conntrack_defrag_local_out_ops = { | ||
311 | .hook = ipv6_defrag, | ||
312 | .owner = THIS_MODULE, | ||
313 | .pf = PF_INET6, | ||
314 | .hooknum = NF_IP6_LOCAL_OUT, | ||
315 | .priority = NF_IP6_PRI_CONNTRACK_DEFRAG, | ||
316 | }; | ||
317 | |||
318 | /* Refragmenter; last chance. */ | ||
319 | static struct nf_hook_ops ipv6_conntrack_out_ops = { | ||
320 | .hook = ipv6_confirm, | ||
321 | .owner = THIS_MODULE, | ||
322 | .pf = PF_INET6, | ||
323 | .hooknum = NF_IP6_POST_ROUTING, | ||
324 | .priority = NF_IP6_PRI_LAST, | ||
325 | }; | ||
326 | |||
327 | static struct nf_hook_ops ipv6_conntrack_local_in_ops = { | ||
328 | .hook = ipv6_confirm, | ||
329 | .owner = THIS_MODULE, | ||
330 | .pf = PF_INET6, | ||
331 | .hooknum = NF_IP6_LOCAL_IN, | ||
332 | .priority = NF_IP6_PRI_LAST-1, | ||
333 | }; | ||
334 | |||
335 | #ifdef CONFIG_SYSCTL | ||
336 | |||
337 | /* From nf_conntrack_proto_icmpv6.c */ | ||
338 | extern unsigned long nf_ct_icmpv6_timeout; | ||
339 | |||
340 | /* From nf_conntrack_frag6.c */ | ||
341 | extern unsigned long nf_ct_frag6_timeout; | ||
342 | extern unsigned long nf_ct_frag6_low_thresh; | ||
343 | extern unsigned long nf_ct_frag6_high_thresh; | ||
344 | |||
345 | static struct ctl_table_header *nf_ct_ipv6_sysctl_header; | ||
346 | |||
347 | static ctl_table nf_ct_sysctl_table[] = { | ||
348 | { | ||
349 | .ctl_name = NET_NF_CONNTRACK_ICMPV6_TIMEOUT, | ||
350 | .procname = "nf_conntrack_icmpv6_timeout", | ||
351 | .data = &nf_ct_icmpv6_timeout, | ||
352 | .maxlen = sizeof(unsigned int), | ||
353 | .mode = 0644, | ||
354 | .proc_handler = &proc_dointvec_jiffies, | ||
355 | }, | ||
356 | { | ||
357 | .ctl_name = NET_NF_CONNTRACK_FRAG6_TIMEOUT, | ||
358 | .procname = "nf_conntrack_frag6_timeout", | ||
359 | .data = &nf_ct_frag6_timeout, | ||
360 | .maxlen = sizeof(unsigned int), | ||
361 | .mode = 0644, | ||
362 | .proc_handler = &proc_dointvec_jiffies, | ||
363 | }, | ||
364 | { | ||
365 | .ctl_name = NET_NF_CONNTRACK_FRAG6_LOW_THRESH, | ||
366 | .procname = "nf_conntrack_frag6_low_thresh", | ||
367 | .data = &nf_ct_frag6_low_thresh, | ||
368 | .maxlen = sizeof(unsigned int), | ||
369 | .mode = 0644, | ||
370 | .proc_handler = &proc_dointvec_jiffies, | ||
371 | }, | ||
372 | { | ||
373 | .ctl_name = NET_NF_CONNTRACK_FRAG6_HIGH_THRESH, | ||
374 | .procname = "nf_conntrack_frag6_high_thresh", | ||
375 | .data = &nf_ct_frag6_high_thresh, | ||
376 | .maxlen = sizeof(unsigned int), | ||
377 | .mode = 0644, | ||
378 | .proc_handler = &proc_dointvec_jiffies, | ||
379 | }, | ||
380 | { .ctl_name = 0 } | ||
381 | }; | ||
382 | |||
383 | static ctl_table nf_ct_netfilter_table[] = { | ||
384 | { | ||
385 | .ctl_name = NET_NETFILTER, | ||
386 | .procname = "netfilter", | ||
387 | .mode = 0555, | ||
388 | .child = nf_ct_sysctl_table, | ||
389 | }, | ||
390 | { .ctl_name = 0 } | ||
391 | }; | ||
392 | |||
393 | static ctl_table nf_ct_net_table[] = { | ||
394 | { | ||
395 | .ctl_name = CTL_NET, | ||
396 | .procname = "net", | ||
397 | .mode = 0555, | ||
398 | .child = nf_ct_netfilter_table, | ||
399 | }, | ||
400 | { .ctl_name = 0 } | ||
401 | }; | ||
402 | #endif | ||
403 | |||
404 | struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 = { | ||
405 | .l3proto = PF_INET6, | ||
406 | .name = "ipv6", | ||
407 | .pkt_to_tuple = ipv6_pkt_to_tuple, | ||
408 | .invert_tuple = ipv6_invert_tuple, | ||
409 | .print_tuple = ipv6_print_tuple, | ||
410 | .print_conntrack = ipv6_print_conntrack, | ||
411 | .prepare = ipv6_prepare, | ||
412 | .get_features = ipv6_get_features, | ||
413 | .me = THIS_MODULE, | ||
414 | }; | ||
415 | |||
416 | extern struct nf_conntrack_protocol nf_conntrack_protocol_tcp6; | ||
417 | extern struct nf_conntrack_protocol nf_conntrack_protocol_udp6; | ||
418 | extern struct nf_conntrack_protocol nf_conntrack_protocol_icmpv6; | ||
419 | extern int nf_ct_frag6_init(void); | ||
420 | extern void nf_ct_frag6_cleanup(void); | ||
421 | static int init_or_cleanup(int init) | ||
422 | { | ||
423 | int ret = 0; | ||
424 | |||
425 | if (!init) goto cleanup; | ||
426 | |||
427 | ret = nf_ct_frag6_init(); | ||
428 | if (ret < 0) { | ||
429 | printk("nf_conntrack_ipv6: can't initialize frag6.\n"); | ||
430 | goto cleanup_nothing; | ||
431 | } | ||
432 | ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_tcp6); | ||
433 | if (ret < 0) { | ||
434 | printk("nf_conntrack_ipv6: can't register tcp.\n"); | ||
435 | goto cleanup_frag6; | ||
436 | } | ||
437 | |||
438 | ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_udp6); | ||
439 | if (ret < 0) { | ||
440 | printk("nf_conntrack_ipv6: can't register udp.\n"); | ||
441 | goto cleanup_tcp; | ||
442 | } | ||
443 | |||
444 | ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_icmpv6); | ||
445 | if (ret < 0) { | ||
446 | printk("nf_conntrack_ipv6: can't register icmpv6.\n"); | ||
447 | goto cleanup_udp; | ||
448 | } | ||
449 | |||
450 | ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv6); | ||
451 | if (ret < 0) { | ||
452 | printk("nf_conntrack_ipv6: can't register ipv6\n"); | ||
453 | goto cleanup_icmpv6; | ||
454 | } | ||
455 | |||
456 | ret = nf_register_hook(&ipv6_conntrack_defrag_ops); | ||
457 | if (ret < 0) { | ||
458 | printk("nf_conntrack_ipv6: can't register pre-routing defrag " | ||
459 | "hook.\n"); | ||
460 | goto cleanup_ipv6; | ||
461 | } | ||
462 | |||
463 | ret = nf_register_hook(&ipv6_conntrack_defrag_local_out_ops); | ||
464 | if (ret < 0) { | ||
465 | printk("nf_conntrack_ipv6: can't register local_out defrag " | ||
466 | "hook.\n"); | ||
467 | goto cleanup_defragops; | ||
468 | } | ||
469 | |||
470 | ret = nf_register_hook(&ipv6_conntrack_in_ops); | ||
471 | if (ret < 0) { | ||
472 | printk("nf_conntrack_ipv6: can't register pre-routing hook.\n"); | ||
473 | goto cleanup_defraglocalops; | ||
474 | } | ||
475 | |||
476 | ret = nf_register_hook(&ipv6_conntrack_local_out_ops); | ||
477 | if (ret < 0) { | ||
478 | printk("nf_conntrack_ipv6: can't register local out hook.\n"); | ||
479 | goto cleanup_inops; | ||
480 | } | ||
481 | |||
482 | ret = nf_register_hook(&ipv6_conntrack_out_ops); | ||
483 | if (ret < 0) { | ||
484 | printk("nf_conntrack_ipv6: can't register post-routing hook.\n"); | ||
485 | goto cleanup_inandlocalops; | ||
486 | } | ||
487 | |||
488 | ret = nf_register_hook(&ipv6_conntrack_local_in_ops); | ||
489 | if (ret < 0) { | ||
490 | printk("nf_conntrack_ipv6: can't register local in hook.\n"); | ||
491 | goto cleanup_inoutandlocalops; | ||
492 | } | ||
493 | |||
494 | #ifdef CONFIG_SYSCTL | ||
495 | nf_ct_ipv6_sysctl_header = register_sysctl_table(nf_ct_net_table, 0); | ||
496 | if (nf_ct_ipv6_sysctl_header == NULL) { | ||
497 | printk("nf_conntrack: can't register to sysctl.\n"); | ||
498 | ret = -ENOMEM; | ||
499 | goto cleanup_localinops; | ||
500 | } | ||
501 | #endif | ||
502 | return ret; | ||
503 | |||
504 | cleanup: | ||
505 | synchronize_net(); | ||
506 | #ifdef CONFIG_SYSCTL | ||
507 | unregister_sysctl_table(nf_ct_ipv6_sysctl_header); | ||
508 | cleanup_localinops: | ||
509 | #endif | ||
510 | nf_unregister_hook(&ipv6_conntrack_local_in_ops); | ||
511 | cleanup_inoutandlocalops: | ||
512 | nf_unregister_hook(&ipv6_conntrack_out_ops); | ||
513 | cleanup_inandlocalops: | ||
514 | nf_unregister_hook(&ipv6_conntrack_local_out_ops); | ||
515 | cleanup_inops: | ||
516 | nf_unregister_hook(&ipv6_conntrack_in_ops); | ||
517 | cleanup_defraglocalops: | ||
518 | nf_unregister_hook(&ipv6_conntrack_defrag_local_out_ops); | ||
519 | cleanup_defragops: | ||
520 | nf_unregister_hook(&ipv6_conntrack_defrag_ops); | ||
521 | cleanup_ipv6: | ||
522 | nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv6); | ||
523 | cleanup_icmpv6: | ||
524 | nf_conntrack_protocol_unregister(&nf_conntrack_protocol_icmpv6); | ||
525 | cleanup_udp: | ||
526 | nf_conntrack_protocol_unregister(&nf_conntrack_protocol_udp6); | ||
527 | cleanup_tcp: | ||
528 | nf_conntrack_protocol_unregister(&nf_conntrack_protocol_tcp6); | ||
529 | cleanup_frag6: | ||
530 | nf_ct_frag6_cleanup(); | ||
531 | cleanup_nothing: | ||
532 | return ret; | ||
533 | } | ||
534 | |||
535 | MODULE_LICENSE("GPL"); | ||
536 | MODULE_AUTHOR("Yasuyuki KOZAKAI @USAGI <yasuyuki.kozakai@toshiba.co.jp>"); | ||
537 | |||
538 | static int __init init(void) | ||
539 | { | ||
540 | need_nf_conntrack(); | ||
541 | return init_or_cleanup(1); | ||
542 | } | ||
543 | |||
544 | static void __exit fini(void) | ||
545 | { | ||
546 | init_or_cleanup(0); | ||
547 | } | ||
548 | |||
549 | module_init(init); | ||
550 | module_exit(fini); | ||
551 | |||
552 | void need_ip6_conntrack(void) | ||
553 | { | ||
554 | } | ||
555 | |||
556 | EXPORT_SYMBOL(need_ip6_conntrack); | ||
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c new file mode 100644 index 000000000000..c0f1da5497a9 --- /dev/null +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | |||
@@ -0,0 +1,272 @@ | |||
1 | /* | ||
2 | * Copyright (C)2003,2004 USAGI/WIDE Project | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License version 2 as | ||
6 | * published by the Free Software Foundation. | ||
7 | * | ||
8 | * Author: | ||
9 | * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> | ||
10 | * | ||
11 | * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> | ||
12 | * - ICMPv6 tracking support. Derived from the original ip_conntrack code | ||
13 | * net/ipv4/netfilter/ip_conntrack_proto_icmp.c which had the following | ||
14 | * copyright information: | ||
15 | * (C) 1999-2001 Paul `Rusty' Russell | ||
16 | * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> | ||
17 | */ | ||
18 | |||
19 | #include <linux/types.h> | ||
20 | #include <linux/sched.h> | ||
21 | #include <linux/timer.h> | ||
22 | #include <linux/module.h> | ||
23 | #include <linux/netfilter.h> | ||
24 | #include <linux/in6.h> | ||
25 | #include <linux/icmpv6.h> | ||
26 | #include <linux/ipv6.h> | ||
27 | #include <net/ipv6.h> | ||
28 | #include <net/ip6_checksum.h> | ||
29 | #include <linux/seq_file.h> | ||
30 | #include <linux/netfilter_ipv6.h> | ||
31 | #include <net/netfilter/nf_conntrack_tuple.h> | ||
32 | #include <net/netfilter/nf_conntrack_protocol.h> | ||
33 | #include <net/netfilter/nf_conntrack_core.h> | ||
34 | #include <net/netfilter/ipv6/nf_conntrack_icmpv6.h> | ||
35 | |||
36 | unsigned long nf_ct_icmpv6_timeout = 30*HZ; | ||
37 | |||
38 | #if 0 | ||
39 | #define DEBUGP printk | ||
40 | #else | ||
41 | #define DEBUGP(format, args...) | ||
42 | #endif | ||
43 | |||
44 | static int icmpv6_pkt_to_tuple(const struct sk_buff *skb, | ||
45 | unsigned int dataoff, | ||
46 | struct nf_conntrack_tuple *tuple) | ||
47 | { | ||
48 | struct icmp6hdr _hdr, *hp; | ||
49 | |||
50 | hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); | ||
51 | if (hp == NULL) | ||
52 | return 0; | ||
53 | tuple->dst.u.icmp.type = hp->icmp6_type; | ||
54 | tuple->src.u.icmp.id = hp->icmp6_identifier; | ||
55 | tuple->dst.u.icmp.code = hp->icmp6_code; | ||
56 | |||
57 | return 1; | ||
58 | } | ||
59 | |||
60 | static int icmpv6_invert_tuple(struct nf_conntrack_tuple *tuple, | ||
61 | const struct nf_conntrack_tuple *orig) | ||
62 | { | ||
63 | /* Add 1; spaces filled with 0. */ | ||
64 | static u_int8_t invmap[] = { | ||
65 | [ICMPV6_ECHO_REQUEST - 128] = ICMPV6_ECHO_REPLY + 1, | ||
66 | [ICMPV6_ECHO_REPLY - 128] = ICMPV6_ECHO_REQUEST + 1, | ||
67 | [ICMPV6_NI_QUERY - 128] = ICMPV6_NI_QUERY + 1, | ||
68 | [ICMPV6_NI_REPLY - 128] = ICMPV6_NI_REPLY +1 | ||
69 | }; | ||
70 | |||
71 | __u8 type = orig->dst.u.icmp.type - 128; | ||
72 | if (type >= sizeof(invmap) || !invmap[type]) | ||
73 | return 0; | ||
74 | |||
75 | tuple->src.u.icmp.id = orig->src.u.icmp.id; | ||
76 | tuple->dst.u.icmp.type = invmap[type] - 1; | ||
77 | tuple->dst.u.icmp.code = orig->dst.u.icmp.code; | ||
78 | return 1; | ||
79 | } | ||
80 | |||
81 | /* Print out the per-protocol part of the tuple. */ | ||
82 | static int icmpv6_print_tuple(struct seq_file *s, | ||
83 | const struct nf_conntrack_tuple *tuple) | ||
84 | { | ||
85 | return seq_printf(s, "type=%u code=%u id=%u ", | ||
86 | tuple->dst.u.icmp.type, | ||
87 | tuple->dst.u.icmp.code, | ||
88 | ntohs(tuple->src.u.icmp.id)); | ||
89 | } | ||
90 | |||
91 | /* Print out the private part of the conntrack. */ | ||
92 | static int icmpv6_print_conntrack(struct seq_file *s, | ||
93 | const struct nf_conn *conntrack) | ||
94 | { | ||
95 | return 0; | ||
96 | } | ||
97 | |||
98 | /* Returns verdict for packet, or -1 for invalid. */ | ||
99 | static int icmpv6_packet(struct nf_conn *ct, | ||
100 | const struct sk_buff *skb, | ||
101 | unsigned int dataoff, | ||
102 | enum ip_conntrack_info ctinfo, | ||
103 | int pf, | ||
104 | unsigned int hooknum) | ||
105 | { | ||
106 | /* Try to delete connection immediately after all replies: | ||
107 | won't actually vanish as we still have skb, and del_timer | ||
108 | means this will only run once even if count hits zero twice | ||
109 | (theoretically possible with SMP) */ | ||
110 | if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) { | ||
111 | if (atomic_dec_and_test(&ct->proto.icmp.count) | ||
112 | && del_timer(&ct->timeout)) | ||
113 | ct->timeout.function((unsigned long)ct); | ||
114 | } else { | ||
115 | atomic_inc(&ct->proto.icmp.count); | ||
116 | nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); | ||
117 | nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmpv6_timeout); | ||
118 | } | ||
119 | |||
120 | return NF_ACCEPT; | ||
121 | } | ||
122 | |||
123 | /* Called when a new connection for this protocol found. */ | ||
124 | static int icmpv6_new(struct nf_conn *conntrack, | ||
125 | const struct sk_buff *skb, | ||
126 | unsigned int dataoff) | ||
127 | { | ||
128 | static u_int8_t valid_new[] = { | ||
129 | [ICMPV6_ECHO_REQUEST - 128] = 1, | ||
130 | [ICMPV6_NI_QUERY - 128] = 1 | ||
131 | }; | ||
132 | |||
133 | if (conntrack->tuplehash[0].tuple.dst.u.icmp.type - 128 >= sizeof(valid_new) | ||
134 | || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type - 128]) { | ||
135 | /* Can't create a new ICMPv6 `conn' with this. */ | ||
136 | DEBUGP("icmp: can't create new conn with type %u\n", | ||
137 | conntrack->tuplehash[0].tuple.dst.u.icmp.type); | ||
138 | NF_CT_DUMP_TUPLE(&conntrack->tuplehash[0].tuple); | ||
139 | return 0; | ||
140 | } | ||
141 | atomic_set(&conntrack->proto.icmp.count, 0); | ||
142 | return 1; | ||
143 | } | ||
144 | |||
145 | extern int | ||
146 | nf_ct_ipv6_skip_exthdr(struct sk_buff *skb, int start, u8 *nexthdrp, int len); | ||
147 | extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6; | ||
148 | static int | ||
149 | icmpv6_error_message(struct sk_buff *skb, | ||
150 | unsigned int icmp6off, | ||
151 | enum ip_conntrack_info *ctinfo, | ||
152 | unsigned int hooknum) | ||
153 | { | ||
154 | struct nf_conntrack_tuple intuple, origtuple; | ||
155 | struct nf_conntrack_tuple_hash *h; | ||
156 | struct icmp6hdr _hdr, *hp; | ||
157 | unsigned int inip6off; | ||
158 | struct nf_conntrack_protocol *inproto; | ||
159 | u_int8_t inprotonum; | ||
160 | unsigned int inprotoff; | ||
161 | |||
162 | NF_CT_ASSERT(skb->nfct == NULL); | ||
163 | |||
164 | hp = skb_header_pointer(skb, icmp6off, sizeof(_hdr), &_hdr); | ||
165 | if (hp == NULL) { | ||
166 | DEBUGP("icmpv6_error: Can't get ICMPv6 hdr.\n"); | ||
167 | return -NF_ACCEPT; | ||
168 | } | ||
169 | |||
170 | inip6off = icmp6off + sizeof(_hdr); | ||
171 | if (skb_copy_bits(skb, inip6off+offsetof(struct ipv6hdr, nexthdr), | ||
172 | &inprotonum, sizeof(inprotonum)) != 0) { | ||
173 | DEBUGP("icmpv6_error: Can't get nexthdr in inner IPv6 header.\n"); | ||
174 | return -NF_ACCEPT; | ||
175 | } | ||
176 | inprotoff = nf_ct_ipv6_skip_exthdr(skb, | ||
177 | inip6off + sizeof(struct ipv6hdr), | ||
178 | &inprotonum, | ||
179 | skb->len - inip6off | ||
180 | - sizeof(struct ipv6hdr)); | ||
181 | |||
182 | if ((inprotoff < 0) || (inprotoff > skb->len) || | ||
183 | (inprotonum == NEXTHDR_FRAGMENT)) { | ||
184 | DEBUGP("icmpv6_error: Can't get protocol header in ICMPv6 payload.\n"); | ||
185 | return -NF_ACCEPT; | ||
186 | } | ||
187 | |||
188 | inproto = nf_ct_find_proto(PF_INET6, inprotonum); | ||
189 | |||
190 | /* Are they talking about one of our connections? */ | ||
191 | if (!nf_ct_get_tuple(skb, inip6off, inprotoff, PF_INET6, inprotonum, | ||
192 | &origtuple, &nf_conntrack_l3proto_ipv6, inproto)) { | ||
193 | DEBUGP("icmpv6_error: Can't get tuple\n"); | ||
194 | return -NF_ACCEPT; | ||
195 | } | ||
196 | |||
197 | /* Ordinarily, we'd expect the inverted tupleproto, but it's | ||
198 | been preserved inside the ICMP. */ | ||
199 | if (!nf_ct_invert_tuple(&intuple, &origtuple, | ||
200 | &nf_conntrack_l3proto_ipv6, inproto)) { | ||
201 | DEBUGP("icmpv6_error: Can't invert tuple\n"); | ||
202 | return -NF_ACCEPT; | ||
203 | } | ||
204 | |||
205 | *ctinfo = IP_CT_RELATED; | ||
206 | |||
207 | h = nf_conntrack_find_get(&intuple, NULL); | ||
208 | if (!h) { | ||
209 | DEBUGP("icmpv6_error: no match\n"); | ||
210 | return -NF_ACCEPT; | ||
211 | } else { | ||
212 | if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) | ||
213 | *ctinfo += IP_CT_IS_REPLY; | ||
214 | } | ||
215 | |||
216 | /* Update skb to refer to this connection */ | ||
217 | skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general; | ||
218 | skb->nfctinfo = *ctinfo; | ||
219 | return -NF_ACCEPT; | ||
220 | } | ||
221 | |||
222 | static int | ||
223 | icmpv6_error(struct sk_buff *skb, unsigned int dataoff, | ||
224 | enum ip_conntrack_info *ctinfo, int pf, unsigned int hooknum) | ||
225 | { | ||
226 | struct icmp6hdr _ih, *icmp6h; | ||
227 | |||
228 | icmp6h = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih); | ||
229 | if (icmp6h == NULL) { | ||
230 | if (LOG_INVALID(IPPROTO_ICMPV6)) | ||
231 | nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL, | ||
232 | "nf_ct_icmpv6: short packet "); | ||
233 | return -NF_ACCEPT; | ||
234 | } | ||
235 | |||
236 | if (hooknum != NF_IP6_PRE_ROUTING) | ||
237 | goto skipped; | ||
238 | |||
239 | /* Ignore it if the checksum's bogus. */ | ||
240 | if (csum_ipv6_magic(&skb->nh.ipv6h->saddr, &skb->nh.ipv6h->daddr, | ||
241 | skb->len - dataoff, IPPROTO_ICMPV6, | ||
242 | skb_checksum(skb, dataoff, | ||
243 | skb->len - dataoff, 0))) { | ||
244 | nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL, | ||
245 | "nf_ct_icmpv6: ICMPv6 checksum failed\n"); | ||
246 | return -NF_ACCEPT; | ||
247 | } | ||
248 | |||
249 | skipped: | ||
250 | |||
251 | /* is not error message ? */ | ||
252 | if (icmp6h->icmp6_type >= 128) | ||
253 | return NF_ACCEPT; | ||
254 | |||
255 | return icmpv6_error_message(skb, dataoff, ctinfo, hooknum); | ||
256 | } | ||
257 | |||
258 | struct nf_conntrack_protocol nf_conntrack_protocol_icmpv6 = | ||
259 | { | ||
260 | .l3proto = PF_INET6, | ||
261 | .proto = IPPROTO_ICMPV6, | ||
262 | .name = "icmpv6", | ||
263 | .pkt_to_tuple = icmpv6_pkt_to_tuple, | ||
264 | .invert_tuple = icmpv6_invert_tuple, | ||
265 | .print_tuple = icmpv6_print_tuple, | ||
266 | .print_conntrack = icmpv6_print_conntrack, | ||
267 | .packet = icmpv6_packet, | ||
268 | .new = icmpv6_new, | ||
269 | .error = icmpv6_error, | ||
270 | }; | ||
271 | |||
272 | EXPORT_SYMBOL(nf_conntrack_protocol_icmpv6); | ||
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c new file mode 100644 index 000000000000..7640b9bb7694 --- /dev/null +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c | |||
@@ -0,0 +1,885 @@ | |||
1 | /* | ||
2 | * IPv6 fragment reassembly for connection tracking | ||
3 | * | ||
4 | * Copyright (C)2004 USAGI/WIDE Project | ||
5 | * | ||
6 | * Author: | ||
7 | * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> | ||
8 | * | ||
9 | * Based on: net/ipv6/reassembly.c | ||
10 | * | ||
11 | * This program is free software; you can redistribute it and/or | ||
12 | * modify it under the terms of the GNU General Public License | ||
13 | * as published by the Free Software Foundation; either version | ||
14 | * 2 of the License, or (at your option) any later version. | ||
15 | */ | ||
16 | |||
17 | #include <linux/config.h> | ||
18 | #include <linux/errno.h> | ||
19 | #include <linux/types.h> | ||
20 | #include <linux/string.h> | ||
21 | #include <linux/socket.h> | ||
22 | #include <linux/sockios.h> | ||
23 | #include <linux/jiffies.h> | ||
24 | #include <linux/net.h> | ||
25 | #include <linux/list.h> | ||
26 | #include <linux/netdevice.h> | ||
27 | #include <linux/in6.h> | ||
28 | #include <linux/ipv6.h> | ||
29 | #include <linux/icmpv6.h> | ||
30 | #include <linux/random.h> | ||
31 | #include <linux/jhash.h> | ||
32 | |||
33 | #include <net/sock.h> | ||
34 | #include <net/snmp.h> | ||
35 | |||
36 | #include <net/ipv6.h> | ||
37 | #include <net/protocol.h> | ||
38 | #include <net/transp_v6.h> | ||
39 | #include <net/rawv6.h> | ||
40 | #include <net/ndisc.h> | ||
41 | #include <net/addrconf.h> | ||
42 | #include <linux/sysctl.h> | ||
43 | #include <linux/netfilter.h> | ||
44 | #include <linux/netfilter_ipv6.h> | ||
45 | #include <linux/kernel.h> | ||
46 | #include <linux/module.h> | ||
47 | |||
48 | #if 0 | ||
49 | #define DEBUGP printk | ||
50 | #else | ||
51 | #define DEBUGP(format, args...) | ||
52 | #endif | ||
53 | |||
54 | #define NF_CT_FRAG6_HIGH_THRESH 262144 /* == 256*1024 */ | ||
55 | #define NF_CT_FRAG6_LOW_THRESH 196608 /* == 192*1024 */ | ||
56 | #define NF_CT_FRAG6_TIMEOUT IPV6_FRAG_TIMEOUT | ||
57 | |||
58 | int nf_ct_frag6_high_thresh = 256*1024; | ||
59 | int nf_ct_frag6_low_thresh = 192*1024; | ||
60 | int nf_ct_frag6_timeout = IPV6_FRAG_TIMEOUT; | ||
61 | |||
62 | struct nf_ct_frag6_skb_cb | ||
63 | { | ||
64 | struct inet6_skb_parm h; | ||
65 | int offset; | ||
66 | struct sk_buff *orig; | ||
67 | }; | ||
68 | |||
69 | #define NFCT_FRAG6_CB(skb) ((struct nf_ct_frag6_skb_cb*)((skb)->cb)) | ||
70 | |||
71 | struct nf_ct_frag6_queue | ||
72 | { | ||
73 | struct nf_ct_frag6_queue *next; | ||
74 | struct list_head lru_list; /* lru list member */ | ||
75 | |||
76 | __u32 id; /* fragment id */ | ||
77 | struct in6_addr saddr; | ||
78 | struct in6_addr daddr; | ||
79 | |||
80 | spinlock_t lock; | ||
81 | atomic_t refcnt; | ||
82 | struct timer_list timer; /* expire timer */ | ||
83 | struct sk_buff *fragments; | ||
84 | int len; | ||
85 | int meat; | ||
86 | struct timeval stamp; | ||
87 | unsigned int csum; | ||
88 | __u8 last_in; /* has first/last segment arrived? */ | ||
89 | #define COMPLETE 4 | ||
90 | #define FIRST_IN 2 | ||
91 | #define LAST_IN 1 | ||
92 | __u16 nhoffset; | ||
93 | struct nf_ct_frag6_queue **pprev; | ||
94 | }; | ||
95 | |||
96 | /* Hash table. */ | ||
97 | |||
98 | #define FRAG6Q_HASHSZ 64 | ||
99 | |||
100 | static struct nf_ct_frag6_queue *nf_ct_frag6_hash[FRAG6Q_HASHSZ]; | ||
101 | static rwlock_t nf_ct_frag6_lock = RW_LOCK_UNLOCKED; | ||
102 | static u32 nf_ct_frag6_hash_rnd; | ||
103 | static LIST_HEAD(nf_ct_frag6_lru_list); | ||
104 | int nf_ct_frag6_nqueues = 0; | ||
105 | |||
106 | static __inline__ void __fq_unlink(struct nf_ct_frag6_queue *fq) | ||
107 | { | ||
108 | if (fq->next) | ||
109 | fq->next->pprev = fq->pprev; | ||
110 | *fq->pprev = fq->next; | ||
111 | list_del(&fq->lru_list); | ||
112 | nf_ct_frag6_nqueues--; | ||
113 | } | ||
114 | |||
115 | static __inline__ void fq_unlink(struct nf_ct_frag6_queue *fq) | ||
116 | { | ||
117 | write_lock(&nf_ct_frag6_lock); | ||
118 | __fq_unlink(fq); | ||
119 | write_unlock(&nf_ct_frag6_lock); | ||
120 | } | ||
121 | |||
122 | static unsigned int ip6qhashfn(u32 id, struct in6_addr *saddr, | ||
123 | struct in6_addr *daddr) | ||
124 | { | ||
125 | u32 a, b, c; | ||
126 | |||
127 | a = saddr->s6_addr32[0]; | ||
128 | b = saddr->s6_addr32[1]; | ||
129 | c = saddr->s6_addr32[2]; | ||
130 | |||
131 | a += JHASH_GOLDEN_RATIO; | ||
132 | b += JHASH_GOLDEN_RATIO; | ||
133 | c += nf_ct_frag6_hash_rnd; | ||
134 | __jhash_mix(a, b, c); | ||
135 | |||
136 | a += saddr->s6_addr32[3]; | ||
137 | b += daddr->s6_addr32[0]; | ||
138 | c += daddr->s6_addr32[1]; | ||
139 | __jhash_mix(a, b, c); | ||
140 | |||
141 | a += daddr->s6_addr32[2]; | ||
142 | b += daddr->s6_addr32[3]; | ||
143 | c += id; | ||
144 | __jhash_mix(a, b, c); | ||
145 | |||
146 | return c & (FRAG6Q_HASHSZ - 1); | ||
147 | } | ||
148 | |||
149 | static struct timer_list nf_ct_frag6_secret_timer; | ||
150 | int nf_ct_frag6_secret_interval = 10 * 60 * HZ; | ||
151 | |||
152 | static void nf_ct_frag6_secret_rebuild(unsigned long dummy) | ||
153 | { | ||
154 | unsigned long now = jiffies; | ||
155 | int i; | ||
156 | |||
157 | write_lock(&nf_ct_frag6_lock); | ||
158 | get_random_bytes(&nf_ct_frag6_hash_rnd, sizeof(u32)); | ||
159 | for (i = 0; i < FRAG6Q_HASHSZ; i++) { | ||
160 | struct nf_ct_frag6_queue *q; | ||
161 | |||
162 | q = nf_ct_frag6_hash[i]; | ||
163 | while (q) { | ||
164 | struct nf_ct_frag6_queue *next = q->next; | ||
165 | unsigned int hval = ip6qhashfn(q->id, | ||
166 | &q->saddr, | ||
167 | &q->daddr); | ||
168 | |||
169 | if (hval != i) { | ||
170 | /* Unlink. */ | ||
171 | if (q->next) | ||
172 | q->next->pprev = q->pprev; | ||
173 | *q->pprev = q->next; | ||
174 | |||
175 | /* Relink to new hash chain. */ | ||
176 | if ((q->next = nf_ct_frag6_hash[hval]) != NULL) | ||
177 | q->next->pprev = &q->next; | ||
178 | nf_ct_frag6_hash[hval] = q; | ||
179 | q->pprev = &nf_ct_frag6_hash[hval]; | ||
180 | } | ||
181 | |||
182 | q = next; | ||
183 | } | ||
184 | } | ||
185 | write_unlock(&nf_ct_frag6_lock); | ||
186 | |||
187 | mod_timer(&nf_ct_frag6_secret_timer, now + nf_ct_frag6_secret_interval); | ||
188 | } | ||
189 | |||
190 | atomic_t nf_ct_frag6_mem = ATOMIC_INIT(0); | ||
191 | |||
192 | /* Memory Tracking Functions. */ | ||
193 | static inline void frag_kfree_skb(struct sk_buff *skb) | ||
194 | { | ||
195 | atomic_sub(skb->truesize, &nf_ct_frag6_mem); | ||
196 | if (NFCT_FRAG6_CB(skb)->orig) | ||
197 | kfree_skb(NFCT_FRAG6_CB(skb)->orig); | ||
198 | |||
199 | kfree_skb(skb); | ||
200 | } | ||
201 | |||
202 | static inline void frag_free_queue(struct nf_ct_frag6_queue *fq) | ||
203 | { | ||
204 | atomic_sub(sizeof(struct nf_ct_frag6_queue), &nf_ct_frag6_mem); | ||
205 | kfree(fq); | ||
206 | } | ||
207 | |||
208 | static inline struct nf_ct_frag6_queue *frag_alloc_queue(void) | ||
209 | { | ||
210 | struct nf_ct_frag6_queue *fq = kmalloc(sizeof(struct nf_ct_frag6_queue), GFP_ATOMIC); | ||
211 | |||
212 | if (!fq) | ||
213 | return NULL; | ||
214 | atomic_add(sizeof(struct nf_ct_frag6_queue), &nf_ct_frag6_mem); | ||
215 | return fq; | ||
216 | } | ||
217 | |||
218 | /* Destruction primitives. */ | ||
219 | |||
220 | /* Complete destruction of fq. */ | ||
221 | static void nf_ct_frag6_destroy(struct nf_ct_frag6_queue *fq) | ||
222 | { | ||
223 | struct sk_buff *fp; | ||
224 | |||
225 | BUG_TRAP(fq->last_in&COMPLETE); | ||
226 | BUG_TRAP(del_timer(&fq->timer) == 0); | ||
227 | |||
228 | /* Release all fragment data. */ | ||
229 | fp = fq->fragments; | ||
230 | while (fp) { | ||
231 | struct sk_buff *xp = fp->next; | ||
232 | |||
233 | frag_kfree_skb(fp); | ||
234 | fp = xp; | ||
235 | } | ||
236 | |||
237 | frag_free_queue(fq); | ||
238 | } | ||
239 | |||
240 | static __inline__ void fq_put(struct nf_ct_frag6_queue *fq) | ||
241 | { | ||
242 | if (atomic_dec_and_test(&fq->refcnt)) | ||
243 | nf_ct_frag6_destroy(fq); | ||
244 | } | ||
245 | |||
246 | /* Kill fq entry. It is not destroyed immediately, | ||
247 | * because caller (and someone more) holds reference count. | ||
248 | */ | ||
249 | static __inline__ void fq_kill(struct nf_ct_frag6_queue *fq) | ||
250 | { | ||
251 | if (del_timer(&fq->timer)) | ||
252 | atomic_dec(&fq->refcnt); | ||
253 | |||
254 | if (!(fq->last_in & COMPLETE)) { | ||
255 | fq_unlink(fq); | ||
256 | atomic_dec(&fq->refcnt); | ||
257 | fq->last_in |= COMPLETE; | ||
258 | } | ||
259 | } | ||
260 | |||
261 | static void nf_ct_frag6_evictor(void) | ||
262 | { | ||
263 | struct nf_ct_frag6_queue *fq; | ||
264 | struct list_head *tmp; | ||
265 | |||
266 | for (;;) { | ||
267 | if (atomic_read(&nf_ct_frag6_mem) <= nf_ct_frag6_low_thresh) | ||
268 | return; | ||
269 | read_lock(&nf_ct_frag6_lock); | ||
270 | if (list_empty(&nf_ct_frag6_lru_list)) { | ||
271 | read_unlock(&nf_ct_frag6_lock); | ||
272 | return; | ||
273 | } | ||
274 | tmp = nf_ct_frag6_lru_list.next; | ||
275 | fq = list_entry(tmp, struct nf_ct_frag6_queue, lru_list); | ||
276 | atomic_inc(&fq->refcnt); | ||
277 | read_unlock(&nf_ct_frag6_lock); | ||
278 | |||
279 | spin_lock(&fq->lock); | ||
280 | if (!(fq->last_in&COMPLETE)) | ||
281 | fq_kill(fq); | ||
282 | spin_unlock(&fq->lock); | ||
283 | |||
284 | fq_put(fq); | ||
285 | } | ||
286 | } | ||
287 | |||
288 | static void nf_ct_frag6_expire(unsigned long data) | ||
289 | { | ||
290 | struct nf_ct_frag6_queue *fq = (struct nf_ct_frag6_queue *) data; | ||
291 | |||
292 | spin_lock(&fq->lock); | ||
293 | |||
294 | if (fq->last_in & COMPLETE) | ||
295 | goto out; | ||
296 | |||
297 | fq_kill(fq); | ||
298 | |||
299 | out: | ||
300 | spin_unlock(&fq->lock); | ||
301 | fq_put(fq); | ||
302 | } | ||
303 | |||
304 | /* Creation primitives. */ | ||
305 | |||
306 | |||
307 | static struct nf_ct_frag6_queue *nf_ct_frag6_intern(unsigned int hash, | ||
308 | struct nf_ct_frag6_queue *fq_in) | ||
309 | { | ||
310 | struct nf_ct_frag6_queue *fq; | ||
311 | |||
312 | write_lock(&nf_ct_frag6_lock); | ||
313 | #ifdef CONFIG_SMP | ||
314 | for (fq = nf_ct_frag6_hash[hash]; fq; fq = fq->next) { | ||
315 | if (fq->id == fq_in->id && | ||
316 | !ipv6_addr_cmp(&fq_in->saddr, &fq->saddr) && | ||
317 | !ipv6_addr_cmp(&fq_in->daddr, &fq->daddr)) { | ||
318 | atomic_inc(&fq->refcnt); | ||
319 | write_unlock(&nf_ct_frag6_lock); | ||
320 | fq_in->last_in |= COMPLETE; | ||
321 | fq_put(fq_in); | ||
322 | return fq; | ||
323 | } | ||
324 | } | ||
325 | #endif | ||
326 | fq = fq_in; | ||
327 | |||
328 | if (!mod_timer(&fq->timer, jiffies + nf_ct_frag6_timeout)) | ||
329 | atomic_inc(&fq->refcnt); | ||
330 | |||
331 | atomic_inc(&fq->refcnt); | ||
332 | if ((fq->next = nf_ct_frag6_hash[hash]) != NULL) | ||
333 | fq->next->pprev = &fq->next; | ||
334 | nf_ct_frag6_hash[hash] = fq; | ||
335 | fq->pprev = &nf_ct_frag6_hash[hash]; | ||
336 | INIT_LIST_HEAD(&fq->lru_list); | ||
337 | list_add_tail(&fq->lru_list, &nf_ct_frag6_lru_list); | ||
338 | nf_ct_frag6_nqueues++; | ||
339 | write_unlock(&nf_ct_frag6_lock); | ||
340 | return fq; | ||
341 | } | ||
342 | |||
343 | |||
344 | static struct nf_ct_frag6_queue * | ||
345 | nf_ct_frag6_create(unsigned int hash, u32 id, struct in6_addr *src, struct in6_addr *dst) | ||
346 | { | ||
347 | struct nf_ct_frag6_queue *fq; | ||
348 | |||
349 | if ((fq = frag_alloc_queue()) == NULL) { | ||
350 | DEBUGP("Can't alloc new queue\n"); | ||
351 | goto oom; | ||
352 | } | ||
353 | |||
354 | memset(fq, 0, sizeof(struct nf_ct_frag6_queue)); | ||
355 | |||
356 | fq->id = id; | ||
357 | ipv6_addr_copy(&fq->saddr, src); | ||
358 | ipv6_addr_copy(&fq->daddr, dst); | ||
359 | |||
360 | init_timer(&fq->timer); | ||
361 | fq->timer.function = nf_ct_frag6_expire; | ||
362 | fq->timer.data = (long) fq; | ||
363 | fq->lock = SPIN_LOCK_UNLOCKED; | ||
364 | atomic_set(&fq->refcnt, 1); | ||
365 | |||
366 | return nf_ct_frag6_intern(hash, fq); | ||
367 | |||
368 | oom: | ||
369 | return NULL; | ||
370 | } | ||
371 | |||
372 | static __inline__ struct nf_ct_frag6_queue * | ||
373 | fq_find(u32 id, struct in6_addr *src, struct in6_addr *dst) | ||
374 | { | ||
375 | struct nf_ct_frag6_queue *fq; | ||
376 | unsigned int hash = ip6qhashfn(id, src, dst); | ||
377 | |||
378 | read_lock(&nf_ct_frag6_lock); | ||
379 | for (fq = nf_ct_frag6_hash[hash]; fq; fq = fq->next) { | ||
380 | if (fq->id == id && | ||
381 | !ipv6_addr_cmp(src, &fq->saddr) && | ||
382 | !ipv6_addr_cmp(dst, &fq->daddr)) { | ||
383 | atomic_inc(&fq->refcnt); | ||
384 | read_unlock(&nf_ct_frag6_lock); | ||
385 | return fq; | ||
386 | } | ||
387 | } | ||
388 | read_unlock(&nf_ct_frag6_lock); | ||
389 | |||
390 | return nf_ct_frag6_create(hash, id, src, dst); | ||
391 | } | ||
392 | |||
393 | |||
394 | static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb, | ||
395 | struct frag_hdr *fhdr, int nhoff) | ||
396 | { | ||
397 | struct sk_buff *prev, *next; | ||
398 | int offset, end; | ||
399 | |||
400 | if (fq->last_in & COMPLETE) { | ||
401 | DEBUGP("Allready completed\n"); | ||
402 | goto err; | ||
403 | } | ||
404 | |||
405 | offset = ntohs(fhdr->frag_off) & ~0x7; | ||
406 | end = offset + (ntohs(skb->nh.ipv6h->payload_len) - | ||
407 | ((u8 *) (fhdr + 1) - (u8 *) (skb->nh.ipv6h + 1))); | ||
408 | |||
409 | if ((unsigned int)end > IPV6_MAXPLEN) { | ||
410 | DEBUGP("offset is too large.\n"); | ||
411 | return -1; | ||
412 | } | ||
413 | |||
414 | if (skb->ip_summed == CHECKSUM_HW) | ||
415 | skb->csum = csum_sub(skb->csum, | ||
416 | csum_partial(skb->nh.raw, | ||
417 | (u8*)(fhdr + 1) - skb->nh.raw, | ||
418 | 0)); | ||
419 | |||
420 | /* Is this the final fragment? */ | ||
421 | if (!(fhdr->frag_off & htons(IP6_MF))) { | ||
422 | /* If we already have some bits beyond end | ||
423 | * or have different end, the segment is corrupted. | ||
424 | */ | ||
425 | if (end < fq->len || | ||
426 | ((fq->last_in & LAST_IN) && end != fq->len)) { | ||
427 | DEBUGP("already received last fragment\n"); | ||
428 | goto err; | ||
429 | } | ||
430 | fq->last_in |= LAST_IN; | ||
431 | fq->len = end; | ||
432 | } else { | ||
433 | /* Check if the fragment is rounded to 8 bytes. | ||
434 | * Required by the RFC. | ||
435 | */ | ||
436 | if (end & 0x7) { | ||
437 | /* RFC2460 says always send parameter problem in | ||
438 | * this case. -DaveM | ||
439 | */ | ||
440 | DEBUGP("the end of this fragment is not rounded to 8 bytes.\n"); | ||
441 | return -1; | ||
442 | } | ||
443 | if (end > fq->len) { | ||
444 | /* Some bits beyond end -> corruption. */ | ||
445 | if (fq->last_in & LAST_IN) { | ||
446 | DEBUGP("last packet already reached.\n"); | ||
447 | goto err; | ||
448 | } | ||
449 | fq->len = end; | ||
450 | } | ||
451 | } | ||
452 | |||
453 | if (end == offset) | ||
454 | goto err; | ||
455 | |||
456 | /* Point into the IP datagram 'data' part. */ | ||
457 | if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data)) { | ||
458 | DEBUGP("queue: message is too short.\n"); | ||
459 | goto err; | ||
460 | } | ||
461 | if (end-offset < skb->len) { | ||
462 | if (pskb_trim(skb, end - offset)) { | ||
463 | DEBUGP("Can't trim\n"); | ||
464 | goto err; | ||
465 | } | ||
466 | if (skb->ip_summed != CHECKSUM_UNNECESSARY) | ||
467 | skb->ip_summed = CHECKSUM_NONE; | ||
468 | } | ||
469 | |||
470 | /* Find out which fragments are in front and at the back of us | ||
471 | * in the chain of fragments so far. We must know where to put | ||
472 | * this fragment, right? | ||
473 | */ | ||
474 | prev = NULL; | ||
475 | for (next = fq->fragments; next != NULL; next = next->next) { | ||
476 | if (NFCT_FRAG6_CB(next)->offset >= offset) | ||
477 | break; /* bingo! */ | ||
478 | prev = next; | ||
479 | } | ||
480 | |||
481 | /* We found where to put this one. Check for overlap with | ||
482 | * preceding fragment, and, if needed, align things so that | ||
483 | * any overlaps are eliminated. | ||
484 | */ | ||
485 | if (prev) { | ||
486 | int i = (NFCT_FRAG6_CB(prev)->offset + prev->len) - offset; | ||
487 | |||
488 | if (i > 0) { | ||
489 | offset += i; | ||
490 | if (end <= offset) { | ||
491 | DEBUGP("overlap\n"); | ||
492 | goto err; | ||
493 | } | ||
494 | if (!pskb_pull(skb, i)) { | ||
495 | DEBUGP("Can't pull\n"); | ||
496 | goto err; | ||
497 | } | ||
498 | if (skb->ip_summed != CHECKSUM_UNNECESSARY) | ||
499 | skb->ip_summed = CHECKSUM_NONE; | ||
500 | } | ||
501 | } | ||
502 | |||
503 | /* Look for overlap with succeeding segments. | ||
504 | * If we can merge fragments, do it. | ||
505 | */ | ||
506 | while (next && NFCT_FRAG6_CB(next)->offset < end) { | ||
507 | /* overlap is 'i' bytes */ | ||
508 | int i = end - NFCT_FRAG6_CB(next)->offset; | ||
509 | |||
510 | if (i < next->len) { | ||
511 | /* Eat head of the next overlapped fragment | ||
512 | * and leave the loop. The next ones cannot overlap. | ||
513 | */ | ||
514 | DEBUGP("Eat head of the overlapped parts.: %d", i); | ||
515 | if (!pskb_pull(next, i)) | ||
516 | goto err; | ||
517 | |||
518 | /* next fragment */ | ||
519 | NFCT_FRAG6_CB(next)->offset += i; | ||
520 | fq->meat -= i; | ||
521 | if (next->ip_summed != CHECKSUM_UNNECESSARY) | ||
522 | next->ip_summed = CHECKSUM_NONE; | ||
523 | break; | ||
524 | } else { | ||
525 | struct sk_buff *free_it = next; | ||
526 | |||
527 | /* Old fragmnet is completely overridden with | ||
528 | * new one drop it. | ||
529 | */ | ||
530 | next = next->next; | ||
531 | |||
532 | if (prev) | ||
533 | prev->next = next; | ||
534 | else | ||
535 | fq->fragments = next; | ||
536 | |||
537 | fq->meat -= free_it->len; | ||
538 | frag_kfree_skb(free_it); | ||
539 | } | ||
540 | } | ||
541 | |||
542 | NFCT_FRAG6_CB(skb)->offset = offset; | ||
543 | |||
544 | /* Insert this fragment in the chain of fragments. */ | ||
545 | skb->next = next; | ||
546 | if (prev) | ||
547 | prev->next = skb; | ||
548 | else | ||
549 | fq->fragments = skb; | ||
550 | |||
551 | skb->dev = NULL; | ||
552 | skb_get_timestamp(skb, &fq->stamp); | ||
553 | fq->meat += skb->len; | ||
554 | atomic_add(skb->truesize, &nf_ct_frag6_mem); | ||
555 | |||
556 | /* The first fragment. | ||
557 | * nhoffset is obtained from the first fragment, of course. | ||
558 | */ | ||
559 | if (offset == 0) { | ||
560 | fq->nhoffset = nhoff; | ||
561 | fq->last_in |= FIRST_IN; | ||
562 | } | ||
563 | write_lock(&nf_ct_frag6_lock); | ||
564 | list_move_tail(&fq->lru_list, &nf_ct_frag6_lru_list); | ||
565 | write_unlock(&nf_ct_frag6_lock); | ||
566 | return 0; | ||
567 | |||
568 | err: | ||
569 | return -1; | ||
570 | } | ||
571 | |||
572 | /* | ||
573 | * Check if this packet is complete. | ||
574 | * Returns NULL on failure by any reason, and pointer | ||
575 | * to current nexthdr field in reassembled frame. | ||
576 | * | ||
577 | * It is called with locked fq, and caller must check that | ||
578 | * queue is eligible for reassembly i.e. it is not COMPLETE, | ||
579 | * the last and the first frames arrived and all the bits are here. | ||
580 | */ | ||
581 | static struct sk_buff * | ||
582 | nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) | ||
583 | { | ||
584 | struct sk_buff *fp, *op, *head = fq->fragments; | ||
585 | int payload_len; | ||
586 | |||
587 | fq_kill(fq); | ||
588 | |||
589 | BUG_TRAP(head != NULL); | ||
590 | BUG_TRAP(NFCT_FRAG6_CB(head)->offset == 0); | ||
591 | |||
592 | /* Unfragmented part is taken from the first segment. */ | ||
593 | payload_len = (head->data - head->nh.raw) - sizeof(struct ipv6hdr) + fq->len - sizeof(struct frag_hdr); | ||
594 | if (payload_len > IPV6_MAXPLEN) { | ||
595 | DEBUGP("payload len is too large.\n"); | ||
596 | goto out_oversize; | ||
597 | } | ||
598 | |||
599 | /* Head of list must not be cloned. */ | ||
600 | if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC)) { | ||
601 | DEBUGP("skb is cloned but can't expand head"); | ||
602 | goto out_oom; | ||
603 | } | ||
604 | |||
605 | /* If the first fragment is fragmented itself, we split | ||
606 | * it to two chunks: the first with data and paged part | ||
607 | * and the second, holding only fragments. */ | ||
608 | if (skb_shinfo(head)->frag_list) { | ||
609 | struct sk_buff *clone; | ||
610 | int i, plen = 0; | ||
611 | |||
612 | if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL) { | ||
613 | DEBUGP("Can't alloc skb\n"); | ||
614 | goto out_oom; | ||
615 | } | ||
616 | clone->next = head->next; | ||
617 | head->next = clone; | ||
618 | skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; | ||
619 | skb_shinfo(head)->frag_list = NULL; | ||
620 | for (i=0; i<skb_shinfo(head)->nr_frags; i++) | ||
621 | plen += skb_shinfo(head)->frags[i].size; | ||
622 | clone->len = clone->data_len = head->data_len - plen; | ||
623 | head->data_len -= clone->len; | ||
624 | head->len -= clone->len; | ||
625 | clone->csum = 0; | ||
626 | clone->ip_summed = head->ip_summed; | ||
627 | |||
628 | NFCT_FRAG6_CB(clone)->orig = NULL; | ||
629 | atomic_add(clone->truesize, &nf_ct_frag6_mem); | ||
630 | } | ||
631 | |||
632 | /* We have to remove fragment header from datagram and to relocate | ||
633 | * header in order to calculate ICV correctly. */ | ||
634 | head->nh.raw[fq->nhoffset] = head->h.raw[0]; | ||
635 | memmove(head->head + sizeof(struct frag_hdr), head->head, | ||
636 | (head->data - head->head) - sizeof(struct frag_hdr)); | ||
637 | head->mac.raw += sizeof(struct frag_hdr); | ||
638 | head->nh.raw += sizeof(struct frag_hdr); | ||
639 | |||
640 | skb_shinfo(head)->frag_list = head->next; | ||
641 | head->h.raw = head->data; | ||
642 | skb_push(head, head->data - head->nh.raw); | ||
643 | atomic_sub(head->truesize, &nf_ct_frag6_mem); | ||
644 | |||
645 | for (fp=head->next; fp; fp = fp->next) { | ||
646 | head->data_len += fp->len; | ||
647 | head->len += fp->len; | ||
648 | if (head->ip_summed != fp->ip_summed) | ||
649 | head->ip_summed = CHECKSUM_NONE; | ||
650 | else if (head->ip_summed == CHECKSUM_HW) | ||
651 | head->csum = csum_add(head->csum, fp->csum); | ||
652 | head->truesize += fp->truesize; | ||
653 | atomic_sub(fp->truesize, &nf_ct_frag6_mem); | ||
654 | } | ||
655 | |||
656 | head->next = NULL; | ||
657 | head->dev = dev; | ||
658 | skb_set_timestamp(head, &fq->stamp); | ||
659 | head->nh.ipv6h->payload_len = htons(payload_len); | ||
660 | |||
661 | /* Yes, and fold redundant checksum back. 8) */ | ||
662 | if (head->ip_summed == CHECKSUM_HW) | ||
663 | head->csum = csum_partial(head->nh.raw, head->h.raw-head->nh.raw, head->csum); | ||
664 | |||
665 | fq->fragments = NULL; | ||
666 | |||
667 | /* all original skbs are linked into the NFCT_FRAG6_CB(head).orig */ | ||
668 | fp = skb_shinfo(head)->frag_list; | ||
669 | if (NFCT_FRAG6_CB(fp)->orig == NULL) | ||
670 | /* at above code, head skb is divided into two skbs. */ | ||
671 | fp = fp->next; | ||
672 | |||
673 | op = NFCT_FRAG6_CB(head)->orig; | ||
674 | for (; fp; fp = fp->next) { | ||
675 | struct sk_buff *orig = NFCT_FRAG6_CB(fp)->orig; | ||
676 | |||
677 | op->next = orig; | ||
678 | op = orig; | ||
679 | NFCT_FRAG6_CB(fp)->orig = NULL; | ||
680 | } | ||
681 | |||
682 | return head; | ||
683 | |||
684 | out_oversize: | ||
685 | if (net_ratelimit()) | ||
686 | printk(KERN_DEBUG "nf_ct_frag6_reasm: payload len = %d\n", payload_len); | ||
687 | goto out_fail; | ||
688 | out_oom: | ||
689 | if (net_ratelimit()) | ||
690 | printk(KERN_DEBUG "nf_ct_frag6_reasm: no memory for reassembly\n"); | ||
691 | out_fail: | ||
692 | return NULL; | ||
693 | } | ||
694 | |||
695 | /* | ||
696 | * find the header just before Fragment Header. | ||
697 | * | ||
698 | * if success return 0 and set ... | ||
699 | * (*prevhdrp): the value of "Next Header Field" in the header | ||
700 | * just before Fragment Header. | ||
701 | * (*prevhoff): the offset of "Next Header Field" in the header | ||
702 | * just before Fragment Header. | ||
703 | * (*fhoff) : the offset of Fragment Header. | ||
704 | * | ||
705 | * Based on ipv6_skip_hdr() in net/ipv6/exthdr.c | ||
706 | * | ||
707 | */ | ||
708 | static int | ||
709 | find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff) | ||
710 | { | ||
711 | u8 nexthdr = skb->nh.ipv6h->nexthdr; | ||
712 | u8 prev_nhoff = (u8 *)&skb->nh.ipv6h->nexthdr - skb->data; | ||
713 | int start = (u8 *)(skb->nh.ipv6h+1) - skb->data; | ||
714 | int len = skb->len - start; | ||
715 | u8 prevhdr = NEXTHDR_IPV6; | ||
716 | |||
717 | while (nexthdr != NEXTHDR_FRAGMENT) { | ||
718 | struct ipv6_opt_hdr hdr; | ||
719 | int hdrlen; | ||
720 | |||
721 | if (!ipv6_ext_hdr(nexthdr)) { | ||
722 | return -1; | ||
723 | } | ||
724 | if (len < (int)sizeof(struct ipv6_opt_hdr)) { | ||
725 | DEBUGP("too short\n"); | ||
726 | return -1; | ||
727 | } | ||
728 | if (nexthdr == NEXTHDR_NONE) { | ||
729 | DEBUGP("next header is none\n"); | ||
730 | return -1; | ||
731 | } | ||
732 | if (skb_copy_bits(skb, start, &hdr, sizeof(hdr))) | ||
733 | BUG(); | ||
734 | if (nexthdr == NEXTHDR_AUTH) | ||
735 | hdrlen = (hdr.hdrlen+2)<<2; | ||
736 | else | ||
737 | hdrlen = ipv6_optlen(&hdr); | ||
738 | |||
739 | prevhdr = nexthdr; | ||
740 | prev_nhoff = start; | ||
741 | |||
742 | nexthdr = hdr.nexthdr; | ||
743 | len -= hdrlen; | ||
744 | start += hdrlen; | ||
745 | } | ||
746 | |||
747 | if (len < 0) | ||
748 | return -1; | ||
749 | |||
750 | *prevhdrp = prevhdr; | ||
751 | *prevhoff = prev_nhoff; | ||
752 | *fhoff = start; | ||
753 | |||
754 | return 0; | ||
755 | } | ||
756 | |||
757 | struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb) | ||
758 | { | ||
759 | struct sk_buff *clone; | ||
760 | struct net_device *dev = skb->dev; | ||
761 | struct frag_hdr *fhdr; | ||
762 | struct nf_ct_frag6_queue *fq; | ||
763 | struct ipv6hdr *hdr; | ||
764 | int fhoff, nhoff; | ||
765 | u8 prevhdr; | ||
766 | struct sk_buff *ret_skb = NULL; | ||
767 | |||
768 | /* Jumbo payload inhibits frag. header */ | ||
769 | if (skb->nh.ipv6h->payload_len == 0) { | ||
770 | DEBUGP("payload len = 0\n"); | ||
771 | return skb; | ||
772 | } | ||
773 | |||
774 | if (find_prev_fhdr(skb, &prevhdr, &nhoff, &fhoff) < 0) | ||
775 | return skb; | ||
776 | |||
777 | clone = skb_clone(skb, GFP_ATOMIC); | ||
778 | if (clone == NULL) { | ||
779 | DEBUGP("Can't clone skb\n"); | ||
780 | return skb; | ||
781 | } | ||
782 | |||
783 | NFCT_FRAG6_CB(clone)->orig = skb; | ||
784 | |||
785 | if (!pskb_may_pull(clone, fhoff + sizeof(*fhdr))) { | ||
786 | DEBUGP("message is too short.\n"); | ||
787 | goto ret_orig; | ||
788 | } | ||
789 | |||
790 | clone->h.raw = clone->data + fhoff; | ||
791 | hdr = clone->nh.ipv6h; | ||
792 | fhdr = (struct frag_hdr *)clone->h.raw; | ||
793 | |||
794 | if (!(fhdr->frag_off & htons(0xFFF9))) { | ||
795 | DEBUGP("Invalid fragment offset\n"); | ||
796 | /* It is not a fragmented frame */ | ||
797 | goto ret_orig; | ||
798 | } | ||
799 | |||
800 | if (atomic_read(&nf_ct_frag6_mem) > nf_ct_frag6_high_thresh) | ||
801 | nf_ct_frag6_evictor(); | ||
802 | |||
803 | fq = fq_find(fhdr->identification, &hdr->saddr, &hdr->daddr); | ||
804 | if (fq == NULL) { | ||
805 | DEBUGP("Can't find and can't create new queue\n"); | ||
806 | goto ret_orig; | ||
807 | } | ||
808 | |||
809 | spin_lock(&fq->lock); | ||
810 | |||
811 | if (nf_ct_frag6_queue(fq, clone, fhdr, nhoff) < 0) { | ||
812 | spin_unlock(&fq->lock); | ||
813 | DEBUGP("Can't insert skb to queue\n"); | ||
814 | fq_put(fq); | ||
815 | goto ret_orig; | ||
816 | } | ||
817 | |||
818 | if (fq->last_in == (FIRST_IN|LAST_IN) && fq->meat == fq->len) { | ||
819 | ret_skb = nf_ct_frag6_reasm(fq, dev); | ||
820 | if (ret_skb == NULL) | ||
821 | DEBUGP("Can't reassemble fragmented packets\n"); | ||
822 | } | ||
823 | spin_unlock(&fq->lock); | ||
824 | |||
825 | fq_put(fq); | ||
826 | return ret_skb; | ||
827 | |||
828 | ret_orig: | ||
829 | kfree_skb(clone); | ||
830 | return skb; | ||
831 | } | ||
832 | |||
833 | void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb, | ||
834 | struct net_device *in, struct net_device *out, | ||
835 | int (*okfn)(struct sk_buff *)) | ||
836 | { | ||
837 | struct sk_buff *s, *s2; | ||
838 | |||
839 | for (s = NFCT_FRAG6_CB(skb)->orig; s;) { | ||
840 | nf_conntrack_put_reasm(s->nfct_reasm); | ||
841 | nf_conntrack_get_reasm(skb); | ||
842 | s->nfct_reasm = skb; | ||
843 | |||
844 | s2 = s->next; | ||
845 | NF_HOOK_THRESH(PF_INET6, hooknum, s, in, out, okfn, | ||
846 | NF_IP6_PRI_CONNTRACK_DEFRAG + 1); | ||
847 | s = s2; | ||
848 | } | ||
849 | nf_conntrack_put_reasm(skb); | ||
850 | } | ||
851 | |||
852 | int nf_ct_frag6_kfree_frags(struct sk_buff *skb) | ||
853 | { | ||
854 | struct sk_buff *s, *s2; | ||
855 | |||
856 | for (s = NFCT_FRAG6_CB(skb)->orig; s; s = s2) { | ||
857 | |||
858 | s2 = s->next; | ||
859 | kfree_skb(s); | ||
860 | } | ||
861 | |||
862 | kfree_skb(skb); | ||
863 | |||
864 | return 0; | ||
865 | } | ||
866 | |||
867 | int nf_ct_frag6_init(void) | ||
868 | { | ||
869 | nf_ct_frag6_hash_rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^ | ||
870 | (jiffies ^ (jiffies >> 6))); | ||
871 | |||
872 | init_timer(&nf_ct_frag6_secret_timer); | ||
873 | nf_ct_frag6_secret_timer.function = nf_ct_frag6_secret_rebuild; | ||
874 | nf_ct_frag6_secret_timer.expires = jiffies | ||
875 | + nf_ct_frag6_secret_interval; | ||
876 | add_timer(&nf_ct_frag6_secret_timer); | ||
877 | |||
878 | return 0; | ||
879 | } | ||
880 | |||
881 | void nf_ct_frag6_cleanup(void) | ||
882 | { | ||
883 | del_timer(&nf_ct_frag6_secret_timer); | ||
884 | nf_ct_frag6_evictor(); | ||
885 | } | ||
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index a1265a320b11..8e9628f1c4c5 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c | |||
@@ -174,8 +174,10 @@ int ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) | |||
174 | struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); | 174 | struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); |
175 | 175 | ||
176 | /* Not releasing hash table! */ | 176 | /* Not releasing hash table! */ |
177 | if (clone) | 177 | if (clone) { |
178 | nf_reset(clone); | ||
178 | rawv6_rcv(sk, clone); | 179 | rawv6_rcv(sk, clone); |
180 | } | ||
179 | } | 181 | } |
180 | sk = __raw_v6_lookup(sk_next(sk), nexthdr, daddr, saddr, | 182 | sk = __raw_v6_lookup(sk_next(sk), nexthdr, daddr, saddr, |
181 | IP6CB(skb)->iif); | 183 | IP6CB(skb)->iif); |
@@ -296,13 +298,10 @@ void rawv6_err(struct sock *sk, struct sk_buff *skb, | |||
296 | static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb) | 298 | static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb) |
297 | { | 299 | { |
298 | if ((raw6_sk(sk)->checksum || sk->sk_filter) && | 300 | if ((raw6_sk(sk)->checksum || sk->sk_filter) && |
299 | skb->ip_summed != CHECKSUM_UNNECESSARY) { | 301 | skb_checksum_complete(skb)) { |
300 | if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { | 302 | /* FIXME: increment a raw6 drops counter here */ |
301 | /* FIXME: increment a raw6 drops counter here */ | 303 | kfree_skb(skb); |
302 | kfree_skb(skb); | 304 | return 0; |
303 | return 0; | ||
304 | } | ||
305 | skb->ip_summed = CHECKSUM_UNNECESSARY; | ||
306 | } | 305 | } |
307 | 306 | ||
308 | /* Charge it to the socket. */ | 307 | /* Charge it to the socket. */ |
@@ -335,32 +334,25 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) | |||
335 | if (!rp->checksum) | 334 | if (!rp->checksum) |
336 | skb->ip_summed = CHECKSUM_UNNECESSARY; | 335 | skb->ip_summed = CHECKSUM_UNNECESSARY; |
337 | 336 | ||
338 | if (skb->ip_summed != CHECKSUM_UNNECESSARY) { | 337 | if (skb->ip_summed == CHECKSUM_HW) { |
339 | if (skb->ip_summed == CHECKSUM_HW) { | 338 | skb_postpull_rcsum(skb, skb->nh.raw, |
340 | skb_postpull_rcsum(skb, skb->nh.raw, | 339 | skb->h.raw - skb->nh.raw); |
341 | skb->h.raw - skb->nh.raw); | 340 | if (!csum_ipv6_magic(&skb->nh.ipv6h->saddr, |
341 | &skb->nh.ipv6h->daddr, | ||
342 | skb->len, inet->num, skb->csum)) | ||
342 | skb->ip_summed = CHECKSUM_UNNECESSARY; | 343 | skb->ip_summed = CHECKSUM_UNNECESSARY; |
343 | if (csum_ipv6_magic(&skb->nh.ipv6h->saddr, | ||
344 | &skb->nh.ipv6h->daddr, | ||
345 | skb->len, inet->num, skb->csum)) { | ||
346 | LIMIT_NETDEBUG(KERN_DEBUG "raw v6 hw csum failure.\n"); | ||
347 | skb->ip_summed = CHECKSUM_NONE; | ||
348 | } | ||
349 | } | ||
350 | if (skb->ip_summed == CHECKSUM_NONE) | ||
351 | skb->csum = ~csum_ipv6_magic(&skb->nh.ipv6h->saddr, | ||
352 | &skb->nh.ipv6h->daddr, | ||
353 | skb->len, inet->num, 0); | ||
354 | } | 344 | } |
345 | if (skb->ip_summed != CHECKSUM_UNNECESSARY) | ||
346 | skb->csum = ~csum_ipv6_magic(&skb->nh.ipv6h->saddr, | ||
347 | &skb->nh.ipv6h->daddr, | ||
348 | skb->len, inet->num, 0); | ||
355 | 349 | ||
356 | if (inet->hdrincl) { | 350 | if (inet->hdrincl) { |
357 | if (skb->ip_summed != CHECKSUM_UNNECESSARY && | 351 | if (skb_checksum_complete(skb)) { |
358 | (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { | ||
359 | /* FIXME: increment a raw6 drops counter here */ | 352 | /* FIXME: increment a raw6 drops counter here */ |
360 | kfree_skb(skb); | 353 | kfree_skb(skb); |
361 | return 0; | 354 | return 0; |
362 | } | 355 | } |
363 | skb->ip_summed = CHECKSUM_UNNECESSARY; | ||
364 | } | 356 | } |
365 | 357 | ||
366 | rawv6_rcv_skb(sk, skb); | 358 | rawv6_rcv_skb(sk, skb); |
@@ -405,7 +397,7 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk, | |||
405 | if (skb->ip_summed==CHECKSUM_UNNECESSARY) { | 397 | if (skb->ip_summed==CHECKSUM_UNNECESSARY) { |
406 | err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); | 398 | err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); |
407 | } else if (msg->msg_flags&MSG_TRUNC) { | 399 | } else if (msg->msg_flags&MSG_TRUNC) { |
408 | if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) | 400 | if (__skb_checksum_complete(skb)) |
409 | goto csum_copy_err; | 401 | goto csum_copy_err; |
410 | err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); | 402 | err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); |
411 | } else { | 403 | } else { |
diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 227e99ed510c..f7f42c3e96cb 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c | |||
@@ -1710,7 +1710,7 @@ static void fib6_dump_end(struct netlink_callback *cb) | |||
1710 | static int fib6_dump_done(struct netlink_callback *cb) | 1710 | static int fib6_dump_done(struct netlink_callback *cb) |
1711 | { | 1711 | { |
1712 | fib6_dump_end(cb); | 1712 | fib6_dump_end(cb); |
1713 | return cb->done(cb); | 1713 | return cb->done ? cb->done(cb) : 0; |
1714 | } | 1714 | } |
1715 | 1715 | ||
1716 | int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) | 1716 | int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) |
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d746d3b27efb..62c0e5bd931c 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c | |||
@@ -1401,20 +1401,18 @@ out: | |||
1401 | static int tcp_v6_checksum_init(struct sk_buff *skb) | 1401 | static int tcp_v6_checksum_init(struct sk_buff *skb) |
1402 | { | 1402 | { |
1403 | if (skb->ip_summed == CHECKSUM_HW) { | 1403 | if (skb->ip_summed == CHECKSUM_HW) { |
1404 | skb->ip_summed = CHECKSUM_UNNECESSARY; | ||
1405 | if (!tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, | 1404 | if (!tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, |
1406 | &skb->nh.ipv6h->daddr,skb->csum)) | 1405 | &skb->nh.ipv6h->daddr,skb->csum)) { |
1406 | skb->ip_summed = CHECKSUM_UNNECESSARY; | ||
1407 | return 0; | 1407 | return 0; |
1408 | LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v6 csum failed\n"); | 1408 | } |
1409 | } | 1409 | } |
1410 | |||
1411 | skb->csum = ~tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, | ||
1412 | &skb->nh.ipv6h->daddr, 0); | ||
1413 | |||
1410 | if (skb->len <= 76) { | 1414 | if (skb->len <= 76) { |
1411 | if (tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, | 1415 | return __skb_checksum_complete(skb); |
1412 | &skb->nh.ipv6h->daddr,skb_checksum(skb, 0, skb->len, 0))) | ||
1413 | return -1; | ||
1414 | skb->ip_summed = CHECKSUM_UNNECESSARY; | ||
1415 | } else { | ||
1416 | skb->csum = ~tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, | ||
1417 | &skb->nh.ipv6h->daddr,0); | ||
1418 | } | 1416 | } |
1419 | return 0; | 1417 | return 0; |
1420 | } | 1418 | } |
@@ -1575,7 +1573,7 @@ static int tcp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) | |||
1575 | goto discard_it; | 1573 | goto discard_it; |
1576 | 1574 | ||
1577 | if ((skb->ip_summed != CHECKSUM_UNNECESSARY && | 1575 | if ((skb->ip_summed != CHECKSUM_UNNECESSARY && |
1578 | tcp_v6_checksum_init(skb) < 0)) | 1576 | tcp_v6_checksum_init(skb))) |
1579 | goto bad_packet; | 1577 | goto bad_packet; |
1580 | 1578 | ||
1581 | th = skb->h.th; | 1579 | th = skb->h.th; |
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index bf9519341fd3..e671153b47b2 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c | |||
@@ -248,7 +248,7 @@ try_again: | |||
248 | err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, | 248 | err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, |
249 | copied); | 249 | copied); |
250 | } else if (msg->msg_flags&MSG_TRUNC) { | 250 | } else if (msg->msg_flags&MSG_TRUNC) { |
251 | if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) | 251 | if (__skb_checksum_complete(skb)) |
252 | goto csum_copy_err; | 252 | goto csum_copy_err; |
253 | err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, | 253 | err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, |
254 | copied); | 254 | copied); |
@@ -363,13 +363,10 @@ static inline int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) | |||
363 | return -1; | 363 | return -1; |
364 | } | 364 | } |
365 | 365 | ||
366 | if (skb->ip_summed != CHECKSUM_UNNECESSARY) { | 366 | if (skb_checksum_complete(skb)) { |
367 | if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { | 367 | UDP6_INC_STATS_BH(UDP_MIB_INERRORS); |
368 | UDP6_INC_STATS_BH(UDP_MIB_INERRORS); | 368 | kfree_skb(skb); |
369 | kfree_skb(skb); | 369 | return 0; |
370 | return 0; | ||
371 | } | ||
372 | skb->ip_summed = CHECKSUM_UNNECESSARY; | ||
373 | } | 370 | } |
374 | 371 | ||
375 | if (sock_queue_rcv_skb(sk,skb)<0) { | 372 | if (sock_queue_rcv_skb(sk,skb)<0) { |
@@ -491,13 +488,10 @@ static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) | |||
491 | uh = skb->h.uh; | 488 | uh = skb->h.uh; |
492 | } | 489 | } |
493 | 490 | ||
494 | if (skb->ip_summed==CHECKSUM_HW) { | 491 | if (skb->ip_summed == CHECKSUM_HW && |
492 | !csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum)) | ||
495 | skb->ip_summed = CHECKSUM_UNNECESSARY; | 493 | skb->ip_summed = CHECKSUM_UNNECESSARY; |
496 | if (csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum)) { | 494 | |
497 | LIMIT_NETDEBUG(KERN_DEBUG "udp v6 hw csum failure.\n"); | ||
498 | skb->ip_summed = CHECKSUM_NONE; | ||
499 | } | ||
500 | } | ||
501 | if (skb->ip_summed != CHECKSUM_UNNECESSARY) | 495 | if (skb->ip_summed != CHECKSUM_UNNECESSARY) |
502 | skb->csum = ~csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, 0); | 496 | skb->csum = ~csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, 0); |
503 | 497 | ||
@@ -521,8 +515,7 @@ static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) | |||
521 | if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) | 515 | if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) |
522 | goto discard; | 516 | goto discard; |
523 | 517 | ||
524 | if (skb->ip_summed != CHECKSUM_UNNECESSARY && | 518 | if (skb_checksum_complete(skb)) |
525 | (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) | ||
526 | goto discard; | 519 | goto discard; |
527 | UDP6_INC_STATS_BH(UDP_MIB_NOPORTS); | 520 | UDP6_INC_STATS_BH(UDP_MIB_NOPORTS); |
528 | 521 | ||
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 8296b38bf270..a84f9221e5f0 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig | |||
@@ -1,3 +1,6 @@ | |||
1 | menu "Core Netfilter Configuration" | ||
2 | depends on NET && NETFILTER | ||
3 | |||
1 | config NETFILTER_NETLINK | 4 | config NETFILTER_NETLINK |
2 | tristate "Netfilter netlink interface" | 5 | tristate "Netfilter netlink interface" |
3 | help | 6 | help |
@@ -22,3 +25,74 @@ config NETFILTER_NETLINK_LOG | |||
22 | and is also scheduled to replace the old syslog-based ipt_LOG | 25 | and is also scheduled to replace the old syslog-based ipt_LOG |
23 | and ip6t_LOG modules. | 26 | and ip6t_LOG modules. |
24 | 27 | ||
28 | config NF_CONNTRACK | ||
29 | tristate "Layer 3 Independent Connection tracking (EXPERIMENTAL)" | ||
30 | depends on EXPERIMENTAL && IP_NF_CONNTRACK=n | ||
31 | default n | ||
32 | ---help--- | ||
33 | Connection tracking keeps a record of what packets have passed | ||
34 | through your machine, in order to figure out how they are related | ||
35 | into connections. | ||
36 | |||
37 | Layer 3 independent connection tracking is experimental scheme | ||
38 | which generalize ip_conntrack to support other layer 3 protocols. | ||
39 | |||
40 | To compile it as a module, choose M here. If unsure, say N. | ||
41 | |||
42 | config NF_CT_ACCT | ||
43 | bool "Connection tracking flow accounting" | ||
44 | depends on NF_CONNTRACK | ||
45 | help | ||
46 | If this option is enabled, the connection tracking code will | ||
47 | keep per-flow packet and byte counters. | ||
48 | |||
49 | Those counters can be used for flow-based accounting or the | ||
50 | `connbytes' match. | ||
51 | |||
52 | If unsure, say `N'. | ||
53 | |||
54 | config NF_CONNTRACK_MARK | ||
55 | bool 'Connection mark tracking support' | ||
56 | depends on NF_CONNTRACK | ||
57 | help | ||
58 | This option enables support for connection marks, used by the | ||
59 | `CONNMARK' target and `connmark' match. Similar to the mark value | ||
60 | of packets, but this mark value is kept in the conntrack session | ||
61 | instead of the individual packets. | ||
62 | |||
63 | config NF_CONNTRACK_EVENTS | ||
64 | bool "Connection tracking events" | ||
65 | depends on NF_CONNTRACK | ||
66 | help | ||
67 | If this option is enabled, the connection tracking code will | ||
68 | provide a notifier chain that can be used by other kernel code | ||
69 | to get notified aboutchanges in the connection tracking state. | ||
70 | |||
71 | If unsure, say `N'. | ||
72 | |||
73 | config NF_CT_PROTO_SCTP | ||
74 | tristate 'SCTP protocol on new connection tracking support (EXPERIMENTAL)' | ||
75 | depends on EXPERIMENTAL && NF_CONNTRACK | ||
76 | default n | ||
77 | help | ||
78 | With this option enabled, the layer 3 independent connection | ||
79 | tracking code will be able to do state tracking on SCTP connections. | ||
80 | |||
81 | If you want to compile it as a module, say M here and read | ||
82 | Documentation/modules.txt. If unsure, say `N'. | ||
83 | |||
84 | config NF_CONNTRACK_FTP | ||
85 | tristate "FTP support on new connection tracking (EXPERIMENTAL)" | ||
86 | depends on EXPERIMENTAL && NF_CONNTRACK | ||
87 | help | ||
88 | Tracking FTP connections is problematic: special helpers are | ||
89 | required for tracking them, and doing masquerading and other forms | ||
90 | of Network Address Translation on them. | ||
91 | |||
92 | This is FTP support on Layer 3 independent connection tracking. | ||
93 | Layer 3 independent connection tracking is experimental scheme | ||
94 | which generalize ip_conntrack to support other layer 3 protocols. | ||
95 | |||
96 | To compile it as a module, choose M here. If unsure, say N. | ||
97 | |||
98 | endmenu | ||
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index b3b44f8b415a..55f019ad2c08 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile | |||
@@ -5,3 +5,11 @@ obj-$(CONFIG_NETFILTER) = netfilter.o | |||
5 | obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o | 5 | obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o |
6 | obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o | 6 | obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o |
7 | obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o | 7 | obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o |
8 | |||
9 | nf_conntrack-objs := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o | ||
10 | |||
11 | obj-$(CONFIG_NF_CONNTRACK) += nf_conntrack.o | ||
12 | obj-$(CONFIG_NF_CONNTRACK_FTP) += nf_conntrack_ftp.o | ||
13 | |||
14 | # SCTP protocol connection tracking | ||
15 | obj-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o | ||
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c new file mode 100644 index 000000000000..9a67c796b385 --- /dev/null +++ b/net/netfilter/nf_conntrack_core.c | |||
@@ -0,0 +1,1538 @@ | |||
1 | /* Connection state tracking for netfilter. This is separated from, | ||
2 | but required by, the NAT layer; it can also be used by an iptables | ||
3 | extension. */ | ||
4 | |||
5 | /* (C) 1999-2001 Paul `Rusty' Russell | ||
6 | * (C) 2002-2005 Netfilter Core Team <coreteam@netfilter.org> | ||
7 | * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or modify | ||
10 | * it under the terms of the GNU General Public License version 2 as | ||
11 | * published by the Free Software Foundation. | ||
12 | * | ||
13 | * 23 Apr 2001: Harald Welte <laforge@gnumonks.org> | ||
14 | * - new API and handling of conntrack/nat helpers | ||
15 | * - now capable of multiple expectations for one master | ||
16 | * 16 Jul 2002: Harald Welte <laforge@gnumonks.org> | ||
17 | * - add usage/reference counts to ip_conntrack_expect | ||
18 | * - export ip_conntrack[_expect]_{find_get,put} functions | ||
19 | * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> | ||
20 | * - generalize L3 protocol denendent part. | ||
21 | * 23 Mar 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> | ||
22 | * - add support various size of conntrack structures. | ||
23 | * | ||
24 | * Derived from net/ipv4/netfilter/ip_conntrack_core.c | ||
25 | */ | ||
26 | |||
27 | #include <linux/config.h> | ||
28 | #include <linux/types.h> | ||
29 | #include <linux/netfilter.h> | ||
30 | #include <linux/module.h> | ||
31 | #include <linux/skbuff.h> | ||
32 | #include <linux/proc_fs.h> | ||
33 | #include <linux/vmalloc.h> | ||
34 | #include <linux/stddef.h> | ||
35 | #include <linux/slab.h> | ||
36 | #include <linux/random.h> | ||
37 | #include <linux/jhash.h> | ||
38 | #include <linux/err.h> | ||
39 | #include <linux/percpu.h> | ||
40 | #include <linux/moduleparam.h> | ||
41 | #include <linux/notifier.h> | ||
42 | #include <linux/kernel.h> | ||
43 | #include <linux/netdevice.h> | ||
44 | #include <linux/socket.h> | ||
45 | |||
46 | /* This rwlock protects the main hash table, protocol/helper/expected | ||
47 | registrations, conntrack timers*/ | ||
48 | #define ASSERT_READ_LOCK(x) | ||
49 | #define ASSERT_WRITE_LOCK(x) | ||
50 | |||
51 | #include <net/netfilter/nf_conntrack.h> | ||
52 | #include <net/netfilter/nf_conntrack_l3proto.h> | ||
53 | #include <net/netfilter/nf_conntrack_protocol.h> | ||
54 | #include <net/netfilter/nf_conntrack_helper.h> | ||
55 | #include <net/netfilter/nf_conntrack_core.h> | ||
56 | #include <linux/netfilter_ipv4/listhelp.h> | ||
57 | |||
58 | #define NF_CONNTRACK_VERSION "0.4.1" | ||
59 | |||
60 | #if 0 | ||
61 | #define DEBUGP printk | ||
62 | #else | ||
63 | #define DEBUGP(format, args...) | ||
64 | #endif | ||
65 | |||
66 | DEFINE_RWLOCK(nf_conntrack_lock); | ||
67 | |||
68 | /* nf_conntrack_standalone needs this */ | ||
69 | atomic_t nf_conntrack_count = ATOMIC_INIT(0); | ||
70 | |||
71 | void (*nf_conntrack_destroyed)(struct nf_conn *conntrack) = NULL; | ||
72 | LIST_HEAD(nf_conntrack_expect_list); | ||
73 | struct nf_conntrack_protocol **nf_ct_protos[PF_MAX]; | ||
74 | struct nf_conntrack_l3proto *nf_ct_l3protos[PF_MAX]; | ||
75 | static LIST_HEAD(helpers); | ||
76 | unsigned int nf_conntrack_htable_size = 0; | ||
77 | int nf_conntrack_max; | ||
78 | struct list_head *nf_conntrack_hash; | ||
79 | static kmem_cache_t *nf_conntrack_expect_cachep; | ||
80 | struct nf_conn nf_conntrack_untracked; | ||
81 | unsigned int nf_ct_log_invalid; | ||
82 | static LIST_HEAD(unconfirmed); | ||
83 | static int nf_conntrack_vmalloc; | ||
84 | |||
85 | #ifdef CONFIG_NF_CONNTRACK_EVENTS | ||
86 | struct notifier_block *nf_conntrack_chain; | ||
87 | struct notifier_block *nf_conntrack_expect_chain; | ||
88 | |||
89 | DEFINE_PER_CPU(struct nf_conntrack_ecache, nf_conntrack_ecache); | ||
90 | |||
91 | /* deliver cached events and clear cache entry - must be called with locally | ||
92 | * disabled softirqs */ | ||
93 | static inline void | ||
94 | __nf_ct_deliver_cached_events(struct nf_conntrack_ecache *ecache) | ||
95 | { | ||
96 | DEBUGP("ecache: delivering events for %p\n", ecache->ct); | ||
97 | if (nf_ct_is_confirmed(ecache->ct) && !nf_ct_is_dying(ecache->ct) | ||
98 | && ecache->events) | ||
99 | notifier_call_chain(&nf_conntrack_chain, ecache->events, | ||
100 | ecache->ct); | ||
101 | |||
102 | ecache->events = 0; | ||
103 | nf_ct_put(ecache->ct); | ||
104 | ecache->ct = NULL; | ||
105 | } | ||
106 | |||
107 | /* Deliver all cached events for a particular conntrack. This is called | ||
108 | * by code prior to async packet handling for freeing the skb */ | ||
109 | void nf_ct_deliver_cached_events(const struct nf_conn *ct) | ||
110 | { | ||
111 | struct nf_conntrack_ecache *ecache; | ||
112 | |||
113 | local_bh_disable(); | ||
114 | ecache = &__get_cpu_var(nf_conntrack_ecache); | ||
115 | if (ecache->ct == ct) | ||
116 | __nf_ct_deliver_cached_events(ecache); | ||
117 | local_bh_enable(); | ||
118 | } | ||
119 | |||
120 | /* Deliver cached events for old pending events, if current conntrack != old */ | ||
121 | void __nf_ct_event_cache_init(struct nf_conn *ct) | ||
122 | { | ||
123 | struct nf_conntrack_ecache *ecache; | ||
124 | |||
125 | /* take care of delivering potentially old events */ | ||
126 | ecache = &__get_cpu_var(nf_conntrack_ecache); | ||
127 | BUG_ON(ecache->ct == ct); | ||
128 | if (ecache->ct) | ||
129 | __nf_ct_deliver_cached_events(ecache); | ||
130 | /* initialize for this conntrack/packet */ | ||
131 | ecache->ct = ct; | ||
132 | nf_conntrack_get(&ct->ct_general); | ||
133 | } | ||
134 | |||
135 | /* flush the event cache - touches other CPU's data and must not be called | ||
136 | * while packets are still passing through the code */ | ||
137 | static void nf_ct_event_cache_flush(void) | ||
138 | { | ||
139 | struct nf_conntrack_ecache *ecache; | ||
140 | int cpu; | ||
141 | |||
142 | for_each_cpu(cpu) { | ||
143 | ecache = &per_cpu(nf_conntrack_ecache, cpu); | ||
144 | if (ecache->ct) | ||
145 | nf_ct_put(ecache->ct); | ||
146 | } | ||
147 | } | ||
148 | #else | ||
149 | static inline void nf_ct_event_cache_flush(void) {} | ||
150 | #endif /* CONFIG_NF_CONNTRACK_EVENTS */ | ||
151 | |||
152 | DEFINE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat); | ||
153 | EXPORT_PER_CPU_SYMBOL(nf_conntrack_stat); | ||
154 | |||
155 | /* | ||
156 | * This scheme offers various size of "struct nf_conn" dependent on | ||
157 | * features(helper, nat, ...) | ||
158 | */ | ||
159 | |||
160 | #define NF_CT_FEATURES_NAMELEN 256 | ||
161 | static struct { | ||
162 | /* name of slab cache. printed in /proc/slabinfo */ | ||
163 | char *name; | ||
164 | |||
165 | /* size of slab cache */ | ||
166 | size_t size; | ||
167 | |||
168 | /* slab cache pointer */ | ||
169 | kmem_cache_t *cachep; | ||
170 | |||
171 | /* allocated slab cache + modules which uses this slab cache */ | ||
172 | int use; | ||
173 | |||
174 | /* Initialization */ | ||
175 | int (*init_conntrack)(struct nf_conn *, u_int32_t); | ||
176 | |||
177 | } nf_ct_cache[NF_CT_F_NUM]; | ||
178 | |||
179 | /* protect members of nf_ct_cache except of "use" */ | ||
180 | DEFINE_RWLOCK(nf_ct_cache_lock); | ||
181 | |||
182 | /* This avoids calling kmem_cache_create() with same name simultaneously */ | ||
183 | DECLARE_MUTEX(nf_ct_cache_mutex); | ||
184 | |||
185 | extern struct nf_conntrack_protocol nf_conntrack_generic_protocol; | ||
186 | struct nf_conntrack_protocol * | ||
187 | nf_ct_find_proto(u_int16_t l3proto, u_int8_t protocol) | ||
188 | { | ||
189 | if (unlikely(nf_ct_protos[l3proto] == NULL)) | ||
190 | return &nf_conntrack_generic_protocol; | ||
191 | |||
192 | return nf_ct_protos[l3proto][protocol]; | ||
193 | } | ||
194 | |||
195 | static int nf_conntrack_hash_rnd_initted; | ||
196 | static unsigned int nf_conntrack_hash_rnd; | ||
197 | |||
198 | static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple, | ||
199 | unsigned int size, unsigned int rnd) | ||
200 | { | ||
201 | unsigned int a, b; | ||
202 | a = jhash((void *)tuple->src.u3.all, sizeof(tuple->src.u3.all), | ||
203 | ((tuple->src.l3num) << 16) | tuple->dst.protonum); | ||
204 | b = jhash((void *)tuple->dst.u3.all, sizeof(tuple->dst.u3.all), | ||
205 | (tuple->src.u.all << 16) | tuple->dst.u.all); | ||
206 | |||
207 | return jhash_2words(a, b, rnd) % size; | ||
208 | } | ||
209 | |||
210 | static inline u_int32_t hash_conntrack(const struct nf_conntrack_tuple *tuple) | ||
211 | { | ||
212 | return __hash_conntrack(tuple, nf_conntrack_htable_size, | ||
213 | nf_conntrack_hash_rnd); | ||
214 | } | ||
215 | |||
216 | /* Initialize "struct nf_conn" which has spaces for helper */ | ||
217 | static int | ||
218 | init_conntrack_for_helper(struct nf_conn *conntrack, u_int32_t features) | ||
219 | { | ||
220 | |||
221 | conntrack->help = (union nf_conntrack_help *) | ||
222 | (((unsigned long)conntrack->data | ||
223 | + (__alignof__(union nf_conntrack_help) - 1)) | ||
224 | & (~((unsigned long)(__alignof__(union nf_conntrack_help) -1)))); | ||
225 | return 0; | ||
226 | } | ||
227 | |||
228 | int nf_conntrack_register_cache(u_int32_t features, const char *name, | ||
229 | size_t size, | ||
230 | int (*init)(struct nf_conn *, u_int32_t)) | ||
231 | { | ||
232 | int ret = 0; | ||
233 | char *cache_name; | ||
234 | kmem_cache_t *cachep; | ||
235 | |||
236 | DEBUGP("nf_conntrack_register_cache: features=0x%x, name=%s, size=%d\n", | ||
237 | features, name, size); | ||
238 | |||
239 | if (features < NF_CT_F_BASIC || features >= NF_CT_F_NUM) { | ||
240 | DEBUGP("nf_conntrack_register_cache: invalid features.: 0x%x\n", | ||
241 | features); | ||
242 | return -EINVAL; | ||
243 | } | ||
244 | |||
245 | down(&nf_ct_cache_mutex); | ||
246 | |||
247 | write_lock_bh(&nf_ct_cache_lock); | ||
248 | /* e.g: multiple helpers are loaded */ | ||
249 | if (nf_ct_cache[features].use > 0) { | ||
250 | DEBUGP("nf_conntrack_register_cache: already resisterd.\n"); | ||
251 | if ((!strncmp(nf_ct_cache[features].name, name, | ||
252 | NF_CT_FEATURES_NAMELEN)) | ||
253 | && nf_ct_cache[features].size == size | ||
254 | && nf_ct_cache[features].init_conntrack == init) { | ||
255 | DEBUGP("nf_conntrack_register_cache: reusing.\n"); | ||
256 | nf_ct_cache[features].use++; | ||
257 | ret = 0; | ||
258 | } else | ||
259 | ret = -EBUSY; | ||
260 | |||
261 | write_unlock_bh(&nf_ct_cache_lock); | ||
262 | up(&nf_ct_cache_mutex); | ||
263 | return ret; | ||
264 | } | ||
265 | write_unlock_bh(&nf_ct_cache_lock); | ||
266 | |||
267 | /* | ||
268 | * The memory space for name of slab cache must be alive until | ||
269 | * cache is destroyed. | ||
270 | */ | ||
271 | cache_name = kmalloc(sizeof(char)*NF_CT_FEATURES_NAMELEN, GFP_ATOMIC); | ||
272 | if (cache_name == NULL) { | ||
273 | DEBUGP("nf_conntrack_register_cache: can't alloc cache_name\n"); | ||
274 | ret = -ENOMEM; | ||
275 | goto out_up_mutex; | ||
276 | } | ||
277 | |||
278 | if (strlcpy(cache_name, name, NF_CT_FEATURES_NAMELEN) | ||
279 | >= NF_CT_FEATURES_NAMELEN) { | ||
280 | printk("nf_conntrack_register_cache: name too long\n"); | ||
281 | ret = -EINVAL; | ||
282 | goto out_free_name; | ||
283 | } | ||
284 | |||
285 | cachep = kmem_cache_create(cache_name, size, 0, 0, | ||
286 | NULL, NULL); | ||
287 | if (!cachep) { | ||
288 | printk("nf_conntrack_register_cache: Can't create slab cache " | ||
289 | "for the features = 0x%x\n", features); | ||
290 | ret = -ENOMEM; | ||
291 | goto out_free_name; | ||
292 | } | ||
293 | |||
294 | write_lock_bh(&nf_ct_cache_lock); | ||
295 | nf_ct_cache[features].use = 1; | ||
296 | nf_ct_cache[features].size = size; | ||
297 | nf_ct_cache[features].init_conntrack = init; | ||
298 | nf_ct_cache[features].cachep = cachep; | ||
299 | nf_ct_cache[features].name = cache_name; | ||
300 | write_unlock_bh(&nf_ct_cache_lock); | ||
301 | |||
302 | goto out_up_mutex; | ||
303 | |||
304 | out_free_name: | ||
305 | kfree(cache_name); | ||
306 | out_up_mutex: | ||
307 | up(&nf_ct_cache_mutex); | ||
308 | return ret; | ||
309 | } | ||
310 | |||
311 | /* FIXME: In the current, only nf_conntrack_cleanup() can call this function. */ | ||
312 | void nf_conntrack_unregister_cache(u_int32_t features) | ||
313 | { | ||
314 | kmem_cache_t *cachep; | ||
315 | char *name; | ||
316 | |||
317 | /* | ||
318 | * This assures that kmem_cache_create() isn't called before destroying | ||
319 | * slab cache. | ||
320 | */ | ||
321 | DEBUGP("nf_conntrack_unregister_cache: 0x%04x\n", features); | ||
322 | down(&nf_ct_cache_mutex); | ||
323 | |||
324 | write_lock_bh(&nf_ct_cache_lock); | ||
325 | if (--nf_ct_cache[features].use > 0) { | ||
326 | write_unlock_bh(&nf_ct_cache_lock); | ||
327 | up(&nf_ct_cache_mutex); | ||
328 | return; | ||
329 | } | ||
330 | cachep = nf_ct_cache[features].cachep; | ||
331 | name = nf_ct_cache[features].name; | ||
332 | nf_ct_cache[features].cachep = NULL; | ||
333 | nf_ct_cache[features].name = NULL; | ||
334 | nf_ct_cache[features].init_conntrack = NULL; | ||
335 | nf_ct_cache[features].size = 0; | ||
336 | write_unlock_bh(&nf_ct_cache_lock); | ||
337 | |||
338 | synchronize_net(); | ||
339 | |||
340 | kmem_cache_destroy(cachep); | ||
341 | kfree(name); | ||
342 | |||
343 | up(&nf_ct_cache_mutex); | ||
344 | } | ||
345 | |||
346 | int | ||
347 | nf_ct_get_tuple(const struct sk_buff *skb, | ||
348 | unsigned int nhoff, | ||
349 | unsigned int dataoff, | ||
350 | u_int16_t l3num, | ||
351 | u_int8_t protonum, | ||
352 | struct nf_conntrack_tuple *tuple, | ||
353 | const struct nf_conntrack_l3proto *l3proto, | ||
354 | const struct nf_conntrack_protocol *protocol) | ||
355 | { | ||
356 | NF_CT_TUPLE_U_BLANK(tuple); | ||
357 | |||
358 | tuple->src.l3num = l3num; | ||
359 | if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0) | ||
360 | return 0; | ||
361 | |||
362 | tuple->dst.protonum = protonum; | ||
363 | tuple->dst.dir = IP_CT_DIR_ORIGINAL; | ||
364 | |||
365 | return protocol->pkt_to_tuple(skb, dataoff, tuple); | ||
366 | } | ||
367 | |||
368 | int | ||
369 | nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, | ||
370 | const struct nf_conntrack_tuple *orig, | ||
371 | const struct nf_conntrack_l3proto *l3proto, | ||
372 | const struct nf_conntrack_protocol *protocol) | ||
373 | { | ||
374 | NF_CT_TUPLE_U_BLANK(inverse); | ||
375 | |||
376 | inverse->src.l3num = orig->src.l3num; | ||
377 | if (l3proto->invert_tuple(inverse, orig) == 0) | ||
378 | return 0; | ||
379 | |||
380 | inverse->dst.dir = !orig->dst.dir; | ||
381 | |||
382 | inverse->dst.protonum = orig->dst.protonum; | ||
383 | return protocol->invert_tuple(inverse, orig); | ||
384 | } | ||
385 | |||
386 | /* nf_conntrack_expect helper functions */ | ||
387 | static void nf_ct_unlink_expect(struct nf_conntrack_expect *exp) | ||
388 | { | ||
389 | ASSERT_WRITE_LOCK(&nf_conntrack_lock); | ||
390 | NF_CT_ASSERT(!timer_pending(&exp_timeout)); | ||
391 | list_del(&exp->list); | ||
392 | NF_CT_STAT_INC(expect_delete); | ||
393 | exp->master->expecting--; | ||
394 | nf_conntrack_expect_put(exp); | ||
395 | } | ||
396 | |||
397 | static void expectation_timed_out(unsigned long ul_expect) | ||
398 | { | ||
399 | struct nf_conntrack_expect *exp = (void *)ul_expect; | ||
400 | |||
401 | write_lock_bh(&nf_conntrack_lock); | ||
402 | nf_ct_unlink_expect(exp); | ||
403 | write_unlock_bh(&nf_conntrack_lock); | ||
404 | nf_conntrack_expect_put(exp); | ||
405 | } | ||
406 | |||
407 | /* If an expectation for this connection is found, it gets delete from | ||
408 | * global list then returned. */ | ||
409 | static struct nf_conntrack_expect * | ||
410 | find_expectation(const struct nf_conntrack_tuple *tuple) | ||
411 | { | ||
412 | struct nf_conntrack_expect *i; | ||
413 | |||
414 | list_for_each_entry(i, &nf_conntrack_expect_list, list) { | ||
415 | /* If master is not in hash table yet (ie. packet hasn't left | ||
416 | this machine yet), how can other end know about expected? | ||
417 | Hence these are not the droids you are looking for (if | ||
418 | master ct never got confirmed, we'd hold a reference to it | ||
419 | and weird things would happen to future packets). */ | ||
420 | if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) | ||
421 | && nf_ct_is_confirmed(i->master)) { | ||
422 | if (i->flags & NF_CT_EXPECT_PERMANENT) { | ||
423 | atomic_inc(&i->use); | ||
424 | return i; | ||
425 | } else if (del_timer(&i->timeout)) { | ||
426 | nf_ct_unlink_expect(i); | ||
427 | return i; | ||
428 | } | ||
429 | } | ||
430 | } | ||
431 | return NULL; | ||
432 | } | ||
433 | |||
434 | /* delete all expectations for this conntrack */ | ||
435 | static void remove_expectations(struct nf_conn *ct) | ||
436 | { | ||
437 | struct nf_conntrack_expect *i, *tmp; | ||
438 | |||
439 | /* Optimization: most connection never expect any others. */ | ||
440 | if (ct->expecting == 0) | ||
441 | return; | ||
442 | |||
443 | list_for_each_entry_safe(i, tmp, &nf_conntrack_expect_list, list) { | ||
444 | if (i->master == ct && del_timer(&i->timeout)) { | ||
445 | nf_ct_unlink_expect(i); | ||
446 | nf_conntrack_expect_put(i); | ||
447 | } | ||
448 | } | ||
449 | } | ||
450 | |||
451 | static void | ||
452 | clean_from_lists(struct nf_conn *ct) | ||
453 | { | ||
454 | unsigned int ho, hr; | ||
455 | |||
456 | DEBUGP("clean_from_lists(%p)\n", ct); | ||
457 | ASSERT_WRITE_LOCK(&nf_conntrack_lock); | ||
458 | |||
459 | ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); | ||
460 | hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); | ||
461 | LIST_DELETE(&nf_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]); | ||
462 | LIST_DELETE(&nf_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]); | ||
463 | |||
464 | /* Destroy all pending expectations */ | ||
465 | remove_expectations(ct); | ||
466 | } | ||
467 | |||
468 | static void | ||
469 | destroy_conntrack(struct nf_conntrack *nfct) | ||
470 | { | ||
471 | struct nf_conn *ct = (struct nf_conn *)nfct; | ||
472 | struct nf_conntrack_l3proto *l3proto; | ||
473 | struct nf_conntrack_protocol *proto; | ||
474 | |||
475 | DEBUGP("destroy_conntrack(%p)\n", ct); | ||
476 | NF_CT_ASSERT(atomic_read(&nfct->use) == 0); | ||
477 | NF_CT_ASSERT(!timer_pending(&ct->timeout)); | ||
478 | |||
479 | nf_conntrack_event(IPCT_DESTROY, ct); | ||
480 | set_bit(IPS_DYING_BIT, &ct->status); | ||
481 | |||
482 | /* To make sure we don't get any weird locking issues here: | ||
483 | * destroy_conntrack() MUST NOT be called with a write lock | ||
484 | * to nf_conntrack_lock!!! -HW */ | ||
485 | l3proto = nf_ct_find_l3proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num); | ||
486 | if (l3proto && l3proto->destroy) | ||
487 | l3proto->destroy(ct); | ||
488 | |||
489 | proto = nf_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num, | ||
490 | ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum); | ||
491 | if (proto && proto->destroy) | ||
492 | proto->destroy(ct); | ||
493 | |||
494 | if (nf_conntrack_destroyed) | ||
495 | nf_conntrack_destroyed(ct); | ||
496 | |||
497 | write_lock_bh(&nf_conntrack_lock); | ||
498 | /* Expectations will have been removed in clean_from_lists, | ||
499 | * except TFTP can create an expectation on the first packet, | ||
500 | * before connection is in the list, so we need to clean here, | ||
501 | * too. */ | ||
502 | remove_expectations(ct); | ||
503 | |||
504 | /* We overload first tuple to link into unconfirmed list. */ | ||
505 | if (!nf_ct_is_confirmed(ct)) { | ||
506 | BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list)); | ||
507 | list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); | ||
508 | } | ||
509 | |||
510 | NF_CT_STAT_INC(delete); | ||
511 | write_unlock_bh(&nf_conntrack_lock); | ||
512 | |||
513 | if (ct->master) | ||
514 | nf_ct_put(ct->master); | ||
515 | |||
516 | DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct); | ||
517 | nf_conntrack_free(ct); | ||
518 | } | ||
519 | |||
520 | static void death_by_timeout(unsigned long ul_conntrack) | ||
521 | { | ||
522 | struct nf_conn *ct = (void *)ul_conntrack; | ||
523 | |||
524 | write_lock_bh(&nf_conntrack_lock); | ||
525 | /* Inside lock so preempt is disabled on module removal path. | ||
526 | * Otherwise we can get spurious warnings. */ | ||
527 | NF_CT_STAT_INC(delete_list); | ||
528 | clean_from_lists(ct); | ||
529 | write_unlock_bh(&nf_conntrack_lock); | ||
530 | nf_ct_put(ct); | ||
531 | } | ||
532 | |||
533 | static inline int | ||
534 | conntrack_tuple_cmp(const struct nf_conntrack_tuple_hash *i, | ||
535 | const struct nf_conntrack_tuple *tuple, | ||
536 | const struct nf_conn *ignored_conntrack) | ||
537 | { | ||
538 | ASSERT_READ_LOCK(&nf_conntrack_lock); | ||
539 | return nf_ct_tuplehash_to_ctrack(i) != ignored_conntrack | ||
540 | && nf_ct_tuple_equal(tuple, &i->tuple); | ||
541 | } | ||
542 | |||
543 | static struct nf_conntrack_tuple_hash * | ||
544 | __nf_conntrack_find(const struct nf_conntrack_tuple *tuple, | ||
545 | const struct nf_conn *ignored_conntrack) | ||
546 | { | ||
547 | struct nf_conntrack_tuple_hash *h; | ||
548 | unsigned int hash = hash_conntrack(tuple); | ||
549 | |||
550 | ASSERT_READ_LOCK(&nf_conntrack_lock); | ||
551 | list_for_each_entry(h, &nf_conntrack_hash[hash], list) { | ||
552 | if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) { | ||
553 | NF_CT_STAT_INC(found); | ||
554 | return h; | ||
555 | } | ||
556 | NF_CT_STAT_INC(searched); | ||
557 | } | ||
558 | |||
559 | return NULL; | ||
560 | } | ||
561 | |||
562 | /* Find a connection corresponding to a tuple. */ | ||
563 | struct nf_conntrack_tuple_hash * | ||
564 | nf_conntrack_find_get(const struct nf_conntrack_tuple *tuple, | ||
565 | const struct nf_conn *ignored_conntrack) | ||
566 | { | ||
567 | struct nf_conntrack_tuple_hash *h; | ||
568 | |||
569 | read_lock_bh(&nf_conntrack_lock); | ||
570 | h = __nf_conntrack_find(tuple, ignored_conntrack); | ||
571 | if (h) | ||
572 | atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use); | ||
573 | read_unlock_bh(&nf_conntrack_lock); | ||
574 | |||
575 | return h; | ||
576 | } | ||
577 | |||
578 | /* Confirm a connection given skb; places it in hash table */ | ||
579 | int | ||
580 | __nf_conntrack_confirm(struct sk_buff **pskb) | ||
581 | { | ||
582 | unsigned int hash, repl_hash; | ||
583 | struct nf_conn *ct; | ||
584 | enum ip_conntrack_info ctinfo; | ||
585 | |||
586 | ct = nf_ct_get(*pskb, &ctinfo); | ||
587 | |||
588 | /* ipt_REJECT uses nf_conntrack_attach to attach related | ||
589 | ICMP/TCP RST packets in other direction. Actual packet | ||
590 | which created connection will be IP_CT_NEW or for an | ||
591 | expected connection, IP_CT_RELATED. */ | ||
592 | if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) | ||
593 | return NF_ACCEPT; | ||
594 | |||
595 | hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); | ||
596 | repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); | ||
597 | |||
598 | /* We're not in hash table, and we refuse to set up related | ||
599 | connections for unconfirmed conns. But packet copies and | ||
600 | REJECT will give spurious warnings here. */ | ||
601 | /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */ | ||
602 | |||
603 | /* No external references means noone else could have | ||
604 | confirmed us. */ | ||
605 | NF_CT_ASSERT(!nf_ct_is_confirmed(ct)); | ||
606 | DEBUGP("Confirming conntrack %p\n", ct); | ||
607 | |||
608 | write_lock_bh(&nf_conntrack_lock); | ||
609 | |||
610 | /* See if there's one in the list already, including reverse: | ||
611 | NAT could have grabbed it without realizing, since we're | ||
612 | not in the hash. If there is, we lost race. */ | ||
613 | if (!LIST_FIND(&nf_conntrack_hash[hash], | ||
614 | conntrack_tuple_cmp, | ||
615 | struct nf_conntrack_tuple_hash *, | ||
616 | &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL) | ||
617 | && !LIST_FIND(&nf_conntrack_hash[repl_hash], | ||
618 | conntrack_tuple_cmp, | ||
619 | struct nf_conntrack_tuple_hash *, | ||
620 | &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) { | ||
621 | /* Remove from unconfirmed list */ | ||
622 | list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); | ||
623 | |||
624 | list_prepend(&nf_conntrack_hash[hash], | ||
625 | &ct->tuplehash[IP_CT_DIR_ORIGINAL]); | ||
626 | list_prepend(&nf_conntrack_hash[repl_hash], | ||
627 | &ct->tuplehash[IP_CT_DIR_REPLY]); | ||
628 | /* Timer relative to confirmation time, not original | ||
629 | setting time, otherwise we'd get timer wrap in | ||
630 | weird delay cases. */ | ||
631 | ct->timeout.expires += jiffies; | ||
632 | add_timer(&ct->timeout); | ||
633 | atomic_inc(&ct->ct_general.use); | ||
634 | set_bit(IPS_CONFIRMED_BIT, &ct->status); | ||
635 | NF_CT_STAT_INC(insert); | ||
636 | write_unlock_bh(&nf_conntrack_lock); | ||
637 | if (ct->helper) | ||
638 | nf_conntrack_event_cache(IPCT_HELPER, *pskb); | ||
639 | #ifdef CONFIG_NF_NAT_NEEDED | ||
640 | if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) || | ||
641 | test_bit(IPS_DST_NAT_DONE_BIT, &ct->status)) | ||
642 | nf_conntrack_event_cache(IPCT_NATINFO, *pskb); | ||
643 | #endif | ||
644 | nf_conntrack_event_cache(master_ct(ct) ? | ||
645 | IPCT_RELATED : IPCT_NEW, *pskb); | ||
646 | return NF_ACCEPT; | ||
647 | } | ||
648 | |||
649 | NF_CT_STAT_INC(insert_failed); | ||
650 | write_unlock_bh(&nf_conntrack_lock); | ||
651 | return NF_DROP; | ||
652 | } | ||
653 | |||
654 | /* Returns true if a connection correspondings to the tuple (required | ||
655 | for NAT). */ | ||
656 | int | ||
657 | nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, | ||
658 | const struct nf_conn *ignored_conntrack) | ||
659 | { | ||
660 | struct nf_conntrack_tuple_hash *h; | ||
661 | |||
662 | read_lock_bh(&nf_conntrack_lock); | ||
663 | h = __nf_conntrack_find(tuple, ignored_conntrack); | ||
664 | read_unlock_bh(&nf_conntrack_lock); | ||
665 | |||
666 | return h != NULL; | ||
667 | } | ||
668 | |||
669 | /* There's a small race here where we may free a just-assured | ||
670 | connection. Too bad: we're in trouble anyway. */ | ||
671 | static inline int unreplied(const struct nf_conntrack_tuple_hash *i) | ||
672 | { | ||
673 | return !(test_bit(IPS_ASSURED_BIT, | ||
674 | &nf_ct_tuplehash_to_ctrack(i)->status)); | ||
675 | } | ||
676 | |||
677 | static int early_drop(struct list_head *chain) | ||
678 | { | ||
679 | /* Traverse backwards: gives us oldest, which is roughly LRU */ | ||
680 | struct nf_conntrack_tuple_hash *h; | ||
681 | struct nf_conn *ct = NULL; | ||
682 | int dropped = 0; | ||
683 | |||
684 | read_lock_bh(&nf_conntrack_lock); | ||
685 | h = LIST_FIND_B(chain, unreplied, struct nf_conntrack_tuple_hash *); | ||
686 | if (h) { | ||
687 | ct = nf_ct_tuplehash_to_ctrack(h); | ||
688 | atomic_inc(&ct->ct_general.use); | ||
689 | } | ||
690 | read_unlock_bh(&nf_conntrack_lock); | ||
691 | |||
692 | if (!ct) | ||
693 | return dropped; | ||
694 | |||
695 | if (del_timer(&ct->timeout)) { | ||
696 | death_by_timeout((unsigned long)ct); | ||
697 | dropped = 1; | ||
698 | NF_CT_STAT_INC(early_drop); | ||
699 | } | ||
700 | nf_ct_put(ct); | ||
701 | return dropped; | ||
702 | } | ||
703 | |||
704 | static inline int helper_cmp(const struct nf_conntrack_helper *i, | ||
705 | const struct nf_conntrack_tuple *rtuple) | ||
706 | { | ||
707 | return nf_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask); | ||
708 | } | ||
709 | |||
710 | static struct nf_conntrack_helper * | ||
711 | nf_ct_find_helper(const struct nf_conntrack_tuple *tuple) | ||
712 | { | ||
713 | return LIST_FIND(&helpers, helper_cmp, | ||
714 | struct nf_conntrack_helper *, | ||
715 | tuple); | ||
716 | } | ||
717 | |||
718 | static struct nf_conn * | ||
719 | __nf_conntrack_alloc(const struct nf_conntrack_tuple *orig, | ||
720 | const struct nf_conntrack_tuple *repl, | ||
721 | const struct nf_conntrack_l3proto *l3proto) | ||
722 | { | ||
723 | struct nf_conn *conntrack = NULL; | ||
724 | u_int32_t features = 0; | ||
725 | |||
726 | if (!nf_conntrack_hash_rnd_initted) { | ||
727 | get_random_bytes(&nf_conntrack_hash_rnd, 4); | ||
728 | nf_conntrack_hash_rnd_initted = 1; | ||
729 | } | ||
730 | |||
731 | if (nf_conntrack_max | ||
732 | && atomic_read(&nf_conntrack_count) >= nf_conntrack_max) { | ||
733 | unsigned int hash = hash_conntrack(orig); | ||
734 | /* Try dropping from this hash chain. */ | ||
735 | if (!early_drop(&nf_conntrack_hash[hash])) { | ||
736 | if (net_ratelimit()) | ||
737 | printk(KERN_WARNING | ||
738 | "nf_conntrack: table full, dropping" | ||
739 | " packet.\n"); | ||
740 | return ERR_PTR(-ENOMEM); | ||
741 | } | ||
742 | } | ||
743 | |||
744 | /* find features needed by this conntrack. */ | ||
745 | features = l3proto->get_features(orig); | ||
746 | read_lock_bh(&nf_conntrack_lock); | ||
747 | if (nf_ct_find_helper(repl) != NULL) | ||
748 | features |= NF_CT_F_HELP; | ||
749 | read_unlock_bh(&nf_conntrack_lock); | ||
750 | |||
751 | DEBUGP("nf_conntrack_alloc: features=0x%x\n", features); | ||
752 | |||
753 | read_lock_bh(&nf_ct_cache_lock); | ||
754 | |||
755 | if (!nf_ct_cache[features].use) { | ||
756 | DEBUGP("nf_conntrack_alloc: not supported features = 0x%x\n", | ||
757 | features); | ||
758 | goto out; | ||
759 | } | ||
760 | |||
761 | conntrack = kmem_cache_alloc(nf_ct_cache[features].cachep, GFP_ATOMIC); | ||
762 | if (conntrack == NULL) { | ||
763 | DEBUGP("nf_conntrack_alloc: Can't alloc conntrack from cache\n"); | ||
764 | goto out; | ||
765 | } | ||
766 | |||
767 | memset(conntrack, 0, nf_ct_cache[features].size); | ||
768 | conntrack->features = features; | ||
769 | if (nf_ct_cache[features].init_conntrack && | ||
770 | nf_ct_cache[features].init_conntrack(conntrack, features) < 0) { | ||
771 | DEBUGP("nf_conntrack_alloc: failed to init\n"); | ||
772 | kmem_cache_free(nf_ct_cache[features].cachep, conntrack); | ||
773 | conntrack = NULL; | ||
774 | goto out; | ||
775 | } | ||
776 | |||
777 | atomic_set(&conntrack->ct_general.use, 1); | ||
778 | conntrack->ct_general.destroy = destroy_conntrack; | ||
779 | conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; | ||
780 | conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; | ||
781 | /* Don't set timer yet: wait for confirmation */ | ||
782 | init_timer(&conntrack->timeout); | ||
783 | conntrack->timeout.data = (unsigned long)conntrack; | ||
784 | conntrack->timeout.function = death_by_timeout; | ||
785 | |||
786 | atomic_inc(&nf_conntrack_count); | ||
787 | out: | ||
788 | read_unlock_bh(&nf_ct_cache_lock); | ||
789 | return conntrack; | ||
790 | } | ||
791 | |||
792 | struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig, | ||
793 | const struct nf_conntrack_tuple *repl) | ||
794 | { | ||
795 | struct nf_conntrack_l3proto *l3proto; | ||
796 | |||
797 | l3proto = nf_ct_find_l3proto(orig->src.l3num); | ||
798 | return __nf_conntrack_alloc(orig, repl, l3proto); | ||
799 | } | ||
800 | |||
801 | void nf_conntrack_free(struct nf_conn *conntrack) | ||
802 | { | ||
803 | u_int32_t features = conntrack->features; | ||
804 | NF_CT_ASSERT(features >= NF_CT_F_BASIC && features < NF_CT_F_NUM); | ||
805 | DEBUGP("nf_conntrack_free: features = 0x%x, conntrack=%p\n", features, | ||
806 | conntrack); | ||
807 | kmem_cache_free(nf_ct_cache[features].cachep, conntrack); | ||
808 | atomic_dec(&nf_conntrack_count); | ||
809 | } | ||
810 | |||
811 | /* Allocate a new conntrack: we return -ENOMEM if classification | ||
812 | failed due to stress. Otherwise it really is unclassifiable. */ | ||
813 | static struct nf_conntrack_tuple_hash * | ||
814 | init_conntrack(const struct nf_conntrack_tuple *tuple, | ||
815 | struct nf_conntrack_l3proto *l3proto, | ||
816 | struct nf_conntrack_protocol *protocol, | ||
817 | struct sk_buff *skb, | ||
818 | unsigned int dataoff) | ||
819 | { | ||
820 | struct nf_conn *conntrack; | ||
821 | struct nf_conntrack_tuple repl_tuple; | ||
822 | struct nf_conntrack_expect *exp; | ||
823 | |||
824 | if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, protocol)) { | ||
825 | DEBUGP("Can't invert tuple.\n"); | ||
826 | return NULL; | ||
827 | } | ||
828 | |||
829 | conntrack = __nf_conntrack_alloc(tuple, &repl_tuple, l3proto); | ||
830 | if (conntrack == NULL || IS_ERR(conntrack)) { | ||
831 | DEBUGP("Can't allocate conntrack.\n"); | ||
832 | return (struct nf_conntrack_tuple_hash *)conntrack; | ||
833 | } | ||
834 | |||
835 | if (!protocol->new(conntrack, skb, dataoff)) { | ||
836 | nf_conntrack_free(conntrack); | ||
837 | DEBUGP("init conntrack: can't track with proto module\n"); | ||
838 | return NULL; | ||
839 | } | ||
840 | |||
841 | write_lock_bh(&nf_conntrack_lock); | ||
842 | exp = find_expectation(tuple); | ||
843 | |||
844 | if (exp) { | ||
845 | DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n", | ||
846 | conntrack, exp); | ||
847 | /* Welcome, Mr. Bond. We've been expecting you... */ | ||
848 | __set_bit(IPS_EXPECTED_BIT, &conntrack->status); | ||
849 | conntrack->master = exp->master; | ||
850 | #ifdef CONFIG_NF_CONNTRACK_MARK | ||
851 | conntrack->mark = exp->master->mark; | ||
852 | #endif | ||
853 | nf_conntrack_get(&conntrack->master->ct_general); | ||
854 | NF_CT_STAT_INC(expect_new); | ||
855 | } else { | ||
856 | conntrack->helper = nf_ct_find_helper(&repl_tuple); | ||
857 | |||
858 | NF_CT_STAT_INC(new); | ||
859 | } | ||
860 | |||
861 | /* Overload tuple linked list to put us in unconfirmed list. */ | ||
862 | list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed); | ||
863 | |||
864 | write_unlock_bh(&nf_conntrack_lock); | ||
865 | |||
866 | if (exp) { | ||
867 | if (exp->expectfn) | ||
868 | exp->expectfn(conntrack, exp); | ||
869 | nf_conntrack_expect_put(exp); | ||
870 | } | ||
871 | |||
872 | return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL]; | ||
873 | } | ||
874 | |||
875 | /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */ | ||
876 | static inline struct nf_conn * | ||
877 | resolve_normal_ct(struct sk_buff *skb, | ||
878 | unsigned int dataoff, | ||
879 | u_int16_t l3num, | ||
880 | u_int8_t protonum, | ||
881 | struct nf_conntrack_l3proto *l3proto, | ||
882 | struct nf_conntrack_protocol *proto, | ||
883 | int *set_reply, | ||
884 | enum ip_conntrack_info *ctinfo) | ||
885 | { | ||
886 | struct nf_conntrack_tuple tuple; | ||
887 | struct nf_conntrack_tuple_hash *h; | ||
888 | struct nf_conn *ct; | ||
889 | |||
890 | if (!nf_ct_get_tuple(skb, (unsigned int)(skb->nh.raw - skb->data), | ||
891 | dataoff, l3num, protonum, &tuple, l3proto, | ||
892 | proto)) { | ||
893 | DEBUGP("resolve_normal_ct: Can't get tuple\n"); | ||
894 | return NULL; | ||
895 | } | ||
896 | |||
897 | /* look for tuple match */ | ||
898 | h = nf_conntrack_find_get(&tuple, NULL); | ||
899 | if (!h) { | ||
900 | h = init_conntrack(&tuple, l3proto, proto, skb, dataoff); | ||
901 | if (!h) | ||
902 | return NULL; | ||
903 | if (IS_ERR(h)) | ||
904 | return (void *)h; | ||
905 | } | ||
906 | ct = nf_ct_tuplehash_to_ctrack(h); | ||
907 | |||
908 | /* It exists; we have (non-exclusive) reference. */ | ||
909 | if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) { | ||
910 | *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY; | ||
911 | /* Please set reply bit if this packet OK */ | ||
912 | *set_reply = 1; | ||
913 | } else { | ||
914 | /* Once we've had two way comms, always ESTABLISHED. */ | ||
915 | if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { | ||
916 | DEBUGP("nf_conntrack_in: normal packet for %p\n", ct); | ||
917 | *ctinfo = IP_CT_ESTABLISHED; | ||
918 | } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) { | ||
919 | DEBUGP("nf_conntrack_in: related packet for %p\n", ct); | ||
920 | *ctinfo = IP_CT_RELATED; | ||
921 | } else { | ||
922 | DEBUGP("nf_conntrack_in: new packet for %p\n", ct); | ||
923 | *ctinfo = IP_CT_NEW; | ||
924 | } | ||
925 | *set_reply = 0; | ||
926 | } | ||
927 | skb->nfct = &ct->ct_general; | ||
928 | skb->nfctinfo = *ctinfo; | ||
929 | return ct; | ||
930 | } | ||
931 | |||
932 | unsigned int | ||
933 | nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff **pskb) | ||
934 | { | ||
935 | struct nf_conn *ct; | ||
936 | enum ip_conntrack_info ctinfo; | ||
937 | struct nf_conntrack_l3proto *l3proto; | ||
938 | struct nf_conntrack_protocol *proto; | ||
939 | unsigned int dataoff; | ||
940 | u_int8_t protonum; | ||
941 | int set_reply = 0; | ||
942 | int ret; | ||
943 | |||
944 | /* Previously seen (loopback or untracked)? Ignore. */ | ||
945 | if ((*pskb)->nfct) { | ||
946 | NF_CT_STAT_INC(ignore); | ||
947 | return NF_ACCEPT; | ||
948 | } | ||
949 | |||
950 | l3proto = nf_ct_find_l3proto((u_int16_t)pf); | ||
951 | if ((ret = l3proto->prepare(pskb, hooknum, &dataoff, &protonum)) <= 0) { | ||
952 | DEBUGP("not prepared to track yet or error occured\n"); | ||
953 | return -ret; | ||
954 | } | ||
955 | |||
956 | proto = nf_ct_find_proto((u_int16_t)pf, protonum); | ||
957 | |||
958 | /* It may be an special packet, error, unclean... | ||
959 | * inverse of the return code tells to the netfilter | ||
960 | * core what to do with the packet. */ | ||
961 | if (proto->error != NULL && | ||
962 | (ret = proto->error(*pskb, dataoff, &ctinfo, pf, hooknum)) <= 0) { | ||
963 | NF_CT_STAT_INC(error); | ||
964 | NF_CT_STAT_INC(invalid); | ||
965 | return -ret; | ||
966 | } | ||
967 | |||
968 | ct = resolve_normal_ct(*pskb, dataoff, pf, protonum, l3proto, proto, | ||
969 | &set_reply, &ctinfo); | ||
970 | if (!ct) { | ||
971 | /* Not valid part of a connection */ | ||
972 | NF_CT_STAT_INC(invalid); | ||
973 | return NF_ACCEPT; | ||
974 | } | ||
975 | |||
976 | if (IS_ERR(ct)) { | ||
977 | /* Too stressed to deal. */ | ||
978 | NF_CT_STAT_INC(drop); | ||
979 | return NF_DROP; | ||
980 | } | ||
981 | |||
982 | NF_CT_ASSERT((*pskb)->nfct); | ||
983 | |||
984 | ret = proto->packet(ct, *pskb, dataoff, ctinfo, pf, hooknum); | ||
985 | if (ret < 0) { | ||
986 | /* Invalid: inverse of the return code tells | ||
987 | * the netfilter core what to do */ | ||
988 | DEBUGP("nf_conntrack_in: Can't track with proto module\n"); | ||
989 | nf_conntrack_put((*pskb)->nfct); | ||
990 | (*pskb)->nfct = NULL; | ||
991 | NF_CT_STAT_INC(invalid); | ||
992 | return -ret; | ||
993 | } | ||
994 | |||
995 | if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) | ||
996 | nf_conntrack_event_cache(IPCT_STATUS, *pskb); | ||
997 | |||
998 | return ret; | ||
999 | } | ||
1000 | |||
1001 | int nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse, | ||
1002 | const struct nf_conntrack_tuple *orig) | ||
1003 | { | ||
1004 | return nf_ct_invert_tuple(inverse, orig, | ||
1005 | nf_ct_find_l3proto(orig->src.l3num), | ||
1006 | nf_ct_find_proto(orig->src.l3num, | ||
1007 | orig->dst.protonum)); | ||
1008 | } | ||
1009 | |||
1010 | /* Would two expected things clash? */ | ||
1011 | static inline int expect_clash(const struct nf_conntrack_expect *a, | ||
1012 | const struct nf_conntrack_expect *b) | ||
1013 | { | ||
1014 | /* Part covered by intersection of masks must be unequal, | ||
1015 | otherwise they clash */ | ||
1016 | struct nf_conntrack_tuple intersect_mask; | ||
1017 | int count; | ||
1018 | |||
1019 | intersect_mask.src.l3num = a->mask.src.l3num & b->mask.src.l3num; | ||
1020 | intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all; | ||
1021 | intersect_mask.dst.u.all = a->mask.dst.u.all & b->mask.dst.u.all; | ||
1022 | intersect_mask.dst.protonum = a->mask.dst.protonum | ||
1023 | & b->mask.dst.protonum; | ||
1024 | |||
1025 | for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){ | ||
1026 | intersect_mask.src.u3.all[count] = | ||
1027 | a->mask.src.u3.all[count] & b->mask.src.u3.all[count]; | ||
1028 | } | ||
1029 | |||
1030 | for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){ | ||
1031 | intersect_mask.dst.u3.all[count] = | ||
1032 | a->mask.dst.u3.all[count] & b->mask.dst.u3.all[count]; | ||
1033 | } | ||
1034 | |||
1035 | return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask); | ||
1036 | } | ||
1037 | |||
1038 | static inline int expect_matches(const struct nf_conntrack_expect *a, | ||
1039 | const struct nf_conntrack_expect *b) | ||
1040 | { | ||
1041 | return a->master == b->master | ||
1042 | && nf_ct_tuple_equal(&a->tuple, &b->tuple) | ||
1043 | && nf_ct_tuple_equal(&a->mask, &b->mask); | ||
1044 | } | ||
1045 | |||
1046 | /* Generally a bad idea to call this: could have matched already. */ | ||
1047 | void nf_conntrack_unexpect_related(struct nf_conntrack_expect *exp) | ||
1048 | { | ||
1049 | struct nf_conntrack_expect *i; | ||
1050 | |||
1051 | write_lock_bh(&nf_conntrack_lock); | ||
1052 | /* choose the the oldest expectation to evict */ | ||
1053 | list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) { | ||
1054 | if (expect_matches(i, exp) && del_timer(&i->timeout)) { | ||
1055 | nf_ct_unlink_expect(i); | ||
1056 | write_unlock_bh(&nf_conntrack_lock); | ||
1057 | nf_conntrack_expect_put(i); | ||
1058 | return; | ||
1059 | } | ||
1060 | } | ||
1061 | write_unlock_bh(&nf_conntrack_lock); | ||
1062 | } | ||
1063 | |||
1064 | /* We don't increase the master conntrack refcount for non-fulfilled | ||
1065 | * conntracks. During the conntrack destruction, the expectations are | ||
1066 | * always killed before the conntrack itself */ | ||
1067 | struct nf_conntrack_expect *nf_conntrack_expect_alloc(struct nf_conn *me) | ||
1068 | { | ||
1069 | struct nf_conntrack_expect *new; | ||
1070 | |||
1071 | new = kmem_cache_alloc(nf_conntrack_expect_cachep, GFP_ATOMIC); | ||
1072 | if (!new) { | ||
1073 | DEBUGP("expect_related: OOM allocating expect\n"); | ||
1074 | return NULL; | ||
1075 | } | ||
1076 | new->master = me; | ||
1077 | atomic_set(&new->use, 1); | ||
1078 | return new; | ||
1079 | } | ||
1080 | |||
1081 | void nf_conntrack_expect_put(struct nf_conntrack_expect *exp) | ||
1082 | { | ||
1083 | if (atomic_dec_and_test(&exp->use)) | ||
1084 | kmem_cache_free(nf_conntrack_expect_cachep, exp); | ||
1085 | } | ||
1086 | |||
1087 | static void nf_conntrack_expect_insert(struct nf_conntrack_expect *exp) | ||
1088 | { | ||
1089 | atomic_inc(&exp->use); | ||
1090 | exp->master->expecting++; | ||
1091 | list_add(&exp->list, &nf_conntrack_expect_list); | ||
1092 | |||
1093 | init_timer(&exp->timeout); | ||
1094 | exp->timeout.data = (unsigned long)exp; | ||
1095 | exp->timeout.function = expectation_timed_out; | ||
1096 | exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ; | ||
1097 | add_timer(&exp->timeout); | ||
1098 | |||
1099 | atomic_inc(&exp->use); | ||
1100 | NF_CT_STAT_INC(expect_create); | ||
1101 | } | ||
1102 | |||
1103 | /* Race with expectations being used means we could have none to find; OK. */ | ||
1104 | static void evict_oldest_expect(struct nf_conn *master) | ||
1105 | { | ||
1106 | struct nf_conntrack_expect *i; | ||
1107 | |||
1108 | list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) { | ||
1109 | if (i->master == master) { | ||
1110 | if (del_timer(&i->timeout)) { | ||
1111 | nf_ct_unlink_expect(i); | ||
1112 | nf_conntrack_expect_put(i); | ||
1113 | } | ||
1114 | break; | ||
1115 | } | ||
1116 | } | ||
1117 | } | ||
1118 | |||
1119 | static inline int refresh_timer(struct nf_conntrack_expect *i) | ||
1120 | { | ||
1121 | if (!del_timer(&i->timeout)) | ||
1122 | return 0; | ||
1123 | |||
1124 | i->timeout.expires = jiffies + i->master->helper->timeout*HZ; | ||
1125 | add_timer(&i->timeout); | ||
1126 | return 1; | ||
1127 | } | ||
1128 | |||
1129 | int nf_conntrack_expect_related(struct nf_conntrack_expect *expect) | ||
1130 | { | ||
1131 | struct nf_conntrack_expect *i; | ||
1132 | int ret; | ||
1133 | |||
1134 | DEBUGP("nf_conntrack_expect_related %p\n", related_to); | ||
1135 | DEBUGP("tuple: "); NF_CT_DUMP_TUPLE(&expect->tuple); | ||
1136 | DEBUGP("mask: "); NF_CT_DUMP_TUPLE(&expect->mask); | ||
1137 | |||
1138 | write_lock_bh(&nf_conntrack_lock); | ||
1139 | list_for_each_entry(i, &nf_conntrack_expect_list, list) { | ||
1140 | if (expect_matches(i, expect)) { | ||
1141 | /* Refresh timer: if it's dying, ignore.. */ | ||
1142 | if (refresh_timer(i)) { | ||
1143 | ret = 0; | ||
1144 | goto out; | ||
1145 | } | ||
1146 | } else if (expect_clash(i, expect)) { | ||
1147 | ret = -EBUSY; | ||
1148 | goto out; | ||
1149 | } | ||
1150 | } | ||
1151 | /* Will be over limit? */ | ||
1152 | if (expect->master->helper->max_expected && | ||
1153 | expect->master->expecting >= expect->master->helper->max_expected) | ||
1154 | evict_oldest_expect(expect->master); | ||
1155 | |||
1156 | nf_conntrack_expect_insert(expect); | ||
1157 | nf_conntrack_expect_event(IPEXP_NEW, expect); | ||
1158 | ret = 0; | ||
1159 | out: | ||
1160 | write_unlock_bh(&nf_conntrack_lock); | ||
1161 | return ret; | ||
1162 | } | ||
1163 | |||
1164 | /* Alter reply tuple (maybe alter helper). This is for NAT, and is | ||
1165 | implicitly racy: see __nf_conntrack_confirm */ | ||
1166 | void nf_conntrack_alter_reply(struct nf_conn *conntrack, | ||
1167 | const struct nf_conntrack_tuple *newreply) | ||
1168 | { | ||
1169 | write_lock_bh(&nf_conntrack_lock); | ||
1170 | /* Should be unconfirmed, so not in hash table yet */ | ||
1171 | NF_CT_ASSERT(!nf_ct_is_confirmed(conntrack)); | ||
1172 | |||
1173 | DEBUGP("Altering reply tuple of %p to ", conntrack); | ||
1174 | NF_CT_DUMP_TUPLE(newreply); | ||
1175 | |||
1176 | conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; | ||
1177 | if (!conntrack->master && conntrack->expecting == 0) | ||
1178 | conntrack->helper = nf_ct_find_helper(newreply); | ||
1179 | write_unlock_bh(&nf_conntrack_lock); | ||
1180 | } | ||
1181 | |||
1182 | int nf_conntrack_helper_register(struct nf_conntrack_helper *me) | ||
1183 | { | ||
1184 | int ret; | ||
1185 | BUG_ON(me->timeout == 0); | ||
1186 | |||
1187 | ret = nf_conntrack_register_cache(NF_CT_F_HELP, "nf_conntrack:help", | ||
1188 | sizeof(struct nf_conn) | ||
1189 | + sizeof(union nf_conntrack_help) | ||
1190 | + __alignof__(union nf_conntrack_help), | ||
1191 | init_conntrack_for_helper); | ||
1192 | if (ret < 0) { | ||
1193 | printk(KERN_ERR "nf_conntrack_helper_reigster: Unable to create slab cache for conntracks\n"); | ||
1194 | return ret; | ||
1195 | } | ||
1196 | write_lock_bh(&nf_conntrack_lock); | ||
1197 | list_prepend(&helpers, me); | ||
1198 | write_unlock_bh(&nf_conntrack_lock); | ||
1199 | |||
1200 | return 0; | ||
1201 | } | ||
1202 | |||
1203 | static inline int unhelp(struct nf_conntrack_tuple_hash *i, | ||
1204 | const struct nf_conntrack_helper *me) | ||
1205 | { | ||
1206 | if (nf_ct_tuplehash_to_ctrack(i)->helper == me) { | ||
1207 | nf_conntrack_event(IPCT_HELPER, nf_ct_tuplehash_to_ctrack(i)); | ||
1208 | nf_ct_tuplehash_to_ctrack(i)->helper = NULL; | ||
1209 | } | ||
1210 | return 0; | ||
1211 | } | ||
1212 | |||
1213 | void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) | ||
1214 | { | ||
1215 | unsigned int i; | ||
1216 | struct nf_conntrack_expect *exp, *tmp; | ||
1217 | |||
1218 | /* Need write lock here, to delete helper. */ | ||
1219 | write_lock_bh(&nf_conntrack_lock); | ||
1220 | LIST_DELETE(&helpers, me); | ||
1221 | |||
1222 | /* Get rid of expectations */ | ||
1223 | list_for_each_entry_safe(exp, tmp, &nf_conntrack_expect_list, list) { | ||
1224 | if (exp->master->helper == me && del_timer(&exp->timeout)) { | ||
1225 | nf_ct_unlink_expect(exp); | ||
1226 | nf_conntrack_expect_put(exp); | ||
1227 | } | ||
1228 | } | ||
1229 | |||
1230 | /* Get rid of expecteds, set helpers to NULL. */ | ||
1231 | LIST_FIND_W(&unconfirmed, unhelp, struct nf_conntrack_tuple_hash*, me); | ||
1232 | for (i = 0; i < nf_conntrack_htable_size; i++) | ||
1233 | LIST_FIND_W(&nf_conntrack_hash[i], unhelp, | ||
1234 | struct nf_conntrack_tuple_hash *, me); | ||
1235 | write_unlock_bh(&nf_conntrack_lock); | ||
1236 | |||
1237 | /* Someone could be still looking at the helper in a bh. */ | ||
1238 | synchronize_net(); | ||
1239 | } | ||
1240 | |||
1241 | /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ | ||
1242 | void __nf_ct_refresh_acct(struct nf_conn *ct, | ||
1243 | enum ip_conntrack_info ctinfo, | ||
1244 | const struct sk_buff *skb, | ||
1245 | unsigned long extra_jiffies, | ||
1246 | int do_acct) | ||
1247 | { | ||
1248 | int event = 0; | ||
1249 | |||
1250 | NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct); | ||
1251 | NF_CT_ASSERT(skb); | ||
1252 | |||
1253 | write_lock_bh(&nf_conntrack_lock); | ||
1254 | |||
1255 | /* If not in hash table, timer will not be active yet */ | ||
1256 | if (!nf_ct_is_confirmed(ct)) { | ||
1257 | ct->timeout.expires = extra_jiffies; | ||
1258 | event = IPCT_REFRESH; | ||
1259 | } else { | ||
1260 | /* Need del_timer for race avoidance (may already be dying). */ | ||
1261 | if (del_timer(&ct->timeout)) { | ||
1262 | ct->timeout.expires = jiffies + extra_jiffies; | ||
1263 | add_timer(&ct->timeout); | ||
1264 | event = IPCT_REFRESH; | ||
1265 | } | ||
1266 | } | ||
1267 | |||
1268 | #ifdef CONFIG_NF_CT_ACCT | ||
1269 | if (do_acct) { | ||
1270 | ct->counters[CTINFO2DIR(ctinfo)].packets++; | ||
1271 | ct->counters[CTINFO2DIR(ctinfo)].bytes += | ||
1272 | skb->len - (unsigned int)(skb->nh.raw - skb->data); | ||
1273 | if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000) | ||
1274 | || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000)) | ||
1275 | event |= IPCT_COUNTER_FILLING; | ||
1276 | } | ||
1277 | #endif | ||
1278 | |||
1279 | write_unlock_bh(&nf_conntrack_lock); | ||
1280 | |||
1281 | /* must be unlocked when calling event cache */ | ||
1282 | if (event) | ||
1283 | nf_conntrack_event_cache(event, skb); | ||
1284 | } | ||
1285 | |||
1286 | /* Used by ipt_REJECT and ip6t_REJECT. */ | ||
1287 | void __nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb) | ||
1288 | { | ||
1289 | struct nf_conn *ct; | ||
1290 | enum ip_conntrack_info ctinfo; | ||
1291 | |||
1292 | /* This ICMP is in reverse direction to the packet which caused it */ | ||
1293 | ct = nf_ct_get(skb, &ctinfo); | ||
1294 | if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) | ||
1295 | ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY; | ||
1296 | else | ||
1297 | ctinfo = IP_CT_RELATED; | ||
1298 | |||
1299 | /* Attach to new skbuff, and increment count */ | ||
1300 | nskb->nfct = &ct->ct_general; | ||
1301 | nskb->nfctinfo = ctinfo; | ||
1302 | nf_conntrack_get(nskb->nfct); | ||
1303 | } | ||
1304 | |||
1305 | static inline int | ||
1306 | do_iter(const struct nf_conntrack_tuple_hash *i, | ||
1307 | int (*iter)(struct nf_conn *i, void *data), | ||
1308 | void *data) | ||
1309 | { | ||
1310 | return iter(nf_ct_tuplehash_to_ctrack(i), data); | ||
1311 | } | ||
1312 | |||
1313 | /* Bring out ya dead! */ | ||
1314 | static struct nf_conntrack_tuple_hash * | ||
1315 | get_next_corpse(int (*iter)(struct nf_conn *i, void *data), | ||
1316 | void *data, unsigned int *bucket) | ||
1317 | { | ||
1318 | struct nf_conntrack_tuple_hash *h = NULL; | ||
1319 | |||
1320 | write_lock_bh(&nf_conntrack_lock); | ||
1321 | for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { | ||
1322 | h = LIST_FIND_W(&nf_conntrack_hash[*bucket], do_iter, | ||
1323 | struct nf_conntrack_tuple_hash *, iter, data); | ||
1324 | if (h) | ||
1325 | break; | ||
1326 | } | ||
1327 | if (!h) | ||
1328 | h = LIST_FIND_W(&unconfirmed, do_iter, | ||
1329 | struct nf_conntrack_tuple_hash *, iter, data); | ||
1330 | if (h) | ||
1331 | atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use); | ||
1332 | write_unlock_bh(&nf_conntrack_lock); | ||
1333 | |||
1334 | return h; | ||
1335 | } | ||
1336 | |||
1337 | void | ||
1338 | nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data) | ||
1339 | { | ||
1340 | struct nf_conntrack_tuple_hash *h; | ||
1341 | unsigned int bucket = 0; | ||
1342 | |||
1343 | while ((h = get_next_corpse(iter, data, &bucket)) != NULL) { | ||
1344 | struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); | ||
1345 | /* Time to push up daises... */ | ||
1346 | if (del_timer(&ct->timeout)) | ||
1347 | death_by_timeout((unsigned long)ct); | ||
1348 | /* ... else the timer will get him soon. */ | ||
1349 | |||
1350 | nf_ct_put(ct); | ||
1351 | } | ||
1352 | } | ||
1353 | |||
1354 | static int kill_all(struct nf_conn *i, void *data) | ||
1355 | { | ||
1356 | return 1; | ||
1357 | } | ||
1358 | |||
1359 | static void free_conntrack_hash(struct list_head *hash, int vmalloced, int size) | ||
1360 | { | ||
1361 | if (vmalloced) | ||
1362 | vfree(hash); | ||
1363 | else | ||
1364 | free_pages((unsigned long)hash, | ||
1365 | get_order(sizeof(struct list_head) * size)); | ||
1366 | } | ||
1367 | |||
1368 | /* Mishearing the voices in his head, our hero wonders how he's | ||
1369 | supposed to kill the mall. */ | ||
1370 | void nf_conntrack_cleanup(void) | ||
1371 | { | ||
1372 | int i; | ||
1373 | |||
1374 | /* This makes sure all current packets have passed through | ||
1375 | netfilter framework. Roll on, two-stage module | ||
1376 | delete... */ | ||
1377 | synchronize_net(); | ||
1378 | |||
1379 | nf_ct_event_cache_flush(); | ||
1380 | i_see_dead_people: | ||
1381 | nf_ct_iterate_cleanup(kill_all, NULL); | ||
1382 | if (atomic_read(&nf_conntrack_count) != 0) { | ||
1383 | schedule(); | ||
1384 | goto i_see_dead_people; | ||
1385 | } | ||
1386 | |||
1387 | for (i = 0; i < NF_CT_F_NUM; i++) { | ||
1388 | if (nf_ct_cache[i].use == 0) | ||
1389 | continue; | ||
1390 | |||
1391 | NF_CT_ASSERT(nf_ct_cache[i].use == 1); | ||
1392 | nf_ct_cache[i].use = 1; | ||
1393 | nf_conntrack_unregister_cache(i); | ||
1394 | } | ||
1395 | kmem_cache_destroy(nf_conntrack_expect_cachep); | ||
1396 | free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc, | ||
1397 | nf_conntrack_htable_size); | ||
1398 | } | ||
1399 | |||
1400 | static struct list_head *alloc_hashtable(int size, int *vmalloced) | ||
1401 | { | ||
1402 | struct list_head *hash; | ||
1403 | unsigned int i; | ||
1404 | |||
1405 | *vmalloced = 0; | ||
1406 | hash = (void*)__get_free_pages(GFP_KERNEL, | ||
1407 | get_order(sizeof(struct list_head) | ||
1408 | * size)); | ||
1409 | if (!hash) { | ||
1410 | *vmalloced = 1; | ||
1411 | printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n"); | ||
1412 | hash = vmalloc(sizeof(struct list_head) * size); | ||
1413 | } | ||
1414 | |||
1415 | if (hash) | ||
1416 | for (i = 0; i < size; i++) | ||
1417 | INIT_LIST_HEAD(&hash[i]); | ||
1418 | |||
1419 | return hash; | ||
1420 | } | ||
1421 | |||
1422 | int set_hashsize(const char *val, struct kernel_param *kp) | ||
1423 | { | ||
1424 | int i, bucket, hashsize, vmalloced; | ||
1425 | int old_vmalloced, old_size; | ||
1426 | int rnd; | ||
1427 | struct list_head *hash, *old_hash; | ||
1428 | struct nf_conntrack_tuple_hash *h; | ||
1429 | |||
1430 | /* On boot, we can set this without any fancy locking. */ | ||
1431 | if (!nf_conntrack_htable_size) | ||
1432 | return param_set_uint(val, kp); | ||
1433 | |||
1434 | hashsize = simple_strtol(val, NULL, 0); | ||
1435 | if (!hashsize) | ||
1436 | return -EINVAL; | ||
1437 | |||
1438 | hash = alloc_hashtable(hashsize, &vmalloced); | ||
1439 | if (!hash) | ||
1440 | return -ENOMEM; | ||
1441 | |||
1442 | /* We have to rehahs for the new table anyway, so we also can | ||
1443 | * use a newrandom seed */ | ||
1444 | get_random_bytes(&rnd, 4); | ||
1445 | |||
1446 | write_lock_bh(&nf_conntrack_lock); | ||
1447 | for (i = 0; i < nf_conntrack_htable_size; i++) { | ||
1448 | while (!list_empty(&nf_conntrack_hash[i])) { | ||
1449 | h = list_entry(nf_conntrack_hash[i].next, | ||
1450 | struct nf_conntrack_tuple_hash, list); | ||
1451 | list_del(&h->list); | ||
1452 | bucket = __hash_conntrack(&h->tuple, hashsize, rnd); | ||
1453 | list_add_tail(&h->list, &hash[bucket]); | ||
1454 | } | ||
1455 | } | ||
1456 | old_size = nf_conntrack_htable_size; | ||
1457 | old_vmalloced = nf_conntrack_vmalloc; | ||
1458 | old_hash = nf_conntrack_hash; | ||
1459 | |||
1460 | nf_conntrack_htable_size = hashsize; | ||
1461 | nf_conntrack_vmalloc = vmalloced; | ||
1462 | nf_conntrack_hash = hash; | ||
1463 | nf_conntrack_hash_rnd = rnd; | ||
1464 | write_unlock_bh(&nf_conntrack_lock); | ||
1465 | |||
1466 | free_conntrack_hash(old_hash, old_vmalloced, old_size); | ||
1467 | return 0; | ||
1468 | } | ||
1469 | |||
1470 | module_param_call(hashsize, set_hashsize, param_get_uint, | ||
1471 | &nf_conntrack_htable_size, 0600); | ||
1472 | |||
1473 | int __init nf_conntrack_init(void) | ||
1474 | { | ||
1475 | unsigned int i; | ||
1476 | int ret; | ||
1477 | |||
1478 | /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB | ||
1479 | * machine has 256 buckets. >= 1GB machines have 8192 buckets. */ | ||
1480 | if (!nf_conntrack_htable_size) { | ||
1481 | nf_conntrack_htable_size | ||
1482 | = (((num_physpages << PAGE_SHIFT) / 16384) | ||
1483 | / sizeof(struct list_head)); | ||
1484 | if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE)) | ||
1485 | nf_conntrack_htable_size = 8192; | ||
1486 | if (nf_conntrack_htable_size < 16) | ||
1487 | nf_conntrack_htable_size = 16; | ||
1488 | } | ||
1489 | nf_conntrack_max = 8 * nf_conntrack_htable_size; | ||
1490 | |||
1491 | printk("nf_conntrack version %s (%u buckets, %d max)\n", | ||
1492 | NF_CONNTRACK_VERSION, nf_conntrack_htable_size, | ||
1493 | nf_conntrack_max); | ||
1494 | |||
1495 | nf_conntrack_hash = alloc_hashtable(nf_conntrack_htable_size, | ||
1496 | &nf_conntrack_vmalloc); | ||
1497 | if (!nf_conntrack_hash) { | ||
1498 | printk(KERN_ERR "Unable to create nf_conntrack_hash\n"); | ||
1499 | goto err_out; | ||
1500 | } | ||
1501 | |||
1502 | ret = nf_conntrack_register_cache(NF_CT_F_BASIC, "nf_conntrack:basic", | ||
1503 | sizeof(struct nf_conn), NULL); | ||
1504 | if (ret < 0) { | ||
1505 | printk(KERN_ERR "Unable to create nf_conn slab cache\n"); | ||
1506 | goto err_free_hash; | ||
1507 | } | ||
1508 | |||
1509 | nf_conntrack_expect_cachep = kmem_cache_create("nf_conntrack_expect", | ||
1510 | sizeof(struct nf_conntrack_expect), | ||
1511 | 0, 0, NULL, NULL); | ||
1512 | if (!nf_conntrack_expect_cachep) { | ||
1513 | printk(KERN_ERR "Unable to create nf_expect slab cache\n"); | ||
1514 | goto err_free_conntrack_slab; | ||
1515 | } | ||
1516 | |||
1517 | /* Don't NEED lock here, but good form anyway. */ | ||
1518 | write_lock_bh(&nf_conntrack_lock); | ||
1519 | for (i = 0; i < PF_MAX; i++) | ||
1520 | nf_ct_l3protos[i] = &nf_conntrack_generic_l3proto; | ||
1521 | write_unlock_bh(&nf_conntrack_lock); | ||
1522 | |||
1523 | /* Set up fake conntrack: | ||
1524 | - to never be deleted, not in any hashes */ | ||
1525 | atomic_set(&nf_conntrack_untracked.ct_general.use, 1); | ||
1526 | /* - and look it like as a confirmed connection */ | ||
1527 | set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status); | ||
1528 | |||
1529 | return ret; | ||
1530 | |||
1531 | err_free_conntrack_slab: | ||
1532 | nf_conntrack_unregister_cache(NF_CT_F_BASIC); | ||
1533 | err_free_hash: | ||
1534 | free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc, | ||
1535 | nf_conntrack_htable_size); | ||
1536 | err_out: | ||
1537 | return -ENOMEM; | ||
1538 | } | ||
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c new file mode 100644 index 000000000000..65080e269f27 --- /dev/null +++ b/net/netfilter/nf_conntrack_ftp.c | |||
@@ -0,0 +1,698 @@ | |||
1 | /* FTP extension for connection tracking. */ | ||
2 | |||
3 | /* (C) 1999-2001 Paul `Rusty' Russell | ||
4 | * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> | ||
5 | * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | * | ||
11 | * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> | ||
12 | * - enable working with Layer 3 protocol independent connection tracking. | ||
13 | * - track EPRT and EPSV commands with IPv6 address. | ||
14 | * | ||
15 | * Derived from net/ipv4/netfilter/ip_conntrack_ftp.c | ||
16 | */ | ||
17 | |||
18 | #include <linux/config.h> | ||
19 | #include <linux/module.h> | ||
20 | #include <linux/moduleparam.h> | ||
21 | #include <linux/netfilter.h> | ||
22 | #include <linux/ip.h> | ||
23 | #include <linux/ipv6.h> | ||
24 | #include <linux/ctype.h> | ||
25 | #include <net/checksum.h> | ||
26 | #include <net/tcp.h> | ||
27 | |||
28 | #include <net/netfilter/nf_conntrack.h> | ||
29 | #include <net/netfilter/nf_conntrack_helper.h> | ||
30 | #include <linux/netfilter/nf_conntrack_ftp.h> | ||
31 | |||
32 | MODULE_LICENSE("GPL"); | ||
33 | MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); | ||
34 | MODULE_DESCRIPTION("ftp connection tracking helper"); | ||
35 | |||
36 | /* This is slow, but it's simple. --RR */ | ||
37 | static char *ftp_buffer; | ||
38 | |||
39 | static DEFINE_SPINLOCK(nf_ftp_lock); | ||
40 | |||
41 | #define MAX_PORTS 8 | ||
42 | static u_int16_t ports[MAX_PORTS]; | ||
43 | static unsigned int ports_c; | ||
44 | module_param_array(ports, ushort, &ports_c, 0400); | ||
45 | |||
46 | static int loose; | ||
47 | module_param(loose, int, 0600); | ||
48 | |||
49 | unsigned int (*nf_nat_ftp_hook)(struct sk_buff **pskb, | ||
50 | enum ip_conntrack_info ctinfo, | ||
51 | enum ip_ct_ftp_type type, | ||
52 | unsigned int matchoff, | ||
53 | unsigned int matchlen, | ||
54 | struct nf_conntrack_expect *exp, | ||
55 | u32 *seq); | ||
56 | EXPORT_SYMBOL_GPL(nf_nat_ftp_hook); | ||
57 | |||
58 | #if 0 | ||
59 | #define DEBUGP printk | ||
60 | #else | ||
61 | #define DEBUGP(format, args...) | ||
62 | #endif | ||
63 | |||
64 | static int try_rfc959(const char *, size_t, struct nf_conntrack_man *, char); | ||
65 | static int try_eprt(const char *, size_t, struct nf_conntrack_man *, char); | ||
66 | static int try_epsv_response(const char *, size_t, struct nf_conntrack_man *, | ||
67 | char); | ||
68 | |||
69 | static struct ftp_search { | ||
70 | enum ip_conntrack_dir dir; | ||
71 | const char *pattern; | ||
72 | size_t plen; | ||
73 | char skip; | ||
74 | char term; | ||
75 | enum ip_ct_ftp_type ftptype; | ||
76 | int (*getnum)(const char *, size_t, struct nf_conntrack_man *, char); | ||
77 | } search[] = { | ||
78 | { | ||
79 | IP_CT_DIR_ORIGINAL, | ||
80 | "PORT", sizeof("PORT") - 1, ' ', '\r', | ||
81 | IP_CT_FTP_PORT, | ||
82 | try_rfc959, | ||
83 | }, | ||
84 | { | ||
85 | IP_CT_DIR_REPLY, | ||
86 | "227 ", sizeof("227 ") - 1, '(', ')', | ||
87 | IP_CT_FTP_PASV, | ||
88 | try_rfc959, | ||
89 | }, | ||
90 | { | ||
91 | IP_CT_DIR_ORIGINAL, | ||
92 | "EPRT", sizeof("EPRT") - 1, ' ', '\r', | ||
93 | IP_CT_FTP_EPRT, | ||
94 | try_eprt, | ||
95 | }, | ||
96 | { | ||
97 | IP_CT_DIR_REPLY, | ||
98 | "229 ", sizeof("229 ") - 1, '(', ')', | ||
99 | IP_CT_FTP_EPSV, | ||
100 | try_epsv_response, | ||
101 | }, | ||
102 | }; | ||
103 | |||
104 | /* This code is based on inet_pton() in glibc-2.2.4 */ | ||
105 | static int | ||
106 | get_ipv6_addr(const char *src, size_t dlen, struct in6_addr *dst, u_int8_t term) | ||
107 | { | ||
108 | static const char xdigits[] = "0123456789abcdef"; | ||
109 | u_int8_t tmp[16], *tp, *endp, *colonp; | ||
110 | int ch, saw_xdigit; | ||
111 | u_int32_t val; | ||
112 | size_t clen = 0; | ||
113 | |||
114 | tp = memset(tmp, '\0', sizeof(tmp)); | ||
115 | endp = tp + sizeof(tmp); | ||
116 | colonp = NULL; | ||
117 | |||
118 | /* Leading :: requires some special handling. */ | ||
119 | if (*src == ':'){ | ||
120 | if (*++src != ':') { | ||
121 | DEBUGP("invalid \":\" at the head of addr\n"); | ||
122 | return 0; | ||
123 | } | ||
124 | clen++; | ||
125 | } | ||
126 | |||
127 | saw_xdigit = 0; | ||
128 | val = 0; | ||
129 | while ((clen < dlen) && (*src != term)) { | ||
130 | const char *pch; | ||
131 | |||
132 | ch = tolower(*src++); | ||
133 | clen++; | ||
134 | |||
135 | pch = strchr(xdigits, ch); | ||
136 | if (pch != NULL) { | ||
137 | val <<= 4; | ||
138 | val |= (pch - xdigits); | ||
139 | if (val > 0xffff) | ||
140 | return 0; | ||
141 | |||
142 | saw_xdigit = 1; | ||
143 | continue; | ||
144 | } | ||
145 | if (ch != ':') { | ||
146 | DEBUGP("get_ipv6_addr: invalid char. \'%c\'\n", ch); | ||
147 | return 0; | ||
148 | } | ||
149 | |||
150 | if (!saw_xdigit) { | ||
151 | if (colonp) { | ||
152 | DEBUGP("invalid location of \"::\".\n"); | ||
153 | return 0; | ||
154 | } | ||
155 | colonp = tp; | ||
156 | continue; | ||
157 | } else if (*src == term) { | ||
158 | DEBUGP("trancated IPv6 addr\n"); | ||
159 | return 0; | ||
160 | } | ||
161 | |||
162 | if (tp + 2 > endp) | ||
163 | return 0; | ||
164 | *tp++ = (u_int8_t) (val >> 8) & 0xff; | ||
165 | *tp++ = (u_int8_t) val & 0xff; | ||
166 | |||
167 | saw_xdigit = 0; | ||
168 | val = 0; | ||
169 | continue; | ||
170 | } | ||
171 | if (saw_xdigit) { | ||
172 | if (tp + 2 > endp) | ||
173 | return 0; | ||
174 | *tp++ = (u_int8_t) (val >> 8) & 0xff; | ||
175 | *tp++ = (u_int8_t) val & 0xff; | ||
176 | } | ||
177 | if (colonp != NULL) { | ||
178 | /* | ||
179 | * Since some memmove()'s erroneously fail to handle | ||
180 | * overlapping regions, we'll do the shift by hand. | ||
181 | */ | ||
182 | const int n = tp - colonp; | ||
183 | int i; | ||
184 | |||
185 | if (tp == endp) | ||
186 | return 0; | ||
187 | |||
188 | for (i = 1; i <= n; i++) { | ||
189 | endp[- i] = colonp[n - i]; | ||
190 | colonp[n - i] = 0; | ||
191 | } | ||
192 | tp = endp; | ||
193 | } | ||
194 | if (tp != endp || (*src != term)) | ||
195 | return 0; | ||
196 | |||
197 | memcpy(dst->s6_addr, tmp, sizeof(dst->s6_addr)); | ||
198 | return clen; | ||
199 | } | ||
200 | |||
201 | static int try_number(const char *data, size_t dlen, u_int32_t array[], | ||
202 | int array_size, char sep, char term) | ||
203 | { | ||
204 | u_int32_t i, len; | ||
205 | |||
206 | memset(array, 0, sizeof(array[0])*array_size); | ||
207 | |||
208 | /* Keep data pointing at next char. */ | ||
209 | for (i = 0, len = 0; len < dlen && i < array_size; len++, data++) { | ||
210 | if (*data >= '0' && *data <= '9') { | ||
211 | array[i] = array[i]*10 + *data - '0'; | ||
212 | } | ||
213 | else if (*data == sep) | ||
214 | i++; | ||
215 | else { | ||
216 | /* Unexpected character; true if it's the | ||
217 | terminator and we're finished. */ | ||
218 | if (*data == term && i == array_size - 1) | ||
219 | return len; | ||
220 | |||
221 | DEBUGP("Char %u (got %u nums) `%u' unexpected\n", | ||
222 | len, i, *data); | ||
223 | return 0; | ||
224 | } | ||
225 | } | ||
226 | DEBUGP("Failed to fill %u numbers separated by %c\n", array_size, sep); | ||
227 | |||
228 | return 0; | ||
229 | } | ||
230 | |||
231 | /* Returns 0, or length of numbers: 192,168,1,1,5,6 */ | ||
232 | static int try_rfc959(const char *data, size_t dlen, | ||
233 | struct nf_conntrack_man *cmd, char term) | ||
234 | { | ||
235 | int length; | ||
236 | u_int32_t array[6]; | ||
237 | |||
238 | length = try_number(data, dlen, array, 6, ',', term); | ||
239 | if (length == 0) | ||
240 | return 0; | ||
241 | |||
242 | cmd->u3.ip = htonl((array[0] << 24) | (array[1] << 16) | | ||
243 | (array[2] << 8) | array[3]); | ||
244 | cmd->u.tcp.port = htons((array[4] << 8) | array[5]); | ||
245 | return length; | ||
246 | } | ||
247 | |||
248 | /* Grab port: number up to delimiter */ | ||
249 | static int get_port(const char *data, int start, size_t dlen, char delim, | ||
250 | u_int16_t *port) | ||
251 | { | ||
252 | u_int16_t tmp_port = 0; | ||
253 | int i; | ||
254 | |||
255 | for (i = start; i < dlen; i++) { | ||
256 | /* Finished? */ | ||
257 | if (data[i] == delim) { | ||
258 | if (tmp_port == 0) | ||
259 | break; | ||
260 | *port = htons(tmp_port); | ||
261 | DEBUGP("get_port: return %d\n", tmp_port); | ||
262 | return i + 1; | ||
263 | } | ||
264 | else if (data[i] >= '0' && data[i] <= '9') | ||
265 | tmp_port = tmp_port*10 + data[i] - '0'; | ||
266 | else { /* Some other crap */ | ||
267 | DEBUGP("get_port: invalid char.\n"); | ||
268 | break; | ||
269 | } | ||
270 | } | ||
271 | return 0; | ||
272 | } | ||
273 | |||
274 | /* Returns 0, or length of numbers: |1|132.235.1.2|6275| or |2|3ffe::1|6275| */ | ||
275 | static int try_eprt(const char *data, size_t dlen, struct nf_conntrack_man *cmd, | ||
276 | char term) | ||
277 | { | ||
278 | char delim; | ||
279 | int length; | ||
280 | |||
281 | /* First character is delimiter, then "1" for IPv4 or "2" for IPv6, | ||
282 | then delimiter again. */ | ||
283 | if (dlen <= 3) { | ||
284 | DEBUGP("EPRT: too short\n"); | ||
285 | return 0; | ||
286 | } | ||
287 | delim = data[0]; | ||
288 | if (isdigit(delim) || delim < 33 || delim > 126 || data[2] != delim) { | ||
289 | DEBUGP("try_eprt: invalid delimitter.\n"); | ||
290 | return 0; | ||
291 | } | ||
292 | |||
293 | if ((cmd->l3num == PF_INET && data[1] != '1') || | ||
294 | (cmd->l3num == PF_INET6 && data[1] != '2')) { | ||
295 | DEBUGP("EPRT: invalid protocol number.\n"); | ||
296 | return 0; | ||
297 | } | ||
298 | |||
299 | DEBUGP("EPRT: Got %c%c%c\n", delim, data[1], delim); | ||
300 | |||
301 | if (data[1] == '1') { | ||
302 | u_int32_t array[4]; | ||
303 | |||
304 | /* Now we have IP address. */ | ||
305 | length = try_number(data + 3, dlen - 3, array, 4, '.', delim); | ||
306 | if (length != 0) | ||
307 | cmd->u3.ip = htonl((array[0] << 24) | (array[1] << 16) | ||
308 | | (array[2] << 8) | array[3]); | ||
309 | } else { | ||
310 | /* Now we have IPv6 address. */ | ||
311 | length = get_ipv6_addr(data + 3, dlen - 3, | ||
312 | (struct in6_addr *)cmd->u3.ip6, delim); | ||
313 | } | ||
314 | |||
315 | if (length == 0) | ||
316 | return 0; | ||
317 | DEBUGP("EPRT: Got IP address!\n"); | ||
318 | /* Start offset includes initial "|1|", and trailing delimiter */ | ||
319 | return get_port(data, 3 + length + 1, dlen, delim, &cmd->u.tcp.port); | ||
320 | } | ||
321 | |||
322 | /* Returns 0, or length of numbers: |||6446| */ | ||
323 | static int try_epsv_response(const char *data, size_t dlen, | ||
324 | struct nf_conntrack_man *cmd, char term) | ||
325 | { | ||
326 | char delim; | ||
327 | |||
328 | /* Three delimiters. */ | ||
329 | if (dlen <= 3) return 0; | ||
330 | delim = data[0]; | ||
331 | if (isdigit(delim) || delim < 33 || delim > 126 | ||
332 | || data[1] != delim || data[2] != delim) | ||
333 | return 0; | ||
334 | |||
335 | return get_port(data, 3, dlen, delim, &cmd->u.tcp.port); | ||
336 | } | ||
337 | |||
338 | /* Return 1 for match, 0 for accept, -1 for partial. */ | ||
339 | static int find_pattern(const char *data, size_t dlen, | ||
340 | const char *pattern, size_t plen, | ||
341 | char skip, char term, | ||
342 | unsigned int *numoff, | ||
343 | unsigned int *numlen, | ||
344 | struct nf_conntrack_man *cmd, | ||
345 | int (*getnum)(const char *, size_t, | ||
346 | struct nf_conntrack_man *, char)) | ||
347 | { | ||
348 | size_t i; | ||
349 | |||
350 | DEBUGP("find_pattern `%s': dlen = %u\n", pattern, dlen); | ||
351 | if (dlen == 0) | ||
352 | return 0; | ||
353 | |||
354 | if (dlen <= plen) { | ||
355 | /* Short packet: try for partial? */ | ||
356 | if (strnicmp(data, pattern, dlen) == 0) | ||
357 | return -1; | ||
358 | else return 0; | ||
359 | } | ||
360 | |||
361 | if (strnicmp(data, pattern, plen) != 0) { | ||
362 | #if 0 | ||
363 | size_t i; | ||
364 | |||
365 | DEBUGP("ftp: string mismatch\n"); | ||
366 | for (i = 0; i < plen; i++) { | ||
367 | DEBUGP("ftp:char %u `%c'(%u) vs `%c'(%u)\n", | ||
368 | i, data[i], data[i], | ||
369 | pattern[i], pattern[i]); | ||
370 | } | ||
371 | #endif | ||
372 | return 0; | ||
373 | } | ||
374 | |||
375 | DEBUGP("Pattern matches!\n"); | ||
376 | /* Now we've found the constant string, try to skip | ||
377 | to the 'skip' character */ | ||
378 | for (i = plen; data[i] != skip; i++) | ||
379 | if (i == dlen - 1) return -1; | ||
380 | |||
381 | /* Skip over the last character */ | ||
382 | i++; | ||
383 | |||
384 | DEBUGP("Skipped up to `%c'!\n", skip); | ||
385 | |||
386 | *numoff = i; | ||
387 | *numlen = getnum(data + i, dlen - i, cmd, term); | ||
388 | if (!*numlen) | ||
389 | return -1; | ||
390 | |||
391 | DEBUGP("Match succeeded!\n"); | ||
392 | return 1; | ||
393 | } | ||
394 | |||
395 | /* Look up to see if we're just after a \n. */ | ||
396 | static int find_nl_seq(u32 seq, const struct ip_ct_ftp_master *info, int dir) | ||
397 | { | ||
398 | unsigned int i; | ||
399 | |||
400 | for (i = 0; i < info->seq_aft_nl_num[dir]; i++) | ||
401 | if (info->seq_aft_nl[dir][i] == seq) | ||
402 | return 1; | ||
403 | return 0; | ||
404 | } | ||
405 | |||
406 | /* We don't update if it's older than what we have. */ | ||
407 | static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir, | ||
408 | struct sk_buff *skb) | ||
409 | { | ||
410 | unsigned int i, oldest = NUM_SEQ_TO_REMEMBER; | ||
411 | |||
412 | /* Look for oldest: if we find exact match, we're done. */ | ||
413 | for (i = 0; i < info->seq_aft_nl_num[dir]; i++) { | ||
414 | if (info->seq_aft_nl[dir][i] == nl_seq) | ||
415 | return; | ||
416 | |||
417 | if (oldest == info->seq_aft_nl_num[dir] | ||
418 | || before(info->seq_aft_nl[dir][i], oldest)) | ||
419 | oldest = i; | ||
420 | } | ||
421 | |||
422 | if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) { | ||
423 | info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq; | ||
424 | nf_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb); | ||
425 | } else if (oldest != NUM_SEQ_TO_REMEMBER) { | ||
426 | info->seq_aft_nl[dir][oldest] = nl_seq; | ||
427 | nf_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb); | ||
428 | } | ||
429 | } | ||
430 | |||
431 | static int help(struct sk_buff **pskb, | ||
432 | unsigned int protoff, | ||
433 | struct nf_conn *ct, | ||
434 | enum ip_conntrack_info ctinfo) | ||
435 | { | ||
436 | unsigned int dataoff, datalen; | ||
437 | struct tcphdr _tcph, *th; | ||
438 | char *fb_ptr; | ||
439 | int ret; | ||
440 | u32 seq; | ||
441 | int dir = CTINFO2DIR(ctinfo); | ||
442 | unsigned int matchlen, matchoff; | ||
443 | struct ip_ct_ftp_master *ct_ftp_info = &ct->help->ct_ftp_info; | ||
444 | struct nf_conntrack_expect *exp; | ||
445 | struct nf_conntrack_man cmd = {}; | ||
446 | |||
447 | unsigned int i; | ||
448 | int found = 0, ends_in_nl; | ||
449 | |||
450 | /* Until there's been traffic both ways, don't look in packets. */ | ||
451 | if (ctinfo != IP_CT_ESTABLISHED | ||
452 | && ctinfo != IP_CT_ESTABLISHED+IP_CT_IS_REPLY) { | ||
453 | DEBUGP("ftp: Conntrackinfo = %u\n", ctinfo); | ||
454 | return NF_ACCEPT; | ||
455 | } | ||
456 | |||
457 | th = skb_header_pointer(*pskb, protoff, sizeof(_tcph), &_tcph); | ||
458 | if (th == NULL) | ||
459 | return NF_ACCEPT; | ||
460 | |||
461 | dataoff = protoff + th->doff * 4; | ||
462 | /* No data? */ | ||
463 | if (dataoff >= (*pskb)->len) { | ||
464 | DEBUGP("ftp: dataoff(%u) >= skblen(%u)\n", dataoff, | ||
465 | (*pskb)->len); | ||
466 | return NF_ACCEPT; | ||
467 | } | ||
468 | datalen = (*pskb)->len - dataoff; | ||
469 | |||
470 | spin_lock_bh(&nf_ftp_lock); | ||
471 | fb_ptr = skb_header_pointer(*pskb, dataoff, datalen, ftp_buffer); | ||
472 | BUG_ON(fb_ptr == NULL); | ||
473 | |||
474 | ends_in_nl = (fb_ptr[datalen - 1] == '\n'); | ||
475 | seq = ntohl(th->seq) + datalen; | ||
476 | |||
477 | /* Look up to see if we're just after a \n. */ | ||
478 | if (!find_nl_seq(ntohl(th->seq), ct_ftp_info, dir)) { | ||
479 | /* Now if this ends in \n, update ftp info. */ | ||
480 | DEBUGP("nf_conntrack_ftp_help: wrong seq pos %s(%u) or %s(%u)\n", | ||
481 | ct_ftp_info->seq_aft_nl_num[dir] > 0 ? "" : "(UNSET)", | ||
482 | ct_ftp_info->seq_aft_nl[dir][0], | ||
483 | ct_ftp_info->seq_aft_nl_num[dir] > 1 ? "" : "(UNSET)", | ||
484 | ct_ftp_info->seq_aft_nl[dir][1]); | ||
485 | ret = NF_ACCEPT; | ||
486 | goto out_update_nl; | ||
487 | } | ||
488 | |||
489 | /* Initialize IP/IPv6 addr to expected address (it's not mentioned | ||
490 | in EPSV responses) */ | ||
491 | cmd.l3num = ct->tuplehash[dir].tuple.src.l3num; | ||
492 | memcpy(cmd.u3.all, &ct->tuplehash[dir].tuple.src.u3.all, | ||
493 | sizeof(cmd.u3.all)); | ||
494 | |||
495 | for (i = 0; i < ARRAY_SIZE(search); i++) { | ||
496 | if (search[i].dir != dir) continue; | ||
497 | |||
498 | found = find_pattern(fb_ptr, datalen, | ||
499 | search[i].pattern, | ||
500 | search[i].plen, | ||
501 | search[i].skip, | ||
502 | search[i].term, | ||
503 | &matchoff, &matchlen, | ||
504 | &cmd, | ||
505 | search[i].getnum); | ||
506 | if (found) break; | ||
507 | } | ||
508 | if (found == -1) { | ||
509 | /* We don't usually drop packets. After all, this is | ||
510 | connection tracking, not packet filtering. | ||
511 | However, it is necessary for accurate tracking in | ||
512 | this case. */ | ||
513 | if (net_ratelimit()) | ||
514 | printk("conntrack_ftp: partial %s %u+%u\n", | ||
515 | search[i].pattern, | ||
516 | ntohl(th->seq), datalen); | ||
517 | ret = NF_DROP; | ||
518 | goto out; | ||
519 | } else if (found == 0) { /* No match */ | ||
520 | ret = NF_ACCEPT; | ||
521 | goto out_update_nl; | ||
522 | } | ||
523 | |||
524 | DEBUGP("conntrack_ftp: match `%.*s' (%u bytes at %u)\n", | ||
525 | (int)matchlen, fb_ptr + matchoff, | ||
526 | matchlen, ntohl(th->seq) + matchoff); | ||
527 | |||
528 | exp = nf_conntrack_expect_alloc(ct); | ||
529 | if (exp == NULL) { | ||
530 | ret = NF_DROP; | ||
531 | goto out; | ||
532 | } | ||
533 | |||
534 | /* We refer to the reverse direction ("!dir") tuples here, | ||
535 | * because we're expecting something in the other direction. | ||
536 | * Doesn't matter unless NAT is happening. */ | ||
537 | exp->tuple.dst.u3 = ct->tuplehash[!dir].tuple.dst.u3; | ||
538 | |||
539 | /* Update the ftp info */ | ||
540 | if ((cmd.l3num == ct->tuplehash[dir].tuple.src.l3num) && | ||
541 | memcmp(&cmd.u3.all, &ct->tuplehash[dir].tuple.src.u3.all, | ||
542 | sizeof(cmd.u3.all))) { | ||
543 | /* Enrico Scholz's passive FTP to partially RNAT'd ftp | ||
544 | server: it really wants us to connect to a | ||
545 | different IP address. Simply don't record it for | ||
546 | NAT. */ | ||
547 | if (cmd.l3num == PF_INET) { | ||
548 | DEBUGP("conntrack_ftp: NOT RECORDING: %u,%u,%u,%u != %u.%u.%u.%u\n", | ||
549 | NIPQUAD(cmd.u3.ip), | ||
550 | NIPQUAD(ct->tuplehash[dir].tuple.src.u3.ip)); | ||
551 | } else { | ||
552 | DEBUGP("conntrack_ftp: NOT RECORDING: %x:%x:%x:%x:%x:%x:%x:%x != %x:%x:%x:%x:%x:%x:%x:%x\n", | ||
553 | NIP6(*((struct in6_addr *)cmd.u3.ip6)), | ||
554 | NIP6(*((struct in6_addr *)ct->tuplehash[dir] | ||
555 | .tuple.src.u3.ip6))); | ||
556 | } | ||
557 | |||
558 | /* Thanks to Cristiano Lincoln Mattos | ||
559 | <lincoln@cesar.org.br> for reporting this potential | ||
560 | problem (DMZ machines opening holes to internal | ||
561 | networks, or the packet filter itself). */ | ||
562 | if (!loose) { | ||
563 | ret = NF_ACCEPT; | ||
564 | goto out_put_expect; | ||
565 | } | ||
566 | memcpy(&exp->tuple.dst.u3, &cmd.u3.all, | ||
567 | sizeof(exp->tuple.dst.u3)); | ||
568 | } | ||
569 | |||
570 | exp->tuple.src.u3 = ct->tuplehash[!dir].tuple.src.u3; | ||
571 | exp->tuple.src.l3num = cmd.l3num; | ||
572 | exp->tuple.src.u.tcp.port = 0; | ||
573 | exp->tuple.dst.u.tcp.port = cmd.u.tcp.port; | ||
574 | exp->tuple.dst.protonum = IPPROTO_TCP; | ||
575 | |||
576 | exp->mask = (struct nf_conntrack_tuple) | ||
577 | { .src = { .l3num = 0xFFFF, | ||
578 | .u = { .tcp = { 0 }}, | ||
579 | }, | ||
580 | .dst = { .protonum = 0xFF, | ||
581 | .u = { .tcp = { 0xFFFF }}, | ||
582 | }, | ||
583 | }; | ||
584 | if (cmd.l3num == PF_INET) { | ||
585 | exp->mask.src.u3.ip = 0xFFFFFFFF; | ||
586 | exp->mask.dst.u3.ip = 0xFFFFFFFF; | ||
587 | } else { | ||
588 | memset(exp->mask.src.u3.ip6, 0xFF, | ||
589 | sizeof(exp->mask.src.u3.ip6)); | ||
590 | memset(exp->mask.dst.u3.ip6, 0xFF, | ||
591 | sizeof(exp->mask.src.u3.ip6)); | ||
592 | } | ||
593 | |||
594 | exp->expectfn = NULL; | ||
595 | exp->flags = 0; | ||
596 | |||
597 | /* Now, NAT might want to mangle the packet, and register the | ||
598 | * (possibly changed) expectation itself. */ | ||
599 | if (nf_nat_ftp_hook) | ||
600 | ret = nf_nat_ftp_hook(pskb, ctinfo, search[i].ftptype, | ||
601 | matchoff, matchlen, exp, &seq); | ||
602 | else { | ||
603 | /* Can't expect this? Best to drop packet now. */ | ||
604 | if (nf_conntrack_expect_related(exp) != 0) | ||
605 | ret = NF_DROP; | ||
606 | else | ||
607 | ret = NF_ACCEPT; | ||
608 | } | ||
609 | |||
610 | out_put_expect: | ||
611 | nf_conntrack_expect_put(exp); | ||
612 | |||
613 | out_update_nl: | ||
614 | /* Now if this ends in \n, update ftp info. Seq may have been | ||
615 | * adjusted by NAT code. */ | ||
616 | if (ends_in_nl) | ||
617 | update_nl_seq(seq, ct_ftp_info, dir, *pskb); | ||
618 | out: | ||
619 | spin_unlock_bh(&nf_ftp_lock); | ||
620 | return ret; | ||
621 | } | ||
622 | |||
623 | static struct nf_conntrack_helper ftp[MAX_PORTS][2]; | ||
624 | static char ftp_names[MAX_PORTS][2][sizeof("ftp-65535")]; | ||
625 | |||
626 | /* don't make this __exit, since it's called from __init ! */ | ||
627 | static void fini(void) | ||
628 | { | ||
629 | int i, j; | ||
630 | for (i = 0; i < ports_c; i++) { | ||
631 | for (j = 0; j < 2; j++) { | ||
632 | if (ftp[i][j].me == NULL) | ||
633 | continue; | ||
634 | |||
635 | DEBUGP("nf_ct_ftp: unregistering helper for pf: %d " | ||
636 | "port: %d\n", | ||
637 | ftp[i][j].tuple.src.l3num, ports[i]); | ||
638 | nf_conntrack_helper_unregister(&ftp[i][j]); | ||
639 | } | ||
640 | } | ||
641 | |||
642 | kfree(ftp_buffer); | ||
643 | } | ||
644 | |||
645 | static int __init init(void) | ||
646 | { | ||
647 | int i, j = -1, ret = 0; | ||
648 | char *tmpname; | ||
649 | |||
650 | ftp_buffer = kmalloc(65536, GFP_KERNEL); | ||
651 | if (!ftp_buffer) | ||
652 | return -ENOMEM; | ||
653 | |||
654 | if (ports_c == 0) | ||
655 | ports[ports_c++] = FTP_PORT; | ||
656 | |||
657 | /* FIXME should be configurable whether IPv4 and IPv6 FTP connections | ||
658 | are tracked or not - YK */ | ||
659 | for (i = 0; i < ports_c; i++) { | ||
660 | memset(&ftp[i], 0, sizeof(struct nf_conntrack_helper)); | ||
661 | |||
662 | ftp[i][0].tuple.src.l3num = PF_INET; | ||
663 | ftp[i][1].tuple.src.l3num = PF_INET6; | ||
664 | for (j = 0; j < 2; j++) { | ||
665 | ftp[i][j].tuple.src.u.tcp.port = htons(ports[i]); | ||
666 | ftp[i][j].tuple.dst.protonum = IPPROTO_TCP; | ||
667 | ftp[i][j].mask.src.u.tcp.port = 0xFFFF; | ||
668 | ftp[i][j].mask.dst.protonum = 0xFF; | ||
669 | ftp[i][j].max_expected = 1; | ||
670 | ftp[i][j].timeout = 5 * 60; /* 5 Minutes */ | ||
671 | ftp[i][j].me = THIS_MODULE; | ||
672 | ftp[i][j].help = help; | ||
673 | tmpname = &ftp_names[i][j][0]; | ||
674 | if (ports[i] == FTP_PORT) | ||
675 | sprintf(tmpname, "ftp"); | ||
676 | else | ||
677 | sprintf(tmpname, "ftp-%d", ports[i]); | ||
678 | ftp[i][j].name = tmpname; | ||
679 | |||
680 | DEBUGP("nf_ct_ftp: registering helper for pf: %d " | ||
681 | "port: %d\n", | ||
682 | ftp[i][j].tuple.src.l3num, ports[i]); | ||
683 | ret = nf_conntrack_helper_register(&ftp[i][j]); | ||
684 | if (ret) { | ||
685 | printk("nf_ct_ftp: failed to register helper " | ||
686 | " for pf: %d port: %d\n", | ||
687 | ftp[i][j].tuple.src.l3num, ports[i]); | ||
688 | fini(); | ||
689 | return ret; | ||
690 | } | ||
691 | } | ||
692 | } | ||
693 | |||
694 | return 0; | ||
695 | } | ||
696 | |||
697 | module_init(init); | ||
698 | module_exit(fini); | ||
diff --git a/net/netfilter/nf_conntrack_l3proto_generic.c b/net/netfilter/nf_conntrack_l3proto_generic.c new file mode 100644 index 000000000000..7de4f06c63c5 --- /dev/null +++ b/net/netfilter/nf_conntrack_l3proto_generic.c | |||
@@ -0,0 +1,98 @@ | |||
1 | /* | ||
2 | * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> | ||
3 | * | ||
4 | * Based largely upon the original ip_conntrack code which | ||
5 | * had the following copyright information: | ||
6 | * | ||
7 | * (C) 1999-2001 Paul `Rusty' Russell | ||
8 | * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or modify | ||
11 | * it under the terms of the GNU General Public License version 2 as | ||
12 | * published by the Free Software Foundation. | ||
13 | * | ||
14 | * Author: | ||
15 | * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> | ||
16 | */ | ||
17 | |||
18 | #include <linux/config.h> | ||
19 | #include <linux/types.h> | ||
20 | #include <linux/ip.h> | ||
21 | #include <linux/netfilter.h> | ||
22 | #include <linux/module.h> | ||
23 | #include <linux/skbuff.h> | ||
24 | #include <linux/icmp.h> | ||
25 | #include <linux/sysctl.h> | ||
26 | #include <net/ip.h> | ||
27 | |||
28 | #include <linux/netfilter_ipv4.h> | ||
29 | #include <net/netfilter/nf_conntrack.h> | ||
30 | #include <net/netfilter/nf_conntrack_protocol.h> | ||
31 | #include <net/netfilter/nf_conntrack_l3proto.h> | ||
32 | #include <net/netfilter/nf_conntrack_core.h> | ||
33 | #include <net/netfilter/ipv4/nf_conntrack_ipv4.h> | ||
34 | |||
35 | #if 0 | ||
36 | #define DEBUGP printk | ||
37 | #else | ||
38 | #define DEBUGP(format, args...) | ||
39 | #endif | ||
40 | |||
41 | DECLARE_PER_CPU(struct nf_conntrack_stat, nf_conntrack_stat); | ||
42 | |||
43 | static int generic_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, | ||
44 | struct nf_conntrack_tuple *tuple) | ||
45 | { | ||
46 | memset(&tuple->src.u3, 0, sizeof(tuple->src.u3)); | ||
47 | memset(&tuple->dst.u3, 0, sizeof(tuple->dst.u3)); | ||
48 | |||
49 | return 1; | ||
50 | } | ||
51 | |||
52 | static int generic_invert_tuple(struct nf_conntrack_tuple *tuple, | ||
53 | const struct nf_conntrack_tuple *orig) | ||
54 | { | ||
55 | memset(&tuple->src.u3, 0, sizeof(tuple->src.u3)); | ||
56 | memset(&tuple->dst.u3, 0, sizeof(tuple->dst.u3)); | ||
57 | |||
58 | return 1; | ||
59 | } | ||
60 | |||
61 | static int generic_print_tuple(struct seq_file *s, | ||
62 | const struct nf_conntrack_tuple *tuple) | ||
63 | { | ||
64 | return 0; | ||
65 | } | ||
66 | |||
67 | static int generic_print_conntrack(struct seq_file *s, | ||
68 | const struct nf_conn *conntrack) | ||
69 | { | ||
70 | return 0; | ||
71 | } | ||
72 | |||
73 | static int | ||
74 | generic_prepare(struct sk_buff **pskb, unsigned int hooknum, | ||
75 | unsigned int *dataoff, u_int8_t *protonum) | ||
76 | { | ||
77 | /* Never track !!! */ | ||
78 | return -NF_ACCEPT; | ||
79 | } | ||
80 | |||
81 | |||
82 | static u_int32_t generic_get_features(const struct nf_conntrack_tuple *tuple) | ||
83 | |||
84 | { | ||
85 | return NF_CT_F_BASIC; | ||
86 | } | ||
87 | |||
88 | struct nf_conntrack_l3proto nf_conntrack_generic_l3proto = { | ||
89 | .l3proto = PF_UNSPEC, | ||
90 | .name = "unknown", | ||
91 | .pkt_to_tuple = generic_pkt_to_tuple, | ||
92 | .invert_tuple = generic_invert_tuple, | ||
93 | .print_tuple = generic_print_tuple, | ||
94 | .print_conntrack = generic_print_conntrack, | ||
95 | .prepare = generic_prepare, | ||
96 | .get_features = generic_get_features, | ||
97 | .me = THIS_MODULE, | ||
98 | }; | ||
diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c new file mode 100644 index 000000000000..36425f6c833f --- /dev/null +++ b/net/netfilter/nf_conntrack_proto_generic.c | |||
@@ -0,0 +1,85 @@ | |||
1 | /* (C) 1999-2001 Paul `Rusty' Russell | ||
2 | * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License version 2 as | ||
6 | * published by the Free Software Foundation. | ||
7 | * | ||
8 | * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> | ||
9 | * - enable working with L3 protocol independent connection tracking. | ||
10 | * | ||
11 | * Derived from net/ipv4/netfilter/ip_conntrack_proto_generic.c | ||
12 | */ | ||
13 | |||
14 | #include <linux/types.h> | ||
15 | #include <linux/sched.h> | ||
16 | #include <linux/timer.h> | ||
17 | #include <linux/netfilter.h> | ||
18 | #include <net/netfilter/nf_conntrack_protocol.h> | ||
19 | |||
20 | unsigned long nf_ct_generic_timeout = 600*HZ; | ||
21 | |||
22 | static int generic_pkt_to_tuple(const struct sk_buff *skb, | ||
23 | unsigned int dataoff, | ||
24 | struct nf_conntrack_tuple *tuple) | ||
25 | { | ||
26 | tuple->src.u.all = 0; | ||
27 | tuple->dst.u.all = 0; | ||
28 | |||
29 | return 1; | ||
30 | } | ||
31 | |||
32 | static int generic_invert_tuple(struct nf_conntrack_tuple *tuple, | ||
33 | const struct nf_conntrack_tuple *orig) | ||
34 | { | ||
35 | tuple->src.u.all = 0; | ||
36 | tuple->dst.u.all = 0; | ||
37 | |||
38 | return 1; | ||
39 | } | ||
40 | |||
41 | /* Print out the per-protocol part of the tuple. */ | ||
42 | static int generic_print_tuple(struct seq_file *s, | ||
43 | const struct nf_conntrack_tuple *tuple) | ||
44 | { | ||
45 | return 0; | ||
46 | } | ||
47 | |||
48 | /* Print out the private part of the conntrack. */ | ||
49 | static int generic_print_conntrack(struct seq_file *s, | ||
50 | const struct nf_conn *state) | ||
51 | { | ||
52 | return 0; | ||
53 | } | ||
54 | |||
55 | /* Returns verdict for packet, or -1 for invalid. */ | ||
56 | static int packet(struct nf_conn *conntrack, | ||
57 | const struct sk_buff *skb, | ||
58 | unsigned int dataoff, | ||
59 | enum ip_conntrack_info ctinfo, | ||
60 | int pf, | ||
61 | unsigned int hooknum) | ||
62 | { | ||
63 | nf_ct_refresh_acct(conntrack, ctinfo, skb, nf_ct_generic_timeout); | ||
64 | return NF_ACCEPT; | ||
65 | } | ||
66 | |||
67 | /* Called when a new connection for this protocol found. */ | ||
68 | static int new(struct nf_conn *conntrack, const struct sk_buff *skb, | ||
69 | unsigned int dataoff) | ||
70 | { | ||
71 | return 1; | ||
72 | } | ||
73 | |||
74 | struct nf_conntrack_protocol nf_conntrack_generic_protocol = | ||
75 | { | ||
76 | .l3proto = PF_UNSPEC, | ||
77 | .proto = 0, | ||
78 | .name = "unknown", | ||
79 | .pkt_to_tuple = generic_pkt_to_tuple, | ||
80 | .invert_tuple = generic_invert_tuple, | ||
81 | .print_tuple = generic_print_tuple, | ||
82 | .print_conntrack = generic_print_conntrack, | ||
83 | .packet = packet, | ||
84 | .new = new, | ||
85 | }; | ||
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c new file mode 100644 index 000000000000..3a600f77b4e0 --- /dev/null +++ b/net/netfilter/nf_conntrack_proto_sctp.c | |||
@@ -0,0 +1,670 @@ | |||
1 | /* | ||
2 | * Connection tracking protocol helper module for SCTP. | ||
3 | * | ||
4 | * SCTP is defined in RFC 2960. References to various sections in this code | ||
5 | * are to this RFC. | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | * | ||
11 | * 17 Oct 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> | ||
12 | * - enable working with L3 protocol independent connection tracking. | ||
13 | * | ||
14 | * Derived from net/ipv4/ip_conntrack_sctp.c | ||
15 | */ | ||
16 | |||
17 | /* | ||
18 | * Added support for proc manipulation of timeouts. | ||
19 | */ | ||
20 | |||
21 | #include <linux/types.h> | ||
22 | #include <linux/sched.h> | ||
23 | #include <linux/timer.h> | ||
24 | #include <linux/netfilter.h> | ||
25 | #include <linux/module.h> | ||
26 | #include <linux/in.h> | ||
27 | #include <linux/ip.h> | ||
28 | #include <linux/sctp.h> | ||
29 | #include <linux/string.h> | ||
30 | #include <linux/seq_file.h> | ||
31 | |||
32 | #include <net/netfilter/nf_conntrack.h> | ||
33 | #include <net/netfilter/nf_conntrack_protocol.h> | ||
34 | |||
35 | #if 0 | ||
36 | #define DEBUGP(format, ...) printk(format, ## __VA_ARGS__) | ||
37 | #else | ||
38 | #define DEBUGP(format, args...) | ||
39 | #endif | ||
40 | |||
41 | /* Protects conntrack->proto.sctp */ | ||
42 | static DEFINE_RWLOCK(sctp_lock); | ||
43 | |||
44 | /* FIXME: Examine ipfilter's timeouts and conntrack transitions more | ||
45 | closely. They're more complex. --RR | ||
46 | |||
47 | And so for me for SCTP :D -Kiran */ | ||
48 | |||
49 | static const char *sctp_conntrack_names[] = { | ||
50 | "NONE", | ||
51 | "CLOSED", | ||
52 | "COOKIE_WAIT", | ||
53 | "COOKIE_ECHOED", | ||
54 | "ESTABLISHED", | ||
55 | "SHUTDOWN_SENT", | ||
56 | "SHUTDOWN_RECD", | ||
57 | "SHUTDOWN_ACK_SENT", | ||
58 | }; | ||
59 | |||
60 | #define SECS * HZ | ||
61 | #define MINS * 60 SECS | ||
62 | #define HOURS * 60 MINS | ||
63 | #define DAYS * 24 HOURS | ||
64 | |||
65 | static unsigned long nf_ct_sctp_timeout_closed = 10 SECS; | ||
66 | static unsigned long nf_ct_sctp_timeout_cookie_wait = 3 SECS; | ||
67 | static unsigned long nf_ct_sctp_timeout_cookie_echoed = 3 SECS; | ||
68 | static unsigned long nf_ct_sctp_timeout_established = 5 DAYS; | ||
69 | static unsigned long nf_ct_sctp_timeout_shutdown_sent = 300 SECS / 1000; | ||
70 | static unsigned long nf_ct_sctp_timeout_shutdown_recd = 300 SECS / 1000; | ||
71 | static unsigned long nf_ct_sctp_timeout_shutdown_ack_sent = 3 SECS; | ||
72 | |||
73 | static unsigned long * sctp_timeouts[] | ||
74 | = { NULL, /* SCTP_CONNTRACK_NONE */ | ||
75 | &nf_ct_sctp_timeout_closed, /* SCTP_CONNTRACK_CLOSED */ | ||
76 | &nf_ct_sctp_timeout_cookie_wait, /* SCTP_CONNTRACK_COOKIE_WAIT */ | ||
77 | &nf_ct_sctp_timeout_cookie_echoed, /* SCTP_CONNTRACK_COOKIE_ECHOED */ | ||
78 | &nf_ct_sctp_timeout_established, /* SCTP_CONNTRACK_ESTABLISHED */ | ||
79 | &nf_ct_sctp_timeout_shutdown_sent, /* SCTP_CONNTRACK_SHUTDOWN_SENT */ | ||
80 | &nf_ct_sctp_timeout_shutdown_recd, /* SCTP_CONNTRACK_SHUTDOWN_RECD */ | ||
81 | &nf_ct_sctp_timeout_shutdown_ack_sent /* SCTP_CONNTRACK_SHUTDOWN_ACK_SENT */ | ||
82 | }; | ||
83 | |||
84 | #define sNO SCTP_CONNTRACK_NONE | ||
85 | #define sCL SCTP_CONNTRACK_CLOSED | ||
86 | #define sCW SCTP_CONNTRACK_COOKIE_WAIT | ||
87 | #define sCE SCTP_CONNTRACK_COOKIE_ECHOED | ||
88 | #define sES SCTP_CONNTRACK_ESTABLISHED | ||
89 | #define sSS SCTP_CONNTRACK_SHUTDOWN_SENT | ||
90 | #define sSR SCTP_CONNTRACK_SHUTDOWN_RECD | ||
91 | #define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT | ||
92 | #define sIV SCTP_CONNTRACK_MAX | ||
93 | |||
94 | /* | ||
95 | These are the descriptions of the states: | ||
96 | |||
97 | NOTE: These state names are tantalizingly similar to the states of an | ||
98 | SCTP endpoint. But the interpretation of the states is a little different, | ||
99 | considering that these are the states of the connection and not of an end | ||
100 | point. Please note the subtleties. -Kiran | ||
101 | |||
102 | NONE - Nothing so far. | ||
103 | COOKIE WAIT - We have seen an INIT chunk in the original direction, or also | ||
104 | an INIT_ACK chunk in the reply direction. | ||
105 | COOKIE ECHOED - We have seen a COOKIE_ECHO chunk in the original direction. | ||
106 | ESTABLISHED - We have seen a COOKIE_ACK in the reply direction. | ||
107 | SHUTDOWN_SENT - We have seen a SHUTDOWN chunk in the original direction. | ||
108 | SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply directoin. | ||
109 | SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite | ||
110 | to that of the SHUTDOWN chunk. | ||
111 | CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of | ||
112 | the SHUTDOWN chunk. Connection is closed. | ||
113 | */ | ||
114 | |||
115 | /* TODO | ||
116 | - I have assumed that the first INIT is in the original direction. | ||
117 | This messes things when an INIT comes in the reply direction in CLOSED | ||
118 | state. | ||
119 | - Check the error type in the reply dir before transitioning from | ||
120 | cookie echoed to closed. | ||
121 | - Sec 5.2.4 of RFC 2960 | ||
122 | - Multi Homing support. | ||
123 | */ | ||
124 | |||
125 | /* SCTP conntrack state transitions */ | ||
126 | static enum sctp_conntrack sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = { | ||
127 | { | ||
128 | /* ORIGINAL */ | ||
129 | /* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */ | ||
130 | /* init */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA}, | ||
131 | /* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA}, | ||
132 | /* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, | ||
133 | /* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA}, | ||
134 | /* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA}, | ||
135 | /* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant have Stale cookie*/ | ||
136 | /* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA},/* 5.2.4 - Big TODO */ | ||
137 | /* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in orig dir */ | ||
138 | /* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL} | ||
139 | }, | ||
140 | { | ||
141 | /* REPLY */ | ||
142 | /* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */ | ||
143 | /* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* INIT in sCL Big TODO */ | ||
144 | /* init_ack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA}, | ||
145 | /* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, | ||
146 | /* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA}, | ||
147 | /* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA}, | ||
148 | /* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA}, | ||
149 | /* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in reply dir */ | ||
150 | /* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA}, | ||
151 | /* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL} | ||
152 | } | ||
153 | }; | ||
154 | |||
155 | static int sctp_pkt_to_tuple(const struct sk_buff *skb, | ||
156 | unsigned int dataoff, | ||
157 | struct nf_conntrack_tuple *tuple) | ||
158 | { | ||
159 | sctp_sctphdr_t _hdr, *hp; | ||
160 | |||
161 | DEBUGP(__FUNCTION__); | ||
162 | DEBUGP("\n"); | ||
163 | |||
164 | /* Actually only need first 8 bytes. */ | ||
165 | hp = skb_header_pointer(skb, dataoff, 8, &_hdr); | ||
166 | if (hp == NULL) | ||
167 | return 0; | ||
168 | |||
169 | tuple->src.u.sctp.port = hp->source; | ||
170 | tuple->dst.u.sctp.port = hp->dest; | ||
171 | return 1; | ||
172 | } | ||
173 | |||
174 | static int sctp_invert_tuple(struct nf_conntrack_tuple *tuple, | ||
175 | const struct nf_conntrack_tuple *orig) | ||
176 | { | ||
177 | DEBUGP(__FUNCTION__); | ||
178 | DEBUGP("\n"); | ||
179 | |||
180 | tuple->src.u.sctp.port = orig->dst.u.sctp.port; | ||
181 | tuple->dst.u.sctp.port = orig->src.u.sctp.port; | ||
182 | return 1; | ||
183 | } | ||
184 | |||
185 | /* Print out the per-protocol part of the tuple. */ | ||
186 | static int sctp_print_tuple(struct seq_file *s, | ||
187 | const struct nf_conntrack_tuple *tuple) | ||
188 | { | ||
189 | DEBUGP(__FUNCTION__); | ||
190 | DEBUGP("\n"); | ||
191 | |||
192 | return seq_printf(s, "sport=%hu dport=%hu ", | ||
193 | ntohs(tuple->src.u.sctp.port), | ||
194 | ntohs(tuple->dst.u.sctp.port)); | ||
195 | } | ||
196 | |||
197 | /* Print out the private part of the conntrack. */ | ||
198 | static int sctp_print_conntrack(struct seq_file *s, | ||
199 | const struct nf_conn *conntrack) | ||
200 | { | ||
201 | enum sctp_conntrack state; | ||
202 | |||
203 | DEBUGP(__FUNCTION__); | ||
204 | DEBUGP("\n"); | ||
205 | |||
206 | read_lock_bh(&sctp_lock); | ||
207 | state = conntrack->proto.sctp.state; | ||
208 | read_unlock_bh(&sctp_lock); | ||
209 | |||
210 | return seq_printf(s, "%s ", sctp_conntrack_names[state]); | ||
211 | } | ||
212 | |||
213 | #define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) \ | ||
214 | for (offset = dataoff + sizeof(sctp_sctphdr_t), count = 0; \ | ||
215 | offset < skb->len && \ | ||
216 | (sch = skb_header_pointer(skb, offset, sizeof(_sch), &_sch)); \ | ||
217 | offset += (htons(sch->length) + 3) & ~3, count++) | ||
218 | |||
219 | /* Some validity checks to make sure the chunks are fine */ | ||
220 | static int do_basic_checks(struct nf_conn *conntrack, | ||
221 | const struct sk_buff *skb, | ||
222 | unsigned int dataoff, | ||
223 | char *map) | ||
224 | { | ||
225 | u_int32_t offset, count; | ||
226 | sctp_chunkhdr_t _sch, *sch; | ||
227 | int flag; | ||
228 | |||
229 | DEBUGP(__FUNCTION__); | ||
230 | DEBUGP("\n"); | ||
231 | |||
232 | flag = 0; | ||
233 | |||
234 | for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { | ||
235 | DEBUGP("Chunk Num: %d Type: %d\n", count, sch->type); | ||
236 | |||
237 | if (sch->type == SCTP_CID_INIT | ||
238 | || sch->type == SCTP_CID_INIT_ACK | ||
239 | || sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { | ||
240 | flag = 1; | ||
241 | } | ||
242 | |||
243 | /* Cookie Ack/Echo chunks not the first OR | ||
244 | Init / Init Ack / Shutdown compl chunks not the only chunks */ | ||
245 | if ((sch->type == SCTP_CID_COOKIE_ACK | ||
246 | || sch->type == SCTP_CID_COOKIE_ECHO | ||
247 | || flag) | ||
248 | && count !=0 ) { | ||
249 | DEBUGP("Basic checks failed\n"); | ||
250 | return 1; | ||
251 | } | ||
252 | |||
253 | if (map) { | ||
254 | set_bit(sch->type, (void *)map); | ||
255 | } | ||
256 | } | ||
257 | |||
258 | DEBUGP("Basic checks passed\n"); | ||
259 | return 0; | ||
260 | } | ||
261 | |||
262 | static int new_state(enum ip_conntrack_dir dir, | ||
263 | enum sctp_conntrack cur_state, | ||
264 | int chunk_type) | ||
265 | { | ||
266 | int i; | ||
267 | |||
268 | DEBUGP(__FUNCTION__); | ||
269 | DEBUGP("\n"); | ||
270 | |||
271 | DEBUGP("Chunk type: %d\n", chunk_type); | ||
272 | |||
273 | switch (chunk_type) { | ||
274 | case SCTP_CID_INIT: | ||
275 | DEBUGP("SCTP_CID_INIT\n"); | ||
276 | i = 0; break; | ||
277 | case SCTP_CID_INIT_ACK: | ||
278 | DEBUGP("SCTP_CID_INIT_ACK\n"); | ||
279 | i = 1; break; | ||
280 | case SCTP_CID_ABORT: | ||
281 | DEBUGP("SCTP_CID_ABORT\n"); | ||
282 | i = 2; break; | ||
283 | case SCTP_CID_SHUTDOWN: | ||
284 | DEBUGP("SCTP_CID_SHUTDOWN\n"); | ||
285 | i = 3; break; | ||
286 | case SCTP_CID_SHUTDOWN_ACK: | ||
287 | DEBUGP("SCTP_CID_SHUTDOWN_ACK\n"); | ||
288 | i = 4; break; | ||
289 | case SCTP_CID_ERROR: | ||
290 | DEBUGP("SCTP_CID_ERROR\n"); | ||
291 | i = 5; break; | ||
292 | case SCTP_CID_COOKIE_ECHO: | ||
293 | DEBUGP("SCTP_CID_COOKIE_ECHO\n"); | ||
294 | i = 6; break; | ||
295 | case SCTP_CID_COOKIE_ACK: | ||
296 | DEBUGP("SCTP_CID_COOKIE_ACK\n"); | ||
297 | i = 7; break; | ||
298 | case SCTP_CID_SHUTDOWN_COMPLETE: | ||
299 | DEBUGP("SCTP_CID_SHUTDOWN_COMPLETE\n"); | ||
300 | i = 8; break; | ||
301 | default: | ||
302 | /* Other chunks like DATA, SACK, HEARTBEAT and | ||
303 | its ACK do not cause a change in state */ | ||
304 | DEBUGP("Unknown chunk type, Will stay in %s\n", | ||
305 | sctp_conntrack_names[cur_state]); | ||
306 | return cur_state; | ||
307 | } | ||
308 | |||
309 | DEBUGP("dir: %d cur_state: %s chunk_type: %d new_state: %s\n", | ||
310 | dir, sctp_conntrack_names[cur_state], chunk_type, | ||
311 | sctp_conntrack_names[sctp_conntracks[dir][i][cur_state]]); | ||
312 | |||
313 | return sctp_conntracks[dir][i][cur_state]; | ||
314 | } | ||
315 | |||
316 | /* Returns verdict for packet, or -1 for invalid. */ | ||
317 | static int sctp_packet(struct nf_conn *conntrack, | ||
318 | const struct sk_buff *skb, | ||
319 | unsigned int dataoff, | ||
320 | enum ip_conntrack_info ctinfo, | ||
321 | int pf, | ||
322 | unsigned int hooknum) | ||
323 | { | ||
324 | enum sctp_conntrack newconntrack, oldsctpstate; | ||
325 | sctp_sctphdr_t _sctph, *sh; | ||
326 | sctp_chunkhdr_t _sch, *sch; | ||
327 | u_int32_t offset, count; | ||
328 | char map[256 / sizeof (char)] = {0}; | ||
329 | |||
330 | DEBUGP(__FUNCTION__); | ||
331 | DEBUGP("\n"); | ||
332 | |||
333 | sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph); | ||
334 | if (sh == NULL) | ||
335 | return -1; | ||
336 | |||
337 | if (do_basic_checks(conntrack, skb, dataoff, map) != 0) | ||
338 | return -1; | ||
339 | |||
340 | /* Check the verification tag (Sec 8.5) */ | ||
341 | if (!test_bit(SCTP_CID_INIT, (void *)map) | ||
342 | && !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, (void *)map) | ||
343 | && !test_bit(SCTP_CID_COOKIE_ECHO, (void *)map) | ||
344 | && !test_bit(SCTP_CID_ABORT, (void *)map) | ||
345 | && !test_bit(SCTP_CID_SHUTDOWN_ACK, (void *)map) | ||
346 | && (sh->vtag != conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) { | ||
347 | DEBUGP("Verification tag check failed\n"); | ||
348 | return -1; | ||
349 | } | ||
350 | |||
351 | oldsctpstate = newconntrack = SCTP_CONNTRACK_MAX; | ||
352 | for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { | ||
353 | write_lock_bh(&sctp_lock); | ||
354 | |||
355 | /* Special cases of Verification tag check (Sec 8.5.1) */ | ||
356 | if (sch->type == SCTP_CID_INIT) { | ||
357 | /* Sec 8.5.1 (A) */ | ||
358 | if (sh->vtag != 0) { | ||
359 | write_unlock_bh(&sctp_lock); | ||
360 | return -1; | ||
361 | } | ||
362 | } else if (sch->type == SCTP_CID_ABORT) { | ||
363 | /* Sec 8.5.1 (B) */ | ||
364 | if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)]) | ||
365 | && !(sh->vtag == conntrack->proto.sctp.vtag | ||
366 | [1 - CTINFO2DIR(ctinfo)])) { | ||
367 | write_unlock_bh(&sctp_lock); | ||
368 | return -1; | ||
369 | } | ||
370 | } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { | ||
371 | /* Sec 8.5.1 (C) */ | ||
372 | if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)]) | ||
373 | && !(sh->vtag == conntrack->proto.sctp.vtag | ||
374 | [1 - CTINFO2DIR(ctinfo)] | ||
375 | && (sch->flags & 1))) { | ||
376 | write_unlock_bh(&sctp_lock); | ||
377 | return -1; | ||
378 | } | ||
379 | } else if (sch->type == SCTP_CID_COOKIE_ECHO) { | ||
380 | /* Sec 8.5.1 (D) */ | ||
381 | if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) { | ||
382 | write_unlock_bh(&sctp_lock); | ||
383 | return -1; | ||
384 | } | ||
385 | } | ||
386 | |||
387 | oldsctpstate = conntrack->proto.sctp.state; | ||
388 | newconntrack = new_state(CTINFO2DIR(ctinfo), oldsctpstate, sch->type); | ||
389 | |||
390 | /* Invalid */ | ||
391 | if (newconntrack == SCTP_CONNTRACK_MAX) { | ||
392 | DEBUGP("nf_conntrack_sctp: Invalid dir=%i ctype=%u conntrack=%u\n", | ||
393 | CTINFO2DIR(ctinfo), sch->type, oldsctpstate); | ||
394 | write_unlock_bh(&sctp_lock); | ||
395 | return -1; | ||
396 | } | ||
397 | |||
398 | /* If it is an INIT or an INIT ACK note down the vtag */ | ||
399 | if (sch->type == SCTP_CID_INIT | ||
400 | || sch->type == SCTP_CID_INIT_ACK) { | ||
401 | sctp_inithdr_t _inithdr, *ih; | ||
402 | |||
403 | ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t), | ||
404 | sizeof(_inithdr), &_inithdr); | ||
405 | if (ih == NULL) { | ||
406 | write_unlock_bh(&sctp_lock); | ||
407 | return -1; | ||
408 | } | ||
409 | DEBUGP("Setting vtag %x for dir %d\n", | ||
410 | ih->init_tag, !CTINFO2DIR(ctinfo)); | ||
411 | conntrack->proto.sctp.vtag[!CTINFO2DIR(ctinfo)] = ih->init_tag; | ||
412 | } | ||
413 | |||
414 | conntrack->proto.sctp.state = newconntrack; | ||
415 | if (oldsctpstate != newconntrack) | ||
416 | nf_conntrack_event_cache(IPCT_PROTOINFO, skb); | ||
417 | write_unlock_bh(&sctp_lock); | ||
418 | } | ||
419 | |||
420 | nf_ct_refresh_acct(conntrack, ctinfo, skb, *sctp_timeouts[newconntrack]); | ||
421 | |||
422 | if (oldsctpstate == SCTP_CONNTRACK_COOKIE_ECHOED | ||
423 | && CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY | ||
424 | && newconntrack == SCTP_CONNTRACK_ESTABLISHED) { | ||
425 | DEBUGP("Setting assured bit\n"); | ||
426 | set_bit(IPS_ASSURED_BIT, &conntrack->status); | ||
427 | nf_conntrack_event_cache(IPCT_STATUS, skb); | ||
428 | } | ||
429 | |||
430 | return NF_ACCEPT; | ||
431 | } | ||
432 | |||
433 | /* Called when a new connection for this protocol found. */ | ||
434 | static int sctp_new(struct nf_conn *conntrack, const struct sk_buff *skb, | ||
435 | unsigned int dataoff) | ||
436 | { | ||
437 | enum sctp_conntrack newconntrack; | ||
438 | sctp_sctphdr_t _sctph, *sh; | ||
439 | sctp_chunkhdr_t _sch, *sch; | ||
440 | u_int32_t offset, count; | ||
441 | char map[256 / sizeof (char)] = {0}; | ||
442 | |||
443 | DEBUGP(__FUNCTION__); | ||
444 | DEBUGP("\n"); | ||
445 | |||
446 | sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph); | ||
447 | if (sh == NULL) | ||
448 | return 0; | ||
449 | |||
450 | if (do_basic_checks(conntrack, skb, dataoff, map) != 0) | ||
451 | return 0; | ||
452 | |||
453 | /* If an OOTB packet has any of these chunks discard (Sec 8.4) */ | ||
454 | if ((test_bit (SCTP_CID_ABORT, (void *)map)) | ||
455 | || (test_bit (SCTP_CID_SHUTDOWN_COMPLETE, (void *)map)) | ||
456 | || (test_bit (SCTP_CID_COOKIE_ACK, (void *)map))) { | ||
457 | return 0; | ||
458 | } | ||
459 | |||
460 | newconntrack = SCTP_CONNTRACK_MAX; | ||
461 | for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { | ||
462 | /* Don't need lock here: this conntrack not in circulation yet */ | ||
463 | newconntrack = new_state(IP_CT_DIR_ORIGINAL, | ||
464 | SCTP_CONNTRACK_NONE, sch->type); | ||
465 | |||
466 | /* Invalid: delete conntrack */ | ||
467 | if (newconntrack == SCTP_CONNTRACK_MAX) { | ||
468 | DEBUGP("nf_conntrack_sctp: invalid new deleting.\n"); | ||
469 | return 0; | ||
470 | } | ||
471 | |||
472 | /* Copy the vtag into the state info */ | ||
473 | if (sch->type == SCTP_CID_INIT) { | ||
474 | if (sh->vtag == 0) { | ||
475 | sctp_inithdr_t _inithdr, *ih; | ||
476 | |||
477 | ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t), | ||
478 | sizeof(_inithdr), &_inithdr); | ||
479 | if (ih == NULL) | ||
480 | return 0; | ||
481 | |||
482 | DEBUGP("Setting vtag %x for new conn\n", | ||
483 | ih->init_tag); | ||
484 | |||
485 | conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] = | ||
486 | ih->init_tag; | ||
487 | } else { | ||
488 | /* Sec 8.5.1 (A) */ | ||
489 | return 0; | ||
490 | } | ||
491 | } | ||
492 | /* If it is a shutdown ack OOTB packet, we expect a return | ||
493 | shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */ | ||
494 | else { | ||
495 | DEBUGP("Setting vtag %x for new conn OOTB\n", | ||
496 | sh->vtag); | ||
497 | conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag; | ||
498 | } | ||
499 | |||
500 | conntrack->proto.sctp.state = newconntrack; | ||
501 | } | ||
502 | |||
503 | return 1; | ||
504 | } | ||
505 | |||
506 | struct nf_conntrack_protocol nf_conntrack_protocol_sctp4 = { | ||
507 | .l3proto = PF_INET, | ||
508 | .proto = IPPROTO_SCTP, | ||
509 | .name = "sctp", | ||
510 | .pkt_to_tuple = sctp_pkt_to_tuple, | ||
511 | .invert_tuple = sctp_invert_tuple, | ||
512 | .print_tuple = sctp_print_tuple, | ||
513 | .print_conntrack = sctp_print_conntrack, | ||
514 | .packet = sctp_packet, | ||
515 | .new = sctp_new, | ||
516 | .destroy = NULL, | ||
517 | .me = THIS_MODULE | ||
518 | }; | ||
519 | |||
520 | struct nf_conntrack_protocol nf_conntrack_protocol_sctp6 = { | ||
521 | .l3proto = PF_INET6, | ||
522 | .proto = IPPROTO_SCTP, | ||
523 | .name = "sctp", | ||
524 | .pkt_to_tuple = sctp_pkt_to_tuple, | ||
525 | .invert_tuple = sctp_invert_tuple, | ||
526 | .print_tuple = sctp_print_tuple, | ||
527 | .print_conntrack = sctp_print_conntrack, | ||
528 | .packet = sctp_packet, | ||
529 | .new = sctp_new, | ||
530 | .destroy = NULL, | ||
531 | .me = THIS_MODULE | ||
532 | }; | ||
533 | |||
534 | #ifdef CONFIG_SYSCTL | ||
535 | static ctl_table nf_ct_sysctl_table[] = { | ||
536 | { | ||
537 | .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED, | ||
538 | .procname = "nf_conntrack_sctp_timeout_closed", | ||
539 | .data = &nf_ct_sctp_timeout_closed, | ||
540 | .maxlen = sizeof(unsigned int), | ||
541 | .mode = 0644, | ||
542 | .proc_handler = &proc_dointvec_jiffies, | ||
543 | }, | ||
544 | { | ||
545 | .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT, | ||
546 | .procname = "nf_conntrack_sctp_timeout_cookie_wait", | ||
547 | .data = &nf_ct_sctp_timeout_cookie_wait, | ||
548 | .maxlen = sizeof(unsigned int), | ||
549 | .mode = 0644, | ||
550 | .proc_handler = &proc_dointvec_jiffies, | ||
551 | }, | ||
552 | { | ||
553 | .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED, | ||
554 | .procname = "nf_conntrack_sctp_timeout_cookie_echoed", | ||
555 | .data = &nf_ct_sctp_timeout_cookie_echoed, | ||
556 | .maxlen = sizeof(unsigned int), | ||
557 | .mode = 0644, | ||
558 | .proc_handler = &proc_dointvec_jiffies, | ||
559 | }, | ||
560 | { | ||
561 | .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED, | ||
562 | .procname = "nf_conntrack_sctp_timeout_established", | ||
563 | .data = &nf_ct_sctp_timeout_established, | ||
564 | .maxlen = sizeof(unsigned int), | ||
565 | .mode = 0644, | ||
566 | .proc_handler = &proc_dointvec_jiffies, | ||
567 | }, | ||
568 | { | ||
569 | .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT, | ||
570 | .procname = "nf_conntrack_sctp_timeout_shutdown_sent", | ||
571 | .data = &nf_ct_sctp_timeout_shutdown_sent, | ||
572 | .maxlen = sizeof(unsigned int), | ||
573 | .mode = 0644, | ||
574 | .proc_handler = &proc_dointvec_jiffies, | ||
575 | }, | ||
576 | { | ||
577 | .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD, | ||
578 | .procname = "nf_conntrack_sctp_timeout_shutdown_recd", | ||
579 | .data = &nf_ct_sctp_timeout_shutdown_recd, | ||
580 | .maxlen = sizeof(unsigned int), | ||
581 | .mode = 0644, | ||
582 | .proc_handler = &proc_dointvec_jiffies, | ||
583 | }, | ||
584 | { | ||
585 | .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT, | ||
586 | .procname = "nf_conntrack_sctp_timeout_shutdown_ack_sent", | ||
587 | .data = &nf_ct_sctp_timeout_shutdown_ack_sent, | ||
588 | .maxlen = sizeof(unsigned int), | ||
589 | .mode = 0644, | ||
590 | .proc_handler = &proc_dointvec_jiffies, | ||
591 | }, | ||
592 | { .ctl_name = 0 } | ||
593 | }; | ||
594 | |||
595 | static ctl_table nf_ct_netfilter_table[] = { | ||
596 | { | ||
597 | .ctl_name = NET_NETFILTER, | ||
598 | .procname = "netfilter", | ||
599 | .mode = 0555, | ||
600 | .child = nf_ct_sysctl_table, | ||
601 | }, | ||
602 | { .ctl_name = 0 } | ||
603 | }; | ||
604 | |||
605 | static ctl_table nf_ct_net_table[] = { | ||
606 | { | ||
607 | .ctl_name = CTL_NET, | ||
608 | .procname = "net", | ||
609 | .mode = 0555, | ||
610 | .child = nf_ct_netfilter_table, | ||
611 | }, | ||
612 | { .ctl_name = 0 } | ||
613 | }; | ||
614 | |||
615 | static struct ctl_table_header *nf_ct_sysctl_header; | ||
616 | #endif | ||
617 | |||
618 | int __init init(void) | ||
619 | { | ||
620 | int ret; | ||
621 | |||
622 | ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_sctp4); | ||
623 | if (ret) { | ||
624 | printk("nf_conntrack_proto_sctp4: protocol register failed\n"); | ||
625 | goto out; | ||
626 | } | ||
627 | ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_sctp6); | ||
628 | if (ret) { | ||
629 | printk("nf_conntrack_proto_sctp6: protocol register failed\n"); | ||
630 | goto cleanup_sctp4; | ||
631 | } | ||
632 | |||
633 | #ifdef CONFIG_SYSCTL | ||
634 | nf_ct_sysctl_header = register_sysctl_table(nf_ct_net_table, 0); | ||
635 | if (nf_ct_sysctl_header == NULL) { | ||
636 | printk("nf_conntrack_proto_sctp: can't register to sysctl.\n"); | ||
637 | goto cleanup; | ||
638 | } | ||
639 | #endif | ||
640 | |||
641 | return ret; | ||
642 | |||
643 | #ifdef CONFIG_SYSCTL | ||
644 | cleanup: | ||
645 | nf_conntrack_protocol_unregister(&nf_conntrack_protocol_sctp6); | ||
646 | #endif | ||
647 | cleanup_sctp4: | ||
648 | nf_conntrack_protocol_unregister(&nf_conntrack_protocol_sctp4); | ||
649 | out: | ||
650 | DEBUGP("SCTP conntrack module loading %s\n", | ||
651 | ret ? "failed": "succeeded"); | ||
652 | return ret; | ||
653 | } | ||
654 | |||
655 | void __exit fini(void) | ||
656 | { | ||
657 | nf_conntrack_protocol_unregister(&nf_conntrack_protocol_sctp6); | ||
658 | nf_conntrack_protocol_unregister(&nf_conntrack_protocol_sctp4); | ||
659 | #ifdef CONFIG_SYSCTL | ||
660 | unregister_sysctl_table(nf_ct_sysctl_header); | ||
661 | #endif | ||
662 | DEBUGP("SCTP conntrack module unloaded\n"); | ||
663 | } | ||
664 | |||
665 | module_init(init); | ||
666 | module_exit(fini); | ||
667 | |||
668 | MODULE_LICENSE("GPL"); | ||
669 | MODULE_AUTHOR("Kiran Kumar Immidi"); | ||
670 | MODULE_DESCRIPTION("Netfilter connection tracking protocol helper for SCTP"); | ||
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c new file mode 100644 index 000000000000..83d90dd624f0 --- /dev/null +++ b/net/netfilter/nf_conntrack_proto_tcp.c | |||
@@ -0,0 +1,1162 @@ | |||
1 | /* (C) 1999-2001 Paul `Rusty' Russell | ||
2 | * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License version 2 as | ||
6 | * published by the Free Software Foundation. | ||
7 | * | ||
8 | * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>: | ||
9 | * - Real stateful connection tracking | ||
10 | * - Modified state transitions table | ||
11 | * - Window scaling support added | ||
12 | * - SACK support added | ||
13 | * | ||
14 | * Willy Tarreau: | ||
15 | * - State table bugfixes | ||
16 | * - More robust state changes | ||
17 | * - Tuning timer parameters | ||
18 | * | ||
19 | * 27 Oct 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> | ||
20 | * - genelized Layer 3 protocol part. | ||
21 | * | ||
22 | * Derived from net/ipv4/netfilter/ip_conntrack_proto_tcp.c | ||
23 | * | ||
24 | * version 2.2 | ||
25 | */ | ||
26 | |||
27 | #include <linux/config.h> | ||
28 | #include <linux/types.h> | ||
29 | #include <linux/sched.h> | ||
30 | #include <linux/timer.h> | ||
31 | #include <linux/netfilter.h> | ||
32 | #include <linux/module.h> | ||
33 | #include <linux/in.h> | ||
34 | #include <linux/tcp.h> | ||
35 | #include <linux/spinlock.h> | ||
36 | #include <linux/skbuff.h> | ||
37 | #include <linux/ipv6.h> | ||
38 | #include <net/ip6_checksum.h> | ||
39 | |||
40 | #include <net/tcp.h> | ||
41 | |||
42 | #include <linux/netfilter.h> | ||
43 | #include <linux/netfilter_ipv4.h> | ||
44 | #include <linux/netfilter_ipv6.h> | ||
45 | #include <net/netfilter/nf_conntrack.h> | ||
46 | #include <net/netfilter/nf_conntrack_protocol.h> | ||
47 | |||
48 | #if 0 | ||
49 | #define DEBUGP printk | ||
50 | #define DEBUGP_VARS | ||
51 | #else | ||
52 | #define DEBUGP(format, args...) | ||
53 | #endif | ||
54 | |||
55 | /* Protects conntrack->proto.tcp */ | ||
56 | static DEFINE_RWLOCK(tcp_lock); | ||
57 | |||
58 | /* "Be conservative in what you do, | ||
59 | be liberal in what you accept from others." | ||
60 | If it's non-zero, we mark only out of window RST segments as INVALID. */ | ||
61 | int nf_ct_tcp_be_liberal = 0; | ||
62 | |||
63 | /* When connection is picked up from the middle, how many packets are required | ||
64 | to pass in each direction when we assume we are in sync - if any side uses | ||
65 | window scaling, we lost the game. | ||
66 | If it is set to zero, we disable picking up already established | ||
67 | connections. */ | ||
68 | int nf_ct_tcp_loose = 3; | ||
69 | |||
70 | /* Max number of the retransmitted packets without receiving an (acceptable) | ||
71 | ACK from the destination. If this number is reached, a shorter timer | ||
72 | will be started. */ | ||
73 | int nf_ct_tcp_max_retrans = 3; | ||
74 | |||
75 | /* FIXME: Examine ipfilter's timeouts and conntrack transitions more | ||
76 | closely. They're more complex. --RR */ | ||
77 | |||
78 | static const char *tcp_conntrack_names[] = { | ||
79 | "NONE", | ||
80 | "SYN_SENT", | ||
81 | "SYN_RECV", | ||
82 | "ESTABLISHED", | ||
83 | "FIN_WAIT", | ||
84 | "CLOSE_WAIT", | ||
85 | "LAST_ACK", | ||
86 | "TIME_WAIT", | ||
87 | "CLOSE", | ||
88 | "LISTEN" | ||
89 | }; | ||
90 | |||
91 | #define SECS * HZ | ||
92 | #define MINS * 60 SECS | ||
93 | #define HOURS * 60 MINS | ||
94 | #define DAYS * 24 HOURS | ||
95 | |||
96 | unsigned long nf_ct_tcp_timeout_syn_sent = 2 MINS; | ||
97 | unsigned long nf_ct_tcp_timeout_syn_recv = 60 SECS; | ||
98 | unsigned long nf_ct_tcp_timeout_established = 5 DAYS; | ||
99 | unsigned long nf_ct_tcp_timeout_fin_wait = 2 MINS; | ||
100 | unsigned long nf_ct_tcp_timeout_close_wait = 60 SECS; | ||
101 | unsigned long nf_ct_tcp_timeout_last_ack = 30 SECS; | ||
102 | unsigned long nf_ct_tcp_timeout_time_wait = 2 MINS; | ||
103 | unsigned long nf_ct_tcp_timeout_close = 10 SECS; | ||
104 | |||
105 | /* RFC1122 says the R2 limit should be at least 100 seconds. | ||
106 | Linux uses 15 packets as limit, which corresponds | ||
107 | to ~13-30min depending on RTO. */ | ||
108 | unsigned long nf_ct_tcp_timeout_max_retrans = 5 MINS; | ||
109 | |||
110 | static unsigned long * tcp_timeouts[] | ||
111 | = { NULL, /* TCP_CONNTRACK_NONE */ | ||
112 | &nf_ct_tcp_timeout_syn_sent, /* TCP_CONNTRACK_SYN_SENT, */ | ||
113 | &nf_ct_tcp_timeout_syn_recv, /* TCP_CONNTRACK_SYN_RECV, */ | ||
114 | &nf_ct_tcp_timeout_established, /* TCP_CONNTRACK_ESTABLISHED, */ | ||
115 | &nf_ct_tcp_timeout_fin_wait, /* TCP_CONNTRACK_FIN_WAIT, */ | ||
116 | &nf_ct_tcp_timeout_close_wait, /* TCP_CONNTRACK_CLOSE_WAIT, */ | ||
117 | &nf_ct_tcp_timeout_last_ack, /* TCP_CONNTRACK_LAST_ACK, */ | ||
118 | &nf_ct_tcp_timeout_time_wait, /* TCP_CONNTRACK_TIME_WAIT, */ | ||
119 | &nf_ct_tcp_timeout_close, /* TCP_CONNTRACK_CLOSE, */ | ||
120 | NULL, /* TCP_CONNTRACK_LISTEN */ | ||
121 | }; | ||
122 | |||
123 | #define sNO TCP_CONNTRACK_NONE | ||
124 | #define sSS TCP_CONNTRACK_SYN_SENT | ||
125 | #define sSR TCP_CONNTRACK_SYN_RECV | ||
126 | #define sES TCP_CONNTRACK_ESTABLISHED | ||
127 | #define sFW TCP_CONNTRACK_FIN_WAIT | ||
128 | #define sCW TCP_CONNTRACK_CLOSE_WAIT | ||
129 | #define sLA TCP_CONNTRACK_LAST_ACK | ||
130 | #define sTW TCP_CONNTRACK_TIME_WAIT | ||
131 | #define sCL TCP_CONNTRACK_CLOSE | ||
132 | #define sLI TCP_CONNTRACK_LISTEN | ||
133 | #define sIV TCP_CONNTRACK_MAX | ||
134 | #define sIG TCP_CONNTRACK_IGNORE | ||
135 | |||
136 | /* What TCP flags are set from RST/SYN/FIN/ACK. */ | ||
137 | enum tcp_bit_set { | ||
138 | TCP_SYN_SET, | ||
139 | TCP_SYNACK_SET, | ||
140 | TCP_FIN_SET, | ||
141 | TCP_ACK_SET, | ||
142 | TCP_RST_SET, | ||
143 | TCP_NONE_SET, | ||
144 | }; | ||
145 | |||
146 | /* | ||
147 | * The TCP state transition table needs a few words... | ||
148 | * | ||
149 | * We are the man in the middle. All the packets go through us | ||
150 | * but might get lost in transit to the destination. | ||
151 | * It is assumed that the destinations can't receive segments | ||
152 | * we haven't seen. | ||
153 | * | ||
154 | * The checked segment is in window, but our windows are *not* | ||
155 | * equivalent with the ones of the sender/receiver. We always | ||
156 | * try to guess the state of the current sender. | ||
157 | * | ||
158 | * The meaning of the states are: | ||
159 | * | ||
160 | * NONE: initial state | ||
161 | * SYN_SENT: SYN-only packet seen | ||
162 | * SYN_RECV: SYN-ACK packet seen | ||
163 | * ESTABLISHED: ACK packet seen | ||
164 | * FIN_WAIT: FIN packet seen | ||
165 | * CLOSE_WAIT: ACK seen (after FIN) | ||
166 | * LAST_ACK: FIN seen (after FIN) | ||
167 | * TIME_WAIT: last ACK seen | ||
168 | * CLOSE: closed connection | ||
169 | * | ||
170 | * LISTEN state is not used. | ||
171 | * | ||
172 | * Packets marked as IGNORED (sIG): | ||
173 | * if they may be either invalid or valid | ||
174 | * and the receiver may send back a connection | ||
175 | * closing RST or a SYN/ACK. | ||
176 | * | ||
177 | * Packets marked as INVALID (sIV): | ||
178 | * if they are invalid | ||
179 | * or we do not support the request (simultaneous open) | ||
180 | */ | ||
181 | static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { | ||
182 | { | ||
183 | /* ORIGINAL */ | ||
184 | /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ | ||
185 | /*syn*/ { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sIV }, | ||
186 | /* | ||
187 | * sNO -> sSS Initialize a new connection | ||
188 | * sSS -> sSS Retransmitted SYN | ||
189 | * sSR -> sIG Late retransmitted SYN? | ||
190 | * sES -> sIG Error: SYNs in window outside the SYN_SENT state | ||
191 | * are errors. Receiver will reply with RST | ||
192 | * and close the connection. | ||
193 | * Or we are not in sync and hold a dead connection. | ||
194 | * sFW -> sIG | ||
195 | * sCW -> sIG | ||
196 | * sLA -> sIG | ||
197 | * sTW -> sSS Reopened connection (RFC 1122). | ||
198 | * sCL -> sSS | ||
199 | */ | ||
200 | /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ | ||
201 | /*synack*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }, | ||
202 | /* | ||
203 | * A SYN/ACK from the client is always invalid: | ||
204 | * - either it tries to set up a simultaneous open, which is | ||
205 | * not supported; | ||
206 | * - or the firewall has just been inserted between the two hosts | ||
207 | * during the session set-up. The SYN will be retransmitted | ||
208 | * by the true client (or it'll time out). | ||
209 | */ | ||
210 | /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ | ||
211 | /*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, | ||
212 | /* | ||
213 | * sNO -> sIV Too late and no reason to do anything... | ||
214 | * sSS -> sIV Client migth not send FIN in this state: | ||
215 | * we enforce waiting for a SYN/ACK reply first. | ||
216 | * sSR -> sFW Close started. | ||
217 | * sES -> sFW | ||
218 | * sFW -> sLA FIN seen in both directions, waiting for | ||
219 | * the last ACK. | ||
220 | * Migth be a retransmitted FIN as well... | ||
221 | * sCW -> sLA | ||
222 | * sLA -> sLA Retransmitted FIN. Remain in the same state. | ||
223 | * sTW -> sTW | ||
224 | * sCL -> sCL | ||
225 | */ | ||
226 | /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ | ||
227 | /*ack*/ { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV }, | ||
228 | /* | ||
229 | * sNO -> sES Assumed. | ||
230 | * sSS -> sIV ACK is invalid: we haven't seen a SYN/ACK yet. | ||
231 | * sSR -> sES Established state is reached. | ||
232 | * sES -> sES :-) | ||
233 | * sFW -> sCW Normal close request answered by ACK. | ||
234 | * sCW -> sCW | ||
235 | * sLA -> sTW Last ACK detected. | ||
236 | * sTW -> sTW Retransmitted last ACK. Remain in the same state. | ||
237 | * sCL -> sCL | ||
238 | */ | ||
239 | /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ | ||
240 | /*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV }, | ||
241 | /*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } | ||
242 | }, | ||
243 | { | ||
244 | /* REPLY */ | ||
245 | /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ | ||
246 | /*syn*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }, | ||
247 | /* | ||
248 | * sNO -> sIV Never reached. | ||
249 | * sSS -> sIV Simultaneous open, not supported | ||
250 | * sSR -> sIV Simultaneous open, not supported. | ||
251 | * sES -> sIV Server may not initiate a connection. | ||
252 | * sFW -> sIV | ||
253 | * sCW -> sIV | ||
254 | * sLA -> sIV | ||
255 | * sTW -> sIV Reopened connection, but server may not do it. | ||
256 | * sCL -> sIV | ||
257 | */ | ||
258 | /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ | ||
259 | /*synack*/ { sIV, sSR, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIV }, | ||
260 | /* | ||
261 | * sSS -> sSR Standard open. | ||
262 | * sSR -> sSR Retransmitted SYN/ACK. | ||
263 | * sES -> sIG Late retransmitted SYN/ACK? | ||
264 | * sFW -> sIG Might be SYN/ACK answering ignored SYN | ||
265 | * sCW -> sIG | ||
266 | * sLA -> sIG | ||
267 | * sTW -> sIG | ||
268 | * sCL -> sIG | ||
269 | */ | ||
270 | /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ | ||
271 | /*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, | ||
272 | /* | ||
273 | * sSS -> sIV Server might not send FIN in this state. | ||
274 | * sSR -> sFW Close started. | ||
275 | * sES -> sFW | ||
276 | * sFW -> sLA FIN seen in both directions. | ||
277 | * sCW -> sLA | ||
278 | * sLA -> sLA Retransmitted FIN. | ||
279 | * sTW -> sTW | ||
280 | * sCL -> sCL | ||
281 | */ | ||
282 | /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ | ||
283 | /*ack*/ { sIV, sIV, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIV }, | ||
284 | /* | ||
285 | * sSS -> sIV Might be a half-open connection. | ||
286 | * sSR -> sSR Might answer late resent SYN. | ||
287 | * sES -> sES :-) | ||
288 | * sFW -> sCW Normal close request answered by ACK. | ||
289 | * sCW -> sCW | ||
290 | * sLA -> sTW Last ACK detected. | ||
291 | * sTW -> sTW Retransmitted last ACK. | ||
292 | * sCL -> sCL | ||
293 | */ | ||
294 | /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ | ||
295 | /*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV }, | ||
296 | /*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } | ||
297 | } | ||
298 | }; | ||
299 | |||
300 | static int tcp_pkt_to_tuple(const struct sk_buff *skb, | ||
301 | unsigned int dataoff, | ||
302 | struct nf_conntrack_tuple *tuple) | ||
303 | { | ||
304 | struct tcphdr _hdr, *hp; | ||
305 | |||
306 | /* Actually only need first 8 bytes. */ | ||
307 | hp = skb_header_pointer(skb, dataoff, 8, &_hdr); | ||
308 | if (hp == NULL) | ||
309 | return 0; | ||
310 | |||
311 | tuple->src.u.tcp.port = hp->source; | ||
312 | tuple->dst.u.tcp.port = hp->dest; | ||
313 | |||
314 | return 1; | ||
315 | } | ||
316 | |||
317 | static int tcp_invert_tuple(struct nf_conntrack_tuple *tuple, | ||
318 | const struct nf_conntrack_tuple *orig) | ||
319 | { | ||
320 | tuple->src.u.tcp.port = orig->dst.u.tcp.port; | ||
321 | tuple->dst.u.tcp.port = orig->src.u.tcp.port; | ||
322 | return 1; | ||
323 | } | ||
324 | |||
325 | /* Print out the per-protocol part of the tuple. */ | ||
326 | static int tcp_print_tuple(struct seq_file *s, | ||
327 | const struct nf_conntrack_tuple *tuple) | ||
328 | { | ||
329 | return seq_printf(s, "sport=%hu dport=%hu ", | ||
330 | ntohs(tuple->src.u.tcp.port), | ||
331 | ntohs(tuple->dst.u.tcp.port)); | ||
332 | } | ||
333 | |||
334 | /* Print out the private part of the conntrack. */ | ||
335 | static int tcp_print_conntrack(struct seq_file *s, | ||
336 | const struct nf_conn *conntrack) | ||
337 | { | ||
338 | enum tcp_conntrack state; | ||
339 | |||
340 | read_lock_bh(&tcp_lock); | ||
341 | state = conntrack->proto.tcp.state; | ||
342 | read_unlock_bh(&tcp_lock); | ||
343 | |||
344 | return seq_printf(s, "%s ", tcp_conntrack_names[state]); | ||
345 | } | ||
346 | |||
347 | static unsigned int get_conntrack_index(const struct tcphdr *tcph) | ||
348 | { | ||
349 | if (tcph->rst) return TCP_RST_SET; | ||
350 | else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET); | ||
351 | else if (tcph->fin) return TCP_FIN_SET; | ||
352 | else if (tcph->ack) return TCP_ACK_SET; | ||
353 | else return TCP_NONE_SET; | ||
354 | } | ||
355 | |||
356 | /* TCP connection tracking based on 'Real Stateful TCP Packet Filtering | ||
357 | in IP Filter' by Guido van Rooij. | ||
358 | |||
359 | http://www.nluug.nl/events/sane2000/papers.html | ||
360 | http://www.iae.nl/users/guido/papers/tcp_filtering.ps.gz | ||
361 | |||
362 | The boundaries and the conditions are changed according to RFC793: | ||
363 | the packet must intersect the window (i.e. segments may be | ||
364 | after the right or before the left edge) and thus receivers may ACK | ||
365 | segments after the right edge of the window. | ||
366 | |||
367 | td_maxend = max(sack + max(win,1)) seen in reply packets | ||
368 | td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets | ||
369 | td_maxwin += seq + len - sender.td_maxend | ||
370 | if seq + len > sender.td_maxend | ||
371 | td_end = max(seq + len) seen in sent packets | ||
372 | |||
373 | I. Upper bound for valid data: seq <= sender.td_maxend | ||
374 | II. Lower bound for valid data: seq + len >= sender.td_end - receiver.td_maxwin | ||
375 | III. Upper bound for valid ack: sack <= receiver.td_end | ||
376 | IV. Lower bound for valid ack: ack >= receiver.td_end - MAXACKWINDOW | ||
377 | |||
378 | where sack is the highest right edge of sack block found in the packet. | ||
379 | |||
380 | The upper bound limit for a valid ack is not ignored - | ||
381 | we doesn't have to deal with fragments. | ||
382 | */ | ||
383 | |||
384 | static inline __u32 segment_seq_plus_len(__u32 seq, | ||
385 | size_t len, | ||
386 | unsigned int dataoff, | ||
387 | struct tcphdr *tcph) | ||
388 | { | ||
389 | /* XXX Should I use payload length field in IP/IPv6 header ? | ||
390 | * - YK */ | ||
391 | return (seq + len - dataoff - tcph->doff*4 | ||
392 | + (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0)); | ||
393 | } | ||
394 | |||
395 | /* Fixme: what about big packets? */ | ||
396 | #define MAXACKWINCONST 66000 | ||
397 | #define MAXACKWINDOW(sender) \ | ||
398 | ((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin \ | ||
399 | : MAXACKWINCONST) | ||
400 | |||
401 | /* | ||
402 | * Simplified tcp_parse_options routine from tcp_input.c | ||
403 | */ | ||
404 | static void tcp_options(const struct sk_buff *skb, | ||
405 | unsigned int dataoff, | ||
406 | struct tcphdr *tcph, | ||
407 | struct ip_ct_tcp_state *state) | ||
408 | { | ||
409 | unsigned char buff[(15 * 4) - sizeof(struct tcphdr)]; | ||
410 | unsigned char *ptr; | ||
411 | int length = (tcph->doff*4) - sizeof(struct tcphdr); | ||
412 | |||
413 | if (!length) | ||
414 | return; | ||
415 | |||
416 | ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr), | ||
417 | length, buff); | ||
418 | BUG_ON(ptr == NULL); | ||
419 | |||
420 | state->td_scale = | ||
421 | state->flags = 0; | ||
422 | |||
423 | while (length > 0) { | ||
424 | int opcode=*ptr++; | ||
425 | int opsize; | ||
426 | |||
427 | switch (opcode) { | ||
428 | case TCPOPT_EOL: | ||
429 | return; | ||
430 | case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ | ||
431 | length--; | ||
432 | continue; | ||
433 | default: | ||
434 | opsize=*ptr++; | ||
435 | if (opsize < 2) /* "silly options" */ | ||
436 | return; | ||
437 | if (opsize > length) | ||
438 | break; /* don't parse partial options */ | ||
439 | |||
440 | if (opcode == TCPOPT_SACK_PERM | ||
441 | && opsize == TCPOLEN_SACK_PERM) | ||
442 | state->flags |= IP_CT_TCP_FLAG_SACK_PERM; | ||
443 | else if (opcode == TCPOPT_WINDOW | ||
444 | && opsize == TCPOLEN_WINDOW) { | ||
445 | state->td_scale = *(u_int8_t *)ptr; | ||
446 | |||
447 | if (state->td_scale > 14) { | ||
448 | /* See RFC1323 */ | ||
449 | state->td_scale = 14; | ||
450 | } | ||
451 | state->flags |= | ||
452 | IP_CT_TCP_FLAG_WINDOW_SCALE; | ||
453 | } | ||
454 | ptr += opsize - 2; | ||
455 | length -= opsize; | ||
456 | } | ||
457 | } | ||
458 | } | ||
459 | |||
460 | static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff, | ||
461 | struct tcphdr *tcph, __u32 *sack) | ||
462 | { | ||
463 | unsigned char buff[(15 * 4) - sizeof(struct tcphdr)]; | ||
464 | unsigned char *ptr; | ||
465 | int length = (tcph->doff*4) - sizeof(struct tcphdr); | ||
466 | __u32 tmp; | ||
467 | |||
468 | if (!length) | ||
469 | return; | ||
470 | |||
471 | ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr), | ||
472 | length, buff); | ||
473 | BUG_ON(ptr == NULL); | ||
474 | |||
475 | /* Fast path for timestamp-only option */ | ||
476 | if (length == TCPOLEN_TSTAMP_ALIGNED*4 | ||
477 | && *(__u32 *)ptr == | ||
478 | __constant_ntohl((TCPOPT_NOP << 24) | ||
479 | | (TCPOPT_NOP << 16) | ||
480 | | (TCPOPT_TIMESTAMP << 8) | ||
481 | | TCPOLEN_TIMESTAMP)) | ||
482 | return; | ||
483 | |||
484 | while (length > 0) { | ||
485 | int opcode = *ptr++; | ||
486 | int opsize, i; | ||
487 | |||
488 | switch (opcode) { | ||
489 | case TCPOPT_EOL: | ||
490 | return; | ||
491 | case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ | ||
492 | length--; | ||
493 | continue; | ||
494 | default: | ||
495 | opsize = *ptr++; | ||
496 | if (opsize < 2) /* "silly options" */ | ||
497 | return; | ||
498 | if (opsize > length) | ||
499 | break; /* don't parse partial options */ | ||
500 | |||
501 | if (opcode == TCPOPT_SACK | ||
502 | && opsize >= (TCPOLEN_SACK_BASE | ||
503 | + TCPOLEN_SACK_PERBLOCK) | ||
504 | && !((opsize - TCPOLEN_SACK_BASE) | ||
505 | % TCPOLEN_SACK_PERBLOCK)) { | ||
506 | for (i = 0; | ||
507 | i < (opsize - TCPOLEN_SACK_BASE); | ||
508 | i += TCPOLEN_SACK_PERBLOCK) { | ||
509 | memcpy(&tmp, (__u32 *)(ptr + i) + 1, | ||
510 | sizeof(__u32)); | ||
511 | tmp = ntohl(tmp); | ||
512 | |||
513 | if (after(tmp, *sack)) | ||
514 | *sack = tmp; | ||
515 | } | ||
516 | return; | ||
517 | } | ||
518 | ptr += opsize - 2; | ||
519 | length -= opsize; | ||
520 | } | ||
521 | } | ||
522 | } | ||
523 | |||
524 | static int tcp_in_window(struct ip_ct_tcp *state, | ||
525 | enum ip_conntrack_dir dir, | ||
526 | unsigned int index, | ||
527 | const struct sk_buff *skb, | ||
528 | unsigned int dataoff, | ||
529 | struct tcphdr *tcph, | ||
530 | int pf) | ||
531 | { | ||
532 | struct ip_ct_tcp_state *sender = &state->seen[dir]; | ||
533 | struct ip_ct_tcp_state *receiver = &state->seen[!dir]; | ||
534 | __u32 seq, ack, sack, end, win, swin; | ||
535 | int res; | ||
536 | |||
537 | /* | ||
538 | * Get the required data from the packet. | ||
539 | */ | ||
540 | seq = ntohl(tcph->seq); | ||
541 | ack = sack = ntohl(tcph->ack_seq); | ||
542 | win = ntohs(tcph->window); | ||
543 | end = segment_seq_plus_len(seq, skb->len, dataoff, tcph); | ||
544 | |||
545 | if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM) | ||
546 | tcp_sack(skb, dataoff, tcph, &sack); | ||
547 | |||
548 | DEBUGP("tcp_in_window: START\n"); | ||
549 | DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu " | ||
550 | "seq=%u ack=%u sack=%u win=%u end=%u\n", | ||
551 | NIPQUAD(iph->saddr), ntohs(tcph->source), | ||
552 | NIPQUAD(iph->daddr), ntohs(tcph->dest), | ||
553 | seq, ack, sack, win, end); | ||
554 | DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " | ||
555 | "receiver end=%u maxend=%u maxwin=%u scale=%i\n", | ||
556 | sender->td_end, sender->td_maxend, sender->td_maxwin, | ||
557 | sender->td_scale, | ||
558 | receiver->td_end, receiver->td_maxend, receiver->td_maxwin, | ||
559 | receiver->td_scale); | ||
560 | |||
561 | if (sender->td_end == 0) { | ||
562 | /* | ||
563 | * Initialize sender data. | ||
564 | */ | ||
565 | if (tcph->syn && tcph->ack) { | ||
566 | /* | ||
567 | * Outgoing SYN-ACK in reply to a SYN. | ||
568 | */ | ||
569 | sender->td_end = | ||
570 | sender->td_maxend = end; | ||
571 | sender->td_maxwin = (win == 0 ? 1 : win); | ||
572 | |||
573 | tcp_options(skb, dataoff, tcph, sender); | ||
574 | /* | ||
575 | * RFC 1323: | ||
576 | * Both sides must send the Window Scale option | ||
577 | * to enable window scaling in either direction. | ||
578 | */ | ||
579 | if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE | ||
580 | && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE)) | ||
581 | sender->td_scale = | ||
582 | receiver->td_scale = 0; | ||
583 | } else { | ||
584 | /* | ||
585 | * We are in the middle of a connection, | ||
586 | * its history is lost for us. | ||
587 | * Let's try to use the data from the packet. | ||
588 | */ | ||
589 | sender->td_end = end; | ||
590 | sender->td_maxwin = (win == 0 ? 1 : win); | ||
591 | sender->td_maxend = end + sender->td_maxwin; | ||
592 | } | ||
593 | } else if (((state->state == TCP_CONNTRACK_SYN_SENT | ||
594 | && dir == IP_CT_DIR_ORIGINAL) | ||
595 | || (state->state == TCP_CONNTRACK_SYN_RECV | ||
596 | && dir == IP_CT_DIR_REPLY)) | ||
597 | && after(end, sender->td_end)) { | ||
598 | /* | ||
599 | * RFC 793: "if a TCP is reinitialized ... then it need | ||
600 | * not wait at all; it must only be sure to use sequence | ||
601 | * numbers larger than those recently used." | ||
602 | */ | ||
603 | sender->td_end = | ||
604 | sender->td_maxend = end; | ||
605 | sender->td_maxwin = (win == 0 ? 1 : win); | ||
606 | |||
607 | tcp_options(skb, dataoff, tcph, sender); | ||
608 | } | ||
609 | |||
610 | if (!(tcph->ack)) { | ||
611 | /* | ||
612 | * If there is no ACK, just pretend it was set and OK. | ||
613 | */ | ||
614 | ack = sack = receiver->td_end; | ||
615 | } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) == | ||
616 | (TCP_FLAG_ACK|TCP_FLAG_RST)) | ||
617 | && (ack == 0)) { | ||
618 | /* | ||
619 | * Broken TCP stacks, that set ACK in RST packets as well | ||
620 | * with zero ack value. | ||
621 | */ | ||
622 | ack = sack = receiver->td_end; | ||
623 | } | ||
624 | |||
625 | if (seq == end | ||
626 | && (!tcph->rst | ||
627 | || (seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT))) | ||
628 | /* | ||
629 | * Packets contains no data: we assume it is valid | ||
630 | * and check the ack value only. | ||
631 | * However RST segments are always validated by their | ||
632 | * SEQ number, except when seq == 0 (reset sent answering | ||
633 | * SYN. | ||
634 | */ | ||
635 | seq = end = sender->td_end; | ||
636 | |||
637 | DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu " | ||
638 | "seq=%u ack=%u sack =%u win=%u end=%u\n", | ||
639 | NIPQUAD(iph->saddr), ntohs(tcph->source), | ||
640 | NIPQUAD(iph->daddr), ntohs(tcph->dest), | ||
641 | seq, ack, sack, win, end); | ||
642 | DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " | ||
643 | "receiver end=%u maxend=%u maxwin=%u scale=%i\n", | ||
644 | sender->td_end, sender->td_maxend, sender->td_maxwin, | ||
645 | sender->td_scale, | ||
646 | receiver->td_end, receiver->td_maxend, receiver->td_maxwin, | ||
647 | receiver->td_scale); | ||
648 | |||
649 | DEBUGP("tcp_in_window: I=%i II=%i III=%i IV=%i\n", | ||
650 | before(seq, sender->td_maxend + 1), | ||
651 | after(end, sender->td_end - receiver->td_maxwin - 1), | ||
652 | before(sack, receiver->td_end + 1), | ||
653 | after(ack, receiver->td_end - MAXACKWINDOW(sender))); | ||
654 | |||
655 | if (sender->loose || receiver->loose || | ||
656 | (before(seq, sender->td_maxend + 1) && | ||
657 | after(end, sender->td_end - receiver->td_maxwin - 1) && | ||
658 | before(sack, receiver->td_end + 1) && | ||
659 | after(ack, receiver->td_end - MAXACKWINDOW(sender)))) { | ||
660 | /* | ||
661 | * Take into account window scaling (RFC 1323). | ||
662 | */ | ||
663 | if (!tcph->syn) | ||
664 | win <<= sender->td_scale; | ||
665 | |||
666 | /* | ||
667 | * Update sender data. | ||
668 | */ | ||
669 | swin = win + (sack - ack); | ||
670 | if (sender->td_maxwin < swin) | ||
671 | sender->td_maxwin = swin; | ||
672 | if (after(end, sender->td_end)) | ||
673 | sender->td_end = end; | ||
674 | /* | ||
675 | * Update receiver data. | ||
676 | */ | ||
677 | if (after(end, sender->td_maxend)) | ||
678 | receiver->td_maxwin += end - sender->td_maxend; | ||
679 | if (after(sack + win, receiver->td_maxend - 1)) { | ||
680 | receiver->td_maxend = sack + win; | ||
681 | if (win == 0) | ||
682 | receiver->td_maxend++; | ||
683 | } | ||
684 | |||
685 | /* | ||
686 | * Check retransmissions. | ||
687 | */ | ||
688 | if (index == TCP_ACK_SET) { | ||
689 | if (state->last_dir == dir | ||
690 | && state->last_seq == seq | ||
691 | && state->last_ack == ack | ||
692 | && state->last_end == end) | ||
693 | state->retrans++; | ||
694 | else { | ||
695 | state->last_dir = dir; | ||
696 | state->last_seq = seq; | ||
697 | state->last_ack = ack; | ||
698 | state->last_end = end; | ||
699 | state->retrans = 0; | ||
700 | } | ||
701 | } | ||
702 | /* | ||
703 | * Close the window of disabled window tracking :-) | ||
704 | */ | ||
705 | if (sender->loose) | ||
706 | sender->loose--; | ||
707 | |||
708 | res = 1; | ||
709 | } else { | ||
710 | if (LOG_INVALID(IPPROTO_TCP)) | ||
711 | nf_log_packet(pf, 0, skb, NULL, NULL, NULL, | ||
712 | "nf_ct_tcp: %s ", | ||
713 | before(seq, sender->td_maxend + 1) ? | ||
714 | after(end, sender->td_end - receiver->td_maxwin - 1) ? | ||
715 | before(sack, receiver->td_end + 1) ? | ||
716 | after(ack, receiver->td_end - MAXACKWINDOW(sender)) ? "BUG" | ||
717 | : "ACK is under the lower bound (possible overly delayed ACK)" | ||
718 | : "ACK is over the upper bound (ACKed data not seen yet)" | ||
719 | : "SEQ is under the lower bound (already ACKed data retransmitted)" | ||
720 | : "SEQ is over the upper bound (over the window of the receiver)"); | ||
721 | |||
722 | res = nf_ct_tcp_be_liberal; | ||
723 | } | ||
724 | |||
725 | DEBUGP("tcp_in_window: res=%i sender end=%u maxend=%u maxwin=%u " | ||
726 | "receiver end=%u maxend=%u maxwin=%u\n", | ||
727 | res, sender->td_end, sender->td_maxend, sender->td_maxwin, | ||
728 | receiver->td_end, receiver->td_maxend, receiver->td_maxwin); | ||
729 | |||
730 | return res; | ||
731 | } | ||
732 | |||
733 | #ifdef CONFIG_IP_NF_NAT_NEEDED | ||
734 | /* Update sender->td_end after NAT successfully mangled the packet */ | ||
735 | /* Caller must linearize skb at tcp header. */ | ||
736 | void nf_conntrack_tcp_update(struct sk_buff *skb, | ||
737 | unsigned int dataoff, | ||
738 | struct nf_conn *conntrack, | ||
739 | int dir) | ||
740 | { | ||
741 | struct tcphdr *tcph = (void *)skb->data + dataoff; | ||
742 | __u32 end; | ||
743 | #ifdef DEBUGP_VARS | ||
744 | struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[dir]; | ||
745 | struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[!dir]; | ||
746 | #endif | ||
747 | |||
748 | end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, dataoff, tcph); | ||
749 | |||
750 | write_lock_bh(&tcp_lock); | ||
751 | /* | ||
752 | * We have to worry for the ack in the reply packet only... | ||
753 | */ | ||
754 | if (after(end, conntrack->proto.tcp.seen[dir].td_end)) | ||
755 | conntrack->proto.tcp.seen[dir].td_end = end; | ||
756 | conntrack->proto.tcp.last_end = end; | ||
757 | write_unlock_bh(&tcp_lock); | ||
758 | DEBUGP("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i " | ||
759 | "receiver end=%u maxend=%u maxwin=%u scale=%i\n", | ||
760 | sender->td_end, sender->td_maxend, sender->td_maxwin, | ||
761 | sender->td_scale, | ||
762 | receiver->td_end, receiver->td_maxend, receiver->td_maxwin, | ||
763 | receiver->td_scale); | ||
764 | } | ||
765 | |||
766 | #endif | ||
767 | |||
768 | #define TH_FIN 0x01 | ||
769 | #define TH_SYN 0x02 | ||
770 | #define TH_RST 0x04 | ||
771 | #define TH_PUSH 0x08 | ||
772 | #define TH_ACK 0x10 | ||
773 | #define TH_URG 0x20 | ||
774 | #define TH_ECE 0x40 | ||
775 | #define TH_CWR 0x80 | ||
776 | |||
777 | /* table of valid flag combinations - ECE and CWR are always valid */ | ||
778 | static u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] = | ||
779 | { | ||
780 | [TH_SYN] = 1, | ||
781 | [TH_SYN|TH_ACK] = 1, | ||
782 | [TH_SYN|TH_ACK|TH_PUSH] = 1, | ||
783 | [TH_RST] = 1, | ||
784 | [TH_RST|TH_ACK] = 1, | ||
785 | [TH_RST|TH_ACK|TH_PUSH] = 1, | ||
786 | [TH_FIN|TH_ACK] = 1, | ||
787 | [TH_ACK] = 1, | ||
788 | [TH_ACK|TH_PUSH] = 1, | ||
789 | [TH_ACK|TH_URG] = 1, | ||
790 | [TH_ACK|TH_URG|TH_PUSH] = 1, | ||
791 | [TH_FIN|TH_ACK|TH_PUSH] = 1, | ||
792 | [TH_FIN|TH_ACK|TH_URG] = 1, | ||
793 | [TH_FIN|TH_ACK|TH_URG|TH_PUSH] = 1, | ||
794 | }; | ||
795 | |||
796 | /* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */ | ||
797 | static int tcp_error(struct sk_buff *skb, | ||
798 | unsigned int dataoff, | ||
799 | enum ip_conntrack_info *ctinfo, | ||
800 | int pf, | ||
801 | unsigned int hooknum, | ||
802 | int(*csum)(const struct sk_buff *,unsigned int)) | ||
803 | { | ||
804 | struct tcphdr _tcph, *th; | ||
805 | unsigned int tcplen = skb->len - dataoff; | ||
806 | u_int8_t tcpflags; | ||
807 | |||
808 | /* Smaller that minimal TCP header? */ | ||
809 | th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); | ||
810 | if (th == NULL) { | ||
811 | if (LOG_INVALID(IPPROTO_TCP)) | ||
812 | nf_log_packet(pf, 0, skb, NULL, NULL, NULL, | ||
813 | "nf_ct_tcp: short packet "); | ||
814 | return -NF_ACCEPT; | ||
815 | } | ||
816 | |||
817 | /* Not whole TCP header or malformed packet */ | ||
818 | if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) { | ||
819 | if (LOG_INVALID(IPPROTO_TCP)) | ||
820 | nf_log_packet(pf, 0, skb, NULL, NULL, NULL, | ||
821 | "nf_ct_tcp: truncated/malformed packet "); | ||
822 | return -NF_ACCEPT; | ||
823 | } | ||
824 | |||
825 | /* Checksum invalid? Ignore. | ||
826 | * We skip checking packets on the outgoing path | ||
827 | * because the semantic of CHECKSUM_HW is different there | ||
828 | * and moreover root might send raw packets. | ||
829 | */ | ||
830 | /* FIXME: Source route IP option packets --RR */ | ||
831 | if (((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) || | ||
832 | (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING)) | ||
833 | && skb->ip_summed != CHECKSUM_UNNECESSARY | ||
834 | && csum(skb, dataoff)) { | ||
835 | if (LOG_INVALID(IPPROTO_TCP)) | ||
836 | nf_log_packet(pf, 0, skb, NULL, NULL, NULL, | ||
837 | "nf_ct_tcp: bad TCP checksum "); | ||
838 | return -NF_ACCEPT; | ||
839 | } | ||
840 | |||
841 | /* Check TCP flags. */ | ||
842 | tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR)); | ||
843 | if (!tcp_valid_flags[tcpflags]) { | ||
844 | if (LOG_INVALID(IPPROTO_TCP)) | ||
845 | nf_log_packet(pf, 0, skb, NULL, NULL, NULL, | ||
846 | "nf_ct_tcp: invalid TCP flag combination "); | ||
847 | return -NF_ACCEPT; | ||
848 | } | ||
849 | |||
850 | return NF_ACCEPT; | ||
851 | } | ||
852 | |||
853 | static int csum4(const struct sk_buff *skb, unsigned int dataoff) | ||
854 | { | ||
855 | return csum_tcpudp_magic(skb->nh.iph->saddr, skb->nh.iph->daddr, | ||
856 | skb->len - dataoff, IPPROTO_TCP, | ||
857 | skb->ip_summed == CHECKSUM_HW ? skb->csum | ||
858 | : skb_checksum(skb, dataoff, | ||
859 | skb->len - dataoff, 0)); | ||
860 | } | ||
861 | |||
862 | static int csum6(const struct sk_buff *skb, unsigned int dataoff) | ||
863 | { | ||
864 | return csum_ipv6_magic(&skb->nh.ipv6h->saddr, &skb->nh.ipv6h->daddr, | ||
865 | skb->len - dataoff, IPPROTO_TCP, | ||
866 | skb->ip_summed == CHECKSUM_HW ? skb->csum | ||
867 | : skb_checksum(skb, dataoff, skb->len - dataoff, | ||
868 | 0)); | ||
869 | } | ||
870 | |||
871 | static int tcp_error4(struct sk_buff *skb, | ||
872 | unsigned int dataoff, | ||
873 | enum ip_conntrack_info *ctinfo, | ||
874 | int pf, | ||
875 | unsigned int hooknum) | ||
876 | { | ||
877 | return tcp_error(skb, dataoff, ctinfo, pf, hooknum, csum4); | ||
878 | } | ||
879 | |||
880 | static int tcp_error6(struct sk_buff *skb, | ||
881 | unsigned int dataoff, | ||
882 | enum ip_conntrack_info *ctinfo, | ||
883 | int pf, | ||
884 | unsigned int hooknum) | ||
885 | { | ||
886 | return tcp_error(skb, dataoff, ctinfo, pf, hooknum, csum6); | ||
887 | } | ||
888 | |||
889 | /* Returns verdict for packet, or -1 for invalid. */ | ||
890 | static int tcp_packet(struct nf_conn *conntrack, | ||
891 | const struct sk_buff *skb, | ||
892 | unsigned int dataoff, | ||
893 | enum ip_conntrack_info ctinfo, | ||
894 | int pf, | ||
895 | unsigned int hooknum) | ||
896 | { | ||
897 | enum tcp_conntrack new_state, old_state; | ||
898 | enum ip_conntrack_dir dir; | ||
899 | struct tcphdr *th, _tcph; | ||
900 | unsigned long timeout; | ||
901 | unsigned int index; | ||
902 | |||
903 | th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); | ||
904 | BUG_ON(th == NULL); | ||
905 | |||
906 | write_lock_bh(&tcp_lock); | ||
907 | old_state = conntrack->proto.tcp.state; | ||
908 | dir = CTINFO2DIR(ctinfo); | ||
909 | index = get_conntrack_index(th); | ||
910 | new_state = tcp_conntracks[dir][index][old_state]; | ||
911 | |||
912 | switch (new_state) { | ||
913 | case TCP_CONNTRACK_IGNORE: | ||
914 | /* Either SYN in ORIGINAL | ||
915 | * or SYN/ACK in REPLY. */ | ||
916 | if (index == TCP_SYNACK_SET | ||
917 | && conntrack->proto.tcp.last_index == TCP_SYN_SET | ||
918 | && conntrack->proto.tcp.last_dir != dir | ||
919 | && ntohl(th->ack_seq) == | ||
920 | conntrack->proto.tcp.last_end) { | ||
921 | /* This SYN/ACK acknowledges a SYN that we earlier | ||
922 | * ignored as invalid. This means that the client and | ||
923 | * the server are both in sync, while the firewall is | ||
924 | * not. We kill this session and block the SYN/ACK so | ||
925 | * that the client cannot but retransmit its SYN and | ||
926 | * thus initiate a clean new session. | ||
927 | */ | ||
928 | write_unlock_bh(&tcp_lock); | ||
929 | if (LOG_INVALID(IPPROTO_TCP)) | ||
930 | nf_log_packet(pf, 0, skb, NULL, NULL, NULL, | ||
931 | "nf_ct_tcp: killing out of sync session "); | ||
932 | if (del_timer(&conntrack->timeout)) | ||
933 | conntrack->timeout.function((unsigned long) | ||
934 | conntrack); | ||
935 | return -NF_DROP; | ||
936 | } | ||
937 | conntrack->proto.tcp.last_index = index; | ||
938 | conntrack->proto.tcp.last_dir = dir; | ||
939 | conntrack->proto.tcp.last_seq = ntohl(th->seq); | ||
940 | conntrack->proto.tcp.last_end = | ||
941 | segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th); | ||
942 | |||
943 | write_unlock_bh(&tcp_lock); | ||
944 | if (LOG_INVALID(IPPROTO_TCP)) | ||
945 | nf_log_packet(pf, 0, skb, NULL, NULL, NULL, | ||
946 | "nf_ct_tcp: invalid packed ignored "); | ||
947 | return NF_ACCEPT; | ||
948 | case TCP_CONNTRACK_MAX: | ||
949 | /* Invalid packet */ | ||
950 | DEBUGP("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n", | ||
951 | dir, get_conntrack_index(th), | ||
952 | old_state); | ||
953 | write_unlock_bh(&tcp_lock); | ||
954 | if (LOG_INVALID(IPPROTO_TCP)) | ||
955 | nf_log_packet(pf, 0, skb, NULL, NULL, NULL, | ||
956 | "nf_ct_tcp: invalid state "); | ||
957 | return -NF_ACCEPT; | ||
958 | case TCP_CONNTRACK_SYN_SENT: | ||
959 | if (old_state < TCP_CONNTRACK_TIME_WAIT) | ||
960 | break; | ||
961 | if ((conntrack->proto.tcp.seen[dir].flags & | ||
962 | IP_CT_TCP_FLAG_CLOSE_INIT) | ||
963 | || after(ntohl(th->seq), | ||
964 | conntrack->proto.tcp.seen[dir].td_end)) { | ||
965 | /* Attempt to reopen a closed connection. | ||
966 | * Delete this connection and look up again. */ | ||
967 | write_unlock_bh(&tcp_lock); | ||
968 | if (del_timer(&conntrack->timeout)) | ||
969 | conntrack->timeout.function((unsigned long) | ||
970 | conntrack); | ||
971 | return -NF_REPEAT; | ||
972 | } | ||
973 | case TCP_CONNTRACK_CLOSE: | ||
974 | if (index == TCP_RST_SET | ||
975 | && test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status) | ||
976 | && conntrack->proto.tcp.last_index == TCP_SYN_SET | ||
977 | && ntohl(th->ack_seq) == conntrack->proto.tcp.last_end) { | ||
978 | /* RST sent to invalid SYN we had let trough | ||
979 | * SYN was in window then, tear down connection. | ||
980 | * We skip window checking, because packet might ACK | ||
981 | * segments we ignored in the SYN. */ | ||
982 | goto in_window; | ||
983 | } | ||
984 | /* Just fall trough */ | ||
985 | default: | ||
986 | /* Keep compilers happy. */ | ||
987 | break; | ||
988 | } | ||
989 | |||
990 | if (!tcp_in_window(&conntrack->proto.tcp, dir, index, | ||
991 | skb, dataoff, th, pf)) { | ||
992 | write_unlock_bh(&tcp_lock); | ||
993 | return -NF_ACCEPT; | ||
994 | } | ||
995 | in_window: | ||
996 | /* From now on we have got in-window packets */ | ||
997 | conntrack->proto.tcp.last_index = index; | ||
998 | |||
999 | DEBUGP("tcp_conntracks: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu " | ||
1000 | "syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n", | ||
1001 | NIPQUAD(iph->saddr), ntohs(th->source), | ||
1002 | NIPQUAD(iph->daddr), ntohs(th->dest), | ||
1003 | (th->syn ? 1 : 0), (th->ack ? 1 : 0), | ||
1004 | (th->fin ? 1 : 0), (th->rst ? 1 : 0), | ||
1005 | old_state, new_state); | ||
1006 | |||
1007 | conntrack->proto.tcp.state = new_state; | ||
1008 | if (old_state != new_state | ||
1009 | && (new_state == TCP_CONNTRACK_FIN_WAIT | ||
1010 | || new_state == TCP_CONNTRACK_CLOSE)) | ||
1011 | conntrack->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; | ||
1012 | timeout = conntrack->proto.tcp.retrans >= nf_ct_tcp_max_retrans | ||
1013 | && *tcp_timeouts[new_state] > nf_ct_tcp_timeout_max_retrans | ||
1014 | ? nf_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state]; | ||
1015 | write_unlock_bh(&tcp_lock); | ||
1016 | |||
1017 | nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); | ||
1018 | if (new_state != old_state) | ||
1019 | nf_conntrack_event_cache(IPCT_PROTOINFO, skb); | ||
1020 | |||
1021 | if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { | ||
1022 | /* If only reply is a RST, we can consider ourselves not to | ||
1023 | have an established connection: this is a fairly common | ||
1024 | problem case, so we can delete the conntrack | ||
1025 | immediately. --RR */ | ||
1026 | if (th->rst) { | ||
1027 | if (del_timer(&conntrack->timeout)) | ||
1028 | conntrack->timeout.function((unsigned long) | ||
1029 | conntrack); | ||
1030 | return NF_ACCEPT; | ||
1031 | } | ||
1032 | } else if (!test_bit(IPS_ASSURED_BIT, &conntrack->status) | ||
1033 | && (old_state == TCP_CONNTRACK_SYN_RECV | ||
1034 | || old_state == TCP_CONNTRACK_ESTABLISHED) | ||
1035 | && new_state == TCP_CONNTRACK_ESTABLISHED) { | ||
1036 | /* Set ASSURED if we see see valid ack in ESTABLISHED | ||
1037 | after SYN_RECV or a valid answer for a picked up | ||
1038 | connection. */ | ||
1039 | set_bit(IPS_ASSURED_BIT, &conntrack->status); | ||
1040 | nf_conntrack_event_cache(IPCT_STATUS, skb); | ||
1041 | } | ||
1042 | nf_ct_refresh_acct(conntrack, ctinfo, skb, timeout); | ||
1043 | |||
1044 | return NF_ACCEPT; | ||
1045 | } | ||
1046 | |||
1047 | /* Called when a new connection for this protocol found. */ | ||
1048 | static int tcp_new(struct nf_conn *conntrack, | ||
1049 | const struct sk_buff *skb, | ||
1050 | unsigned int dataoff) | ||
1051 | { | ||
1052 | enum tcp_conntrack new_state; | ||
1053 | struct tcphdr *th, _tcph; | ||
1054 | #ifdef DEBUGP_VARS | ||
1055 | struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[0]; | ||
1056 | struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[1]; | ||
1057 | #endif | ||
1058 | |||
1059 | th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); | ||
1060 | BUG_ON(th == NULL); | ||
1061 | |||
1062 | /* Don't need lock here: this conntrack not in circulation yet */ | ||
1063 | new_state | ||
1064 | = tcp_conntracks[0][get_conntrack_index(th)] | ||
1065 | [TCP_CONNTRACK_NONE]; | ||
1066 | |||
1067 | /* Invalid: delete conntrack */ | ||
1068 | if (new_state >= TCP_CONNTRACK_MAX) { | ||
1069 | DEBUGP("nf_ct_tcp: invalid new deleting.\n"); | ||
1070 | return 0; | ||
1071 | } | ||
1072 | |||
1073 | if (new_state == TCP_CONNTRACK_SYN_SENT) { | ||
1074 | /* SYN packet */ | ||
1075 | conntrack->proto.tcp.seen[0].td_end = | ||
1076 | segment_seq_plus_len(ntohl(th->seq), skb->len, | ||
1077 | dataoff, th); | ||
1078 | conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window); | ||
1079 | if (conntrack->proto.tcp.seen[0].td_maxwin == 0) | ||
1080 | conntrack->proto.tcp.seen[0].td_maxwin = 1; | ||
1081 | conntrack->proto.tcp.seen[0].td_maxend = | ||
1082 | conntrack->proto.tcp.seen[0].td_end; | ||
1083 | |||
1084 | tcp_options(skb, dataoff, th, &conntrack->proto.tcp.seen[0]); | ||
1085 | conntrack->proto.tcp.seen[1].flags = 0; | ||
1086 | conntrack->proto.tcp.seen[0].loose = | ||
1087 | conntrack->proto.tcp.seen[1].loose = 0; | ||
1088 | } else if (nf_ct_tcp_loose == 0) { | ||
1089 | /* Don't try to pick up connections. */ | ||
1090 | return 0; | ||
1091 | } else { | ||
1092 | /* | ||
1093 | * We are in the middle of a connection, | ||
1094 | * its history is lost for us. | ||
1095 | * Let's try to use the data from the packet. | ||
1096 | */ | ||
1097 | conntrack->proto.tcp.seen[0].td_end = | ||
1098 | segment_seq_plus_len(ntohl(th->seq), skb->len, | ||
1099 | dataoff, th); | ||
1100 | conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window); | ||
1101 | if (conntrack->proto.tcp.seen[0].td_maxwin == 0) | ||
1102 | conntrack->proto.tcp.seen[0].td_maxwin = 1; | ||
1103 | conntrack->proto.tcp.seen[0].td_maxend = | ||
1104 | conntrack->proto.tcp.seen[0].td_end + | ||
1105 | conntrack->proto.tcp.seen[0].td_maxwin; | ||
1106 | conntrack->proto.tcp.seen[0].td_scale = 0; | ||
1107 | |||
1108 | /* We assume SACK. Should we assume window scaling too? */ | ||
1109 | conntrack->proto.tcp.seen[0].flags = | ||
1110 | conntrack->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM; | ||
1111 | conntrack->proto.tcp.seen[0].loose = | ||
1112 | conntrack->proto.tcp.seen[1].loose = nf_ct_tcp_loose; | ||
1113 | } | ||
1114 | |||
1115 | conntrack->proto.tcp.seen[1].td_end = 0; | ||
1116 | conntrack->proto.tcp.seen[1].td_maxend = 0; | ||
1117 | conntrack->proto.tcp.seen[1].td_maxwin = 1; | ||
1118 | conntrack->proto.tcp.seen[1].td_scale = 0; | ||
1119 | |||
1120 | /* tcp_packet will set them */ | ||
1121 | conntrack->proto.tcp.state = TCP_CONNTRACK_NONE; | ||
1122 | conntrack->proto.tcp.last_index = TCP_NONE_SET; | ||
1123 | |||
1124 | DEBUGP("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i " | ||
1125 | "receiver end=%u maxend=%u maxwin=%u scale=%i\n", | ||
1126 | sender->td_end, sender->td_maxend, sender->td_maxwin, | ||
1127 | sender->td_scale, | ||
1128 | receiver->td_end, receiver->td_maxend, receiver->td_maxwin, | ||
1129 | receiver->td_scale); | ||
1130 | return 1; | ||
1131 | } | ||
1132 | |||
1133 | struct nf_conntrack_protocol nf_conntrack_protocol_tcp4 = | ||
1134 | { | ||
1135 | .l3proto = PF_INET, | ||
1136 | .proto = IPPROTO_TCP, | ||
1137 | .name = "tcp", | ||
1138 | .pkt_to_tuple = tcp_pkt_to_tuple, | ||
1139 | .invert_tuple = tcp_invert_tuple, | ||
1140 | .print_tuple = tcp_print_tuple, | ||
1141 | .print_conntrack = tcp_print_conntrack, | ||
1142 | .packet = tcp_packet, | ||
1143 | .new = tcp_new, | ||
1144 | .error = tcp_error4, | ||
1145 | }; | ||
1146 | |||
1147 | struct nf_conntrack_protocol nf_conntrack_protocol_tcp6 = | ||
1148 | { | ||
1149 | .l3proto = PF_INET6, | ||
1150 | .proto = IPPROTO_TCP, | ||
1151 | .name = "tcp", | ||
1152 | .pkt_to_tuple = tcp_pkt_to_tuple, | ||
1153 | .invert_tuple = tcp_invert_tuple, | ||
1154 | .print_tuple = tcp_print_tuple, | ||
1155 | .print_conntrack = tcp_print_conntrack, | ||
1156 | .packet = tcp_packet, | ||
1157 | .new = tcp_new, | ||
1158 | .error = tcp_error6, | ||
1159 | }; | ||
1160 | |||
1161 | EXPORT_SYMBOL(nf_conntrack_protocol_tcp4); | ||
1162 | EXPORT_SYMBOL(nf_conntrack_protocol_tcp6); | ||
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c new file mode 100644 index 000000000000..3cae7ce420dd --- /dev/null +++ b/net/netfilter/nf_conntrack_proto_udp.c | |||
@@ -0,0 +1,216 @@ | |||
1 | /* (C) 1999-2001 Paul `Rusty' Russell | ||
2 | * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License version 2 as | ||
6 | * published by the Free Software Foundation. | ||
7 | * | ||
8 | * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> | ||
9 | * - enable working with Layer 3 protocol independent connection tracking. | ||
10 | * | ||
11 | * Derived from net/ipv4/netfilter/ip_conntrack_proto_udp.c | ||
12 | */ | ||
13 | |||
14 | #include <linux/types.h> | ||
15 | #include <linux/sched.h> | ||
16 | #include <linux/timer.h> | ||
17 | #include <linux/module.h> | ||
18 | #include <linux/netfilter.h> | ||
19 | #include <linux/udp.h> | ||
20 | #include <linux/seq_file.h> | ||
21 | #include <linux/skbuff.h> | ||
22 | #include <linux/ipv6.h> | ||
23 | #include <net/ip6_checksum.h> | ||
24 | #include <net/checksum.h> | ||
25 | #include <linux/netfilter.h> | ||
26 | #include <linux/netfilter_ipv4.h> | ||
27 | #include <linux/netfilter_ipv6.h> | ||
28 | #include <net/netfilter/nf_conntrack_protocol.h> | ||
29 | |||
30 | unsigned long nf_ct_udp_timeout = 30*HZ; | ||
31 | unsigned long nf_ct_udp_timeout_stream = 180*HZ; | ||
32 | |||
33 | static int udp_pkt_to_tuple(const struct sk_buff *skb, | ||
34 | unsigned int dataoff, | ||
35 | struct nf_conntrack_tuple *tuple) | ||
36 | { | ||
37 | struct udphdr _hdr, *hp; | ||
38 | |||
39 | /* Actually only need first 8 bytes. */ | ||
40 | hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); | ||
41 | if (hp == NULL) | ||
42 | return 0; | ||
43 | |||
44 | tuple->src.u.udp.port = hp->source; | ||
45 | tuple->dst.u.udp.port = hp->dest; | ||
46 | |||
47 | return 1; | ||
48 | } | ||
49 | |||
50 | static int udp_invert_tuple(struct nf_conntrack_tuple *tuple, | ||
51 | const struct nf_conntrack_tuple *orig) | ||
52 | { | ||
53 | tuple->src.u.udp.port = orig->dst.u.udp.port; | ||
54 | tuple->dst.u.udp.port = orig->src.u.udp.port; | ||
55 | return 1; | ||
56 | } | ||
57 | |||
58 | /* Print out the per-protocol part of the tuple. */ | ||
59 | static int udp_print_tuple(struct seq_file *s, | ||
60 | const struct nf_conntrack_tuple *tuple) | ||
61 | { | ||
62 | return seq_printf(s, "sport=%hu dport=%hu ", | ||
63 | ntohs(tuple->src.u.udp.port), | ||
64 | ntohs(tuple->dst.u.udp.port)); | ||
65 | } | ||
66 | |||
67 | /* Print out the private part of the conntrack. */ | ||
68 | static int udp_print_conntrack(struct seq_file *s, | ||
69 | const struct nf_conn *conntrack) | ||
70 | { | ||
71 | return 0; | ||
72 | } | ||
73 | |||
74 | /* Returns verdict for packet, and may modify conntracktype */ | ||
75 | static int udp_packet(struct nf_conn *conntrack, | ||
76 | const struct sk_buff *skb, | ||
77 | unsigned int dataoff, | ||
78 | enum ip_conntrack_info ctinfo, | ||
79 | int pf, | ||
80 | unsigned int hooknum) | ||
81 | { | ||
82 | /* If we've seen traffic both ways, this is some kind of UDP | ||
83 | stream. Extend timeout. */ | ||
84 | if (test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { | ||
85 | nf_ct_refresh_acct(conntrack, ctinfo, skb, | ||
86 | nf_ct_udp_timeout_stream); | ||
87 | /* Also, more likely to be important, and not a probe */ | ||
88 | if (!test_and_set_bit(IPS_ASSURED_BIT, &conntrack->status)) | ||
89 | nf_conntrack_event_cache(IPCT_STATUS, skb); | ||
90 | } else | ||
91 | nf_ct_refresh_acct(conntrack, ctinfo, skb, nf_ct_udp_timeout); | ||
92 | |||
93 | return NF_ACCEPT; | ||
94 | } | ||
95 | |||
96 | /* Called when a new connection for this protocol found. */ | ||
97 | static int udp_new(struct nf_conn *conntrack, const struct sk_buff *skb, | ||
98 | unsigned int dataoff) | ||
99 | { | ||
100 | return 1; | ||
101 | } | ||
102 | |||
103 | static int udp_error(struct sk_buff *skb, unsigned int dataoff, | ||
104 | enum ip_conntrack_info *ctinfo, | ||
105 | int pf, | ||
106 | unsigned int hooknum, | ||
107 | int (*csum)(const struct sk_buff *, unsigned int)) | ||
108 | { | ||
109 | unsigned int udplen = skb->len - dataoff; | ||
110 | struct udphdr _hdr, *hdr; | ||
111 | |||
112 | /* Header is too small? */ | ||
113 | hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); | ||
114 | if (hdr == NULL) { | ||
115 | if (LOG_INVALID(IPPROTO_UDP)) | ||
116 | nf_log_packet(pf, 0, skb, NULL, NULL, NULL, | ||
117 | "nf_ct_udp: short packet "); | ||
118 | return -NF_ACCEPT; | ||
119 | } | ||
120 | |||
121 | /* Truncated/malformed packets */ | ||
122 | if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) { | ||
123 | if (LOG_INVALID(IPPROTO_UDP)) | ||
124 | nf_log_packet(pf, 0, skb, NULL, NULL, NULL, | ||
125 | "nf_ct_udp: truncated/malformed packet "); | ||
126 | return -NF_ACCEPT; | ||
127 | } | ||
128 | |||
129 | /* Packet with no checksum */ | ||
130 | if (!hdr->check) | ||
131 | return NF_ACCEPT; | ||
132 | |||
133 | /* Checksum invalid? Ignore. | ||
134 | * We skip checking packets on the outgoing path | ||
135 | * because the semantic of CHECKSUM_HW is different there | ||
136 | * and moreover root might send raw packets. | ||
137 | * FIXME: Source route IP option packets --RR */ | ||
138 | if (((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) || | ||
139 | (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING)) | ||
140 | && skb->ip_summed != CHECKSUM_UNNECESSARY | ||
141 | && csum(skb, dataoff)) { | ||
142 | if (LOG_INVALID(IPPROTO_UDP)) | ||
143 | nf_log_packet(pf, 0, skb, NULL, NULL, NULL, | ||
144 | "nf_ct_udp: bad UDP checksum "); | ||
145 | return -NF_ACCEPT; | ||
146 | } | ||
147 | |||
148 | return NF_ACCEPT; | ||
149 | } | ||
150 | |||
151 | static int csum4(const struct sk_buff *skb, unsigned int dataoff) | ||
152 | { | ||
153 | return csum_tcpudp_magic(skb->nh.iph->saddr, skb->nh.iph->daddr, | ||
154 | skb->len - dataoff, IPPROTO_UDP, | ||
155 | skb->ip_summed == CHECKSUM_HW ? skb->csum | ||
156 | : skb_checksum(skb, dataoff, | ||
157 | skb->len - dataoff, 0)); | ||
158 | } | ||
159 | |||
160 | static int csum6(const struct sk_buff *skb, unsigned int dataoff) | ||
161 | { | ||
162 | return csum_ipv6_magic(&skb->nh.ipv6h->saddr, &skb->nh.ipv6h->daddr, | ||
163 | skb->len - dataoff, IPPROTO_UDP, | ||
164 | skb->ip_summed == CHECKSUM_HW ? skb->csum | ||
165 | : skb_checksum(skb, dataoff, skb->len - dataoff, | ||
166 | 0)); | ||
167 | } | ||
168 | |||
169 | static int udp_error4(struct sk_buff *skb, | ||
170 | unsigned int dataoff, | ||
171 | enum ip_conntrack_info *ctinfo, | ||
172 | int pf, | ||
173 | unsigned int hooknum) | ||
174 | { | ||
175 | return udp_error(skb, dataoff, ctinfo, pf, hooknum, csum4); | ||
176 | } | ||
177 | |||
178 | static int udp_error6(struct sk_buff *skb, | ||
179 | unsigned int dataoff, | ||
180 | enum ip_conntrack_info *ctinfo, | ||
181 | int pf, | ||
182 | unsigned int hooknum) | ||
183 | { | ||
184 | return udp_error(skb, dataoff, ctinfo, pf, hooknum, csum6); | ||
185 | } | ||
186 | |||
187 | struct nf_conntrack_protocol nf_conntrack_protocol_udp4 = | ||
188 | { | ||
189 | .l3proto = PF_INET, | ||
190 | .proto = IPPROTO_UDP, | ||
191 | .name = "udp", | ||
192 | .pkt_to_tuple = udp_pkt_to_tuple, | ||
193 | .invert_tuple = udp_invert_tuple, | ||
194 | .print_tuple = udp_print_tuple, | ||
195 | .print_conntrack = udp_print_conntrack, | ||
196 | .packet = udp_packet, | ||
197 | .new = udp_new, | ||
198 | .error = udp_error4, | ||
199 | }; | ||
200 | |||
201 | struct nf_conntrack_protocol nf_conntrack_protocol_udp6 = | ||
202 | { | ||
203 | .l3proto = PF_INET6, | ||
204 | .proto = IPPROTO_UDP, | ||
205 | .name = "udp", | ||
206 | .pkt_to_tuple = udp_pkt_to_tuple, | ||
207 | .invert_tuple = udp_invert_tuple, | ||
208 | .print_tuple = udp_print_tuple, | ||
209 | .print_conntrack = udp_print_conntrack, | ||
210 | .packet = udp_packet, | ||
211 | .new = udp_new, | ||
212 | .error = udp_error6, | ||
213 | }; | ||
214 | |||
215 | EXPORT_SYMBOL(nf_conntrack_protocol_udp4); | ||
216 | EXPORT_SYMBOL(nf_conntrack_protocol_udp6); | ||
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c new file mode 100644 index 000000000000..45224db4fe2f --- /dev/null +++ b/net/netfilter/nf_conntrack_standalone.c | |||
@@ -0,0 +1,869 @@ | |||
1 | /* This file contains all the functions required for the standalone | ||
2 | nf_conntrack module. | ||
3 | |||
4 | These are not required by the compatibility layer. | ||
5 | */ | ||
6 | |||
7 | /* (C) 1999-2001 Paul `Rusty' Russell | ||
8 | * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or modify | ||
11 | * it under the terms of the GNU General Public License version 2 as | ||
12 | * published by the Free Software Foundation. | ||
13 | * | ||
14 | * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> | ||
15 | * - generalize L3 protocol dependent part. | ||
16 | * | ||
17 | * Derived from net/ipv4/netfilter/ip_conntrack_standalone.c | ||
18 | */ | ||
19 | |||
20 | #include <linux/config.h> | ||
21 | #include <linux/types.h> | ||
22 | #include <linux/netfilter.h> | ||
23 | #include <linux/module.h> | ||
24 | #include <linux/skbuff.h> | ||
25 | #include <linux/proc_fs.h> | ||
26 | #include <linux/seq_file.h> | ||
27 | #include <linux/percpu.h> | ||
28 | #include <linux/netdevice.h> | ||
29 | #ifdef CONFIG_SYSCTL | ||
30 | #include <linux/sysctl.h> | ||
31 | #endif | ||
32 | |||
33 | #define ASSERT_READ_LOCK(x) | ||
34 | #define ASSERT_WRITE_LOCK(x) | ||
35 | |||
36 | #include <net/netfilter/nf_conntrack.h> | ||
37 | #include <net/netfilter/nf_conntrack_l3proto.h> | ||
38 | #include <net/netfilter/nf_conntrack_protocol.h> | ||
39 | #include <net/netfilter/nf_conntrack_core.h> | ||
40 | #include <net/netfilter/nf_conntrack_helper.h> | ||
41 | #include <linux/netfilter_ipv4/listhelp.h> | ||
42 | |||
43 | #if 0 | ||
44 | #define DEBUGP printk | ||
45 | #else | ||
46 | #define DEBUGP(format, args...) | ||
47 | #endif | ||
48 | |||
49 | MODULE_LICENSE("GPL"); | ||
50 | |||
51 | extern atomic_t nf_conntrack_count; | ||
52 | DECLARE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat); | ||
53 | |||
54 | static int kill_l3proto(struct nf_conn *i, void *data) | ||
55 | { | ||
56 | return (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num == | ||
57 | ((struct nf_conntrack_l3proto *)data)->l3proto); | ||
58 | } | ||
59 | |||
60 | static int kill_proto(struct nf_conn *i, void *data) | ||
61 | { | ||
62 | struct nf_conntrack_protocol *proto; | ||
63 | proto = (struct nf_conntrack_protocol *)data; | ||
64 | return (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum == | ||
65 | proto->proto) && | ||
66 | (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num == | ||
67 | proto->l3proto); | ||
68 | } | ||
69 | |||
70 | #ifdef CONFIG_PROC_FS | ||
71 | static int | ||
72 | print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, | ||
73 | struct nf_conntrack_l3proto *l3proto, | ||
74 | struct nf_conntrack_protocol *proto) | ||
75 | { | ||
76 | return l3proto->print_tuple(s, tuple) || proto->print_tuple(s, tuple); | ||
77 | } | ||
78 | |||
79 | #ifdef CONFIG_NF_CT_ACCT | ||
80 | static unsigned int | ||
81 | seq_print_counters(struct seq_file *s, | ||
82 | const struct ip_conntrack_counter *counter) | ||
83 | { | ||
84 | return seq_printf(s, "packets=%llu bytes=%llu ", | ||
85 | (unsigned long long)counter->packets, | ||
86 | (unsigned long long)counter->bytes); | ||
87 | } | ||
88 | #else | ||
89 | #define seq_print_counters(x, y) 0 | ||
90 | #endif | ||
91 | |||
92 | struct ct_iter_state { | ||
93 | unsigned int bucket; | ||
94 | }; | ||
95 | |||
96 | static struct list_head *ct_get_first(struct seq_file *seq) | ||
97 | { | ||
98 | struct ct_iter_state *st = seq->private; | ||
99 | |||
100 | for (st->bucket = 0; | ||
101 | st->bucket < nf_conntrack_htable_size; | ||
102 | st->bucket++) { | ||
103 | if (!list_empty(&nf_conntrack_hash[st->bucket])) | ||
104 | return nf_conntrack_hash[st->bucket].next; | ||
105 | } | ||
106 | return NULL; | ||
107 | } | ||
108 | |||
109 | static struct list_head *ct_get_next(struct seq_file *seq, struct list_head *head) | ||
110 | { | ||
111 | struct ct_iter_state *st = seq->private; | ||
112 | |||
113 | head = head->next; | ||
114 | while (head == &nf_conntrack_hash[st->bucket]) { | ||
115 | if (++st->bucket >= nf_conntrack_htable_size) | ||
116 | return NULL; | ||
117 | head = nf_conntrack_hash[st->bucket].next; | ||
118 | } | ||
119 | return head; | ||
120 | } | ||
121 | |||
122 | static struct list_head *ct_get_idx(struct seq_file *seq, loff_t pos) | ||
123 | { | ||
124 | struct list_head *head = ct_get_first(seq); | ||
125 | |||
126 | if (head) | ||
127 | while (pos && (head = ct_get_next(seq, head))) | ||
128 | pos--; | ||
129 | return pos ? NULL : head; | ||
130 | } | ||
131 | |||
132 | static void *ct_seq_start(struct seq_file *seq, loff_t *pos) | ||
133 | { | ||
134 | read_lock_bh(&nf_conntrack_lock); | ||
135 | return ct_get_idx(seq, *pos); | ||
136 | } | ||
137 | |||
138 | static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos) | ||
139 | { | ||
140 | (*pos)++; | ||
141 | return ct_get_next(s, v); | ||
142 | } | ||
143 | |||
144 | static void ct_seq_stop(struct seq_file *s, void *v) | ||
145 | { | ||
146 | read_unlock_bh(&nf_conntrack_lock); | ||
147 | } | ||
148 | |||
149 | /* return 0 on success, 1 in case of error */ | ||
150 | static int ct_seq_show(struct seq_file *s, void *v) | ||
151 | { | ||
152 | const struct nf_conntrack_tuple_hash *hash = v; | ||
153 | const struct nf_conn *conntrack = nf_ct_tuplehash_to_ctrack(hash); | ||
154 | struct nf_conntrack_l3proto *l3proto; | ||
155 | struct nf_conntrack_protocol *proto; | ||
156 | |||
157 | ASSERT_READ_LOCK(&nf_conntrack_lock); | ||
158 | NF_CT_ASSERT(conntrack); | ||
159 | |||
160 | /* we only want to print DIR_ORIGINAL */ | ||
161 | if (NF_CT_DIRECTION(hash)) | ||
162 | return 0; | ||
163 | |||
164 | l3proto = nf_ct_find_l3proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL] | ||
165 | .tuple.src.l3num); | ||
166 | |||
167 | NF_CT_ASSERT(l3proto); | ||
168 | proto = nf_ct_find_proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL] | ||
169 | .tuple.src.l3num, | ||
170 | conntrack->tuplehash[IP_CT_DIR_ORIGINAL] | ||
171 | .tuple.dst.protonum); | ||
172 | NF_CT_ASSERT(proto); | ||
173 | |||
174 | if (seq_printf(s, "%-8s %u %-8s %u %ld ", | ||
175 | l3proto->name, | ||
176 | conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num, | ||
177 | proto->name, | ||
178 | conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum, | ||
179 | timer_pending(&conntrack->timeout) | ||
180 | ? (long)(conntrack->timeout.expires - jiffies)/HZ : 0) != 0) | ||
181 | return -ENOSPC; | ||
182 | |||
183 | if (l3proto->print_conntrack(s, conntrack)) | ||
184 | return -ENOSPC; | ||
185 | |||
186 | if (proto->print_conntrack(s, conntrack)) | ||
187 | return -ENOSPC; | ||
188 | |||
189 | if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple, | ||
190 | l3proto, proto)) | ||
191 | return -ENOSPC; | ||
192 | |||
193 | if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_ORIGINAL])) | ||
194 | return -ENOSPC; | ||
195 | |||
196 | if (!(test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status))) | ||
197 | if (seq_printf(s, "[UNREPLIED] ")) | ||
198 | return -ENOSPC; | ||
199 | |||
200 | if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple, | ||
201 | l3proto, proto)) | ||
202 | return -ENOSPC; | ||
203 | |||
204 | if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_REPLY])) | ||
205 | return -ENOSPC; | ||
206 | |||
207 | if (test_bit(IPS_ASSURED_BIT, &conntrack->status)) | ||
208 | if (seq_printf(s, "[ASSURED] ")) | ||
209 | return -ENOSPC; | ||
210 | |||
211 | #if defined(CONFIG_NF_CONNTRACK_MARK) | ||
212 | if (seq_printf(s, "mark=%u ", conntrack->mark)) | ||
213 | return -ENOSPC; | ||
214 | #endif | ||
215 | |||
216 | if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use))) | ||
217 | return -ENOSPC; | ||
218 | |||
219 | return 0; | ||
220 | } | ||
221 | |||
222 | static struct seq_operations ct_seq_ops = { | ||
223 | .start = ct_seq_start, | ||
224 | .next = ct_seq_next, | ||
225 | .stop = ct_seq_stop, | ||
226 | .show = ct_seq_show | ||
227 | }; | ||
228 | |||
229 | static int ct_open(struct inode *inode, struct file *file) | ||
230 | { | ||
231 | struct seq_file *seq; | ||
232 | struct ct_iter_state *st; | ||
233 | int ret; | ||
234 | |||
235 | st = kmalloc(sizeof(struct ct_iter_state), GFP_KERNEL); | ||
236 | if (st == NULL) | ||
237 | return -ENOMEM; | ||
238 | ret = seq_open(file, &ct_seq_ops); | ||
239 | if (ret) | ||
240 | goto out_free; | ||
241 | seq = file->private_data; | ||
242 | seq->private = st; | ||
243 | memset(st, 0, sizeof(struct ct_iter_state)); | ||
244 | return ret; | ||
245 | out_free: | ||
246 | kfree(st); | ||
247 | return ret; | ||
248 | } | ||
249 | |||
250 | static struct file_operations ct_file_ops = { | ||
251 | .owner = THIS_MODULE, | ||
252 | .open = ct_open, | ||
253 | .read = seq_read, | ||
254 | .llseek = seq_lseek, | ||
255 | .release = seq_release_private, | ||
256 | }; | ||
257 | |||
258 | /* expects */ | ||
259 | static void *exp_seq_start(struct seq_file *s, loff_t *pos) | ||
260 | { | ||
261 | struct list_head *e = &nf_conntrack_expect_list; | ||
262 | loff_t i; | ||
263 | |||
264 | /* strange seq_file api calls stop even if we fail, | ||
265 | * thus we need to grab lock since stop unlocks */ | ||
266 | read_lock_bh(&nf_conntrack_lock); | ||
267 | |||
268 | if (list_empty(e)) | ||
269 | return NULL; | ||
270 | |||
271 | for (i = 0; i <= *pos; i++) { | ||
272 | e = e->next; | ||
273 | if (e == &nf_conntrack_expect_list) | ||
274 | return NULL; | ||
275 | } | ||
276 | return e; | ||
277 | } | ||
278 | |||
279 | static void *exp_seq_next(struct seq_file *s, void *v, loff_t *pos) | ||
280 | { | ||
281 | struct list_head *e = v; | ||
282 | |||
283 | ++*pos; | ||
284 | e = e->next; | ||
285 | |||
286 | if (e == &nf_conntrack_expect_list) | ||
287 | return NULL; | ||
288 | |||
289 | return e; | ||
290 | } | ||
291 | |||
292 | static void exp_seq_stop(struct seq_file *s, void *v) | ||
293 | { | ||
294 | read_unlock_bh(&nf_conntrack_lock); | ||
295 | } | ||
296 | |||
297 | static int exp_seq_show(struct seq_file *s, void *v) | ||
298 | { | ||
299 | struct nf_conntrack_expect *expect = v; | ||
300 | |||
301 | if (expect->timeout.function) | ||
302 | seq_printf(s, "%ld ", timer_pending(&expect->timeout) | ||
303 | ? (long)(expect->timeout.expires - jiffies)/HZ : 0); | ||
304 | else | ||
305 | seq_printf(s, "- "); | ||
306 | seq_printf(s, "l3proto = %u proto=%u ", | ||
307 | expect->tuple.src.l3num, | ||
308 | expect->tuple.dst.protonum); | ||
309 | print_tuple(s, &expect->tuple, | ||
310 | nf_ct_find_l3proto(expect->tuple.src.l3num), | ||
311 | nf_ct_find_proto(expect->tuple.src.l3num, | ||
312 | expect->tuple.dst.protonum)); | ||
313 | return seq_putc(s, '\n'); | ||
314 | } | ||
315 | |||
316 | static struct seq_operations exp_seq_ops = { | ||
317 | .start = exp_seq_start, | ||
318 | .next = exp_seq_next, | ||
319 | .stop = exp_seq_stop, | ||
320 | .show = exp_seq_show | ||
321 | }; | ||
322 | |||
323 | static int exp_open(struct inode *inode, struct file *file) | ||
324 | { | ||
325 | return seq_open(file, &exp_seq_ops); | ||
326 | } | ||
327 | |||
328 | static struct file_operations exp_file_ops = { | ||
329 | .owner = THIS_MODULE, | ||
330 | .open = exp_open, | ||
331 | .read = seq_read, | ||
332 | .llseek = seq_lseek, | ||
333 | .release = seq_release | ||
334 | }; | ||
335 | |||
336 | static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) | ||
337 | { | ||
338 | int cpu; | ||
339 | |||
340 | if (*pos == 0) | ||
341 | return SEQ_START_TOKEN; | ||
342 | |||
343 | for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) { | ||
344 | if (!cpu_possible(cpu)) | ||
345 | continue; | ||
346 | *pos = cpu + 1; | ||
347 | return &per_cpu(nf_conntrack_stat, cpu); | ||
348 | } | ||
349 | |||
350 | return NULL; | ||
351 | } | ||
352 | |||
353 | static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) | ||
354 | { | ||
355 | int cpu; | ||
356 | |||
357 | for (cpu = *pos; cpu < NR_CPUS; ++cpu) { | ||
358 | if (!cpu_possible(cpu)) | ||
359 | continue; | ||
360 | *pos = cpu + 1; | ||
361 | return &per_cpu(nf_conntrack_stat, cpu); | ||
362 | } | ||
363 | |||
364 | return NULL; | ||
365 | } | ||
366 | |||
367 | static void ct_cpu_seq_stop(struct seq_file *seq, void *v) | ||
368 | { | ||
369 | } | ||
370 | |||
371 | static int ct_cpu_seq_show(struct seq_file *seq, void *v) | ||
372 | { | ||
373 | unsigned int nr_conntracks = atomic_read(&nf_conntrack_count); | ||
374 | struct ip_conntrack_stat *st = v; | ||
375 | |||
376 | if (v == SEQ_START_TOKEN) { | ||
377 | seq_printf(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete\n"); | ||
378 | return 0; | ||
379 | } | ||
380 | |||
381 | seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x " | ||
382 | "%08x %08x %08x %08x %08x %08x %08x %08x \n", | ||
383 | nr_conntracks, | ||
384 | st->searched, | ||
385 | st->found, | ||
386 | st->new, | ||
387 | st->invalid, | ||
388 | st->ignore, | ||
389 | st->delete, | ||
390 | st->delete_list, | ||
391 | st->insert, | ||
392 | st->insert_failed, | ||
393 | st->drop, | ||
394 | st->early_drop, | ||
395 | st->error, | ||
396 | |||
397 | st->expect_new, | ||
398 | st->expect_create, | ||
399 | st->expect_delete | ||
400 | ); | ||
401 | return 0; | ||
402 | } | ||
403 | |||
404 | static struct seq_operations ct_cpu_seq_ops = { | ||
405 | .start = ct_cpu_seq_start, | ||
406 | .next = ct_cpu_seq_next, | ||
407 | .stop = ct_cpu_seq_stop, | ||
408 | .show = ct_cpu_seq_show, | ||
409 | }; | ||
410 | |||
411 | static int ct_cpu_seq_open(struct inode *inode, struct file *file) | ||
412 | { | ||
413 | return seq_open(file, &ct_cpu_seq_ops); | ||
414 | } | ||
415 | |||
416 | static struct file_operations ct_cpu_seq_fops = { | ||
417 | .owner = THIS_MODULE, | ||
418 | .open = ct_cpu_seq_open, | ||
419 | .read = seq_read, | ||
420 | .llseek = seq_lseek, | ||
421 | .release = seq_release_private, | ||
422 | }; | ||
423 | #endif /* CONFIG_PROC_FS */ | ||
424 | |||
425 | /* Sysctl support */ | ||
426 | |||
427 | #ifdef CONFIG_SYSCTL | ||
428 | |||
429 | /* From nf_conntrack_core.c */ | ||
430 | extern int nf_conntrack_max; | ||
431 | extern unsigned int nf_conntrack_htable_size; | ||
432 | |||
433 | /* From nf_conntrack_proto_tcp.c */ | ||
434 | extern unsigned long nf_ct_tcp_timeout_syn_sent; | ||
435 | extern unsigned long nf_ct_tcp_timeout_syn_recv; | ||
436 | extern unsigned long nf_ct_tcp_timeout_established; | ||
437 | extern unsigned long nf_ct_tcp_timeout_fin_wait; | ||
438 | extern unsigned long nf_ct_tcp_timeout_close_wait; | ||
439 | extern unsigned long nf_ct_tcp_timeout_last_ack; | ||
440 | extern unsigned long nf_ct_tcp_timeout_time_wait; | ||
441 | extern unsigned long nf_ct_tcp_timeout_close; | ||
442 | extern unsigned long nf_ct_tcp_timeout_max_retrans; | ||
443 | extern int nf_ct_tcp_loose; | ||
444 | extern int nf_ct_tcp_be_liberal; | ||
445 | extern int nf_ct_tcp_max_retrans; | ||
446 | |||
447 | /* From nf_conntrack_proto_udp.c */ | ||
448 | extern unsigned long nf_ct_udp_timeout; | ||
449 | extern unsigned long nf_ct_udp_timeout_stream; | ||
450 | |||
451 | /* From nf_conntrack_proto_generic.c */ | ||
452 | extern unsigned long nf_ct_generic_timeout; | ||
453 | |||
454 | /* Log invalid packets of a given protocol */ | ||
455 | static int log_invalid_proto_min = 0; | ||
456 | static int log_invalid_proto_max = 255; | ||
457 | |||
458 | static struct ctl_table_header *nf_ct_sysctl_header; | ||
459 | |||
460 | static ctl_table nf_ct_sysctl_table[] = { | ||
461 | { | ||
462 | .ctl_name = NET_NF_CONNTRACK_MAX, | ||
463 | .procname = "nf_conntrack_max", | ||
464 | .data = &nf_conntrack_max, | ||
465 | .maxlen = sizeof(int), | ||
466 | .mode = 0644, | ||
467 | .proc_handler = &proc_dointvec, | ||
468 | }, | ||
469 | { | ||
470 | .ctl_name = NET_NF_CONNTRACK_COUNT, | ||
471 | .procname = "nf_conntrack_count", | ||
472 | .data = &nf_conntrack_count, | ||
473 | .maxlen = sizeof(int), | ||
474 | .mode = 0444, | ||
475 | .proc_handler = &proc_dointvec, | ||
476 | }, | ||
477 | { | ||
478 | .ctl_name = NET_NF_CONNTRACK_BUCKETS, | ||
479 | .procname = "nf_conntrack_buckets", | ||
480 | .data = &nf_conntrack_htable_size, | ||
481 | .maxlen = sizeof(unsigned int), | ||
482 | .mode = 0444, | ||
483 | .proc_handler = &proc_dointvec, | ||
484 | }, | ||
485 | { | ||
486 | .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, | ||
487 | .procname = "nf_conntrack_tcp_timeout_syn_sent", | ||
488 | .data = &nf_ct_tcp_timeout_syn_sent, | ||
489 | .maxlen = sizeof(unsigned int), | ||
490 | .mode = 0644, | ||
491 | .proc_handler = &proc_dointvec_jiffies, | ||
492 | }, | ||
493 | { | ||
494 | .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV, | ||
495 | .procname = "nf_conntrack_tcp_timeout_syn_recv", | ||
496 | .data = &nf_ct_tcp_timeout_syn_recv, | ||
497 | .maxlen = sizeof(unsigned int), | ||
498 | .mode = 0644, | ||
499 | .proc_handler = &proc_dointvec_jiffies, | ||
500 | }, | ||
501 | { | ||
502 | .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED, | ||
503 | .procname = "nf_conntrack_tcp_timeout_established", | ||
504 | .data = &nf_ct_tcp_timeout_established, | ||
505 | .maxlen = sizeof(unsigned int), | ||
506 | .mode = 0644, | ||
507 | .proc_handler = &proc_dointvec_jiffies, | ||
508 | }, | ||
509 | { | ||
510 | .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT, | ||
511 | .procname = "nf_conntrack_tcp_timeout_fin_wait", | ||
512 | .data = &nf_ct_tcp_timeout_fin_wait, | ||
513 | .maxlen = sizeof(unsigned int), | ||
514 | .mode = 0644, | ||
515 | .proc_handler = &proc_dointvec_jiffies, | ||
516 | }, | ||
517 | { | ||
518 | .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT, | ||
519 | .procname = "nf_conntrack_tcp_timeout_close_wait", | ||
520 | .data = &nf_ct_tcp_timeout_close_wait, | ||
521 | .maxlen = sizeof(unsigned int), | ||
522 | .mode = 0644, | ||
523 | .proc_handler = &proc_dointvec_jiffies, | ||
524 | }, | ||
525 | { | ||
526 | .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK, | ||
527 | .procname = "nf_conntrack_tcp_timeout_last_ack", | ||
528 | .data = &nf_ct_tcp_timeout_last_ack, | ||
529 | .maxlen = sizeof(unsigned int), | ||
530 | .mode = 0644, | ||
531 | .proc_handler = &proc_dointvec_jiffies, | ||
532 | }, | ||
533 | { | ||
534 | .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT, | ||
535 | .procname = "nf_conntrack_tcp_timeout_time_wait", | ||
536 | .data = &nf_ct_tcp_timeout_time_wait, | ||
537 | .maxlen = sizeof(unsigned int), | ||
538 | .mode = 0644, | ||
539 | .proc_handler = &proc_dointvec_jiffies, | ||
540 | }, | ||
541 | { | ||
542 | .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE, | ||
543 | .procname = "nf_conntrack_tcp_timeout_close", | ||
544 | .data = &nf_ct_tcp_timeout_close, | ||
545 | .maxlen = sizeof(unsigned int), | ||
546 | .mode = 0644, | ||
547 | .proc_handler = &proc_dointvec_jiffies, | ||
548 | }, | ||
549 | { | ||
550 | .ctl_name = NET_NF_CONNTRACK_UDP_TIMEOUT, | ||
551 | .procname = "nf_conntrack_udp_timeout", | ||
552 | .data = &nf_ct_udp_timeout, | ||
553 | .maxlen = sizeof(unsigned int), | ||
554 | .mode = 0644, | ||
555 | .proc_handler = &proc_dointvec_jiffies, | ||
556 | }, | ||
557 | { | ||
558 | .ctl_name = NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM, | ||
559 | .procname = "nf_conntrack_udp_timeout_stream", | ||
560 | .data = &nf_ct_udp_timeout_stream, | ||
561 | .maxlen = sizeof(unsigned int), | ||
562 | .mode = 0644, | ||
563 | .proc_handler = &proc_dointvec_jiffies, | ||
564 | }, | ||
565 | { | ||
566 | .ctl_name = NET_NF_CONNTRACK_GENERIC_TIMEOUT, | ||
567 | .procname = "nf_conntrack_generic_timeout", | ||
568 | .data = &nf_ct_generic_timeout, | ||
569 | .maxlen = sizeof(unsigned int), | ||
570 | .mode = 0644, | ||
571 | .proc_handler = &proc_dointvec_jiffies, | ||
572 | }, | ||
573 | { | ||
574 | .ctl_name = NET_NF_CONNTRACK_LOG_INVALID, | ||
575 | .procname = "nf_conntrack_log_invalid", | ||
576 | .data = &nf_ct_log_invalid, | ||
577 | .maxlen = sizeof(unsigned int), | ||
578 | .mode = 0644, | ||
579 | .proc_handler = &proc_dointvec_minmax, | ||
580 | .strategy = &sysctl_intvec, | ||
581 | .extra1 = &log_invalid_proto_min, | ||
582 | .extra2 = &log_invalid_proto_max, | ||
583 | }, | ||
584 | { | ||
585 | .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS, | ||
586 | .procname = "nf_conntrack_tcp_timeout_max_retrans", | ||
587 | .data = &nf_ct_tcp_timeout_max_retrans, | ||
588 | .maxlen = sizeof(unsigned int), | ||
589 | .mode = 0644, | ||
590 | .proc_handler = &proc_dointvec_jiffies, | ||
591 | }, | ||
592 | { | ||
593 | .ctl_name = NET_NF_CONNTRACK_TCP_LOOSE, | ||
594 | .procname = "nf_conntrack_tcp_loose", | ||
595 | .data = &nf_ct_tcp_loose, | ||
596 | .maxlen = sizeof(unsigned int), | ||
597 | .mode = 0644, | ||
598 | .proc_handler = &proc_dointvec, | ||
599 | }, | ||
600 | { | ||
601 | .ctl_name = NET_NF_CONNTRACK_TCP_BE_LIBERAL, | ||
602 | .procname = "nf_conntrack_tcp_be_liberal", | ||
603 | .data = &nf_ct_tcp_be_liberal, | ||
604 | .maxlen = sizeof(unsigned int), | ||
605 | .mode = 0644, | ||
606 | .proc_handler = &proc_dointvec, | ||
607 | }, | ||
608 | { | ||
609 | .ctl_name = NET_NF_CONNTRACK_TCP_MAX_RETRANS, | ||
610 | .procname = "nf_conntrack_tcp_max_retrans", | ||
611 | .data = &nf_ct_tcp_max_retrans, | ||
612 | .maxlen = sizeof(unsigned int), | ||
613 | .mode = 0644, | ||
614 | .proc_handler = &proc_dointvec, | ||
615 | }, | ||
616 | |||
617 | { .ctl_name = 0 } | ||
618 | }; | ||
619 | |||
620 | #define NET_NF_CONNTRACK_MAX 2089 | ||
621 | |||
622 | static ctl_table nf_ct_netfilter_table[] = { | ||
623 | { | ||
624 | .ctl_name = NET_NETFILTER, | ||
625 | .procname = "netfilter", | ||
626 | .mode = 0555, | ||
627 | .child = nf_ct_sysctl_table, | ||
628 | }, | ||
629 | { | ||
630 | .ctl_name = NET_NF_CONNTRACK_MAX, | ||
631 | .procname = "nf_conntrack_max", | ||
632 | .data = &nf_conntrack_max, | ||
633 | .maxlen = sizeof(int), | ||
634 | .mode = 0644, | ||
635 | .proc_handler = &proc_dointvec, | ||
636 | }, | ||
637 | { .ctl_name = 0 } | ||
638 | }; | ||
639 | |||
640 | static ctl_table nf_ct_net_table[] = { | ||
641 | { | ||
642 | .ctl_name = CTL_NET, | ||
643 | .procname = "net", | ||
644 | .mode = 0555, | ||
645 | .child = nf_ct_netfilter_table, | ||
646 | }, | ||
647 | { .ctl_name = 0 } | ||
648 | }; | ||
649 | EXPORT_SYMBOL(nf_ct_log_invalid); | ||
650 | #endif /* CONFIG_SYSCTL */ | ||
651 | |||
652 | static int init_or_cleanup(int init) | ||
653 | { | ||
654 | #ifdef CONFIG_PROC_FS | ||
655 | struct proc_dir_entry *proc, *proc_exp, *proc_stat; | ||
656 | #endif | ||
657 | int ret = 0; | ||
658 | |||
659 | if (!init) goto cleanup; | ||
660 | |||
661 | ret = nf_conntrack_init(); | ||
662 | if (ret < 0) | ||
663 | goto cleanup_nothing; | ||
664 | |||
665 | #ifdef CONFIG_PROC_FS | ||
666 | proc = proc_net_fops_create("nf_conntrack", 0440, &ct_file_ops); | ||
667 | if (!proc) goto cleanup_init; | ||
668 | |||
669 | proc_exp = proc_net_fops_create("nf_conntrack_expect", 0440, | ||
670 | &exp_file_ops); | ||
671 | if (!proc_exp) goto cleanup_proc; | ||
672 | |||
673 | proc_stat = create_proc_entry("nf_conntrack", S_IRUGO, proc_net_stat); | ||
674 | if (!proc_stat) | ||
675 | goto cleanup_proc_exp; | ||
676 | |||
677 | proc_stat->proc_fops = &ct_cpu_seq_fops; | ||
678 | proc_stat->owner = THIS_MODULE; | ||
679 | #endif | ||
680 | #ifdef CONFIG_SYSCTL | ||
681 | nf_ct_sysctl_header = register_sysctl_table(nf_ct_net_table, 0); | ||
682 | if (nf_ct_sysctl_header == NULL) { | ||
683 | printk("nf_conntrack: can't register to sysctl.\n"); | ||
684 | ret = -ENOMEM; | ||
685 | goto cleanup_proc_stat; | ||
686 | } | ||
687 | #endif | ||
688 | |||
689 | return ret; | ||
690 | |||
691 | cleanup: | ||
692 | #ifdef CONFIG_SYSCTL | ||
693 | unregister_sysctl_table(nf_ct_sysctl_header); | ||
694 | cleanup_proc_stat: | ||
695 | #endif | ||
696 | #ifdef CONFIG_PROC_FS | ||
697 | proc_net_remove("nf_conntrack_stat"); | ||
698 | cleanup_proc_exp: | ||
699 | proc_net_remove("nf_conntrack_expect"); | ||
700 | cleanup_proc: | ||
701 | proc_net_remove("nf_conntrack"); | ||
702 | cleanup_init: | ||
703 | #endif /* CNFIG_PROC_FS */ | ||
704 | nf_conntrack_cleanup(); | ||
705 | cleanup_nothing: | ||
706 | return ret; | ||
707 | } | ||
708 | |||
709 | int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto) | ||
710 | { | ||
711 | int ret = 0; | ||
712 | |||
713 | write_lock_bh(&nf_conntrack_lock); | ||
714 | if (nf_ct_l3protos[proto->l3proto] != &nf_conntrack_generic_l3proto) { | ||
715 | ret = -EBUSY; | ||
716 | goto out; | ||
717 | } | ||
718 | nf_ct_l3protos[proto->l3proto] = proto; | ||
719 | out: | ||
720 | write_unlock_bh(&nf_conntrack_lock); | ||
721 | |||
722 | return ret; | ||
723 | } | ||
724 | |||
725 | void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto) | ||
726 | { | ||
727 | write_lock_bh(&nf_conntrack_lock); | ||
728 | nf_ct_l3protos[proto->l3proto] = &nf_conntrack_generic_l3proto; | ||
729 | write_unlock_bh(&nf_conntrack_lock); | ||
730 | |||
731 | /* Somebody could be still looking at the proto in bh. */ | ||
732 | synchronize_net(); | ||
733 | |||
734 | /* Remove all contrack entries for this protocol */ | ||
735 | nf_ct_iterate_cleanup(kill_l3proto, proto); | ||
736 | } | ||
737 | |||
738 | /* FIXME: Allow NULL functions and sub in pointers to generic for | ||
739 | them. --RR */ | ||
740 | int nf_conntrack_protocol_register(struct nf_conntrack_protocol *proto) | ||
741 | { | ||
742 | int ret = 0; | ||
743 | |||
744 | retry: | ||
745 | write_lock_bh(&nf_conntrack_lock); | ||
746 | if (nf_ct_protos[proto->l3proto]) { | ||
747 | if (nf_ct_protos[proto->l3proto][proto->proto] | ||
748 | != &nf_conntrack_generic_protocol) { | ||
749 | ret = -EBUSY; | ||
750 | goto out_unlock; | ||
751 | } | ||
752 | } else { | ||
753 | /* l3proto may be loaded latter. */ | ||
754 | struct nf_conntrack_protocol **proto_array; | ||
755 | int i; | ||
756 | |||
757 | write_unlock_bh(&nf_conntrack_lock); | ||
758 | |||
759 | proto_array = (struct nf_conntrack_protocol **) | ||
760 | kmalloc(MAX_NF_CT_PROTO * | ||
761 | sizeof(struct nf_conntrack_protocol *), | ||
762 | GFP_KERNEL); | ||
763 | if (proto_array == NULL) { | ||
764 | ret = -ENOMEM; | ||
765 | goto out; | ||
766 | } | ||
767 | for (i = 0; i < MAX_NF_CT_PROTO; i++) | ||
768 | proto_array[i] = &nf_conntrack_generic_protocol; | ||
769 | |||
770 | write_lock_bh(&nf_conntrack_lock); | ||
771 | if (nf_ct_protos[proto->l3proto]) { | ||
772 | /* bad timing, but no problem */ | ||
773 | write_unlock_bh(&nf_conntrack_lock); | ||
774 | kfree(proto_array); | ||
775 | } else { | ||
776 | nf_ct_protos[proto->l3proto] = proto_array; | ||
777 | write_unlock_bh(&nf_conntrack_lock); | ||
778 | } | ||
779 | |||
780 | /* | ||
781 | * Just once because array is never freed until unloading | ||
782 | * nf_conntrack.ko | ||
783 | */ | ||
784 | goto retry; | ||
785 | } | ||
786 | |||
787 | nf_ct_protos[proto->l3proto][proto->proto] = proto; | ||
788 | |||
789 | out_unlock: | ||
790 | write_unlock_bh(&nf_conntrack_lock); | ||
791 | out: | ||
792 | return ret; | ||
793 | } | ||
794 | |||
795 | void nf_conntrack_protocol_unregister(struct nf_conntrack_protocol *proto) | ||
796 | { | ||
797 | write_lock_bh(&nf_conntrack_lock); | ||
798 | nf_ct_protos[proto->l3proto][proto->proto] | ||
799 | = &nf_conntrack_generic_protocol; | ||
800 | write_unlock_bh(&nf_conntrack_lock); | ||
801 | |||
802 | /* Somebody could be still looking at the proto in bh. */ | ||
803 | synchronize_net(); | ||
804 | |||
805 | /* Remove all contrack entries for this protocol */ | ||
806 | nf_ct_iterate_cleanup(kill_proto, proto); | ||
807 | } | ||
808 | |||
809 | static int __init init(void) | ||
810 | { | ||
811 | return init_or_cleanup(1); | ||
812 | } | ||
813 | |||
814 | static void __exit fini(void) | ||
815 | { | ||
816 | init_or_cleanup(0); | ||
817 | } | ||
818 | |||
819 | module_init(init); | ||
820 | module_exit(fini); | ||
821 | |||
822 | /* Some modules need us, but don't depend directly on any symbol. | ||
823 | They should call this. */ | ||
824 | void need_nf_conntrack(void) | ||
825 | { | ||
826 | } | ||
827 | |||
828 | #ifdef CONFIG_NF_CONNTRACK_EVENTS | ||
829 | EXPORT_SYMBOL_GPL(nf_conntrack_chain); | ||
830 | EXPORT_SYMBOL_GPL(nf_conntrack_expect_chain); | ||
831 | EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier); | ||
832 | EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier); | ||
833 | EXPORT_SYMBOL_GPL(__nf_ct_event_cache_init); | ||
834 | EXPORT_PER_CPU_SYMBOL_GPL(nf_conntrack_ecache); | ||
835 | EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events); | ||
836 | #endif | ||
837 | EXPORT_SYMBOL(nf_conntrack_l3proto_register); | ||
838 | EXPORT_SYMBOL(nf_conntrack_l3proto_unregister); | ||
839 | EXPORT_SYMBOL(nf_conntrack_protocol_register); | ||
840 | EXPORT_SYMBOL(nf_conntrack_protocol_unregister); | ||
841 | EXPORT_SYMBOL(nf_ct_invert_tuplepr); | ||
842 | EXPORT_SYMBOL(nf_conntrack_alter_reply); | ||
843 | EXPORT_SYMBOL(nf_conntrack_destroyed); | ||
844 | EXPORT_SYMBOL(need_nf_conntrack); | ||
845 | EXPORT_SYMBOL(nf_conntrack_helper_register); | ||
846 | EXPORT_SYMBOL(nf_conntrack_helper_unregister); | ||
847 | EXPORT_SYMBOL(nf_ct_iterate_cleanup); | ||
848 | EXPORT_SYMBOL(__nf_ct_refresh_acct); | ||
849 | EXPORT_SYMBOL(nf_ct_protos); | ||
850 | EXPORT_SYMBOL(nf_ct_find_proto); | ||
851 | EXPORT_SYMBOL(nf_ct_l3protos); | ||
852 | EXPORT_SYMBOL(nf_conntrack_expect_alloc); | ||
853 | EXPORT_SYMBOL(nf_conntrack_expect_put); | ||
854 | EXPORT_SYMBOL(nf_conntrack_expect_related); | ||
855 | EXPORT_SYMBOL(nf_conntrack_unexpect_related); | ||
856 | EXPORT_SYMBOL(nf_conntrack_tuple_taken); | ||
857 | EXPORT_SYMBOL(nf_conntrack_htable_size); | ||
858 | EXPORT_SYMBOL(nf_conntrack_lock); | ||
859 | EXPORT_SYMBOL(nf_conntrack_hash); | ||
860 | EXPORT_SYMBOL(nf_conntrack_untracked); | ||
861 | EXPORT_SYMBOL_GPL(nf_conntrack_find_get); | ||
862 | #ifdef CONFIG_IP_NF_NAT_NEEDED | ||
863 | EXPORT_SYMBOL(nf_conntrack_tcp_update); | ||
864 | #endif | ||
865 | EXPORT_SYMBOL(__nf_conntrack_confirm); | ||
866 | EXPORT_SYMBOL(nf_ct_get_tuple); | ||
867 | EXPORT_SYMBOL(nf_ct_invert_tuple); | ||
868 | EXPORT_SYMBOL(nf_conntrack_in); | ||
869 | EXPORT_SYMBOL(__nf_conntrack_attach); | ||
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 4bc27a6334c1..83f4c53030fc 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c | |||
@@ -128,7 +128,7 @@ void __nfa_fill(struct sk_buff *skb, int attrtype, int attrlen, | |||
128 | memset(NFA_DATA(nfa) + attrlen, 0, NFA_ALIGN(size) - size); | 128 | memset(NFA_DATA(nfa) + attrlen, 0, NFA_ALIGN(size) - size); |
129 | } | 129 | } |
130 | 130 | ||
131 | int nfattr_parse(struct nfattr *tb[], int maxattr, struct nfattr *nfa, int len) | 131 | void nfattr_parse(struct nfattr *tb[], int maxattr, struct nfattr *nfa, int len) |
132 | { | 132 | { |
133 | memset(tb, 0, sizeof(struct nfattr *) * maxattr); | 133 | memset(tb, 0, sizeof(struct nfattr *) * maxattr); |
134 | 134 | ||
@@ -138,8 +138,6 @@ int nfattr_parse(struct nfattr *tb[], int maxattr, struct nfattr *nfa, int len) | |||
138 | tb[flavor-1] = nfa; | 138 | tb[flavor-1] = nfa; |
139 | nfa = NFA_NEXT(nfa, len); | 139 | nfa = NFA_NEXT(nfa, len); |
140 | } | 140 | } |
141 | |||
142 | return 0; | ||
143 | } | 141 | } |
144 | 142 | ||
145 | /** | 143 | /** |
@@ -242,15 +240,18 @@ static inline int nfnetlink_rcv_msg(struct sk_buff *skb, | |||
242 | ss = nfnetlink_get_subsys(type); | 240 | ss = nfnetlink_get_subsys(type); |
243 | if (!ss) { | 241 | if (!ss) { |
244 | #ifdef CONFIG_KMOD | 242 | #ifdef CONFIG_KMOD |
245 | /* don't call nfnl_shunlock, since it would reenter | 243 | if (cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN)) { |
246 | * with further packet processing */ | 244 | /* don't call nfnl_shunlock, since it would reenter |
247 | up(&nfnl_sem); | 245 | * with further packet processing */ |
248 | request_module("nfnetlink-subsys-%d", NFNL_SUBSYS_ID(type)); | 246 | up(&nfnl_sem); |
249 | nfnl_shlock(); | 247 | request_module("nfnetlink-subsys-%d", |
250 | ss = nfnetlink_get_subsys(type); | 248 | NFNL_SUBSYS_ID(type)); |
249 | nfnl_shlock(); | ||
250 | ss = nfnetlink_get_subsys(type); | ||
251 | } | ||
251 | if (!ss) | 252 | if (!ss) |
252 | #endif | 253 | #endif |
253 | goto err_inval; | 254 | goto err_inval; |
254 | } | 255 | } |
255 | 256 | ||
256 | nc = nfnetlink_find_client(type, ss); | 257 | nc = nfnetlink_find_client(type, ss); |
diff --git a/net/netlink/Makefile b/net/netlink/Makefile index 39d9c2dcd03c..e3589c2de49e 100644 --- a/net/netlink/Makefile +++ b/net/netlink/Makefile | |||
@@ -2,4 +2,4 @@ | |||
2 | # Makefile for the netlink driver. | 2 | # Makefile for the netlink driver. |
3 | # | 3 | # |
4 | 4 | ||
5 | obj-y := af_netlink.o | 5 | obj-y := af_netlink.o attr.o genetlink.o |
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 5ca283537bc6..8c38ee6d255e 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c | |||
@@ -58,6 +58,7 @@ | |||
58 | 58 | ||
59 | #include <net/sock.h> | 59 | #include <net/sock.h> |
60 | #include <net/scm.h> | 60 | #include <net/scm.h> |
61 | #include <net/netlink.h> | ||
61 | 62 | ||
62 | #define Nprintk(a...) | 63 | #define Nprintk(a...) |
63 | #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) | 64 | #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) |
@@ -427,7 +428,8 @@ static int netlink_release(struct socket *sock) | |||
427 | 428 | ||
428 | spin_lock(&nlk->cb_lock); | 429 | spin_lock(&nlk->cb_lock); |
429 | if (nlk->cb) { | 430 | if (nlk->cb) { |
430 | nlk->cb->done(nlk->cb); | 431 | if (nlk->cb->done) |
432 | nlk->cb->done(nlk->cb); | ||
431 | netlink_destroy_callback(nlk->cb); | 433 | netlink_destroy_callback(nlk->cb); |
432 | nlk->cb = NULL; | 434 | nlk->cb = NULL; |
433 | } | 435 | } |
@@ -1322,7 +1324,8 @@ static int netlink_dump(struct sock *sk) | |||
1322 | skb_queue_tail(&sk->sk_receive_queue, skb); | 1324 | skb_queue_tail(&sk->sk_receive_queue, skb); |
1323 | sk->sk_data_ready(sk, skb->len); | 1325 | sk->sk_data_ready(sk, skb->len); |
1324 | 1326 | ||
1325 | cb->done(cb); | 1327 | if (cb->done) |
1328 | cb->done(cb); | ||
1326 | nlk->cb = NULL; | 1329 | nlk->cb = NULL; |
1327 | spin_unlock(&nlk->cb_lock); | 1330 | spin_unlock(&nlk->cb_lock); |
1328 | 1331 | ||
@@ -1409,6 +1412,94 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) | |||
1409 | netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); | 1412 | netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); |
1410 | } | 1413 | } |
1411 | 1414 | ||
1415 | static int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, | ||
1416 | struct nlmsghdr *, int *)) | ||
1417 | { | ||
1418 | unsigned int total_len; | ||
1419 | struct nlmsghdr *nlh; | ||
1420 | int err; | ||
1421 | |||
1422 | while (skb->len >= nlmsg_total_size(0)) { | ||
1423 | nlh = (struct nlmsghdr *) skb->data; | ||
1424 | |||
1425 | if (skb->len < nlh->nlmsg_len) | ||
1426 | return 0; | ||
1427 | |||
1428 | total_len = min(NLMSG_ALIGN(nlh->nlmsg_len), skb->len); | ||
1429 | |||
1430 | if (cb(skb, nlh, &err) < 0) { | ||
1431 | /* Not an error, but we have to interrupt processing | ||
1432 | * here. Note: that in this case we do not pull | ||
1433 | * message from skb, it will be processed later. | ||
1434 | */ | ||
1435 | if (err == 0) | ||
1436 | return -1; | ||
1437 | netlink_ack(skb, nlh, err); | ||
1438 | } else if (nlh->nlmsg_flags & NLM_F_ACK) | ||
1439 | netlink_ack(skb, nlh, 0); | ||
1440 | |||
1441 | skb_pull(skb, total_len); | ||
1442 | } | ||
1443 | |||
1444 | return 0; | ||
1445 | } | ||
1446 | |||
1447 | /** | ||
1448 | * nelink_run_queue - Process netlink receive queue. | ||
1449 | * @sk: Netlink socket containing the queue | ||
1450 | * @qlen: Place to store queue length upon entry | ||
1451 | * @cb: Callback function invoked for each netlink message found | ||
1452 | * | ||
1453 | * Processes as much as there was in the queue upon entry and invokes | ||
1454 | * a callback function for each netlink message found. The callback | ||
1455 | * function may refuse a message by returning a negative error code | ||
1456 | * but setting the error pointer to 0 in which case this function | ||
1457 | * returns with a qlen != 0. | ||
1458 | * | ||
1459 | * qlen must be initialized to 0 before the initial entry, afterwards | ||
1460 | * the function may be called repeatedly until qlen reaches 0. | ||
1461 | */ | ||
1462 | void netlink_run_queue(struct sock *sk, unsigned int *qlen, | ||
1463 | int (*cb)(struct sk_buff *, struct nlmsghdr *, int *)) | ||
1464 | { | ||
1465 | struct sk_buff *skb; | ||
1466 | |||
1467 | if (!*qlen || *qlen > skb_queue_len(&sk->sk_receive_queue)) | ||
1468 | *qlen = skb_queue_len(&sk->sk_receive_queue); | ||
1469 | |||
1470 | for (; *qlen; (*qlen)--) { | ||
1471 | skb = skb_dequeue(&sk->sk_receive_queue); | ||
1472 | if (netlink_rcv_skb(skb, cb)) { | ||
1473 | if (skb->len) | ||
1474 | skb_queue_head(&sk->sk_receive_queue, skb); | ||
1475 | else { | ||
1476 | kfree_skb(skb); | ||
1477 | (*qlen)--; | ||
1478 | } | ||
1479 | break; | ||
1480 | } | ||
1481 | |||
1482 | kfree_skb(skb); | ||
1483 | } | ||
1484 | } | ||
1485 | |||
1486 | /** | ||
1487 | * netlink_queue_skip - Skip netlink message while processing queue. | ||
1488 | * @nlh: Netlink message to be skipped | ||
1489 | * @skb: Socket buffer containing the netlink messages. | ||
1490 | * | ||
1491 | * Pulls the given netlink message off the socket buffer so the next | ||
1492 | * call to netlink_queue_run() will not reconsider the message. | ||
1493 | */ | ||
1494 | void netlink_queue_skip(struct nlmsghdr *nlh, struct sk_buff *skb) | ||
1495 | { | ||
1496 | int msglen = NLMSG_ALIGN(nlh->nlmsg_len); | ||
1497 | |||
1498 | if (msglen > skb->len) | ||
1499 | msglen = skb->len; | ||
1500 | |||
1501 | skb_pull(skb, msglen); | ||
1502 | } | ||
1412 | 1503 | ||
1413 | #ifdef CONFIG_PROC_FS | 1504 | #ifdef CONFIG_PROC_FS |
1414 | struct nl_seq_iter { | 1505 | struct nl_seq_iter { |
@@ -1657,6 +1748,8 @@ out: | |||
1657 | core_initcall(netlink_proto_init); | 1748 | core_initcall(netlink_proto_init); |
1658 | 1749 | ||
1659 | EXPORT_SYMBOL(netlink_ack); | 1750 | EXPORT_SYMBOL(netlink_ack); |
1751 | EXPORT_SYMBOL(netlink_run_queue); | ||
1752 | EXPORT_SYMBOL(netlink_queue_skip); | ||
1660 | EXPORT_SYMBOL(netlink_broadcast); | 1753 | EXPORT_SYMBOL(netlink_broadcast); |
1661 | EXPORT_SYMBOL(netlink_dump_start); | 1754 | EXPORT_SYMBOL(netlink_dump_start); |
1662 | EXPORT_SYMBOL(netlink_kernel_create); | 1755 | EXPORT_SYMBOL(netlink_kernel_create); |
diff --git a/net/netlink/attr.c b/net/netlink/attr.c new file mode 100644 index 000000000000..fffef4ab276f --- /dev/null +++ b/net/netlink/attr.c | |||
@@ -0,0 +1,328 @@ | |||
1 | /* | ||
2 | * NETLINK Netlink attributes | ||
3 | * | ||
4 | * Authors: Thomas Graf <tgraf@suug.ch> | ||
5 | * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> | ||
6 | */ | ||
7 | |||
8 | #include <linux/config.h> | ||
9 | #include <linux/module.h> | ||
10 | #include <linux/kernel.h> | ||
11 | #include <linux/errno.h> | ||
12 | #include <linux/jiffies.h> | ||
13 | #include <linux/netdevice.h> | ||
14 | #include <linux/skbuff.h> | ||
15 | #include <linux/string.h> | ||
16 | #include <linux/types.h> | ||
17 | #include <net/netlink.h> | ||
18 | |||
19 | static u16 nla_attr_minlen[NLA_TYPE_MAX+1] __read_mostly = { | ||
20 | [NLA_U8] = sizeof(u8), | ||
21 | [NLA_U16] = sizeof(u16), | ||
22 | [NLA_U32] = sizeof(u32), | ||
23 | [NLA_U64] = sizeof(u64), | ||
24 | [NLA_STRING] = 1, | ||
25 | [NLA_NESTED] = NLA_HDRLEN, | ||
26 | }; | ||
27 | |||
28 | static int validate_nla(struct nlattr *nla, int maxtype, | ||
29 | struct nla_policy *policy) | ||
30 | { | ||
31 | struct nla_policy *pt; | ||
32 | int minlen = 0; | ||
33 | |||
34 | if (nla->nla_type <= 0 || nla->nla_type > maxtype) | ||
35 | return 0; | ||
36 | |||
37 | pt = &policy[nla->nla_type]; | ||
38 | |||
39 | BUG_ON(pt->type > NLA_TYPE_MAX); | ||
40 | |||
41 | if (pt->minlen) | ||
42 | minlen = pt->minlen; | ||
43 | else if (pt->type != NLA_UNSPEC) | ||
44 | minlen = nla_attr_minlen[pt->type]; | ||
45 | |||
46 | if (pt->type == NLA_FLAG && nla_len(nla) > 0) | ||
47 | return -ERANGE; | ||
48 | |||
49 | if (nla_len(nla) < minlen) | ||
50 | return -ERANGE; | ||
51 | |||
52 | return 0; | ||
53 | } | ||
54 | |||
55 | /** | ||
56 | * nla_validate - Validate a stream of attributes | ||
57 | * @head: head of attribute stream | ||
58 | * @len: length of attribute stream | ||
59 | * @maxtype: maximum attribute type to be expected | ||
60 | * @policy: validation policy | ||
61 | * | ||
62 | * Validates all attributes in the specified attribute stream against the | ||
63 | * specified policy. Attributes with a type exceeding maxtype will be | ||
64 | * ignored. See documenation of struct nla_policy for more details. | ||
65 | * | ||
66 | * Returns 0 on success or a negative error code. | ||
67 | */ | ||
68 | int nla_validate(struct nlattr *head, int len, int maxtype, | ||
69 | struct nla_policy *policy) | ||
70 | { | ||
71 | struct nlattr *nla; | ||
72 | int rem, err; | ||
73 | |||
74 | nla_for_each_attr(nla, head, len, rem) { | ||
75 | err = validate_nla(nla, maxtype, policy); | ||
76 | if (err < 0) | ||
77 | goto errout; | ||
78 | } | ||
79 | |||
80 | err = 0; | ||
81 | errout: | ||
82 | return err; | ||
83 | } | ||
84 | |||
85 | /** | ||
86 | * nla_parse - Parse a stream of attributes into a tb buffer | ||
87 | * @tb: destination array with maxtype+1 elements | ||
88 | * @maxtype: maximum attribute type to be expected | ||
89 | * @head: head of attribute stream | ||
90 | * @len: length of attribute stream | ||
91 | * | ||
92 | * Parses a stream of attributes and stores a pointer to each attribute in | ||
93 | * the tb array accessable via the attribute type. Attributes with a type | ||
94 | * exceeding maxtype will be silently ignored for backwards compatibility | ||
95 | * reasons. policy may be set to NULL if no validation is required. | ||
96 | * | ||
97 | * Returns 0 on success or a negative error code. | ||
98 | */ | ||
99 | int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len, | ||
100 | struct nla_policy *policy) | ||
101 | { | ||
102 | struct nlattr *nla; | ||
103 | int rem, err; | ||
104 | |||
105 | memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); | ||
106 | |||
107 | nla_for_each_attr(nla, head, len, rem) { | ||
108 | u16 type = nla->nla_type; | ||
109 | |||
110 | if (type > 0 && type <= maxtype) { | ||
111 | if (policy) { | ||
112 | err = validate_nla(nla, maxtype, policy); | ||
113 | if (err < 0) | ||
114 | goto errout; | ||
115 | } | ||
116 | |||
117 | tb[type] = nla; | ||
118 | } | ||
119 | } | ||
120 | |||
121 | if (unlikely(rem > 0)) | ||
122 | printk(KERN_WARNING "netlink: %d bytes leftover after parsing " | ||
123 | "attributes.\n", rem); | ||
124 | |||
125 | err = 0; | ||
126 | errout: | ||
127 | return err; | ||
128 | } | ||
129 | |||
130 | /** | ||
131 | * nla_find - Find a specific attribute in a stream of attributes | ||
132 | * @head: head of attribute stream | ||
133 | * @len: length of attribute stream | ||
134 | * @attrtype: type of attribute to look for | ||
135 | * | ||
136 | * Returns the first attribute in the stream matching the specified type. | ||
137 | */ | ||
138 | struct nlattr *nla_find(struct nlattr *head, int len, int attrtype) | ||
139 | { | ||
140 | struct nlattr *nla; | ||
141 | int rem; | ||
142 | |||
143 | nla_for_each_attr(nla, head, len, rem) | ||
144 | if (nla->nla_type == attrtype) | ||
145 | return nla; | ||
146 | |||
147 | return NULL; | ||
148 | } | ||
149 | |||
150 | /** | ||
151 | * nla_strlcpy - Copy string attribute payload into a sized buffer | ||
152 | * @dst: where to copy the string to | ||
153 | * @src: attribute to copy the string from | ||
154 | * @dstsize: size of destination buffer | ||
155 | * | ||
156 | * Copies at most dstsize - 1 bytes into the destination buffer. | ||
157 | * The result is always a valid NUL-terminated string. Unlike | ||
158 | * strlcpy the destination buffer is always padded out. | ||
159 | * | ||
160 | * Returns the length of the source buffer. | ||
161 | */ | ||
162 | size_t nla_strlcpy(char *dst, const struct nlattr *nla, size_t dstsize) | ||
163 | { | ||
164 | size_t srclen = nla_len(nla); | ||
165 | char *src = nla_data(nla); | ||
166 | |||
167 | if (srclen > 0 && src[srclen - 1] == '\0') | ||
168 | srclen--; | ||
169 | |||
170 | if (dstsize > 0) { | ||
171 | size_t len = (srclen >= dstsize) ? dstsize - 1 : srclen; | ||
172 | |||
173 | memset(dst, 0, dstsize); | ||
174 | memcpy(dst, src, len); | ||
175 | } | ||
176 | |||
177 | return srclen; | ||
178 | } | ||
179 | |||
180 | /** | ||
181 | * nla_memcpy - Copy a netlink attribute into another memory area | ||
182 | * @dest: where to copy to memcpy | ||
183 | * @src: netlink attribute to copy from | ||
184 | * @count: size of the destination area | ||
185 | * | ||
186 | * Note: The number of bytes copied is limited by the length of | ||
187 | * attribute's payload. memcpy | ||
188 | * | ||
189 | * Returns the number of bytes copied. | ||
190 | */ | ||
191 | int nla_memcpy(void *dest, struct nlattr *src, int count) | ||
192 | { | ||
193 | int minlen = min_t(int, count, nla_len(src)); | ||
194 | |||
195 | memcpy(dest, nla_data(src), minlen); | ||
196 | |||
197 | return minlen; | ||
198 | } | ||
199 | |||
200 | /** | ||
201 | * nla_memcmp - Compare an attribute with sized memory area | ||
202 | * @nla: netlink attribute | ||
203 | * @data: memory area | ||
204 | * @size: size of memory area | ||
205 | */ | ||
206 | int nla_memcmp(const struct nlattr *nla, const void *data, | ||
207 | size_t size) | ||
208 | { | ||
209 | int d = nla_len(nla) - size; | ||
210 | |||
211 | if (d == 0) | ||
212 | d = memcmp(nla_data(nla), data, size); | ||
213 | |||
214 | return d; | ||
215 | } | ||
216 | |||
217 | /** | ||
218 | * nla_strcmp - Compare a string attribute against a string | ||
219 | * @nla: netlink string attribute | ||
220 | * @str: another string | ||
221 | */ | ||
222 | int nla_strcmp(const struct nlattr *nla, const char *str) | ||
223 | { | ||
224 | int len = strlen(str) + 1; | ||
225 | int d = nla_len(nla) - len; | ||
226 | |||
227 | if (d == 0) | ||
228 | d = memcmp(nla_data(nla), str, len); | ||
229 | |||
230 | return d; | ||
231 | } | ||
232 | |||
233 | /** | ||
234 | * __nla_reserve - reserve room for attribute on the skb | ||
235 | * @skb: socket buffer to reserve room on | ||
236 | * @attrtype: attribute type | ||
237 | * @attrlen: length of attribute payload | ||
238 | * | ||
239 | * Adds a netlink attribute header to a socket buffer and reserves | ||
240 | * room for the payload but does not copy it. | ||
241 | * | ||
242 | * The caller is responsible to ensure that the skb provides enough | ||
243 | * tailroom for the attribute header and payload. | ||
244 | */ | ||
245 | struct nlattr *__nla_reserve(struct sk_buff *skb, int attrtype, int attrlen) | ||
246 | { | ||
247 | struct nlattr *nla; | ||
248 | |||
249 | nla = (struct nlattr *) skb_put(skb, nla_total_size(attrlen)); | ||
250 | nla->nla_type = attrtype; | ||
251 | nla->nla_len = nla_attr_size(attrlen); | ||
252 | |||
253 | memset((unsigned char *) nla + nla->nla_len, 0, nla_padlen(attrlen)); | ||
254 | |||
255 | return nla; | ||
256 | } | ||
257 | |||
258 | /** | ||
259 | * nla_reserve - reserve room for attribute on the skb | ||
260 | * @skb: socket buffer to reserve room on | ||
261 | * @attrtype: attribute type | ||
262 | * @attrlen: length of attribute payload | ||
263 | * | ||
264 | * Adds a netlink attribute header to a socket buffer and reserves | ||
265 | * room for the payload but does not copy it. | ||
266 | * | ||
267 | * Returns NULL if the tailroom of the skb is insufficient to store | ||
268 | * the attribute header and payload. | ||
269 | */ | ||
270 | struct nlattr *nla_reserve(struct sk_buff *skb, int attrtype, int attrlen) | ||
271 | { | ||
272 | if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen))) | ||
273 | return NULL; | ||
274 | |||
275 | return __nla_reserve(skb, attrtype, attrlen); | ||
276 | } | ||
277 | |||
278 | /** | ||
279 | * __nla_put - Add a netlink attribute to a socket buffer | ||
280 | * @skb: socket buffer to add attribute to | ||
281 | * @attrtype: attribute type | ||
282 | * @attrlen: length of attribute payload | ||
283 | * @data: head of attribute payload | ||
284 | * | ||
285 | * The caller is responsible to ensure that the skb provides enough | ||
286 | * tailroom for the attribute header and payload. | ||
287 | */ | ||
288 | void __nla_put(struct sk_buff *skb, int attrtype, int attrlen, | ||
289 | const void *data) | ||
290 | { | ||
291 | struct nlattr *nla; | ||
292 | |||
293 | nla = __nla_reserve(skb, attrtype, attrlen); | ||
294 | memcpy(nla_data(nla), data, attrlen); | ||
295 | } | ||
296 | |||
297 | |||
298 | /** | ||
299 | * nla_put - Add a netlink attribute to a socket buffer | ||
300 | * @skb: socket buffer to add attribute to | ||
301 | * @attrtype: attribute type | ||
302 | * @attrlen: length of attribute payload | ||
303 | * @data: head of attribute payload | ||
304 | * | ||
305 | * Returns -1 if the tailroom of the skb is insufficient to store | ||
306 | * the attribute header and payload. | ||
307 | */ | ||
308 | int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data) | ||
309 | { | ||
310 | if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen))) | ||
311 | return -1; | ||
312 | |||
313 | __nla_put(skb, attrtype, attrlen, data); | ||
314 | return 0; | ||
315 | } | ||
316 | |||
317 | |||
318 | EXPORT_SYMBOL(nla_validate); | ||
319 | EXPORT_SYMBOL(nla_parse); | ||
320 | EXPORT_SYMBOL(nla_find); | ||
321 | EXPORT_SYMBOL(nla_strlcpy); | ||
322 | EXPORT_SYMBOL(__nla_reserve); | ||
323 | EXPORT_SYMBOL(nla_reserve); | ||
324 | EXPORT_SYMBOL(__nla_put); | ||
325 | EXPORT_SYMBOL(nla_put); | ||
326 | EXPORT_SYMBOL(nla_memcpy); | ||
327 | EXPORT_SYMBOL(nla_memcmp); | ||
328 | EXPORT_SYMBOL(nla_strcmp); | ||
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c new file mode 100644 index 000000000000..287cfcc56951 --- /dev/null +++ b/net/netlink/genetlink.c | |||
@@ -0,0 +1,579 @@ | |||
1 | /* | ||
2 | * NETLINK Generic Netlink Family | ||
3 | * | ||
4 | * Authors: Jamal Hadi Salim | ||
5 | * Thomas Graf <tgraf@suug.ch> | ||
6 | */ | ||
7 | |||
8 | #include <linux/config.h> | ||
9 | #include <linux/module.h> | ||
10 | #include <linux/kernel.h> | ||
11 | #include <linux/errno.h> | ||
12 | #include <linux/types.h> | ||
13 | #include <linux/socket.h> | ||
14 | #include <linux/string.h> | ||
15 | #include <linux/skbuff.h> | ||
16 | #include <net/sock.h> | ||
17 | #include <net/genetlink.h> | ||
18 | |||
19 | struct sock *genl_sock = NULL; | ||
20 | |||
21 | static DECLARE_MUTEX(genl_sem); /* serialization of message processing */ | ||
22 | |||
23 | static void genl_lock(void) | ||
24 | { | ||
25 | down(&genl_sem); | ||
26 | } | ||
27 | |||
28 | static int genl_trylock(void) | ||
29 | { | ||
30 | return down_trylock(&genl_sem); | ||
31 | } | ||
32 | |||
33 | static void genl_unlock(void) | ||
34 | { | ||
35 | up(&genl_sem); | ||
36 | |||
37 | if (genl_sock && genl_sock->sk_receive_queue.qlen) | ||
38 | genl_sock->sk_data_ready(genl_sock, 0); | ||
39 | } | ||
40 | |||
41 | #define GENL_FAM_TAB_SIZE 16 | ||
42 | #define GENL_FAM_TAB_MASK (GENL_FAM_TAB_SIZE - 1) | ||
43 | |||
44 | static struct list_head family_ht[GENL_FAM_TAB_SIZE]; | ||
45 | |||
46 | static int genl_ctrl_event(int event, void *data); | ||
47 | |||
48 | static inline unsigned int genl_family_hash(unsigned int id) | ||
49 | { | ||
50 | return id & GENL_FAM_TAB_MASK; | ||
51 | } | ||
52 | |||
53 | static inline struct list_head *genl_family_chain(unsigned int id) | ||
54 | { | ||
55 | return &family_ht[genl_family_hash(id)]; | ||
56 | } | ||
57 | |||
58 | static struct genl_family *genl_family_find_byid(unsigned int id) | ||
59 | { | ||
60 | struct genl_family *f; | ||
61 | |||
62 | list_for_each_entry(f, genl_family_chain(id), family_list) | ||
63 | if (f->id == id) | ||
64 | return f; | ||
65 | |||
66 | return NULL; | ||
67 | } | ||
68 | |||
69 | static struct genl_family *genl_family_find_byname(char *name) | ||
70 | { | ||
71 | struct genl_family *f; | ||
72 | int i; | ||
73 | |||
74 | for (i = 0; i < GENL_FAM_TAB_SIZE; i++) | ||
75 | list_for_each_entry(f, genl_family_chain(i), family_list) | ||
76 | if (strcmp(f->name, name) == 0) | ||
77 | return f; | ||
78 | |||
79 | return NULL; | ||
80 | } | ||
81 | |||
82 | static struct genl_ops *genl_get_cmd(u8 cmd, struct genl_family *family) | ||
83 | { | ||
84 | struct genl_ops *ops; | ||
85 | |||
86 | list_for_each_entry(ops, &family->ops_list, ops_list) | ||
87 | if (ops->cmd == cmd) | ||
88 | return ops; | ||
89 | |||
90 | return NULL; | ||
91 | } | ||
92 | |||
93 | /* Of course we are going to have problems once we hit | ||
94 | * 2^16 alive types, but that can only happen by year 2K | ||
95 | */ | ||
96 | static inline u16 genl_generate_id(void) | ||
97 | { | ||
98 | static u16 id_gen_idx; | ||
99 | int overflowed = 0; | ||
100 | |||
101 | do { | ||
102 | if (id_gen_idx == 0) | ||
103 | id_gen_idx = GENL_MIN_ID; | ||
104 | |||
105 | if (++id_gen_idx > GENL_MAX_ID) { | ||
106 | if (!overflowed) { | ||
107 | overflowed = 1; | ||
108 | id_gen_idx = 0; | ||
109 | continue; | ||
110 | } else | ||
111 | return 0; | ||
112 | } | ||
113 | |||
114 | } while (genl_family_find_byid(id_gen_idx)); | ||
115 | |||
116 | return id_gen_idx; | ||
117 | } | ||
118 | |||
119 | /** | ||
120 | * genl_register_ops - register generic netlink operations | ||
121 | * @family: generic netlink family | ||
122 | * @ops: operations to be registered | ||
123 | * | ||
124 | * Registers the specified operations and assigns them to the specified | ||
125 | * family. Either a doit or dumpit callback must be specified or the | ||
126 | * operation will fail. Only one operation structure per command | ||
127 | * identifier may be registered. | ||
128 | * | ||
129 | * See include/net/genetlink.h for more documenation on the operations | ||
130 | * structure. | ||
131 | * | ||
132 | * Returns 0 on success or a negative error code. | ||
133 | */ | ||
134 | int genl_register_ops(struct genl_family *family, struct genl_ops *ops) | ||
135 | { | ||
136 | int err = -EINVAL; | ||
137 | |||
138 | if (ops->dumpit == NULL && ops->doit == NULL) | ||
139 | goto errout; | ||
140 | |||
141 | if (genl_get_cmd(ops->cmd, family)) { | ||
142 | err = -EEXIST; | ||
143 | goto errout; | ||
144 | } | ||
145 | |||
146 | genl_lock(); | ||
147 | list_add_tail(&ops->ops_list, &family->ops_list); | ||
148 | genl_unlock(); | ||
149 | |||
150 | genl_ctrl_event(CTRL_CMD_NEWOPS, ops); | ||
151 | err = 0; | ||
152 | errout: | ||
153 | return err; | ||
154 | } | ||
155 | |||
156 | /** | ||
157 | * genl_unregister_ops - unregister generic netlink operations | ||
158 | * @family: generic netlink family | ||
159 | * @ops: operations to be unregistered | ||
160 | * | ||
161 | * Unregisters the specified operations and unassigns them from the | ||
162 | * specified family. The operation blocks until the current message | ||
163 | * processing has finished and doesn't start again until the | ||
164 | * unregister process has finished. | ||
165 | * | ||
166 | * Note: It is not necessary to unregister all operations before | ||
167 | * unregistering the family, unregistering the family will cause | ||
168 | * all assigned operations to be unregistered automatically. | ||
169 | * | ||
170 | * Returns 0 on success or a negative error code. | ||
171 | */ | ||
172 | int genl_unregister_ops(struct genl_family *family, struct genl_ops *ops) | ||
173 | { | ||
174 | struct genl_ops *rc; | ||
175 | |||
176 | genl_lock(); | ||
177 | list_for_each_entry(rc, &family->ops_list, ops_list) { | ||
178 | if (rc == ops) { | ||
179 | list_del(&ops->ops_list); | ||
180 | genl_unlock(); | ||
181 | genl_ctrl_event(CTRL_CMD_DELOPS, ops); | ||
182 | return 0; | ||
183 | } | ||
184 | } | ||
185 | genl_unlock(); | ||
186 | |||
187 | return -ENOENT; | ||
188 | } | ||
189 | |||
190 | /** | ||
191 | * genl_register_family - register a generic netlink family | ||
192 | * @family: generic netlink family | ||
193 | * | ||
194 | * Registers the specified family after validating it first. Only one | ||
195 | * family may be registered with the same family name or identifier. | ||
196 | * The family id may equal GENL_ID_GENERATE causing an unique id to | ||
197 | * be automatically generated and assigned. | ||
198 | * | ||
199 | * Return 0 on success or a negative error code. | ||
200 | */ | ||
201 | int genl_register_family(struct genl_family *family) | ||
202 | { | ||
203 | int err = -EINVAL; | ||
204 | |||
205 | if (family->id && family->id < GENL_MIN_ID) | ||
206 | goto errout; | ||
207 | |||
208 | if (family->id > GENL_MAX_ID) | ||
209 | goto errout; | ||
210 | |||
211 | INIT_LIST_HEAD(&family->ops_list); | ||
212 | |||
213 | genl_lock(); | ||
214 | |||
215 | if (genl_family_find_byname(family->name)) { | ||
216 | err = -EEXIST; | ||
217 | goto errout_locked; | ||
218 | } | ||
219 | |||
220 | if (genl_family_find_byid(family->id)) { | ||
221 | err = -EEXIST; | ||
222 | goto errout_locked; | ||
223 | } | ||
224 | |||
225 | if (!try_module_get(family->owner)) { | ||
226 | err = -EBUSY; | ||
227 | goto errout_locked; | ||
228 | } | ||
229 | |||
230 | if (family->id == GENL_ID_GENERATE) { | ||
231 | u16 newid = genl_generate_id(); | ||
232 | |||
233 | if (!newid) { | ||
234 | err = -ENOMEM; | ||
235 | goto errout_locked; | ||
236 | } | ||
237 | |||
238 | family->id = newid; | ||
239 | } | ||
240 | |||
241 | if (family->maxattr) { | ||
242 | family->attrbuf = kmalloc((family->maxattr+1) * | ||
243 | sizeof(struct nlattr *), GFP_KERNEL); | ||
244 | if (family->attrbuf == NULL) { | ||
245 | err = -ENOMEM; | ||
246 | goto errout; | ||
247 | } | ||
248 | } else | ||
249 | family->attrbuf = NULL; | ||
250 | |||
251 | list_add_tail(&family->family_list, genl_family_chain(family->id)); | ||
252 | genl_unlock(); | ||
253 | |||
254 | genl_ctrl_event(CTRL_CMD_NEWFAMILY, family); | ||
255 | |||
256 | return 0; | ||
257 | |||
258 | errout_locked: | ||
259 | genl_unlock(); | ||
260 | errout: | ||
261 | return err; | ||
262 | } | ||
263 | |||
264 | /** | ||
265 | * genl_unregister_family - unregister generic netlink family | ||
266 | * @family: generic netlink family | ||
267 | * | ||
268 | * Unregisters the specified family. | ||
269 | * | ||
270 | * Returns 0 on success or a negative error code. | ||
271 | */ | ||
272 | int genl_unregister_family(struct genl_family *family) | ||
273 | { | ||
274 | struct genl_family *rc; | ||
275 | |||
276 | genl_lock(); | ||
277 | |||
278 | list_for_each_entry(rc, genl_family_chain(family->id), family_list) { | ||
279 | if (family->id != rc->id || strcmp(rc->name, family->name)) | ||
280 | continue; | ||
281 | |||
282 | list_del(&rc->family_list); | ||
283 | INIT_LIST_HEAD(&family->ops_list); | ||
284 | genl_unlock(); | ||
285 | |||
286 | module_put(family->owner); | ||
287 | kfree(family->attrbuf); | ||
288 | genl_ctrl_event(CTRL_CMD_DELFAMILY, family); | ||
289 | return 0; | ||
290 | } | ||
291 | |||
292 | genl_unlock(); | ||
293 | |||
294 | return -ENOENT; | ||
295 | } | ||
296 | |||
297 | static inline int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, | ||
298 | int *errp) | ||
299 | { | ||
300 | struct genl_ops *ops; | ||
301 | struct genl_family *family; | ||
302 | struct genl_info info; | ||
303 | struct genlmsghdr *hdr = nlmsg_data(nlh); | ||
304 | int hdrlen, err = -EINVAL; | ||
305 | |||
306 | if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) | ||
307 | goto ignore; | ||
308 | |||
309 | if (nlh->nlmsg_type < NLMSG_MIN_TYPE) | ||
310 | goto ignore; | ||
311 | |||
312 | family = genl_family_find_byid(nlh->nlmsg_type); | ||
313 | if (family == NULL) { | ||
314 | err = -ENOENT; | ||
315 | goto errout; | ||
316 | } | ||
317 | |||
318 | hdrlen = GENL_HDRLEN + family->hdrsize; | ||
319 | if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) | ||
320 | goto errout; | ||
321 | |||
322 | ops = genl_get_cmd(hdr->cmd, family); | ||
323 | if (ops == NULL) { | ||
324 | err = -EOPNOTSUPP; | ||
325 | goto errout; | ||
326 | } | ||
327 | |||
328 | if ((ops->flags & GENL_ADMIN_PERM) && security_netlink_recv(skb)) { | ||
329 | err = -EPERM; | ||
330 | goto errout; | ||
331 | } | ||
332 | |||
333 | if (nlh->nlmsg_flags & NLM_F_DUMP) { | ||
334 | if (ops->dumpit == NULL) { | ||
335 | err = -EOPNOTSUPP; | ||
336 | goto errout; | ||
337 | } | ||
338 | |||
339 | *errp = err = netlink_dump_start(genl_sock, skb, nlh, | ||
340 | ops->dumpit, NULL); | ||
341 | if (err == 0) | ||
342 | skb_pull(skb, min(NLMSG_ALIGN(nlh->nlmsg_len), | ||
343 | skb->len)); | ||
344 | return -1; | ||
345 | } | ||
346 | |||
347 | if (ops->doit == NULL) { | ||
348 | err = -EOPNOTSUPP; | ||
349 | goto errout; | ||
350 | } | ||
351 | |||
352 | if (family->attrbuf) { | ||
353 | err = nlmsg_parse(nlh, hdrlen, family->attrbuf, family->maxattr, | ||
354 | ops->policy); | ||
355 | if (err < 0) | ||
356 | goto errout; | ||
357 | } | ||
358 | |||
359 | info.snd_seq = nlh->nlmsg_seq; | ||
360 | info.snd_pid = NETLINK_CB(skb).pid; | ||
361 | info.nlhdr = nlh; | ||
362 | info.genlhdr = nlmsg_data(nlh); | ||
363 | info.userhdr = nlmsg_data(nlh) + GENL_HDRLEN; | ||
364 | info.attrs = family->attrbuf; | ||
365 | |||
366 | *errp = err = ops->doit(skb, &info); | ||
367 | return err; | ||
368 | |||
369 | ignore: | ||
370 | return 0; | ||
371 | |||
372 | errout: | ||
373 | *errp = err; | ||
374 | return -1; | ||
375 | } | ||
376 | |||
377 | static void genl_rcv(struct sock *sk, int len) | ||
378 | { | ||
379 | unsigned int qlen = 0; | ||
380 | |||
381 | do { | ||
382 | if (genl_trylock()) | ||
383 | return; | ||
384 | netlink_run_queue(sk, &qlen, &genl_rcv_msg); | ||
385 | genl_unlock(); | ||
386 | } while (qlen && genl_sock && genl_sock->sk_receive_queue.qlen); | ||
387 | } | ||
388 | |||
389 | /************************************************************************** | ||
390 | * Controller | ||
391 | **************************************************************************/ | ||
392 | |||
393 | static int ctrl_fill_info(struct genl_family *family, u32 pid, u32 seq, | ||
394 | u32 flags, struct sk_buff *skb, u8 cmd) | ||
395 | { | ||
396 | void *hdr; | ||
397 | |||
398 | hdr = genlmsg_put(skb, pid, seq, GENL_ID_CTRL, 0, flags, cmd, | ||
399 | family->version); | ||
400 | if (hdr == NULL) | ||
401 | return -1; | ||
402 | |||
403 | NLA_PUT_STRING(skb, CTRL_ATTR_FAMILY_NAME, family->name); | ||
404 | NLA_PUT_U16(skb, CTRL_ATTR_FAMILY_ID, family->id); | ||
405 | |||
406 | return genlmsg_end(skb, hdr); | ||
407 | |||
408 | nla_put_failure: | ||
409 | return genlmsg_cancel(skb, hdr); | ||
410 | } | ||
411 | |||
412 | static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb) | ||
413 | { | ||
414 | |||
415 | int i, n = 0; | ||
416 | struct genl_family *rt; | ||
417 | int chains_to_skip = cb->args[0]; | ||
418 | int fams_to_skip = cb->args[1]; | ||
419 | |||
420 | for (i = 0; i < GENL_FAM_TAB_SIZE; i++) { | ||
421 | if (i < chains_to_skip) | ||
422 | continue; | ||
423 | n = 0; | ||
424 | list_for_each_entry(rt, genl_family_chain(i), family_list) { | ||
425 | if (++n < fams_to_skip) | ||
426 | continue; | ||
427 | if (ctrl_fill_info(rt, NETLINK_CB(cb->skb).pid, | ||
428 | cb->nlh->nlmsg_seq, NLM_F_MULTI, | ||
429 | skb, CTRL_CMD_NEWFAMILY) < 0) | ||
430 | goto errout; | ||
431 | } | ||
432 | |||
433 | fams_to_skip = 0; | ||
434 | } | ||
435 | |||
436 | errout: | ||
437 | cb->args[0] = i; | ||
438 | cb->args[1] = n; | ||
439 | |||
440 | return skb->len; | ||
441 | } | ||
442 | |||
443 | static struct sk_buff *ctrl_build_msg(struct genl_family *family, u32 pid, | ||
444 | int seq, int cmd) | ||
445 | { | ||
446 | struct sk_buff *skb; | ||
447 | int err; | ||
448 | |||
449 | skb = nlmsg_new(NLMSG_GOODSIZE); | ||
450 | if (skb == NULL) | ||
451 | return ERR_PTR(-ENOBUFS); | ||
452 | |||
453 | err = ctrl_fill_info(family, pid, seq, 0, skb, cmd); | ||
454 | if (err < 0) { | ||
455 | nlmsg_free(skb); | ||
456 | return ERR_PTR(err); | ||
457 | } | ||
458 | |||
459 | return skb; | ||
460 | } | ||
461 | |||
462 | static struct nla_policy ctrl_policy[CTRL_ATTR_MAX+1] __read_mostly = { | ||
463 | [CTRL_ATTR_FAMILY_ID] = { .type = NLA_U16 }, | ||
464 | [CTRL_ATTR_FAMILY_NAME] = { .type = NLA_STRING }, | ||
465 | }; | ||
466 | |||
467 | static int ctrl_getfamily(struct sk_buff *skb, struct genl_info *info) | ||
468 | { | ||
469 | struct sk_buff *msg; | ||
470 | struct genl_family *res = NULL; | ||
471 | int err = -EINVAL; | ||
472 | |||
473 | if (info->attrs[CTRL_ATTR_FAMILY_ID]) { | ||
474 | u16 id = nla_get_u16(info->attrs[CTRL_ATTR_FAMILY_ID]); | ||
475 | res = genl_family_find_byid(id); | ||
476 | } | ||
477 | |||
478 | if (info->attrs[CTRL_ATTR_FAMILY_NAME]) { | ||
479 | char name[GENL_NAMSIZ]; | ||
480 | |||
481 | if (nla_strlcpy(name, info->attrs[CTRL_ATTR_FAMILY_NAME], | ||
482 | GENL_NAMSIZ) >= GENL_NAMSIZ) | ||
483 | goto errout; | ||
484 | |||
485 | res = genl_family_find_byname(name); | ||
486 | } | ||
487 | |||
488 | if (res == NULL) { | ||
489 | err = -ENOENT; | ||
490 | goto errout; | ||
491 | } | ||
492 | |||
493 | msg = ctrl_build_msg(res, info->snd_pid, info->snd_seq, | ||
494 | CTRL_CMD_NEWFAMILY); | ||
495 | if (IS_ERR(msg)) { | ||
496 | err = PTR_ERR(msg); | ||
497 | goto errout; | ||
498 | } | ||
499 | |||
500 | err = genlmsg_unicast(msg, info->snd_pid); | ||
501 | errout: | ||
502 | return err; | ||
503 | } | ||
504 | |||
505 | static int genl_ctrl_event(int event, void *data) | ||
506 | { | ||
507 | struct sk_buff *msg; | ||
508 | |||
509 | if (genl_sock == NULL) | ||
510 | return 0; | ||
511 | |||
512 | switch (event) { | ||
513 | case CTRL_CMD_NEWFAMILY: | ||
514 | case CTRL_CMD_DELFAMILY: | ||
515 | msg = ctrl_build_msg(data, 0, 0, event); | ||
516 | if (IS_ERR(msg)) | ||
517 | return PTR_ERR(msg); | ||
518 | |||
519 | genlmsg_multicast(msg, 0, GENL_ID_CTRL); | ||
520 | break; | ||
521 | } | ||
522 | |||
523 | return 0; | ||
524 | } | ||
525 | |||
526 | static struct genl_ops genl_ctrl_ops = { | ||
527 | .cmd = CTRL_CMD_GETFAMILY, | ||
528 | .doit = ctrl_getfamily, | ||
529 | .dumpit = ctrl_dumpfamily, | ||
530 | .policy = ctrl_policy, | ||
531 | }; | ||
532 | |||
533 | static struct genl_family genl_ctrl = { | ||
534 | .id = GENL_ID_CTRL, | ||
535 | .name = "nlctrl", | ||
536 | .version = 0x1, | ||
537 | .maxattr = CTRL_ATTR_MAX, | ||
538 | .owner = THIS_MODULE, | ||
539 | }; | ||
540 | |||
541 | static int __init genl_init(void) | ||
542 | { | ||
543 | int i, err; | ||
544 | |||
545 | for (i = 0; i < GENL_FAM_TAB_SIZE; i++) | ||
546 | INIT_LIST_HEAD(&family_ht[i]); | ||
547 | |||
548 | err = genl_register_family(&genl_ctrl); | ||
549 | if (err < 0) | ||
550 | goto errout; | ||
551 | |||
552 | err = genl_register_ops(&genl_ctrl, &genl_ctrl_ops); | ||
553 | if (err < 0) | ||
554 | goto errout_register; | ||
555 | |||
556 | netlink_set_nonroot(NETLINK_GENERIC, NL_NONROOT_RECV); | ||
557 | genl_sock = netlink_kernel_create(NETLINK_GENERIC, GENL_MAX_ID, | ||
558 | genl_rcv, THIS_MODULE); | ||
559 | if (genl_sock == NULL) { | ||
560 | panic("GENL: Cannot initialize generic netlink\n"); | ||
561 | return -ENOMEM; | ||
562 | } | ||
563 | |||
564 | return 0; | ||
565 | |||
566 | errout_register: | ||
567 | genl_unregister_family(&genl_ctrl); | ||
568 | errout: | ||
569 | panic("GENL: Cannot register controller: %d\n", err); | ||
570 | return err; | ||
571 | } | ||
572 | |||
573 | subsys_initcall(genl_init); | ||
574 | |||
575 | EXPORT_SYMBOL(genl_sock); | ||
576 | EXPORT_SYMBOL(genl_register_ops); | ||
577 | EXPORT_SYMBOL(genl_unregister_ops); | ||
578 | EXPORT_SYMBOL(genl_register_family); | ||
579 | EXPORT_SYMBOL(genl_unregister_family); | ||
diff --git a/net/rxrpc/transport.c b/net/rxrpc/transport.c index 122c086ee2db..dbe6105e83a5 100644 --- a/net/rxrpc/transport.c +++ b/net/rxrpc/transport.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #include <linux/in.h> | 23 | #include <linux/in.h> |
24 | #include <linux/in6.h> | 24 | #include <linux/in6.h> |
25 | #include <linux/icmp.h> | 25 | #include <linux/icmp.h> |
26 | #include <linux/skbuff.h> | ||
26 | #include <net/sock.h> | 27 | #include <net/sock.h> |
27 | #include <net/ip.h> | 28 | #include <net/ip.h> |
28 | #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) | 29 | #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) |
@@ -475,15 +476,11 @@ void rxrpc_trans_receive_packet(struct rxrpc_transport *trans) | |||
475 | 476 | ||
476 | /* we'll probably need to checksum it (didn't call | 477 | /* we'll probably need to checksum it (didn't call |
477 | * sock_recvmsg) */ | 478 | * sock_recvmsg) */ |
478 | if (pkt->ip_summed != CHECKSUM_UNNECESSARY) { | 479 | if (skb_checksum_complete(pkt)) { |
479 | if ((unsigned short) | 480 | kfree_skb(pkt); |
480 | csum_fold(skb_checksum(pkt, 0, pkt->len, | 481 | rxrpc_krxiod_queue_transport(trans); |
481 | pkt->csum))) { | 482 | _leave(" CSUM failed"); |
482 | kfree_skb(pkt); | 483 | return; |
483 | rxrpc_krxiod_queue_transport(trans); | ||
484 | _leave(" CSUM failed"); | ||
485 | return; | ||
486 | } | ||
487 | } | 484 | } |
488 | 485 | ||
489 | addr = pkt->nh.iph->saddr; | 486 | addr = pkt->nh.iph->saddr; |
diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 8c8ddf7f9b61..dec68a604773 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c | |||
@@ -128,9 +128,29 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a | |||
128 | */ | 128 | */ |
129 | asoc->max_burst = sctp_max_burst; | 129 | asoc->max_burst = sctp_max_burst; |
130 | 130 | ||
131 | /* Copy things from the endpoint. */ | 131 | /* initialize association timers */ |
132 | asoc->timeouts[SCTP_EVENT_TIMEOUT_NONE] = 0; | ||
133 | asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = asoc->rto_initial; | ||
134 | asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = asoc->rto_initial; | ||
135 | asoc->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = asoc->rto_initial; | ||
136 | asoc->timeouts[SCTP_EVENT_TIMEOUT_T3_RTX] = 0; | ||
137 | asoc->timeouts[SCTP_EVENT_TIMEOUT_T4_RTO] = 0; | ||
138 | |||
139 | /* sctpimpguide Section 2.12.2 | ||
140 | * If the 'T5-shutdown-guard' timer is used, it SHOULD be set to the | ||
141 | * recommended value of 5 times 'RTO.Max'. | ||
142 | */ | ||
143 | asoc->timeouts[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD] | ||
144 | = 5 * asoc->rto_max; | ||
145 | |||
146 | asoc->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0; | ||
147 | asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = | ||
148 | SCTP_DEFAULT_TIMEOUT_SACK; | ||
149 | asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = | ||
150 | sp->autoclose * HZ; | ||
151 | |||
152 | /* Initilizes the timers */ | ||
132 | for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i) { | 153 | for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i) { |
133 | asoc->timeouts[i] = ep->timeouts[i]; | ||
134 | init_timer(&asoc->timers[i]); | 154 | init_timer(&asoc->timers[i]); |
135 | asoc->timers[i].function = sctp_timer_events[i]; | 155 | asoc->timers[i].function = sctp_timer_events[i]; |
136 | asoc->timers[i].data = (unsigned long) asoc; | 156 | asoc->timers[i].data = (unsigned long) asoc; |
@@ -157,10 +177,10 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a | |||
157 | * RFC 6 - A SCTP receiver MUST be able to receive a minimum of | 177 | * RFC 6 - A SCTP receiver MUST be able to receive a minimum of |
158 | * 1500 bytes in one SCTP packet. | 178 | * 1500 bytes in one SCTP packet. |
159 | */ | 179 | */ |
160 | if (sk->sk_rcvbuf < SCTP_DEFAULT_MINWINDOW) | 180 | if ((sk->sk_rcvbuf/2) < SCTP_DEFAULT_MINWINDOW) |
161 | asoc->rwnd = SCTP_DEFAULT_MINWINDOW; | 181 | asoc->rwnd = SCTP_DEFAULT_MINWINDOW; |
162 | else | 182 | else |
163 | asoc->rwnd = sk->sk_rcvbuf; | 183 | asoc->rwnd = sk->sk_rcvbuf/2; |
164 | 184 | ||
165 | asoc->a_rwnd = asoc->rwnd; | 185 | asoc->a_rwnd = asoc->rwnd; |
166 | 186 | ||
@@ -172,6 +192,9 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a | |||
172 | /* Set the sndbuf size for transmit. */ | 192 | /* Set the sndbuf size for transmit. */ |
173 | asoc->sndbuf_used = 0; | 193 | asoc->sndbuf_used = 0; |
174 | 194 | ||
195 | /* Initialize the receive memory counter */ | ||
196 | atomic_set(&asoc->rmem_alloc, 0); | ||
197 | |||
175 | init_waitqueue_head(&asoc->wait); | 198 | init_waitqueue_head(&asoc->wait); |
176 | 199 | ||
177 | asoc->c.my_vtag = sctp_generate_tag(ep); | 200 | asoc->c.my_vtag = sctp_generate_tag(ep); |
@@ -380,6 +403,8 @@ static void sctp_association_destroy(struct sctp_association *asoc) | |||
380 | spin_unlock_bh(&sctp_assocs_id_lock); | 403 | spin_unlock_bh(&sctp_assocs_id_lock); |
381 | } | 404 | } |
382 | 405 | ||
406 | BUG_TRAP(!atomic_read(&asoc->rmem_alloc)); | ||
407 | |||
383 | if (asoc->base.malloced) { | 408 | if (asoc->base.malloced) { |
384 | kfree(asoc); | 409 | kfree(asoc); |
385 | SCTP_DBG_OBJCNT_DEC(assoc); | 410 | SCTP_DBG_OBJCNT_DEC(assoc); |
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index 96984f7a2d69..67bd53070ee0 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c | |||
@@ -70,7 +70,6 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, | |||
70 | struct sock *sk, | 70 | struct sock *sk, |
71 | gfp_t gfp) | 71 | gfp_t gfp) |
72 | { | 72 | { |
73 | struct sctp_sock *sp = sctp_sk(sk); | ||
74 | memset(ep, 0, sizeof(struct sctp_endpoint)); | 73 | memset(ep, 0, sizeof(struct sctp_endpoint)); |
75 | 74 | ||
76 | /* Initialize the base structure. */ | 75 | /* Initialize the base structure. */ |
@@ -100,33 +99,14 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, | |||
100 | /* Create the lists of associations. */ | 99 | /* Create the lists of associations. */ |
101 | INIT_LIST_HEAD(&ep->asocs); | 100 | INIT_LIST_HEAD(&ep->asocs); |
102 | 101 | ||
103 | /* Set up the base timeout information. */ | ||
104 | ep->timeouts[SCTP_EVENT_TIMEOUT_NONE] = 0; | ||
105 | ep->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = | ||
106 | msecs_to_jiffies(sp->rtoinfo.srto_initial); | ||
107 | ep->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = | ||
108 | msecs_to_jiffies(sp->rtoinfo.srto_initial); | ||
109 | ep->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = | ||
110 | msecs_to_jiffies(sp->rtoinfo.srto_initial); | ||
111 | ep->timeouts[SCTP_EVENT_TIMEOUT_T3_RTX] = 0; | ||
112 | ep->timeouts[SCTP_EVENT_TIMEOUT_T4_RTO] = 0; | ||
113 | |||
114 | /* sctpimpguide-05 Section 2.12.2 | ||
115 | * If the 'T5-shutdown-guard' timer is used, it SHOULD be set to the | ||
116 | * recommended value of 5 times 'RTO.Max'. | ||
117 | */ | ||
118 | ep->timeouts[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD] | ||
119 | = 5 * msecs_to_jiffies(sp->rtoinfo.srto_max); | ||
120 | |||
121 | ep->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0; | ||
122 | ep->timeouts[SCTP_EVENT_TIMEOUT_SACK] = sctp_sack_timeout; | ||
123 | ep->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sp->autoclose * HZ; | ||
124 | |||
125 | /* Use SCTP specific send buffer space queues. */ | 102 | /* Use SCTP specific send buffer space queues. */ |
126 | ep->sndbuf_policy = sctp_sndbuf_policy; | 103 | ep->sndbuf_policy = sctp_sndbuf_policy; |
127 | sk->sk_write_space = sctp_write_space; | 104 | sk->sk_write_space = sctp_write_space; |
128 | sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); | 105 | sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); |
129 | 106 | ||
107 | /* Get the receive buffer policy for this endpoint */ | ||
108 | ep->rcvbuf_policy = sctp_rcvbuf_policy; | ||
109 | |||
130 | /* Initialize the secret key used with cookie. */ | 110 | /* Initialize the secret key used with cookie. */ |
131 | get_random_bytes(&ep->secret_key[0], SCTP_SECRET_SIZE); | 111 | get_random_bytes(&ep->secret_key[0], SCTP_SECRET_SIZE); |
132 | ep->last_key = ep->current_key = 0; | 112 | ep->last_key = ep->current_key = 0; |
diff --git a/net/sctp/input.c b/net/sctp/input.c index 28f32243397f..b24ff2c1aef5 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c | |||
@@ -100,21 +100,6 @@ static inline int sctp_rcv_checksum(struct sk_buff *skb) | |||
100 | return 0; | 100 | return 0; |
101 | } | 101 | } |
102 | 102 | ||
103 | /* The free routine for skbuffs that sctp receives */ | ||
104 | static void sctp_rfree(struct sk_buff *skb) | ||
105 | { | ||
106 | atomic_sub(sizeof(struct sctp_chunk),&skb->sk->sk_rmem_alloc); | ||
107 | sock_rfree(skb); | ||
108 | } | ||
109 | |||
110 | /* The ownership wrapper routine to do receive buffer accounting */ | ||
111 | static void sctp_rcv_set_owner_r(struct sk_buff *skb, struct sock *sk) | ||
112 | { | ||
113 | skb_set_owner_r(skb,sk); | ||
114 | skb->destructor = sctp_rfree; | ||
115 | atomic_add(sizeof(struct sctp_chunk),&sk->sk_rmem_alloc); | ||
116 | } | ||
117 | |||
118 | struct sctp_input_cb { | 103 | struct sctp_input_cb { |
119 | union { | 104 | union { |
120 | struct inet_skb_parm h4; | 105 | struct inet_skb_parm h4; |
@@ -217,9 +202,6 @@ int sctp_rcv(struct sk_buff *skb) | |||
217 | rcvr = &ep->base; | 202 | rcvr = &ep->base; |
218 | } | 203 | } |
219 | 204 | ||
220 | if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) | ||
221 | goto discard_release; | ||
222 | |||
223 | /* | 205 | /* |
224 | * RFC 2960, 8.4 - Handle "Out of the blue" Packets. | 206 | * RFC 2960, 8.4 - Handle "Out of the blue" Packets. |
225 | * An SCTP packet is called an "out of the blue" (OOTB) | 207 | * An SCTP packet is called an "out of the blue" (OOTB) |
@@ -256,8 +238,6 @@ int sctp_rcv(struct sk_buff *skb) | |||
256 | } | 238 | } |
257 | SCTP_INPUT_CB(skb)->chunk = chunk; | 239 | SCTP_INPUT_CB(skb)->chunk = chunk; |
258 | 240 | ||
259 | sctp_rcv_set_owner_r(skb,sk); | ||
260 | |||
261 | /* Remember what endpoint is to handle this packet. */ | 241 | /* Remember what endpoint is to handle this packet. */ |
262 | chunk->rcvr = rcvr; | 242 | chunk->rcvr = rcvr; |
263 | 243 | ||
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 26de4d3e1bd9..f775d78aa59d 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c | |||
@@ -530,6 +530,9 @@ static void sctp_v4_get_saddr(struct sctp_association *asoc, | |||
530 | { | 530 | { |
531 | struct rtable *rt = (struct rtable *)dst; | 531 | struct rtable *rt = (struct rtable *)dst; |
532 | 532 | ||
533 | if (!asoc) | ||
534 | return; | ||
535 | |||
533 | if (rt) { | 536 | if (rt) { |
534 | saddr->v4.sin_family = AF_INET; | 537 | saddr->v4.sin_family = AF_INET; |
535 | saddr->v4.sin_port = asoc->base.bind_addr.port; | 538 | saddr->v4.sin_port = asoc->base.bind_addr.port; |
@@ -1047,6 +1050,9 @@ SCTP_STATIC __init int sctp_init(void) | |||
1047 | /* Sendbuffer growth - do per-socket accounting */ | 1050 | /* Sendbuffer growth - do per-socket accounting */ |
1048 | sctp_sndbuf_policy = 0; | 1051 | sctp_sndbuf_policy = 0; |
1049 | 1052 | ||
1053 | /* Rcvbuffer growth - do per-socket accounting */ | ||
1054 | sctp_rcvbuf_policy = 0; | ||
1055 | |||
1050 | /* HB.interval - 30 seconds */ | 1056 | /* HB.interval - 30 seconds */ |
1051 | sctp_hb_interval = SCTP_DEFAULT_TIMEOUT_HEARTBEAT; | 1057 | sctp_hb_interval = SCTP_DEFAULT_TIMEOUT_HEARTBEAT; |
1052 | 1058 | ||
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index f84173ea8ec1..823947170a33 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c | |||
@@ -385,7 +385,7 @@ sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = { | |||
385 | NULL, | 385 | NULL, |
386 | sctp_generate_t4_rto_event, | 386 | sctp_generate_t4_rto_event, |
387 | sctp_generate_t5_shutdown_guard_event, | 387 | sctp_generate_t5_shutdown_guard_event, |
388 | sctp_generate_heartbeat_event, | 388 | NULL, |
389 | sctp_generate_sack_event, | 389 | sctp_generate_sack_event, |
390 | sctp_generate_autoclose_event, | 390 | sctp_generate_autoclose_event, |
391 | }; | 391 | }; |
@@ -689,9 +689,9 @@ static void sctp_cmd_new_state(sctp_cmd_seq_t *cmds, | |||
689 | * increased due to timer expirations. | 689 | * increased due to timer expirations. |
690 | */ | 690 | */ |
691 | asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = | 691 | asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = |
692 | asoc->ep->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT]; | 692 | asoc->rto_initial; |
693 | asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = | 693 | asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = |
694 | asoc->ep->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE]; | 694 | asoc->rto_initial; |
695 | } | 695 | } |
696 | 696 | ||
697 | if (sctp_state(asoc, ESTABLISHED) || | 697 | if (sctp_state(asoc, ESTABLISHED) || |
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 505c7de10c50..475bfb4972d9 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c | |||
@@ -5160,6 +5160,8 @@ static int sctp_eat_data(const struct sctp_association *asoc, | |||
5160 | sctp_verb_t deliver; | 5160 | sctp_verb_t deliver; |
5161 | int tmp; | 5161 | int tmp; |
5162 | __u32 tsn; | 5162 | __u32 tsn; |
5163 | int account_value; | ||
5164 | struct sock *sk = asoc->base.sk; | ||
5163 | 5165 | ||
5164 | data_hdr = chunk->subh.data_hdr = (sctp_datahdr_t *)chunk->skb->data; | 5166 | data_hdr = chunk->subh.data_hdr = (sctp_datahdr_t *)chunk->skb->data; |
5165 | skb_pull(chunk->skb, sizeof(sctp_datahdr_t)); | 5167 | skb_pull(chunk->skb, sizeof(sctp_datahdr_t)); |
@@ -5169,6 +5171,26 @@ static int sctp_eat_data(const struct sctp_association *asoc, | |||
5169 | 5171 | ||
5170 | /* ASSERT: Now skb->data is really the user data. */ | 5172 | /* ASSERT: Now skb->data is really the user data. */ |
5171 | 5173 | ||
5174 | /* | ||
5175 | * if we are established, and we have used up our receive | ||
5176 | * buffer memory, drop the frame | ||
5177 | */ | ||
5178 | if (asoc->state == SCTP_STATE_ESTABLISHED) { | ||
5179 | /* | ||
5180 | * If the receive buffer policy is 1, then each | ||
5181 | * association can allocate up to sk_rcvbuf bytes | ||
5182 | * otherwise, all the associations in aggregate | ||
5183 | * may allocate up to sk_rcvbuf bytes | ||
5184 | */ | ||
5185 | if (asoc->ep->rcvbuf_policy) | ||
5186 | account_value = atomic_read(&asoc->rmem_alloc); | ||
5187 | else | ||
5188 | account_value = atomic_read(&sk->sk_rmem_alloc); | ||
5189 | |||
5190 | if (account_value > sk->sk_rcvbuf) | ||
5191 | return SCTP_IERROR_IGNORE_TSN; | ||
5192 | } | ||
5193 | |||
5172 | /* Process ECN based congestion. | 5194 | /* Process ECN based congestion. |
5173 | * | 5195 | * |
5174 | * Since the chunk structure is reused for all chunks within | 5196 | * Since the chunk structure is reused for all chunks within |
diff --git a/net/sctp/socket.c b/net/sctp/socket.c index b529af5e6f2a..abab81f3818f 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c | |||
@@ -1932,7 +1932,6 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval, | |||
1932 | if (copy_from_user(&sp->autoclose, optval, optlen)) | 1932 | if (copy_from_user(&sp->autoclose, optval, optlen)) |
1933 | return -EFAULT; | 1933 | return -EFAULT; |
1934 | 1934 | ||
1935 | sp->ep->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sp->autoclose * HZ; | ||
1936 | return 0; | 1935 | return 0; |
1937 | } | 1936 | } |
1938 | 1937 | ||
@@ -5115,8 +5114,10 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, | |||
5115 | sctp_skb_for_each(skb, &oldsk->sk_receive_queue, tmp) { | 5114 | sctp_skb_for_each(skb, &oldsk->sk_receive_queue, tmp) { |
5116 | event = sctp_skb2event(skb); | 5115 | event = sctp_skb2event(skb); |
5117 | if (event->asoc == assoc) { | 5116 | if (event->asoc == assoc) { |
5117 | sock_rfree(skb); | ||
5118 | __skb_unlink(skb, &oldsk->sk_receive_queue); | 5118 | __skb_unlink(skb, &oldsk->sk_receive_queue); |
5119 | __skb_queue_tail(&newsk->sk_receive_queue, skb); | 5119 | __skb_queue_tail(&newsk->sk_receive_queue, skb); |
5120 | skb_set_owner_r(skb, newsk); | ||
5120 | } | 5121 | } |
5121 | } | 5122 | } |
5122 | 5123 | ||
@@ -5144,8 +5145,10 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, | |||
5144 | sctp_skb_for_each(skb, &oldsp->pd_lobby, tmp) { | 5145 | sctp_skb_for_each(skb, &oldsp->pd_lobby, tmp) { |
5145 | event = sctp_skb2event(skb); | 5146 | event = sctp_skb2event(skb); |
5146 | if (event->asoc == assoc) { | 5147 | if (event->asoc == assoc) { |
5148 | sock_rfree(skb); | ||
5147 | __skb_unlink(skb, &oldsp->pd_lobby); | 5149 | __skb_unlink(skb, &oldsp->pd_lobby); |
5148 | __skb_queue_tail(queue, skb); | 5150 | __skb_queue_tail(queue, skb); |
5151 | skb_set_owner_r(skb, newsk); | ||
5149 | } | 5152 | } |
5150 | } | 5153 | } |
5151 | 5154 | ||
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 75b28dd634fe..fcd7096c953d 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c | |||
@@ -121,6 +121,14 @@ static ctl_table sctp_table[] = { | |||
121 | .proc_handler = &proc_dointvec | 121 | .proc_handler = &proc_dointvec |
122 | }, | 122 | }, |
123 | { | 123 | { |
124 | .ctl_name = NET_SCTP_RCVBUF_POLICY, | ||
125 | .procname = "rcvbuf_policy", | ||
126 | .data = &sctp_rcvbuf_policy, | ||
127 | .maxlen = sizeof(int), | ||
128 | .mode = 0644, | ||
129 | .proc_handler = &proc_dointvec | ||
130 | }, | ||
131 | { | ||
124 | .ctl_name = NET_SCTP_PATH_MAX_RETRANS, | 132 | .ctl_name = NET_SCTP_PATH_MAX_RETRANS, |
125 | .procname = "path_max_retrans", | 133 | .procname = "path_max_retrans", |
126 | .data = &sctp_max_retrans_path, | 134 | .data = &sctp_max_retrans_path, |
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index e049f41faa47..ba97f974f57c 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c | |||
@@ -52,19 +52,6 @@ static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event, | |||
52 | struct sctp_association *asoc); | 52 | struct sctp_association *asoc); |
53 | static void sctp_ulpevent_release_data(struct sctp_ulpevent *event); | 53 | static void sctp_ulpevent_release_data(struct sctp_ulpevent *event); |
54 | 54 | ||
55 | /* Stub skb destructor. */ | ||
56 | static void sctp_stub_rfree(struct sk_buff *skb) | ||
57 | { | ||
58 | /* WARNING: This function is just a warning not to use the | ||
59 | * skb destructor. If the skb is shared, we may get the destructor | ||
60 | * callback on some processor that does not own the sock_lock. This | ||
61 | * was occuring with PACKET socket applications that were monitoring | ||
62 | * our skbs. We can't take the sock_lock, because we can't risk | ||
63 | * recursing if we do really own the sock lock. Instead, do all | ||
64 | * of our rwnd manipulation while we own the sock_lock outright. | ||
65 | */ | ||
66 | } | ||
67 | |||
68 | /* Initialize an ULP event from an given skb. */ | 55 | /* Initialize an ULP event from an given skb. */ |
69 | SCTP_STATIC void sctp_ulpevent_init(struct sctp_ulpevent *event, int msg_flags) | 56 | SCTP_STATIC void sctp_ulpevent_init(struct sctp_ulpevent *event, int msg_flags) |
70 | { | 57 | { |
@@ -111,15 +98,19 @@ static inline void sctp_ulpevent_set_owner(struct sctp_ulpevent *event, | |||
111 | */ | 98 | */ |
112 | sctp_association_hold((struct sctp_association *)asoc); | 99 | sctp_association_hold((struct sctp_association *)asoc); |
113 | skb = sctp_event2skb(event); | 100 | skb = sctp_event2skb(event); |
114 | skb->sk = asoc->base.sk; | ||
115 | event->asoc = (struct sctp_association *)asoc; | 101 | event->asoc = (struct sctp_association *)asoc; |
116 | skb->destructor = sctp_stub_rfree; | 102 | atomic_add(skb->truesize, &event->asoc->rmem_alloc); |
103 | skb_set_owner_r(skb, asoc->base.sk); | ||
117 | } | 104 | } |
118 | 105 | ||
119 | /* A simple destructor to give up the reference to the association. */ | 106 | /* A simple destructor to give up the reference to the association. */ |
120 | static inline void sctp_ulpevent_release_owner(struct sctp_ulpevent *event) | 107 | static inline void sctp_ulpevent_release_owner(struct sctp_ulpevent *event) |
121 | { | 108 | { |
122 | sctp_association_put(event->asoc); | 109 | struct sctp_association *asoc = event->asoc; |
110 | struct sk_buff *skb = sctp_event2skb(event); | ||
111 | |||
112 | atomic_sub(skb->truesize, &asoc->rmem_alloc); | ||
113 | sctp_association_put(asoc); | ||
123 | } | 114 | } |
124 | 115 | ||
125 | /* Create and initialize an SCTP_ASSOC_CHANGE event. | 116 | /* Create and initialize an SCTP_ASSOC_CHANGE event. |
@@ -922,7 +913,6 @@ done: | |||
922 | /* Free a ulpevent that has an owner. It includes releasing the reference | 913 | /* Free a ulpevent that has an owner. It includes releasing the reference |
923 | * to the owner, updating the rwnd in case of a DATA event and freeing the | 914 | * to the owner, updating the rwnd in case of a DATA event and freeing the |
924 | * skb. | 915 | * skb. |
925 | * See comments in sctp_stub_rfree(). | ||
926 | */ | 916 | */ |
927 | void sctp_ulpevent_free(struct sctp_ulpevent *event) | 917 | void sctp_ulpevent_free(struct sctp_ulpevent *event) |
928 | { | 918 | { |
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 702ede309b06..61c3abeaccae 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c | |||
@@ -55,6 +55,7 @@ static void call_bind(struct rpc_task *task); | |||
55 | static void call_bind_status(struct rpc_task *task); | 55 | static void call_bind_status(struct rpc_task *task); |
56 | static void call_transmit(struct rpc_task *task); | 56 | static void call_transmit(struct rpc_task *task); |
57 | static void call_status(struct rpc_task *task); | 57 | static void call_status(struct rpc_task *task); |
58 | static void call_transmit_status(struct rpc_task *task); | ||
58 | static void call_refresh(struct rpc_task *task); | 59 | static void call_refresh(struct rpc_task *task); |
59 | static void call_refreshresult(struct rpc_task *task); | 60 | static void call_refreshresult(struct rpc_task *task); |
60 | static void call_timeout(struct rpc_task *task); | 61 | static void call_timeout(struct rpc_task *task); |
@@ -672,6 +673,18 @@ call_allocate(struct rpc_task *task) | |||
672 | rpc_exit(task, -ERESTARTSYS); | 673 | rpc_exit(task, -ERESTARTSYS); |
673 | } | 674 | } |
674 | 675 | ||
676 | static inline int | ||
677 | rpc_task_need_encode(struct rpc_task *task) | ||
678 | { | ||
679 | return task->tk_rqstp->rq_snd_buf.len == 0; | ||
680 | } | ||
681 | |||
682 | static inline void | ||
683 | rpc_task_force_reencode(struct rpc_task *task) | ||
684 | { | ||
685 | task->tk_rqstp->rq_snd_buf.len = 0; | ||
686 | } | ||
687 | |||
675 | /* | 688 | /* |
676 | * 3. Encode arguments of an RPC call | 689 | * 3. Encode arguments of an RPC call |
677 | */ | 690 | */ |
@@ -867,12 +880,14 @@ call_transmit(struct rpc_task *task) | |||
867 | if (task->tk_status != 0) | 880 | if (task->tk_status != 0) |
868 | return; | 881 | return; |
869 | /* Encode here so that rpcsec_gss can use correct sequence number. */ | 882 | /* Encode here so that rpcsec_gss can use correct sequence number. */ |
870 | if (task->tk_rqstp->rq_bytes_sent == 0) { | 883 | if (rpc_task_need_encode(task)) { |
884 | task->tk_rqstp->rq_bytes_sent = 0; | ||
871 | call_encode(task); | 885 | call_encode(task); |
872 | /* Did the encode result in an error condition? */ | 886 | /* Did the encode result in an error condition? */ |
873 | if (task->tk_status != 0) | 887 | if (task->tk_status != 0) |
874 | goto out_nosend; | 888 | goto out_nosend; |
875 | } | 889 | } |
890 | task->tk_action = call_transmit_status; | ||
876 | xprt_transmit(task); | 891 | xprt_transmit(task); |
877 | if (task->tk_status < 0) | 892 | if (task->tk_status < 0) |
878 | return; | 893 | return; |
@@ -884,6 +899,7 @@ call_transmit(struct rpc_task *task) | |||
884 | out_nosend: | 899 | out_nosend: |
885 | /* release socket write lock before attempting to handle error */ | 900 | /* release socket write lock before attempting to handle error */ |
886 | xprt_abort_transmit(task); | 901 | xprt_abort_transmit(task); |
902 | rpc_task_force_reencode(task); | ||
887 | } | 903 | } |
888 | 904 | ||
889 | /* | 905 | /* |
@@ -915,7 +931,6 @@ call_status(struct rpc_task *task) | |||
915 | break; | 931 | break; |
916 | case -ECONNREFUSED: | 932 | case -ECONNREFUSED: |
917 | case -ENOTCONN: | 933 | case -ENOTCONN: |
918 | req->rq_bytes_sent = 0; | ||
919 | if (clnt->cl_autobind) | 934 | if (clnt->cl_autobind) |
920 | clnt->cl_port = 0; | 935 | clnt->cl_port = 0; |
921 | task->tk_action = call_bind; | 936 | task->tk_action = call_bind; |
@@ -937,7 +952,18 @@ call_status(struct rpc_task *task) | |||
937 | } | 952 | } |
938 | 953 | ||
939 | /* | 954 | /* |
940 | * 6a. Handle RPC timeout | 955 | * 6a. Handle transmission errors. |
956 | */ | ||
957 | static void | ||
958 | call_transmit_status(struct rpc_task *task) | ||
959 | { | ||
960 | if (task->tk_status != -EAGAIN) | ||
961 | rpc_task_force_reencode(task); | ||
962 | call_status(task); | ||
963 | } | ||
964 | |||
965 | /* | ||
966 | * 6b. Handle RPC timeout | ||
941 | * We do not release the request slot, so we keep using the | 967 | * We do not release the request slot, so we keep using the |
942 | * same XID for all retransmits. | 968 | * same XID for all retransmits. |
943 | */ | 969 | */ |
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 4f188d0a5d11..81e00a6c19de 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c | |||
@@ -603,7 +603,7 @@ rpc_lookup_negative(char *path, struct nameidata *nd) | |||
603 | return ERR_PTR(error); | 603 | return ERR_PTR(error); |
604 | dir = nd->dentry->d_inode; | 604 | dir = nd->dentry->d_inode; |
605 | down(&dir->i_sem); | 605 | down(&dir->i_sem); |
606 | dentry = lookup_hash(&nd->last, nd->dentry); | 606 | dentry = lookup_hash(nd); |
607 | if (IS_ERR(dentry)) | 607 | if (IS_ERR(dentry)) |
608 | goto out_err; | 608 | goto out_err; |
609 | if (dentry->d_inode) { | 609 | if (dentry->d_inode) { |
@@ -665,7 +665,7 @@ rpc_rmdir(char *path) | |||
665 | return error; | 665 | return error; |
666 | dir = nd.dentry->d_inode; | 666 | dir = nd.dentry->d_inode; |
667 | down(&dir->i_sem); | 667 | down(&dir->i_sem); |
668 | dentry = lookup_hash(&nd.last, nd.dentry); | 668 | dentry = lookup_hash(&nd); |
669 | if (IS_ERR(dentry)) { | 669 | if (IS_ERR(dentry)) { |
670 | error = PTR_ERR(dentry); | 670 | error = PTR_ERR(dentry); |
671 | goto out_release; | 671 | goto out_release; |
@@ -726,7 +726,7 @@ rpc_unlink(char *path) | |||
726 | return error; | 726 | return error; |
727 | dir = nd.dentry->d_inode; | 727 | dir = nd.dentry->d_inode; |
728 | down(&dir->i_sem); | 728 | down(&dir->i_sem); |
729 | dentry = lookup_hash(&nd.last, nd.dentry); | 729 | dentry = lookup_hash(&nd); |
730 | if (IS_ERR(dentry)) { | 730 | if (IS_ERR(dentry)) { |
731 | error = PTR_ERR(dentry); | 731 | error = PTR_ERR(dentry); |
732 | goto out_release; | 732 | goto out_release; |
diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c index 8f97e90f36c8..eb330d4f66d6 100644 --- a/net/sunrpc/socklib.c +++ b/net/sunrpc/socklib.c | |||
@@ -6,6 +6,9 @@ | |||
6 | * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> | 6 | * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> |
7 | */ | 7 | */ |
8 | 8 | ||
9 | #include <linux/compiler.h> | ||
10 | #include <linux/netdevice.h> | ||
11 | #include <linux/skbuff.h> | ||
9 | #include <linux/types.h> | 12 | #include <linux/types.h> |
10 | #include <linux/pagemap.h> | 13 | #include <linux/pagemap.h> |
11 | #include <linux/udp.h> | 14 | #include <linux/udp.h> |
@@ -165,6 +168,8 @@ int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) | |||
165 | return -1; | 168 | return -1; |
166 | if ((unsigned short)csum_fold(desc.csum)) | 169 | if ((unsigned short)csum_fold(desc.csum)) |
167 | return -1; | 170 | return -1; |
171 | if (unlikely(skb->ip_summed == CHECKSUM_HW)) | ||
172 | netdev_rx_csum_fault(skb->dev); | ||
168 | return 0; | 173 | return 0; |
169 | no_checksum: | 174 | no_checksum: |
170 | if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_bits) < 0) | 175 | if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_bits) < 0) |
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index f16e7cdd6150..e50e7cf43737 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c | |||
@@ -623,12 +623,9 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) | |||
623 | /* we can use it in-place */ | 623 | /* we can use it in-place */ |
624 | rqstp->rq_arg.head[0].iov_base = skb->data + sizeof(struct udphdr); | 624 | rqstp->rq_arg.head[0].iov_base = skb->data + sizeof(struct udphdr); |
625 | rqstp->rq_arg.head[0].iov_len = len; | 625 | rqstp->rq_arg.head[0].iov_len = len; |
626 | if (skb->ip_summed != CHECKSUM_UNNECESSARY) { | 626 | if (skb_checksum_complete(skb)) { |
627 | if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { | 627 | skb_free_datagram(svsk->sk_sk, skb); |
628 | skb_free_datagram(svsk->sk_sk, skb); | 628 | return 0; |
629 | return 0; | ||
630 | } | ||
631 | skb->ip_summed = CHECKSUM_UNNECESSARY; | ||
632 | } | 629 | } |
633 | rqstp->rq_skbuff = skb; | 630 | rqstp->rq_skbuff = skb; |
634 | } | 631 | } |
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 41feca3bef86..acc73ba8bade 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c | |||
@@ -676,7 +676,7 @@ static struct sock *unix_find_other(struct sockaddr_un *sunname, int len, | |||
676 | err = path_lookup(sunname->sun_path, LOOKUP_FOLLOW, &nd); | 676 | err = path_lookup(sunname->sun_path, LOOKUP_FOLLOW, &nd); |
677 | if (err) | 677 | if (err) |
678 | goto fail; | 678 | goto fail; |
679 | err = permission(nd.dentry->d_inode,MAY_WRITE, &nd); | 679 | err = vfs_permission(&nd, MAY_WRITE); |
680 | if (err) | 680 | if (err) |
681 | goto put_fail; | 681 | goto put_fail; |
682 | 682 | ||
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index c35336a0f71b..0cdd9a07e043 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c | |||
@@ -18,7 +18,6 @@ | |||
18 | #include <linux/string.h> | 18 | #include <linux/string.h> |
19 | #include <linux/net.h> | 19 | #include <linux/net.h> |
20 | #include <linux/skbuff.h> | 20 | #include <linux/skbuff.h> |
21 | #include <linux/netlink.h> | ||
22 | #include <linux/rtnetlink.h> | 21 | #include <linux/rtnetlink.h> |
23 | #include <linux/pfkeyv2.h> | 22 | #include <linux/pfkeyv2.h> |
24 | #include <linux/ipsec.h> | 23 | #include <linux/ipsec.h> |
@@ -26,6 +25,7 @@ | |||
26 | #include <linux/security.h> | 25 | #include <linux/security.h> |
27 | #include <net/sock.h> | 26 | #include <net/sock.h> |
28 | #include <net/xfrm.h> | 27 | #include <net/xfrm.h> |
28 | #include <net/netlink.h> | ||
29 | #include <asm/uaccess.h> | 29 | #include <asm/uaccess.h> |
30 | 30 | ||
31 | static struct sock *xfrm_nl; | 31 | static struct sock *xfrm_nl; |
@@ -948,11 +948,6 @@ static struct xfrm_link { | |||
948 | [XFRM_MSG_FLUSHPOLICY - XFRM_MSG_BASE] = { .doit = xfrm_flush_policy }, | 948 | [XFRM_MSG_FLUSHPOLICY - XFRM_MSG_BASE] = { .doit = xfrm_flush_policy }, |
949 | }; | 949 | }; |
950 | 950 | ||
951 | static int xfrm_done(struct netlink_callback *cb) | ||
952 | { | ||
953 | return 0; | ||
954 | } | ||
955 | |||
956 | static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp) | 951 | static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp) |
957 | { | 952 | { |
958 | struct rtattr *xfrma[XFRMA_MAX]; | 953 | struct rtattr *xfrma[XFRMA_MAX]; |
@@ -984,20 +979,15 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *err | |||
984 | if ((type == (XFRM_MSG_GETSA - XFRM_MSG_BASE) || | 979 | if ((type == (XFRM_MSG_GETSA - XFRM_MSG_BASE) || |
985 | type == (XFRM_MSG_GETPOLICY - XFRM_MSG_BASE)) && | 980 | type == (XFRM_MSG_GETPOLICY - XFRM_MSG_BASE)) && |
986 | (nlh->nlmsg_flags & NLM_F_DUMP)) { | 981 | (nlh->nlmsg_flags & NLM_F_DUMP)) { |
987 | u32 rlen; | ||
988 | |||
989 | if (link->dump == NULL) | 982 | if (link->dump == NULL) |
990 | goto err_einval; | 983 | goto err_einval; |
991 | 984 | ||
992 | if ((*errp = netlink_dump_start(xfrm_nl, skb, nlh, | 985 | if ((*errp = netlink_dump_start(xfrm_nl, skb, nlh, |
993 | link->dump, | 986 | link->dump, NULL)) != 0) { |
994 | xfrm_done)) != 0) { | ||
995 | return -1; | 987 | return -1; |
996 | } | 988 | } |
997 | rlen = NLMSG_ALIGN(nlh->nlmsg_len); | 989 | |
998 | if (rlen > skb->len) | 990 | netlink_queue_skip(nlh, skb); |
999 | rlen = skb->len; | ||
1000 | skb_pull(skb, rlen); | ||
1001 | return -1; | 991 | return -1; |
1002 | } | 992 | } |
1003 | 993 | ||
@@ -1032,60 +1022,13 @@ err_einval: | |||
1032 | return -1; | 1022 | return -1; |
1033 | } | 1023 | } |
1034 | 1024 | ||
1035 | static int xfrm_user_rcv_skb(struct sk_buff *skb) | ||
1036 | { | ||
1037 | int err; | ||
1038 | struct nlmsghdr *nlh; | ||
1039 | |||
1040 | while (skb->len >= NLMSG_SPACE(0)) { | ||
1041 | u32 rlen; | ||
1042 | |||
1043 | nlh = (struct nlmsghdr *) skb->data; | ||
1044 | if (nlh->nlmsg_len < sizeof(*nlh) || | ||
1045 | skb->len < nlh->nlmsg_len) | ||
1046 | return 0; | ||
1047 | rlen = NLMSG_ALIGN(nlh->nlmsg_len); | ||
1048 | if (rlen > skb->len) | ||
1049 | rlen = skb->len; | ||
1050 | if (xfrm_user_rcv_msg(skb, nlh, &err) < 0) { | ||
1051 | if (err == 0) | ||
1052 | return -1; | ||
1053 | netlink_ack(skb, nlh, err); | ||
1054 | } else if (nlh->nlmsg_flags & NLM_F_ACK) | ||
1055 | netlink_ack(skb, nlh, 0); | ||
1056 | skb_pull(skb, rlen); | ||
1057 | } | ||
1058 | |||
1059 | return 0; | ||
1060 | } | ||
1061 | |||
1062 | static void xfrm_netlink_rcv(struct sock *sk, int len) | 1025 | static void xfrm_netlink_rcv(struct sock *sk, int len) |
1063 | { | 1026 | { |
1064 | unsigned int qlen = skb_queue_len(&sk->sk_receive_queue); | 1027 | unsigned int qlen = 0; |
1065 | 1028 | ||
1066 | do { | 1029 | do { |
1067 | struct sk_buff *skb; | ||
1068 | |||
1069 | down(&xfrm_cfg_sem); | 1030 | down(&xfrm_cfg_sem); |
1070 | 1031 | netlink_run_queue(sk, &qlen, &xfrm_user_rcv_msg); | |
1071 | if (qlen > skb_queue_len(&sk->sk_receive_queue)) | ||
1072 | qlen = skb_queue_len(&sk->sk_receive_queue); | ||
1073 | |||
1074 | for (; qlen; qlen--) { | ||
1075 | skb = skb_dequeue(&sk->sk_receive_queue); | ||
1076 | if (xfrm_user_rcv_skb(skb)) { | ||
1077 | if (skb->len) | ||
1078 | skb_queue_head(&sk->sk_receive_queue, | ||
1079 | skb); | ||
1080 | else { | ||
1081 | kfree_skb(skb); | ||
1082 | qlen--; | ||
1083 | } | ||
1084 | break; | ||
1085 | } | ||
1086 | kfree_skb(skb); | ||
1087 | } | ||
1088 | |||
1089 | up(&xfrm_cfg_sem); | 1032 | up(&xfrm_cfg_sem); |
1090 | 1033 | ||
1091 | } while (qlen); | 1034 | } while (qlen); |