diff options
Diffstat (limited to 'net/ipv4')
63 files changed, 1330 insertions, 651 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index e55136ae09f4..011cca7ae02b 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig | |||
@@ -456,6 +456,14 @@ config TCP_CONG_BIC | |||
456 | increase provides TCP friendliness. | 456 | increase provides TCP friendliness. |
457 | See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/ | 457 | See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/ |
458 | 458 | ||
459 | config TCP_CONG_CUBIC | ||
460 | tristate "CUBIC TCP" | ||
461 | default m | ||
462 | ---help--- | ||
463 | This is version 2.0 of BIC-TCP which uses a cubic growth function | ||
464 | among other techniques. | ||
465 | See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf | ||
466 | |||
459 | config TCP_CONG_WESTWOOD | 467 | config TCP_CONG_WESTWOOD |
460 | tristate "TCP Westwood+" | 468 | tristate "TCP Westwood+" |
461 | default m | 469 | default m |
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index f0435d00db6b..c54edd76de09 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile | |||
@@ -34,6 +34,7 @@ obj-$(CONFIG_INET_DIAG) += inet_diag.o | |||
34 | obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o | 34 | obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o |
35 | obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o | 35 | obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o |
36 | obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o | 36 | obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o |
37 | obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o | ||
37 | obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o | 38 | obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o |
38 | obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o | 39 | obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o |
39 | obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o | 40 | obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o |
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index d368cf249000..966a071a408c 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c | |||
@@ -93,6 +93,7 @@ | |||
93 | #include <linux/smp_lock.h> | 93 | #include <linux/smp_lock.h> |
94 | #include <linux/inet.h> | 94 | #include <linux/inet.h> |
95 | #include <linux/igmp.h> | 95 | #include <linux/igmp.h> |
96 | #include <linux/inetdevice.h> | ||
96 | #include <linux/netdevice.h> | 97 | #include <linux/netdevice.h> |
97 | #include <net/ip.h> | 98 | #include <net/ip.h> |
98 | #include <net/protocol.h> | 99 | #include <net/protocol.h> |
@@ -302,6 +303,7 @@ lookup_protocol: | |||
302 | sk->sk_reuse = 1; | 303 | sk->sk_reuse = 1; |
303 | 304 | ||
304 | inet = inet_sk(sk); | 305 | inet = inet_sk(sk); |
306 | inet->is_icsk = INET_PROTOSW_ICSK & answer_flags; | ||
305 | 307 | ||
306 | if (SOCK_RAW == sock->type) { | 308 | if (SOCK_RAW == sock->type) { |
307 | inet->num = protocol; | 309 | inet->num = protocol; |
@@ -775,16 +777,16 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) | |||
775 | err = devinet_ioctl(cmd, (void __user *)arg); | 777 | err = devinet_ioctl(cmd, (void __user *)arg); |
776 | break; | 778 | break; |
777 | default: | 779 | default: |
778 | if (!sk->sk_prot->ioctl || | 780 | if (sk->sk_prot->ioctl) |
779 | (err = sk->sk_prot->ioctl(sk, cmd, arg)) == | 781 | err = sk->sk_prot->ioctl(sk, cmd, arg); |
780 | -ENOIOCTLCMD) | 782 | else |
781 | err = dev_ioctl(cmd, (void __user *)arg); | 783 | err = -ENOIOCTLCMD; |
782 | break; | 784 | break; |
783 | } | 785 | } |
784 | return err; | 786 | return err; |
785 | } | 787 | } |
786 | 788 | ||
787 | struct proto_ops inet_stream_ops = { | 789 | const struct proto_ops inet_stream_ops = { |
788 | .family = PF_INET, | 790 | .family = PF_INET, |
789 | .owner = THIS_MODULE, | 791 | .owner = THIS_MODULE, |
790 | .release = inet_release, | 792 | .release = inet_release, |
@@ -805,7 +807,7 @@ struct proto_ops inet_stream_ops = { | |||
805 | .sendpage = tcp_sendpage | 807 | .sendpage = tcp_sendpage |
806 | }; | 808 | }; |
807 | 809 | ||
808 | struct proto_ops inet_dgram_ops = { | 810 | const struct proto_ops inet_dgram_ops = { |
809 | .family = PF_INET, | 811 | .family = PF_INET, |
810 | .owner = THIS_MODULE, | 812 | .owner = THIS_MODULE, |
811 | .release = inet_release, | 813 | .release = inet_release, |
@@ -830,7 +832,7 @@ struct proto_ops inet_dgram_ops = { | |||
830 | * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without | 832 | * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without |
831 | * udp_poll | 833 | * udp_poll |
832 | */ | 834 | */ |
833 | static struct proto_ops inet_sockraw_ops = { | 835 | static const struct proto_ops inet_sockraw_ops = { |
834 | .family = PF_INET, | 836 | .family = PF_INET, |
835 | .owner = THIS_MODULE, | 837 | .owner = THIS_MODULE, |
836 | .release = inet_release, | 838 | .release = inet_release, |
@@ -869,7 +871,8 @@ static struct inet_protosw inetsw_array[] = | |||
869 | .ops = &inet_stream_ops, | 871 | .ops = &inet_stream_ops, |
870 | .capability = -1, | 872 | .capability = -1, |
871 | .no_check = 0, | 873 | .no_check = 0, |
872 | .flags = INET_PROTOSW_PERMANENT, | 874 | .flags = INET_PROTOSW_PERMANENT | |
875 | INET_PROTOSW_ICSK, | ||
873 | }, | 876 | }, |
874 | 877 | ||
875 | { | 878 | { |
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 035ad2c9e1ba..aed537fa2c88 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c | |||
@@ -6,6 +6,7 @@ | |||
6 | #include <linux/crypto.h> | 6 | #include <linux/crypto.h> |
7 | #include <linux/pfkeyv2.h> | 7 | #include <linux/pfkeyv2.h> |
8 | #include <net/icmp.h> | 8 | #include <net/icmp.h> |
9 | #include <net/protocol.h> | ||
9 | #include <asm/scatterlist.h> | 10 | #include <asm/scatterlist.h> |
10 | 11 | ||
11 | 12 | ||
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index b425748f02d7..37432088fe6d 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c | |||
@@ -86,6 +86,7 @@ | |||
86 | #include <linux/in.h> | 86 | #include <linux/in.h> |
87 | #include <linux/mm.h> | 87 | #include <linux/mm.h> |
88 | #include <linux/inet.h> | 88 | #include <linux/inet.h> |
89 | #include <linux/inetdevice.h> | ||
89 | #include <linux/netdevice.h> | 90 | #include <linux/netdevice.h> |
90 | #include <linux/etherdevice.h> | 91 | #include <linux/etherdevice.h> |
91 | #include <linux/fddidevice.h> | 92 | #include <linux/fddidevice.h> |
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 04a6fe3e95a2..7b9bb28e2ee9 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c | |||
@@ -58,6 +58,7 @@ | |||
58 | #endif | 58 | #endif |
59 | #include <linux/kmod.h> | 59 | #include <linux/kmod.h> |
60 | 60 | ||
61 | #include <net/arp.h> | ||
61 | #include <net/ip.h> | 62 | #include <net/ip.h> |
62 | #include <net/route.h> | 63 | #include <net/route.h> |
63 | #include <net/ip_fib.h> | 64 | #include <net/ip_fib.h> |
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 1b18ce66e7b7..73bfcae8af9c 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <linux/pfkeyv2.h> | 9 | #include <linux/pfkeyv2.h> |
10 | #include <linux/random.h> | 10 | #include <linux/random.h> |
11 | #include <net/icmp.h> | 11 | #include <net/icmp.h> |
12 | #include <net/protocol.h> | ||
12 | #include <net/udp.h> | 13 | #include <net/udp.h> |
13 | 14 | ||
14 | /* decapsulation data for use when post-processing */ | 15 | /* decapsulation data for use when post-processing */ |
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 19b1b984d687..18f5e509281a 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c | |||
@@ -30,6 +30,7 @@ | |||
30 | #include <linux/errno.h> | 30 | #include <linux/errno.h> |
31 | #include <linux/in.h> | 31 | #include <linux/in.h> |
32 | #include <linux/inet.h> | 32 | #include <linux/inet.h> |
33 | #include <linux/inetdevice.h> | ||
33 | #include <linux/netdevice.h> | 34 | #include <linux/netdevice.h> |
34 | #include <linux/if_arp.h> | 35 | #include <linux/if_arp.h> |
35 | #include <linux/skbuff.h> | 36 | #include <linux/skbuff.h> |
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index 7ea0209cb169..e2890ec8159e 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c | |||
@@ -29,6 +29,7 @@ | |||
29 | #include <linux/errno.h> | 29 | #include <linux/errno.h> |
30 | #include <linux/in.h> | 30 | #include <linux/in.h> |
31 | #include <linux/inet.h> | 31 | #include <linux/inet.h> |
32 | #include <linux/inetdevice.h> | ||
32 | #include <linux/netdevice.h> | 33 | #include <linux/netdevice.h> |
33 | #include <linux/if_arp.h> | 34 | #include <linux/if_arp.h> |
34 | #include <linux/proc_fs.h> | 35 | #include <linux/proc_fs.h> |
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 0b298bbc1518..0dd4d06e456d 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include <linux/errno.h> | 33 | #include <linux/errno.h> |
34 | #include <linux/in.h> | 34 | #include <linux/in.h> |
35 | #include <linux/inet.h> | 35 | #include <linux/inet.h> |
36 | #include <linux/inetdevice.h> | ||
36 | #include <linux/netdevice.h> | 37 | #include <linux/netdevice.h> |
37 | #include <linux/if_arp.h> | 38 | #include <linux/if_arp.h> |
38 | #include <linux/proc_fs.h> | 39 | #include <linux/proc_fs.h> |
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 6d2a6ac070e3..ef4724de7350 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c | |||
@@ -29,6 +29,7 @@ | |||
29 | #include <linux/errno.h> | 29 | #include <linux/errno.h> |
30 | #include <linux/in.h> | 30 | #include <linux/in.h> |
31 | #include <linux/inet.h> | 31 | #include <linux/inet.h> |
32 | #include <linux/inetdevice.h> | ||
32 | #include <linux/netdevice.h> | 33 | #include <linux/netdevice.h> |
33 | #include <linux/if_arp.h> | 34 | #include <linux/if_arp.h> |
34 | #include <linux/proc_fs.h> | 35 | #include <linux/proc_fs.h> |
@@ -36,6 +37,7 @@ | |||
36 | #include <linux/netlink.h> | 37 | #include <linux/netlink.h> |
37 | #include <linux/init.h> | 38 | #include <linux/init.h> |
38 | 39 | ||
40 | #include <net/arp.h> | ||
39 | #include <net/ip.h> | 41 | #include <net/ip.h> |
40 | #include <net/protocol.h> | 42 | #include <net/protocol.h> |
41 | #include <net/route.h> | 43 | #include <net/route.h> |
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 705e3ce86df9..e320b32373e5 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c | |||
@@ -41,6 +41,13 @@ | |||
41 | * modify it under the terms of the GNU General Public License | 41 | * modify it under the terms of the GNU General Public License |
42 | * as published by the Free Software Foundation; either version | 42 | * as published by the Free Software Foundation; either version |
43 | * 2 of the License, or (at your option) any later version. | 43 | * 2 of the License, or (at your option) any later version. |
44 | * | ||
45 | * Substantial contributions to this work comes from: | ||
46 | * | ||
47 | * David S. Miller, <davem@davemloft.net> | ||
48 | * Stephen Hemminger <shemminger@osdl.org> | ||
49 | * Paul E. McKenney <paulmck@us.ibm.com> | ||
50 | * Patrick McHardy <kaber@trash.net> | ||
44 | */ | 51 | */ |
45 | 52 | ||
46 | #define VERSION "0.404" | 53 | #define VERSION "0.404" |
@@ -59,6 +66,7 @@ | |||
59 | #include <linux/errno.h> | 66 | #include <linux/errno.h> |
60 | #include <linux/in.h> | 67 | #include <linux/in.h> |
61 | #include <linux/inet.h> | 68 | #include <linux/inet.h> |
69 | #include <linux/inetdevice.h> | ||
62 | #include <linux/netdevice.h> | 70 | #include <linux/netdevice.h> |
63 | #include <linux/if_arp.h> | 71 | #include <linux/if_arp.h> |
64 | #include <linux/proc_fs.h> | 72 | #include <linux/proc_fs.h> |
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 92e23b2ad4d2..be5a519cd2f8 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c | |||
@@ -73,6 +73,7 @@ | |||
73 | #include <linux/socket.h> | 73 | #include <linux/socket.h> |
74 | #include <linux/in.h> | 74 | #include <linux/in.h> |
75 | #include <linux/inet.h> | 75 | #include <linux/inet.h> |
76 | #include <linux/inetdevice.h> | ||
76 | #include <linux/netdevice.h> | 77 | #include <linux/netdevice.h> |
77 | #include <linux/string.h> | 78 | #include <linux/string.h> |
78 | #include <linux/netfilter_ipv4.h> | 79 | #include <linux/netfilter_ipv4.h> |
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 4a195c724f01..34758118c10c 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c | |||
@@ -91,6 +91,8 @@ | |||
91 | #include <linux/if_arp.h> | 91 | #include <linux/if_arp.h> |
92 | #include <linux/rtnetlink.h> | 92 | #include <linux/rtnetlink.h> |
93 | #include <linux/times.h> | 93 | #include <linux/times.h> |
94 | |||
95 | #include <net/arp.h> | ||
94 | #include <net/ip.h> | 96 | #include <net/ip.h> |
95 | #include <net/protocol.h> | 97 | #include <net/protocol.h> |
96 | #include <net/route.h> | 98 | #include <net/route.h> |
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 3fe021f1a566..ae20281d8deb 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c | |||
@@ -37,7 +37,8 @@ EXPORT_SYMBOL(inet_csk_timer_bug_msg); | |||
37 | */ | 37 | */ |
38 | int sysctl_local_port_range[2] = { 1024, 4999 }; | 38 | int sysctl_local_port_range[2] = { 1024, 4999 }; |
39 | 39 | ||
40 | static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb) | 40 | int inet_csk_bind_conflict(const struct sock *sk, |
41 | const struct inet_bind_bucket *tb) | ||
41 | { | 42 | { |
42 | const u32 sk_rcv_saddr = inet_rcv_saddr(sk); | 43 | const u32 sk_rcv_saddr = inet_rcv_saddr(sk); |
43 | struct sock *sk2; | 44 | struct sock *sk2; |
@@ -62,11 +63,15 @@ static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucke | |||
62 | return node != NULL; | 63 | return node != NULL; |
63 | } | 64 | } |
64 | 65 | ||
66 | EXPORT_SYMBOL_GPL(inet_csk_bind_conflict); | ||
67 | |||
65 | /* Obtain a reference to a local port for the given sock, | 68 | /* Obtain a reference to a local port for the given sock, |
66 | * if snum is zero it means select any available local port. | 69 | * if snum is zero it means select any available local port. |
67 | */ | 70 | */ |
68 | int inet_csk_get_port(struct inet_hashinfo *hashinfo, | 71 | int inet_csk_get_port(struct inet_hashinfo *hashinfo, |
69 | struct sock *sk, unsigned short snum) | 72 | struct sock *sk, unsigned short snum, |
73 | int (*bind_conflict)(const struct sock *sk, | ||
74 | const struct inet_bind_bucket *tb)) | ||
70 | { | 75 | { |
71 | struct inet_bind_hashbucket *head; | 76 | struct inet_bind_hashbucket *head; |
72 | struct hlist_node *node; | 77 | struct hlist_node *node; |
@@ -125,7 +130,7 @@ tb_found: | |||
125 | goto success; | 130 | goto success; |
126 | } else { | 131 | } else { |
127 | ret = 1; | 132 | ret = 1; |
128 | if (inet_csk_bind_conflict(sk, tb)) | 133 | if (bind_conflict(sk, tb)) |
129 | goto fail_unlock; | 134 | goto fail_unlock; |
130 | } | 135 | } |
131 | } | 136 | } |
@@ -380,7 +385,7 @@ struct request_sock *inet_csk_search_req(const struct sock *sk, | |||
380 | EXPORT_SYMBOL_GPL(inet_csk_search_req); | 385 | EXPORT_SYMBOL_GPL(inet_csk_search_req); |
381 | 386 | ||
382 | void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, | 387 | void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, |
383 | const unsigned timeout) | 388 | unsigned long timeout) |
384 | { | 389 | { |
385 | struct inet_connection_sock *icsk = inet_csk(sk); | 390 | struct inet_connection_sock *icsk = inet_csk(sk); |
386 | struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; | 391 | struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; |
@@ -631,3 +636,15 @@ void inet_csk_listen_stop(struct sock *sk) | |||
631 | } | 636 | } |
632 | 637 | ||
633 | EXPORT_SYMBOL_GPL(inet_csk_listen_stop); | 638 | EXPORT_SYMBOL_GPL(inet_csk_listen_stop); |
639 | |||
640 | void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) | ||
641 | { | ||
642 | struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; | ||
643 | const struct inet_sock *inet = inet_sk(sk); | ||
644 | |||
645 | sin->sin_family = AF_INET; | ||
646 | sin->sin_addr.s_addr = inet->daddr; | ||
647 | sin->sin_port = inet->dport; | ||
648 | } | ||
649 | |||
650 | EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr); | ||
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 39061ed53cfd..c49908192047 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c | |||
@@ -112,12 +112,12 @@ static int inet_diag_fill(struct sk_buff *skb, struct sock *sk, | |||
112 | r->idiag_inode = 0; | 112 | r->idiag_inode = 0; |
113 | #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) | 113 | #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) |
114 | if (r->idiag_family == AF_INET6) { | 114 | if (r->idiag_family == AF_INET6) { |
115 | const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk); | 115 | const struct inet6_timewait_sock *tw6 = inet6_twsk(sk); |
116 | 116 | ||
117 | ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, | 117 | ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, |
118 | &tcp6tw->tw_v6_rcv_saddr); | 118 | &tw6->tw_v6_rcv_saddr); |
119 | ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, | 119 | ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, |
120 | &tcp6tw->tw_v6_daddr); | 120 | &tw6->tw_v6_daddr); |
121 | } | 121 | } |
122 | #endif | 122 | #endif |
123 | nlh->nlmsg_len = skb->tail - b; | 123 | nlh->nlmsg_len = skb->tail - b; |
@@ -489,9 +489,9 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, | |||
489 | #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) | 489 | #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) |
490 | if (r->idiag_family == AF_INET6) { | 490 | if (r->idiag_family == AF_INET6) { |
491 | ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, | 491 | ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, |
492 | &tcp6_rsk(req)->loc_addr); | 492 | &inet6_rsk(req)->loc_addr); |
493 | ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, | 493 | ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, |
494 | &tcp6_rsk(req)->rmt_addr); | 494 | &inet6_rsk(req)->rmt_addr); |
495 | } | 495 | } |
496 | #endif | 496 | #endif |
497 | nlh->nlmsg_len = skb->tail - b; | 497 | nlh->nlmsg_len = skb->tail - b; |
@@ -553,13 +553,13 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, | |||
553 | entry.saddr = | 553 | entry.saddr = |
554 | #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) | 554 | #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) |
555 | (entry.family == AF_INET6) ? | 555 | (entry.family == AF_INET6) ? |
556 | tcp6_rsk(req)->loc_addr.s6_addr32 : | 556 | inet6_rsk(req)->loc_addr.s6_addr32 : |
557 | #endif | 557 | #endif |
558 | &ireq->loc_addr; | 558 | &ireq->loc_addr; |
559 | entry.daddr = | 559 | entry.daddr = |
560 | #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) | 560 | #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) |
561 | (entry.family == AF_INET6) ? | 561 | (entry.family == AF_INET6) ? |
562 | tcp6_rsk(req)->rmt_addr.s6_addr32 : | 562 | inet6_rsk(req)->rmt_addr.s6_addr32 : |
563 | #endif | 563 | #endif |
564 | &ireq->rmt_addr; | 564 | &ireq->rmt_addr; |
565 | entry.dport = ntohs(ireq->rmt_port); | 565 | entry.dport = ntohs(ireq->rmt_port); |
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index e8d29fe736d2..33228115cda4 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c | |||
@@ -15,12 +15,14 @@ | |||
15 | 15 | ||
16 | #include <linux/config.h> | 16 | #include <linux/config.h> |
17 | #include <linux/module.h> | 17 | #include <linux/module.h> |
18 | #include <linux/random.h> | ||
18 | #include <linux/sched.h> | 19 | #include <linux/sched.h> |
19 | #include <linux/slab.h> | 20 | #include <linux/slab.h> |
20 | #include <linux/wait.h> | 21 | #include <linux/wait.h> |
21 | 22 | ||
22 | #include <net/inet_connection_sock.h> | 23 | #include <net/inet_connection_sock.h> |
23 | #include <net/inet_hashtables.h> | 24 | #include <net/inet_hashtables.h> |
25 | #include <net/ip.h> | ||
24 | 26 | ||
25 | /* | 27 | /* |
26 | * Allocate and initialize a new local port bind bucket. | 28 | * Allocate and initialize a new local port bind bucket. |
@@ -163,3 +165,179 @@ struct sock *__inet_lookup_listener(const struct hlist_head *head, const u32 dad | |||
163 | } | 165 | } |
164 | 166 | ||
165 | EXPORT_SYMBOL_GPL(__inet_lookup_listener); | 167 | EXPORT_SYMBOL_GPL(__inet_lookup_listener); |
168 | |||
169 | /* called with local bh disabled */ | ||
170 | static int __inet_check_established(struct inet_timewait_death_row *death_row, | ||
171 | struct sock *sk, __u16 lport, | ||
172 | struct inet_timewait_sock **twp) | ||
173 | { | ||
174 | struct inet_hashinfo *hinfo = death_row->hashinfo; | ||
175 | struct inet_sock *inet = inet_sk(sk); | ||
176 | u32 daddr = inet->rcv_saddr; | ||
177 | u32 saddr = inet->daddr; | ||
178 | int dif = sk->sk_bound_dev_if; | ||
179 | INET_ADDR_COOKIE(acookie, saddr, daddr) | ||
180 | const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport); | ||
181 | unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); | ||
182 | struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); | ||
183 | struct sock *sk2; | ||
184 | const struct hlist_node *node; | ||
185 | struct inet_timewait_sock *tw; | ||
186 | |||
187 | prefetch(head->chain.first); | ||
188 | write_lock(&head->lock); | ||
189 | |||
190 | /* Check TIME-WAIT sockets first. */ | ||
191 | sk_for_each(sk2, node, &(head + hinfo->ehash_size)->chain) { | ||
192 | tw = inet_twsk(sk2); | ||
193 | |||
194 | if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) { | ||
195 | if (twsk_unique(sk, sk2, twp)) | ||
196 | goto unique; | ||
197 | else | ||
198 | goto not_unique; | ||
199 | } | ||
200 | } | ||
201 | tw = NULL; | ||
202 | |||
203 | /* And established part... */ | ||
204 | sk_for_each(sk2, node, &head->chain) { | ||
205 | if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) | ||
206 | goto not_unique; | ||
207 | } | ||
208 | |||
209 | unique: | ||
210 | /* Must record num and sport now. Otherwise we will see | ||
211 | * in hash table socket with a funny identity. */ | ||
212 | inet->num = lport; | ||
213 | inet->sport = htons(lport); | ||
214 | sk->sk_hash = hash; | ||
215 | BUG_TRAP(sk_unhashed(sk)); | ||
216 | __sk_add_node(sk, &head->chain); | ||
217 | sock_prot_inc_use(sk->sk_prot); | ||
218 | write_unlock(&head->lock); | ||
219 | |||
220 | if (twp) { | ||
221 | *twp = tw; | ||
222 | NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); | ||
223 | } else if (tw) { | ||
224 | /* Silly. Should hash-dance instead... */ | ||
225 | inet_twsk_deschedule(tw, death_row); | ||
226 | NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); | ||
227 | |||
228 | inet_twsk_put(tw); | ||
229 | } | ||
230 | |||
231 | return 0; | ||
232 | |||
233 | not_unique: | ||
234 | write_unlock(&head->lock); | ||
235 | return -EADDRNOTAVAIL; | ||
236 | } | ||
237 | |||
238 | static inline u32 inet_sk_port_offset(const struct sock *sk) | ||
239 | { | ||
240 | const struct inet_sock *inet = inet_sk(sk); | ||
241 | return secure_ipv4_port_ephemeral(inet->rcv_saddr, inet->daddr, | ||
242 | inet->dport); | ||
243 | } | ||
244 | |||
245 | /* | ||
246 | * Bind a port for a connect operation and hash it. | ||
247 | */ | ||
248 | int inet_hash_connect(struct inet_timewait_death_row *death_row, | ||
249 | struct sock *sk) | ||
250 | { | ||
251 | struct inet_hashinfo *hinfo = death_row->hashinfo; | ||
252 | const unsigned short snum = inet_sk(sk)->num; | ||
253 | struct inet_bind_hashbucket *head; | ||
254 | struct inet_bind_bucket *tb; | ||
255 | int ret; | ||
256 | |||
257 | if (!snum) { | ||
258 | int low = sysctl_local_port_range[0]; | ||
259 | int high = sysctl_local_port_range[1]; | ||
260 | int range = high - low; | ||
261 | int i; | ||
262 | int port; | ||
263 | static u32 hint; | ||
264 | u32 offset = hint + inet_sk_port_offset(sk); | ||
265 | struct hlist_node *node; | ||
266 | struct inet_timewait_sock *tw = NULL; | ||
267 | |||
268 | local_bh_disable(); | ||
269 | for (i = 1; i <= range; i++) { | ||
270 | port = low + (i + offset) % range; | ||
271 | head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)]; | ||
272 | spin_lock(&head->lock); | ||
273 | |||
274 | /* Does not bother with rcv_saddr checks, | ||
275 | * because the established check is already | ||
276 | * unique enough. | ||
277 | */ | ||
278 | inet_bind_bucket_for_each(tb, node, &head->chain) { | ||
279 | if (tb->port == port) { | ||
280 | BUG_TRAP(!hlist_empty(&tb->owners)); | ||
281 | if (tb->fastreuse >= 0) | ||
282 | goto next_port; | ||
283 | if (!__inet_check_established(death_row, | ||
284 | sk, port, | ||
285 | &tw)) | ||
286 | goto ok; | ||
287 | goto next_port; | ||
288 | } | ||
289 | } | ||
290 | |||
291 | tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, head, port); | ||
292 | if (!tb) { | ||
293 | spin_unlock(&head->lock); | ||
294 | break; | ||
295 | } | ||
296 | tb->fastreuse = -1; | ||
297 | goto ok; | ||
298 | |||
299 | next_port: | ||
300 | spin_unlock(&head->lock); | ||
301 | } | ||
302 | local_bh_enable(); | ||
303 | |||
304 | return -EADDRNOTAVAIL; | ||
305 | |||
306 | ok: | ||
307 | hint += i; | ||
308 | |||
309 | /* Head lock still held and bh's disabled */ | ||
310 | inet_bind_hash(sk, tb, port); | ||
311 | if (sk_unhashed(sk)) { | ||
312 | inet_sk(sk)->sport = htons(port); | ||
313 | __inet_hash(hinfo, sk, 0); | ||
314 | } | ||
315 | spin_unlock(&head->lock); | ||
316 | |||
317 | if (tw) { | ||
318 | inet_twsk_deschedule(tw, death_row);; | ||
319 | inet_twsk_put(tw); | ||
320 | } | ||
321 | |||
322 | ret = 0; | ||
323 | goto out; | ||
324 | } | ||
325 | |||
326 | head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)]; | ||
327 | tb = inet_csk(sk)->icsk_bind_hash; | ||
328 | spin_lock_bh(&head->lock); | ||
329 | if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { | ||
330 | __inet_hash(hinfo, sk, 0); | ||
331 | spin_unlock_bh(&head->lock); | ||
332 | return 0; | ||
333 | } else { | ||
334 | spin_unlock(&head->lock); | ||
335 | /* No definite answer... Walk to established hash table */ | ||
336 | ret = __inet_check_established(death_row, sk, snum, NULL); | ||
337 | out: | ||
338 | local_bh_enable(); | ||
339 | return ret; | ||
340 | } | ||
341 | } | ||
342 | |||
343 | EXPORT_SYMBOL_GPL(inet_hash_connect); | ||
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index a010e9a68811..417f126c749e 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c | |||
@@ -90,8 +90,9 @@ EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); | |||
90 | 90 | ||
91 | struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state) | 91 | struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state) |
92 | { | 92 | { |
93 | struct inet_timewait_sock *tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_slab, | 93 | struct inet_timewait_sock *tw = |
94 | SLAB_ATOMIC); | 94 | kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab, |
95 | SLAB_ATOMIC); | ||
95 | if (tw != NULL) { | 96 | if (tw != NULL) { |
96 | const struct inet_sock *inet = inet_sk(sk); | 97 | const struct inet_sock *inet = inet_sk(sk); |
97 | 98 | ||
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 2fc3fd38924f..ce5fe3f74a3d 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c | |||
@@ -401,6 +401,7 @@ struct inet_peer *inet_getpeer(__u32 daddr, int create) | |||
401 | return NULL; | 401 | return NULL; |
402 | n->v4daddr = daddr; | 402 | n->v4daddr = daddr; |
403 | atomic_set(&n->refcnt, 1); | 403 | atomic_set(&n->refcnt, 1); |
404 | atomic_set(&n->rid, 0); | ||
404 | n->ip_id_count = secure_ip_id(daddr); | 405 | n->ip_id_count = secure_ip_id(daddr); |
405 | n->tcp_ts_stamp = 0; | 406 | n->tcp_ts_stamp = 0; |
406 | 407 | ||
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 8ce0ce2ee48e..ce2b70ce4018 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c | |||
@@ -22,6 +22,7 @@ | |||
22 | * Patrick McHardy : LRU queue of frag heads for evictor. | 22 | * Patrick McHardy : LRU queue of frag heads for evictor. |
23 | */ | 23 | */ |
24 | 24 | ||
25 | #include <linux/compiler.h> | ||
25 | #include <linux/config.h> | 26 | #include <linux/config.h> |
26 | #include <linux/module.h> | 27 | #include <linux/module.h> |
27 | #include <linux/types.h> | 28 | #include <linux/types.h> |
@@ -38,6 +39,7 @@ | |||
38 | #include <net/ip.h> | 39 | #include <net/ip.h> |
39 | #include <net/icmp.h> | 40 | #include <net/icmp.h> |
40 | #include <net/checksum.h> | 41 | #include <net/checksum.h> |
42 | #include <net/inetpeer.h> | ||
41 | #include <linux/tcp.h> | 43 | #include <linux/tcp.h> |
42 | #include <linux/udp.h> | 44 | #include <linux/udp.h> |
43 | #include <linux/inet.h> | 45 | #include <linux/inet.h> |
@@ -56,6 +58,8 @@ | |||
56 | int sysctl_ipfrag_high_thresh = 256*1024; | 58 | int sysctl_ipfrag_high_thresh = 256*1024; |
57 | int sysctl_ipfrag_low_thresh = 192*1024; | 59 | int sysctl_ipfrag_low_thresh = 192*1024; |
58 | 60 | ||
61 | int sysctl_ipfrag_max_dist = 64; | ||
62 | |||
59 | /* Important NOTE! Fragment queue must be destroyed before MSL expires. | 63 | /* Important NOTE! Fragment queue must be destroyed before MSL expires. |
60 | * RFC791 is wrong proposing to prolongate timer each fragment arrival by TTL. | 64 | * RFC791 is wrong proposing to prolongate timer each fragment arrival by TTL. |
61 | */ | 65 | */ |
@@ -89,8 +93,10 @@ struct ipq { | |||
89 | spinlock_t lock; | 93 | spinlock_t lock; |
90 | atomic_t refcnt; | 94 | atomic_t refcnt; |
91 | struct timer_list timer; /* when will this queue expire? */ | 95 | struct timer_list timer; /* when will this queue expire? */ |
92 | int iif; | ||
93 | struct timeval stamp; | 96 | struct timeval stamp; |
97 | int iif; | ||
98 | unsigned int rid; | ||
99 | struct inet_peer *peer; | ||
94 | }; | 100 | }; |
95 | 101 | ||
96 | /* Hash table. */ | 102 | /* Hash table. */ |
@@ -195,6 +201,9 @@ static void ip_frag_destroy(struct ipq *qp, int *work) | |||
195 | BUG_TRAP(qp->last_in&COMPLETE); | 201 | BUG_TRAP(qp->last_in&COMPLETE); |
196 | BUG_TRAP(del_timer(&qp->timer) == 0); | 202 | BUG_TRAP(del_timer(&qp->timer) == 0); |
197 | 203 | ||
204 | if (qp->peer) | ||
205 | inet_putpeer(qp->peer); | ||
206 | |||
198 | /* Release all fragment data. */ | 207 | /* Release all fragment data. */ |
199 | fp = qp->fragments; | 208 | fp = qp->fragments; |
200 | while (fp) { | 209 | while (fp) { |
@@ -353,6 +362,7 @@ static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user) | |||
353 | qp->meat = 0; | 362 | qp->meat = 0; |
354 | qp->fragments = NULL; | 363 | qp->fragments = NULL; |
355 | qp->iif = 0; | 364 | qp->iif = 0; |
365 | qp->peer = sysctl_ipfrag_max_dist ? inet_getpeer(iph->saddr, 1) : NULL; | ||
356 | 366 | ||
357 | /* Initialize a timer for this entry. */ | 367 | /* Initialize a timer for this entry. */ |
358 | init_timer(&qp->timer); | 368 | init_timer(&qp->timer); |
@@ -398,6 +408,56 @@ static inline struct ipq *ip_find(struct iphdr *iph, u32 user) | |||
398 | return ip_frag_create(hash, iph, user); | 408 | return ip_frag_create(hash, iph, user); |
399 | } | 409 | } |
400 | 410 | ||
411 | /* Is the fragment too far ahead to be part of ipq? */ | ||
412 | static inline int ip_frag_too_far(struct ipq *qp) | ||
413 | { | ||
414 | struct inet_peer *peer = qp->peer; | ||
415 | unsigned int max = sysctl_ipfrag_max_dist; | ||
416 | unsigned int start, end; | ||
417 | |||
418 | int rc; | ||
419 | |||
420 | if (!peer || !max) | ||
421 | return 0; | ||
422 | |||
423 | start = qp->rid; | ||
424 | end = atomic_inc_return(&peer->rid); | ||
425 | qp->rid = end; | ||
426 | |||
427 | rc = qp->fragments && (end - start) > max; | ||
428 | |||
429 | if (rc) { | ||
430 | IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); | ||
431 | } | ||
432 | |||
433 | return rc; | ||
434 | } | ||
435 | |||
436 | static int ip_frag_reinit(struct ipq *qp) | ||
437 | { | ||
438 | struct sk_buff *fp; | ||
439 | |||
440 | if (!mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time)) { | ||
441 | atomic_inc(&qp->refcnt); | ||
442 | return -ETIMEDOUT; | ||
443 | } | ||
444 | |||
445 | fp = qp->fragments; | ||
446 | do { | ||
447 | struct sk_buff *xp = fp->next; | ||
448 | frag_kfree_skb(fp, NULL); | ||
449 | fp = xp; | ||
450 | } while (fp); | ||
451 | |||
452 | qp->last_in = 0; | ||
453 | qp->len = 0; | ||
454 | qp->meat = 0; | ||
455 | qp->fragments = NULL; | ||
456 | qp->iif = 0; | ||
457 | |||
458 | return 0; | ||
459 | } | ||
460 | |||
401 | /* Add new segment to existing queue. */ | 461 | /* Add new segment to existing queue. */ |
402 | static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) | 462 | static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) |
403 | { | 463 | { |
@@ -408,6 +468,12 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) | |||
408 | if (qp->last_in & COMPLETE) | 468 | if (qp->last_in & COMPLETE) |
409 | goto err; | 469 | goto err; |
410 | 470 | ||
471 | if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) && | ||
472 | unlikely(ip_frag_too_far(qp)) && unlikely(ip_frag_reinit(qp))) { | ||
473 | ipq_kill(qp); | ||
474 | goto err; | ||
475 | } | ||
476 | |||
411 | offset = ntohs(skb->nh.iph->frag_off); | 477 | offset = ntohs(skb->nh.iph->frag_off); |
412 | flags = offset & ~IP_OFFSET; | 478 | flags = offset & ~IP_OFFSET; |
413 | offset &= IP_OFFSET; | 479 | offset &= IP_OFFSET; |
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 473d0f2b2e0d..e45846ae570b 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c | |||
@@ -128,6 +128,7 @@ | |||
128 | #include <linux/sockios.h> | 128 | #include <linux/sockios.h> |
129 | #include <linux/in.h> | 129 | #include <linux/in.h> |
130 | #include <linux/inet.h> | 130 | #include <linux/inet.h> |
131 | #include <linux/inetdevice.h> | ||
131 | #include <linux/netdevice.h> | 132 | #include <linux/netdevice.h> |
132 | #include <linux/etherdevice.h> | 133 | #include <linux/etherdevice.h> |
133 | 134 | ||
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index dbe12da8d8b3..d3f6c468faf4 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <net/sock.h> | 22 | #include <net/sock.h> |
23 | #include <net/ip.h> | 23 | #include <net/ip.h> |
24 | #include <net/icmp.h> | 24 | #include <net/icmp.h> |
25 | #include <net/route.h> | ||
25 | 26 | ||
26 | /* | 27 | /* |
27 | * Write options to IP header, record destination address to | 28 | * Write options to IP header, record destination address to |
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index eba64e2bd397..2a830de3a699 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c | |||
@@ -445,6 +445,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) | |||
445 | 445 | ||
446 | hlen = iph->ihl * 4; | 446 | hlen = iph->ihl * 4; |
447 | mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */ | 447 | mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */ |
448 | IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE; | ||
448 | 449 | ||
449 | /* When frag_list is given, use it. First, check its validity: | 450 | /* When frag_list is given, use it. First, check its validity: |
450 | * some transformers could create wrong frag_list or break existing | 451 | * some transformers could create wrong frag_list or break existing |
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 4f2d87257309..6986e11d65cc 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c | |||
@@ -25,12 +25,12 @@ | |||
25 | #include <linux/skbuff.h> | 25 | #include <linux/skbuff.h> |
26 | #include <linux/ip.h> | 26 | #include <linux/ip.h> |
27 | #include <linux/icmp.h> | 27 | #include <linux/icmp.h> |
28 | #include <linux/inetdevice.h> | ||
28 | #include <linux/netdevice.h> | 29 | #include <linux/netdevice.h> |
29 | #include <net/sock.h> | 30 | #include <net/sock.h> |
30 | #include <net/ip.h> | 31 | #include <net/ip.h> |
31 | #include <net/icmp.h> | 32 | #include <net/icmp.h> |
32 | #include <net/tcp.h> | 33 | #include <net/tcp_states.h> |
33 | #include <linux/tcp.h> | ||
34 | #include <linux/udp.h> | 34 | #include <linux/udp.h> |
35 | #include <linux/igmp.h> | 35 | #include <linux/igmp.h> |
36 | #include <linux/netfilter.h> | 36 | #include <linux/netfilter.h> |
@@ -427,8 +427,8 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, | |||
427 | err = ip_options_get_from_user(&opt, optval, optlen); | 427 | err = ip_options_get_from_user(&opt, optval, optlen); |
428 | if (err) | 428 | if (err) |
429 | break; | 429 | break; |
430 | if (sk->sk_type == SOCK_STREAM) { | 430 | if (inet->is_icsk) { |
431 | struct tcp_sock *tp = tcp_sk(sk); | 431 | struct inet_connection_sock *icsk = inet_csk(sk); |
432 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | 432 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
433 | if (sk->sk_family == PF_INET || | 433 | if (sk->sk_family == PF_INET || |
434 | (!((1 << sk->sk_state) & | 434 | (!((1 << sk->sk_state) & |
@@ -436,10 +436,10 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, | |||
436 | inet->daddr != LOOPBACK4_IPV6)) { | 436 | inet->daddr != LOOPBACK4_IPV6)) { |
437 | #endif | 437 | #endif |
438 | if (inet->opt) | 438 | if (inet->opt) |
439 | tp->ext_header_len -= inet->opt->optlen; | 439 | icsk->icsk_ext_hdr_len -= inet->opt->optlen; |
440 | if (opt) | 440 | if (opt) |
441 | tp->ext_header_len += opt->optlen; | 441 | icsk->icsk_ext_hdr_len += opt->optlen; |
442 | tcp_sync_mss(sk, tp->pmtu_cookie); | 442 | icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); |
443 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | 443 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
444 | } | 444 | } |
445 | #endif | 445 | #endif |
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index fc718df17b40..d64e2ec8da7b 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c | |||
@@ -28,6 +28,7 @@ | |||
28 | #include <net/xfrm.h> | 28 | #include <net/xfrm.h> |
29 | #include <net/icmp.h> | 29 | #include <net/icmp.h> |
30 | #include <net/ipcomp.h> | 30 | #include <net/ipcomp.h> |
31 | #include <net/protocol.h> | ||
31 | 32 | ||
32 | struct ipcomp_tfms { | 33 | struct ipcomp_tfms { |
33 | struct list_head list; | 34 | struct list_head list; |
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index e8674baaa8d9..bb3613ec448c 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c | |||
@@ -42,6 +42,7 @@ | |||
42 | #include <linux/in.h> | 42 | #include <linux/in.h> |
43 | #include <linux/if.h> | 43 | #include <linux/if.h> |
44 | #include <linux/inet.h> | 44 | #include <linux/inet.h> |
45 | #include <linux/inetdevice.h> | ||
45 | #include <linux/netdevice.h> | 46 | #include <linux/netdevice.h> |
46 | #include <linux/if_arp.h> | 47 | #include <linux/if_arp.h> |
47 | #include <linux/skbuff.h> | 48 | #include <linux/skbuff.h> |
@@ -58,6 +59,7 @@ | |||
58 | #include <net/arp.h> | 59 | #include <net/arp.h> |
59 | #include <net/ip.h> | 60 | #include <net/ip.h> |
60 | #include <net/ipconfig.h> | 61 | #include <net/ipconfig.h> |
62 | #include <net/route.h> | ||
61 | 63 | ||
62 | #include <asm/uaccess.h> | 64 | #include <asm/uaccess.h> |
63 | #include <net/checksum.h> | 65 | #include <net/checksum.h> |
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 302b7eb507c9..caa3b7d2e48a 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c | |||
@@ -52,6 +52,7 @@ | |||
52 | #include <net/ip.h> | 52 | #include <net/ip.h> |
53 | #include <net/protocol.h> | 53 | #include <net/protocol.h> |
54 | #include <linux/skbuff.h> | 54 | #include <linux/skbuff.h> |
55 | #include <net/route.h> | ||
55 | #include <net/sock.h> | 56 | #include <net/sock.h> |
56 | #include <net/icmp.h> | 57 | #include <net/icmp.h> |
57 | #include <net/udp.h> | 58 | #include <net/udp.h> |
diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c index d7eb680101c2..9b176a942ac5 100644 --- a/net/ipv4/ipvs/ip_vs_app.c +++ b/net/ipv4/ipvs/ip_vs_app.c | |||
@@ -224,34 +224,6 @@ void unregister_ip_vs_app(struct ip_vs_app *app) | |||
224 | } | 224 | } |
225 | 225 | ||
226 | 226 | ||
227 | #if 0000 | ||
228 | /* | ||
229 | * Get reference to app by name (called from user context) | ||
230 | */ | ||
231 | struct ip_vs_app *ip_vs_app_get_by_name(char *appname) | ||
232 | { | ||
233 | struct ip_vs_app *app, *a = NULL; | ||
234 | |||
235 | down(&__ip_vs_app_mutex); | ||
236 | |||
237 | list_for_each_entry(ent, &ip_vs_app_list, a_list) { | ||
238 | if (strcmp(app->name, appname)) | ||
239 | continue; | ||
240 | |||
241 | /* softirq may call ip_vs_app_get too, so the caller | ||
242 | must disable softirq on the current CPU */ | ||
243 | if (ip_vs_app_get(app)) | ||
244 | a = app; | ||
245 | break; | ||
246 | } | ||
247 | |||
248 | up(&__ip_vs_app_mutex); | ||
249 | |||
250 | return a; | ||
251 | } | ||
252 | #endif | ||
253 | |||
254 | |||
255 | /* | 227 | /* |
256 | * Bind ip_vs_conn to its ip_vs_app (called by cp constructor) | 228 | * Bind ip_vs_conn to its ip_vs_app (called by cp constructor) |
257 | */ | 229 | */ |
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c index 2a3a8c59c655..81d90354c928 100644 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ b/net/ipv4/ipvs/ip_vs_conn.c | |||
@@ -24,7 +24,10 @@ | |||
24 | * | 24 | * |
25 | */ | 25 | */ |
26 | 26 | ||
27 | #include <linux/in.h> | ||
28 | #include <linux/net.h> | ||
27 | #include <linux/kernel.h> | 29 | #include <linux/kernel.h> |
30 | #include <linux/module.h> | ||
28 | #include <linux/vmalloc.h> | 31 | #include <linux/vmalloc.h> |
29 | #include <linux/proc_fs.h> /* for proc_net_* */ | 32 | #include <linux/proc_fs.h> /* for proc_net_* */ |
30 | #include <linux/seq_file.h> | 33 | #include <linux/seq_file.h> |
@@ -219,7 +222,7 @@ struct ip_vs_conn *ip_vs_conn_in_get | |||
219 | if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) | 222 | if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) |
220 | cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port); | 223 | cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port); |
221 | 224 | ||
222 | IP_VS_DBG(7, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", | 225 | IP_VS_DBG(9, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", |
223 | ip_vs_proto_name(protocol), | 226 | ip_vs_proto_name(protocol), |
224 | NIPQUAD(s_addr), ntohs(s_port), | 227 | NIPQUAD(s_addr), ntohs(s_port), |
225 | NIPQUAD(d_addr), ntohs(d_port), | 228 | NIPQUAD(d_addr), ntohs(d_port), |
@@ -254,7 +257,7 @@ struct ip_vs_conn *ip_vs_ct_in_get | |||
254 | out: | 257 | out: |
255 | ct_read_unlock(hash); | 258 | ct_read_unlock(hash); |
256 | 259 | ||
257 | IP_VS_DBG(7, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", | 260 | IP_VS_DBG(9, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", |
258 | ip_vs_proto_name(protocol), | 261 | ip_vs_proto_name(protocol), |
259 | NIPQUAD(s_addr), ntohs(s_port), | 262 | NIPQUAD(s_addr), ntohs(s_port), |
260 | NIPQUAD(d_addr), ntohs(d_port), | 263 | NIPQUAD(d_addr), ntohs(d_port), |
@@ -295,7 +298,7 @@ struct ip_vs_conn *ip_vs_conn_out_get | |||
295 | 298 | ||
296 | ct_read_unlock(hash); | 299 | ct_read_unlock(hash); |
297 | 300 | ||
298 | IP_VS_DBG(7, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", | 301 | IP_VS_DBG(9, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", |
299 | ip_vs_proto_name(protocol), | 302 | ip_vs_proto_name(protocol), |
300 | NIPQUAD(s_addr), ntohs(s_port), | 303 | NIPQUAD(s_addr), ntohs(s_port), |
301 | NIPQUAD(d_addr), ntohs(d_port), | 304 | NIPQUAD(d_addr), ntohs(d_port), |
@@ -391,8 +394,9 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) | |||
391 | cp->flags |= atomic_read(&dest->conn_flags); | 394 | cp->flags |= atomic_read(&dest->conn_flags); |
392 | cp->dest = dest; | 395 | cp->dest = dest; |
393 | 396 | ||
394 | IP_VS_DBG(9, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " | 397 | IP_VS_DBG(7, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " |
395 | "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n", | 398 | "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " |
399 | "dest->refcnt:%d\n", | ||
396 | ip_vs_proto_name(cp->protocol), | 400 | ip_vs_proto_name(cp->protocol), |
397 | NIPQUAD(cp->caddr), ntohs(cp->cport), | 401 | NIPQUAD(cp->caddr), ntohs(cp->cport), |
398 | NIPQUAD(cp->vaddr), ntohs(cp->vport), | 402 | NIPQUAD(cp->vaddr), ntohs(cp->vport), |
@@ -430,8 +434,9 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) | |||
430 | if (!dest) | 434 | if (!dest) |
431 | return; | 435 | return; |
432 | 436 | ||
433 | IP_VS_DBG(9, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " | 437 | IP_VS_DBG(7, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " |
434 | "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n", | 438 | "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " |
439 | "dest->refcnt:%d\n", | ||
435 | ip_vs_proto_name(cp->protocol), | 440 | ip_vs_proto_name(cp->protocol), |
436 | NIPQUAD(cp->caddr), ntohs(cp->cport), | 441 | NIPQUAD(cp->caddr), ntohs(cp->cport), |
437 | NIPQUAD(cp->vaddr), ntohs(cp->vport), | 442 | NIPQUAD(cp->vaddr), ntohs(cp->vport), |
@@ -571,7 +576,7 @@ static void ip_vs_conn_expire(unsigned long data) | |||
571 | ip_vs_conn_hash(cp); | 576 | ip_vs_conn_hash(cp); |
572 | 577 | ||
573 | expire_later: | 578 | expire_later: |
574 | IP_VS_DBG(7, "delayed: refcnt-1=%d conn.n_control=%d\n", | 579 | IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n", |
575 | atomic_read(&cp->refcnt)-1, | 580 | atomic_read(&cp->refcnt)-1, |
576 | atomic_read(&cp->n_control)); | 581 | atomic_read(&cp->n_control)); |
577 | 582 | ||
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 1a0843cd58a9..1aca94a9fd8b 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c | |||
@@ -426,7 +426,7 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) | |||
426 | return NULL; | 426 | return NULL; |
427 | 427 | ||
428 | IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u " | 428 | IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u " |
429 | "d:%u.%u.%u.%u:%u flg:%X cnt:%d\n", | 429 | "d:%u.%u.%u.%u:%u conn->flags:%X conn->refcnt:%d\n", |
430 | ip_vs_fwd_tag(cp), | 430 | ip_vs_fwd_tag(cp), |
431 | NIPQUAD(cp->caddr), ntohs(cp->cport), | 431 | NIPQUAD(cp->caddr), ntohs(cp->cport), |
432 | NIPQUAD(cp->vaddr), ntohs(cp->vport), | 432 | NIPQUAD(cp->vaddr), ntohs(cp->vport), |
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 9bdcf31b760e..c935c5086d33 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c | |||
@@ -35,6 +35,7 @@ | |||
35 | #include <linux/netfilter_ipv4.h> | 35 | #include <linux/netfilter_ipv4.h> |
36 | 36 | ||
37 | #include <net/ip.h> | 37 | #include <net/ip.h> |
38 | #include <net/route.h> | ||
38 | #include <net/sock.h> | 39 | #include <net/sock.h> |
39 | 40 | ||
40 | #include <asm/uaccess.h> | 41 | #include <asm/uaccess.h> |
@@ -447,7 +448,7 @@ ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport) | |||
447 | out: | 448 | out: |
448 | read_unlock(&__ip_vs_svc_lock); | 449 | read_unlock(&__ip_vs_svc_lock); |
449 | 450 | ||
450 | IP_VS_DBG(6, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n", | 451 | IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n", |
451 | fwmark, ip_vs_proto_name(protocol), | 452 | fwmark, ip_vs_proto_name(protocol), |
452 | NIPQUAD(vaddr), ntohs(vport), | 453 | NIPQUAD(vaddr), ntohs(vport), |
453 | svc?"hit":"not hit"); | 454 | svc?"hit":"not hit"); |
@@ -597,7 +598,7 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport) | |||
597 | */ | 598 | */ |
598 | list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { | 599 | list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { |
599 | IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, " | 600 | IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, " |
600 | "refcnt=%d\n", | 601 | "dest->refcnt=%d\n", |
601 | dest->vfwmark, | 602 | dest->vfwmark, |
602 | NIPQUAD(dest->addr), ntohs(dest->port), | 603 | NIPQUAD(dest->addr), ntohs(dest->port), |
603 | atomic_read(&dest->refcnt)); | 604 | atomic_read(&dest->refcnt)); |
@@ -804,7 +805,7 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) | |||
804 | dest = ip_vs_trash_get_dest(svc, daddr, dport); | 805 | dest = ip_vs_trash_get_dest(svc, daddr, dport); |
805 | if (dest != NULL) { | 806 | if (dest != NULL) { |
806 | IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, " | 807 | IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, " |
807 | "refcnt=%d, service %u/%u.%u.%u.%u:%u\n", | 808 | "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n", |
808 | NIPQUAD(daddr), ntohs(dport), | 809 | NIPQUAD(daddr), ntohs(dport), |
809 | atomic_read(&dest->refcnt), | 810 | atomic_read(&dest->refcnt), |
810 | dest->vfwmark, | 811 | dest->vfwmark, |
@@ -949,7 +950,8 @@ static void __ip_vs_del_dest(struct ip_vs_dest *dest) | |||
949 | atomic_dec(&dest->svc->refcnt); | 950 | atomic_dec(&dest->svc->refcnt); |
950 | kfree(dest); | 951 | kfree(dest); |
951 | } else { | 952 | } else { |
952 | IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, refcnt=%d\n", | 953 | IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, " |
954 | "dest->refcnt=%d\n", | ||
953 | NIPQUAD(dest->addr), ntohs(dest->port), | 955 | NIPQUAD(dest->addr), ntohs(dest->port), |
954 | atomic_read(&dest->refcnt)); | 956 | atomic_read(&dest->refcnt)); |
955 | list_add(&dest->n_list, &ip_vs_dest_trash); | 957 | list_add(&dest->n_list, &ip_vs_dest_trash); |
diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c index f3bc320dce93..9fee19c4c617 100644 --- a/net/ipv4/ipvs/ip_vs_dh.c +++ b/net/ipv4/ipvs/ip_vs_dh.c | |||
@@ -37,8 +37,10 @@ | |||
37 | * | 37 | * |
38 | */ | 38 | */ |
39 | 39 | ||
40 | #include <linux/ip.h> | ||
40 | #include <linux/module.h> | 41 | #include <linux/module.h> |
41 | #include <linux/kernel.h> | 42 | #include <linux/kernel.h> |
43 | #include <linux/skbuff.h> | ||
42 | 44 | ||
43 | #include <net/ip_vs.h> | 45 | #include <net/ip_vs.h> |
44 | 46 | ||
diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c index 67b3e2fc1fa1..e7004741ac73 100644 --- a/net/ipv4/ipvs/ip_vs_est.c +++ b/net/ipv4/ipvs/ip_vs_est.c | |||
@@ -13,7 +13,10 @@ | |||
13 | * Changes: | 13 | * Changes: |
14 | * | 14 | * |
15 | */ | 15 | */ |
16 | #include <linux/config.h> | ||
16 | #include <linux/kernel.h> | 17 | #include <linux/kernel.h> |
18 | #include <linux/jiffies.h> | ||
19 | #include <linux/slab.h> | ||
17 | #include <linux/types.h> | 20 | #include <linux/types.h> |
18 | 21 | ||
19 | #include <net/ip_vs.h> | 22 | #include <net/ip_vs.h> |
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c index 561cda326fa8..6e5cb92a5c83 100644 --- a/net/ipv4/ipvs/ip_vs_lblc.c +++ b/net/ipv4/ipvs/ip_vs_lblc.c | |||
@@ -41,8 +41,10 @@ | |||
41 | * me to write this module. | 41 | * me to write this module. |
42 | */ | 42 | */ |
43 | 43 | ||
44 | #include <linux/ip.h> | ||
44 | #include <linux/module.h> | 45 | #include <linux/module.h> |
45 | #include <linux/kernel.h> | 46 | #include <linux/kernel.h> |
47 | #include <linux/skbuff.h> | ||
46 | 48 | ||
47 | /* for sysctl */ | 49 | /* for sysctl */ |
48 | #include <linux/fs.h> | 50 | #include <linux/fs.h> |
@@ -228,33 +230,6 @@ ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) | |||
228 | } | 230 | } |
229 | 231 | ||
230 | 232 | ||
231 | #if 0000 | ||
232 | /* | ||
233 | * Unhash ip_vs_lblc_entry from ip_vs_lblc_table. | ||
234 | * returns bool success. | ||
235 | */ | ||
236 | static int ip_vs_lblc_unhash(struct ip_vs_lblc_table *tbl, | ||
237 | struct ip_vs_lblc_entry *en) | ||
238 | { | ||
239 | if (list_empty(&en->list)) { | ||
240 | IP_VS_ERR("ip_vs_lblc_unhash(): request for not hashed entry, " | ||
241 | "called from %p\n", __builtin_return_address(0)); | ||
242 | return 0; | ||
243 | } | ||
244 | |||
245 | /* | ||
246 | * Remove it from the table | ||
247 | */ | ||
248 | write_lock(&tbl->lock); | ||
249 | list_del(&en->list); | ||
250 | INIT_LIST_HEAD(&en->list); | ||
251 | write_unlock(&tbl->lock); | ||
252 | |||
253 | return 1; | ||
254 | } | ||
255 | #endif | ||
256 | |||
257 | |||
258 | /* | 233 | /* |
259 | * Get ip_vs_lblc_entry associated with supplied parameters. | 234 | * Get ip_vs_lblc_entry associated with supplied parameters. |
260 | */ | 235 | */ |
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c index ce456dbf09a5..32ba37ba72d8 100644 --- a/net/ipv4/ipvs/ip_vs_lblcr.c +++ b/net/ipv4/ipvs/ip_vs_lblcr.c | |||
@@ -39,8 +39,10 @@ | |||
39 | * | 39 | * |
40 | */ | 40 | */ |
41 | 41 | ||
42 | #include <linux/ip.h> | ||
42 | #include <linux/module.h> | 43 | #include <linux/module.h> |
43 | #include <linux/kernel.h> | 44 | #include <linux/kernel.h> |
45 | #include <linux/skbuff.h> | ||
44 | 46 | ||
45 | /* for sysctl */ | 47 | /* for sysctl */ |
46 | #include <linux/fs.h> | 48 | #include <linux/fs.h> |
@@ -414,33 +416,6 @@ ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en) | |||
414 | } | 416 | } |
415 | 417 | ||
416 | 418 | ||
417 | #if 0000 | ||
418 | /* | ||
419 | * Unhash ip_vs_lblcr_entry from ip_vs_lblcr_table. | ||
420 | * returns bool success. | ||
421 | */ | ||
422 | static int ip_vs_lblcr_unhash(struct ip_vs_lblcr_table *tbl, | ||
423 | struct ip_vs_lblcr_entry *en) | ||
424 | { | ||
425 | if (list_empty(&en->list)) { | ||
426 | IP_VS_ERR("ip_vs_lblcr_unhash(): request for not hashed entry, " | ||
427 | "called from %p\n", __builtin_return_address(0)); | ||
428 | return 0; | ||
429 | } | ||
430 | |||
431 | /* | ||
432 | * Remove it from the table | ||
433 | */ | ||
434 | write_lock(&tbl->lock); | ||
435 | list_del(&en->list); | ||
436 | INIT_LIST_HEAD(&en->list); | ||
437 | write_unlock(&tbl->lock); | ||
438 | |||
439 | return 1; | ||
440 | } | ||
441 | #endif | ||
442 | |||
443 | |||
444 | /* | 419 | /* |
445 | * Get ip_vs_lblcr_entry associated with supplied parameters. | 420 | * Get ip_vs_lblcr_entry associated with supplied parameters. |
446 | */ | 421 | */ |
diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c index 453e94a0bbd7..8b0505b09317 100644 --- a/net/ipv4/ipvs/ip_vs_proto_ah.c +++ b/net/ipv4/ipvs/ip_vs_proto_ah.c | |||
@@ -12,6 +12,8 @@ | |||
12 | * | 12 | * |
13 | */ | 13 | */ |
14 | 14 | ||
15 | #include <linux/in.h> | ||
16 | #include <linux/ip.h> | ||
15 | #include <linux/module.h> | 17 | #include <linux/module.h> |
16 | #include <linux/kernel.h> | 18 | #include <linux/kernel.h> |
17 | #include <linux/netfilter.h> | 19 | #include <linux/netfilter.h> |
diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c index 478e5c7c7e8e..c36ccf057a19 100644 --- a/net/ipv4/ipvs/ip_vs_proto_esp.c +++ b/net/ipv4/ipvs/ip_vs_proto_esp.c | |||
@@ -12,6 +12,8 @@ | |||
12 | * | 12 | * |
13 | */ | 13 | */ |
14 | 14 | ||
15 | #include <linux/in.h> | ||
16 | #include <linux/ip.h> | ||
15 | #include <linux/module.h> | 17 | #include <linux/module.h> |
16 | #include <linux/kernel.h> | 18 | #include <linux/kernel.h> |
17 | #include <linux/netfilter.h> | 19 | #include <linux/netfilter.h> |
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c index 0e878fd6215c..bc28b1160a3a 100644 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c | |||
@@ -275,28 +275,6 @@ static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { | |||
275 | [IP_VS_TCP_S_LAST] = 2*HZ, | 275 | [IP_VS_TCP_S_LAST] = 2*HZ, |
276 | }; | 276 | }; |
277 | 277 | ||
278 | |||
279 | #if 0 | ||
280 | |||
281 | /* FIXME: This is going to die */ | ||
282 | |||
283 | static int tcp_timeouts_dos[IP_VS_TCP_S_LAST+1] = { | ||
284 | [IP_VS_TCP_S_NONE] = 2*HZ, | ||
285 | [IP_VS_TCP_S_ESTABLISHED] = 8*60*HZ, | ||
286 | [IP_VS_TCP_S_SYN_SENT] = 60*HZ, | ||
287 | [IP_VS_TCP_S_SYN_RECV] = 10*HZ, | ||
288 | [IP_VS_TCP_S_FIN_WAIT] = 60*HZ, | ||
289 | [IP_VS_TCP_S_TIME_WAIT] = 60*HZ, | ||
290 | [IP_VS_TCP_S_CLOSE] = 10*HZ, | ||
291 | [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ, | ||
292 | [IP_VS_TCP_S_LAST_ACK] = 30*HZ, | ||
293 | [IP_VS_TCP_S_LISTEN] = 2*60*HZ, | ||
294 | [IP_VS_TCP_S_SYNACK] = 100*HZ, | ||
295 | [IP_VS_TCP_S_LAST] = 2*HZ, | ||
296 | }; | ||
297 | |||
298 | #endif | ||
299 | |||
300 | static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = { | 278 | static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = { |
301 | [IP_VS_TCP_S_NONE] = "NONE", | 279 | [IP_VS_TCP_S_NONE] = "NONE", |
302 | [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED", | 280 | [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED", |
@@ -448,7 +426,7 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, | |||
448 | struct ip_vs_dest *dest = cp->dest; | 426 | struct ip_vs_dest *dest = cp->dest; |
449 | 427 | ||
450 | IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->" | 428 | IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->" |
451 | "%u.%u.%u.%u:%d state: %s->%s cnt:%d\n", | 429 | "%u.%u.%u.%u:%d state: %s->%s conn->refcnt:%d\n", |
452 | pp->name, | 430 | pp->name, |
453 | (state_off==TCP_DIR_OUTPUT)?"output ":"input ", | 431 | (state_off==TCP_DIR_OUTPUT)?"output ":"input ", |
454 | th->syn? 'S' : '.', | 432 | th->syn? 'S' : '.', |
diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c index 8ae5f2e0aefa..89d9175d8f28 100644 --- a/net/ipv4/ipvs/ip_vs_proto_udp.c +++ b/net/ipv4/ipvs/ip_vs_proto_udp.c | |||
@@ -15,8 +15,11 @@ | |||
15 | * | 15 | * |
16 | */ | 16 | */ |
17 | 17 | ||
18 | #include <linux/in.h> | ||
19 | #include <linux/ip.h> | ||
18 | #include <linux/kernel.h> | 20 | #include <linux/kernel.h> |
19 | #include <linux/netfilter_ipv4.h> | 21 | #include <linux/netfilter_ipv4.h> |
22 | #include <linux/udp.h> | ||
20 | 23 | ||
21 | #include <net/ip_vs.h> | 24 | #include <net/ip_vs.h> |
22 | 25 | ||
diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c index 6f7c50e44a39..7775e6cc68be 100644 --- a/net/ipv4/ipvs/ip_vs_sh.c +++ b/net/ipv4/ipvs/ip_vs_sh.c | |||
@@ -34,8 +34,10 @@ | |||
34 | * | 34 | * |
35 | */ | 35 | */ |
36 | 36 | ||
37 | #include <linux/ip.h> | ||
37 | #include <linux/module.h> | 38 | #include <linux/module.h> |
38 | #include <linux/kernel.h> | 39 | #include <linux/kernel.h> |
40 | #include <linux/skbuff.h> | ||
39 | 41 | ||
40 | #include <net/ip_vs.h> | 42 | #include <net/ip_vs.h> |
41 | 43 | ||
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c index 2e5ced3d8062..1bca714bda3d 100644 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ b/net/ipv4/ipvs/ip_vs_sync.c | |||
@@ -21,12 +21,14 @@ | |||
21 | 21 | ||
22 | #include <linux/module.h> | 22 | #include <linux/module.h> |
23 | #include <linux/slab.h> | 23 | #include <linux/slab.h> |
24 | #include <linux/inetdevice.h> | ||
24 | #include <linux/net.h> | 25 | #include <linux/net.h> |
25 | #include <linux/completion.h> | 26 | #include <linux/completion.h> |
26 | #include <linux/delay.h> | 27 | #include <linux/delay.h> |
27 | #include <linux/skbuff.h> | 28 | #include <linux/skbuff.h> |
28 | #include <linux/in.h> | 29 | #include <linux/in.h> |
29 | #include <linux/igmp.h> /* for ip_mc_join_group */ | 30 | #include <linux/igmp.h> /* for ip_mc_join_group */ |
31 | #include <linux/udp.h> | ||
30 | 32 | ||
31 | #include <net/ip.h> | 33 | #include <net/ip.h> |
32 | #include <net/sock.h> | 34 | #include <net/sock.h> |
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 3c2e9639bba6..bba156304695 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c | |||
@@ -68,19 +68,14 @@ struct arpt_table_info { | |||
68 | unsigned int initial_entries; | 68 | unsigned int initial_entries; |
69 | unsigned int hook_entry[NF_ARP_NUMHOOKS]; | 69 | unsigned int hook_entry[NF_ARP_NUMHOOKS]; |
70 | unsigned int underflow[NF_ARP_NUMHOOKS]; | 70 | unsigned int underflow[NF_ARP_NUMHOOKS]; |
71 | char entries[0] __attribute__((aligned(SMP_CACHE_BYTES))); | 71 | void *entries[NR_CPUS]; |
72 | }; | 72 | }; |
73 | 73 | ||
74 | static LIST_HEAD(arpt_target); | 74 | static LIST_HEAD(arpt_target); |
75 | static LIST_HEAD(arpt_tables); | 75 | static LIST_HEAD(arpt_tables); |
76 | #define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0) | ||
76 | #define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) | 77 | #define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) |
77 | 78 | ||
78 | #ifdef CONFIG_SMP | ||
79 | #define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p)) | ||
80 | #else | ||
81 | #define TABLE_OFFSET(t,p) 0 | ||
82 | #endif | ||
83 | |||
84 | static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap, | 79 | static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap, |
85 | char *hdr_addr, int len) | 80 | char *hdr_addr, int len) |
86 | { | 81 | { |
@@ -269,9 +264,7 @@ unsigned int arpt_do_table(struct sk_buff **pskb, | |||
269 | outdev = out ? out->name : nulldevname; | 264 | outdev = out ? out->name : nulldevname; |
270 | 265 | ||
271 | read_lock_bh(&table->lock); | 266 | read_lock_bh(&table->lock); |
272 | table_base = (void *)table->private->entries | 267 | table_base = (void *)table->private->entries[smp_processor_id()]; |
273 | + TABLE_OFFSET(table->private, | ||
274 | smp_processor_id()); | ||
275 | e = get_entry(table_base, table->private->hook_entry[hook]); | 268 | e = get_entry(table_base, table->private->hook_entry[hook]); |
276 | back = get_entry(table_base, table->private->underflow[hook]); | 269 | back = get_entry(table_base, table->private->underflow[hook]); |
277 | 270 | ||
@@ -462,7 +455,8 @@ static inline int unconditional(const struct arpt_arp *arp) | |||
462 | /* Figures out from what hook each rule can be called: returns 0 if | 455 | /* Figures out from what hook each rule can be called: returns 0 if |
463 | * there are loops. Puts hook bitmask in comefrom. | 456 | * there are loops. Puts hook bitmask in comefrom. |
464 | */ | 457 | */ |
465 | static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int valid_hooks) | 458 | static int mark_source_chains(struct arpt_table_info *newinfo, |
459 | unsigned int valid_hooks, void *entry0) | ||
466 | { | 460 | { |
467 | unsigned int hook; | 461 | unsigned int hook; |
468 | 462 | ||
@@ -472,7 +466,7 @@ static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int vali | |||
472 | for (hook = 0; hook < NF_ARP_NUMHOOKS; hook++) { | 466 | for (hook = 0; hook < NF_ARP_NUMHOOKS; hook++) { |
473 | unsigned int pos = newinfo->hook_entry[hook]; | 467 | unsigned int pos = newinfo->hook_entry[hook]; |
474 | struct arpt_entry *e | 468 | struct arpt_entry *e |
475 | = (struct arpt_entry *)(newinfo->entries + pos); | 469 | = (struct arpt_entry *)(entry0 + pos); |
476 | 470 | ||
477 | if (!(valid_hooks & (1 << hook))) | 471 | if (!(valid_hooks & (1 << hook))) |
478 | continue; | 472 | continue; |
@@ -514,13 +508,13 @@ static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int vali | |||
514 | goto next; | 508 | goto next; |
515 | 509 | ||
516 | e = (struct arpt_entry *) | 510 | e = (struct arpt_entry *) |
517 | (newinfo->entries + pos); | 511 | (entry0 + pos); |
518 | } while (oldpos == pos + e->next_offset); | 512 | } while (oldpos == pos + e->next_offset); |
519 | 513 | ||
520 | /* Move along one */ | 514 | /* Move along one */ |
521 | size = e->next_offset; | 515 | size = e->next_offset; |
522 | e = (struct arpt_entry *) | 516 | e = (struct arpt_entry *) |
523 | (newinfo->entries + pos + size); | 517 | (entry0 + pos + size); |
524 | e->counters.pcnt = pos; | 518 | e->counters.pcnt = pos; |
525 | pos += size; | 519 | pos += size; |
526 | } else { | 520 | } else { |
@@ -537,7 +531,7 @@ static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int vali | |||
537 | newpos = pos + e->next_offset; | 531 | newpos = pos + e->next_offset; |
538 | } | 532 | } |
539 | e = (struct arpt_entry *) | 533 | e = (struct arpt_entry *) |
540 | (newinfo->entries + newpos); | 534 | (entry0 + newpos); |
541 | e->counters.pcnt = pos; | 535 | e->counters.pcnt = pos; |
542 | pos = newpos; | 536 | pos = newpos; |
543 | } | 537 | } |
@@ -689,6 +683,7 @@ static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i) | |||
689 | static int translate_table(const char *name, | 683 | static int translate_table(const char *name, |
690 | unsigned int valid_hooks, | 684 | unsigned int valid_hooks, |
691 | struct arpt_table_info *newinfo, | 685 | struct arpt_table_info *newinfo, |
686 | void *entry0, | ||
692 | unsigned int size, | 687 | unsigned int size, |
693 | unsigned int number, | 688 | unsigned int number, |
694 | const unsigned int *hook_entries, | 689 | const unsigned int *hook_entries, |
@@ -710,11 +705,11 @@ static int translate_table(const char *name, | |||
710 | i = 0; | 705 | i = 0; |
711 | 706 | ||
712 | /* Walk through entries, checking offsets. */ | 707 | /* Walk through entries, checking offsets. */ |
713 | ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, | 708 | ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size, |
714 | check_entry_size_and_hooks, | 709 | check_entry_size_and_hooks, |
715 | newinfo, | 710 | newinfo, |
716 | newinfo->entries, | 711 | entry0, |
717 | newinfo->entries + size, | 712 | entry0 + size, |
718 | hook_entries, underflows, &i); | 713 | hook_entries, underflows, &i); |
719 | duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret); | 714 | duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret); |
720 | if (ret != 0) | 715 | if (ret != 0) |
@@ -743,29 +738,26 @@ static int translate_table(const char *name, | |||
743 | } | 738 | } |
744 | } | 739 | } |
745 | 740 | ||
746 | if (!mark_source_chains(newinfo, valid_hooks)) { | 741 | if (!mark_source_chains(newinfo, valid_hooks, entry0)) { |
747 | duprintf("Looping hook\n"); | 742 | duprintf("Looping hook\n"); |
748 | return -ELOOP; | 743 | return -ELOOP; |
749 | } | 744 | } |
750 | 745 | ||
751 | /* Finally, each sanity check must pass */ | 746 | /* Finally, each sanity check must pass */ |
752 | i = 0; | 747 | i = 0; |
753 | ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, | 748 | ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size, |
754 | check_entry, name, size, &i); | 749 | check_entry, name, size, &i); |
755 | 750 | ||
756 | if (ret != 0) { | 751 | if (ret != 0) { |
757 | ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, | 752 | ARPT_ENTRY_ITERATE(entry0, newinfo->size, |
758 | cleanup_entry, &i); | 753 | cleanup_entry, &i); |
759 | return ret; | 754 | return ret; |
760 | } | 755 | } |
761 | 756 | ||
762 | /* And one copy for every other CPU */ | 757 | /* And one copy for every other CPU */ |
763 | for_each_cpu(i) { | 758 | for_each_cpu(i) { |
764 | if (i == 0) | 759 | if (newinfo->entries[i] && newinfo->entries[i] != entry0) |
765 | continue; | 760 | memcpy(newinfo->entries[i], entry0, newinfo->size); |
766 | memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i, | ||
767 | newinfo->entries, | ||
768 | SMP_ALIGN(newinfo->size)); | ||
769 | } | 761 | } |
770 | 762 | ||
771 | return ret; | 763 | return ret; |
@@ -807,15 +799,42 @@ static inline int add_entry_to_counter(const struct arpt_entry *e, | |||
807 | return 0; | 799 | return 0; |
808 | } | 800 | } |
809 | 801 | ||
802 | static inline int set_entry_to_counter(const struct arpt_entry *e, | ||
803 | struct arpt_counters total[], | ||
804 | unsigned int *i) | ||
805 | { | ||
806 | SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); | ||
807 | |||
808 | (*i)++; | ||
809 | return 0; | ||
810 | } | ||
811 | |||
810 | static void get_counters(const struct arpt_table_info *t, | 812 | static void get_counters(const struct arpt_table_info *t, |
811 | struct arpt_counters counters[]) | 813 | struct arpt_counters counters[]) |
812 | { | 814 | { |
813 | unsigned int cpu; | 815 | unsigned int cpu; |
814 | unsigned int i; | 816 | unsigned int i; |
817 | unsigned int curcpu; | ||
818 | |||
819 | /* Instead of clearing (by a previous call to memset()) | ||
820 | * the counters and using adds, we set the counters | ||
821 | * with data used by 'current' CPU | ||
822 | * We dont care about preemption here. | ||
823 | */ | ||
824 | curcpu = raw_smp_processor_id(); | ||
825 | |||
826 | i = 0; | ||
827 | ARPT_ENTRY_ITERATE(t->entries[curcpu], | ||
828 | t->size, | ||
829 | set_entry_to_counter, | ||
830 | counters, | ||
831 | &i); | ||
815 | 832 | ||
816 | for_each_cpu(cpu) { | 833 | for_each_cpu(cpu) { |
834 | if (cpu == curcpu) | ||
835 | continue; | ||
817 | i = 0; | 836 | i = 0; |
818 | ARPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), | 837 | ARPT_ENTRY_ITERATE(t->entries[cpu], |
819 | t->size, | 838 | t->size, |
820 | add_entry_to_counter, | 839 | add_entry_to_counter, |
821 | counters, | 840 | counters, |
@@ -831,6 +850,7 @@ static int copy_entries_to_user(unsigned int total_size, | |||
831 | struct arpt_entry *e; | 850 | struct arpt_entry *e; |
832 | struct arpt_counters *counters; | 851 | struct arpt_counters *counters; |
833 | int ret = 0; | 852 | int ret = 0; |
853 | void *loc_cpu_entry; | ||
834 | 854 | ||
835 | /* We need atomic snapshot of counters: rest doesn't change | 855 | /* We need atomic snapshot of counters: rest doesn't change |
836 | * (other than comefrom, which userspace doesn't care | 856 | * (other than comefrom, which userspace doesn't care |
@@ -843,13 +863,13 @@ static int copy_entries_to_user(unsigned int total_size, | |||
843 | return -ENOMEM; | 863 | return -ENOMEM; |
844 | 864 | ||
845 | /* First, sum counters... */ | 865 | /* First, sum counters... */ |
846 | memset(counters, 0, countersize); | ||
847 | write_lock_bh(&table->lock); | 866 | write_lock_bh(&table->lock); |
848 | get_counters(table->private, counters); | 867 | get_counters(table->private, counters); |
849 | write_unlock_bh(&table->lock); | 868 | write_unlock_bh(&table->lock); |
850 | 869 | ||
851 | /* ... then copy entire thing from CPU 0... */ | 870 | loc_cpu_entry = table->private->entries[raw_smp_processor_id()]; |
852 | if (copy_to_user(userptr, table->private->entries, total_size) != 0) { | 871 | /* ... then copy entire thing ... */ |
872 | if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { | ||
853 | ret = -EFAULT; | 873 | ret = -EFAULT; |
854 | goto free_counters; | 874 | goto free_counters; |
855 | } | 875 | } |
@@ -859,7 +879,7 @@ static int copy_entries_to_user(unsigned int total_size, | |||
859 | for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ | 879 | for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ |
860 | struct arpt_entry_target *t; | 880 | struct arpt_entry_target *t; |
861 | 881 | ||
862 | e = (struct arpt_entry *)(table->private->entries + off); | 882 | e = (struct arpt_entry *)(loc_cpu_entry + off); |
863 | if (copy_to_user(userptr + off | 883 | if (copy_to_user(userptr + off |
864 | + offsetof(struct arpt_entry, counters), | 884 | + offsetof(struct arpt_entry, counters), |
865 | &counters[num], | 885 | &counters[num], |
@@ -911,6 +931,47 @@ static int get_entries(const struct arpt_get_entries *entries, | |||
911 | return ret; | 931 | return ret; |
912 | } | 932 | } |
913 | 933 | ||
934 | static void free_table_info(struct arpt_table_info *info) | ||
935 | { | ||
936 | int cpu; | ||
937 | for_each_cpu(cpu) { | ||
938 | if (info->size <= PAGE_SIZE) | ||
939 | kfree(info->entries[cpu]); | ||
940 | else | ||
941 | vfree(info->entries[cpu]); | ||
942 | } | ||
943 | kfree(info); | ||
944 | } | ||
945 | |||
946 | static struct arpt_table_info *alloc_table_info(unsigned int size) | ||
947 | { | ||
948 | struct arpt_table_info *newinfo; | ||
949 | int cpu; | ||
950 | |||
951 | newinfo = kzalloc(sizeof(struct arpt_table_info), GFP_KERNEL); | ||
952 | if (!newinfo) | ||
953 | return NULL; | ||
954 | |||
955 | newinfo->size = size; | ||
956 | |||
957 | for_each_cpu(cpu) { | ||
958 | if (size <= PAGE_SIZE) | ||
959 | newinfo->entries[cpu] = kmalloc_node(size, | ||
960 | GFP_KERNEL, | ||
961 | cpu_to_node(cpu)); | ||
962 | else | ||
963 | newinfo->entries[cpu] = vmalloc_node(size, | ||
964 | cpu_to_node(cpu)); | ||
965 | |||
966 | if (newinfo->entries[cpu] == NULL) { | ||
967 | free_table_info(newinfo); | ||
968 | return NULL; | ||
969 | } | ||
970 | } | ||
971 | |||
972 | return newinfo; | ||
973 | } | ||
974 | |||
914 | static int do_replace(void __user *user, unsigned int len) | 975 | static int do_replace(void __user *user, unsigned int len) |
915 | { | 976 | { |
916 | int ret; | 977 | int ret; |
@@ -918,6 +979,7 @@ static int do_replace(void __user *user, unsigned int len) | |||
918 | struct arpt_table *t; | 979 | struct arpt_table *t; |
919 | struct arpt_table_info *newinfo, *oldinfo; | 980 | struct arpt_table_info *newinfo, *oldinfo; |
920 | struct arpt_counters *counters; | 981 | struct arpt_counters *counters; |
982 | void *loc_cpu_entry, *loc_cpu_old_entry; | ||
921 | 983 | ||
922 | if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) | 984 | if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) |
923 | return -EFAULT; | 985 | return -EFAULT; |
@@ -930,13 +992,13 @@ static int do_replace(void __user *user, unsigned int len) | |||
930 | if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) | 992 | if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) |
931 | return -ENOMEM; | 993 | return -ENOMEM; |
932 | 994 | ||
933 | newinfo = vmalloc(sizeof(struct arpt_table_info) | 995 | newinfo = alloc_table_info(tmp.size); |
934 | + SMP_ALIGN(tmp.size) * | ||
935 | (highest_possible_processor_id()+1)); | ||
936 | if (!newinfo) | 996 | if (!newinfo) |
937 | return -ENOMEM; | 997 | return -ENOMEM; |
938 | 998 | ||
939 | if (copy_from_user(newinfo->entries, user + sizeof(tmp), | 999 | /* choose the copy that is on our node/cpu */ |
1000 | loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; | ||
1001 | if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), | ||
940 | tmp.size) != 0) { | 1002 | tmp.size) != 0) { |
941 | ret = -EFAULT; | 1003 | ret = -EFAULT; |
942 | goto free_newinfo; | 1004 | goto free_newinfo; |
@@ -947,10 +1009,9 @@ static int do_replace(void __user *user, unsigned int len) | |||
947 | ret = -ENOMEM; | 1009 | ret = -ENOMEM; |
948 | goto free_newinfo; | 1010 | goto free_newinfo; |
949 | } | 1011 | } |
950 | memset(counters, 0, tmp.num_counters * sizeof(struct arpt_counters)); | ||
951 | 1012 | ||
952 | ret = translate_table(tmp.name, tmp.valid_hooks, | 1013 | ret = translate_table(tmp.name, tmp.valid_hooks, |
953 | newinfo, tmp.size, tmp.num_entries, | 1014 | newinfo, loc_cpu_entry, tmp.size, tmp.num_entries, |
954 | tmp.hook_entry, tmp.underflow); | 1015 | tmp.hook_entry, tmp.underflow); |
955 | if (ret != 0) | 1016 | if (ret != 0) |
956 | goto free_newinfo_counters; | 1017 | goto free_newinfo_counters; |
@@ -989,8 +1050,10 @@ static int do_replace(void __user *user, unsigned int len) | |||
989 | /* Get the old counters. */ | 1050 | /* Get the old counters. */ |
990 | get_counters(oldinfo, counters); | 1051 | get_counters(oldinfo, counters); |
991 | /* Decrease module usage counts and free resource */ | 1052 | /* Decrease module usage counts and free resource */ |
992 | ARPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL); | 1053 | loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; |
993 | vfree(oldinfo); | 1054 | ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL); |
1055 | |||
1056 | free_table_info(oldinfo); | ||
994 | if (copy_to_user(tmp.counters, counters, | 1057 | if (copy_to_user(tmp.counters, counters, |
995 | sizeof(struct arpt_counters) * tmp.num_counters) != 0) | 1058 | sizeof(struct arpt_counters) * tmp.num_counters) != 0) |
996 | ret = -EFAULT; | 1059 | ret = -EFAULT; |
@@ -1002,11 +1065,11 @@ static int do_replace(void __user *user, unsigned int len) | |||
1002 | module_put(t->me); | 1065 | module_put(t->me); |
1003 | up(&arpt_mutex); | 1066 | up(&arpt_mutex); |
1004 | free_newinfo_counters_untrans: | 1067 | free_newinfo_counters_untrans: |
1005 | ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry, NULL); | 1068 | ARPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); |
1006 | free_newinfo_counters: | 1069 | free_newinfo_counters: |
1007 | vfree(counters); | 1070 | vfree(counters); |
1008 | free_newinfo: | 1071 | free_newinfo: |
1009 | vfree(newinfo); | 1072 | free_table_info(newinfo); |
1010 | return ret; | 1073 | return ret; |
1011 | } | 1074 | } |
1012 | 1075 | ||
@@ -1030,6 +1093,7 @@ static int do_add_counters(void __user *user, unsigned int len) | |||
1030 | struct arpt_counters_info tmp, *paddc; | 1093 | struct arpt_counters_info tmp, *paddc; |
1031 | struct arpt_table *t; | 1094 | struct arpt_table *t; |
1032 | int ret = 0; | 1095 | int ret = 0; |
1096 | void *loc_cpu_entry; | ||
1033 | 1097 | ||
1034 | if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) | 1098 | if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) |
1035 | return -EFAULT; | 1099 | return -EFAULT; |
@@ -1059,7 +1123,9 @@ static int do_add_counters(void __user *user, unsigned int len) | |||
1059 | } | 1123 | } |
1060 | 1124 | ||
1061 | i = 0; | 1125 | i = 0; |
1062 | ARPT_ENTRY_ITERATE(t->private->entries, | 1126 | /* Choose the copy that is on our node */ |
1127 | loc_cpu_entry = t->private->entries[smp_processor_id()]; | ||
1128 | ARPT_ENTRY_ITERATE(loc_cpu_entry, | ||
1063 | t->private->size, | 1129 | t->private->size, |
1064 | add_counter_to_entry, | 1130 | add_counter_to_entry, |
1065 | paddc->counters, | 1131 | paddc->counters, |
@@ -1220,30 +1286,32 @@ int arpt_register_table(struct arpt_table *table, | |||
1220 | struct arpt_table_info *newinfo; | 1286 | struct arpt_table_info *newinfo; |
1221 | static struct arpt_table_info bootstrap | 1287 | static struct arpt_table_info bootstrap |
1222 | = { 0, 0, 0, { 0 }, { 0 }, { } }; | 1288 | = { 0, 0, 0, { 0 }, { 0 }, { } }; |
1289 | void *loc_cpu_entry; | ||
1223 | 1290 | ||
1224 | newinfo = vmalloc(sizeof(struct arpt_table_info) | 1291 | newinfo = alloc_table_info(repl->size); |
1225 | + SMP_ALIGN(repl->size) * | ||
1226 | (highest_possible_processor_id()+1)); | ||
1227 | if (!newinfo) { | 1292 | if (!newinfo) { |
1228 | ret = -ENOMEM; | 1293 | ret = -ENOMEM; |
1229 | return ret; | 1294 | return ret; |
1230 | } | 1295 | } |
1231 | memcpy(newinfo->entries, repl->entries, repl->size); | 1296 | |
1297 | /* choose the copy on our node/cpu */ | ||
1298 | loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; | ||
1299 | memcpy(loc_cpu_entry, repl->entries, repl->size); | ||
1232 | 1300 | ||
1233 | ret = translate_table(table->name, table->valid_hooks, | 1301 | ret = translate_table(table->name, table->valid_hooks, |
1234 | newinfo, repl->size, | 1302 | newinfo, loc_cpu_entry, repl->size, |
1235 | repl->num_entries, | 1303 | repl->num_entries, |
1236 | repl->hook_entry, | 1304 | repl->hook_entry, |
1237 | repl->underflow); | 1305 | repl->underflow); |
1238 | duprintf("arpt_register_table: translate table gives %d\n", ret); | 1306 | duprintf("arpt_register_table: translate table gives %d\n", ret); |
1239 | if (ret != 0) { | 1307 | if (ret != 0) { |
1240 | vfree(newinfo); | 1308 | free_table_info(newinfo); |
1241 | return ret; | 1309 | return ret; |
1242 | } | 1310 | } |
1243 | 1311 | ||
1244 | ret = down_interruptible(&arpt_mutex); | 1312 | ret = down_interruptible(&arpt_mutex); |
1245 | if (ret != 0) { | 1313 | if (ret != 0) { |
1246 | vfree(newinfo); | 1314 | free_table_info(newinfo); |
1247 | return ret; | 1315 | return ret; |
1248 | } | 1316 | } |
1249 | 1317 | ||
@@ -1272,20 +1340,23 @@ int arpt_register_table(struct arpt_table *table, | |||
1272 | return ret; | 1340 | return ret; |
1273 | 1341 | ||
1274 | free_unlock: | 1342 | free_unlock: |
1275 | vfree(newinfo); | 1343 | free_table_info(newinfo); |
1276 | goto unlock; | 1344 | goto unlock; |
1277 | } | 1345 | } |
1278 | 1346 | ||
1279 | void arpt_unregister_table(struct arpt_table *table) | 1347 | void arpt_unregister_table(struct arpt_table *table) |
1280 | { | 1348 | { |
1349 | void *loc_cpu_entry; | ||
1350 | |||
1281 | down(&arpt_mutex); | 1351 | down(&arpt_mutex); |
1282 | LIST_DELETE(&arpt_tables, table); | 1352 | LIST_DELETE(&arpt_tables, table); |
1283 | up(&arpt_mutex); | 1353 | up(&arpt_mutex); |
1284 | 1354 | ||
1285 | /* Decrease module usage counts and free resources */ | 1355 | /* Decrease module usage counts and free resources */ |
1286 | ARPT_ENTRY_ITERATE(table->private->entries, table->private->size, | 1356 | loc_cpu_entry = table->private->entries[raw_smp_processor_id()]; |
1357 | ARPT_ENTRY_ITERATE(loc_cpu_entry, table->private->size, | ||
1287 | cleanup_entry, NULL); | 1358 | cleanup_entry, NULL); |
1288 | vfree(table->private); | 1359 | free_table_info(table->private); |
1289 | } | 1360 | } |
1290 | 1361 | ||
1291 | /* The built-in targets: standard (NULL) and error. */ | 1362 | /* The built-in targets: standard (NULL) and error. */ |
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c index e52847fa10f5..0366eedb4d70 100644 --- a/net/ipv4/netfilter/ip_conntrack_amanda.c +++ b/net/ipv4/netfilter/ip_conntrack_amanda.c | |||
@@ -18,11 +18,13 @@ | |||
18 | * | 18 | * |
19 | */ | 19 | */ |
20 | 20 | ||
21 | #include <linux/in.h> | ||
21 | #include <linux/kernel.h> | 22 | #include <linux/kernel.h> |
22 | #include <linux/module.h> | 23 | #include <linux/module.h> |
23 | #include <linux/netfilter.h> | 24 | #include <linux/netfilter.h> |
24 | #include <linux/ip.h> | 25 | #include <linux/ip.h> |
25 | #include <linux/moduleparam.h> | 26 | #include <linux/moduleparam.h> |
27 | #include <linux/udp.h> | ||
26 | #include <net/checksum.h> | 28 | #include <net/checksum.h> |
27 | #include <net/udp.h> | 29 | #include <net/udp.h> |
28 | 30 | ||
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_gre.c b/net/ipv4/netfilter/ip_conntrack_proto_gre.c index 744abb9d377a..57956dee60c8 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_gre.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_gre.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/ip.h> | 31 | #include <linux/ip.h> |
32 | #include <linux/in.h> | 32 | #include <linux/in.h> |
33 | #include <linux/list.h> | 33 | #include <linux/list.h> |
34 | #include <linux/seq_file.h> | ||
34 | 35 | ||
35 | static DEFINE_RWLOCK(ip_ct_gre_lock); | 36 | static DEFINE_RWLOCK(ip_ct_gre_lock); |
36 | #define ASSERT_READ_LOCK(x) | 37 | #define ASSERT_READ_LOCK(x) |
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c index f2dcac7c7660..46becbe4fe58 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c | |||
@@ -11,6 +11,7 @@ | |||
11 | #include <linux/timer.h> | 11 | #include <linux/timer.h> |
12 | #include <linux/netfilter.h> | 12 | #include <linux/netfilter.h> |
13 | #include <linux/in.h> | 13 | #include <linux/in.h> |
14 | #include <linux/ip.h> | ||
14 | #include <linux/udp.h> | 15 | #include <linux/udp.h> |
15 | #include <linux/seq_file.h> | 16 | #include <linux/seq_file.h> |
16 | #include <net/checksum.h> | 17 | #include <net/checksum.h> |
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index dd476b191f4b..a88bcc551244 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #endif | 27 | #endif |
28 | #include <net/checksum.h> | 28 | #include <net/checksum.h> |
29 | #include <net/ip.h> | 29 | #include <net/ip.h> |
30 | #include <net/route.h> | ||
30 | 31 | ||
31 | #define ASSERT_READ_LOCK(x) | 32 | #define ASSERT_READ_LOCK(x) |
32 | #define ASSERT_WRITE_LOCK(x) | 33 | #define ASSERT_WRITE_LOCK(x) |
diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c index 8acb7ed40b47..4f95d477805c 100644 --- a/net/ipv4/netfilter/ip_nat_snmp_basic.c +++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c | |||
@@ -44,6 +44,7 @@ | |||
44 | * | 44 | * |
45 | */ | 45 | */ |
46 | #include <linux/config.h> | 46 | #include <linux/config.h> |
47 | #include <linux/in.h> | ||
47 | #include <linux/module.h> | 48 | #include <linux/module.h> |
48 | #include <linux/types.h> | 49 | #include <linux/types.h> |
49 | #include <linux/kernel.h> | 50 | #include <linux/kernel.h> |
@@ -53,6 +54,7 @@ | |||
53 | #include <linux/netfilter_ipv4/ip_conntrack_helper.h> | 54 | #include <linux/netfilter_ipv4/ip_conntrack_helper.h> |
54 | #include <linux/netfilter_ipv4/ip_nat_helper.h> | 55 | #include <linux/netfilter_ipv4/ip_nat_helper.h> |
55 | #include <linux/ip.h> | 56 | #include <linux/ip.h> |
57 | #include <linux/udp.h> | ||
56 | #include <net/checksum.h> | 58 | #include <net/checksum.h> |
57 | #include <net/udp.h> | 59 | #include <net/udp.h> |
58 | #include <asm/uaccess.h> | 60 | #include <asm/uaccess.h> |
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 45886c8475e8..2a26d167e149 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c | |||
@@ -83,11 +83,6 @@ static DECLARE_MUTEX(ipt_mutex); | |||
83 | context stops packets coming through and allows user context to read | 83 | context stops packets coming through and allows user context to read |
84 | the counters or update the rules. | 84 | the counters or update the rules. |
85 | 85 | ||
86 | To be cache friendly on SMP, we arrange them like so: | ||
87 | [ n-entries ] | ||
88 | ... cache-align padding ... | ||
89 | [ n-entries ] | ||
90 | |||
91 | Hence the start of any table is given by get_table() below. */ | 86 | Hence the start of any table is given by get_table() below. */ |
92 | 87 | ||
93 | /* The table itself */ | 88 | /* The table itself */ |
@@ -105,20 +100,15 @@ struct ipt_table_info | |||
105 | unsigned int underflow[NF_IP_NUMHOOKS]; | 100 | unsigned int underflow[NF_IP_NUMHOOKS]; |
106 | 101 | ||
107 | /* ipt_entry tables: one per CPU */ | 102 | /* ipt_entry tables: one per CPU */ |
108 | char entries[0] ____cacheline_aligned; | 103 | void *entries[NR_CPUS]; |
109 | }; | 104 | }; |
110 | 105 | ||
111 | static LIST_HEAD(ipt_target); | 106 | static LIST_HEAD(ipt_target); |
112 | static LIST_HEAD(ipt_match); | 107 | static LIST_HEAD(ipt_match); |
113 | static LIST_HEAD(ipt_tables); | 108 | static LIST_HEAD(ipt_tables); |
109 | #define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0) | ||
114 | #define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) | 110 | #define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) |
115 | 111 | ||
116 | #ifdef CONFIG_SMP | ||
117 | #define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p)) | ||
118 | #else | ||
119 | #define TABLE_OFFSET(t,p) 0 | ||
120 | #endif | ||
121 | |||
122 | #if 0 | 112 | #if 0 |
123 | #define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0) | 113 | #define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0) |
124 | #define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; }) | 114 | #define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; }) |
@@ -290,8 +280,7 @@ ipt_do_table(struct sk_buff **pskb, | |||
290 | 280 | ||
291 | read_lock_bh(&table->lock); | 281 | read_lock_bh(&table->lock); |
292 | IP_NF_ASSERT(table->valid_hooks & (1 << hook)); | 282 | IP_NF_ASSERT(table->valid_hooks & (1 << hook)); |
293 | table_base = (void *)table->private->entries | 283 | table_base = (void *)table->private->entries[smp_processor_id()]; |
294 | + TABLE_OFFSET(table->private, smp_processor_id()); | ||
295 | e = get_entry(table_base, table->private->hook_entry[hook]); | 284 | e = get_entry(table_base, table->private->hook_entry[hook]); |
296 | 285 | ||
297 | #ifdef CONFIG_NETFILTER_DEBUG | 286 | #ifdef CONFIG_NETFILTER_DEBUG |
@@ -563,7 +552,8 @@ unconditional(const struct ipt_ip *ip) | |||
563 | /* Figures out from what hook each rule can be called: returns 0 if | 552 | /* Figures out from what hook each rule can be called: returns 0 if |
564 | there are loops. Puts hook bitmask in comefrom. */ | 553 | there are loops. Puts hook bitmask in comefrom. */ |
565 | static int | 554 | static int |
566 | mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks) | 555 | mark_source_chains(struct ipt_table_info *newinfo, |
556 | unsigned int valid_hooks, void *entry0) | ||
567 | { | 557 | { |
568 | unsigned int hook; | 558 | unsigned int hook; |
569 | 559 | ||
@@ -572,7 +562,7 @@ mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks) | |||
572 | for (hook = 0; hook < NF_IP_NUMHOOKS; hook++) { | 562 | for (hook = 0; hook < NF_IP_NUMHOOKS; hook++) { |
573 | unsigned int pos = newinfo->hook_entry[hook]; | 563 | unsigned int pos = newinfo->hook_entry[hook]; |
574 | struct ipt_entry *e | 564 | struct ipt_entry *e |
575 | = (struct ipt_entry *)(newinfo->entries + pos); | 565 | = (struct ipt_entry *)(entry0 + pos); |
576 | 566 | ||
577 | if (!(valid_hooks & (1 << hook))) | 567 | if (!(valid_hooks & (1 << hook))) |
578 | continue; | 568 | continue; |
@@ -622,13 +612,13 @@ mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks) | |||
622 | goto next; | 612 | goto next; |
623 | 613 | ||
624 | e = (struct ipt_entry *) | 614 | e = (struct ipt_entry *) |
625 | (newinfo->entries + pos); | 615 | (entry0 + pos); |
626 | } while (oldpos == pos + e->next_offset); | 616 | } while (oldpos == pos + e->next_offset); |
627 | 617 | ||
628 | /* Move along one */ | 618 | /* Move along one */ |
629 | size = e->next_offset; | 619 | size = e->next_offset; |
630 | e = (struct ipt_entry *) | 620 | e = (struct ipt_entry *) |
631 | (newinfo->entries + pos + size); | 621 | (entry0 + pos + size); |
632 | e->counters.pcnt = pos; | 622 | e->counters.pcnt = pos; |
633 | pos += size; | 623 | pos += size; |
634 | } else { | 624 | } else { |
@@ -645,7 +635,7 @@ mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks) | |||
645 | newpos = pos + e->next_offset; | 635 | newpos = pos + e->next_offset; |
646 | } | 636 | } |
647 | e = (struct ipt_entry *) | 637 | e = (struct ipt_entry *) |
648 | (newinfo->entries + newpos); | 638 | (entry0 + newpos); |
649 | e->counters.pcnt = pos; | 639 | e->counters.pcnt = pos; |
650 | pos = newpos; | 640 | pos = newpos; |
651 | } | 641 | } |
@@ -855,6 +845,7 @@ static int | |||
855 | translate_table(const char *name, | 845 | translate_table(const char *name, |
856 | unsigned int valid_hooks, | 846 | unsigned int valid_hooks, |
857 | struct ipt_table_info *newinfo, | 847 | struct ipt_table_info *newinfo, |
848 | void *entry0, | ||
858 | unsigned int size, | 849 | unsigned int size, |
859 | unsigned int number, | 850 | unsigned int number, |
860 | const unsigned int *hook_entries, | 851 | const unsigned int *hook_entries, |
@@ -875,11 +866,11 @@ translate_table(const char *name, | |||
875 | duprintf("translate_table: size %u\n", newinfo->size); | 866 | duprintf("translate_table: size %u\n", newinfo->size); |
876 | i = 0; | 867 | i = 0; |
877 | /* Walk through entries, checking offsets. */ | 868 | /* Walk through entries, checking offsets. */ |
878 | ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, | 869 | ret = IPT_ENTRY_ITERATE(entry0, newinfo->size, |
879 | check_entry_size_and_hooks, | 870 | check_entry_size_and_hooks, |
880 | newinfo, | 871 | newinfo, |
881 | newinfo->entries, | 872 | entry0, |
882 | newinfo->entries + size, | 873 | entry0 + size, |
883 | hook_entries, underflows, &i); | 874 | hook_entries, underflows, &i); |
884 | if (ret != 0) | 875 | if (ret != 0) |
885 | return ret; | 876 | return ret; |
@@ -907,27 +898,24 @@ translate_table(const char *name, | |||
907 | } | 898 | } |
908 | } | 899 | } |
909 | 900 | ||
910 | if (!mark_source_chains(newinfo, valid_hooks)) | 901 | if (!mark_source_chains(newinfo, valid_hooks, entry0)) |
911 | return -ELOOP; | 902 | return -ELOOP; |
912 | 903 | ||
913 | /* Finally, each sanity check must pass */ | 904 | /* Finally, each sanity check must pass */ |
914 | i = 0; | 905 | i = 0; |
915 | ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, | 906 | ret = IPT_ENTRY_ITERATE(entry0, newinfo->size, |
916 | check_entry, name, size, &i); | 907 | check_entry, name, size, &i); |
917 | 908 | ||
918 | if (ret != 0) { | 909 | if (ret != 0) { |
919 | IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, | 910 | IPT_ENTRY_ITERATE(entry0, newinfo->size, |
920 | cleanup_entry, &i); | 911 | cleanup_entry, &i); |
921 | return ret; | 912 | return ret; |
922 | } | 913 | } |
923 | 914 | ||
924 | /* And one copy for every other CPU */ | 915 | /* And one copy for every other CPU */ |
925 | for_each_cpu(i) { | 916 | for_each_cpu(i) { |
926 | if (i == 0) | 917 | if (newinfo->entries[i] && newinfo->entries[i] != entry0) |
927 | continue; | 918 | memcpy(newinfo->entries[i], entry0, newinfo->size); |
928 | memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i, | ||
929 | newinfo->entries, | ||
930 | SMP_ALIGN(newinfo->size)); | ||
931 | } | 919 | } |
932 | 920 | ||
933 | return ret; | 921 | return ret; |
@@ -943,15 +931,12 @@ replace_table(struct ipt_table *table, | |||
943 | 931 | ||
944 | #ifdef CONFIG_NETFILTER_DEBUG | 932 | #ifdef CONFIG_NETFILTER_DEBUG |
945 | { | 933 | { |
946 | struct ipt_entry *table_base; | 934 | int cpu; |
947 | unsigned int i; | ||
948 | 935 | ||
949 | for_each_cpu(i) { | 936 | for_each_cpu(cpu) { |
950 | table_base = | 937 | struct ipt_entry *table_base = newinfo->entries[cpu]; |
951 | (void *)newinfo->entries | 938 | if (table_base) |
952 | + TABLE_OFFSET(newinfo, i); | 939 | table_base->comefrom = 0xdead57ac; |
953 | |||
954 | table_base->comefrom = 0xdead57ac; | ||
955 | } | 940 | } |
956 | } | 941 | } |
957 | #endif | 942 | #endif |
@@ -986,16 +971,44 @@ add_entry_to_counter(const struct ipt_entry *e, | |||
986 | return 0; | 971 | return 0; |
987 | } | 972 | } |
988 | 973 | ||
974 | static inline int | ||
975 | set_entry_to_counter(const struct ipt_entry *e, | ||
976 | struct ipt_counters total[], | ||
977 | unsigned int *i) | ||
978 | { | ||
979 | SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); | ||
980 | |||
981 | (*i)++; | ||
982 | return 0; | ||
983 | } | ||
984 | |||
989 | static void | 985 | static void |
990 | get_counters(const struct ipt_table_info *t, | 986 | get_counters(const struct ipt_table_info *t, |
991 | struct ipt_counters counters[]) | 987 | struct ipt_counters counters[]) |
992 | { | 988 | { |
993 | unsigned int cpu; | 989 | unsigned int cpu; |
994 | unsigned int i; | 990 | unsigned int i; |
991 | unsigned int curcpu; | ||
992 | |||
993 | /* Instead of clearing (by a previous call to memset()) | ||
994 | * the counters and using adds, we set the counters | ||
995 | * with data used by 'current' CPU | ||
996 | * We dont care about preemption here. | ||
997 | */ | ||
998 | curcpu = raw_smp_processor_id(); | ||
999 | |||
1000 | i = 0; | ||
1001 | IPT_ENTRY_ITERATE(t->entries[curcpu], | ||
1002 | t->size, | ||
1003 | set_entry_to_counter, | ||
1004 | counters, | ||
1005 | &i); | ||
995 | 1006 | ||
996 | for_each_cpu(cpu) { | 1007 | for_each_cpu(cpu) { |
1008 | if (cpu == curcpu) | ||
1009 | continue; | ||
997 | i = 0; | 1010 | i = 0; |
998 | IPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), | 1011 | IPT_ENTRY_ITERATE(t->entries[cpu], |
999 | t->size, | 1012 | t->size, |
1000 | add_entry_to_counter, | 1013 | add_entry_to_counter, |
1001 | counters, | 1014 | counters, |
@@ -1012,24 +1025,29 @@ copy_entries_to_user(unsigned int total_size, | |||
1012 | struct ipt_entry *e; | 1025 | struct ipt_entry *e; |
1013 | struct ipt_counters *counters; | 1026 | struct ipt_counters *counters; |
1014 | int ret = 0; | 1027 | int ret = 0; |
1028 | void *loc_cpu_entry; | ||
1015 | 1029 | ||
1016 | /* We need atomic snapshot of counters: rest doesn't change | 1030 | /* We need atomic snapshot of counters: rest doesn't change |
1017 | (other than comefrom, which userspace doesn't care | 1031 | (other than comefrom, which userspace doesn't care |
1018 | about). */ | 1032 | about). */ |
1019 | countersize = sizeof(struct ipt_counters) * table->private->number; | 1033 | countersize = sizeof(struct ipt_counters) * table->private->number; |
1020 | counters = vmalloc(countersize); | 1034 | counters = vmalloc_node(countersize, numa_node_id()); |
1021 | 1035 | ||
1022 | if (counters == NULL) | 1036 | if (counters == NULL) |
1023 | return -ENOMEM; | 1037 | return -ENOMEM; |
1024 | 1038 | ||
1025 | /* First, sum counters... */ | 1039 | /* First, sum counters... */ |
1026 | memset(counters, 0, countersize); | ||
1027 | write_lock_bh(&table->lock); | 1040 | write_lock_bh(&table->lock); |
1028 | get_counters(table->private, counters); | 1041 | get_counters(table->private, counters); |
1029 | write_unlock_bh(&table->lock); | 1042 | write_unlock_bh(&table->lock); |
1030 | 1043 | ||
1031 | /* ... then copy entire thing from CPU 0... */ | 1044 | /* choose the copy that is on our node/cpu, ... |
1032 | if (copy_to_user(userptr, table->private->entries, total_size) != 0) { | 1045 | * This choice is lazy (because current thread is |
1046 | * allowed to migrate to another cpu) | ||
1047 | */ | ||
1048 | loc_cpu_entry = table->private->entries[raw_smp_processor_id()]; | ||
1049 | /* ... then copy entire thing ... */ | ||
1050 | if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { | ||
1033 | ret = -EFAULT; | 1051 | ret = -EFAULT; |
1034 | goto free_counters; | 1052 | goto free_counters; |
1035 | } | 1053 | } |
@@ -1041,7 +1059,7 @@ copy_entries_to_user(unsigned int total_size, | |||
1041 | struct ipt_entry_match *m; | 1059 | struct ipt_entry_match *m; |
1042 | struct ipt_entry_target *t; | 1060 | struct ipt_entry_target *t; |
1043 | 1061 | ||
1044 | e = (struct ipt_entry *)(table->private->entries + off); | 1062 | e = (struct ipt_entry *)(loc_cpu_entry + off); |
1045 | if (copy_to_user(userptr + off | 1063 | if (copy_to_user(userptr + off |
1046 | + offsetof(struct ipt_entry, counters), | 1064 | + offsetof(struct ipt_entry, counters), |
1047 | &counters[num], | 1065 | &counters[num], |
@@ -1110,6 +1128,45 @@ get_entries(const struct ipt_get_entries *entries, | |||
1110 | return ret; | 1128 | return ret; |
1111 | } | 1129 | } |
1112 | 1130 | ||
1131 | static void free_table_info(struct ipt_table_info *info) | ||
1132 | { | ||
1133 | int cpu; | ||
1134 | for_each_cpu(cpu) { | ||
1135 | if (info->size <= PAGE_SIZE) | ||
1136 | kfree(info->entries[cpu]); | ||
1137 | else | ||
1138 | vfree(info->entries[cpu]); | ||
1139 | } | ||
1140 | kfree(info); | ||
1141 | } | ||
1142 | |||
1143 | static struct ipt_table_info *alloc_table_info(unsigned int size) | ||
1144 | { | ||
1145 | struct ipt_table_info *newinfo; | ||
1146 | int cpu; | ||
1147 | |||
1148 | newinfo = kzalloc(sizeof(struct ipt_table_info), GFP_KERNEL); | ||
1149 | if (!newinfo) | ||
1150 | return NULL; | ||
1151 | |||
1152 | newinfo->size = size; | ||
1153 | |||
1154 | for_each_cpu(cpu) { | ||
1155 | if (size <= PAGE_SIZE) | ||
1156 | newinfo->entries[cpu] = kmalloc_node(size, | ||
1157 | GFP_KERNEL, | ||
1158 | cpu_to_node(cpu)); | ||
1159 | else | ||
1160 | newinfo->entries[cpu] = vmalloc_node(size, cpu_to_node(cpu)); | ||
1161 | if (newinfo->entries[cpu] == 0) { | ||
1162 | free_table_info(newinfo); | ||
1163 | return NULL; | ||
1164 | } | ||
1165 | } | ||
1166 | |||
1167 | return newinfo; | ||
1168 | } | ||
1169 | |||
1113 | static int | 1170 | static int |
1114 | do_replace(void __user *user, unsigned int len) | 1171 | do_replace(void __user *user, unsigned int len) |
1115 | { | 1172 | { |
@@ -1118,6 +1175,7 @@ do_replace(void __user *user, unsigned int len) | |||
1118 | struct ipt_table *t; | 1175 | struct ipt_table *t; |
1119 | struct ipt_table_info *newinfo, *oldinfo; | 1176 | struct ipt_table_info *newinfo, *oldinfo; |
1120 | struct ipt_counters *counters; | 1177 | struct ipt_counters *counters; |
1178 | void *loc_cpu_entry, *loc_cpu_old_entry; | ||
1121 | 1179 | ||
1122 | if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) | 1180 | if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) |
1123 | return -EFAULT; | 1181 | return -EFAULT; |
@@ -1130,13 +1188,13 @@ do_replace(void __user *user, unsigned int len) | |||
1130 | if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) | 1188 | if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) |
1131 | return -ENOMEM; | 1189 | return -ENOMEM; |
1132 | 1190 | ||
1133 | newinfo = vmalloc(sizeof(struct ipt_table_info) | 1191 | newinfo = alloc_table_info(tmp.size); |
1134 | + SMP_ALIGN(tmp.size) * | ||
1135 | (highest_possible_processor_id()+1)); | ||
1136 | if (!newinfo) | 1192 | if (!newinfo) |
1137 | return -ENOMEM; | 1193 | return -ENOMEM; |
1138 | 1194 | ||
1139 | if (copy_from_user(newinfo->entries, user + sizeof(tmp), | 1195 | /* choose the copy that is our node/cpu */ |
1196 | loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; | ||
1197 | if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), | ||
1140 | tmp.size) != 0) { | 1198 | tmp.size) != 0) { |
1141 | ret = -EFAULT; | 1199 | ret = -EFAULT; |
1142 | goto free_newinfo; | 1200 | goto free_newinfo; |
@@ -1147,10 +1205,9 @@ do_replace(void __user *user, unsigned int len) | |||
1147 | ret = -ENOMEM; | 1205 | ret = -ENOMEM; |
1148 | goto free_newinfo; | 1206 | goto free_newinfo; |
1149 | } | 1207 | } |
1150 | memset(counters, 0, tmp.num_counters * sizeof(struct ipt_counters)); | ||
1151 | 1208 | ||
1152 | ret = translate_table(tmp.name, tmp.valid_hooks, | 1209 | ret = translate_table(tmp.name, tmp.valid_hooks, |
1153 | newinfo, tmp.size, tmp.num_entries, | 1210 | newinfo, loc_cpu_entry, tmp.size, tmp.num_entries, |
1154 | tmp.hook_entry, tmp.underflow); | 1211 | tmp.hook_entry, tmp.underflow); |
1155 | if (ret != 0) | 1212 | if (ret != 0) |
1156 | goto free_newinfo_counters; | 1213 | goto free_newinfo_counters; |
@@ -1189,8 +1246,9 @@ do_replace(void __user *user, unsigned int len) | |||
1189 | /* Get the old counters. */ | 1246 | /* Get the old counters. */ |
1190 | get_counters(oldinfo, counters); | 1247 | get_counters(oldinfo, counters); |
1191 | /* Decrease module usage counts and free resource */ | 1248 | /* Decrease module usage counts and free resource */ |
1192 | IPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL); | 1249 | loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; |
1193 | vfree(oldinfo); | 1250 | IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL); |
1251 | free_table_info(oldinfo); | ||
1194 | if (copy_to_user(tmp.counters, counters, | 1252 | if (copy_to_user(tmp.counters, counters, |
1195 | sizeof(struct ipt_counters) * tmp.num_counters) != 0) | 1253 | sizeof(struct ipt_counters) * tmp.num_counters) != 0) |
1196 | ret = -EFAULT; | 1254 | ret = -EFAULT; |
@@ -1202,11 +1260,11 @@ do_replace(void __user *user, unsigned int len) | |||
1202 | module_put(t->me); | 1260 | module_put(t->me); |
1203 | up(&ipt_mutex); | 1261 | up(&ipt_mutex); |
1204 | free_newinfo_counters_untrans: | 1262 | free_newinfo_counters_untrans: |
1205 | IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL); | 1263 | IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL); |
1206 | free_newinfo_counters: | 1264 | free_newinfo_counters: |
1207 | vfree(counters); | 1265 | vfree(counters); |
1208 | free_newinfo: | 1266 | free_newinfo: |
1209 | vfree(newinfo); | 1267 | free_table_info(newinfo); |
1210 | return ret; | 1268 | return ret; |
1211 | } | 1269 | } |
1212 | 1270 | ||
@@ -1239,6 +1297,7 @@ do_add_counters(void __user *user, unsigned int len) | |||
1239 | struct ipt_counters_info tmp, *paddc; | 1297 | struct ipt_counters_info tmp, *paddc; |
1240 | struct ipt_table *t; | 1298 | struct ipt_table *t; |
1241 | int ret = 0; | 1299 | int ret = 0; |
1300 | void *loc_cpu_entry; | ||
1242 | 1301 | ||
1243 | if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) | 1302 | if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) |
1244 | return -EFAULT; | 1303 | return -EFAULT; |
@@ -1246,7 +1305,7 @@ do_add_counters(void __user *user, unsigned int len) | |||
1246 | if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct ipt_counters)) | 1305 | if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct ipt_counters)) |
1247 | return -EINVAL; | 1306 | return -EINVAL; |
1248 | 1307 | ||
1249 | paddc = vmalloc(len); | 1308 | paddc = vmalloc_node(len, numa_node_id()); |
1250 | if (!paddc) | 1309 | if (!paddc) |
1251 | return -ENOMEM; | 1310 | return -ENOMEM; |
1252 | 1311 | ||
@@ -1268,7 +1327,9 @@ do_add_counters(void __user *user, unsigned int len) | |||
1268 | } | 1327 | } |
1269 | 1328 | ||
1270 | i = 0; | 1329 | i = 0; |
1271 | IPT_ENTRY_ITERATE(t->private->entries, | 1330 | /* Choose the copy that is on our node */ |
1331 | loc_cpu_entry = t->private->entries[raw_smp_processor_id()]; | ||
1332 | IPT_ENTRY_ITERATE(loc_cpu_entry, | ||
1272 | t->private->size, | 1333 | t->private->size, |
1273 | add_counter_to_entry, | 1334 | add_counter_to_entry, |
1274 | paddc->counters, | 1335 | paddc->counters, |
@@ -1460,28 +1521,31 @@ int ipt_register_table(struct ipt_table *table, const struct ipt_replace *repl) | |||
1460 | struct ipt_table_info *newinfo; | 1521 | struct ipt_table_info *newinfo; |
1461 | static struct ipt_table_info bootstrap | 1522 | static struct ipt_table_info bootstrap |
1462 | = { 0, 0, 0, { 0 }, { 0 }, { } }; | 1523 | = { 0, 0, 0, { 0 }, { 0 }, { } }; |
1524 | void *loc_cpu_entry; | ||
1463 | 1525 | ||
1464 | newinfo = vmalloc(sizeof(struct ipt_table_info) | 1526 | newinfo = alloc_table_info(repl->size); |
1465 | + SMP_ALIGN(repl->size) * | ||
1466 | (highest_possible_processor_id()+1)); | ||
1467 | if (!newinfo) | 1527 | if (!newinfo) |
1468 | return -ENOMEM; | 1528 | return -ENOMEM; |
1469 | 1529 | ||
1470 | memcpy(newinfo->entries, repl->entries, repl->size); | 1530 | /* choose the copy on our node/cpu |
1531 | * but dont care of preemption | ||
1532 | */ | ||
1533 | loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; | ||
1534 | memcpy(loc_cpu_entry, repl->entries, repl->size); | ||
1471 | 1535 | ||
1472 | ret = translate_table(table->name, table->valid_hooks, | 1536 | ret = translate_table(table->name, table->valid_hooks, |
1473 | newinfo, repl->size, | 1537 | newinfo, loc_cpu_entry, repl->size, |
1474 | repl->num_entries, | 1538 | repl->num_entries, |
1475 | repl->hook_entry, | 1539 | repl->hook_entry, |
1476 | repl->underflow); | 1540 | repl->underflow); |
1477 | if (ret != 0) { | 1541 | if (ret != 0) { |
1478 | vfree(newinfo); | 1542 | free_table_info(newinfo); |
1479 | return ret; | 1543 | return ret; |
1480 | } | 1544 | } |
1481 | 1545 | ||
1482 | ret = down_interruptible(&ipt_mutex); | 1546 | ret = down_interruptible(&ipt_mutex); |
1483 | if (ret != 0) { | 1547 | if (ret != 0) { |
1484 | vfree(newinfo); | 1548 | free_table_info(newinfo); |
1485 | return ret; | 1549 | return ret; |
1486 | } | 1550 | } |
1487 | 1551 | ||
@@ -1510,20 +1574,23 @@ int ipt_register_table(struct ipt_table *table, const struct ipt_replace *repl) | |||
1510 | return ret; | 1574 | return ret; |
1511 | 1575 | ||
1512 | free_unlock: | 1576 | free_unlock: |
1513 | vfree(newinfo); | 1577 | free_table_info(newinfo); |
1514 | goto unlock; | 1578 | goto unlock; |
1515 | } | 1579 | } |
1516 | 1580 | ||
1517 | void ipt_unregister_table(struct ipt_table *table) | 1581 | void ipt_unregister_table(struct ipt_table *table) |
1518 | { | 1582 | { |
1583 | void *loc_cpu_entry; | ||
1584 | |||
1519 | down(&ipt_mutex); | 1585 | down(&ipt_mutex); |
1520 | LIST_DELETE(&ipt_tables, table); | 1586 | LIST_DELETE(&ipt_tables, table); |
1521 | up(&ipt_mutex); | 1587 | up(&ipt_mutex); |
1522 | 1588 | ||
1523 | /* Decrease module usage counts and free resources */ | 1589 | /* Decrease module usage counts and free resources */ |
1524 | IPT_ENTRY_ITERATE(table->private->entries, table->private->size, | 1590 | loc_cpu_entry = table->private->entries[raw_smp_processor_id()]; |
1591 | IPT_ENTRY_ITERATE(loc_cpu_entry, table->private->size, | ||
1525 | cleanup_entry, NULL); | 1592 | cleanup_entry, NULL); |
1526 | vfree(table->private); | 1593 | free_table_info(table->private); |
1527 | } | 1594 | } |
1528 | 1595 | ||
1529 | /* Returns 1 if the port is matched by the range, 0 otherwise */ | 1596 | /* Returns 1 if the port is matched by the range, 0 otherwise */ |
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 275a174c6fe6..27860510ca6d 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c | |||
@@ -11,6 +11,7 @@ | |||
11 | 11 | ||
12 | #include <linux/config.h> | 12 | #include <linux/config.h> |
13 | #include <linux/types.h> | 13 | #include <linux/types.h> |
14 | #include <linux/inetdevice.h> | ||
14 | #include <linux/ip.h> | 15 | #include <linux/ip.h> |
15 | #include <linux/timer.h> | 16 | #include <linux/timer.h> |
16 | #include <linux/module.h> | 17 | #include <linux/module.h> |
@@ -18,6 +19,7 @@ | |||
18 | #include <net/protocol.h> | 19 | #include <net/protocol.h> |
19 | #include <net/ip.h> | 20 | #include <net/ip.h> |
20 | #include <net/checksum.h> | 21 | #include <net/checksum.h> |
22 | #include <net/route.h> | ||
21 | #include <linux/netfilter_ipv4.h> | 23 | #include <linux/netfilter_ipv4.h> |
22 | #include <linux/netfilter_ipv4/ip_nat_rule.h> | 24 | #include <linux/netfilter_ipv4/ip_nat_rule.h> |
23 | #include <linux/netfilter_ipv4/ip_tables.h> | 25 | #include <linux/netfilter_ipv4/ip_tables.h> |
diff --git a/net/ipv4/netfilter/ipt_physdev.c b/net/ipv4/netfilter/ipt_physdev.c index 1a53924041fc..03f554857a4d 100644 --- a/net/ipv4/netfilter/ipt_physdev.c +++ b/net/ipv4/netfilter/ipt_physdev.c | |||
@@ -9,6 +9,7 @@ | |||
9 | */ | 9 | */ |
10 | 10 | ||
11 | #include <linux/module.h> | 11 | #include <linux/module.h> |
12 | #include <linux/netdevice.h> | ||
12 | #include <linux/skbuff.h> | 13 | #include <linux/skbuff.h> |
13 | #include <linux/netfilter_ipv4/ipt_physdev.h> | 14 | #include <linux/netfilter_ipv4/ipt_physdev.h> |
14 | #include <linux/netfilter_ipv4/ip_tables.h> | 15 | #include <linux/netfilter_ipv4/ip_tables.h> |
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 0d7dc668db46..39d49dc333a7 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c | |||
@@ -38,6 +38,7 @@ | |||
38 | #include <net/protocol.h> | 38 | #include <net/protocol.h> |
39 | #include <net/tcp.h> | 39 | #include <net/tcp.h> |
40 | #include <net/udp.h> | 40 | #include <net/udp.h> |
41 | #include <linux/inetdevice.h> | ||
41 | #include <linux/proc_fs.h> | 42 | #include <linux/proc_fs.h> |
42 | #include <linux/seq_file.h> | 43 | #include <linux/seq_file.h> |
43 | #include <net/sock.h> | 44 | #include <net/sock.h> |
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index a34e60ea48a1..e20be3331f67 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c | |||
@@ -173,10 +173,10 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, | |||
173 | struct request_sock *req, | 173 | struct request_sock *req, |
174 | struct dst_entry *dst) | 174 | struct dst_entry *dst) |
175 | { | 175 | { |
176 | struct tcp_sock *tp = tcp_sk(sk); | 176 | struct inet_connection_sock *icsk = inet_csk(sk); |
177 | struct sock *child; | 177 | struct sock *child; |
178 | 178 | ||
179 | child = tp->af_specific->syn_recv_sock(sk, skb, req, dst); | 179 | child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst); |
180 | if (child) | 180 | if (child) |
181 | inet_csk_reqsk_queue_add(sk, req, child); | 181 | inet_csk_reqsk_queue_add(sk, req, child); |
182 | else | 182 | else |
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 01444a02b48b..16984d4a8a06 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c | |||
@@ -12,6 +12,7 @@ | |||
12 | #include <linux/sysctl.h> | 12 | #include <linux/sysctl.h> |
13 | #include <linux/config.h> | 13 | #include <linux/config.h> |
14 | #include <linux/igmp.h> | 14 | #include <linux/igmp.h> |
15 | #include <linux/inetdevice.h> | ||
15 | #include <net/snmp.h> | 16 | #include <net/snmp.h> |
16 | #include <net/icmp.h> | 17 | #include <net/icmp.h> |
17 | #include <net/ip.h> | 18 | #include <net/ip.h> |
@@ -22,6 +23,7 @@ | |||
22 | extern int sysctl_ip_nonlocal_bind; | 23 | extern int sysctl_ip_nonlocal_bind; |
23 | 24 | ||
24 | #ifdef CONFIG_SYSCTL | 25 | #ifdef CONFIG_SYSCTL |
26 | static int zero; | ||
25 | static int tcp_retr1_max = 255; | 27 | static int tcp_retr1_max = 255; |
26 | static int ip_local_port_range_min[] = { 1, 1 }; | 28 | static int ip_local_port_range_min[] = { 1, 1 }; |
27 | static int ip_local_port_range_max[] = { 65535, 65535 }; | 29 | static int ip_local_port_range_max[] = { 65535, 65535 }; |
@@ -614,6 +616,15 @@ ctl_table ipv4_table[] = { | |||
614 | .strategy = &sysctl_jiffies | 616 | .strategy = &sysctl_jiffies |
615 | }, | 617 | }, |
616 | { | 618 | { |
619 | .ctl_name = NET_IPV4_IPFRAG_MAX_DIST, | ||
620 | .procname = "ipfrag_max_dist", | ||
621 | .data = &sysctl_ipfrag_max_dist, | ||
622 | .maxlen = sizeof(int), | ||
623 | .mode = 0644, | ||
624 | .proc_handler = &proc_dointvec_minmax, | ||
625 | .extra1 = &zero | ||
626 | }, | ||
627 | { | ||
617 | .ctl_name = NET_TCP_NO_METRICS_SAVE, | 628 | .ctl_name = NET_TCP_NO_METRICS_SAVE, |
618 | .procname = "tcp_no_metrics_save", | 629 | .procname = "tcp_no_metrics_save", |
619 | .data = &sysctl_tcp_nometrics_save, | 630 | .data = &sysctl_tcp_nometrics_save, |
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ef98b14ac56d..00aa80e93243 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -1696,8 +1696,8 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, | |||
1696 | int err = 0; | 1696 | int err = 0; |
1697 | 1697 | ||
1698 | if (level != SOL_TCP) | 1698 | if (level != SOL_TCP) |
1699 | return tp->af_specific->setsockopt(sk, level, optname, | 1699 | return icsk->icsk_af_ops->setsockopt(sk, level, optname, |
1700 | optval, optlen); | 1700 | optval, optlen); |
1701 | 1701 | ||
1702 | /* This is a string value all the others are int's */ | 1702 | /* This is a string value all the others are int's */ |
1703 | if (optname == TCP_CONGESTION) { | 1703 | if (optname == TCP_CONGESTION) { |
@@ -1914,7 +1914,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) | |||
1914 | info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime); | 1914 | info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime); |
1915 | info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); | 1915 | info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); |
1916 | 1916 | ||
1917 | info->tcpi_pmtu = tp->pmtu_cookie; | 1917 | info->tcpi_pmtu = icsk->icsk_pmtu_cookie; |
1918 | info->tcpi_rcv_ssthresh = tp->rcv_ssthresh; | 1918 | info->tcpi_rcv_ssthresh = tp->rcv_ssthresh; |
1919 | info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3; | 1919 | info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3; |
1920 | info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2; | 1920 | info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2; |
@@ -1939,8 +1939,8 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, | |||
1939 | int val, len; | 1939 | int val, len; |
1940 | 1940 | ||
1941 | if (level != SOL_TCP) | 1941 | if (level != SOL_TCP) |
1942 | return tp->af_specific->getsockopt(sk, level, optname, | 1942 | return icsk->icsk_af_ops->getsockopt(sk, level, optname, |
1943 | optval, optlen); | 1943 | optval, optlen); |
1944 | 1944 | ||
1945 | if (get_user(len, optlen)) | 1945 | if (get_user(len, optlen)) |
1946 | return -EFAULT; | 1946 | return -EFAULT; |
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 1d0cd86621b1..035f2092d73a 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c | |||
@@ -30,8 +30,6 @@ static int fast_convergence = 1; | |||
30 | static int max_increment = 16; | 30 | static int max_increment = 16; |
31 | static int low_window = 14; | 31 | static int low_window = 14; |
32 | static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ | 32 | static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ |
33 | static int low_utilization_threshold = 153; | ||
34 | static int low_utilization_period = 2; | ||
35 | static int initial_ssthresh = 100; | 33 | static int initial_ssthresh = 100; |
36 | static int smooth_part = 20; | 34 | static int smooth_part = 20; |
37 | 35 | ||
@@ -43,10 +41,6 @@ module_param(low_window, int, 0644); | |||
43 | MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)"); | 41 | MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)"); |
44 | module_param(beta, int, 0644); | 42 | module_param(beta, int, 0644); |
45 | MODULE_PARM_DESC(beta, "beta for multiplicative increase"); | 43 | MODULE_PARM_DESC(beta, "beta for multiplicative increase"); |
46 | module_param(low_utilization_threshold, int, 0644); | ||
47 | MODULE_PARM_DESC(low_utilization_threshold, "percent (scaled by 1024) for low utilization mode"); | ||
48 | module_param(low_utilization_period, int, 0644); | ||
49 | MODULE_PARM_DESC(low_utilization_period, "if average delay exceeds then goto to low utilization mode (seconds)"); | ||
50 | module_param(initial_ssthresh, int, 0644); | 44 | module_param(initial_ssthresh, int, 0644); |
51 | MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold"); | 45 | MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold"); |
52 | module_param(smooth_part, int, 0644); | 46 | module_param(smooth_part, int, 0644); |
@@ -60,11 +54,6 @@ struct bictcp { | |||
60 | u32 loss_cwnd; /* congestion window at last loss */ | 54 | u32 loss_cwnd; /* congestion window at last loss */ |
61 | u32 last_cwnd; /* the last snd_cwnd */ | 55 | u32 last_cwnd; /* the last snd_cwnd */ |
62 | u32 last_time; /* time when updated last_cwnd */ | 56 | u32 last_time; /* time when updated last_cwnd */ |
63 | u32 delay_min; /* min delay */ | ||
64 | u32 delay_max; /* max delay */ | ||
65 | u32 last_delay; | ||
66 | u8 low_utilization;/* 0: high; 1: low */ | ||
67 | u32 low_utilization_start; /* starting time of low utilization detection*/ | ||
68 | u32 epoch_start; /* beginning of an epoch */ | 57 | u32 epoch_start; /* beginning of an epoch */ |
69 | #define ACK_RATIO_SHIFT 4 | 58 | #define ACK_RATIO_SHIFT 4 |
70 | u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ | 59 | u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ |
@@ -77,11 +66,6 @@ static inline void bictcp_reset(struct bictcp *ca) | |||
77 | ca->loss_cwnd = 0; | 66 | ca->loss_cwnd = 0; |
78 | ca->last_cwnd = 0; | 67 | ca->last_cwnd = 0; |
79 | ca->last_time = 0; | 68 | ca->last_time = 0; |
80 | ca->delay_min = 0; | ||
81 | ca->delay_max = 0; | ||
82 | ca->last_delay = 0; | ||
83 | ca->low_utilization = 0; | ||
84 | ca->low_utilization_start = 0; | ||
85 | ca->epoch_start = 0; | 69 | ca->epoch_start = 0; |
86 | ca->delayed_ack = 2 << ACK_RATIO_SHIFT; | 70 | ca->delayed_ack = 2 << ACK_RATIO_SHIFT; |
87 | } | 71 | } |
@@ -143,8 +127,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) | |||
143 | } | 127 | } |
144 | 128 | ||
145 | /* if in slow start or link utilization is very low */ | 129 | /* if in slow start or link utilization is very low */ |
146 | if ( ca->loss_cwnd == 0 || | 130 | if (ca->loss_cwnd == 0) { |
147 | (cwnd > ca->loss_cwnd && ca->low_utilization)) { | ||
148 | if (ca->cnt > 20) /* increase cwnd 5% per RTT */ | 131 | if (ca->cnt > 20) /* increase cwnd 5% per RTT */ |
149 | ca->cnt = 20; | 132 | ca->cnt = 20; |
150 | } | 133 | } |
@@ -154,69 +137,12 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) | |||
154 | ca->cnt = 1; | 137 | ca->cnt = 1; |
155 | } | 138 | } |
156 | 139 | ||
157 | |||
158 | /* Detect low utilization in congestion avoidance */ | ||
159 | static inline void bictcp_low_utilization(struct sock *sk, int flag) | ||
160 | { | ||
161 | const struct tcp_sock *tp = tcp_sk(sk); | ||
162 | struct bictcp *ca = inet_csk_ca(sk); | ||
163 | u32 dist, delay; | ||
164 | |||
165 | /* No time stamp */ | ||
166 | if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) || | ||
167 | /* Discard delay samples right after fast recovery */ | ||
168 | tcp_time_stamp < ca->epoch_start + HZ || | ||
169 | /* this delay samples may not be accurate */ | ||
170 | flag == 0) { | ||
171 | ca->last_delay = 0; | ||
172 | goto notlow; | ||
173 | } | ||
174 | |||
175 | delay = ca->last_delay<<3; /* use the same scale as tp->srtt*/ | ||
176 | ca->last_delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr; | ||
177 | if (delay == 0) /* no previous delay sample */ | ||
178 | goto notlow; | ||
179 | |||
180 | /* first time call or link delay decreases */ | ||
181 | if (ca->delay_min == 0 || ca->delay_min > delay) { | ||
182 | ca->delay_min = ca->delay_max = delay; | ||
183 | goto notlow; | ||
184 | } | ||
185 | |||
186 | if (ca->delay_max < delay) | ||
187 | ca->delay_max = delay; | ||
188 | |||
189 | /* utilization is low, if avg delay < dist*threshold | ||
190 | for checking_period time */ | ||
191 | dist = ca->delay_max - ca->delay_min; | ||
192 | if (dist <= ca->delay_min>>6 || | ||
193 | tp->srtt - ca->delay_min >= (dist*low_utilization_threshold)>>10) | ||
194 | goto notlow; | ||
195 | |||
196 | if (ca->low_utilization_start == 0) { | ||
197 | ca->low_utilization = 0; | ||
198 | ca->low_utilization_start = tcp_time_stamp; | ||
199 | } else if ((s32)(tcp_time_stamp - ca->low_utilization_start) | ||
200 | > low_utilization_period*HZ) { | ||
201 | ca->low_utilization = 1; | ||
202 | } | ||
203 | |||
204 | return; | ||
205 | |||
206 | notlow: | ||
207 | ca->low_utilization = 0; | ||
208 | ca->low_utilization_start = 0; | ||
209 | |||
210 | } | ||
211 | |||
212 | static void bictcp_cong_avoid(struct sock *sk, u32 ack, | 140 | static void bictcp_cong_avoid(struct sock *sk, u32 ack, |
213 | u32 seq_rtt, u32 in_flight, int data_acked) | 141 | u32 seq_rtt, u32 in_flight, int data_acked) |
214 | { | 142 | { |
215 | struct tcp_sock *tp = tcp_sk(sk); | 143 | struct tcp_sock *tp = tcp_sk(sk); |
216 | struct bictcp *ca = inet_csk_ca(sk); | 144 | struct bictcp *ca = inet_csk_ca(sk); |
217 | 145 | ||
218 | bictcp_low_utilization(sk, data_acked); | ||
219 | |||
220 | if (!tcp_is_cwnd_limited(sk, in_flight)) | 146 | if (!tcp_is_cwnd_limited(sk, in_flight)) |
221 | return; | 147 | return; |
222 | 148 | ||
@@ -249,11 +175,6 @@ static u32 bictcp_recalc_ssthresh(struct sock *sk) | |||
249 | 175 | ||
250 | ca->epoch_start = 0; /* end of epoch */ | 176 | ca->epoch_start = 0; /* end of epoch */ |
251 | 177 | ||
252 | /* in case of wrong delay_max*/ | ||
253 | if (ca->delay_min > 0 && ca->delay_max > ca->delay_min) | ||
254 | ca->delay_max = ca->delay_min | ||
255 | + ((ca->delay_max - ca->delay_min)* 90) / 100; | ||
256 | |||
257 | /* Wmax and fast convergence */ | 178 | /* Wmax and fast convergence */ |
258 | if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence) | 179 | if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence) |
259 | ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta)) | 180 | ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta)) |
@@ -289,14 +210,14 @@ static void bictcp_state(struct sock *sk, u8 new_state) | |||
289 | bictcp_reset(inet_csk_ca(sk)); | 210 | bictcp_reset(inet_csk_ca(sk)); |
290 | } | 211 | } |
291 | 212 | ||
292 | /* Track delayed acknowledgement ratio using sliding window | 213 | /* Track delayed acknowledgment ratio using sliding window |
293 | * ratio = (15*ratio + sample) / 16 | 214 | * ratio = (15*ratio + sample) / 16 |
294 | */ | 215 | */ |
295 | static void bictcp_acked(struct sock *sk, u32 cnt) | 216 | static void bictcp_acked(struct sock *sk, u32 cnt) |
296 | { | 217 | { |
297 | const struct inet_connection_sock *icsk = inet_csk(sk); | 218 | const struct inet_connection_sock *icsk = inet_csk(sk); |
298 | 219 | ||
299 | if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) { | 220 | if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) { |
300 | struct bictcp *ca = inet_csk_ca(sk); | 221 | struct bictcp *ca = inet_csk_ca(sk); |
301 | cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; | 222 | cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; |
302 | ca->delayed_ack += cnt; | 223 | ca->delayed_ack += cnt; |
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index c7cc62c8dc12..e688c687d62d 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c | |||
@@ -174,6 +174,34 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) | |||
174 | return err; | 174 | return err; |
175 | } | 175 | } |
176 | 176 | ||
177 | |||
178 | /* | ||
179 | * Linear increase during slow start | ||
180 | */ | ||
181 | void tcp_slow_start(struct tcp_sock *tp) | ||
182 | { | ||
183 | if (sysctl_tcp_abc) { | ||
184 | /* RFC3465: Slow Start | ||
185 | * TCP sender SHOULD increase cwnd by the number of | ||
186 | * previously unacknowledged bytes ACKed by each incoming | ||
187 | * acknowledgment, provided the increase is not more than L | ||
188 | */ | ||
189 | if (tp->bytes_acked < tp->mss_cache) | ||
190 | return; | ||
191 | |||
192 | /* We MAY increase by 2 if discovered delayed ack */ | ||
193 | if (sysctl_tcp_abc > 1 && tp->bytes_acked > 2*tp->mss_cache) { | ||
194 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
195 | tp->snd_cwnd++; | ||
196 | } | ||
197 | } | ||
198 | tp->bytes_acked = 0; | ||
199 | |||
200 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
201 | tp->snd_cwnd++; | ||
202 | } | ||
203 | EXPORT_SYMBOL_GPL(tcp_slow_start); | ||
204 | |||
177 | /* | 205 | /* |
178 | * TCP Reno congestion control | 206 | * TCP Reno congestion control |
179 | * This is special case used for fallback as well. | 207 | * This is special case used for fallback as well. |
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c new file mode 100644 index 000000000000..31a4986dfbf7 --- /dev/null +++ b/net/ipv4/tcp_cubic.c | |||
@@ -0,0 +1,411 @@ | |||
1 | /* | ||
2 | * TCP CUBIC: Binary Increase Congestion control for TCP v2.0 | ||
3 | * | ||
4 | * This is from the implementation of CUBIC TCP in | ||
5 | * Injong Rhee, Lisong Xu. | ||
6 | * "CUBIC: A New TCP-Friendly High-Speed TCP Variant | ||
7 | * in PFLDnet 2005 | ||
8 | * Available from: | ||
9 | * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf | ||
10 | * | ||
11 | * Unless CUBIC is enabled and congestion window is large | ||
12 | * this behaves the same as the original Reno. | ||
13 | */ | ||
14 | |||
15 | #include <linux/config.h> | ||
16 | #include <linux/mm.h> | ||
17 | #include <linux/module.h> | ||
18 | #include <net/tcp.h> | ||
19 | #include <asm/div64.h> | ||
20 | |||
21 | #define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation | ||
22 | * max_cwnd = snd_cwnd * beta | ||
23 | */ | ||
24 | #define BICTCP_B 4 /* | ||
25 | * In binary search, | ||
26 | * go to point (max+min)/N | ||
27 | */ | ||
28 | #define BICTCP_HZ 10 /* BIC HZ 2^10 = 1024 */ | ||
29 | |||
30 | static int fast_convergence = 1; | ||
31 | static int max_increment = 16; | ||
32 | static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ | ||
33 | static int initial_ssthresh = 100; | ||
34 | static int bic_scale = 41; | ||
35 | static int tcp_friendliness = 1; | ||
36 | |||
37 | static u32 cube_rtt_scale; | ||
38 | static u32 beta_scale; | ||
39 | static u64 cube_factor; | ||
40 | |||
41 | /* Note parameters that are used for precomputing scale factors are read-only */ | ||
42 | module_param(fast_convergence, int, 0644); | ||
43 | MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence"); | ||
44 | module_param(max_increment, int, 0644); | ||
45 | MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search"); | ||
46 | module_param(beta, int, 0444); | ||
47 | MODULE_PARM_DESC(beta, "beta for multiplicative increase"); | ||
48 | module_param(initial_ssthresh, int, 0644); | ||
49 | MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold"); | ||
50 | module_param(bic_scale, int, 0444); | ||
51 | MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_scale/1024)"); | ||
52 | module_param(tcp_friendliness, int, 0644); | ||
53 | MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness"); | ||
54 | |||
55 | #include <asm/div64.h> | ||
56 | |||
57 | /* BIC TCP Parameters */ | ||
58 | struct bictcp { | ||
59 | u32 cnt; /* increase cwnd by 1 after ACKs */ | ||
60 | u32 last_max_cwnd; /* last maximum snd_cwnd */ | ||
61 | u32 loss_cwnd; /* congestion window at last loss */ | ||
62 | u32 last_cwnd; /* the last snd_cwnd */ | ||
63 | u32 last_time; /* time when updated last_cwnd */ | ||
64 | u32 bic_origin_point;/* origin point of bic function */ | ||
65 | u32 bic_K; /* time to origin point from the beginning of the current epoch */ | ||
66 | u32 delay_min; /* min delay */ | ||
67 | u32 epoch_start; /* beginning of an epoch */ | ||
68 | u32 ack_cnt; /* number of acks */ | ||
69 | u32 tcp_cwnd; /* estimated tcp cwnd */ | ||
70 | #define ACK_RATIO_SHIFT 4 | ||
71 | u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ | ||
72 | }; | ||
73 | |||
74 | static inline void bictcp_reset(struct bictcp *ca) | ||
75 | { | ||
76 | ca->cnt = 0; | ||
77 | ca->last_max_cwnd = 0; | ||
78 | ca->loss_cwnd = 0; | ||
79 | ca->last_cwnd = 0; | ||
80 | ca->last_time = 0; | ||
81 | ca->bic_origin_point = 0; | ||
82 | ca->bic_K = 0; | ||
83 | ca->delay_min = 0; | ||
84 | ca->epoch_start = 0; | ||
85 | ca->delayed_ack = 2 << ACK_RATIO_SHIFT; | ||
86 | ca->ack_cnt = 0; | ||
87 | ca->tcp_cwnd = 0; | ||
88 | } | ||
89 | |||
90 | static void bictcp_init(struct sock *sk) | ||
91 | { | ||
92 | bictcp_reset(inet_csk_ca(sk)); | ||
93 | if (initial_ssthresh) | ||
94 | tcp_sk(sk)->snd_ssthresh = initial_ssthresh; | ||
95 | } | ||
96 | |||
97 | /* 64bit divisor, dividend and result. dynamic precision */ | ||
98 | static inline u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor) | ||
99 | { | ||
100 | u_int32_t d = divisor; | ||
101 | |||
102 | if (divisor > 0xffffffffULL) { | ||
103 | unsigned int shift = fls(divisor >> 32); | ||
104 | |||
105 | d = divisor >> shift; | ||
106 | dividend >>= shift; | ||
107 | } | ||
108 | |||
109 | /* avoid 64 bit division if possible */ | ||
110 | if (dividend >> 32) | ||
111 | do_div(dividend, d); | ||
112 | else | ||
113 | dividend = (uint32_t) dividend / d; | ||
114 | |||
115 | return dividend; | ||
116 | } | ||
117 | |||
118 | /* | ||
119 | * calculate the cubic root of x using Newton-Raphson | ||
120 | */ | ||
121 | static u32 cubic_root(u64 a) | ||
122 | { | ||
123 | u32 x, x1; | ||
124 | |||
125 | /* Initial estimate is based on: | ||
126 | * cbrt(x) = exp(log(x) / 3) | ||
127 | */ | ||
128 | x = 1u << (fls64(a)/3); | ||
129 | |||
130 | /* | ||
131 | * Iteration based on: | ||
132 | * 2 | ||
133 | * x = ( 2 * x + a / x ) / 3 | ||
134 | * k+1 k k | ||
135 | */ | ||
136 | do { | ||
137 | x1 = x; | ||
138 | x = (2 * x + (uint32_t) div64_64(a, x*x)) / 3; | ||
139 | } while (abs(x1 - x) > 1); | ||
140 | |||
141 | return x; | ||
142 | } | ||
143 | |||
144 | /* | ||
145 | * Compute congestion window to use. | ||
146 | */ | ||
147 | static inline void bictcp_update(struct bictcp *ca, u32 cwnd) | ||
148 | { | ||
149 | u64 offs; | ||
150 | u32 delta, t, bic_target, min_cnt, max_cnt; | ||
151 | |||
152 | ca->ack_cnt++; /* count the number of ACKs */ | ||
153 | |||
154 | if (ca->last_cwnd == cwnd && | ||
155 | (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32) | ||
156 | return; | ||
157 | |||
158 | ca->last_cwnd = cwnd; | ||
159 | ca->last_time = tcp_time_stamp; | ||
160 | |||
161 | if (ca->epoch_start == 0) { | ||
162 | ca->epoch_start = tcp_time_stamp; /* record the beginning of an epoch */ | ||
163 | ca->ack_cnt = 1; /* start counting */ | ||
164 | ca->tcp_cwnd = cwnd; /* syn with cubic */ | ||
165 | |||
166 | if (ca->last_max_cwnd <= cwnd) { | ||
167 | ca->bic_K = 0; | ||
168 | ca->bic_origin_point = cwnd; | ||
169 | } else { | ||
170 | /* Compute new K based on | ||
171 | * (wmax-cwnd) * (srtt>>3 / HZ) / c * 2^(3*bictcp_HZ) | ||
172 | */ | ||
173 | ca->bic_K = cubic_root(cube_factor | ||
174 | * (ca->last_max_cwnd - cwnd)); | ||
175 | ca->bic_origin_point = ca->last_max_cwnd; | ||
176 | } | ||
177 | } | ||
178 | |||
179 | /* cubic function - calc*/ | ||
180 | /* calculate c * time^3 / rtt, | ||
181 | * while considering overflow in calculation of time^3 | ||
182 | * (so time^3 is done by using 64 bit) | ||
183 | * and without the support of division of 64bit numbers | ||
184 | * (so all divisions are done by using 32 bit) | ||
185 | * also NOTE the unit of those veriables | ||
186 | * time = (t - K) / 2^bictcp_HZ | ||
187 | * c = bic_scale >> 10 | ||
188 | * rtt = (srtt >> 3) / HZ | ||
189 | * !!! The following code does not have overflow problems, | ||
190 | * if the cwnd < 1 million packets !!! | ||
191 | */ | ||
192 | |||
193 | /* change the unit from HZ to bictcp_HZ */ | ||
194 | t = ((tcp_time_stamp + ca->delay_min - ca->epoch_start) | ||
195 | << BICTCP_HZ) / HZ; | ||
196 | |||
197 | if (t < ca->bic_K) /* t - K */ | ||
198 | offs = ca->bic_K - t; | ||
199 | else | ||
200 | offs = t - ca->bic_K; | ||
201 | |||
202 | /* c/rtt * (t-K)^3 */ | ||
203 | delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ); | ||
204 | if (t < ca->bic_K) /* below origin*/ | ||
205 | bic_target = ca->bic_origin_point - delta; | ||
206 | else /* above origin*/ | ||
207 | bic_target = ca->bic_origin_point + delta; | ||
208 | |||
209 | /* cubic function - calc bictcp_cnt*/ | ||
210 | if (bic_target > cwnd) { | ||
211 | ca->cnt = cwnd / (bic_target - cwnd); | ||
212 | } else { | ||
213 | ca->cnt = 100 * cwnd; /* very small increment*/ | ||
214 | } | ||
215 | |||
216 | if (ca->delay_min > 0) { | ||
217 | /* max increment = Smax * rtt / 0.1 */ | ||
218 | min_cnt = (cwnd * HZ * 8)/(10 * max_increment * ca->delay_min); | ||
219 | if (ca->cnt < min_cnt) | ||
220 | ca->cnt = min_cnt; | ||
221 | } | ||
222 | |||
223 | /* slow start and low utilization */ | ||
224 | if (ca->loss_cwnd == 0) /* could be aggressive in slow start */ | ||
225 | ca->cnt = 50; | ||
226 | |||
227 | /* TCP Friendly */ | ||
228 | if (tcp_friendliness) { | ||
229 | u32 scale = beta_scale; | ||
230 | delta = (cwnd * scale) >> 3; | ||
231 | while (ca->ack_cnt > delta) { /* update tcp cwnd */ | ||
232 | ca->ack_cnt -= delta; | ||
233 | ca->tcp_cwnd++; | ||
234 | } | ||
235 | |||
236 | if (ca->tcp_cwnd > cwnd){ /* if bic is slower than tcp */ | ||
237 | delta = ca->tcp_cwnd - cwnd; | ||
238 | max_cnt = cwnd / delta; | ||
239 | if (ca->cnt > max_cnt) | ||
240 | ca->cnt = max_cnt; | ||
241 | } | ||
242 | } | ||
243 | |||
244 | ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack; | ||
245 | if (ca->cnt == 0) /* cannot be zero */ | ||
246 | ca->cnt = 1; | ||
247 | } | ||
248 | |||
249 | |||
250 | /* Keep track of minimum rtt */ | ||
251 | static inline void measure_delay(struct sock *sk) | ||
252 | { | ||
253 | const struct tcp_sock *tp = tcp_sk(sk); | ||
254 | struct bictcp *ca = inet_csk_ca(sk); | ||
255 | u32 delay; | ||
256 | |||
257 | /* No time stamp */ | ||
258 | if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) || | ||
259 | /* Discard delay samples right after fast recovery */ | ||
260 | (s32)(tcp_time_stamp - ca->epoch_start) < HZ) | ||
261 | return; | ||
262 | |||
263 | delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr; | ||
264 | if (delay == 0) | ||
265 | delay = 1; | ||
266 | |||
267 | /* first time call or link delay decreases */ | ||
268 | if (ca->delay_min == 0 || ca->delay_min > delay) | ||
269 | ca->delay_min = delay; | ||
270 | } | ||
271 | |||
272 | static void bictcp_cong_avoid(struct sock *sk, u32 ack, | ||
273 | u32 seq_rtt, u32 in_flight, int data_acked) | ||
274 | { | ||
275 | struct tcp_sock *tp = tcp_sk(sk); | ||
276 | struct bictcp *ca = inet_csk_ca(sk); | ||
277 | |||
278 | if (data_acked) | ||
279 | measure_delay(sk); | ||
280 | |||
281 | if (!tcp_is_cwnd_limited(sk, in_flight)) | ||
282 | return; | ||
283 | |||
284 | if (tp->snd_cwnd <= tp->snd_ssthresh) | ||
285 | tcp_slow_start(tp); | ||
286 | else { | ||
287 | bictcp_update(ca, tp->snd_cwnd); | ||
288 | |||
289 | /* In dangerous area, increase slowly. | ||
290 | * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd | ||
291 | */ | ||
292 | if (tp->snd_cwnd_cnt >= ca->cnt) { | ||
293 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
294 | tp->snd_cwnd++; | ||
295 | tp->snd_cwnd_cnt = 0; | ||
296 | } else | ||
297 | tp->snd_cwnd_cnt++; | ||
298 | } | ||
299 | |||
300 | } | ||
301 | |||
302 | static u32 bictcp_recalc_ssthresh(struct sock *sk) | ||
303 | { | ||
304 | const struct tcp_sock *tp = tcp_sk(sk); | ||
305 | struct bictcp *ca = inet_csk_ca(sk); | ||
306 | |||
307 | ca->epoch_start = 0; /* end of epoch */ | ||
308 | |||
309 | /* Wmax and fast convergence */ | ||
310 | if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence) | ||
311 | ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta)) | ||
312 | / (2 * BICTCP_BETA_SCALE); | ||
313 | else | ||
314 | ca->last_max_cwnd = tp->snd_cwnd; | ||
315 | |||
316 | ca->loss_cwnd = tp->snd_cwnd; | ||
317 | |||
318 | return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); | ||
319 | } | ||
320 | |||
321 | static u32 bictcp_undo_cwnd(struct sock *sk) | ||
322 | { | ||
323 | struct bictcp *ca = inet_csk_ca(sk); | ||
324 | |||
325 | return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd); | ||
326 | } | ||
327 | |||
328 | static u32 bictcp_min_cwnd(struct sock *sk) | ||
329 | { | ||
330 | return tcp_sk(sk)->snd_ssthresh; | ||
331 | } | ||
332 | |||
333 | static void bictcp_state(struct sock *sk, u8 new_state) | ||
334 | { | ||
335 | if (new_state == TCP_CA_Loss) | ||
336 | bictcp_reset(inet_csk_ca(sk)); | ||
337 | } | ||
338 | |||
339 | /* Track delayed acknowledgment ratio using sliding window | ||
340 | * ratio = (15*ratio + sample) / 16 | ||
341 | */ | ||
342 | static void bictcp_acked(struct sock *sk, u32 cnt) | ||
343 | { | ||
344 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
345 | |||
346 | if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) { | ||
347 | struct bictcp *ca = inet_csk_ca(sk); | ||
348 | cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; | ||
349 | ca->delayed_ack += cnt; | ||
350 | } | ||
351 | } | ||
352 | |||
353 | |||
354 | static struct tcp_congestion_ops cubictcp = { | ||
355 | .init = bictcp_init, | ||
356 | .ssthresh = bictcp_recalc_ssthresh, | ||
357 | .cong_avoid = bictcp_cong_avoid, | ||
358 | .set_state = bictcp_state, | ||
359 | .undo_cwnd = bictcp_undo_cwnd, | ||
360 | .min_cwnd = bictcp_min_cwnd, | ||
361 | .pkts_acked = bictcp_acked, | ||
362 | .owner = THIS_MODULE, | ||
363 | .name = "cubic", | ||
364 | }; | ||
365 | |||
366 | static int __init cubictcp_register(void) | ||
367 | { | ||
368 | BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE); | ||
369 | |||
370 | /* Precompute a bunch of the scaling factors that are used per-packet | ||
371 | * based on SRTT of 100ms | ||
372 | */ | ||
373 | |||
374 | beta_scale = 8*(BICTCP_BETA_SCALE+beta)/ 3 / (BICTCP_BETA_SCALE - beta); | ||
375 | |||
376 | cube_rtt_scale = (bic_scale << 3) / 10; /* 1024*c/rtt */ | ||
377 | |||
378 | /* calculate the "K" for (wmax-cwnd) = c/rtt * K^3 | ||
379 | * so K = cubic_root( (wmax-cwnd)*rtt/c ) | ||
380 | * the unit of K is bictcp_HZ=2^10, not HZ | ||
381 | * | ||
382 | * c = bic_scale >> 10 | ||
383 | * rtt = 100ms | ||
384 | * | ||
385 | * the following code has been designed and tested for | ||
386 | * cwnd < 1 million packets | ||
387 | * RTT < 100 seconds | ||
388 | * HZ < 1,000,00 (corresponding to 10 nano-second) | ||
389 | */ | ||
390 | |||
391 | /* 1/c * 2^2*bictcp_HZ * srtt */ | ||
392 | cube_factor = 1ull << (10+3*BICTCP_HZ); /* 2^40 */ | ||
393 | |||
394 | /* divide by bic_scale and by constant Srtt (100ms) */ | ||
395 | do_div(cube_factor, bic_scale * 10); | ||
396 | |||
397 | return tcp_register_congestion_control(&cubictcp); | ||
398 | } | ||
399 | |||
400 | static void __exit cubictcp_unregister(void) | ||
401 | { | ||
402 | tcp_unregister_congestion_control(&cubictcp); | ||
403 | } | ||
404 | |||
405 | module_init(cubictcp_register); | ||
406 | module_exit(cubictcp_unregister); | ||
407 | |||
408 | MODULE_AUTHOR("Sangtae Ha, Stephen Hemminger"); | ||
409 | MODULE_LICENSE("GPL"); | ||
410 | MODULE_DESCRIPTION("CUBIC TCP"); | ||
411 | MODULE_VERSION("2.0"); | ||
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index bf2e23086bce..0a461232329f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -115,8 +115,8 @@ int sysctl_tcp_abc = 1; | |||
115 | /* Adapt the MSS value used to make delayed ack decision to the | 115 | /* Adapt the MSS value used to make delayed ack decision to the |
116 | * real world. | 116 | * real world. |
117 | */ | 117 | */ |
118 | static inline void tcp_measure_rcv_mss(struct sock *sk, | 118 | static void tcp_measure_rcv_mss(struct sock *sk, |
119 | const struct sk_buff *skb) | 119 | const struct sk_buff *skb) |
120 | { | 120 | { |
121 | struct inet_connection_sock *icsk = inet_csk(sk); | 121 | struct inet_connection_sock *icsk = inet_csk(sk); |
122 | const unsigned int lss = icsk->icsk_ack.last_seg_size; | 122 | const unsigned int lss = icsk->icsk_ack.last_seg_size; |
@@ -246,8 +246,8 @@ static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp, | |||
246 | return 0; | 246 | return 0; |
247 | } | 247 | } |
248 | 248 | ||
249 | static inline void tcp_grow_window(struct sock *sk, struct tcp_sock *tp, | 249 | static void tcp_grow_window(struct sock *sk, struct tcp_sock *tp, |
250 | struct sk_buff *skb) | 250 | struct sk_buff *skb) |
251 | { | 251 | { |
252 | /* Check #1 */ | 252 | /* Check #1 */ |
253 | if (tp->rcv_ssthresh < tp->window_clamp && | 253 | if (tp->rcv_ssthresh < tp->window_clamp && |
@@ -341,6 +341,26 @@ static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) | |||
341 | tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss); | 341 | tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss); |
342 | } | 342 | } |
343 | 343 | ||
344 | |||
345 | /* Initialize RCV_MSS value. | ||
346 | * RCV_MSS is an our guess about MSS used by the peer. | ||
347 | * We haven't any direct information about the MSS. | ||
348 | * It's better to underestimate the RCV_MSS rather than overestimate. | ||
349 | * Overestimations make us ACKing less frequently than needed. | ||
350 | * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss(). | ||
351 | */ | ||
352 | void tcp_initialize_rcv_mss(struct sock *sk) | ||
353 | { | ||
354 | struct tcp_sock *tp = tcp_sk(sk); | ||
355 | unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); | ||
356 | |||
357 | hint = min(hint, tp->rcv_wnd/2); | ||
358 | hint = min(hint, TCP_MIN_RCVMSS); | ||
359 | hint = max(hint, TCP_MIN_MSS); | ||
360 | |||
361 | inet_csk(sk)->icsk_ack.rcv_mss = hint; | ||
362 | } | ||
363 | |||
344 | /* Receiver "autotuning" code. | 364 | /* Receiver "autotuning" code. |
345 | * | 365 | * |
346 | * The algorithm for RTT estimation w/o timestamps is based on | 366 | * The algorithm for RTT estimation w/o timestamps is based on |
@@ -735,6 +755,27 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) | |||
735 | return min_t(__u32, cwnd, tp->snd_cwnd_clamp); | 755 | return min_t(__u32, cwnd, tp->snd_cwnd_clamp); |
736 | } | 756 | } |
737 | 757 | ||
758 | /* Set slow start threshold and cwnd not falling to slow start */ | ||
759 | void tcp_enter_cwr(struct sock *sk) | ||
760 | { | ||
761 | struct tcp_sock *tp = tcp_sk(sk); | ||
762 | |||
763 | tp->prior_ssthresh = 0; | ||
764 | tp->bytes_acked = 0; | ||
765 | if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { | ||
766 | tp->undo_marker = 0; | ||
767 | tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); | ||
768 | tp->snd_cwnd = min(tp->snd_cwnd, | ||
769 | tcp_packets_in_flight(tp) + 1U); | ||
770 | tp->snd_cwnd_cnt = 0; | ||
771 | tp->high_seq = tp->snd_nxt; | ||
772 | tp->snd_cwnd_stamp = tcp_time_stamp; | ||
773 | TCP_ECN_queue_cwr(tp); | ||
774 | |||
775 | tcp_set_ca_state(sk, TCP_CA_CWR); | ||
776 | } | ||
777 | } | ||
778 | |||
738 | /* Initialize metrics on socket. */ | 779 | /* Initialize metrics on socket. */ |
739 | 780 | ||
740 | static void tcp_init_metrics(struct sock *sk) | 781 | static void tcp_init_metrics(struct sock *sk) |
@@ -2070,8 +2111,8 @@ static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, | |||
2070 | tcp_ack_no_tstamp(sk, seq_rtt, flag); | 2111 | tcp_ack_no_tstamp(sk, seq_rtt, flag); |
2071 | } | 2112 | } |
2072 | 2113 | ||
2073 | static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, | 2114 | static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, |
2074 | u32 in_flight, int good) | 2115 | u32 in_flight, int good) |
2075 | { | 2116 | { |
2076 | const struct inet_connection_sock *icsk = inet_csk(sk); | 2117 | const struct inet_connection_sock *icsk = inet_csk(sk); |
2077 | icsk->icsk_ca_ops->cong_avoid(sk, ack, rtt, in_flight, good); | 2118 | icsk->icsk_ca_ops->cong_avoid(sk, ack, rtt, in_flight, good); |
@@ -2082,7 +2123,7 @@ static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, | |||
2082 | * RFC2988 recommends to restart timer to now+rto. | 2123 | * RFC2988 recommends to restart timer to now+rto. |
2083 | */ | 2124 | */ |
2084 | 2125 | ||
2085 | static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp) | 2126 | static void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp) |
2086 | { | 2127 | { |
2087 | if (!tp->packets_out) { | 2128 | if (!tp->packets_out) { |
2088 | inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); | 2129 | inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); |
@@ -2147,7 +2188,7 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, | |||
2147 | return acked; | 2188 | return acked; |
2148 | } | 2189 | } |
2149 | 2190 | ||
2150 | static inline u32 tcp_usrtt(const struct sk_buff *skb) | 2191 | static u32 tcp_usrtt(const struct sk_buff *skb) |
2151 | { | 2192 | { |
2152 | struct timeval tv, now; | 2193 | struct timeval tv, now; |
2153 | 2194 | ||
@@ -2342,7 +2383,7 @@ static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp, | |||
2342 | 2383 | ||
2343 | if (nwin > tp->max_window) { | 2384 | if (nwin > tp->max_window) { |
2344 | tp->max_window = nwin; | 2385 | tp->max_window = nwin; |
2345 | tcp_sync_mss(sk, tp->pmtu_cookie); | 2386 | tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie); |
2346 | } | 2387 | } |
2347 | } | 2388 | } |
2348 | } | 2389 | } |
@@ -2583,8 +2624,8 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, | |||
2583 | /* Fast parse options. This hopes to only see timestamps. | 2624 | /* Fast parse options. This hopes to only see timestamps. |
2584 | * If it is wrong it falls back on tcp_parse_options(). | 2625 | * If it is wrong it falls back on tcp_parse_options(). |
2585 | */ | 2626 | */ |
2586 | static inline int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, | 2627 | static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, |
2587 | struct tcp_sock *tp) | 2628 | struct tcp_sock *tp) |
2588 | { | 2629 | { |
2589 | if (th->doff == sizeof(struct tcphdr)>>2) { | 2630 | if (th->doff == sizeof(struct tcphdr)>>2) { |
2590 | tp->rx_opt.saw_tstamp = 0; | 2631 | tp->rx_opt.saw_tstamp = 0; |
@@ -2804,8 +2845,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) | |||
2804 | } | 2845 | } |
2805 | } | 2846 | } |
2806 | 2847 | ||
2807 | static __inline__ int | 2848 | static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq) |
2808 | tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq) | ||
2809 | { | 2849 | { |
2810 | if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) { | 2850 | if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) { |
2811 | if (before(seq, sp->start_seq)) | 2851 | if (before(seq, sp->start_seq)) |
@@ -2817,7 +2857,7 @@ tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq) | |||
2817 | return 0; | 2857 | return 0; |
2818 | } | 2858 | } |
2819 | 2859 | ||
2820 | static inline void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq) | 2860 | static void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq) |
2821 | { | 2861 | { |
2822 | if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) { | 2862 | if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) { |
2823 | if (before(seq, tp->rcv_nxt)) | 2863 | if (before(seq, tp->rcv_nxt)) |
@@ -2832,7 +2872,7 @@ static inline void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq) | |||
2832 | } | 2872 | } |
2833 | } | 2873 | } |
2834 | 2874 | ||
2835 | static inline void tcp_dsack_extend(struct tcp_sock *tp, u32 seq, u32 end_seq) | 2875 | static void tcp_dsack_extend(struct tcp_sock *tp, u32 seq, u32 end_seq) |
2836 | { | 2876 | { |
2837 | if (!tp->rx_opt.dsack) | 2877 | if (!tp->rx_opt.dsack) |
2838 | tcp_dsack_set(tp, seq, end_seq); | 2878 | tcp_dsack_set(tp, seq, end_seq); |
@@ -2890,7 +2930,7 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp) | |||
2890 | } | 2930 | } |
2891 | } | 2931 | } |
2892 | 2932 | ||
2893 | static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2) | 2933 | static inline void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2) |
2894 | { | 2934 | { |
2895 | __u32 tmp; | 2935 | __u32 tmp; |
2896 | 2936 | ||
@@ -3455,7 +3495,7 @@ void tcp_cwnd_application_limited(struct sock *sk) | |||
3455 | tp->snd_cwnd_stamp = tcp_time_stamp; | 3495 | tp->snd_cwnd_stamp = tcp_time_stamp; |
3456 | } | 3496 | } |
3457 | 3497 | ||
3458 | static inline int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp) | 3498 | static int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp) |
3459 | { | 3499 | { |
3460 | /* If the user specified a specific send buffer setting, do | 3500 | /* If the user specified a specific send buffer setting, do |
3461 | * not modify it. | 3501 | * not modify it. |
@@ -3502,7 +3542,7 @@ static void tcp_new_space(struct sock *sk) | |||
3502 | sk->sk_write_space(sk); | 3542 | sk->sk_write_space(sk); |
3503 | } | 3543 | } |
3504 | 3544 | ||
3505 | static inline void tcp_check_space(struct sock *sk) | 3545 | static void tcp_check_space(struct sock *sk) |
3506 | { | 3546 | { |
3507 | if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) { | 3547 | if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) { |
3508 | sock_reset_flag(sk, SOCK_QUEUE_SHRUNK); | 3548 | sock_reset_flag(sk, SOCK_QUEUE_SHRUNK); |
@@ -3512,7 +3552,7 @@ static inline void tcp_check_space(struct sock *sk) | |||
3512 | } | 3552 | } |
3513 | } | 3553 | } |
3514 | 3554 | ||
3515 | static __inline__ void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp) | 3555 | static inline void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp) |
3516 | { | 3556 | { |
3517 | tcp_push_pending_frames(sk, tp); | 3557 | tcp_push_pending_frames(sk, tp); |
3518 | tcp_check_space(sk); | 3558 | tcp_check_space(sk); |
@@ -3544,7 +3584,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) | |||
3544 | } | 3584 | } |
3545 | } | 3585 | } |
3546 | 3586 | ||
3547 | static __inline__ void tcp_ack_snd_check(struct sock *sk) | 3587 | static inline void tcp_ack_snd_check(struct sock *sk) |
3548 | { | 3588 | { |
3549 | if (!inet_csk_ack_scheduled(sk)) { | 3589 | if (!inet_csk_ack_scheduled(sk)) { |
3550 | /* We sent a data segment already. */ | 3590 | /* We sent a data segment already. */ |
@@ -3692,8 +3732,7 @@ static int __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) | |||
3692 | return result; | 3732 | return result; |
3693 | } | 3733 | } |
3694 | 3734 | ||
3695 | static __inline__ int | 3735 | static inline int tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) |
3696 | tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) | ||
3697 | { | 3736 | { |
3698 | return skb->ip_summed != CHECKSUM_UNNECESSARY && | 3737 | return skb->ip_summed != CHECKSUM_UNNECESSARY && |
3699 | __tcp_checksum_complete_user(sk, skb); | 3738 | __tcp_checksum_complete_user(sk, skb); |
@@ -3967,12 +4006,12 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | |||
3967 | struct tcphdr *th, unsigned len) | 4006 | struct tcphdr *th, unsigned len) |
3968 | { | 4007 | { |
3969 | struct tcp_sock *tp = tcp_sk(sk); | 4008 | struct tcp_sock *tp = tcp_sk(sk); |
4009 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
3970 | int saved_clamp = tp->rx_opt.mss_clamp; | 4010 | int saved_clamp = tp->rx_opt.mss_clamp; |
3971 | 4011 | ||
3972 | tcp_parse_options(skb, &tp->rx_opt, 0); | 4012 | tcp_parse_options(skb, &tp->rx_opt, 0); |
3973 | 4013 | ||
3974 | if (th->ack) { | 4014 | if (th->ack) { |
3975 | struct inet_connection_sock *icsk; | ||
3976 | /* rfc793: | 4015 | /* rfc793: |
3977 | * "If the state is SYN-SENT then | 4016 | * "If the state is SYN-SENT then |
3978 | * first check the ACK bit | 4017 | * first check the ACK bit |
@@ -4061,7 +4100,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | |||
4061 | if (tp->rx_opt.sack_ok && sysctl_tcp_fack) | 4100 | if (tp->rx_opt.sack_ok && sysctl_tcp_fack) |
4062 | tp->rx_opt.sack_ok |= 2; | 4101 | tp->rx_opt.sack_ok |= 2; |
4063 | 4102 | ||
4064 | tcp_sync_mss(sk, tp->pmtu_cookie); | 4103 | tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); |
4065 | tcp_initialize_rcv_mss(sk); | 4104 | tcp_initialize_rcv_mss(sk); |
4066 | 4105 | ||
4067 | /* Remember, tcp_poll() does not lock socket! | 4106 | /* Remember, tcp_poll() does not lock socket! |
@@ -4072,7 +4111,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | |||
4072 | tcp_set_state(sk, TCP_ESTABLISHED); | 4111 | tcp_set_state(sk, TCP_ESTABLISHED); |
4073 | 4112 | ||
4074 | /* Make sure socket is routed, for correct metrics. */ | 4113 | /* Make sure socket is routed, for correct metrics. */ |
4075 | tp->af_specific->rebuild_header(sk); | 4114 | icsk->icsk_af_ops->rebuild_header(sk); |
4076 | 4115 | ||
4077 | tcp_init_metrics(sk); | 4116 | tcp_init_metrics(sk); |
4078 | 4117 | ||
@@ -4098,8 +4137,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | |||
4098 | sk_wake_async(sk, 0, POLL_OUT); | 4137 | sk_wake_async(sk, 0, POLL_OUT); |
4099 | } | 4138 | } |
4100 | 4139 | ||
4101 | icsk = inet_csk(sk); | ||
4102 | |||
4103 | if (sk->sk_write_pending || | 4140 | if (sk->sk_write_pending || |
4104 | icsk->icsk_accept_queue.rskq_defer_accept || | 4141 | icsk->icsk_accept_queue.rskq_defer_accept || |
4105 | icsk->icsk_ack.pingpong) { | 4142 | icsk->icsk_ack.pingpong) { |
@@ -4173,7 +4210,7 @@ discard: | |||
4173 | if (tp->ecn_flags&TCP_ECN_OK) | 4210 | if (tp->ecn_flags&TCP_ECN_OK) |
4174 | sock_set_flag(sk, SOCK_NO_LARGESEND); | 4211 | sock_set_flag(sk, SOCK_NO_LARGESEND); |
4175 | 4212 | ||
4176 | tcp_sync_mss(sk, tp->pmtu_cookie); | 4213 | tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); |
4177 | tcp_initialize_rcv_mss(sk); | 4214 | tcp_initialize_rcv_mss(sk); |
4178 | 4215 | ||
4179 | 4216 | ||
@@ -4220,6 +4257,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
4220 | struct tcphdr *th, unsigned len) | 4257 | struct tcphdr *th, unsigned len) |
4221 | { | 4258 | { |
4222 | struct tcp_sock *tp = tcp_sk(sk); | 4259 | struct tcp_sock *tp = tcp_sk(sk); |
4260 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
4223 | int queued = 0; | 4261 | int queued = 0; |
4224 | 4262 | ||
4225 | tp->rx_opt.saw_tstamp = 0; | 4263 | tp->rx_opt.saw_tstamp = 0; |
@@ -4236,7 +4274,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
4236 | goto discard; | 4274 | goto discard; |
4237 | 4275 | ||
4238 | if(th->syn) { | 4276 | if(th->syn) { |
4239 | if(tp->af_specific->conn_request(sk, skb) < 0) | 4277 | if (icsk->icsk_af_ops->conn_request(sk, skb) < 0) |
4240 | return 1; | 4278 | return 1; |
4241 | 4279 | ||
4242 | /* Now we have several options: In theory there is | 4280 | /* Now we have several options: In theory there is |
@@ -4349,7 +4387,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
4349 | /* Make sure socket is routed, for | 4387 | /* Make sure socket is routed, for |
4350 | * correct metrics. | 4388 | * correct metrics. |
4351 | */ | 4389 | */ |
4352 | tp->af_specific->rebuild_header(sk); | 4390 | icsk->icsk_af_ops->rebuild_header(sk); |
4353 | 4391 | ||
4354 | tcp_init_metrics(sk); | 4392 | tcp_init_metrics(sk); |
4355 | 4393 | ||
@@ -4475,3 +4513,4 @@ EXPORT_SYMBOL(sysctl_tcp_abc); | |||
4475 | EXPORT_SYMBOL(tcp_parse_options); | 4513 | EXPORT_SYMBOL(tcp_parse_options); |
4476 | EXPORT_SYMBOL(tcp_rcv_established); | 4514 | EXPORT_SYMBOL(tcp_rcv_established); |
4477 | EXPORT_SYMBOL(tcp_rcv_state_process); | 4515 | EXPORT_SYMBOL(tcp_rcv_state_process); |
4516 | EXPORT_SYMBOL(tcp_initialize_rcv_mss); | ||
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4d5021e1929b..e9f83e5b28ce 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
@@ -69,6 +69,7 @@ | |||
69 | #include <net/transp_v6.h> | 69 | #include <net/transp_v6.h> |
70 | #include <net/ipv6.h> | 70 | #include <net/ipv6.h> |
71 | #include <net/inet_common.h> | 71 | #include <net/inet_common.h> |
72 | #include <net/timewait_sock.h> | ||
72 | #include <net/xfrm.h> | 73 | #include <net/xfrm.h> |
73 | 74 | ||
74 | #include <linux/inet.h> | 75 | #include <linux/inet.h> |
@@ -86,8 +87,7 @@ int sysctl_tcp_low_latency; | |||
86 | /* Socket used for sending RSTs */ | 87 | /* Socket used for sending RSTs */ |
87 | static struct socket *tcp_socket; | 88 | static struct socket *tcp_socket; |
88 | 89 | ||
89 | void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, | 90 | void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb); |
90 | struct sk_buff *skb); | ||
91 | 91 | ||
92 | struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { | 92 | struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { |
93 | .lhash_lock = RW_LOCK_UNLOCKED, | 93 | .lhash_lock = RW_LOCK_UNLOCKED, |
@@ -97,7 +97,8 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { | |||
97 | 97 | ||
98 | static int tcp_v4_get_port(struct sock *sk, unsigned short snum) | 98 | static int tcp_v4_get_port(struct sock *sk, unsigned short snum) |
99 | { | 99 | { |
100 | return inet_csk_get_port(&tcp_hashinfo, sk, snum); | 100 | return inet_csk_get_port(&tcp_hashinfo, sk, snum, |
101 | inet_csk_bind_conflict); | ||
101 | } | 102 | } |
102 | 103 | ||
103 | static void tcp_v4_hash(struct sock *sk) | 104 | static void tcp_v4_hash(struct sock *sk) |
@@ -118,202 +119,38 @@ static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb) | |||
118 | skb->h.th->source); | 119 | skb->h.th->source); |
119 | } | 120 | } |
120 | 121 | ||
121 | /* called with local bh disabled */ | 122 | int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) |
122 | static int __tcp_v4_check_established(struct sock *sk, __u16 lport, | ||
123 | struct inet_timewait_sock **twp) | ||
124 | { | 123 | { |
125 | struct inet_sock *inet = inet_sk(sk); | 124 | const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); |
126 | u32 daddr = inet->rcv_saddr; | 125 | struct tcp_sock *tp = tcp_sk(sk); |
127 | u32 saddr = inet->daddr; | ||
128 | int dif = sk->sk_bound_dev_if; | ||
129 | INET_ADDR_COOKIE(acookie, saddr, daddr) | ||
130 | const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport); | ||
131 | unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); | ||
132 | struct inet_ehash_bucket *head = inet_ehash_bucket(&tcp_hashinfo, hash); | ||
133 | struct sock *sk2; | ||
134 | const struct hlist_node *node; | ||
135 | struct inet_timewait_sock *tw; | ||
136 | |||
137 | prefetch(head->chain.first); | ||
138 | write_lock(&head->lock); | ||
139 | |||
140 | /* Check TIME-WAIT sockets first. */ | ||
141 | sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) { | ||
142 | tw = inet_twsk(sk2); | ||
143 | |||
144 | if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) { | ||
145 | const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2); | ||
146 | struct tcp_sock *tp = tcp_sk(sk); | ||
147 | |||
148 | /* With PAWS, it is safe from the viewpoint | ||
149 | of data integrity. Even without PAWS it | ||
150 | is safe provided sequence spaces do not | ||
151 | overlap i.e. at data rates <= 80Mbit/sec. | ||
152 | |||
153 | Actually, the idea is close to VJ's one, | ||
154 | only timestamp cache is held not per host, | ||
155 | but per port pair and TW bucket is used | ||
156 | as state holder. | ||
157 | 126 | ||
158 | If TW bucket has been already destroyed we | 127 | /* With PAWS, it is safe from the viewpoint |
159 | fall back to VJ's scheme and use initial | 128 | of data integrity. Even without PAWS it is safe provided sequence |
160 | timestamp retrieved from peer table. | 129 | spaces do not overlap i.e. at data rates <= 80Mbit/sec. |
161 | */ | ||
162 | if (tcptw->tw_ts_recent_stamp && | ||
163 | (!twp || (sysctl_tcp_tw_reuse && | ||
164 | xtime.tv_sec - | ||
165 | tcptw->tw_ts_recent_stamp > 1))) { | ||
166 | tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; | ||
167 | if (tp->write_seq == 0) | ||
168 | tp->write_seq = 1; | ||
169 | tp->rx_opt.ts_recent = tcptw->tw_ts_recent; | ||
170 | tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; | ||
171 | sock_hold(sk2); | ||
172 | goto unique; | ||
173 | } else | ||
174 | goto not_unique; | ||
175 | } | ||
176 | } | ||
177 | tw = NULL; | ||
178 | 130 | ||
179 | /* And established part... */ | 131 | Actually, the idea is close to VJ's one, only timestamp cache is |
180 | sk_for_each(sk2, node, &head->chain) { | 132 | held not per host, but per port pair and TW bucket is used as state |
181 | if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) | 133 | holder. |
182 | goto not_unique; | ||
183 | } | ||
184 | 134 | ||
185 | unique: | 135 | If TW bucket has been already destroyed we fall back to VJ's scheme |
186 | /* Must record num and sport now. Otherwise we will see | 136 | and use initial timestamp retrieved from peer table. |
187 | * in hash table socket with a funny identity. */ | 137 | */ |
188 | inet->num = lport; | 138 | if (tcptw->tw_ts_recent_stamp && |
189 | inet->sport = htons(lport); | 139 | (twp == NULL || (sysctl_tcp_tw_reuse && |
190 | sk->sk_hash = hash; | 140 | xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) { |
191 | BUG_TRAP(sk_unhashed(sk)); | 141 | tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; |
192 | __sk_add_node(sk, &head->chain); | 142 | if (tp->write_seq == 0) |
193 | sock_prot_inc_use(sk->sk_prot); | 143 | tp->write_seq = 1; |
194 | write_unlock(&head->lock); | 144 | tp->rx_opt.ts_recent = tcptw->tw_ts_recent; |
195 | 145 | tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; | |
196 | if (twp) { | 146 | sock_hold(sktw); |
197 | *twp = tw; | 147 | return 1; |
198 | NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); | ||
199 | } else if (tw) { | ||
200 | /* Silly. Should hash-dance instead... */ | ||
201 | inet_twsk_deschedule(tw, &tcp_death_row); | ||
202 | NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); | ||
203 | |||
204 | inet_twsk_put(tw); | ||
205 | } | 148 | } |
206 | 149 | ||
207 | return 0; | 150 | return 0; |
208 | |||
209 | not_unique: | ||
210 | write_unlock(&head->lock); | ||
211 | return -EADDRNOTAVAIL; | ||
212 | } | 151 | } |
213 | 152 | ||
214 | static inline u32 connect_port_offset(const struct sock *sk) | 153 | EXPORT_SYMBOL_GPL(tcp_twsk_unique); |
215 | { | ||
216 | const struct inet_sock *inet = inet_sk(sk); | ||
217 | |||
218 | return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr, | ||
219 | inet->dport); | ||
220 | } | ||
221 | |||
222 | /* | ||
223 | * Bind a port for a connect operation and hash it. | ||
224 | */ | ||
225 | static inline int tcp_v4_hash_connect(struct sock *sk) | ||
226 | { | ||
227 | const unsigned short snum = inet_sk(sk)->num; | ||
228 | struct inet_bind_hashbucket *head; | ||
229 | struct inet_bind_bucket *tb; | ||
230 | int ret; | ||
231 | |||
232 | if (!snum) { | ||
233 | int low = sysctl_local_port_range[0]; | ||
234 | int high = sysctl_local_port_range[1]; | ||
235 | int range = high - low; | ||
236 | int i; | ||
237 | int port; | ||
238 | static u32 hint; | ||
239 | u32 offset = hint + connect_port_offset(sk); | ||
240 | struct hlist_node *node; | ||
241 | struct inet_timewait_sock *tw = NULL; | ||
242 | |||
243 | local_bh_disable(); | ||
244 | for (i = 1; i <= range; i++) { | ||
245 | port = low + (i + offset) % range; | ||
246 | head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)]; | ||
247 | spin_lock(&head->lock); | ||
248 | |||
249 | /* Does not bother with rcv_saddr checks, | ||
250 | * because the established check is already | ||
251 | * unique enough. | ||
252 | */ | ||
253 | inet_bind_bucket_for_each(tb, node, &head->chain) { | ||
254 | if (tb->port == port) { | ||
255 | BUG_TRAP(!hlist_empty(&tb->owners)); | ||
256 | if (tb->fastreuse >= 0) | ||
257 | goto next_port; | ||
258 | if (!__tcp_v4_check_established(sk, | ||
259 | port, | ||
260 | &tw)) | ||
261 | goto ok; | ||
262 | goto next_port; | ||
263 | } | ||
264 | } | ||
265 | |||
266 | tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port); | ||
267 | if (!tb) { | ||
268 | spin_unlock(&head->lock); | ||
269 | break; | ||
270 | } | ||
271 | tb->fastreuse = -1; | ||
272 | goto ok; | ||
273 | |||
274 | next_port: | ||
275 | spin_unlock(&head->lock); | ||
276 | } | ||
277 | local_bh_enable(); | ||
278 | |||
279 | return -EADDRNOTAVAIL; | ||
280 | |||
281 | ok: | ||
282 | hint += i; | ||
283 | |||
284 | /* Head lock still held and bh's disabled */ | ||
285 | inet_bind_hash(sk, tb, port); | ||
286 | if (sk_unhashed(sk)) { | ||
287 | inet_sk(sk)->sport = htons(port); | ||
288 | __inet_hash(&tcp_hashinfo, sk, 0); | ||
289 | } | ||
290 | spin_unlock(&head->lock); | ||
291 | |||
292 | if (tw) { | ||
293 | inet_twsk_deschedule(tw, &tcp_death_row);; | ||
294 | inet_twsk_put(tw); | ||
295 | } | ||
296 | |||
297 | ret = 0; | ||
298 | goto out; | ||
299 | } | ||
300 | |||
301 | head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)]; | ||
302 | tb = inet_csk(sk)->icsk_bind_hash; | ||
303 | spin_lock_bh(&head->lock); | ||
304 | if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { | ||
305 | __inet_hash(&tcp_hashinfo, sk, 0); | ||
306 | spin_unlock_bh(&head->lock); | ||
307 | return 0; | ||
308 | } else { | ||
309 | spin_unlock(&head->lock); | ||
310 | /* No definite answer... Walk to established hash table */ | ||
311 | ret = __tcp_v4_check_established(sk, snum, NULL); | ||
312 | out: | ||
313 | local_bh_enable(); | ||
314 | return ret; | ||
315 | } | ||
316 | } | ||
317 | 154 | ||
318 | /* This will initiate an outgoing connection. */ | 155 | /* This will initiate an outgoing connection. */ |
319 | int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) | 156 | int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) |
@@ -383,9 +220,9 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) | |||
383 | inet->dport = usin->sin_port; | 220 | inet->dport = usin->sin_port; |
384 | inet->daddr = daddr; | 221 | inet->daddr = daddr; |
385 | 222 | ||
386 | tp->ext_header_len = 0; | 223 | inet_csk(sk)->icsk_ext_hdr_len = 0; |
387 | if (inet->opt) | 224 | if (inet->opt) |
388 | tp->ext_header_len = inet->opt->optlen; | 225 | inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen; |
389 | 226 | ||
390 | tp->rx_opt.mss_clamp = 536; | 227 | tp->rx_opt.mss_clamp = 536; |
391 | 228 | ||
@@ -395,7 +232,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) | |||
395 | * complete initialization after this. | 232 | * complete initialization after this. |
396 | */ | 233 | */ |
397 | tcp_set_state(sk, TCP_SYN_SENT); | 234 | tcp_set_state(sk, TCP_SYN_SENT); |
398 | err = tcp_v4_hash_connect(sk); | 235 | err = inet_hash_connect(&tcp_death_row, sk); |
399 | if (err) | 236 | if (err) |
400 | goto failure; | 237 | goto failure; |
401 | 238 | ||
@@ -433,12 +270,10 @@ failure: | |||
433 | /* | 270 | /* |
434 | * This routine does path mtu discovery as defined in RFC1191. | 271 | * This routine does path mtu discovery as defined in RFC1191. |
435 | */ | 272 | */ |
436 | static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, | 273 | static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu) |
437 | u32 mtu) | ||
438 | { | 274 | { |
439 | struct dst_entry *dst; | 275 | struct dst_entry *dst; |
440 | struct inet_sock *inet = inet_sk(sk); | 276 | struct inet_sock *inet = inet_sk(sk); |
441 | struct tcp_sock *tp = tcp_sk(sk); | ||
442 | 277 | ||
443 | /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs | 278 | /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs |
444 | * send out by Linux are always <576bytes so they should go through | 279 | * send out by Linux are always <576bytes so they should go through |
@@ -467,7 +302,7 @@ static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, | |||
467 | mtu = dst_mtu(dst); | 302 | mtu = dst_mtu(dst); |
468 | 303 | ||
469 | if (inet->pmtudisc != IP_PMTUDISC_DONT && | 304 | if (inet->pmtudisc != IP_PMTUDISC_DONT && |
470 | tp->pmtu_cookie > mtu) { | 305 | inet_csk(sk)->icsk_pmtu_cookie > mtu) { |
471 | tcp_sync_mss(sk, mtu); | 306 | tcp_sync_mss(sk, mtu); |
472 | 307 | ||
473 | /* Resend the TCP packet because it's | 308 | /* Resend the TCP packet because it's |
@@ -644,10 +479,10 @@ out: | |||
644 | } | 479 | } |
645 | 480 | ||
646 | /* This routine computes an IPv4 TCP checksum. */ | 481 | /* This routine computes an IPv4 TCP checksum. */ |
647 | void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, | 482 | void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb) |
648 | struct sk_buff *skb) | ||
649 | { | 483 | { |
650 | struct inet_sock *inet = inet_sk(sk); | 484 | struct inet_sock *inet = inet_sk(sk); |
485 | struct tcphdr *th = skb->h.th; | ||
651 | 486 | ||
652 | if (skb->ip_summed == CHECKSUM_HW) { | 487 | if (skb->ip_summed == CHECKSUM_HW) { |
653 | th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0); | 488 | th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0); |
@@ -826,7 +661,8 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req) | |||
826 | kfree(inet_rsk(req)->opt); | 661 | kfree(inet_rsk(req)->opt); |
827 | } | 662 | } |
828 | 663 | ||
829 | static inline void syn_flood_warning(struct sk_buff *skb) | 664 | #ifdef CONFIG_SYN_COOKIES |
665 | static void syn_flood_warning(struct sk_buff *skb) | ||
830 | { | 666 | { |
831 | static unsigned long warntime; | 667 | static unsigned long warntime; |
832 | 668 | ||
@@ -837,12 +673,13 @@ static inline void syn_flood_warning(struct sk_buff *skb) | |||
837 | ntohs(skb->h.th->dest)); | 673 | ntohs(skb->h.th->dest)); |
838 | } | 674 | } |
839 | } | 675 | } |
676 | #endif | ||
840 | 677 | ||
841 | /* | 678 | /* |
842 | * Save and compile IPv4 options into the request_sock if needed. | 679 | * Save and compile IPv4 options into the request_sock if needed. |
843 | */ | 680 | */ |
844 | static inline struct ip_options *tcp_v4_save_options(struct sock *sk, | 681 | static struct ip_options *tcp_v4_save_options(struct sock *sk, |
845 | struct sk_buff *skb) | 682 | struct sk_buff *skb) |
846 | { | 683 | { |
847 | struct ip_options *opt = &(IPCB(skb)->opt); | 684 | struct ip_options *opt = &(IPCB(skb)->opt); |
848 | struct ip_options *dopt = NULL; | 685 | struct ip_options *dopt = NULL; |
@@ -869,6 +706,11 @@ struct request_sock_ops tcp_request_sock_ops = { | |||
869 | .send_reset = tcp_v4_send_reset, | 706 | .send_reset = tcp_v4_send_reset, |
870 | }; | 707 | }; |
871 | 708 | ||
709 | static struct timewait_sock_ops tcp_timewait_sock_ops = { | ||
710 | .twsk_obj_size = sizeof(struct tcp_timewait_sock), | ||
711 | .twsk_unique = tcp_twsk_unique, | ||
712 | }; | ||
713 | |||
872 | int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | 714 | int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) |
873 | { | 715 | { |
874 | struct inet_request_sock *ireq; | 716 | struct inet_request_sock *ireq; |
@@ -1053,9 +895,9 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, | |||
1053 | ireq->opt = NULL; | 895 | ireq->opt = NULL; |
1054 | newinet->mc_index = inet_iif(skb); | 896 | newinet->mc_index = inet_iif(skb); |
1055 | newinet->mc_ttl = skb->nh.iph->ttl; | 897 | newinet->mc_ttl = skb->nh.iph->ttl; |
1056 | newtp->ext_header_len = 0; | 898 | inet_csk(newsk)->icsk_ext_hdr_len = 0; |
1057 | if (newinet->opt) | 899 | if (newinet->opt) |
1058 | newtp->ext_header_len = newinet->opt->optlen; | 900 | inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen; |
1059 | newinet->id = newtp->write_seq ^ jiffies; | 901 | newinet->id = newtp->write_seq ^ jiffies; |
1060 | 902 | ||
1061 | tcp_sync_mss(newsk, dst_mtu(dst)); | 903 | tcp_sync_mss(newsk, dst_mtu(dst)); |
@@ -1314,16 +1156,6 @@ do_time_wait: | |||
1314 | goto discard_it; | 1156 | goto discard_it; |
1315 | } | 1157 | } |
1316 | 1158 | ||
1317 | static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) | ||
1318 | { | ||
1319 | struct sockaddr_in *sin = (struct sockaddr_in *) uaddr; | ||
1320 | struct inet_sock *inet = inet_sk(sk); | ||
1321 | |||
1322 | sin->sin_family = AF_INET; | ||
1323 | sin->sin_addr.s_addr = inet->daddr; | ||
1324 | sin->sin_port = inet->dport; | ||
1325 | } | ||
1326 | |||
1327 | /* VJ's idea. Save last timestamp seen from this destination | 1159 | /* VJ's idea. Save last timestamp seen from this destination |
1328 | * and hold it at least for normal timewait interval to use for duplicate | 1160 | * and hold it at least for normal timewait interval to use for duplicate |
1329 | * segment detection in subsequent connections, before they enter synchronized | 1161 | * segment detection in subsequent connections, before they enter synchronized |
@@ -1382,7 +1214,7 @@ int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw) | |||
1382 | return 0; | 1214 | return 0; |
1383 | } | 1215 | } |
1384 | 1216 | ||
1385 | struct tcp_func ipv4_specific = { | 1217 | struct inet_connection_sock_af_ops ipv4_specific = { |
1386 | .queue_xmit = ip_queue_xmit, | 1218 | .queue_xmit = ip_queue_xmit, |
1387 | .send_check = tcp_v4_send_check, | 1219 | .send_check = tcp_v4_send_check, |
1388 | .rebuild_header = inet_sk_rebuild_header, | 1220 | .rebuild_header = inet_sk_rebuild_header, |
@@ -1392,7 +1224,7 @@ struct tcp_func ipv4_specific = { | |||
1392 | .net_header_len = sizeof(struct iphdr), | 1224 | .net_header_len = sizeof(struct iphdr), |
1393 | .setsockopt = ip_setsockopt, | 1225 | .setsockopt = ip_setsockopt, |
1394 | .getsockopt = ip_getsockopt, | 1226 | .getsockopt = ip_getsockopt, |
1395 | .addr2sockaddr = v4_addr2sockaddr, | 1227 | .addr2sockaddr = inet_csk_addr2sockaddr, |
1396 | .sockaddr_len = sizeof(struct sockaddr_in), | 1228 | .sockaddr_len = sizeof(struct sockaddr_in), |
1397 | }; | 1229 | }; |
1398 | 1230 | ||
@@ -1433,7 +1265,8 @@ static int tcp_v4_init_sock(struct sock *sk) | |||
1433 | sk->sk_write_space = sk_stream_write_space; | 1265 | sk->sk_write_space = sk_stream_write_space; |
1434 | sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); | 1266 | sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); |
1435 | 1267 | ||
1436 | tp->af_specific = &ipv4_specific; | 1268 | icsk->icsk_af_ops = &ipv4_specific; |
1269 | icsk->icsk_sync_mss = tcp_sync_mss; | ||
1437 | 1270 | ||
1438 | sk->sk_sndbuf = sysctl_tcp_wmem[1]; | 1271 | sk->sk_sndbuf = sysctl_tcp_wmem[1]; |
1439 | sk->sk_rcvbuf = sysctl_tcp_rmem[1]; | 1272 | sk->sk_rcvbuf = sysctl_tcp_rmem[1]; |
@@ -1989,7 +1822,7 @@ struct proto tcp_prot = { | |||
1989 | .sysctl_rmem = sysctl_tcp_rmem, | 1822 | .sysctl_rmem = sysctl_tcp_rmem, |
1990 | .max_header = MAX_TCP_HEADER, | 1823 | .max_header = MAX_TCP_HEADER, |
1991 | .obj_size = sizeof(struct tcp_sock), | 1824 | .obj_size = sizeof(struct tcp_sock), |
1992 | .twsk_obj_size = sizeof(struct tcp_timewait_sock), | 1825 | .twsk_prot = &tcp_timewait_sock_ops, |
1993 | .rsk_prot = &tcp_request_sock_ops, | 1826 | .rsk_prot = &tcp_request_sock_ops, |
1994 | }; | 1827 | }; |
1995 | 1828 | ||
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 1b66a2ac4321..2b9b7f6c7f7c 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c | |||
@@ -274,18 +274,18 @@ kill: | |||
274 | void tcp_time_wait(struct sock *sk, int state, int timeo) | 274 | void tcp_time_wait(struct sock *sk, int state, int timeo) |
275 | { | 275 | { |
276 | struct inet_timewait_sock *tw = NULL; | 276 | struct inet_timewait_sock *tw = NULL; |
277 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
277 | const struct tcp_sock *tp = tcp_sk(sk); | 278 | const struct tcp_sock *tp = tcp_sk(sk); |
278 | int recycle_ok = 0; | 279 | int recycle_ok = 0; |
279 | 280 | ||
280 | if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) | 281 | if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) |
281 | recycle_ok = tp->af_specific->remember_stamp(sk); | 282 | recycle_ok = icsk->icsk_af_ops->remember_stamp(sk); |
282 | 283 | ||
283 | if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) | 284 | if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) |
284 | tw = inet_twsk_alloc(sk, state); | 285 | tw = inet_twsk_alloc(sk, state); |
285 | 286 | ||
286 | if (tw != NULL) { | 287 | if (tw != NULL) { |
287 | struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); | 288 | struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); |
288 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
289 | const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); | 289 | const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); |
290 | 290 | ||
291 | tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; | 291 | tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; |
@@ -298,10 +298,12 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) | |||
298 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | 298 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
299 | if (tw->tw_family == PF_INET6) { | 299 | if (tw->tw_family == PF_INET6) { |
300 | struct ipv6_pinfo *np = inet6_sk(sk); | 300 | struct ipv6_pinfo *np = inet6_sk(sk); |
301 | struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw); | 301 | struct inet6_timewait_sock *tw6; |
302 | 302 | ||
303 | ipv6_addr_copy(&tcp6tw->tw_v6_daddr, &np->daddr); | 303 | tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot); |
304 | ipv6_addr_copy(&tcp6tw->tw_v6_rcv_saddr, &np->rcv_saddr); | 304 | tw6 = inet6_twsk((struct sock *)tw); |
305 | ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr); | ||
306 | ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr); | ||
305 | tw->tw_ipv6only = np->ipv6only; | 307 | tw->tw_ipv6only = np->ipv6only; |
306 | } | 308 | } |
307 | #endif | 309 | #endif |
@@ -456,7 +458,6 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, | |||
456 | struct request_sock **prev) | 458 | struct request_sock **prev) |
457 | { | 459 | { |
458 | struct tcphdr *th = skb->h.th; | 460 | struct tcphdr *th = skb->h.th; |
459 | struct tcp_sock *tp = tcp_sk(sk); | ||
460 | u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); | 461 | u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); |
461 | int paws_reject = 0; | 462 | int paws_reject = 0; |
462 | struct tcp_options_received tmp_opt; | 463 | struct tcp_options_received tmp_opt; |
@@ -613,7 +614,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, | |||
613 | * ESTABLISHED STATE. If it will be dropped after | 614 | * ESTABLISHED STATE. If it will be dropped after |
614 | * socket is created, wait for troubles. | 615 | * socket is created, wait for troubles. |
615 | */ | 616 | */ |
616 | child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL); | 617 | child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, |
618 | req, NULL); | ||
617 | if (child == NULL) | 619 | if (child == NULL) |
618 | goto listen_overflow; | 620 | goto listen_overflow; |
619 | 621 | ||
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b7325e0b406a..a7623ead39a8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -51,8 +51,8 @@ int sysctl_tcp_retrans_collapse = 1; | |||
51 | */ | 51 | */ |
52 | int sysctl_tcp_tso_win_divisor = 3; | 52 | int sysctl_tcp_tso_win_divisor = 3; |
53 | 53 | ||
54 | static inline void update_send_head(struct sock *sk, struct tcp_sock *tp, | 54 | static void update_send_head(struct sock *sk, struct tcp_sock *tp, |
55 | struct sk_buff *skb) | 55 | struct sk_buff *skb) |
56 | { | 56 | { |
57 | sk->sk_send_head = skb->next; | 57 | sk->sk_send_head = skb->next; |
58 | if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue) | 58 | if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue) |
@@ -124,8 +124,8 @@ static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst) | |||
124 | tp->snd_cwnd_used = 0; | 124 | tp->snd_cwnd_used = 0; |
125 | } | 125 | } |
126 | 126 | ||
127 | static inline void tcp_event_data_sent(struct tcp_sock *tp, | 127 | static void tcp_event_data_sent(struct tcp_sock *tp, |
128 | struct sk_buff *skb, struct sock *sk) | 128 | struct sk_buff *skb, struct sock *sk) |
129 | { | 129 | { |
130 | struct inet_connection_sock *icsk = inet_csk(sk); | 130 | struct inet_connection_sock *icsk = inet_csk(sk); |
131 | const u32 now = tcp_time_stamp; | 131 | const u32 now = tcp_time_stamp; |
@@ -142,7 +142,7 @@ static inline void tcp_event_data_sent(struct tcp_sock *tp, | |||
142 | icsk->icsk_ack.pingpong = 1; | 142 | icsk->icsk_ack.pingpong = 1; |
143 | } | 143 | } |
144 | 144 | ||
145 | static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) | 145 | static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) |
146 | { | 146 | { |
147 | tcp_dec_quickack_mode(sk, pkts); | 147 | tcp_dec_quickack_mode(sk, pkts); |
148 | inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); | 148 | inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); |
@@ -212,7 +212,7 @@ void tcp_select_initial_window(int __space, __u32 mss, | |||
212 | * value can be stuffed directly into th->window for an outgoing | 212 | * value can be stuffed directly into th->window for an outgoing |
213 | * frame. | 213 | * frame. |
214 | */ | 214 | */ |
215 | static __inline__ u16 tcp_select_window(struct sock *sk) | 215 | static u16 tcp_select_window(struct sock *sk) |
216 | { | 216 | { |
217 | struct tcp_sock *tp = tcp_sk(sk); | 217 | struct tcp_sock *tp = tcp_sk(sk); |
218 | u32 cur_win = tcp_receive_window(tp); | 218 | u32 cur_win = tcp_receive_window(tp); |
@@ -250,6 +250,75 @@ static __inline__ u16 tcp_select_window(struct sock *sk) | |||
250 | return new_win; | 250 | return new_win; |
251 | } | 251 | } |
252 | 252 | ||
253 | static void tcp_build_and_update_options(__u32 *ptr, struct tcp_sock *tp, | ||
254 | __u32 tstamp) | ||
255 | { | ||
256 | if (tp->rx_opt.tstamp_ok) { | ||
257 | *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | | ||
258 | (TCPOPT_NOP << 16) | | ||
259 | (TCPOPT_TIMESTAMP << 8) | | ||
260 | TCPOLEN_TIMESTAMP); | ||
261 | *ptr++ = htonl(tstamp); | ||
262 | *ptr++ = htonl(tp->rx_opt.ts_recent); | ||
263 | } | ||
264 | if (tp->rx_opt.eff_sacks) { | ||
265 | struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks; | ||
266 | int this_sack; | ||
267 | |||
268 | *ptr++ = htonl((TCPOPT_NOP << 24) | | ||
269 | (TCPOPT_NOP << 16) | | ||
270 | (TCPOPT_SACK << 8) | | ||
271 | (TCPOLEN_SACK_BASE + (tp->rx_opt.eff_sacks * | ||
272 | TCPOLEN_SACK_PERBLOCK))); | ||
273 | for(this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) { | ||
274 | *ptr++ = htonl(sp[this_sack].start_seq); | ||
275 | *ptr++ = htonl(sp[this_sack].end_seq); | ||
276 | } | ||
277 | if (tp->rx_opt.dsack) { | ||
278 | tp->rx_opt.dsack = 0; | ||
279 | tp->rx_opt.eff_sacks--; | ||
280 | } | ||
281 | } | ||
282 | } | ||
283 | |||
284 | /* Construct a tcp options header for a SYN or SYN_ACK packet. | ||
285 | * If this is every changed make sure to change the definition of | ||
286 | * MAX_SYN_SIZE to match the new maximum number of options that you | ||
287 | * can generate. | ||
288 | */ | ||
289 | static void tcp_syn_build_options(__u32 *ptr, int mss, int ts, int sack, | ||
290 | int offer_wscale, int wscale, __u32 tstamp, | ||
291 | __u32 ts_recent) | ||
292 | { | ||
293 | /* We always get an MSS option. | ||
294 | * The option bytes which will be seen in normal data | ||
295 | * packets should timestamps be used, must be in the MSS | ||
296 | * advertised. But we subtract them from tp->mss_cache so | ||
297 | * that calculations in tcp_sendmsg are simpler etc. | ||
298 | * So account for this fact here if necessary. If we | ||
299 | * don't do this correctly, as a receiver we won't | ||
300 | * recognize data packets as being full sized when we | ||
301 | * should, and thus we won't abide by the delayed ACK | ||
302 | * rules correctly. | ||
303 | * SACKs don't matter, we never delay an ACK when we | ||
304 | * have any of those going out. | ||
305 | */ | ||
306 | *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss); | ||
307 | if (ts) { | ||
308 | if(sack) | ||
309 | *ptr++ = __constant_htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) | | ||
310 | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); | ||
311 | else | ||
312 | *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | | ||
313 | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); | ||
314 | *ptr++ = htonl(tstamp); /* TSVAL */ | ||
315 | *ptr++ = htonl(ts_recent); /* TSECR */ | ||
316 | } else if(sack) | ||
317 | *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | | ||
318 | (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); | ||
319 | if (offer_wscale) | ||
320 | *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | (wscale)); | ||
321 | } | ||
253 | 322 | ||
254 | /* This routine actually transmits TCP packets queued in by | 323 | /* This routine actually transmits TCP packets queued in by |
255 | * tcp_do_sendmsg(). This is used by both the initial | 324 | * tcp_do_sendmsg(). This is used by both the initial |
@@ -371,7 +440,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
371 | TCP_ECN_send(sk, tp, skb, tcp_header_size); | 440 | TCP_ECN_send(sk, tp, skb, tcp_header_size); |
372 | } | 441 | } |
373 | 442 | ||
374 | tp->af_specific->send_check(sk, th, skb->len, skb); | 443 | icsk->icsk_af_ops->send_check(sk, skb->len, skb); |
375 | 444 | ||
376 | if (likely(tcb->flags & TCPCB_FLAG_ACK)) | 445 | if (likely(tcb->flags & TCPCB_FLAG_ACK)) |
377 | tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); | 446 | tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); |
@@ -381,7 +450,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
381 | 450 | ||
382 | TCP_INC_STATS(TCP_MIB_OUTSEGS); | 451 | TCP_INC_STATS(TCP_MIB_OUTSEGS); |
383 | 452 | ||
384 | err = tp->af_specific->queue_xmit(skb, 0); | 453 | err = icsk->icsk_af_ops->queue_xmit(skb, 0); |
385 | if (unlikely(err <= 0)) | 454 | if (unlikely(err <= 0)) |
386 | return err; | 455 | return err; |
387 | 456 | ||
@@ -621,7 +690,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) | |||
621 | It is minimum of user_mss and mss received with SYN. | 690 | It is minimum of user_mss and mss received with SYN. |
622 | It also does not include TCP options. | 691 | It also does not include TCP options. |
623 | 692 | ||
624 | tp->pmtu_cookie is last pmtu, seen by this function. | 693 | inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function. |
625 | 694 | ||
626 | tp->mss_cache is current effective sending mss, including | 695 | tp->mss_cache is current effective sending mss, including |
627 | all tcp options except for SACKs. It is evaluated, | 696 | all tcp options except for SACKs. It is evaluated, |
@@ -631,26 +700,26 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) | |||
631 | NOTE1. rfc1122 clearly states that advertised MSS | 700 | NOTE1. rfc1122 clearly states that advertised MSS |
632 | DOES NOT include either tcp or ip options. | 701 | DOES NOT include either tcp or ip options. |
633 | 702 | ||
634 | NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside | 703 | NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache |
635 | this function. --ANK (980731) | 704 | are READ ONLY outside this function. --ANK (980731) |
636 | */ | 705 | */ |
637 | 706 | ||
638 | unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) | 707 | unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) |
639 | { | 708 | { |
640 | struct tcp_sock *tp = tcp_sk(sk); | 709 | struct tcp_sock *tp = tcp_sk(sk); |
641 | int mss_now; | 710 | struct inet_connection_sock *icsk = inet_csk(sk); |
642 | |||
643 | /* Calculate base mss without TCP options: | 711 | /* Calculate base mss without TCP options: |
644 | It is MMS_S - sizeof(tcphdr) of rfc1122 | 712 | It is MMS_S - sizeof(tcphdr) of rfc1122 |
645 | */ | 713 | */ |
646 | mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr); | 714 | int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len - |
715 | sizeof(struct tcphdr)); | ||
647 | 716 | ||
648 | /* Clamp it (mss_clamp does not include tcp options) */ | 717 | /* Clamp it (mss_clamp does not include tcp options) */ |
649 | if (mss_now > tp->rx_opt.mss_clamp) | 718 | if (mss_now > tp->rx_opt.mss_clamp) |
650 | mss_now = tp->rx_opt.mss_clamp; | 719 | mss_now = tp->rx_opt.mss_clamp; |
651 | 720 | ||
652 | /* Now subtract optional transport overhead */ | 721 | /* Now subtract optional transport overhead */ |
653 | mss_now -= tp->ext_header_len; | 722 | mss_now -= icsk->icsk_ext_hdr_len; |
654 | 723 | ||
655 | /* Then reserve room for full set of TCP options and 8 bytes of data */ | 724 | /* Then reserve room for full set of TCP options and 8 bytes of data */ |
656 | if (mss_now < 48) | 725 | if (mss_now < 48) |
@@ -664,7 +733,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) | |||
664 | mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len); | 733 | mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len); |
665 | 734 | ||
666 | /* And store cached results */ | 735 | /* And store cached results */ |
667 | tp->pmtu_cookie = pmtu; | 736 | icsk->icsk_pmtu_cookie = pmtu; |
668 | tp->mss_cache = mss_now; | 737 | tp->mss_cache = mss_now; |
669 | 738 | ||
670 | return mss_now; | 739 | return mss_now; |
@@ -694,7 +763,7 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) | |||
694 | 763 | ||
695 | if (dst) { | 764 | if (dst) { |
696 | u32 mtu = dst_mtu(dst); | 765 | u32 mtu = dst_mtu(dst); |
697 | if (mtu != tp->pmtu_cookie) | 766 | if (mtu != inet_csk(sk)->icsk_pmtu_cookie) |
698 | mss_now = tcp_sync_mss(sk, mtu); | 767 | mss_now = tcp_sync_mss(sk, mtu); |
699 | } | 768 | } |
700 | 769 | ||
@@ -705,9 +774,10 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) | |||
705 | xmit_size_goal = mss_now; | 774 | xmit_size_goal = mss_now; |
706 | 775 | ||
707 | if (doing_tso) { | 776 | if (doing_tso) { |
708 | xmit_size_goal = 65535 - | 777 | xmit_size_goal = (65535 - |
709 | tp->af_specific->net_header_len - | 778 | inet_csk(sk)->icsk_af_ops->net_header_len - |
710 | tp->ext_header_len - tp->tcp_header_len; | 779 | inet_csk(sk)->icsk_ext_hdr_len - |
780 | tp->tcp_header_len); | ||
711 | 781 | ||
712 | if (tp->max_window && | 782 | if (tp->max_window && |
713 | (xmit_size_goal > (tp->max_window >> 1))) | 783 | (xmit_size_goal > (tp->max_window >> 1))) |
@@ -723,7 +793,7 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) | |||
723 | 793 | ||
724 | /* Congestion window validation. (RFC2861) */ | 794 | /* Congestion window validation. (RFC2861) */ |
725 | 795 | ||
726 | static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) | 796 | static void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) |
727 | { | 797 | { |
728 | __u32 packets_out = tp->packets_out; | 798 | __u32 packets_out = tp->packets_out; |
729 | 799 | ||
@@ -772,7 +842,7 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *sk | |||
772 | /* This must be invoked the first time we consider transmitting | 842 | /* This must be invoked the first time we consider transmitting |
773 | * SKB onto the wire. | 843 | * SKB onto the wire. |
774 | */ | 844 | */ |
775 | static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now) | 845 | static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now) |
776 | { | 846 | { |
777 | int tso_segs = tcp_skb_pcount(skb); | 847 | int tso_segs = tcp_skb_pcount(skb); |
778 | 848 | ||
@@ -1422,7 +1492,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | |||
1422 | (sysctl_tcp_retrans_collapse != 0)) | 1492 | (sysctl_tcp_retrans_collapse != 0)) |
1423 | tcp_retrans_try_collapse(sk, skb, cur_mss); | 1493 | tcp_retrans_try_collapse(sk, skb, cur_mss); |
1424 | 1494 | ||
1425 | if(tp->af_specific->rebuild_header(sk)) | 1495 | if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) |
1426 | return -EHOSTUNREACH; /* Routing failure or similar. */ | 1496 | return -EHOSTUNREACH; /* Routing failure or similar. */ |
1427 | 1497 | ||
1428 | /* Some Solaris stacks overoptimize and ignore the FIN on a | 1498 | /* Some Solaris stacks overoptimize and ignore the FIN on a |
@@ -1793,7 +1863,7 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
1793 | /* | 1863 | /* |
1794 | * Do all connect socket setups that can be done AF independent. | 1864 | * Do all connect socket setups that can be done AF independent. |
1795 | */ | 1865 | */ |
1796 | static inline void tcp_connect_init(struct sock *sk) | 1866 | static void tcp_connect_init(struct sock *sk) |
1797 | { | 1867 | { |
1798 | struct dst_entry *dst = __sk_dst_get(sk); | 1868 | struct dst_entry *dst = __sk_dst_get(sk); |
1799 | struct tcp_sock *tp = tcp_sk(sk); | 1869 | struct tcp_sock *tp = tcp_sk(sk); |
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 13e7e6e8df16..3b7403495052 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c | |||
@@ -330,6 +330,10 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, | |||
330 | vegas->cntRTT = 0; | 330 | vegas->cntRTT = 0; |
331 | vegas->minRTT = 0x7fffffff; | 331 | vegas->minRTT = 0x7fffffff; |
332 | } | 332 | } |
333 | /* Use normal slow start */ | ||
334 | else if (tp->snd_cwnd <= tp->snd_ssthresh) | ||
335 | tcp_slow_start(tp); | ||
336 | |||
333 | } | 337 | } |
334 | 338 | ||
335 | /* Extract info for Tcp socket info provided via netlink. */ | 339 | /* Extract info for Tcp socket info provided via netlink. */ |
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 2422a5f7195d..223abaa72bc5 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c | |||
@@ -86,6 +86,7 @@ | |||
86 | #include <linux/module.h> | 86 | #include <linux/module.h> |
87 | #include <linux/socket.h> | 87 | #include <linux/socket.h> |
88 | #include <linux/sockios.h> | 88 | #include <linux/sockios.h> |
89 | #include <linux/igmp.h> | ||
89 | #include <linux/in.h> | 90 | #include <linux/in.h> |
90 | #include <linux/errno.h> | 91 | #include <linux/errno.h> |
91 | #include <linux/timer.h> | 92 | #include <linux/timer.h> |
@@ -846,20 +847,7 @@ out: | |||
846 | csum_copy_err: | 847 | csum_copy_err: |
847 | UDP_INC_STATS_BH(UDP_MIB_INERRORS); | 848 | UDP_INC_STATS_BH(UDP_MIB_INERRORS); |
848 | 849 | ||
849 | /* Clear queue. */ | 850 | skb_kill_datagram(sk, skb, flags); |
850 | if (flags&MSG_PEEK) { | ||
851 | int clear = 0; | ||
852 | spin_lock_bh(&sk->sk_receive_queue.lock); | ||
853 | if (skb == skb_peek(&sk->sk_receive_queue)) { | ||
854 | __skb_unlink(skb, &sk->sk_receive_queue); | ||
855 | clear = 1; | ||
856 | } | ||
857 | spin_unlock_bh(&sk->sk_receive_queue.lock); | ||
858 | if (clear) | ||
859 | kfree_skb(skb); | ||
860 | } | ||
861 | |||
862 | skb_free_datagram(sk, skb); | ||
863 | 851 | ||
864 | if (noblock) | 852 | if (noblock) |
865 | return -EAGAIN; | 853 | return -EAGAIN; |
@@ -1094,7 +1082,7 @@ static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh, | |||
1094 | * Otherwise, csum completion requires chacksumming packet body, | 1082 | * Otherwise, csum completion requires chacksumming packet body, |
1095 | * including udp header and folding it to skb->csum. | 1083 | * including udp header and folding it to skb->csum. |
1096 | */ | 1084 | */ |
1097 | static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, | 1085 | static void udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, |
1098 | unsigned short ulen, u32 saddr, u32 daddr) | 1086 | unsigned short ulen, u32 saddr, u32 daddr) |
1099 | { | 1087 | { |
1100 | if (uh->check == 0) { | 1088 | if (uh->check == 0) { |
@@ -1108,7 +1096,6 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, | |||
1108 | /* Probably, we should checksum udp header (it should be in cache | 1096 | /* Probably, we should checksum udp header (it should be in cache |
1109 | * in any case) and data in tiny packets (< rx copybreak). | 1097 | * in any case) and data in tiny packets (< rx copybreak). |
1110 | */ | 1098 | */ |
1111 | return 0; | ||
1112 | } | 1099 | } |
1113 | 1100 | ||
1114 | /* | 1101 | /* |
@@ -1141,8 +1128,7 @@ int udp_rcv(struct sk_buff *skb) | |||
1141 | if (pskb_trim_rcsum(skb, ulen)) | 1128 | if (pskb_trim_rcsum(skb, ulen)) |
1142 | goto short_packet; | 1129 | goto short_packet; |
1143 | 1130 | ||
1144 | if (udp_checksum_init(skb, uh, ulen, saddr, daddr) < 0) | 1131 | udp_checksum_init(skb, uh, ulen, saddr, daddr); |
1145 | goto csum_error; | ||
1146 | 1132 | ||
1147 | if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) | 1133 | if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) |
1148 | return udp_v4_mcast_deliver(skb, uh, saddr, daddr); | 1134 | return udp_v4_mcast_deliver(skb, uh, saddr, daddr); |