diff options
| author | Jeff Garzik <jgarzik@pobox.com> | 2005-08-30 03:48:57 -0400 |
|---|---|---|
| committer | Jeff Garzik <jgarzik@pobox.com> | 2005-08-30 03:48:57 -0400 |
| commit | 2fcf522509cceea524b6e7ece8fd6759b682175a (patch) | |
| tree | d356e87307e451cce5497ad8daeeeb047befe489 /net/ipv4 | |
| parent | da61396d24e37258817e42537c482e962b4742f7 (diff) | |
| parent | 1fdab81e675c6ef76a49b8aabb7eaf4be51d1b80 (diff) | |
/spare/repo/libata-dev branch 'master'
Diffstat (limited to 'net/ipv4')
105 files changed, 7415 insertions, 4538 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 0b3d9f1d8069..e55136ae09f4 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig | |||
| @@ -413,20 +413,19 @@ config INET_TUNNEL | |||
| 413 | 413 | ||
| 414 | If unsure, say Y. | 414 | If unsure, say Y. |
| 415 | 415 | ||
| 416 | config IP_TCPDIAG | 416 | config INET_DIAG |
| 417 | tristate "IP: TCP socket monitoring interface" | 417 | tristate "INET: socket monitoring interface" |
| 418 | default y | 418 | default y |
| 419 | ---help--- | 419 | ---help--- |
| 420 | Support for TCP socket monitoring interface used by native Linux | 420 | Support for INET (TCP, DCCP, etc) socket monitoring interface used by |
| 421 | tools such as ss. ss is included in iproute2, currently downloadable | 421 | native Linux tools such as ss. ss is included in iproute2, currently |
| 422 | at <http://developer.osdl.org/dev/iproute2>. If you want IPv6 support | 422 | downloadable at <http://developer.osdl.org/dev/iproute2>. |
| 423 | and have selected IPv6 as a module, you need to build this as a | ||
| 424 | module too. | ||
| 425 | 423 | ||
| 426 | If unsure, say Y. | 424 | If unsure, say Y. |
| 427 | 425 | ||
| 428 | config IP_TCPDIAG_IPV6 | 426 | config INET_TCP_DIAG |
| 429 | def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6) | 427 | depends on INET_DIAG |
| 428 | def_tristate INET_DIAG | ||
| 430 | 429 | ||
| 431 | config TCP_CONG_ADVANCED | 430 | config TCP_CONG_ADVANCED |
| 432 | bool "TCP: advanced congestion control" | 431 | bool "TCP: advanced congestion control" |
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 55dc6cca1e7b..f0435d00db6b 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile | |||
| @@ -4,11 +4,12 @@ | |||
| 4 | 4 | ||
| 5 | obj-y := route.o inetpeer.o protocol.o \ | 5 | obj-y := route.o inetpeer.o protocol.o \ |
| 6 | ip_input.o ip_fragment.o ip_forward.o ip_options.o \ | 6 | ip_input.o ip_fragment.o ip_forward.o ip_options.o \ |
| 7 | ip_output.o ip_sockglue.o \ | 7 | ip_output.o ip_sockglue.o inet_hashtables.o \ |
| 8 | inet_timewait_sock.o inet_connection_sock.o \ | ||
| 8 | tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ | 9 | tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ |
| 9 | tcp_minisocks.o tcp_cong.o \ | 10 | tcp_minisocks.o tcp_cong.o \ |
| 10 | datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ | 11 | datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ |
| 11 | sysctl_net_ipv4.o fib_frontend.o fib_semantics.o | 12 | sysctl_net_ipv4.o fib_frontend.o fib_semantics.o netfilter.o |
| 12 | 13 | ||
| 13 | obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o | 14 | obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o |
| 14 | obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o | 15 | obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o |
| @@ -29,8 +30,9 @@ obj-$(CONFIG_IP_ROUTE_MULTIPATH_WRANDOM) += multipath_wrandom.o | |||
| 29 | obj-$(CONFIG_IP_ROUTE_MULTIPATH_DRR) += multipath_drr.o | 30 | obj-$(CONFIG_IP_ROUTE_MULTIPATH_DRR) += multipath_drr.o |
| 30 | obj-$(CONFIG_NETFILTER) += netfilter/ | 31 | obj-$(CONFIG_NETFILTER) += netfilter/ |
| 31 | obj-$(CONFIG_IP_VS) += ipvs/ | 32 | obj-$(CONFIG_IP_VS) += ipvs/ |
| 32 | obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o | 33 | obj-$(CONFIG_INET_DIAG) += inet_diag.o |
| 33 | obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o | 34 | obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o |
| 35 | obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o | ||
| 34 | obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o | 36 | obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o |
| 35 | obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o | 37 | obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o |
| 36 | obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o | 38 | obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o |
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 163ae4068b5f..bf147f8db399 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c | |||
| @@ -99,6 +99,7 @@ | |||
| 99 | #include <net/arp.h> | 99 | #include <net/arp.h> |
| 100 | #include <net/route.h> | 100 | #include <net/route.h> |
| 101 | #include <net/ip_fib.h> | 101 | #include <net/ip_fib.h> |
| 102 | #include <net/inet_connection_sock.h> | ||
| 102 | #include <net/tcp.h> | 103 | #include <net/tcp.h> |
| 103 | #include <net/udp.h> | 104 | #include <net/udp.h> |
| 104 | #include <linux/skbuff.h> | 105 | #include <linux/skbuff.h> |
| @@ -112,11 +113,7 @@ | |||
| 112 | #include <linux/mroute.h> | 113 | #include <linux/mroute.h> |
| 113 | #endif | 114 | #endif |
| 114 | 115 | ||
| 115 | DEFINE_SNMP_STAT(struct linux_mib, net_statistics); | 116 | DEFINE_SNMP_STAT(struct linux_mib, net_statistics) __read_mostly; |
| 116 | |||
| 117 | #ifdef INET_REFCNT_DEBUG | ||
| 118 | atomic_t inet_sock_nr; | ||
| 119 | #endif | ||
| 120 | 117 | ||
| 121 | extern void ip_mc_drop_socket(struct sock *sk); | 118 | extern void ip_mc_drop_socket(struct sock *sk); |
| 122 | 119 | ||
| @@ -153,11 +150,7 @@ void inet_sock_destruct(struct sock *sk) | |||
| 153 | if (inet->opt) | 150 | if (inet->opt) |
| 154 | kfree(inet->opt); | 151 | kfree(inet->opt); |
| 155 | dst_release(sk->sk_dst_cache); | 152 | dst_release(sk->sk_dst_cache); |
| 156 | #ifdef INET_REFCNT_DEBUG | 153 | sk_refcnt_debug_dec(sk); |
| 157 | atomic_dec(&inet_sock_nr); | ||
| 158 | printk(KERN_DEBUG "INET socket %p released, %d are still alive\n", | ||
| 159 | sk, atomic_read(&inet_sock_nr)); | ||
| 160 | #endif | ||
| 161 | } | 154 | } |
| 162 | 155 | ||
| 163 | /* | 156 | /* |
| @@ -210,7 +203,7 @@ int inet_listen(struct socket *sock, int backlog) | |||
| 210 | * we can only allow the backlog to be adjusted. | 203 | * we can only allow the backlog to be adjusted. |
| 211 | */ | 204 | */ |
| 212 | if (old_state != TCP_LISTEN) { | 205 | if (old_state != TCP_LISTEN) { |
| 213 | err = tcp_listen_start(sk); | 206 | err = inet_csk_listen_start(sk, TCP_SYNQ_HSIZE); |
| 214 | if (err) | 207 | if (err) |
| 215 | goto out; | 208 | goto out; |
| 216 | } | 209 | } |
| @@ -235,12 +228,14 @@ static int inet_create(struct socket *sock, int protocol) | |||
| 235 | struct proto *answer_prot; | 228 | struct proto *answer_prot; |
| 236 | unsigned char answer_flags; | 229 | unsigned char answer_flags; |
| 237 | char answer_no_check; | 230 | char answer_no_check; |
| 238 | int err; | 231 | int try_loading_module = 0; |
| 232 | int err = -ESOCKTNOSUPPORT; | ||
| 239 | 233 | ||
| 240 | sock->state = SS_UNCONNECTED; | 234 | sock->state = SS_UNCONNECTED; |
| 241 | 235 | ||
| 242 | /* Look for the requested type/protocol pair. */ | 236 | /* Look for the requested type/protocol pair. */ |
| 243 | answer = NULL; | 237 | answer = NULL; |
| 238 | lookup_protocol: | ||
| 244 | rcu_read_lock(); | 239 | rcu_read_lock(); |
| 245 | list_for_each_rcu(p, &inetsw[sock->type]) { | 240 | list_for_each_rcu(p, &inetsw[sock->type]) { |
| 246 | answer = list_entry(p, struct inet_protosw, list); | 241 | answer = list_entry(p, struct inet_protosw, list); |
| @@ -261,9 +256,28 @@ static int inet_create(struct socket *sock, int protocol) | |||
| 261 | answer = NULL; | 256 | answer = NULL; |
| 262 | } | 257 | } |
| 263 | 258 | ||
| 264 | err = -ESOCKTNOSUPPORT; | 259 | if (unlikely(answer == NULL)) { |
| 265 | if (!answer) | 260 | if (try_loading_module < 2) { |
| 266 | goto out_rcu_unlock; | 261 | rcu_read_unlock(); |
| 262 | /* | ||
| 263 | * Be more specific, e.g. net-pf-2-proto-132-type-1 | ||
| 264 | * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM) | ||
| 265 | */ | ||
| 266 | if (++try_loading_module == 1) | ||
| 267 | request_module("net-pf-%d-proto-%d-type-%d", | ||
| 268 | PF_INET, protocol, sock->type); | ||
| 269 | /* | ||
| 270 | * Fall back to generic, e.g. net-pf-2-proto-132 | ||
| 271 | * (net-pf-PF_INET-proto-IPPROTO_SCTP) | ||
| 272 | */ | ||
| 273 | else | ||
| 274 | request_module("net-pf-%d-proto-%d", | ||
| 275 | PF_INET, protocol); | ||
| 276 | goto lookup_protocol; | ||
| 277 | } else | ||
| 278 | goto out_rcu_unlock; | ||
| 279 | } | ||
| 280 | |||
| 267 | err = -EPERM; | 281 | err = -EPERM; |
| 268 | if (answer->capability > 0 && !capable(answer->capability)) | 282 | if (answer->capability > 0 && !capable(answer->capability)) |
| 269 | goto out_rcu_unlock; | 283 | goto out_rcu_unlock; |
| @@ -317,9 +331,7 @@ static int inet_create(struct socket *sock, int protocol) | |||
| 317 | inet->mc_index = 0; | 331 | inet->mc_index = 0; |
| 318 | inet->mc_list = NULL; | 332 | inet->mc_list = NULL; |
| 319 | 333 | ||
| 320 | #ifdef INET_REFCNT_DEBUG | 334 | sk_refcnt_debug_inc(sk); |
| 321 | atomic_inc(&inet_sock_nr); | ||
| 322 | #endif | ||
| 323 | 335 | ||
| 324 | if (inet->num) { | 336 | if (inet->num) { |
| 325 | /* It assumes that any protocol which allows | 337 | /* It assumes that any protocol which allows |
| @@ -847,10 +859,6 @@ static struct net_proto_family inet_family_ops = { | |||
| 847 | .owner = THIS_MODULE, | 859 | .owner = THIS_MODULE, |
| 848 | }; | 860 | }; |
| 849 | 861 | ||
| 850 | |||
| 851 | extern void tcp_init(void); | ||
| 852 | extern void tcp_v4_init(struct net_proto_family *); | ||
| 853 | |||
| 854 | /* Upon startup we insert all the elements in inetsw_array[] into | 862 | /* Upon startup we insert all the elements in inetsw_array[] into |
| 855 | * the linked list inetsw. | 863 | * the linked list inetsw. |
| 856 | */ | 864 | */ |
| @@ -961,6 +969,119 @@ void inet_unregister_protosw(struct inet_protosw *p) | |||
| 961 | } | 969 | } |
| 962 | } | 970 | } |
| 963 | 971 | ||
| 972 | /* | ||
| 973 | * Shall we try to damage output packets if routing dev changes? | ||
| 974 | */ | ||
| 975 | |||
| 976 | int sysctl_ip_dynaddr; | ||
| 977 | |||
| 978 | static int inet_sk_reselect_saddr(struct sock *sk) | ||
| 979 | { | ||
| 980 | struct inet_sock *inet = inet_sk(sk); | ||
| 981 | int err; | ||
| 982 | struct rtable *rt; | ||
| 983 | __u32 old_saddr = inet->saddr; | ||
| 984 | __u32 new_saddr; | ||
| 985 | __u32 daddr = inet->daddr; | ||
| 986 | |||
| 987 | if (inet->opt && inet->opt->srr) | ||
| 988 | daddr = inet->opt->faddr; | ||
| 989 | |||
| 990 | /* Query new route. */ | ||
| 991 | err = ip_route_connect(&rt, daddr, 0, | ||
| 992 | RT_CONN_FLAGS(sk), | ||
| 993 | sk->sk_bound_dev_if, | ||
| 994 | sk->sk_protocol, | ||
| 995 | inet->sport, inet->dport, sk); | ||
| 996 | if (err) | ||
| 997 | return err; | ||
| 998 | |||
| 999 | sk_setup_caps(sk, &rt->u.dst); | ||
| 1000 | |||
| 1001 | new_saddr = rt->rt_src; | ||
| 1002 | |||
| 1003 | if (new_saddr == old_saddr) | ||
| 1004 | return 0; | ||
| 1005 | |||
| 1006 | if (sysctl_ip_dynaddr > 1) { | ||
| 1007 | printk(KERN_INFO "%s(): shifting inet->" | ||
| 1008 | "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n", | ||
| 1009 | __FUNCTION__, | ||
| 1010 | NIPQUAD(old_saddr), | ||
| 1011 | NIPQUAD(new_saddr)); | ||
| 1012 | } | ||
| 1013 | |||
| 1014 | inet->saddr = inet->rcv_saddr = new_saddr; | ||
| 1015 | |||
| 1016 | /* | ||
| 1017 | * XXX The only one ugly spot where we need to | ||
| 1018 | * XXX really change the sockets identity after | ||
| 1019 | * XXX it has entered the hashes. -DaveM | ||
| 1020 | * | ||
| 1021 | * Besides that, it does not check for connection | ||
| 1022 | * uniqueness. Wait for troubles. | ||
| 1023 | */ | ||
| 1024 | __sk_prot_rehash(sk); | ||
| 1025 | return 0; | ||
| 1026 | } | ||
| 1027 | |||
| 1028 | int inet_sk_rebuild_header(struct sock *sk) | ||
| 1029 | { | ||
| 1030 | struct inet_sock *inet = inet_sk(sk); | ||
| 1031 | struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0); | ||
| 1032 | u32 daddr; | ||
| 1033 | int err; | ||
| 1034 | |||
| 1035 | /* Route is OK, nothing to do. */ | ||
| 1036 | if (rt) | ||
| 1037 | return 0; | ||
| 1038 | |||
| 1039 | /* Reroute. */ | ||
| 1040 | daddr = inet->daddr; | ||
| 1041 | if (inet->opt && inet->opt->srr) | ||
| 1042 | daddr = inet->opt->faddr; | ||
| 1043 | { | ||
| 1044 | struct flowi fl = { | ||
| 1045 | .oif = sk->sk_bound_dev_if, | ||
| 1046 | .nl_u = { | ||
| 1047 | .ip4_u = { | ||
| 1048 | .daddr = daddr, | ||
| 1049 | .saddr = inet->saddr, | ||
| 1050 | .tos = RT_CONN_FLAGS(sk), | ||
| 1051 | }, | ||
| 1052 | }, | ||
| 1053 | .proto = sk->sk_protocol, | ||
| 1054 | .uli_u = { | ||
| 1055 | .ports = { | ||
| 1056 | .sport = inet->sport, | ||
| 1057 | .dport = inet->dport, | ||
| 1058 | }, | ||
| 1059 | }, | ||
| 1060 | }; | ||
| 1061 | |||
| 1062 | err = ip_route_output_flow(&rt, &fl, sk, 0); | ||
| 1063 | } | ||
| 1064 | if (!err) | ||
| 1065 | sk_setup_caps(sk, &rt->u.dst); | ||
| 1066 | else { | ||
| 1067 | /* Routing failed... */ | ||
| 1068 | sk->sk_route_caps = 0; | ||
| 1069 | /* | ||
| 1070 | * Other protocols have to map its equivalent state to TCP_SYN_SENT. | ||
| 1071 | * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme | ||
| 1072 | */ | ||
| 1073 | if (!sysctl_ip_dynaddr || | ||
| 1074 | sk->sk_state != TCP_SYN_SENT || | ||
| 1075 | (sk->sk_userlocks & SOCK_BINDADDR_LOCK) || | ||
| 1076 | (err = inet_sk_reselect_saddr(sk)) != 0) | ||
| 1077 | sk->sk_err_soft = -err; | ||
| 1078 | } | ||
| 1079 | |||
| 1080 | return err; | ||
| 1081 | } | ||
| 1082 | |||
| 1083 | EXPORT_SYMBOL(inet_sk_rebuild_header); | ||
| 1084 | |||
| 964 | #ifdef CONFIG_IP_MULTICAST | 1085 | #ifdef CONFIG_IP_MULTICAST |
| 965 | static struct net_protocol igmp_protocol = { | 1086 | static struct net_protocol igmp_protocol = { |
| 966 | .handler = igmp_rcv, | 1087 | .handler = igmp_rcv, |
| @@ -1007,7 +1128,6 @@ static int __init init_ipv4_mibs(void) | |||
| 1007 | } | 1128 | } |
| 1008 | 1129 | ||
| 1009 | static int ipv4_proc_init(void); | 1130 | static int ipv4_proc_init(void); |
| 1010 | extern void ipfrag_init(void); | ||
| 1011 | 1131 | ||
| 1012 | /* | 1132 | /* |
| 1013 | * IP protocol layer initialiser | 1133 | * IP protocol layer initialiser |
| @@ -1128,19 +1248,10 @@ module_init(inet_init); | |||
| 1128 | /* ------------------------------------------------------------------------ */ | 1248 | /* ------------------------------------------------------------------------ */ |
| 1129 | 1249 | ||
| 1130 | #ifdef CONFIG_PROC_FS | 1250 | #ifdef CONFIG_PROC_FS |
| 1131 | extern int fib_proc_init(void); | ||
| 1132 | extern void fib_proc_exit(void); | ||
| 1133 | #ifdef CONFIG_IP_FIB_TRIE | 1251 | #ifdef CONFIG_IP_FIB_TRIE |
| 1134 | extern int fib_stat_proc_init(void); | 1252 | extern int fib_stat_proc_init(void); |
| 1135 | extern void fib_stat_proc_exit(void); | 1253 | extern void fib_stat_proc_exit(void); |
| 1136 | #endif | 1254 | #endif |
| 1137 | extern int ip_misc_proc_init(void); | ||
| 1138 | extern int raw_proc_init(void); | ||
| 1139 | extern void raw_proc_exit(void); | ||
| 1140 | extern int tcp4_proc_init(void); | ||
| 1141 | extern void tcp4_proc_exit(void); | ||
| 1142 | extern int udp4_proc_init(void); | ||
| 1143 | extern void udp4_proc_exit(void); | ||
| 1144 | 1255 | ||
| 1145 | static int __init ipv4_proc_init(void) | 1256 | static int __init ipv4_proc_init(void) |
| 1146 | { | 1257 | { |
| @@ -1205,7 +1316,3 @@ EXPORT_SYMBOL(inet_stream_ops); | |||
| 1205 | EXPORT_SYMBOL(inet_unregister_protosw); | 1316 | EXPORT_SYMBOL(inet_unregister_protosw); |
| 1206 | EXPORT_SYMBOL(net_statistics); | 1317 | EXPORT_SYMBOL(net_statistics); |
| 1207 | EXPORT_SYMBOL(sysctl_ip_nonlocal_bind); | 1318 | EXPORT_SYMBOL(sysctl_ip_nonlocal_bind); |
| 1208 | |||
| 1209 | #ifdef INET_REFCNT_DEBUG | ||
| 1210 | EXPORT_SYMBOL(inet_sock_nr); | ||
| 1211 | #endif | ||
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index a642fd612853..8bf312bdea13 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c | |||
| @@ -700,7 +700,7 @@ void arp_send(int type, int ptype, u32 dest_ip, | |||
| 700 | static void parp_redo(struct sk_buff *skb) | 700 | static void parp_redo(struct sk_buff *skb) |
| 701 | { | 701 | { |
| 702 | nf_reset(skb); | 702 | nf_reset(skb); |
| 703 | arp_rcv(skb, skb->dev, NULL); | 703 | arp_rcv(skb, skb->dev, NULL, skb->dev); |
| 704 | } | 704 | } |
| 705 | 705 | ||
| 706 | /* | 706 | /* |
| @@ -865,7 +865,7 @@ static int arp_process(struct sk_buff *skb) | |||
| 865 | if (n) | 865 | if (n) |
| 866 | neigh_release(n); | 866 | neigh_release(n); |
| 867 | 867 | ||
| 868 | if (skb->stamp.tv_sec == LOCALLY_ENQUEUED || | 868 | if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED || |
| 869 | skb->pkt_type == PACKET_HOST || | 869 | skb->pkt_type == PACKET_HOST || |
| 870 | in_dev->arp_parms->proxy_delay == 0) { | 870 | in_dev->arp_parms->proxy_delay == 0) { |
| 871 | arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); | 871 | arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); |
| @@ -927,7 +927,7 @@ out: | |||
| 927 | * Receive an arp request from the device layer. | 927 | * Receive an arp request from the device layer. |
| 928 | */ | 928 | */ |
| 929 | 929 | ||
| 930 | int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) | 930 | int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) |
| 931 | { | 931 | { |
| 932 | struct arphdr *arp; | 932 | struct arphdr *arp; |
| 933 | 933 | ||
| @@ -948,6 +948,8 @@ int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) | |||
| 948 | if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) | 948 | if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) |
| 949 | goto out_of_mem; | 949 | goto out_of_mem; |
| 950 | 950 | ||
| 951 | memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); | ||
| 952 | |||
| 951 | return NF_HOOK(NF_ARP, NF_ARP_IN, skb, dev, NULL, arp_process); | 953 | return NF_HOOK(NF_ARP, NF_ARP_IN, skb, dev, NULL, arp_process); |
| 952 | 954 | ||
| 953 | freeskb: | 955 | freeskb: |
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index b1db561f2542..c1b42b5257f8 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c | |||
| @@ -16,9 +16,10 @@ | |||
| 16 | #include <linux/module.h> | 16 | #include <linux/module.h> |
| 17 | #include <linux/ip.h> | 17 | #include <linux/ip.h> |
| 18 | #include <linux/in.h> | 18 | #include <linux/in.h> |
| 19 | #include <net/ip.h> | ||
| 19 | #include <net/sock.h> | 20 | #include <net/sock.h> |
| 20 | #include <net/tcp.h> | ||
| 21 | #include <net/route.h> | 21 | #include <net/route.h> |
| 22 | #include <net/tcp_states.h> | ||
| 22 | 23 | ||
| 23 | int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) | 24 | int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) |
| 24 | { | 25 | { |
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index d8a10e3dd77d..ba2895ae8151 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c | |||
| @@ -1111,13 +1111,12 @@ static void rtmsg_ifa(int event, struct in_ifaddr* ifa) | |||
| 1111 | struct sk_buff *skb = alloc_skb(size, GFP_KERNEL); | 1111 | struct sk_buff *skb = alloc_skb(size, GFP_KERNEL); |
| 1112 | 1112 | ||
| 1113 | if (!skb) | 1113 | if (!skb) |
| 1114 | netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, ENOBUFS); | 1114 | netlink_set_err(rtnl, 0, RTNLGRP_IPV4_IFADDR, ENOBUFS); |
| 1115 | else if (inet_fill_ifaddr(skb, ifa, current->pid, 0, event, 0) < 0) { | 1115 | else if (inet_fill_ifaddr(skb, ifa, current->pid, 0, event, 0) < 0) { |
| 1116 | kfree_skb(skb); | 1116 | kfree_skb(skb); |
| 1117 | netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, EINVAL); | 1117 | netlink_set_err(rtnl, 0, RTNLGRP_IPV4_IFADDR, EINVAL); |
| 1118 | } else { | 1118 | } else { |
| 1119 | NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_IFADDR; | 1119 | netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV4_IFADDR, GFP_KERNEL); |
| 1120 | netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV4_IFADDR, GFP_KERNEL); | ||
| 1121 | } | 1120 | } |
| 1122 | } | 1121 | } |
| 1123 | 1122 | ||
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index ba57446d5d1f..b31ffc5053d2 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c | |||
| @@ -331,8 +331,8 @@ static void esp4_err(struct sk_buff *skb, u32 info) | |||
| 331 | x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET); | 331 | x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET); |
| 332 | if (!x) | 332 | if (!x) |
| 333 | return; | 333 | return; |
| 334 | NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n", | 334 | NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n", |
| 335 | ntohl(esph->spi), ntohl(iph->daddr))); | 335 | ntohl(esph->spi), ntohl(iph->daddr)); |
| 336 | xfrm_state_put(x); | 336 | xfrm_state_put(x); |
| 337 | } | 337 | } |
| 338 | 338 | ||
| @@ -395,10 +395,10 @@ static int esp_init_state(struct xfrm_state *x) | |||
| 395 | 395 | ||
| 396 | if (aalg_desc->uinfo.auth.icv_fullbits/8 != | 396 | if (aalg_desc->uinfo.auth.icv_fullbits/8 != |
| 397 | crypto_tfm_alg_digestsize(esp->auth.tfm)) { | 397 | crypto_tfm_alg_digestsize(esp->auth.tfm)) { |
| 398 | NETDEBUG(printk(KERN_INFO "ESP: %s digestsize %u != %hu\n", | 398 | NETDEBUG(KERN_INFO "ESP: %s digestsize %u != %hu\n", |
| 399 | x->aalg->alg_name, | 399 | x->aalg->alg_name, |
| 400 | crypto_tfm_alg_digestsize(esp->auth.tfm), | 400 | crypto_tfm_alg_digestsize(esp->auth.tfm), |
| 401 | aalg_desc->uinfo.auth.icv_fullbits/8)); | 401 | aalg_desc->uinfo.auth.icv_fullbits/8); |
| 402 | goto error; | 402 | goto error; |
| 403 | } | 403 | } |
| 404 | 404 | ||
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index cd8e45ab9580..4e1379f71269 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c | |||
| @@ -558,16 +558,15 @@ static void nl_fib_input(struct sock *sk, int len) | |||
| 558 | nl_fib_lookup(frn, tb); | 558 | nl_fib_lookup(frn, tb); |
| 559 | 559 | ||
| 560 | pid = nlh->nlmsg_pid; /*pid of sending process */ | 560 | pid = nlh->nlmsg_pid; /*pid of sending process */ |
| 561 | NETLINK_CB(skb).groups = 0; /* not in mcast group */ | ||
| 562 | NETLINK_CB(skb).pid = 0; /* from kernel */ | 561 | NETLINK_CB(skb).pid = 0; /* from kernel */ |
| 563 | NETLINK_CB(skb).dst_pid = pid; | 562 | NETLINK_CB(skb).dst_pid = pid; |
| 564 | NETLINK_CB(skb).dst_groups = 0; /* unicast */ | 563 | NETLINK_CB(skb).dst_group = 0; /* unicast */ |
| 565 | netlink_unicast(sk, skb, pid, MSG_DONTWAIT); | 564 | netlink_unicast(sk, skb, pid, MSG_DONTWAIT); |
| 566 | } | 565 | } |
| 567 | 566 | ||
| 568 | static void nl_fib_lookup_init(void) | 567 | static void nl_fib_lookup_init(void) |
| 569 | { | 568 | { |
| 570 | netlink_kernel_create(NETLINK_FIB_LOOKUP, nl_fib_input); | 569 | netlink_kernel_create(NETLINK_FIB_LOOKUP, 0, nl_fib_input, THIS_MODULE); |
| 571 | } | 570 | } |
| 572 | 571 | ||
| 573 | static void fib_disable_ip(struct net_device *dev, int force) | 572 | static void fib_disable_ip(struct net_device *dev, int force) |
| @@ -662,5 +661,4 @@ void __init ip_fib_init(void) | |||
| 662 | } | 661 | } |
| 663 | 662 | ||
| 664 | EXPORT_SYMBOL(inet_addr_type); | 663 | EXPORT_SYMBOL(inet_addr_type); |
| 665 | EXPORT_SYMBOL(ip_dev_find); | ||
| 666 | EXPORT_SYMBOL(ip_rt_ioctl); | 664 | EXPORT_SYMBOL(ip_rt_ioctl); |
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index b10d6bb5ef3d..2a8c9afc3695 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c | |||
| @@ -45,8 +45,8 @@ | |||
| 45 | 45 | ||
| 46 | #include "fib_lookup.h" | 46 | #include "fib_lookup.h" |
| 47 | 47 | ||
| 48 | static kmem_cache_t *fn_hash_kmem; | 48 | static kmem_cache_t *fn_hash_kmem __read_mostly; |
| 49 | static kmem_cache_t *fn_alias_kmem; | 49 | static kmem_cache_t *fn_alias_kmem __read_mostly; |
| 50 | 50 | ||
| 51 | struct fib_node { | 51 | struct fib_node { |
| 52 | struct hlist_node fn_hash; | 52 | struct hlist_node fn_hash; |
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index b729d97cfa93..ef6609ea0eb7 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h | |||
| @@ -7,6 +7,7 @@ | |||
| 7 | 7 | ||
| 8 | struct fib_alias { | 8 | struct fib_alias { |
| 9 | struct list_head fa_list; | 9 | struct list_head fa_list; |
| 10 | struct rcu_head rcu; | ||
| 10 | struct fib_info *fa_info; | 11 | struct fib_info *fa_info; |
| 11 | u8 fa_tos; | 12 | u8 fa_tos; |
| 12 | u8 fa_type; | 13 | u8 fa_type; |
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index e278cb9d0075..d41219e8037c 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c | |||
| @@ -290,10 +290,10 @@ void rtmsg_fib(int event, u32 key, struct fib_alias *fa, | |||
| 290 | kfree_skb(skb); | 290 | kfree_skb(skb); |
| 291 | return; | 291 | return; |
| 292 | } | 292 | } |
| 293 | NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_ROUTE; | 293 | NETLINK_CB(skb).dst_group = RTNLGRP_IPV4_ROUTE; |
| 294 | if (n->nlmsg_flags&NLM_F_ECHO) | 294 | if (n->nlmsg_flags&NLM_F_ECHO) |
| 295 | atomic_inc(&skb->users); | 295 | atomic_inc(&skb->users); |
| 296 | netlink_broadcast(rtnl, skb, pid, RTMGRP_IPV4_ROUTE, GFP_KERNEL); | 296 | netlink_broadcast(rtnl, skb, pid, RTNLGRP_IPV4_ROUTE, GFP_KERNEL); |
| 297 | if (n->nlmsg_flags&NLM_F_ECHO) | 297 | if (n->nlmsg_flags&NLM_F_ECHO) |
| 298 | netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT); | 298 | netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT); |
| 299 | } | 299 | } |
| @@ -854,6 +854,7 @@ failure: | |||
| 854 | return NULL; | 854 | return NULL; |
| 855 | } | 855 | } |
| 856 | 856 | ||
| 857 | /* Note! fib_semantic_match intentionally uses RCU list functions. */ | ||
| 857 | int fib_semantic_match(struct list_head *head, const struct flowi *flp, | 858 | int fib_semantic_match(struct list_head *head, const struct flowi *flp, |
| 858 | struct fib_result *res, __u32 zone, __u32 mask, | 859 | struct fib_result *res, __u32 zone, __u32 mask, |
| 859 | int prefixlen) | 860 | int prefixlen) |
| @@ -861,7 +862,7 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp, | |||
| 861 | struct fib_alias *fa; | 862 | struct fib_alias *fa; |
| 862 | int nh_sel = 0; | 863 | int nh_sel = 0; |
| 863 | 864 | ||
| 864 | list_for_each_entry(fa, head, fa_list) { | 865 | list_for_each_entry_rcu(fa, head, fa_list) { |
| 865 | int err; | 866 | int err; |
| 866 | 867 | ||
| 867 | if (fa->fa_tos && | 868 | if (fa->fa_tos && |
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 45efd5f4741b..b2dea4e5da77 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c | |||
| @@ -43,7 +43,7 @@ | |||
| 43 | * 2 of the License, or (at your option) any later version. | 43 | * 2 of the License, or (at your option) any later version. |
| 44 | */ | 44 | */ |
| 45 | 45 | ||
| 46 | #define VERSION "0.325" | 46 | #define VERSION "0.402" |
| 47 | 47 | ||
| 48 | #include <linux/config.h> | 48 | #include <linux/config.h> |
| 49 | #include <asm/uaccess.h> | 49 | #include <asm/uaccess.h> |
| @@ -62,6 +62,7 @@ | |||
| 62 | #include <linux/netdevice.h> | 62 | #include <linux/netdevice.h> |
| 63 | #include <linux/if_arp.h> | 63 | #include <linux/if_arp.h> |
| 64 | #include <linux/proc_fs.h> | 64 | #include <linux/proc_fs.h> |
| 65 | #include <linux/rcupdate.h> | ||
| 65 | #include <linux/skbuff.h> | 66 | #include <linux/skbuff.h> |
| 66 | #include <linux/netlink.h> | 67 | #include <linux/netlink.h> |
| 67 | #include <linux/init.h> | 68 | #include <linux/init.h> |
| @@ -77,56 +78,55 @@ | |||
| 77 | #undef CONFIG_IP_FIB_TRIE_STATS | 78 | #undef CONFIG_IP_FIB_TRIE_STATS |
| 78 | #define MAX_CHILDS 16384 | 79 | #define MAX_CHILDS 16384 |
| 79 | 80 | ||
| 80 | #define EXTRACT(p, n, str) ((str)<<(p)>>(32-(n))) | ||
| 81 | #define KEYLENGTH (8*sizeof(t_key)) | 81 | #define KEYLENGTH (8*sizeof(t_key)) |
| 82 | #define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l)) | 82 | #define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l)) |
| 83 | #define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset)) | 83 | #define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset)) |
| 84 | 84 | ||
| 85 | static DEFINE_RWLOCK(fib_lock); | ||
| 86 | |||
| 87 | typedef unsigned int t_key; | 85 | typedef unsigned int t_key; |
| 88 | 86 | ||
| 89 | #define T_TNODE 0 | 87 | #define T_TNODE 0 |
| 90 | #define T_LEAF 1 | 88 | #define T_LEAF 1 |
| 91 | #define NODE_TYPE_MASK 0x1UL | 89 | #define NODE_TYPE_MASK 0x1UL |
| 92 | #define NODE_PARENT(_node) \ | 90 | #define NODE_PARENT(node) \ |
| 93 | ((struct tnode *)((_node)->_parent & ~NODE_TYPE_MASK)) | 91 | ((struct tnode *)rcu_dereference(((node)->parent & ~NODE_TYPE_MASK))) |
| 94 | #define NODE_SET_PARENT(_node, _ptr) \ | 92 | |
| 95 | ((_node)->_parent = (((unsigned long)(_ptr)) | \ | 93 | #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK) |
| 96 | ((_node)->_parent & NODE_TYPE_MASK))) | 94 | |
| 97 | #define NODE_INIT_PARENT(_node, _type) \ | 95 | #define NODE_SET_PARENT(node, ptr) \ |
| 98 | ((_node)->_parent = (_type)) | 96 | rcu_assign_pointer((node)->parent, \ |
| 99 | #define NODE_TYPE(_node) \ | 97 | ((unsigned long)(ptr)) | NODE_TYPE(node)) |
| 100 | ((_node)->_parent & NODE_TYPE_MASK) | 98 | |
| 101 | 99 | #define IS_TNODE(n) (!(n->parent & T_LEAF)) | |
| 102 | #define IS_TNODE(n) (!(n->_parent & T_LEAF)) | 100 | #define IS_LEAF(n) (n->parent & T_LEAF) |
| 103 | #define IS_LEAF(n) (n->_parent & T_LEAF) | ||
| 104 | 101 | ||
| 105 | struct node { | 102 | struct node { |
| 106 | t_key key; | 103 | t_key key; |
| 107 | unsigned long _parent; | 104 | unsigned long parent; |
| 108 | }; | 105 | }; |
| 109 | 106 | ||
| 110 | struct leaf { | 107 | struct leaf { |
| 111 | t_key key; | 108 | t_key key; |
| 112 | unsigned long _parent; | 109 | unsigned long parent; |
| 113 | struct hlist_head list; | 110 | struct hlist_head list; |
| 111 | struct rcu_head rcu; | ||
| 114 | }; | 112 | }; |
| 115 | 113 | ||
| 116 | struct leaf_info { | 114 | struct leaf_info { |
| 117 | struct hlist_node hlist; | 115 | struct hlist_node hlist; |
| 116 | struct rcu_head rcu; | ||
| 118 | int plen; | 117 | int plen; |
| 119 | struct list_head falh; | 118 | struct list_head falh; |
| 120 | }; | 119 | }; |
| 121 | 120 | ||
| 122 | struct tnode { | 121 | struct tnode { |
| 123 | t_key key; | 122 | t_key key; |
| 124 | unsigned long _parent; | 123 | unsigned long parent; |
| 125 | unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */ | 124 | unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */ |
| 126 | unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */ | 125 | unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */ |
| 127 | unsigned short full_children; /* KEYLENGTH bits needed */ | 126 | unsigned short full_children; /* KEYLENGTH bits needed */ |
| 128 | unsigned short empty_children; /* KEYLENGTH bits needed */ | 127 | unsigned short empty_children; /* KEYLENGTH bits needed */ |
| 129 | struct node *child[0]; | 128 | struct rcu_head rcu; |
| 129 | struct node *child[0]; | ||
| 130 | }; | 130 | }; |
| 131 | 131 | ||
| 132 | #ifdef CONFIG_IP_FIB_TRIE_STATS | 132 | #ifdef CONFIG_IP_FIB_TRIE_STATS |
| @@ -150,77 +150,45 @@ struct trie_stat { | |||
| 150 | }; | 150 | }; |
| 151 | 151 | ||
| 152 | struct trie { | 152 | struct trie { |
| 153 | struct node *trie; | 153 | struct node *trie; |
| 154 | #ifdef CONFIG_IP_FIB_TRIE_STATS | 154 | #ifdef CONFIG_IP_FIB_TRIE_STATS |
| 155 | struct trie_use_stats stats; | 155 | struct trie_use_stats stats; |
| 156 | #endif | 156 | #endif |
| 157 | int size; | 157 | int size; |
| 158 | unsigned int revision; | 158 | unsigned int revision; |
| 159 | }; | 159 | }; |
| 160 | 160 | ||
| 161 | static int trie_debug = 0; | ||
| 162 | |||
| 163 | static int tnode_full(struct tnode *tn, struct node *n); | ||
| 164 | static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n); | 161 | static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n); |
| 165 | static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull); | 162 | static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull); |
| 166 | static int tnode_child_length(struct tnode *tn); | ||
| 167 | static struct node *resize(struct trie *t, struct tnode *tn); | 163 | static struct node *resize(struct trie *t, struct tnode *tn); |
| 168 | static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err); | 164 | static struct tnode *inflate(struct trie *t, struct tnode *tn); |
| 169 | static struct tnode *halve(struct trie *t, struct tnode *tn, int *err); | 165 | static struct tnode *halve(struct trie *t, struct tnode *tn); |
| 170 | static void tnode_free(struct tnode *tn); | 166 | static void tnode_free(struct tnode *tn); |
| 171 | static void trie_dump_seq(struct seq_file *seq, struct trie *t); | 167 | static void trie_dump_seq(struct seq_file *seq, struct trie *t); |
| 172 | extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio); | ||
| 173 | extern int fib_detect_death(struct fib_info *fi, int order, | ||
| 174 | struct fib_info **last_resort, int *last_idx, int *dflt); | ||
| 175 | |||
| 176 | extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, int z, int tb_id, | ||
| 177 | struct nlmsghdr *n, struct netlink_skb_parms *req); | ||
| 178 | 168 | ||
| 179 | static kmem_cache_t *fn_alias_kmem; | 169 | static kmem_cache_t *fn_alias_kmem __read_mostly; |
| 180 | static struct trie *trie_local = NULL, *trie_main = NULL; | 170 | static struct trie *trie_local = NULL, *trie_main = NULL; |
| 181 | 171 | ||
| 182 | static void trie_bug(char *err) | 172 | |
| 183 | { | 173 | /* rcu_read_lock needs to be hold by caller from readside */ |
| 184 | printk("Trie Bug: %s\n", err); | ||
| 185 | BUG(); | ||
| 186 | } | ||
| 187 | 174 | ||
| 188 | static inline struct node *tnode_get_child(struct tnode *tn, int i) | 175 | static inline struct node *tnode_get_child(struct tnode *tn, int i) |
| 189 | { | 176 | { |
| 190 | if (i >= 1<<tn->bits) | 177 | BUG_ON(i >= 1 << tn->bits); |
| 191 | trie_bug("tnode_get_child"); | ||
| 192 | 178 | ||
| 193 | return tn->child[i]; | 179 | return rcu_dereference(tn->child[i]); |
| 194 | } | 180 | } |
| 195 | 181 | ||
| 196 | static inline int tnode_child_length(struct tnode *tn) | 182 | static inline int tnode_child_length(const struct tnode *tn) |
| 197 | { | 183 | { |
| 198 | return 1<<tn->bits; | 184 | return 1 << tn->bits; |
| 199 | } | 185 | } |
| 200 | 186 | ||
| 201 | /* | ||
| 202 | _________________________________________________________________ | ||
| 203 | | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C | | ||
| 204 | ---------------------------------------------------------------- | ||
| 205 | 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | ||
| 206 | |||
| 207 | _________________________________________________________________ | ||
| 208 | | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u | | ||
| 209 | ----------------------------------------------------------------- | ||
| 210 | 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 | ||
| 211 | |||
| 212 | tp->pos = 7 | ||
| 213 | tp->bits = 3 | ||
| 214 | n->pos = 15 | ||
| 215 | n->bits=4 | ||
| 216 | KEYLENGTH=32 | ||
| 217 | */ | ||
| 218 | |||
| 219 | static inline t_key tkey_extract_bits(t_key a, int offset, int bits) | 187 | static inline t_key tkey_extract_bits(t_key a, int offset, int bits) |
| 220 | { | 188 | { |
| 221 | if (offset < KEYLENGTH) | 189 | if (offset < KEYLENGTH) |
| 222 | return ((t_key)(a << offset)) >> (KEYLENGTH - bits); | 190 | return ((t_key)(a << offset)) >> (KEYLENGTH - bits); |
| 223 | else | 191 | else |
| 224 | return 0; | 192 | return 0; |
| 225 | } | 193 | } |
| 226 | 194 | ||
| @@ -233,8 +201,8 @@ static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b) | |||
| 233 | { | 201 | { |
| 234 | if (bits == 0 || offset >= KEYLENGTH) | 202 | if (bits == 0 || offset >= KEYLENGTH) |
| 235 | return 1; | 203 | return 1; |
| 236 | bits = bits > KEYLENGTH ? KEYLENGTH : bits; | 204 | bits = bits > KEYLENGTH ? KEYLENGTH : bits; |
| 237 | return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0; | 205 | return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0; |
| 238 | } | 206 | } |
| 239 | 207 | ||
| 240 | static inline int tkey_mismatch(t_key a, int offset, t_key b) | 208 | static inline int tkey_mismatch(t_key a, int offset, t_key b) |
| @@ -249,14 +217,6 @@ static inline int tkey_mismatch(t_key a, int offset, t_key b) | |||
| 249 | return i; | 217 | return i; |
| 250 | } | 218 | } |
| 251 | 219 | ||
| 252 | /* Candiate for fib_semantics */ | ||
| 253 | |||
| 254 | static void fn_free_alias(struct fib_alias *fa) | ||
| 255 | { | ||
| 256 | fib_release_info(fa->fa_info); | ||
| 257 | kmem_cache_free(fn_alias_kmem, fa); | ||
| 258 | } | ||
| 259 | |||
| 260 | /* | 220 | /* |
| 261 | To understand this stuff, an understanding of keys and all their bits is | 221 | To understand this stuff, an understanding of keys and all their bits is |
| 262 | necessary. Every node in the trie has a key associated with it, but not | 222 | necessary. Every node in the trie has a key associated with it, but not |
| @@ -295,7 +255,7 @@ static void fn_free_alias(struct fib_alias *fa) | |||
| 295 | tp->pos = 7 | 255 | tp->pos = 7 |
| 296 | tp->bits = 3 | 256 | tp->bits = 3 |
| 297 | n->pos = 15 | 257 | n->pos = 15 |
| 298 | n->bits=4 | 258 | n->bits = 4 |
| 299 | 259 | ||
| 300 | First, let's just ignore the bits that come before the parent tp, that is | 260 | First, let's just ignore the bits that come before the parent tp, that is |
| 301 | the bits from 0 to (tp->pos-1). They are *known* but at this point we do | 261 | the bits from 0 to (tp->pos-1). They are *known* but at this point we do |
| @@ -320,60 +280,65 @@ static void fn_free_alias(struct fib_alias *fa) | |||
| 320 | 280 | ||
| 321 | */ | 281 | */ |
| 322 | 282 | ||
| 323 | static void check_tnode(struct tnode *tn) | 283 | static inline void check_tnode(const struct tnode *tn) |
| 324 | { | 284 | { |
| 325 | if (tn && tn->pos+tn->bits > 32) { | 285 | WARN_ON(tn && tn->pos+tn->bits > 32); |
| 326 | printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn, tn->pos, tn->bits); | ||
| 327 | } | ||
| 328 | } | 286 | } |
| 329 | 287 | ||
| 330 | static int halve_threshold = 25; | 288 | static int halve_threshold = 25; |
| 331 | static int inflate_threshold = 50; | 289 | static int inflate_threshold = 50; |
| 332 | 290 | ||
| 333 | static struct leaf *leaf_new(void) | 291 | |
| 292 | static void __alias_free_mem(struct rcu_head *head) | ||
| 334 | { | 293 | { |
| 335 | struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL); | 294 | struct fib_alias *fa = container_of(head, struct fib_alias, rcu); |
| 336 | if (l) { | 295 | kmem_cache_free(fn_alias_kmem, fa); |
| 337 | NODE_INIT_PARENT(l, T_LEAF); | ||
| 338 | INIT_HLIST_HEAD(&l->list); | ||
| 339 | } | ||
| 340 | return l; | ||
| 341 | } | 296 | } |
| 342 | 297 | ||
| 343 | static struct leaf_info *leaf_info_new(int plen) | 298 | static inline void alias_free_mem_rcu(struct fib_alias *fa) |
| 344 | { | 299 | { |
| 345 | struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL); | 300 | call_rcu(&fa->rcu, __alias_free_mem); |
| 346 | if (li) { | 301 | } |
| 347 | li->plen = plen; | 302 | |
| 348 | INIT_LIST_HEAD(&li->falh); | 303 | static void __leaf_free_rcu(struct rcu_head *head) |
| 349 | } | 304 | { |
| 350 | return li; | 305 | kfree(container_of(head, struct leaf, rcu)); |
| 306 | } | ||
| 307 | |||
| 308 | static inline void free_leaf(struct leaf *leaf) | ||
| 309 | { | ||
| 310 | call_rcu(&leaf->rcu, __leaf_free_rcu); | ||
| 351 | } | 311 | } |
| 352 | 312 | ||
| 353 | static inline void free_leaf(struct leaf *l) | 313 | static void __leaf_info_free_rcu(struct rcu_head *head) |
| 354 | { | 314 | { |
| 355 | kfree(l); | 315 | kfree(container_of(head, struct leaf_info, rcu)); |
| 356 | } | 316 | } |
| 357 | 317 | ||
| 358 | static inline void free_leaf_info(struct leaf_info *li) | 318 | static inline void free_leaf_info(struct leaf_info *leaf) |
| 359 | { | 319 | { |
| 360 | kfree(li); | 320 | call_rcu(&leaf->rcu, __leaf_info_free_rcu); |
| 361 | } | 321 | } |
| 362 | 322 | ||
| 363 | static struct tnode *tnode_alloc(unsigned int size) | 323 | static struct tnode *tnode_alloc(unsigned int size) |
| 364 | { | 324 | { |
| 365 | if (size <= PAGE_SIZE) { | 325 | struct page *pages; |
| 366 | return kmalloc(size, GFP_KERNEL); | 326 | |
| 367 | } else { | 327 | if (size <= PAGE_SIZE) |
| 368 | return (struct tnode *) | 328 | return kcalloc(size, 1, GFP_KERNEL); |
| 369 | __get_free_pages(GFP_KERNEL, get_order(size)); | 329 | |
| 370 | } | 330 | pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size)); |
| 331 | if (!pages) | ||
| 332 | return NULL; | ||
| 333 | |||
| 334 | return page_address(pages); | ||
| 371 | } | 335 | } |
| 372 | 336 | ||
| 373 | static void __tnode_free(struct tnode *tn) | 337 | static void __tnode_free_rcu(struct rcu_head *head) |
| 374 | { | 338 | { |
| 339 | struct tnode *tn = container_of(head, struct tnode, rcu); | ||
| 375 | unsigned int size = sizeof(struct tnode) + | 340 | unsigned int size = sizeof(struct tnode) + |
| 376 | (1<<tn->bits) * sizeof(struct node *); | 341 | (1 << tn->bits) * sizeof(struct node *); |
| 377 | 342 | ||
| 378 | if (size <= PAGE_SIZE) | 343 | if (size <= PAGE_SIZE) |
| 379 | kfree(tn); | 344 | kfree(tn); |
| @@ -381,15 +346,40 @@ static void __tnode_free(struct tnode *tn) | |||
| 381 | free_pages((unsigned long)tn, get_order(size)); | 346 | free_pages((unsigned long)tn, get_order(size)); |
| 382 | } | 347 | } |
| 383 | 348 | ||
| 349 | static inline void tnode_free(struct tnode *tn) | ||
| 350 | { | ||
| 351 | call_rcu(&tn->rcu, __tnode_free_rcu); | ||
| 352 | } | ||
| 353 | |||
| 354 | static struct leaf *leaf_new(void) | ||
| 355 | { | ||
| 356 | struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL); | ||
| 357 | if (l) { | ||
| 358 | l->parent = T_LEAF; | ||
| 359 | INIT_HLIST_HEAD(&l->list); | ||
| 360 | } | ||
| 361 | return l; | ||
| 362 | } | ||
| 363 | |||
| 364 | static struct leaf_info *leaf_info_new(int plen) | ||
| 365 | { | ||
| 366 | struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL); | ||
| 367 | if (li) { | ||
| 368 | li->plen = plen; | ||
| 369 | INIT_LIST_HEAD(&li->falh); | ||
| 370 | } | ||
| 371 | return li; | ||
| 372 | } | ||
| 373 | |||
| 384 | static struct tnode* tnode_new(t_key key, int pos, int bits) | 374 | static struct tnode* tnode_new(t_key key, int pos, int bits) |
| 385 | { | 375 | { |
| 386 | int nchildren = 1<<bits; | 376 | int nchildren = 1<<bits; |
| 387 | int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *); | 377 | int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *); |
| 388 | struct tnode *tn = tnode_alloc(sz); | 378 | struct tnode *tn = tnode_alloc(sz); |
| 389 | 379 | ||
| 390 | if (tn) { | 380 | if (tn) { |
| 391 | memset(tn, 0, sz); | 381 | memset(tn, 0, sz); |
| 392 | NODE_INIT_PARENT(tn, T_TNODE); | 382 | tn->parent = T_TNODE; |
| 393 | tn->pos = pos; | 383 | tn->pos = pos; |
| 394 | tn->bits = bits; | 384 | tn->bits = bits; |
| 395 | tn->key = key; | 385 | tn->key = key; |
| @@ -397,38 +387,17 @@ static struct tnode* tnode_new(t_key key, int pos, int bits) | |||
| 397 | tn->empty_children = 1<<bits; | 387 | tn->empty_children = 1<<bits; |
| 398 | } | 388 | } |
| 399 | 389 | ||
| 400 | if (trie_debug > 0) | 390 | pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode), |
| 401 | printk("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode), | 391 | (unsigned int) (sizeof(struct node) * 1<<bits)); |
| 402 | (unsigned int) (sizeof(struct node) * 1<<bits)); | ||
| 403 | return tn; | 392 | return tn; |
| 404 | } | 393 | } |
| 405 | 394 | ||
| 406 | static void tnode_free(struct tnode *tn) | ||
| 407 | { | ||
| 408 | if (!tn) { | ||
| 409 | trie_bug("tnode_free\n"); | ||
| 410 | } | ||
| 411 | if (IS_LEAF(tn)) { | ||
| 412 | free_leaf((struct leaf *)tn); | ||
| 413 | if (trie_debug > 0 ) | ||
| 414 | printk("FL %p \n", tn); | ||
| 415 | } | ||
| 416 | else if (IS_TNODE(tn)) { | ||
| 417 | __tnode_free(tn); | ||
| 418 | if (trie_debug > 0 ) | ||
| 419 | printk("FT %p \n", tn); | ||
| 420 | } | ||
| 421 | else { | ||
| 422 | trie_bug("tnode_free\n"); | ||
| 423 | } | ||
| 424 | } | ||
| 425 | |||
| 426 | /* | 395 | /* |
| 427 | * Check whether a tnode 'n' is "full", i.e. it is an internal node | 396 | * Check whether a tnode 'n' is "full", i.e. it is an internal node |
| 428 | * and no bits are skipped. See discussion in dyntree paper p. 6 | 397 | * and no bits are skipped. See discussion in dyntree paper p. 6 |
| 429 | */ | 398 | */ |
| 430 | 399 | ||
| 431 | static inline int tnode_full(struct tnode *tn, struct node *n) | 400 | static inline int tnode_full(const struct tnode *tn, const struct node *n) |
| 432 | { | 401 | { |
| 433 | if (n == NULL || IS_LEAF(n)) | 402 | if (n == NULL || IS_LEAF(n)) |
| 434 | return 0; | 403 | return 0; |
| @@ -448,15 +417,11 @@ static inline void put_child(struct trie *t, struct tnode *tn, int i, struct nod | |||
| 448 | 417 | ||
| 449 | static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull) | 418 | static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull) |
| 450 | { | 419 | { |
| 451 | struct node *chi; | 420 | struct node *chi = tn->child[i]; |
| 452 | int isfull; | 421 | int isfull; |
| 453 | 422 | ||
| 454 | if (i >= 1<<tn->bits) { | 423 | BUG_ON(i >= 1<<tn->bits); |
| 455 | printk("bits=%d, i=%d\n", tn->bits, i); | 424 | |
| 456 | trie_bug("tnode_put_child_reorg bits"); | ||
| 457 | } | ||
| 458 | write_lock_bh(&fib_lock); | ||
| 459 | chi = tn->child[i]; | ||
| 460 | 425 | ||
| 461 | /* update emptyChildren */ | 426 | /* update emptyChildren */ |
| 462 | if (n == NULL && chi != NULL) | 427 | if (n == NULL && chi != NULL) |
| @@ -465,33 +430,32 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int w | |||
| 465 | tn->empty_children--; | 430 | tn->empty_children--; |
| 466 | 431 | ||
| 467 | /* update fullChildren */ | 432 | /* update fullChildren */ |
| 468 | if (wasfull == -1) | 433 | if (wasfull == -1) |
| 469 | wasfull = tnode_full(tn, chi); | 434 | wasfull = tnode_full(tn, chi); |
| 470 | 435 | ||
| 471 | isfull = tnode_full(tn, n); | 436 | isfull = tnode_full(tn, n); |
| 472 | if (wasfull && !isfull) | 437 | if (wasfull && !isfull) |
| 473 | tn->full_children--; | 438 | tn->full_children--; |
| 474 | |||
| 475 | else if (!wasfull && isfull) | 439 | else if (!wasfull && isfull) |
| 476 | tn->full_children++; | 440 | tn->full_children++; |
| 441 | |||
| 477 | if (n) | 442 | if (n) |
| 478 | NODE_SET_PARENT(n, tn); | 443 | NODE_SET_PARENT(n, tn); |
| 479 | 444 | ||
| 480 | tn->child[i] = n; | 445 | rcu_assign_pointer(tn->child[i], n); |
| 481 | write_unlock_bh(&fib_lock); | ||
| 482 | } | 446 | } |
| 483 | 447 | ||
| 484 | static struct node *resize(struct trie *t, struct tnode *tn) | 448 | static struct node *resize(struct trie *t, struct tnode *tn) |
| 485 | { | 449 | { |
| 486 | int i; | 450 | int i; |
| 487 | int err = 0; | 451 | int err = 0; |
| 452 | struct tnode *old_tn; | ||
| 488 | 453 | ||
| 489 | if (!tn) | 454 | if (!tn) |
| 490 | return NULL; | 455 | return NULL; |
| 491 | 456 | ||
| 492 | if (trie_debug) | 457 | pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n", |
| 493 | printk("In tnode_resize %p inflate_threshold=%d threshold=%d\n", | 458 | tn, inflate_threshold, halve_threshold); |
| 494 | tn, inflate_threshold, halve_threshold); | ||
| 495 | 459 | ||
| 496 | /* No children */ | 460 | /* No children */ |
| 497 | if (tn->empty_children == tnode_child_length(tn)) { | 461 | if (tn->empty_children == tnode_child_length(tn)) { |
| @@ -501,20 +465,16 @@ static struct node *resize(struct trie *t, struct tnode *tn) | |||
| 501 | /* One child */ | 465 | /* One child */ |
| 502 | if (tn->empty_children == tnode_child_length(tn) - 1) | 466 | if (tn->empty_children == tnode_child_length(tn) - 1) |
| 503 | for (i = 0; i < tnode_child_length(tn); i++) { | 467 | for (i = 0; i < tnode_child_length(tn); i++) { |
| 468 | struct node *n; | ||
| 504 | 469 | ||
| 505 | write_lock_bh(&fib_lock); | 470 | n = tn->child[i]; |
| 506 | if (tn->child[i] != NULL) { | 471 | if (!n) |
| 507 | 472 | continue; | |
| 508 | /* compress one level */ | ||
| 509 | struct node *n = tn->child[i]; | ||
| 510 | if (n) | ||
| 511 | NODE_INIT_PARENT(n, NODE_TYPE(n)); | ||
| 512 | 473 | ||
| 513 | write_unlock_bh(&fib_lock); | 474 | /* compress one level */ |
| 514 | tnode_free(tn); | 475 | NODE_SET_PARENT(n, NULL); |
| 515 | return n; | 476 | tnode_free(tn); |
| 516 | } | 477 | return n; |
| 517 | write_unlock_bh(&fib_lock); | ||
| 518 | } | 478 | } |
| 519 | /* | 479 | /* |
| 520 | * Double as long as the resulting node has a number of | 480 | * Double as long as the resulting node has a number of |
| @@ -566,16 +526,16 @@ static struct node *resize(struct trie *t, struct tnode *tn) | |||
| 566 | * | 526 | * |
| 567 | * expand not_to_be_doubled and to_be_doubled, and shorten: | 527 | * expand not_to_be_doubled and to_be_doubled, and shorten: |
| 568 | * 100 * (tnode_child_length(tn) - tn->empty_children + | 528 | * 100 * (tnode_child_length(tn) - tn->empty_children + |
| 569 | * tn->full_children ) >= inflate_threshold * new_child_length | 529 | * tn->full_children) >= inflate_threshold * new_child_length |
| 570 | * | 530 | * |
| 571 | * expand new_child_length: | 531 | * expand new_child_length: |
| 572 | * 100 * (tnode_child_length(tn) - tn->empty_children + | 532 | * 100 * (tnode_child_length(tn) - tn->empty_children + |
| 573 | * tn->full_children ) >= | 533 | * tn->full_children) >= |
| 574 | * inflate_threshold * tnode_child_length(tn) * 2 | 534 | * inflate_threshold * tnode_child_length(tn) * 2 |
| 575 | * | 535 | * |
| 576 | * shorten again: | 536 | * shorten again: |
| 577 | * 50 * (tn->full_children + tnode_child_length(tn) - | 537 | * 50 * (tn->full_children + tnode_child_length(tn) - |
| 578 | * tn->empty_children ) >= inflate_threshold * | 538 | * tn->empty_children) >= inflate_threshold * |
| 579 | * tnode_child_length(tn) | 539 | * tnode_child_length(tn) |
| 580 | * | 540 | * |
| 581 | */ | 541 | */ |
| @@ -587,9 +547,10 @@ static struct node *resize(struct trie *t, struct tnode *tn) | |||
| 587 | 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >= | 547 | 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >= |
| 588 | inflate_threshold * tnode_child_length(tn))) { | 548 | inflate_threshold * tnode_child_length(tn))) { |
| 589 | 549 | ||
| 590 | tn = inflate(t, tn, &err); | 550 | old_tn = tn; |
| 591 | 551 | tn = inflate(t, tn); | |
| 592 | if (err) { | 552 | if (IS_ERR(tn)) { |
| 553 | tn = old_tn; | ||
| 593 | #ifdef CONFIG_IP_FIB_TRIE_STATS | 554 | #ifdef CONFIG_IP_FIB_TRIE_STATS |
| 594 | t->stats.resize_node_skipped++; | 555 | t->stats.resize_node_skipped++; |
| 595 | #endif | 556 | #endif |
| @@ -609,9 +570,10 @@ static struct node *resize(struct trie *t, struct tnode *tn) | |||
| 609 | 100 * (tnode_child_length(tn) - tn->empty_children) < | 570 | 100 * (tnode_child_length(tn) - tn->empty_children) < |
| 610 | halve_threshold * tnode_child_length(tn)) { | 571 | halve_threshold * tnode_child_length(tn)) { |
| 611 | 572 | ||
| 612 | tn = halve(t, tn, &err); | 573 | old_tn = tn; |
| 613 | 574 | tn = halve(t, tn); | |
| 614 | if (err) { | 575 | if (IS_ERR(tn)) { |
| 576 | tn = old_tn; | ||
| 615 | #ifdef CONFIG_IP_FIB_TRIE_STATS | 577 | #ifdef CONFIG_IP_FIB_TRIE_STATS |
| 616 | t->stats.resize_node_skipped++; | 578 | t->stats.resize_node_skipped++; |
| 617 | #endif | 579 | #endif |
| @@ -621,44 +583,37 @@ static struct node *resize(struct trie *t, struct tnode *tn) | |||
| 621 | 583 | ||
| 622 | 584 | ||
| 623 | /* Only one child remains */ | 585 | /* Only one child remains */ |
| 624 | |||
| 625 | if (tn->empty_children == tnode_child_length(tn) - 1) | 586 | if (tn->empty_children == tnode_child_length(tn) - 1) |
| 626 | for (i = 0; i < tnode_child_length(tn); i++) { | 587 | for (i = 0; i < tnode_child_length(tn); i++) { |
| 627 | 588 | struct node *n; | |
| 628 | write_lock_bh(&fib_lock); | 589 | |
| 629 | if (tn->child[i] != NULL) { | 590 | n = tn->child[i]; |
| 630 | /* compress one level */ | 591 | if (!n) |
| 631 | struct node *n = tn->child[i]; | 592 | continue; |
| 632 | 593 | ||
| 633 | if (n) | 594 | /* compress one level */ |
| 634 | NODE_INIT_PARENT(n, NODE_TYPE(n)); | 595 | |
| 635 | 596 | NODE_SET_PARENT(n, NULL); | |
| 636 | write_unlock_bh(&fib_lock); | 597 | tnode_free(tn); |
| 637 | tnode_free(tn); | 598 | return n; |
| 638 | return n; | ||
| 639 | } | ||
| 640 | write_unlock_bh(&fib_lock); | ||
| 641 | } | 599 | } |
| 642 | 600 | ||
| 643 | return (struct node *) tn; | 601 | return (struct node *) tn; |
| 644 | } | 602 | } |
| 645 | 603 | ||
| 646 | static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err) | 604 | static struct tnode *inflate(struct trie *t, struct tnode *tn) |
| 647 | { | 605 | { |
| 648 | struct tnode *inode; | 606 | struct tnode *inode; |
| 649 | struct tnode *oldtnode = tn; | 607 | struct tnode *oldtnode = tn; |
| 650 | int olen = tnode_child_length(tn); | 608 | int olen = tnode_child_length(tn); |
| 651 | int i; | 609 | int i; |
| 652 | 610 | ||
| 653 | if (trie_debug) | 611 | pr_debug("In inflate\n"); |
| 654 | printk("In inflate\n"); | ||
| 655 | 612 | ||
| 656 | tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1); | 613 | tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1); |
| 657 | 614 | ||
| 658 | if (!tn) { | 615 | if (!tn) |
| 659 | *err = -ENOMEM; | 616 | return ERR_PTR(-ENOMEM); |
| 660 | return oldtnode; | ||
| 661 | } | ||
| 662 | 617 | ||
| 663 | /* | 618 | /* |
| 664 | * Preallocate and store tnodes before the actual work so we | 619 | * Preallocate and store tnodes before the actual work so we |
| @@ -666,8 +621,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err) | |||
| 666 | * fails. In case of failure we return the oldnode and inflate | 621 | * fails. In case of failure we return the oldnode and inflate |
| 667 | * of tnode is ignored. | 622 | * of tnode is ignored. |
| 668 | */ | 623 | */ |
| 669 | 624 | ||
| 670 | for(i = 0; i < olen; i++) { | 625 | for (i = 0; i < olen; i++) { |
| 671 | struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i); | 626 | struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i); |
| 672 | 627 | ||
| 673 | if (inode && | 628 | if (inode && |
| @@ -675,46 +630,30 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err) | |||
| 675 | inode->pos == oldtnode->pos + oldtnode->bits && | 630 | inode->pos == oldtnode->pos + oldtnode->bits && |
| 676 | inode->bits > 1) { | 631 | inode->bits > 1) { |
| 677 | struct tnode *left, *right; | 632 | struct tnode *left, *right; |
| 678 | |||
| 679 | t_key m = TKEY_GET_MASK(inode->pos, 1); | 633 | t_key m = TKEY_GET_MASK(inode->pos, 1); |
| 680 | 634 | ||
| 681 | left = tnode_new(inode->key&(~m), inode->pos + 1, | 635 | left = tnode_new(inode->key&(~m), inode->pos + 1, |
| 682 | inode->bits - 1); | 636 | inode->bits - 1); |
| 637 | if (!left) | ||
| 638 | goto nomem; | ||
| 683 | 639 | ||
| 684 | if (!left) { | ||
| 685 | *err = -ENOMEM; | ||
| 686 | break; | ||
| 687 | } | ||
| 688 | |||
| 689 | right = tnode_new(inode->key|m, inode->pos + 1, | 640 | right = tnode_new(inode->key|m, inode->pos + 1, |
| 690 | inode->bits - 1); | 641 | inode->bits - 1); |
| 691 | 642 | ||
| 692 | if (!right) { | 643 | if (!right) { |
| 693 | *err = -ENOMEM; | 644 | tnode_free(left); |
| 694 | break; | 645 | goto nomem; |
| 695 | } | 646 | } |
| 696 | 647 | ||
| 697 | put_child(t, tn, 2*i, (struct node *) left); | 648 | put_child(t, tn, 2*i, (struct node *) left); |
| 698 | put_child(t, tn, 2*i+1, (struct node *) right); | 649 | put_child(t, tn, 2*i+1, (struct node *) right); |
| 699 | } | 650 | } |
| 700 | } | 651 | } |
| 701 | 652 | ||
| 702 | if (*err) { | 653 | for (i = 0; i < olen; i++) { |
| 703 | int size = tnode_child_length(tn); | ||
| 704 | int j; | ||
| 705 | |||
| 706 | for(j = 0; j < size; j++) | ||
| 707 | if (tn->child[j]) | ||
| 708 | tnode_free((struct tnode *)tn->child[j]); | ||
| 709 | |||
| 710 | tnode_free(tn); | ||
| 711 | |||
| 712 | *err = -ENOMEM; | ||
| 713 | return oldtnode; | ||
| 714 | } | ||
| 715 | |||
| 716 | for(i = 0; i < olen; i++) { | ||
| 717 | struct node *node = tnode_get_child(oldtnode, i); | 654 | struct node *node = tnode_get_child(oldtnode, i); |
| 655 | struct tnode *left, *right; | ||
| 656 | int size, j; | ||
| 718 | 657 | ||
| 719 | /* An empty child */ | 658 | /* An empty child */ |
| 720 | if (node == NULL) | 659 | if (node == NULL) |
| @@ -740,76 +679,82 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err) | |||
| 740 | put_child(t, tn, 2*i+1, inode->child[1]); | 679 | put_child(t, tn, 2*i+1, inode->child[1]); |
| 741 | 680 | ||
| 742 | tnode_free(inode); | 681 | tnode_free(inode); |
| 682 | continue; | ||
| 743 | } | 683 | } |
| 744 | 684 | ||
| 745 | /* An internal node with more than two children */ | 685 | /* An internal node with more than two children */ |
| 746 | else { | 686 | |
| 747 | struct tnode *left, *right; | 687 | /* We will replace this node 'inode' with two new |
| 748 | int size, j; | 688 | * ones, 'left' and 'right', each with half of the |
| 749 | 689 | * original children. The two new nodes will have | |
| 750 | /* We will replace this node 'inode' with two new | 690 | * a position one bit further down the key and this |
| 751 | * ones, 'left' and 'right', each with half of the | 691 | * means that the "significant" part of their keys |
| 752 | * original children. The two new nodes will have | 692 | * (see the discussion near the top of this file) |
| 753 | * a position one bit further down the key and this | 693 | * will differ by one bit, which will be "0" in |
| 754 | * means that the "significant" part of their keys | 694 | * left's key and "1" in right's key. Since we are |
| 755 | * (see the discussion near the top of this file) | 695 | * moving the key position by one step, the bit that |
| 756 | * will differ by one bit, which will be "0" in | 696 | * we are moving away from - the bit at position |
| 757 | * left's key and "1" in right's key. Since we are | 697 | * (inode->pos) - is the one that will differ between |
| 758 | * moving the key position by one step, the bit that | 698 | * left and right. So... we synthesize that bit in the |
| 759 | * we are moving away from - the bit at position | 699 | * two new keys. |
| 760 | * (inode->pos) - is the one that will differ between | 700 | * The mask 'm' below will be a single "one" bit at |
| 761 | * left and right. So... we synthesize that bit in the | 701 | * the position (inode->pos) |
| 762 | * two new keys. | 702 | */ |
| 763 | * The mask 'm' below will be a single "one" bit at | ||
| 764 | * the position (inode->pos) | ||
| 765 | */ | ||
| 766 | |||
| 767 | /* Use the old key, but set the new significant | ||
| 768 | * bit to zero. | ||
| 769 | */ | ||
| 770 | 703 | ||
| 771 | left = (struct tnode *) tnode_get_child(tn, 2*i); | 704 | /* Use the old key, but set the new significant |
| 772 | put_child(t, tn, 2*i, NULL); | 705 | * bit to zero. |
| 706 | */ | ||
| 773 | 707 | ||
| 774 | if (!left) | 708 | left = (struct tnode *) tnode_get_child(tn, 2*i); |
| 775 | BUG(); | 709 | put_child(t, tn, 2*i, NULL); |
| 776 | 710 | ||
| 777 | right = (struct tnode *) tnode_get_child(tn, 2*i+1); | 711 | BUG_ON(!left); |
| 778 | put_child(t, tn, 2*i+1, NULL); | ||
| 779 | 712 | ||
| 780 | if (!right) | 713 | right = (struct tnode *) tnode_get_child(tn, 2*i+1); |
| 781 | BUG(); | 714 | put_child(t, tn, 2*i+1, NULL); |
| 782 | 715 | ||
| 783 | size = tnode_child_length(left); | 716 | BUG_ON(!right); |
| 784 | for(j = 0; j < size; j++) { | ||
| 785 | put_child(t, left, j, inode->child[j]); | ||
| 786 | put_child(t, right, j, inode->child[j + size]); | ||
| 787 | } | ||
| 788 | put_child(t, tn, 2*i, resize(t, left)); | ||
| 789 | put_child(t, tn, 2*i+1, resize(t, right)); | ||
| 790 | 717 | ||
| 791 | tnode_free(inode); | 718 | size = tnode_child_length(left); |
| 719 | for (j = 0; j < size; j++) { | ||
| 720 | put_child(t, left, j, inode->child[j]); | ||
| 721 | put_child(t, right, j, inode->child[j + size]); | ||
| 792 | } | 722 | } |
| 723 | put_child(t, tn, 2*i, resize(t, left)); | ||
| 724 | put_child(t, tn, 2*i+1, resize(t, right)); | ||
| 725 | |||
| 726 | tnode_free(inode); | ||
| 793 | } | 727 | } |
| 794 | tnode_free(oldtnode); | 728 | tnode_free(oldtnode); |
| 795 | return tn; | 729 | return tn; |
| 730 | nomem: | ||
| 731 | { | ||
| 732 | int size = tnode_child_length(tn); | ||
| 733 | int j; | ||
| 734 | |||
| 735 | for (j = 0; j < size; j++) | ||
| 736 | if (tn->child[j]) | ||
| 737 | tnode_free((struct tnode *)tn->child[j]); | ||
| 738 | |||
| 739 | tnode_free(tn); | ||
| 740 | |||
| 741 | return ERR_PTR(-ENOMEM); | ||
| 742 | } | ||
| 796 | } | 743 | } |
| 797 | 744 | ||
| 798 | static struct tnode *halve(struct trie *t, struct tnode *tn, int *err) | 745 | static struct tnode *halve(struct trie *t, struct tnode *tn) |
| 799 | { | 746 | { |
| 800 | struct tnode *oldtnode = tn; | 747 | struct tnode *oldtnode = tn; |
| 801 | struct node *left, *right; | 748 | struct node *left, *right; |
| 802 | int i; | 749 | int i; |
| 803 | int olen = tnode_child_length(tn); | 750 | int olen = tnode_child_length(tn); |
| 804 | 751 | ||
| 805 | if (trie_debug) printk("In halve\n"); | 752 | pr_debug("In halve\n"); |
| 806 | 753 | ||
| 807 | tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1); | 754 | tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1); |
| 808 | 755 | ||
| 809 | if (!tn) { | 756 | if (!tn) |
| 810 | *err = -ENOMEM; | 757 | return ERR_PTR(-ENOMEM); |
| 811 | return oldtnode; | ||
| 812 | } | ||
| 813 | 758 | ||
| 814 | /* | 759 | /* |
| 815 | * Preallocate and store tnodes before the actual work so we | 760 | * Preallocate and store tnodes before the actual work so we |
| @@ -818,38 +763,27 @@ static struct tnode *halve(struct trie *t, struct tnode *tn, int *err) | |||
| 818 | * of tnode is ignored. | 763 | * of tnode is ignored. |
| 819 | */ | 764 | */ |
| 820 | 765 | ||
| 821 | for(i = 0; i < olen; i += 2) { | 766 | for (i = 0; i < olen; i += 2) { |
| 822 | left = tnode_get_child(oldtnode, i); | 767 | left = tnode_get_child(oldtnode, i); |
| 823 | right = tnode_get_child(oldtnode, i+1); | 768 | right = tnode_get_child(oldtnode, i+1); |
| 824 | 769 | ||
| 825 | /* Two nonempty children */ | 770 | /* Two nonempty children */ |
| 826 | if (left && right) { | 771 | if (left && right) { |
| 827 | struct tnode *newBinNode = | 772 | struct tnode *newn; |
| 828 | tnode_new(left->key, tn->pos + tn->bits, 1); | ||
| 829 | 773 | ||
| 830 | if (!newBinNode) { | 774 | newn = tnode_new(left->key, tn->pos + tn->bits, 1); |
| 831 | *err = -ENOMEM; | ||
| 832 | break; | ||
| 833 | } | ||
| 834 | put_child(t, tn, i/2, (struct node *)newBinNode); | ||
| 835 | } | ||
| 836 | } | ||
| 837 | 775 | ||
| 838 | if (*err) { | 776 | if (!newn) |
| 839 | int size = tnode_child_length(tn); | 777 | goto nomem; |
| 840 | int j; | ||
| 841 | 778 | ||
| 842 | for(j = 0; j < size; j++) | 779 | put_child(t, tn, i/2, (struct node *)newn); |
| 843 | if (tn->child[j]) | 780 | } |
| 844 | tnode_free((struct tnode *)tn->child[j]); | ||
| 845 | 781 | ||
| 846 | tnode_free(tn); | ||
| 847 | |||
| 848 | *err = -ENOMEM; | ||
| 849 | return oldtnode; | ||
| 850 | } | 782 | } |
| 851 | 783 | ||
| 852 | for(i = 0; i < olen; i += 2) { | 784 | for (i = 0; i < olen; i += 2) { |
| 785 | struct tnode *newBinNode; | ||
| 786 | |||
| 853 | left = tnode_get_child(oldtnode, i); | 787 | left = tnode_get_child(oldtnode, i); |
| 854 | right = tnode_get_child(oldtnode, i+1); | 788 | right = tnode_get_child(oldtnode, i+1); |
| 855 | 789 | ||
| @@ -858,88 +792,99 @@ static struct tnode *halve(struct trie *t, struct tnode *tn, int *err) | |||
| 858 | if (right == NULL) /* Both are empty */ | 792 | if (right == NULL) /* Both are empty */ |
| 859 | continue; | 793 | continue; |
| 860 | put_child(t, tn, i/2, right); | 794 | put_child(t, tn, i/2, right); |
| 861 | } else if (right == NULL) | 795 | continue; |
| 796 | } | ||
| 797 | |||
| 798 | if (right == NULL) { | ||
| 862 | put_child(t, tn, i/2, left); | 799 | put_child(t, tn, i/2, left); |
| 800 | continue; | ||
| 801 | } | ||
| 863 | 802 | ||
| 864 | /* Two nonempty children */ | 803 | /* Two nonempty children */ |
| 865 | else { | 804 | newBinNode = (struct tnode *) tnode_get_child(tn, i/2); |
| 866 | struct tnode *newBinNode = | 805 | put_child(t, tn, i/2, NULL); |
| 867 | (struct tnode *) tnode_get_child(tn, i/2); | 806 | put_child(t, newBinNode, 0, left); |
| 868 | put_child(t, tn, i/2, NULL); | 807 | put_child(t, newBinNode, 1, right); |
| 869 | 808 | put_child(t, tn, i/2, resize(t, newBinNode)); | |
| 870 | if (!newBinNode) | ||
| 871 | BUG(); | ||
| 872 | |||
| 873 | put_child(t, newBinNode, 0, left); | ||
| 874 | put_child(t, newBinNode, 1, right); | ||
| 875 | put_child(t, tn, i/2, resize(t, newBinNode)); | ||
| 876 | } | ||
| 877 | } | 809 | } |
| 878 | tnode_free(oldtnode); | 810 | tnode_free(oldtnode); |
| 879 | return tn; | 811 | return tn; |
| 812 | nomem: | ||
| 813 | { | ||
| 814 | int size = tnode_child_length(tn); | ||
| 815 | int j; | ||
| 816 | |||
| 817 | for (j = 0; j < size; j++) | ||
| 818 | if (tn->child[j]) | ||
| 819 | tnode_free((struct tnode *)tn->child[j]); | ||
| 820 | |||
| 821 | tnode_free(tn); | ||
| 822 | |||
| 823 | return ERR_PTR(-ENOMEM); | ||
| 824 | } | ||
| 880 | } | 825 | } |
| 881 | 826 | ||
| 882 | static void *trie_init(struct trie *t) | 827 | static void trie_init(struct trie *t) |
| 883 | { | 828 | { |
| 884 | if (t) { | 829 | if (!t) |
| 885 | t->size = 0; | 830 | return; |
| 886 | t->trie = NULL; | 831 | |
| 887 | t->revision = 0; | 832 | t->size = 0; |
| 833 | rcu_assign_pointer(t->trie, NULL); | ||
| 834 | t->revision = 0; | ||
| 888 | #ifdef CONFIG_IP_FIB_TRIE_STATS | 835 | #ifdef CONFIG_IP_FIB_TRIE_STATS |
| 889 | memset(&t->stats, 0, sizeof(struct trie_use_stats)); | 836 | memset(&t->stats, 0, sizeof(struct trie_use_stats)); |
| 890 | #endif | 837 | #endif |
| 891 | } | ||
| 892 | return t; | ||
| 893 | } | 838 | } |
| 894 | 839 | ||
| 840 | /* readside most use rcu_read_lock currently dump routines | ||
| 841 | via get_fa_head and dump */ | ||
| 842 | |||
| 895 | static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen) | 843 | static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen) |
| 896 | { | 844 | { |
| 897 | struct hlist_node *node; | 845 | struct hlist_node *node; |
| 898 | struct leaf_info *li; | 846 | struct leaf_info *li; |
| 899 | 847 | ||
| 900 | hlist_for_each_entry(li, node, head, hlist) { | 848 | hlist_for_each_entry_rcu(li, node, head, hlist) |
| 901 | if (li->plen == plen) | 849 | if (li->plen == plen) |
| 902 | return li; | 850 | return li; |
| 903 | } | 851 | |
| 904 | return NULL; | 852 | return NULL; |
| 905 | } | 853 | } |
| 906 | 854 | ||
| 907 | static inline struct list_head * get_fa_head(struct leaf *l, int plen) | 855 | static inline struct list_head * get_fa_head(struct leaf *l, int plen) |
| 908 | { | 856 | { |
| 909 | struct list_head *fa_head = NULL; | ||
| 910 | struct leaf_info *li = find_leaf_info(&l->list, plen); | 857 | struct leaf_info *li = find_leaf_info(&l->list, plen); |
| 911 | 858 | ||
| 912 | if (li) | 859 | if (!li) |
| 913 | fa_head = &li->falh; | 860 | return NULL; |
| 914 | 861 | ||
| 915 | return fa_head; | 862 | return &li->falh; |
| 916 | } | 863 | } |
| 917 | 864 | ||
| 918 | static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new) | 865 | static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new) |
| 919 | { | 866 | { |
| 920 | struct leaf_info *li = NULL, *last = NULL; | 867 | struct leaf_info *li = NULL, *last = NULL; |
| 921 | struct hlist_node *node, *tmp; | 868 | struct hlist_node *node; |
| 922 | 869 | ||
| 923 | write_lock_bh(&fib_lock); | 870 | if (hlist_empty(head)) { |
| 924 | 871 | hlist_add_head_rcu(&new->hlist, head); | |
| 925 | if (hlist_empty(head)) | 872 | } else { |
| 926 | hlist_add_head(&new->hlist, head); | 873 | hlist_for_each_entry(li, node, head, hlist) { |
| 927 | else { | 874 | if (new->plen > li->plen) |
| 928 | hlist_for_each_entry_safe(li, node, tmp, head, hlist) { | 875 | break; |
| 929 | 876 | ||
| 930 | if (new->plen > li->plen) | 877 | last = li; |
| 931 | break; | 878 | } |
| 932 | 879 | if (last) | |
| 933 | last = li; | 880 | hlist_add_after_rcu(&last->hlist, &new->hlist); |
| 934 | } | 881 | else |
| 935 | if (last) | 882 | hlist_add_before_rcu(&new->hlist, &li->hlist); |
| 936 | hlist_add_after(&last->hlist, &new->hlist); | 883 | } |
| 937 | else | ||
| 938 | hlist_add_before(&new->hlist, &li->hlist); | ||
| 939 | } | ||
| 940 | write_unlock_bh(&fib_lock); | ||
| 941 | } | 884 | } |
| 942 | 885 | ||
| 886 | /* rcu_read_lock needs to be hold by caller from readside */ | ||
| 887 | |||
| 943 | static struct leaf * | 888 | static struct leaf * |
| 944 | fib_find_node(struct trie *t, u32 key) | 889 | fib_find_node(struct trie *t, u32 key) |
| 945 | { | 890 | { |
| @@ -948,61 +893,43 @@ fib_find_node(struct trie *t, u32 key) | |||
| 948 | struct node *n; | 893 | struct node *n; |
| 949 | 894 | ||
| 950 | pos = 0; | 895 | pos = 0; |
| 951 | n = t->trie; | 896 | n = rcu_dereference(t->trie); |
| 952 | 897 | ||
| 953 | while (n != NULL && NODE_TYPE(n) == T_TNODE) { | 898 | while (n != NULL && NODE_TYPE(n) == T_TNODE) { |
| 954 | tn = (struct tnode *) n; | 899 | tn = (struct tnode *) n; |
| 955 | 900 | ||
| 956 | check_tnode(tn); | 901 | check_tnode(tn); |
| 957 | 902 | ||
| 958 | if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { | 903 | if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { |
| 959 | pos=tn->pos + tn->bits; | 904 | pos = tn->pos + tn->bits; |
| 960 | n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits)); | 905 | n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits)); |
| 961 | } | 906 | } else |
| 962 | else | ||
| 963 | break; | 907 | break; |
| 964 | } | 908 | } |
| 965 | /* Case we have found a leaf. Compare prefixes */ | 909 | /* Case we have found a leaf. Compare prefixes */ |
| 966 | 910 | ||
| 967 | if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) { | 911 | if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) |
| 968 | struct leaf *l = (struct leaf *) n; | 912 | return (struct leaf *)n; |
| 969 | return l; | 913 | |
| 970 | } | ||
| 971 | return NULL; | 914 | return NULL; |
| 972 | } | 915 | } |
| 973 | 916 | ||
| 974 | static struct node *trie_rebalance(struct trie *t, struct tnode *tn) | 917 | static struct node *trie_rebalance(struct trie *t, struct tnode *tn) |
| 975 | { | 918 | { |
| 976 | int i = 0; | ||
| 977 | int wasfull; | 919 | int wasfull; |
| 978 | t_key cindex, key; | 920 | t_key cindex, key; |
| 979 | struct tnode *tp = NULL; | 921 | struct tnode *tp = NULL; |
| 980 | 922 | ||
| 981 | if (!tn) | ||
| 982 | BUG(); | ||
| 983 | |||
| 984 | key = tn->key; | 923 | key = tn->key; |
| 985 | i = 0; | ||
| 986 | 924 | ||
| 987 | while (tn != NULL && NODE_PARENT(tn) != NULL) { | 925 | while (tn != NULL && NODE_PARENT(tn) != NULL) { |
| 988 | 926 | ||
| 989 | if (i > 10) { | ||
| 990 | printk("Rebalance tn=%p \n", tn); | ||
| 991 | if (tn) printk("tn->parent=%p \n", NODE_PARENT(tn)); | ||
| 992 | |||
| 993 | printk("Rebalance tp=%p \n", tp); | ||
| 994 | if (tp) printk("tp->parent=%p \n", NODE_PARENT(tp)); | ||
| 995 | } | ||
| 996 | |||
| 997 | if (i > 12) BUG(); | ||
| 998 | i++; | ||
| 999 | |||
| 1000 | tp = NODE_PARENT(tn); | 927 | tp = NODE_PARENT(tn); |
| 1001 | cindex = tkey_extract_bits(key, tp->pos, tp->bits); | 928 | cindex = tkey_extract_bits(key, tp->pos, tp->bits); |
| 1002 | wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); | 929 | wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); |
| 1003 | tn = (struct tnode *) resize (t, (struct tnode *)tn); | 930 | tn = (struct tnode *) resize (t, (struct tnode *)tn); |
| 1004 | tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull); | 931 | tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull); |
| 1005 | 932 | ||
| 1006 | if (!NODE_PARENT(tn)) | 933 | if (!NODE_PARENT(tn)) |
| 1007 | break; | 934 | break; |
| 1008 | 935 | ||
| @@ -1015,6 +942,8 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn) | |||
| 1015 | return (struct node*) tn; | 942 | return (struct node*) tn; |
| 1016 | } | 943 | } |
| 1017 | 944 | ||
| 945 | /* only used from updater-side */ | ||
| 946 | |||
| 1018 | static struct list_head * | 947 | static struct list_head * |
| 1019 | fib_insert_node(struct trie *t, int *err, u32 key, int plen) | 948 | fib_insert_node(struct trie *t, int *err, u32 key, int plen) |
| 1020 | { | 949 | { |
| @@ -1050,20 +979,16 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen) | |||
| 1050 | 979 | ||
| 1051 | while (n != NULL && NODE_TYPE(n) == T_TNODE) { | 980 | while (n != NULL && NODE_TYPE(n) == T_TNODE) { |
| 1052 | tn = (struct tnode *) n; | 981 | tn = (struct tnode *) n; |
| 1053 | 982 | ||
| 1054 | check_tnode(tn); | 983 | check_tnode(tn); |
| 1055 | 984 | ||
| 1056 | if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { | 985 | if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { |
| 1057 | tp = tn; | 986 | tp = tn; |
| 1058 | pos=tn->pos + tn->bits; | 987 | pos = tn->pos + tn->bits; |
| 1059 | n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits)); | 988 | n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits)); |
| 1060 | 989 | ||
| 1061 | if (n && NODE_PARENT(n) != tn) { | 990 | BUG_ON(n && NODE_PARENT(n) != tn); |
| 1062 | printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n)); | 991 | } else |
| 1063 | BUG(); | ||
| 1064 | } | ||
| 1065 | } | ||
| 1066 | else | ||
| 1067 | break; | 992 | break; |
| 1068 | } | 993 | } |
| 1069 | 994 | ||
| @@ -1073,17 +998,15 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen) | |||
| 1073 | * tp is n's (parent) ----> NULL or TNODE | 998 | * tp is n's (parent) ----> NULL or TNODE |
| 1074 | */ | 999 | */ |
| 1075 | 1000 | ||
| 1076 | if (tp && IS_LEAF(tp)) | 1001 | BUG_ON(tp && IS_LEAF(tp)); |
| 1077 | BUG(); | ||
| 1078 | |||
| 1079 | 1002 | ||
| 1080 | /* Case 1: n is a leaf. Compare prefixes */ | 1003 | /* Case 1: n is a leaf. Compare prefixes */ |
| 1081 | 1004 | ||
| 1082 | if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) { | 1005 | if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) { |
| 1083 | struct leaf *l = ( struct leaf *) n; | 1006 | struct leaf *l = (struct leaf *) n; |
| 1084 | 1007 | ||
| 1085 | li = leaf_info_new(plen); | 1008 | li = leaf_info_new(plen); |
| 1086 | 1009 | ||
| 1087 | if (!li) { | 1010 | if (!li) { |
| 1088 | *err = -ENOMEM; | 1011 | *err = -ENOMEM; |
| 1089 | goto err; | 1012 | goto err; |
| @@ -1113,35 +1036,29 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen) | |||
| 1113 | fa_head = &li->falh; | 1036 | fa_head = &li->falh; |
| 1114 | insert_leaf_info(&l->list, li); | 1037 | insert_leaf_info(&l->list, li); |
| 1115 | 1038 | ||
| 1116 | /* Case 2: n is NULL, and will just insert a new leaf */ | ||
| 1117 | if (t->trie && n == NULL) { | 1039 | if (t->trie && n == NULL) { |
| 1040 | /* Case 2: n is NULL, and will just insert a new leaf */ | ||
| 1118 | 1041 | ||
| 1119 | NODE_SET_PARENT(l, tp); | 1042 | NODE_SET_PARENT(l, tp); |
| 1120 | |||
| 1121 | if (!tp) | ||
| 1122 | BUG(); | ||
| 1123 | 1043 | ||
| 1124 | else { | 1044 | cindex = tkey_extract_bits(key, tp->pos, tp->bits); |
| 1125 | cindex = tkey_extract_bits(key, tp->pos, tp->bits); | 1045 | put_child(t, (struct tnode *)tp, cindex, (struct node *)l); |
| 1126 | put_child(t, (struct tnode *)tp, cindex, (struct node *)l); | 1046 | } else { |
| 1127 | } | 1047 | /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ |
| 1128 | } | ||
| 1129 | /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ | ||
| 1130 | else { | ||
| 1131 | /* | 1048 | /* |
| 1132 | * Add a new tnode here | 1049 | * Add a new tnode here |
| 1133 | * first tnode need some special handling | 1050 | * first tnode need some special handling |
| 1134 | */ | 1051 | */ |
| 1135 | 1052 | ||
| 1136 | if (tp) | 1053 | if (tp) |
| 1137 | pos=tp->pos+tp->bits; | 1054 | pos = tp->pos+tp->bits; |
| 1138 | else | 1055 | else |
| 1139 | pos=0; | 1056 | pos = 0; |
| 1057 | |||
| 1140 | if (n) { | 1058 | if (n) { |
| 1141 | newpos = tkey_mismatch(key, pos, n->key); | 1059 | newpos = tkey_mismatch(key, pos, n->key); |
| 1142 | tn = tnode_new(n->key, newpos, 1); | 1060 | tn = tnode_new(n->key, newpos, 1); |
| 1143 | } | 1061 | } else { |
| 1144 | else { | ||
| 1145 | newpos = 0; | 1062 | newpos = 0; |
| 1146 | tn = tnode_new(key, newpos, 1); /* First tnode */ | 1063 | tn = tnode_new(key, newpos, 1); /* First tnode */ |
| 1147 | } | 1064 | } |
| @@ -1151,32 +1068,33 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen) | |||
| 1151 | tnode_free((struct tnode *) l); | 1068 | tnode_free((struct tnode *) l); |
| 1152 | *err = -ENOMEM; | 1069 | *err = -ENOMEM; |
| 1153 | goto err; | 1070 | goto err; |
| 1154 | } | 1071 | } |
| 1155 | 1072 | ||
| 1156 | NODE_SET_PARENT(tn, tp); | 1073 | NODE_SET_PARENT(tn, tp); |
| 1157 | 1074 | ||
| 1158 | missbit=tkey_extract_bits(key, newpos, 1); | 1075 | missbit = tkey_extract_bits(key, newpos, 1); |
| 1159 | put_child(t, tn, missbit, (struct node *)l); | 1076 | put_child(t, tn, missbit, (struct node *)l); |
| 1160 | put_child(t, tn, 1-missbit, n); | 1077 | put_child(t, tn, 1-missbit, n); |
| 1161 | 1078 | ||
| 1162 | if (tp) { | 1079 | if (tp) { |
| 1163 | cindex = tkey_extract_bits(key, tp->pos, tp->bits); | 1080 | cindex = tkey_extract_bits(key, tp->pos, tp->bits); |
| 1164 | put_child(t, (struct tnode *)tp, cindex, (struct node *)tn); | 1081 | put_child(t, (struct tnode *)tp, cindex, (struct node *)tn); |
| 1165 | } | 1082 | } else { |
| 1166 | else { | 1083 | rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */ |
| 1167 | t->trie = (struct node*) tn; /* First tnode */ | ||
| 1168 | tp = tn; | 1084 | tp = tn; |
| 1169 | } | 1085 | } |
| 1170 | } | 1086 | } |
| 1171 | if (tp && tp->pos+tp->bits > 32) { | 1087 | |
| 1088 | if (tp && tp->pos + tp->bits > 32) | ||
| 1172 | printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n", | 1089 | printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n", |
| 1173 | tp, tp->pos, tp->bits, key, plen); | 1090 | tp, tp->pos, tp->bits, key, plen); |
| 1174 | } | 1091 | |
| 1175 | /* Rebalance the trie */ | 1092 | /* Rebalance the trie */ |
| 1176 | t->trie = trie_rebalance(t, tp); | 1093 | |
| 1094 | rcu_assign_pointer(t->trie, trie_rebalance(t, tp)); | ||
| 1177 | done: | 1095 | done: |
| 1178 | t->revision++; | 1096 | t->revision++; |
| 1179 | err:; | 1097 | err: |
| 1180 | return fa_head; | 1098 | return fa_head; |
| 1181 | } | 1099 | } |
| 1182 | 1100 | ||
| @@ -1204,17 +1122,18 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, | |||
| 1204 | 1122 | ||
| 1205 | key = ntohl(key); | 1123 | key = ntohl(key); |
| 1206 | 1124 | ||
| 1207 | if (trie_debug) | 1125 | pr_debug("Insert table=%d %08x/%d\n", tb->tb_id, key, plen); |
| 1208 | printk("Insert table=%d %08x/%d\n", tb->tb_id, key, plen); | ||
| 1209 | 1126 | ||
| 1210 | mask = ntohl( inet_make_mask(plen) ); | 1127 | mask = ntohl(inet_make_mask(plen)); |
| 1211 | 1128 | ||
| 1212 | if (key & ~mask) | 1129 | if (key & ~mask) |
| 1213 | return -EINVAL; | 1130 | return -EINVAL; |
| 1214 | 1131 | ||
| 1215 | key = key & mask; | 1132 | key = key & mask; |
| 1216 | 1133 | ||
| 1217 | if ((fi = fib_create_info(r, rta, nlhdr, &err)) == NULL) | 1134 | fi = fib_create_info(r, rta, nlhdr, &err); |
| 1135 | |||
| 1136 | if (!fi) | ||
| 1218 | goto err; | 1137 | goto err; |
| 1219 | 1138 | ||
| 1220 | l = fib_find_node(t, key); | 1139 | l = fib_find_node(t, key); |
| @@ -1236,8 +1155,7 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, | |||
| 1236 | * and we need to allocate a new one of those as well. | 1155 | * and we need to allocate a new one of those as well. |
| 1237 | */ | 1156 | */ |
| 1238 | 1157 | ||
| 1239 | if (fa && | 1158 | if (fa && fa->fa_info->fib_priority == fi->fib_priority) { |
| 1240 | fa->fa_info->fib_priority == fi->fib_priority) { | ||
| 1241 | struct fib_alias *fa_orig; | 1159 | struct fib_alias *fa_orig; |
| 1242 | 1160 | ||
| 1243 | err = -EEXIST; | 1161 | err = -EEXIST; |
| @@ -1248,22 +1166,27 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, | |||
| 1248 | struct fib_info *fi_drop; | 1166 | struct fib_info *fi_drop; |
| 1249 | u8 state; | 1167 | u8 state; |
| 1250 | 1168 | ||
| 1251 | write_lock_bh(&fib_lock); | 1169 | err = -ENOBUFS; |
| 1170 | new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL); | ||
| 1171 | if (new_fa == NULL) | ||
| 1172 | goto out; | ||
| 1252 | 1173 | ||
| 1253 | fi_drop = fa->fa_info; | 1174 | fi_drop = fa->fa_info; |
| 1254 | fa->fa_info = fi; | 1175 | new_fa->fa_tos = fa->fa_tos; |
| 1255 | fa->fa_type = type; | 1176 | new_fa->fa_info = fi; |
| 1256 | fa->fa_scope = r->rtm_scope; | 1177 | new_fa->fa_type = type; |
| 1178 | new_fa->fa_scope = r->rtm_scope; | ||
| 1257 | state = fa->fa_state; | 1179 | state = fa->fa_state; |
| 1258 | fa->fa_state &= ~FA_S_ACCESSED; | 1180 | new_fa->fa_state &= ~FA_S_ACCESSED; |
| 1259 | 1181 | ||
| 1260 | write_unlock_bh(&fib_lock); | 1182 | list_replace_rcu(&fa->fa_list, &new_fa->fa_list); |
| 1183 | alias_free_mem_rcu(fa); | ||
| 1261 | 1184 | ||
| 1262 | fib_release_info(fi_drop); | 1185 | fib_release_info(fi_drop); |
| 1263 | if (state & FA_S_ACCESSED) | 1186 | if (state & FA_S_ACCESSED) |
| 1264 | rt_cache_flush(-1); | 1187 | rt_cache_flush(-1); |
| 1265 | 1188 | ||
| 1266 | goto succeeded; | 1189 | goto succeeded; |
| 1267 | } | 1190 | } |
| 1268 | /* Error if we find a perfect match which | 1191 | /* Error if we find a perfect match which |
| 1269 | * uses the same scope, type, and nexthop | 1192 | * uses the same scope, type, and nexthop |
| @@ -1285,7 +1208,7 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, | |||
| 1285 | fa = fa_orig; | 1208 | fa = fa_orig; |
| 1286 | } | 1209 | } |
| 1287 | err = -ENOENT; | 1210 | err = -ENOENT; |
| 1288 | if (!(nlhdr->nlmsg_flags&NLM_F_CREATE)) | 1211 | if (!(nlhdr->nlmsg_flags & NLM_F_CREATE)) |
| 1289 | goto out; | 1212 | goto out; |
| 1290 | 1213 | ||
| 1291 | err = -ENOBUFS; | 1214 | err = -ENOBUFS; |
| @@ -1298,9 +1221,6 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, | |||
| 1298 | new_fa->fa_type = type; | 1221 | new_fa->fa_type = type; |
| 1299 | new_fa->fa_scope = r->rtm_scope; | 1222 | new_fa->fa_scope = r->rtm_scope; |
| 1300 | new_fa->fa_state = 0; | 1223 | new_fa->fa_state = 0; |
| 1301 | #if 0 | ||
| 1302 | new_fa->dst = NULL; | ||
| 1303 | #endif | ||
| 1304 | /* | 1224 | /* |
| 1305 | * Insert new entry to the list. | 1225 | * Insert new entry to the list. |
| 1306 | */ | 1226 | */ |
| @@ -1312,12 +1232,8 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, | |||
| 1312 | goto out_free_new_fa; | 1232 | goto out_free_new_fa; |
| 1313 | } | 1233 | } |
| 1314 | 1234 | ||
| 1315 | write_lock_bh(&fib_lock); | 1235 | list_add_tail_rcu(&new_fa->fa_list, |
| 1316 | 1236 | (fa ? &fa->fa_list : fa_head)); | |
| 1317 | list_add_tail(&new_fa->fa_list, | ||
| 1318 | (fa ? &fa->fa_list : fa_head)); | ||
| 1319 | |||
| 1320 | write_unlock_bh(&fib_lock); | ||
| 1321 | 1237 | ||
| 1322 | rt_cache_flush(-1); | 1238 | rt_cache_flush(-1); |
| 1323 | rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req); | 1239 | rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req); |
| @@ -1328,11 +1244,14 @@ out_free_new_fa: | |||
| 1328 | kmem_cache_free(fn_alias_kmem, new_fa); | 1244 | kmem_cache_free(fn_alias_kmem, new_fa); |
| 1329 | out: | 1245 | out: |
| 1330 | fib_release_info(fi); | 1246 | fib_release_info(fi); |
| 1331 | err:; | 1247 | err: |
| 1332 | return err; | 1248 | return err; |
| 1333 | } | 1249 | } |
| 1334 | 1250 | ||
| 1335 | static inline int check_leaf(struct trie *t, struct leaf *l, t_key key, int *plen, const struct flowi *flp, | 1251 | |
| 1252 | /* should be clalled with rcu_read_lock */ | ||
| 1253 | static inline int check_leaf(struct trie *t, struct leaf *l, | ||
| 1254 | t_key key, int *plen, const struct flowi *flp, | ||
| 1336 | struct fib_result *res) | 1255 | struct fib_result *res) |
| 1337 | { | 1256 | { |
| 1338 | int err, i; | 1257 | int err, i; |
| @@ -1341,8 +1260,7 @@ static inline int check_leaf(struct trie *t, struct leaf *l, t_key key, int *pl | |||
| 1341 | struct hlist_head *hhead = &l->list; | 1260 | struct hlist_head *hhead = &l->list; |
| 1342 | struct hlist_node *node; | 1261 | struct hlist_node *node; |
| 1343 | 1262 | ||
| 1344 | hlist_for_each_entry(li, node, hhead, hlist) { | 1263 | hlist_for_each_entry_rcu(li, node, hhead, hlist) { |
| 1345 | |||
| 1346 | i = li->plen; | 1264 | i = li->plen; |
| 1347 | mask = ntohl(inet_make_mask(i)); | 1265 | mask = ntohl(inet_make_mask(i)); |
| 1348 | if (l->key != (key & mask)) | 1266 | if (l->key != (key & mask)) |
| @@ -1370,13 +1288,17 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result | |||
| 1370 | struct node *n; | 1288 | struct node *n; |
| 1371 | struct tnode *pn; | 1289 | struct tnode *pn; |
| 1372 | int pos, bits; | 1290 | int pos, bits; |
| 1373 | t_key key=ntohl(flp->fl4_dst); | 1291 | t_key key = ntohl(flp->fl4_dst); |
| 1374 | int chopped_off; | 1292 | int chopped_off; |
| 1375 | t_key cindex = 0; | 1293 | t_key cindex = 0; |
| 1376 | int current_prefix_length = KEYLENGTH; | 1294 | int current_prefix_length = KEYLENGTH; |
| 1377 | n = t->trie; | 1295 | struct tnode *cn; |
| 1296 | t_key node_prefix, key_prefix, pref_mismatch; | ||
| 1297 | int mp; | ||
| 1298 | |||
| 1299 | rcu_read_lock(); | ||
| 1378 | 1300 | ||
| 1379 | read_lock(&fib_lock); | 1301 | n = rcu_dereference(t->trie); |
| 1380 | if (!n) | 1302 | if (!n) |
| 1381 | goto failed; | 1303 | goto failed; |
| 1382 | 1304 | ||
| @@ -1393,8 +1315,7 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result | |||
| 1393 | pn = (struct tnode *) n; | 1315 | pn = (struct tnode *) n; |
| 1394 | chopped_off = 0; | 1316 | chopped_off = 0; |
| 1395 | 1317 | ||
| 1396 | while (pn) { | 1318 | while (pn) { |
| 1397 | |||
| 1398 | pos = pn->pos; | 1319 | pos = pn->pos; |
| 1399 | bits = pn->bits; | 1320 | bits = pn->bits; |
| 1400 | 1321 | ||
| @@ -1410,130 +1331,129 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result | |||
| 1410 | goto backtrace; | 1331 | goto backtrace; |
| 1411 | } | 1332 | } |
| 1412 | 1333 | ||
| 1413 | if (IS_TNODE(n)) { | 1334 | if (IS_LEAF(n)) { |
| 1335 | if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0) | ||
| 1336 | goto found; | ||
| 1337 | else | ||
| 1338 | goto backtrace; | ||
| 1339 | } | ||
| 1340 | |||
| 1414 | #define HL_OPTIMIZE | 1341 | #define HL_OPTIMIZE |
| 1415 | #ifdef HL_OPTIMIZE | 1342 | #ifdef HL_OPTIMIZE |
| 1416 | struct tnode *cn = (struct tnode *)n; | 1343 | cn = (struct tnode *)n; |
| 1417 | t_key node_prefix, key_prefix, pref_mismatch; | ||
| 1418 | int mp; | ||
| 1419 | 1344 | ||
| 1420 | /* | 1345 | /* |
| 1421 | * It's a tnode, and we can do some extra checks here if we | 1346 | * It's a tnode, and we can do some extra checks here if we |
| 1422 | * like, to avoid descending into a dead-end branch. | 1347 | * like, to avoid descending into a dead-end branch. |
| 1423 | * This tnode is in the parent's child array at index | 1348 | * This tnode is in the parent's child array at index |
| 1424 | * key[p_pos..p_pos+p_bits] but potentially with some bits | 1349 | * key[p_pos..p_pos+p_bits] but potentially with some bits |
| 1425 | * chopped off, so in reality the index may be just a | 1350 | * chopped off, so in reality the index may be just a |
| 1426 | * subprefix, padded with zero at the end. | 1351 | * subprefix, padded with zero at the end. |
| 1427 | * We can also take a look at any skipped bits in this | 1352 | * We can also take a look at any skipped bits in this |
| 1428 | * tnode - everything up to p_pos is supposed to be ok, | 1353 | * tnode - everything up to p_pos is supposed to be ok, |
| 1429 | * and the non-chopped bits of the index (se previous | 1354 | * and the non-chopped bits of the index (se previous |
| 1430 | * paragraph) are also guaranteed ok, but the rest is | 1355 | * paragraph) are also guaranteed ok, but the rest is |
| 1431 | * considered unknown. | 1356 | * considered unknown. |
| 1432 | * | 1357 | * |
| 1433 | * The skipped bits are key[pos+bits..cn->pos]. | 1358 | * The skipped bits are key[pos+bits..cn->pos]. |
| 1434 | */ | 1359 | */ |
| 1435 | |||
| 1436 | /* If current_prefix_length < pos+bits, we are already doing | ||
| 1437 | * actual prefix matching, which means everything from | ||
| 1438 | * pos+(bits-chopped_off) onward must be zero along some | ||
| 1439 | * branch of this subtree - otherwise there is *no* valid | ||
| 1440 | * prefix present. Here we can only check the skipped | ||
| 1441 | * bits. Remember, since we have already indexed into the | ||
| 1442 | * parent's child array, we know that the bits we chopped of | ||
| 1443 | * *are* zero. | ||
| 1444 | */ | ||
| 1445 | 1360 | ||
| 1446 | /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */ | 1361 | /* If current_prefix_length < pos+bits, we are already doing |
| 1447 | 1362 | * actual prefix matching, which means everything from | |
| 1448 | if (current_prefix_length < pos+bits) { | 1363 | * pos+(bits-chopped_off) onward must be zero along some |
| 1449 | if (tkey_extract_bits(cn->key, current_prefix_length, | 1364 | * branch of this subtree - otherwise there is *no* valid |
| 1450 | cn->pos - current_prefix_length) != 0 || | 1365 | * prefix present. Here we can only check the skipped |
| 1451 | !(cn->child[0])) | 1366 | * bits. Remember, since we have already indexed into the |
| 1452 | goto backtrace; | 1367 | * parent's child array, we know that the bits we chopped of |
| 1453 | } | 1368 | * *are* zero. |
| 1369 | */ | ||
| 1454 | 1370 | ||
| 1455 | /* | 1371 | /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */ |
| 1456 | * If chopped_off=0, the index is fully validated and we | ||
| 1457 | * only need to look at the skipped bits for this, the new, | ||
| 1458 | * tnode. What we actually want to do is to find out if | ||
| 1459 | * these skipped bits match our key perfectly, or if we will | ||
| 1460 | * have to count on finding a matching prefix further down, | ||
| 1461 | * because if we do, we would like to have some way of | ||
| 1462 | * verifying the existence of such a prefix at this point. | ||
| 1463 | */ | ||
| 1464 | 1372 | ||
| 1465 | /* The only thing we can do at this point is to verify that | 1373 | if (current_prefix_length < pos+bits) { |
| 1466 | * any such matching prefix can indeed be a prefix to our | 1374 | if (tkey_extract_bits(cn->key, current_prefix_length, |
| 1467 | * key, and if the bits in the node we are inspecting that | 1375 | cn->pos - current_prefix_length) != 0 || |
| 1468 | * do not match our key are not ZERO, this cannot be true. | 1376 | !(cn->child[0])) |
| 1469 | * Thus, find out where there is a mismatch (before cn->pos) | 1377 | goto backtrace; |
| 1470 | * and verify that all the mismatching bits are zero in the | 1378 | } |
| 1471 | * new tnode's key. | ||
| 1472 | */ | ||
| 1473 | 1379 | ||
| 1474 | /* Note: We aren't very concerned about the piece of the key | 1380 | /* |
| 1475 | * that precede pn->pos+pn->bits, since these have already been | 1381 | * If chopped_off=0, the index is fully validated and we |
| 1476 | * checked. The bits after cn->pos aren't checked since these are | 1382 | * only need to look at the skipped bits for this, the new, |
| 1477 | * by definition "unknown" at this point. Thus, what we want to | 1383 | * tnode. What we actually want to do is to find out if |
| 1478 | * see is if we are about to enter the "prefix matching" state, | 1384 | * these skipped bits match our key perfectly, or if we will |
| 1479 | * and in that case verify that the skipped bits that will prevail | 1385 | * have to count on finding a matching prefix further down, |
| 1480 | * throughout this subtree are zero, as they have to be if we are | 1386 | * because if we do, we would like to have some way of |
| 1481 | * to find a matching prefix. | 1387 | * verifying the existence of such a prefix at this point. |
| 1482 | */ | 1388 | */ |
| 1483 | 1389 | ||
| 1484 | node_prefix = MASK_PFX(cn->key, cn->pos); | 1390 | /* The only thing we can do at this point is to verify that |
| 1485 | key_prefix = MASK_PFX(key, cn->pos); | 1391 | * any such matching prefix can indeed be a prefix to our |
| 1486 | pref_mismatch = key_prefix^node_prefix; | 1392 | * key, and if the bits in the node we are inspecting that |
| 1487 | mp = 0; | 1393 | * do not match our key are not ZERO, this cannot be true. |
| 1394 | * Thus, find out where there is a mismatch (before cn->pos) | ||
| 1395 | * and verify that all the mismatching bits are zero in the | ||
| 1396 | * new tnode's key. | ||
| 1397 | */ | ||
| 1488 | 1398 | ||
| 1489 | /* In short: If skipped bits in this node do not match the search | 1399 | /* Note: We aren't very concerned about the piece of the key |
| 1490 | * key, enter the "prefix matching" state.directly. | 1400 | * that precede pn->pos+pn->bits, since these have already been |
| 1491 | */ | 1401 | * checked. The bits after cn->pos aren't checked since these are |
| 1492 | if (pref_mismatch) { | 1402 | * by definition "unknown" at this point. Thus, what we want to |
| 1493 | while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) { | 1403 | * see is if we are about to enter the "prefix matching" state, |
| 1494 | mp++; | 1404 | * and in that case verify that the skipped bits that will prevail |
| 1495 | pref_mismatch = pref_mismatch <<1; | 1405 | * throughout this subtree are zero, as they have to be if we are |
| 1496 | } | 1406 | * to find a matching prefix. |
| 1497 | key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp); | 1407 | */ |
| 1498 | 1408 | ||
| 1499 | if (key_prefix != 0) | 1409 | node_prefix = MASK_PFX(cn->key, cn->pos); |
| 1500 | goto backtrace; | 1410 | key_prefix = MASK_PFX(key, cn->pos); |
| 1501 | 1411 | pref_mismatch = key_prefix^node_prefix; | |
| 1502 | if (current_prefix_length >= cn->pos) | 1412 | mp = 0; |
| 1503 | current_prefix_length=mp; | 1413 | |
| 1504 | } | 1414 | /* In short: If skipped bits in this node do not match the search |
| 1505 | #endif | 1415 | * key, enter the "prefix matching" state.directly. |
| 1506 | pn = (struct tnode *)n; /* Descend */ | 1416 | */ |
| 1507 | chopped_off = 0; | 1417 | if (pref_mismatch) { |
| 1508 | continue; | 1418 | while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) { |
| 1419 | mp++; | ||
| 1420 | pref_mismatch = pref_mismatch <<1; | ||
| 1421 | } | ||
| 1422 | key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp); | ||
| 1423 | |||
| 1424 | if (key_prefix != 0) | ||
| 1425 | goto backtrace; | ||
| 1426 | |||
| 1427 | if (current_prefix_length >= cn->pos) | ||
| 1428 | current_prefix_length = mp; | ||
| 1509 | } | 1429 | } |
| 1510 | if (IS_LEAF(n)) { | 1430 | #endif |
| 1511 | if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0) | 1431 | pn = (struct tnode *)n; /* Descend */ |
| 1512 | goto found; | 1432 | chopped_off = 0; |
| 1513 | } | 1433 | continue; |
| 1434 | |||
| 1514 | backtrace: | 1435 | backtrace: |
| 1515 | chopped_off++; | 1436 | chopped_off++; |
| 1516 | 1437 | ||
| 1517 | /* As zero don't change the child key (cindex) */ | 1438 | /* As zero don't change the child key (cindex) */ |
| 1518 | while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1)))) { | 1439 | while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1)))) |
| 1519 | chopped_off++; | 1440 | chopped_off++; |
| 1520 | } | ||
| 1521 | 1441 | ||
| 1522 | /* Decrease current_... with bits chopped off */ | 1442 | /* Decrease current_... with bits chopped off */ |
| 1523 | if (current_prefix_length > pn->pos + pn->bits - chopped_off) | 1443 | if (current_prefix_length > pn->pos + pn->bits - chopped_off) |
| 1524 | current_prefix_length = pn->pos + pn->bits - chopped_off; | 1444 | current_prefix_length = pn->pos + pn->bits - chopped_off; |
| 1525 | 1445 | ||
| 1526 | /* | 1446 | /* |
| 1527 | * Either we do the actual chop off according or if we have | 1447 | * Either we do the actual chop off according or if we have |
| 1528 | * chopped off all bits in this tnode walk up to our parent. | 1448 | * chopped off all bits in this tnode walk up to our parent. |
| 1529 | */ | 1449 | */ |
| 1530 | 1450 | ||
| 1531 | if (chopped_off <= pn->bits) | 1451 | if (chopped_off <= pn->bits) { |
| 1532 | cindex &= ~(1 << (chopped_off-1)); | 1452 | cindex &= ~(1 << (chopped_off-1)); |
| 1533 | else { | 1453 | } else { |
| 1534 | if (NODE_PARENT(pn) == NULL) | 1454 | if (NODE_PARENT(pn) == NULL) |
| 1535 | goto failed; | 1455 | goto failed; |
| 1536 | 1456 | ||
| 1537 | /* Get Child's index */ | 1457 | /* Get Child's index */ |
| 1538 | cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits); | 1458 | cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits); |
| 1539 | pn = NODE_PARENT(pn); | 1459 | pn = NODE_PARENT(pn); |
| @@ -1548,10 +1468,11 @@ backtrace: | |||
| 1548 | failed: | 1468 | failed: |
| 1549 | ret = 1; | 1469 | ret = 1; |
| 1550 | found: | 1470 | found: |
| 1551 | read_unlock(&fib_lock); | 1471 | rcu_read_unlock(); |
| 1552 | return ret; | 1472 | return ret; |
| 1553 | } | 1473 | } |
| 1554 | 1474 | ||
| 1475 | /* only called from updater side */ | ||
| 1555 | static int trie_leaf_remove(struct trie *t, t_key key) | 1476 | static int trie_leaf_remove(struct trie *t, t_key key) |
| 1556 | { | 1477 | { |
| 1557 | t_key cindex; | 1478 | t_key cindex; |
| @@ -1559,24 +1480,20 @@ static int trie_leaf_remove(struct trie *t, t_key key) | |||
| 1559 | struct node *n = t->trie; | 1480 | struct node *n = t->trie; |
| 1560 | struct leaf *l; | 1481 | struct leaf *l; |
| 1561 | 1482 | ||
| 1562 | if (trie_debug) | 1483 | pr_debug("entering trie_leaf_remove(%p)\n", n); |
| 1563 | printk("entering trie_leaf_remove(%p)\n", n); | ||
| 1564 | 1484 | ||
| 1565 | /* Note that in the case skipped bits, those bits are *not* checked! | 1485 | /* Note that in the case skipped bits, those bits are *not* checked! |
| 1566 | * When we finish this, we will have NULL or a T_LEAF, and the | 1486 | * When we finish this, we will have NULL or a T_LEAF, and the |
| 1567 | * T_LEAF may or may not match our key. | 1487 | * T_LEAF may or may not match our key. |
| 1568 | */ | 1488 | */ |
| 1569 | 1489 | ||
| 1570 | while (n != NULL && IS_TNODE(n)) { | 1490 | while (n != NULL && IS_TNODE(n)) { |
| 1571 | struct tnode *tn = (struct tnode *) n; | 1491 | struct tnode *tn = (struct tnode *) n; |
| 1572 | check_tnode(tn); | 1492 | check_tnode(tn); |
| 1573 | n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits)); | 1493 | n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits)); |
| 1574 | 1494 | ||
| 1575 | if (n && NODE_PARENT(n) != tn) { | 1495 | BUG_ON(n && NODE_PARENT(n) != tn); |
| 1576 | printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n)); | 1496 | } |
| 1577 | BUG(); | ||
| 1578 | } | ||
| 1579 | } | ||
| 1580 | l = (struct leaf *) n; | 1497 | l = (struct leaf *) n; |
| 1581 | 1498 | ||
| 1582 | if (!n || !tkey_equals(l->key, key)) | 1499 | if (!n || !tkey_equals(l->key, key)) |
| @@ -1590,23 +1507,24 @@ static int trie_leaf_remove(struct trie *t, t_key key) | |||
| 1590 | t->revision++; | 1507 | t->revision++; |
| 1591 | t->size--; | 1508 | t->size--; |
| 1592 | 1509 | ||
| 1510 | preempt_disable(); | ||
| 1593 | tp = NODE_PARENT(n); | 1511 | tp = NODE_PARENT(n); |
| 1594 | tnode_free((struct tnode *) n); | 1512 | tnode_free((struct tnode *) n); |
| 1595 | 1513 | ||
| 1596 | if (tp) { | 1514 | if (tp) { |
| 1597 | cindex = tkey_extract_bits(key, tp->pos, tp->bits); | 1515 | cindex = tkey_extract_bits(key, tp->pos, tp->bits); |
| 1598 | put_child(t, (struct tnode *)tp, cindex, NULL); | 1516 | put_child(t, (struct tnode *)tp, cindex, NULL); |
| 1599 | t->trie = trie_rebalance(t, tp); | 1517 | rcu_assign_pointer(t->trie, trie_rebalance(t, tp)); |
| 1600 | } | 1518 | } else |
| 1601 | else | 1519 | rcu_assign_pointer(t->trie, NULL); |
| 1602 | t->trie = NULL; | 1520 | preempt_enable(); |
| 1603 | 1521 | ||
| 1604 | return 1; | 1522 | return 1; |
| 1605 | } | 1523 | } |
| 1606 | 1524 | ||
| 1607 | static int | 1525 | static int |
| 1608 | fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, | 1526 | fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, |
| 1609 | struct nlmsghdr *nlhdr, struct netlink_skb_parms *req) | 1527 | struct nlmsghdr *nlhdr, struct netlink_skb_parms *req) |
| 1610 | { | 1528 | { |
| 1611 | struct trie *t = (struct trie *) tb->tb_data; | 1529 | struct trie *t = (struct trie *) tb->tb_data; |
| 1612 | u32 key, mask; | 1530 | u32 key, mask; |
| @@ -1615,6 +1533,8 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, | |||
| 1615 | struct fib_alias *fa, *fa_to_delete; | 1533 | struct fib_alias *fa, *fa_to_delete; |
| 1616 | struct list_head *fa_head; | 1534 | struct list_head *fa_head; |
| 1617 | struct leaf *l; | 1535 | struct leaf *l; |
| 1536 | struct leaf_info *li; | ||
| 1537 | |||
| 1618 | 1538 | ||
| 1619 | if (plen > 32) | 1539 | if (plen > 32) |
| 1620 | return -EINVAL; | 1540 | return -EINVAL; |
| @@ -1624,7 +1544,7 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, | |||
| 1624 | memcpy(&key, rta->rta_dst, 4); | 1544 | memcpy(&key, rta->rta_dst, 4); |
| 1625 | 1545 | ||
| 1626 | key = ntohl(key); | 1546 | key = ntohl(key); |
| 1627 | mask = ntohl( inet_make_mask(plen) ); | 1547 | mask = ntohl(inet_make_mask(plen)); |
| 1628 | 1548 | ||
| 1629 | if (key & ~mask) | 1549 | if (key & ~mask) |
| 1630 | return -EINVAL; | 1550 | return -EINVAL; |
| @@ -1641,11 +1561,11 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, | |||
| 1641 | if (!fa) | 1561 | if (!fa) |
| 1642 | return -ESRCH; | 1562 | return -ESRCH; |
| 1643 | 1563 | ||
| 1644 | if (trie_debug) | 1564 | pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t); |
| 1645 | printk("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t); | ||
| 1646 | 1565 | ||
| 1647 | fa_to_delete = NULL; | 1566 | fa_to_delete = NULL; |
| 1648 | fa_head = fa->fa_list.prev; | 1567 | fa_head = fa->fa_list.prev; |
| 1568 | |||
| 1649 | list_for_each_entry(fa, fa_head, fa_list) { | 1569 | list_for_each_entry(fa, fa_head, fa_list) { |
| 1650 | struct fib_info *fi = fa->fa_info; | 1570 | struct fib_info *fi = fa->fa_info; |
| 1651 | 1571 | ||
| @@ -1664,39 +1584,31 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, | |||
| 1664 | } | 1584 | } |
| 1665 | } | 1585 | } |
| 1666 | 1586 | ||
| 1667 | if (fa_to_delete) { | 1587 | if (!fa_to_delete) |
| 1668 | int kill_li = 0; | 1588 | return -ESRCH; |
| 1669 | struct leaf_info *li; | ||
| 1670 | |||
| 1671 | fa = fa_to_delete; | ||
| 1672 | rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req); | ||
| 1673 | 1589 | ||
| 1674 | l = fib_find_node(t, key); | 1590 | fa = fa_to_delete; |
| 1675 | li = find_leaf_info(&l->list, plen); | 1591 | rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req); |
| 1676 | 1592 | ||
| 1677 | write_lock_bh(&fib_lock); | 1593 | l = fib_find_node(t, key); |
| 1594 | li = find_leaf_info(&l->list, plen); | ||
| 1678 | 1595 | ||
| 1679 | list_del(&fa->fa_list); | 1596 | list_del_rcu(&fa->fa_list); |
| 1680 | 1597 | ||
| 1681 | if (list_empty(fa_head)) { | 1598 | if (list_empty(fa_head)) { |
| 1682 | hlist_del(&li->hlist); | 1599 | hlist_del_rcu(&li->hlist); |
| 1683 | kill_li = 1; | 1600 | free_leaf_info(li); |
| 1684 | } | 1601 | } |
| 1685 | write_unlock_bh(&fib_lock); | ||
| 1686 | |||
| 1687 | if (kill_li) | ||
| 1688 | free_leaf_info(li); | ||
| 1689 | 1602 | ||
| 1690 | if (hlist_empty(&l->list)) | 1603 | if (hlist_empty(&l->list)) |
| 1691 | trie_leaf_remove(t, key); | 1604 | trie_leaf_remove(t, key); |
| 1692 | 1605 | ||
| 1693 | if (fa->fa_state & FA_S_ACCESSED) | 1606 | if (fa->fa_state & FA_S_ACCESSED) |
| 1694 | rt_cache_flush(-1); | 1607 | rt_cache_flush(-1); |
| 1695 | 1608 | ||
| 1696 | fn_free_alias(fa); | 1609 | fib_release_info(fa->fa_info); |
| 1697 | return 0; | 1610 | alias_free_mem_rcu(fa); |
| 1698 | } | 1611 | return 0; |
| 1699 | return -ESRCH; | ||
| 1700 | } | 1612 | } |
| 1701 | 1613 | ||
| 1702 | static int trie_flush_list(struct trie *t, struct list_head *head) | 1614 | static int trie_flush_list(struct trie *t, struct list_head *head) |
| @@ -1706,14 +1618,11 @@ static int trie_flush_list(struct trie *t, struct list_head *head) | |||
| 1706 | 1618 | ||
| 1707 | list_for_each_entry_safe(fa, fa_node, head, fa_list) { | 1619 | list_for_each_entry_safe(fa, fa_node, head, fa_list) { |
| 1708 | struct fib_info *fi = fa->fa_info; | 1620 | struct fib_info *fi = fa->fa_info; |
| 1709 | |||
| 1710 | if (fi && (fi->fib_flags&RTNH_F_DEAD)) { | ||
| 1711 | |||
| 1712 | write_lock_bh(&fib_lock); | ||
| 1713 | list_del(&fa->fa_list); | ||
| 1714 | write_unlock_bh(&fib_lock); | ||
| 1715 | 1621 | ||
| 1716 | fn_free_alias(fa); | 1622 | if (fi && (fi->fib_flags & RTNH_F_DEAD)) { |
| 1623 | list_del_rcu(&fa->fa_list); | ||
| 1624 | fib_release_info(fa->fa_info); | ||
| 1625 | alias_free_mem_rcu(fa); | ||
| 1717 | found++; | 1626 | found++; |
| 1718 | } | 1627 | } |
| 1719 | } | 1628 | } |
| @@ -1728,37 +1637,34 @@ static int trie_flush_leaf(struct trie *t, struct leaf *l) | |||
| 1728 | struct leaf_info *li = NULL; | 1637 | struct leaf_info *li = NULL; |
| 1729 | 1638 | ||
| 1730 | hlist_for_each_entry_safe(li, node, tmp, lih, hlist) { | 1639 | hlist_for_each_entry_safe(li, node, tmp, lih, hlist) { |
| 1731 | |||
| 1732 | found += trie_flush_list(t, &li->falh); | 1640 | found += trie_flush_list(t, &li->falh); |
| 1733 | 1641 | ||
| 1734 | if (list_empty(&li->falh)) { | 1642 | if (list_empty(&li->falh)) { |
| 1735 | 1643 | hlist_del_rcu(&li->hlist); | |
| 1736 | write_lock_bh(&fib_lock); | ||
| 1737 | hlist_del(&li->hlist); | ||
| 1738 | write_unlock_bh(&fib_lock); | ||
| 1739 | |||
| 1740 | free_leaf_info(li); | 1644 | free_leaf_info(li); |
| 1741 | } | 1645 | } |
| 1742 | } | 1646 | } |
| 1743 | return found; | 1647 | return found; |
| 1744 | } | 1648 | } |
| 1745 | 1649 | ||
| 1650 | /* rcu_read_lock needs to be hold by caller from readside */ | ||
| 1651 | |||
| 1746 | static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf) | 1652 | static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf) |
| 1747 | { | 1653 | { |
| 1748 | struct node *c = (struct node *) thisleaf; | 1654 | struct node *c = (struct node *) thisleaf; |
| 1749 | struct tnode *p; | 1655 | struct tnode *p; |
| 1750 | int idx; | 1656 | int idx; |
| 1657 | struct node *trie = rcu_dereference(t->trie); | ||
| 1751 | 1658 | ||
| 1752 | if (c == NULL) { | 1659 | if (c == NULL) { |
| 1753 | if (t->trie == NULL) | 1660 | if (trie == NULL) |
| 1754 | return NULL; | 1661 | return NULL; |
| 1755 | 1662 | ||
| 1756 | if (IS_LEAF(t->trie)) /* trie w. just a leaf */ | 1663 | if (IS_LEAF(trie)) /* trie w. just a leaf */ |
| 1757 | return (struct leaf *) t->trie; | 1664 | return (struct leaf *) trie; |
| 1758 | 1665 | ||
| 1759 | p = (struct tnode*) t->trie; /* Start */ | 1666 | p = (struct tnode*) trie; /* Start */ |
| 1760 | } | 1667 | } else |
| 1761 | else | ||
| 1762 | p = (struct tnode *) NODE_PARENT(c); | 1668 | p = (struct tnode *) NODE_PARENT(c); |
| 1763 | 1669 | ||
| 1764 | while (p) { | 1670 | while (p) { |
| @@ -1771,29 +1677,31 @@ static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf) | |||
| 1771 | pos = 0; | 1677 | pos = 0; |
| 1772 | 1678 | ||
| 1773 | last = 1 << p->bits; | 1679 | last = 1 << p->bits; |
| 1774 | for(idx = pos; idx < last ; idx++) { | 1680 | for (idx = pos; idx < last ; idx++) { |
| 1775 | if (p->child[idx]) { | 1681 | c = rcu_dereference(p->child[idx]); |
| 1776 | 1682 | ||
| 1777 | /* Decend if tnode */ | 1683 | if (!c) |
| 1778 | 1684 | continue; | |
| 1779 | while (IS_TNODE(p->child[idx])) { | 1685 | |
| 1780 | p = (struct tnode*) p->child[idx]; | 1686 | /* Decend if tnode */ |
| 1781 | idx = 0; | 1687 | while (IS_TNODE(c)) { |
| 1782 | 1688 | p = (struct tnode *) c; | |
| 1783 | /* Rightmost non-NULL branch */ | 1689 | idx = 0; |
| 1784 | if (p && IS_TNODE(p)) | 1690 | |
| 1785 | while (p->child[idx] == NULL && idx < (1 << p->bits)) idx++; | 1691 | /* Rightmost non-NULL branch */ |
| 1786 | 1692 | if (p && IS_TNODE(p)) | |
| 1787 | /* Done with this tnode? */ | 1693 | while (!(c = rcu_dereference(p->child[idx])) |
| 1788 | if (idx >= (1 << p->bits) || p->child[idx] == NULL ) | 1694 | && idx < (1<<p->bits)) idx++; |
| 1789 | goto up; | 1695 | |
| 1790 | } | 1696 | /* Done with this tnode? */ |
| 1791 | return (struct leaf*) p->child[idx]; | 1697 | if (idx >= (1 << p->bits) || !c) |
| 1698 | goto up; | ||
| 1792 | } | 1699 | } |
| 1700 | return (struct leaf *) c; | ||
| 1793 | } | 1701 | } |
| 1794 | up: | 1702 | up: |
| 1795 | /* No more children go up one step */ | 1703 | /* No more children go up one step */ |
| 1796 | c = (struct node*) p; | 1704 | c = (struct node *) p; |
| 1797 | p = (struct tnode *) NODE_PARENT(p); | 1705 | p = (struct tnode *) NODE_PARENT(p); |
| 1798 | } | 1706 | } |
| 1799 | return NULL; /* Ready. Root of trie */ | 1707 | return NULL; /* Ready. Root of trie */ |
| @@ -1807,23 +1715,24 @@ static int fn_trie_flush(struct fib_table *tb) | |||
| 1807 | 1715 | ||
| 1808 | t->revision++; | 1716 | t->revision++; |
| 1809 | 1717 | ||
| 1810 | for (h=0; (l = nextleaf(t, l)) != NULL; h++) { | 1718 | rcu_read_lock(); |
| 1719 | for (h = 0; (l = nextleaf(t, l)) != NULL; h++) { | ||
| 1811 | found += trie_flush_leaf(t, l); | 1720 | found += trie_flush_leaf(t, l); |
| 1812 | 1721 | ||
| 1813 | if (ll && hlist_empty(&ll->list)) | 1722 | if (ll && hlist_empty(&ll->list)) |
| 1814 | trie_leaf_remove(t, ll->key); | 1723 | trie_leaf_remove(t, ll->key); |
| 1815 | ll = l; | 1724 | ll = l; |
| 1816 | } | 1725 | } |
| 1726 | rcu_read_unlock(); | ||
| 1817 | 1727 | ||
| 1818 | if (ll && hlist_empty(&ll->list)) | 1728 | if (ll && hlist_empty(&ll->list)) |
| 1819 | trie_leaf_remove(t, ll->key); | 1729 | trie_leaf_remove(t, ll->key); |
| 1820 | 1730 | ||
| 1821 | if (trie_debug) | 1731 | pr_debug("trie_flush found=%d\n", found); |
| 1822 | printk("trie_flush found=%d\n", found); | ||
| 1823 | return found; | 1732 | return found; |
| 1824 | } | 1733 | } |
| 1825 | 1734 | ||
| 1826 | static int trie_last_dflt=-1; | 1735 | static int trie_last_dflt = -1; |
| 1827 | 1736 | ||
| 1828 | static void | 1737 | static void |
| 1829 | fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) | 1738 | fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) |
| @@ -1840,7 +1749,7 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib | |||
| 1840 | last_resort = NULL; | 1749 | last_resort = NULL; |
| 1841 | order = -1; | 1750 | order = -1; |
| 1842 | 1751 | ||
| 1843 | read_lock(&fib_lock); | 1752 | rcu_read_lock(); |
| 1844 | 1753 | ||
| 1845 | l = fib_find_node(t, 0); | 1754 | l = fib_find_node(t, 0); |
| 1846 | if (!l) | 1755 | if (!l) |
| @@ -1853,20 +1762,20 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib | |||
| 1853 | if (list_empty(fa_head)) | 1762 | if (list_empty(fa_head)) |
| 1854 | goto out; | 1763 | goto out; |
| 1855 | 1764 | ||
| 1856 | list_for_each_entry(fa, fa_head, fa_list) { | 1765 | list_for_each_entry_rcu(fa, fa_head, fa_list) { |
| 1857 | struct fib_info *next_fi = fa->fa_info; | 1766 | struct fib_info *next_fi = fa->fa_info; |
| 1858 | 1767 | ||
| 1859 | if (fa->fa_scope != res->scope || | 1768 | if (fa->fa_scope != res->scope || |
| 1860 | fa->fa_type != RTN_UNICAST) | 1769 | fa->fa_type != RTN_UNICAST) |
| 1861 | continue; | 1770 | continue; |
| 1862 | 1771 | ||
| 1863 | if (next_fi->fib_priority > res->fi->fib_priority) | 1772 | if (next_fi->fib_priority > res->fi->fib_priority) |
| 1864 | break; | 1773 | break; |
| 1865 | if (!next_fi->fib_nh[0].nh_gw || | 1774 | if (!next_fi->fib_nh[0].nh_gw || |
| 1866 | next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) | 1775 | next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) |
| 1867 | continue; | 1776 | continue; |
| 1868 | fa->fa_state |= FA_S_ACCESSED; | 1777 | fa->fa_state |= FA_S_ACCESSED; |
| 1869 | 1778 | ||
| 1870 | if (fi == NULL) { | 1779 | if (fi == NULL) { |
| 1871 | if (next_fi != res->fi) | 1780 | if (next_fi != res->fi) |
| 1872 | break; | 1781 | break; |
| @@ -1904,7 +1813,7 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib | |||
| 1904 | } | 1813 | } |
| 1905 | trie_last_dflt = last_idx; | 1814 | trie_last_dflt = last_idx; |
| 1906 | out:; | 1815 | out:; |
| 1907 | read_unlock(&fib_lock); | 1816 | rcu_read_unlock(); |
| 1908 | } | 1817 | } |
| 1909 | 1818 | ||
| 1910 | static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb, | 1819 | static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb, |
| @@ -1913,12 +1822,14 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi | |||
| 1913 | int i, s_i; | 1822 | int i, s_i; |
| 1914 | struct fib_alias *fa; | 1823 | struct fib_alias *fa; |
| 1915 | 1824 | ||
| 1916 | u32 xkey=htonl(key); | 1825 | u32 xkey = htonl(key); |
| 1917 | 1826 | ||
| 1918 | s_i=cb->args[3]; | 1827 | s_i = cb->args[3]; |
| 1919 | i = 0; | 1828 | i = 0; |
| 1920 | 1829 | ||
| 1921 | list_for_each_entry(fa, fah, fa_list) { | 1830 | /* rcu_read_lock is hold by caller */ |
| 1831 | |||
| 1832 | list_for_each_entry_rcu(fa, fah, fa_list) { | ||
| 1922 | if (i < s_i) { | 1833 | if (i < s_i) { |
| 1923 | i++; | 1834 | i++; |
| 1924 | continue; | 1835 | continue; |
| @@ -1946,10 +1857,10 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi | |||
| 1946 | fa->fa_info, 0) < 0) { | 1857 | fa->fa_info, 0) < 0) { |
| 1947 | cb->args[3] = i; | 1858 | cb->args[3] = i; |
| 1948 | return -1; | 1859 | return -1; |
| 1949 | } | 1860 | } |
| 1950 | i++; | 1861 | i++; |
| 1951 | } | 1862 | } |
| 1952 | cb->args[3]=i; | 1863 | cb->args[3] = i; |
| 1953 | return skb->len; | 1864 | return skb->len; |
| 1954 | } | 1865 | } |
| 1955 | 1866 | ||
| @@ -1959,10 +1870,10 @@ static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, str | |||
| 1959 | int h, s_h; | 1870 | int h, s_h; |
| 1960 | struct list_head *fa_head; | 1871 | struct list_head *fa_head; |
| 1961 | struct leaf *l = NULL; | 1872 | struct leaf *l = NULL; |
| 1962 | s_h=cb->args[2]; | ||
| 1963 | 1873 | ||
| 1964 | for (h=0; (l = nextleaf(t, l)) != NULL; h++) { | 1874 | s_h = cb->args[2]; |
| 1965 | 1875 | ||
| 1876 | for (h = 0; (l = nextleaf(t, l)) != NULL; h++) { | ||
| 1966 | if (h < s_h) | 1877 | if (h < s_h) |
| 1967 | continue; | 1878 | continue; |
| 1968 | if (h > s_h) | 1879 | if (h > s_h) |
| @@ -1970,7 +1881,7 @@ static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, str | |||
| 1970 | sizeof(cb->args) - 3*sizeof(cb->args[0])); | 1881 | sizeof(cb->args) - 3*sizeof(cb->args[0])); |
| 1971 | 1882 | ||
| 1972 | fa_head = get_fa_head(l, plen); | 1883 | fa_head = get_fa_head(l, plen); |
| 1973 | 1884 | ||
| 1974 | if (!fa_head) | 1885 | if (!fa_head) |
| 1975 | continue; | 1886 | continue; |
| 1976 | 1887 | ||
| @@ -1978,11 +1889,11 @@ static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, str | |||
| 1978 | continue; | 1889 | continue; |
| 1979 | 1890 | ||
| 1980 | if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) { | 1891 | if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) { |
| 1981 | cb->args[2]=h; | 1892 | cb->args[2] = h; |
| 1982 | return -1; | 1893 | return -1; |
| 1983 | } | 1894 | } |
| 1984 | } | 1895 | } |
| 1985 | cb->args[2]=h; | 1896 | cb->args[2] = h; |
| 1986 | return skb->len; | 1897 | return skb->len; |
| 1987 | } | 1898 | } |
| 1988 | 1899 | ||
| @@ -1993,25 +1904,24 @@ static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlin | |||
| 1993 | 1904 | ||
| 1994 | s_m = cb->args[1]; | 1905 | s_m = cb->args[1]; |
| 1995 | 1906 | ||
| 1996 | read_lock(&fib_lock); | 1907 | rcu_read_lock(); |
| 1997 | for (m=0; m<=32; m++) { | 1908 | for (m = 0; m <= 32; m++) { |
| 1998 | |||
| 1999 | if (m < s_m) | 1909 | if (m < s_m) |
| 2000 | continue; | 1910 | continue; |
| 2001 | if (m > s_m) | 1911 | if (m > s_m) |
| 2002 | memset(&cb->args[2], 0, | 1912 | memset(&cb->args[2], 0, |
| 2003 | sizeof(cb->args) - 2*sizeof(cb->args[0])); | 1913 | sizeof(cb->args) - 2*sizeof(cb->args[0])); |
| 2004 | 1914 | ||
| 2005 | if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) { | 1915 | if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) { |
| 2006 | cb->args[1] = m; | 1916 | cb->args[1] = m; |
| 2007 | goto out; | 1917 | goto out; |
| 2008 | } | 1918 | } |
| 2009 | } | 1919 | } |
| 2010 | read_unlock(&fib_lock); | 1920 | rcu_read_unlock(); |
| 2011 | cb->args[1] = m; | 1921 | cb->args[1] = m; |
| 2012 | return skb->len; | 1922 | return skb->len; |
| 2013 | out: | 1923 | out: |
| 2014 | read_unlock(&fib_lock); | 1924 | rcu_read_unlock(); |
| 2015 | return -1; | 1925 | return -1; |
| 2016 | } | 1926 | } |
| 2017 | 1927 | ||
| @@ -2051,9 +1961,9 @@ struct fib_table * __init fib_hash_init(int id) | |||
| 2051 | trie_init(t); | 1961 | trie_init(t); |
| 2052 | 1962 | ||
| 2053 | if (id == RT_TABLE_LOCAL) | 1963 | if (id == RT_TABLE_LOCAL) |
| 2054 | trie_local = t; | 1964 | trie_local = t; |
| 2055 | else if (id == RT_TABLE_MAIN) | 1965 | else if (id == RT_TABLE_MAIN) |
| 2056 | trie_main = t; | 1966 | trie_main = t; |
| 2057 | 1967 | ||
| 2058 | if (id == RT_TABLE_LOCAL) | 1968 | if (id == RT_TABLE_LOCAL) |
| 2059 | printk("IPv4 FIB: Using LC-trie version %s\n", VERSION); | 1969 | printk("IPv4 FIB: Using LC-trie version %s\n", VERSION); |
| @@ -2065,7 +1975,8 @@ struct fib_table * __init fib_hash_init(int id) | |||
| 2065 | 1975 | ||
| 2066 | static void putspace_seq(struct seq_file *seq, int n) | 1976 | static void putspace_seq(struct seq_file *seq, int n) |
| 2067 | { | 1977 | { |
| 2068 | while (n--) seq_printf(seq, " "); | 1978 | while (n--) |
| 1979 | seq_printf(seq, " "); | ||
| 2069 | } | 1980 | } |
| 2070 | 1981 | ||
| 2071 | static void printbin_seq(struct seq_file *seq, unsigned int v, int bits) | 1982 | static void printbin_seq(struct seq_file *seq, unsigned int v, int bits) |
| @@ -2086,29 +1997,22 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n, | |||
| 2086 | seq_printf(seq, "%d/", cindex); | 1997 | seq_printf(seq, "%d/", cindex); |
| 2087 | printbin_seq(seq, cindex, bits); | 1998 | printbin_seq(seq, cindex, bits); |
| 2088 | seq_printf(seq, ": "); | 1999 | seq_printf(seq, ": "); |
| 2089 | } | 2000 | } else |
| 2090 | else | ||
| 2091 | seq_printf(seq, "<root>: "); | 2001 | seq_printf(seq, "<root>: "); |
| 2092 | seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n); | 2002 | seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n); |
| 2093 | 2003 | ||
| 2094 | if (IS_LEAF(n)) | ||
| 2095 | seq_printf(seq, "key=%d.%d.%d.%d\n", | ||
| 2096 | n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256); | ||
| 2097 | else { | ||
| 2098 | int plen = ((struct tnode *)n)->pos; | ||
| 2099 | t_key prf=MASK_PFX(n->key, plen); | ||
| 2100 | seq_printf(seq, "key=%d.%d.%d.%d/%d\n", | ||
| 2101 | prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen); | ||
| 2102 | } | ||
| 2103 | if (IS_LEAF(n)) { | 2004 | if (IS_LEAF(n)) { |
| 2104 | struct leaf *l=(struct leaf *)n; | 2005 | struct leaf *l = (struct leaf *)n; |
| 2105 | struct fib_alias *fa; | 2006 | struct fib_alias *fa; |
| 2106 | int i; | 2007 | int i; |
| 2107 | for (i=32; i>=0; i--) | 2008 | |
| 2108 | if (find_leaf_info(&l->list, i)) { | 2009 | seq_printf(seq, "key=%d.%d.%d.%d\n", |
| 2109 | 2010 | n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256); | |
| 2011 | |||
| 2012 | for (i = 32; i >= 0; i--) | ||
| 2013 | if (find_leaf_info(&l->list, i)) { | ||
| 2110 | struct list_head *fa_head = get_fa_head(l, i); | 2014 | struct list_head *fa_head = get_fa_head(l, i); |
| 2111 | 2015 | ||
| 2112 | if (!fa_head) | 2016 | if (!fa_head) |
| 2113 | continue; | 2017 | continue; |
| 2114 | 2018 | ||
| @@ -2118,17 +2022,16 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n, | |||
| 2118 | putspace_seq(seq, indent+2); | 2022 | putspace_seq(seq, indent+2); |
| 2119 | seq_printf(seq, "{/%d...dumping}\n", i); | 2023 | seq_printf(seq, "{/%d...dumping}\n", i); |
| 2120 | 2024 | ||
| 2121 | 2025 | list_for_each_entry_rcu(fa, fa_head, fa_list) { | |
| 2122 | list_for_each_entry(fa, fa_head, fa_list) { | ||
| 2123 | putspace_seq(seq, indent+2); | 2026 | putspace_seq(seq, indent+2); |
| 2124 | if (fa->fa_info->fib_nh == NULL) { | ||
| 2125 | seq_printf(seq, "Error _fib_nh=NULL\n"); | ||
| 2126 | continue; | ||
| 2127 | } | ||
| 2128 | if (fa->fa_info == NULL) { | 2027 | if (fa->fa_info == NULL) { |
| 2129 | seq_printf(seq, "Error fa_info=NULL\n"); | 2028 | seq_printf(seq, "Error fa_info=NULL\n"); |
| 2130 | continue; | 2029 | continue; |
| 2131 | } | 2030 | } |
| 2031 | if (fa->fa_info->fib_nh == NULL) { | ||
| 2032 | seq_printf(seq, "Error _fib_nh=NULL\n"); | ||
| 2033 | continue; | ||
| 2034 | } | ||
| 2132 | 2035 | ||
| 2133 | seq_printf(seq, "{type=%d scope=%d TOS=%d}\n", | 2036 | seq_printf(seq, "{type=%d scope=%d TOS=%d}\n", |
| 2134 | fa->fa_type, | 2037 | fa->fa_type, |
| @@ -2136,11 +2039,16 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n, | |||
| 2136 | fa->fa_tos); | 2039 | fa->fa_tos); |
| 2137 | } | 2040 | } |
| 2138 | } | 2041 | } |
| 2139 | } | 2042 | } else { |
| 2140 | else if (IS_TNODE(n)) { | ||
| 2141 | struct tnode *tn = (struct tnode *)n; | 2043 | struct tnode *tn = (struct tnode *)n; |
| 2044 | int plen = ((struct tnode *)n)->pos; | ||
| 2045 | t_key prf = MASK_PFX(n->key, plen); | ||
| 2046 | |||
| 2047 | seq_printf(seq, "key=%d.%d.%d.%d/%d\n", | ||
| 2048 | prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen); | ||
| 2049 | |||
| 2142 | putspace_seq(seq, indent); seq_printf(seq, "| "); | 2050 | putspace_seq(seq, indent); seq_printf(seq, "| "); |
| 2143 | seq_printf(seq, "{key prefix=%08x/", tn->key&TKEY_GET_MASK(0, tn->pos)); | 2051 | seq_printf(seq, "{key prefix=%08x/", tn->key & TKEY_GET_MASK(0, tn->pos)); |
| 2144 | printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos); | 2052 | printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos); |
| 2145 | seq_printf(seq, "}\n"); | 2053 | seq_printf(seq, "}\n"); |
| 2146 | putspace_seq(seq, indent); seq_printf(seq, "| "); | 2054 | putspace_seq(seq, indent); seq_printf(seq, "| "); |
| @@ -2154,194 +2062,196 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n, | |||
| 2154 | 2062 | ||
| 2155 | static void trie_dump_seq(struct seq_file *seq, struct trie *t) | 2063 | static void trie_dump_seq(struct seq_file *seq, struct trie *t) |
| 2156 | { | 2064 | { |
| 2157 | struct node *n = t->trie; | 2065 | struct node *n; |
| 2158 | int cindex=0; | 2066 | int cindex = 0; |
| 2159 | int indent=1; | 2067 | int indent = 1; |
| 2160 | int pend=0; | 2068 | int pend = 0; |
| 2161 | int depth = 0; | 2069 | int depth = 0; |
| 2070 | struct tnode *tn; | ||
| 2162 | 2071 | ||
| 2163 | read_lock(&fib_lock); | 2072 | rcu_read_lock(); |
| 2164 | 2073 | n = rcu_dereference(t->trie); | |
| 2165 | seq_printf(seq, "------ trie_dump of t=%p ------\n", t); | 2074 | seq_printf(seq, "------ trie_dump of t=%p ------\n", t); |
| 2166 | if (n) { | ||
| 2167 | printnode_seq(seq, indent, n, pend, cindex, 0); | ||
| 2168 | if (IS_TNODE(n)) { | ||
| 2169 | struct tnode *tn = (struct tnode *)n; | ||
| 2170 | pend = tn->pos+tn->bits; | ||
| 2171 | putspace_seq(seq, indent); seq_printf(seq, "\\--\n"); | ||
| 2172 | indent += 3; | ||
| 2173 | depth++; | ||
| 2174 | |||
| 2175 | while (tn && cindex < (1 << tn->bits)) { | ||
| 2176 | if (tn->child[cindex]) { | ||
| 2177 | |||
| 2178 | /* Got a child */ | ||
| 2179 | |||
| 2180 | printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits); | ||
| 2181 | if (IS_LEAF(tn->child[cindex])) { | ||
| 2182 | cindex++; | ||
| 2183 | |||
| 2184 | } | ||
| 2185 | else { | ||
| 2186 | /* | ||
| 2187 | * New tnode. Decend one level | ||
| 2188 | */ | ||
| 2189 | |||
| 2190 | depth++; | ||
| 2191 | n = tn->child[cindex]; | ||
| 2192 | tn = (struct tnode *)n; | ||
| 2193 | pend = tn->pos+tn->bits; | ||
| 2194 | putspace_seq(seq, indent); seq_printf(seq, "\\--\n"); | ||
| 2195 | indent+=3; | ||
| 2196 | cindex=0; | ||
| 2197 | } | ||
| 2198 | } | ||
| 2199 | else | ||
| 2200 | cindex++; | ||
| 2201 | 2075 | ||
| 2076 | if (!n) { | ||
| 2077 | seq_printf(seq, "------ trie is empty\n"); | ||
| 2078 | |||
| 2079 | rcu_read_unlock(); | ||
| 2080 | return; | ||
| 2081 | } | ||
| 2082 | |||
| 2083 | printnode_seq(seq, indent, n, pend, cindex, 0); | ||
| 2084 | |||
| 2085 | if (!IS_TNODE(n)) { | ||
| 2086 | rcu_read_unlock(); | ||
| 2087 | return; | ||
| 2088 | } | ||
| 2089 | |||
| 2090 | tn = (struct tnode *)n; | ||
| 2091 | pend = tn->pos+tn->bits; | ||
| 2092 | putspace_seq(seq, indent); seq_printf(seq, "\\--\n"); | ||
| 2093 | indent += 3; | ||
| 2094 | depth++; | ||
| 2095 | |||
| 2096 | while (tn && cindex < (1 << tn->bits)) { | ||
| 2097 | struct node *child = rcu_dereference(tn->child[cindex]); | ||
| 2098 | if (!child) | ||
| 2099 | cindex++; | ||
| 2100 | else { | ||
| 2101 | /* Got a child */ | ||
| 2102 | printnode_seq(seq, indent, child, pend, | ||
| 2103 | cindex, tn->bits); | ||
| 2104 | |||
| 2105 | if (IS_LEAF(child)) | ||
| 2106 | cindex++; | ||
| 2107 | |||
| 2108 | else { | ||
| 2202 | /* | 2109 | /* |
| 2203 | * Test if we are done | 2110 | * New tnode. Decend one level |
| 2204 | */ | 2111 | */ |
| 2205 | |||
| 2206 | while (cindex >= (1 << tn->bits)) { | ||
| 2207 | 2112 | ||
| 2208 | /* | 2113 | depth++; |
| 2209 | * Move upwards and test for root | 2114 | n = child; |
| 2210 | * pop off all traversed nodes | 2115 | tn = (struct tnode *)n; |
| 2211 | */ | 2116 | pend = tn->pos+tn->bits; |
| 2212 | 2117 | putspace_seq(seq, indent); | |
| 2213 | if (NODE_PARENT(tn) == NULL) { | 2118 | seq_printf(seq, "\\--\n"); |
| 2214 | tn = NULL; | 2119 | indent += 3; |
| 2215 | n = NULL; | 2120 | cindex = 0; |
| 2216 | break; | ||
| 2217 | } | ||
| 2218 | else { | ||
| 2219 | cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits); | ||
| 2220 | tn = NODE_PARENT(tn); | ||
| 2221 | cindex++; | ||
| 2222 | n = (struct node *)tn; | ||
| 2223 | pend = tn->pos+tn->bits; | ||
| 2224 | indent-=3; | ||
| 2225 | depth--; | ||
| 2226 | } | ||
| 2227 | } | ||
| 2228 | } | 2121 | } |
| 2229 | } | 2122 | } |
| 2230 | else n = NULL; | ||
| 2231 | } | ||
| 2232 | else seq_printf(seq, "------ trie is empty\n"); | ||
| 2233 | 2123 | ||
| 2234 | read_unlock(&fib_lock); | 2124 | /* |
| 2125 | * Test if we are done | ||
| 2126 | */ | ||
| 2127 | |||
| 2128 | while (cindex >= (1 << tn->bits)) { | ||
| 2129 | /* | ||
| 2130 | * Move upwards and test for root | ||
| 2131 | * pop off all traversed nodes | ||
| 2132 | */ | ||
| 2133 | |||
| 2134 | if (NODE_PARENT(tn) == NULL) { | ||
| 2135 | tn = NULL; | ||
| 2136 | break; | ||
| 2137 | } | ||
| 2138 | |||
| 2139 | cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits); | ||
| 2140 | cindex++; | ||
| 2141 | tn = NODE_PARENT(tn); | ||
| 2142 | pend = tn->pos + tn->bits; | ||
| 2143 | indent -= 3; | ||
| 2144 | depth--; | ||
| 2145 | } | ||
| 2146 | } | ||
| 2147 | rcu_read_unlock(); | ||
| 2235 | } | 2148 | } |
| 2236 | 2149 | ||
| 2237 | static struct trie_stat *trie_stat_new(void) | 2150 | static struct trie_stat *trie_stat_new(void) |
| 2238 | { | 2151 | { |
| 2239 | struct trie_stat *s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL); | 2152 | struct trie_stat *s; |
| 2240 | int i; | 2153 | int i; |
| 2241 | 2154 | ||
| 2242 | if (s) { | 2155 | s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL); |
| 2243 | s->totdepth = 0; | 2156 | if (!s) |
| 2244 | s->maxdepth = 0; | 2157 | return NULL; |
| 2245 | s->tnodes = 0; | 2158 | |
| 2246 | s->leaves = 0; | 2159 | s->totdepth = 0; |
| 2247 | s->nullpointers = 0; | 2160 | s->maxdepth = 0; |
| 2248 | 2161 | s->tnodes = 0; | |
| 2249 | for(i=0; i< MAX_CHILDS; i++) | 2162 | s->leaves = 0; |
| 2250 | s->nodesizes[i] = 0; | 2163 | s->nullpointers = 0; |
| 2251 | } | 2164 | |
| 2165 | for (i = 0; i < MAX_CHILDS; i++) | ||
| 2166 | s->nodesizes[i] = 0; | ||
| 2167 | |||
| 2252 | return s; | 2168 | return s; |
| 2253 | } | 2169 | } |
| 2254 | 2170 | ||
| 2255 | static struct trie_stat *trie_collect_stats(struct trie *t) | 2171 | static struct trie_stat *trie_collect_stats(struct trie *t) |
| 2256 | { | 2172 | { |
| 2257 | struct node *n = t->trie; | 2173 | struct node *n; |
| 2258 | struct trie_stat *s = trie_stat_new(); | 2174 | struct trie_stat *s = trie_stat_new(); |
| 2259 | int cindex = 0; | 2175 | int cindex = 0; |
| 2260 | int indent = 1; | ||
| 2261 | int pend = 0; | 2176 | int pend = 0; |
| 2262 | int depth = 0; | 2177 | int depth = 0; |
| 2263 | 2178 | ||
| 2264 | read_lock(&fib_lock); | 2179 | if (!s) |
| 2180 | return NULL; | ||
| 2265 | 2181 | ||
| 2266 | if (s) { | 2182 | rcu_read_lock(); |
| 2267 | if (n) { | 2183 | n = rcu_dereference(t->trie); |
| 2268 | if (IS_TNODE(n)) { | ||
| 2269 | struct tnode *tn = (struct tnode *)n; | ||
| 2270 | pend = tn->pos+tn->bits; | ||
| 2271 | indent += 3; | ||
| 2272 | s->nodesizes[tn->bits]++; | ||
| 2273 | depth++; | ||
| 2274 | 2184 | ||
| 2275 | while (tn && cindex < (1 << tn->bits)) { | 2185 | if (!n) |
| 2276 | if (tn->child[cindex]) { | 2186 | return s; |
| 2277 | /* Got a child */ | 2187 | |
| 2278 | 2188 | if (IS_TNODE(n)) { | |
| 2279 | if (IS_LEAF(tn->child[cindex])) { | 2189 | struct tnode *tn = (struct tnode *)n; |
| 2280 | cindex++; | 2190 | pend = tn->pos+tn->bits; |
| 2281 | 2191 | s->nodesizes[tn->bits]++; | |
| 2282 | /* stats */ | 2192 | depth++; |
| 2283 | if (depth > s->maxdepth) | 2193 | |
| 2284 | s->maxdepth = depth; | 2194 | while (tn && cindex < (1 << tn->bits)) { |
| 2285 | s->totdepth += depth; | 2195 | struct node *ch = rcu_dereference(tn->child[cindex]); |
| 2286 | s->leaves++; | 2196 | if (ch) { |
| 2287 | } | ||
| 2288 | |||
| 2289 | else { | ||
| 2290 | /* | ||
| 2291 | * New tnode. Decend one level | ||
| 2292 | */ | ||
| 2293 | |||
| 2294 | s->tnodes++; | ||
| 2295 | s->nodesizes[tn->bits]++; | ||
| 2296 | depth++; | ||
| 2297 | |||
| 2298 | n = tn->child[cindex]; | ||
| 2299 | tn = (struct tnode *)n; | ||
| 2300 | pend = tn->pos+tn->bits; | ||
| 2301 | |||
| 2302 | indent += 3; | ||
| 2303 | cindex = 0; | ||
| 2304 | } | ||
| 2305 | } | ||
| 2306 | else { | ||
| 2307 | cindex++; | ||
| 2308 | s->nullpointers++; | ||
| 2309 | } | ||
| 2310 | 2197 | ||
| 2198 | /* Got a child */ | ||
| 2199 | |||
| 2200 | if (IS_LEAF(tn->child[cindex])) { | ||
| 2201 | cindex++; | ||
| 2202 | |||
| 2203 | /* stats */ | ||
| 2204 | if (depth > s->maxdepth) | ||
| 2205 | s->maxdepth = depth; | ||
| 2206 | s->totdepth += depth; | ||
| 2207 | s->leaves++; | ||
| 2208 | } else { | ||
| 2311 | /* | 2209 | /* |
| 2312 | * Test if we are done | 2210 | * New tnode. Decend one level |
| 2313 | */ | 2211 | */ |
| 2314 | 2212 | ||
| 2315 | while (cindex >= (1 << tn->bits)) { | 2213 | s->tnodes++; |
| 2316 | 2214 | s->nodesizes[tn->bits]++; | |
| 2317 | /* | 2215 | depth++; |
| 2318 | * Move upwards and test for root | 2216 | |
| 2319 | * pop off all traversed nodes | 2217 | n = ch; |
| 2320 | */ | 2218 | tn = (struct tnode *)n; |
| 2321 | 2219 | pend = tn->pos+tn->bits; | |
| 2322 | 2220 | ||
| 2323 | if (NODE_PARENT(tn) == NULL) { | 2221 | cindex = 0; |
| 2324 | tn = NULL; | ||
| 2325 | n = NULL; | ||
| 2326 | break; | ||
| 2327 | } | ||
| 2328 | else { | ||
| 2329 | cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits); | ||
| 2330 | tn = NODE_PARENT(tn); | ||
| 2331 | cindex++; | ||
| 2332 | n = (struct node *)tn; | ||
| 2333 | pend = tn->pos+tn->bits; | ||
| 2334 | indent -= 3; | ||
| 2335 | depth--; | ||
| 2336 | } | ||
| 2337 | } | ||
| 2338 | } | 2222 | } |
| 2223 | } else { | ||
| 2224 | cindex++; | ||
| 2225 | s->nullpointers++; | ||
| 2339 | } | 2226 | } |
| 2340 | else n = NULL; | 2227 | |
| 2228 | /* | ||
| 2229 | * Test if we are done | ||
| 2230 | */ | ||
| 2231 | |||
| 2232 | while (cindex >= (1 << tn->bits)) { | ||
| 2233 | /* | ||
| 2234 | * Move upwards and test for root | ||
| 2235 | * pop off all traversed nodes | ||
| 2236 | */ | ||
| 2237 | |||
| 2238 | if (NODE_PARENT(tn) == NULL) { | ||
| 2239 | tn = NULL; | ||
| 2240 | n = NULL; | ||
| 2241 | break; | ||
| 2242 | } | ||
| 2243 | |||
| 2244 | cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits); | ||
| 2245 | tn = NODE_PARENT(tn); | ||
| 2246 | cindex++; | ||
| 2247 | n = (struct node *)tn; | ||
| 2248 | pend = tn->pos+tn->bits; | ||
| 2249 | depth--; | ||
| 2250 | } | ||
| 2341 | } | 2251 | } |
| 2342 | } | 2252 | } |
| 2343 | 2253 | ||
| 2344 | read_unlock(&fib_lock); | 2254 | rcu_read_unlock(); |
| 2345 | return s; | 2255 | return s; |
| 2346 | } | 2256 | } |
| 2347 | 2257 | ||
| @@ -2359,17 +2269,22 @@ static struct fib_alias *fib_triestat_get_next(struct seq_file *seq) | |||
| 2359 | 2269 | ||
| 2360 | static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos) | 2270 | static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos) |
| 2361 | { | 2271 | { |
| 2362 | void *v = NULL; | 2272 | if (!ip_fib_main_table) |
| 2273 | return NULL; | ||
| 2363 | 2274 | ||
| 2364 | if (ip_fib_main_table) | 2275 | if (*pos) |
| 2365 | v = *pos ? fib_triestat_get_next(seq) : SEQ_START_TOKEN; | 2276 | return fib_triestat_get_next(seq); |
| 2366 | return v; | 2277 | else |
| 2278 | return SEQ_START_TOKEN; | ||
| 2367 | } | 2279 | } |
| 2368 | 2280 | ||
| 2369 | static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos) | 2281 | static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos) |
| 2370 | { | 2282 | { |
| 2371 | ++*pos; | 2283 | ++*pos; |
| 2372 | return v == SEQ_START_TOKEN ? fib_triestat_get_first(seq) : fib_triestat_get_next(seq); | 2284 | if (v == SEQ_START_TOKEN) |
| 2285 | return fib_triestat_get_first(seq); | ||
| 2286 | else | ||
| 2287 | return fib_triestat_get_next(seq); | ||
| 2373 | } | 2288 | } |
| 2374 | 2289 | ||
| 2375 | static void fib_triestat_seq_stop(struct seq_file *seq, void *v) | 2290 | static void fib_triestat_seq_stop(struct seq_file *seq, void *v) |
| @@ -2388,22 +2303,22 @@ static void collect_and_show(struct trie *t, struct seq_file *seq) | |||
| 2388 | { | 2303 | { |
| 2389 | int bytes = 0; /* How many bytes are used, a ref is 4 bytes */ | 2304 | int bytes = 0; /* How many bytes are used, a ref is 4 bytes */ |
| 2390 | int i, max, pointers; | 2305 | int i, max, pointers; |
| 2391 | struct trie_stat *stat; | 2306 | struct trie_stat *stat; |
| 2392 | int avdepth; | 2307 | int avdepth; |
| 2393 | 2308 | ||
| 2394 | stat = trie_collect_stats(t); | 2309 | stat = trie_collect_stats(t); |
| 2395 | 2310 | ||
| 2396 | bytes=0; | 2311 | bytes = 0; |
| 2397 | seq_printf(seq, "trie=%p\n", t); | 2312 | seq_printf(seq, "trie=%p\n", t); |
| 2398 | 2313 | ||
| 2399 | if (stat) { | 2314 | if (stat) { |
| 2400 | if (stat->leaves) | 2315 | if (stat->leaves) |
| 2401 | avdepth=stat->totdepth*100 / stat->leaves; | 2316 | avdepth = stat->totdepth*100 / stat->leaves; |
| 2402 | else | 2317 | else |
| 2403 | avdepth=0; | 2318 | avdepth = 0; |
| 2404 | seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100 ); | 2319 | seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100); |
| 2405 | seq_printf(seq, "Max depth: %4d\n", stat->maxdepth); | 2320 | seq_printf(seq, "Max depth: %4d\n", stat->maxdepth); |
| 2406 | 2321 | ||
| 2407 | seq_printf(seq, "Leaves: %d\n", stat->leaves); | 2322 | seq_printf(seq, "Leaves: %d\n", stat->leaves); |
| 2408 | bytes += sizeof(struct leaf) * stat->leaves; | 2323 | bytes += sizeof(struct leaf) * stat->leaves; |
| 2409 | seq_printf(seq, "Internal nodes: %d\n", stat->tnodes); | 2324 | seq_printf(seq, "Internal nodes: %d\n", stat->tnodes); |
| @@ -2455,11 +2370,9 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v) | |||
| 2455 | 2370 | ||
| 2456 | if (trie_main) | 2371 | if (trie_main) |
| 2457 | collect_and_show(trie_main, seq); | 2372 | collect_and_show(trie_main, seq); |
| 2458 | } | 2373 | } else { |
| 2459 | else { | 2374 | snprintf(bf, sizeof(bf), "*\t%08X\t%08X", 200, 400); |
| 2460 | snprintf(bf, sizeof(bf), | 2375 | |
| 2461 | "*\t%08X\t%08X", 200, 400); | ||
| 2462 | |||
| 2463 | seq_printf(seq, "%-127s\n", bf); | 2376 | seq_printf(seq, "%-127s\n", bf); |
| 2464 | } | 2377 | } |
| 2465 | return 0; | 2378 | return 0; |
| @@ -2520,22 +2433,27 @@ static struct fib_alias *fib_trie_get_next(struct seq_file *seq) | |||
| 2520 | 2433 | ||
| 2521 | static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos) | 2434 | static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos) |
| 2522 | { | 2435 | { |
| 2523 | void *v = NULL; | 2436 | if (!ip_fib_main_table) |
| 2437 | return NULL; | ||
| 2524 | 2438 | ||
| 2525 | if (ip_fib_main_table) | 2439 | if (*pos) |
| 2526 | v = *pos ? fib_trie_get_next(seq) : SEQ_START_TOKEN; | 2440 | return fib_trie_get_next(seq); |
| 2527 | return v; | 2441 | else |
| 2442 | return SEQ_START_TOKEN; | ||
| 2528 | } | 2443 | } |
| 2529 | 2444 | ||
| 2530 | static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) | 2445 | static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) |
| 2531 | { | 2446 | { |
| 2532 | ++*pos; | 2447 | ++*pos; |
| 2533 | return v == SEQ_START_TOKEN ? fib_trie_get_first(seq) : fib_trie_get_next(seq); | 2448 | if (v == SEQ_START_TOKEN) |
| 2449 | return fib_trie_get_first(seq); | ||
| 2450 | else | ||
| 2451 | return fib_trie_get_next(seq); | ||
| 2452 | |||
| 2534 | } | 2453 | } |
| 2535 | 2454 | ||
| 2536 | static void fib_trie_seq_stop(struct seq_file *seq, void *v) | 2455 | static void fib_trie_seq_stop(struct seq_file *seq, void *v) |
| 2537 | { | 2456 | { |
| 2538 | |||
| 2539 | } | 2457 | } |
| 2540 | 2458 | ||
| 2541 | /* | 2459 | /* |
| @@ -2555,9 +2473,7 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v) | |||
| 2555 | 2473 | ||
| 2556 | if (trie_main) | 2474 | if (trie_main) |
| 2557 | trie_dump_seq(seq, trie_main); | 2475 | trie_dump_seq(seq, trie_main); |
| 2558 | } | 2476 | } else { |
| 2559 | |||
| 2560 | else { | ||
| 2561 | snprintf(bf, sizeof(bf), | 2477 | snprintf(bf, sizeof(bf), |
| 2562 | "*\t%08X\t%08X", 200, 400); | 2478 | "*\t%08X\t%08X", 200, 400); |
| 2563 | seq_printf(seq, "%-127s\n", bf); | 2479 | seq_printf(seq, "%-127s\n", bf); |
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index badfc5849973..24eb56ae1b5a 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c | |||
| @@ -114,7 +114,7 @@ struct icmp_bxm { | |||
| 114 | /* | 114 | /* |
| 115 | * Statistics | 115 | * Statistics |
| 116 | */ | 116 | */ |
| 117 | DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics); | 117 | DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics) __read_mostly; |
| 118 | 118 | ||
| 119 | /* An array of errno for error messages from dest unreach. */ | 119 | /* An array of errno for error messages from dest unreach. */ |
| 120 | /* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */ | 120 | /* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */ |
| @@ -627,11 +627,10 @@ static void icmp_unreach(struct sk_buff *skb) | |||
| 627 | break; | 627 | break; |
| 628 | case ICMP_FRAG_NEEDED: | 628 | case ICMP_FRAG_NEEDED: |
| 629 | if (ipv4_config.no_pmtu_disc) { | 629 | if (ipv4_config.no_pmtu_disc) { |
| 630 | LIMIT_NETDEBUG( | 630 | LIMIT_NETDEBUG(KERN_INFO "ICMP: %u.%u.%u.%u: " |
| 631 | printk(KERN_INFO "ICMP: %u.%u.%u.%u: " | ||
| 632 | "fragmentation needed " | 631 | "fragmentation needed " |
| 633 | "and DF set.\n", | 632 | "and DF set.\n", |
| 634 | NIPQUAD(iph->daddr))); | 633 | NIPQUAD(iph->daddr)); |
| 635 | } else { | 634 | } else { |
| 636 | info = ip_rt_frag_needed(iph, | 635 | info = ip_rt_frag_needed(iph, |
| 637 | ntohs(icmph->un.frag.mtu)); | 636 | ntohs(icmph->un.frag.mtu)); |
| @@ -640,10 +639,9 @@ static void icmp_unreach(struct sk_buff *skb) | |||
| 640 | } | 639 | } |
| 641 | break; | 640 | break; |
| 642 | case ICMP_SR_FAILED: | 641 | case ICMP_SR_FAILED: |
| 643 | LIMIT_NETDEBUG( | 642 | LIMIT_NETDEBUG(KERN_INFO "ICMP: %u.%u.%u.%u: Source " |
| 644 | printk(KERN_INFO "ICMP: %u.%u.%u.%u: Source " | ||
| 645 | "Route Failed.\n", | 643 | "Route Failed.\n", |
| 646 | NIPQUAD(iph->daddr))); | 644 | NIPQUAD(iph->daddr)); |
| 647 | break; | 645 | break; |
| 648 | default: | 646 | default: |
| 649 | break; | 647 | break; |
| @@ -936,7 +934,7 @@ int icmp_rcv(struct sk_buff *skb) | |||
| 936 | case CHECKSUM_HW: | 934 | case CHECKSUM_HW: |
| 937 | if (!(u16)csum_fold(skb->csum)) | 935 | if (!(u16)csum_fold(skb->csum)) |
| 938 | break; | 936 | break; |
| 939 | LIMIT_NETDEBUG(printk(KERN_DEBUG "icmp v4 hw csum failure\n")); | 937 | LIMIT_NETDEBUG(KERN_DEBUG "icmp v4 hw csum failure\n"); |
| 940 | case CHECKSUM_NONE: | 938 | case CHECKSUM_NONE: |
| 941 | if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) | 939 | if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) |
| 942 | goto error; | 940 | goto error; |
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 5088f90835ae..44607f4767b8 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c | |||
| @@ -904,7 +904,7 @@ int igmp_rcv(struct sk_buff *skb) | |||
| 904 | case IGMP_MTRACE_RESP: | 904 | case IGMP_MTRACE_RESP: |
| 905 | break; | 905 | break; |
| 906 | default: | 906 | default: |
| 907 | NETDEBUG(printk(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type)); | 907 | NETDEBUG(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type); |
| 908 | } | 908 | } |
| 909 | in_dev_put(in_dev); | 909 | in_dev_put(in_dev); |
| 910 | kfree_skb(skb); | 910 | kfree_skb(skb); |
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c new file mode 100644 index 000000000000..fe3c6d3d0c91 --- /dev/null +++ b/net/ipv4/inet_connection_sock.c | |||
| @@ -0,0 +1,641 @@ | |||
| 1 | /* | ||
| 2 | * INET An implementation of the TCP/IP protocol suite for the LINUX | ||
| 3 | * operating system. INET is implemented using the BSD Socket | ||
| 4 | * interface as the means of communication with the user level. | ||
| 5 | * | ||
| 6 | * Support for INET connection oriented protocols. | ||
| 7 | * | ||
| 8 | * Authors: See the TCP sources | ||
| 9 | * | ||
| 10 | * This program is free software; you can redistribute it and/or | ||
| 11 | * modify it under the terms of the GNU General Public License | ||
| 12 | * as published by the Free Software Foundation; either version | ||
| 13 | * 2 of the License, or(at your option) any later version. | ||
| 14 | */ | ||
| 15 | |||
| 16 | #include <linux/config.h> | ||
| 17 | #include <linux/module.h> | ||
| 18 | #include <linux/jhash.h> | ||
| 19 | |||
| 20 | #include <net/inet_connection_sock.h> | ||
| 21 | #include <net/inet_hashtables.h> | ||
| 22 | #include <net/inet_timewait_sock.h> | ||
| 23 | #include <net/ip.h> | ||
| 24 | #include <net/route.h> | ||
| 25 | #include <net/tcp_states.h> | ||
| 26 | #include <net/xfrm.h> | ||
| 27 | |||
| 28 | #ifdef INET_CSK_DEBUG | ||
| 29 | const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; | ||
| 30 | EXPORT_SYMBOL(inet_csk_timer_bug_msg); | ||
| 31 | #endif | ||
| 32 | |||
| 33 | /* | ||
| 34 | * This array holds the first and last local port number. | ||
| 35 | * For high-usage systems, use sysctl to change this to | ||
| 36 | * 32768-61000 | ||
| 37 | */ | ||
| 38 | int sysctl_local_port_range[2] = { 1024, 4999 }; | ||
| 39 | |||
| 40 | static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb) | ||
| 41 | { | ||
| 42 | const u32 sk_rcv_saddr = inet_rcv_saddr(sk); | ||
| 43 | struct sock *sk2; | ||
| 44 | struct hlist_node *node; | ||
| 45 | int reuse = sk->sk_reuse; | ||
| 46 | |||
| 47 | sk_for_each_bound(sk2, node, &tb->owners) { | ||
| 48 | if (sk != sk2 && | ||
| 49 | !inet_v6_ipv6only(sk2) && | ||
| 50 | (!sk->sk_bound_dev_if || | ||
| 51 | !sk2->sk_bound_dev_if || | ||
| 52 | sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { | ||
| 53 | if (!reuse || !sk2->sk_reuse || | ||
| 54 | sk2->sk_state == TCP_LISTEN) { | ||
| 55 | const u32 sk2_rcv_saddr = inet_rcv_saddr(sk2); | ||
| 56 | if (!sk2_rcv_saddr || !sk_rcv_saddr || | ||
| 57 | sk2_rcv_saddr == sk_rcv_saddr) | ||
| 58 | break; | ||
| 59 | } | ||
| 60 | } | ||
| 61 | } | ||
| 62 | return node != NULL; | ||
| 63 | } | ||
| 64 | |||
| 65 | /* Obtain a reference to a local port for the given sock, | ||
| 66 | * if snum is zero it means select any available local port. | ||
| 67 | */ | ||
| 68 | int inet_csk_get_port(struct inet_hashinfo *hashinfo, | ||
| 69 | struct sock *sk, unsigned short snum) | ||
| 70 | { | ||
| 71 | struct inet_bind_hashbucket *head; | ||
| 72 | struct hlist_node *node; | ||
| 73 | struct inet_bind_bucket *tb; | ||
| 74 | int ret; | ||
| 75 | |||
| 76 | local_bh_disable(); | ||
| 77 | if (!snum) { | ||
| 78 | int low = sysctl_local_port_range[0]; | ||
| 79 | int high = sysctl_local_port_range[1]; | ||
| 80 | int remaining = (high - low) + 1; | ||
| 81 | int rover; | ||
| 82 | |||
| 83 | spin_lock(&hashinfo->portalloc_lock); | ||
| 84 | if (hashinfo->port_rover < low) | ||
| 85 | rover = low; | ||
| 86 | else | ||
| 87 | rover = hashinfo->port_rover; | ||
| 88 | do { | ||
| 89 | rover++; | ||
| 90 | if (rover > high) | ||
| 91 | rover = low; | ||
| 92 | head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)]; | ||
| 93 | spin_lock(&head->lock); | ||
| 94 | inet_bind_bucket_for_each(tb, node, &head->chain) | ||
| 95 | if (tb->port == rover) | ||
| 96 | goto next; | ||
| 97 | break; | ||
| 98 | next: | ||
| 99 | spin_unlock(&head->lock); | ||
| 100 | } while (--remaining > 0); | ||
| 101 | hashinfo->port_rover = rover; | ||
| 102 | spin_unlock(&hashinfo->portalloc_lock); | ||
| 103 | |||
| 104 | /* Exhausted local port range during search? It is not | ||
| 105 | * possible for us to be holding one of the bind hash | ||
| 106 | * locks if this test triggers, because if 'remaining' | ||
| 107 | * drops to zero, we broke out of the do/while loop at | ||
| 108 | * the top level, not from the 'break;' statement. | ||
| 109 | */ | ||
| 110 | ret = 1; | ||
| 111 | if (remaining <= 0) | ||
| 112 | goto fail; | ||
| 113 | |||
| 114 | /* OK, here is the one we will use. HEAD is | ||
| 115 | * non-NULL and we hold it's mutex. | ||
| 116 | */ | ||
| 117 | snum = rover; | ||
| 118 | } else { | ||
| 119 | head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)]; | ||
| 120 | spin_lock(&head->lock); | ||
| 121 | inet_bind_bucket_for_each(tb, node, &head->chain) | ||
| 122 | if (tb->port == snum) | ||
| 123 | goto tb_found; | ||
| 124 | } | ||
| 125 | tb = NULL; | ||
| 126 | goto tb_not_found; | ||
| 127 | tb_found: | ||
| 128 | if (!hlist_empty(&tb->owners)) { | ||
| 129 | if (sk->sk_reuse > 1) | ||
| 130 | goto success; | ||
| 131 | if (tb->fastreuse > 0 && | ||
| 132 | sk->sk_reuse && sk->sk_state != TCP_LISTEN) { | ||
| 133 | goto success; | ||
| 134 | } else { | ||
| 135 | ret = 1; | ||
| 136 | if (inet_csk_bind_conflict(sk, tb)) | ||
| 137 | goto fail_unlock; | ||
| 138 | } | ||
| 139 | } | ||
| 140 | tb_not_found: | ||
| 141 | ret = 1; | ||
| 142 | if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum)) == NULL) | ||
| 143 | goto fail_unlock; | ||
| 144 | if (hlist_empty(&tb->owners)) { | ||
| 145 | if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) | ||
| 146 | tb->fastreuse = 1; | ||
| 147 | else | ||
| 148 | tb->fastreuse = 0; | ||
| 149 | } else if (tb->fastreuse && | ||
| 150 | (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) | ||
| 151 | tb->fastreuse = 0; | ||
| 152 | success: | ||
| 153 | if (!inet_csk(sk)->icsk_bind_hash) | ||
| 154 | inet_bind_hash(sk, tb, snum); | ||
| 155 | BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb); | ||
| 156 | ret = 0; | ||
| 157 | |||
| 158 | fail_unlock: | ||
| 159 | spin_unlock(&head->lock); | ||
| 160 | fail: | ||
| 161 | local_bh_enable(); | ||
| 162 | return ret; | ||
| 163 | } | ||
| 164 | |||
| 165 | EXPORT_SYMBOL_GPL(inet_csk_get_port); | ||
| 166 | |||
| 167 | /* | ||
| 168 | * Wait for an incoming connection, avoid race conditions. This must be called | ||
| 169 | * with the socket locked. | ||
| 170 | */ | ||
| 171 | static int inet_csk_wait_for_connect(struct sock *sk, long timeo) | ||
| 172 | { | ||
| 173 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 174 | DEFINE_WAIT(wait); | ||
| 175 | int err; | ||
| 176 | |||
| 177 | /* | ||
| 178 | * True wake-one mechanism for incoming connections: only | ||
| 179 | * one process gets woken up, not the 'whole herd'. | ||
| 180 | * Since we do not 'race & poll' for established sockets | ||
| 181 | * anymore, the common case will execute the loop only once. | ||
| 182 | * | ||
| 183 | * Subtle issue: "add_wait_queue_exclusive()" will be added | ||
| 184 | * after any current non-exclusive waiters, and we know that | ||
| 185 | * it will always _stay_ after any new non-exclusive waiters | ||
| 186 | * because all non-exclusive waiters are added at the | ||
| 187 | * beginning of the wait-queue. As such, it's ok to "drop" | ||
| 188 | * our exclusiveness temporarily when we get woken up without | ||
| 189 | * having to remove and re-insert us on the wait queue. | ||
| 190 | */ | ||
| 191 | for (;;) { | ||
| 192 | prepare_to_wait_exclusive(sk->sk_sleep, &wait, | ||
| 193 | TASK_INTERRUPTIBLE); | ||
| 194 | release_sock(sk); | ||
| 195 | if (reqsk_queue_empty(&icsk->icsk_accept_queue)) | ||
| 196 | timeo = schedule_timeout(timeo); | ||
| 197 | lock_sock(sk); | ||
| 198 | err = 0; | ||
| 199 | if (!reqsk_queue_empty(&icsk->icsk_accept_queue)) | ||
| 200 | break; | ||
| 201 | err = -EINVAL; | ||
| 202 | if (sk->sk_state != TCP_LISTEN) | ||
| 203 | break; | ||
| 204 | err = sock_intr_errno(timeo); | ||
| 205 | if (signal_pending(current)) | ||
| 206 | break; | ||
| 207 | err = -EAGAIN; | ||
| 208 | if (!timeo) | ||
| 209 | break; | ||
| 210 | } | ||
| 211 | finish_wait(sk->sk_sleep, &wait); | ||
| 212 | return err; | ||
| 213 | } | ||
| 214 | |||
| 215 | /* | ||
| 216 | * This will accept the next outstanding connection. | ||
| 217 | */ | ||
| 218 | struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) | ||
| 219 | { | ||
| 220 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 221 | struct sock *newsk; | ||
| 222 | int error; | ||
| 223 | |||
| 224 | lock_sock(sk); | ||
| 225 | |||
| 226 | /* We need to make sure that this socket is listening, | ||
| 227 | * and that it has something pending. | ||
| 228 | */ | ||
| 229 | error = -EINVAL; | ||
| 230 | if (sk->sk_state != TCP_LISTEN) | ||
| 231 | goto out_err; | ||
| 232 | |||
| 233 | /* Find already established connection */ | ||
| 234 | if (reqsk_queue_empty(&icsk->icsk_accept_queue)) { | ||
| 235 | long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); | ||
| 236 | |||
| 237 | /* If this is a non blocking socket don't sleep */ | ||
| 238 | error = -EAGAIN; | ||
| 239 | if (!timeo) | ||
| 240 | goto out_err; | ||
| 241 | |||
| 242 | error = inet_csk_wait_for_connect(sk, timeo); | ||
| 243 | if (error) | ||
| 244 | goto out_err; | ||
| 245 | } | ||
| 246 | |||
| 247 | newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk); | ||
| 248 | BUG_TRAP(newsk->sk_state != TCP_SYN_RECV); | ||
| 249 | out: | ||
| 250 | release_sock(sk); | ||
| 251 | return newsk; | ||
| 252 | out_err: | ||
| 253 | newsk = NULL; | ||
| 254 | *err = error; | ||
| 255 | goto out; | ||
| 256 | } | ||
| 257 | |||
| 258 | EXPORT_SYMBOL(inet_csk_accept); | ||
| 259 | |||
| 260 | /* | ||
| 261 | * Using different timers for retransmit, delayed acks and probes | ||
| 262 | * We may wish use just one timer maintaining a list of expire jiffies | ||
| 263 | * to optimize. | ||
| 264 | */ | ||
| 265 | void inet_csk_init_xmit_timers(struct sock *sk, | ||
| 266 | void (*retransmit_handler)(unsigned long), | ||
| 267 | void (*delack_handler)(unsigned long), | ||
| 268 | void (*keepalive_handler)(unsigned long)) | ||
| 269 | { | ||
| 270 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 271 | |||
| 272 | init_timer(&icsk->icsk_retransmit_timer); | ||
| 273 | init_timer(&icsk->icsk_delack_timer); | ||
| 274 | init_timer(&sk->sk_timer); | ||
| 275 | |||
| 276 | icsk->icsk_retransmit_timer.function = retransmit_handler; | ||
| 277 | icsk->icsk_delack_timer.function = delack_handler; | ||
| 278 | sk->sk_timer.function = keepalive_handler; | ||
| 279 | |||
| 280 | icsk->icsk_retransmit_timer.data = | ||
| 281 | icsk->icsk_delack_timer.data = | ||
| 282 | sk->sk_timer.data = (unsigned long)sk; | ||
| 283 | |||
| 284 | icsk->icsk_pending = icsk->icsk_ack.pending = 0; | ||
| 285 | } | ||
| 286 | |||
| 287 | EXPORT_SYMBOL(inet_csk_init_xmit_timers); | ||
| 288 | |||
| 289 | void inet_csk_clear_xmit_timers(struct sock *sk) | ||
| 290 | { | ||
| 291 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 292 | |||
| 293 | icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0; | ||
| 294 | |||
| 295 | sk_stop_timer(sk, &icsk->icsk_retransmit_timer); | ||
| 296 | sk_stop_timer(sk, &icsk->icsk_delack_timer); | ||
| 297 | sk_stop_timer(sk, &sk->sk_timer); | ||
| 298 | } | ||
| 299 | |||
| 300 | EXPORT_SYMBOL(inet_csk_clear_xmit_timers); | ||
| 301 | |||
| 302 | void inet_csk_delete_keepalive_timer(struct sock *sk) | ||
| 303 | { | ||
| 304 | sk_stop_timer(sk, &sk->sk_timer); | ||
| 305 | } | ||
| 306 | |||
| 307 | EXPORT_SYMBOL(inet_csk_delete_keepalive_timer); | ||
| 308 | |||
| 309 | void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len) | ||
| 310 | { | ||
| 311 | sk_reset_timer(sk, &sk->sk_timer, jiffies + len); | ||
| 312 | } | ||
| 313 | |||
| 314 | EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); | ||
| 315 | |||
| 316 | struct dst_entry* inet_csk_route_req(struct sock *sk, | ||
| 317 | const struct request_sock *req) | ||
| 318 | { | ||
| 319 | struct rtable *rt; | ||
| 320 | const struct inet_request_sock *ireq = inet_rsk(req); | ||
| 321 | struct ip_options *opt = inet_rsk(req)->opt; | ||
| 322 | struct flowi fl = { .oif = sk->sk_bound_dev_if, | ||
| 323 | .nl_u = { .ip4_u = | ||
| 324 | { .daddr = ((opt && opt->srr) ? | ||
| 325 | opt->faddr : | ||
| 326 | ireq->rmt_addr), | ||
| 327 | .saddr = ireq->loc_addr, | ||
| 328 | .tos = RT_CONN_FLAGS(sk) } }, | ||
| 329 | .proto = sk->sk_protocol, | ||
| 330 | .uli_u = { .ports = | ||
| 331 | { .sport = inet_sk(sk)->sport, | ||
| 332 | .dport = ireq->rmt_port } } }; | ||
| 333 | |||
| 334 | if (ip_route_output_flow(&rt, &fl, sk, 0)) { | ||
| 335 | IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); | ||
| 336 | return NULL; | ||
| 337 | } | ||
| 338 | if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) { | ||
| 339 | ip_rt_put(rt); | ||
| 340 | IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); | ||
| 341 | return NULL; | ||
| 342 | } | ||
| 343 | return &rt->u.dst; | ||
| 344 | } | ||
| 345 | |||
| 346 | EXPORT_SYMBOL_GPL(inet_csk_route_req); | ||
| 347 | |||
| 348 | static inline u32 inet_synq_hash(const u32 raddr, const u16 rport, | ||
| 349 | const u32 rnd, const u16 synq_hsize) | ||
| 350 | { | ||
| 351 | return jhash_2words(raddr, (u32)rport, rnd) & (synq_hsize - 1); | ||
| 352 | } | ||
| 353 | |||
| 354 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | ||
| 355 | #define AF_INET_FAMILY(fam) ((fam) == AF_INET) | ||
| 356 | #else | ||
| 357 | #define AF_INET_FAMILY(fam) 1 | ||
| 358 | #endif | ||
| 359 | |||
| 360 | struct request_sock *inet_csk_search_req(const struct sock *sk, | ||
| 361 | struct request_sock ***prevp, | ||
| 362 | const __u16 rport, const __u32 raddr, | ||
| 363 | const __u32 laddr) | ||
| 364 | { | ||
| 365 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 366 | struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; | ||
| 367 | struct request_sock *req, **prev; | ||
| 368 | |||
| 369 | for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd, | ||
| 370 | lopt->nr_table_entries)]; | ||
| 371 | (req = *prev) != NULL; | ||
| 372 | prev = &req->dl_next) { | ||
| 373 | const struct inet_request_sock *ireq = inet_rsk(req); | ||
| 374 | |||
| 375 | if (ireq->rmt_port == rport && | ||
| 376 | ireq->rmt_addr == raddr && | ||
| 377 | ireq->loc_addr == laddr && | ||
| 378 | AF_INET_FAMILY(req->rsk_ops->family)) { | ||
| 379 | BUG_TRAP(!req->sk); | ||
| 380 | *prevp = prev; | ||
| 381 | break; | ||
| 382 | } | ||
| 383 | } | ||
| 384 | |||
| 385 | return req; | ||
| 386 | } | ||
| 387 | |||
| 388 | EXPORT_SYMBOL_GPL(inet_csk_search_req); | ||
| 389 | |||
| 390 | void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, | ||
| 391 | const unsigned timeout) | ||
| 392 | { | ||
| 393 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 394 | struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; | ||
| 395 | const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, | ||
| 396 | lopt->hash_rnd, lopt->nr_table_entries); | ||
| 397 | |||
| 398 | reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); | ||
| 399 | inet_csk_reqsk_queue_added(sk, timeout); | ||
| 400 | } | ||
| 401 | |||
| 402 | /* Only thing we need from tcp.h */ | ||
| 403 | extern int sysctl_tcp_synack_retries; | ||
| 404 | |||
| 405 | EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); | ||
| 406 | |||
| 407 | void inet_csk_reqsk_queue_prune(struct sock *parent, | ||
| 408 | const unsigned long interval, | ||
| 409 | const unsigned long timeout, | ||
| 410 | const unsigned long max_rto) | ||
| 411 | { | ||
| 412 | struct inet_connection_sock *icsk = inet_csk(parent); | ||
| 413 | struct request_sock_queue *queue = &icsk->icsk_accept_queue; | ||
| 414 | struct listen_sock *lopt = queue->listen_opt; | ||
| 415 | int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; | ||
| 416 | int thresh = max_retries; | ||
| 417 | unsigned long now = jiffies; | ||
| 418 | struct request_sock **reqp, *req; | ||
| 419 | int i, budget; | ||
| 420 | |||
| 421 | if (lopt == NULL || lopt->qlen == 0) | ||
| 422 | return; | ||
| 423 | |||
| 424 | /* Normally all the openreqs are young and become mature | ||
| 425 | * (i.e. converted to established socket) for first timeout. | ||
| 426 | * If synack was not acknowledged for 3 seconds, it means | ||
| 427 | * one of the following things: synack was lost, ack was lost, | ||
| 428 | * rtt is high or nobody planned to ack (i.e. synflood). | ||
| 429 | * When server is a bit loaded, queue is populated with old | ||
| 430 | * open requests, reducing effective size of queue. | ||
| 431 | * When server is well loaded, queue size reduces to zero | ||
| 432 | * after several minutes of work. It is not synflood, | ||
| 433 | * it is normal operation. The solution is pruning | ||
| 434 | * too old entries overriding normal timeout, when | ||
| 435 | * situation becomes dangerous. | ||
| 436 | * | ||
| 437 | * Essentially, we reserve half of room for young | ||
| 438 | * embrions; and abort old ones without pity, if old | ||
| 439 | * ones are about to clog our table. | ||
| 440 | */ | ||
| 441 | if (lopt->qlen>>(lopt->max_qlen_log-1)) { | ||
| 442 | int young = (lopt->qlen_young<<1); | ||
| 443 | |||
| 444 | while (thresh > 2) { | ||
| 445 | if (lopt->qlen < young) | ||
| 446 | break; | ||
| 447 | thresh--; | ||
| 448 | young <<= 1; | ||
| 449 | } | ||
| 450 | } | ||
| 451 | |||
| 452 | if (queue->rskq_defer_accept) | ||
| 453 | max_retries = queue->rskq_defer_accept; | ||
| 454 | |||
| 455 | budget = 2 * (lopt->nr_table_entries / (timeout / interval)); | ||
| 456 | i = lopt->clock_hand; | ||
| 457 | |||
| 458 | do { | ||
| 459 | reqp=&lopt->syn_table[i]; | ||
| 460 | while ((req = *reqp) != NULL) { | ||
| 461 | if (time_after_eq(now, req->expires)) { | ||
| 462 | if ((req->retrans < thresh || | ||
| 463 | (inet_rsk(req)->acked && req->retrans < max_retries)) | ||
| 464 | && !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) { | ||
| 465 | unsigned long timeo; | ||
| 466 | |||
| 467 | if (req->retrans++ == 0) | ||
| 468 | lopt->qlen_young--; | ||
| 469 | timeo = min((timeout << req->retrans), max_rto); | ||
| 470 | req->expires = now + timeo; | ||
| 471 | reqp = &req->dl_next; | ||
| 472 | continue; | ||
| 473 | } | ||
| 474 | |||
| 475 | /* Drop this request */ | ||
| 476 | inet_csk_reqsk_queue_unlink(parent, req, reqp); | ||
| 477 | reqsk_queue_removed(queue, req); | ||
| 478 | reqsk_free(req); | ||
| 479 | continue; | ||
| 480 | } | ||
| 481 | reqp = &req->dl_next; | ||
| 482 | } | ||
| 483 | |||
| 484 | i = (i + 1) & (lopt->nr_table_entries - 1); | ||
| 485 | |||
| 486 | } while (--budget > 0); | ||
| 487 | |||
| 488 | lopt->clock_hand = i; | ||
| 489 | |||
| 490 | if (lopt->qlen) | ||
| 491 | inet_csk_reset_keepalive_timer(parent, interval); | ||
| 492 | } | ||
| 493 | |||
| 494 | EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune); | ||
| 495 | |||
| 496 | struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req, | ||
| 497 | const unsigned int __nocast priority) | ||
| 498 | { | ||
| 499 | struct sock *newsk = sk_clone(sk, priority); | ||
| 500 | |||
| 501 | if (newsk != NULL) { | ||
| 502 | struct inet_connection_sock *newicsk = inet_csk(newsk); | ||
| 503 | |||
| 504 | newsk->sk_state = TCP_SYN_RECV; | ||
| 505 | newicsk->icsk_bind_hash = NULL; | ||
| 506 | |||
| 507 | inet_sk(newsk)->dport = inet_rsk(req)->rmt_port; | ||
| 508 | newsk->sk_write_space = sk_stream_write_space; | ||
| 509 | |||
| 510 | newicsk->icsk_retransmits = 0; | ||
| 511 | newicsk->icsk_backoff = 0; | ||
| 512 | newicsk->icsk_probes_out = 0; | ||
| 513 | |||
| 514 | /* Deinitialize accept_queue to trap illegal accesses. */ | ||
| 515 | memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue)); | ||
| 516 | } | ||
| 517 | return newsk; | ||
| 518 | } | ||
| 519 | |||
| 520 | EXPORT_SYMBOL_GPL(inet_csk_clone); | ||
| 521 | |||
| 522 | /* | ||
| 523 | * At this point, there should be no process reference to this | ||
| 524 | * socket, and thus no user references at all. Therefore we | ||
| 525 | * can assume the socket waitqueue is inactive and nobody will | ||
| 526 | * try to jump onto it. | ||
| 527 | */ | ||
| 528 | void inet_csk_destroy_sock(struct sock *sk) | ||
| 529 | { | ||
| 530 | BUG_TRAP(sk->sk_state == TCP_CLOSE); | ||
| 531 | BUG_TRAP(sock_flag(sk, SOCK_DEAD)); | ||
| 532 | |||
| 533 | /* It cannot be in hash table! */ | ||
| 534 | BUG_TRAP(sk_unhashed(sk)); | ||
| 535 | |||
| 536 | /* If it has not 0 inet_sk(sk)->num, it must be bound */ | ||
| 537 | BUG_TRAP(!inet_sk(sk)->num || inet_csk(sk)->icsk_bind_hash); | ||
| 538 | |||
| 539 | sk->sk_prot->destroy(sk); | ||
| 540 | |||
| 541 | sk_stream_kill_queues(sk); | ||
| 542 | |||
| 543 | xfrm_sk_free_policy(sk); | ||
| 544 | |||
| 545 | sk_refcnt_debug_release(sk); | ||
| 546 | |||
| 547 | atomic_dec(sk->sk_prot->orphan_count); | ||
| 548 | sock_put(sk); | ||
| 549 | } | ||
| 550 | |||
| 551 | EXPORT_SYMBOL(inet_csk_destroy_sock); | ||
| 552 | |||
| 553 | int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) | ||
| 554 | { | ||
| 555 | struct inet_sock *inet = inet_sk(sk); | ||
| 556 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 557 | int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries); | ||
| 558 | |||
| 559 | if (rc != 0) | ||
| 560 | return rc; | ||
| 561 | |||
| 562 | sk->sk_max_ack_backlog = 0; | ||
| 563 | sk->sk_ack_backlog = 0; | ||
| 564 | inet_csk_delack_init(sk); | ||
| 565 | |||
| 566 | /* There is race window here: we announce ourselves listening, | ||
| 567 | * but this transition is still not validated by get_port(). | ||
| 568 | * It is OK, because this socket enters to hash table only | ||
| 569 | * after validation is complete. | ||
| 570 | */ | ||
| 571 | sk->sk_state = TCP_LISTEN; | ||
| 572 | if (!sk->sk_prot->get_port(sk, inet->num)) { | ||
| 573 | inet->sport = htons(inet->num); | ||
| 574 | |||
| 575 | sk_dst_reset(sk); | ||
| 576 | sk->sk_prot->hash(sk); | ||
| 577 | |||
| 578 | return 0; | ||
| 579 | } | ||
| 580 | |||
| 581 | sk->sk_state = TCP_CLOSE; | ||
| 582 | __reqsk_queue_destroy(&icsk->icsk_accept_queue); | ||
| 583 | return -EADDRINUSE; | ||
| 584 | } | ||
| 585 | |||
| 586 | EXPORT_SYMBOL_GPL(inet_csk_listen_start); | ||
| 587 | |||
| 588 | /* | ||
| 589 | * This routine closes sockets which have been at least partially | ||
| 590 | * opened, but not yet accepted. | ||
| 591 | */ | ||
| 592 | void inet_csk_listen_stop(struct sock *sk) | ||
| 593 | { | ||
| 594 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 595 | struct request_sock *acc_req; | ||
| 596 | struct request_sock *req; | ||
| 597 | |||
| 598 | inet_csk_delete_keepalive_timer(sk); | ||
| 599 | |||
| 600 | /* make all the listen_opt local to us */ | ||
| 601 | acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue); | ||
| 602 | |||
| 603 | /* Following specs, it would be better either to send FIN | ||
| 604 | * (and enter FIN-WAIT-1, it is normal close) | ||
| 605 | * or to send active reset (abort). | ||
| 606 | * Certainly, it is pretty dangerous while synflood, but it is | ||
| 607 | * bad justification for our negligence 8) | ||
| 608 | * To be honest, we are not able to make either | ||
| 609 | * of the variants now. --ANK | ||
| 610 | */ | ||
| 611 | reqsk_queue_destroy(&icsk->icsk_accept_queue); | ||
| 612 | |||
| 613 | while ((req = acc_req) != NULL) { | ||
| 614 | struct sock *child = req->sk; | ||
| 615 | |||
| 616 | acc_req = req->dl_next; | ||
| 617 | |||
| 618 | local_bh_disable(); | ||
| 619 | bh_lock_sock(child); | ||
| 620 | BUG_TRAP(!sock_owned_by_user(child)); | ||
| 621 | sock_hold(child); | ||
| 622 | |||
| 623 | sk->sk_prot->disconnect(child, O_NONBLOCK); | ||
| 624 | |||
| 625 | sock_orphan(child); | ||
| 626 | |||
| 627 | atomic_inc(sk->sk_prot->orphan_count); | ||
| 628 | |||
| 629 | inet_csk_destroy_sock(child); | ||
| 630 | |||
| 631 | bh_unlock_sock(child); | ||
| 632 | local_bh_enable(); | ||
| 633 | sock_put(child); | ||
| 634 | |||
| 635 | sk_acceptq_removed(sk); | ||
| 636 | __reqsk_free(req); | ||
| 637 | } | ||
| 638 | BUG_TRAP(!sk->sk_ack_backlog); | ||
| 639 | } | ||
| 640 | |||
| 641 | EXPORT_SYMBOL_GPL(inet_csk_listen_stop); | ||
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c new file mode 100644 index 000000000000..71f3c7350c6e --- /dev/null +++ b/net/ipv4/inet_diag.c | |||
| @@ -0,0 +1,868 @@ | |||
| 1 | /* | ||
| 2 | * inet_diag.c Module for monitoring INET transport protocols sockets. | ||
| 3 | * | ||
| 4 | * Version: $Id: inet_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $ | ||
| 5 | * | ||
| 6 | * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> | ||
| 7 | * | ||
| 8 | * This program is free software; you can redistribute it and/or | ||
| 9 | * modify it under the terms of the GNU General Public License | ||
| 10 | * as published by the Free Software Foundation; either version | ||
| 11 | * 2 of the License, or (at your option) any later version. | ||
| 12 | */ | ||
| 13 | |||
| 14 | #include <linux/config.h> | ||
| 15 | #include <linux/module.h> | ||
| 16 | #include <linux/types.h> | ||
| 17 | #include <linux/fcntl.h> | ||
| 18 | #include <linux/random.h> | ||
| 19 | #include <linux/cache.h> | ||
| 20 | #include <linux/init.h> | ||
| 21 | #include <linux/time.h> | ||
| 22 | |||
| 23 | #include <net/icmp.h> | ||
| 24 | #include <net/tcp.h> | ||
| 25 | #include <net/ipv6.h> | ||
| 26 | #include <net/inet_common.h> | ||
| 27 | #include <net/inet_connection_sock.h> | ||
| 28 | #include <net/inet_hashtables.h> | ||
| 29 | #include <net/inet_timewait_sock.h> | ||
| 30 | #include <net/inet6_hashtables.h> | ||
| 31 | |||
| 32 | #include <linux/inet.h> | ||
| 33 | #include <linux/stddef.h> | ||
| 34 | |||
| 35 | #include <linux/inet_diag.h> | ||
| 36 | |||
| 37 | static const struct inet_diag_handler **inet_diag_table; | ||
| 38 | |||
| 39 | struct inet_diag_entry { | ||
| 40 | u32 *saddr; | ||
| 41 | u32 *daddr; | ||
| 42 | u16 sport; | ||
| 43 | u16 dport; | ||
| 44 | u16 family; | ||
| 45 | u16 userlocks; | ||
| 46 | }; | ||
| 47 | |||
| 48 | static struct sock *idiagnl; | ||
| 49 | |||
| 50 | #define INET_DIAG_PUT(skb, attrtype, attrlen) \ | ||
| 51 | RTA_DATA(__RTA_PUT(skb, attrtype, attrlen)) | ||
| 52 | |||
| 53 | static int inet_diag_fill(struct sk_buff *skb, struct sock *sk, | ||
| 54 | int ext, u32 pid, u32 seq, u16 nlmsg_flags, | ||
| 55 | const struct nlmsghdr *unlh) | ||
| 56 | { | ||
| 57 | const struct inet_sock *inet = inet_sk(sk); | ||
| 58 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 59 | struct inet_diag_msg *r; | ||
| 60 | struct nlmsghdr *nlh; | ||
| 61 | void *info = NULL; | ||
| 62 | struct inet_diag_meminfo *minfo = NULL; | ||
| 63 | unsigned char *b = skb->tail; | ||
| 64 | const struct inet_diag_handler *handler; | ||
| 65 | |||
| 66 | handler = inet_diag_table[unlh->nlmsg_type]; | ||
| 67 | BUG_ON(handler == NULL); | ||
| 68 | |||
| 69 | nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r)); | ||
| 70 | nlh->nlmsg_flags = nlmsg_flags; | ||
| 71 | |||
| 72 | r = NLMSG_DATA(nlh); | ||
| 73 | if (sk->sk_state != TCP_TIME_WAIT) { | ||
| 74 | if (ext & (1 << (INET_DIAG_MEMINFO - 1))) | ||
| 75 | minfo = INET_DIAG_PUT(skb, INET_DIAG_MEMINFO, | ||
| 76 | sizeof(*minfo)); | ||
| 77 | if (ext & (1 << (INET_DIAG_INFO - 1))) | ||
| 78 | info = INET_DIAG_PUT(skb, INET_DIAG_INFO, | ||
| 79 | handler->idiag_info_size); | ||
| 80 | |||
| 81 | if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) { | ||
| 82 | size_t len = strlen(icsk->icsk_ca_ops->name); | ||
| 83 | strcpy(INET_DIAG_PUT(skb, INET_DIAG_CONG, len + 1), | ||
| 84 | icsk->icsk_ca_ops->name); | ||
| 85 | } | ||
| 86 | } | ||
| 87 | r->idiag_family = sk->sk_family; | ||
| 88 | r->idiag_state = sk->sk_state; | ||
| 89 | r->idiag_timer = 0; | ||
| 90 | r->idiag_retrans = 0; | ||
| 91 | |||
| 92 | r->id.idiag_if = sk->sk_bound_dev_if; | ||
| 93 | r->id.idiag_cookie[0] = (u32)(unsigned long)sk; | ||
| 94 | r->id.idiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1); | ||
| 95 | |||
| 96 | if (r->idiag_state == TCP_TIME_WAIT) { | ||
| 97 | const struct inet_timewait_sock *tw = inet_twsk(sk); | ||
| 98 | long tmo = tw->tw_ttd - jiffies; | ||
| 99 | if (tmo < 0) | ||
| 100 | tmo = 0; | ||
| 101 | |||
| 102 | r->id.idiag_sport = tw->tw_sport; | ||
| 103 | r->id.idiag_dport = tw->tw_dport; | ||
| 104 | r->id.idiag_src[0] = tw->tw_rcv_saddr; | ||
| 105 | r->id.idiag_dst[0] = tw->tw_daddr; | ||
| 106 | r->idiag_state = tw->tw_substate; | ||
| 107 | r->idiag_timer = 3; | ||
| 108 | r->idiag_expires = (tmo * 1000 + HZ - 1) / HZ; | ||
| 109 | r->idiag_rqueue = 0; | ||
| 110 | r->idiag_wqueue = 0; | ||
| 111 | r->idiag_uid = 0; | ||
| 112 | r->idiag_inode = 0; | ||
| 113 | #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) | ||
| 114 | if (r->idiag_family == AF_INET6) { | ||
| 115 | const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk); | ||
| 116 | |||
| 117 | ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, | ||
| 118 | &tcp6tw->tw_v6_rcv_saddr); | ||
| 119 | ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, | ||
| 120 | &tcp6tw->tw_v6_daddr); | ||
| 121 | } | ||
| 122 | #endif | ||
| 123 | nlh->nlmsg_len = skb->tail - b; | ||
| 124 | return skb->len; | ||
| 125 | } | ||
| 126 | |||
| 127 | r->id.idiag_sport = inet->sport; | ||
| 128 | r->id.idiag_dport = inet->dport; | ||
| 129 | r->id.idiag_src[0] = inet->rcv_saddr; | ||
| 130 | r->id.idiag_dst[0] = inet->daddr; | ||
| 131 | |||
| 132 | #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) | ||
| 133 | if (r->idiag_family == AF_INET6) { | ||
| 134 | struct ipv6_pinfo *np = inet6_sk(sk); | ||
| 135 | |||
| 136 | ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, | ||
| 137 | &np->rcv_saddr); | ||
| 138 | ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, | ||
| 139 | &np->daddr); | ||
| 140 | } | ||
| 141 | #endif | ||
| 142 | |||
| 143 | #define EXPIRES_IN_MS(tmo) ((tmo - jiffies) * 1000 + HZ - 1) / HZ | ||
| 144 | |||
| 145 | if (icsk->icsk_pending == ICSK_TIME_RETRANS) { | ||
| 146 | r->idiag_timer = 1; | ||
| 147 | r->idiag_retrans = icsk->icsk_retransmits; | ||
| 148 | r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); | ||
| 149 | } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { | ||
| 150 | r->idiag_timer = 4; | ||
| 151 | r->idiag_retrans = icsk->icsk_probes_out; | ||
| 152 | r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); | ||
| 153 | } else if (timer_pending(&sk->sk_timer)) { | ||
| 154 | r->idiag_timer = 2; | ||
| 155 | r->idiag_retrans = icsk->icsk_probes_out; | ||
| 156 | r->idiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires); | ||
| 157 | } else { | ||
| 158 | r->idiag_timer = 0; | ||
| 159 | r->idiag_expires = 0; | ||
| 160 | } | ||
| 161 | #undef EXPIRES_IN_MS | ||
| 162 | |||
| 163 | r->idiag_uid = sock_i_uid(sk); | ||
| 164 | r->idiag_inode = sock_i_ino(sk); | ||
| 165 | |||
| 166 | if (minfo) { | ||
| 167 | minfo->idiag_rmem = atomic_read(&sk->sk_rmem_alloc); | ||
| 168 | minfo->idiag_wmem = sk->sk_wmem_queued; | ||
| 169 | minfo->idiag_fmem = sk->sk_forward_alloc; | ||
| 170 | minfo->idiag_tmem = atomic_read(&sk->sk_wmem_alloc); | ||
| 171 | } | ||
| 172 | |||
| 173 | handler->idiag_get_info(sk, r, info); | ||
| 174 | |||
| 175 | if (sk->sk_state < TCP_TIME_WAIT && | ||
| 176 | icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info) | ||
| 177 | icsk->icsk_ca_ops->get_info(sk, ext, skb); | ||
| 178 | |||
| 179 | nlh->nlmsg_len = skb->tail - b; | ||
| 180 | return skb->len; | ||
| 181 | |||
| 182 | rtattr_failure: | ||
| 183 | nlmsg_failure: | ||
| 184 | skb_trim(skb, b - skb->data); | ||
| 185 | return -1; | ||
| 186 | } | ||
| 187 | |||
| 188 | static int inet_diag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh) | ||
| 189 | { | ||
| 190 | int err; | ||
| 191 | struct sock *sk; | ||
| 192 | struct inet_diag_req *req = NLMSG_DATA(nlh); | ||
| 193 | struct sk_buff *rep; | ||
| 194 | struct inet_hashinfo *hashinfo; | ||
| 195 | const struct inet_diag_handler *handler; | ||
| 196 | |||
| 197 | handler = inet_diag_table[nlh->nlmsg_type]; | ||
| 198 | BUG_ON(handler == NULL); | ||
| 199 | hashinfo = handler->idiag_hashinfo; | ||
| 200 | |||
| 201 | if (req->idiag_family == AF_INET) { | ||
| 202 | sk = inet_lookup(hashinfo, req->id.idiag_dst[0], | ||
| 203 | req->id.idiag_dport, req->id.idiag_src[0], | ||
| 204 | req->id.idiag_sport, req->id.idiag_if); | ||
| 205 | } | ||
| 206 | #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) | ||
| 207 | else if (req->idiag_family == AF_INET6) { | ||
| 208 | sk = inet6_lookup(hashinfo, | ||
| 209 | (struct in6_addr *)req->id.idiag_dst, | ||
| 210 | req->id.idiag_dport, | ||
| 211 | (struct in6_addr *)req->id.idiag_src, | ||
| 212 | req->id.idiag_sport, | ||
| 213 | req->id.idiag_if); | ||
| 214 | } | ||
| 215 | #endif | ||
| 216 | else { | ||
| 217 | return -EINVAL; | ||
| 218 | } | ||
| 219 | |||
| 220 | if (sk == NULL) | ||
| 221 | return -ENOENT; | ||
| 222 | |||
| 223 | err = -ESTALE; | ||
| 224 | if ((req->id.idiag_cookie[0] != INET_DIAG_NOCOOKIE || | ||
| 225 | req->id.idiag_cookie[1] != INET_DIAG_NOCOOKIE) && | ||
| 226 | ((u32)(unsigned long)sk != req->id.idiag_cookie[0] || | ||
| 227 | (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.idiag_cookie[1])) | ||
| 228 | goto out; | ||
| 229 | |||
| 230 | err = -ENOMEM; | ||
| 231 | rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) + | ||
| 232 | sizeof(struct inet_diag_meminfo) + | ||
| 233 | handler->idiag_info_size + 64)), | ||
| 234 | GFP_KERNEL); | ||
| 235 | if (!rep) | ||
| 236 | goto out; | ||
| 237 | |||
| 238 | if (inet_diag_fill(rep, sk, req->idiag_ext, | ||
| 239 | NETLINK_CB(in_skb).pid, | ||
| 240 | nlh->nlmsg_seq, 0, nlh) <= 0) | ||
| 241 | BUG(); | ||
| 242 | |||
| 243 | err = netlink_unicast(idiagnl, rep, NETLINK_CB(in_skb).pid, | ||
| 244 | MSG_DONTWAIT); | ||
| 245 | if (err > 0) | ||
| 246 | err = 0; | ||
| 247 | |||
| 248 | out: | ||
| 249 | if (sk) { | ||
| 250 | if (sk->sk_state == TCP_TIME_WAIT) | ||
| 251 | inet_twsk_put((struct inet_timewait_sock *)sk); | ||
| 252 | else | ||
| 253 | sock_put(sk); | ||
| 254 | } | ||
| 255 | return err; | ||
| 256 | } | ||
| 257 | |||
| 258 | static int bitstring_match(const u32 *a1, const u32 *a2, int bits) | ||
| 259 | { | ||
| 260 | int words = bits >> 5; | ||
| 261 | |||
| 262 | bits &= 0x1f; | ||
| 263 | |||
| 264 | if (words) { | ||
| 265 | if (memcmp(a1, a2, words << 2)) | ||
| 266 | return 0; | ||
| 267 | } | ||
| 268 | if (bits) { | ||
| 269 | __u32 w1, w2; | ||
| 270 | __u32 mask; | ||
| 271 | |||
| 272 | w1 = a1[words]; | ||
| 273 | w2 = a2[words]; | ||
| 274 | |||
| 275 | mask = htonl((0xffffffff) << (32 - bits)); | ||
| 276 | |||
| 277 | if ((w1 ^ w2) & mask) | ||
| 278 | return 0; | ||
| 279 | } | ||
| 280 | |||
| 281 | return 1; | ||
| 282 | } | ||
| 283 | |||
| 284 | |||
| 285 | static int inet_diag_bc_run(const void *bc, int len, | ||
| 286 | const struct inet_diag_entry *entry) | ||
| 287 | { | ||
| 288 | while (len > 0) { | ||
| 289 | int yes = 1; | ||
| 290 | const struct inet_diag_bc_op *op = bc; | ||
| 291 | |||
| 292 | switch (op->code) { | ||
| 293 | case INET_DIAG_BC_NOP: | ||
| 294 | break; | ||
| 295 | case INET_DIAG_BC_JMP: | ||
| 296 | yes = 0; | ||
| 297 | break; | ||
| 298 | case INET_DIAG_BC_S_GE: | ||
| 299 | yes = entry->sport >= op[1].no; | ||
| 300 | break; | ||
| 301 | case INET_DIAG_BC_S_LE: | ||
| 302 | yes = entry->dport <= op[1].no; | ||
| 303 | break; | ||
| 304 | case INET_DIAG_BC_D_GE: | ||
| 305 | yes = entry->dport >= op[1].no; | ||
| 306 | break; | ||
| 307 | case INET_DIAG_BC_D_LE: | ||
| 308 | yes = entry->dport <= op[1].no; | ||
| 309 | break; | ||
| 310 | case INET_DIAG_BC_AUTO: | ||
| 311 | yes = !(entry->userlocks & SOCK_BINDPORT_LOCK); | ||
| 312 | break; | ||
| 313 | case INET_DIAG_BC_S_COND: | ||
| 314 | case INET_DIAG_BC_D_COND: { | ||
| 315 | struct inet_diag_hostcond *cond; | ||
| 316 | u32 *addr; | ||
| 317 | |||
| 318 | cond = (struct inet_diag_hostcond *)(op + 1); | ||
| 319 | if (cond->port != -1 && | ||
| 320 | cond->port != (op->code == INET_DIAG_BC_S_COND ? | ||
| 321 | entry->sport : entry->dport)) { | ||
| 322 | yes = 0; | ||
| 323 | break; | ||
| 324 | } | ||
| 325 | |||
| 326 | if (cond->prefix_len == 0) | ||
| 327 | break; | ||
| 328 | |||
| 329 | if (op->code == INET_DIAG_BC_S_COND) | ||
| 330 | addr = entry->saddr; | ||
| 331 | else | ||
| 332 | addr = entry->daddr; | ||
| 333 | |||
| 334 | if (bitstring_match(addr, cond->addr, cond->prefix_len)) | ||
| 335 | break; | ||
| 336 | if (entry->family == AF_INET6 && | ||
| 337 | cond->family == AF_INET) { | ||
| 338 | if (addr[0] == 0 && addr[1] == 0 && | ||
| 339 | addr[2] == htonl(0xffff) && | ||
| 340 | bitstring_match(addr + 3, cond->addr, | ||
| 341 | cond->prefix_len)) | ||
| 342 | break; | ||
| 343 | } | ||
| 344 | yes = 0; | ||
| 345 | break; | ||
| 346 | } | ||
| 347 | } | ||
| 348 | |||
| 349 | if (yes) { | ||
| 350 | len -= op->yes; | ||
| 351 | bc += op->yes; | ||
| 352 | } else { | ||
| 353 | len -= op->no; | ||
| 354 | bc += op->no; | ||
| 355 | } | ||
| 356 | } | ||
| 357 | return (len == 0); | ||
| 358 | } | ||
| 359 | |||
| 360 | static int valid_cc(const void *bc, int len, int cc) | ||
| 361 | { | ||
| 362 | while (len >= 0) { | ||
| 363 | const struct inet_diag_bc_op *op = bc; | ||
| 364 | |||
| 365 | if (cc > len) | ||
| 366 | return 0; | ||
| 367 | if (cc == len) | ||
| 368 | return 1; | ||
| 369 | if (op->yes < 4) | ||
| 370 | return 0; | ||
| 371 | len -= op->yes; | ||
| 372 | bc += op->yes; | ||
| 373 | } | ||
| 374 | return 0; | ||
| 375 | } | ||
| 376 | |||
| 377 | static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) | ||
| 378 | { | ||
| 379 | const unsigned char *bc = bytecode; | ||
| 380 | int len = bytecode_len; | ||
| 381 | |||
| 382 | while (len > 0) { | ||
| 383 | struct inet_diag_bc_op *op = (struct inet_diag_bc_op *)bc; | ||
| 384 | |||
| 385 | //printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len); | ||
| 386 | switch (op->code) { | ||
| 387 | case INET_DIAG_BC_AUTO: | ||
| 388 | case INET_DIAG_BC_S_COND: | ||
| 389 | case INET_DIAG_BC_D_COND: | ||
| 390 | case INET_DIAG_BC_S_GE: | ||
| 391 | case INET_DIAG_BC_S_LE: | ||
| 392 | case INET_DIAG_BC_D_GE: | ||
| 393 | case INET_DIAG_BC_D_LE: | ||
| 394 | if (op->yes < 4 || op->yes > len + 4) | ||
| 395 | return -EINVAL; | ||
| 396 | case INET_DIAG_BC_JMP: | ||
| 397 | if (op->no < 4 || op->no > len + 4) | ||
| 398 | return -EINVAL; | ||
| 399 | if (op->no < len && | ||
| 400 | !valid_cc(bytecode, bytecode_len, len - op->no)) | ||
| 401 | return -EINVAL; | ||
| 402 | break; | ||
| 403 | case INET_DIAG_BC_NOP: | ||
| 404 | if (op->yes < 4 || op->yes > len + 4) | ||
| 405 | return -EINVAL; | ||
| 406 | break; | ||
| 407 | default: | ||
| 408 | return -EINVAL; | ||
| 409 | } | ||
| 410 | bc += op->yes; | ||
| 411 | len -= op->yes; | ||
| 412 | } | ||
| 413 | return len == 0 ? 0 : -EINVAL; | ||
| 414 | } | ||
| 415 | |||
| 416 | static int inet_diag_dump_sock(struct sk_buff *skb, struct sock *sk, | ||
| 417 | struct netlink_callback *cb) | ||
| 418 | { | ||
| 419 | struct inet_diag_req *r = NLMSG_DATA(cb->nlh); | ||
| 420 | |||
| 421 | if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { | ||
| 422 | struct inet_diag_entry entry; | ||
| 423 | struct rtattr *bc = (struct rtattr *)(r + 1); | ||
| 424 | struct inet_sock *inet = inet_sk(sk); | ||
| 425 | |||
| 426 | entry.family = sk->sk_family; | ||
| 427 | #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) | ||
| 428 | if (entry.family == AF_INET6) { | ||
| 429 | struct ipv6_pinfo *np = inet6_sk(sk); | ||
| 430 | |||
| 431 | entry.saddr = np->rcv_saddr.s6_addr32; | ||
| 432 | entry.daddr = np->daddr.s6_addr32; | ||
| 433 | } else | ||
| 434 | #endif | ||
| 435 | { | ||
| 436 | entry.saddr = &inet->rcv_saddr; | ||
| 437 | entry.daddr = &inet->daddr; | ||
| 438 | } | ||
| 439 | entry.sport = inet->num; | ||
| 440 | entry.dport = ntohs(inet->dport); | ||
| 441 | entry.userlocks = sk->sk_userlocks; | ||
| 442 | |||
| 443 | if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry)) | ||
| 444 | return 0; | ||
| 445 | } | ||
| 446 | |||
| 447 | return inet_diag_fill(skb, sk, r->idiag_ext, NETLINK_CB(cb->skb).pid, | ||
| 448 | cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); | ||
| 449 | } | ||
| 450 | |||
| 451 | static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, | ||
| 452 | struct request_sock *req, | ||
| 453 | u32 pid, u32 seq, | ||
| 454 | const struct nlmsghdr *unlh) | ||
| 455 | { | ||
| 456 | const struct inet_request_sock *ireq = inet_rsk(req); | ||
| 457 | struct inet_sock *inet = inet_sk(sk); | ||
| 458 | unsigned char *b = skb->tail; | ||
| 459 | struct inet_diag_msg *r; | ||
| 460 | struct nlmsghdr *nlh; | ||
| 461 | long tmo; | ||
| 462 | |||
| 463 | nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r)); | ||
| 464 | nlh->nlmsg_flags = NLM_F_MULTI; | ||
| 465 | r = NLMSG_DATA(nlh); | ||
| 466 | |||
| 467 | r->idiag_family = sk->sk_family; | ||
| 468 | r->idiag_state = TCP_SYN_RECV; | ||
| 469 | r->idiag_timer = 1; | ||
| 470 | r->idiag_retrans = req->retrans; | ||
| 471 | |||
| 472 | r->id.idiag_if = sk->sk_bound_dev_if; | ||
| 473 | r->id.idiag_cookie[0] = (u32)(unsigned long)req; | ||
| 474 | r->id.idiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1); | ||
| 475 | |||
| 476 | tmo = req->expires - jiffies; | ||
| 477 | if (tmo < 0) | ||
| 478 | tmo = 0; | ||
| 479 | |||
| 480 | r->id.idiag_sport = inet->sport; | ||
| 481 | r->id.idiag_dport = ireq->rmt_port; | ||
| 482 | r->id.idiag_src[0] = ireq->loc_addr; | ||
| 483 | r->id.idiag_dst[0] = ireq->rmt_addr; | ||
| 484 | r->idiag_expires = jiffies_to_msecs(tmo); | ||
| 485 | r->idiag_rqueue = 0; | ||
| 486 | r->idiag_wqueue = 0; | ||
| 487 | r->idiag_uid = sock_i_uid(sk); | ||
| 488 | r->idiag_inode = 0; | ||
| 489 | #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) | ||
| 490 | if (r->idiag_family == AF_INET6) { | ||
| 491 | ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, | ||
| 492 | &tcp6_rsk(req)->loc_addr); | ||
| 493 | ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, | ||
| 494 | &tcp6_rsk(req)->rmt_addr); | ||
| 495 | } | ||
| 496 | #endif | ||
| 497 | nlh->nlmsg_len = skb->tail - b; | ||
| 498 | |||
| 499 | return skb->len; | ||
| 500 | |||
| 501 | nlmsg_failure: | ||
| 502 | skb_trim(skb, b - skb->data); | ||
| 503 | return -1; | ||
| 504 | } | ||
| 505 | |||
| 506 | static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, | ||
| 507 | struct netlink_callback *cb) | ||
| 508 | { | ||
| 509 | struct inet_diag_entry entry; | ||
| 510 | struct inet_diag_req *r = NLMSG_DATA(cb->nlh); | ||
| 511 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 512 | struct listen_sock *lopt; | ||
| 513 | struct rtattr *bc = NULL; | ||
| 514 | struct inet_sock *inet = inet_sk(sk); | ||
| 515 | int j, s_j; | ||
| 516 | int reqnum, s_reqnum; | ||
| 517 | int err = 0; | ||
| 518 | |||
| 519 | s_j = cb->args[3]; | ||
| 520 | s_reqnum = cb->args[4]; | ||
| 521 | |||
| 522 | if (s_j > 0) | ||
| 523 | s_j--; | ||
| 524 | |||
| 525 | entry.family = sk->sk_family; | ||
| 526 | |||
| 527 | read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); | ||
| 528 | |||
| 529 | lopt = icsk->icsk_accept_queue.listen_opt; | ||
| 530 | if (!lopt || !lopt->qlen) | ||
| 531 | goto out; | ||
| 532 | |||
| 533 | if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { | ||
| 534 | bc = (struct rtattr *)(r + 1); | ||
| 535 | entry.sport = inet->num; | ||
| 536 | entry.userlocks = sk->sk_userlocks; | ||
| 537 | } | ||
| 538 | |||
| 539 | for (j = s_j; j < lopt->nr_table_entries; j++) { | ||
| 540 | struct request_sock *req, *head = lopt->syn_table[j]; | ||
| 541 | |||
| 542 | reqnum = 0; | ||
| 543 | for (req = head; req; reqnum++, req = req->dl_next) { | ||
| 544 | struct inet_request_sock *ireq = inet_rsk(req); | ||
| 545 | |||
| 546 | if (reqnum < s_reqnum) | ||
| 547 | continue; | ||
| 548 | if (r->id.idiag_dport != ireq->rmt_port && | ||
| 549 | r->id.idiag_dport) | ||
| 550 | continue; | ||
| 551 | |||
| 552 | if (bc) { | ||
| 553 | entry.saddr = | ||
| 554 | #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) | ||
| 555 | (entry.family == AF_INET6) ? | ||
| 556 | tcp6_rsk(req)->loc_addr.s6_addr32 : | ||
| 557 | #endif | ||
| 558 | &ireq->loc_addr; | ||
| 559 | entry.daddr = | ||
| 560 | #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) | ||
| 561 | (entry.family == AF_INET6) ? | ||
| 562 | tcp6_rsk(req)->rmt_addr.s6_addr32 : | ||
| 563 | #endif | ||
| 564 | &ireq->rmt_addr; | ||
| 565 | entry.dport = ntohs(ireq->rmt_port); | ||
| 566 | |||
| 567 | if (!inet_diag_bc_run(RTA_DATA(bc), | ||
| 568 | RTA_PAYLOAD(bc), &entry)) | ||
| 569 | continue; | ||
| 570 | } | ||
| 571 | |||
| 572 | err = inet_diag_fill_req(skb, sk, req, | ||
| 573 | NETLINK_CB(cb->skb).pid, | ||
| 574 | cb->nlh->nlmsg_seq, cb->nlh); | ||
| 575 | if (err < 0) { | ||
| 576 | cb->args[3] = j + 1; | ||
| 577 | cb->args[4] = reqnum; | ||
| 578 | goto out; | ||
| 579 | } | ||
| 580 | } | ||
| 581 | |||
| 582 | s_reqnum = 0; | ||
| 583 | } | ||
| 584 | |||
| 585 | out: | ||
| 586 | read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); | ||
| 587 | |||
| 588 | return err; | ||
| 589 | } | ||
| 590 | |||
| 591 | static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) | ||
| 592 | { | ||
| 593 | int i, num; | ||
| 594 | int s_i, s_num; | ||
| 595 | struct inet_diag_req *r = NLMSG_DATA(cb->nlh); | ||
| 596 | const struct inet_diag_handler *handler; | ||
| 597 | struct inet_hashinfo *hashinfo; | ||
| 598 | |||
| 599 | handler = inet_diag_table[cb->nlh->nlmsg_type]; | ||
| 600 | BUG_ON(handler == NULL); | ||
| 601 | hashinfo = handler->idiag_hashinfo; | ||
| 602 | |||
| 603 | s_i = cb->args[1]; | ||
| 604 | s_num = num = cb->args[2]; | ||
| 605 | |||
| 606 | if (cb->args[0] == 0) { | ||
| 607 | if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV))) | ||
| 608 | goto skip_listen_ht; | ||
| 609 | |||
| 610 | inet_listen_lock(hashinfo); | ||
| 611 | for (i = s_i; i < INET_LHTABLE_SIZE; i++) { | ||
| 612 | struct sock *sk; | ||
| 613 | struct hlist_node *node; | ||
| 614 | |||
| 615 | num = 0; | ||
| 616 | sk_for_each(sk, node, &hashinfo->listening_hash[i]) { | ||
| 617 | struct inet_sock *inet = inet_sk(sk); | ||
| 618 | |||
| 619 | if (num < s_num) { | ||
| 620 | num++; | ||
| 621 | continue; | ||
| 622 | } | ||
| 623 | |||
| 624 | if (r->id.idiag_sport != inet->sport && | ||
| 625 | r->id.idiag_sport) | ||
| 626 | goto next_listen; | ||
| 627 | |||
| 628 | if (!(r->idiag_states & TCPF_LISTEN) || | ||
| 629 | r->id.idiag_dport || | ||
| 630 | cb->args[3] > 0) | ||
| 631 | goto syn_recv; | ||
| 632 | |||
| 633 | if (inet_diag_dump_sock(skb, sk, cb) < 0) { | ||
| 634 | inet_listen_unlock(hashinfo); | ||
| 635 | goto done; | ||
| 636 | } | ||
| 637 | |||
| 638 | syn_recv: | ||
| 639 | if (!(r->idiag_states & TCPF_SYN_RECV)) | ||
| 640 | goto next_listen; | ||
| 641 | |||
| 642 | if (inet_diag_dump_reqs(skb, sk, cb) < 0) { | ||
| 643 | inet_listen_unlock(hashinfo); | ||
| 644 | goto done; | ||
| 645 | } | ||
| 646 | |||
| 647 | next_listen: | ||
| 648 | cb->args[3] = 0; | ||
| 649 | cb->args[4] = 0; | ||
| 650 | ++num; | ||
| 651 | } | ||
| 652 | |||
| 653 | s_num = 0; | ||
| 654 | cb->args[3] = 0; | ||
| 655 | cb->args[4] = 0; | ||
| 656 | } | ||
| 657 | inet_listen_unlock(hashinfo); | ||
| 658 | skip_listen_ht: | ||
| 659 | cb->args[0] = 1; | ||
| 660 | s_i = num = s_num = 0; | ||
| 661 | } | ||
| 662 | |||
| 663 | if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV))) | ||
| 664 | return skb->len; | ||
| 665 | |||
| 666 | for (i = s_i; i < hashinfo->ehash_size; i++) { | ||
| 667 | struct inet_ehash_bucket *head = &hashinfo->ehash[i]; | ||
| 668 | struct sock *sk; | ||
| 669 | struct hlist_node *node; | ||
| 670 | |||
| 671 | if (i > s_i) | ||
| 672 | s_num = 0; | ||
| 673 | |||
| 674 | read_lock_bh(&head->lock); | ||
| 675 | |||
| 676 | num = 0; | ||
| 677 | sk_for_each(sk, node, &head->chain) { | ||
| 678 | struct inet_sock *inet = inet_sk(sk); | ||
| 679 | |||
| 680 | if (num < s_num) | ||
| 681 | goto next_normal; | ||
| 682 | if (!(r->idiag_states & (1 << sk->sk_state))) | ||
| 683 | goto next_normal; | ||
| 684 | if (r->id.idiag_sport != inet->sport && | ||
| 685 | r->id.idiag_sport) | ||
| 686 | goto next_normal; | ||
| 687 | if (r->id.idiag_dport != inet->dport && r->id.idiag_dport) | ||
| 688 | goto next_normal; | ||
| 689 | if (inet_diag_dump_sock(skb, sk, cb) < 0) { | ||
| 690 | read_unlock_bh(&head->lock); | ||
| 691 | goto done; | ||
| 692 | } | ||
| 693 | next_normal: | ||
| 694 | ++num; | ||
| 695 | } | ||
| 696 | |||
| 697 | if (r->idiag_states & TCPF_TIME_WAIT) { | ||
| 698 | sk_for_each(sk, node, | ||
| 699 | &hashinfo->ehash[i + hashinfo->ehash_size].chain) { | ||
| 700 | struct inet_sock *inet = inet_sk(sk); | ||
| 701 | |||
| 702 | if (num < s_num) | ||
| 703 | goto next_dying; | ||
| 704 | if (r->id.idiag_sport != inet->sport && | ||
| 705 | r->id.idiag_sport) | ||
| 706 | goto next_dying; | ||
| 707 | if (r->id.idiag_dport != inet->dport && | ||
| 708 | r->id.idiag_dport) | ||
| 709 | goto next_dying; | ||
| 710 | if (inet_diag_dump_sock(skb, sk, cb) < 0) { | ||
| 711 | read_unlock_bh(&head->lock); | ||
| 712 | goto done; | ||
| 713 | } | ||
| 714 | next_dying: | ||
| 715 | ++num; | ||
| 716 | } | ||
| 717 | } | ||
| 718 | read_unlock_bh(&head->lock); | ||
| 719 | } | ||
| 720 | |||
| 721 | done: | ||
| 722 | cb->args[1] = i; | ||
| 723 | cb->args[2] = num; | ||
| 724 | return skb->len; | ||
| 725 | } | ||
| 726 | |||
| 727 | static int inet_diag_dump_done(struct netlink_callback *cb) | ||
| 728 | { | ||
| 729 | return 0; | ||
| 730 | } | ||
| 731 | |||
| 732 | |||
| 733 | static __inline__ int | ||
| 734 | inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | ||
| 735 | { | ||
| 736 | if (!(nlh->nlmsg_flags&NLM_F_REQUEST)) | ||
| 737 | return 0; | ||
| 738 | |||
| 739 | if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX) | ||
| 740 | goto err_inval; | ||
| 741 | |||
| 742 | if (inet_diag_table[nlh->nlmsg_type] == NULL) | ||
| 743 | return -ENOENT; | ||
| 744 | |||
| 745 | if (NLMSG_LENGTH(sizeof(struct inet_diag_req)) > skb->len) | ||
| 746 | goto err_inval; | ||
| 747 | |||
| 748 | if (nlh->nlmsg_flags&NLM_F_DUMP) { | ||
| 749 | if (nlh->nlmsg_len > | ||
| 750 | (4 + NLMSG_SPACE(sizeof(struct inet_diag_req)))) { | ||
| 751 | struct rtattr *rta = (void *)(NLMSG_DATA(nlh) + | ||
| 752 | sizeof(struct inet_diag_req)); | ||
| 753 | if (rta->rta_type != INET_DIAG_REQ_BYTECODE || | ||
| 754 | rta->rta_len < 8 || | ||
| 755 | rta->rta_len > | ||
| 756 | (nlh->nlmsg_len - | ||
| 757 | NLMSG_SPACE(sizeof(struct inet_diag_req)))) | ||
| 758 | goto err_inval; | ||
| 759 | if (inet_diag_bc_audit(RTA_DATA(rta), RTA_PAYLOAD(rta))) | ||
| 760 | goto err_inval; | ||
| 761 | } | ||
| 762 | return netlink_dump_start(idiagnl, skb, nlh, | ||
| 763 | inet_diag_dump, | ||
| 764 | inet_diag_dump_done); | ||
| 765 | } else { | ||
| 766 | return inet_diag_get_exact(skb, nlh); | ||
| 767 | } | ||
| 768 | |||
| 769 | err_inval: | ||
| 770 | return -EINVAL; | ||
| 771 | } | ||
| 772 | |||
| 773 | |||
| 774 | static inline void inet_diag_rcv_skb(struct sk_buff *skb) | ||
| 775 | { | ||
| 776 | int err; | ||
| 777 | struct nlmsghdr * nlh; | ||
| 778 | |||
| 779 | if (skb->len >= NLMSG_SPACE(0)) { | ||
| 780 | nlh = (struct nlmsghdr *)skb->data; | ||
| 781 | if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) | ||
| 782 | return; | ||
| 783 | err = inet_diag_rcv_msg(skb, nlh); | ||
| 784 | if (err || nlh->nlmsg_flags & NLM_F_ACK) | ||
| 785 | netlink_ack(skb, nlh, err); | ||
| 786 | } | ||
| 787 | } | ||
| 788 | |||
| 789 | static void inet_diag_rcv(struct sock *sk, int len) | ||
| 790 | { | ||
| 791 | struct sk_buff *skb; | ||
| 792 | unsigned int qlen = skb_queue_len(&sk->sk_receive_queue); | ||
| 793 | |||
| 794 | while (qlen-- && (skb = skb_dequeue(&sk->sk_receive_queue))) { | ||
| 795 | inet_diag_rcv_skb(skb); | ||
| 796 | kfree_skb(skb); | ||
| 797 | } | ||
| 798 | } | ||
| 799 | |||
| 800 | static DEFINE_SPINLOCK(inet_diag_register_lock); | ||
| 801 | |||
| 802 | int inet_diag_register(const struct inet_diag_handler *h) | ||
| 803 | { | ||
| 804 | const __u16 type = h->idiag_type; | ||
| 805 | int err = -EINVAL; | ||
| 806 | |||
| 807 | if (type >= INET_DIAG_GETSOCK_MAX) | ||
| 808 | goto out; | ||
| 809 | |||
| 810 | spin_lock(&inet_diag_register_lock); | ||
| 811 | err = -EEXIST; | ||
| 812 | if (inet_diag_table[type] == NULL) { | ||
| 813 | inet_diag_table[type] = h; | ||
| 814 | err = 0; | ||
| 815 | } | ||
| 816 | spin_unlock(&inet_diag_register_lock); | ||
| 817 | out: | ||
| 818 | return err; | ||
| 819 | } | ||
| 820 | EXPORT_SYMBOL_GPL(inet_diag_register); | ||
| 821 | |||
| 822 | void inet_diag_unregister(const struct inet_diag_handler *h) | ||
| 823 | { | ||
| 824 | const __u16 type = h->idiag_type; | ||
| 825 | |||
| 826 | if (type >= INET_DIAG_GETSOCK_MAX) | ||
| 827 | return; | ||
| 828 | |||
| 829 | spin_lock(&inet_diag_register_lock); | ||
| 830 | inet_diag_table[type] = NULL; | ||
| 831 | spin_unlock(&inet_diag_register_lock); | ||
| 832 | |||
| 833 | synchronize_rcu(); | ||
| 834 | } | ||
| 835 | EXPORT_SYMBOL_GPL(inet_diag_unregister); | ||
| 836 | |||
| 837 | static int __init inet_diag_init(void) | ||
| 838 | { | ||
| 839 | const int inet_diag_table_size = (INET_DIAG_GETSOCK_MAX * | ||
| 840 | sizeof(struct inet_diag_handler *)); | ||
| 841 | int err = -ENOMEM; | ||
| 842 | |||
| 843 | inet_diag_table = kmalloc(inet_diag_table_size, GFP_KERNEL); | ||
| 844 | if (!inet_diag_table) | ||
| 845 | goto out; | ||
| 846 | |||
| 847 | memset(inet_diag_table, 0, inet_diag_table_size); | ||
| 848 | idiagnl = netlink_kernel_create(NETLINK_INET_DIAG, 0, inet_diag_rcv, | ||
| 849 | THIS_MODULE); | ||
| 850 | if (idiagnl == NULL) | ||
| 851 | goto out_free_table; | ||
| 852 | err = 0; | ||
| 853 | out: | ||
| 854 | return err; | ||
| 855 | out_free_table: | ||
| 856 | kfree(inet_diag_table); | ||
| 857 | goto out; | ||
| 858 | } | ||
| 859 | |||
| 860 | static void __exit inet_diag_exit(void) | ||
| 861 | { | ||
| 862 | sock_release(idiagnl->sk_socket); | ||
| 863 | kfree(inet_diag_table); | ||
| 864 | } | ||
| 865 | |||
| 866 | module_init(inet_diag_init); | ||
| 867 | module_exit(inet_diag_exit); | ||
| 868 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c new file mode 100644 index 000000000000..e8d29fe736d2 --- /dev/null +++ b/net/ipv4/inet_hashtables.c | |||
| @@ -0,0 +1,165 @@ | |||
| 1 | /* | ||
| 2 | * INET An implementation of the TCP/IP protocol suite for the LINUX | ||
| 3 | * operating system. INET is implemented using the BSD Socket | ||
| 4 | * interface as the means of communication with the user level. | ||
| 5 | * | ||
| 6 | * Generic INET transport hashtables | ||
| 7 | * | ||
| 8 | * Authors: Lotsa people, from code originally in tcp | ||
| 9 | * | ||
| 10 | * This program is free software; you can redistribute it and/or | ||
| 11 | * modify it under the terms of the GNU General Public License | ||
| 12 | * as published by the Free Software Foundation; either version | ||
| 13 | * 2 of the License, or (at your option) any later version. | ||
| 14 | */ | ||
| 15 | |||
| 16 | #include <linux/config.h> | ||
| 17 | #include <linux/module.h> | ||
| 18 | #include <linux/sched.h> | ||
| 19 | #include <linux/slab.h> | ||
| 20 | #include <linux/wait.h> | ||
| 21 | |||
| 22 | #include <net/inet_connection_sock.h> | ||
| 23 | #include <net/inet_hashtables.h> | ||
| 24 | |||
| 25 | /* | ||
| 26 | * Allocate and initialize a new local port bind bucket. | ||
| 27 | * The bindhash mutex for snum's hash chain must be held here. | ||
| 28 | */ | ||
| 29 | struct inet_bind_bucket *inet_bind_bucket_create(kmem_cache_t *cachep, | ||
| 30 | struct inet_bind_hashbucket *head, | ||
| 31 | const unsigned short snum) | ||
| 32 | { | ||
| 33 | struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, SLAB_ATOMIC); | ||
| 34 | |||
| 35 | if (tb != NULL) { | ||
| 36 | tb->port = snum; | ||
| 37 | tb->fastreuse = 0; | ||
| 38 | INIT_HLIST_HEAD(&tb->owners); | ||
| 39 | hlist_add_head(&tb->node, &head->chain); | ||
| 40 | } | ||
| 41 | return tb; | ||
| 42 | } | ||
| 43 | |||
| 44 | EXPORT_SYMBOL(inet_bind_bucket_create); | ||
| 45 | |||
| 46 | /* | ||
| 47 | * Caller must hold hashbucket lock for this tb with local BH disabled | ||
| 48 | */ | ||
| 49 | void inet_bind_bucket_destroy(kmem_cache_t *cachep, struct inet_bind_bucket *tb) | ||
| 50 | { | ||
| 51 | if (hlist_empty(&tb->owners)) { | ||
| 52 | __hlist_del(&tb->node); | ||
| 53 | kmem_cache_free(cachep, tb); | ||
| 54 | } | ||
| 55 | } | ||
| 56 | |||
| 57 | void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, | ||
| 58 | const unsigned short snum) | ||
| 59 | { | ||
| 60 | inet_sk(sk)->num = snum; | ||
| 61 | sk_add_bind_node(sk, &tb->owners); | ||
| 62 | inet_csk(sk)->icsk_bind_hash = tb; | ||
| 63 | } | ||
| 64 | |||
| 65 | EXPORT_SYMBOL(inet_bind_hash); | ||
| 66 | |||
| 67 | /* | ||
| 68 | * Get rid of any references to a local port held by the given sock. | ||
| 69 | */ | ||
| 70 | static void __inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk) | ||
| 71 | { | ||
| 72 | const int bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size); | ||
| 73 | struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; | ||
| 74 | struct inet_bind_bucket *tb; | ||
| 75 | |||
| 76 | spin_lock(&head->lock); | ||
| 77 | tb = inet_csk(sk)->icsk_bind_hash; | ||
| 78 | __sk_del_bind_node(sk); | ||
| 79 | inet_csk(sk)->icsk_bind_hash = NULL; | ||
| 80 | inet_sk(sk)->num = 0; | ||
| 81 | inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); | ||
| 82 | spin_unlock(&head->lock); | ||
| 83 | } | ||
| 84 | |||
| 85 | void inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk) | ||
| 86 | { | ||
| 87 | local_bh_disable(); | ||
| 88 | __inet_put_port(hashinfo, sk); | ||
| 89 | local_bh_enable(); | ||
| 90 | } | ||
| 91 | |||
| 92 | EXPORT_SYMBOL(inet_put_port); | ||
| 93 | |||
| 94 | /* | ||
| 95 | * This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP. | ||
| 96 | * Look, when several writers sleep and reader wakes them up, all but one | ||
| 97 | * immediately hit write lock and grab all the cpus. Exclusive sleep solves | ||
| 98 | * this, _but_ remember, it adds useless work on UP machines (wake up each | ||
| 99 | * exclusive lock release). It should be ifdefed really. | ||
| 100 | */ | ||
| 101 | void inet_listen_wlock(struct inet_hashinfo *hashinfo) | ||
| 102 | { | ||
| 103 | write_lock(&hashinfo->lhash_lock); | ||
| 104 | |||
| 105 | if (atomic_read(&hashinfo->lhash_users)) { | ||
| 106 | DEFINE_WAIT(wait); | ||
| 107 | |||
| 108 | for (;;) { | ||
| 109 | prepare_to_wait_exclusive(&hashinfo->lhash_wait, | ||
| 110 | &wait, TASK_UNINTERRUPTIBLE); | ||
| 111 | if (!atomic_read(&hashinfo->lhash_users)) | ||
| 112 | break; | ||
| 113 | write_unlock_bh(&hashinfo->lhash_lock); | ||
| 114 | schedule(); | ||
| 115 | write_lock_bh(&hashinfo->lhash_lock); | ||
| 116 | } | ||
| 117 | |||
| 118 | finish_wait(&hashinfo->lhash_wait, &wait); | ||
| 119 | } | ||
| 120 | } | ||
| 121 | |||
| 122 | EXPORT_SYMBOL(inet_listen_wlock); | ||
| 123 | |||
| 124 | /* | ||
| 125 | * Don't inline this cruft. Here are some nice properties to exploit here. The | ||
| 126 | * BSD API does not allow a listening sock to specify the remote port nor the | ||
| 127 | * remote address for the connection. So always assume those are both | ||
| 128 | * wildcarded during the search since they can never be otherwise. | ||
| 129 | */ | ||
| 130 | struct sock *__inet_lookup_listener(const struct hlist_head *head, const u32 daddr, | ||
| 131 | const unsigned short hnum, const int dif) | ||
| 132 | { | ||
| 133 | struct sock *result = NULL, *sk; | ||
| 134 | const struct hlist_node *node; | ||
| 135 | int hiscore = -1; | ||
| 136 | |||
| 137 | sk_for_each(sk, node, head) { | ||
| 138 | const struct inet_sock *inet = inet_sk(sk); | ||
| 139 | |||
| 140 | if (inet->num == hnum && !ipv6_only_sock(sk)) { | ||
| 141 | const __u32 rcv_saddr = inet->rcv_saddr; | ||
| 142 | int score = sk->sk_family == PF_INET ? 1 : 0; | ||
| 143 | |||
| 144 | if (rcv_saddr) { | ||
| 145 | if (rcv_saddr != daddr) | ||
| 146 | continue; | ||
| 147 | score += 2; | ||
| 148 | } | ||
| 149 | if (sk->sk_bound_dev_if) { | ||
| 150 | if (sk->sk_bound_dev_if != dif) | ||
| 151 | continue; | ||
| 152 | score += 2; | ||
| 153 | } | ||
| 154 | if (score == 5) | ||
| 155 | return sk; | ||
| 156 | if (score > hiscore) { | ||
| 157 | hiscore = score; | ||
| 158 | result = sk; | ||
| 159 | } | ||
| 160 | } | ||
| 161 | } | ||
| 162 | return result; | ||
| 163 | } | ||
| 164 | |||
| 165 | EXPORT_SYMBOL_GPL(__inet_lookup_listener); | ||
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c new file mode 100644 index 000000000000..4d1502a49852 --- /dev/null +++ b/net/ipv4/inet_timewait_sock.c | |||
| @@ -0,0 +1,384 @@ | |||
| 1 | /* | ||
| 2 | * INET An implementation of the TCP/IP protocol suite for the LINUX | ||
| 3 | * operating system. INET is implemented using the BSD Socket | ||
| 4 | * interface as the means of communication with the user level. | ||
| 5 | * | ||
| 6 | * Generic TIME_WAIT sockets functions | ||
| 7 | * | ||
| 8 | * From code orinally in TCP | ||
| 9 | */ | ||
| 10 | |||
| 11 | #include <linux/config.h> | ||
| 12 | |||
| 13 | #include <net/inet_hashtables.h> | ||
| 14 | #include <net/inet_timewait_sock.h> | ||
| 15 | #include <net/ip.h> | ||
| 16 | |||
| 17 | /* Must be called with locally disabled BHs. */ | ||
| 18 | void __inet_twsk_kill(struct inet_timewait_sock *tw, struct inet_hashinfo *hashinfo) | ||
| 19 | { | ||
| 20 | struct inet_bind_hashbucket *bhead; | ||
| 21 | struct inet_bind_bucket *tb; | ||
| 22 | /* Unlink from established hashes. */ | ||
| 23 | struct inet_ehash_bucket *ehead = &hashinfo->ehash[tw->tw_hashent]; | ||
| 24 | |||
| 25 | write_lock(&ehead->lock); | ||
| 26 | if (hlist_unhashed(&tw->tw_node)) { | ||
| 27 | write_unlock(&ehead->lock); | ||
| 28 | return; | ||
| 29 | } | ||
| 30 | __hlist_del(&tw->tw_node); | ||
| 31 | sk_node_init(&tw->tw_node); | ||
| 32 | write_unlock(&ehead->lock); | ||
| 33 | |||
| 34 | /* Disassociate with bind bucket. */ | ||
| 35 | bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)]; | ||
| 36 | spin_lock(&bhead->lock); | ||
| 37 | tb = tw->tw_tb; | ||
| 38 | __hlist_del(&tw->tw_bind_node); | ||
| 39 | tw->tw_tb = NULL; | ||
| 40 | inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); | ||
| 41 | spin_unlock(&bhead->lock); | ||
| 42 | #ifdef SOCK_REFCNT_DEBUG | ||
| 43 | if (atomic_read(&tw->tw_refcnt) != 1) { | ||
| 44 | printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n", | ||
| 45 | tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt)); | ||
| 46 | } | ||
| 47 | #endif | ||
| 48 | inet_twsk_put(tw); | ||
| 49 | } | ||
| 50 | |||
| 51 | EXPORT_SYMBOL_GPL(__inet_twsk_kill); | ||
| 52 | |||
| 53 | /* | ||
| 54 | * Enter the time wait state. This is called with locally disabled BH. | ||
| 55 | * Essentially we whip up a timewait bucket, copy the relevant info into it | ||
| 56 | * from the SK, and mess with hash chains and list linkage. | ||
| 57 | */ | ||
| 58 | void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, | ||
| 59 | struct inet_hashinfo *hashinfo) | ||
| 60 | { | ||
| 61 | const struct inet_sock *inet = inet_sk(sk); | ||
| 62 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 63 | struct inet_ehash_bucket *ehead = &hashinfo->ehash[sk->sk_hashent]; | ||
| 64 | struct inet_bind_hashbucket *bhead; | ||
| 65 | /* Step 1: Put TW into bind hash. Original socket stays there too. | ||
| 66 | Note, that any socket with inet->num != 0 MUST be bound in | ||
| 67 | binding cache, even if it is closed. | ||
| 68 | */ | ||
| 69 | bhead = &hashinfo->bhash[inet_bhashfn(inet->num, hashinfo->bhash_size)]; | ||
| 70 | spin_lock(&bhead->lock); | ||
| 71 | tw->tw_tb = icsk->icsk_bind_hash; | ||
| 72 | BUG_TRAP(icsk->icsk_bind_hash); | ||
| 73 | inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); | ||
| 74 | spin_unlock(&bhead->lock); | ||
| 75 | |||
| 76 | write_lock(&ehead->lock); | ||
| 77 | |||
| 78 | /* Step 2: Remove SK from established hash. */ | ||
| 79 | if (__sk_del_node_init(sk)) | ||
| 80 | sock_prot_dec_use(sk->sk_prot); | ||
| 81 | |||
| 82 | /* Step 3: Hash TW into TIMEWAIT half of established hash table. */ | ||
| 83 | inet_twsk_add_node(tw, &(ehead + hashinfo->ehash_size)->chain); | ||
| 84 | atomic_inc(&tw->tw_refcnt); | ||
| 85 | |||
| 86 | write_unlock(&ehead->lock); | ||
| 87 | } | ||
| 88 | |||
| 89 | EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); | ||
| 90 | |||
| 91 | struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state) | ||
| 92 | { | ||
| 93 | struct inet_timewait_sock *tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_slab, | ||
| 94 | SLAB_ATOMIC); | ||
| 95 | if (tw != NULL) { | ||
| 96 | const struct inet_sock *inet = inet_sk(sk); | ||
| 97 | |||
| 98 | /* Give us an identity. */ | ||
| 99 | tw->tw_daddr = inet->daddr; | ||
| 100 | tw->tw_rcv_saddr = inet->rcv_saddr; | ||
| 101 | tw->tw_bound_dev_if = sk->sk_bound_dev_if; | ||
| 102 | tw->tw_num = inet->num; | ||
| 103 | tw->tw_state = TCP_TIME_WAIT; | ||
| 104 | tw->tw_substate = state; | ||
| 105 | tw->tw_sport = inet->sport; | ||
| 106 | tw->tw_dport = inet->dport; | ||
| 107 | tw->tw_family = sk->sk_family; | ||
| 108 | tw->tw_reuse = sk->sk_reuse; | ||
| 109 | tw->tw_hashent = sk->sk_hashent; | ||
| 110 | tw->tw_ipv6only = 0; | ||
| 111 | tw->tw_prot = sk->sk_prot_creator; | ||
| 112 | atomic_set(&tw->tw_refcnt, 1); | ||
| 113 | inet_twsk_dead_node_init(tw); | ||
| 114 | } | ||
| 115 | |||
| 116 | return tw; | ||
| 117 | } | ||
| 118 | |||
| 119 | EXPORT_SYMBOL_GPL(inet_twsk_alloc); | ||
| 120 | |||
| 121 | /* Returns non-zero if quota exceeded. */ | ||
| 122 | static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr, | ||
| 123 | const int slot) | ||
| 124 | { | ||
| 125 | struct inet_timewait_sock *tw; | ||
| 126 | struct hlist_node *node; | ||
| 127 | unsigned int killed; | ||
| 128 | int ret; | ||
| 129 | |||
| 130 | /* NOTE: compare this to previous version where lock | ||
| 131 | * was released after detaching chain. It was racy, | ||
| 132 | * because tw buckets are scheduled in not serialized context | ||
| 133 | * in 2.3 (with netfilter), and with softnet it is common, because | ||
| 134 | * soft irqs are not sequenced. | ||
| 135 | */ | ||
| 136 | killed = 0; | ||
| 137 | ret = 0; | ||
| 138 | rescan: | ||
| 139 | inet_twsk_for_each_inmate(tw, node, &twdr->cells[slot]) { | ||
| 140 | __inet_twsk_del_dead_node(tw); | ||
| 141 | spin_unlock(&twdr->death_lock); | ||
| 142 | __inet_twsk_kill(tw, twdr->hashinfo); | ||
| 143 | inet_twsk_put(tw); | ||
| 144 | killed++; | ||
| 145 | spin_lock(&twdr->death_lock); | ||
| 146 | if (killed > INET_TWDR_TWKILL_QUOTA) { | ||
| 147 | ret = 1; | ||
| 148 | break; | ||
| 149 | } | ||
| 150 | |||
| 151 | /* While we dropped twdr->death_lock, another cpu may have | ||
| 152 | * killed off the next TW bucket in the list, therefore | ||
| 153 | * do a fresh re-read of the hlist head node with the | ||
| 154 | * lock reacquired. We still use the hlist traversal | ||
| 155 | * macro in order to get the prefetches. | ||
| 156 | */ | ||
| 157 | goto rescan; | ||
| 158 | } | ||
| 159 | |||
| 160 | twdr->tw_count -= killed; | ||
| 161 | NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed); | ||
| 162 | |||
| 163 | return ret; | ||
| 164 | } | ||
| 165 | |||
| 166 | void inet_twdr_hangman(unsigned long data) | ||
| 167 | { | ||
| 168 | struct inet_timewait_death_row *twdr; | ||
| 169 | int unsigned need_timer; | ||
| 170 | |||
| 171 | twdr = (struct inet_timewait_death_row *)data; | ||
| 172 | spin_lock(&twdr->death_lock); | ||
| 173 | |||
| 174 | if (twdr->tw_count == 0) | ||
| 175 | goto out; | ||
| 176 | |||
| 177 | need_timer = 0; | ||
| 178 | if (inet_twdr_do_twkill_work(twdr, twdr->slot)) { | ||
| 179 | twdr->thread_slots |= (1 << twdr->slot); | ||
| 180 | mb(); | ||
| 181 | schedule_work(&twdr->twkill_work); | ||
| 182 | need_timer = 1; | ||
| 183 | } else { | ||
| 184 | /* We purged the entire slot, anything left? */ | ||
| 185 | if (twdr->tw_count) | ||
| 186 | need_timer = 1; | ||
| 187 | } | ||
| 188 | twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1)); | ||
| 189 | if (need_timer) | ||
| 190 | mod_timer(&twdr->tw_timer, jiffies + twdr->period); | ||
| 191 | out: | ||
| 192 | spin_unlock(&twdr->death_lock); | ||
| 193 | } | ||
| 194 | |||
| 195 | EXPORT_SYMBOL_GPL(inet_twdr_hangman); | ||
| 196 | |||
| 197 | extern void twkill_slots_invalid(void); | ||
| 198 | |||
| 199 | void inet_twdr_twkill_work(void *data) | ||
| 200 | { | ||
| 201 | struct inet_timewait_death_row *twdr = data; | ||
| 202 | int i; | ||
| 203 | |||
| 204 | if ((INET_TWDR_TWKILL_SLOTS - 1) > (sizeof(twdr->thread_slots) * 8)) | ||
| 205 | twkill_slots_invalid(); | ||
| 206 | |||
| 207 | while (twdr->thread_slots) { | ||
| 208 | spin_lock_bh(&twdr->death_lock); | ||
| 209 | for (i = 0; i < INET_TWDR_TWKILL_SLOTS; i++) { | ||
| 210 | if (!(twdr->thread_slots & (1 << i))) | ||
| 211 | continue; | ||
| 212 | |||
| 213 | while (inet_twdr_do_twkill_work(twdr, i) != 0) { | ||
| 214 | if (need_resched()) { | ||
| 215 | spin_unlock_bh(&twdr->death_lock); | ||
| 216 | schedule(); | ||
| 217 | spin_lock_bh(&twdr->death_lock); | ||
| 218 | } | ||
| 219 | } | ||
| 220 | |||
| 221 | twdr->thread_slots &= ~(1 << i); | ||
| 222 | } | ||
| 223 | spin_unlock_bh(&twdr->death_lock); | ||
| 224 | } | ||
| 225 | } | ||
| 226 | |||
| 227 | EXPORT_SYMBOL_GPL(inet_twdr_twkill_work); | ||
| 228 | |||
| 229 | /* These are always called from BH context. See callers in | ||
| 230 | * tcp_input.c to verify this. | ||
| 231 | */ | ||
| 232 | |||
| 233 | /* This is for handling early-kills of TIME_WAIT sockets. */ | ||
| 234 | void inet_twsk_deschedule(struct inet_timewait_sock *tw, | ||
| 235 | struct inet_timewait_death_row *twdr) | ||
| 236 | { | ||
| 237 | spin_lock(&twdr->death_lock); | ||
| 238 | if (inet_twsk_del_dead_node(tw)) { | ||
| 239 | inet_twsk_put(tw); | ||
| 240 | if (--twdr->tw_count == 0) | ||
| 241 | del_timer(&twdr->tw_timer); | ||
| 242 | } | ||
| 243 | spin_unlock(&twdr->death_lock); | ||
| 244 | __inet_twsk_kill(tw, twdr->hashinfo); | ||
| 245 | } | ||
| 246 | |||
| 247 | EXPORT_SYMBOL(inet_twsk_deschedule); | ||
| 248 | |||
| 249 | void inet_twsk_schedule(struct inet_timewait_sock *tw, | ||
| 250 | struct inet_timewait_death_row *twdr, | ||
| 251 | const int timeo, const int timewait_len) | ||
| 252 | { | ||
| 253 | struct hlist_head *list; | ||
| 254 | int slot; | ||
| 255 | |||
| 256 | /* timeout := RTO * 3.5 | ||
| 257 | * | ||
| 258 | * 3.5 = 1+2+0.5 to wait for two retransmits. | ||
| 259 | * | ||
| 260 | * RATIONALE: if FIN arrived and we entered TIME-WAIT state, | ||
| 261 | * our ACK acking that FIN can be lost. If N subsequent retransmitted | ||
| 262 | * FINs (or previous seqments) are lost (probability of such event | ||
| 263 | * is p^(N+1), where p is probability to lose single packet and | ||
| 264 | * time to detect the loss is about RTO*(2^N - 1) with exponential | ||
| 265 | * backoff). Normal timewait length is calculated so, that we | ||
| 266 | * waited at least for one retransmitted FIN (maximal RTO is 120sec). | ||
| 267 | * [ BTW Linux. following BSD, violates this requirement waiting | ||
| 268 | * only for 60sec, we should wait at least for 240 secs. | ||
| 269 | * Well, 240 consumes too much of resources 8) | ||
| 270 | * ] | ||
| 271 | * This interval is not reduced to catch old duplicate and | ||
| 272 | * responces to our wandering segments living for two MSLs. | ||
| 273 | * However, if we use PAWS to detect | ||
| 274 | * old duplicates, we can reduce the interval to bounds required | ||
| 275 | * by RTO, rather than MSL. So, if peer understands PAWS, we | ||
| 276 | * kill tw bucket after 3.5*RTO (it is important that this number | ||
| 277 | * is greater than TS tick!) and detect old duplicates with help | ||
| 278 | * of PAWS. | ||
| 279 | */ | ||
| 280 | slot = (timeo + (1 << INET_TWDR_RECYCLE_TICK) - 1) >> INET_TWDR_RECYCLE_TICK; | ||
| 281 | |||
| 282 | spin_lock(&twdr->death_lock); | ||
| 283 | |||
| 284 | /* Unlink it, if it was scheduled */ | ||
| 285 | if (inet_twsk_del_dead_node(tw)) | ||
| 286 | twdr->tw_count--; | ||
| 287 | else | ||
| 288 | atomic_inc(&tw->tw_refcnt); | ||
| 289 | |||
| 290 | if (slot >= INET_TWDR_RECYCLE_SLOTS) { | ||
| 291 | /* Schedule to slow timer */ | ||
| 292 | if (timeo >= timewait_len) { | ||
| 293 | slot = INET_TWDR_TWKILL_SLOTS - 1; | ||
| 294 | } else { | ||
| 295 | slot = (timeo + twdr->period - 1) / twdr->period; | ||
| 296 | if (slot >= INET_TWDR_TWKILL_SLOTS) | ||
| 297 | slot = INET_TWDR_TWKILL_SLOTS - 1; | ||
| 298 | } | ||
| 299 | tw->tw_ttd = jiffies + timeo; | ||
| 300 | slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1); | ||
| 301 | list = &twdr->cells[slot]; | ||
| 302 | } else { | ||
| 303 | tw->tw_ttd = jiffies + (slot << INET_TWDR_RECYCLE_TICK); | ||
| 304 | |||
| 305 | if (twdr->twcal_hand < 0) { | ||
| 306 | twdr->twcal_hand = 0; | ||
| 307 | twdr->twcal_jiffie = jiffies; | ||
| 308 | twdr->twcal_timer.expires = twdr->twcal_jiffie + | ||
| 309 | (slot << INET_TWDR_RECYCLE_TICK); | ||
| 310 | add_timer(&twdr->twcal_timer); | ||
| 311 | } else { | ||
| 312 | if (time_after(twdr->twcal_timer.expires, | ||
| 313 | jiffies + (slot << INET_TWDR_RECYCLE_TICK))) | ||
| 314 | mod_timer(&twdr->twcal_timer, | ||
| 315 | jiffies + (slot << INET_TWDR_RECYCLE_TICK)); | ||
| 316 | slot = (twdr->twcal_hand + slot) & (INET_TWDR_RECYCLE_SLOTS - 1); | ||
| 317 | } | ||
| 318 | list = &twdr->twcal_row[slot]; | ||
| 319 | } | ||
| 320 | |||
| 321 | hlist_add_head(&tw->tw_death_node, list); | ||
| 322 | |||
| 323 | if (twdr->tw_count++ == 0) | ||
| 324 | mod_timer(&twdr->tw_timer, jiffies + twdr->period); | ||
| 325 | spin_unlock(&twdr->death_lock); | ||
| 326 | } | ||
| 327 | |||
| 328 | EXPORT_SYMBOL_GPL(inet_twsk_schedule); | ||
| 329 | |||
| 330 | void inet_twdr_twcal_tick(unsigned long data) | ||
| 331 | { | ||
| 332 | struct inet_timewait_death_row *twdr; | ||
| 333 | int n, slot; | ||
| 334 | unsigned long j; | ||
| 335 | unsigned long now = jiffies; | ||
| 336 | int killed = 0; | ||
| 337 | int adv = 0; | ||
| 338 | |||
| 339 | twdr = (struct inet_timewait_death_row *)data; | ||
| 340 | |||
| 341 | spin_lock(&twdr->death_lock); | ||
| 342 | if (twdr->twcal_hand < 0) | ||
| 343 | goto out; | ||
| 344 | |||
| 345 | slot = twdr->twcal_hand; | ||
| 346 | j = twdr->twcal_jiffie; | ||
| 347 | |||
| 348 | for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) { | ||
| 349 | if (time_before_eq(j, now)) { | ||
| 350 | struct hlist_node *node, *safe; | ||
| 351 | struct inet_timewait_sock *tw; | ||
| 352 | |||
| 353 | inet_twsk_for_each_inmate_safe(tw, node, safe, | ||
| 354 | &twdr->twcal_row[slot]) { | ||
| 355 | __inet_twsk_del_dead_node(tw); | ||
| 356 | __inet_twsk_kill(tw, twdr->hashinfo); | ||
| 357 | inet_twsk_put(tw); | ||
| 358 | killed++; | ||
| 359 | } | ||
| 360 | } else { | ||
| 361 | if (!adv) { | ||
| 362 | adv = 1; | ||
| 363 | twdr->twcal_jiffie = j; | ||
| 364 | twdr->twcal_hand = slot; | ||
| 365 | } | ||
| 366 | |||
| 367 | if (!hlist_empty(&twdr->twcal_row[slot])) { | ||
| 368 | mod_timer(&twdr->twcal_timer, j); | ||
| 369 | goto out; | ||
| 370 | } | ||
| 371 | } | ||
| 372 | j += 1 << INET_TWDR_RECYCLE_TICK; | ||
| 373 | slot = (slot + 1) & (INET_TWDR_RECYCLE_SLOTS - 1); | ||
| 374 | } | ||
| 375 | twdr->twcal_hand = -1; | ||
| 376 | |||
| 377 | out: | ||
| 378 | if ((twdr->tw_count -= killed) == 0) | ||
| 379 | del_timer(&twdr->tw_timer); | ||
| 380 | NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed); | ||
| 381 | spin_unlock(&twdr->death_lock); | ||
| 382 | } | ||
| 383 | |||
| 384 | EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick); | ||
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index ab18a853d7ce..f84ba9c96551 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c | |||
| @@ -20,6 +20,7 @@ | |||
| 20 | #include <linux/kernel.h> | 20 | #include <linux/kernel.h> |
| 21 | #include <linux/mm.h> | 21 | #include <linux/mm.h> |
| 22 | #include <linux/net.h> | 22 | #include <linux/net.h> |
| 23 | #include <net/ip.h> | ||
| 23 | #include <net/inetpeer.h> | 24 | #include <net/inetpeer.h> |
| 24 | 25 | ||
| 25 | /* | 26 | /* |
| @@ -72,7 +73,7 @@ | |||
| 72 | /* Exported for inet_getid inline function. */ | 73 | /* Exported for inet_getid inline function. */ |
| 73 | DEFINE_SPINLOCK(inet_peer_idlock); | 74 | DEFINE_SPINLOCK(inet_peer_idlock); |
| 74 | 75 | ||
| 75 | static kmem_cache_t *peer_cachep; | 76 | static kmem_cache_t *peer_cachep __read_mostly; |
| 76 | 77 | ||
| 77 | #define node_height(x) x->avl_height | 78 | #define node_height(x) x->avl_height |
| 78 | static struct inet_peer peer_fake_node = { | 79 | static struct inet_peer peer_fake_node = { |
| @@ -459,5 +460,3 @@ static void peer_check_expire(unsigned long dummy) | |||
| 459 | peer_total / inet_peer_threshold * HZ; | 460 | peer_total / inet_peer_threshold * HZ; |
| 460 | add_timer(&peer_periodic_timer); | 461 | add_timer(&peer_periodic_timer); |
| 461 | } | 462 | } |
| 462 | |||
| 463 | EXPORT_SYMBOL(inet_peer_idlock); | ||
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 77094aac6c28..0923add122b4 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c | |||
| @@ -76,16 +76,12 @@ int ip_forward(struct sk_buff *skb) | |||
| 76 | * that reaches zero, we must reply an ICMP control message telling | 76 | * that reaches zero, we must reply an ICMP control message telling |
| 77 | * that the packet's lifetime expired. | 77 | * that the packet's lifetime expired. |
| 78 | */ | 78 | */ |
| 79 | 79 | if (skb->nh.iph->ttl <= 1) | |
| 80 | iph = skb->nh.iph; | ||
| 81 | |||
| 82 | if (iph->ttl <= 1) | ||
| 83 | goto too_many_hops; | 80 | goto too_many_hops; |
| 84 | 81 | ||
| 85 | if (!xfrm4_route_forward(skb)) | 82 | if (!xfrm4_route_forward(skb)) |
| 86 | goto drop; | 83 | goto drop; |
| 87 | 84 | ||
| 88 | iph = skb->nh.iph; | ||
| 89 | rt = (struct rtable*)skb->dst; | 85 | rt = (struct rtable*)skb->dst; |
| 90 | 86 | ||
| 91 | if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) | 87 | if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) |
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index eb377ae15305..9e6e683cc34d 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c | |||
| @@ -377,7 +377,7 @@ static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user) | |||
| 377 | return ip_frag_intern(hash, qp); | 377 | return ip_frag_intern(hash, qp); |
| 378 | 378 | ||
| 379 | out_nomem: | 379 | out_nomem: |
| 380 | LIMIT_NETDEBUG(printk(KERN_ERR "ip_frag_create: no memory left !\n")); | 380 | LIMIT_NETDEBUG(KERN_ERR "ip_frag_create: no memory left !\n"); |
| 381 | return NULL; | 381 | return NULL; |
| 382 | } | 382 | } |
| 383 | 383 | ||
| @@ -533,7 +533,7 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) | |||
| 533 | if (skb->dev) | 533 | if (skb->dev) |
| 534 | qp->iif = skb->dev->ifindex; | 534 | qp->iif = skb->dev->ifindex; |
| 535 | skb->dev = NULL; | 535 | skb->dev = NULL; |
| 536 | qp->stamp = skb->stamp; | 536 | skb_get_timestamp(skb, &qp->stamp); |
| 537 | qp->meat += skb->len; | 537 | qp->meat += skb->len; |
| 538 | atomic_add(skb->truesize, &ip_frag_mem); | 538 | atomic_add(skb->truesize, &ip_frag_mem); |
| 539 | if (offset == 0) | 539 | if (offset == 0) |
| @@ -615,7 +615,7 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev) | |||
| 615 | 615 | ||
| 616 | head->next = NULL; | 616 | head->next = NULL; |
| 617 | head->dev = dev; | 617 | head->dev = dev; |
| 618 | head->stamp = qp->stamp; | 618 | skb_set_timestamp(head, &qp->stamp); |
| 619 | 619 | ||
| 620 | iph = head->nh.iph; | 620 | iph = head->nh.iph; |
| 621 | iph->frag_off = 0; | 621 | iph->frag_off = 0; |
| @@ -625,8 +625,8 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev) | |||
| 625 | return head; | 625 | return head; |
| 626 | 626 | ||
| 627 | out_nomem: | 627 | out_nomem: |
| 628 | LIMIT_NETDEBUG(printk(KERN_ERR "IP: queue_glue: no memory for gluing " | 628 | LIMIT_NETDEBUG(KERN_ERR "IP: queue_glue: no memory for gluing " |
| 629 | "queue %p\n", qp)); | 629 | "queue %p\n", qp); |
| 630 | goto out_fail; | 630 | goto out_fail; |
| 631 | out_oversize: | 631 | out_oversize: |
| 632 | if (net_ratelimit()) | 632 | if (net_ratelimit()) |
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index c703528e0bcd..473d0f2b2e0d 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c | |||
| @@ -150,7 +150,7 @@ | |||
| 150 | * SNMP management statistics | 150 | * SNMP management statistics |
| 151 | */ | 151 | */ |
| 152 | 152 | ||
| 153 | DEFINE_SNMP_STAT(struct ipstats_mib, ip_statistics); | 153 | DEFINE_SNMP_STAT(struct ipstats_mib, ip_statistics) __read_mostly; |
| 154 | 154 | ||
| 155 | /* | 155 | /* |
| 156 | * Process Router Attention IP option | 156 | * Process Router Attention IP option |
| @@ -225,8 +225,8 @@ static inline int ip_local_deliver_finish(struct sk_buff *skb) | |||
| 225 | /* If there maybe a raw socket we must check - if not we | 225 | /* If there maybe a raw socket we must check - if not we |
| 226 | * don't care less | 226 | * don't care less |
| 227 | */ | 227 | */ |
| 228 | if (raw_sk) | 228 | if (raw_sk && !raw_v4_input(skb, skb->nh.iph, hash)) |
| 229 | raw_v4_input(skb, skb->nh.iph, hash); | 229 | raw_sk = NULL; |
| 230 | 230 | ||
| 231 | if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) { | 231 | if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) { |
| 232 | int ret; | 232 | int ret; |
| @@ -279,18 +279,70 @@ int ip_local_deliver(struct sk_buff *skb) | |||
| 279 | ip_local_deliver_finish); | 279 | ip_local_deliver_finish); |
| 280 | } | 280 | } |
| 281 | 281 | ||
| 282 | static inline int ip_rcv_finish(struct sk_buff *skb) | 282 | static inline int ip_rcv_options(struct sk_buff *skb) |
| 283 | { | 283 | { |
| 284 | struct ip_options *opt; | ||
| 285 | struct iphdr *iph; | ||
| 284 | struct net_device *dev = skb->dev; | 286 | struct net_device *dev = skb->dev; |
| 287 | |||
| 288 | /* It looks as overkill, because not all | ||
| 289 | IP options require packet mangling. | ||
| 290 | But it is the easiest for now, especially taking | ||
| 291 | into account that combination of IP options | ||
| 292 | and running sniffer is extremely rare condition. | ||
| 293 | --ANK (980813) | ||
| 294 | */ | ||
| 295 | if (skb_cow(skb, skb_headroom(skb))) { | ||
| 296 | IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS); | ||
| 297 | goto drop; | ||
| 298 | } | ||
| 299 | |||
| 300 | iph = skb->nh.iph; | ||
| 301 | |||
| 302 | if (ip_options_compile(NULL, skb)) { | ||
| 303 | IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); | ||
| 304 | goto drop; | ||
| 305 | } | ||
| 306 | |||
| 307 | opt = &(IPCB(skb)->opt); | ||
| 308 | if (unlikely(opt->srr)) { | ||
| 309 | struct in_device *in_dev = in_dev_get(dev); | ||
| 310 | if (in_dev) { | ||
| 311 | if (!IN_DEV_SOURCE_ROUTE(in_dev)) { | ||
| 312 | if (IN_DEV_LOG_MARTIANS(in_dev) && | ||
| 313 | net_ratelimit()) | ||
| 314 | printk(KERN_INFO "source route option " | ||
| 315 | "%u.%u.%u.%u -> %u.%u.%u.%u\n", | ||
| 316 | NIPQUAD(iph->saddr), | ||
| 317 | NIPQUAD(iph->daddr)); | ||
| 318 | in_dev_put(in_dev); | ||
| 319 | goto drop; | ||
| 320 | } | ||
| 321 | |||
| 322 | in_dev_put(in_dev); | ||
| 323 | } | ||
| 324 | |||
| 325 | if (ip_options_rcv_srr(skb)) | ||
| 326 | goto drop; | ||
| 327 | } | ||
| 328 | |||
| 329 | return 0; | ||
| 330 | drop: | ||
| 331 | return -1; | ||
| 332 | } | ||
| 333 | |||
| 334 | static inline int ip_rcv_finish(struct sk_buff *skb) | ||
| 335 | { | ||
| 285 | struct iphdr *iph = skb->nh.iph; | 336 | struct iphdr *iph = skb->nh.iph; |
| 286 | int err; | ||
| 287 | 337 | ||
| 288 | /* | 338 | /* |
| 289 | * Initialise the virtual path cache for the packet. It describes | 339 | * Initialise the virtual path cache for the packet. It describes |
| 290 | * how the packet travels inside Linux networking. | 340 | * how the packet travels inside Linux networking. |
| 291 | */ | 341 | */ |
| 292 | if (skb->dst == NULL) { | 342 | if (likely(skb->dst == NULL)) { |
| 293 | if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) { | 343 | int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, |
| 344 | skb->dev); | ||
| 345 | if (unlikely(err)) { | ||
| 294 | if (err == -EHOSTUNREACH) | 346 | if (err == -EHOSTUNREACH) |
| 295 | IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS); | 347 | IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS); |
| 296 | goto drop; | 348 | goto drop; |
| @@ -298,7 +350,7 @@ static inline int ip_rcv_finish(struct sk_buff *skb) | |||
| 298 | } | 350 | } |
| 299 | 351 | ||
| 300 | #ifdef CONFIG_NET_CLS_ROUTE | 352 | #ifdef CONFIG_NET_CLS_ROUTE |
| 301 | if (skb->dst->tclassid) { | 353 | if (unlikely(skb->dst->tclassid)) { |
| 302 | struct ip_rt_acct *st = ip_rt_acct + 256*smp_processor_id(); | 354 | struct ip_rt_acct *st = ip_rt_acct + 256*smp_processor_id(); |
| 303 | u32 idx = skb->dst->tclassid; | 355 | u32 idx = skb->dst->tclassid; |
| 304 | st[idx&0xFF].o_packets++; | 356 | st[idx&0xFF].o_packets++; |
| @@ -308,48 +360,11 @@ static inline int ip_rcv_finish(struct sk_buff *skb) | |||
| 308 | } | 360 | } |
| 309 | #endif | 361 | #endif |
| 310 | 362 | ||
| 311 | if (iph->ihl > 5) { | 363 | if (iph->ihl > 5 && ip_rcv_options(skb)) |
| 312 | struct ip_options *opt; | 364 | goto drop; |
| 313 | |||
| 314 | /* It looks as overkill, because not all | ||
| 315 | IP options require packet mangling. | ||
| 316 | But it is the easiest for now, especially taking | ||
| 317 | into account that combination of IP options | ||
| 318 | and running sniffer is extremely rare condition. | ||
| 319 | --ANK (980813) | ||
| 320 | */ | ||
| 321 | |||
| 322 | if (skb_cow(skb, skb_headroom(skb))) { | ||
| 323 | IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS); | ||
| 324 | goto drop; | ||
| 325 | } | ||
| 326 | iph = skb->nh.iph; | ||
| 327 | |||
| 328 | if (ip_options_compile(NULL, skb)) | ||
| 329 | goto inhdr_error; | ||
| 330 | |||
| 331 | opt = &(IPCB(skb)->opt); | ||
| 332 | if (opt->srr) { | ||
| 333 | struct in_device *in_dev = in_dev_get(dev); | ||
| 334 | if (in_dev) { | ||
| 335 | if (!IN_DEV_SOURCE_ROUTE(in_dev)) { | ||
| 336 | if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) | ||
| 337 | printk(KERN_INFO "source route option %u.%u.%u.%u -> %u.%u.%u.%u\n", | ||
| 338 | NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); | ||
| 339 | in_dev_put(in_dev); | ||
| 340 | goto drop; | ||
| 341 | } | ||
| 342 | in_dev_put(in_dev); | ||
| 343 | } | ||
| 344 | if (ip_options_rcv_srr(skb)) | ||
| 345 | goto drop; | ||
| 346 | } | ||
| 347 | } | ||
| 348 | 365 | ||
| 349 | return dst_input(skb); | 366 | return dst_input(skb); |
| 350 | 367 | ||
| 351 | inhdr_error: | ||
| 352 | IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); | ||
| 353 | drop: | 368 | drop: |
| 354 | kfree_skb(skb); | 369 | kfree_skb(skb); |
| 355 | return NET_RX_DROP; | 370 | return NET_RX_DROP; |
| @@ -358,9 +373,10 @@ drop: | |||
| 358 | /* | 373 | /* |
| 359 | * Main IP Receive routine. | 374 | * Main IP Receive routine. |
| 360 | */ | 375 | */ |
| 361 | int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) | 376 | int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) |
| 362 | { | 377 | { |
| 363 | struct iphdr *iph; | 378 | struct iphdr *iph; |
| 379 | u32 len; | ||
| 364 | 380 | ||
| 365 | /* When the interface is in promisc. mode, drop all the crap | 381 | /* When the interface is in promisc. mode, drop all the crap |
| 366 | * that it receives, do not try to analyse it. | 382 | * that it receives, do not try to analyse it. |
| @@ -392,29 +408,27 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) | |||
| 392 | */ | 408 | */ |
| 393 | 409 | ||
| 394 | if (iph->ihl < 5 || iph->version != 4) | 410 | if (iph->ihl < 5 || iph->version != 4) |
| 395 | goto inhdr_error; | 411 | goto inhdr_error; |
| 396 | 412 | ||
| 397 | if (!pskb_may_pull(skb, iph->ihl*4)) | 413 | if (!pskb_may_pull(skb, iph->ihl*4)) |
| 398 | goto inhdr_error; | 414 | goto inhdr_error; |
| 399 | 415 | ||
| 400 | iph = skb->nh.iph; | 416 | iph = skb->nh.iph; |
| 401 | 417 | ||
| 402 | if (ip_fast_csum((u8 *)iph, iph->ihl) != 0) | 418 | if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) |
| 403 | goto inhdr_error; | 419 | goto inhdr_error; |
| 404 | 420 | ||
| 405 | { | 421 | len = ntohs(iph->tot_len); |
| 406 | __u32 len = ntohs(iph->tot_len); | 422 | if (skb->len < len || len < (iph->ihl*4)) |
| 407 | if (skb->len < len || len < (iph->ihl<<2)) | 423 | goto inhdr_error; |
| 408 | goto inhdr_error; | ||
| 409 | 424 | ||
| 410 | /* Our transport medium may have padded the buffer out. Now we know it | 425 | /* Our transport medium may have padded the buffer out. Now we know it |
| 411 | * is IP we can trim to the true length of the frame. | 426 | * is IP we can trim to the true length of the frame. |
| 412 | * Note this now means skb->len holds ntohs(iph->tot_len). | 427 | * Note this now means skb->len holds ntohs(iph->tot_len). |
| 413 | */ | 428 | */ |
| 414 | if (pskb_trim_rcsum(skb, len)) { | 429 | if (pskb_trim_rcsum(skb, len)) { |
| 415 | IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS); | 430 | IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS); |
| 416 | goto drop; | 431 | goto drop; |
| 417 | } | ||
| 418 | } | 432 | } |
| 419 | 433 | ||
| 420 | return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL, | 434 | return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL, |
| @@ -428,5 +442,4 @@ out: | |||
| 428 | return NET_RX_DROP; | 442 | return NET_RX_DROP; |
| 429 | } | 443 | } |
| 430 | 444 | ||
| 431 | EXPORT_SYMBOL(ip_rcv); | ||
| 432 | EXPORT_SYMBOL(ip_statistics); | 445 | EXPORT_SYMBOL(ip_statistics); |
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 6d89f3f3e701..bce4e875193b 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c | |||
| @@ -489,23 +489,18 @@ void ip_options_undo(struct ip_options * opt) | |||
| 489 | } | 489 | } |
| 490 | } | 490 | } |
| 491 | 491 | ||
| 492 | int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen, int user) | 492 | static struct ip_options *ip_options_get_alloc(const int optlen) |
| 493 | { | 493 | { |
| 494 | struct ip_options *opt; | 494 | struct ip_options *opt = kmalloc(sizeof(*opt) + ((optlen + 3) & ~3), |
| 495 | GFP_KERNEL); | ||
| 496 | if (opt) | ||
| 497 | memset(opt, 0, sizeof(*opt)); | ||
| 498 | return opt; | ||
| 499 | } | ||
| 495 | 500 | ||
| 496 | opt = kmalloc(sizeof(struct ip_options)+((optlen+3)&~3), GFP_KERNEL); | 501 | static int ip_options_get_finish(struct ip_options **optp, |
| 497 | if (!opt) | 502 | struct ip_options *opt, int optlen) |
| 498 | return -ENOMEM; | 503 | { |
| 499 | memset(opt, 0, sizeof(struct ip_options)); | ||
| 500 | if (optlen) { | ||
| 501 | if (user) { | ||
| 502 | if (copy_from_user(opt->__data, data, optlen)) { | ||
| 503 | kfree(opt); | ||
| 504 | return -EFAULT; | ||
| 505 | } | ||
| 506 | } else | ||
| 507 | memcpy(opt->__data, data, optlen); | ||
| 508 | } | ||
| 509 | while (optlen & 3) | 504 | while (optlen & 3) |
| 510 | opt->__data[optlen++] = IPOPT_END; | 505 | opt->__data[optlen++] = IPOPT_END; |
| 511 | opt->optlen = optlen; | 506 | opt->optlen = optlen; |
| @@ -521,6 +516,30 @@ int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen, in | |||
| 521 | return 0; | 516 | return 0; |
| 522 | } | 517 | } |
| 523 | 518 | ||
| 519 | int ip_options_get_from_user(struct ip_options **optp, unsigned char __user *data, int optlen) | ||
| 520 | { | ||
| 521 | struct ip_options *opt = ip_options_get_alloc(optlen); | ||
| 522 | |||
| 523 | if (!opt) | ||
| 524 | return -ENOMEM; | ||
| 525 | if (optlen && copy_from_user(opt->__data, data, optlen)) { | ||
| 526 | kfree(opt); | ||
| 527 | return -EFAULT; | ||
| 528 | } | ||
| 529 | return ip_options_get_finish(optp, opt, optlen); | ||
| 530 | } | ||
| 531 | |||
| 532 | int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen) | ||
| 533 | { | ||
| 534 | struct ip_options *opt = ip_options_get_alloc(optlen); | ||
| 535 | |||
| 536 | if (!opt) | ||
| 537 | return -ENOMEM; | ||
| 538 | if (optlen) | ||
| 539 | memcpy(opt->__data, data, optlen); | ||
| 540 | return ip_options_get_finish(optp, opt, optlen); | ||
| 541 | } | ||
| 542 | |||
| 524 | void ip_forward_options(struct sk_buff *skb) | 543 | void ip_forward_options(struct sk_buff *skb) |
| 525 | { | 544 | { |
| 526 | struct ip_options * opt = &(IPCB(skb)->opt); | 545 | struct ip_options * opt = &(IPCB(skb)->opt); |
| @@ -620,6 +639,3 @@ int ip_options_rcv_srr(struct sk_buff *skb) | |||
| 620 | } | 639 | } |
| 621 | return 0; | 640 | return 0; |
| 622 | } | 641 | } |
| 623 | |||
| 624 | EXPORT_SYMBOL(ip_options_compile); | ||
| 625 | EXPORT_SYMBOL(ip_options_undo); | ||
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 80d13103b2b0..3f1a263e1249 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c | |||
| @@ -69,13 +69,10 @@ | |||
| 69 | #include <net/ip.h> | 69 | #include <net/ip.h> |
| 70 | #include <net/protocol.h> | 70 | #include <net/protocol.h> |
| 71 | #include <net/route.h> | 71 | #include <net/route.h> |
| 72 | #include <net/tcp.h> | ||
| 73 | #include <net/udp.h> | ||
| 74 | #include <linux/skbuff.h> | 72 | #include <linux/skbuff.h> |
| 75 | #include <net/sock.h> | 73 | #include <net/sock.h> |
| 76 | #include <net/arp.h> | 74 | #include <net/arp.h> |
| 77 | #include <net/icmp.h> | 75 | #include <net/icmp.h> |
| 78 | #include <net/raw.h> | ||
| 79 | #include <net/checksum.h> | 76 | #include <net/checksum.h> |
| 80 | #include <net/inetpeer.h> | 77 | #include <net/inetpeer.h> |
| 81 | #include <net/checksum.h> | 78 | #include <net/checksum.h> |
| @@ -84,12 +81,8 @@ | |||
| 84 | #include <linux/netfilter_bridge.h> | 81 | #include <linux/netfilter_bridge.h> |
| 85 | #include <linux/mroute.h> | 82 | #include <linux/mroute.h> |
| 86 | #include <linux/netlink.h> | 83 | #include <linux/netlink.h> |
| 84 | #include <linux/tcp.h> | ||
| 87 | 85 | ||
| 88 | /* | ||
| 89 | * Shall we try to damage output packets if routing dev changes? | ||
| 90 | */ | ||
| 91 | |||
| 92 | int sysctl_ip_dynaddr; | ||
| 93 | int sysctl_ip_default_ttl = IPDEFTTL; | 86 | int sysctl_ip_default_ttl = IPDEFTTL; |
| 94 | 87 | ||
| 95 | /* Generate a checksum for an outgoing IP datagram. */ | 88 | /* Generate a checksum for an outgoing IP datagram. */ |
| @@ -165,6 +158,8 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, | |||
| 165 | dst_output); | 158 | dst_output); |
| 166 | } | 159 | } |
| 167 | 160 | ||
| 161 | EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); | ||
| 162 | |||
| 168 | static inline int ip_finish_output2(struct sk_buff *skb) | 163 | static inline int ip_finish_output2(struct sk_buff *skb) |
| 169 | { | 164 | { |
| 170 | struct dst_entry *dst = skb->dst; | 165 | struct dst_entry *dst = skb->dst; |
| @@ -205,7 +200,7 @@ static inline int ip_finish_output2(struct sk_buff *skb) | |||
| 205 | return -EINVAL; | 200 | return -EINVAL; |
| 206 | } | 201 | } |
| 207 | 202 | ||
| 208 | int ip_finish_output(struct sk_buff *skb) | 203 | static inline int ip_finish_output(struct sk_buff *skb) |
| 209 | { | 204 | { |
| 210 | struct net_device *dev = skb->dst->dev; | 205 | struct net_device *dev = skb->dst->dev; |
| 211 | 206 | ||
| @@ -329,8 +324,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok) | |||
| 329 | if (ip_route_output_flow(&rt, &fl, sk, 0)) | 324 | if (ip_route_output_flow(&rt, &fl, sk, 0)) |
| 330 | goto no_route; | 325 | goto no_route; |
| 331 | } | 326 | } |
| 332 | __sk_dst_set(sk, &rt->u.dst); | 327 | sk_setup_caps(sk, &rt->u.dst); |
| 333 | tcp_v4_setup_caps(sk, &rt->u.dst); | ||
| 334 | } | 328 | } |
| 335 | skb->dst = dst_clone(&rt->u.dst); | 329 | skb->dst = dst_clone(&rt->u.dst); |
| 336 | 330 | ||
| @@ -392,7 +386,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) | |||
| 392 | #endif | 386 | #endif |
| 393 | #ifdef CONFIG_NETFILTER | 387 | #ifdef CONFIG_NETFILTER |
| 394 | to->nfmark = from->nfmark; | 388 | to->nfmark = from->nfmark; |
| 395 | to->nfcache = from->nfcache; | ||
| 396 | /* Connection association is same as pre-frag packet */ | 389 | /* Connection association is same as pre-frag packet */ |
| 397 | nf_conntrack_put(to->nfct); | 390 | nf_conntrack_put(to->nfct); |
| 398 | to->nfct = from->nfct; | 391 | to->nfct = from->nfct; |
| @@ -580,7 +573,7 @@ slow_path: | |||
| 580 | */ | 573 | */ |
| 581 | 574 | ||
| 582 | if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) { | 575 | if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) { |
| 583 | NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n")); | 576 | NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n"); |
| 584 | err = -ENOMEM; | 577 | err = -ENOMEM; |
| 585 | goto fail; | 578 | goto fail; |
| 586 | } | 579 | } |
| @@ -1329,12 +1322,7 @@ void __init ip_init(void) | |||
| 1329 | #endif | 1322 | #endif |
| 1330 | } | 1323 | } |
| 1331 | 1324 | ||
| 1332 | EXPORT_SYMBOL(ip_finish_output); | ||
| 1333 | EXPORT_SYMBOL(ip_fragment); | 1325 | EXPORT_SYMBOL(ip_fragment); |
| 1334 | EXPORT_SYMBOL(ip_generic_getfrag); | 1326 | EXPORT_SYMBOL(ip_generic_getfrag); |
| 1335 | EXPORT_SYMBOL(ip_queue_xmit); | 1327 | EXPORT_SYMBOL(ip_queue_xmit); |
| 1336 | EXPORT_SYMBOL(ip_send_check); | 1328 | EXPORT_SYMBOL(ip_send_check); |
| 1337 | |||
| 1338 | #ifdef CONFIG_SYSCTL | ||
| 1339 | EXPORT_SYMBOL(sysctl_ip_default_ttl); | ||
| 1340 | #endif | ||
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index ff4bd067b397..2f0b47da5b37 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c | |||
| @@ -153,7 +153,7 @@ int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc) | |||
| 153 | switch (cmsg->cmsg_type) { | 153 | switch (cmsg->cmsg_type) { |
| 154 | case IP_RETOPTS: | 154 | case IP_RETOPTS: |
| 155 | err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)); | 155 | err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)); |
| 156 | err = ip_options_get(&ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40, 0); | 156 | err = ip_options_get(&ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40); |
| 157 | if (err) | 157 | if (err) |
| 158 | return err; | 158 | return err; |
| 159 | break; | 159 | break; |
| @@ -425,7 +425,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, | |||
| 425 | struct ip_options * opt = NULL; | 425 | struct ip_options * opt = NULL; |
| 426 | if (optlen > 40 || optlen < 0) | 426 | if (optlen > 40 || optlen < 0) |
| 427 | goto e_inval; | 427 | goto e_inval; |
| 428 | err = ip_options_get(&opt, optval, optlen, 1); | 428 | err = ip_options_get_from_user(&opt, optval, optlen); |
| 429 | if (err) | 429 | if (err) |
| 430 | break; | 430 | break; |
| 431 | if (sk->sk_type == SOCK_STREAM) { | 431 | if (sk->sk_type == SOCK_STREAM) { |
| @@ -614,7 +614,6 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, | |||
| 614 | } | 614 | } |
| 615 | case IP_MSFILTER: | 615 | case IP_MSFILTER: |
| 616 | { | 616 | { |
| 617 | extern int sysctl_optmem_max; | ||
| 618 | extern int sysctl_igmp_max_msf; | 617 | extern int sysctl_igmp_max_msf; |
| 619 | struct ip_msfilter *msf; | 618 | struct ip_msfilter *msf; |
| 620 | 619 | ||
| @@ -769,7 +768,6 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, | |||
| 769 | } | 768 | } |
| 770 | case MCAST_MSFILTER: | 769 | case MCAST_MSFILTER: |
| 771 | { | 770 | { |
| 772 | extern int sysctl_optmem_max; | ||
| 773 | extern int sysctl_igmp_max_msf; | 771 | extern int sysctl_igmp_max_msf; |
| 774 | struct sockaddr_in *psin; | 772 | struct sockaddr_in *psin; |
| 775 | struct ip_msfilter *msf = NULL; | 773 | struct ip_msfilter *msf = NULL; |
| @@ -1090,7 +1088,5 @@ int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval, | |||
| 1090 | 1088 | ||
| 1091 | EXPORT_SYMBOL(ip_cmsg_recv); | 1089 | EXPORT_SYMBOL(ip_cmsg_recv); |
| 1092 | 1090 | ||
| 1093 | #ifdef CONFIG_IP_SCTP_MODULE | ||
| 1094 | EXPORT_SYMBOL(ip_getsockopt); | 1091 | EXPORT_SYMBOL(ip_getsockopt); |
| 1095 | EXPORT_SYMBOL(ip_setsockopt); | 1092 | EXPORT_SYMBOL(ip_setsockopt); |
| 1096 | #endif | ||
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 7ded6e60f43a..dcb7ee6c4858 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c | |||
| @@ -214,8 +214,8 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) | |||
| 214 | spi, IPPROTO_COMP, AF_INET); | 214 | spi, IPPROTO_COMP, AF_INET); |
| 215 | if (!x) | 215 | if (!x) |
| 216 | return; | 216 | return; |
| 217 | NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%u.%u.%u.%u\n", | 217 | NETDEBUG(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%u.%u.%u.%u\n", |
| 218 | spi, NIPQUAD(iph->daddr))); | 218 | spi, NIPQUAD(iph->daddr)); |
| 219 | xfrm_state_put(x); | 219 | xfrm_state_put(x); |
| 220 | } | 220 | } |
| 221 | 221 | ||
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index d2bf8e1930a3..63e106605f28 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c | |||
| @@ -393,7 +393,7 @@ static int __init ic_defaults(void) | |||
| 393 | 393 | ||
| 394 | #ifdef IPCONFIG_RARP | 394 | #ifdef IPCONFIG_RARP |
| 395 | 395 | ||
| 396 | static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt); | 396 | static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); |
| 397 | 397 | ||
| 398 | static struct packet_type rarp_packet_type __initdata = { | 398 | static struct packet_type rarp_packet_type __initdata = { |
| 399 | .type = __constant_htons(ETH_P_RARP), | 399 | .type = __constant_htons(ETH_P_RARP), |
| @@ -414,7 +414,7 @@ static inline void ic_rarp_cleanup(void) | |||
| 414 | * Process received RARP packet. | 414 | * Process received RARP packet. |
| 415 | */ | 415 | */ |
| 416 | static int __init | 416 | static int __init |
| 417 | ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) | 417 | ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) |
| 418 | { | 418 | { |
| 419 | struct arphdr *rarp; | 419 | struct arphdr *rarp; |
| 420 | unsigned char *rarp_ptr; | 420 | unsigned char *rarp_ptr; |
| @@ -555,7 +555,7 @@ struct bootp_pkt { /* BOOTP packet format */ | |||
| 555 | #define DHCPRELEASE 7 | 555 | #define DHCPRELEASE 7 |
| 556 | #define DHCPINFORM 8 | 556 | #define DHCPINFORM 8 |
| 557 | 557 | ||
| 558 | static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt); | 558 | static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); |
| 559 | 559 | ||
| 560 | static struct packet_type bootp_packet_type __initdata = { | 560 | static struct packet_type bootp_packet_type __initdata = { |
| 561 | .type = __constant_htons(ETH_P_IP), | 561 | .type = __constant_htons(ETH_P_IP), |
| @@ -823,7 +823,7 @@ static void __init ic_do_bootp_ext(u8 *ext) | |||
| 823 | /* | 823 | /* |
| 824 | * Receive BOOTP reply. | 824 | * Receive BOOTP reply. |
| 825 | */ | 825 | */ |
| 826 | static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) | 826 | static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) |
| 827 | { | 827 | { |
| 828 | struct bootp_pkt *b; | 828 | struct bootp_pkt *b; |
| 829 | struct iphdr *h; | 829 | struct iphdr *h; |
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index dc806b578427..9dbf5909f3a6 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c | |||
| @@ -103,7 +103,7 @@ static DEFINE_SPINLOCK(mfc_unres_lock); | |||
| 103 | In this case data path is free of exclusive locks at all. | 103 | In this case data path is free of exclusive locks at all. |
| 104 | */ | 104 | */ |
| 105 | 105 | ||
| 106 | static kmem_cache_t *mrt_cachep; | 106 | static kmem_cache_t *mrt_cachep __read_mostly; |
| 107 | 107 | ||
| 108 | static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local); | 108 | static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local); |
| 109 | static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert); | 109 | static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert); |
diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c index d9212addd193..6e092dadb388 100644 --- a/net/ipv4/ipvs/ip_vs_app.c +++ b/net/ipv4/ipvs/ip_vs_app.c | |||
| @@ -26,6 +26,7 @@ | |||
| 26 | #include <linux/in.h> | 26 | #include <linux/in.h> |
| 27 | #include <linux/ip.h> | 27 | #include <linux/ip.h> |
| 28 | #include <net/protocol.h> | 28 | #include <net/protocol.h> |
| 29 | #include <net/tcp.h> | ||
| 29 | #include <asm/system.h> | 30 | #include <asm/system.h> |
| 30 | #include <linux/stat.h> | 31 | #include <linux/stat.h> |
| 31 | #include <linux/proc_fs.h> | 32 | #include <linux/proc_fs.h> |
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c index d0145a8b1551..e11952ea17af 100644 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ b/net/ipv4/ipvs/ip_vs_conn.c | |||
| @@ -40,7 +40,7 @@ | |||
| 40 | static struct list_head *ip_vs_conn_tab; | 40 | static struct list_head *ip_vs_conn_tab; |
| 41 | 41 | ||
| 42 | /* SLAB cache for IPVS connections */ | 42 | /* SLAB cache for IPVS connections */ |
| 43 | static kmem_cache_t *ip_vs_conn_cachep; | 43 | static kmem_cache_t *ip_vs_conn_cachep __read_mostly; |
| 44 | 44 | ||
| 45 | /* counter for current IPVS connections */ | 45 | /* counter for current IPVS connections */ |
| 46 | static atomic_t ip_vs_conn_count = ATOMIC_INIT(0); | 46 | static atomic_t ip_vs_conn_count = ATOMIC_INIT(0); |
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 5fb257dd07cb..3ac7eeca04ac 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c | |||
| @@ -22,6 +22,7 @@ | |||
| 22 | * | 22 | * |
| 23 | * Changes: | 23 | * Changes: |
| 24 | * Paul `Rusty' Russell properly handle non-linear skbs | 24 | * Paul `Rusty' Russell properly handle non-linear skbs |
| 25 | * Harald Welte don't use nfcache | ||
| 25 | * | 26 | * |
| 26 | */ | 27 | */ |
| 27 | 28 | ||
| @@ -529,7 +530,7 @@ static unsigned int ip_vs_post_routing(unsigned int hooknum, | |||
| 529 | const struct net_device *out, | 530 | const struct net_device *out, |
| 530 | int (*okfn)(struct sk_buff *)) | 531 | int (*okfn)(struct sk_buff *)) |
| 531 | { | 532 | { |
| 532 | if (!((*pskb)->nfcache & NFC_IPVS_PROPERTY)) | 533 | if (!((*pskb)->ipvs_property)) |
| 533 | return NF_ACCEPT; | 534 | return NF_ACCEPT; |
| 534 | 535 | ||
| 535 | /* The packet was sent from IPVS, exit this chain */ | 536 | /* The packet was sent from IPVS, exit this chain */ |
| @@ -701,7 +702,7 @@ static int ip_vs_out_icmp(struct sk_buff **pskb, int *related) | |||
| 701 | /* do the statistics and put it back */ | 702 | /* do the statistics and put it back */ |
| 702 | ip_vs_out_stats(cp, skb); | 703 | ip_vs_out_stats(cp, skb); |
| 703 | 704 | ||
| 704 | skb->nfcache |= NFC_IPVS_PROPERTY; | 705 | skb->ipvs_property = 1; |
| 705 | verdict = NF_ACCEPT; | 706 | verdict = NF_ACCEPT; |
| 706 | 707 | ||
| 707 | out: | 708 | out: |
| @@ -739,7 +740,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb, | |||
| 739 | 740 | ||
| 740 | EnterFunction(11); | 741 | EnterFunction(11); |
| 741 | 742 | ||
| 742 | if (skb->nfcache & NFC_IPVS_PROPERTY) | 743 | if (skb->ipvs_property) |
| 743 | return NF_ACCEPT; | 744 | return NF_ACCEPT; |
| 744 | 745 | ||
| 745 | iph = skb->nh.iph; | 746 | iph = skb->nh.iph; |
| @@ -821,7 +822,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb, | |||
| 821 | ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); | 822 | ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); |
| 822 | ip_vs_conn_put(cp); | 823 | ip_vs_conn_put(cp); |
| 823 | 824 | ||
| 824 | skb->nfcache |= NFC_IPVS_PROPERTY; | 825 | skb->ipvs_property = 1; |
| 825 | 826 | ||
| 826 | LeaveFunction(11); | 827 | LeaveFunction(11); |
| 827 | return NF_ACCEPT; | 828 | return NF_ACCEPT; |
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 7d99ede2ef79..2d66848e7aa0 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c | |||
| @@ -1598,7 +1598,7 @@ static ctl_table vs_table[] = { | |||
| 1598 | { .ctl_name = 0 } | 1598 | { .ctl_name = 0 } |
| 1599 | }; | 1599 | }; |
| 1600 | 1600 | ||
| 1601 | static ctl_table ipv4_table[] = { | 1601 | static ctl_table ipvs_ipv4_table[] = { |
| 1602 | { | 1602 | { |
| 1603 | .ctl_name = NET_IPV4, | 1603 | .ctl_name = NET_IPV4, |
| 1604 | .procname = "ipv4", | 1604 | .procname = "ipv4", |
| @@ -1613,7 +1613,7 @@ static ctl_table vs_root_table[] = { | |||
| 1613 | .ctl_name = CTL_NET, | 1613 | .ctl_name = CTL_NET, |
| 1614 | .procname = "net", | 1614 | .procname = "net", |
| 1615 | .mode = 0555, | 1615 | .mode = 0555, |
| 1616 | .child = ipv4_table, | 1616 | .child = ipvs_ipv4_table, |
| 1617 | }, | 1617 | }, |
| 1618 | { .ctl_name = 0 } | 1618 | { .ctl_name = 0 } |
| 1619 | }; | 1619 | }; |
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c index c035838b780a..561cda326fa8 100644 --- a/net/ipv4/ipvs/ip_vs_lblc.c +++ b/net/ipv4/ipvs/ip_vs_lblc.c | |||
| @@ -131,7 +131,7 @@ static ctl_table vs_table[] = { | |||
| 131 | { .ctl_name = 0 } | 131 | { .ctl_name = 0 } |
| 132 | }; | 132 | }; |
| 133 | 133 | ||
| 134 | static ctl_table ipv4_table[] = { | 134 | static ctl_table ipvs_ipv4_table[] = { |
| 135 | { | 135 | { |
| 136 | .ctl_name = NET_IPV4, | 136 | .ctl_name = NET_IPV4, |
| 137 | .procname = "ipv4", | 137 | .procname = "ipv4", |
| @@ -146,7 +146,7 @@ static ctl_table lblc_root_table[] = { | |||
| 146 | .ctl_name = CTL_NET, | 146 | .ctl_name = CTL_NET, |
| 147 | .procname = "net", | 147 | .procname = "net", |
| 148 | .mode = 0555, | 148 | .mode = 0555, |
| 149 | .child = ipv4_table | 149 | .child = ipvs_ipv4_table |
| 150 | }, | 150 | }, |
| 151 | { .ctl_name = 0 } | 151 | { .ctl_name = 0 } |
| 152 | }; | 152 | }; |
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c index 22b5dd55d271..ce456dbf09a5 100644 --- a/net/ipv4/ipvs/ip_vs_lblcr.c +++ b/net/ipv4/ipvs/ip_vs_lblcr.c | |||
| @@ -320,7 +320,7 @@ static ctl_table vs_table[] = { | |||
| 320 | { .ctl_name = 0 } | 320 | { .ctl_name = 0 } |
| 321 | }; | 321 | }; |
| 322 | 322 | ||
| 323 | static ctl_table ipv4_table[] = { | 323 | static ctl_table ipvs_ipv4_table[] = { |
| 324 | { | 324 | { |
| 325 | .ctl_name = NET_IPV4, | 325 | .ctl_name = NET_IPV4, |
| 326 | .procname = "ipv4", | 326 | .procname = "ipv4", |
| @@ -335,7 +335,7 @@ static ctl_table lblcr_root_table[] = { | |||
| 335 | .ctl_name = CTL_NET, | 335 | .ctl_name = CTL_NET, |
| 336 | .procname = "net", | 336 | .procname = "net", |
| 337 | .mode = 0555, | 337 | .mode = 0555, |
| 338 | .child = ipv4_table | 338 | .child = ipvs_ipv4_table |
| 339 | }, | 339 | }, |
| 340 | { .ctl_name = 0 } | 340 | { .ctl_name = 0 } |
| 341 | }; | 341 | }; |
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c index e65de675da74..c19408973c09 100644 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c | |||
| @@ -604,14 +604,14 @@ void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp) | |||
| 604 | } | 604 | } |
| 605 | 605 | ||
| 606 | 606 | ||
| 607 | static void tcp_init(struct ip_vs_protocol *pp) | 607 | static void ip_vs_tcp_init(struct ip_vs_protocol *pp) |
| 608 | { | 608 | { |
| 609 | IP_VS_INIT_HASH_TABLE(tcp_apps); | 609 | IP_VS_INIT_HASH_TABLE(tcp_apps); |
| 610 | pp->timeout_table = tcp_timeouts; | 610 | pp->timeout_table = tcp_timeouts; |
| 611 | } | 611 | } |
| 612 | 612 | ||
| 613 | 613 | ||
| 614 | static void tcp_exit(struct ip_vs_protocol *pp) | 614 | static void ip_vs_tcp_exit(struct ip_vs_protocol *pp) |
| 615 | { | 615 | { |
| 616 | } | 616 | } |
| 617 | 617 | ||
| @@ -621,8 +621,8 @@ struct ip_vs_protocol ip_vs_protocol_tcp = { | |||
| 621 | .protocol = IPPROTO_TCP, | 621 | .protocol = IPPROTO_TCP, |
| 622 | .dont_defrag = 0, | 622 | .dont_defrag = 0, |
| 623 | .appcnt = ATOMIC_INIT(0), | 623 | .appcnt = ATOMIC_INIT(0), |
| 624 | .init = tcp_init, | 624 | .init = ip_vs_tcp_init, |
| 625 | .exit = tcp_exit, | 625 | .exit = ip_vs_tcp_exit, |
| 626 | .register_app = tcp_register_app, | 626 | .register_app = tcp_register_app, |
| 627 | .unregister_app = tcp_unregister_app, | 627 | .unregister_app = tcp_unregister_app, |
| 628 | .conn_schedule = tcp_conn_schedule, | 628 | .conn_schedule = tcp_conn_schedule, |
diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c index a8512a3fd08a..3b87482049cf 100644 --- a/net/ipv4/ipvs/ip_vs_xmit.c +++ b/net/ipv4/ipvs/ip_vs_xmit.c | |||
| @@ -127,7 +127,7 @@ ip_vs_dst_reset(struct ip_vs_dest *dest) | |||
| 127 | 127 | ||
| 128 | #define IP_VS_XMIT(skb, rt) \ | 128 | #define IP_VS_XMIT(skb, rt) \ |
| 129 | do { \ | 129 | do { \ |
| 130 | (skb)->nfcache |= NFC_IPVS_PROPERTY; \ | 130 | (skb)->ipvs_property = 1; \ |
| 131 | (skb)->ip_summed = CHECKSUM_NONE; \ | 131 | (skb)->ip_summed = CHECKSUM_NONE; \ |
| 132 | NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \ | 132 | NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \ |
| 133 | (rt)->u.dst.dev, dst_output); \ | 133 | (rt)->u.dst.dev, dst_output); \ |
diff --git a/net/ipv4/multipath_drr.c b/net/ipv4/multipath_drr.c index c9cf8726051d..db67373f9b34 100644 --- a/net/ipv4/multipath_drr.c +++ b/net/ipv4/multipath_drr.c | |||
| @@ -107,7 +107,7 @@ static int drr_dev_event(struct notifier_block *this, | |||
| 107 | return NOTIFY_DONE; | 107 | return NOTIFY_DONE; |
| 108 | } | 108 | } |
| 109 | 109 | ||
| 110 | struct notifier_block drr_dev_notifier = { | 110 | static struct notifier_block drr_dev_notifier = { |
| 111 | .notifier_call = drr_dev_event, | 111 | .notifier_call = drr_dev_event, |
| 112 | }; | 112 | }; |
| 113 | 113 | ||
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c new file mode 100644 index 000000000000..ae0779d82c5d --- /dev/null +++ b/net/ipv4/netfilter.c | |||
| @@ -0,0 +1,139 @@ | |||
| 1 | /* IPv4 specific functions of netfilter core */ | ||
| 2 | |||
| 3 | #include <linux/config.h> | ||
| 4 | #ifdef CONFIG_NETFILTER | ||
| 5 | |||
| 6 | #include <linux/kernel.h> | ||
| 7 | #include <linux/netfilter.h> | ||
| 8 | #include <linux/netfilter_ipv4.h> | ||
| 9 | |||
| 10 | #include <linux/tcp.h> | ||
| 11 | #include <linux/udp.h> | ||
| 12 | #include <linux/icmp.h> | ||
| 13 | #include <net/route.h> | ||
| 14 | #include <linux/ip.h> | ||
| 15 | |||
| 16 | /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ | ||
| 17 | int ip_route_me_harder(struct sk_buff **pskb) | ||
| 18 | { | ||
| 19 | struct iphdr *iph = (*pskb)->nh.iph; | ||
| 20 | struct rtable *rt; | ||
| 21 | struct flowi fl = {}; | ||
| 22 | struct dst_entry *odst; | ||
| 23 | unsigned int hh_len; | ||
| 24 | |||
| 25 | /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause | ||
| 26 | * packets with foreign saddr to appear on the NF_IP_LOCAL_OUT hook. | ||
| 27 | */ | ||
| 28 | if (inet_addr_type(iph->saddr) == RTN_LOCAL) { | ||
| 29 | fl.nl_u.ip4_u.daddr = iph->daddr; | ||
| 30 | fl.nl_u.ip4_u.saddr = iph->saddr; | ||
| 31 | fl.nl_u.ip4_u.tos = RT_TOS(iph->tos); | ||
| 32 | fl.oif = (*pskb)->sk ? (*pskb)->sk->sk_bound_dev_if : 0; | ||
| 33 | #ifdef CONFIG_IP_ROUTE_FWMARK | ||
| 34 | fl.nl_u.ip4_u.fwmark = (*pskb)->nfmark; | ||
| 35 | #endif | ||
| 36 | fl.proto = iph->protocol; | ||
| 37 | if (ip_route_output_key(&rt, &fl) != 0) | ||
| 38 | return -1; | ||
| 39 | |||
| 40 | /* Drop old route. */ | ||
| 41 | dst_release((*pskb)->dst); | ||
| 42 | (*pskb)->dst = &rt->u.dst; | ||
| 43 | } else { | ||
| 44 | /* non-local src, find valid iif to satisfy | ||
| 45 | * rp-filter when calling ip_route_input. */ | ||
| 46 | fl.nl_u.ip4_u.daddr = iph->saddr; | ||
| 47 | if (ip_route_output_key(&rt, &fl) != 0) | ||
| 48 | return -1; | ||
| 49 | |||
| 50 | odst = (*pskb)->dst; | ||
| 51 | if (ip_route_input(*pskb, iph->daddr, iph->saddr, | ||
| 52 | RT_TOS(iph->tos), rt->u.dst.dev) != 0) { | ||
| 53 | dst_release(&rt->u.dst); | ||
| 54 | return -1; | ||
| 55 | } | ||
| 56 | dst_release(&rt->u.dst); | ||
| 57 | dst_release(odst); | ||
| 58 | } | ||
| 59 | |||
| 60 | if ((*pskb)->dst->error) | ||
| 61 | return -1; | ||
| 62 | |||
| 63 | /* Change in oif may mean change in hh_len. */ | ||
| 64 | hh_len = (*pskb)->dst->dev->hard_header_len; | ||
| 65 | if (skb_headroom(*pskb) < hh_len) { | ||
| 66 | struct sk_buff *nskb; | ||
| 67 | |||
| 68 | nskb = skb_realloc_headroom(*pskb, hh_len); | ||
| 69 | if (!nskb) | ||
| 70 | return -1; | ||
| 71 | if ((*pskb)->sk) | ||
| 72 | skb_set_owner_w(nskb, (*pskb)->sk); | ||
| 73 | kfree_skb(*pskb); | ||
| 74 | *pskb = nskb; | ||
| 75 | } | ||
| 76 | |||
| 77 | return 0; | ||
| 78 | } | ||
| 79 | EXPORT_SYMBOL(ip_route_me_harder); | ||
| 80 | |||
| 81 | /* | ||
| 82 | * Extra routing may needed on local out, as the QUEUE target never | ||
| 83 | * returns control to the table. | ||
| 84 | */ | ||
| 85 | |||
| 86 | struct ip_rt_info { | ||
| 87 | u_int32_t daddr; | ||
| 88 | u_int32_t saddr; | ||
| 89 | u_int8_t tos; | ||
| 90 | }; | ||
| 91 | |||
| 92 | static void queue_save(const struct sk_buff *skb, struct nf_info *info) | ||
| 93 | { | ||
| 94 | struct ip_rt_info *rt_info = nf_info_reroute(info); | ||
| 95 | |||
| 96 | if (info->hook == NF_IP_LOCAL_OUT) { | ||
| 97 | const struct iphdr *iph = skb->nh.iph; | ||
| 98 | |||
| 99 | rt_info->tos = iph->tos; | ||
| 100 | rt_info->daddr = iph->daddr; | ||
| 101 | rt_info->saddr = iph->saddr; | ||
| 102 | } | ||
| 103 | } | ||
| 104 | |||
| 105 | static int queue_reroute(struct sk_buff **pskb, const struct nf_info *info) | ||
| 106 | { | ||
| 107 | const struct ip_rt_info *rt_info = nf_info_reroute(info); | ||
| 108 | |||
| 109 | if (info->hook == NF_IP_LOCAL_OUT) { | ||
| 110 | struct iphdr *iph = (*pskb)->nh.iph; | ||
| 111 | |||
| 112 | if (!(iph->tos == rt_info->tos | ||
| 113 | && iph->daddr == rt_info->daddr | ||
| 114 | && iph->saddr == rt_info->saddr)) | ||
| 115 | return ip_route_me_harder(pskb); | ||
| 116 | } | ||
| 117 | return 0; | ||
| 118 | } | ||
| 119 | |||
| 120 | static struct nf_queue_rerouter ip_reroute = { | ||
| 121 | .rer_size = sizeof(struct ip_rt_info), | ||
| 122 | .save = queue_save, | ||
| 123 | .reroute = queue_reroute, | ||
| 124 | }; | ||
| 125 | |||
| 126 | static int init(void) | ||
| 127 | { | ||
| 128 | return nf_register_queue_rerouter(PF_INET, &ip_reroute); | ||
| 129 | } | ||
| 130 | |||
| 131 | static void fini(void) | ||
| 132 | { | ||
| 133 | nf_unregister_queue_rerouter(PF_INET); | ||
| 134 | } | ||
| 135 | |||
| 136 | module_init(init); | ||
| 137 | module_exit(fini); | ||
| 138 | |||
| 139 | #endif /* CONFIG_NETFILTER */ | ||
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 46d4cb1c06f0..e046f5521814 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig | |||
| @@ -40,6 +40,16 @@ config IP_NF_CONNTRACK_MARK | |||
| 40 | of packets, but this mark value is kept in the conntrack session | 40 | of packets, but this mark value is kept in the conntrack session |
| 41 | instead of the individual packets. | 41 | instead of the individual packets. |
| 42 | 42 | ||
| 43 | config IP_NF_CONNTRACK_EVENTS | ||
| 44 | bool "Connection tracking events" | ||
| 45 | depends on IP_NF_CONNTRACK | ||
| 46 | help | ||
| 47 | If this option is enabled, the connection tracking code will | ||
| 48 | provide a notifier chain that can be used by other kernel code | ||
| 49 | to get notified about changes in the connection tracking state. | ||
| 50 | |||
| 51 | IF unsure, say `N'. | ||
| 52 | |||
| 43 | config IP_NF_CT_PROTO_SCTP | 53 | config IP_NF_CT_PROTO_SCTP |
| 44 | tristate 'SCTP protocol connection tracking support (EXPERIMENTAL)' | 54 | tristate 'SCTP protocol connection tracking support (EXPERIMENTAL)' |
| 45 | depends on IP_NF_CONNTRACK && EXPERIMENTAL | 55 | depends on IP_NF_CONNTRACK && EXPERIMENTAL |
| @@ -100,11 +110,15 @@ config IP_NF_AMANDA | |||
| 100 | To compile it as a module, choose M here. If unsure, say Y. | 110 | To compile it as a module, choose M here. If unsure, say Y. |
| 101 | 111 | ||
| 102 | config IP_NF_QUEUE | 112 | config IP_NF_QUEUE |
| 103 | tristate "Userspace queueing via NETLINK" | 113 | tristate "IP Userspace queueing via NETLINK (OBSOLETE)" |
| 104 | help | 114 | help |
| 105 | Netfilter has the ability to queue packets to user space: the | 115 | Netfilter has the ability to queue packets to user space: the |
| 106 | netlink device can be used to access them using this driver. | 116 | netlink device can be used to access them using this driver. |
| 107 | 117 | ||
| 118 | This option enables the old IPv4-only "ip_queue" implementation | ||
| 119 | which has been obsoleted by the new "nfnetlink_queue" code (see | ||
| 120 | CONFIG_NETFILTER_NETLINK_QUEUE). | ||
| 121 | |||
| 108 | To compile it as a module, choose M here. If unsure, say N. | 122 | To compile it as a module, choose M here. If unsure, say N. |
| 109 | 123 | ||
| 110 | config IP_NF_IPTABLES | 124 | config IP_NF_IPTABLES |
| @@ -340,6 +354,17 @@ config IP_NF_MATCH_SCTP | |||
| 340 | If you want to compile it as a module, say M here and read | 354 | If you want to compile it as a module, say M here and read |
| 341 | <file:Documentation/modules.txt>. If unsure, say `N'. | 355 | <file:Documentation/modules.txt>. If unsure, say `N'. |
| 342 | 356 | ||
| 357 | config IP_NF_MATCH_DCCP | ||
| 358 | tristate 'DCCP protocol match support' | ||
| 359 | depends on IP_NF_IPTABLES | ||
| 360 | help | ||
| 361 | With this option enabled, you will be able to use the iptables | ||
| 362 | `dccp' match in order to match on DCCP source/destination ports | ||
| 363 | and DCCP flags. | ||
| 364 | |||
| 365 | If you want to compile it as a module, say M here and read | ||
| 366 | <file:Documentation/modules.txt>. If unsure, say `N'. | ||
| 367 | |||
| 343 | config IP_NF_MATCH_COMMENT | 368 | config IP_NF_MATCH_COMMENT |
| 344 | tristate 'comment match support' | 369 | tristate 'comment match support' |
| 345 | depends on IP_NF_IPTABLES | 370 | depends on IP_NF_IPTABLES |
| @@ -361,6 +386,16 @@ config IP_NF_MATCH_CONNMARK | |||
| 361 | <file:Documentation/modules.txt>. The module will be called | 386 | <file:Documentation/modules.txt>. The module will be called |
| 362 | ipt_connmark.o. If unsure, say `N'. | 387 | ipt_connmark.o. If unsure, say `N'. |
| 363 | 388 | ||
| 389 | config IP_NF_MATCH_CONNBYTES | ||
| 390 | tristate 'Connection byte/packet counter match support' | ||
| 391 | depends on IP_NF_CT_ACCT && IP_NF_IPTABLES | ||
| 392 | help | ||
| 393 | This option adds a `connbytes' match, which allows you to match the | ||
| 394 | number of bytes and/or packets for each direction within a connection. | ||
| 395 | |||
| 396 | If you want to compile it as a module, say M here and read | ||
| 397 | <file:Documentation/modules.txt>. If unsure, say `N'. | ||
| 398 | |||
| 364 | config IP_NF_MATCH_HASHLIMIT | 399 | config IP_NF_MATCH_HASHLIMIT |
| 365 | tristate 'hashlimit match support' | 400 | tristate 'hashlimit match support' |
| 366 | depends on IP_NF_IPTABLES | 401 | depends on IP_NF_IPTABLES |
| @@ -375,6 +410,19 @@ config IP_NF_MATCH_HASHLIMIT | |||
| 375 | destination IP' or `500pps from any given source IP' with a single | 410 | destination IP' or `500pps from any given source IP' with a single |
| 376 | IPtables rule. | 411 | IPtables rule. |
| 377 | 412 | ||
| 413 | config IP_NF_MATCH_STRING | ||
| 414 | tristate 'string match support' | ||
| 415 | depends on IP_NF_IPTABLES | ||
| 416 | select TEXTSEARCH | ||
| 417 | select TEXTSEARCH_KMP | ||
| 418 | select TEXTSEARCH_BM | ||
| 419 | select TEXTSEARCH_FSM | ||
| 420 | help | ||
| 421 | This option adds a `string' match, which allows you to look for | ||
| 422 | pattern matchings in packets. | ||
| 423 | |||
| 424 | To compile it as a module, choose M here. If unsure, say N. | ||
| 425 | |||
| 378 | # `filter', generic and specific targets | 426 | # `filter', generic and specific targets |
| 379 | config IP_NF_FILTER | 427 | config IP_NF_FILTER |
| 380 | tristate "Packet filtering" | 428 | tristate "Packet filtering" |
| @@ -616,6 +664,20 @@ config IP_NF_TARGET_CLASSIFY | |||
| 616 | 664 | ||
| 617 | To compile it as a module, choose M here. If unsure, say N. | 665 | To compile it as a module, choose M here. If unsure, say N. |
| 618 | 666 | ||
| 667 | config IP_NF_TARGET_TTL | ||
| 668 | tristate 'TTL target support' | ||
| 669 | depends on IP_NF_MANGLE | ||
| 670 | help | ||
| 671 | This option adds a `TTL' target, which enables the user to modify | ||
| 672 | the TTL value of the IP header. | ||
| 673 | |||
| 674 | While it is safe to decrement/lower the TTL, this target also enables | ||
| 675 | functionality to increment and set the TTL value of the IP header to | ||
| 676 | arbitrary values. This is EXTREMELY DANGEROUS since you can easily | ||
| 677 | create immortal packets that loop forever on the network. | ||
| 678 | |||
| 679 | To compile it as a module, choose M here. If unsure, say N. | ||
| 680 | |||
| 619 | config IP_NF_TARGET_CONNMARK | 681 | config IP_NF_TARGET_CONNMARK |
| 620 | tristate 'CONNMARK target support' | 682 | tristate 'CONNMARK target support' |
| 621 | depends on IP_NF_CONNTRACK_MARK && IP_NF_MANGLE | 683 | depends on IP_NF_CONNTRACK_MARK && IP_NF_MANGLE |
| @@ -692,5 +754,11 @@ config IP_NF_ARP_MANGLE | |||
| 692 | Allows altering the ARP packet payload: source and destination | 754 | Allows altering the ARP packet payload: source and destination |
| 693 | hardware and network addresses. | 755 | hardware and network addresses. |
| 694 | 756 | ||
| 757 | config IP_NF_CONNTRACK_NETLINK | ||
| 758 | tristate 'Connection tracking netlink interface' | ||
| 759 | depends on IP_NF_CONNTRACK && NETFILTER_NETLINK | ||
| 760 | help | ||
| 761 | This option enables support for a netlink-based userspace interface | ||
| 762 | |||
| 695 | endmenu | 763 | endmenu |
| 696 | 764 | ||
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 45796d5924dd..a7bd38f50522 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile | |||
| @@ -9,6 +9,10 @@ iptable_nat-objs := ip_nat_standalone.o ip_nat_rule.o ip_nat_core.o ip_nat_helpe | |||
| 9 | # connection tracking | 9 | # connection tracking |
| 10 | obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o | 10 | obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o |
| 11 | 11 | ||
| 12 | # conntrack netlink interface | ||
| 13 | obj-$(CONFIG_IP_NF_CONNTRACK_NETLINK) += ip_conntrack_netlink.o | ||
| 14 | |||
| 15 | |||
| 12 | # SCTP protocol connection tracking | 16 | # SCTP protocol connection tracking |
| 13 | obj-$(CONFIG_IP_NF_CT_PROTO_SCTP) += ip_conntrack_proto_sctp.o | 17 | obj-$(CONFIG_IP_NF_CT_PROTO_SCTP) += ip_conntrack_proto_sctp.o |
| 14 | 18 | ||
| @@ -38,6 +42,7 @@ obj-$(CONFIG_IP_NF_MATCH_HELPER) += ipt_helper.o | |||
| 38 | obj-$(CONFIG_IP_NF_MATCH_LIMIT) += ipt_limit.o | 42 | obj-$(CONFIG_IP_NF_MATCH_LIMIT) += ipt_limit.o |
| 39 | obj-$(CONFIG_IP_NF_MATCH_HASHLIMIT) += ipt_hashlimit.o | 43 | obj-$(CONFIG_IP_NF_MATCH_HASHLIMIT) += ipt_hashlimit.o |
| 40 | obj-$(CONFIG_IP_NF_MATCH_SCTP) += ipt_sctp.o | 44 | obj-$(CONFIG_IP_NF_MATCH_SCTP) += ipt_sctp.o |
| 45 | obj-$(CONFIG_IP_NF_MATCH_DCCP) += ipt_dccp.o | ||
| 41 | obj-$(CONFIG_IP_NF_MATCH_MARK) += ipt_mark.o | 46 | obj-$(CONFIG_IP_NF_MATCH_MARK) += ipt_mark.o |
| 42 | obj-$(CONFIG_IP_NF_MATCH_MAC) += ipt_mac.o | 47 | obj-$(CONFIG_IP_NF_MATCH_MAC) += ipt_mac.o |
| 43 | obj-$(CONFIG_IP_NF_MATCH_IPRANGE) += ipt_iprange.o | 48 | obj-$(CONFIG_IP_NF_MATCH_IPRANGE) += ipt_iprange.o |
| @@ -54,11 +59,13 @@ obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o | |||
| 54 | obj-$(CONFIG_IP_NF_MATCH_STATE) += ipt_state.o | 59 | obj-$(CONFIG_IP_NF_MATCH_STATE) += ipt_state.o |
| 55 | obj-$(CONFIG_IP_NF_MATCH_CONNMARK) += ipt_connmark.o | 60 | obj-$(CONFIG_IP_NF_MATCH_CONNMARK) += ipt_connmark.o |
| 56 | obj-$(CONFIG_IP_NF_MATCH_CONNTRACK) += ipt_conntrack.o | 61 | obj-$(CONFIG_IP_NF_MATCH_CONNTRACK) += ipt_conntrack.o |
| 62 | obj-$(CONFIG_IP_NF_MATCH_CONNBYTES) += ipt_connbytes.o | ||
| 57 | obj-$(CONFIG_IP_NF_MATCH_TCPMSS) += ipt_tcpmss.o | 63 | obj-$(CONFIG_IP_NF_MATCH_TCPMSS) += ipt_tcpmss.o |
| 58 | obj-$(CONFIG_IP_NF_MATCH_REALM) += ipt_realm.o | 64 | obj-$(CONFIG_IP_NF_MATCH_REALM) += ipt_realm.o |
| 59 | obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o | 65 | obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o |
| 60 | obj-$(CONFIG_IP_NF_MATCH_PHYSDEV) += ipt_physdev.o | 66 | obj-$(CONFIG_IP_NF_MATCH_PHYSDEV) += ipt_physdev.o |
| 61 | obj-$(CONFIG_IP_NF_MATCH_COMMENT) += ipt_comment.o | 67 | obj-$(CONFIG_IP_NF_MATCH_COMMENT) += ipt_comment.o |
| 68 | obj-$(CONFIG_IP_NF_MATCH_STRING) += ipt_string.o | ||
| 62 | 69 | ||
| 63 | # targets | 70 | # targets |
| 64 | obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o | 71 | obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o |
| @@ -78,6 +85,7 @@ obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o | |||
| 78 | obj-$(CONFIG_IP_NF_TARGET_TCPMSS) += ipt_TCPMSS.o | 85 | obj-$(CONFIG_IP_NF_TARGET_TCPMSS) += ipt_TCPMSS.o |
| 79 | obj-$(CONFIG_IP_NF_TARGET_NOTRACK) += ipt_NOTRACK.o | 86 | obj-$(CONFIG_IP_NF_TARGET_NOTRACK) += ipt_NOTRACK.o |
| 80 | obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o | 87 | obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o |
| 88 | obj-$(CONFIG_IP_NF_TARGET_TTL) += ipt_TTL.o | ||
| 81 | 89 | ||
| 82 | # generic ARP tables | 90 | # generic ARP tables |
| 83 | obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o | 91 | obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o |
| @@ -87,3 +95,4 @@ obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o | |||
| 87 | obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o | 95 | obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o |
| 88 | 96 | ||
| 89 | obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o | 97 | obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o |
| 98 | obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += ipt_NFQUEUE.o | ||
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c index 01e1b58322a9..be4c9eb3243f 100644 --- a/net/ipv4/netfilter/ip_conntrack_amanda.c +++ b/net/ipv4/netfilter/ip_conntrack_amanda.c | |||
| @@ -40,7 +40,7 @@ MODULE_PARM_DESC(master_timeout, "timeout for the master connection"); | |||
| 40 | static char *conns[] = { "DATA ", "MESG ", "INDEX " }; | 40 | static char *conns[] = { "DATA ", "MESG ", "INDEX " }; |
| 41 | 41 | ||
| 42 | /* This is slow, but it's simple. --RR */ | 42 | /* This is slow, but it's simple. --RR */ |
| 43 | static char amanda_buffer[65536]; | 43 | static char *amanda_buffer; |
| 44 | static DEFINE_SPINLOCK(amanda_buffer_lock); | 44 | static DEFINE_SPINLOCK(amanda_buffer_lock); |
| 45 | 45 | ||
| 46 | unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb, | 46 | unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb, |
| @@ -153,11 +153,25 @@ static struct ip_conntrack_helper amanda_helper = { | |||
| 153 | static void __exit fini(void) | 153 | static void __exit fini(void) |
| 154 | { | 154 | { |
| 155 | ip_conntrack_helper_unregister(&amanda_helper); | 155 | ip_conntrack_helper_unregister(&amanda_helper); |
| 156 | kfree(amanda_buffer); | ||
| 156 | } | 157 | } |
| 157 | 158 | ||
| 158 | static int __init init(void) | 159 | static int __init init(void) |
| 159 | { | 160 | { |
| 160 | return ip_conntrack_helper_register(&amanda_helper); | 161 | int ret; |
| 162 | |||
| 163 | amanda_buffer = kmalloc(65536, GFP_KERNEL); | ||
| 164 | if (!amanda_buffer) | ||
| 165 | return -ENOMEM; | ||
| 166 | |||
| 167 | ret = ip_conntrack_helper_register(&amanda_helper); | ||
| 168 | if (ret < 0) { | ||
| 169 | kfree(amanda_buffer); | ||
| 170 | return ret; | ||
| 171 | } | ||
| 172 | return 0; | ||
| 173 | |||
| 174 | |||
| 161 | } | 175 | } |
| 162 | 176 | ||
| 163 | module_init(init); | 177 | module_init(init); |
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index a7f0c821a9b2..a0648600190e 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c | |||
| @@ -37,6 +37,7 @@ | |||
| 37 | #include <linux/err.h> | 37 | #include <linux/err.h> |
| 38 | #include <linux/percpu.h> | 38 | #include <linux/percpu.h> |
| 39 | #include <linux/moduleparam.h> | 39 | #include <linux/moduleparam.h> |
| 40 | #include <linux/notifier.h> | ||
| 40 | 41 | ||
| 41 | /* ip_conntrack_lock protects the main hash table, protocol/helper/expected | 42 | /* ip_conntrack_lock protects the main hash table, protocol/helper/expected |
| 42 | registrations, conntrack timers*/ | 43 | registrations, conntrack timers*/ |
| @@ -49,7 +50,7 @@ | |||
| 49 | #include <linux/netfilter_ipv4/ip_conntrack_core.h> | 50 | #include <linux/netfilter_ipv4/ip_conntrack_core.h> |
| 50 | #include <linux/netfilter_ipv4/listhelp.h> | 51 | #include <linux/netfilter_ipv4/listhelp.h> |
| 51 | 52 | ||
| 52 | #define IP_CONNTRACK_VERSION "2.1" | 53 | #define IP_CONNTRACK_VERSION "2.3" |
| 53 | 54 | ||
| 54 | #if 0 | 55 | #if 0 |
| 55 | #define DEBUGP printk | 56 | #define DEBUGP printk |
| @@ -69,22 +70,81 @@ static LIST_HEAD(helpers); | |||
| 69 | unsigned int ip_conntrack_htable_size = 0; | 70 | unsigned int ip_conntrack_htable_size = 0; |
| 70 | int ip_conntrack_max; | 71 | int ip_conntrack_max; |
| 71 | struct list_head *ip_conntrack_hash; | 72 | struct list_head *ip_conntrack_hash; |
| 72 | static kmem_cache_t *ip_conntrack_cachep; | 73 | static kmem_cache_t *ip_conntrack_cachep __read_mostly; |
| 73 | static kmem_cache_t *ip_conntrack_expect_cachep; | 74 | static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly; |
| 74 | struct ip_conntrack ip_conntrack_untracked; | 75 | struct ip_conntrack ip_conntrack_untracked; |
| 75 | unsigned int ip_ct_log_invalid; | 76 | unsigned int ip_ct_log_invalid; |
| 76 | static LIST_HEAD(unconfirmed); | 77 | static LIST_HEAD(unconfirmed); |
| 77 | static int ip_conntrack_vmalloc; | 78 | static int ip_conntrack_vmalloc; |
| 78 | 79 | ||
| 79 | DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat); | 80 | static unsigned int ip_conntrack_next_id = 1; |
| 81 | static unsigned int ip_conntrack_expect_next_id = 1; | ||
| 82 | #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS | ||
| 83 | struct notifier_block *ip_conntrack_chain; | ||
| 84 | struct notifier_block *ip_conntrack_expect_chain; | ||
| 85 | |||
| 86 | DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache); | ||
| 80 | 87 | ||
| 81 | void | 88 | /* deliver cached events and clear cache entry - must be called with locally |
| 82 | ip_conntrack_put(struct ip_conntrack *ct) | 89 | * disabled softirqs */ |
| 90 | static inline void | ||
| 91 | __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache) | ||
| 83 | { | 92 | { |
| 84 | IP_NF_ASSERT(ct); | 93 | DEBUGP("ecache: delivering events for %p\n", ecache->ct); |
| 85 | nf_conntrack_put(&ct->ct_general); | 94 | if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events) |
| 95 | notifier_call_chain(&ip_conntrack_chain, ecache->events, | ||
| 96 | ecache->ct); | ||
| 97 | ecache->events = 0; | ||
| 98 | ip_conntrack_put(ecache->ct); | ||
| 99 | ecache->ct = NULL; | ||
| 86 | } | 100 | } |
| 87 | 101 | ||
| 102 | /* Deliver all cached events for a particular conntrack. This is called | ||
| 103 | * by code prior to async packet handling or freeing the skb */ | ||
| 104 | void ip_ct_deliver_cached_events(const struct ip_conntrack *ct) | ||
| 105 | { | ||
| 106 | struct ip_conntrack_ecache *ecache; | ||
| 107 | |||
| 108 | local_bh_disable(); | ||
| 109 | ecache = &__get_cpu_var(ip_conntrack_ecache); | ||
| 110 | if (ecache->ct == ct) | ||
| 111 | __ip_ct_deliver_cached_events(ecache); | ||
| 112 | local_bh_enable(); | ||
| 113 | } | ||
| 114 | |||
| 115 | void __ip_ct_event_cache_init(struct ip_conntrack *ct) | ||
| 116 | { | ||
| 117 | struct ip_conntrack_ecache *ecache; | ||
| 118 | |||
| 119 | /* take care of delivering potentially old events */ | ||
| 120 | ecache = &__get_cpu_var(ip_conntrack_ecache); | ||
| 121 | BUG_ON(ecache->ct == ct); | ||
| 122 | if (ecache->ct) | ||
| 123 | __ip_ct_deliver_cached_events(ecache); | ||
| 124 | /* initialize for this conntrack/packet */ | ||
| 125 | ecache->ct = ct; | ||
| 126 | nf_conntrack_get(&ct->ct_general); | ||
| 127 | } | ||
| 128 | |||
| 129 | /* flush the event cache - touches other CPU's data and must not be called while | ||
| 130 | * packets are still passing through the code */ | ||
| 131 | static void ip_ct_event_cache_flush(void) | ||
| 132 | { | ||
| 133 | struct ip_conntrack_ecache *ecache; | ||
| 134 | int cpu; | ||
| 135 | |||
| 136 | for_each_cpu(cpu) { | ||
| 137 | ecache = &per_cpu(ip_conntrack_ecache, cpu); | ||
| 138 | if (ecache->ct) | ||
| 139 | ip_conntrack_put(ecache->ct); | ||
| 140 | } | ||
| 141 | } | ||
| 142 | #else | ||
| 143 | static inline void ip_ct_event_cache_flush(void) {} | ||
| 144 | #endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */ | ||
| 145 | |||
| 146 | DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat); | ||
| 147 | |||
| 88 | static int ip_conntrack_hash_rnd_initted; | 148 | static int ip_conntrack_hash_rnd_initted; |
| 89 | static unsigned int ip_conntrack_hash_rnd; | 149 | static unsigned int ip_conntrack_hash_rnd; |
| 90 | 150 | ||
| @@ -144,6 +204,13 @@ static void unlink_expect(struct ip_conntrack_expect *exp) | |||
| 144 | list_del(&exp->list); | 204 | list_del(&exp->list); |
| 145 | CONNTRACK_STAT_INC(expect_delete); | 205 | CONNTRACK_STAT_INC(expect_delete); |
| 146 | exp->master->expecting--; | 206 | exp->master->expecting--; |
| 207 | ip_conntrack_expect_put(exp); | ||
| 208 | } | ||
| 209 | |||
| 210 | void __ip_ct_expect_unlink_destroy(struct ip_conntrack_expect *exp) | ||
| 211 | { | ||
| 212 | unlink_expect(exp); | ||
| 213 | ip_conntrack_expect_put(exp); | ||
| 147 | } | 214 | } |
| 148 | 215 | ||
| 149 | static void expectation_timed_out(unsigned long ul_expect) | 216 | static void expectation_timed_out(unsigned long ul_expect) |
| @@ -156,6 +223,33 @@ static void expectation_timed_out(unsigned long ul_expect) | |||
| 156 | ip_conntrack_expect_put(exp); | 223 | ip_conntrack_expect_put(exp); |
| 157 | } | 224 | } |
| 158 | 225 | ||
| 226 | struct ip_conntrack_expect * | ||
| 227 | __ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple) | ||
| 228 | { | ||
| 229 | struct ip_conntrack_expect *i; | ||
| 230 | |||
| 231 | list_for_each_entry(i, &ip_conntrack_expect_list, list) { | ||
| 232 | if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) { | ||
| 233 | atomic_inc(&i->use); | ||
| 234 | return i; | ||
| 235 | } | ||
| 236 | } | ||
| 237 | return NULL; | ||
| 238 | } | ||
| 239 | |||
| 240 | /* Just find a expectation corresponding to a tuple. */ | ||
| 241 | struct ip_conntrack_expect * | ||
| 242 | ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple) | ||
| 243 | { | ||
| 244 | struct ip_conntrack_expect *i; | ||
| 245 | |||
| 246 | read_lock_bh(&ip_conntrack_lock); | ||
| 247 | i = __ip_conntrack_expect_find(tuple); | ||
| 248 | read_unlock_bh(&ip_conntrack_lock); | ||
| 249 | |||
| 250 | return i; | ||
| 251 | } | ||
| 252 | |||
| 159 | /* If an expectation for this connection is found, it gets delete from | 253 | /* If an expectation for this connection is found, it gets delete from |
| 160 | * global list then returned. */ | 254 | * global list then returned. */ |
| 161 | static struct ip_conntrack_expect * | 255 | static struct ip_conntrack_expect * |
| @@ -180,7 +274,7 @@ find_expectation(const struct ip_conntrack_tuple *tuple) | |||
| 180 | } | 274 | } |
| 181 | 275 | ||
| 182 | /* delete all expectations for this conntrack */ | 276 | /* delete all expectations for this conntrack */ |
| 183 | static void remove_expectations(struct ip_conntrack *ct) | 277 | void ip_ct_remove_expectations(struct ip_conntrack *ct) |
| 184 | { | 278 | { |
| 185 | struct ip_conntrack_expect *i, *tmp; | 279 | struct ip_conntrack_expect *i, *tmp; |
| 186 | 280 | ||
| @@ -210,7 +304,7 @@ clean_from_lists(struct ip_conntrack *ct) | |||
| 210 | LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]); | 304 | LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]); |
| 211 | 305 | ||
| 212 | /* Destroy all pending expectations */ | 306 | /* Destroy all pending expectations */ |
| 213 | remove_expectations(ct); | 307 | ip_ct_remove_expectations(ct); |
| 214 | } | 308 | } |
| 215 | 309 | ||
| 216 | static void | 310 | static void |
| @@ -223,10 +317,13 @@ destroy_conntrack(struct nf_conntrack *nfct) | |||
| 223 | IP_NF_ASSERT(atomic_read(&nfct->use) == 0); | 317 | IP_NF_ASSERT(atomic_read(&nfct->use) == 0); |
| 224 | IP_NF_ASSERT(!timer_pending(&ct->timeout)); | 318 | IP_NF_ASSERT(!timer_pending(&ct->timeout)); |
| 225 | 319 | ||
| 320 | ip_conntrack_event(IPCT_DESTROY, ct); | ||
| 321 | set_bit(IPS_DYING_BIT, &ct->status); | ||
| 322 | |||
| 226 | /* To make sure we don't get any weird locking issues here: | 323 | /* To make sure we don't get any weird locking issues here: |
| 227 | * destroy_conntrack() MUST NOT be called with a write lock | 324 | * destroy_conntrack() MUST NOT be called with a write lock |
| 228 | * to ip_conntrack_lock!!! -HW */ | 325 | * to ip_conntrack_lock!!! -HW */ |
| 229 | proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum); | 326 | proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum); |
| 230 | if (proto && proto->destroy) | 327 | if (proto && proto->destroy) |
| 231 | proto->destroy(ct); | 328 | proto->destroy(ct); |
| 232 | 329 | ||
| @@ -238,7 +335,7 @@ destroy_conntrack(struct nf_conntrack *nfct) | |||
| 238 | * except TFTP can create an expectation on the first packet, | 335 | * except TFTP can create an expectation on the first packet, |
| 239 | * before connection is in the list, so we need to clean here, | 336 | * before connection is in the list, so we need to clean here, |
| 240 | * too. */ | 337 | * too. */ |
| 241 | remove_expectations(ct); | 338 | ip_ct_remove_expectations(ct); |
| 242 | 339 | ||
| 243 | /* We overload first tuple to link into unconfirmed list. */ | 340 | /* We overload first tuple to link into unconfirmed list. */ |
| 244 | if (!is_confirmed(ct)) { | 341 | if (!is_confirmed(ct)) { |
| @@ -253,8 +350,7 @@ destroy_conntrack(struct nf_conntrack *nfct) | |||
| 253 | ip_conntrack_put(ct->master); | 350 | ip_conntrack_put(ct->master); |
| 254 | 351 | ||
| 255 | DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct); | 352 | DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct); |
| 256 | kmem_cache_free(ip_conntrack_cachep, ct); | 353 | ip_conntrack_free(ct); |
| 257 | atomic_dec(&ip_conntrack_count); | ||
| 258 | } | 354 | } |
| 259 | 355 | ||
| 260 | static void death_by_timeout(unsigned long ul_conntrack) | 356 | static void death_by_timeout(unsigned long ul_conntrack) |
| @@ -280,7 +376,7 @@ conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i, | |||
| 280 | && ip_ct_tuple_equal(tuple, &i->tuple); | 376 | && ip_ct_tuple_equal(tuple, &i->tuple); |
| 281 | } | 377 | } |
| 282 | 378 | ||
| 283 | static struct ip_conntrack_tuple_hash * | 379 | struct ip_conntrack_tuple_hash * |
| 284 | __ip_conntrack_find(const struct ip_conntrack_tuple *tuple, | 380 | __ip_conntrack_find(const struct ip_conntrack_tuple *tuple, |
| 285 | const struct ip_conntrack *ignored_conntrack) | 381 | const struct ip_conntrack *ignored_conntrack) |
| 286 | { | 382 | { |
| @@ -315,6 +411,29 @@ ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple, | |||
| 315 | return h; | 411 | return h; |
| 316 | } | 412 | } |
| 317 | 413 | ||
| 414 | static void __ip_conntrack_hash_insert(struct ip_conntrack *ct, | ||
| 415 | unsigned int hash, | ||
| 416 | unsigned int repl_hash) | ||
| 417 | { | ||
| 418 | ct->id = ++ip_conntrack_next_id; | ||
| 419 | list_prepend(&ip_conntrack_hash[hash], | ||
| 420 | &ct->tuplehash[IP_CT_DIR_ORIGINAL].list); | ||
| 421 | list_prepend(&ip_conntrack_hash[repl_hash], | ||
| 422 | &ct->tuplehash[IP_CT_DIR_REPLY].list); | ||
| 423 | } | ||
| 424 | |||
| 425 | void ip_conntrack_hash_insert(struct ip_conntrack *ct) | ||
| 426 | { | ||
| 427 | unsigned int hash, repl_hash; | ||
| 428 | |||
| 429 | hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); | ||
| 430 | repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); | ||
| 431 | |||
| 432 | write_lock_bh(&ip_conntrack_lock); | ||
| 433 | __ip_conntrack_hash_insert(ct, hash, repl_hash); | ||
| 434 | write_unlock_bh(&ip_conntrack_lock); | ||
| 435 | } | ||
| 436 | |||
| 318 | /* Confirm a connection given skb; places it in hash table */ | 437 | /* Confirm a connection given skb; places it in hash table */ |
| 319 | int | 438 | int |
| 320 | __ip_conntrack_confirm(struct sk_buff **pskb) | 439 | __ip_conntrack_confirm(struct sk_buff **pskb) |
| @@ -361,10 +480,7 @@ __ip_conntrack_confirm(struct sk_buff **pskb) | |||
| 361 | /* Remove from unconfirmed list */ | 480 | /* Remove from unconfirmed list */ |
| 362 | list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); | 481 | list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); |
| 363 | 482 | ||
| 364 | list_prepend(&ip_conntrack_hash[hash], | 483 | __ip_conntrack_hash_insert(ct, hash, repl_hash); |
| 365 | &ct->tuplehash[IP_CT_DIR_ORIGINAL]); | ||
| 366 | list_prepend(&ip_conntrack_hash[repl_hash], | ||
| 367 | &ct->tuplehash[IP_CT_DIR_REPLY]); | ||
| 368 | /* Timer relative to confirmation time, not original | 484 | /* Timer relative to confirmation time, not original |
| 369 | setting time, otherwise we'd get timer wrap in | 485 | setting time, otherwise we'd get timer wrap in |
| 370 | weird delay cases. */ | 486 | weird delay cases. */ |
| @@ -374,6 +490,16 @@ __ip_conntrack_confirm(struct sk_buff **pskb) | |||
| 374 | set_bit(IPS_CONFIRMED_BIT, &ct->status); | 490 | set_bit(IPS_CONFIRMED_BIT, &ct->status); |
| 375 | CONNTRACK_STAT_INC(insert); | 491 | CONNTRACK_STAT_INC(insert); |
| 376 | write_unlock_bh(&ip_conntrack_lock); | 492 | write_unlock_bh(&ip_conntrack_lock); |
| 493 | if (ct->helper) | ||
| 494 | ip_conntrack_event_cache(IPCT_HELPER, *pskb); | ||
| 495 | #ifdef CONFIG_IP_NF_NAT_NEEDED | ||
| 496 | if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) || | ||
| 497 | test_bit(IPS_DST_NAT_DONE_BIT, &ct->status)) | ||
| 498 | ip_conntrack_event_cache(IPCT_NATINFO, *pskb); | ||
| 499 | #endif | ||
| 500 | ip_conntrack_event_cache(master_ct(ct) ? | ||
| 501 | IPCT_RELATED : IPCT_NEW, *pskb); | ||
| 502 | |||
| 377 | return NF_ACCEPT; | 503 | return NF_ACCEPT; |
| 378 | } | 504 | } |
| 379 | 505 | ||
| @@ -438,34 +564,84 @@ static inline int helper_cmp(const struct ip_conntrack_helper *i, | |||
| 438 | return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask); | 564 | return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask); |
| 439 | } | 565 | } |
| 440 | 566 | ||
| 441 | static struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple) | 567 | static struct ip_conntrack_helper * |
| 568 | __ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple) | ||
| 442 | { | 569 | { |
| 443 | return LIST_FIND(&helpers, helper_cmp, | 570 | return LIST_FIND(&helpers, helper_cmp, |
| 444 | struct ip_conntrack_helper *, | 571 | struct ip_conntrack_helper *, |
| 445 | tuple); | 572 | tuple); |
| 446 | } | 573 | } |
| 447 | 574 | ||
| 448 | /* Allocate a new conntrack: we return -ENOMEM if classification | 575 | struct ip_conntrack_helper * |
| 449 | failed due to stress. Otherwise it really is unclassifiable. */ | 576 | ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple) |
| 450 | static struct ip_conntrack_tuple_hash * | 577 | { |
| 451 | init_conntrack(const struct ip_conntrack_tuple *tuple, | 578 | struct ip_conntrack_helper *helper; |
| 452 | struct ip_conntrack_protocol *protocol, | 579 | |
| 453 | struct sk_buff *skb) | 580 | /* need ip_conntrack_lock to assure that helper exists until |
| 581 | * try_module_get() is called */ | ||
| 582 | read_lock_bh(&ip_conntrack_lock); | ||
| 583 | |||
| 584 | helper = __ip_conntrack_helper_find(tuple); | ||
| 585 | if (helper) { | ||
| 586 | /* need to increase module usage count to assure helper will | ||
| 587 | * not go away while the caller is e.g. busy putting a | ||
| 588 | * conntrack in the hash that uses the helper */ | ||
| 589 | if (!try_module_get(helper->me)) | ||
| 590 | helper = NULL; | ||
| 591 | } | ||
| 592 | |||
| 593 | read_unlock_bh(&ip_conntrack_lock); | ||
| 594 | |||
| 595 | return helper; | ||
| 596 | } | ||
| 597 | |||
| 598 | void ip_conntrack_helper_put(struct ip_conntrack_helper *helper) | ||
| 599 | { | ||
| 600 | module_put(helper->me); | ||
| 601 | } | ||
| 602 | |||
| 603 | struct ip_conntrack_protocol * | ||
| 604 | __ip_conntrack_proto_find(u_int8_t protocol) | ||
| 605 | { | ||
| 606 | return ip_ct_protos[protocol]; | ||
| 607 | } | ||
| 608 | |||
| 609 | /* this is guaranteed to always return a valid protocol helper, since | ||
| 610 | * it falls back to generic_protocol */ | ||
| 611 | struct ip_conntrack_protocol * | ||
| 612 | ip_conntrack_proto_find_get(u_int8_t protocol) | ||
| 613 | { | ||
| 614 | struct ip_conntrack_protocol *p; | ||
| 615 | |||
| 616 | preempt_disable(); | ||
| 617 | p = __ip_conntrack_proto_find(protocol); | ||
| 618 | if (p) { | ||
| 619 | if (!try_module_get(p->me)) | ||
| 620 | p = &ip_conntrack_generic_protocol; | ||
| 621 | } | ||
| 622 | preempt_enable(); | ||
| 623 | |||
| 624 | return p; | ||
| 625 | } | ||
| 626 | |||
| 627 | void ip_conntrack_proto_put(struct ip_conntrack_protocol *p) | ||
| 628 | { | ||
| 629 | module_put(p->me); | ||
| 630 | } | ||
| 631 | |||
| 632 | struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig, | ||
| 633 | struct ip_conntrack_tuple *repl) | ||
| 454 | { | 634 | { |
| 455 | struct ip_conntrack *conntrack; | 635 | struct ip_conntrack *conntrack; |
| 456 | struct ip_conntrack_tuple repl_tuple; | ||
| 457 | size_t hash; | ||
| 458 | struct ip_conntrack_expect *exp; | ||
| 459 | 636 | ||
| 460 | if (!ip_conntrack_hash_rnd_initted) { | 637 | if (!ip_conntrack_hash_rnd_initted) { |
| 461 | get_random_bytes(&ip_conntrack_hash_rnd, 4); | 638 | get_random_bytes(&ip_conntrack_hash_rnd, 4); |
| 462 | ip_conntrack_hash_rnd_initted = 1; | 639 | ip_conntrack_hash_rnd_initted = 1; |
| 463 | } | 640 | } |
| 464 | 641 | ||
| 465 | hash = hash_conntrack(tuple); | ||
| 466 | |||
| 467 | if (ip_conntrack_max | 642 | if (ip_conntrack_max |
| 468 | && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) { | 643 | && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) { |
| 644 | unsigned int hash = hash_conntrack(orig); | ||
| 469 | /* Try dropping from this hash chain. */ | 645 | /* Try dropping from this hash chain. */ |
| 470 | if (!early_drop(&ip_conntrack_hash[hash])) { | 646 | if (!early_drop(&ip_conntrack_hash[hash])) { |
| 471 | if (net_ratelimit()) | 647 | if (net_ratelimit()) |
| @@ -476,11 +652,6 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, | |||
| 476 | } | 652 | } |
| 477 | } | 653 | } |
| 478 | 654 | ||
| 479 | if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) { | ||
| 480 | DEBUGP("Can't invert tuple.\n"); | ||
| 481 | return NULL; | ||
| 482 | } | ||
| 483 | |||
| 484 | conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC); | 655 | conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC); |
| 485 | if (!conntrack) { | 656 | if (!conntrack) { |
| 486 | DEBUGP("Can't allocate conntrack.\n"); | 657 | DEBUGP("Can't allocate conntrack.\n"); |
| @@ -490,17 +661,50 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, | |||
| 490 | memset(conntrack, 0, sizeof(*conntrack)); | 661 | memset(conntrack, 0, sizeof(*conntrack)); |
| 491 | atomic_set(&conntrack->ct_general.use, 1); | 662 | atomic_set(&conntrack->ct_general.use, 1); |
| 492 | conntrack->ct_general.destroy = destroy_conntrack; | 663 | conntrack->ct_general.destroy = destroy_conntrack; |
| 493 | conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple; | 664 | conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; |
| 494 | conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple; | 665 | conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; |
| 495 | if (!protocol->new(conntrack, skb)) { | ||
| 496 | kmem_cache_free(ip_conntrack_cachep, conntrack); | ||
| 497 | return NULL; | ||
| 498 | } | ||
| 499 | /* Don't set timer yet: wait for confirmation */ | 666 | /* Don't set timer yet: wait for confirmation */ |
| 500 | init_timer(&conntrack->timeout); | 667 | init_timer(&conntrack->timeout); |
| 501 | conntrack->timeout.data = (unsigned long)conntrack; | 668 | conntrack->timeout.data = (unsigned long)conntrack; |
| 502 | conntrack->timeout.function = death_by_timeout; | 669 | conntrack->timeout.function = death_by_timeout; |
| 503 | 670 | ||
| 671 | atomic_inc(&ip_conntrack_count); | ||
| 672 | |||
| 673 | return conntrack; | ||
| 674 | } | ||
| 675 | |||
| 676 | void | ||
| 677 | ip_conntrack_free(struct ip_conntrack *conntrack) | ||
| 678 | { | ||
| 679 | atomic_dec(&ip_conntrack_count); | ||
| 680 | kmem_cache_free(ip_conntrack_cachep, conntrack); | ||
| 681 | } | ||
| 682 | |||
| 683 | /* Allocate a new conntrack: we return -ENOMEM if classification | ||
| 684 | * failed due to stress. Otherwise it really is unclassifiable */ | ||
| 685 | static struct ip_conntrack_tuple_hash * | ||
| 686 | init_conntrack(struct ip_conntrack_tuple *tuple, | ||
| 687 | struct ip_conntrack_protocol *protocol, | ||
| 688 | struct sk_buff *skb) | ||
| 689 | { | ||
| 690 | struct ip_conntrack *conntrack; | ||
| 691 | struct ip_conntrack_tuple repl_tuple; | ||
| 692 | struct ip_conntrack_expect *exp; | ||
| 693 | |||
| 694 | if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) { | ||
| 695 | DEBUGP("Can't invert tuple.\n"); | ||
| 696 | return NULL; | ||
| 697 | } | ||
| 698 | |||
| 699 | conntrack = ip_conntrack_alloc(tuple, &repl_tuple); | ||
| 700 | if (conntrack == NULL || IS_ERR(conntrack)) | ||
| 701 | return (struct ip_conntrack_tuple_hash *)conntrack; | ||
| 702 | |||
| 703 | if (!protocol->new(conntrack, skb)) { | ||
| 704 | ip_conntrack_free(conntrack); | ||
| 705 | return NULL; | ||
| 706 | } | ||
| 707 | |||
| 504 | write_lock_bh(&ip_conntrack_lock); | 708 | write_lock_bh(&ip_conntrack_lock); |
| 505 | exp = find_expectation(tuple); | 709 | exp = find_expectation(tuple); |
| 506 | 710 | ||
| @@ -521,7 +725,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, | |||
| 521 | nf_conntrack_get(&conntrack->master->ct_general); | 725 | nf_conntrack_get(&conntrack->master->ct_general); |
| 522 | CONNTRACK_STAT_INC(expect_new); | 726 | CONNTRACK_STAT_INC(expect_new); |
| 523 | } else { | 727 | } else { |
| 524 | conntrack->helper = ip_ct_find_helper(&repl_tuple); | 728 | conntrack->helper = __ip_conntrack_helper_find(&repl_tuple); |
| 525 | 729 | ||
| 526 | CONNTRACK_STAT_INC(new); | 730 | CONNTRACK_STAT_INC(new); |
| 527 | } | 731 | } |
| @@ -529,7 +733,6 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, | |||
| 529 | /* Overload tuple linked list to put us in unconfirmed list. */ | 733 | /* Overload tuple linked list to put us in unconfirmed list. */ |
| 530 | list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed); | 734 | list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed); |
| 531 | 735 | ||
| 532 | atomic_inc(&ip_conntrack_count); | ||
| 533 | write_unlock_bh(&ip_conntrack_lock); | 736 | write_unlock_bh(&ip_conntrack_lock); |
| 534 | 737 | ||
| 535 | if (exp) { | 738 | if (exp) { |
| @@ -607,7 +810,7 @@ unsigned int ip_conntrack_in(unsigned int hooknum, | |||
| 607 | struct ip_conntrack *ct; | 810 | struct ip_conntrack *ct; |
| 608 | enum ip_conntrack_info ctinfo; | 811 | enum ip_conntrack_info ctinfo; |
| 609 | struct ip_conntrack_protocol *proto; | 812 | struct ip_conntrack_protocol *proto; |
| 610 | int set_reply; | 813 | int set_reply = 0; |
| 611 | int ret; | 814 | int ret; |
| 612 | 815 | ||
| 613 | /* Previously seen (loopback or untracked)? Ignore. */ | 816 | /* Previously seen (loopback or untracked)? Ignore. */ |
| @@ -625,9 +828,6 @@ unsigned int ip_conntrack_in(unsigned int hooknum, | |||
| 625 | return NF_DROP; | 828 | return NF_DROP; |
| 626 | } | 829 | } |
| 627 | 830 | ||
| 628 | /* FIXME: Do this right please. --RR */ | ||
| 629 | (*pskb)->nfcache |= NFC_UNKNOWN; | ||
| 630 | |||
| 631 | /* Doesn't cover locally-generated broadcast, so not worth it. */ | 831 | /* Doesn't cover locally-generated broadcast, so not worth it. */ |
| 632 | #if 0 | 832 | #if 0 |
| 633 | /* Ignore broadcast: no `connection'. */ | 833 | /* Ignore broadcast: no `connection'. */ |
| @@ -643,7 +843,7 @@ unsigned int ip_conntrack_in(unsigned int hooknum, | |||
| 643 | } | 843 | } |
| 644 | #endif | 844 | #endif |
| 645 | 845 | ||
| 646 | proto = ip_ct_find_proto((*pskb)->nh.iph->protocol); | 846 | proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol); |
| 647 | 847 | ||
| 648 | /* It may be an special packet, error, unclean... | 848 | /* It may be an special packet, error, unclean... |
| 649 | * inverse of the return code tells to the netfilter | 849 | * inverse of the return code tells to the netfilter |
| @@ -679,8 +879,8 @@ unsigned int ip_conntrack_in(unsigned int hooknum, | |||
| 679 | return -ret; | 879 | return -ret; |
| 680 | } | 880 | } |
| 681 | 881 | ||
| 682 | if (set_reply) | 882 | if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) |
| 683 | set_bit(IPS_SEEN_REPLY_BIT, &ct->status); | 883 | ip_conntrack_event_cache(IPCT_STATUS, *pskb); |
| 684 | 884 | ||
| 685 | return ret; | 885 | return ret; |
| 686 | } | 886 | } |
| @@ -689,7 +889,7 @@ int invert_tuplepr(struct ip_conntrack_tuple *inverse, | |||
| 689 | const struct ip_conntrack_tuple *orig) | 889 | const struct ip_conntrack_tuple *orig) |
| 690 | { | 890 | { |
| 691 | return ip_ct_invert_tuple(inverse, orig, | 891 | return ip_ct_invert_tuple(inverse, orig, |
| 692 | ip_ct_find_proto(orig->dst.protonum)); | 892 | __ip_conntrack_proto_find(orig->dst.protonum)); |
| 693 | } | 893 | } |
| 694 | 894 | ||
| 695 | /* Would two expected things clash? */ | 895 | /* Would two expected things clash? */ |
| @@ -769,6 +969,8 @@ static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp) | |||
| 769 | exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ; | 969 | exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ; |
| 770 | add_timer(&exp->timeout); | 970 | add_timer(&exp->timeout); |
| 771 | 971 | ||
| 972 | exp->id = ++ip_conntrack_expect_next_id; | ||
| 973 | atomic_inc(&exp->use); | ||
| 772 | CONNTRACK_STAT_INC(expect_create); | 974 | CONNTRACK_STAT_INC(expect_create); |
| 773 | } | 975 | } |
| 774 | 976 | ||
| @@ -827,6 +1029,7 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect) | |||
| 827 | evict_oldest_expect(expect->master); | 1029 | evict_oldest_expect(expect->master); |
| 828 | 1030 | ||
| 829 | ip_conntrack_expect_insert(expect); | 1031 | ip_conntrack_expect_insert(expect); |
| 1032 | ip_conntrack_expect_event(IPEXP_NEW, expect); | ||
| 830 | ret = 0; | 1033 | ret = 0; |
| 831 | out: | 1034 | out: |
| 832 | write_unlock_bh(&ip_conntrack_lock); | 1035 | write_unlock_bh(&ip_conntrack_lock); |
| @@ -847,7 +1050,7 @@ void ip_conntrack_alter_reply(struct ip_conntrack *conntrack, | |||
| 847 | 1050 | ||
| 848 | conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; | 1051 | conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; |
| 849 | if (!conntrack->master && conntrack->expecting == 0) | 1052 | if (!conntrack->master && conntrack->expecting == 0) |
| 850 | conntrack->helper = ip_ct_find_helper(newreply); | 1053 | conntrack->helper = __ip_conntrack_helper_find(newreply); |
| 851 | write_unlock_bh(&ip_conntrack_lock); | 1054 | write_unlock_bh(&ip_conntrack_lock); |
| 852 | } | 1055 | } |
| 853 | 1056 | ||
| @@ -861,11 +1064,26 @@ int ip_conntrack_helper_register(struct ip_conntrack_helper *me) | |||
| 861 | return 0; | 1064 | return 0; |
| 862 | } | 1065 | } |
| 863 | 1066 | ||
| 1067 | struct ip_conntrack_helper * | ||
| 1068 | __ip_conntrack_helper_find_byname(const char *name) | ||
| 1069 | { | ||
| 1070 | struct ip_conntrack_helper *h; | ||
| 1071 | |||
| 1072 | list_for_each_entry(h, &helpers, list) { | ||
| 1073 | if (!strcmp(h->name, name)) | ||
| 1074 | return h; | ||
| 1075 | } | ||
| 1076 | |||
| 1077 | return NULL; | ||
| 1078 | } | ||
| 1079 | |||
| 864 | static inline int unhelp(struct ip_conntrack_tuple_hash *i, | 1080 | static inline int unhelp(struct ip_conntrack_tuple_hash *i, |
| 865 | const struct ip_conntrack_helper *me) | 1081 | const struct ip_conntrack_helper *me) |
| 866 | { | 1082 | { |
| 867 | if (tuplehash_to_ctrack(i)->helper == me) | 1083 | if (tuplehash_to_ctrack(i)->helper == me) { |
| 1084 | ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i)); | ||
| 868 | tuplehash_to_ctrack(i)->helper = NULL; | 1085 | tuplehash_to_ctrack(i)->helper = NULL; |
| 1086 | } | ||
| 869 | return 0; | 1087 | return 0; |
| 870 | } | 1088 | } |
| 871 | 1089 | ||
| @@ -927,12 +1145,46 @@ void ip_ct_refresh_acct(struct ip_conntrack *ct, | |||
| 927 | if (del_timer(&ct->timeout)) { | 1145 | if (del_timer(&ct->timeout)) { |
| 928 | ct->timeout.expires = jiffies + extra_jiffies; | 1146 | ct->timeout.expires = jiffies + extra_jiffies; |
| 929 | add_timer(&ct->timeout); | 1147 | add_timer(&ct->timeout); |
| 1148 | ip_conntrack_event_cache(IPCT_REFRESH, skb); | ||
| 930 | } | 1149 | } |
| 931 | ct_add_counters(ct, ctinfo, skb); | 1150 | ct_add_counters(ct, ctinfo, skb); |
| 932 | write_unlock_bh(&ip_conntrack_lock); | 1151 | write_unlock_bh(&ip_conntrack_lock); |
| 933 | } | 1152 | } |
| 934 | } | 1153 | } |
| 935 | 1154 | ||
| 1155 | #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ | ||
| 1156 | defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) | ||
| 1157 | /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be | ||
| 1158 | * in ip_conntrack_core, since we don't want the protocols to autoload | ||
| 1159 | * or depend on ctnetlink */ | ||
| 1160 | int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb, | ||
| 1161 | const struct ip_conntrack_tuple *tuple) | ||
| 1162 | { | ||
| 1163 | NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t), | ||
| 1164 | &tuple->src.u.tcp.port); | ||
| 1165 | NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t), | ||
| 1166 | &tuple->dst.u.tcp.port); | ||
| 1167 | return 0; | ||
| 1168 | |||
| 1169 | nfattr_failure: | ||
| 1170 | return -1; | ||
| 1171 | } | ||
| 1172 | |||
| 1173 | int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[], | ||
| 1174 | struct ip_conntrack_tuple *t) | ||
| 1175 | { | ||
| 1176 | if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1]) | ||
| 1177 | return -EINVAL; | ||
| 1178 | |||
| 1179 | t->src.u.tcp.port = | ||
| 1180 | *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]); | ||
| 1181 | t->dst.u.tcp.port = | ||
| 1182 | *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]); | ||
| 1183 | |||
| 1184 | return 0; | ||
| 1185 | } | ||
| 1186 | #endif | ||
| 1187 | |||
| 936 | /* Returns new sk_buff, or NULL */ | 1188 | /* Returns new sk_buff, or NULL */ |
| 937 | struct sk_buff * | 1189 | struct sk_buff * |
| 938 | ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user) | 1190 | ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user) |
| @@ -943,10 +1195,8 @@ ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user) | |||
| 943 | skb = ip_defrag(skb, user); | 1195 | skb = ip_defrag(skb, user); |
| 944 | local_bh_enable(); | 1196 | local_bh_enable(); |
| 945 | 1197 | ||
| 946 | if (skb) { | 1198 | if (skb) |
| 947 | ip_send_check(skb->nh.iph); | 1199 | ip_send_check(skb->nh.iph); |
| 948 | skb->nfcache |= NFC_ALTERED; | ||
| 949 | } | ||
| 950 | return skb; | 1200 | return skb; |
| 951 | } | 1201 | } |
| 952 | 1202 | ||
| @@ -1096,16 +1346,14 @@ static void free_conntrack_hash(void) | |||
| 1096 | * ip_conntrack_htable_size)); | 1346 | * ip_conntrack_htable_size)); |
| 1097 | } | 1347 | } |
| 1098 | 1348 | ||
| 1099 | /* Mishearing the voices in his head, our hero wonders how he's | 1349 | void ip_conntrack_flush() |
| 1100 | supposed to kill the mall. */ | ||
| 1101 | void ip_conntrack_cleanup(void) | ||
| 1102 | { | 1350 | { |
| 1103 | ip_ct_attach = NULL; | ||
| 1104 | /* This makes sure all current packets have passed through | 1351 | /* This makes sure all current packets have passed through |
| 1105 | netfilter framework. Roll on, two-stage module | 1352 | netfilter framework. Roll on, two-stage module |
| 1106 | delete... */ | 1353 | delete... */ |
| 1107 | synchronize_net(); | 1354 | synchronize_net(); |
| 1108 | 1355 | ||
| 1356 | ip_ct_event_cache_flush(); | ||
| 1109 | i_see_dead_people: | 1357 | i_see_dead_people: |
| 1110 | ip_ct_iterate_cleanup(kill_all, NULL); | 1358 | ip_ct_iterate_cleanup(kill_all, NULL); |
| 1111 | if (atomic_read(&ip_conntrack_count) != 0) { | 1359 | if (atomic_read(&ip_conntrack_count) != 0) { |
| @@ -1115,7 +1363,14 @@ void ip_conntrack_cleanup(void) | |||
| 1115 | /* wait until all references to ip_conntrack_untracked are dropped */ | 1363 | /* wait until all references to ip_conntrack_untracked are dropped */ |
| 1116 | while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1) | 1364 | while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1) |
| 1117 | schedule(); | 1365 | schedule(); |
| 1366 | } | ||
| 1118 | 1367 | ||
| 1368 | /* Mishearing the voices in his head, our hero wonders how he's | ||
| 1369 | supposed to kill the mall. */ | ||
| 1370 | void ip_conntrack_cleanup(void) | ||
| 1371 | { | ||
| 1372 | ip_ct_attach = NULL; | ||
| 1373 | ip_conntrack_flush(); | ||
| 1119 | kmem_cache_destroy(ip_conntrack_cachep); | 1374 | kmem_cache_destroy(ip_conntrack_cachep); |
| 1120 | kmem_cache_destroy(ip_conntrack_expect_cachep); | 1375 | kmem_cache_destroy(ip_conntrack_expect_cachep); |
| 1121 | free_conntrack_hash(); | 1376 | free_conntrack_hash(); |
diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c index 7a3b773be3f9..3a2627db1729 100644 --- a/net/ipv4/netfilter/ip_conntrack_ftp.c +++ b/net/ipv4/netfilter/ip_conntrack_ftp.c | |||
| @@ -25,8 +25,7 @@ MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); | |||
| 25 | MODULE_DESCRIPTION("ftp connection tracking helper"); | 25 | MODULE_DESCRIPTION("ftp connection tracking helper"); |
| 26 | 26 | ||
| 27 | /* This is slow, but it's simple. --RR */ | 27 | /* This is slow, but it's simple. --RR */ |
| 28 | static char ftp_buffer[65536]; | 28 | static char *ftp_buffer; |
| 29 | |||
| 30 | static DEFINE_SPINLOCK(ip_ftp_lock); | 29 | static DEFINE_SPINLOCK(ip_ftp_lock); |
| 31 | 30 | ||
| 32 | #define MAX_PORTS 8 | 31 | #define MAX_PORTS 8 |
| @@ -262,7 +261,8 @@ static int find_nl_seq(u32 seq, const struct ip_ct_ftp_master *info, int dir) | |||
| 262 | } | 261 | } |
| 263 | 262 | ||
| 264 | /* We don't update if it's older than what we have. */ | 263 | /* We don't update if it's older than what we have. */ |
| 265 | static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir) | 264 | static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir, |
| 265 | struct sk_buff *skb) | ||
| 266 | { | 266 | { |
| 267 | unsigned int i, oldest = NUM_SEQ_TO_REMEMBER; | 267 | unsigned int i, oldest = NUM_SEQ_TO_REMEMBER; |
| 268 | 268 | ||
| @@ -276,10 +276,13 @@ static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir) | |||
| 276 | oldest = i; | 276 | oldest = i; |
| 277 | } | 277 | } |
| 278 | 278 | ||
| 279 | if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) | 279 | if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) { |
| 280 | info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq; | 280 | info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq; |
| 281 | else if (oldest != NUM_SEQ_TO_REMEMBER) | 281 | ip_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb); |
| 282 | } else if (oldest != NUM_SEQ_TO_REMEMBER) { | ||
| 282 | info->seq_aft_nl[dir][oldest] = nl_seq; | 283 | info->seq_aft_nl[dir][oldest] = nl_seq; |
| 284 | ip_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb); | ||
| 285 | } | ||
| 283 | } | 286 | } |
| 284 | 287 | ||
| 285 | static int help(struct sk_buff **pskb, | 288 | static int help(struct sk_buff **pskb, |
| @@ -439,7 +442,7 @@ out_update_nl: | |||
| 439 | /* Now if this ends in \n, update ftp info. Seq may have been | 442 | /* Now if this ends in \n, update ftp info. Seq may have been |
| 440 | * adjusted by NAT code. */ | 443 | * adjusted by NAT code. */ |
| 441 | if (ends_in_nl) | 444 | if (ends_in_nl) |
| 442 | update_nl_seq(seq, ct_ftp_info,dir); | 445 | update_nl_seq(seq, ct_ftp_info,dir, *pskb); |
| 443 | out: | 446 | out: |
| 444 | spin_unlock_bh(&ip_ftp_lock); | 447 | spin_unlock_bh(&ip_ftp_lock); |
| 445 | return ret; | 448 | return ret; |
| @@ -457,6 +460,8 @@ static void fini(void) | |||
| 457 | ports[i]); | 460 | ports[i]); |
| 458 | ip_conntrack_helper_unregister(&ftp[i]); | 461 | ip_conntrack_helper_unregister(&ftp[i]); |
| 459 | } | 462 | } |
| 463 | |||
| 464 | kfree(ftp_buffer); | ||
| 460 | } | 465 | } |
| 461 | 466 | ||
| 462 | static int __init init(void) | 467 | static int __init init(void) |
| @@ -464,6 +469,10 @@ static int __init init(void) | |||
| 464 | int i, ret; | 469 | int i, ret; |
| 465 | char *tmpname; | 470 | char *tmpname; |
| 466 | 471 | ||
| 472 | ftp_buffer = kmalloc(65536, GFP_KERNEL); | ||
| 473 | if (!ftp_buffer) | ||
| 474 | return -ENOMEM; | ||
| 475 | |||
| 467 | if (ports_c == 0) | 476 | if (ports_c == 0) |
| 468 | ports[ports_c++] = FTP_PORT; | 477 | ports[ports_c++] = FTP_PORT; |
| 469 | 478 | ||
diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c index 4a28f297d502..25438eec21a1 100644 --- a/net/ipv4/netfilter/ip_conntrack_irc.c +++ b/net/ipv4/netfilter/ip_conntrack_irc.c | |||
| @@ -39,7 +39,7 @@ static int ports_c; | |||
| 39 | static int max_dcc_channels = 8; | 39 | static int max_dcc_channels = 8; |
| 40 | static unsigned int dcc_timeout = 300; | 40 | static unsigned int dcc_timeout = 300; |
| 41 | /* This is slow, but it's simple. --RR */ | 41 | /* This is slow, but it's simple. --RR */ |
| 42 | static char irc_buffer[65536]; | 42 | static char *irc_buffer; |
| 43 | static DEFINE_SPINLOCK(irc_buffer_lock); | 43 | static DEFINE_SPINLOCK(irc_buffer_lock); |
| 44 | 44 | ||
| 45 | unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb, | 45 | unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb, |
| @@ -257,6 +257,10 @@ static int __init init(void) | |||
| 257 | printk("ip_conntrack_irc: dcc_timeout must be a positive integer\n"); | 257 | printk("ip_conntrack_irc: dcc_timeout must be a positive integer\n"); |
| 258 | return -EBUSY; | 258 | return -EBUSY; |
| 259 | } | 259 | } |
| 260 | |||
| 261 | irc_buffer = kmalloc(65536, GFP_KERNEL); | ||
| 262 | if (!irc_buffer) | ||
| 263 | return -ENOMEM; | ||
| 260 | 264 | ||
| 261 | /* If no port given, default to standard irc port */ | 265 | /* If no port given, default to standard irc port */ |
| 262 | if (ports_c == 0) | 266 | if (ports_c == 0) |
| @@ -304,6 +308,7 @@ static void fini(void) | |||
| 304 | ports[i]); | 308 | ports[i]); |
| 305 | ip_conntrack_helper_unregister(&irc_helpers[i]); | 309 | ip_conntrack_helper_unregister(&irc_helpers[i]); |
| 306 | } | 310 | } |
| 311 | kfree(irc_buffer); | ||
| 307 | } | 312 | } |
| 308 | 313 | ||
| 309 | module_init(init); | 314 | module_init(init); |
diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c new file mode 100644 index 000000000000..a4e9278db4ed --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c | |||
| @@ -0,0 +1,1579 @@ | |||
| 1 | /* Connection tracking via netlink socket. Allows for user space | ||
| 2 | * protocol helpers and general trouble making from userspace. | ||
| 3 | * | ||
| 4 | * (C) 2001 by Jay Schulist <jschlst@samba.org> | ||
| 5 | * (C) 2002-2005 by Harald Welte <laforge@gnumonks.org> | ||
| 6 | * (C) 2003 by Patrick Mchardy <kaber@trash.net> | ||
| 7 | * (C) 2005 by Pablo Neira Ayuso <pablo@eurodev.net> | ||
| 8 | * | ||
| 9 | * I've reworked this stuff to use attributes instead of conntrack | ||
| 10 | * structures. 5.44 am. I need more tea. --pablo 05/07/11. | ||
| 11 | * | ||
| 12 | * Initial connection tracking via netlink development funded and | ||
| 13 | * generally made possible by Network Robots, Inc. (www.networkrobots.com) | ||
| 14 | * | ||
| 15 | * Further development of this code funded by Astaro AG (http://www.astaro.com) | ||
| 16 | * | ||
| 17 | * This software may be used and distributed according to the terms | ||
| 18 | * of the GNU General Public License, incorporated herein by reference. | ||
| 19 | */ | ||
| 20 | |||
| 21 | #include <linux/init.h> | ||
| 22 | #include <linux/module.h> | ||
| 23 | #include <linux/kernel.h> | ||
| 24 | #include <linux/types.h> | ||
| 25 | #include <linux/timer.h> | ||
| 26 | #include <linux/skbuff.h> | ||
| 27 | #include <linux/errno.h> | ||
| 28 | #include <linux/netlink.h> | ||
| 29 | #include <linux/spinlock.h> | ||
| 30 | #include <linux/notifier.h> | ||
| 31 | #include <linux/rtnetlink.h> | ||
| 32 | |||
| 33 | #include <linux/netfilter.h> | ||
| 34 | #include <linux/netfilter_ipv4.h> | ||
| 35 | #include <linux/netfilter_ipv4/ip_tables.h> | ||
| 36 | #include <linux/netfilter_ipv4/ip_conntrack.h> | ||
| 37 | #include <linux/netfilter_ipv4/ip_conntrack_core.h> | ||
| 38 | #include <linux/netfilter_ipv4/ip_conntrack_helper.h> | ||
| 39 | #include <linux/netfilter_ipv4/ip_conntrack_protocol.h> | ||
| 40 | #include <linux/netfilter_ipv4/ip_nat_protocol.h> | ||
| 41 | |||
| 42 | #include <linux/netfilter/nfnetlink.h> | ||
| 43 | #include <linux/netfilter/nfnetlink_conntrack.h> | ||
| 44 | |||
| 45 | MODULE_LICENSE("GPL"); | ||
| 46 | |||
| 47 | static char __initdata version[] = "0.90"; | ||
| 48 | |||
| 49 | #if 0 | ||
| 50 | #define DEBUGP printk | ||
| 51 | #else | ||
| 52 | #define DEBUGP(format, args...) | ||
| 53 | #endif | ||
| 54 | |||
| 55 | |||
| 56 | static inline int | ||
| 57 | ctnetlink_dump_tuples_proto(struct sk_buff *skb, | ||
| 58 | const struct ip_conntrack_tuple *tuple) | ||
| 59 | { | ||
| 60 | struct ip_conntrack_protocol *proto; | ||
| 61 | |||
| 62 | NFA_PUT(skb, CTA_PROTO_NUM, sizeof(u_int8_t), &tuple->dst.protonum); | ||
| 63 | |||
| 64 | proto = ip_conntrack_proto_find_get(tuple->dst.protonum); | ||
| 65 | if (proto && proto->tuple_to_nfattr) | ||
| 66 | return proto->tuple_to_nfattr(skb, tuple); | ||
| 67 | |||
| 68 | return 0; | ||
| 69 | |||
| 70 | nfattr_failure: | ||
| 71 | return -1; | ||
| 72 | } | ||
| 73 | |||
| 74 | static inline int | ||
| 75 | ctnetlink_dump_tuples(struct sk_buff *skb, | ||
| 76 | const struct ip_conntrack_tuple *tuple) | ||
| 77 | { | ||
| 78 | struct nfattr *nest_parms; | ||
| 79 | |||
| 80 | nest_parms = NFA_NEST(skb, CTA_TUPLE_IP); | ||
| 81 | NFA_PUT(skb, CTA_IP_V4_SRC, sizeof(u_int32_t), &tuple->src.ip); | ||
| 82 | NFA_PUT(skb, CTA_IP_V4_DST, sizeof(u_int32_t), &tuple->dst.ip); | ||
| 83 | NFA_NEST_END(skb, nest_parms); | ||
| 84 | |||
| 85 | nest_parms = NFA_NEST(skb, CTA_TUPLE_PROTO); | ||
| 86 | ctnetlink_dump_tuples_proto(skb, tuple); | ||
| 87 | NFA_NEST_END(skb, nest_parms); | ||
| 88 | |||
| 89 | return 0; | ||
| 90 | |||
| 91 | nfattr_failure: | ||
| 92 | return -1; | ||
| 93 | } | ||
| 94 | |||
| 95 | static inline int | ||
| 96 | ctnetlink_dump_status(struct sk_buff *skb, const struct ip_conntrack *ct) | ||
| 97 | { | ||
| 98 | u_int32_t status = htonl((u_int32_t) ct->status); | ||
| 99 | NFA_PUT(skb, CTA_STATUS, sizeof(status), &status); | ||
| 100 | return 0; | ||
| 101 | |||
| 102 | nfattr_failure: | ||
| 103 | return -1; | ||
| 104 | } | ||
| 105 | |||
| 106 | static inline int | ||
| 107 | ctnetlink_dump_timeout(struct sk_buff *skb, const struct ip_conntrack *ct) | ||
| 108 | { | ||
| 109 | long timeout_l = ct->timeout.expires - jiffies; | ||
| 110 | u_int32_t timeout; | ||
| 111 | |||
| 112 | if (timeout_l < 0) | ||
| 113 | timeout = 0; | ||
| 114 | else | ||
| 115 | timeout = htonl(timeout_l / HZ); | ||
| 116 | |||
| 117 | NFA_PUT(skb, CTA_TIMEOUT, sizeof(timeout), &timeout); | ||
| 118 | return 0; | ||
| 119 | |||
| 120 | nfattr_failure: | ||
| 121 | return -1; | ||
| 122 | } | ||
| 123 | |||
| 124 | static inline int | ||
| 125 | ctnetlink_dump_protoinfo(struct sk_buff *skb, const struct ip_conntrack *ct) | ||
| 126 | { | ||
| 127 | struct ip_conntrack_protocol *proto = ip_conntrack_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum); | ||
| 128 | |||
| 129 | struct nfattr *nest_proto; | ||
| 130 | int ret; | ||
| 131 | |||
| 132 | if (!proto || !proto->to_nfattr) | ||
| 133 | return 0; | ||
| 134 | |||
| 135 | nest_proto = NFA_NEST(skb, CTA_PROTOINFO); | ||
| 136 | |||
| 137 | ret = proto->to_nfattr(skb, nest_proto, ct); | ||
| 138 | |||
| 139 | ip_conntrack_proto_put(proto); | ||
| 140 | |||
| 141 | NFA_NEST_END(skb, nest_proto); | ||
| 142 | |||
| 143 | return ret; | ||
| 144 | |||
| 145 | nfattr_failure: | ||
| 146 | return -1; | ||
| 147 | } | ||
| 148 | |||
| 149 | static inline int | ||
| 150 | ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct ip_conntrack *ct) | ||
| 151 | { | ||
| 152 | struct nfattr *nest_helper; | ||
| 153 | |||
| 154 | if (!ct->helper) | ||
| 155 | return 0; | ||
| 156 | |||
| 157 | nest_helper = NFA_NEST(skb, CTA_HELP); | ||
| 158 | NFA_PUT(skb, CTA_HELP_NAME, CTA_HELP_MAXNAMESIZE, &ct->helper->name); | ||
| 159 | |||
| 160 | if (ct->helper->to_nfattr) | ||
| 161 | ct->helper->to_nfattr(skb, ct); | ||
| 162 | |||
| 163 | NFA_NEST_END(skb, nest_helper); | ||
| 164 | |||
| 165 | return 0; | ||
| 166 | |||
| 167 | nfattr_failure: | ||
| 168 | return -1; | ||
| 169 | } | ||
| 170 | |||
| 171 | #ifdef CONFIG_IP_NF_CT_ACCT | ||
| 172 | static inline int | ||
| 173 | ctnetlink_dump_counters(struct sk_buff *skb, const struct ip_conntrack *ct, | ||
| 174 | enum ip_conntrack_dir dir) | ||
| 175 | { | ||
| 176 | enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG; | ||
| 177 | struct nfattr *nest_count = NFA_NEST(skb, type); | ||
| 178 | u_int64_t tmp; | ||
| 179 | |||
| 180 | tmp = cpu_to_be64(ct->counters[dir].packets); | ||
| 181 | NFA_PUT(skb, CTA_COUNTERS_PACKETS, sizeof(u_int64_t), &tmp); | ||
| 182 | |||
| 183 | tmp = cpu_to_be64(ct->counters[dir].bytes); | ||
| 184 | NFA_PUT(skb, CTA_COUNTERS_BYTES, sizeof(u_int64_t), &tmp); | ||
| 185 | |||
| 186 | NFA_NEST_END(skb, nest_count); | ||
| 187 | |||
| 188 | return 0; | ||
| 189 | |||
| 190 | nfattr_failure: | ||
| 191 | return -1; | ||
| 192 | } | ||
| 193 | #else | ||
| 194 | #define ctnetlink_dump_counters(a, b, c) (0) | ||
| 195 | #endif | ||
| 196 | |||
| 197 | #ifdef CONFIG_IP_NF_CONNTRACK_MARK | ||
| 198 | static inline int | ||
| 199 | ctnetlink_dump_mark(struct sk_buff *skb, const struct ip_conntrack *ct) | ||
| 200 | { | ||
| 201 | u_int32_t mark = htonl(ct->mark); | ||
| 202 | |||
| 203 | NFA_PUT(skb, CTA_MARK, sizeof(u_int32_t), &mark); | ||
| 204 | return 0; | ||
| 205 | |||
| 206 | nfattr_failure: | ||
| 207 | return -1; | ||
| 208 | } | ||
| 209 | #else | ||
| 210 | #define ctnetlink_dump_mark(a, b) (0) | ||
| 211 | #endif | ||
| 212 | |||
| 213 | static inline int | ||
| 214 | ctnetlink_dump_id(struct sk_buff *skb, const struct ip_conntrack *ct) | ||
| 215 | { | ||
| 216 | u_int32_t id = htonl(ct->id); | ||
| 217 | NFA_PUT(skb, CTA_ID, sizeof(u_int32_t), &id); | ||
| 218 | return 0; | ||
| 219 | |||
| 220 | nfattr_failure: | ||
| 221 | return -1; | ||
| 222 | } | ||
| 223 | |||
| 224 | static inline int | ||
| 225 | ctnetlink_dump_use(struct sk_buff *skb, const struct ip_conntrack *ct) | ||
| 226 | { | ||
| 227 | unsigned int use = htonl(atomic_read(&ct->ct_general.use)); | ||
| 228 | |||
| 229 | NFA_PUT(skb, CTA_USE, sizeof(u_int32_t), &use); | ||
| 230 | return 0; | ||
| 231 | |||
| 232 | nfattr_failure: | ||
| 233 | return -1; | ||
| 234 | } | ||
| 235 | |||
| 236 | #define tuple(ct, dir) (&(ct)->tuplehash[dir].tuple) | ||
| 237 | |||
| 238 | static int | ||
| 239 | ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq, | ||
| 240 | int event, int nowait, | ||
| 241 | const struct ip_conntrack *ct) | ||
| 242 | { | ||
| 243 | struct nlmsghdr *nlh; | ||
| 244 | struct nfgenmsg *nfmsg; | ||
| 245 | struct nfattr *nest_parms; | ||
| 246 | unsigned char *b; | ||
| 247 | |||
| 248 | b = skb->tail; | ||
| 249 | |||
| 250 | event |= NFNL_SUBSYS_CTNETLINK << 8; | ||
| 251 | nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg)); | ||
| 252 | nfmsg = NLMSG_DATA(nlh); | ||
| 253 | |||
| 254 | nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0; | ||
| 255 | nfmsg->nfgen_family = AF_INET; | ||
| 256 | nfmsg->version = NFNETLINK_V0; | ||
| 257 | nfmsg->res_id = 0; | ||
| 258 | |||
| 259 | nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG); | ||
| 260 | if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) | ||
| 261 | goto nfattr_failure; | ||
| 262 | NFA_NEST_END(skb, nest_parms); | ||
| 263 | |||
| 264 | nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY); | ||
| 265 | if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0) | ||
| 266 | goto nfattr_failure; | ||
| 267 | NFA_NEST_END(skb, nest_parms); | ||
| 268 | |||
| 269 | if (ctnetlink_dump_status(skb, ct) < 0 || | ||
| 270 | ctnetlink_dump_timeout(skb, ct) < 0 || | ||
| 271 | ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 || | ||
| 272 | ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 || | ||
| 273 | ctnetlink_dump_protoinfo(skb, ct) < 0 || | ||
| 274 | ctnetlink_dump_helpinfo(skb, ct) < 0 || | ||
| 275 | ctnetlink_dump_mark(skb, ct) < 0 || | ||
| 276 | ctnetlink_dump_id(skb, ct) < 0 || | ||
| 277 | ctnetlink_dump_use(skb, ct) < 0) | ||
| 278 | goto nfattr_failure; | ||
| 279 | |||
| 280 | nlh->nlmsg_len = skb->tail - b; | ||
| 281 | return skb->len; | ||
| 282 | |||
| 283 | nlmsg_failure: | ||
| 284 | nfattr_failure: | ||
| 285 | skb_trim(skb, b - skb->data); | ||
| 286 | return -1; | ||
| 287 | } | ||
| 288 | |||
| 289 | #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS | ||
| 290 | static int ctnetlink_conntrack_event(struct notifier_block *this, | ||
| 291 | unsigned long events, void *ptr) | ||
| 292 | { | ||
| 293 | struct nlmsghdr *nlh; | ||
| 294 | struct nfgenmsg *nfmsg; | ||
| 295 | struct nfattr *nest_parms; | ||
| 296 | struct ip_conntrack *ct = (struct ip_conntrack *)ptr; | ||
| 297 | struct sk_buff *skb; | ||
| 298 | unsigned int type; | ||
| 299 | unsigned char *b; | ||
| 300 | unsigned int flags = 0, group; | ||
| 301 | |||
| 302 | /* ignore our fake conntrack entry */ | ||
| 303 | if (ct == &ip_conntrack_untracked) | ||
| 304 | return NOTIFY_DONE; | ||
| 305 | |||
| 306 | if (events & IPCT_DESTROY) { | ||
| 307 | type = IPCTNL_MSG_CT_DELETE; | ||
| 308 | group = NFNLGRP_CONNTRACK_DESTROY; | ||
| 309 | goto alloc_skb; | ||
| 310 | } | ||
| 311 | if (events & (IPCT_NEW | IPCT_RELATED)) { | ||
| 312 | type = IPCTNL_MSG_CT_NEW; | ||
| 313 | flags = NLM_F_CREATE|NLM_F_EXCL; | ||
| 314 | /* dump everything */ | ||
| 315 | events = ~0UL; | ||
| 316 | group = NFNLGRP_CONNTRACK_NEW; | ||
| 317 | goto alloc_skb; | ||
| 318 | } | ||
| 319 | if (events & (IPCT_STATUS | | ||
| 320 | IPCT_PROTOINFO | | ||
| 321 | IPCT_HELPER | | ||
| 322 | IPCT_HELPINFO | | ||
| 323 | IPCT_NATINFO)) { | ||
| 324 | type = IPCTNL_MSG_CT_NEW; | ||
| 325 | group = NFNLGRP_CONNTRACK_UPDATE; | ||
| 326 | goto alloc_skb; | ||
| 327 | } | ||
| 328 | |||
| 329 | return NOTIFY_DONE; | ||
| 330 | |||
| 331 | alloc_skb: | ||
| 332 | /* FIXME: Check if there are any listeners before, don't hurt performance */ | ||
| 333 | |||
| 334 | skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); | ||
| 335 | if (!skb) | ||
| 336 | return NOTIFY_DONE; | ||
| 337 | |||
| 338 | b = skb->tail; | ||
| 339 | |||
| 340 | type |= NFNL_SUBSYS_CTNETLINK << 8; | ||
| 341 | nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg)); | ||
| 342 | nfmsg = NLMSG_DATA(nlh); | ||
| 343 | |||
| 344 | nlh->nlmsg_flags = flags; | ||
| 345 | nfmsg->nfgen_family = AF_INET; | ||
| 346 | nfmsg->version = NFNETLINK_V0; | ||
| 347 | nfmsg->res_id = 0; | ||
| 348 | |||
| 349 | nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG); | ||
| 350 | if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) | ||
| 351 | goto nfattr_failure; | ||
| 352 | NFA_NEST_END(skb, nest_parms); | ||
| 353 | |||
| 354 | nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY); | ||
| 355 | if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0) | ||
| 356 | goto nfattr_failure; | ||
| 357 | NFA_NEST_END(skb, nest_parms); | ||
| 358 | |||
| 359 | /* NAT stuff is now a status flag */ | ||
| 360 | if ((events & IPCT_STATUS || events & IPCT_NATINFO) | ||
| 361 | && ctnetlink_dump_status(skb, ct) < 0) | ||
| 362 | goto nfattr_failure; | ||
| 363 | if (events & IPCT_REFRESH | ||
| 364 | && ctnetlink_dump_timeout(skb, ct) < 0) | ||
| 365 | goto nfattr_failure; | ||
| 366 | if (events & IPCT_PROTOINFO | ||
| 367 | && ctnetlink_dump_protoinfo(skb, ct) < 0) | ||
| 368 | goto nfattr_failure; | ||
| 369 | if (events & IPCT_HELPINFO | ||
| 370 | && ctnetlink_dump_helpinfo(skb, ct) < 0) | ||
| 371 | goto nfattr_failure; | ||
| 372 | |||
| 373 | if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 || | ||
| 374 | ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0) | ||
| 375 | goto nfattr_failure; | ||
| 376 | |||
| 377 | nlh->nlmsg_len = skb->tail - b; | ||
| 378 | nfnetlink_send(skb, 0, group, 0); | ||
| 379 | return NOTIFY_DONE; | ||
| 380 | |||
| 381 | nlmsg_failure: | ||
| 382 | nfattr_failure: | ||
| 383 | kfree_skb(skb); | ||
| 384 | return NOTIFY_DONE; | ||
| 385 | } | ||
| 386 | #endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */ | ||
| 387 | |||
| 388 | static int ctnetlink_done(struct netlink_callback *cb) | ||
| 389 | { | ||
| 390 | DEBUGP("entered %s\n", __FUNCTION__); | ||
| 391 | return 0; | ||
| 392 | } | ||
| 393 | |||
| 394 | static int | ||
| 395 | ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) | ||
| 396 | { | ||
| 397 | struct ip_conntrack *ct = NULL; | ||
| 398 | struct ip_conntrack_tuple_hash *h; | ||
| 399 | struct list_head *i; | ||
| 400 | u_int32_t *id = (u_int32_t *) &cb->args[1]; | ||
| 401 | |||
| 402 | DEBUGP("entered %s, last bucket=%lu id=%u\n", __FUNCTION__, | ||
| 403 | cb->args[0], *id); | ||
| 404 | |||
| 405 | read_lock_bh(&ip_conntrack_lock); | ||
| 406 | for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) { | ||
| 407 | list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) { | ||
| 408 | h = (struct ip_conntrack_tuple_hash *) i; | ||
| 409 | if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) | ||
| 410 | continue; | ||
| 411 | ct = tuplehash_to_ctrack(h); | ||
| 412 | if (ct->id <= *id) | ||
| 413 | continue; | ||
| 414 | if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, | ||
| 415 | cb->nlh->nlmsg_seq, | ||
| 416 | IPCTNL_MSG_CT_NEW, | ||
| 417 | 1, ct) < 0) | ||
| 418 | goto out; | ||
| 419 | *id = ct->id; | ||
| 420 | } | ||
| 421 | } | ||
| 422 | out: | ||
| 423 | read_unlock_bh(&ip_conntrack_lock); | ||
| 424 | |||
| 425 | DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id); | ||
| 426 | |||
| 427 | return skb->len; | ||
| 428 | } | ||
| 429 | |||
| 430 | #ifdef CONFIG_IP_NF_CT_ACCT | ||
| 431 | static int | ||
| 432 | ctnetlink_dump_table_w(struct sk_buff *skb, struct netlink_callback *cb) | ||
| 433 | { | ||
| 434 | struct ip_conntrack *ct = NULL; | ||
| 435 | struct ip_conntrack_tuple_hash *h; | ||
| 436 | struct list_head *i; | ||
| 437 | u_int32_t *id = (u_int32_t *) &cb->args[1]; | ||
| 438 | |||
| 439 | DEBUGP("entered %s, last bucket=%u id=%u\n", __FUNCTION__, | ||
| 440 | cb->args[0], *id); | ||
| 441 | |||
| 442 | write_lock_bh(&ip_conntrack_lock); | ||
| 443 | for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) { | ||
| 444 | list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) { | ||
| 445 | h = (struct ip_conntrack_tuple_hash *) i; | ||
| 446 | if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) | ||
| 447 | continue; | ||
| 448 | ct = tuplehash_to_ctrack(h); | ||
| 449 | if (ct->id <= *id) | ||
| 450 | continue; | ||
| 451 | if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, | ||
| 452 | cb->nlh->nlmsg_seq, | ||
| 453 | IPCTNL_MSG_CT_NEW, | ||
| 454 | 1, ct) < 0) | ||
| 455 | goto out; | ||
| 456 | *id = ct->id; | ||
| 457 | |||
| 458 | memset(&ct->counters, 0, sizeof(ct->counters)); | ||
| 459 | } | ||
| 460 | } | ||
| 461 | out: | ||
| 462 | write_unlock_bh(&ip_conntrack_lock); | ||
| 463 | |||
| 464 | DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id); | ||
| 465 | |||
| 466 | return skb->len; | ||
| 467 | } | ||
| 468 | #endif | ||
| 469 | |||
| 470 | static const int cta_min_ip[CTA_IP_MAX] = { | ||
| 471 | [CTA_IP_V4_SRC-1] = sizeof(u_int32_t), | ||
| 472 | [CTA_IP_V4_DST-1] = sizeof(u_int32_t), | ||
| 473 | }; | ||
| 474 | |||
| 475 | static inline int | ||
| 476 | ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple) | ||
| 477 | { | ||
| 478 | struct nfattr *tb[CTA_IP_MAX]; | ||
| 479 | |||
| 480 | DEBUGP("entered %s\n", __FUNCTION__); | ||
| 481 | |||
| 482 | |||
| 483 | if (nfattr_parse_nested(tb, CTA_IP_MAX, attr) < 0) | ||
| 484 | goto nfattr_failure; | ||
| 485 | |||
| 486 | if (nfattr_bad_size(tb, CTA_IP_MAX, cta_min_ip)) | ||
| 487 | return -EINVAL; | ||
| 488 | |||
| 489 | if (!tb[CTA_IP_V4_SRC-1]) | ||
| 490 | return -EINVAL; | ||
| 491 | tuple->src.ip = *(u_int32_t *)NFA_DATA(tb[CTA_IP_V4_SRC-1]); | ||
| 492 | |||
| 493 | if (!tb[CTA_IP_V4_DST-1]) | ||
| 494 | return -EINVAL; | ||
| 495 | tuple->dst.ip = *(u_int32_t *)NFA_DATA(tb[CTA_IP_V4_DST-1]); | ||
| 496 | |||
| 497 | DEBUGP("leaving\n"); | ||
| 498 | |||
| 499 | return 0; | ||
| 500 | |||
| 501 | nfattr_failure: | ||
| 502 | return -1; | ||
| 503 | } | ||
| 504 | |||
| 505 | static const int cta_min_proto[CTA_PROTO_MAX] = { | ||
| 506 | [CTA_PROTO_NUM-1] = sizeof(u_int16_t), | ||
| 507 | [CTA_PROTO_SRC_PORT-1] = sizeof(u_int16_t), | ||
| 508 | [CTA_PROTO_DST_PORT-1] = sizeof(u_int16_t), | ||
| 509 | [CTA_PROTO_ICMP_TYPE-1] = sizeof(u_int8_t), | ||
| 510 | [CTA_PROTO_ICMP_CODE-1] = sizeof(u_int8_t), | ||
| 511 | [CTA_PROTO_ICMP_ID-1] = sizeof(u_int16_t), | ||
| 512 | }; | ||
| 513 | |||
| 514 | static inline int | ||
| 515 | ctnetlink_parse_tuple_proto(struct nfattr *attr, | ||
| 516 | struct ip_conntrack_tuple *tuple) | ||
| 517 | { | ||
| 518 | struct nfattr *tb[CTA_PROTO_MAX]; | ||
| 519 | struct ip_conntrack_protocol *proto; | ||
| 520 | int ret = 0; | ||
| 521 | |||
| 522 | DEBUGP("entered %s\n", __FUNCTION__); | ||
| 523 | |||
| 524 | if (nfattr_parse_nested(tb, CTA_PROTO_MAX, attr) < 0) | ||
| 525 | goto nfattr_failure; | ||
| 526 | |||
| 527 | if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto)) | ||
| 528 | return -EINVAL; | ||
| 529 | |||
| 530 | if (!tb[CTA_PROTO_NUM-1]) | ||
| 531 | return -EINVAL; | ||
| 532 | tuple->dst.protonum = *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_NUM-1]); | ||
| 533 | |||
| 534 | proto = ip_conntrack_proto_find_get(tuple->dst.protonum); | ||
| 535 | |||
| 536 | if (likely(proto && proto->nfattr_to_tuple)) { | ||
| 537 | ret = proto->nfattr_to_tuple(tb, tuple); | ||
| 538 | ip_conntrack_proto_put(proto); | ||
| 539 | } | ||
| 540 | |||
| 541 | return ret; | ||
| 542 | |||
| 543 | nfattr_failure: | ||
| 544 | return -1; | ||
| 545 | } | ||
| 546 | |||
| 547 | static inline int | ||
| 548 | ctnetlink_parse_tuple(struct nfattr *cda[], struct ip_conntrack_tuple *tuple, | ||
| 549 | enum ctattr_tuple type) | ||
| 550 | { | ||
| 551 | struct nfattr *tb[CTA_TUPLE_MAX]; | ||
| 552 | int err; | ||
| 553 | |||
| 554 | DEBUGP("entered %s\n", __FUNCTION__); | ||
| 555 | |||
| 556 | memset(tuple, 0, sizeof(*tuple)); | ||
| 557 | |||
| 558 | if (nfattr_parse_nested(tb, CTA_TUPLE_MAX, cda[type-1]) < 0) | ||
| 559 | goto nfattr_failure; | ||
| 560 | |||
| 561 | if (!tb[CTA_TUPLE_IP-1]) | ||
| 562 | return -EINVAL; | ||
| 563 | |||
| 564 | err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP-1], tuple); | ||
| 565 | if (err < 0) | ||
| 566 | return err; | ||
| 567 | |||
| 568 | if (!tb[CTA_TUPLE_PROTO-1]) | ||
| 569 | return -EINVAL; | ||
| 570 | |||
| 571 | err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO-1], tuple); | ||
| 572 | if (err < 0) | ||
| 573 | return err; | ||
| 574 | |||
| 575 | /* orig and expect tuples get DIR_ORIGINAL */ | ||
| 576 | if (type == CTA_TUPLE_REPLY) | ||
| 577 | tuple->dst.dir = IP_CT_DIR_REPLY; | ||
| 578 | else | ||
| 579 | tuple->dst.dir = IP_CT_DIR_ORIGINAL; | ||
| 580 | |||
| 581 | DUMP_TUPLE(tuple); | ||
| 582 | |||
| 583 | DEBUGP("leaving\n"); | ||
| 584 | |||
| 585 | return 0; | ||
| 586 | |||
| 587 | nfattr_failure: | ||
| 588 | return -1; | ||
| 589 | } | ||
| 590 | |||
| 591 | #ifdef CONFIG_IP_NF_NAT_NEEDED | ||
| 592 | static const int cta_min_protonat[CTA_PROTONAT_MAX] = { | ||
| 593 | [CTA_PROTONAT_PORT_MIN-1] = sizeof(u_int16_t), | ||
| 594 | [CTA_PROTONAT_PORT_MAX-1] = sizeof(u_int16_t), | ||
| 595 | }; | ||
| 596 | |||
| 597 | static int ctnetlink_parse_nat_proto(struct nfattr *attr, | ||
| 598 | const struct ip_conntrack *ct, | ||
| 599 | struct ip_nat_range *range) | ||
| 600 | { | ||
| 601 | struct nfattr *tb[CTA_PROTONAT_MAX]; | ||
| 602 | struct ip_nat_protocol *npt; | ||
| 603 | |||
| 604 | DEBUGP("entered %s\n", __FUNCTION__); | ||
| 605 | |||
| 606 | if (nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr) < 0) | ||
| 607 | goto nfattr_failure; | ||
| 608 | |||
| 609 | if (nfattr_bad_size(tb, CTA_PROTONAT_MAX, cta_min_protonat)) | ||
| 610 | goto nfattr_failure; | ||
| 611 | |||
| 612 | npt = ip_nat_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum); | ||
| 613 | if (!npt) | ||
| 614 | return 0; | ||
| 615 | |||
| 616 | if (!npt->nfattr_to_range) { | ||
| 617 | ip_nat_proto_put(npt); | ||
| 618 | return 0; | ||
| 619 | } | ||
| 620 | |||
| 621 | /* nfattr_to_range returns 1 if it parsed, 0 if not, neg. on error */ | ||
| 622 | if (npt->nfattr_to_range(tb, range) > 0) | ||
| 623 | range->flags |= IP_NAT_RANGE_PROTO_SPECIFIED; | ||
| 624 | |||
| 625 | ip_nat_proto_put(npt); | ||
| 626 | |||
| 627 | DEBUGP("leaving\n"); | ||
| 628 | return 0; | ||
| 629 | |||
| 630 | nfattr_failure: | ||
| 631 | return -1; | ||
| 632 | } | ||
| 633 | |||
| 634 | static inline int | ||
| 635 | ctnetlink_parse_nat(struct nfattr *cda[], | ||
| 636 | const struct ip_conntrack *ct, struct ip_nat_range *range) | ||
| 637 | { | ||
| 638 | struct nfattr *tb[CTA_NAT_MAX]; | ||
| 639 | int err; | ||
| 640 | |||
| 641 | DEBUGP("entered %s\n", __FUNCTION__); | ||
| 642 | |||
| 643 | memset(range, 0, sizeof(*range)); | ||
| 644 | |||
| 645 | if (nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]) < 0) | ||
| 646 | goto nfattr_failure; | ||
| 647 | |||
| 648 | if (tb[CTA_NAT_MINIP-1]) | ||
| 649 | range->min_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MINIP-1]); | ||
| 650 | |||
| 651 | if (!tb[CTA_NAT_MAXIP-1]) | ||
| 652 | range->max_ip = range->min_ip; | ||
| 653 | else | ||
| 654 | range->max_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MAXIP-1]); | ||
| 655 | |||
| 656 | if (range->min_ip) | ||
| 657 | range->flags |= IP_NAT_RANGE_MAP_IPS; | ||
| 658 | |||
| 659 | if (!tb[CTA_NAT_PROTO-1]) | ||
| 660 | return 0; | ||
| 661 | |||
| 662 | err = ctnetlink_parse_nat_proto(tb[CTA_NAT_PROTO-1], ct, range); | ||
| 663 | if (err < 0) | ||
| 664 | return err; | ||
| 665 | |||
| 666 | DEBUGP("leaving\n"); | ||
| 667 | return 0; | ||
| 668 | |||
| 669 | nfattr_failure: | ||
| 670 | return -1; | ||
| 671 | } | ||
| 672 | #endif | ||
| 673 | |||
| 674 | static inline int | ||
| 675 | ctnetlink_parse_help(struct nfattr *attr, char **helper_name) | ||
| 676 | { | ||
| 677 | struct nfattr *tb[CTA_HELP_MAX]; | ||
| 678 | |||
| 679 | DEBUGP("entered %s\n", __FUNCTION__); | ||
| 680 | |||
| 681 | if (nfattr_parse_nested(tb, CTA_HELP_MAX, attr) < 0) | ||
| 682 | goto nfattr_failure; | ||
| 683 | |||
| 684 | if (!tb[CTA_HELP_NAME-1]) | ||
| 685 | return -EINVAL; | ||
| 686 | |||
| 687 | *helper_name = NFA_DATA(tb[CTA_HELP_NAME-1]); | ||
| 688 | |||
| 689 | return 0; | ||
| 690 | |||
| 691 | nfattr_failure: | ||
| 692 | return -1; | ||
| 693 | } | ||
| 694 | |||
| 695 | static int | ||
| 696 | ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, | ||
| 697 | struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) | ||
| 698 | { | ||
| 699 | struct ip_conntrack_tuple_hash *h; | ||
| 700 | struct ip_conntrack_tuple tuple; | ||
| 701 | struct ip_conntrack *ct; | ||
| 702 | int err = 0; | ||
| 703 | |||
| 704 | DEBUGP("entered %s\n", __FUNCTION__); | ||
| 705 | |||
| 706 | if (cda[CTA_TUPLE_ORIG-1]) | ||
| 707 | err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG); | ||
| 708 | else if (cda[CTA_TUPLE_REPLY-1]) | ||
| 709 | err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY); | ||
| 710 | else { | ||
| 711 | /* Flush the whole table */ | ||
| 712 | ip_conntrack_flush(); | ||
| 713 | return 0; | ||
| 714 | } | ||
| 715 | |||
| 716 | if (err < 0) | ||
| 717 | return err; | ||
| 718 | |||
| 719 | h = ip_conntrack_find_get(&tuple, NULL); | ||
| 720 | if (!h) { | ||
| 721 | DEBUGP("tuple not found in conntrack hash\n"); | ||
| 722 | return -ENOENT; | ||
| 723 | } | ||
| 724 | |||
| 725 | ct = tuplehash_to_ctrack(h); | ||
| 726 | |||
| 727 | if (cda[CTA_ID-1]) { | ||
| 728 | u_int32_t id = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_ID-1])); | ||
| 729 | if (ct->id != id) { | ||
| 730 | ip_conntrack_put(ct); | ||
| 731 | return -ENOENT; | ||
| 732 | } | ||
| 733 | } | ||
| 734 | if (del_timer(&ct->timeout)) { | ||
| 735 | ip_conntrack_put(ct); | ||
| 736 | ct->timeout.function((unsigned long)ct); | ||
| 737 | return 0; | ||
| 738 | } | ||
| 739 | ip_conntrack_put(ct); | ||
| 740 | DEBUGP("leaving\n"); | ||
| 741 | |||
| 742 | return 0; | ||
| 743 | } | ||
| 744 | |||
| 745 | static int | ||
| 746 | ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, | ||
| 747 | struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) | ||
| 748 | { | ||
| 749 | struct ip_conntrack_tuple_hash *h; | ||
| 750 | struct ip_conntrack_tuple tuple; | ||
| 751 | struct ip_conntrack *ct; | ||
| 752 | struct sk_buff *skb2 = NULL; | ||
| 753 | int err = 0; | ||
| 754 | |||
| 755 | DEBUGP("entered %s\n", __FUNCTION__); | ||
| 756 | |||
| 757 | if (nlh->nlmsg_flags & NLM_F_DUMP) { | ||
| 758 | struct nfgenmsg *msg = NLMSG_DATA(nlh); | ||
| 759 | u32 rlen; | ||
| 760 | |||
| 761 | if (msg->nfgen_family != AF_INET) | ||
| 762 | return -EAFNOSUPPORT; | ||
| 763 | |||
| 764 | if (NFNL_MSG_TYPE(nlh->nlmsg_type) == | ||
| 765 | IPCTNL_MSG_CT_GET_CTRZERO) { | ||
| 766 | #ifdef CONFIG_IP_NF_CT_ACCT | ||
| 767 | if ((*errp = netlink_dump_start(ctnl, skb, nlh, | ||
| 768 | ctnetlink_dump_table_w, | ||
| 769 | ctnetlink_done)) != 0) | ||
| 770 | return -EINVAL; | ||
| 771 | #else | ||
| 772 | return -ENOTSUPP; | ||
| 773 | #endif | ||
| 774 | } else { | ||
| 775 | if ((*errp = netlink_dump_start(ctnl, skb, nlh, | ||
| 776 | ctnetlink_dump_table, | ||
| 777 | ctnetlink_done)) != 0) | ||
| 778 | return -EINVAL; | ||
| 779 | } | ||
| 780 | |||
| 781 | rlen = NLMSG_ALIGN(nlh->nlmsg_len); | ||
| 782 | if (rlen > skb->len) | ||
| 783 | rlen = skb->len; | ||
| 784 | skb_pull(skb, rlen); | ||
| 785 | return 0; | ||
| 786 | } | ||
| 787 | |||
| 788 | if (cda[CTA_TUPLE_ORIG-1]) | ||
| 789 | err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG); | ||
| 790 | else if (cda[CTA_TUPLE_REPLY-1]) | ||
| 791 | err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY); | ||
| 792 | else | ||
| 793 | return -EINVAL; | ||
| 794 | |||
| 795 | if (err < 0) | ||
| 796 | return err; | ||
| 797 | |||
| 798 | h = ip_conntrack_find_get(&tuple, NULL); | ||
| 799 | if (!h) { | ||
| 800 | DEBUGP("tuple not found in conntrack hash"); | ||
| 801 | return -ENOENT; | ||
| 802 | } | ||
| 803 | DEBUGP("tuple found\n"); | ||
| 804 | ct = tuplehash_to_ctrack(h); | ||
| 805 | |||
| 806 | err = -ENOMEM; | ||
| 807 | skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); | ||
| 808 | if (!skb2) { | ||
| 809 | ip_conntrack_put(ct); | ||
| 810 | return -ENOMEM; | ||
| 811 | } | ||
| 812 | NETLINK_CB(skb2).dst_pid = NETLINK_CB(skb).pid; | ||
| 813 | |||
| 814 | err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq, | ||
| 815 | IPCTNL_MSG_CT_NEW, 1, ct); | ||
| 816 | ip_conntrack_put(ct); | ||
| 817 | if (err <= 0) | ||
| 818 | goto out; | ||
| 819 | |||
| 820 | err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); | ||
| 821 | if (err < 0) | ||
| 822 | goto out; | ||
| 823 | |||
| 824 | DEBUGP("leaving\n"); | ||
| 825 | return 0; | ||
| 826 | |||
| 827 | out: | ||
| 828 | if (skb2) | ||
| 829 | kfree_skb(skb2); | ||
| 830 | return -1; | ||
| 831 | } | ||
| 832 | |||
| 833 | static inline int | ||
| 834 | ctnetlink_change_status(struct ip_conntrack *ct, struct nfattr *cda[]) | ||
| 835 | { | ||
| 836 | unsigned long d, status = *(u_int32_t *)NFA_DATA(cda[CTA_STATUS-1]); | ||
| 837 | d = ct->status ^ status; | ||
| 838 | |||
| 839 | if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING)) | ||
| 840 | /* unchangeable */ | ||
| 841 | return -EINVAL; | ||
| 842 | |||
| 843 | if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY)) | ||
| 844 | /* SEEN_REPLY bit can only be set */ | ||
| 845 | return -EINVAL; | ||
| 846 | |||
| 847 | |||
| 848 | if (d & IPS_ASSURED && !(status & IPS_ASSURED)) | ||
| 849 | /* ASSURED bit can only be set */ | ||
| 850 | return -EINVAL; | ||
| 851 | |||
| 852 | if (cda[CTA_NAT-1]) { | ||
| 853 | #ifndef CONFIG_IP_NF_NAT_NEEDED | ||
| 854 | return -EINVAL; | ||
| 855 | #else | ||
| 856 | unsigned int hooknum; | ||
| 857 | struct ip_nat_range range; | ||
| 858 | |||
| 859 | if (ctnetlink_parse_nat(cda, ct, &range) < 0) | ||
| 860 | return -EINVAL; | ||
| 861 | |||
| 862 | DEBUGP("NAT: %u.%u.%u.%u-%u.%u.%u.%u:%u-%u\n", | ||
| 863 | NIPQUAD(range.min_ip), NIPQUAD(range.max_ip), | ||
| 864 | htons(range.min.all), htons(range.max.all)); | ||
| 865 | |||
| 866 | /* This is tricky but it works. ip_nat_setup_info needs the | ||
| 867 | * hook number as parameter, so let's do the correct | ||
| 868 | * conversion and run away */ | ||
| 869 | if (status & IPS_SRC_NAT_DONE) | ||
| 870 | hooknum = NF_IP_POST_ROUTING; /* IP_NAT_MANIP_SRC */ | ||
| 871 | else if (status & IPS_DST_NAT_DONE) | ||
| 872 | hooknum = NF_IP_PRE_ROUTING; /* IP_NAT_MANIP_DST */ | ||
| 873 | else | ||
| 874 | return -EINVAL; /* Missing NAT flags */ | ||
| 875 | |||
| 876 | DEBUGP("NAT status: %lu\n", | ||
| 877 | status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK)); | ||
| 878 | |||
| 879 | if (ip_nat_initialized(ct, hooknum)) | ||
| 880 | return -EEXIST; | ||
| 881 | ip_nat_setup_info(ct, &range, hooknum); | ||
| 882 | |||
| 883 | DEBUGP("NAT status after setup_info: %lu\n", | ||
| 884 | ct->status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK)); | ||
| 885 | #endif | ||
| 886 | } | ||
| 887 | |||
| 888 | /* Be careful here, modifying NAT bits can screw up things, | ||
| 889 | * so don't let users modify them directly if they don't pass | ||
| 890 | * ip_nat_range. */ | ||
| 891 | ct->status |= status & ~(IPS_NAT_DONE_MASK | IPS_NAT_MASK); | ||
| 892 | return 0; | ||
| 893 | } | ||
| 894 | |||
| 895 | |||
| 896 | static inline int | ||
| 897 | ctnetlink_change_helper(struct ip_conntrack *ct, struct nfattr *cda[]) | ||
| 898 | { | ||
| 899 | struct ip_conntrack_helper *helper; | ||
| 900 | char *helpname; | ||
| 901 | int err; | ||
| 902 | |||
| 903 | DEBUGP("entered %s\n", __FUNCTION__); | ||
| 904 | |||
| 905 | /* don't change helper of sibling connections */ | ||
| 906 | if (ct->master) | ||
| 907 | return -EINVAL; | ||
| 908 | |||
| 909 | err = ctnetlink_parse_help(cda[CTA_HELP-1], &helpname); | ||
| 910 | if (err < 0) | ||
| 911 | return err; | ||
| 912 | |||
| 913 | helper = __ip_conntrack_helper_find_byname(helpname); | ||
| 914 | if (!helper) { | ||
| 915 | if (!strcmp(helpname, "")) | ||
| 916 | helper = NULL; | ||
| 917 | else | ||
| 918 | return -EINVAL; | ||
| 919 | } | ||
| 920 | |||
| 921 | if (ct->helper) { | ||
| 922 | if (!helper) { | ||
| 923 | /* we had a helper before ... */ | ||
| 924 | ip_ct_remove_expectations(ct); | ||
| 925 | ct->helper = NULL; | ||
| 926 | } else { | ||
| 927 | /* need to zero data of old helper */ | ||
| 928 | memset(&ct->help, 0, sizeof(ct->help)); | ||
| 929 | } | ||
| 930 | } | ||
| 931 | |||
| 932 | ct->helper = helper; | ||
| 933 | |||
| 934 | return 0; | ||
| 935 | } | ||
| 936 | |||
| 937 | static inline int | ||
| 938 | ctnetlink_change_timeout(struct ip_conntrack *ct, struct nfattr *cda[]) | ||
| 939 | { | ||
| 940 | u_int32_t timeout = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_TIMEOUT-1])); | ||
| 941 | |||
| 942 | if (!del_timer(&ct->timeout)) | ||
| 943 | return -ETIME; | ||
| 944 | |||
| 945 | ct->timeout.expires = jiffies + timeout * HZ; | ||
| 946 | add_timer(&ct->timeout); | ||
| 947 | |||
| 948 | return 0; | ||
| 949 | } | ||
| 950 | |||
| 951 | static int | ||
| 952 | ctnetlink_change_conntrack(struct ip_conntrack *ct, struct nfattr *cda[]) | ||
| 953 | { | ||
| 954 | int err; | ||
| 955 | |||
| 956 | DEBUGP("entered %s\n", __FUNCTION__); | ||
| 957 | |||
| 958 | if (cda[CTA_HELP-1]) { | ||
| 959 | err = ctnetlink_change_helper(ct, cda); | ||
| 960 | if (err < 0) | ||
| 961 | return err; | ||
| 962 | } | ||
| 963 | |||
| 964 | if (cda[CTA_TIMEOUT-1]) { | ||
| 965 | err = ctnetlink_change_timeout(ct, cda); | ||
| 966 | if (err < 0) | ||
| 967 | return err; | ||
| 968 | } | ||
| 969 | |||
| 970 | if (cda[CTA_STATUS-1]) { | ||
| 971 | err = ctnetlink_change_status(ct, cda); | ||
| 972 | if (err < 0) | ||
| 973 | return err; | ||
| 974 | } | ||
| 975 | |||
| 976 | DEBUGP("all done\n"); | ||
| 977 | return 0; | ||
| 978 | } | ||
| 979 | |||
| 980 | static int | ||
| 981 | ctnetlink_create_conntrack(struct nfattr *cda[], | ||
| 982 | struct ip_conntrack_tuple *otuple, | ||
| 983 | struct ip_conntrack_tuple *rtuple) | ||
| 984 | { | ||
| 985 | struct ip_conntrack *ct; | ||
| 986 | int err = -EINVAL; | ||
| 987 | |||
| 988 | DEBUGP("entered %s\n", __FUNCTION__); | ||
| 989 | |||
| 990 | ct = ip_conntrack_alloc(otuple, rtuple); | ||
| 991 | if (ct == NULL || IS_ERR(ct)) | ||
| 992 | return -ENOMEM; | ||
| 993 | |||
| 994 | if (!cda[CTA_TIMEOUT-1]) | ||
| 995 | goto err; | ||
| 996 | ct->timeout.expires = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_TIMEOUT-1])); | ||
| 997 | |||
| 998 | ct->timeout.expires = jiffies + ct->timeout.expires * HZ; | ||
| 999 | ct->status |= IPS_CONFIRMED; | ||
| 1000 | |||
| 1001 | err = ctnetlink_change_status(ct, cda); | ||
| 1002 | if (err < 0) | ||
| 1003 | goto err; | ||
| 1004 | |||
| 1005 | ct->helper = ip_conntrack_helper_find_get(rtuple); | ||
| 1006 | |||
| 1007 | add_timer(&ct->timeout); | ||
| 1008 | ip_conntrack_hash_insert(ct); | ||
| 1009 | |||
| 1010 | if (ct->helper) | ||
| 1011 | ip_conntrack_helper_put(ct->helper); | ||
| 1012 | |||
| 1013 | DEBUGP("conntrack with id %u inserted\n", ct->id); | ||
| 1014 | return 0; | ||
| 1015 | |||
| 1016 | err: | ||
| 1017 | ip_conntrack_free(ct); | ||
| 1018 | return err; | ||
| 1019 | } | ||
| 1020 | |||
| 1021 | static int | ||
| 1022 | ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, | ||
| 1023 | struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) | ||
| 1024 | { | ||
| 1025 | struct ip_conntrack_tuple otuple, rtuple; | ||
| 1026 | struct ip_conntrack_tuple_hash *h = NULL; | ||
| 1027 | int err = 0; | ||
| 1028 | |||
| 1029 | DEBUGP("entered %s\n", __FUNCTION__); | ||
| 1030 | |||
| 1031 | if (cda[CTA_TUPLE_ORIG-1]) { | ||
| 1032 | err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG); | ||
| 1033 | if (err < 0) | ||
| 1034 | return err; | ||
| 1035 | } | ||
| 1036 | |||
| 1037 | if (cda[CTA_TUPLE_REPLY-1]) { | ||
| 1038 | err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY); | ||
| 1039 | if (err < 0) | ||
| 1040 | return err; | ||
| 1041 | } | ||
| 1042 | |||
| 1043 | write_lock_bh(&ip_conntrack_lock); | ||
| 1044 | if (cda[CTA_TUPLE_ORIG-1]) | ||
| 1045 | h = __ip_conntrack_find(&otuple, NULL); | ||
| 1046 | else if (cda[CTA_TUPLE_REPLY-1]) | ||
| 1047 | h = __ip_conntrack_find(&rtuple, NULL); | ||
| 1048 | |||
| 1049 | if (h == NULL) { | ||
| 1050 | write_unlock_bh(&ip_conntrack_lock); | ||
| 1051 | DEBUGP("no such conntrack, create new\n"); | ||
| 1052 | err = -ENOENT; | ||
| 1053 | if (nlh->nlmsg_flags & NLM_F_CREATE) | ||
| 1054 | err = ctnetlink_create_conntrack(cda, &otuple, &rtuple); | ||
| 1055 | return err; | ||
| 1056 | } | ||
| 1057 | /* implicit 'else' */ | ||
| 1058 | |||
| 1059 | /* we only allow nat config for new conntracks */ | ||
| 1060 | if (cda[CTA_NAT-1]) { | ||
| 1061 | err = -EINVAL; | ||
| 1062 | goto out_unlock; | ||
| 1063 | } | ||
| 1064 | |||
| 1065 | /* We manipulate the conntrack inside the global conntrack table lock, | ||
| 1066 | * so there's no need to increase the refcount */ | ||
| 1067 | DEBUGP("conntrack found\n"); | ||
| 1068 | err = -EEXIST; | ||
| 1069 | if (!(nlh->nlmsg_flags & NLM_F_EXCL)) | ||
| 1070 | err = ctnetlink_change_conntrack(tuplehash_to_ctrack(h), cda); | ||
| 1071 | |||
| 1072 | out_unlock: | ||
| 1073 | write_unlock_bh(&ip_conntrack_lock); | ||
| 1074 | return err; | ||
| 1075 | } | ||
| 1076 | |||
| 1077 | /*********************************************************************** | ||
| 1078 | * EXPECT | ||
| 1079 | ***********************************************************************/ | ||
| 1080 | |||
| 1081 | static inline int | ||
| 1082 | ctnetlink_exp_dump_tuple(struct sk_buff *skb, | ||
| 1083 | const struct ip_conntrack_tuple *tuple, | ||
| 1084 | enum ctattr_expect type) | ||
| 1085 | { | ||
| 1086 | struct nfattr *nest_parms = NFA_NEST(skb, type); | ||
| 1087 | |||
| 1088 | if (ctnetlink_dump_tuples(skb, tuple) < 0) | ||
| 1089 | goto nfattr_failure; | ||
| 1090 | |||
| 1091 | NFA_NEST_END(skb, nest_parms); | ||
| 1092 | |||
| 1093 | return 0; | ||
| 1094 | |||
| 1095 | nfattr_failure: | ||
| 1096 | return -1; | ||
| 1097 | } | ||
| 1098 | |||
| 1099 | static inline int | ||
| 1100 | ctnetlink_exp_dump_expect(struct sk_buff *skb, | ||
| 1101 | const struct ip_conntrack_expect *exp) | ||
| 1102 | { | ||
| 1103 | struct ip_conntrack *master = exp->master; | ||
| 1104 | u_int32_t timeout = htonl((exp->timeout.expires - jiffies) / HZ); | ||
| 1105 | u_int32_t id = htonl(exp->id); | ||
| 1106 | |||
| 1107 | if (ctnetlink_exp_dump_tuple(skb, &exp->tuple, CTA_EXPECT_TUPLE) < 0) | ||
| 1108 | goto nfattr_failure; | ||
| 1109 | if (ctnetlink_exp_dump_tuple(skb, &exp->mask, CTA_EXPECT_MASK) < 0) | ||
| 1110 | goto nfattr_failure; | ||
| 1111 | if (ctnetlink_exp_dump_tuple(skb, | ||
| 1112 | &master->tuplehash[IP_CT_DIR_ORIGINAL].tuple, | ||
| 1113 | CTA_EXPECT_MASTER) < 0) | ||
| 1114 | goto nfattr_failure; | ||
| 1115 | |||
| 1116 | NFA_PUT(skb, CTA_EXPECT_TIMEOUT, sizeof(timeout), &timeout); | ||
| 1117 | NFA_PUT(skb, CTA_EXPECT_ID, sizeof(u_int32_t), &id); | ||
| 1118 | |||
| 1119 | return 0; | ||
| 1120 | |||
| 1121 | nfattr_failure: | ||
| 1122 | return -1; | ||
| 1123 | } | ||
| 1124 | |||
| 1125 | static int | ||
| 1126 | ctnetlink_exp_fill_info(struct sk_buff *skb, u32 pid, u32 seq, | ||
| 1127 | int event, | ||
| 1128 | int nowait, | ||
| 1129 | const struct ip_conntrack_expect *exp) | ||
| 1130 | { | ||
| 1131 | struct nlmsghdr *nlh; | ||
| 1132 | struct nfgenmsg *nfmsg; | ||
| 1133 | unsigned char *b; | ||
| 1134 | |||
| 1135 | b = skb->tail; | ||
| 1136 | |||
| 1137 | event |= NFNL_SUBSYS_CTNETLINK_EXP << 8; | ||
| 1138 | nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg)); | ||
| 1139 | nfmsg = NLMSG_DATA(nlh); | ||
| 1140 | |||
| 1141 | nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0; | ||
| 1142 | nfmsg->nfgen_family = AF_INET; | ||
| 1143 | nfmsg->version = NFNETLINK_V0; | ||
| 1144 | nfmsg->res_id = 0; | ||
| 1145 | |||
| 1146 | if (ctnetlink_exp_dump_expect(skb, exp) < 0) | ||
| 1147 | goto nfattr_failure; | ||
| 1148 | |||
| 1149 | nlh->nlmsg_len = skb->tail - b; | ||
| 1150 | return skb->len; | ||
| 1151 | |||
| 1152 | nlmsg_failure: | ||
| 1153 | nfattr_failure: | ||
| 1154 | skb_trim(skb, b - skb->data); | ||
| 1155 | return -1; | ||
| 1156 | } | ||
| 1157 | |||
| 1158 | #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS | ||
| 1159 | static int ctnetlink_expect_event(struct notifier_block *this, | ||
| 1160 | unsigned long events, void *ptr) | ||
| 1161 | { | ||
| 1162 | struct nlmsghdr *nlh; | ||
| 1163 | struct nfgenmsg *nfmsg; | ||
| 1164 | struct ip_conntrack_expect *exp = (struct ip_conntrack_expect *)ptr; | ||
| 1165 | struct sk_buff *skb; | ||
| 1166 | unsigned int type; | ||
| 1167 | unsigned char *b; | ||
| 1168 | int flags = 0; | ||
| 1169 | u16 proto; | ||
| 1170 | |||
| 1171 | if (events & IPEXP_NEW) { | ||
| 1172 | type = IPCTNL_MSG_EXP_NEW; | ||
| 1173 | flags = NLM_F_CREATE|NLM_F_EXCL; | ||
| 1174 | } else | ||
| 1175 | return NOTIFY_DONE; | ||
| 1176 | |||
| 1177 | skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); | ||
| 1178 | if (!skb) | ||
| 1179 | return NOTIFY_DONE; | ||
| 1180 | |||
| 1181 | b = skb->tail; | ||
| 1182 | |||
| 1183 | type |= NFNL_SUBSYS_CTNETLINK << 8; | ||
| 1184 | nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg)); | ||
| 1185 | nfmsg = NLMSG_DATA(nlh); | ||
| 1186 | |||
| 1187 | nlh->nlmsg_flags = flags; | ||
| 1188 | nfmsg->nfgen_family = AF_INET; | ||
| 1189 | nfmsg->version = NFNETLINK_V0; | ||
| 1190 | nfmsg->res_id = 0; | ||
| 1191 | |||
| 1192 | if (ctnetlink_exp_dump_expect(skb, exp) < 0) | ||
| 1193 | goto nfattr_failure; | ||
| 1194 | |||
| 1195 | nlh->nlmsg_len = skb->tail - b; | ||
| 1196 | proto = exp->tuple.dst.protonum; | ||
| 1197 | nfnetlink_send(skb, 0, NFNLGRP_CONNTRACK_EXP_NEW, 0); | ||
| 1198 | return NOTIFY_DONE; | ||
| 1199 | |||
| 1200 | nlmsg_failure: | ||
| 1201 | nfattr_failure: | ||
| 1202 | kfree_skb(skb); | ||
| 1203 | return NOTIFY_DONE; | ||
| 1204 | } | ||
| 1205 | #endif | ||
| 1206 | |||
| 1207 | static int | ||
| 1208 | ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb) | ||
| 1209 | { | ||
| 1210 | struct ip_conntrack_expect *exp = NULL; | ||
| 1211 | struct list_head *i; | ||
| 1212 | u_int32_t *id = (u_int32_t *) &cb->args[0]; | ||
| 1213 | |||
| 1214 | DEBUGP("entered %s, last id=%llu\n", __FUNCTION__, *id); | ||
| 1215 | |||
| 1216 | read_lock_bh(&ip_conntrack_lock); | ||
| 1217 | list_for_each_prev(i, &ip_conntrack_expect_list) { | ||
| 1218 | exp = (struct ip_conntrack_expect *) i; | ||
| 1219 | if (exp->id <= *id) | ||
| 1220 | continue; | ||
| 1221 | if (ctnetlink_exp_fill_info(skb, NETLINK_CB(cb->skb).pid, | ||
| 1222 | cb->nlh->nlmsg_seq, | ||
| 1223 | IPCTNL_MSG_EXP_NEW, | ||
| 1224 | 1, exp) < 0) | ||
| 1225 | goto out; | ||
| 1226 | *id = exp->id; | ||
| 1227 | } | ||
| 1228 | out: | ||
| 1229 | read_unlock_bh(&ip_conntrack_lock); | ||
| 1230 | |||
| 1231 | DEBUGP("leaving, last id=%llu\n", *id); | ||
| 1232 | |||
| 1233 | return skb->len; | ||
| 1234 | } | ||
| 1235 | |||
| 1236 | static int | ||
| 1237 | ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, | ||
| 1238 | struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) | ||
| 1239 | { | ||
| 1240 | struct ip_conntrack_tuple tuple; | ||
| 1241 | struct ip_conntrack_expect *exp; | ||
| 1242 | struct sk_buff *skb2; | ||
| 1243 | int err = 0; | ||
| 1244 | |||
| 1245 | DEBUGP("entered %s\n", __FUNCTION__); | ||
| 1246 | |||
| 1247 | if (nlh->nlmsg_flags & NLM_F_DUMP) { | ||
| 1248 | struct nfgenmsg *msg = NLMSG_DATA(nlh); | ||
| 1249 | u32 rlen; | ||
| 1250 | |||
| 1251 | if (msg->nfgen_family != AF_INET) | ||
| 1252 | return -EAFNOSUPPORT; | ||
| 1253 | |||
| 1254 | if ((*errp = netlink_dump_start(ctnl, skb, nlh, | ||
| 1255 | ctnetlink_exp_dump_table, | ||
| 1256 | ctnetlink_done)) != 0) | ||
| 1257 | return -EINVAL; | ||
| 1258 | rlen = NLMSG_ALIGN(nlh->nlmsg_len); | ||
| 1259 | if (rlen > skb->len) | ||
| 1260 | rlen = skb->len; | ||
| 1261 | skb_pull(skb, rlen); | ||
| 1262 | return 0; | ||
| 1263 | } | ||
| 1264 | |||
| 1265 | if (cda[CTA_EXPECT_MASTER-1]) | ||
| 1266 | err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER); | ||
| 1267 | else | ||
| 1268 | return -EINVAL; | ||
| 1269 | |||
| 1270 | if (err < 0) | ||
| 1271 | return err; | ||
| 1272 | |||
| 1273 | exp = ip_conntrack_expect_find_get(&tuple); | ||
| 1274 | if (!exp) | ||
| 1275 | return -ENOENT; | ||
| 1276 | |||
| 1277 | err = -ENOMEM; | ||
| 1278 | skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); | ||
| 1279 | if (!skb2) | ||
| 1280 | goto out; | ||
| 1281 | NETLINK_CB(skb2).dst_pid = NETLINK_CB(skb).pid; | ||
| 1282 | |||
| 1283 | err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).pid, | ||
| 1284 | nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, | ||
| 1285 | 1, exp); | ||
| 1286 | if (err <= 0) | ||
| 1287 | goto out; | ||
| 1288 | |||
| 1289 | ip_conntrack_expect_put(exp); | ||
| 1290 | |||
| 1291 | err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); | ||
| 1292 | if (err < 0) | ||
| 1293 | goto free; | ||
| 1294 | |||
| 1295 | return err; | ||
| 1296 | |||
| 1297 | out: | ||
| 1298 | ip_conntrack_expect_put(exp); | ||
| 1299 | free: | ||
| 1300 | if (skb2) | ||
| 1301 | kfree_skb(skb2); | ||
| 1302 | return err; | ||
| 1303 | } | ||
| 1304 | |||
| 1305 | static int | ||
| 1306 | ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, | ||
| 1307 | struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) | ||
| 1308 | { | ||
| 1309 | struct ip_conntrack_expect *exp, *tmp; | ||
| 1310 | struct ip_conntrack_tuple tuple; | ||
| 1311 | struct ip_conntrack_helper *h; | ||
| 1312 | int err; | ||
| 1313 | |||
| 1314 | if (cda[CTA_EXPECT_TUPLE-1]) { | ||
| 1315 | /* delete a single expect by tuple */ | ||
| 1316 | err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE); | ||
| 1317 | if (err < 0) | ||
| 1318 | return err; | ||
| 1319 | |||
| 1320 | /* bump usage count to 2 */ | ||
| 1321 | exp = ip_conntrack_expect_find_get(&tuple); | ||
| 1322 | if (!exp) | ||
| 1323 | return -ENOENT; | ||
| 1324 | |||
| 1325 | if (cda[CTA_EXPECT_ID-1]) { | ||
| 1326 | u_int32_t id = | ||
| 1327 | *(u_int32_t *)NFA_DATA(cda[CTA_EXPECT_ID-1]); | ||
| 1328 | if (exp->id != ntohl(id)) { | ||
| 1329 | ip_conntrack_expect_put(exp); | ||
| 1330 | return -ENOENT; | ||
| 1331 | } | ||
| 1332 | } | ||
| 1333 | |||
| 1334 | /* after list removal, usage count == 1 */ | ||
| 1335 | ip_conntrack_unexpect_related(exp); | ||
| 1336 | /* have to put what we 'get' above. | ||
| 1337 | * after this line usage count == 0 */ | ||
| 1338 | ip_conntrack_expect_put(exp); | ||
| 1339 | } else if (cda[CTA_EXPECT_HELP_NAME-1]) { | ||
| 1340 | char *name = NFA_DATA(cda[CTA_EXPECT_HELP_NAME-1]); | ||
| 1341 | |||
| 1342 | /* delete all expectations for this helper */ | ||
| 1343 | write_lock_bh(&ip_conntrack_lock); | ||
| 1344 | h = __ip_conntrack_helper_find_byname(name); | ||
| 1345 | if (!h) { | ||
| 1346 | write_unlock_bh(&ip_conntrack_lock); | ||
| 1347 | return -EINVAL; | ||
| 1348 | } | ||
| 1349 | list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, | ||
| 1350 | list) { | ||
| 1351 | if (exp->master->helper == h | ||
| 1352 | && del_timer(&exp->timeout)) | ||
| 1353 | __ip_ct_expect_unlink_destroy(exp); | ||
| 1354 | } | ||
| 1355 | write_unlock(&ip_conntrack_lock); | ||
| 1356 | } else { | ||
| 1357 | /* This basically means we have to flush everything*/ | ||
| 1358 | write_lock_bh(&ip_conntrack_lock); | ||
| 1359 | list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, | ||
| 1360 | list) { | ||
| 1361 | if (del_timer(&exp->timeout)) | ||
| 1362 | __ip_ct_expect_unlink_destroy(exp); | ||
| 1363 | } | ||
| 1364 | write_unlock_bh(&ip_conntrack_lock); | ||
| 1365 | } | ||
| 1366 | |||
| 1367 | return 0; | ||
| 1368 | } | ||
| 1369 | static int | ||
| 1370 | ctnetlink_change_expect(struct ip_conntrack_expect *x, struct nfattr *cda[]) | ||
| 1371 | { | ||
| 1372 | return -EOPNOTSUPP; | ||
| 1373 | } | ||
| 1374 | |||
| 1375 | static int | ||
| 1376 | ctnetlink_create_expect(struct nfattr *cda[]) | ||
| 1377 | { | ||
| 1378 | struct ip_conntrack_tuple tuple, mask, master_tuple; | ||
| 1379 | struct ip_conntrack_tuple_hash *h = NULL; | ||
| 1380 | struct ip_conntrack_expect *exp; | ||
| 1381 | struct ip_conntrack *ct; | ||
| 1382 | int err = 0; | ||
| 1383 | |||
| 1384 | DEBUGP("entered %s\n", __FUNCTION__); | ||
| 1385 | |||
| 1386 | /* caller guarantees that those three CTA_EXPECT_* exist */ | ||
| 1387 | err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE); | ||
| 1388 | if (err < 0) | ||
| 1389 | return err; | ||
| 1390 | err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK); | ||
| 1391 | if (err < 0) | ||
| 1392 | return err; | ||
| 1393 | err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER); | ||
| 1394 | if (err < 0) | ||
| 1395 | return err; | ||
| 1396 | |||
| 1397 | /* Look for master conntrack of this expectation */ | ||
| 1398 | h = ip_conntrack_find_get(&master_tuple, NULL); | ||
| 1399 | if (!h) | ||
| 1400 | return -ENOENT; | ||
| 1401 | ct = tuplehash_to_ctrack(h); | ||
| 1402 | |||
| 1403 | if (!ct->helper) { | ||
| 1404 | /* such conntrack hasn't got any helper, abort */ | ||
| 1405 | err = -EINVAL; | ||
| 1406 | goto out; | ||
| 1407 | } | ||
| 1408 | |||
| 1409 | exp = ip_conntrack_expect_alloc(ct); | ||
| 1410 | if (!exp) { | ||
| 1411 | err = -ENOMEM; | ||
| 1412 | goto out; | ||
| 1413 | } | ||
| 1414 | |||
| 1415 | exp->expectfn = NULL; | ||
| 1416 | exp->master = ct; | ||
| 1417 | memcpy(&exp->tuple, &tuple, sizeof(struct ip_conntrack_tuple)); | ||
| 1418 | memcpy(&exp->mask, &mask, sizeof(struct ip_conntrack_tuple)); | ||
| 1419 | |||
| 1420 | err = ip_conntrack_expect_related(exp); | ||
| 1421 | ip_conntrack_expect_put(exp); | ||
| 1422 | |||
| 1423 | out: | ||
| 1424 | ip_conntrack_put(tuplehash_to_ctrack(h)); | ||
| 1425 | return err; | ||
| 1426 | } | ||
| 1427 | |||
| 1428 | static int | ||
| 1429 | ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb, | ||
| 1430 | struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) | ||
| 1431 | { | ||
| 1432 | struct ip_conntrack_tuple tuple; | ||
| 1433 | struct ip_conntrack_expect *exp; | ||
| 1434 | int err = 0; | ||
| 1435 | |||
| 1436 | DEBUGP("entered %s\n", __FUNCTION__); | ||
| 1437 | |||
| 1438 | if (!cda[CTA_EXPECT_TUPLE-1] | ||
| 1439 | || !cda[CTA_EXPECT_MASK-1] | ||
| 1440 | || !cda[CTA_EXPECT_MASTER-1]) | ||
| 1441 | return -EINVAL; | ||
| 1442 | |||
| 1443 | err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE); | ||
| 1444 | if (err < 0) | ||
| 1445 | return err; | ||
| 1446 | |||
| 1447 | write_lock_bh(&ip_conntrack_lock); | ||
| 1448 | exp = __ip_conntrack_expect_find(&tuple); | ||
| 1449 | |||
| 1450 | if (!exp) { | ||
| 1451 | write_unlock_bh(&ip_conntrack_lock); | ||
| 1452 | err = -ENOENT; | ||
| 1453 | if (nlh->nlmsg_flags & NLM_F_CREATE) | ||
| 1454 | err = ctnetlink_create_expect(cda); | ||
| 1455 | return err; | ||
| 1456 | } | ||
| 1457 | |||
| 1458 | err = -EEXIST; | ||
| 1459 | if (!(nlh->nlmsg_flags & NLM_F_EXCL)) | ||
| 1460 | err = ctnetlink_change_expect(exp, cda); | ||
| 1461 | write_unlock_bh(&ip_conntrack_lock); | ||
| 1462 | |||
| 1463 | DEBUGP("leaving\n"); | ||
| 1464 | |||
| 1465 | return err; | ||
| 1466 | } | ||
| 1467 | |||
| 1468 | #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS | ||
| 1469 | static struct notifier_block ctnl_notifier = { | ||
| 1470 | .notifier_call = ctnetlink_conntrack_event, | ||
| 1471 | }; | ||
| 1472 | |||
| 1473 | static struct notifier_block ctnl_notifier_exp = { | ||
| 1474 | .notifier_call = ctnetlink_expect_event, | ||
| 1475 | }; | ||
| 1476 | #endif | ||
| 1477 | |||
| 1478 | static struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = { | ||
| 1479 | [IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack, | ||
| 1480 | .attr_count = CTA_MAX, | ||
| 1481 | .cap_required = CAP_NET_ADMIN }, | ||
| 1482 | [IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack, | ||
| 1483 | .attr_count = CTA_MAX, | ||
| 1484 | .cap_required = CAP_NET_ADMIN }, | ||
| 1485 | [IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack, | ||
| 1486 | .attr_count = CTA_MAX, | ||
| 1487 | .cap_required = CAP_NET_ADMIN }, | ||
| 1488 | [IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack, | ||
| 1489 | .attr_count = CTA_MAX, | ||
| 1490 | .cap_required = CAP_NET_ADMIN }, | ||
| 1491 | }; | ||
| 1492 | |||
| 1493 | static struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = { | ||
| 1494 | [IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect, | ||
| 1495 | .attr_count = CTA_EXPECT_MAX, | ||
| 1496 | .cap_required = CAP_NET_ADMIN }, | ||
| 1497 | [IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect, | ||
| 1498 | .attr_count = CTA_EXPECT_MAX, | ||
| 1499 | .cap_required = CAP_NET_ADMIN }, | ||
| 1500 | [IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect, | ||
| 1501 | .attr_count = CTA_EXPECT_MAX, | ||
| 1502 | .cap_required = CAP_NET_ADMIN }, | ||
| 1503 | }; | ||
| 1504 | |||
| 1505 | static struct nfnetlink_subsystem ctnl_subsys = { | ||
| 1506 | .name = "conntrack", | ||
| 1507 | .subsys_id = NFNL_SUBSYS_CTNETLINK, | ||
| 1508 | .cb_count = IPCTNL_MSG_MAX, | ||
| 1509 | .cb = ctnl_cb, | ||
| 1510 | }; | ||
| 1511 | |||
| 1512 | static struct nfnetlink_subsystem ctnl_exp_subsys = { | ||
| 1513 | .name = "conntrack_expect", | ||
| 1514 | .subsys_id = NFNL_SUBSYS_CTNETLINK_EXP, | ||
| 1515 | .cb_count = IPCTNL_MSG_EXP_MAX, | ||
| 1516 | .cb = ctnl_exp_cb, | ||
| 1517 | }; | ||
| 1518 | |||
| 1519 | static int __init ctnetlink_init(void) | ||
| 1520 | { | ||
| 1521 | int ret; | ||
| 1522 | |||
| 1523 | printk("ctnetlink v%s: registering with nfnetlink.\n", version); | ||
| 1524 | ret = nfnetlink_subsys_register(&ctnl_subsys); | ||
| 1525 | if (ret < 0) { | ||
| 1526 | printk("ctnetlink_init: cannot register with nfnetlink.\n"); | ||
| 1527 | goto err_out; | ||
| 1528 | } | ||
| 1529 | |||
| 1530 | ret = nfnetlink_subsys_register(&ctnl_exp_subsys); | ||
| 1531 | if (ret < 0) { | ||
| 1532 | printk("ctnetlink_init: cannot register exp with nfnetlink.\n"); | ||
| 1533 | goto err_unreg_subsys; | ||
| 1534 | } | ||
| 1535 | |||
| 1536 | #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS | ||
| 1537 | ret = ip_conntrack_register_notifier(&ctnl_notifier); | ||
| 1538 | if (ret < 0) { | ||
| 1539 | printk("ctnetlink_init: cannot register notifier.\n"); | ||
| 1540 | goto err_unreg_exp_subsys; | ||
| 1541 | } | ||
| 1542 | |||
| 1543 | ret = ip_conntrack_expect_register_notifier(&ctnl_notifier_exp); | ||
| 1544 | if (ret < 0) { | ||
| 1545 | printk("ctnetlink_init: cannot expect register notifier.\n"); | ||
| 1546 | goto err_unreg_notifier; | ||
| 1547 | } | ||
| 1548 | #endif | ||
| 1549 | |||
| 1550 | return 0; | ||
| 1551 | |||
| 1552 | #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS | ||
| 1553 | err_unreg_notifier: | ||
| 1554 | ip_conntrack_unregister_notifier(&ctnl_notifier); | ||
| 1555 | err_unreg_exp_subsys: | ||
| 1556 | nfnetlink_subsys_unregister(&ctnl_exp_subsys); | ||
| 1557 | #endif | ||
| 1558 | err_unreg_subsys: | ||
| 1559 | nfnetlink_subsys_unregister(&ctnl_subsys); | ||
| 1560 | err_out: | ||
| 1561 | return ret; | ||
| 1562 | } | ||
| 1563 | |||
| 1564 | static void __exit ctnetlink_exit(void) | ||
| 1565 | { | ||
| 1566 | printk("ctnetlink: unregistering from nfnetlink.\n"); | ||
| 1567 | |||
| 1568 | #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS | ||
| 1569 | ip_conntrack_unregister_notifier(&ctnl_notifier_exp); | ||
| 1570 | ip_conntrack_unregister_notifier(&ctnl_notifier); | ||
| 1571 | #endif | ||
| 1572 | |||
| 1573 | nfnetlink_subsys_unregister(&ctnl_exp_subsys); | ||
| 1574 | nfnetlink_subsys_unregister(&ctnl_subsys); | ||
| 1575 | return; | ||
| 1576 | } | ||
| 1577 | |||
| 1578 | module_init(ctnetlink_init); | ||
| 1579 | module_exit(ctnetlink_exit); | ||
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c index 602c74db3252..838d1d69b36e 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c | |||
| @@ -102,22 +102,24 @@ static int icmp_packet(struct ip_conntrack *ct, | |||
| 102 | ct->timeout.function((unsigned long)ct); | 102 | ct->timeout.function((unsigned long)ct); |
| 103 | } else { | 103 | } else { |
| 104 | atomic_inc(&ct->proto.icmp.count); | 104 | atomic_inc(&ct->proto.icmp.count); |
| 105 | ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); | ||
| 105 | ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout); | 106 | ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout); |
| 106 | } | 107 | } |
| 107 | 108 | ||
| 108 | return NF_ACCEPT; | 109 | return NF_ACCEPT; |
| 109 | } | 110 | } |
| 110 | 111 | ||
| 112 | static u_int8_t valid_new[] = { | ||
| 113 | [ICMP_ECHO] = 1, | ||
| 114 | [ICMP_TIMESTAMP] = 1, | ||
| 115 | [ICMP_INFO_REQUEST] = 1, | ||
| 116 | [ICMP_ADDRESS] = 1 | ||
| 117 | }; | ||
| 118 | |||
| 111 | /* Called when a new connection for this protocol found. */ | 119 | /* Called when a new connection for this protocol found. */ |
| 112 | static int icmp_new(struct ip_conntrack *conntrack, | 120 | static int icmp_new(struct ip_conntrack *conntrack, |
| 113 | const struct sk_buff *skb) | 121 | const struct sk_buff *skb) |
| 114 | { | 122 | { |
| 115 | static u_int8_t valid_new[] | ||
| 116 | = { [ICMP_ECHO] = 1, | ||
| 117 | [ICMP_TIMESTAMP] = 1, | ||
| 118 | [ICMP_INFO_REQUEST] = 1, | ||
| 119 | [ICMP_ADDRESS] = 1 }; | ||
| 120 | |||
| 121 | if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) | 123 | if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) |
| 122 | || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) { | 124 | || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) { |
| 123 | /* Can't create a new ICMP `conn' with this. */ | 125 | /* Can't create a new ICMP `conn' with this. */ |
| @@ -158,11 +160,12 @@ icmp_error_message(struct sk_buff *skb, | |||
| 158 | return NF_ACCEPT; | 160 | return NF_ACCEPT; |
| 159 | } | 161 | } |
| 160 | 162 | ||
| 161 | innerproto = ip_ct_find_proto(inside->ip.protocol); | 163 | innerproto = ip_conntrack_proto_find_get(inside->ip.protocol); |
| 162 | dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp) + inside->ip.ihl*4; | 164 | dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp) + inside->ip.ihl*4; |
| 163 | /* Are they talking about one of our connections? */ | 165 | /* Are they talking about one of our connections? */ |
| 164 | if (!ip_ct_get_tuple(&inside->ip, skb, dataoff, &origtuple, innerproto)) { | 166 | if (!ip_ct_get_tuple(&inside->ip, skb, dataoff, &origtuple, innerproto)) { |
| 165 | DEBUGP("icmp_error: ! get_tuple p=%u", inside->ip.protocol); | 167 | DEBUGP("icmp_error: ! get_tuple p=%u", inside->ip.protocol); |
| 168 | ip_conntrack_proto_put(innerproto); | ||
| 166 | return NF_ACCEPT; | 169 | return NF_ACCEPT; |
| 167 | } | 170 | } |
| 168 | 171 | ||
| @@ -170,8 +173,10 @@ icmp_error_message(struct sk_buff *skb, | |||
| 170 | been preserved inside the ICMP. */ | 173 | been preserved inside the ICMP. */ |
| 171 | if (!ip_ct_invert_tuple(&innertuple, &origtuple, innerproto)) { | 174 | if (!ip_ct_invert_tuple(&innertuple, &origtuple, innerproto)) { |
| 172 | DEBUGP("icmp_error_track: Can't invert tuple\n"); | 175 | DEBUGP("icmp_error_track: Can't invert tuple\n"); |
| 176 | ip_conntrack_proto_put(innerproto); | ||
| 173 | return NF_ACCEPT; | 177 | return NF_ACCEPT; |
| 174 | } | 178 | } |
| 179 | ip_conntrack_proto_put(innerproto); | ||
| 175 | 180 | ||
| 176 | *ctinfo = IP_CT_RELATED; | 181 | *ctinfo = IP_CT_RELATED; |
| 177 | 182 | ||
| @@ -212,7 +217,7 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, | |||
| 212 | icmph = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_ih), &_ih); | 217 | icmph = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_ih), &_ih); |
| 213 | if (icmph == NULL) { | 218 | if (icmph == NULL) { |
| 214 | if (LOG_INVALID(IPPROTO_ICMP)) | 219 | if (LOG_INVALID(IPPROTO_ICMP)) |
| 215 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, | 220 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, |
| 216 | "ip_ct_icmp: short packet "); | 221 | "ip_ct_icmp: short packet "); |
| 217 | return -NF_ACCEPT; | 222 | return -NF_ACCEPT; |
| 218 | } | 223 | } |
| @@ -226,13 +231,13 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, | |||
| 226 | if (!(u16)csum_fold(skb->csum)) | 231 | if (!(u16)csum_fold(skb->csum)) |
| 227 | break; | 232 | break; |
| 228 | if (LOG_INVALID(IPPROTO_ICMP)) | 233 | if (LOG_INVALID(IPPROTO_ICMP)) |
| 229 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, | 234 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, |
| 230 | "ip_ct_icmp: bad HW ICMP checksum "); | 235 | "ip_ct_icmp: bad HW ICMP checksum "); |
| 231 | return -NF_ACCEPT; | 236 | return -NF_ACCEPT; |
| 232 | case CHECKSUM_NONE: | 237 | case CHECKSUM_NONE: |
| 233 | if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) { | 238 | if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) { |
| 234 | if (LOG_INVALID(IPPROTO_ICMP)) | 239 | if (LOG_INVALID(IPPROTO_ICMP)) |
| 235 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, | 240 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, |
| 236 | "ip_ct_icmp: bad ICMP checksum "); | 241 | "ip_ct_icmp: bad ICMP checksum "); |
| 237 | return -NF_ACCEPT; | 242 | return -NF_ACCEPT; |
| 238 | } | 243 | } |
| @@ -249,7 +254,7 @@ checksum_skipped: | |||
| 249 | */ | 254 | */ |
| 250 | if (icmph->type > NR_ICMP_TYPES) { | 255 | if (icmph->type > NR_ICMP_TYPES) { |
| 251 | if (LOG_INVALID(IPPROTO_ICMP)) | 256 | if (LOG_INVALID(IPPROTO_ICMP)) |
| 252 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, | 257 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, |
| 253 | "ip_ct_icmp: invalid ICMP type "); | 258 | "ip_ct_icmp: invalid ICMP type "); |
| 254 | return -NF_ACCEPT; | 259 | return -NF_ACCEPT; |
| 255 | } | 260 | } |
| @@ -265,6 +270,47 @@ checksum_skipped: | |||
| 265 | return icmp_error_message(skb, ctinfo, hooknum); | 270 | return icmp_error_message(skb, ctinfo, hooknum); |
| 266 | } | 271 | } |
| 267 | 272 | ||
| 273 | #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ | ||
| 274 | defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) | ||
| 275 | static int icmp_tuple_to_nfattr(struct sk_buff *skb, | ||
| 276 | const struct ip_conntrack_tuple *t) | ||
| 277 | { | ||
| 278 | NFA_PUT(skb, CTA_PROTO_ICMP_ID, sizeof(u_int16_t), | ||
| 279 | &t->src.u.icmp.id); | ||
| 280 | NFA_PUT(skb, CTA_PROTO_ICMP_TYPE, sizeof(u_int8_t), | ||
| 281 | &t->dst.u.icmp.type); | ||
| 282 | NFA_PUT(skb, CTA_PROTO_ICMP_CODE, sizeof(u_int8_t), | ||
| 283 | &t->dst.u.icmp.code); | ||
| 284 | |||
| 285 | if (t->dst.u.icmp.type >= sizeof(valid_new) | ||
| 286 | || !valid_new[t->dst.u.icmp.type]) | ||
| 287 | return -EINVAL; | ||
| 288 | |||
| 289 | return 0; | ||
| 290 | |||
| 291 | nfattr_failure: | ||
| 292 | return -1; | ||
| 293 | } | ||
| 294 | |||
| 295 | static int icmp_nfattr_to_tuple(struct nfattr *tb[], | ||
| 296 | struct ip_conntrack_tuple *tuple) | ||
| 297 | { | ||
| 298 | if (!tb[CTA_PROTO_ICMP_TYPE-1] | ||
| 299 | || !tb[CTA_PROTO_ICMP_CODE-1] | ||
| 300 | || !tb[CTA_PROTO_ICMP_ID-1]) | ||
| 301 | return -1; | ||
| 302 | |||
| 303 | tuple->dst.u.icmp.type = | ||
| 304 | *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_TYPE-1]); | ||
| 305 | tuple->dst.u.icmp.code = | ||
| 306 | *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_CODE-1]); | ||
| 307 | tuple->src.u.icmp.id = | ||
| 308 | *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_ID-1]); | ||
| 309 | |||
| 310 | return 0; | ||
| 311 | } | ||
| 312 | #endif | ||
| 313 | |||
| 268 | struct ip_conntrack_protocol ip_conntrack_protocol_icmp = | 314 | struct ip_conntrack_protocol ip_conntrack_protocol_icmp = |
| 269 | { | 315 | { |
| 270 | .proto = IPPROTO_ICMP, | 316 | .proto = IPPROTO_ICMP, |
| @@ -276,4 +322,9 @@ struct ip_conntrack_protocol ip_conntrack_protocol_icmp = | |||
| 276 | .packet = icmp_packet, | 322 | .packet = icmp_packet, |
| 277 | .new = icmp_new, | 323 | .new = icmp_new, |
| 278 | .error = icmp_error, | 324 | .error = icmp_error, |
| 325 | #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ | ||
| 326 | defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) | ||
| 327 | .tuple_to_nfattr = icmp_tuple_to_nfattr, | ||
| 328 | .nfattr_to_tuple = icmp_nfattr_to_tuple, | ||
| 329 | #endif | ||
| 279 | }; | 330 | }; |
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c index 31d75390bf12..a875f35e576d 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c | |||
| @@ -404,6 +404,8 @@ static int sctp_packet(struct ip_conntrack *conntrack, | |||
| 404 | } | 404 | } |
| 405 | 405 | ||
| 406 | conntrack->proto.sctp.state = newconntrack; | 406 | conntrack->proto.sctp.state = newconntrack; |
| 407 | if (oldsctpstate != newconntrack) | ||
| 408 | ip_conntrack_event_cache(IPCT_PROTOINFO, skb); | ||
| 407 | write_unlock_bh(&sctp_lock); | 409 | write_unlock_bh(&sctp_lock); |
| 408 | } | 410 | } |
| 409 | 411 | ||
| @@ -503,7 +505,12 @@ static struct ip_conntrack_protocol ip_conntrack_protocol_sctp = { | |||
| 503 | .packet = sctp_packet, | 505 | .packet = sctp_packet, |
| 504 | .new = sctp_new, | 506 | .new = sctp_new, |
| 505 | .destroy = NULL, | 507 | .destroy = NULL, |
| 506 | .me = THIS_MODULE | 508 | .me = THIS_MODULE, |
| 509 | #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ | ||
| 510 | defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) | ||
| 511 | .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr, | ||
| 512 | .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple, | ||
| 513 | #endif | ||
| 507 | }; | 514 | }; |
| 508 | 515 | ||
| 509 | #ifdef CONFIG_SYSCTL | 516 | #ifdef CONFIG_SYSCTL |
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index 809dfed766d4..f23ef1f88c46 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c | |||
| @@ -336,6 +336,23 @@ static int tcp_print_conntrack(struct seq_file *s, | |||
| 336 | return seq_printf(s, "%s ", tcp_conntrack_names[state]); | 336 | return seq_printf(s, "%s ", tcp_conntrack_names[state]); |
| 337 | } | 337 | } |
| 338 | 338 | ||
| 339 | #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ | ||
| 340 | defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) | ||
| 341 | static int tcp_to_nfattr(struct sk_buff *skb, struct nfattr *nfa, | ||
| 342 | const struct ip_conntrack *ct) | ||
| 343 | { | ||
| 344 | read_lock_bh(&tcp_lock); | ||
| 345 | NFA_PUT(skb, CTA_PROTOINFO_TCP_STATE, sizeof(u_int8_t), | ||
| 346 | &ct->proto.tcp.state); | ||
| 347 | read_unlock_bh(&tcp_lock); | ||
| 348 | |||
| 349 | return 0; | ||
| 350 | |||
| 351 | nfattr_failure: | ||
| 352 | return -1; | ||
| 353 | } | ||
| 354 | #endif | ||
| 355 | |||
| 339 | static unsigned int get_conntrack_index(const struct tcphdr *tcph) | 356 | static unsigned int get_conntrack_index(const struct tcphdr *tcph) |
| 340 | { | 357 | { |
| 341 | if (tcph->rst) return TCP_RST_SET; | 358 | if (tcph->rst) return TCP_RST_SET; |
| @@ -699,7 +716,7 @@ static int tcp_in_window(struct ip_ct_tcp *state, | |||
| 699 | res = 1; | 716 | res = 1; |
| 700 | } else { | 717 | } else { |
| 701 | if (LOG_INVALID(IPPROTO_TCP)) | 718 | if (LOG_INVALID(IPPROTO_TCP)) |
| 702 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, | 719 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, |
| 703 | "ip_ct_tcp: %s ", | 720 | "ip_ct_tcp: %s ", |
| 704 | before(seq, sender->td_maxend + 1) ? | 721 | before(seq, sender->td_maxend + 1) ? |
| 705 | after(end, sender->td_end - receiver->td_maxwin - 1) ? | 722 | after(end, sender->td_end - receiver->td_maxwin - 1) ? |
| @@ -798,7 +815,7 @@ static int tcp_error(struct sk_buff *skb, | |||
| 798 | sizeof(_tcph), &_tcph); | 815 | sizeof(_tcph), &_tcph); |
| 799 | if (th == NULL) { | 816 | if (th == NULL) { |
| 800 | if (LOG_INVALID(IPPROTO_TCP)) | 817 | if (LOG_INVALID(IPPROTO_TCP)) |
| 801 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, | 818 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, |
| 802 | "ip_ct_tcp: short packet "); | 819 | "ip_ct_tcp: short packet "); |
| 803 | return -NF_ACCEPT; | 820 | return -NF_ACCEPT; |
| 804 | } | 821 | } |
| @@ -806,7 +823,7 @@ static int tcp_error(struct sk_buff *skb, | |||
| 806 | /* Not whole TCP header or malformed packet */ | 823 | /* Not whole TCP header or malformed packet */ |
| 807 | if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) { | 824 | if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) { |
| 808 | if (LOG_INVALID(IPPROTO_TCP)) | 825 | if (LOG_INVALID(IPPROTO_TCP)) |
| 809 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, | 826 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, |
| 810 | "ip_ct_tcp: truncated/malformed packet "); | 827 | "ip_ct_tcp: truncated/malformed packet "); |
| 811 | return -NF_ACCEPT; | 828 | return -NF_ACCEPT; |
| 812 | } | 829 | } |
| @@ -823,7 +840,7 @@ static int tcp_error(struct sk_buff *skb, | |||
| 823 | skb->ip_summed == CHECKSUM_HW ? skb->csum | 840 | skb->ip_summed == CHECKSUM_HW ? skb->csum |
| 824 | : skb_checksum(skb, iph->ihl*4, tcplen, 0))) { | 841 | : skb_checksum(skb, iph->ihl*4, tcplen, 0))) { |
| 825 | if (LOG_INVALID(IPPROTO_TCP)) | 842 | if (LOG_INVALID(IPPROTO_TCP)) |
| 826 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, | 843 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, |
| 827 | "ip_ct_tcp: bad TCP checksum "); | 844 | "ip_ct_tcp: bad TCP checksum "); |
| 828 | return -NF_ACCEPT; | 845 | return -NF_ACCEPT; |
| 829 | } | 846 | } |
| @@ -832,7 +849,7 @@ static int tcp_error(struct sk_buff *skb, | |||
| 832 | tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR)); | 849 | tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR)); |
| 833 | if (!tcp_valid_flags[tcpflags]) { | 850 | if (!tcp_valid_flags[tcpflags]) { |
| 834 | if (LOG_INVALID(IPPROTO_TCP)) | 851 | if (LOG_INVALID(IPPROTO_TCP)) |
| 835 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, | 852 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, |
| 836 | "ip_ct_tcp: invalid TCP flag combination "); | 853 | "ip_ct_tcp: invalid TCP flag combination "); |
| 837 | return -NF_ACCEPT; | 854 | return -NF_ACCEPT; |
| 838 | } | 855 | } |
| @@ -880,8 +897,9 @@ static int tcp_packet(struct ip_conntrack *conntrack, | |||
| 880 | */ | 897 | */ |
| 881 | write_unlock_bh(&tcp_lock); | 898 | write_unlock_bh(&tcp_lock); |
| 882 | if (LOG_INVALID(IPPROTO_TCP)) | 899 | if (LOG_INVALID(IPPROTO_TCP)) |
| 883 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, | 900 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, |
| 884 | "ip_ct_tcp: killing out of sync session "); | 901 | NULL, "ip_ct_tcp: " |
| 902 | "killing out of sync session "); | ||
| 885 | if (del_timer(&conntrack->timeout)) | 903 | if (del_timer(&conntrack->timeout)) |
| 886 | conntrack->timeout.function((unsigned long) | 904 | conntrack->timeout.function((unsigned long) |
| 887 | conntrack); | 905 | conntrack); |
| @@ -895,7 +913,7 @@ static int tcp_packet(struct ip_conntrack *conntrack, | |||
| 895 | 913 | ||
| 896 | write_unlock_bh(&tcp_lock); | 914 | write_unlock_bh(&tcp_lock); |
| 897 | if (LOG_INVALID(IPPROTO_TCP)) | 915 | if (LOG_INVALID(IPPROTO_TCP)) |
| 898 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, | 916 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, |
| 899 | "ip_ct_tcp: invalid packet ignored "); | 917 | "ip_ct_tcp: invalid packet ignored "); |
| 900 | return NF_ACCEPT; | 918 | return NF_ACCEPT; |
| 901 | case TCP_CONNTRACK_MAX: | 919 | case TCP_CONNTRACK_MAX: |
| @@ -905,7 +923,7 @@ static int tcp_packet(struct ip_conntrack *conntrack, | |||
| 905 | old_state); | 923 | old_state); |
| 906 | write_unlock_bh(&tcp_lock); | 924 | write_unlock_bh(&tcp_lock); |
| 907 | if (LOG_INVALID(IPPROTO_TCP)) | 925 | if (LOG_INVALID(IPPROTO_TCP)) |
| 908 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, | 926 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, |
| 909 | "ip_ct_tcp: invalid state "); | 927 | "ip_ct_tcp: invalid state "); |
| 910 | return -NF_ACCEPT; | 928 | return -NF_ACCEPT; |
| 911 | case TCP_CONNTRACK_SYN_SENT: | 929 | case TCP_CONNTRACK_SYN_SENT: |
| @@ -926,7 +944,7 @@ static int tcp_packet(struct ip_conntrack *conntrack, | |||
| 926 | write_unlock_bh(&tcp_lock); | 944 | write_unlock_bh(&tcp_lock); |
| 927 | if (LOG_INVALID(IPPROTO_TCP)) | 945 | if (LOG_INVALID(IPPROTO_TCP)) |
| 928 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, | 946 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, |
| 929 | "ip_ct_tcp: invalid SYN"); | 947 | NULL, "ip_ct_tcp: invalid SYN"); |
| 930 | return -NF_ACCEPT; | 948 | return -NF_ACCEPT; |
| 931 | } | 949 | } |
| 932 | case TCP_CONNTRACK_CLOSE: | 950 | case TCP_CONNTRACK_CLOSE: |
| @@ -973,6 +991,10 @@ static int tcp_packet(struct ip_conntrack *conntrack, | |||
| 973 | ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state]; | 991 | ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state]; |
| 974 | write_unlock_bh(&tcp_lock); | 992 | write_unlock_bh(&tcp_lock); |
| 975 | 993 | ||
| 994 | ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); | ||
| 995 | if (new_state != old_state) | ||
| 996 | ip_conntrack_event_cache(IPCT_PROTOINFO, skb); | ||
| 997 | |||
| 976 | if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { | 998 | if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { |
| 977 | /* If only reply is a RST, we can consider ourselves not to | 999 | /* If only reply is a RST, we can consider ourselves not to |
| 978 | have an established connection: this is a fairly common | 1000 | have an established connection: this is a fairly common |
| @@ -1096,4 +1118,10 @@ struct ip_conntrack_protocol ip_conntrack_protocol_tcp = | |||
| 1096 | .packet = tcp_packet, | 1118 | .packet = tcp_packet, |
| 1097 | .new = tcp_new, | 1119 | .new = tcp_new, |
| 1098 | .error = tcp_error, | 1120 | .error = tcp_error, |
| 1121 | #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ | ||
| 1122 | defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) | ||
| 1123 | .to_nfattr = tcp_to_nfattr, | ||
| 1124 | .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr, | ||
| 1125 | .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple, | ||
| 1126 | #endif | ||
| 1099 | }; | 1127 | }; |
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c index 8c1eaba098d4..f2dcac7c7660 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c | |||
| @@ -73,7 +73,8 @@ static int udp_packet(struct ip_conntrack *conntrack, | |||
| 73 | ip_ct_refresh_acct(conntrack, ctinfo, skb, | 73 | ip_ct_refresh_acct(conntrack, ctinfo, skb, |
| 74 | ip_ct_udp_timeout_stream); | 74 | ip_ct_udp_timeout_stream); |
| 75 | /* Also, more likely to be important, and not a probe */ | 75 | /* Also, more likely to be important, and not a probe */ |
| 76 | set_bit(IPS_ASSURED_BIT, &conntrack->status); | 76 | if (!test_and_set_bit(IPS_ASSURED_BIT, &conntrack->status)) |
| 77 | ip_conntrack_event_cache(IPCT_STATUS, skb); | ||
| 77 | } else | 78 | } else |
| 78 | ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout); | 79 | ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout); |
| 79 | 80 | ||
| @@ -97,7 +98,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, | |||
| 97 | hdr = skb_header_pointer(skb, iph->ihl*4, sizeof(_hdr), &_hdr); | 98 | hdr = skb_header_pointer(skb, iph->ihl*4, sizeof(_hdr), &_hdr); |
| 98 | if (hdr == NULL) { | 99 | if (hdr == NULL) { |
| 99 | if (LOG_INVALID(IPPROTO_UDP)) | 100 | if (LOG_INVALID(IPPROTO_UDP)) |
| 100 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, | 101 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, |
| 101 | "ip_ct_udp: short packet "); | 102 | "ip_ct_udp: short packet "); |
| 102 | return -NF_ACCEPT; | 103 | return -NF_ACCEPT; |
| 103 | } | 104 | } |
| @@ -105,7 +106,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, | |||
| 105 | /* Truncated/malformed packets */ | 106 | /* Truncated/malformed packets */ |
| 106 | if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) { | 107 | if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) { |
| 107 | if (LOG_INVALID(IPPROTO_UDP)) | 108 | if (LOG_INVALID(IPPROTO_UDP)) |
| 108 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, | 109 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, |
| 109 | "ip_ct_udp: truncated/malformed packet "); | 110 | "ip_ct_udp: truncated/malformed packet "); |
| 110 | return -NF_ACCEPT; | 111 | return -NF_ACCEPT; |
| 111 | } | 112 | } |
| @@ -125,7 +126,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, | |||
| 125 | skb->ip_summed == CHECKSUM_HW ? skb->csum | 126 | skb->ip_summed == CHECKSUM_HW ? skb->csum |
| 126 | : skb_checksum(skb, iph->ihl*4, udplen, 0))) { | 127 | : skb_checksum(skb, iph->ihl*4, udplen, 0))) { |
| 127 | if (LOG_INVALID(IPPROTO_UDP)) | 128 | if (LOG_INVALID(IPPROTO_UDP)) |
| 128 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, | 129 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, |
| 129 | "ip_ct_udp: bad UDP checksum "); | 130 | "ip_ct_udp: bad UDP checksum "); |
| 130 | return -NF_ACCEPT; | 131 | return -NF_ACCEPT; |
| 131 | } | 132 | } |
| @@ -144,4 +145,9 @@ struct ip_conntrack_protocol ip_conntrack_protocol_udp = | |||
| 144 | .packet = udp_packet, | 145 | .packet = udp_packet, |
| 145 | .new = udp_new, | 146 | .new = udp_new, |
| 146 | .error = udp_error, | 147 | .error = udp_error, |
| 148 | #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ | ||
| 149 | defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) | ||
| 150 | .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr, | ||
| 151 | .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple, | ||
| 152 | #endif | ||
| 147 | }; | 153 | }; |
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index 61798c46e91d..ee5895afd0c3 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c | |||
| @@ -5,7 +5,7 @@ | |||
| 5 | */ | 5 | */ |
| 6 | 6 | ||
| 7 | /* (C) 1999-2001 Paul `Rusty' Russell | 7 | /* (C) 1999-2001 Paul `Rusty' Russell |
| 8 | * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> | 8 | * (C) 2002-2005 Netfilter Core Team <coreteam@netfilter.org> |
| 9 | * | 9 | * |
| 10 | * This program is free software; you can redistribute it and/or modify | 10 | * This program is free software; you can redistribute it and/or modify |
| 11 | * it under the terms of the GNU General Public License version 2 as | 11 | * it under the terms of the GNU General Public License version 2 as |
| @@ -147,8 +147,7 @@ static int ct_seq_show(struct seq_file *s, void *v) | |||
| 147 | if (DIRECTION(hash)) | 147 | if (DIRECTION(hash)) |
| 148 | return 0; | 148 | return 0; |
| 149 | 149 | ||
| 150 | proto = ip_ct_find_proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL] | 150 | proto = __ip_conntrack_proto_find(conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum); |
| 151 | .tuple.dst.protonum); | ||
| 152 | IP_NF_ASSERT(proto); | 151 | IP_NF_ASSERT(proto); |
| 153 | 152 | ||
| 154 | if (seq_printf(s, "%-8s %u %ld ", | 153 | if (seq_printf(s, "%-8s %u %ld ", |
| @@ -185,7 +184,7 @@ static int ct_seq_show(struct seq_file *s, void *v) | |||
| 185 | return -ENOSPC; | 184 | return -ENOSPC; |
| 186 | 185 | ||
| 187 | #if defined(CONFIG_IP_NF_CONNTRACK_MARK) | 186 | #if defined(CONFIG_IP_NF_CONNTRACK_MARK) |
| 188 | if (seq_printf(s, "mark=%lu ", conntrack->mark)) | 187 | if (seq_printf(s, "mark=%u ", conntrack->mark)) |
| 189 | return -ENOSPC; | 188 | return -ENOSPC; |
| 190 | #endif | 189 | #endif |
| 191 | 190 | ||
| @@ -283,7 +282,7 @@ static int exp_seq_show(struct seq_file *s, void *v) | |||
| 283 | seq_printf(s, "proto=%u ", expect->tuple.dst.protonum); | 282 | seq_printf(s, "proto=%u ", expect->tuple.dst.protonum); |
| 284 | 283 | ||
| 285 | print_tuple(s, &expect->tuple, | 284 | print_tuple(s, &expect->tuple, |
| 286 | ip_ct_find_proto(expect->tuple.dst.protonum)); | 285 | __ip_conntrack_proto_find(expect->tuple.dst.protonum)); |
| 287 | return seq_putc(s, '\n'); | 286 | return seq_putc(s, '\n'); |
| 288 | } | 287 | } |
| 289 | 288 | ||
| @@ -889,6 +888,7 @@ static int init_or_cleanup(int init) | |||
| 889 | return ret; | 888 | return ret; |
| 890 | 889 | ||
| 891 | cleanup: | 890 | cleanup: |
| 891 | synchronize_net(); | ||
| 892 | #ifdef CONFIG_SYSCTL | 892 | #ifdef CONFIG_SYSCTL |
| 893 | unregister_sysctl_table(ip_ct_sysctl_header); | 893 | unregister_sysctl_table(ip_ct_sysctl_header); |
| 894 | cleanup_localinops: | 894 | cleanup_localinops: |
| @@ -971,6 +971,14 @@ void need_ip_conntrack(void) | |||
| 971 | { | 971 | { |
| 972 | } | 972 | } |
| 973 | 973 | ||
| 974 | #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS | ||
| 975 | EXPORT_SYMBOL_GPL(ip_conntrack_chain); | ||
| 976 | EXPORT_SYMBOL_GPL(ip_conntrack_expect_chain); | ||
| 977 | EXPORT_SYMBOL_GPL(ip_conntrack_register_notifier); | ||
| 978 | EXPORT_SYMBOL_GPL(ip_conntrack_unregister_notifier); | ||
| 979 | EXPORT_SYMBOL_GPL(__ip_ct_event_cache_init); | ||
| 980 | EXPORT_PER_CPU_SYMBOL_GPL(ip_conntrack_ecache); | ||
| 981 | #endif | ||
| 974 | EXPORT_SYMBOL(ip_conntrack_protocol_register); | 982 | EXPORT_SYMBOL(ip_conntrack_protocol_register); |
| 975 | EXPORT_SYMBOL(ip_conntrack_protocol_unregister); | 983 | EXPORT_SYMBOL(ip_conntrack_protocol_unregister); |
| 976 | EXPORT_SYMBOL(ip_ct_get_tuple); | 984 | EXPORT_SYMBOL(ip_ct_get_tuple); |
| @@ -982,12 +990,16 @@ EXPORT_SYMBOL(ip_conntrack_helper_register); | |||
| 982 | EXPORT_SYMBOL(ip_conntrack_helper_unregister); | 990 | EXPORT_SYMBOL(ip_conntrack_helper_unregister); |
| 983 | EXPORT_SYMBOL(ip_ct_iterate_cleanup); | 991 | EXPORT_SYMBOL(ip_ct_iterate_cleanup); |
| 984 | EXPORT_SYMBOL(ip_ct_refresh_acct); | 992 | EXPORT_SYMBOL(ip_ct_refresh_acct); |
| 985 | EXPORT_SYMBOL(ip_ct_protos); | 993 | |
| 986 | EXPORT_SYMBOL(ip_ct_find_proto); | ||
| 987 | EXPORT_SYMBOL(ip_conntrack_expect_alloc); | 994 | EXPORT_SYMBOL(ip_conntrack_expect_alloc); |
| 988 | EXPORT_SYMBOL(ip_conntrack_expect_put); | 995 | EXPORT_SYMBOL(ip_conntrack_expect_put); |
| 996 | EXPORT_SYMBOL_GPL(ip_conntrack_expect_find_get); | ||
| 989 | EXPORT_SYMBOL(ip_conntrack_expect_related); | 997 | EXPORT_SYMBOL(ip_conntrack_expect_related); |
| 990 | EXPORT_SYMBOL(ip_conntrack_unexpect_related); | 998 | EXPORT_SYMBOL(ip_conntrack_unexpect_related); |
| 999 | EXPORT_SYMBOL_GPL(ip_conntrack_expect_list); | ||
| 1000 | EXPORT_SYMBOL_GPL(__ip_conntrack_expect_find); | ||
| 1001 | EXPORT_SYMBOL_GPL(__ip_ct_expect_unlink_destroy); | ||
| 1002 | |||
| 991 | EXPORT_SYMBOL(ip_conntrack_tuple_taken); | 1003 | EXPORT_SYMBOL(ip_conntrack_tuple_taken); |
| 992 | EXPORT_SYMBOL(ip_ct_gather_frags); | 1004 | EXPORT_SYMBOL(ip_ct_gather_frags); |
| 993 | EXPORT_SYMBOL(ip_conntrack_htable_size); | 1005 | EXPORT_SYMBOL(ip_conntrack_htable_size); |
| @@ -995,7 +1007,28 @@ EXPORT_SYMBOL(ip_conntrack_lock); | |||
| 995 | EXPORT_SYMBOL(ip_conntrack_hash); | 1007 | EXPORT_SYMBOL(ip_conntrack_hash); |
| 996 | EXPORT_SYMBOL(ip_conntrack_untracked); | 1008 | EXPORT_SYMBOL(ip_conntrack_untracked); |
| 997 | EXPORT_SYMBOL_GPL(ip_conntrack_find_get); | 1009 | EXPORT_SYMBOL_GPL(ip_conntrack_find_get); |
| 998 | EXPORT_SYMBOL_GPL(ip_conntrack_put); | ||
| 999 | #ifdef CONFIG_IP_NF_NAT_NEEDED | 1010 | #ifdef CONFIG_IP_NF_NAT_NEEDED |
| 1000 | EXPORT_SYMBOL(ip_conntrack_tcp_update); | 1011 | EXPORT_SYMBOL(ip_conntrack_tcp_update); |
| 1001 | #endif | 1012 | #endif |
| 1013 | |||
| 1014 | EXPORT_SYMBOL_GPL(ip_conntrack_flush); | ||
| 1015 | EXPORT_SYMBOL_GPL(__ip_conntrack_find); | ||
| 1016 | |||
| 1017 | EXPORT_SYMBOL_GPL(ip_conntrack_alloc); | ||
| 1018 | EXPORT_SYMBOL_GPL(ip_conntrack_free); | ||
| 1019 | EXPORT_SYMBOL_GPL(ip_conntrack_hash_insert); | ||
| 1020 | |||
| 1021 | EXPORT_SYMBOL_GPL(ip_ct_remove_expectations); | ||
| 1022 | |||
| 1023 | EXPORT_SYMBOL_GPL(ip_conntrack_helper_find_get); | ||
| 1024 | EXPORT_SYMBOL_GPL(ip_conntrack_helper_put); | ||
| 1025 | EXPORT_SYMBOL_GPL(__ip_conntrack_helper_find_byname); | ||
| 1026 | |||
| 1027 | EXPORT_SYMBOL_GPL(ip_conntrack_proto_find_get); | ||
| 1028 | EXPORT_SYMBOL_GPL(ip_conntrack_proto_put); | ||
| 1029 | EXPORT_SYMBOL_GPL(__ip_conntrack_proto_find); | ||
| 1030 | #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ | ||
| 1031 | defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) | ||
| 1032 | EXPORT_SYMBOL_GPL(ip_ct_port_tuple_to_nfattr); | ||
| 1033 | EXPORT_SYMBOL_GPL(ip_ct_port_nfattr_to_tuple); | ||
| 1034 | #endif | ||
diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c index 739b6dde1c82..1adedb743f60 100644 --- a/net/ipv4/netfilter/ip_nat_core.c +++ b/net/ipv4/netfilter/ip_nat_core.c | |||
| @@ -47,8 +47,39 @@ DEFINE_RWLOCK(ip_nat_lock); | |||
| 47 | static unsigned int ip_nat_htable_size; | 47 | static unsigned int ip_nat_htable_size; |
| 48 | 48 | ||
| 49 | static struct list_head *bysource; | 49 | static struct list_head *bysource; |
| 50 | |||
| 51 | #define MAX_IP_NAT_PROTO 256 | ||
| 50 | struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO]; | 52 | struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO]; |
| 51 | 53 | ||
| 54 | static inline struct ip_nat_protocol * | ||
| 55 | __ip_nat_proto_find(u_int8_t protonum) | ||
| 56 | { | ||
| 57 | return ip_nat_protos[protonum]; | ||
| 58 | } | ||
| 59 | |||
| 60 | struct ip_nat_protocol * | ||
| 61 | ip_nat_proto_find_get(u_int8_t protonum) | ||
| 62 | { | ||
| 63 | struct ip_nat_protocol *p; | ||
| 64 | |||
| 65 | /* we need to disable preemption to make sure 'p' doesn't get | ||
| 66 | * removed until we've grabbed the reference */ | ||
| 67 | preempt_disable(); | ||
| 68 | p = __ip_nat_proto_find(protonum); | ||
| 69 | if (p) { | ||
| 70 | if (!try_module_get(p->me)) | ||
| 71 | p = &ip_nat_unknown_protocol; | ||
| 72 | } | ||
| 73 | preempt_enable(); | ||
| 74 | |||
| 75 | return p; | ||
| 76 | } | ||
| 77 | |||
| 78 | void | ||
| 79 | ip_nat_proto_put(struct ip_nat_protocol *p) | ||
| 80 | { | ||
| 81 | module_put(p->me); | ||
| 82 | } | ||
| 52 | 83 | ||
| 53 | /* We keep an extra hash for each conntrack, for fast searching. */ | 84 | /* We keep an extra hash for each conntrack, for fast searching. */ |
| 54 | static inline unsigned int | 85 | static inline unsigned int |
| @@ -103,7 +134,8 @@ static int | |||
| 103 | in_range(const struct ip_conntrack_tuple *tuple, | 134 | in_range(const struct ip_conntrack_tuple *tuple, |
| 104 | const struct ip_nat_range *range) | 135 | const struct ip_nat_range *range) |
| 105 | { | 136 | { |
| 106 | struct ip_nat_protocol *proto = ip_nat_find_proto(tuple->dst.protonum); | 137 | struct ip_nat_protocol *proto = |
| 138 | __ip_nat_proto_find(tuple->dst.protonum); | ||
| 107 | 139 | ||
| 108 | /* If we are supposed to map IPs, then we must be in the | 140 | /* If we are supposed to map IPs, then we must be in the |
| 109 | range specified, otherwise let this drag us onto a new src IP. */ | 141 | range specified, otherwise let this drag us onto a new src IP. */ |
| @@ -216,8 +248,7 @@ get_unique_tuple(struct ip_conntrack_tuple *tuple, | |||
| 216 | struct ip_conntrack *conntrack, | 248 | struct ip_conntrack *conntrack, |
| 217 | enum ip_nat_manip_type maniptype) | 249 | enum ip_nat_manip_type maniptype) |
| 218 | { | 250 | { |
| 219 | struct ip_nat_protocol *proto | 251 | struct ip_nat_protocol *proto; |
| 220 | = ip_nat_find_proto(orig_tuple->dst.protonum); | ||
| 221 | 252 | ||
| 222 | /* 1) If this srcip/proto/src-proto-part is currently mapped, | 253 | /* 1) If this srcip/proto/src-proto-part is currently mapped, |
| 223 | and that same mapping gives a unique tuple within the given | 254 | and that same mapping gives a unique tuple within the given |
| @@ -242,14 +273,20 @@ get_unique_tuple(struct ip_conntrack_tuple *tuple, | |||
| 242 | /* 3) The per-protocol part of the manip is made to map into | 273 | /* 3) The per-protocol part of the manip is made to map into |
| 243 | the range to make a unique tuple. */ | 274 | the range to make a unique tuple. */ |
| 244 | 275 | ||
| 276 | proto = ip_nat_proto_find_get(orig_tuple->dst.protonum); | ||
| 277 | |||
| 245 | /* Only bother mapping if it's not already in range and unique */ | 278 | /* Only bother mapping if it's not already in range and unique */ |
| 246 | if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) | 279 | if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) |
| 247 | || proto->in_range(tuple, maniptype, &range->min, &range->max)) | 280 | || proto->in_range(tuple, maniptype, &range->min, &range->max)) |
| 248 | && !ip_nat_used_tuple(tuple, conntrack)) | 281 | && !ip_nat_used_tuple(tuple, conntrack)) { |
| 282 | ip_nat_proto_put(proto); | ||
| 249 | return; | 283 | return; |
| 284 | } | ||
| 250 | 285 | ||
| 251 | /* Last change: get protocol to try to obtain unique tuple. */ | 286 | /* Last change: get protocol to try to obtain unique tuple. */ |
| 252 | proto->unique_tuple(tuple, range, maniptype, conntrack); | 287 | proto->unique_tuple(tuple, range, maniptype, conntrack); |
| 288 | |||
| 289 | ip_nat_proto_put(proto); | ||
| 253 | } | 290 | } |
| 254 | 291 | ||
| 255 | unsigned int | 292 | unsigned int |
| @@ -320,17 +357,20 @@ manip_pkt(u_int16_t proto, | |||
| 320 | enum ip_nat_manip_type maniptype) | 357 | enum ip_nat_manip_type maniptype) |
| 321 | { | 358 | { |
| 322 | struct iphdr *iph; | 359 | struct iphdr *iph; |
| 360 | struct ip_nat_protocol *p; | ||
| 323 | 361 | ||
| 324 | (*pskb)->nfcache |= NFC_ALTERED; | 362 | if (!skb_make_writable(pskb, iphdroff + sizeof(*iph))) |
| 325 | if (!skb_ip_make_writable(pskb, iphdroff + sizeof(*iph))) | ||
| 326 | return 0; | 363 | return 0; |
| 327 | 364 | ||
| 328 | iph = (void *)(*pskb)->data + iphdroff; | 365 | iph = (void *)(*pskb)->data + iphdroff; |
| 329 | 366 | ||
| 330 | /* Manipulate protcol part. */ | 367 | /* Manipulate protcol part. */ |
| 331 | if (!ip_nat_find_proto(proto)->manip_pkt(pskb, iphdroff, | 368 | p = ip_nat_proto_find_get(proto); |
| 332 | target, maniptype)) | 369 | if (!p->manip_pkt(pskb, iphdroff, target, maniptype)) { |
| 370 | ip_nat_proto_put(p); | ||
| 333 | return 0; | 371 | return 0; |
| 372 | } | ||
| 373 | ip_nat_proto_put(p); | ||
| 334 | 374 | ||
| 335 | iph = (void *)(*pskb)->data + iphdroff; | 375 | iph = (void *)(*pskb)->data + iphdroff; |
| 336 | 376 | ||
| @@ -391,7 +431,7 @@ int icmp_reply_translation(struct sk_buff **pskb, | |||
| 391 | struct ip_conntrack_tuple inner, target; | 431 | struct ip_conntrack_tuple inner, target; |
| 392 | int hdrlen = (*pskb)->nh.iph->ihl * 4; | 432 | int hdrlen = (*pskb)->nh.iph->ihl * 4; |
| 393 | 433 | ||
| 394 | if (!skb_ip_make_writable(pskb, hdrlen + sizeof(*inside))) | 434 | if (!skb_make_writable(pskb, hdrlen + sizeof(*inside))) |
| 395 | return 0; | 435 | return 0; |
| 396 | 436 | ||
| 397 | inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4; | 437 | inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4; |
| @@ -426,7 +466,8 @@ int icmp_reply_translation(struct sk_buff **pskb, | |||
| 426 | 466 | ||
| 427 | if (!ip_ct_get_tuple(&inside->ip, *pskb, (*pskb)->nh.iph->ihl*4 + | 467 | if (!ip_ct_get_tuple(&inside->ip, *pskb, (*pskb)->nh.iph->ihl*4 + |
| 428 | sizeof(struct icmphdr) + inside->ip.ihl*4, | 468 | sizeof(struct icmphdr) + inside->ip.ihl*4, |
| 429 | &inner, ip_ct_find_proto(inside->ip.protocol))) | 469 | &inner, |
| 470 | __ip_conntrack_proto_find(inside->ip.protocol))) | ||
| 430 | return 0; | 471 | return 0; |
| 431 | 472 | ||
| 432 | /* Change inner back to look like incoming packet. We do the | 473 | /* Change inner back to look like incoming packet. We do the |
| @@ -496,6 +537,49 @@ void ip_nat_protocol_unregister(struct ip_nat_protocol *proto) | |||
| 496 | synchronize_net(); | 537 | synchronize_net(); |
| 497 | } | 538 | } |
| 498 | 539 | ||
| 540 | #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ | ||
| 541 | defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) | ||
| 542 | int | ||
| 543 | ip_nat_port_range_to_nfattr(struct sk_buff *skb, | ||
| 544 | const struct ip_nat_range *range) | ||
| 545 | { | ||
| 546 | NFA_PUT(skb, CTA_PROTONAT_PORT_MIN, sizeof(u_int16_t), | ||
| 547 | &range->min.tcp.port); | ||
| 548 | NFA_PUT(skb, CTA_PROTONAT_PORT_MAX, sizeof(u_int16_t), | ||
| 549 | &range->max.tcp.port); | ||
| 550 | |||
| 551 | return 0; | ||
| 552 | |||
| 553 | nfattr_failure: | ||
| 554 | return -1; | ||
| 555 | } | ||
| 556 | |||
| 557 | int | ||
| 558 | ip_nat_port_nfattr_to_range(struct nfattr *tb[], struct ip_nat_range *range) | ||
| 559 | { | ||
| 560 | int ret = 0; | ||
| 561 | |||
| 562 | /* we have to return whether we actually parsed something or not */ | ||
| 563 | |||
| 564 | if (tb[CTA_PROTONAT_PORT_MIN-1]) { | ||
| 565 | ret = 1; | ||
| 566 | range->min.tcp.port = | ||
| 567 | *(u_int16_t *)NFA_DATA(tb[CTA_PROTONAT_PORT_MIN-1]); | ||
| 568 | } | ||
| 569 | |||
| 570 | if (!tb[CTA_PROTONAT_PORT_MAX-1]) { | ||
| 571 | if (ret) | ||
| 572 | range->max.tcp.port = range->min.tcp.port; | ||
| 573 | } else { | ||
| 574 | ret = 1; | ||
| 575 | range->max.tcp.port = | ||
| 576 | *(u_int16_t *)NFA_DATA(tb[CTA_PROTONAT_PORT_MAX-1]); | ||
| 577 | } | ||
| 578 | |||
| 579 | return ret; | ||
| 580 | } | ||
| 581 | #endif | ||
| 582 | |||
| 499 | int __init ip_nat_init(void) | 583 | int __init ip_nat_init(void) |
| 500 | { | 584 | { |
| 501 | size_t i; | 585 | size_t i; |
diff --git a/net/ipv4/netfilter/ip_nat_helper.c b/net/ipv4/netfilter/ip_nat_helper.c index 158f34f32c04..d2dd5d313556 100644 --- a/net/ipv4/netfilter/ip_nat_helper.c +++ b/net/ipv4/netfilter/ip_nat_helper.c | |||
| @@ -168,7 +168,7 @@ ip_nat_mangle_tcp_packet(struct sk_buff **pskb, | |||
| 168 | struct tcphdr *tcph; | 168 | struct tcphdr *tcph; |
| 169 | int datalen; | 169 | int datalen; |
| 170 | 170 | ||
| 171 | if (!skb_ip_make_writable(pskb, (*pskb)->len)) | 171 | if (!skb_make_writable(pskb, (*pskb)->len)) |
| 172 | return 0; | 172 | return 0; |
| 173 | 173 | ||
| 174 | if (rep_len > match_len | 174 | if (rep_len > match_len |
| @@ -228,7 +228,7 @@ ip_nat_mangle_udp_packet(struct sk_buff **pskb, | |||
| 228 | match_offset + match_len) | 228 | match_offset + match_len) |
| 229 | return 0; | 229 | return 0; |
| 230 | 230 | ||
| 231 | if (!skb_ip_make_writable(pskb, (*pskb)->len)) | 231 | if (!skb_make_writable(pskb, (*pskb)->len)) |
| 232 | return 0; | 232 | return 0; |
| 233 | 233 | ||
| 234 | if (rep_len > match_len | 234 | if (rep_len > match_len |
| @@ -315,7 +315,7 @@ ip_nat_sack_adjust(struct sk_buff **pskb, | |||
| 315 | optoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct tcphdr); | 315 | optoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct tcphdr); |
| 316 | optend = (*pskb)->nh.iph->ihl*4 + tcph->doff*4; | 316 | optend = (*pskb)->nh.iph->ihl*4 + tcph->doff*4; |
| 317 | 317 | ||
| 318 | if (!skb_ip_make_writable(pskb, optend)) | 318 | if (!skb_make_writable(pskb, optend)) |
| 319 | return 0; | 319 | return 0; |
| 320 | 320 | ||
| 321 | dir = CTINFO2DIR(ctinfo); | 321 | dir = CTINFO2DIR(ctinfo); |
| @@ -363,7 +363,7 @@ ip_nat_seq_adjust(struct sk_buff **pskb, | |||
| 363 | this_way = &ct->nat.info.seq[dir]; | 363 | this_way = &ct->nat.info.seq[dir]; |
| 364 | other_way = &ct->nat.info.seq[!dir]; | 364 | other_way = &ct->nat.info.seq[!dir]; |
| 365 | 365 | ||
| 366 | if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) | 366 | if (!skb_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) |
| 367 | return 0; | 367 | return 0; |
| 368 | 368 | ||
| 369 | tcph = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4; | 369 | tcph = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4; |
diff --git a/net/ipv4/netfilter/ip_nat_proto_icmp.c b/net/ipv4/netfilter/ip_nat_proto_icmp.c index 6596c9ee1655..938719043999 100644 --- a/net/ipv4/netfilter/ip_nat_proto_icmp.c +++ b/net/ipv4/netfilter/ip_nat_proto_icmp.c | |||
| @@ -62,7 +62,7 @@ icmp_manip_pkt(struct sk_buff **pskb, | |||
| 62 | struct icmphdr *hdr; | 62 | struct icmphdr *hdr; |
| 63 | unsigned int hdroff = iphdroff + iph->ihl*4; | 63 | unsigned int hdroff = iphdroff + iph->ihl*4; |
| 64 | 64 | ||
| 65 | if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr))) | 65 | if (!skb_make_writable(pskb, hdroff + sizeof(*hdr))) |
| 66 | return 0; | 66 | return 0; |
| 67 | 67 | ||
| 68 | hdr = (struct icmphdr *)((*pskb)->data + hdroff); | 68 | hdr = (struct icmphdr *)((*pskb)->data + hdroff); |
| @@ -106,11 +106,18 @@ icmp_print_range(char *buffer, const struct ip_nat_range *range) | |||
| 106 | else return 0; | 106 | else return 0; |
| 107 | } | 107 | } |
| 108 | 108 | ||
| 109 | struct ip_nat_protocol ip_nat_protocol_icmp | 109 | struct ip_nat_protocol ip_nat_protocol_icmp = { |
| 110 | = { "ICMP", IPPROTO_ICMP, | 110 | .name = "ICMP", |
| 111 | icmp_manip_pkt, | 111 | .protonum = IPPROTO_ICMP, |
| 112 | icmp_in_range, | 112 | .me = THIS_MODULE, |
| 113 | icmp_unique_tuple, | 113 | .manip_pkt = icmp_manip_pkt, |
| 114 | icmp_print, | 114 | .in_range = icmp_in_range, |
| 115 | icmp_print_range | 115 | .unique_tuple = icmp_unique_tuple, |
| 116 | .print = icmp_print, | ||
| 117 | .print_range = icmp_print_range, | ||
| 118 | #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ | ||
| 119 | defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) | ||
| 120 | .range_to_nfattr = ip_nat_port_range_to_nfattr, | ||
| 121 | .nfattr_to_range = ip_nat_port_nfattr_to_range, | ||
| 122 | #endif | ||
| 116 | }; | 123 | }; |
diff --git a/net/ipv4/netfilter/ip_nat_proto_tcp.c b/net/ipv4/netfilter/ip_nat_proto_tcp.c index a98e36d2b3c6..1d381bf68574 100644 --- a/net/ipv4/netfilter/ip_nat_proto_tcp.c +++ b/net/ipv4/netfilter/ip_nat_proto_tcp.c | |||
| @@ -12,6 +12,7 @@ | |||
| 12 | #include <linux/ip.h> | 12 | #include <linux/ip.h> |
| 13 | #include <linux/tcp.h> | 13 | #include <linux/tcp.h> |
| 14 | #include <linux/if.h> | 14 | #include <linux/if.h> |
| 15 | #include <linux/netfilter/nfnetlink_conntrack.h> | ||
| 15 | #include <linux/netfilter_ipv4/ip_nat.h> | 16 | #include <linux/netfilter_ipv4/ip_nat.h> |
| 16 | #include <linux/netfilter_ipv4/ip_nat_rule.h> | 17 | #include <linux/netfilter_ipv4/ip_nat_rule.h> |
| 17 | #include <linux/netfilter_ipv4/ip_nat_protocol.h> | 18 | #include <linux/netfilter_ipv4/ip_nat_protocol.h> |
| @@ -102,7 +103,7 @@ tcp_manip_pkt(struct sk_buff **pskb, | |||
| 102 | if ((*pskb)->len >= hdroff + sizeof(struct tcphdr)) | 103 | if ((*pskb)->len >= hdroff + sizeof(struct tcphdr)) |
| 103 | hdrsize = sizeof(struct tcphdr); | 104 | hdrsize = sizeof(struct tcphdr); |
| 104 | 105 | ||
| 105 | if (!skb_ip_make_writable(pskb, hdroff + hdrsize)) | 106 | if (!skb_make_writable(pskb, hdroff + hdrsize)) |
| 106 | return 0; | 107 | return 0; |
| 107 | 108 | ||
| 108 | iph = (struct iphdr *)((*pskb)->data + iphdroff); | 109 | iph = (struct iphdr *)((*pskb)->data + iphdroff); |
| @@ -169,11 +170,18 @@ tcp_print_range(char *buffer, const struct ip_nat_range *range) | |||
| 169 | else return 0; | 170 | else return 0; |
| 170 | } | 171 | } |
| 171 | 172 | ||
| 172 | struct ip_nat_protocol ip_nat_protocol_tcp | 173 | struct ip_nat_protocol ip_nat_protocol_tcp = { |
| 173 | = { "TCP", IPPROTO_TCP, | 174 | .name = "TCP", |
| 174 | tcp_manip_pkt, | 175 | .protonum = IPPROTO_TCP, |
| 175 | tcp_in_range, | 176 | .me = THIS_MODULE, |
| 176 | tcp_unique_tuple, | 177 | .manip_pkt = tcp_manip_pkt, |
| 177 | tcp_print, | 178 | .in_range = tcp_in_range, |
| 178 | tcp_print_range | 179 | .unique_tuple = tcp_unique_tuple, |
| 180 | .print = tcp_print, | ||
| 181 | .print_range = tcp_print_range, | ||
| 182 | #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ | ||
| 183 | defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) | ||
| 184 | .range_to_nfattr = ip_nat_port_range_to_nfattr, | ||
| 185 | .nfattr_to_range = ip_nat_port_nfattr_to_range, | ||
| 186 | #endif | ||
| 179 | }; | 187 | }; |
diff --git a/net/ipv4/netfilter/ip_nat_proto_udp.c b/net/ipv4/netfilter/ip_nat_proto_udp.c index 9f66e5625664..c4906e1aa24a 100644 --- a/net/ipv4/netfilter/ip_nat_proto_udp.c +++ b/net/ipv4/netfilter/ip_nat_proto_udp.c | |||
| @@ -94,7 +94,7 @@ udp_manip_pkt(struct sk_buff **pskb, | |||
| 94 | u32 oldip, newip; | 94 | u32 oldip, newip; |
| 95 | u16 *portptr, newport; | 95 | u16 *portptr, newport; |
| 96 | 96 | ||
| 97 | if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr))) | 97 | if (!skb_make_writable(pskb, hdroff + sizeof(*hdr))) |
| 98 | return 0; | 98 | return 0; |
| 99 | 99 | ||
| 100 | iph = (struct iphdr *)((*pskb)->data + iphdroff); | 100 | iph = (struct iphdr *)((*pskb)->data + iphdroff); |
| @@ -156,11 +156,18 @@ udp_print_range(char *buffer, const struct ip_nat_range *range) | |||
| 156 | else return 0; | 156 | else return 0; |
| 157 | } | 157 | } |
| 158 | 158 | ||
| 159 | struct ip_nat_protocol ip_nat_protocol_udp | 159 | struct ip_nat_protocol ip_nat_protocol_udp = { |
| 160 | = { "UDP", IPPROTO_UDP, | 160 | .name = "UDP", |
| 161 | udp_manip_pkt, | 161 | .protonum = IPPROTO_UDP, |
| 162 | udp_in_range, | 162 | .me = THIS_MODULE, |
| 163 | udp_unique_tuple, | 163 | .manip_pkt = udp_manip_pkt, |
| 164 | udp_print, | 164 | .in_range = udp_in_range, |
| 165 | udp_print_range | 165 | .unique_tuple = udp_unique_tuple, |
| 166 | .print = udp_print, | ||
| 167 | .print_range = udp_print_range, | ||
| 168 | #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ | ||
| 169 | defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) | ||
| 170 | .range_to_nfattr = ip_nat_port_range_to_nfattr, | ||
| 171 | .nfattr_to_range = ip_nat_port_nfattr_to_range, | ||
| 172 | #endif | ||
| 166 | }; | 173 | }; |
diff --git a/net/ipv4/netfilter/ip_nat_proto_unknown.c b/net/ipv4/netfilter/ip_nat_proto_unknown.c index f5525bd58d16..99bbef56f84e 100644 --- a/net/ipv4/netfilter/ip_nat_proto_unknown.c +++ b/net/ipv4/netfilter/ip_nat_proto_unknown.c | |||
| @@ -61,10 +61,11 @@ unknown_print_range(char *buffer, const struct ip_nat_range *range) | |||
| 61 | } | 61 | } |
| 62 | 62 | ||
| 63 | struct ip_nat_protocol ip_nat_unknown_protocol = { | 63 | struct ip_nat_protocol ip_nat_unknown_protocol = { |
| 64 | "unknown", 0, | 64 | .name = "unknown", |
| 65 | unknown_manip_pkt, | 65 | .me = THIS_MODULE, |
| 66 | unknown_in_range, | 66 | .manip_pkt = unknown_manip_pkt, |
| 67 | unknown_unique_tuple, | 67 | .in_range = unknown_in_range, |
| 68 | unknown_print, | 68 | .unique_tuple = unknown_unique_tuple, |
| 69 | unknown_print_range | 69 | .print = unknown_print, |
| 70 | .print_range = unknown_print_range | ||
| 70 | }; | 71 | }; |
diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c index 2a48b6e635ae..93b2c5111bb2 100644 --- a/net/ipv4/netfilter/ip_nat_snmp_basic.c +++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c | |||
| @@ -1275,7 +1275,7 @@ static int help(struct sk_buff **pskb, | |||
| 1275 | return NF_DROP; | 1275 | return NF_DROP; |
| 1276 | } | 1276 | } |
| 1277 | 1277 | ||
| 1278 | if (!skb_ip_make_writable(pskb, (*pskb)->len)) | 1278 | if (!skb_make_writable(pskb, (*pskb)->len)) |
| 1279 | return NF_DROP; | 1279 | return NF_DROP; |
| 1280 | 1280 | ||
| 1281 | spin_lock_bh(&snmp_lock); | 1281 | spin_lock_bh(&snmp_lock); |
diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c index 91d5ea1dbbc9..89db052add81 100644 --- a/net/ipv4/netfilter/ip_nat_standalone.c +++ b/net/ipv4/netfilter/ip_nat_standalone.c | |||
| @@ -73,8 +73,6 @@ ip_nat_fn(unsigned int hooknum, | |||
| 73 | IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off | 73 | IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off |
| 74 | & htons(IP_MF|IP_OFFSET))); | 74 | & htons(IP_MF|IP_OFFSET))); |
| 75 | 75 | ||
| 76 | (*pskb)->nfcache |= NFC_UNKNOWN; | ||
| 77 | |||
| 78 | /* If we had a hardware checksum before, it's now invalid */ | 76 | /* If we had a hardware checksum before, it's now invalid */ |
| 79 | if ((*pskb)->ip_summed == CHECKSUM_HW) | 77 | if ((*pskb)->ip_summed == CHECKSUM_HW) |
| 80 | if (skb_checksum_help(*pskb, (out == NULL))) | 78 | if (skb_checksum_help(*pskb, (out == NULL))) |
| @@ -396,6 +394,8 @@ module_exit(fini); | |||
| 396 | EXPORT_SYMBOL(ip_nat_setup_info); | 394 | EXPORT_SYMBOL(ip_nat_setup_info); |
| 397 | EXPORT_SYMBOL(ip_nat_protocol_register); | 395 | EXPORT_SYMBOL(ip_nat_protocol_register); |
| 398 | EXPORT_SYMBOL(ip_nat_protocol_unregister); | 396 | EXPORT_SYMBOL(ip_nat_protocol_unregister); |
| 397 | EXPORT_SYMBOL_GPL(ip_nat_proto_find_get); | ||
| 398 | EXPORT_SYMBOL_GPL(ip_nat_proto_put); | ||
| 399 | EXPORT_SYMBOL(ip_nat_cheat_check); | 399 | EXPORT_SYMBOL(ip_nat_cheat_check); |
| 400 | EXPORT_SYMBOL(ip_nat_mangle_tcp_packet); | 400 | EXPORT_SYMBOL(ip_nat_mangle_tcp_packet); |
| 401 | EXPORT_SYMBOL(ip_nat_mangle_udp_packet); | 401 | EXPORT_SYMBOL(ip_nat_mangle_udp_packet); |
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index c6baa8174389..d54f14d926f6 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c | |||
| @@ -43,17 +43,10 @@ | |||
| 43 | #define NET_IPQ_QMAX 2088 | 43 | #define NET_IPQ_QMAX 2088 |
| 44 | #define NET_IPQ_QMAX_NAME "ip_queue_maxlen" | 44 | #define NET_IPQ_QMAX_NAME "ip_queue_maxlen" |
| 45 | 45 | ||
| 46 | struct ipq_rt_info { | ||
| 47 | __u8 tos; | ||
| 48 | __u32 daddr; | ||
| 49 | __u32 saddr; | ||
| 50 | }; | ||
| 51 | |||
| 52 | struct ipq_queue_entry { | 46 | struct ipq_queue_entry { |
| 53 | struct list_head list; | 47 | struct list_head list; |
| 54 | struct nf_info *info; | 48 | struct nf_info *info; |
| 55 | struct sk_buff *skb; | 49 | struct sk_buff *skb; |
| 56 | struct ipq_rt_info rt_info; | ||
| 57 | }; | 50 | }; |
| 58 | 51 | ||
| 59 | typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long); | 52 | typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long); |
| @@ -247,8 +240,8 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp) | |||
| 247 | 240 | ||
| 248 | pmsg->packet_id = (unsigned long )entry; | 241 | pmsg->packet_id = (unsigned long )entry; |
| 249 | pmsg->data_len = data_len; | 242 | pmsg->data_len = data_len; |
| 250 | pmsg->timestamp_sec = entry->skb->stamp.tv_sec; | 243 | pmsg->timestamp_sec = skb_tv_base.tv_sec + entry->skb->tstamp.off_sec; |
| 251 | pmsg->timestamp_usec = entry->skb->stamp.tv_usec; | 244 | pmsg->timestamp_usec = skb_tv_base.tv_usec + entry->skb->tstamp.off_usec; |
| 252 | pmsg->mark = entry->skb->nfmark; | 245 | pmsg->mark = entry->skb->nfmark; |
| 253 | pmsg->hook = entry->info->hook; | 246 | pmsg->hook = entry->info->hook; |
| 254 | pmsg->hw_protocol = entry->skb->protocol; | 247 | pmsg->hw_protocol = entry->skb->protocol; |
| @@ -287,7 +280,8 @@ nlmsg_failure: | |||
| 287 | } | 280 | } |
| 288 | 281 | ||
| 289 | static int | 282 | static int |
| 290 | ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data) | 283 | ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, |
| 284 | unsigned int queuenum, void *data) | ||
| 291 | { | 285 | { |
| 292 | int status = -EINVAL; | 286 | int status = -EINVAL; |
| 293 | struct sk_buff *nskb; | 287 | struct sk_buff *nskb; |
| @@ -305,14 +299,6 @@ ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data) | |||
| 305 | entry->info = info; | 299 | entry->info = info; |
| 306 | entry->skb = skb; | 300 | entry->skb = skb; |
| 307 | 301 | ||
| 308 | if (entry->info->hook == NF_IP_LOCAL_OUT) { | ||
| 309 | struct iphdr *iph = skb->nh.iph; | ||
| 310 | |||
| 311 | entry->rt_info.tos = iph->tos; | ||
| 312 | entry->rt_info.daddr = iph->daddr; | ||
| 313 | entry->rt_info.saddr = iph->saddr; | ||
| 314 | } | ||
| 315 | |||
| 316 | nskb = ipq_build_packet_message(entry, &status); | 302 | nskb = ipq_build_packet_message(entry, &status); |
| 317 | if (nskb == NULL) | 303 | if (nskb == NULL) |
| 318 | goto err_out_free; | 304 | goto err_out_free; |
| @@ -388,24 +374,11 @@ ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct ipq_queue_entry *e) | |||
| 388 | } | 374 | } |
| 389 | skb_put(e->skb, diff); | 375 | skb_put(e->skb, diff); |
| 390 | } | 376 | } |
| 391 | if (!skb_ip_make_writable(&e->skb, v->data_len)) | 377 | if (!skb_make_writable(&e->skb, v->data_len)) |
| 392 | return -ENOMEM; | 378 | return -ENOMEM; |
| 393 | memcpy(e->skb->data, v->payload, v->data_len); | 379 | memcpy(e->skb->data, v->payload, v->data_len); |
| 394 | e->skb->ip_summed = CHECKSUM_NONE; | 380 | e->skb->ip_summed = CHECKSUM_NONE; |
| 395 | e->skb->nfcache |= NFC_ALTERED; | 381 | |
| 396 | |||
| 397 | /* | ||
| 398 | * Extra routing may needed on local out, as the QUEUE target never | ||
| 399 | * returns control to the table. | ||
| 400 | */ | ||
| 401 | if (e->info->hook == NF_IP_LOCAL_OUT) { | ||
| 402 | struct iphdr *iph = e->skb->nh.iph; | ||
| 403 | |||
| 404 | if (!(iph->tos == e->rt_info.tos | ||
| 405 | && iph->daddr == e->rt_info.daddr | ||
| 406 | && iph->saddr == e->rt_info.saddr)) | ||
| 407 | return ip_route_me_harder(&e->skb); | ||
| 408 | } | ||
| 409 | return 0; | 382 | return 0; |
| 410 | } | 383 | } |
| 411 | 384 | ||
| @@ -683,6 +656,11 @@ ipq_get_info(char *buffer, char **start, off_t offset, int length) | |||
| 683 | } | 656 | } |
| 684 | #endif /* CONFIG_PROC_FS */ | 657 | #endif /* CONFIG_PROC_FS */ |
| 685 | 658 | ||
| 659 | static struct nf_queue_handler nfqh = { | ||
| 660 | .name = "ip_queue", | ||
| 661 | .outfn = &ipq_enqueue_packet, | ||
| 662 | }; | ||
| 663 | |||
| 686 | static int | 664 | static int |
| 687 | init_or_cleanup(int init) | 665 | init_or_cleanup(int init) |
| 688 | { | 666 | { |
| @@ -693,7 +671,8 @@ init_or_cleanup(int init) | |||
| 693 | goto cleanup; | 671 | goto cleanup; |
| 694 | 672 | ||
| 695 | netlink_register_notifier(&ipq_nl_notifier); | 673 | netlink_register_notifier(&ipq_nl_notifier); |
| 696 | ipqnl = netlink_kernel_create(NETLINK_FIREWALL, ipq_rcv_sk); | 674 | ipqnl = netlink_kernel_create(NETLINK_FIREWALL, 0, ipq_rcv_sk, |
| 675 | THIS_MODULE); | ||
| 697 | if (ipqnl == NULL) { | 676 | if (ipqnl == NULL) { |
| 698 | printk(KERN_ERR "ip_queue: failed to create netlink socket\n"); | 677 | printk(KERN_ERR "ip_queue: failed to create netlink socket\n"); |
| 699 | goto cleanup_netlink_notifier; | 678 | goto cleanup_netlink_notifier; |
| @@ -710,7 +689,7 @@ init_or_cleanup(int init) | |||
| 710 | register_netdevice_notifier(&ipq_dev_notifier); | 689 | register_netdevice_notifier(&ipq_dev_notifier); |
| 711 | ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0); | 690 | ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0); |
| 712 | 691 | ||
| 713 | status = nf_register_queue_handler(PF_INET, ipq_enqueue_packet, NULL); | 692 | status = nf_register_queue_handler(PF_INET, &nfqh); |
| 714 | if (status < 0) { | 693 | if (status < 0) { |
| 715 | printk(KERN_ERR "ip_queue: failed to register queue handler\n"); | 694 | printk(KERN_ERR "ip_queue: failed to register queue handler\n"); |
| 716 | goto cleanup_sysctl; | 695 | goto cleanup_sysctl; |
| @@ -718,7 +697,7 @@ init_or_cleanup(int init) | |||
| 718 | return status; | 697 | return status; |
| 719 | 698 | ||
| 720 | cleanup: | 699 | cleanup: |
| 721 | nf_unregister_queue_handler(PF_INET); | 700 | nf_unregister_queue_handlers(&nfqh); |
| 722 | synchronize_net(); | 701 | synchronize_net(); |
| 723 | ipq_flush(NF_DROP); | 702 | ipq_flush(NF_DROP); |
| 724 | 703 | ||
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index c88dfcd38c56..eef99a1b5de6 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c | |||
| @@ -312,7 +312,6 @@ ipt_do_table(struct sk_buff **pskb, | |||
| 312 | do { | 312 | do { |
| 313 | IP_NF_ASSERT(e); | 313 | IP_NF_ASSERT(e); |
| 314 | IP_NF_ASSERT(back); | 314 | IP_NF_ASSERT(back); |
| 315 | (*pskb)->nfcache |= e->nfcache; | ||
| 316 | if (ip_packet_match(ip, indev, outdev, &e->ip, offset)) { | 315 | if (ip_packet_match(ip, indev, outdev, &e->ip, offset)) { |
| 317 | struct ipt_entry_target *t; | 316 | struct ipt_entry_target *t; |
| 318 | 317 | ||
| @@ -341,8 +340,8 @@ ipt_do_table(struct sk_buff **pskb, | |||
| 341 | back->comefrom); | 340 | back->comefrom); |
| 342 | continue; | 341 | continue; |
| 343 | } | 342 | } |
| 344 | if (table_base + v | 343 | if (table_base + v != (void *)e + e->next_offset |
| 345 | != (void *)e + e->next_offset) { | 344 | && !(e->ip.flags & IPT_F_GOTO)) { |
| 346 | /* Save old back ptr in next entry */ | 345 | /* Save old back ptr in next entry */ |
| 347 | struct ipt_entry *next | 346 | struct ipt_entry *next |
| 348 | = (void *)e + e->next_offset; | 347 | = (void *)e + e->next_offset; |
diff --git a/net/ipv4/netfilter/ipt_CLASSIFY.c b/net/ipv4/netfilter/ipt_CLASSIFY.c index 9842e6e23184..dab78d8bd494 100644 --- a/net/ipv4/netfilter/ipt_CLASSIFY.c +++ b/net/ipv4/netfilter/ipt_CLASSIFY.c | |||
| @@ -32,10 +32,8 @@ target(struct sk_buff **pskb, | |||
| 32 | { | 32 | { |
| 33 | const struct ipt_classify_target_info *clinfo = targinfo; | 33 | const struct ipt_classify_target_info *clinfo = targinfo; |
| 34 | 34 | ||
| 35 | if((*pskb)->priority != clinfo->priority) { | 35 | if((*pskb)->priority != clinfo->priority) |
| 36 | (*pskb)->priority = clinfo->priority; | 36 | (*pskb)->priority = clinfo->priority; |
| 37 | (*pskb)->nfcache |= NFC_ALTERED; | ||
| 38 | } | ||
| 39 | 37 | ||
| 40 | return IPT_CONTINUE; | 38 | return IPT_CONTINUE; |
| 41 | } | 39 | } |
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 6706d3a1bc4f..2d05cafec221 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c | |||
| @@ -367,7 +367,7 @@ target(struct sk_buff **pskb, | |||
| 367 | #ifdef DEBUG_CLUSTERP | 367 | #ifdef DEBUG_CLUSTERP |
| 368 | DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); | 368 | DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); |
| 369 | #endif | 369 | #endif |
| 370 | DEBUGP("hash=%u ct_hash=%lu ", hash, ct->mark); | 370 | DEBUGP("hash=%u ct_hash=%u ", hash, ct->mark); |
| 371 | if (!clusterip_responsible(cipinfo->config, hash)) { | 371 | if (!clusterip_responsible(cipinfo->config, hash)) { |
| 372 | DEBUGP("not responsible\n"); | 372 | DEBUGP("not responsible\n"); |
| 373 | return NF_DROP; | 373 | return NF_DROP; |
diff --git a/net/ipv4/netfilter/ipt_CONNMARK.c b/net/ipv4/netfilter/ipt_CONNMARK.c index 30ddd3e18eb7..134638021339 100644 --- a/net/ipv4/netfilter/ipt_CONNMARK.c +++ b/net/ipv4/netfilter/ipt_CONNMARK.c | |||
| @@ -40,9 +40,9 @@ target(struct sk_buff **pskb, | |||
| 40 | void *userinfo) | 40 | void *userinfo) |
| 41 | { | 41 | { |
| 42 | const struct ipt_connmark_target_info *markinfo = targinfo; | 42 | const struct ipt_connmark_target_info *markinfo = targinfo; |
| 43 | unsigned long diff; | 43 | u_int32_t diff; |
| 44 | unsigned long nfmark; | 44 | u_int32_t nfmark; |
| 45 | unsigned long newmark; | 45 | u_int32_t newmark; |
| 46 | 46 | ||
| 47 | enum ip_conntrack_info ctinfo; | 47 | enum ip_conntrack_info ctinfo; |
| 48 | struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo); | 48 | struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo); |
| @@ -61,10 +61,8 @@ target(struct sk_buff **pskb, | |||
| 61 | case IPT_CONNMARK_RESTORE: | 61 | case IPT_CONNMARK_RESTORE: |
| 62 | nfmark = (*pskb)->nfmark; | 62 | nfmark = (*pskb)->nfmark; |
| 63 | diff = (ct->mark ^ nfmark) & markinfo->mask; | 63 | diff = (ct->mark ^ nfmark) & markinfo->mask; |
| 64 | if (diff != 0) { | 64 | if (diff != 0) |
| 65 | (*pskb)->nfmark = nfmark ^ diff; | 65 | (*pskb)->nfmark = nfmark ^ diff; |
| 66 | (*pskb)->nfcache |= NFC_ALTERED; | ||
| 67 | } | ||
| 68 | break; | 66 | break; |
| 69 | } | 67 | } |
| 70 | } | 68 | } |
| @@ -94,6 +92,11 @@ checkentry(const char *tablename, | |||
| 94 | } | 92 | } |
| 95 | } | 93 | } |
| 96 | 94 | ||
| 95 | if (matchinfo->mark > 0xffffffff || matchinfo->mask > 0xffffffff) { | ||
| 96 | printk(KERN_WARNING "CONNMARK: Only supports 32bit mark\n"); | ||
| 97 | return 0; | ||
| 98 | } | ||
| 99 | |||
| 97 | return 1; | 100 | return 1; |
| 98 | } | 101 | } |
| 99 | 102 | ||
diff --git a/net/ipv4/netfilter/ipt_DSCP.c b/net/ipv4/netfilter/ipt_DSCP.c index 3ea4509099f9..6e319570a28c 100644 --- a/net/ipv4/netfilter/ipt_DSCP.c +++ b/net/ipv4/netfilter/ipt_DSCP.c | |||
| @@ -39,7 +39,7 @@ target(struct sk_buff **pskb, | |||
| 39 | if (((*pskb)->nh.iph->tos & IPT_DSCP_MASK) != sh_dscp) { | 39 | if (((*pskb)->nh.iph->tos & IPT_DSCP_MASK) != sh_dscp) { |
| 40 | u_int16_t diffs[2]; | 40 | u_int16_t diffs[2]; |
| 41 | 41 | ||
| 42 | if (!skb_ip_make_writable(pskb, sizeof(struct iphdr))) | 42 | if (!skb_make_writable(pskb, sizeof(struct iphdr))) |
| 43 | return NF_DROP; | 43 | return NF_DROP; |
| 44 | 44 | ||
| 45 | diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF; | 45 | diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF; |
| @@ -51,7 +51,6 @@ target(struct sk_buff **pskb, | |||
| 51 | sizeof(diffs), | 51 | sizeof(diffs), |
| 52 | (*pskb)->nh.iph->check | 52 | (*pskb)->nh.iph->check |
| 53 | ^ 0xFFFF)); | 53 | ^ 0xFFFF)); |
| 54 | (*pskb)->nfcache |= NFC_ALTERED; | ||
| 55 | } | 54 | } |
| 56 | return IPT_CONTINUE; | 55 | return IPT_CONTINUE; |
| 57 | } | 56 | } |
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c index 94a0ce1c1c9d..a1319693f648 100644 --- a/net/ipv4/netfilter/ipt_ECN.c +++ b/net/ipv4/netfilter/ipt_ECN.c | |||
| @@ -31,7 +31,7 @@ set_ect_ip(struct sk_buff **pskb, const struct ipt_ECN_info *einfo) | |||
| 31 | != (einfo->ip_ect & IPT_ECN_IP_MASK)) { | 31 | != (einfo->ip_ect & IPT_ECN_IP_MASK)) { |
| 32 | u_int16_t diffs[2]; | 32 | u_int16_t diffs[2]; |
| 33 | 33 | ||
| 34 | if (!skb_ip_make_writable(pskb, sizeof(struct iphdr))) | 34 | if (!skb_make_writable(pskb, sizeof(struct iphdr))) |
| 35 | return 0; | 35 | return 0; |
| 36 | 36 | ||
| 37 | diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF; | 37 | diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF; |
| @@ -43,7 +43,6 @@ set_ect_ip(struct sk_buff **pskb, const struct ipt_ECN_info *einfo) | |||
| 43 | sizeof(diffs), | 43 | sizeof(diffs), |
| 44 | (*pskb)->nh.iph->check | 44 | (*pskb)->nh.iph->check |
| 45 | ^0xFFFF)); | 45 | ^0xFFFF)); |
| 46 | (*pskb)->nfcache |= NFC_ALTERED; | ||
| 47 | } | 46 | } |
| 48 | return 1; | 47 | return 1; |
| 49 | } | 48 | } |
| @@ -67,7 +66,7 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward) | |||
| 67 | tcph->cwr == einfo->proto.tcp.cwr))) | 66 | tcph->cwr == einfo->proto.tcp.cwr))) |
| 68 | return 1; | 67 | return 1; |
| 69 | 68 | ||
| 70 | if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) | 69 | if (!skb_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) |
| 71 | return 0; | 70 | return 0; |
| 72 | tcph = (void *)(*pskb)->nh.iph + (*pskb)->nh.iph->ihl*4; | 71 | tcph = (void *)(*pskb)->nh.iph + (*pskb)->nh.iph->ihl*4; |
| 73 | 72 | ||
| @@ -87,7 +86,6 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward) | |||
| 87 | tcph->check = csum_fold(csum_partial((char *)diffs, | 86 | tcph->check = csum_fold(csum_partial((char *)diffs, |
| 88 | sizeof(diffs), | 87 | sizeof(diffs), |
| 89 | tcph->check^0xFFFF)); | 88 | tcph->check^0xFFFF)); |
| 90 | (*pskb)->nfcache |= NFC_ALTERED; | ||
| 91 | return 1; | 89 | return 1; |
| 92 | } | 90 | } |
| 93 | 91 | ||
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c index ef08733d26da..92ed050fac69 100644 --- a/net/ipv4/netfilter/ipt_LOG.c +++ b/net/ipv4/netfilter/ipt_LOG.c | |||
| @@ -27,10 +27,6 @@ MODULE_LICENSE("GPL"); | |||
| 27 | MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); | 27 | MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); |
| 28 | MODULE_DESCRIPTION("iptables syslog logging module"); | 28 | MODULE_DESCRIPTION("iptables syslog logging module"); |
| 29 | 29 | ||
| 30 | static unsigned int nflog = 1; | ||
| 31 | module_param(nflog, int, 0400); | ||
| 32 | MODULE_PARM_DESC(nflog, "register as internal netfilter logging module"); | ||
| 33 | |||
| 34 | #if 0 | 30 | #if 0 |
| 35 | #define DEBUGP printk | 31 | #define DEBUGP printk |
| 36 | #else | 32 | #else |
| @@ -41,11 +37,17 @@ MODULE_PARM_DESC(nflog, "register as internal netfilter logging module"); | |||
| 41 | static DEFINE_SPINLOCK(log_lock); | 37 | static DEFINE_SPINLOCK(log_lock); |
| 42 | 38 | ||
| 43 | /* One level of recursion won't kill us */ | 39 | /* One level of recursion won't kill us */ |
| 44 | static void dump_packet(const struct ipt_log_info *info, | 40 | static void dump_packet(const struct nf_loginfo *info, |
| 45 | const struct sk_buff *skb, | 41 | const struct sk_buff *skb, |
| 46 | unsigned int iphoff) | 42 | unsigned int iphoff) |
| 47 | { | 43 | { |
| 48 | struct iphdr _iph, *ih; | 44 | struct iphdr _iph, *ih; |
| 45 | unsigned int logflags; | ||
| 46 | |||
| 47 | if (info->type == NF_LOG_TYPE_LOG) | ||
| 48 | logflags = info->u.log.logflags; | ||
| 49 | else | ||
| 50 | logflags = NF_LOG_MASK; | ||
| 49 | 51 | ||
| 50 | ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); | 52 | ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); |
| 51 | if (ih == NULL) { | 53 | if (ih == NULL) { |
| @@ -76,7 +78,7 @@ static void dump_packet(const struct ipt_log_info *info, | |||
| 76 | if (ntohs(ih->frag_off) & IP_OFFSET) | 78 | if (ntohs(ih->frag_off) & IP_OFFSET) |
| 77 | printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); | 79 | printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); |
| 78 | 80 | ||
| 79 | if ((info->logflags & IPT_LOG_IPOPT) | 81 | if ((logflags & IPT_LOG_IPOPT) |
| 80 | && ih->ihl * 4 > sizeof(struct iphdr)) { | 82 | && ih->ihl * 4 > sizeof(struct iphdr)) { |
| 81 | unsigned char _opt[4 * 15 - sizeof(struct iphdr)], *op; | 83 | unsigned char _opt[4 * 15 - sizeof(struct iphdr)], *op; |
| 82 | unsigned int i, optsize; | 84 | unsigned int i, optsize; |
| @@ -119,7 +121,7 @@ static void dump_packet(const struct ipt_log_info *info, | |||
| 119 | printk("SPT=%u DPT=%u ", | 121 | printk("SPT=%u DPT=%u ", |
| 120 | ntohs(th->source), ntohs(th->dest)); | 122 | ntohs(th->source), ntohs(th->dest)); |
| 121 | /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ | 123 | /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ |
| 122 | if (info->logflags & IPT_LOG_TCPSEQ) | 124 | if (logflags & IPT_LOG_TCPSEQ) |
| 123 | printk("SEQ=%u ACK=%u ", | 125 | printk("SEQ=%u ACK=%u ", |
| 124 | ntohl(th->seq), ntohl(th->ack_seq)); | 126 | ntohl(th->seq), ntohl(th->ack_seq)); |
| 125 | /* Max length: 13 "WINDOW=65535 " */ | 127 | /* Max length: 13 "WINDOW=65535 " */ |
| @@ -146,7 +148,7 @@ static void dump_packet(const struct ipt_log_info *info, | |||
| 146 | /* Max length: 11 "URGP=65535 " */ | 148 | /* Max length: 11 "URGP=65535 " */ |
| 147 | printk("URGP=%u ", ntohs(th->urg_ptr)); | 149 | printk("URGP=%u ", ntohs(th->urg_ptr)); |
| 148 | 150 | ||
| 149 | if ((info->logflags & IPT_LOG_TCPOPT) | 151 | if ((logflags & IPT_LOG_TCPOPT) |
| 150 | && th->doff * 4 > sizeof(struct tcphdr)) { | 152 | && th->doff * 4 > sizeof(struct tcphdr)) { |
| 151 | unsigned char _opt[4 * 15 - sizeof(struct tcphdr)]; | 153 | unsigned char _opt[4 * 15 - sizeof(struct tcphdr)]; |
| 152 | unsigned char *op; | 154 | unsigned char *op; |
| @@ -328,7 +330,7 @@ static void dump_packet(const struct ipt_log_info *info, | |||
| 328 | } | 330 | } |
| 329 | 331 | ||
| 330 | /* Max length: 15 "UID=4294967295 " */ | 332 | /* Max length: 15 "UID=4294967295 " */ |
| 331 | if ((info->logflags & IPT_LOG_UID) && !iphoff && skb->sk) { | 333 | if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) { |
| 332 | read_lock_bh(&skb->sk->sk_callback_lock); | 334 | read_lock_bh(&skb->sk->sk_callback_lock); |
| 333 | if (skb->sk->sk_socket && skb->sk->sk_socket->file) | 335 | if (skb->sk->sk_socket && skb->sk->sk_socket->file) |
| 334 | printk("UID=%u ", skb->sk->sk_socket->file->f_uid); | 336 | printk("UID=%u ", skb->sk->sk_socket->file->f_uid); |
| @@ -349,19 +351,31 @@ static void dump_packet(const struct ipt_log_info *info, | |||
| 349 | /* maxlen = 230+ 91 + 230 + 252 = 803 */ | 351 | /* maxlen = 230+ 91 + 230 + 252 = 803 */ |
| 350 | } | 352 | } |
| 351 | 353 | ||
| 354 | struct nf_loginfo default_loginfo = { | ||
| 355 | .type = NF_LOG_TYPE_LOG, | ||
| 356 | .u = { | ||
| 357 | .log = { | ||
| 358 | .level = 0, | ||
| 359 | .logflags = NF_LOG_MASK, | ||
| 360 | }, | ||
| 361 | }, | ||
| 362 | }; | ||
| 363 | |||
| 352 | static void | 364 | static void |
| 353 | ipt_log_packet(unsigned int hooknum, | 365 | ipt_log_packet(unsigned int pf, |
| 366 | unsigned int hooknum, | ||
| 354 | const struct sk_buff *skb, | 367 | const struct sk_buff *skb, |
| 355 | const struct net_device *in, | 368 | const struct net_device *in, |
| 356 | const struct net_device *out, | 369 | const struct net_device *out, |
| 357 | const struct ipt_log_info *loginfo, | 370 | const struct nf_loginfo *loginfo, |
| 358 | const char *level_string, | ||
| 359 | const char *prefix) | 371 | const char *prefix) |
| 360 | { | 372 | { |
| 373 | if (!loginfo) | ||
| 374 | loginfo = &default_loginfo; | ||
| 375 | |||
| 361 | spin_lock_bh(&log_lock); | 376 | spin_lock_bh(&log_lock); |
| 362 | printk(level_string); | 377 | printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, |
| 363 | printk("%sIN=%s OUT=%s ", | 378 | prefix, |
| 364 | prefix == NULL ? loginfo->prefix : prefix, | ||
| 365 | in ? in->name : "", | 379 | in ? in->name : "", |
| 366 | out ? out->name : ""); | 380 | out ? out->name : ""); |
| 367 | #ifdef CONFIG_BRIDGE_NETFILTER | 381 | #ifdef CONFIG_BRIDGE_NETFILTER |
| @@ -405,28 +419,15 @@ ipt_log_target(struct sk_buff **pskb, | |||
| 405 | void *userinfo) | 419 | void *userinfo) |
| 406 | { | 420 | { |
| 407 | const struct ipt_log_info *loginfo = targinfo; | 421 | const struct ipt_log_info *loginfo = targinfo; |
| 408 | char level_string[4] = "< >"; | 422 | struct nf_loginfo li; |
| 409 | 423 | ||
| 410 | level_string[1] = '0' + (loginfo->level % 8); | 424 | li.type = NF_LOG_TYPE_LOG; |
| 411 | ipt_log_packet(hooknum, *pskb, in, out, loginfo, level_string, NULL); | 425 | li.u.log.level = loginfo->level; |
| 426 | li.u.log.logflags = loginfo->logflags; | ||
| 412 | 427 | ||
| 413 | return IPT_CONTINUE; | 428 | nf_log_packet(PF_INET, hooknum, *pskb, in, out, &li, loginfo->prefix); |
| 414 | } | ||
| 415 | 429 | ||
| 416 | static void | 430 | return IPT_CONTINUE; |
| 417 | ipt_logfn(unsigned int hooknum, | ||
| 418 | const struct sk_buff *skb, | ||
| 419 | const struct net_device *in, | ||
| 420 | const struct net_device *out, | ||
| 421 | const char *prefix) | ||
| 422 | { | ||
| 423 | struct ipt_log_info loginfo = { | ||
| 424 | .level = 0, | ||
| 425 | .logflags = IPT_LOG_MASK, | ||
| 426 | .prefix = "" | ||
| 427 | }; | ||
| 428 | |||
| 429 | ipt_log_packet(hooknum, skb, in, out, &loginfo, KERN_WARNING, prefix); | ||
| 430 | } | 431 | } |
| 431 | 432 | ||
| 432 | static int ipt_log_checkentry(const char *tablename, | 433 | static int ipt_log_checkentry(const char *tablename, |
| @@ -464,20 +465,29 @@ static struct ipt_target ipt_log_reg = { | |||
| 464 | .me = THIS_MODULE, | 465 | .me = THIS_MODULE, |
| 465 | }; | 466 | }; |
| 466 | 467 | ||
| 468 | static struct nf_logger ipt_log_logger ={ | ||
| 469 | .name = "ipt_LOG", | ||
| 470 | .logfn = &ipt_log_packet, | ||
| 471 | .me = THIS_MODULE, | ||
| 472 | }; | ||
| 473 | |||
| 467 | static int __init init(void) | 474 | static int __init init(void) |
| 468 | { | 475 | { |
| 469 | if (ipt_register_target(&ipt_log_reg)) | 476 | if (ipt_register_target(&ipt_log_reg)) |
| 470 | return -EINVAL; | 477 | return -EINVAL; |
| 471 | if (nflog) | 478 | if (nf_log_register(PF_INET, &ipt_log_logger) < 0) { |
| 472 | nf_log_register(PF_INET, &ipt_logfn); | 479 | printk(KERN_WARNING "ipt_LOG: not logging via system console " |
| 480 | "since somebody else already registered for PF_INET\n"); | ||
| 481 | /* we cannot make module load fail here, since otherwise | ||
| 482 | * iptables userspace would abort */ | ||
| 483 | } | ||
| 473 | 484 | ||
| 474 | return 0; | 485 | return 0; |
| 475 | } | 486 | } |
| 476 | 487 | ||
| 477 | static void __exit fini(void) | 488 | static void __exit fini(void) |
| 478 | { | 489 | { |
| 479 | if (nflog) | 490 | nf_log_unregister_logger(&ipt_log_logger); |
| 480 | nf_log_unregister(PF_INET, &ipt_logfn); | ||
| 481 | ipt_unregister_target(&ipt_log_reg); | 491 | ipt_unregister_target(&ipt_log_reg); |
| 482 | } | 492 | } |
| 483 | 493 | ||
diff --git a/net/ipv4/netfilter/ipt_MARK.c b/net/ipv4/netfilter/ipt_MARK.c index 33c6f9b63b8d..52b4f2c296bf 100644 --- a/net/ipv4/netfilter/ipt_MARK.c +++ b/net/ipv4/netfilter/ipt_MARK.c | |||
| @@ -29,10 +29,9 @@ target_v0(struct sk_buff **pskb, | |||
| 29 | { | 29 | { |
| 30 | const struct ipt_mark_target_info *markinfo = targinfo; | 30 | const struct ipt_mark_target_info *markinfo = targinfo; |
| 31 | 31 | ||
| 32 | if((*pskb)->nfmark != markinfo->mark) { | 32 | if((*pskb)->nfmark != markinfo->mark) |
| 33 | (*pskb)->nfmark = markinfo->mark; | 33 | (*pskb)->nfmark = markinfo->mark; |
| 34 | (*pskb)->nfcache |= NFC_ALTERED; | 34 | |
| 35 | } | ||
| 36 | return IPT_CONTINUE; | 35 | return IPT_CONTINUE; |
| 37 | } | 36 | } |
| 38 | 37 | ||
| @@ -61,10 +60,9 @@ target_v1(struct sk_buff **pskb, | |||
| 61 | break; | 60 | break; |
| 62 | } | 61 | } |
| 63 | 62 | ||
| 64 | if((*pskb)->nfmark != mark) { | 63 | if((*pskb)->nfmark != mark) |
| 65 | (*pskb)->nfmark = mark; | 64 | (*pskb)->nfmark = mark; |
| 66 | (*pskb)->nfcache |= NFC_ALTERED; | 65 | |
| 67 | } | ||
| 68 | return IPT_CONTINUE; | 66 | return IPT_CONTINUE; |
| 69 | } | 67 | } |
| 70 | 68 | ||
| @@ -76,6 +74,8 @@ checkentry_v0(const char *tablename, | |||
| 76 | unsigned int targinfosize, | 74 | unsigned int targinfosize, |
| 77 | unsigned int hook_mask) | 75 | unsigned int hook_mask) |
| 78 | { | 76 | { |
| 77 | struct ipt_mark_target_info *markinfo = targinfo; | ||
| 78 | |||
| 79 | if (targinfosize != IPT_ALIGN(sizeof(struct ipt_mark_target_info))) { | 79 | if (targinfosize != IPT_ALIGN(sizeof(struct ipt_mark_target_info))) { |
| 80 | printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n", | 80 | printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n", |
| 81 | targinfosize, | 81 | targinfosize, |
| @@ -88,6 +88,11 @@ checkentry_v0(const char *tablename, | |||
| 88 | return 0; | 88 | return 0; |
| 89 | } | 89 | } |
| 90 | 90 | ||
| 91 | if (markinfo->mark > 0xffffffff) { | ||
| 92 | printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n"); | ||
| 93 | return 0; | ||
| 94 | } | ||
| 95 | |||
| 91 | return 1; | 96 | return 1; |
| 92 | } | 97 | } |
| 93 | 98 | ||
| @@ -120,6 +125,11 @@ checkentry_v1(const char *tablename, | |||
| 120 | return 0; | 125 | return 0; |
| 121 | } | 126 | } |
| 122 | 127 | ||
| 128 | if (markinfo->mark > 0xffffffff) { | ||
| 129 | printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n"); | ||
| 130 | return 0; | ||
| 131 | } | ||
| 132 | |||
| 123 | return 1; | 133 | return 1; |
| 124 | } | 134 | } |
| 125 | 135 | ||
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 91e74502c3d3..2f3e181c8e97 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c | |||
| @@ -86,11 +86,6 @@ masquerade_target(struct sk_buff **pskb, | |||
| 86 | 86 | ||
| 87 | IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING); | 87 | IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING); |
| 88 | 88 | ||
| 89 | /* FIXME: For the moment, don't do local packets, breaks | ||
| 90 | testsuite for 2.3.49 --RR */ | ||
| 91 | if ((*pskb)->sk) | ||
| 92 | return NF_ACCEPT; | ||
| 93 | |||
| 94 | ct = ip_conntrack_get(*pskb, &ctinfo); | 89 | ct = ip_conntrack_get(*pskb, &ctinfo); |
| 95 | IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED | 90 | IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED |
| 96 | || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); | 91 | || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); |
diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c index 06254b29d034..e6e7b6095363 100644 --- a/net/ipv4/netfilter/ipt_NETMAP.c +++ b/net/ipv4/netfilter/ipt_NETMAP.c | |||
| @@ -46,7 +46,8 @@ check(const char *tablename, | |||
| 46 | DEBUGP(MODULENAME":check: size %u.\n", targinfosize); | 46 | DEBUGP(MODULENAME":check: size %u.\n", targinfosize); |
| 47 | return 0; | 47 | return 0; |
| 48 | } | 48 | } |
| 49 | if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_POST_ROUTING))) { | 49 | if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_POST_ROUTING) | |
| 50 | (1 << NF_IP_LOCAL_OUT))) { | ||
| 50 | DEBUGP(MODULENAME":check: bad hooks %x.\n", hook_mask); | 51 | DEBUGP(MODULENAME":check: bad hooks %x.\n", hook_mask); |
| 51 | return 0; | 52 | return 0; |
| 52 | } | 53 | } |
| @@ -76,12 +77,13 @@ target(struct sk_buff **pskb, | |||
| 76 | struct ip_nat_range newrange; | 77 | struct ip_nat_range newrange; |
| 77 | 78 | ||
| 78 | IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING | 79 | IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING |
| 79 | || hooknum == NF_IP_POST_ROUTING); | 80 | || hooknum == NF_IP_POST_ROUTING |
| 81 | || hooknum == NF_IP_LOCAL_OUT); | ||
| 80 | ct = ip_conntrack_get(*pskb, &ctinfo); | 82 | ct = ip_conntrack_get(*pskb, &ctinfo); |
| 81 | 83 | ||
| 82 | netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip); | 84 | netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip); |
| 83 | 85 | ||
| 84 | if (hooknum == NF_IP_PRE_ROUTING) | 86 | if (hooknum == NF_IP_PRE_ROUTING || hooknum == NF_IP_LOCAL_OUT) |
| 85 | new_ip = (*pskb)->nh.iph->daddr & ~netmask; | 87 | new_ip = (*pskb)->nh.iph->daddr & ~netmask; |
| 86 | else | 88 | else |
| 87 | new_ip = (*pskb)->nh.iph->saddr & ~netmask; | 89 | new_ip = (*pskb)->nh.iph->saddr & ~netmask; |
diff --git a/net/ipv4/netfilter/ipt_NFQUEUE.c b/net/ipv4/netfilter/ipt_NFQUEUE.c new file mode 100644 index 000000000000..3cedc9be8807 --- /dev/null +++ b/net/ipv4/netfilter/ipt_NFQUEUE.c | |||
| @@ -0,0 +1,70 @@ | |||
| 1 | /* iptables module for using new netfilter netlink queue | ||
| 2 | * | ||
| 3 | * (C) 2005 by Harald Welte <laforge@netfilter.org> | ||
| 4 | * | ||
| 5 | * This program is free software; you can redistribute it and/or modify | ||
| 6 | * it under the terms of the GNU General Public License version 2 as | ||
| 7 | * published by the Free Software Foundation. | ||
| 8 | * | ||
| 9 | */ | ||
| 10 | |||
| 11 | #include <linux/module.h> | ||
| 12 | #include <linux/skbuff.h> | ||
| 13 | |||
| 14 | #include <linux/netfilter.h> | ||
| 15 | #include <linux/netfilter_ipv4/ip_tables.h> | ||
| 16 | #include <linux/netfilter_ipv4/ipt_NFQUEUE.h> | ||
| 17 | |||
| 18 | MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); | ||
| 19 | MODULE_DESCRIPTION("iptables NFQUEUE target"); | ||
| 20 | MODULE_LICENSE("GPL"); | ||
| 21 | |||
| 22 | static unsigned int | ||
| 23 | target(struct sk_buff **pskb, | ||
| 24 | const struct net_device *in, | ||
| 25 | const struct net_device *out, | ||
| 26 | unsigned int hooknum, | ||
| 27 | const void *targinfo, | ||
| 28 | void *userinfo) | ||
| 29 | { | ||
| 30 | const struct ipt_NFQ_info *tinfo = targinfo; | ||
| 31 | |||
| 32 | return NF_QUEUE_NR(tinfo->queuenum); | ||
| 33 | } | ||
| 34 | |||
| 35 | static int | ||
| 36 | checkentry(const char *tablename, | ||
| 37 | const struct ipt_entry *e, | ||
| 38 | void *targinfo, | ||
| 39 | unsigned int targinfosize, | ||
| 40 | unsigned int hook_mask) | ||
| 41 | { | ||
| 42 | if (targinfosize != IPT_ALIGN(sizeof(struct ipt_NFQ_info))) { | ||
| 43 | printk(KERN_WARNING "NFQUEUE: targinfosize %u != %Zu\n", | ||
| 44 | targinfosize, | ||
| 45 | IPT_ALIGN(sizeof(struct ipt_NFQ_info))); | ||
| 46 | return 0; | ||
| 47 | } | ||
| 48 | |||
| 49 | return 1; | ||
| 50 | } | ||
| 51 | |||
| 52 | static struct ipt_target ipt_NFQ_reg = { | ||
| 53 | .name = "NFQUEUE", | ||
| 54 | .target = target, | ||
| 55 | .checkentry = checkentry, | ||
| 56 | .me = THIS_MODULE, | ||
| 57 | }; | ||
| 58 | |||
| 59 | static int __init init(void) | ||
| 60 | { | ||
| 61 | return ipt_register_target(&ipt_NFQ_reg); | ||
| 62 | } | ||
| 63 | |||
| 64 | static void __exit fini(void) | ||
| 65 | { | ||
| 66 | ipt_unregister_target(&ipt_NFQ_reg); | ||
| 67 | } | ||
| 68 | |||
| 69 | module_init(init); | ||
| 70 | module_exit(fini); | ||
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 915696446020..f115a84a4ac6 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c | |||
| @@ -156,7 +156,6 @@ static void send_reset(struct sk_buff *oldskb, int hook) | |||
| 156 | 156 | ||
| 157 | /* This packet will not be the same as the other: clear nf fields */ | 157 | /* This packet will not be the same as the other: clear nf fields */ |
| 158 | nf_reset(nskb); | 158 | nf_reset(nskb); |
| 159 | nskb->nfcache = 0; | ||
| 160 | nskb->nfmark = 0; | 159 | nskb->nfmark = 0; |
| 161 | #ifdef CONFIG_BRIDGE_NETFILTER | 160 | #ifdef CONFIG_BRIDGE_NETFILTER |
| 162 | nf_bridge_put(nskb->nf_bridge); | 161 | nf_bridge_put(nskb->nf_bridge); |
diff --git a/net/ipv4/netfilter/ipt_TCPMSS.c b/net/ipv4/netfilter/ipt_TCPMSS.c index 7b84a254440e..8db70d6908c3 100644 --- a/net/ipv4/netfilter/ipt_TCPMSS.c +++ b/net/ipv4/netfilter/ipt_TCPMSS.c | |||
| @@ -58,7 +58,7 @@ ipt_tcpmss_target(struct sk_buff **pskb, | |||
| 58 | unsigned int i; | 58 | unsigned int i; |
| 59 | u_int8_t *opt; | 59 | u_int8_t *opt; |
| 60 | 60 | ||
| 61 | if (!skb_ip_make_writable(pskb, (*pskb)->len)) | 61 | if (!skb_make_writable(pskb, (*pskb)->len)) |
| 62 | return NF_DROP; | 62 | return NF_DROP; |
| 63 | 63 | ||
| 64 | if ((*pskb)->ip_summed == CHECKSUM_HW && | 64 | if ((*pskb)->ip_summed == CHECKSUM_HW && |
| @@ -190,7 +190,6 @@ ipt_tcpmss_target(struct sk_buff **pskb, | |||
| 190 | newmss); | 190 | newmss); |
| 191 | 191 | ||
| 192 | retmodified: | 192 | retmodified: |
| 193 | (*pskb)->nfcache |= NFC_UNKNOWN | NFC_ALTERED; | ||
| 194 | return IPT_CONTINUE; | 193 | return IPT_CONTINUE; |
| 195 | } | 194 | } |
| 196 | 195 | ||
diff --git a/net/ipv4/netfilter/ipt_TOS.c b/net/ipv4/netfilter/ipt_TOS.c index 85c70d240f8b..deadb36d4428 100644 --- a/net/ipv4/netfilter/ipt_TOS.c +++ b/net/ipv4/netfilter/ipt_TOS.c | |||
| @@ -33,7 +33,7 @@ target(struct sk_buff **pskb, | |||
| 33 | if (((*pskb)->nh.iph->tos & IPTOS_TOS_MASK) != tosinfo->tos) { | 33 | if (((*pskb)->nh.iph->tos & IPTOS_TOS_MASK) != tosinfo->tos) { |
| 34 | u_int16_t diffs[2]; | 34 | u_int16_t diffs[2]; |
| 35 | 35 | ||
| 36 | if (!skb_ip_make_writable(pskb, sizeof(struct iphdr))) | 36 | if (!skb_make_writable(pskb, sizeof(struct iphdr))) |
| 37 | return NF_DROP; | 37 | return NF_DROP; |
| 38 | 38 | ||
| 39 | diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF; | 39 | diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF; |
| @@ -46,7 +46,6 @@ target(struct sk_buff **pskb, | |||
| 46 | sizeof(diffs), | 46 | sizeof(diffs), |
| 47 | (*pskb)->nh.iph->check | 47 | (*pskb)->nh.iph->check |
| 48 | ^0xFFFF)); | 48 | ^0xFFFF)); |
| 49 | (*pskb)->nfcache |= NFC_ALTERED; | ||
| 50 | } | 49 | } |
| 51 | return IPT_CONTINUE; | 50 | return IPT_CONTINUE; |
| 52 | } | 51 | } |
diff --git a/net/ipv4/netfilter/ipt_TTL.c b/net/ipv4/netfilter/ipt_TTL.c new file mode 100644 index 000000000000..b9ae6a9382f3 --- /dev/null +++ b/net/ipv4/netfilter/ipt_TTL.c | |||
| @@ -0,0 +1,119 @@ | |||
| 1 | /* TTL modification target for IP tables | ||
| 2 | * (C) 2000,2005 by Harald Welte <laforge@netfilter.org> | ||
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or modify | ||
| 5 | * it under the terms of the GNU General Public License version 2 as | ||
| 6 | * published by the Free Software Foundation. | ||
| 7 | * | ||
| 8 | */ | ||
| 9 | |||
| 10 | #include <linux/module.h> | ||
| 11 | #include <linux/skbuff.h> | ||
| 12 | #include <linux/ip.h> | ||
| 13 | #include <net/checksum.h> | ||
| 14 | |||
| 15 | #include <linux/netfilter_ipv4/ip_tables.h> | ||
| 16 | #include <linux/netfilter_ipv4/ipt_TTL.h> | ||
| 17 | |||
| 18 | MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); | ||
| 19 | MODULE_DESCRIPTION("IP tables TTL modification module"); | ||
| 20 | MODULE_LICENSE("GPL"); | ||
| 21 | |||
| 22 | static unsigned int | ||
| 23 | ipt_ttl_target(struct sk_buff **pskb, const struct net_device *in, | ||
| 24 | const struct net_device *out, unsigned int hooknum, | ||
| 25 | const void *targinfo, void *userinfo) | ||
| 26 | { | ||
| 27 | struct iphdr *iph; | ||
| 28 | const struct ipt_TTL_info *info = targinfo; | ||
| 29 | u_int16_t diffs[2]; | ||
| 30 | int new_ttl; | ||
| 31 | |||
| 32 | if (!skb_make_writable(pskb, (*pskb)->len)) | ||
| 33 | return NF_DROP; | ||
| 34 | |||
| 35 | iph = (*pskb)->nh.iph; | ||
| 36 | |||
| 37 | switch (info->mode) { | ||
| 38 | case IPT_TTL_SET: | ||
| 39 | new_ttl = info->ttl; | ||
| 40 | break; | ||
| 41 | case IPT_TTL_INC: | ||
| 42 | new_ttl = iph->ttl + info->ttl; | ||
| 43 | if (new_ttl > 255) | ||
| 44 | new_ttl = 255; | ||
| 45 | break; | ||
| 46 | case IPT_TTL_DEC: | ||
| 47 | new_ttl = iph->ttl - info->ttl; | ||
| 48 | if (new_ttl < 0) | ||
| 49 | new_ttl = 0; | ||
| 50 | break; | ||
| 51 | default: | ||
| 52 | new_ttl = iph->ttl; | ||
| 53 | break; | ||
| 54 | } | ||
| 55 | |||
| 56 | if (new_ttl != iph->ttl) { | ||
| 57 | diffs[0] = htons(((unsigned)iph->ttl) << 8) ^ 0xFFFF; | ||
| 58 | iph->ttl = new_ttl; | ||
| 59 | diffs[1] = htons(((unsigned)iph->ttl) << 8); | ||
| 60 | iph->check = csum_fold(csum_partial((char *)diffs, | ||
| 61 | sizeof(diffs), | ||
| 62 | iph->check^0xFFFF)); | ||
| 63 | } | ||
| 64 | |||
| 65 | return IPT_CONTINUE; | ||
| 66 | } | ||
| 67 | |||
| 68 | static int ipt_ttl_checkentry(const char *tablename, | ||
| 69 | const struct ipt_entry *e, | ||
| 70 | void *targinfo, | ||
| 71 | unsigned int targinfosize, | ||
| 72 | unsigned int hook_mask) | ||
| 73 | { | ||
| 74 | struct ipt_TTL_info *info = targinfo; | ||
| 75 | |||
| 76 | if (targinfosize != IPT_ALIGN(sizeof(struct ipt_TTL_info))) { | ||
| 77 | printk(KERN_WARNING "ipt_TTL: targinfosize %u != %Zu\n", | ||
| 78 | targinfosize, | ||
| 79 | IPT_ALIGN(sizeof(struct ipt_TTL_info))); | ||
| 80 | return 0; | ||
| 81 | } | ||
| 82 | |||
| 83 | if (strcmp(tablename, "mangle")) { | ||
| 84 | printk(KERN_WARNING "ipt_TTL: can only be called from " | ||
| 85 | "\"mangle\" table, not \"%s\"\n", tablename); | ||
| 86 | return 0; | ||
| 87 | } | ||
| 88 | |||
| 89 | if (info->mode > IPT_TTL_MAXMODE) { | ||
| 90 | printk(KERN_WARNING "ipt_TTL: invalid or unknown Mode %u\n", | ||
| 91 | info->mode); | ||
| 92 | return 0; | ||
| 93 | } | ||
| 94 | |||
| 95 | if ((info->mode != IPT_TTL_SET) && (info->ttl == 0)) | ||
| 96 | return 0; | ||
| 97 | |||
| 98 | return 1; | ||
| 99 | } | ||
| 100 | |||
| 101 | static struct ipt_target ipt_TTL = { | ||
| 102 | .name = "TTL", | ||
| 103 | .target = ipt_ttl_target, | ||
| 104 | .checkentry = ipt_ttl_checkentry, | ||
| 105 | .me = THIS_MODULE, | ||
| 106 | }; | ||
| 107 | |||
| 108 | static int __init init(void) | ||
| 109 | { | ||
| 110 | return ipt_register_target(&ipt_TTL); | ||
| 111 | } | ||
| 112 | |||
| 113 | static void __exit fini(void) | ||
| 114 | { | ||
| 115 | ipt_unregister_target(&ipt_TTL); | ||
| 116 | } | ||
| 117 | |||
| 118 | module_init(init); | ||
| 119 | module_exit(fini); | ||
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index 52a0076302a7..e2c14f3cb2fc 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c | |||
| @@ -62,6 +62,7 @@ | |||
| 62 | MODULE_LICENSE("GPL"); | 62 | MODULE_LICENSE("GPL"); |
| 63 | MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>"); | 63 | MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>"); |
| 64 | MODULE_DESCRIPTION("iptables userspace logging module"); | 64 | MODULE_DESCRIPTION("iptables userspace logging module"); |
| 65 | MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NFLOG); | ||
| 65 | 66 | ||
| 66 | #define ULOG_NL_EVENT 111 /* Harald's favorite number */ | 67 | #define ULOG_NL_EVENT 111 /* Harald's favorite number */ |
| 67 | #define ULOG_MAXNLGROUPS 32 /* numer of nlgroups */ | 68 | #define ULOG_MAXNLGROUPS 32 /* numer of nlgroups */ |
| @@ -115,10 +116,10 @@ static void ulog_send(unsigned int nlgroupnum) | |||
| 115 | if (ub->qlen > 1) | 116 | if (ub->qlen > 1) |
| 116 | ub->lastnlh->nlmsg_type = NLMSG_DONE; | 117 | ub->lastnlh->nlmsg_type = NLMSG_DONE; |
| 117 | 118 | ||
| 118 | NETLINK_CB(ub->skb).dst_groups = (1 << nlgroupnum); | 119 | NETLINK_CB(ub->skb).dst_group = nlgroupnum + 1; |
| 119 | DEBUGP("ipt_ULOG: throwing %d packets to netlink mask %u\n", | 120 | DEBUGP("ipt_ULOG: throwing %d packets to netlink group %u\n", |
| 120 | ub->qlen, nlgroupnum); | 121 | ub->qlen, nlgroupnum + 1); |
| 121 | netlink_broadcast(nflognl, ub->skb, 0, (1 << nlgroupnum), GFP_ATOMIC); | 122 | netlink_broadcast(nflognl, ub->skb, 0, nlgroupnum + 1, GFP_ATOMIC); |
| 122 | 123 | ||
| 123 | ub->qlen = 0; | 124 | ub->qlen = 0; |
| 124 | ub->skb = NULL; | 125 | ub->skb = NULL; |
| @@ -219,13 +220,13 @@ static void ipt_ulog_packet(unsigned int hooknum, | |||
| 219 | pm = NLMSG_DATA(nlh); | 220 | pm = NLMSG_DATA(nlh); |
| 220 | 221 | ||
| 221 | /* We might not have a timestamp, get one */ | 222 | /* We might not have a timestamp, get one */ |
| 222 | if (skb->stamp.tv_sec == 0) | 223 | if (skb->tstamp.off_sec == 0) |
| 223 | do_gettimeofday((struct timeval *)&skb->stamp); | 224 | __net_timestamp((struct sk_buff *)skb); |
| 224 | 225 | ||
| 225 | /* copy hook, prefix, timestamp, payload, etc. */ | 226 | /* copy hook, prefix, timestamp, payload, etc. */ |
| 226 | pm->data_len = copy_len; | 227 | pm->data_len = copy_len; |
| 227 | pm->timestamp_sec = skb->stamp.tv_sec; | 228 | pm->timestamp_sec = skb_tv_base.tv_sec + skb->tstamp.off_sec; |
| 228 | pm->timestamp_usec = skb->stamp.tv_usec; | 229 | pm->timestamp_usec = skb_tv_base.tv_usec + skb->tstamp.off_usec; |
| 229 | pm->mark = skb->nfmark; | 230 | pm->mark = skb->nfmark; |
| 230 | pm->hook = hooknum; | 231 | pm->hook = hooknum; |
| 231 | if (prefix != NULL) | 232 | if (prefix != NULL) |
| @@ -303,18 +304,27 @@ static unsigned int ipt_ulog_target(struct sk_buff **pskb, | |||
| 303 | return IPT_CONTINUE; | 304 | return IPT_CONTINUE; |
| 304 | } | 305 | } |
| 305 | 306 | ||
| 306 | static void ipt_logfn(unsigned int hooknum, | 307 | static void ipt_logfn(unsigned int pf, |
| 308 | unsigned int hooknum, | ||
| 307 | const struct sk_buff *skb, | 309 | const struct sk_buff *skb, |
| 308 | const struct net_device *in, | 310 | const struct net_device *in, |
| 309 | const struct net_device *out, | 311 | const struct net_device *out, |
| 312 | const struct nf_loginfo *li, | ||
| 310 | const char *prefix) | 313 | const char *prefix) |
| 311 | { | 314 | { |
| 312 | struct ipt_ulog_info loginfo = { | 315 | struct ipt_ulog_info loginfo; |
| 313 | .nl_group = ULOG_DEFAULT_NLGROUP, | 316 | |
| 314 | .copy_range = 0, | 317 | if (!li || li->type != NF_LOG_TYPE_ULOG) { |
| 315 | .qthreshold = ULOG_DEFAULT_QTHRESHOLD, | 318 | loginfo.nl_group = ULOG_DEFAULT_NLGROUP; |
| 316 | .prefix = "" | 319 | loginfo.copy_range = 0; |
| 317 | }; | 320 | loginfo.qthreshold = ULOG_DEFAULT_QTHRESHOLD; |
| 321 | loginfo.prefix[0] = '\0'; | ||
| 322 | } else { | ||
| 323 | loginfo.nl_group = li->u.ulog.group; | ||
| 324 | loginfo.copy_range = li->u.ulog.copy_len; | ||
| 325 | loginfo.qthreshold = li->u.ulog.qthreshold; | ||
| 326 | strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix)); | ||
| 327 | } | ||
| 318 | 328 | ||
| 319 | ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix); | 329 | ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix); |
| 320 | } | 330 | } |
| @@ -354,6 +364,12 @@ static struct ipt_target ipt_ulog_reg = { | |||
| 354 | .me = THIS_MODULE, | 364 | .me = THIS_MODULE, |
| 355 | }; | 365 | }; |
| 356 | 366 | ||
| 367 | static struct nf_logger ipt_ulog_logger = { | ||
| 368 | .name = "ipt_ULOG", | ||
| 369 | .logfn = &ipt_logfn, | ||
| 370 | .me = THIS_MODULE, | ||
| 371 | }; | ||
| 372 | |||
| 357 | static int __init init(void) | 373 | static int __init init(void) |
| 358 | { | 374 | { |
| 359 | int i; | 375 | int i; |
| @@ -372,7 +388,8 @@ static int __init init(void) | |||
| 372 | ulog_buffers[i].timer.data = i; | 388 | ulog_buffers[i].timer.data = i; |
| 373 | } | 389 | } |
| 374 | 390 | ||
| 375 | nflognl = netlink_kernel_create(NETLINK_NFLOG, NULL); | 391 | nflognl = netlink_kernel_create(NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL, |
| 392 | THIS_MODULE); | ||
| 376 | if (!nflognl) | 393 | if (!nflognl) |
| 377 | return -ENOMEM; | 394 | return -ENOMEM; |
| 378 | 395 | ||
| @@ -381,7 +398,7 @@ static int __init init(void) | |||
| 381 | return -EINVAL; | 398 | return -EINVAL; |
| 382 | } | 399 | } |
| 383 | if (nflog) | 400 | if (nflog) |
| 384 | nf_log_register(PF_INET, &ipt_logfn); | 401 | nf_log_register(PF_INET, &ipt_ulog_logger); |
| 385 | 402 | ||
| 386 | return 0; | 403 | return 0; |
| 387 | } | 404 | } |
| @@ -394,7 +411,7 @@ static void __exit fini(void) | |||
| 394 | DEBUGP("ipt_ULOG: cleanup_module\n"); | 411 | DEBUGP("ipt_ULOG: cleanup_module\n"); |
| 395 | 412 | ||
| 396 | if (nflog) | 413 | if (nflog) |
| 397 | nf_log_unregister(PF_INET, &ipt_logfn); | 414 | nf_log_unregister_logger(&ipt_ulog_logger); |
| 398 | ipt_unregister_target(&ipt_ulog_reg); | 415 | ipt_unregister_target(&ipt_ulog_reg); |
| 399 | sock_release(nflognl->sk_socket); | 416 | sock_release(nflognl->sk_socket); |
| 400 | 417 | ||
diff --git a/net/ipv4/netfilter/ipt_connbytes.c b/net/ipv4/netfilter/ipt_connbytes.c new file mode 100644 index 000000000000..df4a42c6da22 --- /dev/null +++ b/net/ipv4/netfilter/ipt_connbytes.c | |||
| @@ -0,0 +1,162 @@ | |||
| 1 | /* Kernel module to match connection tracking byte counter. | ||
| 2 | * GPL (C) 2002 Martin Devera (devik@cdi.cz). | ||
| 3 | * | ||
| 4 | * 2004-07-20 Harald Welte <laforge@netfilter.org> | ||
| 5 | * - reimplemented to use per-connection accounting counters | ||
| 6 | * - add functionality to match number of packets | ||
| 7 | * - add functionality to match average packet size | ||
| 8 | * - add support to match directions seperately | ||
| 9 | * | ||
| 10 | */ | ||
| 11 | #include <linux/module.h> | ||
| 12 | #include <linux/skbuff.h> | ||
| 13 | #include <linux/netfilter_ipv4/ip_conntrack.h> | ||
| 14 | #include <linux/netfilter_ipv4/ip_tables.h> | ||
| 15 | #include <linux/netfilter_ipv4/ipt_connbytes.h> | ||
| 16 | |||
| 17 | #include <asm/div64.h> | ||
| 18 | #include <asm/bitops.h> | ||
| 19 | |||
| 20 | MODULE_LICENSE("GPL"); | ||
| 21 | MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); | ||
| 22 | MODULE_DESCRIPTION("iptables match for matching number of pkts/bytes per connection"); | ||
| 23 | |||
| 24 | /* 64bit divisor, dividend and result. dynamic precision */ | ||
| 25 | static u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor) | ||
| 26 | { | ||
| 27 | u_int32_t d = divisor; | ||
| 28 | |||
| 29 | if (divisor > 0xffffffffULL) { | ||
| 30 | unsigned int shift = fls(divisor >> 32); | ||
| 31 | |||
| 32 | d = divisor >> shift; | ||
| 33 | dividend >>= shift; | ||
| 34 | } | ||
| 35 | |||
| 36 | do_div(dividend, d); | ||
| 37 | return dividend; | ||
| 38 | } | ||
| 39 | |||
| 40 | static int | ||
| 41 | match(const struct sk_buff *skb, | ||
| 42 | const struct net_device *in, | ||
| 43 | const struct net_device *out, | ||
| 44 | const void *matchinfo, | ||
| 45 | int offset, | ||
| 46 | int *hotdrop) | ||
| 47 | { | ||
| 48 | const struct ipt_connbytes_info *sinfo = matchinfo; | ||
| 49 | enum ip_conntrack_info ctinfo; | ||
| 50 | struct ip_conntrack *ct; | ||
| 51 | u_int64_t what = 0; /* initialize to make gcc happy */ | ||
| 52 | |||
| 53 | if (!(ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo))) | ||
| 54 | return 0; /* no match */ | ||
| 55 | |||
| 56 | switch (sinfo->what) { | ||
| 57 | case IPT_CONNBYTES_PKTS: | ||
| 58 | switch (sinfo->direction) { | ||
| 59 | case IPT_CONNBYTES_DIR_ORIGINAL: | ||
| 60 | what = ct->counters[IP_CT_DIR_ORIGINAL].packets; | ||
| 61 | break; | ||
| 62 | case IPT_CONNBYTES_DIR_REPLY: | ||
| 63 | what = ct->counters[IP_CT_DIR_REPLY].packets; | ||
| 64 | break; | ||
| 65 | case IPT_CONNBYTES_DIR_BOTH: | ||
| 66 | what = ct->counters[IP_CT_DIR_ORIGINAL].packets; | ||
| 67 | what += ct->counters[IP_CT_DIR_REPLY].packets; | ||
| 68 | break; | ||
| 69 | } | ||
| 70 | break; | ||
| 71 | case IPT_CONNBYTES_BYTES: | ||
| 72 | switch (sinfo->direction) { | ||
| 73 | case IPT_CONNBYTES_DIR_ORIGINAL: | ||
| 74 | what = ct->counters[IP_CT_DIR_ORIGINAL].bytes; | ||
| 75 | break; | ||
| 76 | case IPT_CONNBYTES_DIR_REPLY: | ||
| 77 | what = ct->counters[IP_CT_DIR_REPLY].bytes; | ||
| 78 | break; | ||
| 79 | case IPT_CONNBYTES_DIR_BOTH: | ||
| 80 | what = ct->counters[IP_CT_DIR_ORIGINAL].bytes; | ||
| 81 | what += ct->counters[IP_CT_DIR_REPLY].bytes; | ||
| 82 | break; | ||
| 83 | } | ||
| 84 | break; | ||
| 85 | case IPT_CONNBYTES_AVGPKT: | ||
| 86 | switch (sinfo->direction) { | ||
| 87 | case IPT_CONNBYTES_DIR_ORIGINAL: | ||
| 88 | what = div64_64(ct->counters[IP_CT_DIR_ORIGINAL].bytes, | ||
| 89 | ct->counters[IP_CT_DIR_ORIGINAL].packets); | ||
| 90 | break; | ||
| 91 | case IPT_CONNBYTES_DIR_REPLY: | ||
| 92 | what = div64_64(ct->counters[IP_CT_DIR_REPLY].bytes, | ||
| 93 | ct->counters[IP_CT_DIR_REPLY].packets); | ||
| 94 | break; | ||
| 95 | case IPT_CONNBYTES_DIR_BOTH: | ||
| 96 | { | ||
| 97 | u_int64_t bytes; | ||
| 98 | u_int64_t pkts; | ||
| 99 | bytes = ct->counters[IP_CT_DIR_ORIGINAL].bytes + | ||
| 100 | ct->counters[IP_CT_DIR_REPLY].bytes; | ||
| 101 | pkts = ct->counters[IP_CT_DIR_ORIGINAL].packets+ | ||
| 102 | ct->counters[IP_CT_DIR_REPLY].packets; | ||
| 103 | |||
| 104 | /* FIXME_THEORETICAL: what to do if sum | ||
| 105 | * overflows ? */ | ||
| 106 | |||
| 107 | what = div64_64(bytes, pkts); | ||
| 108 | } | ||
| 109 | break; | ||
| 110 | } | ||
| 111 | break; | ||
| 112 | } | ||
| 113 | |||
| 114 | if (sinfo->count.to) | ||
| 115 | return (what <= sinfo->count.to && what >= sinfo->count.from); | ||
| 116 | else | ||
| 117 | return (what >= sinfo->count.from); | ||
| 118 | } | ||
| 119 | |||
| 120 | static int check(const char *tablename, | ||
| 121 | const struct ipt_ip *ip, | ||
| 122 | void *matchinfo, | ||
| 123 | unsigned int matchsize, | ||
| 124 | unsigned int hook_mask) | ||
| 125 | { | ||
| 126 | const struct ipt_connbytes_info *sinfo = matchinfo; | ||
| 127 | |||
| 128 | if (matchsize != IPT_ALIGN(sizeof(struct ipt_connbytes_info))) | ||
| 129 | return 0; | ||
| 130 | |||
| 131 | if (sinfo->what != IPT_CONNBYTES_PKTS && | ||
| 132 | sinfo->what != IPT_CONNBYTES_BYTES && | ||
| 133 | sinfo->what != IPT_CONNBYTES_AVGPKT) | ||
| 134 | return 0; | ||
| 135 | |||
| 136 | if (sinfo->direction != IPT_CONNBYTES_DIR_ORIGINAL && | ||
| 137 | sinfo->direction != IPT_CONNBYTES_DIR_REPLY && | ||
| 138 | sinfo->direction != IPT_CONNBYTES_DIR_BOTH) | ||
| 139 | return 0; | ||
| 140 | |||
| 141 | return 1; | ||
| 142 | } | ||
| 143 | |||
| 144 | static struct ipt_match state_match = { | ||
| 145 | .name = "connbytes", | ||
| 146 | .match = &match, | ||
| 147 | .checkentry = &check, | ||
| 148 | .me = THIS_MODULE | ||
| 149 | }; | ||
| 150 | |||
| 151 | static int __init init(void) | ||
| 152 | { | ||
| 153 | return ipt_register_match(&state_match); | ||
| 154 | } | ||
| 155 | |||
| 156 | static void __exit fini(void) | ||
| 157 | { | ||
| 158 | ipt_unregister_match(&state_match); | ||
| 159 | } | ||
| 160 | |||
| 161 | module_init(init); | ||
| 162 | module_exit(fini); | ||
diff --git a/net/ipv4/netfilter/ipt_connmark.c b/net/ipv4/netfilter/ipt_connmark.c index 2706f96cea55..bf8de47ce004 100644 --- a/net/ipv4/netfilter/ipt_connmark.c +++ b/net/ipv4/netfilter/ipt_connmark.c | |||
| @@ -54,9 +54,16 @@ checkentry(const char *tablename, | |||
| 54 | unsigned int matchsize, | 54 | unsigned int matchsize, |
| 55 | unsigned int hook_mask) | 55 | unsigned int hook_mask) |
| 56 | { | 56 | { |
| 57 | struct ipt_connmark_info *cm = | ||
| 58 | (struct ipt_connmark_info *)matchinfo; | ||
| 57 | if (matchsize != IPT_ALIGN(sizeof(struct ipt_connmark_info))) | 59 | if (matchsize != IPT_ALIGN(sizeof(struct ipt_connmark_info))) |
| 58 | return 0; | 60 | return 0; |
| 59 | 61 | ||
| 62 | if (cm->mark > 0xffffffff || cm->mask > 0xffffffff) { | ||
| 63 | printk(KERN_WARNING "connmark: only support 32bit mark\n"); | ||
| 64 | return 0; | ||
| 65 | } | ||
| 66 | |||
| 60 | return 1; | 67 | return 1; |
| 61 | } | 68 | } |
| 62 | 69 | ||
diff --git a/net/ipv4/netfilter/ipt_dccp.c b/net/ipv4/netfilter/ipt_dccp.c new file mode 100644 index 000000000000..ad3278bba6c1 --- /dev/null +++ b/net/ipv4/netfilter/ipt_dccp.c | |||
| @@ -0,0 +1,176 @@ | |||
| 1 | /* | ||
| 2 | * iptables module for DCCP protocol header matching | ||
| 3 | * | ||
| 4 | * (C) 2005 by Harald Welte <laforge@netfilter.org> | ||
| 5 | * | ||
| 6 | * This program is free software; you can redistribute it and/or modify | ||
| 7 | * it under the terms of the GNU General Public License version 2 as | ||
| 8 | * published by the Free Software Foundation. | ||
| 9 | */ | ||
| 10 | |||
| 11 | #include <linux/module.h> | ||
| 12 | #include <linux/skbuff.h> | ||
| 13 | #include <linux/spinlock.h> | ||
| 14 | #include <net/ip.h> | ||
| 15 | #include <linux/dccp.h> | ||
| 16 | |||
| 17 | #include <linux/netfilter_ipv4/ip_tables.h> | ||
| 18 | #include <linux/netfilter_ipv4/ipt_dccp.h> | ||
| 19 | |||
| 20 | #define DCCHECK(cond, option, flag, invflag) (!((flag) & (option)) \ | ||
| 21 | || (!!((invflag) & (option)) ^ (cond))) | ||
| 22 | |||
| 23 | static unsigned char *dccp_optbuf; | ||
| 24 | static DEFINE_SPINLOCK(dccp_buflock); | ||
| 25 | |||
| 26 | static inline int | ||
| 27 | dccp_find_option(u_int8_t option, | ||
| 28 | const struct sk_buff *skb, | ||
| 29 | const struct dccp_hdr *dh, | ||
| 30 | int *hotdrop) | ||
| 31 | { | ||
| 32 | /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */ | ||
| 33 | unsigned char *op; | ||
| 34 | unsigned int optoff = __dccp_hdr_len(dh); | ||
| 35 | unsigned int optlen = dh->dccph_doff*4 - __dccp_hdr_len(dh); | ||
| 36 | unsigned int i; | ||
| 37 | |||
| 38 | if (dh->dccph_doff * 4 < __dccp_hdr_len(dh)) { | ||
| 39 | *hotdrop = 1; | ||
| 40 | return 0; | ||
| 41 | } | ||
| 42 | |||
| 43 | if (!optlen) | ||
| 44 | return 0; | ||
| 45 | |||
| 46 | spin_lock_bh(&dccp_buflock); | ||
| 47 | op = skb_header_pointer(skb, | ||
| 48 | skb->nh.iph->ihl*4 + optoff, | ||
| 49 | optlen, dccp_optbuf); | ||
| 50 | if (op == NULL) { | ||
| 51 | /* If we don't have the whole header, drop packet. */ | ||
| 52 | spin_unlock_bh(&dccp_buflock); | ||
| 53 | *hotdrop = 1; | ||
| 54 | return 0; | ||
| 55 | } | ||
| 56 | |||
| 57 | for (i = 0; i < optlen; ) { | ||
| 58 | if (op[i] == option) { | ||
| 59 | spin_unlock_bh(&dccp_buflock); | ||
| 60 | return 1; | ||
| 61 | } | ||
| 62 | |||
| 63 | if (op[i] < 2) | ||
| 64 | i++; | ||
| 65 | else | ||
| 66 | i += op[i+1]?:1; | ||
| 67 | } | ||
| 68 | |||
| 69 | spin_unlock_bh(&dccp_buflock); | ||
| 70 | return 0; | ||
| 71 | } | ||
| 72 | |||
| 73 | |||
| 74 | static inline int | ||
| 75 | match_types(const struct dccp_hdr *dh, u_int16_t typemask) | ||
| 76 | { | ||
| 77 | return (typemask & (1 << dh->dccph_type)); | ||
| 78 | } | ||
| 79 | |||
| 80 | static inline int | ||
| 81 | match_option(u_int8_t option, const struct sk_buff *skb, | ||
| 82 | const struct dccp_hdr *dh, int *hotdrop) | ||
| 83 | { | ||
| 84 | return dccp_find_option(option, skb, dh, hotdrop); | ||
| 85 | } | ||
| 86 | |||
| 87 | static int | ||
| 88 | match(const struct sk_buff *skb, | ||
| 89 | const struct net_device *in, | ||
| 90 | const struct net_device *out, | ||
| 91 | const void *matchinfo, | ||
| 92 | int offset, | ||
| 93 | int *hotdrop) | ||
| 94 | { | ||
| 95 | const struct ipt_dccp_info *info = | ||
| 96 | (const struct ipt_dccp_info *)matchinfo; | ||
| 97 | struct dccp_hdr _dh, *dh; | ||
| 98 | |||
| 99 | if (offset) | ||
| 100 | return 0; | ||
| 101 | |||
| 102 | dh = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_dh), &_dh); | ||
| 103 | if (dh == NULL) { | ||
| 104 | *hotdrop = 1; | ||
| 105 | return 0; | ||
| 106 | } | ||
| 107 | |||
| 108 | return DCCHECK(((ntohs(dh->dccph_sport) >= info->spts[0]) | ||
| 109 | && (ntohs(dh->dccph_sport) <= info->spts[1])), | ||
| 110 | IPT_DCCP_SRC_PORTS, info->flags, info->invflags) | ||
| 111 | && DCCHECK(((ntohs(dh->dccph_dport) >= info->dpts[0]) | ||
| 112 | && (ntohs(dh->dccph_dport) <= info->dpts[1])), | ||
| 113 | IPT_DCCP_DEST_PORTS, info->flags, info->invflags) | ||
| 114 | && DCCHECK(match_types(dh, info->typemask), | ||
| 115 | IPT_DCCP_TYPE, info->flags, info->invflags) | ||
| 116 | && DCCHECK(match_option(info->option, skb, dh, hotdrop), | ||
| 117 | IPT_DCCP_OPTION, info->flags, info->invflags); | ||
| 118 | } | ||
| 119 | |||
| 120 | static int | ||
| 121 | checkentry(const char *tablename, | ||
| 122 | const struct ipt_ip *ip, | ||
| 123 | void *matchinfo, | ||
| 124 | unsigned int matchsize, | ||
| 125 | unsigned int hook_mask) | ||
| 126 | { | ||
| 127 | const struct ipt_dccp_info *info; | ||
| 128 | |||
| 129 | info = (const struct ipt_dccp_info *)matchinfo; | ||
| 130 | |||
| 131 | return ip->proto == IPPROTO_DCCP | ||
| 132 | && !(ip->invflags & IPT_INV_PROTO) | ||
| 133 | && matchsize == IPT_ALIGN(sizeof(struct ipt_dccp_info)) | ||
| 134 | && !(info->flags & ~IPT_DCCP_VALID_FLAGS) | ||
| 135 | && !(info->invflags & ~IPT_DCCP_VALID_FLAGS) | ||
| 136 | && !(info->invflags & ~info->flags); | ||
| 137 | } | ||
| 138 | |||
| 139 | static struct ipt_match dccp_match = | ||
| 140 | { | ||
| 141 | .name = "dccp", | ||
| 142 | .match = &match, | ||
| 143 | .checkentry = &checkentry, | ||
| 144 | .me = THIS_MODULE, | ||
| 145 | }; | ||
| 146 | |||
| 147 | static int __init init(void) | ||
| 148 | { | ||
| 149 | int ret; | ||
| 150 | |||
| 151 | /* doff is 8 bits, so the maximum option size is (4*256). Don't put | ||
| 152 | * this in BSS since DaveM is worried about locked TLB's for kernel | ||
| 153 | * BSS. */ | ||
| 154 | dccp_optbuf = kmalloc(256 * 4, GFP_KERNEL); | ||
| 155 | if (!dccp_optbuf) | ||
| 156 | return -ENOMEM; | ||
| 157 | ret = ipt_register_match(&dccp_match); | ||
| 158 | if (ret) | ||
| 159 | kfree(dccp_optbuf); | ||
| 160 | |||
| 161 | return ret; | ||
| 162 | } | ||
| 163 | |||
| 164 | static void __exit fini(void) | ||
| 165 | { | ||
| 166 | ipt_unregister_match(&dccp_match); | ||
| 167 | kfree(dccp_optbuf); | ||
| 168 | } | ||
| 169 | |||
| 170 | module_init(init); | ||
| 171 | module_exit(fini); | ||
| 172 | |||
| 173 | MODULE_LICENSE("GPL"); | ||
| 174 | MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); | ||
| 175 | MODULE_DESCRIPTION("Match for DCCP protocol packets"); | ||
| 176 | |||
diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c index 564b49bfebcf..2dd1cccbdab9 100644 --- a/net/ipv4/netfilter/ipt_hashlimit.c +++ b/net/ipv4/netfilter/ipt_hashlimit.c | |||
| @@ -94,7 +94,7 @@ struct ipt_hashlimit_htable { | |||
| 94 | static DEFINE_SPINLOCK(hashlimit_lock); /* protects htables list */ | 94 | static DEFINE_SPINLOCK(hashlimit_lock); /* protects htables list */ |
| 95 | static DECLARE_MUTEX(hlimit_mutex); /* additional checkentry protection */ | 95 | static DECLARE_MUTEX(hlimit_mutex); /* additional checkentry protection */ |
| 96 | static HLIST_HEAD(hashlimit_htables); | 96 | static HLIST_HEAD(hashlimit_htables); |
| 97 | static kmem_cache_t *hashlimit_cachep; | 97 | static kmem_cache_t *hashlimit_cachep __read_mostly; |
| 98 | 98 | ||
| 99 | static inline int dst_cmp(const struct dsthash_ent *ent, struct dsthash_dst *b) | 99 | static inline int dst_cmp(const struct dsthash_ent *ent, struct dsthash_dst *b) |
| 100 | { | 100 | { |
diff --git a/net/ipv4/netfilter/ipt_mark.c b/net/ipv4/netfilter/ipt_mark.c index 8955728127b9..00bef6cdd3f8 100644 --- a/net/ipv4/netfilter/ipt_mark.c +++ b/net/ipv4/netfilter/ipt_mark.c | |||
| @@ -37,9 +37,16 @@ checkentry(const char *tablename, | |||
| 37 | unsigned int matchsize, | 37 | unsigned int matchsize, |
| 38 | unsigned int hook_mask) | 38 | unsigned int hook_mask) |
| 39 | { | 39 | { |
| 40 | struct ipt_mark_info *minfo = (struct ipt_mark_info *) matchinfo; | ||
| 41 | |||
| 40 | if (matchsize != IPT_ALIGN(sizeof(struct ipt_mark_info))) | 42 | if (matchsize != IPT_ALIGN(sizeof(struct ipt_mark_info))) |
| 41 | return 0; | 43 | return 0; |
| 42 | 44 | ||
| 45 | if (minfo->mark > 0xffffffff || minfo->mask > 0xffffffff) { | ||
| 46 | printk(KERN_WARNING "mark: only supports 32bit mark\n"); | ||
| 47 | return 0; | ||
| 48 | } | ||
| 49 | |||
| 43 | return 1; | 50 | return 1; |
| 44 | } | 51 | } |
| 45 | 52 | ||
diff --git a/net/ipv4/netfilter/ipt_owner.c b/net/ipv4/netfilter/ipt_owner.c index 3b9065e06381..c1889f88262b 100644 --- a/net/ipv4/netfilter/ipt_owner.c +++ b/net/ipv4/netfilter/ipt_owner.c | |||
| @@ -21,106 +21,6 @@ MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); | |||
| 21 | MODULE_DESCRIPTION("iptables owner match"); | 21 | MODULE_DESCRIPTION("iptables owner match"); |
| 22 | 22 | ||
| 23 | static int | 23 | static int |
| 24 | match_comm(const struct sk_buff *skb, const char *comm) | ||
| 25 | { | ||
| 26 | struct task_struct *g, *p; | ||
| 27 | struct files_struct *files; | ||
| 28 | int i; | ||
| 29 | |||
| 30 | read_lock(&tasklist_lock); | ||
| 31 | do_each_thread(g, p) { | ||
| 32 | if(strncmp(p->comm, comm, sizeof(p->comm))) | ||
| 33 | continue; | ||
| 34 | |||
| 35 | task_lock(p); | ||
| 36 | files = p->files; | ||
| 37 | if(files) { | ||
| 38 | spin_lock(&files->file_lock); | ||
| 39 | for (i=0; i < files->max_fds; i++) { | ||
| 40 | if (fcheck_files(files, i) == | ||
| 41 | skb->sk->sk_socket->file) { | ||
| 42 | spin_unlock(&files->file_lock); | ||
| 43 | task_unlock(p); | ||
| 44 | read_unlock(&tasklist_lock); | ||
| 45 | return 1; | ||
| 46 | } | ||
| 47 | } | ||
| 48 | spin_unlock(&files->file_lock); | ||
| 49 | } | ||
| 50 | task_unlock(p); | ||
| 51 | } while_each_thread(g, p); | ||
| 52 | read_unlock(&tasklist_lock); | ||
| 53 | return 0; | ||
| 54 | } | ||
| 55 | |||
| 56 | static int | ||
| 57 | match_pid(const struct sk_buff *skb, pid_t pid) | ||
| 58 | { | ||
| 59 | struct task_struct *p; | ||
| 60 | struct files_struct *files; | ||
| 61 | int i; | ||
| 62 | |||
| 63 | read_lock(&tasklist_lock); | ||
| 64 | p = find_task_by_pid(pid); | ||
| 65 | if (!p) | ||
| 66 | goto out; | ||
| 67 | task_lock(p); | ||
| 68 | files = p->files; | ||
| 69 | if(files) { | ||
| 70 | spin_lock(&files->file_lock); | ||
| 71 | for (i=0; i < files->max_fds; i++) { | ||
| 72 | if (fcheck_files(files, i) == | ||
| 73 | skb->sk->sk_socket->file) { | ||
| 74 | spin_unlock(&files->file_lock); | ||
| 75 | task_unlock(p); | ||
| 76 | read_unlock(&tasklist_lock); | ||
| 77 | return 1; | ||
| 78 | } | ||
| 79 | } | ||
| 80 | spin_unlock(&files->file_lock); | ||
| 81 | } | ||
| 82 | task_unlock(p); | ||
| 83 | out: | ||
| 84 | read_unlock(&tasklist_lock); | ||
| 85 | return 0; | ||
| 86 | } | ||
| 87 | |||
| 88 | static int | ||
| 89 | match_sid(const struct sk_buff *skb, pid_t sid) | ||
| 90 | { | ||
| 91 | struct task_struct *g, *p; | ||
| 92 | struct file *file = skb->sk->sk_socket->file; | ||
| 93 | int i, found=0; | ||
| 94 | |||
| 95 | read_lock(&tasklist_lock); | ||
| 96 | do_each_thread(g, p) { | ||
| 97 | struct files_struct *files; | ||
| 98 | if (p->signal->session != sid) | ||
| 99 | continue; | ||
| 100 | |||
| 101 | task_lock(p); | ||
| 102 | files = p->files; | ||
| 103 | if (files) { | ||
| 104 | spin_lock(&files->file_lock); | ||
| 105 | for (i=0; i < files->max_fds; i++) { | ||
| 106 | if (fcheck_files(files, i) == file) { | ||
| 107 | found = 1; | ||
| 108 | break; | ||
| 109 | } | ||
| 110 | } | ||
| 111 | spin_unlock(&files->file_lock); | ||
| 112 | } | ||
| 113 | task_unlock(p); | ||
| 114 | if (found) | ||
| 115 | goto out; | ||
| 116 | } while_each_thread(g, p); | ||
| 117 | out: | ||
| 118 | read_unlock(&tasklist_lock); | ||
| 119 | |||
| 120 | return found; | ||
| 121 | } | ||
| 122 | |||
| 123 | static int | ||
| 124 | match(const struct sk_buff *skb, | 24 | match(const struct sk_buff *skb, |
| 125 | const struct net_device *in, | 25 | const struct net_device *in, |
| 126 | const struct net_device *out, | 26 | const struct net_device *out, |
| @@ -145,24 +45,6 @@ match(const struct sk_buff *skb, | |||
| 145 | return 0; | 45 | return 0; |
| 146 | } | 46 | } |
| 147 | 47 | ||
| 148 | if(info->match & IPT_OWNER_PID) { | ||
| 149 | if (!match_pid(skb, info->pid) ^ | ||
| 150 | !!(info->invert & IPT_OWNER_PID)) | ||
| 151 | return 0; | ||
| 152 | } | ||
| 153 | |||
| 154 | if(info->match & IPT_OWNER_SID) { | ||
| 155 | if (!match_sid(skb, info->sid) ^ | ||
| 156 | !!(info->invert & IPT_OWNER_SID)) | ||
| 157 | return 0; | ||
| 158 | } | ||
| 159 | |||
| 160 | if(info->match & IPT_OWNER_COMM) { | ||
| 161 | if (!match_comm(skb, info->comm) ^ | ||
| 162 | !!(info->invert & IPT_OWNER_COMM)) | ||
| 163 | return 0; | ||
| 164 | } | ||
| 165 | |||
| 166 | return 1; | 48 | return 1; |
| 167 | } | 49 | } |
| 168 | 50 | ||
| @@ -173,6 +55,8 @@ checkentry(const char *tablename, | |||
| 173 | unsigned int matchsize, | 55 | unsigned int matchsize, |
| 174 | unsigned int hook_mask) | 56 | unsigned int hook_mask) |
| 175 | { | 57 | { |
| 58 | const struct ipt_owner_info *info = matchinfo; | ||
| 59 | |||
| 176 | if (hook_mask | 60 | if (hook_mask |
| 177 | & ~((1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_POST_ROUTING))) { | 61 | & ~((1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_POST_ROUTING))) { |
| 178 | printk("ipt_owner: only valid for LOCAL_OUT or POST_ROUTING.\n"); | 62 | printk("ipt_owner: only valid for LOCAL_OUT or POST_ROUTING.\n"); |
| @@ -184,15 +68,13 @@ checkentry(const char *tablename, | |||
| 184 | IPT_ALIGN(sizeof(struct ipt_owner_info))); | 68 | IPT_ALIGN(sizeof(struct ipt_owner_info))); |
| 185 | return 0; | 69 | return 0; |
| 186 | } | 70 | } |
| 187 | #ifdef CONFIG_SMP | 71 | |
| 188 | /* files->file_lock can not be used in a BH */ | 72 | if (info->match & (IPT_OWNER_PID|IPT_OWNER_SID|IPT_OWNER_COMM)) { |
| 189 | if (((struct ipt_owner_info *)matchinfo)->match | 73 | printk("ipt_owner: pid, sid and command matching " |
| 190 | & (IPT_OWNER_PID|IPT_OWNER_SID|IPT_OWNER_COMM)) { | 74 | "not supported anymore\n"); |
| 191 | printk("ipt_owner: pid, sid and command matching is broken " | ||
| 192 | "on SMP.\n"); | ||
| 193 | return 0; | 75 | return 0; |
| 194 | } | 76 | } |
| 195 | #endif | 77 | |
| 196 | return 1; | 78 | return 1; |
| 197 | } | 79 | } |
| 198 | 80 | ||
diff --git a/net/ipv4/netfilter/ipt_string.c b/net/ipv4/netfilter/ipt_string.c new file mode 100644 index 000000000000..b5def204d798 --- /dev/null +++ b/net/ipv4/netfilter/ipt_string.c | |||
| @@ -0,0 +1,91 @@ | |||
| 1 | /* String matching match for iptables | ||
| 2 | * | ||
| 3 | * (C) 2005 Pablo Neira Ayuso <pablo@eurodev.net> | ||
| 4 | * | ||
| 5 | * This program is free software; you can redistribute it and/or modify | ||
| 6 | * it under the terms of the GNU General Public License version 2 as | ||
| 7 | * published by the Free Software Foundation. | ||
| 8 | */ | ||
| 9 | |||
| 10 | #include <linux/init.h> | ||
| 11 | #include <linux/module.h> | ||
| 12 | #include <linux/kernel.h> | ||
| 13 | #include <linux/skbuff.h> | ||
| 14 | #include <linux/netfilter_ipv4/ip_tables.h> | ||
| 15 | #include <linux/netfilter_ipv4/ipt_string.h> | ||
| 16 | #include <linux/textsearch.h> | ||
| 17 | |||
| 18 | MODULE_AUTHOR("Pablo Neira Ayuso <pablo@eurodev.net>"); | ||
| 19 | MODULE_DESCRIPTION("IP tables string match module"); | ||
| 20 | MODULE_LICENSE("GPL"); | ||
| 21 | |||
| 22 | static int match(const struct sk_buff *skb, | ||
| 23 | const struct net_device *in, | ||
| 24 | const struct net_device *out, | ||
| 25 | const void *matchinfo, | ||
| 26 | int offset, | ||
| 27 | int *hotdrop) | ||
| 28 | { | ||
| 29 | struct ts_state state; | ||
| 30 | struct ipt_string_info *conf = (struct ipt_string_info *) matchinfo; | ||
| 31 | |||
| 32 | memset(&state, 0, sizeof(struct ts_state)); | ||
| 33 | |||
| 34 | return (skb_find_text((struct sk_buff *)skb, conf->from_offset, | ||
| 35 | conf->to_offset, conf->config, &state) | ||
| 36 | != UINT_MAX) && !conf->invert; | ||
| 37 | } | ||
| 38 | |||
| 39 | #define STRING_TEXT_PRIV(m) ((struct ipt_string_info *) m) | ||
| 40 | |||
| 41 | static int checkentry(const char *tablename, | ||
| 42 | const struct ipt_ip *ip, | ||
| 43 | void *matchinfo, | ||
| 44 | unsigned int matchsize, | ||
| 45 | unsigned int hook_mask) | ||
| 46 | { | ||
| 47 | struct ipt_string_info *conf = matchinfo; | ||
| 48 | struct ts_config *ts_conf; | ||
| 49 | |||
| 50 | if (matchsize != IPT_ALIGN(sizeof(struct ipt_string_info))) | ||
| 51 | return 0; | ||
| 52 | |||
| 53 | /* Damn, can't handle this case properly with iptables... */ | ||
| 54 | if (conf->from_offset > conf->to_offset) | ||
| 55 | return 0; | ||
| 56 | |||
| 57 | ts_conf = textsearch_prepare(conf->algo, conf->pattern, conf->patlen, | ||
| 58 | GFP_KERNEL, TS_AUTOLOAD); | ||
| 59 | if (IS_ERR(ts_conf)) | ||
| 60 | return 0; | ||
| 61 | |||
| 62 | conf->config = ts_conf; | ||
| 63 | |||
| 64 | return 1; | ||
| 65 | } | ||
| 66 | |||
| 67 | static void destroy(void *matchinfo, unsigned int matchsize) | ||
| 68 | { | ||
| 69 | textsearch_destroy(STRING_TEXT_PRIV(matchinfo)->config); | ||
| 70 | } | ||
| 71 | |||
| 72 | static struct ipt_match string_match = { | ||
| 73 | .name = "string", | ||
| 74 | .match = match, | ||
| 75 | .checkentry = checkentry, | ||
| 76 | .destroy = destroy, | ||
| 77 | .me = THIS_MODULE | ||
| 78 | }; | ||
| 79 | |||
| 80 | static int __init init(void) | ||
| 81 | { | ||
| 82 | return ipt_register_match(&string_match); | ||
| 83 | } | ||
| 84 | |||
| 85 | static void __exit fini(void) | ||
| 86 | { | ||
| 87 | ipt_unregister_match(&string_match); | ||
| 88 | } | ||
| 89 | |||
| 90 | module_init(init); | ||
| 91 | module_exit(fini); | ||
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 912bbcc7f415..f7943ba1f43c 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c | |||
| @@ -59,13 +59,10 @@ static int fold_prot_inuse(struct proto *proto) | |||
| 59 | */ | 59 | */ |
| 60 | static int sockstat_seq_show(struct seq_file *seq, void *v) | 60 | static int sockstat_seq_show(struct seq_file *seq, void *v) |
| 61 | { | 61 | { |
| 62 | /* From net/socket.c */ | ||
| 63 | extern void socket_seq_show(struct seq_file *seq); | ||
| 64 | |||
| 65 | socket_seq_show(seq); | 62 | socket_seq_show(seq); |
| 66 | seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n", | 63 | seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n", |
| 67 | fold_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count), | 64 | fold_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count), |
| 68 | tcp_tw_count, atomic_read(&tcp_sockets_allocated), | 65 | tcp_death_row.tw_count, atomic_read(&tcp_sockets_allocated), |
| 69 | atomic_read(&tcp_memory_allocated)); | 66 | atomic_read(&tcp_memory_allocated)); |
| 70 | seq_printf(seq, "UDP: inuse %d\n", fold_prot_inuse(&udp_prot)); | 67 | seq_printf(seq, "UDP: inuse %d\n", fold_prot_inuse(&udp_prot)); |
| 71 | seq_printf(seq, "RAW: inuse %d\n", fold_prot_inuse(&raw_prot)); | 68 | seq_printf(seq, "RAW: inuse %d\n", fold_prot_inuse(&raw_prot)); |
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index 0db405a869f2..291831e792af 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c | |||
| @@ -40,7 +40,6 @@ | |||
| 40 | #include <linux/timer.h> | 40 | #include <linux/timer.h> |
| 41 | #include <net/ip.h> | 41 | #include <net/ip.h> |
| 42 | #include <net/protocol.h> | 42 | #include <net/protocol.h> |
| 43 | #include <net/tcp.h> | ||
| 44 | #include <linux/skbuff.h> | 43 | #include <linux/skbuff.h> |
| 45 | #include <net/sock.h> | 44 | #include <net/sock.h> |
| 46 | #include <net/icmp.h> | 45 | #include <net/icmp.h> |
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index d1835b1bc8c4..304bb0a1d4f0 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c | |||
| @@ -59,7 +59,6 @@ | |||
| 59 | #include <linux/netdevice.h> | 59 | #include <linux/netdevice.h> |
| 60 | #include <linux/in_route.h> | 60 | #include <linux/in_route.h> |
| 61 | #include <linux/route.h> | 61 | #include <linux/route.h> |
| 62 | #include <linux/tcp.h> | ||
| 63 | #include <linux/skbuff.h> | 62 | #include <linux/skbuff.h> |
| 64 | #include <net/dst.h> | 63 | #include <net/dst.h> |
| 65 | #include <net/sock.h> | 64 | #include <net/sock.h> |
| @@ -71,6 +70,7 @@ | |||
| 71 | #include <net/udp.h> | 70 | #include <net/udp.h> |
| 72 | #include <net/raw.h> | 71 | #include <net/raw.h> |
| 73 | #include <net/snmp.h> | 72 | #include <net/snmp.h> |
| 73 | #include <net/tcp_states.h> | ||
| 74 | #include <net/inet_common.h> | 74 | #include <net/inet_common.h> |
| 75 | #include <net/checksum.h> | 75 | #include <net/checksum.h> |
| 76 | #include <net/xfrm.h> | 76 | #include <net/xfrm.h> |
| @@ -150,10 +150,11 @@ static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb) | |||
| 150 | * RFC 1122: SHOULD pass TOS value up to the transport layer. | 150 | * RFC 1122: SHOULD pass TOS value up to the transport layer. |
| 151 | * -> It does. And not only TOS, but all IP header. | 151 | * -> It does. And not only TOS, but all IP header. |
| 152 | */ | 152 | */ |
| 153 | void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) | 153 | int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) |
| 154 | { | 154 | { |
| 155 | struct sock *sk; | 155 | struct sock *sk; |
| 156 | struct hlist_head *head; | 156 | struct hlist_head *head; |
| 157 | int delivered = 0; | ||
| 157 | 158 | ||
| 158 | read_lock(&raw_v4_lock); | 159 | read_lock(&raw_v4_lock); |
| 159 | head = &raw_v4_htable[hash]; | 160 | head = &raw_v4_htable[hash]; |
| @@ -164,6 +165,7 @@ void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) | |||
| 164 | skb->dev->ifindex); | 165 | skb->dev->ifindex); |
| 165 | 166 | ||
| 166 | while (sk) { | 167 | while (sk) { |
| 168 | delivered = 1; | ||
| 167 | if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) { | 169 | if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) { |
| 168 | struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); | 170 | struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); |
| 169 | 171 | ||
| @@ -177,6 +179,7 @@ void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) | |||
| 177 | } | 179 | } |
| 178 | out: | 180 | out: |
| 179 | read_unlock(&raw_v4_lock); | 181 | read_unlock(&raw_v4_lock); |
| 182 | return delivered; | ||
| 180 | } | 183 | } |
| 181 | 184 | ||
| 182 | void raw_err (struct sock *sk, struct sk_buff *skb, u32 info) | 185 | void raw_err (struct sock *sk, struct sk_buff *skb, u32 info) |
diff --git a/net/ipv4/route.c b/net/ipv4/route.c index d675ff80b04d..8c0b14e3beec 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c | |||
| @@ -240,7 +240,9 @@ static unsigned rt_hash_mask; | |||
| 240 | static int rt_hash_log; | 240 | static int rt_hash_log; |
| 241 | static unsigned int rt_hash_rnd; | 241 | static unsigned int rt_hash_rnd; |
| 242 | 242 | ||
| 243 | struct rt_cache_stat *rt_cache_stat; | 243 | static struct rt_cache_stat *rt_cache_stat; |
| 244 | #define RT_CACHE_STAT_INC(field) \ | ||
| 245 | (per_cpu_ptr(rt_cache_stat, raw_smp_processor_id())->field++) | ||
| 244 | 246 | ||
| 245 | static int rt_intern_hash(unsigned hash, struct rtable *rth, | 247 | static int rt_intern_hash(unsigned hash, struct rtable *rth, |
| 246 | struct rtable **res); | 248 | struct rtable **res); |
| @@ -2600,6 +2602,8 @@ int __ip_route_output_key(struct rtable **rp, const struct flowi *flp) | |||
| 2600 | return ip_route_output_slow(rp, flp); | 2602 | return ip_route_output_slow(rp, flp); |
| 2601 | } | 2603 | } |
| 2602 | 2604 | ||
| 2605 | EXPORT_SYMBOL_GPL(__ip_route_output_key); | ||
| 2606 | |||
| 2603 | int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags) | 2607 | int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags) |
| 2604 | { | 2608 | { |
| 2605 | int err; | 2609 | int err; |
| @@ -2618,6 +2622,8 @@ int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, | |||
| 2618 | return 0; | 2622 | return 0; |
| 2619 | } | 2623 | } |
| 2620 | 2624 | ||
| 2625 | EXPORT_SYMBOL_GPL(ip_route_output_flow); | ||
| 2626 | |||
| 2621 | int ip_route_output_key(struct rtable **rp, struct flowi *flp) | 2627 | int ip_route_output_key(struct rtable **rp, struct flowi *flp) |
| 2622 | { | 2628 | { |
| 2623 | return ip_route_output_flow(rp, flp, NULL, 0); | 2629 | return ip_route_output_flow(rp, flp, NULL, 0); |
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 72d014442185..a34e60ea48a1 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c | |||
| @@ -169,8 +169,6 @@ static inline int cookie_check(struct sk_buff *skb, __u32 cookie) | |||
| 169 | return mssind < NUM_MSS ? msstab[mssind] + 1 : 0; | 169 | return mssind < NUM_MSS ? msstab[mssind] + 1 : 0; |
| 170 | } | 170 | } |
| 171 | 171 | ||
| 172 | extern struct request_sock_ops tcp_request_sock_ops; | ||
| 173 | |||
| 174 | static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, | 172 | static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, |
| 175 | struct request_sock *req, | 173 | struct request_sock *req, |
| 176 | struct dst_entry *dst) | 174 | struct dst_entry *dst) |
| @@ -180,7 +178,7 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, | |||
| 180 | 178 | ||
| 181 | child = tp->af_specific->syn_recv_sock(sk, skb, req, dst); | 179 | child = tp->af_specific->syn_recv_sock(sk, skb, req, dst); |
| 182 | if (child) | 180 | if (child) |
| 183 | tcp_acceptq_queue(sk, req, child); | 181 | inet_csk_reqsk_queue_add(sk, req, child); |
| 184 | else | 182 | else |
| 185 | reqsk_free(req); | 183 | reqsk_free(req); |
| 186 | 184 | ||
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index e32894532416..652685623519 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c | |||
| @@ -11,7 +11,9 @@ | |||
| 11 | #include <linux/module.h> | 11 | #include <linux/module.h> |
| 12 | #include <linux/sysctl.h> | 12 | #include <linux/sysctl.h> |
| 13 | #include <linux/config.h> | 13 | #include <linux/config.h> |
| 14 | #include <linux/igmp.h> | ||
| 14 | #include <net/snmp.h> | 15 | #include <net/snmp.h> |
| 16 | #include <net/icmp.h> | ||
| 15 | #include <net/ip.h> | 17 | #include <net/ip.h> |
| 16 | #include <net/route.h> | 18 | #include <net/route.h> |
| 17 | #include <net/tcp.h> | 19 | #include <net/tcp.h> |
| @@ -19,36 +21,6 @@ | |||
| 19 | /* From af_inet.c */ | 21 | /* From af_inet.c */ |
| 20 | extern int sysctl_ip_nonlocal_bind; | 22 | extern int sysctl_ip_nonlocal_bind; |
| 21 | 23 | ||
| 22 | /* From icmp.c */ | ||
| 23 | extern int sysctl_icmp_echo_ignore_all; | ||
| 24 | extern int sysctl_icmp_echo_ignore_broadcasts; | ||
| 25 | extern int sysctl_icmp_ignore_bogus_error_responses; | ||
| 26 | extern int sysctl_icmp_errors_use_inbound_ifaddr; | ||
| 27 | |||
| 28 | /* From ip_fragment.c */ | ||
| 29 | extern int sysctl_ipfrag_low_thresh; | ||
| 30 | extern int sysctl_ipfrag_high_thresh; | ||
| 31 | extern int sysctl_ipfrag_time; | ||
| 32 | extern int sysctl_ipfrag_secret_interval; | ||
| 33 | |||
| 34 | /* From ip_output.c */ | ||
| 35 | extern int sysctl_ip_dynaddr; | ||
| 36 | |||
| 37 | /* From icmp.c */ | ||
| 38 | extern int sysctl_icmp_ratelimit; | ||
| 39 | extern int sysctl_icmp_ratemask; | ||
| 40 | |||
| 41 | /* From igmp.c */ | ||
| 42 | extern int sysctl_igmp_max_memberships; | ||
| 43 | extern int sysctl_igmp_max_msf; | ||
| 44 | |||
| 45 | /* From inetpeer.c */ | ||
| 46 | extern int inet_peer_threshold; | ||
| 47 | extern int inet_peer_minttl; | ||
| 48 | extern int inet_peer_maxttl; | ||
| 49 | extern int inet_peer_gc_mintime; | ||
| 50 | extern int inet_peer_gc_maxtime; | ||
| 51 | |||
| 52 | #ifdef CONFIG_SYSCTL | 24 | #ifdef CONFIG_SYSCTL |
| 53 | static int tcp_retr1_max = 255; | 25 | static int tcp_retr1_max = 255; |
| 54 | static int ip_local_port_range_min[] = { 1, 1 }; | 26 | static int ip_local_port_range_min[] = { 1, 1 }; |
| @@ -57,8 +29,6 @@ static int ip_local_port_range_max[] = { 65535, 65535 }; | |||
| 57 | 29 | ||
| 58 | struct ipv4_config ipv4_config; | 30 | struct ipv4_config ipv4_config; |
| 59 | 31 | ||
| 60 | extern ctl_table ipv4_route_table[]; | ||
| 61 | |||
| 62 | #ifdef CONFIG_SYSCTL | 32 | #ifdef CONFIG_SYSCTL |
| 63 | 33 | ||
| 64 | static | 34 | static |
| @@ -136,10 +106,11 @@ static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * | |||
| 136 | return ret; | 106 | return ret; |
| 137 | } | 107 | } |
| 138 | 108 | ||
| 139 | int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, int nlen, | 109 | static int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, |
| 140 | void __user *oldval, size_t __user *oldlenp, | 110 | int nlen, void __user *oldval, |
| 141 | void __user *newval, size_t newlen, | 111 | size_t __user *oldlenp, |
| 142 | void **context) | 112 | void __user *newval, size_t newlen, |
| 113 | void **context) | ||
| 143 | { | 114 | { |
| 144 | char val[TCP_CA_NAME_MAX]; | 115 | char val[TCP_CA_NAME_MAX]; |
| 145 | ctl_table tbl = { | 116 | ctl_table tbl = { |
| @@ -259,7 +230,7 @@ ctl_table ipv4_table[] = { | |||
| 259 | { | 230 | { |
| 260 | .ctl_name = NET_TCP_MAX_TW_BUCKETS, | 231 | .ctl_name = NET_TCP_MAX_TW_BUCKETS, |
| 261 | .procname = "tcp_max_tw_buckets", | 232 | .procname = "tcp_max_tw_buckets", |
| 262 | .data = &sysctl_tcp_max_tw_buckets, | 233 | .data = &tcp_death_row.sysctl_max_tw_buckets, |
| 263 | .maxlen = sizeof(int), | 234 | .maxlen = sizeof(int), |
| 264 | .mode = 0644, | 235 | .mode = 0644, |
| 265 | .proc_handler = &proc_dointvec | 236 | .proc_handler = &proc_dointvec |
| @@ -363,7 +334,7 @@ ctl_table ipv4_table[] = { | |||
| 363 | { | 334 | { |
| 364 | .ctl_name = NET_TCP_TW_RECYCLE, | 335 | .ctl_name = NET_TCP_TW_RECYCLE, |
| 365 | .procname = "tcp_tw_recycle", | 336 | .procname = "tcp_tw_recycle", |
| 366 | .data = &sysctl_tcp_tw_recycle, | 337 | .data = &tcp_death_row.sysctl_tw_recycle, |
| 367 | .maxlen = sizeof(int), | 338 | .maxlen = sizeof(int), |
| 368 | .mode = 0644, | 339 | .mode = 0644, |
| 369 | .proc_handler = &proc_dointvec | 340 | .proc_handler = &proc_dointvec |
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 69b1fcf70077..02fdda68718d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
| @@ -269,13 +269,12 @@ | |||
| 269 | 269 | ||
| 270 | int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; | 270 | int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; |
| 271 | 271 | ||
| 272 | DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics); | 272 | DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics) __read_mostly; |
| 273 | |||
| 274 | kmem_cache_t *tcp_bucket_cachep; | ||
| 275 | kmem_cache_t *tcp_timewait_cachep; | ||
| 276 | 273 | ||
| 277 | atomic_t tcp_orphan_count = ATOMIC_INIT(0); | 274 | atomic_t tcp_orphan_count = ATOMIC_INIT(0); |
| 278 | 275 | ||
| 276 | EXPORT_SYMBOL_GPL(tcp_orphan_count); | ||
| 277 | |||
| 279 | int sysctl_tcp_mem[3]; | 278 | int sysctl_tcp_mem[3]; |
| 280 | int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 }; | 279 | int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 }; |
| 281 | int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 }; | 280 | int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 }; |
| @@ -311,15 +310,6 @@ void tcp_enter_memory_pressure(void) | |||
| 311 | EXPORT_SYMBOL(tcp_enter_memory_pressure); | 310 | EXPORT_SYMBOL(tcp_enter_memory_pressure); |
| 312 | 311 | ||
| 313 | /* | 312 | /* |
| 314 | * LISTEN is a special case for poll.. | ||
| 315 | */ | ||
| 316 | static __inline__ unsigned int tcp_listen_poll(struct sock *sk, | ||
| 317 | poll_table *wait) | ||
| 318 | { | ||
| 319 | return !reqsk_queue_empty(&tcp_sk(sk)->accept_queue) ? (POLLIN | POLLRDNORM) : 0; | ||
| 320 | } | ||
| 321 | |||
| 322 | /* | ||
| 323 | * Wait for a TCP event. | 313 | * Wait for a TCP event. |
| 324 | * | 314 | * |
| 325 | * Note that we don't need to lock the socket, as the upper poll layers | 315 | * Note that we don't need to lock the socket, as the upper poll layers |
| @@ -334,7 +324,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) | |||
| 334 | 324 | ||
| 335 | poll_wait(file, sk->sk_sleep, wait); | 325 | poll_wait(file, sk->sk_sleep, wait); |
| 336 | if (sk->sk_state == TCP_LISTEN) | 326 | if (sk->sk_state == TCP_LISTEN) |
| 337 | return tcp_listen_poll(sk, wait); | 327 | return inet_csk_listen_poll(sk); |
| 338 | 328 | ||
| 339 | /* Socket is not locked. We are protected from async events | 329 | /* Socket is not locked. We are protected from async events |
| 340 | by poll logic and correct handling of state changes | 330 | by poll logic and correct handling of state changes |
| @@ -457,109 +447,6 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) | |||
| 457 | return put_user(answ, (int __user *)arg); | 447 | return put_user(answ, (int __user *)arg); |
| 458 | } | 448 | } |
| 459 | 449 | ||
| 460 | |||
| 461 | int tcp_listen_start(struct sock *sk) | ||
| 462 | { | ||
| 463 | struct inet_sock *inet = inet_sk(sk); | ||
| 464 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 465 | int rc = reqsk_queue_alloc(&tp->accept_queue, TCP_SYNQ_HSIZE); | ||
| 466 | |||
| 467 | if (rc != 0) | ||
| 468 | return rc; | ||
| 469 | |||
| 470 | sk->sk_max_ack_backlog = 0; | ||
| 471 | sk->sk_ack_backlog = 0; | ||
| 472 | tcp_delack_init(tp); | ||
| 473 | |||
| 474 | /* There is race window here: we announce ourselves listening, | ||
| 475 | * but this transition is still not validated by get_port(). | ||
| 476 | * It is OK, because this socket enters to hash table only | ||
| 477 | * after validation is complete. | ||
| 478 | */ | ||
| 479 | sk->sk_state = TCP_LISTEN; | ||
| 480 | if (!sk->sk_prot->get_port(sk, inet->num)) { | ||
| 481 | inet->sport = htons(inet->num); | ||
| 482 | |||
| 483 | sk_dst_reset(sk); | ||
| 484 | sk->sk_prot->hash(sk); | ||
| 485 | |||
| 486 | return 0; | ||
| 487 | } | ||
| 488 | |||
| 489 | sk->sk_state = TCP_CLOSE; | ||
| 490 | reqsk_queue_destroy(&tp->accept_queue); | ||
| 491 | return -EADDRINUSE; | ||
| 492 | } | ||
| 493 | |||
| 494 | /* | ||
| 495 | * This routine closes sockets which have been at least partially | ||
| 496 | * opened, but not yet accepted. | ||
| 497 | */ | ||
| 498 | |||
| 499 | static void tcp_listen_stop (struct sock *sk) | ||
| 500 | { | ||
| 501 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 502 | struct listen_sock *lopt; | ||
| 503 | struct request_sock *acc_req; | ||
| 504 | struct request_sock *req; | ||
| 505 | int i; | ||
| 506 | |||
| 507 | tcp_delete_keepalive_timer(sk); | ||
| 508 | |||
| 509 | /* make all the listen_opt local to us */ | ||
| 510 | lopt = reqsk_queue_yank_listen_sk(&tp->accept_queue); | ||
| 511 | acc_req = reqsk_queue_yank_acceptq(&tp->accept_queue); | ||
| 512 | |||
| 513 | if (lopt->qlen) { | ||
| 514 | for (i = 0; i < TCP_SYNQ_HSIZE; i++) { | ||
| 515 | while ((req = lopt->syn_table[i]) != NULL) { | ||
| 516 | lopt->syn_table[i] = req->dl_next; | ||
| 517 | lopt->qlen--; | ||
| 518 | reqsk_free(req); | ||
| 519 | |||
| 520 | /* Following specs, it would be better either to send FIN | ||
| 521 | * (and enter FIN-WAIT-1, it is normal close) | ||
| 522 | * or to send active reset (abort). | ||
| 523 | * Certainly, it is pretty dangerous while synflood, but it is | ||
| 524 | * bad justification for our negligence 8) | ||
| 525 | * To be honest, we are not able to make either | ||
| 526 | * of the variants now. --ANK | ||
| 527 | */ | ||
| 528 | } | ||
| 529 | } | ||
| 530 | } | ||
| 531 | BUG_TRAP(!lopt->qlen); | ||
| 532 | |||
| 533 | kfree(lopt); | ||
| 534 | |||
| 535 | while ((req = acc_req) != NULL) { | ||
| 536 | struct sock *child = req->sk; | ||
| 537 | |||
| 538 | acc_req = req->dl_next; | ||
| 539 | |||
| 540 | local_bh_disable(); | ||
| 541 | bh_lock_sock(child); | ||
| 542 | BUG_TRAP(!sock_owned_by_user(child)); | ||
| 543 | sock_hold(child); | ||
| 544 | |||
| 545 | tcp_disconnect(child, O_NONBLOCK); | ||
| 546 | |||
| 547 | sock_orphan(child); | ||
| 548 | |||
| 549 | atomic_inc(&tcp_orphan_count); | ||
| 550 | |||
| 551 | tcp_destroy_sock(child); | ||
| 552 | |||
| 553 | bh_unlock_sock(child); | ||
| 554 | local_bh_enable(); | ||
| 555 | sock_put(child); | ||
| 556 | |||
| 557 | sk_acceptq_removed(sk); | ||
| 558 | __reqsk_free(req); | ||
| 559 | } | ||
| 560 | BUG_TRAP(!sk->sk_ack_backlog); | ||
| 561 | } | ||
| 562 | |||
| 563 | static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) | 450 | static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) |
| 564 | { | 451 | { |
| 565 | TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; | 452 | TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; |
| @@ -975,7 +862,7 @@ do_fault: | |||
| 975 | if (!skb->len) { | 862 | if (!skb->len) { |
| 976 | if (sk->sk_send_head == skb) | 863 | if (sk->sk_send_head == skb) |
| 977 | sk->sk_send_head = NULL; | 864 | sk->sk_send_head = NULL; |
| 978 | __skb_unlink(skb, skb->list); | 865 | __skb_unlink(skb, &sk->sk_write_queue); |
| 979 | sk_stream_free_skb(sk, skb); | 866 | sk_stream_free_skb(sk, skb); |
| 980 | } | 867 | } |
| 981 | 868 | ||
| @@ -1057,20 +944,21 @@ static void cleanup_rbuf(struct sock *sk, int copied) | |||
| 1057 | BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)); | 944 | BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)); |
| 1058 | #endif | 945 | #endif |
| 1059 | 946 | ||
| 1060 | if (tcp_ack_scheduled(tp)) { | 947 | if (inet_csk_ack_scheduled(sk)) { |
| 948 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 1061 | /* Delayed ACKs frequently hit locked sockets during bulk | 949 | /* Delayed ACKs frequently hit locked sockets during bulk |
| 1062 | * receive. */ | 950 | * receive. */ |
| 1063 | if (tp->ack.blocked || | 951 | if (icsk->icsk_ack.blocked || |
| 1064 | /* Once-per-two-segments ACK was not sent by tcp_input.c */ | 952 | /* Once-per-two-segments ACK was not sent by tcp_input.c */ |
| 1065 | tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss || | 953 | tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss || |
| 1066 | /* | 954 | /* |
| 1067 | * If this read emptied read buffer, we send ACK, if | 955 | * If this read emptied read buffer, we send ACK, if |
| 1068 | * connection is not bidirectional, user drained | 956 | * connection is not bidirectional, user drained |
| 1069 | * receive buffer and there was a small segment | 957 | * receive buffer and there was a small segment |
| 1070 | * in queue. | 958 | * in queue. |
| 1071 | */ | 959 | */ |
| 1072 | (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) && | 960 | (copied > 0 && (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && |
| 1073 | !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc))) | 961 | !icsk->icsk_ack.pingpong && !atomic_read(&sk->sk_rmem_alloc))) |
| 1074 | time_to_ack = 1; | 962 | time_to_ack = 1; |
| 1075 | } | 963 | } |
| 1076 | 964 | ||
| @@ -1572,40 +1460,6 @@ void tcp_shutdown(struct sock *sk, int how) | |||
| 1572 | } | 1460 | } |
| 1573 | } | 1461 | } |
| 1574 | 1462 | ||
| 1575 | /* | ||
| 1576 | * At this point, there should be no process reference to this | ||
| 1577 | * socket, and thus no user references at all. Therefore we | ||
| 1578 | * can assume the socket waitqueue is inactive and nobody will | ||
| 1579 | * try to jump onto it. | ||
| 1580 | */ | ||
| 1581 | void tcp_destroy_sock(struct sock *sk) | ||
| 1582 | { | ||
| 1583 | BUG_TRAP(sk->sk_state == TCP_CLOSE); | ||
| 1584 | BUG_TRAP(sock_flag(sk, SOCK_DEAD)); | ||
| 1585 | |||
| 1586 | /* It cannot be in hash table! */ | ||
| 1587 | BUG_TRAP(sk_unhashed(sk)); | ||
| 1588 | |||
| 1589 | /* If it has not 0 inet_sk(sk)->num, it must be bound */ | ||
| 1590 | BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash); | ||
| 1591 | |||
| 1592 | sk->sk_prot->destroy(sk); | ||
| 1593 | |||
| 1594 | sk_stream_kill_queues(sk); | ||
| 1595 | |||
| 1596 | xfrm_sk_free_policy(sk); | ||
| 1597 | |||
| 1598 | #ifdef INET_REFCNT_DEBUG | ||
| 1599 | if (atomic_read(&sk->sk_refcnt) != 1) { | ||
| 1600 | printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n", | ||
| 1601 | sk, atomic_read(&sk->sk_refcnt)); | ||
| 1602 | } | ||
| 1603 | #endif | ||
| 1604 | |||
| 1605 | atomic_dec(&tcp_orphan_count); | ||
| 1606 | sock_put(sk); | ||
| 1607 | } | ||
| 1608 | |||
| 1609 | void tcp_close(struct sock *sk, long timeout) | 1463 | void tcp_close(struct sock *sk, long timeout) |
| 1610 | { | 1464 | { |
| 1611 | struct sk_buff *skb; | 1465 | struct sk_buff *skb; |
| @@ -1618,7 +1472,7 @@ void tcp_close(struct sock *sk, long timeout) | |||
| 1618 | tcp_set_state(sk, TCP_CLOSE); | 1472 | tcp_set_state(sk, TCP_CLOSE); |
| 1619 | 1473 | ||
| 1620 | /* Special case. */ | 1474 | /* Special case. */ |
| 1621 | tcp_listen_stop(sk); | 1475 | inet_csk_listen_stop(sk); |
| 1622 | 1476 | ||
| 1623 | goto adjudge_to_death; | 1477 | goto adjudge_to_death; |
| 1624 | } | 1478 | } |
| @@ -1721,12 +1575,12 @@ adjudge_to_death: | |||
| 1721 | tcp_send_active_reset(sk, GFP_ATOMIC); | 1575 | tcp_send_active_reset(sk, GFP_ATOMIC); |
| 1722 | NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER); | 1576 | NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER); |
| 1723 | } else { | 1577 | } else { |
| 1724 | int tmo = tcp_fin_time(tp); | 1578 | const int tmo = tcp_fin_time(sk); |
| 1725 | 1579 | ||
| 1726 | if (tmo > TCP_TIMEWAIT_LEN) { | 1580 | if (tmo > TCP_TIMEWAIT_LEN) { |
| 1727 | tcp_reset_keepalive_timer(sk, tcp_fin_time(tp)); | 1581 | inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk)); |
| 1728 | } else { | 1582 | } else { |
| 1729 | atomic_inc(&tcp_orphan_count); | 1583 | atomic_inc(sk->sk_prot->orphan_count); |
| 1730 | tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); | 1584 | tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); |
| 1731 | goto out; | 1585 | goto out; |
| 1732 | } | 1586 | } |
| @@ -1734,7 +1588,7 @@ adjudge_to_death: | |||
| 1734 | } | 1588 | } |
| 1735 | if (sk->sk_state != TCP_CLOSE) { | 1589 | if (sk->sk_state != TCP_CLOSE) { |
| 1736 | sk_stream_mem_reclaim(sk); | 1590 | sk_stream_mem_reclaim(sk); |
| 1737 | if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans || | 1591 | if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans || |
| 1738 | (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && | 1592 | (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && |
| 1739 | atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) { | 1593 | atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) { |
| 1740 | if (net_ratelimit()) | 1594 | if (net_ratelimit()) |
| @@ -1745,10 +1599,10 @@ adjudge_to_death: | |||
| 1745 | NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY); | 1599 | NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY); |
| 1746 | } | 1600 | } |
| 1747 | } | 1601 | } |
| 1748 | atomic_inc(&tcp_orphan_count); | 1602 | atomic_inc(sk->sk_prot->orphan_count); |
| 1749 | 1603 | ||
| 1750 | if (sk->sk_state == TCP_CLOSE) | 1604 | if (sk->sk_state == TCP_CLOSE) |
| 1751 | tcp_destroy_sock(sk); | 1605 | inet_csk_destroy_sock(sk); |
| 1752 | /* Otherwise, socket is reprieved until protocol close. */ | 1606 | /* Otherwise, socket is reprieved until protocol close. */ |
| 1753 | 1607 | ||
| 1754 | out: | 1608 | out: |
| @@ -1769,6 +1623,7 @@ static inline int tcp_need_reset(int state) | |||
| 1769 | int tcp_disconnect(struct sock *sk, int flags) | 1623 | int tcp_disconnect(struct sock *sk, int flags) |
| 1770 | { | 1624 | { |
| 1771 | struct inet_sock *inet = inet_sk(sk); | 1625 | struct inet_sock *inet = inet_sk(sk); |
| 1626 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 1772 | struct tcp_sock *tp = tcp_sk(sk); | 1627 | struct tcp_sock *tp = tcp_sk(sk); |
| 1773 | int err = 0; | 1628 | int err = 0; |
| 1774 | int old_state = sk->sk_state; | 1629 | int old_state = sk->sk_state; |
| @@ -1778,7 +1633,7 @@ int tcp_disconnect(struct sock *sk, int flags) | |||
| 1778 | 1633 | ||
| 1779 | /* ABORT function of RFC793 */ | 1634 | /* ABORT function of RFC793 */ |
| 1780 | if (old_state == TCP_LISTEN) { | 1635 | if (old_state == TCP_LISTEN) { |
| 1781 | tcp_listen_stop(sk); | 1636 | inet_csk_listen_stop(sk); |
| 1782 | } else if (tcp_need_reset(old_state) || | 1637 | } else if (tcp_need_reset(old_state) || |
| 1783 | (tp->snd_nxt != tp->write_seq && | 1638 | (tp->snd_nxt != tp->write_seq && |
| 1784 | (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { | 1639 | (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { |
| @@ -1805,125 +1660,34 @@ int tcp_disconnect(struct sock *sk, int flags) | |||
| 1805 | tp->srtt = 0; | 1660 | tp->srtt = 0; |
| 1806 | if ((tp->write_seq += tp->max_window + 2) == 0) | 1661 | if ((tp->write_seq += tp->max_window + 2) == 0) |
| 1807 | tp->write_seq = 1; | 1662 | tp->write_seq = 1; |
| 1808 | tp->backoff = 0; | 1663 | icsk->icsk_backoff = 0; |
| 1809 | tp->snd_cwnd = 2; | 1664 | tp->snd_cwnd = 2; |
| 1810 | tp->probes_out = 0; | 1665 | icsk->icsk_probes_out = 0; |
| 1811 | tp->packets_out = 0; | 1666 | tp->packets_out = 0; |
| 1812 | tp->snd_ssthresh = 0x7fffffff; | 1667 | tp->snd_ssthresh = 0x7fffffff; |
| 1813 | tp->snd_cwnd_cnt = 0; | 1668 | tp->snd_cwnd_cnt = 0; |
| 1814 | tcp_set_ca_state(tp, TCP_CA_Open); | 1669 | tcp_set_ca_state(sk, TCP_CA_Open); |
| 1815 | tcp_clear_retrans(tp); | 1670 | tcp_clear_retrans(tp); |
| 1816 | tcp_delack_init(tp); | 1671 | inet_csk_delack_init(sk); |
| 1817 | sk->sk_send_head = NULL; | 1672 | sk->sk_send_head = NULL; |
| 1818 | tp->rx_opt.saw_tstamp = 0; | 1673 | tp->rx_opt.saw_tstamp = 0; |
| 1819 | tcp_sack_reset(&tp->rx_opt); | 1674 | tcp_sack_reset(&tp->rx_opt); |
| 1820 | __sk_dst_reset(sk); | 1675 | __sk_dst_reset(sk); |
| 1821 | 1676 | ||
| 1822 | BUG_TRAP(!inet->num || tp->bind_hash); | 1677 | BUG_TRAP(!inet->num || icsk->icsk_bind_hash); |
| 1823 | 1678 | ||
| 1824 | sk->sk_error_report(sk); | 1679 | sk->sk_error_report(sk); |
| 1825 | return err; | 1680 | return err; |
| 1826 | } | 1681 | } |
| 1827 | 1682 | ||
| 1828 | /* | 1683 | /* |
| 1829 | * Wait for an incoming connection, avoid race | ||
| 1830 | * conditions. This must be called with the socket locked. | ||
| 1831 | */ | ||
| 1832 | static int wait_for_connect(struct sock *sk, long timeo) | ||
| 1833 | { | ||
| 1834 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 1835 | DEFINE_WAIT(wait); | ||
| 1836 | int err; | ||
| 1837 | |||
| 1838 | /* | ||
| 1839 | * True wake-one mechanism for incoming connections: only | ||
| 1840 | * one process gets woken up, not the 'whole herd'. | ||
| 1841 | * Since we do not 'race & poll' for established sockets | ||
| 1842 | * anymore, the common case will execute the loop only once. | ||
| 1843 | * | ||
| 1844 | * Subtle issue: "add_wait_queue_exclusive()" will be added | ||
| 1845 | * after any current non-exclusive waiters, and we know that | ||
| 1846 | * it will always _stay_ after any new non-exclusive waiters | ||
| 1847 | * because all non-exclusive waiters are added at the | ||
| 1848 | * beginning of the wait-queue. As such, it's ok to "drop" | ||
| 1849 | * our exclusiveness temporarily when we get woken up without | ||
| 1850 | * having to remove and re-insert us on the wait queue. | ||
| 1851 | */ | ||
| 1852 | for (;;) { | ||
| 1853 | prepare_to_wait_exclusive(sk->sk_sleep, &wait, | ||
| 1854 | TASK_INTERRUPTIBLE); | ||
| 1855 | release_sock(sk); | ||
| 1856 | if (reqsk_queue_empty(&tp->accept_queue)) | ||
| 1857 | timeo = schedule_timeout(timeo); | ||
| 1858 | lock_sock(sk); | ||
| 1859 | err = 0; | ||
| 1860 | if (!reqsk_queue_empty(&tp->accept_queue)) | ||
| 1861 | break; | ||
| 1862 | err = -EINVAL; | ||
| 1863 | if (sk->sk_state != TCP_LISTEN) | ||
| 1864 | break; | ||
| 1865 | err = sock_intr_errno(timeo); | ||
| 1866 | if (signal_pending(current)) | ||
| 1867 | break; | ||
| 1868 | err = -EAGAIN; | ||
| 1869 | if (!timeo) | ||
| 1870 | break; | ||
| 1871 | } | ||
| 1872 | finish_wait(sk->sk_sleep, &wait); | ||
| 1873 | return err; | ||
| 1874 | } | ||
| 1875 | |||
| 1876 | /* | ||
| 1877 | * This will accept the next outstanding connection. | ||
| 1878 | */ | ||
| 1879 | |||
| 1880 | struct sock *tcp_accept(struct sock *sk, int flags, int *err) | ||
| 1881 | { | ||
| 1882 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 1883 | struct sock *newsk; | ||
| 1884 | int error; | ||
| 1885 | |||
| 1886 | lock_sock(sk); | ||
| 1887 | |||
| 1888 | /* We need to make sure that this socket is listening, | ||
| 1889 | * and that it has something pending. | ||
| 1890 | */ | ||
| 1891 | error = -EINVAL; | ||
| 1892 | if (sk->sk_state != TCP_LISTEN) | ||
| 1893 | goto out_err; | ||
| 1894 | |||
| 1895 | /* Find already established connection */ | ||
| 1896 | if (reqsk_queue_empty(&tp->accept_queue)) { | ||
| 1897 | long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); | ||
| 1898 | |||
| 1899 | /* If this is a non blocking socket don't sleep */ | ||
| 1900 | error = -EAGAIN; | ||
| 1901 | if (!timeo) | ||
| 1902 | goto out_err; | ||
| 1903 | |||
| 1904 | error = wait_for_connect(sk, timeo); | ||
| 1905 | if (error) | ||
| 1906 | goto out_err; | ||
| 1907 | } | ||
| 1908 | |||
| 1909 | newsk = reqsk_queue_get_child(&tp->accept_queue, sk); | ||
| 1910 | BUG_TRAP(newsk->sk_state != TCP_SYN_RECV); | ||
| 1911 | out: | ||
| 1912 | release_sock(sk); | ||
| 1913 | return newsk; | ||
| 1914 | out_err: | ||
| 1915 | newsk = NULL; | ||
| 1916 | *err = error; | ||
| 1917 | goto out; | ||
| 1918 | } | ||
| 1919 | |||
| 1920 | /* | ||
| 1921 | * Socket option code for TCP. | 1684 | * Socket option code for TCP. |
| 1922 | */ | 1685 | */ |
| 1923 | int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, | 1686 | int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, |
| 1924 | int optlen) | 1687 | int optlen) |
| 1925 | { | 1688 | { |
| 1926 | struct tcp_sock *tp = tcp_sk(sk); | 1689 | struct tcp_sock *tp = tcp_sk(sk); |
| 1690 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 1927 | int val; | 1691 | int val; |
| 1928 | int err = 0; | 1692 | int err = 0; |
| 1929 | 1693 | ||
| @@ -1945,7 +1709,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, | |||
| 1945 | name[val] = 0; | 1709 | name[val] = 0; |
| 1946 | 1710 | ||
| 1947 | lock_sock(sk); | 1711 | lock_sock(sk); |
| 1948 | err = tcp_set_congestion_control(tp, name); | 1712 | err = tcp_set_congestion_control(sk, name); |
| 1949 | release_sock(sk); | 1713 | release_sock(sk); |
| 1950 | return err; | 1714 | return err; |
| 1951 | } | 1715 | } |
| @@ -2022,7 +1786,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, | |||
| 2022 | elapsed = tp->keepalive_time - elapsed; | 1786 | elapsed = tp->keepalive_time - elapsed; |
| 2023 | else | 1787 | else |
| 2024 | elapsed = 0; | 1788 | elapsed = 0; |
| 2025 | tcp_reset_keepalive_timer(sk, elapsed); | 1789 | inet_csk_reset_keepalive_timer(sk, elapsed); |
| 2026 | } | 1790 | } |
| 2027 | } | 1791 | } |
| 2028 | break; | 1792 | break; |
| @@ -2042,7 +1806,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, | |||
| 2042 | if (val < 1 || val > MAX_TCP_SYNCNT) | 1806 | if (val < 1 || val > MAX_TCP_SYNCNT) |
| 2043 | err = -EINVAL; | 1807 | err = -EINVAL; |
| 2044 | else | 1808 | else |
| 2045 | tp->syn_retries = val; | 1809 | icsk->icsk_syn_retries = val; |
| 2046 | break; | 1810 | break; |
| 2047 | 1811 | ||
| 2048 | case TCP_LINGER2: | 1812 | case TCP_LINGER2: |
| @@ -2055,15 +1819,15 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, | |||
| 2055 | break; | 1819 | break; |
| 2056 | 1820 | ||
| 2057 | case TCP_DEFER_ACCEPT: | 1821 | case TCP_DEFER_ACCEPT: |
| 2058 | tp->defer_accept = 0; | 1822 | icsk->icsk_accept_queue.rskq_defer_accept = 0; |
| 2059 | if (val > 0) { | 1823 | if (val > 0) { |
| 2060 | /* Translate value in seconds to number of | 1824 | /* Translate value in seconds to number of |
| 2061 | * retransmits */ | 1825 | * retransmits */ |
| 2062 | while (tp->defer_accept < 32 && | 1826 | while (icsk->icsk_accept_queue.rskq_defer_accept < 32 && |
| 2063 | val > ((TCP_TIMEOUT_INIT / HZ) << | 1827 | val > ((TCP_TIMEOUT_INIT / HZ) << |
| 2064 | tp->defer_accept)) | 1828 | icsk->icsk_accept_queue.rskq_defer_accept)) |
| 2065 | tp->defer_accept++; | 1829 | icsk->icsk_accept_queue.rskq_defer_accept++; |
| 2066 | tp->defer_accept++; | 1830 | icsk->icsk_accept_queue.rskq_defer_accept++; |
| 2067 | } | 1831 | } |
| 2068 | break; | 1832 | break; |
| 2069 | 1833 | ||
| @@ -2081,16 +1845,16 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, | |||
| 2081 | 1845 | ||
| 2082 | case TCP_QUICKACK: | 1846 | case TCP_QUICKACK: |
| 2083 | if (!val) { | 1847 | if (!val) { |
| 2084 | tp->ack.pingpong = 1; | 1848 | icsk->icsk_ack.pingpong = 1; |
| 2085 | } else { | 1849 | } else { |
| 2086 | tp->ack.pingpong = 0; | 1850 | icsk->icsk_ack.pingpong = 0; |
| 2087 | if ((1 << sk->sk_state) & | 1851 | if ((1 << sk->sk_state) & |
| 2088 | (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && | 1852 | (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && |
| 2089 | tcp_ack_scheduled(tp)) { | 1853 | inet_csk_ack_scheduled(sk)) { |
| 2090 | tp->ack.pending |= TCP_ACK_PUSHED; | 1854 | icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; |
| 2091 | cleanup_rbuf(sk, 1); | 1855 | cleanup_rbuf(sk, 1); |
| 2092 | if (!(val & 1)) | 1856 | if (!(val & 1)) |
| 2093 | tp->ack.pingpong = 1; | 1857 | icsk->icsk_ack.pingpong = 1; |
| 2094 | } | 1858 | } |
| 2095 | } | 1859 | } |
| 2096 | break; | 1860 | break; |
| @@ -2107,15 +1871,16 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, | |||
| 2107 | void tcp_get_info(struct sock *sk, struct tcp_info *info) | 1871 | void tcp_get_info(struct sock *sk, struct tcp_info *info) |
| 2108 | { | 1872 | { |
| 2109 | struct tcp_sock *tp = tcp_sk(sk); | 1873 | struct tcp_sock *tp = tcp_sk(sk); |
| 1874 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 2110 | u32 now = tcp_time_stamp; | 1875 | u32 now = tcp_time_stamp; |
| 2111 | 1876 | ||
| 2112 | memset(info, 0, sizeof(*info)); | 1877 | memset(info, 0, sizeof(*info)); |
| 2113 | 1878 | ||
| 2114 | info->tcpi_state = sk->sk_state; | 1879 | info->tcpi_state = sk->sk_state; |
| 2115 | info->tcpi_ca_state = tp->ca_state; | 1880 | info->tcpi_ca_state = icsk->icsk_ca_state; |
| 2116 | info->tcpi_retransmits = tp->retransmits; | 1881 | info->tcpi_retransmits = icsk->icsk_retransmits; |
| 2117 | info->tcpi_probes = tp->probes_out; | 1882 | info->tcpi_probes = icsk->icsk_probes_out; |
| 2118 | info->tcpi_backoff = tp->backoff; | 1883 | info->tcpi_backoff = icsk->icsk_backoff; |
| 2119 | 1884 | ||
| 2120 | if (tp->rx_opt.tstamp_ok) | 1885 | if (tp->rx_opt.tstamp_ok) |
| 2121 | info->tcpi_options |= TCPI_OPT_TIMESTAMPS; | 1886 | info->tcpi_options |= TCPI_OPT_TIMESTAMPS; |
| @@ -2130,10 +1895,10 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) | |||
| 2130 | if (tp->ecn_flags&TCP_ECN_OK) | 1895 | if (tp->ecn_flags&TCP_ECN_OK) |
| 2131 | info->tcpi_options |= TCPI_OPT_ECN; | 1896 | info->tcpi_options |= TCPI_OPT_ECN; |
| 2132 | 1897 | ||
| 2133 | info->tcpi_rto = jiffies_to_usecs(tp->rto); | 1898 | info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto); |
| 2134 | info->tcpi_ato = jiffies_to_usecs(tp->ack.ato); | 1899 | info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato); |
| 2135 | info->tcpi_snd_mss = tp->mss_cache; | 1900 | info->tcpi_snd_mss = tp->mss_cache; |
| 2136 | info->tcpi_rcv_mss = tp->ack.rcv_mss; | 1901 | info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss; |
| 2137 | 1902 | ||
| 2138 | info->tcpi_unacked = tp->packets_out; | 1903 | info->tcpi_unacked = tp->packets_out; |
| 2139 | info->tcpi_sacked = tp->sacked_out; | 1904 | info->tcpi_sacked = tp->sacked_out; |
| @@ -2142,7 +1907,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) | |||
| 2142 | info->tcpi_fackets = tp->fackets_out; | 1907 | info->tcpi_fackets = tp->fackets_out; |
| 2143 | 1908 | ||
| 2144 | info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime); | 1909 | info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime); |
| 2145 | info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime); | 1910 | info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime); |
| 2146 | info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); | 1911 | info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); |
| 2147 | 1912 | ||
| 2148 | info->tcpi_pmtu = tp->pmtu_cookie; | 1913 | info->tcpi_pmtu = tp->pmtu_cookie; |
| @@ -2165,6 +1930,7 @@ EXPORT_SYMBOL_GPL(tcp_get_info); | |||
| 2165 | int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, | 1930 | int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, |
| 2166 | int __user *optlen) | 1931 | int __user *optlen) |
| 2167 | { | 1932 | { |
| 1933 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 2168 | struct tcp_sock *tp = tcp_sk(sk); | 1934 | struct tcp_sock *tp = tcp_sk(sk); |
| 2169 | int val, len; | 1935 | int val, len; |
| 2170 | 1936 | ||
| @@ -2202,7 +1968,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, | |||
| 2202 | val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes; | 1968 | val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes; |
| 2203 | break; | 1969 | break; |
| 2204 | case TCP_SYNCNT: | 1970 | case TCP_SYNCNT: |
| 2205 | val = tp->syn_retries ? : sysctl_tcp_syn_retries; | 1971 | val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; |
| 2206 | break; | 1972 | break; |
| 2207 | case TCP_LINGER2: | 1973 | case TCP_LINGER2: |
| 2208 | val = tp->linger2; | 1974 | val = tp->linger2; |
| @@ -2210,8 +1976,8 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, | |||
| 2210 | val = (val ? : sysctl_tcp_fin_timeout) / HZ; | 1976 | val = (val ? : sysctl_tcp_fin_timeout) / HZ; |
| 2211 | break; | 1977 | break; |
| 2212 | case TCP_DEFER_ACCEPT: | 1978 | case TCP_DEFER_ACCEPT: |
| 2213 | val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) << | 1979 | val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 : |
| 2214 | (tp->defer_accept - 1)); | 1980 | ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1)); |
| 2215 | break; | 1981 | break; |
| 2216 | case TCP_WINDOW_CLAMP: | 1982 | case TCP_WINDOW_CLAMP: |
| 2217 | val = tp->window_clamp; | 1983 | val = tp->window_clamp; |
| @@ -2232,7 +1998,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, | |||
| 2232 | return 0; | 1998 | return 0; |
| 2233 | } | 1999 | } |
| 2234 | case TCP_QUICKACK: | 2000 | case TCP_QUICKACK: |
| 2235 | val = !tp->ack.pingpong; | 2001 | val = !icsk->icsk_ack.pingpong; |
| 2236 | break; | 2002 | break; |
| 2237 | 2003 | ||
| 2238 | case TCP_CONGESTION: | 2004 | case TCP_CONGESTION: |
| @@ -2241,7 +2007,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, | |||
| 2241 | len = min_t(unsigned int, len, TCP_CA_NAME_MAX); | 2007 | len = min_t(unsigned int, len, TCP_CA_NAME_MAX); |
| 2242 | if (put_user(len, optlen)) | 2008 | if (put_user(len, optlen)) |
| 2243 | return -EFAULT; | 2009 | return -EFAULT; |
| 2244 | if (copy_to_user(optval, tp->ca_ops->name, len)) | 2010 | if (copy_to_user(optval, icsk->icsk_ca_ops->name, len)) |
| 2245 | return -EFAULT; | 2011 | return -EFAULT; |
| 2246 | return 0; | 2012 | return 0; |
| 2247 | default: | 2013 | default: |
| @@ -2278,79 +2044,72 @@ void __init tcp_init(void) | |||
| 2278 | __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb), | 2044 | __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb), |
| 2279 | sizeof(skb->cb)); | 2045 | sizeof(skb->cb)); |
| 2280 | 2046 | ||
| 2281 | tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket", | 2047 | tcp_hashinfo.bind_bucket_cachep = |
| 2282 | sizeof(struct tcp_bind_bucket), | 2048 | kmem_cache_create("tcp_bind_bucket", |
| 2283 | 0, SLAB_HWCACHE_ALIGN, | 2049 | sizeof(struct inet_bind_bucket), 0, |
| 2284 | NULL, NULL); | 2050 | SLAB_HWCACHE_ALIGN, NULL, NULL); |
| 2285 | if (!tcp_bucket_cachep) | 2051 | if (!tcp_hashinfo.bind_bucket_cachep) |
| 2286 | panic("tcp_init: Cannot alloc tcp_bind_bucket cache."); | 2052 | panic("tcp_init: Cannot alloc tcp_bind_bucket cache."); |
| 2287 | 2053 | ||
| 2288 | tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket", | ||
| 2289 | sizeof(struct tcp_tw_bucket), | ||
| 2290 | 0, SLAB_HWCACHE_ALIGN, | ||
| 2291 | NULL, NULL); | ||
| 2292 | if (!tcp_timewait_cachep) | ||
| 2293 | panic("tcp_init: Cannot alloc tcp_tw_bucket cache."); | ||
| 2294 | |||
| 2295 | /* Size and allocate the main established and bind bucket | 2054 | /* Size and allocate the main established and bind bucket |
| 2296 | * hash tables. | 2055 | * hash tables. |
| 2297 | * | 2056 | * |
| 2298 | * The methodology is similar to that of the buffer cache. | 2057 | * The methodology is similar to that of the buffer cache. |
| 2299 | */ | 2058 | */ |
| 2300 | tcp_ehash = (struct tcp_ehash_bucket *) | 2059 | tcp_hashinfo.ehash = |
| 2301 | alloc_large_system_hash("TCP established", | 2060 | alloc_large_system_hash("TCP established", |
| 2302 | sizeof(struct tcp_ehash_bucket), | 2061 | sizeof(struct inet_ehash_bucket), |
| 2303 | thash_entries, | 2062 | thash_entries, |
| 2304 | (num_physpages >= 128 * 1024) ? | 2063 | (num_physpages >= 128 * 1024) ? |
| 2305 | (25 - PAGE_SHIFT) : | 2064 | (25 - PAGE_SHIFT) : |
| 2306 | (27 - PAGE_SHIFT), | 2065 | (27 - PAGE_SHIFT), |
| 2307 | HASH_HIGHMEM, | 2066 | HASH_HIGHMEM, |
| 2308 | &tcp_ehash_size, | 2067 | &tcp_hashinfo.ehash_size, |
| 2309 | NULL, | 2068 | NULL, |
| 2310 | 0); | 2069 | 0); |
| 2311 | tcp_ehash_size = (1 << tcp_ehash_size) >> 1; | 2070 | tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1; |
| 2312 | for (i = 0; i < (tcp_ehash_size << 1); i++) { | 2071 | for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) { |
| 2313 | rwlock_init(&tcp_ehash[i].lock); | 2072 | rwlock_init(&tcp_hashinfo.ehash[i].lock); |
| 2314 | INIT_HLIST_HEAD(&tcp_ehash[i].chain); | 2073 | INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain); |
| 2315 | } | 2074 | } |
| 2316 | 2075 | ||
| 2317 | tcp_bhash = (struct tcp_bind_hashbucket *) | 2076 | tcp_hashinfo.bhash = |
| 2318 | alloc_large_system_hash("TCP bind", | 2077 | alloc_large_system_hash("TCP bind", |
| 2319 | sizeof(struct tcp_bind_hashbucket), | 2078 | sizeof(struct inet_bind_hashbucket), |
| 2320 | tcp_ehash_size, | 2079 | tcp_hashinfo.ehash_size, |
| 2321 | (num_physpages >= 128 * 1024) ? | 2080 | (num_physpages >= 128 * 1024) ? |
| 2322 | (25 - PAGE_SHIFT) : | 2081 | (25 - PAGE_SHIFT) : |
| 2323 | (27 - PAGE_SHIFT), | 2082 | (27 - PAGE_SHIFT), |
| 2324 | HASH_HIGHMEM, | 2083 | HASH_HIGHMEM, |
| 2325 | &tcp_bhash_size, | 2084 | &tcp_hashinfo.bhash_size, |
| 2326 | NULL, | 2085 | NULL, |
| 2327 | 64 * 1024); | 2086 | 64 * 1024); |
| 2328 | tcp_bhash_size = 1 << tcp_bhash_size; | 2087 | tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size; |
| 2329 | for (i = 0; i < tcp_bhash_size; i++) { | 2088 | for (i = 0; i < tcp_hashinfo.bhash_size; i++) { |
| 2330 | spin_lock_init(&tcp_bhash[i].lock); | 2089 | spin_lock_init(&tcp_hashinfo.bhash[i].lock); |
| 2331 | INIT_HLIST_HEAD(&tcp_bhash[i].chain); | 2090 | INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); |
| 2332 | } | 2091 | } |
| 2333 | 2092 | ||
| 2334 | /* Try to be a bit smarter and adjust defaults depending | 2093 | /* Try to be a bit smarter and adjust defaults depending |
| 2335 | * on available memory. | 2094 | * on available memory. |
| 2336 | */ | 2095 | */ |
| 2337 | for (order = 0; ((1 << order) << PAGE_SHIFT) < | 2096 | for (order = 0; ((1 << order) << PAGE_SHIFT) < |
| 2338 | (tcp_bhash_size * sizeof(struct tcp_bind_hashbucket)); | 2097 | (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket)); |
| 2339 | order++) | 2098 | order++) |
| 2340 | ; | 2099 | ; |
| 2341 | if (order >= 4) { | 2100 | if (order >= 4) { |
| 2342 | sysctl_local_port_range[0] = 32768; | 2101 | sysctl_local_port_range[0] = 32768; |
| 2343 | sysctl_local_port_range[1] = 61000; | 2102 | sysctl_local_port_range[1] = 61000; |
| 2344 | sysctl_tcp_max_tw_buckets = 180000; | 2103 | tcp_death_row.sysctl_max_tw_buckets = 180000; |
| 2345 | sysctl_tcp_max_orphans = 4096 << (order - 4); | 2104 | sysctl_tcp_max_orphans = 4096 << (order - 4); |
| 2346 | sysctl_max_syn_backlog = 1024; | 2105 | sysctl_max_syn_backlog = 1024; |
| 2347 | } else if (order < 3) { | 2106 | } else if (order < 3) { |
| 2348 | sysctl_local_port_range[0] = 1024 * (3 - order); | 2107 | sysctl_local_port_range[0] = 1024 * (3 - order); |
| 2349 | sysctl_tcp_max_tw_buckets >>= (3 - order); | 2108 | tcp_death_row.sysctl_max_tw_buckets >>= (3 - order); |
| 2350 | sysctl_tcp_max_orphans >>= (3 - order); | 2109 | sysctl_tcp_max_orphans >>= (3 - order); |
| 2351 | sysctl_max_syn_backlog = 128; | 2110 | sysctl_max_syn_backlog = 128; |
| 2352 | } | 2111 | } |
| 2353 | tcp_port_rover = sysctl_local_port_range[0] - 1; | 2112 | tcp_hashinfo.port_rover = sysctl_local_port_range[0] - 1; |
| 2354 | 2113 | ||
| 2355 | sysctl_tcp_mem[0] = 768 << order; | 2114 | sysctl_tcp_mem[0] = 768 << order; |
| 2356 | sysctl_tcp_mem[1] = 1024 << order; | 2115 | sysctl_tcp_mem[1] = 1024 << order; |
| @@ -2365,14 +2124,12 @@ void __init tcp_init(void) | |||
| 2365 | 2124 | ||
| 2366 | printk(KERN_INFO "TCP: Hash tables configured " | 2125 | printk(KERN_INFO "TCP: Hash tables configured " |
| 2367 | "(established %d bind %d)\n", | 2126 | "(established %d bind %d)\n", |
| 2368 | tcp_ehash_size << 1, tcp_bhash_size); | 2127 | tcp_hashinfo.ehash_size << 1, tcp_hashinfo.bhash_size); |
| 2369 | 2128 | ||
| 2370 | tcp_register_congestion_control(&tcp_reno); | 2129 | tcp_register_congestion_control(&tcp_reno); |
| 2371 | } | 2130 | } |
| 2372 | 2131 | ||
| 2373 | EXPORT_SYMBOL(tcp_accept); | ||
| 2374 | EXPORT_SYMBOL(tcp_close); | 2132 | EXPORT_SYMBOL(tcp_close); |
| 2375 | EXPORT_SYMBOL(tcp_destroy_sock); | ||
| 2376 | EXPORT_SYMBOL(tcp_disconnect); | 2133 | EXPORT_SYMBOL(tcp_disconnect); |
| 2377 | EXPORT_SYMBOL(tcp_getsockopt); | 2134 | EXPORT_SYMBOL(tcp_getsockopt); |
| 2378 | EXPORT_SYMBOL(tcp_ioctl); | 2135 | EXPORT_SYMBOL(tcp_ioctl); |
| @@ -2384,4 +2141,3 @@ EXPORT_SYMBOL(tcp_sendpage); | |||
| 2384 | EXPORT_SYMBOL(tcp_setsockopt); | 2141 | EXPORT_SYMBOL(tcp_setsockopt); |
| 2385 | EXPORT_SYMBOL(tcp_shutdown); | 2142 | EXPORT_SYMBOL(tcp_shutdown); |
| 2386 | EXPORT_SYMBOL(tcp_statistics); | 2143 | EXPORT_SYMBOL(tcp_statistics); |
| 2387 | EXPORT_SYMBOL(tcp_timewait_cachep); | ||
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index ec38d45d6649..b940346de4e7 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c | |||
| @@ -86,11 +86,11 @@ static inline void bictcp_reset(struct bictcp *ca) | |||
| 86 | ca->delayed_ack = 2 << ACK_RATIO_SHIFT; | 86 | ca->delayed_ack = 2 << ACK_RATIO_SHIFT; |
| 87 | } | 87 | } |
| 88 | 88 | ||
| 89 | static void bictcp_init(struct tcp_sock *tp) | 89 | static void bictcp_init(struct sock *sk) |
| 90 | { | 90 | { |
| 91 | bictcp_reset(tcp_ca(tp)); | 91 | bictcp_reset(inet_csk_ca(sk)); |
| 92 | if (initial_ssthresh) | 92 | if (initial_ssthresh) |
| 93 | tp->snd_ssthresh = initial_ssthresh; | 93 | tcp_sk(sk)->snd_ssthresh = initial_ssthresh; |
| 94 | } | 94 | } |
| 95 | 95 | ||
| 96 | /* | 96 | /* |
| @@ -156,9 +156,10 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) | |||
| 156 | 156 | ||
| 157 | 157 | ||
| 158 | /* Detect low utilization in congestion avoidance */ | 158 | /* Detect low utilization in congestion avoidance */ |
| 159 | static inline void bictcp_low_utilization(struct tcp_sock *tp, int flag) | 159 | static inline void bictcp_low_utilization(struct sock *sk, int flag) |
| 160 | { | 160 | { |
| 161 | struct bictcp *ca = tcp_ca(tp); | 161 | const struct tcp_sock *tp = tcp_sk(sk); |
| 162 | struct bictcp *ca = inet_csk_ca(sk); | ||
| 162 | u32 dist, delay; | 163 | u32 dist, delay; |
| 163 | 164 | ||
| 164 | /* No time stamp */ | 165 | /* No time stamp */ |
| @@ -208,12 +209,13 @@ static inline void bictcp_low_utilization(struct tcp_sock *tp, int flag) | |||
| 208 | 209 | ||
| 209 | } | 210 | } |
| 210 | 211 | ||
| 211 | static void bictcp_cong_avoid(struct tcp_sock *tp, u32 ack, | 212 | static void bictcp_cong_avoid(struct sock *sk, u32 ack, |
| 212 | u32 seq_rtt, u32 in_flight, int data_acked) | 213 | u32 seq_rtt, u32 in_flight, int data_acked) |
| 213 | { | 214 | { |
| 214 | struct bictcp *ca = tcp_ca(tp); | 215 | struct tcp_sock *tp = tcp_sk(sk); |
| 216 | struct bictcp *ca = inet_csk_ca(sk); | ||
| 215 | 217 | ||
| 216 | bictcp_low_utilization(tp, data_acked); | 218 | bictcp_low_utilization(sk, data_acked); |
| 217 | 219 | ||
| 218 | if (in_flight < tp->snd_cwnd) | 220 | if (in_flight < tp->snd_cwnd) |
| 219 | return; | 221 | return; |
| @@ -242,9 +244,10 @@ static void bictcp_cong_avoid(struct tcp_sock *tp, u32 ack, | |||
| 242 | * behave like Reno until low_window is reached, | 244 | * behave like Reno until low_window is reached, |
| 243 | * then increase congestion window slowly | 245 | * then increase congestion window slowly |
| 244 | */ | 246 | */ |
| 245 | static u32 bictcp_recalc_ssthresh(struct tcp_sock *tp) | 247 | static u32 bictcp_recalc_ssthresh(struct sock *sk) |
| 246 | { | 248 | { |
| 247 | struct bictcp *ca = tcp_ca(tp); | 249 | const struct tcp_sock *tp = tcp_sk(sk); |
| 250 | struct bictcp *ca = inet_csk_ca(sk); | ||
| 248 | 251 | ||
| 249 | ca->epoch_start = 0; /* end of epoch */ | 252 | ca->epoch_start = 0; /* end of epoch */ |
| 250 | 253 | ||
| @@ -269,31 +272,34 @@ static u32 bictcp_recalc_ssthresh(struct tcp_sock *tp) | |||
| 269 | return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); | 272 | return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); |
| 270 | } | 273 | } |
| 271 | 274 | ||
| 272 | static u32 bictcp_undo_cwnd(struct tcp_sock *tp) | 275 | static u32 bictcp_undo_cwnd(struct sock *sk) |
| 273 | { | 276 | { |
| 274 | struct bictcp *ca = tcp_ca(tp); | 277 | const struct tcp_sock *tp = tcp_sk(sk); |
| 275 | 278 | const struct bictcp *ca = inet_csk_ca(sk); | |
| 276 | return max(tp->snd_cwnd, ca->last_max_cwnd); | 279 | return max(tp->snd_cwnd, ca->last_max_cwnd); |
| 277 | } | 280 | } |
| 278 | 281 | ||
| 279 | static u32 bictcp_min_cwnd(struct tcp_sock *tp) | 282 | static u32 bictcp_min_cwnd(struct sock *sk) |
| 280 | { | 283 | { |
| 284 | const struct tcp_sock *tp = tcp_sk(sk); | ||
| 281 | return tp->snd_ssthresh; | 285 | return tp->snd_ssthresh; |
| 282 | } | 286 | } |
| 283 | 287 | ||
| 284 | static void bictcp_state(struct tcp_sock *tp, u8 new_state) | 288 | static void bictcp_state(struct sock *sk, u8 new_state) |
| 285 | { | 289 | { |
| 286 | if (new_state == TCP_CA_Loss) | 290 | if (new_state == TCP_CA_Loss) |
| 287 | bictcp_reset(tcp_ca(tp)); | 291 | bictcp_reset(inet_csk_ca(sk)); |
| 288 | } | 292 | } |
| 289 | 293 | ||
| 290 | /* Track delayed acknowledgement ratio using sliding window | 294 | /* Track delayed acknowledgement ratio using sliding window |
| 291 | * ratio = (15*ratio + sample) / 16 | 295 | * ratio = (15*ratio + sample) / 16 |
| 292 | */ | 296 | */ |
| 293 | static void bictcp_acked(struct tcp_sock *tp, u32 cnt) | 297 | static void bictcp_acked(struct sock *sk, u32 cnt) |
| 294 | { | 298 | { |
| 295 | if (cnt > 0 && tp->ca_state == TCP_CA_Open) { | 299 | const struct inet_connection_sock *icsk = inet_csk(sk); |
| 296 | struct bictcp *ca = tcp_ca(tp); | 300 | |
| 301 | if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) { | ||
| 302 | struct bictcp *ca = inet_csk_ca(sk); | ||
| 297 | cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; | 303 | cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; |
| 298 | ca->delayed_ack += cnt; | 304 | ca->delayed_ack += cnt; |
| 299 | } | 305 | } |
| @@ -314,7 +320,7 @@ static struct tcp_congestion_ops bictcp = { | |||
| 314 | 320 | ||
| 315 | static int __init bictcp_register(void) | 321 | static int __init bictcp_register(void) |
| 316 | { | 322 | { |
| 317 | BUG_ON(sizeof(struct bictcp) > TCP_CA_PRIV_SIZE); | 323 | BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE); |
| 318 | return tcp_register_congestion_control(&bictcp); | 324 | return tcp_register_congestion_control(&bictcp); |
| 319 | } | 325 | } |
| 320 | 326 | ||
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 4970d10a7785..bbf2d6624e89 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c | |||
| @@ -73,33 +73,36 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) | |||
| 73 | EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); | 73 | EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); |
| 74 | 74 | ||
| 75 | /* Assign choice of congestion control. */ | 75 | /* Assign choice of congestion control. */ |
| 76 | void tcp_init_congestion_control(struct tcp_sock *tp) | 76 | void tcp_init_congestion_control(struct sock *sk) |
| 77 | { | 77 | { |
| 78 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 78 | struct tcp_congestion_ops *ca; | 79 | struct tcp_congestion_ops *ca; |
| 79 | 80 | ||
| 80 | if (tp->ca_ops != &tcp_init_congestion_ops) | 81 | if (icsk->icsk_ca_ops != &tcp_init_congestion_ops) |
| 81 | return; | 82 | return; |
| 82 | 83 | ||
| 83 | rcu_read_lock(); | 84 | rcu_read_lock(); |
| 84 | list_for_each_entry_rcu(ca, &tcp_cong_list, list) { | 85 | list_for_each_entry_rcu(ca, &tcp_cong_list, list) { |
| 85 | if (try_module_get(ca->owner)) { | 86 | if (try_module_get(ca->owner)) { |
| 86 | tp->ca_ops = ca; | 87 | icsk->icsk_ca_ops = ca; |
| 87 | break; | 88 | break; |
| 88 | } | 89 | } |
| 89 | 90 | ||
| 90 | } | 91 | } |
| 91 | rcu_read_unlock(); | 92 | rcu_read_unlock(); |
| 92 | 93 | ||
| 93 | if (tp->ca_ops->init) | 94 | if (icsk->icsk_ca_ops->init) |
| 94 | tp->ca_ops->init(tp); | 95 | icsk->icsk_ca_ops->init(sk); |
| 95 | } | 96 | } |
| 96 | 97 | ||
| 97 | /* Manage refcounts on socket close. */ | 98 | /* Manage refcounts on socket close. */ |
| 98 | void tcp_cleanup_congestion_control(struct tcp_sock *tp) | 99 | void tcp_cleanup_congestion_control(struct sock *sk) |
| 99 | { | 100 | { |
| 100 | if (tp->ca_ops->release) | 101 | struct inet_connection_sock *icsk = inet_csk(sk); |
| 101 | tp->ca_ops->release(tp); | 102 | |
| 102 | module_put(tp->ca_ops->owner); | 103 | if (icsk->icsk_ca_ops->release) |
| 104 | icsk->icsk_ca_ops->release(sk); | ||
| 105 | module_put(icsk->icsk_ca_ops->owner); | ||
| 103 | } | 106 | } |
| 104 | 107 | ||
| 105 | /* Used by sysctl to change default congestion control */ | 108 | /* Used by sysctl to change default congestion control */ |
| @@ -143,14 +146,15 @@ void tcp_get_default_congestion_control(char *name) | |||
| 143 | } | 146 | } |
| 144 | 147 | ||
| 145 | /* Change congestion control for socket */ | 148 | /* Change congestion control for socket */ |
| 146 | int tcp_set_congestion_control(struct tcp_sock *tp, const char *name) | 149 | int tcp_set_congestion_control(struct sock *sk, const char *name) |
| 147 | { | 150 | { |
| 151 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 148 | struct tcp_congestion_ops *ca; | 152 | struct tcp_congestion_ops *ca; |
| 149 | int err = 0; | 153 | int err = 0; |
| 150 | 154 | ||
| 151 | rcu_read_lock(); | 155 | rcu_read_lock(); |
| 152 | ca = tcp_ca_find(name); | 156 | ca = tcp_ca_find(name); |
| 153 | if (ca == tp->ca_ops) | 157 | if (ca == icsk->icsk_ca_ops) |
| 154 | goto out; | 158 | goto out; |
| 155 | 159 | ||
| 156 | if (!ca) | 160 | if (!ca) |
| @@ -160,10 +164,10 @@ int tcp_set_congestion_control(struct tcp_sock *tp, const char *name) | |||
| 160 | err = -EBUSY; | 164 | err = -EBUSY; |
| 161 | 165 | ||
| 162 | else { | 166 | else { |
| 163 | tcp_cleanup_congestion_control(tp); | 167 | tcp_cleanup_congestion_control(sk); |
| 164 | tp->ca_ops = ca; | 168 | icsk->icsk_ca_ops = ca; |
| 165 | if (tp->ca_ops->init) | 169 | if (icsk->icsk_ca_ops->init) |
| 166 | tp->ca_ops->init(tp); | 170 | icsk->icsk_ca_ops->init(sk); |
| 167 | } | 171 | } |
| 168 | out: | 172 | out: |
| 169 | rcu_read_unlock(); | 173 | rcu_read_unlock(); |
| @@ -177,9 +181,11 @@ int tcp_set_congestion_control(struct tcp_sock *tp, const char *name) | |||
| 177 | /* This is Jacobson's slow start and congestion avoidance. | 181 | /* This is Jacobson's slow start and congestion avoidance. |
| 178 | * SIGCOMM '88, p. 328. | 182 | * SIGCOMM '88, p. 328. |
| 179 | */ | 183 | */ |
| 180 | void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight, | 184 | void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, |
| 181 | int flag) | 185 | int flag) |
| 182 | { | 186 | { |
| 187 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 188 | |||
| 183 | if (in_flight < tp->snd_cwnd) | 189 | if (in_flight < tp->snd_cwnd) |
| 184 | return; | 190 | return; |
| 185 | 191 | ||
| @@ -202,15 +208,17 @@ void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight, | |||
| 202 | EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); | 208 | EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); |
| 203 | 209 | ||
| 204 | /* Slow start threshold is half the congestion window (min 2) */ | 210 | /* Slow start threshold is half the congestion window (min 2) */ |
| 205 | u32 tcp_reno_ssthresh(struct tcp_sock *tp) | 211 | u32 tcp_reno_ssthresh(struct sock *sk) |
| 206 | { | 212 | { |
| 213 | const struct tcp_sock *tp = tcp_sk(sk); | ||
| 207 | return max(tp->snd_cwnd >> 1U, 2U); | 214 | return max(tp->snd_cwnd >> 1U, 2U); |
| 208 | } | 215 | } |
| 209 | EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); | 216 | EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); |
| 210 | 217 | ||
| 211 | /* Lower bound on congestion window. */ | 218 | /* Lower bound on congestion window. */ |
| 212 | u32 tcp_reno_min_cwnd(struct tcp_sock *tp) | 219 | u32 tcp_reno_min_cwnd(struct sock *sk) |
| 213 | { | 220 | { |
| 221 | const struct tcp_sock *tp = tcp_sk(sk); | ||
| 214 | return tp->snd_ssthresh/2; | 222 | return tp->snd_ssthresh/2; |
| 215 | } | 223 | } |
| 216 | EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd); | 224 | EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd); |
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index f66945cb158f..c148c1081880 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* | 1 | /* |
| 2 | * tcp_diag.c Module for monitoring TCP sockets. | 2 | * tcp_diag.c Module for monitoring TCP transport protocols sockets. |
| 3 | * | 3 | * |
| 4 | * Version: $Id: tcp_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $ | 4 | * Version: $Id: tcp_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $ |
| 5 | * | 5 | * |
| @@ -12,779 +12,43 @@ | |||
| 12 | */ | 12 | */ |
| 13 | 13 | ||
| 14 | #include <linux/config.h> | 14 | #include <linux/config.h> |
| 15 | #include <linux/module.h> | ||
| 16 | #include <linux/types.h> | ||
| 17 | #include <linux/fcntl.h> | ||
| 18 | #include <linux/random.h> | ||
| 19 | #include <linux/cache.h> | ||
| 20 | #include <linux/init.h> | ||
| 21 | #include <linux/time.h> | ||
| 22 | |||
| 23 | #include <net/icmp.h> | ||
| 24 | #include <net/tcp.h> | ||
| 25 | #include <net/ipv6.h> | ||
| 26 | #include <net/inet_common.h> | ||
| 27 | |||
| 28 | #include <linux/inet.h> | ||
| 29 | #include <linux/stddef.h> | ||
| 30 | |||
| 31 | #include <linux/tcp_diag.h> | ||
| 32 | 15 | ||
| 33 | struct tcpdiag_entry | 16 | #include <linux/module.h> |
| 34 | { | 17 | #include <linux/inet_diag.h> |
| 35 | u32 *saddr; | ||
| 36 | u32 *daddr; | ||
| 37 | u16 sport; | ||
| 38 | u16 dport; | ||
| 39 | u16 family; | ||
| 40 | u16 userlocks; | ||
| 41 | }; | ||
| 42 | 18 | ||
| 43 | static struct sock *tcpnl; | 19 | #include <linux/tcp.h> |
| 44 | 20 | ||
| 45 | #define TCPDIAG_PUT(skb, attrtype, attrlen) \ | 21 | #include <net/tcp.h> |
| 46 | RTA_DATA(__RTA_PUT(skb, attrtype, attrlen)) | ||
| 47 | 22 | ||
| 48 | static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, | 23 | static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, |
| 49 | int ext, u32 pid, u32 seq, u16 nlmsg_flags) | 24 | void *_info) |
| 50 | { | 25 | { |
| 51 | struct inet_sock *inet = inet_sk(sk); | 26 | const struct tcp_sock *tp = tcp_sk(sk); |
| 52 | struct tcp_sock *tp = tcp_sk(sk); | 27 | struct tcp_info *info = _info; |
| 53 | struct tcpdiagmsg *r; | ||
| 54 | struct nlmsghdr *nlh; | ||
| 55 | struct tcp_info *info = NULL; | ||
| 56 | struct tcpdiag_meminfo *minfo = NULL; | ||
| 57 | unsigned char *b = skb->tail; | ||
| 58 | |||
| 59 | nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r)); | ||
| 60 | nlh->nlmsg_flags = nlmsg_flags; | ||
| 61 | r = NLMSG_DATA(nlh); | ||
| 62 | if (sk->sk_state != TCP_TIME_WAIT) { | ||
| 63 | if (ext & (1<<(TCPDIAG_MEMINFO-1))) | ||
| 64 | minfo = TCPDIAG_PUT(skb, TCPDIAG_MEMINFO, sizeof(*minfo)); | ||
| 65 | if (ext & (1<<(TCPDIAG_INFO-1))) | ||
| 66 | info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info)); | ||
| 67 | |||
| 68 | if (ext & (1<<(TCPDIAG_CONG-1))) { | ||
| 69 | size_t len = strlen(tp->ca_ops->name); | ||
| 70 | strcpy(TCPDIAG_PUT(skb, TCPDIAG_CONG, len+1), | ||
| 71 | tp->ca_ops->name); | ||
| 72 | } | ||
| 73 | } | ||
| 74 | r->tcpdiag_family = sk->sk_family; | ||
| 75 | r->tcpdiag_state = sk->sk_state; | ||
| 76 | r->tcpdiag_timer = 0; | ||
| 77 | r->tcpdiag_retrans = 0; | ||
| 78 | |||
| 79 | r->id.tcpdiag_if = sk->sk_bound_dev_if; | ||
| 80 | r->id.tcpdiag_cookie[0] = (u32)(unsigned long)sk; | ||
| 81 | r->id.tcpdiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1); | ||
| 82 | |||
| 83 | if (r->tcpdiag_state == TCP_TIME_WAIT) { | ||
| 84 | struct tcp_tw_bucket *tw = (struct tcp_tw_bucket*)sk; | ||
| 85 | long tmo = tw->tw_ttd - jiffies; | ||
| 86 | if (tmo < 0) | ||
| 87 | tmo = 0; | ||
| 88 | |||
| 89 | r->id.tcpdiag_sport = tw->tw_sport; | ||
| 90 | r->id.tcpdiag_dport = tw->tw_dport; | ||
| 91 | r->id.tcpdiag_src[0] = tw->tw_rcv_saddr; | ||
| 92 | r->id.tcpdiag_dst[0] = tw->tw_daddr; | ||
| 93 | r->tcpdiag_state = tw->tw_substate; | ||
| 94 | r->tcpdiag_timer = 3; | ||
| 95 | r->tcpdiag_expires = (tmo*1000+HZ-1)/HZ; | ||
| 96 | r->tcpdiag_rqueue = 0; | ||
| 97 | r->tcpdiag_wqueue = 0; | ||
| 98 | r->tcpdiag_uid = 0; | ||
| 99 | r->tcpdiag_inode = 0; | ||
| 100 | #ifdef CONFIG_IP_TCPDIAG_IPV6 | ||
| 101 | if (r->tcpdiag_family == AF_INET6) { | ||
| 102 | ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src, | ||
| 103 | &tw->tw_v6_rcv_saddr); | ||
| 104 | ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst, | ||
| 105 | &tw->tw_v6_daddr); | ||
| 106 | } | ||
| 107 | #endif | ||
| 108 | nlh->nlmsg_len = skb->tail - b; | ||
| 109 | return skb->len; | ||
| 110 | } | ||
| 111 | |||
| 112 | r->id.tcpdiag_sport = inet->sport; | ||
| 113 | r->id.tcpdiag_dport = inet->dport; | ||
| 114 | r->id.tcpdiag_src[0] = inet->rcv_saddr; | ||
| 115 | r->id.tcpdiag_dst[0] = inet->daddr; | ||
| 116 | |||
| 117 | #ifdef CONFIG_IP_TCPDIAG_IPV6 | ||
| 118 | if (r->tcpdiag_family == AF_INET6) { | ||
| 119 | struct ipv6_pinfo *np = inet6_sk(sk); | ||
| 120 | |||
| 121 | ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src, | ||
| 122 | &np->rcv_saddr); | ||
| 123 | ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst, | ||
| 124 | &np->daddr); | ||
| 125 | } | ||
| 126 | #endif | ||
| 127 | |||
| 128 | #define EXPIRES_IN_MS(tmo) ((tmo-jiffies)*1000+HZ-1)/HZ | ||
| 129 | |||
| 130 | if (tp->pending == TCP_TIME_RETRANS) { | ||
| 131 | r->tcpdiag_timer = 1; | ||
| 132 | r->tcpdiag_retrans = tp->retransmits; | ||
| 133 | r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout); | ||
| 134 | } else if (tp->pending == TCP_TIME_PROBE0) { | ||
| 135 | r->tcpdiag_timer = 4; | ||
| 136 | r->tcpdiag_retrans = tp->probes_out; | ||
| 137 | r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout); | ||
| 138 | } else if (timer_pending(&sk->sk_timer)) { | ||
| 139 | r->tcpdiag_timer = 2; | ||
| 140 | r->tcpdiag_retrans = tp->probes_out; | ||
| 141 | r->tcpdiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires); | ||
| 142 | } else { | ||
| 143 | r->tcpdiag_timer = 0; | ||
| 144 | r->tcpdiag_expires = 0; | ||
| 145 | } | ||
| 146 | #undef EXPIRES_IN_MS | ||
| 147 | 28 | ||
| 148 | r->tcpdiag_rqueue = tp->rcv_nxt - tp->copied_seq; | 29 | r->idiag_rqueue = tp->rcv_nxt - tp->copied_seq; |
| 149 | r->tcpdiag_wqueue = tp->write_seq - tp->snd_una; | 30 | r->idiag_wqueue = tp->write_seq - tp->snd_una; |
| 150 | r->tcpdiag_uid = sock_i_uid(sk); | 31 | if (info != NULL) |
| 151 | r->tcpdiag_inode = sock_i_ino(sk); | ||
| 152 | |||
| 153 | if (minfo) { | ||
| 154 | minfo->tcpdiag_rmem = atomic_read(&sk->sk_rmem_alloc); | ||
| 155 | minfo->tcpdiag_wmem = sk->sk_wmem_queued; | ||
| 156 | minfo->tcpdiag_fmem = sk->sk_forward_alloc; | ||
| 157 | minfo->tcpdiag_tmem = atomic_read(&sk->sk_wmem_alloc); | ||
| 158 | } | ||
| 159 | |||
| 160 | if (info) | ||
| 161 | tcp_get_info(sk, info); | 32 | tcp_get_info(sk, info); |
| 162 | |||
| 163 | if (sk->sk_state < TCP_TIME_WAIT && tp->ca_ops->get_info) | ||
| 164 | tp->ca_ops->get_info(tp, ext, skb); | ||
| 165 | |||
| 166 | nlh->nlmsg_len = skb->tail - b; | ||
| 167 | return skb->len; | ||
| 168 | |||
| 169 | rtattr_failure: | ||
| 170 | nlmsg_failure: | ||
| 171 | skb_trim(skb, b - skb->data); | ||
| 172 | return -1; | ||
| 173 | } | ||
| 174 | |||
| 175 | extern struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, | ||
| 176 | int dif); | ||
| 177 | #ifdef CONFIG_IP_TCPDIAG_IPV6 | ||
| 178 | extern struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport, | ||
| 179 | struct in6_addr *daddr, u16 dport, | ||
| 180 | int dif); | ||
| 181 | #else | ||
| 182 | static inline struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport, | ||
| 183 | struct in6_addr *daddr, u16 dport, | ||
| 184 | int dif) | ||
| 185 | { | ||
| 186 | return NULL; | ||
| 187 | } | ||
| 188 | #endif | ||
| 189 | |||
| 190 | static int tcpdiag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh) | ||
| 191 | { | ||
| 192 | int err; | ||
| 193 | struct sock *sk; | ||
| 194 | struct tcpdiagreq *req = NLMSG_DATA(nlh); | ||
| 195 | struct sk_buff *rep; | ||
| 196 | |||
| 197 | if (req->tcpdiag_family == AF_INET) { | ||
| 198 | sk = tcp_v4_lookup(req->id.tcpdiag_dst[0], req->id.tcpdiag_dport, | ||
| 199 | req->id.tcpdiag_src[0], req->id.tcpdiag_sport, | ||
| 200 | req->id.tcpdiag_if); | ||
| 201 | } | ||
| 202 | #ifdef CONFIG_IP_TCPDIAG_IPV6 | ||
| 203 | else if (req->tcpdiag_family == AF_INET6) { | ||
| 204 | sk = tcp_v6_lookup((struct in6_addr*)req->id.tcpdiag_dst, req->id.tcpdiag_dport, | ||
| 205 | (struct in6_addr*)req->id.tcpdiag_src, req->id.tcpdiag_sport, | ||
| 206 | req->id.tcpdiag_if); | ||
| 207 | } | ||
| 208 | #endif | ||
| 209 | else { | ||
| 210 | return -EINVAL; | ||
| 211 | } | ||
| 212 | |||
| 213 | if (sk == NULL) | ||
| 214 | return -ENOENT; | ||
| 215 | |||
| 216 | err = -ESTALE; | ||
| 217 | if ((req->id.tcpdiag_cookie[0] != TCPDIAG_NOCOOKIE || | ||
| 218 | req->id.tcpdiag_cookie[1] != TCPDIAG_NOCOOKIE) && | ||
| 219 | ((u32)(unsigned long)sk != req->id.tcpdiag_cookie[0] || | ||
| 220 | (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.tcpdiag_cookie[1])) | ||
| 221 | goto out; | ||
| 222 | |||
| 223 | err = -ENOMEM; | ||
| 224 | rep = alloc_skb(NLMSG_SPACE(sizeof(struct tcpdiagmsg)+ | ||
| 225 | sizeof(struct tcpdiag_meminfo)+ | ||
| 226 | sizeof(struct tcp_info)+64), GFP_KERNEL); | ||
| 227 | if (!rep) | ||
| 228 | goto out; | ||
| 229 | |||
| 230 | if (tcpdiag_fill(rep, sk, req->tcpdiag_ext, | ||
| 231 | NETLINK_CB(in_skb).pid, | ||
| 232 | nlh->nlmsg_seq, 0) <= 0) | ||
| 233 | BUG(); | ||
| 234 | |||
| 235 | err = netlink_unicast(tcpnl, rep, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); | ||
| 236 | if (err > 0) | ||
| 237 | err = 0; | ||
| 238 | |||
| 239 | out: | ||
| 240 | if (sk) { | ||
| 241 | if (sk->sk_state == TCP_TIME_WAIT) | ||
| 242 | tcp_tw_put((struct tcp_tw_bucket*)sk); | ||
| 243 | else | ||
| 244 | sock_put(sk); | ||
| 245 | } | ||
| 246 | return err; | ||
| 247 | } | ||
| 248 | |||
| 249 | static int bitstring_match(const u32 *a1, const u32 *a2, int bits) | ||
| 250 | { | ||
| 251 | int words = bits >> 5; | ||
| 252 | |||
| 253 | bits &= 0x1f; | ||
| 254 | |||
| 255 | if (words) { | ||
| 256 | if (memcmp(a1, a2, words << 2)) | ||
| 257 | return 0; | ||
| 258 | } | ||
| 259 | if (bits) { | ||
| 260 | __u32 w1, w2; | ||
| 261 | __u32 mask; | ||
| 262 | |||
| 263 | w1 = a1[words]; | ||
| 264 | w2 = a2[words]; | ||
| 265 | |||
| 266 | mask = htonl((0xffffffff) << (32 - bits)); | ||
| 267 | |||
| 268 | if ((w1 ^ w2) & mask) | ||
| 269 | return 0; | ||
| 270 | } | ||
| 271 | |||
| 272 | return 1; | ||
| 273 | } | ||
| 274 | |||
| 275 | |||
| 276 | static int tcpdiag_bc_run(const void *bc, int len, | ||
| 277 | const struct tcpdiag_entry *entry) | ||
| 278 | { | ||
| 279 | while (len > 0) { | ||
| 280 | int yes = 1; | ||
| 281 | const struct tcpdiag_bc_op *op = bc; | ||
| 282 | |||
| 283 | switch (op->code) { | ||
| 284 | case TCPDIAG_BC_NOP: | ||
| 285 | break; | ||
| 286 | case TCPDIAG_BC_JMP: | ||
| 287 | yes = 0; | ||
| 288 | break; | ||
| 289 | case TCPDIAG_BC_S_GE: | ||
| 290 | yes = entry->sport >= op[1].no; | ||
| 291 | break; | ||
| 292 | case TCPDIAG_BC_S_LE: | ||
| 293 | yes = entry->dport <= op[1].no; | ||
| 294 | break; | ||
| 295 | case TCPDIAG_BC_D_GE: | ||
| 296 | yes = entry->dport >= op[1].no; | ||
| 297 | break; | ||
| 298 | case TCPDIAG_BC_D_LE: | ||
| 299 | yes = entry->dport <= op[1].no; | ||
| 300 | break; | ||
| 301 | case TCPDIAG_BC_AUTO: | ||
| 302 | yes = !(entry->userlocks & SOCK_BINDPORT_LOCK); | ||
| 303 | break; | ||
| 304 | case TCPDIAG_BC_S_COND: | ||
| 305 | case TCPDIAG_BC_D_COND: | ||
| 306 | { | ||
| 307 | struct tcpdiag_hostcond *cond = (struct tcpdiag_hostcond*)(op+1); | ||
| 308 | u32 *addr; | ||
| 309 | |||
| 310 | if (cond->port != -1 && | ||
| 311 | cond->port != (op->code == TCPDIAG_BC_S_COND ? | ||
| 312 | entry->sport : entry->dport)) { | ||
| 313 | yes = 0; | ||
| 314 | break; | ||
| 315 | } | ||
| 316 | |||
| 317 | if (cond->prefix_len == 0) | ||
| 318 | break; | ||
| 319 | |||
| 320 | if (op->code == TCPDIAG_BC_S_COND) | ||
| 321 | addr = entry->saddr; | ||
| 322 | else | ||
| 323 | addr = entry->daddr; | ||
| 324 | |||
| 325 | if (bitstring_match(addr, cond->addr, cond->prefix_len)) | ||
| 326 | break; | ||
| 327 | if (entry->family == AF_INET6 && | ||
| 328 | cond->family == AF_INET) { | ||
| 329 | if (addr[0] == 0 && addr[1] == 0 && | ||
| 330 | addr[2] == htonl(0xffff) && | ||
| 331 | bitstring_match(addr+3, cond->addr, cond->prefix_len)) | ||
| 332 | break; | ||
| 333 | } | ||
| 334 | yes = 0; | ||
| 335 | break; | ||
| 336 | } | ||
| 337 | } | ||
| 338 | |||
| 339 | if (yes) { | ||
| 340 | len -= op->yes; | ||
| 341 | bc += op->yes; | ||
| 342 | } else { | ||
| 343 | len -= op->no; | ||
| 344 | bc += op->no; | ||
| 345 | } | ||
| 346 | } | ||
| 347 | return (len == 0); | ||
| 348 | } | ||
| 349 | |||
| 350 | static int valid_cc(const void *bc, int len, int cc) | ||
| 351 | { | ||
| 352 | while (len >= 0) { | ||
| 353 | const struct tcpdiag_bc_op *op = bc; | ||
| 354 | |||
| 355 | if (cc > len) | ||
| 356 | return 0; | ||
| 357 | if (cc == len) | ||
| 358 | return 1; | ||
| 359 | if (op->yes < 4) | ||
| 360 | return 0; | ||
| 361 | len -= op->yes; | ||
| 362 | bc += op->yes; | ||
| 363 | } | ||
| 364 | return 0; | ||
| 365 | } | ||
| 366 | |||
| 367 | static int tcpdiag_bc_audit(const void *bytecode, int bytecode_len) | ||
| 368 | { | ||
| 369 | const unsigned char *bc = bytecode; | ||
| 370 | int len = bytecode_len; | ||
| 371 | |||
| 372 | while (len > 0) { | ||
| 373 | struct tcpdiag_bc_op *op = (struct tcpdiag_bc_op*)bc; | ||
| 374 | |||
| 375 | //printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len); | ||
| 376 | switch (op->code) { | ||
| 377 | case TCPDIAG_BC_AUTO: | ||
| 378 | case TCPDIAG_BC_S_COND: | ||
| 379 | case TCPDIAG_BC_D_COND: | ||
| 380 | case TCPDIAG_BC_S_GE: | ||
| 381 | case TCPDIAG_BC_S_LE: | ||
| 382 | case TCPDIAG_BC_D_GE: | ||
| 383 | case TCPDIAG_BC_D_LE: | ||
| 384 | if (op->yes < 4 || op->yes > len+4) | ||
| 385 | return -EINVAL; | ||
| 386 | case TCPDIAG_BC_JMP: | ||
| 387 | if (op->no < 4 || op->no > len+4) | ||
| 388 | return -EINVAL; | ||
| 389 | if (op->no < len && | ||
| 390 | !valid_cc(bytecode, bytecode_len, len-op->no)) | ||
| 391 | return -EINVAL; | ||
| 392 | break; | ||
| 393 | case TCPDIAG_BC_NOP: | ||
| 394 | if (op->yes < 4 || op->yes > len+4) | ||
| 395 | return -EINVAL; | ||
| 396 | break; | ||
| 397 | default: | ||
| 398 | return -EINVAL; | ||
| 399 | } | ||
| 400 | bc += op->yes; | ||
| 401 | len -= op->yes; | ||
| 402 | } | ||
| 403 | return len == 0 ? 0 : -EINVAL; | ||
| 404 | } | ||
| 405 | |||
| 406 | static int tcpdiag_dump_sock(struct sk_buff *skb, struct sock *sk, | ||
| 407 | struct netlink_callback *cb) | ||
| 408 | { | ||
| 409 | struct tcpdiagreq *r = NLMSG_DATA(cb->nlh); | ||
| 410 | |||
| 411 | if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { | ||
| 412 | struct tcpdiag_entry entry; | ||
| 413 | struct rtattr *bc = (struct rtattr *)(r + 1); | ||
| 414 | struct inet_sock *inet = inet_sk(sk); | ||
| 415 | |||
| 416 | entry.family = sk->sk_family; | ||
| 417 | #ifdef CONFIG_IP_TCPDIAG_IPV6 | ||
| 418 | if (entry.family == AF_INET6) { | ||
| 419 | struct ipv6_pinfo *np = inet6_sk(sk); | ||
| 420 | |||
| 421 | entry.saddr = np->rcv_saddr.s6_addr32; | ||
| 422 | entry.daddr = np->daddr.s6_addr32; | ||
| 423 | } else | ||
| 424 | #endif | ||
| 425 | { | ||
| 426 | entry.saddr = &inet->rcv_saddr; | ||
| 427 | entry.daddr = &inet->daddr; | ||
| 428 | } | ||
| 429 | entry.sport = inet->num; | ||
| 430 | entry.dport = ntohs(inet->dport); | ||
| 431 | entry.userlocks = sk->sk_userlocks; | ||
| 432 | |||
| 433 | if (!tcpdiag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry)) | ||
| 434 | return 0; | ||
| 435 | } | ||
| 436 | |||
| 437 | return tcpdiag_fill(skb, sk, r->tcpdiag_ext, NETLINK_CB(cb->skb).pid, | ||
| 438 | cb->nlh->nlmsg_seq, NLM_F_MULTI); | ||
| 439 | } | 33 | } |
| 440 | 34 | ||
| 441 | static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk, | 35 | static struct inet_diag_handler tcp_diag_handler = { |
| 442 | struct request_sock *req, | 36 | .idiag_hashinfo = &tcp_hashinfo, |
| 443 | u32 pid, u32 seq) | 37 | .idiag_get_info = tcp_diag_get_info, |
| 444 | { | 38 | .idiag_type = TCPDIAG_GETSOCK, |
| 445 | const struct inet_request_sock *ireq = inet_rsk(req); | 39 | .idiag_info_size = sizeof(struct tcp_info), |
| 446 | struct inet_sock *inet = inet_sk(sk); | 40 | }; |
| 447 | unsigned char *b = skb->tail; | ||
| 448 | struct tcpdiagmsg *r; | ||
| 449 | struct nlmsghdr *nlh; | ||
| 450 | long tmo; | ||
| 451 | |||
| 452 | nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r)); | ||
| 453 | nlh->nlmsg_flags = NLM_F_MULTI; | ||
| 454 | r = NLMSG_DATA(nlh); | ||
| 455 | |||
| 456 | r->tcpdiag_family = sk->sk_family; | ||
| 457 | r->tcpdiag_state = TCP_SYN_RECV; | ||
| 458 | r->tcpdiag_timer = 1; | ||
| 459 | r->tcpdiag_retrans = req->retrans; | ||
| 460 | |||
| 461 | r->id.tcpdiag_if = sk->sk_bound_dev_if; | ||
| 462 | r->id.tcpdiag_cookie[0] = (u32)(unsigned long)req; | ||
| 463 | r->id.tcpdiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1); | ||
| 464 | |||
| 465 | tmo = req->expires - jiffies; | ||
| 466 | if (tmo < 0) | ||
| 467 | tmo = 0; | ||
| 468 | |||
| 469 | r->id.tcpdiag_sport = inet->sport; | ||
| 470 | r->id.tcpdiag_dport = ireq->rmt_port; | ||
| 471 | r->id.tcpdiag_src[0] = ireq->loc_addr; | ||
| 472 | r->id.tcpdiag_dst[0] = ireq->rmt_addr; | ||
| 473 | r->tcpdiag_expires = jiffies_to_msecs(tmo), | ||
| 474 | r->tcpdiag_rqueue = 0; | ||
| 475 | r->tcpdiag_wqueue = 0; | ||
| 476 | r->tcpdiag_uid = sock_i_uid(sk); | ||
| 477 | r->tcpdiag_inode = 0; | ||
| 478 | #ifdef CONFIG_IP_TCPDIAG_IPV6 | ||
| 479 | if (r->tcpdiag_family == AF_INET6) { | ||
| 480 | ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src, | ||
| 481 | &tcp6_rsk(req)->loc_addr); | ||
| 482 | ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst, | ||
| 483 | &tcp6_rsk(req)->rmt_addr); | ||
| 484 | } | ||
| 485 | #endif | ||
| 486 | nlh->nlmsg_len = skb->tail - b; | ||
| 487 | |||
| 488 | return skb->len; | ||
| 489 | |||
| 490 | nlmsg_failure: | ||
| 491 | skb_trim(skb, b - skb->data); | ||
| 492 | return -1; | ||
| 493 | } | ||
| 494 | |||
| 495 | static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk, | ||
| 496 | struct netlink_callback *cb) | ||
| 497 | { | ||
| 498 | struct tcpdiag_entry entry; | ||
| 499 | struct tcpdiagreq *r = NLMSG_DATA(cb->nlh); | ||
| 500 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 501 | struct listen_sock *lopt; | ||
| 502 | struct rtattr *bc = NULL; | ||
| 503 | struct inet_sock *inet = inet_sk(sk); | ||
| 504 | int j, s_j; | ||
| 505 | int reqnum, s_reqnum; | ||
| 506 | int err = 0; | ||
| 507 | |||
| 508 | s_j = cb->args[3]; | ||
| 509 | s_reqnum = cb->args[4]; | ||
| 510 | |||
| 511 | if (s_j > 0) | ||
| 512 | s_j--; | ||
| 513 | |||
| 514 | entry.family = sk->sk_family; | ||
| 515 | |||
| 516 | read_lock_bh(&tp->accept_queue.syn_wait_lock); | ||
| 517 | |||
| 518 | lopt = tp->accept_queue.listen_opt; | ||
| 519 | if (!lopt || !lopt->qlen) | ||
| 520 | goto out; | ||
| 521 | |||
| 522 | if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { | ||
| 523 | bc = (struct rtattr *)(r + 1); | ||
| 524 | entry.sport = inet->num; | ||
| 525 | entry.userlocks = sk->sk_userlocks; | ||
| 526 | } | ||
| 527 | |||
| 528 | for (j = s_j; j < TCP_SYNQ_HSIZE; j++) { | ||
| 529 | struct request_sock *req, *head = lopt->syn_table[j]; | ||
| 530 | |||
| 531 | reqnum = 0; | ||
| 532 | for (req = head; req; reqnum++, req = req->dl_next) { | ||
| 533 | struct inet_request_sock *ireq = inet_rsk(req); | ||
| 534 | |||
| 535 | if (reqnum < s_reqnum) | ||
| 536 | continue; | ||
| 537 | if (r->id.tcpdiag_dport != ireq->rmt_port && | ||
| 538 | r->id.tcpdiag_dport) | ||
| 539 | continue; | ||
| 540 | |||
| 541 | if (bc) { | ||
| 542 | entry.saddr = | ||
| 543 | #ifdef CONFIG_IP_TCPDIAG_IPV6 | ||
| 544 | (entry.family == AF_INET6) ? | ||
| 545 | tcp6_rsk(req)->loc_addr.s6_addr32 : | ||
| 546 | #endif | ||
| 547 | &ireq->loc_addr; | ||
| 548 | entry.daddr = | ||
| 549 | #ifdef CONFIG_IP_TCPDIAG_IPV6 | ||
| 550 | (entry.family == AF_INET6) ? | ||
| 551 | tcp6_rsk(req)->rmt_addr.s6_addr32 : | ||
| 552 | #endif | ||
| 553 | &ireq->rmt_addr; | ||
| 554 | entry.dport = ntohs(ireq->rmt_port); | ||
| 555 | |||
| 556 | if (!tcpdiag_bc_run(RTA_DATA(bc), | ||
| 557 | RTA_PAYLOAD(bc), &entry)) | ||
| 558 | continue; | ||
| 559 | } | ||
| 560 | |||
| 561 | err = tcpdiag_fill_req(skb, sk, req, | ||
| 562 | NETLINK_CB(cb->skb).pid, | ||
| 563 | cb->nlh->nlmsg_seq); | ||
| 564 | if (err < 0) { | ||
| 565 | cb->args[3] = j + 1; | ||
| 566 | cb->args[4] = reqnum; | ||
| 567 | goto out; | ||
| 568 | } | ||
| 569 | } | ||
| 570 | |||
| 571 | s_reqnum = 0; | ||
| 572 | } | ||
| 573 | |||
| 574 | out: | ||
| 575 | read_unlock_bh(&tp->accept_queue.syn_wait_lock); | ||
| 576 | |||
| 577 | return err; | ||
| 578 | } | ||
| 579 | |||
| 580 | static int tcpdiag_dump(struct sk_buff *skb, struct netlink_callback *cb) | ||
| 581 | { | ||
| 582 | int i, num; | ||
| 583 | int s_i, s_num; | ||
| 584 | struct tcpdiagreq *r = NLMSG_DATA(cb->nlh); | ||
| 585 | |||
| 586 | s_i = cb->args[1]; | ||
| 587 | s_num = num = cb->args[2]; | ||
| 588 | |||
| 589 | if (cb->args[0] == 0) { | ||
| 590 | if (!(r->tcpdiag_states&(TCPF_LISTEN|TCPF_SYN_RECV))) | ||
| 591 | goto skip_listen_ht; | ||
| 592 | tcp_listen_lock(); | ||
| 593 | for (i = s_i; i < TCP_LHTABLE_SIZE; i++) { | ||
| 594 | struct sock *sk; | ||
| 595 | struct hlist_node *node; | ||
| 596 | |||
| 597 | num = 0; | ||
| 598 | sk_for_each(sk, node, &tcp_listening_hash[i]) { | ||
| 599 | struct inet_sock *inet = inet_sk(sk); | ||
| 600 | |||
| 601 | if (num < s_num) { | ||
| 602 | num++; | ||
| 603 | continue; | ||
| 604 | } | ||
| 605 | |||
| 606 | if (r->id.tcpdiag_sport != inet->sport && | ||
| 607 | r->id.tcpdiag_sport) | ||
| 608 | goto next_listen; | ||
| 609 | |||
| 610 | if (!(r->tcpdiag_states&TCPF_LISTEN) || | ||
| 611 | r->id.tcpdiag_dport || | ||
| 612 | cb->args[3] > 0) | ||
| 613 | goto syn_recv; | ||
| 614 | |||
| 615 | if (tcpdiag_dump_sock(skb, sk, cb) < 0) { | ||
| 616 | tcp_listen_unlock(); | ||
| 617 | goto done; | ||
| 618 | } | ||
| 619 | |||
| 620 | syn_recv: | ||
| 621 | if (!(r->tcpdiag_states&TCPF_SYN_RECV)) | ||
| 622 | goto next_listen; | ||
| 623 | |||
| 624 | if (tcpdiag_dump_reqs(skb, sk, cb) < 0) { | ||
| 625 | tcp_listen_unlock(); | ||
| 626 | goto done; | ||
| 627 | } | ||
| 628 | |||
| 629 | next_listen: | ||
| 630 | cb->args[3] = 0; | ||
| 631 | cb->args[4] = 0; | ||
| 632 | ++num; | ||
| 633 | } | ||
| 634 | |||
| 635 | s_num = 0; | ||
| 636 | cb->args[3] = 0; | ||
| 637 | cb->args[4] = 0; | ||
| 638 | } | ||
| 639 | tcp_listen_unlock(); | ||
| 640 | skip_listen_ht: | ||
| 641 | cb->args[0] = 1; | ||
| 642 | s_i = num = s_num = 0; | ||
| 643 | } | ||
| 644 | |||
| 645 | if (!(r->tcpdiag_states&~(TCPF_LISTEN|TCPF_SYN_RECV))) | ||
| 646 | return skb->len; | ||
| 647 | |||
| 648 | for (i = s_i; i < tcp_ehash_size; i++) { | ||
| 649 | struct tcp_ehash_bucket *head = &tcp_ehash[i]; | ||
| 650 | struct sock *sk; | ||
| 651 | struct hlist_node *node; | ||
| 652 | |||
| 653 | if (i > s_i) | ||
| 654 | s_num = 0; | ||
| 655 | |||
| 656 | read_lock_bh(&head->lock); | ||
| 657 | |||
| 658 | num = 0; | ||
| 659 | sk_for_each(sk, node, &head->chain) { | ||
| 660 | struct inet_sock *inet = inet_sk(sk); | ||
| 661 | |||
| 662 | if (num < s_num) | ||
| 663 | goto next_normal; | ||
| 664 | if (!(r->tcpdiag_states & (1 << sk->sk_state))) | ||
| 665 | goto next_normal; | ||
| 666 | if (r->id.tcpdiag_sport != inet->sport && | ||
| 667 | r->id.tcpdiag_sport) | ||
| 668 | goto next_normal; | ||
| 669 | if (r->id.tcpdiag_dport != inet->dport && r->id.tcpdiag_dport) | ||
| 670 | goto next_normal; | ||
| 671 | if (tcpdiag_dump_sock(skb, sk, cb) < 0) { | ||
| 672 | read_unlock_bh(&head->lock); | ||
| 673 | goto done; | ||
| 674 | } | ||
| 675 | next_normal: | ||
| 676 | ++num; | ||
| 677 | } | ||
| 678 | |||
| 679 | if (r->tcpdiag_states&TCPF_TIME_WAIT) { | ||
| 680 | sk_for_each(sk, node, | ||
| 681 | &tcp_ehash[i + tcp_ehash_size].chain) { | ||
| 682 | struct inet_sock *inet = inet_sk(sk); | ||
| 683 | |||
| 684 | if (num < s_num) | ||
| 685 | goto next_dying; | ||
| 686 | if (r->id.tcpdiag_sport != inet->sport && | ||
| 687 | r->id.tcpdiag_sport) | ||
| 688 | goto next_dying; | ||
| 689 | if (r->id.tcpdiag_dport != inet->dport && | ||
| 690 | r->id.tcpdiag_dport) | ||
| 691 | goto next_dying; | ||
| 692 | if (tcpdiag_dump_sock(skb, sk, cb) < 0) { | ||
| 693 | read_unlock_bh(&head->lock); | ||
| 694 | goto done; | ||
| 695 | } | ||
| 696 | next_dying: | ||
| 697 | ++num; | ||
| 698 | } | ||
| 699 | } | ||
| 700 | read_unlock_bh(&head->lock); | ||
| 701 | } | ||
| 702 | |||
| 703 | done: | ||
| 704 | cb->args[1] = i; | ||
| 705 | cb->args[2] = num; | ||
| 706 | return skb->len; | ||
| 707 | } | ||
| 708 | |||
| 709 | static int tcpdiag_dump_done(struct netlink_callback *cb) | ||
| 710 | { | ||
| 711 | return 0; | ||
| 712 | } | ||
| 713 | |||
| 714 | |||
| 715 | static __inline__ int | ||
| 716 | tcpdiag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | ||
| 717 | { | ||
| 718 | if (!(nlh->nlmsg_flags&NLM_F_REQUEST)) | ||
| 719 | return 0; | ||
| 720 | |||
| 721 | if (nlh->nlmsg_type != TCPDIAG_GETSOCK) | ||
| 722 | goto err_inval; | ||
| 723 | |||
| 724 | if (NLMSG_LENGTH(sizeof(struct tcpdiagreq)) > skb->len) | ||
| 725 | goto err_inval; | ||
| 726 | |||
| 727 | if (nlh->nlmsg_flags&NLM_F_DUMP) { | ||
| 728 | if (nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(struct tcpdiagreq))) { | ||
| 729 | struct rtattr *rta = (struct rtattr*)(NLMSG_DATA(nlh) + sizeof(struct tcpdiagreq)); | ||
| 730 | if (rta->rta_type != TCPDIAG_REQ_BYTECODE || | ||
| 731 | rta->rta_len < 8 || | ||
| 732 | rta->rta_len > nlh->nlmsg_len - NLMSG_SPACE(sizeof(struct tcpdiagreq))) | ||
| 733 | goto err_inval; | ||
| 734 | if (tcpdiag_bc_audit(RTA_DATA(rta), RTA_PAYLOAD(rta))) | ||
| 735 | goto err_inval; | ||
| 736 | } | ||
| 737 | return netlink_dump_start(tcpnl, skb, nlh, | ||
| 738 | tcpdiag_dump, | ||
| 739 | tcpdiag_dump_done); | ||
| 740 | } else { | ||
| 741 | return tcpdiag_get_exact(skb, nlh); | ||
| 742 | } | ||
| 743 | |||
| 744 | err_inval: | ||
| 745 | return -EINVAL; | ||
| 746 | } | ||
| 747 | |||
| 748 | |||
| 749 | static inline void tcpdiag_rcv_skb(struct sk_buff *skb) | ||
| 750 | { | ||
| 751 | int err; | ||
| 752 | struct nlmsghdr * nlh; | ||
| 753 | |||
| 754 | if (skb->len >= NLMSG_SPACE(0)) { | ||
| 755 | nlh = (struct nlmsghdr *)skb->data; | ||
| 756 | if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) | ||
| 757 | return; | ||
| 758 | err = tcpdiag_rcv_msg(skb, nlh); | ||
| 759 | if (err || nlh->nlmsg_flags & NLM_F_ACK) | ||
| 760 | netlink_ack(skb, nlh, err); | ||
| 761 | } | ||
| 762 | } | ||
| 763 | |||
| 764 | static void tcpdiag_rcv(struct sock *sk, int len) | ||
| 765 | { | ||
| 766 | struct sk_buff *skb; | ||
| 767 | unsigned int qlen = skb_queue_len(&sk->sk_receive_queue); | ||
| 768 | |||
| 769 | while (qlen-- && (skb = skb_dequeue(&sk->sk_receive_queue))) { | ||
| 770 | tcpdiag_rcv_skb(skb); | ||
| 771 | kfree_skb(skb); | ||
| 772 | } | ||
| 773 | } | ||
| 774 | 41 | ||
| 775 | static int __init tcpdiag_init(void) | 42 | static int __init tcp_diag_init(void) |
| 776 | { | 43 | { |
| 777 | tcpnl = netlink_kernel_create(NETLINK_TCPDIAG, tcpdiag_rcv); | 44 | return inet_diag_register(&tcp_diag_handler); |
| 778 | if (tcpnl == NULL) | ||
| 779 | return -ENOMEM; | ||
| 780 | return 0; | ||
| 781 | } | 45 | } |
| 782 | 46 | ||
| 783 | static void __exit tcpdiag_exit(void) | 47 | static void __exit tcp_diag_exit(void) |
| 784 | { | 48 | { |
| 785 | sock_release(tcpnl->sk_socket); | 49 | inet_diag_unregister(&tcp_diag_handler); |
| 786 | } | 50 | } |
| 787 | 51 | ||
| 788 | module_init(tcpdiag_init); | 52 | module_init(tcp_diag_init); |
| 789 | module_exit(tcpdiag_exit); | 53 | module_exit(tcp_diag_exit); |
| 790 | MODULE_LICENSE("GPL"); | 54 | MODULE_LICENSE("GPL"); |
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 36c51f8136bf..6acc04bde080 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c | |||
| @@ -98,9 +98,10 @@ struct hstcp { | |||
| 98 | u32 ai; | 98 | u32 ai; |
| 99 | }; | 99 | }; |
| 100 | 100 | ||
| 101 | static void hstcp_init(struct tcp_sock *tp) | 101 | static void hstcp_init(struct sock *sk) |
| 102 | { | 102 | { |
| 103 | struct hstcp *ca = tcp_ca(tp); | 103 | struct tcp_sock *tp = tcp_sk(sk); |
| 104 | struct hstcp *ca = inet_csk_ca(sk); | ||
| 104 | 105 | ||
| 105 | ca->ai = 0; | 106 | ca->ai = 0; |
| 106 | 107 | ||
| @@ -109,10 +110,11 @@ static void hstcp_init(struct tcp_sock *tp) | |||
| 109 | tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); | 110 | tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); |
| 110 | } | 111 | } |
| 111 | 112 | ||
| 112 | static void hstcp_cong_avoid(struct tcp_sock *tp, u32 adk, u32 rtt, | 113 | static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, |
| 113 | u32 in_flight, int good) | 114 | u32 in_flight, int good) |
| 114 | { | 115 | { |
| 115 | struct hstcp *ca = tcp_ca(tp); | 116 | struct tcp_sock *tp = tcp_sk(sk); |
| 117 | struct hstcp *ca = inet_csk_ca(sk); | ||
| 116 | 118 | ||
| 117 | if (in_flight < tp->snd_cwnd) | 119 | if (in_flight < tp->snd_cwnd) |
| 118 | return; | 120 | return; |
| @@ -143,9 +145,10 @@ static void hstcp_cong_avoid(struct tcp_sock *tp, u32 adk, u32 rtt, | |||
| 143 | } | 145 | } |
| 144 | } | 146 | } |
| 145 | 147 | ||
| 146 | static u32 hstcp_ssthresh(struct tcp_sock *tp) | 148 | static u32 hstcp_ssthresh(struct sock *sk) |
| 147 | { | 149 | { |
| 148 | struct hstcp *ca = tcp_ca(tp); | 150 | const struct tcp_sock *tp = tcp_sk(sk); |
| 151 | const struct hstcp *ca = inet_csk_ca(sk); | ||
| 149 | 152 | ||
| 150 | /* Do multiplicative decrease */ | 153 | /* Do multiplicative decrease */ |
| 151 | return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U); | 154 | return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U); |
| @@ -164,7 +167,7 @@ static struct tcp_congestion_ops tcp_highspeed = { | |||
| 164 | 167 | ||
| 165 | static int __init hstcp_register(void) | 168 | static int __init hstcp_register(void) |
| 166 | { | 169 | { |
| 167 | BUG_ON(sizeof(struct hstcp) > TCP_CA_PRIV_SIZE); | 170 | BUG_ON(sizeof(struct hstcp) > ICSK_CA_PRIV_SIZE); |
| 168 | return tcp_register_congestion_control(&tcp_highspeed); | 171 | return tcp_register_congestion_control(&tcp_highspeed); |
| 169 | } | 172 | } |
| 170 | 173 | ||
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 40168275acf9..e47b37984e95 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c | |||
| @@ -55,18 +55,21 @@ static inline void htcp_reset(struct htcp *ca) | |||
| 55 | ca->snd_cwnd_cnt2 = 0; | 55 | ca->snd_cwnd_cnt2 = 0; |
| 56 | } | 56 | } |
| 57 | 57 | ||
| 58 | static u32 htcp_cwnd_undo(struct tcp_sock *tp) | 58 | static u32 htcp_cwnd_undo(struct sock *sk) |
| 59 | { | 59 | { |
| 60 | struct htcp *ca = tcp_ca(tp); | 60 | const struct tcp_sock *tp = tcp_sk(sk); |
| 61 | struct htcp *ca = inet_csk_ca(sk); | ||
| 61 | ca->ccount = ca->undo_ccount; | 62 | ca->ccount = ca->undo_ccount; |
| 62 | ca->maxRTT = ca->undo_maxRTT; | 63 | ca->maxRTT = ca->undo_maxRTT; |
| 63 | ca->old_maxB = ca->undo_old_maxB; | 64 | ca->old_maxB = ca->undo_old_maxB; |
| 64 | return max(tp->snd_cwnd, (tp->snd_ssthresh<<7)/ca->beta); | 65 | return max(tp->snd_cwnd, (tp->snd_ssthresh<<7)/ca->beta); |
| 65 | } | 66 | } |
| 66 | 67 | ||
| 67 | static inline void measure_rtt(struct tcp_sock *tp) | 68 | static inline void measure_rtt(struct sock *sk) |
| 68 | { | 69 | { |
| 69 | struct htcp *ca = tcp_ca(tp); | 70 | const struct inet_connection_sock *icsk = inet_csk(sk); |
| 71 | const struct tcp_sock *tp = tcp_sk(sk); | ||
| 72 | struct htcp *ca = inet_csk_ca(sk); | ||
| 70 | u32 srtt = tp->srtt>>3; | 73 | u32 srtt = tp->srtt>>3; |
| 71 | 74 | ||
| 72 | /* keep track of minimum RTT seen so far, minRTT is zero at first */ | 75 | /* keep track of minimum RTT seen so far, minRTT is zero at first */ |
| @@ -74,7 +77,7 @@ static inline void measure_rtt(struct tcp_sock *tp) | |||
| 74 | ca->minRTT = srtt; | 77 | ca->minRTT = srtt; |
| 75 | 78 | ||
| 76 | /* max RTT */ | 79 | /* max RTT */ |
| 77 | if (tp->ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && ca->ccount > 3) { | 80 | if (icsk->icsk_ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && ca->ccount > 3) { |
| 78 | if (ca->maxRTT < ca->minRTT) | 81 | if (ca->maxRTT < ca->minRTT) |
| 79 | ca->maxRTT = ca->minRTT; | 82 | ca->maxRTT = ca->minRTT; |
| 80 | if (ca->maxRTT < srtt && srtt <= ca->maxRTT+HZ/50) | 83 | if (ca->maxRTT < srtt && srtt <= ca->maxRTT+HZ/50) |
| @@ -82,13 +85,16 @@ static inline void measure_rtt(struct tcp_sock *tp) | |||
| 82 | } | 85 | } |
| 83 | } | 86 | } |
| 84 | 87 | ||
| 85 | static void measure_achieved_throughput(struct tcp_sock *tp, u32 pkts_acked) | 88 | static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked) |
| 86 | { | 89 | { |
| 87 | struct htcp *ca = tcp_ca(tp); | 90 | const struct inet_connection_sock *icsk = inet_csk(sk); |
| 91 | const struct tcp_sock *tp = tcp_sk(sk); | ||
| 92 | struct htcp *ca = inet_csk_ca(sk); | ||
| 88 | u32 now = tcp_time_stamp; | 93 | u32 now = tcp_time_stamp; |
| 89 | 94 | ||
| 90 | /* achieved throughput calculations */ | 95 | /* achieved throughput calculations */ |
| 91 | if (tp->ca_state != TCP_CA_Open && tp->ca_state != TCP_CA_Disorder) { | 96 | if (icsk->icsk_ca_state != TCP_CA_Open && |
| 97 | icsk->icsk_ca_state != TCP_CA_Disorder) { | ||
| 92 | ca->packetcount = 0; | 98 | ca->packetcount = 0; |
| 93 | ca->lasttime = now; | 99 | ca->lasttime = now; |
| 94 | return; | 100 | return; |
| @@ -173,9 +179,9 @@ static inline void htcp_alpha_update(struct htcp *ca) | |||
| 173 | * that point do we really have a real sense of maxRTT (the queues en route | 179 | * that point do we really have a real sense of maxRTT (the queues en route |
| 174 | * were getting just too full now). | 180 | * were getting just too full now). |
| 175 | */ | 181 | */ |
| 176 | static void htcp_param_update(struct tcp_sock *tp) | 182 | static void htcp_param_update(struct sock *sk) |
| 177 | { | 183 | { |
| 178 | struct htcp *ca = tcp_ca(tp); | 184 | struct htcp *ca = inet_csk_ca(sk); |
| 179 | u32 minRTT = ca->minRTT; | 185 | u32 minRTT = ca->minRTT; |
| 180 | u32 maxRTT = ca->maxRTT; | 186 | u32 maxRTT = ca->maxRTT; |
| 181 | 187 | ||
| @@ -187,17 +193,19 @@ static void htcp_param_update(struct tcp_sock *tp) | |||
| 187 | ca->maxRTT = minRTT + ((maxRTT-minRTT)*95)/100; | 193 | ca->maxRTT = minRTT + ((maxRTT-minRTT)*95)/100; |
| 188 | } | 194 | } |
| 189 | 195 | ||
| 190 | static u32 htcp_recalc_ssthresh(struct tcp_sock *tp) | 196 | static u32 htcp_recalc_ssthresh(struct sock *sk) |
| 191 | { | 197 | { |
| 192 | struct htcp *ca = tcp_ca(tp); | 198 | const struct tcp_sock *tp = tcp_sk(sk); |
| 193 | htcp_param_update(tp); | 199 | const struct htcp *ca = inet_csk_ca(sk); |
| 200 | htcp_param_update(sk); | ||
| 194 | return max((tp->snd_cwnd * ca->beta) >> 7, 2U); | 201 | return max((tp->snd_cwnd * ca->beta) >> 7, 2U); |
| 195 | } | 202 | } |
| 196 | 203 | ||
| 197 | static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, | 204 | static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, |
| 198 | u32 in_flight, int data_acked) | 205 | u32 in_flight, int data_acked) |
| 199 | { | 206 | { |
| 200 | struct htcp *ca = tcp_ca(tp); | 207 | struct tcp_sock *tp = tcp_sk(sk); |
| 208 | struct htcp *ca = inet_csk_ca(sk); | ||
| 201 | 209 | ||
| 202 | if (in_flight < tp->snd_cwnd) | 210 | if (in_flight < tp->snd_cwnd) |
| 203 | return; | 211 | return; |
| @@ -207,7 +215,7 @@ static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, | |||
| 207 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | 215 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) |
| 208 | tp->snd_cwnd++; | 216 | tp->snd_cwnd++; |
| 209 | } else { | 217 | } else { |
| 210 | measure_rtt(tp); | 218 | measure_rtt(sk); |
| 211 | 219 | ||
| 212 | /* keep track of number of round-trip times since last backoff event */ | 220 | /* keep track of number of round-trip times since last backoff event */ |
| 213 | if (ca->snd_cwnd_cnt2++ > tp->snd_cwnd) { | 221 | if (ca->snd_cwnd_cnt2++ > tp->snd_cwnd) { |
| @@ -229,28 +237,29 @@ static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, | |||
| 229 | } | 237 | } |
| 230 | 238 | ||
| 231 | /* Lower bound on congestion window. */ | 239 | /* Lower bound on congestion window. */ |
| 232 | static u32 htcp_min_cwnd(struct tcp_sock *tp) | 240 | static u32 htcp_min_cwnd(struct sock *sk) |
| 233 | { | 241 | { |
| 242 | const struct tcp_sock *tp = tcp_sk(sk); | ||
| 234 | return tp->snd_ssthresh; | 243 | return tp->snd_ssthresh; |
| 235 | } | 244 | } |
| 236 | 245 | ||
| 237 | 246 | ||
| 238 | static void htcp_init(struct tcp_sock *tp) | 247 | static void htcp_init(struct sock *sk) |
| 239 | { | 248 | { |
| 240 | struct htcp *ca = tcp_ca(tp); | 249 | struct htcp *ca = inet_csk_ca(sk); |
| 241 | 250 | ||
| 242 | memset(ca, 0, sizeof(struct htcp)); | 251 | memset(ca, 0, sizeof(struct htcp)); |
| 243 | ca->alpha = ALPHA_BASE; | 252 | ca->alpha = ALPHA_BASE; |
| 244 | ca->beta = BETA_MIN; | 253 | ca->beta = BETA_MIN; |
| 245 | } | 254 | } |
| 246 | 255 | ||
| 247 | static void htcp_state(struct tcp_sock *tp, u8 new_state) | 256 | static void htcp_state(struct sock *sk, u8 new_state) |
| 248 | { | 257 | { |
| 249 | switch (new_state) { | 258 | switch (new_state) { |
| 250 | case TCP_CA_CWR: | 259 | case TCP_CA_CWR: |
| 251 | case TCP_CA_Recovery: | 260 | case TCP_CA_Recovery: |
| 252 | case TCP_CA_Loss: | 261 | case TCP_CA_Loss: |
| 253 | htcp_reset(tcp_ca(tp)); | 262 | htcp_reset(inet_csk_ca(sk)); |
| 254 | break; | 263 | break; |
| 255 | } | 264 | } |
| 256 | } | 265 | } |
| @@ -269,7 +278,7 @@ static struct tcp_congestion_ops htcp = { | |||
| 269 | 278 | ||
| 270 | static int __init htcp_register(void) | 279 | static int __init htcp_register(void) |
| 271 | { | 280 | { |
| 272 | BUG_ON(sizeof(struct htcp) > TCP_CA_PRIV_SIZE); | 281 | BUG_ON(sizeof(struct htcp) > ICSK_CA_PRIV_SIZE); |
| 273 | BUILD_BUG_ON(BETA_MIN >= BETA_MAX); | 282 | BUILD_BUG_ON(BETA_MIN >= BETA_MAX); |
| 274 | if (!use_bandwidth_switch) | 283 | if (!use_bandwidth_switch) |
| 275 | htcp.pkts_acked = NULL; | 284 | htcp.pkts_acked = NULL; |
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index 13a66342c304..77add63623df 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c | |||
| @@ -33,19 +33,20 @@ MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)"); | |||
| 33 | 33 | ||
| 34 | 34 | ||
| 35 | /* This is called to refresh values for hybla parameters */ | 35 | /* This is called to refresh values for hybla parameters */ |
| 36 | static inline void hybla_recalc_param (struct tcp_sock *tp) | 36 | static inline void hybla_recalc_param (struct sock *sk) |
| 37 | { | 37 | { |
| 38 | struct hybla *ca = tcp_ca(tp); | 38 | struct hybla *ca = inet_csk_ca(sk); |
| 39 | 39 | ||
| 40 | ca->rho_3ls = max_t(u32, tp->srtt / msecs_to_jiffies(rtt0), 8); | 40 | ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt / msecs_to_jiffies(rtt0), 8); |
| 41 | ca->rho = ca->rho_3ls >> 3; | 41 | ca->rho = ca->rho_3ls >> 3; |
| 42 | ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1; | 42 | ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1; |
| 43 | ca->rho2 = ca->rho2_7ls >>7; | 43 | ca->rho2 = ca->rho2_7ls >>7; |
| 44 | } | 44 | } |
| 45 | 45 | ||
| 46 | static void hybla_init(struct tcp_sock *tp) | 46 | static void hybla_init(struct sock *sk) |
| 47 | { | 47 | { |
| 48 | struct hybla *ca = tcp_ca(tp); | 48 | struct tcp_sock *tp = tcp_sk(sk); |
| 49 | struct hybla *ca = inet_csk_ca(sk); | ||
| 49 | 50 | ||
| 50 | ca->rho = 0; | 51 | ca->rho = 0; |
| 51 | ca->rho2 = 0; | 52 | ca->rho2 = 0; |
| @@ -57,17 +58,16 @@ static void hybla_init(struct tcp_sock *tp) | |||
| 57 | tp->snd_cwnd_clamp = 65535; | 58 | tp->snd_cwnd_clamp = 65535; |
| 58 | 59 | ||
| 59 | /* 1st Rho measurement based on initial srtt */ | 60 | /* 1st Rho measurement based on initial srtt */ |
| 60 | hybla_recalc_param(tp); | 61 | hybla_recalc_param(sk); |
| 61 | 62 | ||
| 62 | /* set minimum rtt as this is the 1st ever seen */ | 63 | /* set minimum rtt as this is the 1st ever seen */ |
| 63 | ca->minrtt = tp->srtt; | 64 | ca->minrtt = tp->srtt; |
| 64 | tp->snd_cwnd = ca->rho; | 65 | tp->snd_cwnd = ca->rho; |
| 65 | } | 66 | } |
| 66 | 67 | ||
| 67 | static void hybla_state(struct tcp_sock *tp, u8 ca_state) | 68 | static void hybla_state(struct sock *sk, u8 ca_state) |
| 68 | { | 69 | { |
| 69 | struct hybla *ca = tcp_ca(tp); | 70 | struct hybla *ca = inet_csk_ca(sk); |
| 70 | |||
| 71 | ca->hybla_en = (ca_state == TCP_CA_Open); | 71 | ca->hybla_en = (ca_state == TCP_CA_Open); |
| 72 | } | 72 | } |
| 73 | 73 | ||
| @@ -86,27 +86,28 @@ static inline u32 hybla_fraction(u32 odds) | |||
| 86 | * o Give cwnd a new value based on the model proposed | 86 | * o Give cwnd a new value based on the model proposed |
| 87 | * o remember increments <1 | 87 | * o remember increments <1 |
| 88 | */ | 88 | */ |
| 89 | static void hybla_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, | 89 | static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt, |
| 90 | u32 in_flight, int flag) | 90 | u32 in_flight, int flag) |
| 91 | { | 91 | { |
| 92 | struct hybla *ca = tcp_ca(tp); | 92 | struct tcp_sock *tp = tcp_sk(sk); |
| 93 | struct hybla *ca = inet_csk_ca(sk); | ||
| 93 | u32 increment, odd, rho_fractions; | 94 | u32 increment, odd, rho_fractions; |
| 94 | int is_slowstart = 0; | 95 | int is_slowstart = 0; |
| 95 | 96 | ||
| 96 | /* Recalculate rho only if this srtt is the lowest */ | 97 | /* Recalculate rho only if this srtt is the lowest */ |
| 97 | if (tp->srtt < ca->minrtt){ | 98 | if (tp->srtt < ca->minrtt){ |
| 98 | hybla_recalc_param(tp); | 99 | hybla_recalc_param(sk); |
| 99 | ca->minrtt = tp->srtt; | 100 | ca->minrtt = tp->srtt; |
| 100 | } | 101 | } |
| 101 | 102 | ||
| 102 | if (!ca->hybla_en) | 103 | if (!ca->hybla_en) |
| 103 | return tcp_reno_cong_avoid(tp, ack, rtt, in_flight, flag); | 104 | return tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag); |
| 104 | 105 | ||
| 105 | if (in_flight < tp->snd_cwnd) | 106 | if (in_flight < tp->snd_cwnd) |
| 106 | return; | 107 | return; |
| 107 | 108 | ||
| 108 | if (ca->rho == 0) | 109 | if (ca->rho == 0) |
| 109 | hybla_recalc_param(tp); | 110 | hybla_recalc_param(sk); |
| 110 | 111 | ||
| 111 | rho_fractions = ca->rho_3ls - (ca->rho << 3); | 112 | rho_fractions = ca->rho_3ls - (ca->rho << 3); |
| 112 | 113 | ||
| @@ -170,7 +171,7 @@ static struct tcp_congestion_ops tcp_hybla = { | |||
| 170 | 171 | ||
| 171 | static int __init hybla_register(void) | 172 | static int __init hybla_register(void) |
| 172 | { | 173 | { |
| 173 | BUG_ON(sizeof(struct hybla) > TCP_CA_PRIV_SIZE); | 174 | BUG_ON(sizeof(struct hybla) > ICSK_CA_PRIV_SIZE); |
| 174 | return tcp_register_congestion_control(&tcp_hybla); | 175 | return tcp_register_congestion_control(&tcp_hybla); |
| 175 | } | 176 | } |
| 176 | 177 | ||
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 53a8a5399f1e..1afb080bdf0c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
| @@ -114,20 +114,21 @@ int sysctl_tcp_moderate_rcvbuf = 1; | |||
| 114 | /* Adapt the MSS value used to make delayed ack decision to the | 114 | /* Adapt the MSS value used to make delayed ack decision to the |
| 115 | * real world. | 115 | * real world. |
| 116 | */ | 116 | */ |
| 117 | static inline void tcp_measure_rcv_mss(struct tcp_sock *tp, | 117 | static inline void tcp_measure_rcv_mss(struct sock *sk, |
| 118 | struct sk_buff *skb) | 118 | const struct sk_buff *skb) |
| 119 | { | 119 | { |
| 120 | unsigned int len, lss; | 120 | struct inet_connection_sock *icsk = inet_csk(sk); |
| 121 | const unsigned int lss = icsk->icsk_ack.last_seg_size; | ||
| 122 | unsigned int len; | ||
| 121 | 123 | ||
| 122 | lss = tp->ack.last_seg_size; | 124 | icsk->icsk_ack.last_seg_size = 0; |
| 123 | tp->ack.last_seg_size = 0; | ||
| 124 | 125 | ||
| 125 | /* skb->len may jitter because of SACKs, even if peer | 126 | /* skb->len may jitter because of SACKs, even if peer |
| 126 | * sends good full-sized frames. | 127 | * sends good full-sized frames. |
| 127 | */ | 128 | */ |
| 128 | len = skb->len; | 129 | len = skb->len; |
| 129 | if (len >= tp->ack.rcv_mss) { | 130 | if (len >= icsk->icsk_ack.rcv_mss) { |
| 130 | tp->ack.rcv_mss = len; | 131 | icsk->icsk_ack.rcv_mss = len; |
| 131 | } else { | 132 | } else { |
| 132 | /* Otherwise, we make more careful check taking into account, | 133 | /* Otherwise, we make more careful check taking into account, |
| 133 | * that SACKs block is variable. | 134 | * that SACKs block is variable. |
| @@ -147,41 +148,44 @@ static inline void tcp_measure_rcv_mss(struct tcp_sock *tp, | |||
| 147 | * tcp header plus fixed timestamp option length. | 148 | * tcp header plus fixed timestamp option length. |
| 148 | * Resulting "len" is MSS free of SACK jitter. | 149 | * Resulting "len" is MSS free of SACK jitter. |
| 149 | */ | 150 | */ |
| 150 | len -= tp->tcp_header_len; | 151 | len -= tcp_sk(sk)->tcp_header_len; |
| 151 | tp->ack.last_seg_size = len; | 152 | icsk->icsk_ack.last_seg_size = len; |
| 152 | if (len == lss) { | 153 | if (len == lss) { |
| 153 | tp->ack.rcv_mss = len; | 154 | icsk->icsk_ack.rcv_mss = len; |
| 154 | return; | 155 | return; |
| 155 | } | 156 | } |
| 156 | } | 157 | } |
| 157 | tp->ack.pending |= TCP_ACK_PUSHED; | 158 | icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; |
| 158 | } | 159 | } |
| 159 | } | 160 | } |
| 160 | 161 | ||
| 161 | static void tcp_incr_quickack(struct tcp_sock *tp) | 162 | static void tcp_incr_quickack(struct sock *sk) |
| 162 | { | 163 | { |
| 163 | unsigned quickacks = tp->rcv_wnd/(2*tp->ack.rcv_mss); | 164 | struct inet_connection_sock *icsk = inet_csk(sk); |
| 165 | unsigned quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss); | ||
| 164 | 166 | ||
| 165 | if (quickacks==0) | 167 | if (quickacks==0) |
| 166 | quickacks=2; | 168 | quickacks=2; |
| 167 | if (quickacks > tp->ack.quick) | 169 | if (quickacks > icsk->icsk_ack.quick) |
| 168 | tp->ack.quick = min(quickacks, TCP_MAX_QUICKACKS); | 170 | icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS); |
| 169 | } | 171 | } |
| 170 | 172 | ||
| 171 | void tcp_enter_quickack_mode(struct tcp_sock *tp) | 173 | void tcp_enter_quickack_mode(struct sock *sk) |
| 172 | { | 174 | { |
| 173 | tcp_incr_quickack(tp); | 175 | struct inet_connection_sock *icsk = inet_csk(sk); |
| 174 | tp->ack.pingpong = 0; | 176 | tcp_incr_quickack(sk); |
| 175 | tp->ack.ato = TCP_ATO_MIN; | 177 | icsk->icsk_ack.pingpong = 0; |
| 178 | icsk->icsk_ack.ato = TCP_ATO_MIN; | ||
| 176 | } | 179 | } |
| 177 | 180 | ||
| 178 | /* Send ACKs quickly, if "quick" count is not exhausted | 181 | /* Send ACKs quickly, if "quick" count is not exhausted |
| 179 | * and the session is not interactive. | 182 | * and the session is not interactive. |
| 180 | */ | 183 | */ |
| 181 | 184 | ||
| 182 | static __inline__ int tcp_in_quickack_mode(struct tcp_sock *tp) | 185 | static inline int tcp_in_quickack_mode(const struct sock *sk) |
| 183 | { | 186 | { |
| 184 | return (tp->ack.quick && !tp->ack.pingpong); | 187 | const struct inet_connection_sock *icsk = inet_csk(sk); |
| 188 | return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong; | ||
| 185 | } | 189 | } |
| 186 | 190 | ||
| 187 | /* Buffer size and advertised window tuning. | 191 | /* Buffer size and advertised window tuning. |
| @@ -224,8 +228,8 @@ static void tcp_fixup_sndbuf(struct sock *sk) | |||
| 224 | */ | 228 | */ |
| 225 | 229 | ||
| 226 | /* Slow part of check#2. */ | 230 | /* Slow part of check#2. */ |
| 227 | static int __tcp_grow_window(struct sock *sk, struct tcp_sock *tp, | 231 | static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp, |
| 228 | struct sk_buff *skb) | 232 | const struct sk_buff *skb) |
| 229 | { | 233 | { |
| 230 | /* Optimize this! */ | 234 | /* Optimize this! */ |
| 231 | int truesize = tcp_win_from_space(skb->truesize)/2; | 235 | int truesize = tcp_win_from_space(skb->truesize)/2; |
| @@ -233,7 +237,7 @@ static int __tcp_grow_window(struct sock *sk, struct tcp_sock *tp, | |||
| 233 | 237 | ||
| 234 | while (tp->rcv_ssthresh <= window) { | 238 | while (tp->rcv_ssthresh <= window) { |
| 235 | if (truesize <= skb->len) | 239 | if (truesize <= skb->len) |
| 236 | return 2*tp->ack.rcv_mss; | 240 | return 2 * inet_csk(sk)->icsk_ack.rcv_mss; |
| 237 | 241 | ||
| 238 | truesize >>= 1; | 242 | truesize >>= 1; |
| 239 | window >>= 1; | 243 | window >>= 1; |
| @@ -260,7 +264,7 @@ static inline void tcp_grow_window(struct sock *sk, struct tcp_sock *tp, | |||
| 260 | 264 | ||
| 261 | if (incr) { | 265 | if (incr) { |
| 262 | tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp); | 266 | tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp); |
| 263 | tp->ack.quick |= 1; | 267 | inet_csk(sk)->icsk_ack.quick |= 1; |
| 264 | } | 268 | } |
| 265 | } | 269 | } |
| 266 | } | 270 | } |
| @@ -321,11 +325,12 @@ static void tcp_init_buffer_space(struct sock *sk) | |||
| 321 | /* 5. Recalculate window clamp after socket hit its memory bounds. */ | 325 | /* 5. Recalculate window clamp after socket hit its memory bounds. */ |
| 322 | static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) | 326 | static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) |
| 323 | { | 327 | { |
| 328 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 324 | struct sk_buff *skb; | 329 | struct sk_buff *skb; |
| 325 | unsigned int app_win = tp->rcv_nxt - tp->copied_seq; | 330 | unsigned int app_win = tp->rcv_nxt - tp->copied_seq; |
| 326 | int ofo_win = 0; | 331 | int ofo_win = 0; |
| 327 | 332 | ||
| 328 | tp->ack.quick = 0; | 333 | icsk->icsk_ack.quick = 0; |
| 329 | 334 | ||
| 330 | skb_queue_walk(&tp->out_of_order_queue, skb) { | 335 | skb_queue_walk(&tp->out_of_order_queue, skb) { |
| 331 | ofo_win += skb->len; | 336 | ofo_win += skb->len; |
| @@ -346,8 +351,8 @@ static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) | |||
| 346 | app_win += ofo_win; | 351 | app_win += ofo_win; |
| 347 | if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf) | 352 | if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf) |
| 348 | app_win >>= 1; | 353 | app_win >>= 1; |
| 349 | if (app_win > tp->ack.rcv_mss) | 354 | if (app_win > icsk->icsk_ack.rcv_mss) |
| 350 | app_win -= tp->ack.rcv_mss; | 355 | app_win -= icsk->icsk_ack.rcv_mss; |
| 351 | app_win = max(app_win, 2U*tp->advmss); | 356 | app_win = max(app_win, 2U*tp->advmss); |
| 352 | 357 | ||
| 353 | if (!ofo_win) | 358 | if (!ofo_win) |
| @@ -415,11 +420,12 @@ new_measure: | |||
| 415 | tp->rcv_rtt_est.time = tcp_time_stamp; | 420 | tp->rcv_rtt_est.time = tcp_time_stamp; |
| 416 | } | 421 | } |
| 417 | 422 | ||
| 418 | static inline void tcp_rcv_rtt_measure_ts(struct tcp_sock *tp, struct sk_buff *skb) | 423 | static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, const struct sk_buff *skb) |
| 419 | { | 424 | { |
| 425 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 420 | if (tp->rx_opt.rcv_tsecr && | 426 | if (tp->rx_opt.rcv_tsecr && |
| 421 | (TCP_SKB_CB(skb)->end_seq - | 427 | (TCP_SKB_CB(skb)->end_seq - |
| 422 | TCP_SKB_CB(skb)->seq >= tp->ack.rcv_mss)) | 428 | TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) |
| 423 | tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0); | 429 | tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0); |
| 424 | } | 430 | } |
| 425 | 431 | ||
| @@ -492,41 +498,42 @@ new_measure: | |||
| 492 | */ | 498 | */ |
| 493 | static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) | 499 | static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) |
| 494 | { | 500 | { |
| 501 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 495 | u32 now; | 502 | u32 now; |
| 496 | 503 | ||
| 497 | tcp_schedule_ack(tp); | 504 | inet_csk_schedule_ack(sk); |
| 498 | 505 | ||
| 499 | tcp_measure_rcv_mss(tp, skb); | 506 | tcp_measure_rcv_mss(sk, skb); |
| 500 | 507 | ||
| 501 | tcp_rcv_rtt_measure(tp); | 508 | tcp_rcv_rtt_measure(tp); |
| 502 | 509 | ||
| 503 | now = tcp_time_stamp; | 510 | now = tcp_time_stamp; |
| 504 | 511 | ||
| 505 | if (!tp->ack.ato) { | 512 | if (!icsk->icsk_ack.ato) { |
| 506 | /* The _first_ data packet received, initialize | 513 | /* The _first_ data packet received, initialize |
| 507 | * delayed ACK engine. | 514 | * delayed ACK engine. |
| 508 | */ | 515 | */ |
| 509 | tcp_incr_quickack(tp); | 516 | tcp_incr_quickack(sk); |
| 510 | tp->ack.ato = TCP_ATO_MIN; | 517 | icsk->icsk_ack.ato = TCP_ATO_MIN; |
| 511 | } else { | 518 | } else { |
| 512 | int m = now - tp->ack.lrcvtime; | 519 | int m = now - icsk->icsk_ack.lrcvtime; |
| 513 | 520 | ||
| 514 | if (m <= TCP_ATO_MIN/2) { | 521 | if (m <= TCP_ATO_MIN/2) { |
| 515 | /* The fastest case is the first. */ | 522 | /* The fastest case is the first. */ |
| 516 | tp->ack.ato = (tp->ack.ato>>1) + TCP_ATO_MIN/2; | 523 | icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2; |
| 517 | } else if (m < tp->ack.ato) { | 524 | } else if (m < icsk->icsk_ack.ato) { |
| 518 | tp->ack.ato = (tp->ack.ato>>1) + m; | 525 | icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m; |
| 519 | if (tp->ack.ato > tp->rto) | 526 | if (icsk->icsk_ack.ato > icsk->icsk_rto) |
| 520 | tp->ack.ato = tp->rto; | 527 | icsk->icsk_ack.ato = icsk->icsk_rto; |
| 521 | } else if (m > tp->rto) { | 528 | } else if (m > icsk->icsk_rto) { |
| 522 | /* Too long gap. Apparently sender falled to | 529 | /* Too long gap. Apparently sender falled to |
| 523 | * restart window, so that we send ACKs quickly. | 530 | * restart window, so that we send ACKs quickly. |
| 524 | */ | 531 | */ |
| 525 | tcp_incr_quickack(tp); | 532 | tcp_incr_quickack(sk); |
| 526 | sk_stream_mem_reclaim(sk); | 533 | sk_stream_mem_reclaim(sk); |
| 527 | } | 534 | } |
| 528 | } | 535 | } |
| 529 | tp->ack.lrcvtime = now; | 536 | icsk->icsk_ack.lrcvtime = now; |
| 530 | 537 | ||
| 531 | TCP_ECN_check_ce(tp, skb); | 538 | TCP_ECN_check_ce(tp, skb); |
| 532 | 539 | ||
| @@ -543,8 +550,10 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ | |||
| 543 | * To save cycles in the RFC 1323 implementation it was better to break | 550 | * To save cycles in the RFC 1323 implementation it was better to break |
| 544 | * it up into three procedures. -- erics | 551 | * it up into three procedures. -- erics |
| 545 | */ | 552 | */ |
| 546 | static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt) | 553 | static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt) |
| 547 | { | 554 | { |
| 555 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 556 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 548 | long m = mrtt; /* RTT */ | 557 | long m = mrtt; /* RTT */ |
| 549 | 558 | ||
| 550 | /* The following amusing code comes from Jacobson's | 559 | /* The following amusing code comes from Jacobson's |
| @@ -604,15 +613,16 @@ static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt) | |||
| 604 | tp->rtt_seq = tp->snd_nxt; | 613 | tp->rtt_seq = tp->snd_nxt; |
| 605 | } | 614 | } |
| 606 | 615 | ||
| 607 | if (tp->ca_ops->rtt_sample) | 616 | if (icsk->icsk_ca_ops->rtt_sample) |
| 608 | tp->ca_ops->rtt_sample(tp, *usrtt); | 617 | icsk->icsk_ca_ops->rtt_sample(sk, *usrtt); |
| 609 | } | 618 | } |
| 610 | 619 | ||
| 611 | /* Calculate rto without backoff. This is the second half of Van Jacobson's | 620 | /* Calculate rto without backoff. This is the second half of Van Jacobson's |
| 612 | * routine referred to above. | 621 | * routine referred to above. |
| 613 | */ | 622 | */ |
| 614 | static inline void tcp_set_rto(struct tcp_sock *tp) | 623 | static inline void tcp_set_rto(struct sock *sk) |
| 615 | { | 624 | { |
| 625 | const struct tcp_sock *tp = tcp_sk(sk); | ||
| 616 | /* Old crap is replaced with new one. 8) | 626 | /* Old crap is replaced with new one. 8) |
| 617 | * | 627 | * |
| 618 | * More seriously: | 628 | * More seriously: |
| @@ -623,7 +633,7 @@ static inline void tcp_set_rto(struct tcp_sock *tp) | |||
| 623 | * is invisible. Actually, Linux-2.4 also generates erratic | 633 | * is invisible. Actually, Linux-2.4 also generates erratic |
| 624 | * ACKs in some curcumstances. | 634 | * ACKs in some curcumstances. |
| 625 | */ | 635 | */ |
| 626 | tp->rto = (tp->srtt >> 3) + tp->rttvar; | 636 | inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar; |
| 627 | 637 | ||
| 628 | /* 2. Fixups made earlier cannot be right. | 638 | /* 2. Fixups made earlier cannot be right. |
| 629 | * If we do not estimate RTO correctly without them, | 639 | * If we do not estimate RTO correctly without them, |
| @@ -635,10 +645,10 @@ static inline void tcp_set_rto(struct tcp_sock *tp) | |||
| 635 | /* NOTE: clamping at TCP_RTO_MIN is not required, current algo | 645 | /* NOTE: clamping at TCP_RTO_MIN is not required, current algo |
| 636 | * guarantees that rto is higher. | 646 | * guarantees that rto is higher. |
| 637 | */ | 647 | */ |
| 638 | static inline void tcp_bound_rto(struct tcp_sock *tp) | 648 | static inline void tcp_bound_rto(struct sock *sk) |
| 639 | { | 649 | { |
| 640 | if (tp->rto > TCP_RTO_MAX) | 650 | if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX) |
| 641 | tp->rto = TCP_RTO_MAX; | 651 | inet_csk(sk)->icsk_rto = TCP_RTO_MAX; |
| 642 | } | 652 | } |
| 643 | 653 | ||
| 644 | /* Save metrics learned by this TCP session. | 654 | /* Save metrics learned by this TCP session. |
| @@ -656,9 +666,10 @@ void tcp_update_metrics(struct sock *sk) | |||
| 656 | dst_confirm(dst); | 666 | dst_confirm(dst); |
| 657 | 667 | ||
| 658 | if (dst && (dst->flags&DST_HOST)) { | 668 | if (dst && (dst->flags&DST_HOST)) { |
| 669 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 659 | int m; | 670 | int m; |
| 660 | 671 | ||
| 661 | if (tp->backoff || !tp->srtt) { | 672 | if (icsk->icsk_backoff || !tp->srtt) { |
| 662 | /* This session failed to estimate rtt. Why? | 673 | /* This session failed to estimate rtt. Why? |
| 663 | * Probably, no packets returned in time. | 674 | * Probably, no packets returned in time. |
| 664 | * Reset our results. | 675 | * Reset our results. |
| @@ -707,7 +718,7 @@ void tcp_update_metrics(struct sock *sk) | |||
| 707 | tp->snd_cwnd > dst_metric(dst, RTAX_CWND)) | 718 | tp->snd_cwnd > dst_metric(dst, RTAX_CWND)) |
| 708 | dst->metrics[RTAX_CWND-1] = tp->snd_cwnd; | 719 | dst->metrics[RTAX_CWND-1] = tp->snd_cwnd; |
| 709 | } else if (tp->snd_cwnd > tp->snd_ssthresh && | 720 | } else if (tp->snd_cwnd > tp->snd_ssthresh && |
| 710 | tp->ca_state == TCP_CA_Open) { | 721 | icsk->icsk_ca_state == TCP_CA_Open) { |
| 711 | /* Cong. avoidance phase, cwnd is reliable. */ | 722 | /* Cong. avoidance phase, cwnd is reliable. */ |
| 712 | if (!dst_metric_locked(dst, RTAX_SSTHRESH)) | 723 | if (!dst_metric_locked(dst, RTAX_SSTHRESH)) |
| 713 | dst->metrics[RTAX_SSTHRESH-1] = | 724 | dst->metrics[RTAX_SSTHRESH-1] = |
| @@ -801,9 +812,9 @@ static void tcp_init_metrics(struct sock *sk) | |||
| 801 | tp->mdev = dst_metric(dst, RTAX_RTTVAR); | 812 | tp->mdev = dst_metric(dst, RTAX_RTTVAR); |
| 802 | tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); | 813 | tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); |
| 803 | } | 814 | } |
| 804 | tcp_set_rto(tp); | 815 | tcp_set_rto(sk); |
| 805 | tcp_bound_rto(tp); | 816 | tcp_bound_rto(sk); |
| 806 | if (tp->rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) | 817 | if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) |
| 807 | goto reset; | 818 | goto reset; |
| 808 | tp->snd_cwnd = tcp_init_cwnd(tp, dst); | 819 | tp->snd_cwnd = tcp_init_cwnd(tp, dst); |
| 809 | tp->snd_cwnd_stamp = tcp_time_stamp; | 820 | tp->snd_cwnd_stamp = tcp_time_stamp; |
| @@ -817,12 +828,14 @@ reset: | |||
| 817 | if (!tp->rx_opt.saw_tstamp && tp->srtt) { | 828 | if (!tp->rx_opt.saw_tstamp && tp->srtt) { |
| 818 | tp->srtt = 0; | 829 | tp->srtt = 0; |
| 819 | tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT; | 830 | tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT; |
| 820 | tp->rto = TCP_TIMEOUT_INIT; | 831 | inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; |
| 821 | } | 832 | } |
| 822 | } | 833 | } |
| 823 | 834 | ||
| 824 | static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts) | 835 | static void tcp_update_reordering(struct sock *sk, const int metric, |
| 836 | const int ts) | ||
| 825 | { | 837 | { |
| 838 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 826 | if (metric > tp->reordering) { | 839 | if (metric > tp->reordering) { |
| 827 | tp->reordering = min(TCP_MAX_REORDERING, metric); | 840 | tp->reordering = min(TCP_MAX_REORDERING, metric); |
| 828 | 841 | ||
| @@ -837,7 +850,7 @@ static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts) | |||
| 837 | NET_INC_STATS_BH(LINUX_MIB_TCPSACKREORDER); | 850 | NET_INC_STATS_BH(LINUX_MIB_TCPSACKREORDER); |
| 838 | #if FASTRETRANS_DEBUG > 1 | 851 | #if FASTRETRANS_DEBUG > 1 |
| 839 | printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n", | 852 | printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n", |
| 840 | tp->rx_opt.sack_ok, tp->ca_state, | 853 | tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state, |
| 841 | tp->reordering, | 854 | tp->reordering, |
| 842 | tp->fackets_out, | 855 | tp->fackets_out, |
| 843 | tp->sacked_out, | 856 | tp->sacked_out, |
| @@ -899,6 +912,7 @@ static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts) | |||
| 899 | static int | 912 | static int |
| 900 | tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una) | 913 | tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una) |
| 901 | { | 914 | { |
| 915 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 902 | struct tcp_sock *tp = tcp_sk(sk); | 916 | struct tcp_sock *tp = tcp_sk(sk); |
| 903 | unsigned char *ptr = ack_skb->h.raw + TCP_SKB_CB(ack_skb)->sacked; | 917 | unsigned char *ptr = ack_skb->h.raw + TCP_SKB_CB(ack_skb)->sacked; |
| 904 | struct tcp_sack_block *sp = (struct tcp_sack_block *)(ptr+2); | 918 | struct tcp_sack_block *sp = (struct tcp_sack_block *)(ptr+2); |
| @@ -1064,7 +1078,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ | |||
| 1064 | * we have to account for reordering! Ugly, | 1078 | * we have to account for reordering! Ugly, |
| 1065 | * but should help. | 1079 | * but should help. |
| 1066 | */ | 1080 | */ |
| 1067 | if (lost_retrans && tp->ca_state == TCP_CA_Recovery) { | 1081 | if (lost_retrans && icsk->icsk_ca_state == TCP_CA_Recovery) { |
| 1068 | struct sk_buff *skb; | 1082 | struct sk_buff *skb; |
| 1069 | 1083 | ||
| 1070 | sk_stream_for_retrans_queue(skb, sk) { | 1084 | sk_stream_for_retrans_queue(skb, sk) { |
| @@ -1093,8 +1107,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ | |||
| 1093 | 1107 | ||
| 1094 | tp->left_out = tp->sacked_out + tp->lost_out; | 1108 | tp->left_out = tp->sacked_out + tp->lost_out; |
| 1095 | 1109 | ||
| 1096 | if ((reord < tp->fackets_out) && tp->ca_state != TCP_CA_Loss) | 1110 | if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss) |
| 1097 | tcp_update_reordering(tp, ((tp->fackets_out + 1) - reord), 0); | 1111 | tcp_update_reordering(sk, ((tp->fackets_out + 1) - reord), 0); |
| 1098 | 1112 | ||
| 1099 | #if FASTRETRANS_DEBUG > 0 | 1113 | #if FASTRETRANS_DEBUG > 0 |
| 1100 | BUG_TRAP((int)tp->sacked_out >= 0); | 1114 | BUG_TRAP((int)tp->sacked_out >= 0); |
| @@ -1111,17 +1125,18 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ | |||
| 1111 | */ | 1125 | */ |
| 1112 | void tcp_enter_frto(struct sock *sk) | 1126 | void tcp_enter_frto(struct sock *sk) |
| 1113 | { | 1127 | { |
| 1128 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 1114 | struct tcp_sock *tp = tcp_sk(sk); | 1129 | struct tcp_sock *tp = tcp_sk(sk); |
| 1115 | struct sk_buff *skb; | 1130 | struct sk_buff *skb; |
| 1116 | 1131 | ||
| 1117 | tp->frto_counter = 1; | 1132 | tp->frto_counter = 1; |
| 1118 | 1133 | ||
| 1119 | if (tp->ca_state <= TCP_CA_Disorder || | 1134 | if (icsk->icsk_ca_state <= TCP_CA_Disorder || |
| 1120 | tp->snd_una == tp->high_seq || | 1135 | tp->snd_una == tp->high_seq || |
| 1121 | (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { | 1136 | (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { |
| 1122 | tp->prior_ssthresh = tcp_current_ssthresh(tp); | 1137 | tp->prior_ssthresh = tcp_current_ssthresh(sk); |
| 1123 | tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); | 1138 | tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); |
| 1124 | tcp_ca_event(tp, CA_EVENT_FRTO); | 1139 | tcp_ca_event(sk, CA_EVENT_FRTO); |
| 1125 | } | 1140 | } |
| 1126 | 1141 | ||
| 1127 | /* Have to clear retransmission markers here to keep the bookkeeping | 1142 | /* Have to clear retransmission markers here to keep the bookkeeping |
| @@ -1138,7 +1153,7 @@ void tcp_enter_frto(struct sock *sk) | |||
| 1138 | } | 1153 | } |
| 1139 | tcp_sync_left_out(tp); | 1154 | tcp_sync_left_out(tp); |
| 1140 | 1155 | ||
| 1141 | tcp_set_ca_state(tp, TCP_CA_Open); | 1156 | tcp_set_ca_state(sk, TCP_CA_Open); |
| 1142 | tp->frto_highmark = tp->snd_nxt; | 1157 | tp->frto_highmark = tp->snd_nxt; |
| 1143 | } | 1158 | } |
| 1144 | 1159 | ||
| @@ -1184,7 +1199,7 @@ static void tcp_enter_frto_loss(struct sock *sk) | |||
| 1184 | 1199 | ||
| 1185 | tp->reordering = min_t(unsigned int, tp->reordering, | 1200 | tp->reordering = min_t(unsigned int, tp->reordering, |
| 1186 | sysctl_tcp_reordering); | 1201 | sysctl_tcp_reordering); |
| 1187 | tcp_set_ca_state(tp, TCP_CA_Loss); | 1202 | tcp_set_ca_state(sk, TCP_CA_Loss); |
| 1188 | tp->high_seq = tp->frto_highmark; | 1203 | tp->high_seq = tp->frto_highmark; |
| 1189 | TCP_ECN_queue_cwr(tp); | 1204 | TCP_ECN_queue_cwr(tp); |
| 1190 | } | 1205 | } |
| @@ -1208,16 +1223,17 @@ void tcp_clear_retrans(struct tcp_sock *tp) | |||
| 1208 | */ | 1223 | */ |
| 1209 | void tcp_enter_loss(struct sock *sk, int how) | 1224 | void tcp_enter_loss(struct sock *sk, int how) |
| 1210 | { | 1225 | { |
| 1226 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 1211 | struct tcp_sock *tp = tcp_sk(sk); | 1227 | struct tcp_sock *tp = tcp_sk(sk); |
| 1212 | struct sk_buff *skb; | 1228 | struct sk_buff *skb; |
| 1213 | int cnt = 0; | 1229 | int cnt = 0; |
| 1214 | 1230 | ||
| 1215 | /* Reduce ssthresh if it has not yet been made inside this window. */ | 1231 | /* Reduce ssthresh if it has not yet been made inside this window. */ |
| 1216 | if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || | 1232 | if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || |
| 1217 | (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { | 1233 | (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { |
| 1218 | tp->prior_ssthresh = tcp_current_ssthresh(tp); | 1234 | tp->prior_ssthresh = tcp_current_ssthresh(sk); |
| 1219 | tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); | 1235 | tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); |
| 1220 | tcp_ca_event(tp, CA_EVENT_LOSS); | 1236 | tcp_ca_event(sk, CA_EVENT_LOSS); |
| 1221 | } | 1237 | } |
| 1222 | tp->snd_cwnd = 1; | 1238 | tp->snd_cwnd = 1; |
| 1223 | tp->snd_cwnd_cnt = 0; | 1239 | tp->snd_cwnd_cnt = 0; |
| @@ -1248,12 +1264,12 @@ void tcp_enter_loss(struct sock *sk, int how) | |||
| 1248 | 1264 | ||
| 1249 | tp->reordering = min_t(unsigned int, tp->reordering, | 1265 | tp->reordering = min_t(unsigned int, tp->reordering, |
| 1250 | sysctl_tcp_reordering); | 1266 | sysctl_tcp_reordering); |
| 1251 | tcp_set_ca_state(tp, TCP_CA_Loss); | 1267 | tcp_set_ca_state(sk, TCP_CA_Loss); |
| 1252 | tp->high_seq = tp->snd_nxt; | 1268 | tp->high_seq = tp->snd_nxt; |
| 1253 | TCP_ECN_queue_cwr(tp); | 1269 | TCP_ECN_queue_cwr(tp); |
| 1254 | } | 1270 | } |
| 1255 | 1271 | ||
| 1256 | static int tcp_check_sack_reneging(struct sock *sk, struct tcp_sock *tp) | 1272 | static int tcp_check_sack_reneging(struct sock *sk) |
| 1257 | { | 1273 | { |
| 1258 | struct sk_buff *skb; | 1274 | struct sk_buff *skb; |
| 1259 | 1275 | ||
| @@ -1265,12 +1281,14 @@ static int tcp_check_sack_reneging(struct sock *sk, struct tcp_sock *tp) | |||
| 1265 | */ | 1281 | */ |
| 1266 | if ((skb = skb_peek(&sk->sk_write_queue)) != NULL && | 1282 | if ((skb = skb_peek(&sk->sk_write_queue)) != NULL && |
| 1267 | (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { | 1283 | (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { |
| 1284 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 1268 | NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING); | 1285 | NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING); |
| 1269 | 1286 | ||
| 1270 | tcp_enter_loss(sk, 1); | 1287 | tcp_enter_loss(sk, 1); |
| 1271 | tp->retransmits++; | 1288 | icsk->icsk_retransmits++; |
| 1272 | tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)); | 1289 | tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)); |
| 1273 | tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); | 1290 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, |
| 1291 | icsk->icsk_rto, TCP_RTO_MAX); | ||
| 1274 | return 1; | 1292 | return 1; |
| 1275 | } | 1293 | } |
| 1276 | return 0; | 1294 | return 0; |
| @@ -1281,15 +1299,15 @@ static inline int tcp_fackets_out(struct tcp_sock *tp) | |||
| 1281 | return IsReno(tp) ? tp->sacked_out+1 : tp->fackets_out; | 1299 | return IsReno(tp) ? tp->sacked_out+1 : tp->fackets_out; |
| 1282 | } | 1300 | } |
| 1283 | 1301 | ||
| 1284 | static inline int tcp_skb_timedout(struct tcp_sock *tp, struct sk_buff *skb) | 1302 | static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb) |
| 1285 | { | 1303 | { |
| 1286 | return (tcp_time_stamp - TCP_SKB_CB(skb)->when > tp->rto); | 1304 | return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto); |
| 1287 | } | 1305 | } |
| 1288 | 1306 | ||
| 1289 | static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp) | 1307 | static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp) |
| 1290 | { | 1308 | { |
| 1291 | return tp->packets_out && | 1309 | return tp->packets_out && |
| 1292 | tcp_skb_timedout(tp, skb_peek(&sk->sk_write_queue)); | 1310 | tcp_skb_timedout(sk, skb_peek(&sk->sk_write_queue)); |
| 1293 | } | 1311 | } |
| 1294 | 1312 | ||
| 1295 | /* Linux NewReno/SACK/FACK/ECN state machine. | 1313 | /* Linux NewReno/SACK/FACK/ECN state machine. |
| @@ -1423,8 +1441,9 @@ static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp) | |||
| 1423 | * in assumption of absent reordering, interpret this as reordering. | 1441 | * in assumption of absent reordering, interpret this as reordering. |
| 1424 | * The only another reason could be bug in receiver TCP. | 1442 | * The only another reason could be bug in receiver TCP. |
| 1425 | */ | 1443 | */ |
| 1426 | static void tcp_check_reno_reordering(struct tcp_sock *tp, int addend) | 1444 | static void tcp_check_reno_reordering(struct sock *sk, const int addend) |
| 1427 | { | 1445 | { |
| 1446 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 1428 | u32 holes; | 1447 | u32 holes; |
| 1429 | 1448 | ||
| 1430 | holes = max(tp->lost_out, 1U); | 1449 | holes = max(tp->lost_out, 1U); |
| @@ -1432,16 +1451,17 @@ static void tcp_check_reno_reordering(struct tcp_sock *tp, int addend) | |||
| 1432 | 1451 | ||
| 1433 | if ((tp->sacked_out + holes) > tp->packets_out) { | 1452 | if ((tp->sacked_out + holes) > tp->packets_out) { |
| 1434 | tp->sacked_out = tp->packets_out - holes; | 1453 | tp->sacked_out = tp->packets_out - holes; |
| 1435 | tcp_update_reordering(tp, tp->packets_out+addend, 0); | 1454 | tcp_update_reordering(sk, tp->packets_out + addend, 0); |
| 1436 | } | 1455 | } |
| 1437 | } | 1456 | } |
| 1438 | 1457 | ||
| 1439 | /* Emulate SACKs for SACKless connection: account for a new dupack. */ | 1458 | /* Emulate SACKs for SACKless connection: account for a new dupack. */ |
| 1440 | 1459 | ||
| 1441 | static void tcp_add_reno_sack(struct tcp_sock *tp) | 1460 | static void tcp_add_reno_sack(struct sock *sk) |
| 1442 | { | 1461 | { |
| 1462 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 1443 | tp->sacked_out++; | 1463 | tp->sacked_out++; |
| 1444 | tcp_check_reno_reordering(tp, 0); | 1464 | tcp_check_reno_reordering(sk, 0); |
| 1445 | tcp_sync_left_out(tp); | 1465 | tcp_sync_left_out(tp); |
| 1446 | } | 1466 | } |
| 1447 | 1467 | ||
| @@ -1456,7 +1476,7 @@ static void tcp_remove_reno_sacks(struct sock *sk, struct tcp_sock *tp, int acke | |||
| 1456 | else | 1476 | else |
| 1457 | tp->sacked_out -= acked-1; | 1477 | tp->sacked_out -= acked-1; |
| 1458 | } | 1478 | } |
| 1459 | tcp_check_reno_reordering(tp, acked); | 1479 | tcp_check_reno_reordering(sk, acked); |
| 1460 | tcp_sync_left_out(tp); | 1480 | tcp_sync_left_out(tp); |
| 1461 | } | 1481 | } |
| 1462 | 1482 | ||
| @@ -1509,7 +1529,7 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp) | |||
| 1509 | struct sk_buff *skb; | 1529 | struct sk_buff *skb; |
| 1510 | 1530 | ||
| 1511 | sk_stream_for_retrans_queue(skb, sk) { | 1531 | sk_stream_for_retrans_queue(skb, sk) { |
| 1512 | if (tcp_skb_timedout(tp, skb) && | 1532 | if (tcp_skb_timedout(sk, skb) && |
| 1513 | !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { | 1533 | !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { |
| 1514 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; | 1534 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; |
| 1515 | tp->lost_out += tcp_skb_pcount(skb); | 1535 | tp->lost_out += tcp_skb_pcount(skb); |
| @@ -1530,14 +1550,16 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp) | |||
| 1530 | } | 1550 | } |
| 1531 | 1551 | ||
| 1532 | /* Decrease cwnd each second ack. */ | 1552 | /* Decrease cwnd each second ack. */ |
| 1533 | static void tcp_cwnd_down(struct tcp_sock *tp) | 1553 | static void tcp_cwnd_down(struct sock *sk) |
| 1534 | { | 1554 | { |
| 1555 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 1556 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 1535 | int decr = tp->snd_cwnd_cnt + 1; | 1557 | int decr = tp->snd_cwnd_cnt + 1; |
| 1536 | 1558 | ||
| 1537 | tp->snd_cwnd_cnt = decr&1; | 1559 | tp->snd_cwnd_cnt = decr&1; |
| 1538 | decr >>= 1; | 1560 | decr >>= 1; |
| 1539 | 1561 | ||
| 1540 | if (decr && tp->snd_cwnd > tp->ca_ops->min_cwnd(tp)) | 1562 | if (decr && tp->snd_cwnd > icsk->icsk_ca_ops->min_cwnd(sk)) |
| 1541 | tp->snd_cwnd -= decr; | 1563 | tp->snd_cwnd -= decr; |
| 1542 | 1564 | ||
| 1543 | tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); | 1565 | tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); |
| @@ -1571,11 +1593,15 @@ static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg) | |||
| 1571 | #define DBGUNDO(x...) do { } while (0) | 1593 | #define DBGUNDO(x...) do { } while (0) |
| 1572 | #endif | 1594 | #endif |
| 1573 | 1595 | ||
| 1574 | static void tcp_undo_cwr(struct tcp_sock *tp, int undo) | 1596 | static void tcp_undo_cwr(struct sock *sk, const int undo) |
| 1575 | { | 1597 | { |
| 1598 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 1599 | |||
| 1576 | if (tp->prior_ssthresh) { | 1600 | if (tp->prior_ssthresh) { |
| 1577 | if (tp->ca_ops->undo_cwnd) | 1601 | const struct inet_connection_sock *icsk = inet_csk(sk); |
| 1578 | tp->snd_cwnd = tp->ca_ops->undo_cwnd(tp); | 1602 | |
| 1603 | if (icsk->icsk_ca_ops->undo_cwnd) | ||
| 1604 | tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk); | ||
| 1579 | else | 1605 | else |
| 1580 | tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); | 1606 | tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); |
| 1581 | 1607 | ||
| @@ -1603,9 +1629,9 @@ static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp) | |||
| 1603 | /* Happy end! We did not retransmit anything | 1629 | /* Happy end! We did not retransmit anything |
| 1604 | * or our original transmission succeeded. | 1630 | * or our original transmission succeeded. |
| 1605 | */ | 1631 | */ |
| 1606 | DBGUNDO(sk, tp, tp->ca_state == TCP_CA_Loss ? "loss" : "retrans"); | 1632 | DBGUNDO(sk, tp, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans"); |
| 1607 | tcp_undo_cwr(tp, 1); | 1633 | tcp_undo_cwr(sk, 1); |
| 1608 | if (tp->ca_state == TCP_CA_Loss) | 1634 | if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) |
| 1609 | NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); | 1635 | NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); |
| 1610 | else | 1636 | else |
| 1611 | NET_INC_STATS_BH(LINUX_MIB_TCPFULLUNDO); | 1637 | NET_INC_STATS_BH(LINUX_MIB_TCPFULLUNDO); |
| @@ -1618,7 +1644,7 @@ static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp) | |||
| 1618 | tcp_moderate_cwnd(tp); | 1644 | tcp_moderate_cwnd(tp); |
| 1619 | return 1; | 1645 | return 1; |
| 1620 | } | 1646 | } |
| 1621 | tcp_set_ca_state(tp, TCP_CA_Open); | 1647 | tcp_set_ca_state(sk, TCP_CA_Open); |
| 1622 | return 0; | 1648 | return 0; |
| 1623 | } | 1649 | } |
| 1624 | 1650 | ||
| @@ -1627,7 +1653,7 @@ static void tcp_try_undo_dsack(struct sock *sk, struct tcp_sock *tp) | |||
| 1627 | { | 1653 | { |
| 1628 | if (tp->undo_marker && !tp->undo_retrans) { | 1654 | if (tp->undo_marker && !tp->undo_retrans) { |
| 1629 | DBGUNDO(sk, tp, "D-SACK"); | 1655 | DBGUNDO(sk, tp, "D-SACK"); |
| 1630 | tcp_undo_cwr(tp, 1); | 1656 | tcp_undo_cwr(sk, 1); |
| 1631 | tp->undo_marker = 0; | 1657 | tp->undo_marker = 0; |
| 1632 | NET_INC_STATS_BH(LINUX_MIB_TCPDSACKUNDO); | 1658 | NET_INC_STATS_BH(LINUX_MIB_TCPDSACKUNDO); |
| 1633 | } | 1659 | } |
| @@ -1648,10 +1674,10 @@ static int tcp_try_undo_partial(struct sock *sk, struct tcp_sock *tp, | |||
| 1648 | if (tp->retrans_out == 0) | 1674 | if (tp->retrans_out == 0) |
| 1649 | tp->retrans_stamp = 0; | 1675 | tp->retrans_stamp = 0; |
| 1650 | 1676 | ||
| 1651 | tcp_update_reordering(tp, tcp_fackets_out(tp)+acked, 1); | 1677 | tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); |
| 1652 | 1678 | ||
| 1653 | DBGUNDO(sk, tp, "Hoe"); | 1679 | DBGUNDO(sk, tp, "Hoe"); |
| 1654 | tcp_undo_cwr(tp, 0); | 1680 | tcp_undo_cwr(sk, 0); |
| 1655 | NET_INC_STATS_BH(LINUX_MIB_TCPPARTIALUNDO); | 1681 | NET_INC_STATS_BH(LINUX_MIB_TCPPARTIALUNDO); |
| 1656 | 1682 | ||
| 1657 | /* So... Do not make Hoe's retransmit yet. | 1683 | /* So... Do not make Hoe's retransmit yet. |
| @@ -1674,22 +1700,23 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp) | |||
| 1674 | DBGUNDO(sk, tp, "partial loss"); | 1700 | DBGUNDO(sk, tp, "partial loss"); |
| 1675 | tp->lost_out = 0; | 1701 | tp->lost_out = 0; |
| 1676 | tp->left_out = tp->sacked_out; | 1702 | tp->left_out = tp->sacked_out; |
| 1677 | tcp_undo_cwr(tp, 1); | 1703 | tcp_undo_cwr(sk, 1); |
| 1678 | NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); | 1704 | NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); |
| 1679 | tp->retransmits = 0; | 1705 | inet_csk(sk)->icsk_retransmits = 0; |
| 1680 | tp->undo_marker = 0; | 1706 | tp->undo_marker = 0; |
| 1681 | if (!IsReno(tp)) | 1707 | if (!IsReno(tp)) |
| 1682 | tcp_set_ca_state(tp, TCP_CA_Open); | 1708 | tcp_set_ca_state(sk, TCP_CA_Open); |
| 1683 | return 1; | 1709 | return 1; |
| 1684 | } | 1710 | } |
| 1685 | return 0; | 1711 | return 0; |
| 1686 | } | 1712 | } |
| 1687 | 1713 | ||
| 1688 | static inline void tcp_complete_cwr(struct tcp_sock *tp) | 1714 | static inline void tcp_complete_cwr(struct sock *sk) |
| 1689 | { | 1715 | { |
| 1716 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 1690 | tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); | 1717 | tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); |
| 1691 | tp->snd_cwnd_stamp = tcp_time_stamp; | 1718 | tp->snd_cwnd_stamp = tcp_time_stamp; |
| 1692 | tcp_ca_event(tp, CA_EVENT_COMPLETE_CWR); | 1719 | tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); |
| 1693 | } | 1720 | } |
| 1694 | 1721 | ||
| 1695 | static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) | 1722 | static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) |
| @@ -1700,21 +1727,21 @@ static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) | |||
| 1700 | tp->retrans_stamp = 0; | 1727 | tp->retrans_stamp = 0; |
| 1701 | 1728 | ||
| 1702 | if (flag&FLAG_ECE) | 1729 | if (flag&FLAG_ECE) |
| 1703 | tcp_enter_cwr(tp); | 1730 | tcp_enter_cwr(sk); |
| 1704 | 1731 | ||
| 1705 | if (tp->ca_state != TCP_CA_CWR) { | 1732 | if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { |
| 1706 | int state = TCP_CA_Open; | 1733 | int state = TCP_CA_Open; |
| 1707 | 1734 | ||
| 1708 | if (tp->left_out || tp->retrans_out || tp->undo_marker) | 1735 | if (tp->left_out || tp->retrans_out || tp->undo_marker) |
| 1709 | state = TCP_CA_Disorder; | 1736 | state = TCP_CA_Disorder; |
| 1710 | 1737 | ||
| 1711 | if (tp->ca_state != state) { | 1738 | if (inet_csk(sk)->icsk_ca_state != state) { |
| 1712 | tcp_set_ca_state(tp, state); | 1739 | tcp_set_ca_state(sk, state); |
| 1713 | tp->high_seq = tp->snd_nxt; | 1740 | tp->high_seq = tp->snd_nxt; |
| 1714 | } | 1741 | } |
| 1715 | tcp_moderate_cwnd(tp); | 1742 | tcp_moderate_cwnd(tp); |
| 1716 | } else { | 1743 | } else { |
| 1717 | tcp_cwnd_down(tp); | 1744 | tcp_cwnd_down(sk); |
| 1718 | } | 1745 | } |
| 1719 | } | 1746 | } |
| 1720 | 1747 | ||
| @@ -1733,6 +1760,7 @@ static void | |||
| 1733 | tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, | 1760 | tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, |
| 1734 | int prior_packets, int flag) | 1761 | int prior_packets, int flag) |
| 1735 | { | 1762 | { |
| 1763 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 1736 | struct tcp_sock *tp = tcp_sk(sk); | 1764 | struct tcp_sock *tp = tcp_sk(sk); |
| 1737 | int is_dupack = (tp->snd_una == prior_snd_una && !(flag&FLAG_NOT_DUP)); | 1765 | int is_dupack = (tp->snd_una == prior_snd_una && !(flag&FLAG_NOT_DUP)); |
| 1738 | 1766 | ||
| @@ -1750,13 +1778,13 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, | |||
| 1750 | tp->prior_ssthresh = 0; | 1778 | tp->prior_ssthresh = 0; |
| 1751 | 1779 | ||
| 1752 | /* B. In all the states check for reneging SACKs. */ | 1780 | /* B. In all the states check for reneging SACKs. */ |
| 1753 | if (tp->sacked_out && tcp_check_sack_reneging(sk, tp)) | 1781 | if (tp->sacked_out && tcp_check_sack_reneging(sk)) |
| 1754 | return; | 1782 | return; |
| 1755 | 1783 | ||
| 1756 | /* C. Process data loss notification, provided it is valid. */ | 1784 | /* C. Process data loss notification, provided it is valid. */ |
| 1757 | if ((flag&FLAG_DATA_LOST) && | 1785 | if ((flag&FLAG_DATA_LOST) && |
| 1758 | before(tp->snd_una, tp->high_seq) && | 1786 | before(tp->snd_una, tp->high_seq) && |
| 1759 | tp->ca_state != TCP_CA_Open && | 1787 | icsk->icsk_ca_state != TCP_CA_Open && |
| 1760 | tp->fackets_out > tp->reordering) { | 1788 | tp->fackets_out > tp->reordering) { |
| 1761 | tcp_mark_head_lost(sk, tp, tp->fackets_out-tp->reordering, tp->high_seq); | 1789 | tcp_mark_head_lost(sk, tp, tp->fackets_out-tp->reordering, tp->high_seq); |
| 1762 | NET_INC_STATS_BH(LINUX_MIB_TCPLOSS); | 1790 | NET_INC_STATS_BH(LINUX_MIB_TCPLOSS); |
| @@ -1767,14 +1795,14 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, | |||
| 1767 | 1795 | ||
| 1768 | /* E. Check state exit conditions. State can be terminated | 1796 | /* E. Check state exit conditions. State can be terminated |
| 1769 | * when high_seq is ACKed. */ | 1797 | * when high_seq is ACKed. */ |
| 1770 | if (tp->ca_state == TCP_CA_Open) { | 1798 | if (icsk->icsk_ca_state == TCP_CA_Open) { |
| 1771 | if (!sysctl_tcp_frto) | 1799 | if (!sysctl_tcp_frto) |
| 1772 | BUG_TRAP(tp->retrans_out == 0); | 1800 | BUG_TRAP(tp->retrans_out == 0); |
| 1773 | tp->retrans_stamp = 0; | 1801 | tp->retrans_stamp = 0; |
| 1774 | } else if (!before(tp->snd_una, tp->high_seq)) { | 1802 | } else if (!before(tp->snd_una, tp->high_seq)) { |
| 1775 | switch (tp->ca_state) { | 1803 | switch (icsk->icsk_ca_state) { |
| 1776 | case TCP_CA_Loss: | 1804 | case TCP_CA_Loss: |
| 1777 | tp->retransmits = 0; | 1805 | icsk->icsk_retransmits = 0; |
| 1778 | if (tcp_try_undo_recovery(sk, tp)) | 1806 | if (tcp_try_undo_recovery(sk, tp)) |
| 1779 | return; | 1807 | return; |
| 1780 | break; | 1808 | break; |
| @@ -1783,8 +1811,8 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, | |||
| 1783 | /* CWR is to be held something *above* high_seq | 1811 | /* CWR is to be held something *above* high_seq |
| 1784 | * is ACKed for CWR bit to reach receiver. */ | 1812 | * is ACKed for CWR bit to reach receiver. */ |
| 1785 | if (tp->snd_una != tp->high_seq) { | 1813 | if (tp->snd_una != tp->high_seq) { |
| 1786 | tcp_complete_cwr(tp); | 1814 | tcp_complete_cwr(sk); |
| 1787 | tcp_set_ca_state(tp, TCP_CA_Open); | 1815 | tcp_set_ca_state(sk, TCP_CA_Open); |
| 1788 | } | 1816 | } |
| 1789 | break; | 1817 | break; |
| 1790 | 1818 | ||
| @@ -1795,7 +1823,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, | |||
| 1795 | * catching for all duplicate ACKs. */ | 1823 | * catching for all duplicate ACKs. */ |
| 1796 | IsReno(tp) || tp->snd_una != tp->high_seq) { | 1824 | IsReno(tp) || tp->snd_una != tp->high_seq) { |
| 1797 | tp->undo_marker = 0; | 1825 | tp->undo_marker = 0; |
| 1798 | tcp_set_ca_state(tp, TCP_CA_Open); | 1826 | tcp_set_ca_state(sk, TCP_CA_Open); |
| 1799 | } | 1827 | } |
| 1800 | break; | 1828 | break; |
| 1801 | 1829 | ||
| @@ -1804,17 +1832,17 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, | |||
| 1804 | tcp_reset_reno_sack(tp); | 1832 | tcp_reset_reno_sack(tp); |
| 1805 | if (tcp_try_undo_recovery(sk, tp)) | 1833 | if (tcp_try_undo_recovery(sk, tp)) |
| 1806 | return; | 1834 | return; |
| 1807 | tcp_complete_cwr(tp); | 1835 | tcp_complete_cwr(sk); |
| 1808 | break; | 1836 | break; |
| 1809 | } | 1837 | } |
| 1810 | } | 1838 | } |
| 1811 | 1839 | ||
| 1812 | /* F. Process state. */ | 1840 | /* F. Process state. */ |
| 1813 | switch (tp->ca_state) { | 1841 | switch (icsk->icsk_ca_state) { |
| 1814 | case TCP_CA_Recovery: | 1842 | case TCP_CA_Recovery: |
| 1815 | if (prior_snd_una == tp->snd_una) { | 1843 | if (prior_snd_una == tp->snd_una) { |
| 1816 | if (IsReno(tp) && is_dupack) | 1844 | if (IsReno(tp) && is_dupack) |
| 1817 | tcp_add_reno_sack(tp); | 1845 | tcp_add_reno_sack(sk); |
| 1818 | } else { | 1846 | } else { |
| 1819 | int acked = prior_packets - tp->packets_out; | 1847 | int acked = prior_packets - tp->packets_out; |
| 1820 | if (IsReno(tp)) | 1848 | if (IsReno(tp)) |
| @@ -1824,13 +1852,13 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, | |||
| 1824 | break; | 1852 | break; |
| 1825 | case TCP_CA_Loss: | 1853 | case TCP_CA_Loss: |
| 1826 | if (flag&FLAG_DATA_ACKED) | 1854 | if (flag&FLAG_DATA_ACKED) |
| 1827 | tp->retransmits = 0; | 1855 | icsk->icsk_retransmits = 0; |
| 1828 | if (!tcp_try_undo_loss(sk, tp)) { | 1856 | if (!tcp_try_undo_loss(sk, tp)) { |
| 1829 | tcp_moderate_cwnd(tp); | 1857 | tcp_moderate_cwnd(tp); |
| 1830 | tcp_xmit_retransmit_queue(sk); | 1858 | tcp_xmit_retransmit_queue(sk); |
| 1831 | return; | 1859 | return; |
| 1832 | } | 1860 | } |
| 1833 | if (tp->ca_state != TCP_CA_Open) | 1861 | if (icsk->icsk_ca_state != TCP_CA_Open) |
| 1834 | return; | 1862 | return; |
| 1835 | /* Loss is undone; fall through to processing in Open state. */ | 1863 | /* Loss is undone; fall through to processing in Open state. */ |
| 1836 | default: | 1864 | default: |
| @@ -1838,10 +1866,10 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, | |||
| 1838 | if (tp->snd_una != prior_snd_una) | 1866 | if (tp->snd_una != prior_snd_una) |
| 1839 | tcp_reset_reno_sack(tp); | 1867 | tcp_reset_reno_sack(tp); |
| 1840 | if (is_dupack) | 1868 | if (is_dupack) |
| 1841 | tcp_add_reno_sack(tp); | 1869 | tcp_add_reno_sack(sk); |
| 1842 | } | 1870 | } |
| 1843 | 1871 | ||
| 1844 | if (tp->ca_state == TCP_CA_Disorder) | 1872 | if (icsk->icsk_ca_state == TCP_CA_Disorder) |
| 1845 | tcp_try_undo_dsack(sk, tp); | 1873 | tcp_try_undo_dsack(sk, tp); |
| 1846 | 1874 | ||
| 1847 | if (!tcp_time_to_recover(sk, tp)) { | 1875 | if (!tcp_time_to_recover(sk, tp)) { |
| @@ -1861,30 +1889,28 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, | |||
| 1861 | tp->undo_marker = tp->snd_una; | 1889 | tp->undo_marker = tp->snd_una; |
| 1862 | tp->undo_retrans = tp->retrans_out; | 1890 | tp->undo_retrans = tp->retrans_out; |
| 1863 | 1891 | ||
| 1864 | if (tp->ca_state < TCP_CA_CWR) { | 1892 | if (icsk->icsk_ca_state < TCP_CA_CWR) { |
| 1865 | if (!(flag&FLAG_ECE)) | 1893 | if (!(flag&FLAG_ECE)) |
| 1866 | tp->prior_ssthresh = tcp_current_ssthresh(tp); | 1894 | tp->prior_ssthresh = tcp_current_ssthresh(sk); |
| 1867 | tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); | 1895 | tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); |
| 1868 | TCP_ECN_queue_cwr(tp); | 1896 | TCP_ECN_queue_cwr(tp); |
| 1869 | } | 1897 | } |
| 1870 | 1898 | ||
| 1871 | tp->snd_cwnd_cnt = 0; | 1899 | tp->snd_cwnd_cnt = 0; |
| 1872 | tcp_set_ca_state(tp, TCP_CA_Recovery); | 1900 | tcp_set_ca_state(sk, TCP_CA_Recovery); |
| 1873 | } | 1901 | } |
| 1874 | 1902 | ||
| 1875 | if (is_dupack || tcp_head_timedout(sk, tp)) | 1903 | if (is_dupack || tcp_head_timedout(sk, tp)) |
| 1876 | tcp_update_scoreboard(sk, tp); | 1904 | tcp_update_scoreboard(sk, tp); |
| 1877 | tcp_cwnd_down(tp); | 1905 | tcp_cwnd_down(sk); |
| 1878 | tcp_xmit_retransmit_queue(sk); | 1906 | tcp_xmit_retransmit_queue(sk); |
| 1879 | } | 1907 | } |
| 1880 | 1908 | ||
| 1881 | /* Read draft-ietf-tcplw-high-performance before mucking | 1909 | /* Read draft-ietf-tcplw-high-performance before mucking |
| 1882 | * with this code. (Superceeds RFC1323) | 1910 | * with this code. (Superceeds RFC1323) |
| 1883 | */ | 1911 | */ |
| 1884 | static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag) | 1912 | static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag) |
| 1885 | { | 1913 | { |
| 1886 | __u32 seq_rtt; | ||
| 1887 | |||
| 1888 | /* RTTM Rule: A TSecr value received in a segment is used to | 1914 | /* RTTM Rule: A TSecr value received in a segment is used to |
| 1889 | * update the averaged RTT measurement only if the segment | 1915 | * update the averaged RTT measurement only if the segment |
| 1890 | * acknowledges some new data, i.e., only if it advances the | 1916 | * acknowledges some new data, i.e., only if it advances the |
| @@ -1900,14 +1926,15 @@ static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag) | |||
| 1900 | * answer arrives rto becomes 120 seconds! If at least one of segments | 1926 | * answer arrives rto becomes 120 seconds! If at least one of segments |
| 1901 | * in window is lost... Voila. --ANK (010210) | 1927 | * in window is lost... Voila. --ANK (010210) |
| 1902 | */ | 1928 | */ |
| 1903 | seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; | 1929 | struct tcp_sock *tp = tcp_sk(sk); |
| 1904 | tcp_rtt_estimator(tp, seq_rtt, usrtt); | 1930 | const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; |
| 1905 | tcp_set_rto(tp); | 1931 | tcp_rtt_estimator(sk, seq_rtt, usrtt); |
| 1906 | tp->backoff = 0; | 1932 | tcp_set_rto(sk); |
| 1907 | tcp_bound_rto(tp); | 1933 | inet_csk(sk)->icsk_backoff = 0; |
| 1934 | tcp_bound_rto(sk); | ||
| 1908 | } | 1935 | } |
| 1909 | 1936 | ||
| 1910 | static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int flag) | 1937 | static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag) |
| 1911 | { | 1938 | { |
| 1912 | /* We don't have a timestamp. Can only use | 1939 | /* We don't have a timestamp. Can only use |
| 1913 | * packets that are not retransmitted to determine | 1940 | * packets that are not retransmitted to determine |
| @@ -1921,27 +1948,29 @@ static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int | |||
| 1921 | if (flag & FLAG_RETRANS_DATA_ACKED) | 1948 | if (flag & FLAG_RETRANS_DATA_ACKED) |
| 1922 | return; | 1949 | return; |
| 1923 | 1950 | ||
| 1924 | tcp_rtt_estimator(tp, seq_rtt, usrtt); | 1951 | tcp_rtt_estimator(sk, seq_rtt, usrtt); |
| 1925 | tcp_set_rto(tp); | 1952 | tcp_set_rto(sk); |
| 1926 | tp->backoff = 0; | 1953 | inet_csk(sk)->icsk_backoff = 0; |
| 1927 | tcp_bound_rto(tp); | 1954 | tcp_bound_rto(sk); |
| 1928 | } | 1955 | } |
| 1929 | 1956 | ||
| 1930 | static inline void tcp_ack_update_rtt(struct tcp_sock *tp, | 1957 | static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, |
| 1931 | int flag, s32 seq_rtt, u32 *usrtt) | 1958 | const s32 seq_rtt, u32 *usrtt) |
| 1932 | { | 1959 | { |
| 1960 | const struct tcp_sock *tp = tcp_sk(sk); | ||
| 1933 | /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ | 1961 | /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ |
| 1934 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) | 1962 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) |
| 1935 | tcp_ack_saw_tstamp(tp, usrtt, flag); | 1963 | tcp_ack_saw_tstamp(sk, usrtt, flag); |
| 1936 | else if (seq_rtt >= 0) | 1964 | else if (seq_rtt >= 0) |
| 1937 | tcp_ack_no_tstamp(tp, seq_rtt, usrtt, flag); | 1965 | tcp_ack_no_tstamp(sk, seq_rtt, usrtt, flag); |
| 1938 | } | 1966 | } |
| 1939 | 1967 | ||
| 1940 | static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, | 1968 | static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, |
| 1941 | u32 in_flight, int good) | 1969 | u32 in_flight, int good) |
| 1942 | { | 1970 | { |
| 1943 | tp->ca_ops->cong_avoid(tp, ack, rtt, in_flight, good); | 1971 | const struct inet_connection_sock *icsk = inet_csk(sk); |
| 1944 | tp->snd_cwnd_stamp = tcp_time_stamp; | 1972 | icsk->icsk_ca_ops->cong_avoid(sk, ack, rtt, in_flight, good); |
| 1973 | tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp; | ||
| 1945 | } | 1974 | } |
| 1946 | 1975 | ||
| 1947 | /* Restart timer after forward progress on connection. | 1976 | /* Restart timer after forward progress on connection. |
| @@ -1951,9 +1980,9 @@ static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, | |||
| 1951 | static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp) | 1980 | static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp) |
| 1952 | { | 1981 | { |
| 1953 | if (!tp->packets_out) { | 1982 | if (!tp->packets_out) { |
| 1954 | tcp_clear_xmit_timer(sk, TCP_TIME_RETRANS); | 1983 | inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); |
| 1955 | } else { | 1984 | } else { |
| 1956 | tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); | 1985 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); |
| 1957 | } | 1986 | } |
| 1958 | } | 1987 | } |
| 1959 | 1988 | ||
| @@ -2068,9 +2097,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt | |||
| 2068 | seq_rtt = -1; | 2097 | seq_rtt = -1; |
| 2069 | } else if (seq_rtt < 0) | 2098 | } else if (seq_rtt < 0) |
| 2070 | seq_rtt = now - scb->when; | 2099 | seq_rtt = now - scb->when; |
| 2071 | if (seq_usrtt) | 2100 | if (seq_usrtt) { |
| 2072 | *seq_usrtt = (usnow.tv_sec - skb->stamp.tv_sec) * 1000000 | 2101 | struct timeval tv; |
| 2073 | + (usnow.tv_usec - skb->stamp.tv_usec); | 2102 | |
| 2103 | skb_get_timestamp(skb, &tv); | ||
| 2104 | *seq_usrtt = (usnow.tv_sec - tv.tv_sec) * 1000000 | ||
| 2105 | + (usnow.tv_usec - tv.tv_usec); | ||
| 2106 | } | ||
| 2074 | 2107 | ||
| 2075 | if (sacked & TCPCB_SACKED_ACKED) | 2108 | if (sacked & TCPCB_SACKED_ACKED) |
| 2076 | tp->sacked_out -= tcp_skb_pcount(skb); | 2109 | tp->sacked_out -= tcp_skb_pcount(skb); |
| @@ -2085,16 +2118,17 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt | |||
| 2085 | seq_rtt = now - scb->when; | 2118 | seq_rtt = now - scb->when; |
| 2086 | tcp_dec_pcount_approx(&tp->fackets_out, skb); | 2119 | tcp_dec_pcount_approx(&tp->fackets_out, skb); |
| 2087 | tcp_packets_out_dec(tp, skb); | 2120 | tcp_packets_out_dec(tp, skb); |
| 2088 | __skb_unlink(skb, skb->list); | 2121 | __skb_unlink(skb, &sk->sk_write_queue); |
| 2089 | sk_stream_free_skb(sk, skb); | 2122 | sk_stream_free_skb(sk, skb); |
| 2090 | } | 2123 | } |
| 2091 | 2124 | ||
| 2092 | if (acked&FLAG_ACKED) { | 2125 | if (acked&FLAG_ACKED) { |
| 2093 | tcp_ack_update_rtt(tp, acked, seq_rtt, seq_usrtt); | 2126 | const struct inet_connection_sock *icsk = inet_csk(sk); |
| 2127 | tcp_ack_update_rtt(sk, acked, seq_rtt, seq_usrtt); | ||
| 2094 | tcp_ack_packets_out(sk, tp); | 2128 | tcp_ack_packets_out(sk, tp); |
| 2095 | 2129 | ||
| 2096 | if (tp->ca_ops->pkts_acked) | 2130 | if (icsk->icsk_ca_ops->pkts_acked) |
| 2097 | tp->ca_ops->pkts_acked(tp, pkts_acked); | 2131 | icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked); |
| 2098 | } | 2132 | } |
| 2099 | 2133 | ||
| 2100 | #if FASTRETRANS_DEBUG > 0 | 2134 | #if FASTRETRANS_DEBUG > 0 |
| @@ -2102,19 +2136,20 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt | |||
| 2102 | BUG_TRAP((int)tp->lost_out >= 0); | 2136 | BUG_TRAP((int)tp->lost_out >= 0); |
| 2103 | BUG_TRAP((int)tp->retrans_out >= 0); | 2137 | BUG_TRAP((int)tp->retrans_out >= 0); |
| 2104 | if (!tp->packets_out && tp->rx_opt.sack_ok) { | 2138 | if (!tp->packets_out && tp->rx_opt.sack_ok) { |
| 2139 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 2105 | if (tp->lost_out) { | 2140 | if (tp->lost_out) { |
| 2106 | printk(KERN_DEBUG "Leak l=%u %d\n", | 2141 | printk(KERN_DEBUG "Leak l=%u %d\n", |
| 2107 | tp->lost_out, tp->ca_state); | 2142 | tp->lost_out, icsk->icsk_ca_state); |
| 2108 | tp->lost_out = 0; | 2143 | tp->lost_out = 0; |
| 2109 | } | 2144 | } |
| 2110 | if (tp->sacked_out) { | 2145 | if (tp->sacked_out) { |
| 2111 | printk(KERN_DEBUG "Leak s=%u %d\n", | 2146 | printk(KERN_DEBUG "Leak s=%u %d\n", |
| 2112 | tp->sacked_out, tp->ca_state); | 2147 | tp->sacked_out, icsk->icsk_ca_state); |
| 2113 | tp->sacked_out = 0; | 2148 | tp->sacked_out = 0; |
| 2114 | } | 2149 | } |
| 2115 | if (tp->retrans_out) { | 2150 | if (tp->retrans_out) { |
| 2116 | printk(KERN_DEBUG "Leak r=%u %d\n", | 2151 | printk(KERN_DEBUG "Leak r=%u %d\n", |
| 2117 | tp->retrans_out, tp->ca_state); | 2152 | tp->retrans_out, icsk->icsk_ca_state); |
| 2118 | tp->retrans_out = 0; | 2153 | tp->retrans_out = 0; |
| 2119 | } | 2154 | } |
| 2120 | } | 2155 | } |
| @@ -2125,40 +2160,43 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt | |||
| 2125 | 2160 | ||
| 2126 | static void tcp_ack_probe(struct sock *sk) | 2161 | static void tcp_ack_probe(struct sock *sk) |
| 2127 | { | 2162 | { |
| 2128 | struct tcp_sock *tp = tcp_sk(sk); | 2163 | const struct tcp_sock *tp = tcp_sk(sk); |
| 2164 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 2129 | 2165 | ||
| 2130 | /* Was it a usable window open? */ | 2166 | /* Was it a usable window open? */ |
| 2131 | 2167 | ||
| 2132 | if (!after(TCP_SKB_CB(sk->sk_send_head)->end_seq, | 2168 | if (!after(TCP_SKB_CB(sk->sk_send_head)->end_seq, |
| 2133 | tp->snd_una + tp->snd_wnd)) { | 2169 | tp->snd_una + tp->snd_wnd)) { |
| 2134 | tp->backoff = 0; | 2170 | icsk->icsk_backoff = 0; |
| 2135 | tcp_clear_xmit_timer(sk, TCP_TIME_PROBE0); | 2171 | inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); |
| 2136 | /* Socket must be waked up by subsequent tcp_data_snd_check(). | 2172 | /* Socket must be waked up by subsequent tcp_data_snd_check(). |
| 2137 | * This function is not for random using! | 2173 | * This function is not for random using! |
| 2138 | */ | 2174 | */ |
| 2139 | } else { | 2175 | } else { |
| 2140 | tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, | 2176 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, |
| 2141 | min(tp->rto << tp->backoff, TCP_RTO_MAX)); | 2177 | min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX), |
| 2178 | TCP_RTO_MAX); | ||
| 2142 | } | 2179 | } |
| 2143 | } | 2180 | } |
| 2144 | 2181 | ||
| 2145 | static inline int tcp_ack_is_dubious(struct tcp_sock *tp, int flag) | 2182 | static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag) |
| 2146 | { | 2183 | { |
| 2147 | return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || | 2184 | return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || |
| 2148 | tp->ca_state != TCP_CA_Open); | 2185 | inet_csk(sk)->icsk_ca_state != TCP_CA_Open); |
| 2149 | } | 2186 | } |
| 2150 | 2187 | ||
| 2151 | static inline int tcp_may_raise_cwnd(struct tcp_sock *tp, int flag) | 2188 | static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag) |
| 2152 | { | 2189 | { |
| 2190 | const struct tcp_sock *tp = tcp_sk(sk); | ||
| 2153 | return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) && | 2191 | return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) && |
| 2154 | !((1<<tp->ca_state)&(TCPF_CA_Recovery|TCPF_CA_CWR)); | 2192 | !((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Recovery | TCPF_CA_CWR)); |
| 2155 | } | 2193 | } |
| 2156 | 2194 | ||
| 2157 | /* Check that window update is acceptable. | 2195 | /* Check that window update is acceptable. |
| 2158 | * The function assumes that snd_una<=ack<=snd_next. | 2196 | * The function assumes that snd_una<=ack<=snd_next. |
| 2159 | */ | 2197 | */ |
| 2160 | static inline int tcp_may_update_window(struct tcp_sock *tp, u32 ack, | 2198 | static inline int tcp_may_update_window(const struct tcp_sock *tp, const u32 ack, |
| 2161 | u32 ack_seq, u32 nwin) | 2199 | const u32 ack_seq, const u32 nwin) |
| 2162 | { | 2200 | { |
| 2163 | return (after(ack, tp->snd_una) || | 2201 | return (after(ack, tp->snd_una) || |
| 2164 | after(ack_seq, tp->snd_wl1) || | 2202 | after(ack_seq, tp->snd_wl1) || |
| @@ -2241,6 +2279,7 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una) | |||
| 2241 | /* This routine deals with incoming acks, but not outgoing ones. */ | 2279 | /* This routine deals with incoming acks, but not outgoing ones. */ |
| 2242 | static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | 2280 | static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) |
| 2243 | { | 2281 | { |
| 2282 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 2244 | struct tcp_sock *tp = tcp_sk(sk); | 2283 | struct tcp_sock *tp = tcp_sk(sk); |
| 2245 | u32 prior_snd_una = tp->snd_una; | 2284 | u32 prior_snd_una = tp->snd_una; |
| 2246 | u32 ack_seq = TCP_SKB_CB(skb)->seq; | 2285 | u32 ack_seq = TCP_SKB_CB(skb)->seq; |
| @@ -2268,7 +2307,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | |||
| 2268 | tp->snd_una = ack; | 2307 | tp->snd_una = ack; |
| 2269 | flag |= FLAG_WIN_UPDATE; | 2308 | flag |= FLAG_WIN_UPDATE; |
| 2270 | 2309 | ||
| 2271 | tcp_ca_event(tp, CA_EVENT_FAST_ACK); | 2310 | tcp_ca_event(sk, CA_EVENT_FAST_ACK); |
| 2272 | 2311 | ||
| 2273 | NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS); | 2312 | NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS); |
| 2274 | } else { | 2313 | } else { |
| @@ -2285,7 +2324,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | |||
| 2285 | if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) | 2324 | if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) |
| 2286 | flag |= FLAG_ECE; | 2325 | flag |= FLAG_ECE; |
| 2287 | 2326 | ||
| 2288 | tcp_ca_event(tp, CA_EVENT_SLOW_ACK); | 2327 | tcp_ca_event(sk, CA_EVENT_SLOW_ACK); |
| 2289 | } | 2328 | } |
| 2290 | 2329 | ||
| 2291 | /* We passed data and got it acked, remove any soft error | 2330 | /* We passed data and got it acked, remove any soft error |
| @@ -2301,19 +2340,19 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | |||
| 2301 | 2340 | ||
| 2302 | /* See if we can take anything off of the retransmit queue. */ | 2341 | /* See if we can take anything off of the retransmit queue. */ |
| 2303 | flag |= tcp_clean_rtx_queue(sk, &seq_rtt, | 2342 | flag |= tcp_clean_rtx_queue(sk, &seq_rtt, |
| 2304 | tp->ca_ops->rtt_sample ? &seq_usrtt : NULL); | 2343 | icsk->icsk_ca_ops->rtt_sample ? &seq_usrtt : NULL); |
| 2305 | 2344 | ||
| 2306 | if (tp->frto_counter) | 2345 | if (tp->frto_counter) |
| 2307 | tcp_process_frto(sk, prior_snd_una); | 2346 | tcp_process_frto(sk, prior_snd_una); |
| 2308 | 2347 | ||
| 2309 | if (tcp_ack_is_dubious(tp, flag)) { | 2348 | if (tcp_ack_is_dubious(sk, flag)) { |
| 2310 | /* Advanve CWND, if state allows this. */ | 2349 | /* Advanve CWND, if state allows this. */ |
| 2311 | if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(tp, flag)) | 2350 | if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag)) |
| 2312 | tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 0); | 2351 | tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0); |
| 2313 | tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); | 2352 | tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); |
| 2314 | } else { | 2353 | } else { |
| 2315 | if ((flag & FLAG_DATA_ACKED)) | 2354 | if ((flag & FLAG_DATA_ACKED)) |
| 2316 | tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 1); | 2355 | tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 1); |
| 2317 | } | 2356 | } |
| 2318 | 2357 | ||
| 2319 | if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) | 2358 | if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) |
| @@ -2322,7 +2361,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | |||
| 2322 | return 1; | 2361 | return 1; |
| 2323 | 2362 | ||
| 2324 | no_queue: | 2363 | no_queue: |
| 2325 | tp->probes_out = 0; | 2364 | icsk->icsk_probes_out = 0; |
| 2326 | 2365 | ||
| 2327 | /* If this ack opens up a zero window, clear backoff. It was | 2366 | /* If this ack opens up a zero window, clear backoff. It was |
| 2328 | * being used to time the probes, and is probably far higher than | 2367 | * being used to time the probes, and is probably far higher than |
| @@ -2500,8 +2539,9 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) | |||
| 2500 | * up to bandwidth of 18Gigabit/sec. 8) ] | 2539 | * up to bandwidth of 18Gigabit/sec. 8) ] |
| 2501 | */ | 2540 | */ |
| 2502 | 2541 | ||
| 2503 | static int tcp_disordered_ack(struct tcp_sock *tp, struct sk_buff *skb) | 2542 | static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb) |
| 2504 | { | 2543 | { |
| 2544 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 2505 | struct tcphdr *th = skb->h.th; | 2545 | struct tcphdr *th = skb->h.th; |
| 2506 | u32 seq = TCP_SKB_CB(skb)->seq; | 2546 | u32 seq = TCP_SKB_CB(skb)->seq; |
| 2507 | u32 ack = TCP_SKB_CB(skb)->ack_seq; | 2547 | u32 ack = TCP_SKB_CB(skb)->ack_seq; |
| @@ -2516,14 +2556,15 @@ static int tcp_disordered_ack(struct tcp_sock *tp, struct sk_buff *skb) | |||
| 2516 | !tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) && | 2556 | !tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) && |
| 2517 | 2557 | ||
| 2518 | /* 4. ... and sits in replay window. */ | 2558 | /* 4. ... and sits in replay window. */ |
| 2519 | (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (tp->rto*1024)/HZ); | 2559 | (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ); |
| 2520 | } | 2560 | } |
| 2521 | 2561 | ||
| 2522 | static inline int tcp_paws_discard(struct tcp_sock *tp, struct sk_buff *skb) | 2562 | static inline int tcp_paws_discard(const struct sock *sk, const struct sk_buff *skb) |
| 2523 | { | 2563 | { |
| 2564 | const struct tcp_sock *tp = tcp_sk(sk); | ||
| 2524 | return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW && | 2565 | return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW && |
| 2525 | xtime.tv_sec < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS && | 2566 | xtime.tv_sec < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS && |
| 2526 | !tcp_disordered_ack(tp, skb)); | 2567 | !tcp_disordered_ack(sk, skb)); |
| 2527 | } | 2568 | } |
| 2528 | 2569 | ||
| 2529 | /* Check segment sequence number for validity. | 2570 | /* Check segment sequence number for validity. |
| @@ -2586,7 +2627,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) | |||
| 2586 | { | 2627 | { |
| 2587 | struct tcp_sock *tp = tcp_sk(sk); | 2628 | struct tcp_sock *tp = tcp_sk(sk); |
| 2588 | 2629 | ||
| 2589 | tcp_schedule_ack(tp); | 2630 | inet_csk_schedule_ack(sk); |
| 2590 | 2631 | ||
| 2591 | sk->sk_shutdown |= RCV_SHUTDOWN; | 2632 | sk->sk_shutdown |= RCV_SHUTDOWN; |
| 2592 | sock_set_flag(sk, SOCK_DONE); | 2633 | sock_set_flag(sk, SOCK_DONE); |
| @@ -2596,7 +2637,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) | |||
| 2596 | case TCP_ESTABLISHED: | 2637 | case TCP_ESTABLISHED: |
| 2597 | /* Move to CLOSE_WAIT */ | 2638 | /* Move to CLOSE_WAIT */ |
| 2598 | tcp_set_state(sk, TCP_CLOSE_WAIT); | 2639 | tcp_set_state(sk, TCP_CLOSE_WAIT); |
| 2599 | tp->ack.pingpong = 1; | 2640 | inet_csk(sk)->icsk_ack.pingpong = 1; |
| 2600 | break; | 2641 | break; |
| 2601 | 2642 | ||
| 2602 | case TCP_CLOSE_WAIT: | 2643 | case TCP_CLOSE_WAIT: |
| @@ -2694,7 +2735,7 @@ static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb) | |||
| 2694 | if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && | 2735 | if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && |
| 2695 | before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { | 2736 | before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { |
| 2696 | NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST); | 2737 | NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST); |
| 2697 | tcp_enter_quickack_mode(tp); | 2738 | tcp_enter_quickack_mode(sk); |
| 2698 | 2739 | ||
| 2699 | if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) { | 2740 | if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) { |
| 2700 | u32 end_seq = TCP_SKB_CB(skb)->end_seq; | 2741 | u32 end_seq = TCP_SKB_CB(skb)->end_seq; |
| @@ -2853,7 +2894,7 @@ static void tcp_ofo_queue(struct sock *sk) | |||
| 2853 | 2894 | ||
| 2854 | if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { | 2895 | if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { |
| 2855 | SOCK_DEBUG(sk, "ofo packet was already received \n"); | 2896 | SOCK_DEBUG(sk, "ofo packet was already received \n"); |
| 2856 | __skb_unlink(skb, skb->list); | 2897 | __skb_unlink(skb, &tp->out_of_order_queue); |
| 2857 | __kfree_skb(skb); | 2898 | __kfree_skb(skb); |
| 2858 | continue; | 2899 | continue; |
| 2859 | } | 2900 | } |
| @@ -2861,7 +2902,7 @@ static void tcp_ofo_queue(struct sock *sk) | |||
| 2861 | tp->rcv_nxt, TCP_SKB_CB(skb)->seq, | 2902 | tp->rcv_nxt, TCP_SKB_CB(skb)->seq, |
| 2862 | TCP_SKB_CB(skb)->end_seq); | 2903 | TCP_SKB_CB(skb)->end_seq); |
| 2863 | 2904 | ||
| 2864 | __skb_unlink(skb, skb->list); | 2905 | __skb_unlink(skb, &tp->out_of_order_queue); |
| 2865 | __skb_queue_tail(&sk->sk_receive_queue, skb); | 2906 | __skb_queue_tail(&sk->sk_receive_queue, skb); |
| 2866 | tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; | 2907 | tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; |
| 2867 | if(skb->h.th->fin) | 2908 | if(skb->h.th->fin) |
| @@ -2942,7 +2983,7 @@ queue_and_out: | |||
| 2942 | * gap in queue is filled. | 2983 | * gap in queue is filled. |
| 2943 | */ | 2984 | */ |
| 2944 | if (skb_queue_empty(&tp->out_of_order_queue)) | 2985 | if (skb_queue_empty(&tp->out_of_order_queue)) |
| 2945 | tp->ack.pingpong = 0; | 2986 | inet_csk(sk)->icsk_ack.pingpong = 0; |
| 2946 | } | 2987 | } |
| 2947 | 2988 | ||
| 2948 | if (tp->rx_opt.num_sacks) | 2989 | if (tp->rx_opt.num_sacks) |
| @@ -2963,8 +3004,8 @@ queue_and_out: | |||
| 2963 | tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); | 3004 | tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); |
| 2964 | 3005 | ||
| 2965 | out_of_window: | 3006 | out_of_window: |
| 2966 | tcp_enter_quickack_mode(tp); | 3007 | tcp_enter_quickack_mode(sk); |
| 2967 | tcp_schedule_ack(tp); | 3008 | inet_csk_schedule_ack(sk); |
| 2968 | drop: | 3009 | drop: |
| 2969 | __kfree_skb(skb); | 3010 | __kfree_skb(skb); |
| 2970 | return; | 3011 | return; |
| @@ -2974,7 +3015,7 @@ drop: | |||
| 2974 | if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp))) | 3015 | if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp))) |
| 2975 | goto out_of_window; | 3016 | goto out_of_window; |
| 2976 | 3017 | ||
| 2977 | tcp_enter_quickack_mode(tp); | 3018 | tcp_enter_quickack_mode(sk); |
| 2978 | 3019 | ||
| 2979 | if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { | 3020 | if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { |
| 2980 | /* Partial packet, seq < rcv_next < end_seq */ | 3021 | /* Partial packet, seq < rcv_next < end_seq */ |
| @@ -3003,7 +3044,7 @@ drop: | |||
| 3003 | 3044 | ||
| 3004 | /* Disable header prediction. */ | 3045 | /* Disable header prediction. */ |
| 3005 | tp->pred_flags = 0; | 3046 | tp->pred_flags = 0; |
| 3006 | tcp_schedule_ack(tp); | 3047 | inet_csk_schedule_ack(sk); |
| 3007 | 3048 | ||
| 3008 | SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", | 3049 | SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", |
| 3009 | tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); | 3050 | tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); |
| @@ -3027,7 +3068,7 @@ drop: | |||
| 3027 | u32 end_seq = TCP_SKB_CB(skb)->end_seq; | 3068 | u32 end_seq = TCP_SKB_CB(skb)->end_seq; |
| 3028 | 3069 | ||
| 3029 | if (seq == TCP_SKB_CB(skb1)->end_seq) { | 3070 | if (seq == TCP_SKB_CB(skb1)->end_seq) { |
| 3030 | __skb_append(skb1, skb); | 3071 | __skb_append(skb1, skb, &tp->out_of_order_queue); |
| 3031 | 3072 | ||
| 3032 | if (!tp->rx_opt.num_sacks || | 3073 | if (!tp->rx_opt.num_sacks || |
| 3033 | tp->selective_acks[0].end_seq != seq) | 3074 | tp->selective_acks[0].end_seq != seq) |
| @@ -3071,7 +3112,7 @@ drop: | |||
| 3071 | tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, end_seq); | 3112 | tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, end_seq); |
| 3072 | break; | 3113 | break; |
| 3073 | } | 3114 | } |
| 3074 | __skb_unlink(skb1, skb1->list); | 3115 | __skb_unlink(skb1, &tp->out_of_order_queue); |
| 3075 | tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); | 3116 | tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); |
| 3076 | __kfree_skb(skb1); | 3117 | __kfree_skb(skb1); |
| 3077 | } | 3118 | } |
| @@ -3088,8 +3129,9 @@ add_sack: | |||
| 3088 | * simplifies code) | 3129 | * simplifies code) |
| 3089 | */ | 3130 | */ |
| 3090 | static void | 3131 | static void |
| 3091 | tcp_collapse(struct sock *sk, struct sk_buff *head, | 3132 | tcp_collapse(struct sock *sk, struct sk_buff_head *list, |
| 3092 | struct sk_buff *tail, u32 start, u32 end) | 3133 | struct sk_buff *head, struct sk_buff *tail, |
| 3134 | u32 start, u32 end) | ||
| 3093 | { | 3135 | { |
| 3094 | struct sk_buff *skb; | 3136 | struct sk_buff *skb; |
| 3095 | 3137 | ||
| @@ -3099,7 +3141,7 @@ tcp_collapse(struct sock *sk, struct sk_buff *head, | |||
| 3099 | /* No new bits? It is possible on ofo queue. */ | 3141 | /* No new bits? It is possible on ofo queue. */ |
| 3100 | if (!before(start, TCP_SKB_CB(skb)->end_seq)) { | 3142 | if (!before(start, TCP_SKB_CB(skb)->end_seq)) { |
| 3101 | struct sk_buff *next = skb->next; | 3143 | struct sk_buff *next = skb->next; |
| 3102 | __skb_unlink(skb, skb->list); | 3144 | __skb_unlink(skb, list); |
| 3103 | __kfree_skb(skb); | 3145 | __kfree_skb(skb); |
| 3104 | NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED); | 3146 | NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED); |
| 3105 | skb = next; | 3147 | skb = next; |
| @@ -3145,7 +3187,7 @@ tcp_collapse(struct sock *sk, struct sk_buff *head, | |||
| 3145 | nskb->mac.raw = nskb->head + (skb->mac.raw-skb->head); | 3187 | nskb->mac.raw = nskb->head + (skb->mac.raw-skb->head); |
| 3146 | memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); | 3188 | memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); |
| 3147 | TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; | 3189 | TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; |
| 3148 | __skb_insert(nskb, skb->prev, skb, skb->list); | 3190 | __skb_insert(nskb, skb->prev, skb, list); |
| 3149 | sk_stream_set_owner_r(nskb, sk); | 3191 | sk_stream_set_owner_r(nskb, sk); |
| 3150 | 3192 | ||
| 3151 | /* Copy data, releasing collapsed skbs. */ | 3193 | /* Copy data, releasing collapsed skbs. */ |
| @@ -3164,7 +3206,7 @@ tcp_collapse(struct sock *sk, struct sk_buff *head, | |||
| 3164 | } | 3206 | } |
| 3165 | if (!before(start, TCP_SKB_CB(skb)->end_seq)) { | 3207 | if (!before(start, TCP_SKB_CB(skb)->end_seq)) { |
| 3166 | struct sk_buff *next = skb->next; | 3208 | struct sk_buff *next = skb->next; |
| 3167 | __skb_unlink(skb, skb->list); | 3209 | __skb_unlink(skb, list); |
| 3168 | __kfree_skb(skb); | 3210 | __kfree_skb(skb); |
| 3169 | NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED); | 3211 | NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED); |
| 3170 | skb = next; | 3212 | skb = next; |
| @@ -3200,7 +3242,8 @@ static void tcp_collapse_ofo_queue(struct sock *sk) | |||
| 3200 | if (skb == (struct sk_buff *)&tp->out_of_order_queue || | 3242 | if (skb == (struct sk_buff *)&tp->out_of_order_queue || |
| 3201 | after(TCP_SKB_CB(skb)->seq, end) || | 3243 | after(TCP_SKB_CB(skb)->seq, end) || |
| 3202 | before(TCP_SKB_CB(skb)->end_seq, start)) { | 3244 | before(TCP_SKB_CB(skb)->end_seq, start)) { |
| 3203 | tcp_collapse(sk, head, skb, start, end); | 3245 | tcp_collapse(sk, &tp->out_of_order_queue, |
| 3246 | head, skb, start, end); | ||
| 3204 | head = skb; | 3247 | head = skb; |
| 3205 | if (skb == (struct sk_buff *)&tp->out_of_order_queue) | 3248 | if (skb == (struct sk_buff *)&tp->out_of_order_queue) |
| 3206 | break; | 3249 | break; |
| @@ -3237,7 +3280,8 @@ static int tcp_prune_queue(struct sock *sk) | |||
| 3237 | tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); | 3280 | tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); |
| 3238 | 3281 | ||
| 3239 | tcp_collapse_ofo_queue(sk); | 3282 | tcp_collapse_ofo_queue(sk); |
| 3240 | tcp_collapse(sk, sk->sk_receive_queue.next, | 3283 | tcp_collapse(sk, &sk->sk_receive_queue, |
| 3284 | sk->sk_receive_queue.next, | ||
| 3241 | (struct sk_buff*)&sk->sk_receive_queue, | 3285 | (struct sk_buff*)&sk->sk_receive_queue, |
| 3242 | tp->copied_seq, tp->rcv_nxt); | 3286 | tp->copied_seq, tp->rcv_nxt); |
| 3243 | sk_stream_mem_reclaim(sk); | 3287 | sk_stream_mem_reclaim(sk); |
| @@ -3286,12 +3330,12 @@ void tcp_cwnd_application_limited(struct sock *sk) | |||
| 3286 | { | 3330 | { |
| 3287 | struct tcp_sock *tp = tcp_sk(sk); | 3331 | struct tcp_sock *tp = tcp_sk(sk); |
| 3288 | 3332 | ||
| 3289 | if (tp->ca_state == TCP_CA_Open && | 3333 | if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open && |
| 3290 | sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { | 3334 | sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { |
| 3291 | /* Limited by application or receiver window. */ | 3335 | /* Limited by application or receiver window. */ |
| 3292 | u32 win_used = max(tp->snd_cwnd_used, 2U); | 3336 | u32 win_used = max(tp->snd_cwnd_used, 2U); |
| 3293 | if (win_used < tp->snd_cwnd) { | 3337 | if (win_used < tp->snd_cwnd) { |
| 3294 | tp->snd_ssthresh = tcp_current_ssthresh(tp); | 3338 | tp->snd_ssthresh = tcp_current_ssthresh(sk); |
| 3295 | tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1; | 3339 | tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1; |
| 3296 | } | 3340 | } |
| 3297 | tp->snd_cwnd_used = 0; | 3341 | tp->snd_cwnd_used = 0; |
| @@ -3370,13 +3414,13 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) | |||
| 3370 | struct tcp_sock *tp = tcp_sk(sk); | 3414 | struct tcp_sock *tp = tcp_sk(sk); |
| 3371 | 3415 | ||
| 3372 | /* More than one full frame received... */ | 3416 | /* More than one full frame received... */ |
| 3373 | if (((tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss | 3417 | if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss |
| 3374 | /* ... and right edge of window advances far enough. | 3418 | /* ... and right edge of window advances far enough. |
| 3375 | * (tcp_recvmsg() will send ACK otherwise). Or... | 3419 | * (tcp_recvmsg() will send ACK otherwise). Or... |
| 3376 | */ | 3420 | */ |
| 3377 | && __tcp_select_window(sk) >= tp->rcv_wnd) || | 3421 | && __tcp_select_window(sk) >= tp->rcv_wnd) || |
| 3378 | /* We ACK each frame or... */ | 3422 | /* We ACK each frame or... */ |
| 3379 | tcp_in_quickack_mode(tp) || | 3423 | tcp_in_quickack_mode(sk) || |
| 3380 | /* We have out of order data. */ | 3424 | /* We have out of order data. */ |
| 3381 | (ofo_possible && | 3425 | (ofo_possible && |
| 3382 | skb_peek(&tp->out_of_order_queue))) { | 3426 | skb_peek(&tp->out_of_order_queue))) { |
| @@ -3390,8 +3434,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) | |||
| 3390 | 3434 | ||
| 3391 | static __inline__ void tcp_ack_snd_check(struct sock *sk) | 3435 | static __inline__ void tcp_ack_snd_check(struct sock *sk) |
| 3392 | { | 3436 | { |
| 3393 | struct tcp_sock *tp = tcp_sk(sk); | 3437 | if (!inet_csk_ack_scheduled(sk)) { |
| 3394 | if (!tcp_ack_scheduled(tp)) { | ||
| 3395 | /* We sent a data segment already. */ | 3438 | /* We sent a data segment already. */ |
| 3396 | return; | 3439 | return; |
| 3397 | } | 3440 | } |
| @@ -3462,7 +3505,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th) | |||
| 3462 | struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); | 3505 | struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); |
| 3463 | tp->copied_seq++; | 3506 | tp->copied_seq++; |
| 3464 | if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) { | 3507 | if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) { |
| 3465 | __skb_unlink(skb, skb->list); | 3508 | __skb_unlink(skb, &sk->sk_receive_queue); |
| 3466 | __kfree_skb(skb); | 3509 | __kfree_skb(skb); |
| 3467 | } | 3510 | } |
| 3468 | } | 3511 | } |
| @@ -3645,7 +3688,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, | |||
| 3645 | tp->rcv_nxt == tp->rcv_wup) | 3688 | tp->rcv_nxt == tp->rcv_wup) |
| 3646 | tcp_store_ts_recent(tp); | 3689 | tcp_store_ts_recent(tp); |
| 3647 | 3690 | ||
| 3648 | tcp_rcv_rtt_measure_ts(tp, skb); | 3691 | tcp_rcv_rtt_measure_ts(sk, skb); |
| 3649 | 3692 | ||
| 3650 | /* We know that such packets are checksummed | 3693 | /* We know that such packets are checksummed |
| 3651 | * on entry. | 3694 | * on entry. |
| @@ -3678,7 +3721,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, | |||
| 3678 | tp->rcv_nxt == tp->rcv_wup) | 3721 | tp->rcv_nxt == tp->rcv_wup) |
| 3679 | tcp_store_ts_recent(tp); | 3722 | tcp_store_ts_recent(tp); |
| 3680 | 3723 | ||
| 3681 | tcp_rcv_rtt_measure_ts(tp, skb); | 3724 | tcp_rcv_rtt_measure_ts(sk, skb); |
| 3682 | 3725 | ||
| 3683 | __skb_pull(skb, tcp_header_len); | 3726 | __skb_pull(skb, tcp_header_len); |
| 3684 | tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; | 3727 | tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; |
| @@ -3699,7 +3742,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, | |||
| 3699 | tp->rcv_nxt == tp->rcv_wup) | 3742 | tp->rcv_nxt == tp->rcv_wup) |
| 3700 | tcp_store_ts_recent(tp); | 3743 | tcp_store_ts_recent(tp); |
| 3701 | 3744 | ||
| 3702 | tcp_rcv_rtt_measure_ts(tp, skb); | 3745 | tcp_rcv_rtt_measure_ts(sk, skb); |
| 3703 | 3746 | ||
| 3704 | if ((int)skb->truesize > sk->sk_forward_alloc) | 3747 | if ((int)skb->truesize > sk->sk_forward_alloc) |
| 3705 | goto step5; | 3748 | goto step5; |
| @@ -3719,7 +3762,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, | |||
| 3719 | /* Well, only one small jumplet in fast path... */ | 3762 | /* Well, only one small jumplet in fast path... */ |
| 3720 | tcp_ack(sk, skb, FLAG_DATA); | 3763 | tcp_ack(sk, skb, FLAG_DATA); |
| 3721 | tcp_data_snd_check(sk, tp); | 3764 | tcp_data_snd_check(sk, tp); |
| 3722 | if (!tcp_ack_scheduled(tp)) | 3765 | if (!inet_csk_ack_scheduled(sk)) |
| 3723 | goto no_ack; | 3766 | goto no_ack; |
| 3724 | } | 3767 | } |
| 3725 | 3768 | ||
| @@ -3741,7 +3784,7 @@ slow_path: | |||
| 3741 | * RFC1323: H1. Apply PAWS check first. | 3784 | * RFC1323: H1. Apply PAWS check first. |
| 3742 | */ | 3785 | */ |
| 3743 | if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && | 3786 | if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && |
| 3744 | tcp_paws_discard(tp, skb)) { | 3787 | tcp_paws_discard(sk, skb)) { |
| 3745 | if (!th->rst) { | 3788 | if (!th->rst) { |
| 3746 | NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); | 3789 | NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); |
| 3747 | tcp_send_dupack(sk, skb); | 3790 | tcp_send_dupack(sk, skb); |
| @@ -3788,7 +3831,7 @@ step5: | |||
| 3788 | if(th->ack) | 3831 | if(th->ack) |
| 3789 | tcp_ack(sk, skb, FLAG_SLOWPATH); | 3832 | tcp_ack(sk, skb, FLAG_SLOWPATH); |
| 3790 | 3833 | ||
| 3791 | tcp_rcv_rtt_measure_ts(tp, skb); | 3834 | tcp_rcv_rtt_measure_ts(sk, skb); |
| 3792 | 3835 | ||
| 3793 | /* Process urgent data. */ | 3836 | /* Process urgent data. */ |
| 3794 | tcp_urg(sk, skb, th); | 3837 | tcp_urg(sk, skb, th); |
| @@ -3817,6 +3860,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | |||
| 3817 | tcp_parse_options(skb, &tp->rx_opt, 0); | 3860 | tcp_parse_options(skb, &tp->rx_opt, 0); |
| 3818 | 3861 | ||
| 3819 | if (th->ack) { | 3862 | if (th->ack) { |
| 3863 | struct inet_connection_sock *icsk; | ||
| 3820 | /* rfc793: | 3864 | /* rfc793: |
| 3821 | * "If the state is SYN-SENT then | 3865 | * "If the state is SYN-SENT then |
| 3822 | * first check the ACK bit | 3866 | * first check the ACK bit |
| @@ -3920,7 +3964,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | |||
| 3920 | 3964 | ||
| 3921 | tcp_init_metrics(sk); | 3965 | tcp_init_metrics(sk); |
| 3922 | 3966 | ||
| 3923 | tcp_init_congestion_control(tp); | 3967 | tcp_init_congestion_control(sk); |
| 3924 | 3968 | ||
| 3925 | /* Prevent spurious tcp_cwnd_restart() on first data | 3969 | /* Prevent spurious tcp_cwnd_restart() on first data |
| 3926 | * packet. | 3970 | * packet. |
| @@ -3930,7 +3974,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | |||
| 3930 | tcp_init_buffer_space(sk); | 3974 | tcp_init_buffer_space(sk); |
| 3931 | 3975 | ||
| 3932 | if (sock_flag(sk, SOCK_KEEPOPEN)) | 3976 | if (sock_flag(sk, SOCK_KEEPOPEN)) |
| 3933 | tcp_reset_keepalive_timer(sk, keepalive_time_when(tp)); | 3977 | inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); |
| 3934 | 3978 | ||
| 3935 | if (!tp->rx_opt.snd_wscale) | 3979 | if (!tp->rx_opt.snd_wscale) |
| 3936 | __tcp_fast_path_on(tp, tp->snd_wnd); | 3980 | __tcp_fast_path_on(tp, tp->snd_wnd); |
| @@ -3942,7 +3986,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | |||
| 3942 | sk_wake_async(sk, 0, POLL_OUT); | 3986 | sk_wake_async(sk, 0, POLL_OUT); |
| 3943 | } | 3987 | } |
| 3944 | 3988 | ||
| 3945 | if (sk->sk_write_pending || tp->defer_accept || tp->ack.pingpong) { | 3989 | icsk = inet_csk(sk); |
| 3990 | |||
| 3991 | if (sk->sk_write_pending || | ||
| 3992 | icsk->icsk_accept_queue.rskq_defer_accept || | ||
| 3993 | icsk->icsk_ack.pingpong) { | ||
| 3946 | /* Save one ACK. Data will be ready after | 3994 | /* Save one ACK. Data will be ready after |
| 3947 | * several ticks, if write_pending is set. | 3995 | * several ticks, if write_pending is set. |
| 3948 | * | 3996 | * |
| @@ -3950,12 +3998,13 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | |||
| 3950 | * look so _wonderfully_ clever, that I was not able | 3998 | * look so _wonderfully_ clever, that I was not able |
| 3951 | * to stand against the temptation 8) --ANK | 3999 | * to stand against the temptation 8) --ANK |
| 3952 | */ | 4000 | */ |
| 3953 | tcp_schedule_ack(tp); | 4001 | inet_csk_schedule_ack(sk); |
| 3954 | tp->ack.lrcvtime = tcp_time_stamp; | 4002 | icsk->icsk_ack.lrcvtime = tcp_time_stamp; |
| 3955 | tp->ack.ato = TCP_ATO_MIN; | 4003 | icsk->icsk_ack.ato = TCP_ATO_MIN; |
| 3956 | tcp_incr_quickack(tp); | 4004 | tcp_incr_quickack(sk); |
| 3957 | tcp_enter_quickack_mode(tp); | 4005 | tcp_enter_quickack_mode(sk); |
| 3958 | tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX); | 4006 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, |
| 4007 | TCP_DELACK_MAX, TCP_RTO_MAX); | ||
| 3959 | 4008 | ||
| 3960 | discard: | 4009 | discard: |
| 3961 | __kfree_skb(skb); | 4010 | __kfree_skb(skb); |
| @@ -4111,7 +4160,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
| 4111 | } | 4160 | } |
| 4112 | 4161 | ||
| 4113 | if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && | 4162 | if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && |
| 4114 | tcp_paws_discard(tp, skb)) { | 4163 | tcp_paws_discard(sk, skb)) { |
| 4115 | if (!th->rst) { | 4164 | if (!th->rst) { |
| 4116 | NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); | 4165 | NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); |
| 4117 | tcp_send_dupack(sk, skb); | 4166 | tcp_send_dupack(sk, skb); |
| @@ -4180,7 +4229,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
| 4180 | */ | 4229 | */ |
| 4181 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && | 4230 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && |
| 4182 | !tp->srtt) | 4231 | !tp->srtt) |
| 4183 | tcp_ack_saw_tstamp(tp, 0, 0); | 4232 | tcp_ack_saw_tstamp(sk, NULL, 0); |
| 4184 | 4233 | ||
| 4185 | if (tp->rx_opt.tstamp_ok) | 4234 | if (tp->rx_opt.tstamp_ok) |
| 4186 | tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; | 4235 | tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; |
| @@ -4192,7 +4241,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
| 4192 | 4241 | ||
| 4193 | tcp_init_metrics(sk); | 4242 | tcp_init_metrics(sk); |
| 4194 | 4243 | ||
| 4195 | tcp_init_congestion_control(tp); | 4244 | tcp_init_congestion_control(sk); |
| 4196 | 4245 | ||
| 4197 | /* Prevent spurious tcp_cwnd_restart() on | 4246 | /* Prevent spurious tcp_cwnd_restart() on |
| 4198 | * first data packet. | 4247 | * first data packet. |
| @@ -4227,9 +4276,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
| 4227 | return 1; | 4276 | return 1; |
| 4228 | } | 4277 | } |
| 4229 | 4278 | ||
| 4230 | tmo = tcp_fin_time(tp); | 4279 | tmo = tcp_fin_time(sk); |
| 4231 | if (tmo > TCP_TIMEWAIT_LEN) { | 4280 | if (tmo > TCP_TIMEWAIT_LEN) { |
| 4232 | tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); | 4281 | inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); |
| 4233 | } else if (th->fin || sock_owned_by_user(sk)) { | 4282 | } else if (th->fin || sock_owned_by_user(sk)) { |
| 4234 | /* Bad case. We could lose such FIN otherwise. | 4283 | /* Bad case. We could lose such FIN otherwise. |
| 4235 | * It is not a big problem, but it looks confusing | 4284 | * It is not a big problem, but it looks confusing |
| @@ -4237,7 +4286,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
| 4237 | * if it spins in bh_lock_sock(), but it is really | 4286 | * if it spins in bh_lock_sock(), but it is really |
| 4238 | * marginal case. | 4287 | * marginal case. |
| 4239 | */ | 4288 | */ |
| 4240 | tcp_reset_keepalive_timer(sk, tmo); | 4289 | inet_csk_reset_keepalive_timer(sk, tmo); |
| 4241 | } else { | 4290 | } else { |
| 4242 | tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); | 4291 | tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); |
| 4243 | goto discard; | 4292 | goto discard; |
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 67c670886c1f..13dfb391cdf1 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
| @@ -64,7 +64,9 @@ | |||
| 64 | #include <linux/times.h> | 64 | #include <linux/times.h> |
| 65 | 65 | ||
| 66 | #include <net/icmp.h> | 66 | #include <net/icmp.h> |
| 67 | #include <net/inet_hashtables.h> | ||
| 67 | #include <net/tcp.h> | 68 | #include <net/tcp.h> |
| 69 | #include <net/transp_v6.h> | ||
| 68 | #include <net/ipv6.h> | 70 | #include <net/ipv6.h> |
| 69 | #include <net/inet_common.h> | 71 | #include <net/inet_common.h> |
| 70 | #include <net/xfrm.h> | 72 | #include <net/xfrm.h> |
| @@ -75,7 +77,6 @@ | |||
| 75 | #include <linux/proc_fs.h> | 77 | #include <linux/proc_fs.h> |
| 76 | #include <linux/seq_file.h> | 78 | #include <linux/seq_file.h> |
| 77 | 79 | ||
| 78 | extern int sysctl_ip_dynaddr; | ||
| 79 | int sysctl_tcp_tw_reuse; | 80 | int sysctl_tcp_tw_reuse; |
| 80 | int sysctl_tcp_low_latency; | 81 | int sysctl_tcp_low_latency; |
| 81 | 82 | ||
| @@ -88,463 +89,29 @@ static struct socket *tcp_socket; | |||
| 88 | void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, | 89 | void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, |
| 89 | struct sk_buff *skb); | 90 | struct sk_buff *skb); |
| 90 | 91 | ||
| 91 | struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = { | 92 | struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { |
| 92 | .__tcp_lhash_lock = RW_LOCK_UNLOCKED, | 93 | .lhash_lock = RW_LOCK_UNLOCKED, |
| 93 | .__tcp_lhash_users = ATOMIC_INIT(0), | 94 | .lhash_users = ATOMIC_INIT(0), |
| 94 | .__tcp_lhash_wait | 95 | .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait), |
| 95 | = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait), | 96 | .portalloc_lock = SPIN_LOCK_UNLOCKED, |
| 96 | .__tcp_portalloc_lock = SPIN_LOCK_UNLOCKED | 97 | .port_rover = 1024 - 1, |
| 97 | }; | 98 | }; |
| 98 | 99 | ||
| 99 | /* | ||
| 100 | * This array holds the first and last local port number. | ||
| 101 | * For high-usage systems, use sysctl to change this to | ||
| 102 | * 32768-61000 | ||
| 103 | */ | ||
| 104 | int sysctl_local_port_range[2] = { 1024, 4999 }; | ||
| 105 | int tcp_port_rover = 1024 - 1; | ||
| 106 | |||
| 107 | static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport, | ||
| 108 | __u32 faddr, __u16 fport) | ||
| 109 | { | ||
| 110 | int h = (laddr ^ lport) ^ (faddr ^ fport); | ||
| 111 | h ^= h >> 16; | ||
| 112 | h ^= h >> 8; | ||
| 113 | return h & (tcp_ehash_size - 1); | ||
| 114 | } | ||
| 115 | |||
| 116 | static __inline__ int tcp_sk_hashfn(struct sock *sk) | ||
| 117 | { | ||
| 118 | struct inet_sock *inet = inet_sk(sk); | ||
| 119 | __u32 laddr = inet->rcv_saddr; | ||
| 120 | __u16 lport = inet->num; | ||
| 121 | __u32 faddr = inet->daddr; | ||
| 122 | __u16 fport = inet->dport; | ||
| 123 | |||
| 124 | return tcp_hashfn(laddr, lport, faddr, fport); | ||
| 125 | } | ||
| 126 | |||
| 127 | /* Allocate and initialize a new TCP local port bind bucket. | ||
| 128 | * The bindhash mutex for snum's hash chain must be held here. | ||
| 129 | */ | ||
| 130 | struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head, | ||
| 131 | unsigned short snum) | ||
| 132 | { | ||
| 133 | struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep, | ||
| 134 | SLAB_ATOMIC); | ||
| 135 | if (tb) { | ||
| 136 | tb->port = snum; | ||
| 137 | tb->fastreuse = 0; | ||
| 138 | INIT_HLIST_HEAD(&tb->owners); | ||
| 139 | hlist_add_head(&tb->node, &head->chain); | ||
| 140 | } | ||
| 141 | return tb; | ||
| 142 | } | ||
| 143 | |||
| 144 | /* Caller must hold hashbucket lock for this tb with local BH disabled */ | ||
| 145 | void tcp_bucket_destroy(struct tcp_bind_bucket *tb) | ||
| 146 | { | ||
| 147 | if (hlist_empty(&tb->owners)) { | ||
| 148 | __hlist_del(&tb->node); | ||
| 149 | kmem_cache_free(tcp_bucket_cachep, tb); | ||
| 150 | } | ||
| 151 | } | ||
| 152 | |||
| 153 | /* Caller must disable local BH processing. */ | ||
| 154 | static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child) | ||
| 155 | { | ||
| 156 | struct tcp_bind_hashbucket *head = | ||
| 157 | &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)]; | ||
| 158 | struct tcp_bind_bucket *tb; | ||
| 159 | |||
| 160 | spin_lock(&head->lock); | ||
| 161 | tb = tcp_sk(sk)->bind_hash; | ||
| 162 | sk_add_bind_node(child, &tb->owners); | ||
| 163 | tcp_sk(child)->bind_hash = tb; | ||
| 164 | spin_unlock(&head->lock); | ||
| 165 | } | ||
| 166 | |||
| 167 | inline void tcp_inherit_port(struct sock *sk, struct sock *child) | ||
| 168 | { | ||
| 169 | local_bh_disable(); | ||
| 170 | __tcp_inherit_port(sk, child); | ||
| 171 | local_bh_enable(); | ||
| 172 | } | ||
| 173 | |||
| 174 | void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb, | ||
| 175 | unsigned short snum) | ||
| 176 | { | ||
| 177 | inet_sk(sk)->num = snum; | ||
| 178 | sk_add_bind_node(sk, &tb->owners); | ||
| 179 | tcp_sk(sk)->bind_hash = tb; | ||
| 180 | } | ||
| 181 | |||
| 182 | static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb) | ||
| 183 | { | ||
| 184 | const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk); | ||
| 185 | struct sock *sk2; | ||
| 186 | struct hlist_node *node; | ||
| 187 | int reuse = sk->sk_reuse; | ||
| 188 | |||
| 189 | sk_for_each_bound(sk2, node, &tb->owners) { | ||
| 190 | if (sk != sk2 && | ||
| 191 | !tcp_v6_ipv6only(sk2) && | ||
| 192 | (!sk->sk_bound_dev_if || | ||
| 193 | !sk2->sk_bound_dev_if || | ||
| 194 | sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { | ||
| 195 | if (!reuse || !sk2->sk_reuse || | ||
| 196 | sk2->sk_state == TCP_LISTEN) { | ||
| 197 | const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2); | ||
| 198 | if (!sk2_rcv_saddr || !sk_rcv_saddr || | ||
| 199 | sk2_rcv_saddr == sk_rcv_saddr) | ||
| 200 | break; | ||
| 201 | } | ||
| 202 | } | ||
| 203 | } | ||
| 204 | return node != NULL; | ||
| 205 | } | ||
| 206 | |||
| 207 | /* Obtain a reference to a local port for the given sock, | ||
| 208 | * if snum is zero it means select any available local port. | ||
| 209 | */ | ||
| 210 | static int tcp_v4_get_port(struct sock *sk, unsigned short snum) | 100 | static int tcp_v4_get_port(struct sock *sk, unsigned short snum) |
| 211 | { | 101 | { |
| 212 | struct tcp_bind_hashbucket *head; | 102 | return inet_csk_get_port(&tcp_hashinfo, sk, snum); |
| 213 | struct hlist_node *node; | ||
| 214 | struct tcp_bind_bucket *tb; | ||
| 215 | int ret; | ||
| 216 | |||
| 217 | local_bh_disable(); | ||
| 218 | if (!snum) { | ||
| 219 | int low = sysctl_local_port_range[0]; | ||
| 220 | int high = sysctl_local_port_range[1]; | ||
| 221 | int remaining = (high - low) + 1; | ||
| 222 | int rover; | ||
| 223 | |||
| 224 | spin_lock(&tcp_portalloc_lock); | ||
| 225 | if (tcp_port_rover < low) | ||
| 226 | rover = low; | ||
| 227 | else | ||
| 228 | rover = tcp_port_rover; | ||
| 229 | do { | ||
| 230 | rover++; | ||
| 231 | if (rover > high) | ||
| 232 | rover = low; | ||
| 233 | head = &tcp_bhash[tcp_bhashfn(rover)]; | ||
| 234 | spin_lock(&head->lock); | ||
| 235 | tb_for_each(tb, node, &head->chain) | ||
| 236 | if (tb->port == rover) | ||
| 237 | goto next; | ||
| 238 | break; | ||
| 239 | next: | ||
| 240 | spin_unlock(&head->lock); | ||
| 241 | } while (--remaining > 0); | ||
| 242 | tcp_port_rover = rover; | ||
| 243 | spin_unlock(&tcp_portalloc_lock); | ||
| 244 | |||
| 245 | /* Exhausted local port range during search? It is not | ||
| 246 | * possible for us to be holding one of the bind hash | ||
| 247 | * locks if this test triggers, because if 'remaining' | ||
| 248 | * drops to zero, we broke out of the do/while loop at | ||
| 249 | * the top level, not from the 'break;' statement. | ||
| 250 | */ | ||
| 251 | ret = 1; | ||
| 252 | if (unlikely(remaining <= 0)) | ||
| 253 | goto fail; | ||
| 254 | |||
| 255 | /* OK, here is the one we will use. HEAD is | ||
| 256 | * non-NULL and we hold it's mutex. | ||
| 257 | */ | ||
| 258 | snum = rover; | ||
| 259 | } else { | ||
| 260 | head = &tcp_bhash[tcp_bhashfn(snum)]; | ||
| 261 | spin_lock(&head->lock); | ||
| 262 | tb_for_each(tb, node, &head->chain) | ||
| 263 | if (tb->port == snum) | ||
| 264 | goto tb_found; | ||
| 265 | } | ||
| 266 | tb = NULL; | ||
| 267 | goto tb_not_found; | ||
| 268 | tb_found: | ||
| 269 | if (!hlist_empty(&tb->owners)) { | ||
| 270 | if (sk->sk_reuse > 1) | ||
| 271 | goto success; | ||
| 272 | if (tb->fastreuse > 0 && | ||
| 273 | sk->sk_reuse && sk->sk_state != TCP_LISTEN) { | ||
| 274 | goto success; | ||
| 275 | } else { | ||
| 276 | ret = 1; | ||
| 277 | if (tcp_bind_conflict(sk, tb)) | ||
| 278 | goto fail_unlock; | ||
| 279 | } | ||
| 280 | } | ||
| 281 | tb_not_found: | ||
| 282 | ret = 1; | ||
| 283 | if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL) | ||
| 284 | goto fail_unlock; | ||
| 285 | if (hlist_empty(&tb->owners)) { | ||
| 286 | if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) | ||
| 287 | tb->fastreuse = 1; | ||
| 288 | else | ||
| 289 | tb->fastreuse = 0; | ||
| 290 | } else if (tb->fastreuse && | ||
| 291 | (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) | ||
| 292 | tb->fastreuse = 0; | ||
| 293 | success: | ||
| 294 | if (!tcp_sk(sk)->bind_hash) | ||
| 295 | tcp_bind_hash(sk, tb, snum); | ||
| 296 | BUG_TRAP(tcp_sk(sk)->bind_hash == tb); | ||
| 297 | ret = 0; | ||
| 298 | |||
| 299 | fail_unlock: | ||
| 300 | spin_unlock(&head->lock); | ||
| 301 | fail: | ||
| 302 | local_bh_enable(); | ||
| 303 | return ret; | ||
| 304 | } | ||
| 305 | |||
| 306 | /* Get rid of any references to a local port held by the | ||
| 307 | * given sock. | ||
| 308 | */ | ||
| 309 | static void __tcp_put_port(struct sock *sk) | ||
| 310 | { | ||
| 311 | struct inet_sock *inet = inet_sk(sk); | ||
| 312 | struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)]; | ||
| 313 | struct tcp_bind_bucket *tb; | ||
| 314 | |||
| 315 | spin_lock(&head->lock); | ||
| 316 | tb = tcp_sk(sk)->bind_hash; | ||
| 317 | __sk_del_bind_node(sk); | ||
| 318 | tcp_sk(sk)->bind_hash = NULL; | ||
| 319 | inet->num = 0; | ||
| 320 | tcp_bucket_destroy(tb); | ||
| 321 | spin_unlock(&head->lock); | ||
| 322 | } | ||
| 323 | |||
| 324 | void tcp_put_port(struct sock *sk) | ||
| 325 | { | ||
| 326 | local_bh_disable(); | ||
| 327 | __tcp_put_port(sk); | ||
| 328 | local_bh_enable(); | ||
| 329 | } | ||
| 330 | |||
| 331 | /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP. | ||
| 332 | * Look, when several writers sleep and reader wakes them up, all but one | ||
| 333 | * immediately hit write lock and grab all the cpus. Exclusive sleep solves | ||
| 334 | * this, _but_ remember, it adds useless work on UP machines (wake up each | ||
| 335 | * exclusive lock release). It should be ifdefed really. | ||
| 336 | */ | ||
| 337 | |||
| 338 | void tcp_listen_wlock(void) | ||
| 339 | { | ||
| 340 | write_lock(&tcp_lhash_lock); | ||
| 341 | |||
| 342 | if (atomic_read(&tcp_lhash_users)) { | ||
| 343 | DEFINE_WAIT(wait); | ||
| 344 | |||
| 345 | for (;;) { | ||
| 346 | prepare_to_wait_exclusive(&tcp_lhash_wait, | ||
| 347 | &wait, TASK_UNINTERRUPTIBLE); | ||
| 348 | if (!atomic_read(&tcp_lhash_users)) | ||
| 349 | break; | ||
| 350 | write_unlock_bh(&tcp_lhash_lock); | ||
| 351 | schedule(); | ||
| 352 | write_lock_bh(&tcp_lhash_lock); | ||
| 353 | } | ||
| 354 | |||
| 355 | finish_wait(&tcp_lhash_wait, &wait); | ||
| 356 | } | ||
| 357 | } | ||
| 358 | |||
| 359 | static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible) | ||
| 360 | { | ||
| 361 | struct hlist_head *list; | ||
| 362 | rwlock_t *lock; | ||
| 363 | |||
| 364 | BUG_TRAP(sk_unhashed(sk)); | ||
| 365 | if (listen_possible && sk->sk_state == TCP_LISTEN) { | ||
| 366 | list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)]; | ||
| 367 | lock = &tcp_lhash_lock; | ||
| 368 | tcp_listen_wlock(); | ||
| 369 | } else { | ||
| 370 | list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain; | ||
| 371 | lock = &tcp_ehash[sk->sk_hashent].lock; | ||
| 372 | write_lock(lock); | ||
| 373 | } | ||
| 374 | __sk_add_node(sk, list); | ||
| 375 | sock_prot_inc_use(sk->sk_prot); | ||
| 376 | write_unlock(lock); | ||
| 377 | if (listen_possible && sk->sk_state == TCP_LISTEN) | ||
| 378 | wake_up(&tcp_lhash_wait); | ||
| 379 | } | 103 | } |
| 380 | 104 | ||
| 381 | static void tcp_v4_hash(struct sock *sk) | 105 | static void tcp_v4_hash(struct sock *sk) |
| 382 | { | 106 | { |
| 383 | if (sk->sk_state != TCP_CLOSE) { | 107 | inet_hash(&tcp_hashinfo, sk); |
| 384 | local_bh_disable(); | ||
| 385 | __tcp_v4_hash(sk, 1); | ||
| 386 | local_bh_enable(); | ||
| 387 | } | ||
| 388 | } | 108 | } |
| 389 | 109 | ||
| 390 | void tcp_unhash(struct sock *sk) | 110 | void tcp_unhash(struct sock *sk) |
| 391 | { | 111 | { |
| 392 | rwlock_t *lock; | 112 | inet_unhash(&tcp_hashinfo, sk); |
| 393 | |||
| 394 | if (sk_unhashed(sk)) | ||
| 395 | goto ende; | ||
| 396 | |||
| 397 | if (sk->sk_state == TCP_LISTEN) { | ||
| 398 | local_bh_disable(); | ||
| 399 | tcp_listen_wlock(); | ||
| 400 | lock = &tcp_lhash_lock; | ||
| 401 | } else { | ||
| 402 | struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent]; | ||
| 403 | lock = &head->lock; | ||
| 404 | write_lock_bh(&head->lock); | ||
| 405 | } | ||
| 406 | |||
| 407 | if (__sk_del_node_init(sk)) | ||
| 408 | sock_prot_dec_use(sk->sk_prot); | ||
| 409 | write_unlock_bh(lock); | ||
| 410 | |||
| 411 | ende: | ||
| 412 | if (sk->sk_state == TCP_LISTEN) | ||
| 413 | wake_up(&tcp_lhash_wait); | ||
| 414 | } | ||
| 415 | |||
| 416 | /* Don't inline this cruft. Here are some nice properties to | ||
| 417 | * exploit here. The BSD API does not allow a listening TCP | ||
| 418 | * to specify the remote port nor the remote address for the | ||
| 419 | * connection. So always assume those are both wildcarded | ||
| 420 | * during the search since they can never be otherwise. | ||
| 421 | */ | ||
| 422 | static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr, | ||
| 423 | unsigned short hnum, int dif) | ||
| 424 | { | ||
| 425 | struct sock *result = NULL, *sk; | ||
| 426 | struct hlist_node *node; | ||
| 427 | int score, hiscore; | ||
| 428 | |||
| 429 | hiscore=-1; | ||
| 430 | sk_for_each(sk, node, head) { | ||
| 431 | struct inet_sock *inet = inet_sk(sk); | ||
| 432 | |||
| 433 | if (inet->num == hnum && !ipv6_only_sock(sk)) { | ||
| 434 | __u32 rcv_saddr = inet->rcv_saddr; | ||
| 435 | |||
| 436 | score = (sk->sk_family == PF_INET ? 1 : 0); | ||
| 437 | if (rcv_saddr) { | ||
| 438 | if (rcv_saddr != daddr) | ||
| 439 | continue; | ||
| 440 | score+=2; | ||
| 441 | } | ||
| 442 | if (sk->sk_bound_dev_if) { | ||
| 443 | if (sk->sk_bound_dev_if != dif) | ||
| 444 | continue; | ||
| 445 | score+=2; | ||
| 446 | } | ||
| 447 | if (score == 5) | ||
| 448 | return sk; | ||
| 449 | if (score > hiscore) { | ||
| 450 | hiscore = score; | ||
| 451 | result = sk; | ||
| 452 | } | ||
| 453 | } | ||
| 454 | } | ||
| 455 | return result; | ||
| 456 | } | ||
| 457 | |||
| 458 | /* Optimize the common listener case. */ | ||
| 459 | static inline struct sock *tcp_v4_lookup_listener(u32 daddr, | ||
| 460 | unsigned short hnum, int dif) | ||
| 461 | { | ||
| 462 | struct sock *sk = NULL; | ||
| 463 | struct hlist_head *head; | ||
| 464 | |||
| 465 | read_lock(&tcp_lhash_lock); | ||
| 466 | head = &tcp_listening_hash[tcp_lhashfn(hnum)]; | ||
| 467 | if (!hlist_empty(head)) { | ||
| 468 | struct inet_sock *inet = inet_sk((sk = __sk_head(head))); | ||
| 469 | |||
| 470 | if (inet->num == hnum && !sk->sk_node.next && | ||
| 471 | (!inet->rcv_saddr || inet->rcv_saddr == daddr) && | ||
| 472 | (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) && | ||
| 473 | !sk->sk_bound_dev_if) | ||
| 474 | goto sherry_cache; | ||
| 475 | sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif); | ||
| 476 | } | ||
| 477 | if (sk) { | ||
| 478 | sherry_cache: | ||
| 479 | sock_hold(sk); | ||
| 480 | } | ||
| 481 | read_unlock(&tcp_lhash_lock); | ||
| 482 | return sk; | ||
| 483 | } | 113 | } |
| 484 | 114 | ||
| 485 | /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so | ||
| 486 | * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM | ||
| 487 | * | ||
| 488 | * Local BH must be disabled here. | ||
| 489 | */ | ||
| 490 | |||
| 491 | static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport, | ||
| 492 | u32 daddr, u16 hnum, | ||
| 493 | int dif) | ||
| 494 | { | ||
| 495 | struct tcp_ehash_bucket *head; | ||
| 496 | TCP_V4_ADDR_COOKIE(acookie, saddr, daddr) | ||
| 497 | __u32 ports = TCP_COMBINED_PORTS(sport, hnum); | ||
| 498 | struct sock *sk; | ||
| 499 | struct hlist_node *node; | ||
| 500 | /* Optimize here for direct hit, only listening connections can | ||
| 501 | * have wildcards anyways. | ||
| 502 | */ | ||
| 503 | int hash = tcp_hashfn(daddr, hnum, saddr, sport); | ||
| 504 | head = &tcp_ehash[hash]; | ||
| 505 | read_lock(&head->lock); | ||
| 506 | sk_for_each(sk, node, &head->chain) { | ||
| 507 | if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif)) | ||
| 508 | goto hit; /* You sunk my battleship! */ | ||
| 509 | } | ||
| 510 | |||
| 511 | /* Must check for a TIME_WAIT'er before going to listener hash. */ | ||
| 512 | sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) { | ||
| 513 | if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif)) | ||
| 514 | goto hit; | ||
| 515 | } | ||
| 516 | sk = NULL; | ||
| 517 | out: | ||
| 518 | read_unlock(&head->lock); | ||
| 519 | return sk; | ||
| 520 | hit: | ||
| 521 | sock_hold(sk); | ||
| 522 | goto out; | ||
| 523 | } | ||
| 524 | |||
| 525 | static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport, | ||
| 526 | u32 daddr, u16 hnum, int dif) | ||
| 527 | { | ||
| 528 | struct sock *sk = __tcp_v4_lookup_established(saddr, sport, | ||
| 529 | daddr, hnum, dif); | ||
| 530 | |||
| 531 | return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif); | ||
| 532 | } | ||
| 533 | |||
| 534 | inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, | ||
| 535 | u16 dport, int dif) | ||
| 536 | { | ||
| 537 | struct sock *sk; | ||
| 538 | |||
| 539 | local_bh_disable(); | ||
| 540 | sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif); | ||
| 541 | local_bh_enable(); | ||
| 542 | |||
| 543 | return sk; | ||
| 544 | } | ||
| 545 | |||
| 546 | EXPORT_SYMBOL_GPL(tcp_v4_lookup); | ||
| 547 | |||
| 548 | static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb) | 115 | static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb) |
| 549 | { | 116 | { |
| 550 | return secure_tcp_sequence_number(skb->nh.iph->daddr, | 117 | return secure_tcp_sequence_number(skb->nh.iph->daddr, |
| @@ -555,27 +122,28 @@ static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb) | |||
| 555 | 122 | ||
| 556 | /* called with local bh disabled */ | 123 | /* called with local bh disabled */ |
| 557 | static int __tcp_v4_check_established(struct sock *sk, __u16 lport, | 124 | static int __tcp_v4_check_established(struct sock *sk, __u16 lport, |
| 558 | struct tcp_tw_bucket **twp) | 125 | struct inet_timewait_sock **twp) |
| 559 | { | 126 | { |
| 560 | struct inet_sock *inet = inet_sk(sk); | 127 | struct inet_sock *inet = inet_sk(sk); |
| 561 | u32 daddr = inet->rcv_saddr; | 128 | u32 daddr = inet->rcv_saddr; |
| 562 | u32 saddr = inet->daddr; | 129 | u32 saddr = inet->daddr; |
| 563 | int dif = sk->sk_bound_dev_if; | 130 | int dif = sk->sk_bound_dev_if; |
| 564 | TCP_V4_ADDR_COOKIE(acookie, saddr, daddr) | 131 | INET_ADDR_COOKIE(acookie, saddr, daddr) |
| 565 | __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport); | 132 | const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport); |
| 566 | int hash = tcp_hashfn(daddr, lport, saddr, inet->dport); | 133 | const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size); |
| 567 | struct tcp_ehash_bucket *head = &tcp_ehash[hash]; | 134 | struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash]; |
| 568 | struct sock *sk2; | 135 | struct sock *sk2; |
| 569 | struct hlist_node *node; | 136 | const struct hlist_node *node; |
| 570 | struct tcp_tw_bucket *tw; | 137 | struct inet_timewait_sock *tw; |
| 571 | 138 | ||
| 572 | write_lock(&head->lock); | 139 | write_lock(&head->lock); |
| 573 | 140 | ||
| 574 | /* Check TIME-WAIT sockets first. */ | 141 | /* Check TIME-WAIT sockets first. */ |
| 575 | sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) { | 142 | sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) { |
| 576 | tw = (struct tcp_tw_bucket *)sk2; | 143 | tw = inet_twsk(sk2); |
| 577 | 144 | ||
| 578 | if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) { | 145 | if (INET_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) { |
| 146 | const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2); | ||
| 579 | struct tcp_sock *tp = tcp_sk(sk); | 147 | struct tcp_sock *tp = tcp_sk(sk); |
| 580 | 148 | ||
| 581 | /* With PAWS, it is safe from the viewpoint | 149 | /* With PAWS, it is safe from the viewpoint |
| @@ -592,15 +160,15 @@ static int __tcp_v4_check_established(struct sock *sk, __u16 lport, | |||
| 592 | fall back to VJ's scheme and use initial | 160 | fall back to VJ's scheme and use initial |
| 593 | timestamp retrieved from peer table. | 161 | timestamp retrieved from peer table. |
| 594 | */ | 162 | */ |
| 595 | if (tw->tw_ts_recent_stamp && | 163 | if (tcptw->tw_ts_recent_stamp && |
| 596 | (!twp || (sysctl_tcp_tw_reuse && | 164 | (!twp || (sysctl_tcp_tw_reuse && |
| 597 | xtime.tv_sec - | 165 | xtime.tv_sec - |
| 598 | tw->tw_ts_recent_stamp > 1))) { | 166 | tcptw->tw_ts_recent_stamp > 1))) { |
| 599 | if ((tp->write_seq = | 167 | tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; |
| 600 | tw->tw_snd_nxt + 65535 + 2) == 0) | 168 | if (tp->write_seq == 0) |
| 601 | tp->write_seq = 1; | 169 | tp->write_seq = 1; |
| 602 | tp->rx_opt.ts_recent = tw->tw_ts_recent; | 170 | tp->rx_opt.ts_recent = tcptw->tw_ts_recent; |
| 603 | tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp; | 171 | tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; |
| 604 | sock_hold(sk2); | 172 | sock_hold(sk2); |
| 605 | goto unique; | 173 | goto unique; |
| 606 | } else | 174 | } else |
| @@ -611,7 +179,7 @@ static int __tcp_v4_check_established(struct sock *sk, __u16 lport, | |||
| 611 | 179 | ||
| 612 | /* And established part... */ | 180 | /* And established part... */ |
| 613 | sk_for_each(sk2, node, &head->chain) { | 181 | sk_for_each(sk2, node, &head->chain) { |
| 614 | if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) | 182 | if (INET_MATCH(sk2, acookie, saddr, daddr, ports, dif)) |
| 615 | goto not_unique; | 183 | goto not_unique; |
| 616 | } | 184 | } |
| 617 | 185 | ||
| @@ -631,10 +199,10 @@ unique: | |||
| 631 | NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); | 199 | NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); |
| 632 | } else if (tw) { | 200 | } else if (tw) { |
| 633 | /* Silly. Should hash-dance instead... */ | 201 | /* Silly. Should hash-dance instead... */ |
| 634 | tcp_tw_deschedule(tw); | 202 | inet_twsk_deschedule(tw, &tcp_death_row); |
| 635 | NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); | 203 | NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); |
| 636 | 204 | ||
| 637 | tcp_tw_put(tw); | 205 | inet_twsk_put(tw); |
| 638 | } | 206 | } |
| 639 | 207 | ||
| 640 | return 0; | 208 | return 0; |
| @@ -657,9 +225,9 @@ static inline u32 connect_port_offset(const struct sock *sk) | |||
| 657 | */ | 225 | */ |
| 658 | static inline int tcp_v4_hash_connect(struct sock *sk) | 226 | static inline int tcp_v4_hash_connect(struct sock *sk) |
| 659 | { | 227 | { |
| 660 | unsigned short snum = inet_sk(sk)->num; | 228 | const unsigned short snum = inet_sk(sk)->num; |
| 661 | struct tcp_bind_hashbucket *head; | 229 | struct inet_bind_hashbucket *head; |
| 662 | struct tcp_bind_bucket *tb; | 230 | struct inet_bind_bucket *tb; |
| 663 | int ret; | 231 | int ret; |
| 664 | 232 | ||
| 665 | if (!snum) { | 233 | if (!snum) { |
| @@ -671,19 +239,19 @@ static inline int tcp_v4_hash_connect(struct sock *sk) | |||
| 671 | static u32 hint; | 239 | static u32 hint; |
| 672 | u32 offset = hint + connect_port_offset(sk); | 240 | u32 offset = hint + connect_port_offset(sk); |
| 673 | struct hlist_node *node; | 241 | struct hlist_node *node; |
| 674 | struct tcp_tw_bucket *tw = NULL; | 242 | struct inet_timewait_sock *tw = NULL; |
| 675 | 243 | ||
| 676 | local_bh_disable(); | 244 | local_bh_disable(); |
| 677 | for (i = 1; i <= range; i++) { | 245 | for (i = 1; i <= range; i++) { |
| 678 | port = low + (i + offset) % range; | 246 | port = low + (i + offset) % range; |
| 679 | head = &tcp_bhash[tcp_bhashfn(port)]; | 247 | head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)]; |
| 680 | spin_lock(&head->lock); | 248 | spin_lock(&head->lock); |
| 681 | 249 | ||
| 682 | /* Does not bother with rcv_saddr checks, | 250 | /* Does not bother with rcv_saddr checks, |
| 683 | * because the established check is already | 251 | * because the established check is already |
| 684 | * unique enough. | 252 | * unique enough. |
| 685 | */ | 253 | */ |
| 686 | tb_for_each(tb, node, &head->chain) { | 254 | inet_bind_bucket_for_each(tb, node, &head->chain) { |
| 687 | if (tb->port == port) { | 255 | if (tb->port == port) { |
| 688 | BUG_TRAP(!hlist_empty(&tb->owners)); | 256 | BUG_TRAP(!hlist_empty(&tb->owners)); |
| 689 | if (tb->fastreuse >= 0) | 257 | if (tb->fastreuse >= 0) |
| @@ -696,7 +264,7 @@ static inline int tcp_v4_hash_connect(struct sock *sk) | |||
| 696 | } | 264 | } |
| 697 | } | 265 | } |
| 698 | 266 | ||
| 699 | tb = tcp_bucket_create(head, port); | 267 | tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port); |
| 700 | if (!tb) { | 268 | if (!tb) { |
| 701 | spin_unlock(&head->lock); | 269 | spin_unlock(&head->lock); |
| 702 | break; | 270 | break; |
| @@ -715,27 +283,27 @@ ok: | |||
| 715 | hint += i; | 283 | hint += i; |
| 716 | 284 | ||
| 717 | /* Head lock still held and bh's disabled */ | 285 | /* Head lock still held and bh's disabled */ |
| 718 | tcp_bind_hash(sk, tb, port); | 286 | inet_bind_hash(sk, tb, port); |
| 719 | if (sk_unhashed(sk)) { | 287 | if (sk_unhashed(sk)) { |
| 720 | inet_sk(sk)->sport = htons(port); | 288 | inet_sk(sk)->sport = htons(port); |
| 721 | __tcp_v4_hash(sk, 0); | 289 | __inet_hash(&tcp_hashinfo, sk, 0); |
| 722 | } | 290 | } |
| 723 | spin_unlock(&head->lock); | 291 | spin_unlock(&head->lock); |
| 724 | 292 | ||
| 725 | if (tw) { | 293 | if (tw) { |
| 726 | tcp_tw_deschedule(tw); | 294 | inet_twsk_deschedule(tw, &tcp_death_row);; |
| 727 | tcp_tw_put(tw); | 295 | inet_twsk_put(tw); |
| 728 | } | 296 | } |
| 729 | 297 | ||
| 730 | ret = 0; | 298 | ret = 0; |
| 731 | goto out; | 299 | goto out; |
| 732 | } | 300 | } |
| 733 | 301 | ||
| 734 | head = &tcp_bhash[tcp_bhashfn(snum)]; | 302 | head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)]; |
| 735 | tb = tcp_sk(sk)->bind_hash; | 303 | tb = inet_csk(sk)->icsk_bind_hash; |
| 736 | spin_lock_bh(&head->lock); | 304 | spin_lock_bh(&head->lock); |
| 737 | if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { | 305 | if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { |
| 738 | __tcp_v4_hash(sk, 0); | 306 | __inet_hash(&tcp_hashinfo, sk, 0); |
| 739 | spin_unlock_bh(&head->lock); | 307 | spin_unlock_bh(&head->lock); |
| 740 | return 0; | 308 | return 0; |
| 741 | } else { | 309 | } else { |
| @@ -798,7 +366,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) | |||
| 798 | tp->write_seq = 0; | 366 | tp->write_seq = 0; |
| 799 | } | 367 | } |
| 800 | 368 | ||
| 801 | if (sysctl_tcp_tw_recycle && | 369 | if (tcp_death_row.sysctl_tw_recycle && |
| 802 | !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) { | 370 | !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) { |
| 803 | struct inet_peer *peer = rt_get_peer(rt); | 371 | struct inet_peer *peer = rt_get_peer(rt); |
| 804 | 372 | ||
| @@ -837,8 +405,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) | |||
| 837 | goto failure; | 405 | goto failure; |
| 838 | 406 | ||
| 839 | /* OK, now commit destination to socket. */ | 407 | /* OK, now commit destination to socket. */ |
| 840 | __sk_dst_set(sk, &rt->u.dst); | 408 | sk_setup_caps(sk, &rt->u.dst); |
| 841 | tcp_v4_setup_caps(sk, &rt->u.dst); | ||
| 842 | 409 | ||
| 843 | if (!tp->write_seq) | 410 | if (!tp->write_seq) |
| 844 | tp->write_seq = secure_tcp_sequence_number(inet->saddr, | 411 | tp->write_seq = secure_tcp_sequence_number(inet->saddr, |
| @@ -864,53 +431,6 @@ failure: | |||
| 864 | return err; | 431 | return err; |
| 865 | } | 432 | } |
| 866 | 433 | ||
| 867 | static __inline__ int tcp_v4_iif(struct sk_buff *skb) | ||
| 868 | { | ||
| 869 | return ((struct rtable *)skb->dst)->rt_iif; | ||
| 870 | } | ||
| 871 | |||
| 872 | static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd) | ||
| 873 | { | ||
| 874 | return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1)); | ||
| 875 | } | ||
| 876 | |||
| 877 | static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp, | ||
| 878 | struct request_sock ***prevp, | ||
| 879 | __u16 rport, | ||
| 880 | __u32 raddr, __u32 laddr) | ||
| 881 | { | ||
| 882 | struct listen_sock *lopt = tp->accept_queue.listen_opt; | ||
| 883 | struct request_sock *req, **prev; | ||
| 884 | |||
| 885 | for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)]; | ||
| 886 | (req = *prev) != NULL; | ||
| 887 | prev = &req->dl_next) { | ||
| 888 | const struct inet_request_sock *ireq = inet_rsk(req); | ||
| 889 | |||
| 890 | if (ireq->rmt_port == rport && | ||
| 891 | ireq->rmt_addr == raddr && | ||
| 892 | ireq->loc_addr == laddr && | ||
| 893 | TCP_INET_FAMILY(req->rsk_ops->family)) { | ||
| 894 | BUG_TRAP(!req->sk); | ||
| 895 | *prevp = prev; | ||
| 896 | break; | ||
| 897 | } | ||
| 898 | } | ||
| 899 | |||
| 900 | return req; | ||
| 901 | } | ||
| 902 | |||
| 903 | static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req) | ||
| 904 | { | ||
| 905 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 906 | struct listen_sock *lopt = tp->accept_queue.listen_opt; | ||
| 907 | u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd); | ||
| 908 | |||
| 909 | reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT); | ||
| 910 | tcp_synq_added(sk); | ||
| 911 | } | ||
| 912 | |||
| 913 | |||
| 914 | /* | 434 | /* |
| 915 | * This routine does path mtu discovery as defined in RFC1191. | 435 | * This routine does path mtu discovery as defined in RFC1191. |
| 916 | */ | 436 | */ |
| @@ -993,14 +513,14 @@ void tcp_v4_err(struct sk_buff *skb, u32 info) | |||
| 993 | return; | 513 | return; |
| 994 | } | 514 | } |
| 995 | 515 | ||
| 996 | sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, | 516 | sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr, |
| 997 | th->source, tcp_v4_iif(skb)); | 517 | th->source, inet_iif(skb)); |
| 998 | if (!sk) { | 518 | if (!sk) { |
| 999 | ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); | 519 | ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); |
| 1000 | return; | 520 | return; |
| 1001 | } | 521 | } |
| 1002 | if (sk->sk_state == TCP_TIME_WAIT) { | 522 | if (sk->sk_state == TCP_TIME_WAIT) { |
| 1003 | tcp_tw_put((struct tcp_tw_bucket *)sk); | 523 | inet_twsk_put((struct inet_timewait_sock *)sk); |
| 1004 | return; | 524 | return; |
| 1005 | } | 525 | } |
| 1006 | 526 | ||
| @@ -1054,8 +574,8 @@ void tcp_v4_err(struct sk_buff *skb, u32 info) | |||
| 1054 | if (sock_owned_by_user(sk)) | 574 | if (sock_owned_by_user(sk)) |
| 1055 | goto out; | 575 | goto out; |
| 1056 | 576 | ||
| 1057 | req = tcp_v4_search_req(tp, &prev, th->dest, | 577 | req = inet_csk_search_req(sk, &prev, th->dest, |
| 1058 | iph->daddr, iph->saddr); | 578 | iph->daddr, iph->saddr); |
| 1059 | if (!req) | 579 | if (!req) |
| 1060 | goto out; | 580 | goto out; |
| 1061 | 581 | ||
| @@ -1075,7 +595,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info) | |||
| 1075 | * created socket, and POSIX does not want network | 595 | * created socket, and POSIX does not want network |
| 1076 | * errors returned from accept(). | 596 | * errors returned from accept(). |
| 1077 | */ | 597 | */ |
| 1078 | tcp_synq_drop(sk, req, prev); | 598 | inet_csk_reqsk_queue_drop(sk, req, prev); |
| 1079 | goto out; | 599 | goto out; |
| 1080 | 600 | ||
| 1081 | case TCP_SYN_SENT: | 601 | case TCP_SYN_SENT: |
| @@ -1245,12 +765,13 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, | |||
| 1245 | 765 | ||
| 1246 | static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) | 766 | static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) |
| 1247 | { | 767 | { |
| 1248 | struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk; | 768 | struct inet_timewait_sock *tw = inet_twsk(sk); |
| 769 | const struct tcp_timewait_sock *tcptw = tcp_twsk(sk); | ||
| 1249 | 770 | ||
| 1250 | tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt, | 771 | tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, |
| 1251 | tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent); | 772 | tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent); |
| 1252 | 773 | ||
| 1253 | tcp_tw_put(tw); | 774 | inet_twsk_put(tw); |
| 1254 | } | 775 | } |
| 1255 | 776 | ||
| 1256 | static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req) | 777 | static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req) |
| @@ -1259,36 +780,6 @@ static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req) | |||
| 1259 | req->ts_recent); | 780 | req->ts_recent); |
| 1260 | } | 781 | } |
| 1261 | 782 | ||
| 1262 | static struct dst_entry* tcp_v4_route_req(struct sock *sk, | ||
| 1263 | struct request_sock *req) | ||
| 1264 | { | ||
| 1265 | struct rtable *rt; | ||
| 1266 | const struct inet_request_sock *ireq = inet_rsk(req); | ||
| 1267 | struct ip_options *opt = inet_rsk(req)->opt; | ||
| 1268 | struct flowi fl = { .oif = sk->sk_bound_dev_if, | ||
| 1269 | .nl_u = { .ip4_u = | ||
| 1270 | { .daddr = ((opt && opt->srr) ? | ||
| 1271 | opt->faddr : | ||
| 1272 | ireq->rmt_addr), | ||
| 1273 | .saddr = ireq->loc_addr, | ||
| 1274 | .tos = RT_CONN_FLAGS(sk) } }, | ||
| 1275 | .proto = IPPROTO_TCP, | ||
| 1276 | .uli_u = { .ports = | ||
| 1277 | { .sport = inet_sk(sk)->sport, | ||
| 1278 | .dport = ireq->rmt_port } } }; | ||
| 1279 | |||
| 1280 | if (ip_route_output_flow(&rt, &fl, sk, 0)) { | ||
| 1281 | IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); | ||
| 1282 | return NULL; | ||
| 1283 | } | ||
| 1284 | if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) { | ||
| 1285 | ip_rt_put(rt); | ||
| 1286 | IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); | ||
| 1287 | return NULL; | ||
| 1288 | } | ||
| 1289 | return &rt->u.dst; | ||
| 1290 | } | ||
| 1291 | |||
| 1292 | /* | 783 | /* |
| 1293 | * Send a SYN-ACK after having received an ACK. | 784 | * Send a SYN-ACK after having received an ACK. |
| 1294 | * This still operates on a request_sock only, not on a big | 785 | * This still operates on a request_sock only, not on a big |
| @@ -1302,7 +793,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req, | |||
| 1302 | struct sk_buff * skb; | 793 | struct sk_buff * skb; |
| 1303 | 794 | ||
| 1304 | /* First, grab a route. */ | 795 | /* First, grab a route. */ |
| 1305 | if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL) | 796 | if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL) |
| 1306 | goto out; | 797 | goto out; |
| 1307 | 798 | ||
| 1308 | skb = tcp_make_synack(sk, dst, req); | 799 | skb = tcp_make_synack(sk, dst, req); |
| @@ -1404,7 +895,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | |||
| 1404 | * limitations, they conserve resources and peer is | 895 | * limitations, they conserve resources and peer is |
| 1405 | * evidently real one. | 896 | * evidently real one. |
| 1406 | */ | 897 | */ |
| 1407 | if (tcp_synq_is_full(sk) && !isn) { | 898 | if (inet_csk_reqsk_queue_is_full(sk) && !isn) { |
| 1408 | #ifdef CONFIG_SYN_COOKIES | 899 | #ifdef CONFIG_SYN_COOKIES |
| 1409 | if (sysctl_tcp_syncookies) { | 900 | if (sysctl_tcp_syncookies) { |
| 1410 | want_cookie = 1; | 901 | want_cookie = 1; |
| @@ -1418,7 +909,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | |||
| 1418 | * clogging syn queue with openreqs with exponentially increasing | 909 | * clogging syn queue with openreqs with exponentially increasing |
| 1419 | * timeout. | 910 | * timeout. |
| 1420 | */ | 911 | */ |
| 1421 | if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1) | 912 | if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) |
| 1422 | goto drop; | 913 | goto drop; |
| 1423 | 914 | ||
| 1424 | req = reqsk_alloc(&tcp_request_sock_ops); | 915 | req = reqsk_alloc(&tcp_request_sock_ops); |
| @@ -1474,8 +965,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | |||
| 1474 | * are made in the function processing timewait state. | 965 | * are made in the function processing timewait state. |
| 1475 | */ | 966 | */ |
| 1476 | if (tmp_opt.saw_tstamp && | 967 | if (tmp_opt.saw_tstamp && |
| 1477 | sysctl_tcp_tw_recycle && | 968 | tcp_death_row.sysctl_tw_recycle && |
| 1478 | (dst = tcp_v4_route_req(sk, req)) != NULL && | 969 | (dst = inet_csk_route_req(sk, req)) != NULL && |
| 1479 | (peer = rt_get_peer((struct rtable *)dst)) != NULL && | 970 | (peer = rt_get_peer((struct rtable *)dst)) != NULL && |
| 1480 | peer->v4daddr == saddr) { | 971 | peer->v4daddr == saddr) { |
| 1481 | if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL && | 972 | if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL && |
| @@ -1488,7 +979,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | |||
| 1488 | } | 979 | } |
| 1489 | /* Kill the following clause, if you dislike this way. */ | 980 | /* Kill the following clause, if you dislike this way. */ |
| 1490 | else if (!sysctl_tcp_syncookies && | 981 | else if (!sysctl_tcp_syncookies && |
| 1491 | (sysctl_max_syn_backlog - tcp_synq_len(sk) < | 982 | (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < |
| 1492 | (sysctl_max_syn_backlog >> 2)) && | 983 | (sysctl_max_syn_backlog >> 2)) && |
| 1493 | (!peer || !peer->tcp_ts_stamp) && | 984 | (!peer || !peer->tcp_ts_stamp) && |
| 1494 | (!dst || !dst_metric(dst, RTAX_RTT))) { | 985 | (!dst || !dst_metric(dst, RTAX_RTT))) { |
| @@ -1499,11 +990,10 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | |||
| 1499 | * to destinations, already remembered | 990 | * to destinations, already remembered |
| 1500 | * to the moment of synflood. | 991 | * to the moment of synflood. |
| 1501 | */ | 992 | */ |
| 1502 | LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open " | 993 | LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open " |
| 1503 | "request from %u.%u." | 994 | "request from %u.%u.%u.%u/%u\n", |
| 1504 | "%u.%u/%u\n", | 995 | NIPQUAD(saddr), |
| 1505 | NIPQUAD(saddr), | 996 | ntohs(skb->h.th->source)); |
| 1506 | ntohs(skb->h.th->source))); | ||
| 1507 | dst_release(dst); | 997 | dst_release(dst); |
| 1508 | goto drop_and_free; | 998 | goto drop_and_free; |
| 1509 | } | 999 | } |
| @@ -1518,7 +1008,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | |||
| 1518 | if (want_cookie) { | 1008 | if (want_cookie) { |
| 1519 | reqsk_free(req); | 1009 | reqsk_free(req); |
| 1520 | } else { | 1010 | } else { |
| 1521 | tcp_v4_synq_add(sk, req); | 1011 | inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); |
| 1522 | } | 1012 | } |
| 1523 | return 0; | 1013 | return 0; |
| 1524 | 1014 | ||
| @@ -1546,15 +1036,14 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, | |||
| 1546 | if (sk_acceptq_is_full(sk)) | 1036 | if (sk_acceptq_is_full(sk)) |
| 1547 | goto exit_overflow; | 1037 | goto exit_overflow; |
| 1548 | 1038 | ||
| 1549 | if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL) | 1039 | if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL) |
| 1550 | goto exit; | 1040 | goto exit; |
| 1551 | 1041 | ||
| 1552 | newsk = tcp_create_openreq_child(sk, req, skb); | 1042 | newsk = tcp_create_openreq_child(sk, req, skb); |
| 1553 | if (!newsk) | 1043 | if (!newsk) |
| 1554 | goto exit; | 1044 | goto exit; |
| 1555 | 1045 | ||
| 1556 | newsk->sk_dst_cache = dst; | 1046 | sk_setup_caps(newsk, dst); |
| 1557 | tcp_v4_setup_caps(newsk, dst); | ||
| 1558 | 1047 | ||
| 1559 | newtp = tcp_sk(newsk); | 1048 | newtp = tcp_sk(newsk); |
| 1560 | newinet = inet_sk(newsk); | 1049 | newinet = inet_sk(newsk); |
| @@ -1564,7 +1053,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, | |||
| 1564 | newinet->saddr = ireq->loc_addr; | 1053 | newinet->saddr = ireq->loc_addr; |
| 1565 | newinet->opt = ireq->opt; | 1054 | newinet->opt = ireq->opt; |
| 1566 | ireq->opt = NULL; | 1055 | ireq->opt = NULL; |
| 1567 | newinet->mc_index = tcp_v4_iif(skb); | 1056 | newinet->mc_index = inet_iif(skb); |
| 1568 | newinet->mc_ttl = skb->nh.iph->ttl; | 1057 | newinet->mc_ttl = skb->nh.iph->ttl; |
| 1569 | newtp->ext_header_len = 0; | 1058 | newtp->ext_header_len = 0; |
| 1570 | if (newinet->opt) | 1059 | if (newinet->opt) |
| @@ -1575,8 +1064,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, | |||
| 1575 | newtp->advmss = dst_metric(dst, RTAX_ADVMSS); | 1064 | newtp->advmss = dst_metric(dst, RTAX_ADVMSS); |
| 1576 | tcp_initialize_rcv_mss(newsk); | 1065 | tcp_initialize_rcv_mss(newsk); |
| 1577 | 1066 | ||
| 1578 | __tcp_v4_hash(newsk, 0); | 1067 | __inet_hash(&tcp_hashinfo, newsk, 0); |
| 1579 | __tcp_inherit_port(sk, newsk); | 1068 | __inet_inherit_port(&tcp_hashinfo, sk, newsk); |
| 1580 | 1069 | ||
| 1581 | return newsk; | 1070 | return newsk; |
| 1582 | 1071 | ||
| @@ -1592,27 +1081,24 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) | |||
| 1592 | { | 1081 | { |
| 1593 | struct tcphdr *th = skb->h.th; | 1082 | struct tcphdr *th = skb->h.th; |
| 1594 | struct iphdr *iph = skb->nh.iph; | 1083 | struct iphdr *iph = skb->nh.iph; |
| 1595 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 1596 | struct sock *nsk; | 1084 | struct sock *nsk; |
| 1597 | struct request_sock **prev; | 1085 | struct request_sock **prev; |
| 1598 | /* Find possible connection requests. */ | 1086 | /* Find possible connection requests. */ |
| 1599 | struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source, | 1087 | struct request_sock *req = inet_csk_search_req(sk, &prev, th->source, |
| 1600 | iph->saddr, iph->daddr); | 1088 | iph->saddr, iph->daddr); |
| 1601 | if (req) | 1089 | if (req) |
| 1602 | return tcp_check_req(sk, skb, req, prev); | 1090 | return tcp_check_req(sk, skb, req, prev); |
| 1603 | 1091 | ||
| 1604 | nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr, | 1092 | nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr, |
| 1605 | th->source, | 1093 | th->source, skb->nh.iph->daddr, |
| 1606 | skb->nh.iph->daddr, | 1094 | ntohs(th->dest), inet_iif(skb)); |
| 1607 | ntohs(th->dest), | ||
| 1608 | tcp_v4_iif(skb)); | ||
| 1609 | 1095 | ||
| 1610 | if (nsk) { | 1096 | if (nsk) { |
| 1611 | if (nsk->sk_state != TCP_TIME_WAIT) { | 1097 | if (nsk->sk_state != TCP_TIME_WAIT) { |
| 1612 | bh_lock_sock(nsk); | 1098 | bh_lock_sock(nsk); |
| 1613 | return nsk; | 1099 | return nsk; |
| 1614 | } | 1100 | } |
| 1615 | tcp_tw_put((struct tcp_tw_bucket *)nsk); | 1101 | inet_twsk_put((struct inet_timewait_sock *)nsk); |
| 1616 | return NULL; | 1102 | return NULL; |
| 1617 | } | 1103 | } |
| 1618 | 1104 | ||
| @@ -1631,7 +1117,7 @@ static int tcp_v4_checksum_init(struct sk_buff *skb) | |||
| 1631 | skb->nh.iph->daddr, skb->csum)) | 1117 | skb->nh.iph->daddr, skb->csum)) |
| 1632 | return 0; | 1118 | return 0; |
| 1633 | 1119 | ||
| 1634 | LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n")); | 1120 | LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v4 csum failed\n"); |
| 1635 | skb->ip_summed = CHECKSUM_NONE; | 1121 | skb->ip_summed = CHECKSUM_NONE; |
| 1636 | } | 1122 | } |
| 1637 | if (skb->len <= 76) { | 1123 | if (skb->len <= 76) { |
| @@ -1747,9 +1233,9 @@ int tcp_v4_rcv(struct sk_buff *skb) | |||
| 1747 | TCP_SKB_CB(skb)->flags = skb->nh.iph->tos; | 1233 | TCP_SKB_CB(skb)->flags = skb->nh.iph->tos; |
| 1748 | TCP_SKB_CB(skb)->sacked = 0; | 1234 | TCP_SKB_CB(skb)->sacked = 0; |
| 1749 | 1235 | ||
| 1750 | sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source, | 1236 | sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source, |
| 1751 | skb->nh.iph->daddr, ntohs(th->dest), | 1237 | skb->nh.iph->daddr, ntohs(th->dest), |
| 1752 | tcp_v4_iif(skb)); | 1238 | inet_iif(skb)); |
| 1753 | 1239 | ||
| 1754 | if (!sk) | 1240 | if (!sk) |
| 1755 | goto no_tcp_socket; | 1241 | goto no_tcp_socket; |
| @@ -1801,24 +1287,26 @@ discard_and_relse: | |||
| 1801 | 1287 | ||
| 1802 | do_time_wait: | 1288 | do_time_wait: |
| 1803 | if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { | 1289 | if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { |
| 1804 | tcp_tw_put((struct tcp_tw_bucket *) sk); | 1290 | inet_twsk_put((struct inet_timewait_sock *) sk); |
| 1805 | goto discard_it; | 1291 | goto discard_it; |
| 1806 | } | 1292 | } |
| 1807 | 1293 | ||
| 1808 | if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { | 1294 | if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { |
| 1809 | TCP_INC_STATS_BH(TCP_MIB_INERRS); | 1295 | TCP_INC_STATS_BH(TCP_MIB_INERRS); |
| 1810 | tcp_tw_put((struct tcp_tw_bucket *) sk); | 1296 | inet_twsk_put((struct inet_timewait_sock *) sk); |
| 1811 | goto discard_it; | 1297 | goto discard_it; |
| 1812 | } | 1298 | } |
| 1813 | switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk, | 1299 | switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk, |
| 1814 | skb, th, skb->len)) { | 1300 | skb, th)) { |
| 1815 | case TCP_TW_SYN: { | 1301 | case TCP_TW_SYN: { |
| 1816 | struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, | 1302 | struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo, |
| 1817 | ntohs(th->dest), | 1303 | skb->nh.iph->daddr, |
| 1818 | tcp_v4_iif(skb)); | 1304 | ntohs(th->dest), |
| 1305 | inet_iif(skb)); | ||
| 1819 | if (sk2) { | 1306 | if (sk2) { |
| 1820 | tcp_tw_deschedule((struct tcp_tw_bucket *)sk); | 1307 | inet_twsk_deschedule((struct inet_timewait_sock *)sk, |
| 1821 | tcp_tw_put((struct tcp_tw_bucket *)sk); | 1308 | &tcp_death_row); |
| 1309 | inet_twsk_put((struct inet_timewait_sock *)sk); | ||
| 1822 | sk = sk2; | 1310 | sk = sk2; |
| 1823 | goto process; | 1311 | goto process; |
| 1824 | } | 1312 | } |
| @@ -1834,112 +1322,6 @@ do_time_wait: | |||
| 1834 | goto discard_it; | 1322 | goto discard_it; |
| 1835 | } | 1323 | } |
| 1836 | 1324 | ||
| 1837 | /* With per-bucket locks this operation is not-atomic, so that | ||
| 1838 | * this version is not worse. | ||
| 1839 | */ | ||
| 1840 | static void __tcp_v4_rehash(struct sock *sk) | ||
| 1841 | { | ||
| 1842 | sk->sk_prot->unhash(sk); | ||
| 1843 | sk->sk_prot->hash(sk); | ||
| 1844 | } | ||
| 1845 | |||
| 1846 | static int tcp_v4_reselect_saddr(struct sock *sk) | ||
| 1847 | { | ||
| 1848 | struct inet_sock *inet = inet_sk(sk); | ||
| 1849 | int err; | ||
| 1850 | struct rtable *rt; | ||
| 1851 | __u32 old_saddr = inet->saddr; | ||
| 1852 | __u32 new_saddr; | ||
| 1853 | __u32 daddr = inet->daddr; | ||
| 1854 | |||
| 1855 | if (inet->opt && inet->opt->srr) | ||
| 1856 | daddr = inet->opt->faddr; | ||
| 1857 | |||
| 1858 | /* Query new route. */ | ||
| 1859 | err = ip_route_connect(&rt, daddr, 0, | ||
| 1860 | RT_CONN_FLAGS(sk), | ||
| 1861 | sk->sk_bound_dev_if, | ||
| 1862 | IPPROTO_TCP, | ||
| 1863 | inet->sport, inet->dport, sk); | ||
| 1864 | if (err) | ||
| 1865 | return err; | ||
| 1866 | |||
| 1867 | __sk_dst_set(sk, &rt->u.dst); | ||
| 1868 | tcp_v4_setup_caps(sk, &rt->u.dst); | ||
| 1869 | |||
| 1870 | new_saddr = rt->rt_src; | ||
| 1871 | |||
| 1872 | if (new_saddr == old_saddr) | ||
| 1873 | return 0; | ||
| 1874 | |||
| 1875 | if (sysctl_ip_dynaddr > 1) { | ||
| 1876 | printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->" | ||
| 1877 | "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n", | ||
| 1878 | NIPQUAD(old_saddr), | ||
| 1879 | NIPQUAD(new_saddr)); | ||
| 1880 | } | ||
| 1881 | |||
| 1882 | inet->saddr = new_saddr; | ||
| 1883 | inet->rcv_saddr = new_saddr; | ||
| 1884 | |||
| 1885 | /* XXX The only one ugly spot where we need to | ||
| 1886 | * XXX really change the sockets identity after | ||
| 1887 | * XXX it has entered the hashes. -DaveM | ||
| 1888 | * | ||
| 1889 | * Besides that, it does not check for connection | ||
| 1890 | * uniqueness. Wait for troubles. | ||
| 1891 | */ | ||
| 1892 | __tcp_v4_rehash(sk); | ||
| 1893 | return 0; | ||
| 1894 | } | ||
| 1895 | |||
| 1896 | int tcp_v4_rebuild_header(struct sock *sk) | ||
| 1897 | { | ||
| 1898 | struct inet_sock *inet = inet_sk(sk); | ||
| 1899 | struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0); | ||
| 1900 | u32 daddr; | ||
| 1901 | int err; | ||
| 1902 | |||
| 1903 | /* Route is OK, nothing to do. */ | ||
| 1904 | if (rt) | ||
| 1905 | return 0; | ||
| 1906 | |||
| 1907 | /* Reroute. */ | ||
| 1908 | daddr = inet->daddr; | ||
| 1909 | if (inet->opt && inet->opt->srr) | ||
| 1910 | daddr = inet->opt->faddr; | ||
| 1911 | |||
| 1912 | { | ||
| 1913 | struct flowi fl = { .oif = sk->sk_bound_dev_if, | ||
| 1914 | .nl_u = { .ip4_u = | ||
| 1915 | { .daddr = daddr, | ||
| 1916 | .saddr = inet->saddr, | ||
| 1917 | .tos = RT_CONN_FLAGS(sk) } }, | ||
| 1918 | .proto = IPPROTO_TCP, | ||
| 1919 | .uli_u = { .ports = | ||
| 1920 | { .sport = inet->sport, | ||
| 1921 | .dport = inet->dport } } }; | ||
| 1922 | |||
| 1923 | err = ip_route_output_flow(&rt, &fl, sk, 0); | ||
| 1924 | } | ||
| 1925 | if (!err) { | ||
| 1926 | __sk_dst_set(sk, &rt->u.dst); | ||
| 1927 | tcp_v4_setup_caps(sk, &rt->u.dst); | ||
| 1928 | return 0; | ||
| 1929 | } | ||
| 1930 | |||
| 1931 | /* Routing failed... */ | ||
| 1932 | sk->sk_route_caps = 0; | ||
| 1933 | |||
| 1934 | if (!sysctl_ip_dynaddr || | ||
| 1935 | sk->sk_state != TCP_SYN_SENT || | ||
| 1936 | (sk->sk_userlocks & SOCK_BINDADDR_LOCK) || | ||
| 1937 | (err = tcp_v4_reselect_saddr(sk)) != 0) | ||
| 1938 | sk->sk_err_soft = -err; | ||
| 1939 | |||
| 1940 | return err; | ||
| 1941 | } | ||
| 1942 | |||
| 1943 | static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) | 1325 | static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) |
| 1944 | { | 1326 | { |
| 1945 | struct sockaddr_in *sin = (struct sockaddr_in *) uaddr; | 1327 | struct sockaddr_in *sin = (struct sockaddr_in *) uaddr; |
| @@ -1988,18 +1370,18 @@ int tcp_v4_remember_stamp(struct sock *sk) | |||
| 1988 | return 0; | 1370 | return 0; |
| 1989 | } | 1371 | } |
| 1990 | 1372 | ||
| 1991 | int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw) | 1373 | int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw) |
| 1992 | { | 1374 | { |
| 1993 | struct inet_peer *peer = NULL; | 1375 | struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1); |
| 1994 | |||
| 1995 | peer = inet_getpeer(tw->tw_daddr, 1); | ||
| 1996 | 1376 | ||
| 1997 | if (peer) { | 1377 | if (peer) { |
| 1998 | if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 || | 1378 | const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); |
| 1379 | |||
| 1380 | if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 || | ||
| 1999 | (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec && | 1381 | (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec && |
| 2000 | peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) { | 1382 | peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) { |
| 2001 | peer->tcp_ts_stamp = tw->tw_ts_recent_stamp; | 1383 | peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp; |
| 2002 | peer->tcp_ts = tw->tw_ts_recent; | 1384 | peer->tcp_ts = tcptw->tw_ts_recent; |
| 2003 | } | 1385 | } |
| 2004 | inet_putpeer(peer); | 1386 | inet_putpeer(peer); |
| 2005 | return 1; | 1387 | return 1; |
| @@ -2011,7 +1393,7 @@ int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw) | |||
| 2011 | struct tcp_func ipv4_specific = { | 1393 | struct tcp_func ipv4_specific = { |
| 2012 | .queue_xmit = ip_queue_xmit, | 1394 | .queue_xmit = ip_queue_xmit, |
| 2013 | .send_check = tcp_v4_send_check, | 1395 | .send_check = tcp_v4_send_check, |
| 2014 | .rebuild_header = tcp_v4_rebuild_header, | 1396 | .rebuild_header = inet_sk_rebuild_header, |
| 2015 | .conn_request = tcp_v4_conn_request, | 1397 | .conn_request = tcp_v4_conn_request, |
| 2016 | .syn_recv_sock = tcp_v4_syn_recv_sock, | 1398 | .syn_recv_sock = tcp_v4_syn_recv_sock, |
| 2017 | .remember_stamp = tcp_v4_remember_stamp, | 1399 | .remember_stamp = tcp_v4_remember_stamp, |
| @@ -2027,13 +1409,14 @@ struct tcp_func ipv4_specific = { | |||
| 2027 | */ | 1409 | */ |
| 2028 | static int tcp_v4_init_sock(struct sock *sk) | 1410 | static int tcp_v4_init_sock(struct sock *sk) |
| 2029 | { | 1411 | { |
| 1412 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 2030 | struct tcp_sock *tp = tcp_sk(sk); | 1413 | struct tcp_sock *tp = tcp_sk(sk); |
| 2031 | 1414 | ||
| 2032 | skb_queue_head_init(&tp->out_of_order_queue); | 1415 | skb_queue_head_init(&tp->out_of_order_queue); |
| 2033 | tcp_init_xmit_timers(sk); | 1416 | tcp_init_xmit_timers(sk); |
| 2034 | tcp_prequeue_init(tp); | 1417 | tcp_prequeue_init(tp); |
| 2035 | 1418 | ||
| 2036 | tp->rto = TCP_TIMEOUT_INIT; | 1419 | icsk->icsk_rto = TCP_TIMEOUT_INIT; |
| 2037 | tp->mdev = TCP_TIMEOUT_INIT; | 1420 | tp->mdev = TCP_TIMEOUT_INIT; |
| 2038 | 1421 | ||
| 2039 | /* So many TCP implementations out there (incorrectly) count the | 1422 | /* So many TCP implementations out there (incorrectly) count the |
| @@ -2051,7 +1434,7 @@ static int tcp_v4_init_sock(struct sock *sk) | |||
| 2051 | tp->mss_cache = 536; | 1434 | tp->mss_cache = 536; |
| 2052 | 1435 | ||
| 2053 | tp->reordering = sysctl_tcp_reordering; | 1436 | tp->reordering = sysctl_tcp_reordering; |
| 2054 | tp->ca_ops = &tcp_init_congestion_ops; | 1437 | icsk->icsk_ca_ops = &tcp_init_congestion_ops; |
| 2055 | 1438 | ||
| 2056 | sk->sk_state = TCP_CLOSE; | 1439 | sk->sk_state = TCP_CLOSE; |
| 2057 | 1440 | ||
| @@ -2074,7 +1457,7 @@ int tcp_v4_destroy_sock(struct sock *sk) | |||
| 2074 | 1457 | ||
| 2075 | tcp_clear_xmit_timers(sk); | 1458 | tcp_clear_xmit_timers(sk); |
| 2076 | 1459 | ||
| 2077 | tcp_cleanup_congestion_control(tp); | 1460 | tcp_cleanup_congestion_control(sk); |
| 2078 | 1461 | ||
| 2079 | /* Cleanup up the write buffer. */ | 1462 | /* Cleanup up the write buffer. */ |
| 2080 | sk_stream_writequeue_purge(sk); | 1463 | sk_stream_writequeue_purge(sk); |
| @@ -2086,8 +1469,8 @@ int tcp_v4_destroy_sock(struct sock *sk) | |||
| 2086 | __skb_queue_purge(&tp->ucopy.prequeue); | 1469 | __skb_queue_purge(&tp->ucopy.prequeue); |
| 2087 | 1470 | ||
| 2088 | /* Clean up a referenced TCP bind bucket. */ | 1471 | /* Clean up a referenced TCP bind bucket. */ |
| 2089 | if (tp->bind_hash) | 1472 | if (inet_csk(sk)->icsk_bind_hash) |
| 2090 | tcp_put_port(sk); | 1473 | inet_put_port(&tcp_hashinfo, sk); |
| 2091 | 1474 | ||
| 2092 | /* | 1475 | /* |
| 2093 | * If sendmsg cached page exists, toss it. | 1476 | * If sendmsg cached page exists, toss it. |
| @@ -2107,13 +1490,13 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock); | |||
| 2107 | #ifdef CONFIG_PROC_FS | 1490 | #ifdef CONFIG_PROC_FS |
| 2108 | /* Proc filesystem TCP sock list dumping. */ | 1491 | /* Proc filesystem TCP sock list dumping. */ |
| 2109 | 1492 | ||
| 2110 | static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head) | 1493 | static inline struct inet_timewait_sock *tw_head(struct hlist_head *head) |
| 2111 | { | 1494 | { |
| 2112 | return hlist_empty(head) ? NULL : | 1495 | return hlist_empty(head) ? NULL : |
| 2113 | list_entry(head->first, struct tcp_tw_bucket, tw_node); | 1496 | list_entry(head->first, struct inet_timewait_sock, tw_node); |
| 2114 | } | 1497 | } |
| 2115 | 1498 | ||
| 2116 | static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw) | 1499 | static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw) |
| 2117 | { | 1500 | { |
| 2118 | return tw->tw_node.next ? | 1501 | return tw->tw_node.next ? |
| 2119 | hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL; | 1502 | hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL; |
| @@ -2121,14 +1504,14 @@ static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw) | |||
| 2121 | 1504 | ||
| 2122 | static void *listening_get_next(struct seq_file *seq, void *cur) | 1505 | static void *listening_get_next(struct seq_file *seq, void *cur) |
| 2123 | { | 1506 | { |
| 2124 | struct tcp_sock *tp; | 1507 | struct inet_connection_sock *icsk; |
| 2125 | struct hlist_node *node; | 1508 | struct hlist_node *node; |
| 2126 | struct sock *sk = cur; | 1509 | struct sock *sk = cur; |
| 2127 | struct tcp_iter_state* st = seq->private; | 1510 | struct tcp_iter_state* st = seq->private; |
| 2128 | 1511 | ||
| 2129 | if (!sk) { | 1512 | if (!sk) { |
| 2130 | st->bucket = 0; | 1513 | st->bucket = 0; |
| 2131 | sk = sk_head(&tcp_listening_hash[0]); | 1514 | sk = sk_head(&tcp_hashinfo.listening_hash[0]); |
| 2132 | goto get_sk; | 1515 | goto get_sk; |
| 2133 | } | 1516 | } |
| 2134 | 1517 | ||
| @@ -2137,7 +1520,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur) | |||
| 2137 | if (st->state == TCP_SEQ_STATE_OPENREQ) { | 1520 | if (st->state == TCP_SEQ_STATE_OPENREQ) { |
| 2138 | struct request_sock *req = cur; | 1521 | struct request_sock *req = cur; |
| 2139 | 1522 | ||
| 2140 | tp = tcp_sk(st->syn_wait_sk); | 1523 | icsk = inet_csk(st->syn_wait_sk); |
| 2141 | req = req->dl_next; | 1524 | req = req->dl_next; |
| 2142 | while (1) { | 1525 | while (1) { |
| 2143 | while (req) { | 1526 | while (req) { |
| @@ -2150,17 +1533,17 @@ static void *listening_get_next(struct seq_file *seq, void *cur) | |||
| 2150 | if (++st->sbucket >= TCP_SYNQ_HSIZE) | 1533 | if (++st->sbucket >= TCP_SYNQ_HSIZE) |
| 2151 | break; | 1534 | break; |
| 2152 | get_req: | 1535 | get_req: |
| 2153 | req = tp->accept_queue.listen_opt->syn_table[st->sbucket]; | 1536 | req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket]; |
| 2154 | } | 1537 | } |
| 2155 | sk = sk_next(st->syn_wait_sk); | 1538 | sk = sk_next(st->syn_wait_sk); |
| 2156 | st->state = TCP_SEQ_STATE_LISTENING; | 1539 | st->state = TCP_SEQ_STATE_LISTENING; |
| 2157 | read_unlock_bh(&tp->accept_queue.syn_wait_lock); | 1540 | read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); |
| 2158 | } else { | 1541 | } else { |
| 2159 | tp = tcp_sk(sk); | 1542 | icsk = inet_csk(sk); |
| 2160 | read_lock_bh(&tp->accept_queue.syn_wait_lock); | 1543 | read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); |
| 2161 | if (reqsk_queue_len(&tp->accept_queue)) | 1544 | if (reqsk_queue_len(&icsk->icsk_accept_queue)) |
| 2162 | goto start_req; | 1545 | goto start_req; |
| 2163 | read_unlock_bh(&tp->accept_queue.syn_wait_lock); | 1546 | read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); |
| 2164 | sk = sk_next(sk); | 1547 | sk = sk_next(sk); |
| 2165 | } | 1548 | } |
| 2166 | get_sk: | 1549 | get_sk: |
| @@ -2169,9 +1552,9 @@ get_sk: | |||
| 2169 | cur = sk; | 1552 | cur = sk; |
| 2170 | goto out; | 1553 | goto out; |
| 2171 | } | 1554 | } |
| 2172 | tp = tcp_sk(sk); | 1555 | icsk = inet_csk(sk); |
| 2173 | read_lock_bh(&tp->accept_queue.syn_wait_lock); | 1556 | read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); |
| 2174 | if (reqsk_queue_len(&tp->accept_queue)) { | 1557 | if (reqsk_queue_len(&icsk->icsk_accept_queue)) { |
| 2175 | start_req: | 1558 | start_req: |
| 2176 | st->uid = sock_i_uid(sk); | 1559 | st->uid = sock_i_uid(sk); |
| 2177 | st->syn_wait_sk = sk; | 1560 | st->syn_wait_sk = sk; |
| @@ -2179,10 +1562,10 @@ start_req: | |||
| 2179 | st->sbucket = 0; | 1562 | st->sbucket = 0; |
| 2180 | goto get_req; | 1563 | goto get_req; |
| 2181 | } | 1564 | } |
| 2182 | read_unlock_bh(&tp->accept_queue.syn_wait_lock); | 1565 | read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); |
| 2183 | } | 1566 | } |
| 2184 | if (++st->bucket < TCP_LHTABLE_SIZE) { | 1567 | if (++st->bucket < INET_LHTABLE_SIZE) { |
| 2185 | sk = sk_head(&tcp_listening_hash[st->bucket]); | 1568 | sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]); |
| 2186 | goto get_sk; | 1569 | goto get_sk; |
| 2187 | } | 1570 | } |
| 2188 | cur = NULL; | 1571 | cur = NULL; |
| @@ -2206,16 +1589,16 @@ static void *established_get_first(struct seq_file *seq) | |||
| 2206 | struct tcp_iter_state* st = seq->private; | 1589 | struct tcp_iter_state* st = seq->private; |
| 2207 | void *rc = NULL; | 1590 | void *rc = NULL; |
| 2208 | 1591 | ||
| 2209 | for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) { | 1592 | for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) { |
| 2210 | struct sock *sk; | 1593 | struct sock *sk; |
| 2211 | struct hlist_node *node; | 1594 | struct hlist_node *node; |
| 2212 | struct tcp_tw_bucket *tw; | 1595 | struct inet_timewait_sock *tw; |
| 2213 | 1596 | ||
| 2214 | /* We can reschedule _before_ having picked the target: */ | 1597 | /* We can reschedule _before_ having picked the target: */ |
| 2215 | cond_resched_softirq(); | 1598 | cond_resched_softirq(); |
| 2216 | 1599 | ||
| 2217 | read_lock(&tcp_ehash[st->bucket].lock); | 1600 | read_lock(&tcp_hashinfo.ehash[st->bucket].lock); |
| 2218 | sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) { | 1601 | sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { |
| 2219 | if (sk->sk_family != st->family) { | 1602 | if (sk->sk_family != st->family) { |
| 2220 | continue; | 1603 | continue; |
| 2221 | } | 1604 | } |
| @@ -2223,15 +1606,15 @@ static void *established_get_first(struct seq_file *seq) | |||
| 2223 | goto out; | 1606 | goto out; |
| 2224 | } | 1607 | } |
| 2225 | st->state = TCP_SEQ_STATE_TIME_WAIT; | 1608 | st->state = TCP_SEQ_STATE_TIME_WAIT; |
| 2226 | tw_for_each(tw, node, | 1609 | inet_twsk_for_each(tw, node, |
| 2227 | &tcp_ehash[st->bucket + tcp_ehash_size].chain) { | 1610 | &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) { |
| 2228 | if (tw->tw_family != st->family) { | 1611 | if (tw->tw_family != st->family) { |
| 2229 | continue; | 1612 | continue; |
| 2230 | } | 1613 | } |
| 2231 | rc = tw; | 1614 | rc = tw; |
| 2232 | goto out; | 1615 | goto out; |
| 2233 | } | 1616 | } |
| 2234 | read_unlock(&tcp_ehash[st->bucket].lock); | 1617 | read_unlock(&tcp_hashinfo.ehash[st->bucket].lock); |
| 2235 | st->state = TCP_SEQ_STATE_ESTABLISHED; | 1618 | st->state = TCP_SEQ_STATE_ESTABLISHED; |
| 2236 | } | 1619 | } |
| 2237 | out: | 1620 | out: |
| @@ -2241,7 +1624,7 @@ out: | |||
| 2241 | static void *established_get_next(struct seq_file *seq, void *cur) | 1624 | static void *established_get_next(struct seq_file *seq, void *cur) |
| 2242 | { | 1625 | { |
| 2243 | struct sock *sk = cur; | 1626 | struct sock *sk = cur; |
| 2244 | struct tcp_tw_bucket *tw; | 1627 | struct inet_timewait_sock *tw; |
| 2245 | struct hlist_node *node; | 1628 | struct hlist_node *node; |
| 2246 | struct tcp_iter_state* st = seq->private; | 1629 | struct tcp_iter_state* st = seq->private; |
| 2247 | 1630 | ||
| @@ -2258,15 +1641,15 @@ get_tw: | |||
| 2258 | cur = tw; | 1641 | cur = tw; |
| 2259 | goto out; | 1642 | goto out; |
| 2260 | } | 1643 | } |
| 2261 | read_unlock(&tcp_ehash[st->bucket].lock); | 1644 | read_unlock(&tcp_hashinfo.ehash[st->bucket].lock); |
| 2262 | st->state = TCP_SEQ_STATE_ESTABLISHED; | 1645 | st->state = TCP_SEQ_STATE_ESTABLISHED; |
| 2263 | 1646 | ||
| 2264 | /* We can reschedule between buckets: */ | 1647 | /* We can reschedule between buckets: */ |
| 2265 | cond_resched_softirq(); | 1648 | cond_resched_softirq(); |
| 2266 | 1649 | ||
| 2267 | if (++st->bucket < tcp_ehash_size) { | 1650 | if (++st->bucket < tcp_hashinfo.ehash_size) { |
| 2268 | read_lock(&tcp_ehash[st->bucket].lock); | 1651 | read_lock(&tcp_hashinfo.ehash[st->bucket].lock); |
| 2269 | sk = sk_head(&tcp_ehash[st->bucket].chain); | 1652 | sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain); |
| 2270 | } else { | 1653 | } else { |
| 2271 | cur = NULL; | 1654 | cur = NULL; |
| 2272 | goto out; | 1655 | goto out; |
| @@ -2280,7 +1663,7 @@ get_tw: | |||
| 2280 | } | 1663 | } |
| 2281 | 1664 | ||
| 2282 | st->state = TCP_SEQ_STATE_TIME_WAIT; | 1665 | st->state = TCP_SEQ_STATE_TIME_WAIT; |
| 2283 | tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain); | 1666 | tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain); |
| 2284 | goto get_tw; | 1667 | goto get_tw; |
| 2285 | found: | 1668 | found: |
| 2286 | cur = sk; | 1669 | cur = sk; |
| @@ -2304,12 +1687,12 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos) | |||
| 2304 | void *rc; | 1687 | void *rc; |
| 2305 | struct tcp_iter_state* st = seq->private; | 1688 | struct tcp_iter_state* st = seq->private; |
| 2306 | 1689 | ||
| 2307 | tcp_listen_lock(); | 1690 | inet_listen_lock(&tcp_hashinfo); |
| 2308 | st->state = TCP_SEQ_STATE_LISTENING; | 1691 | st->state = TCP_SEQ_STATE_LISTENING; |
| 2309 | rc = listening_get_idx(seq, &pos); | 1692 | rc = listening_get_idx(seq, &pos); |
| 2310 | 1693 | ||
| 2311 | if (!rc) { | 1694 | if (!rc) { |
| 2312 | tcp_listen_unlock(); | 1695 | inet_listen_unlock(&tcp_hashinfo); |
| 2313 | local_bh_disable(); | 1696 | local_bh_disable(); |
| 2314 | st->state = TCP_SEQ_STATE_ESTABLISHED; | 1697 | st->state = TCP_SEQ_STATE_ESTABLISHED; |
| 2315 | rc = established_get_idx(seq, pos); | 1698 | rc = established_get_idx(seq, pos); |
| @@ -2342,7 +1725,7 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) | |||
| 2342 | case TCP_SEQ_STATE_LISTENING: | 1725 | case TCP_SEQ_STATE_LISTENING: |
| 2343 | rc = listening_get_next(seq, v); | 1726 | rc = listening_get_next(seq, v); |
| 2344 | if (!rc) { | 1727 | if (!rc) { |
| 2345 | tcp_listen_unlock(); | 1728 | inet_listen_unlock(&tcp_hashinfo); |
| 2346 | local_bh_disable(); | 1729 | local_bh_disable(); |
| 2347 | st->state = TCP_SEQ_STATE_ESTABLISHED; | 1730 | st->state = TCP_SEQ_STATE_ESTABLISHED; |
| 2348 | rc = established_get_first(seq); | 1731 | rc = established_get_first(seq); |
| @@ -2365,17 +1748,17 @@ static void tcp_seq_stop(struct seq_file *seq, void *v) | |||
| 2365 | switch (st->state) { | 1748 | switch (st->state) { |
| 2366 | case TCP_SEQ_STATE_OPENREQ: | 1749 | case TCP_SEQ_STATE_OPENREQ: |
| 2367 | if (v) { | 1750 | if (v) { |
| 2368 | struct tcp_sock *tp = tcp_sk(st->syn_wait_sk); | 1751 | struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk); |
| 2369 | read_unlock_bh(&tp->accept_queue.syn_wait_lock); | 1752 | read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); |
| 2370 | } | 1753 | } |
| 2371 | case TCP_SEQ_STATE_LISTENING: | 1754 | case TCP_SEQ_STATE_LISTENING: |
| 2372 | if (v != SEQ_START_TOKEN) | 1755 | if (v != SEQ_START_TOKEN) |
| 2373 | tcp_listen_unlock(); | 1756 | inet_listen_unlock(&tcp_hashinfo); |
| 2374 | break; | 1757 | break; |
| 2375 | case TCP_SEQ_STATE_TIME_WAIT: | 1758 | case TCP_SEQ_STATE_TIME_WAIT: |
| 2376 | case TCP_SEQ_STATE_ESTABLISHED: | 1759 | case TCP_SEQ_STATE_ESTABLISHED: |
| 2377 | if (v) | 1760 | if (v) |
| 2378 | read_unlock(&tcp_ehash[st->bucket].lock); | 1761 | read_unlock(&tcp_hashinfo.ehash[st->bucket].lock); |
| 2379 | local_bh_enable(); | 1762 | local_bh_enable(); |
| 2380 | break; | 1763 | break; |
| 2381 | } | 1764 | } |
| @@ -2472,18 +1855,19 @@ static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i) | |||
| 2472 | int timer_active; | 1855 | int timer_active; |
| 2473 | unsigned long timer_expires; | 1856 | unsigned long timer_expires; |
| 2474 | struct tcp_sock *tp = tcp_sk(sp); | 1857 | struct tcp_sock *tp = tcp_sk(sp); |
| 1858 | const struct inet_connection_sock *icsk = inet_csk(sp); | ||
| 2475 | struct inet_sock *inet = inet_sk(sp); | 1859 | struct inet_sock *inet = inet_sk(sp); |
| 2476 | unsigned int dest = inet->daddr; | 1860 | unsigned int dest = inet->daddr; |
| 2477 | unsigned int src = inet->rcv_saddr; | 1861 | unsigned int src = inet->rcv_saddr; |
| 2478 | __u16 destp = ntohs(inet->dport); | 1862 | __u16 destp = ntohs(inet->dport); |
| 2479 | __u16 srcp = ntohs(inet->sport); | 1863 | __u16 srcp = ntohs(inet->sport); |
| 2480 | 1864 | ||
| 2481 | if (tp->pending == TCP_TIME_RETRANS) { | 1865 | if (icsk->icsk_pending == ICSK_TIME_RETRANS) { |
| 2482 | timer_active = 1; | 1866 | timer_active = 1; |
| 2483 | timer_expires = tp->timeout; | 1867 | timer_expires = icsk->icsk_timeout; |
| 2484 | } else if (tp->pending == TCP_TIME_PROBE0) { | 1868 | } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { |
| 2485 | timer_active = 4; | 1869 | timer_active = 4; |
| 2486 | timer_expires = tp->timeout; | 1870 | timer_expires = icsk->icsk_timeout; |
| 2487 | } else if (timer_pending(&sp->sk_timer)) { | 1871 | } else if (timer_pending(&sp->sk_timer)) { |
| 2488 | timer_active = 2; | 1872 | timer_active = 2; |
| 2489 | timer_expires = sp->sk_timer.expires; | 1873 | timer_expires = sp->sk_timer.expires; |
| @@ -2498,17 +1882,19 @@ static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i) | |||
| 2498 | tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq, | 1882 | tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq, |
| 2499 | timer_active, | 1883 | timer_active, |
| 2500 | jiffies_to_clock_t(timer_expires - jiffies), | 1884 | jiffies_to_clock_t(timer_expires - jiffies), |
| 2501 | tp->retransmits, | 1885 | icsk->icsk_retransmits, |
| 2502 | sock_i_uid(sp), | 1886 | sock_i_uid(sp), |
| 2503 | tp->probes_out, | 1887 | icsk->icsk_probes_out, |
| 2504 | sock_i_ino(sp), | 1888 | sock_i_ino(sp), |
| 2505 | atomic_read(&sp->sk_refcnt), sp, | 1889 | atomic_read(&sp->sk_refcnt), sp, |
| 2506 | tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong, | 1890 | icsk->icsk_rto, |
| 1891 | icsk->icsk_ack.ato, | ||
| 1892 | (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, | ||
| 2507 | tp->snd_cwnd, | 1893 | tp->snd_cwnd, |
| 2508 | tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh); | 1894 | tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh); |
| 2509 | } | 1895 | } |
| 2510 | 1896 | ||
| 2511 | static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i) | 1897 | static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i) |
| 2512 | { | 1898 | { |
| 2513 | unsigned int dest, src; | 1899 | unsigned int dest, src; |
| 2514 | __u16 destp, srcp; | 1900 | __u16 destp, srcp; |
| @@ -2588,7 +1974,7 @@ struct proto tcp_prot = { | |||
| 2588 | .close = tcp_close, | 1974 | .close = tcp_close, |
| 2589 | .connect = tcp_v4_connect, | 1975 | .connect = tcp_v4_connect, |
| 2590 | .disconnect = tcp_disconnect, | 1976 | .disconnect = tcp_disconnect, |
| 2591 | .accept = tcp_accept, | 1977 | .accept = inet_csk_accept, |
| 2592 | .ioctl = tcp_ioctl, | 1978 | .ioctl = tcp_ioctl, |
| 2593 | .init = tcp_v4_init_sock, | 1979 | .init = tcp_v4_init_sock, |
| 2594 | .destroy = tcp_v4_destroy_sock, | 1980 | .destroy = tcp_v4_destroy_sock, |
| @@ -2603,6 +1989,7 @@ struct proto tcp_prot = { | |||
| 2603 | .get_port = tcp_v4_get_port, | 1989 | .get_port = tcp_v4_get_port, |
| 2604 | .enter_memory_pressure = tcp_enter_memory_pressure, | 1990 | .enter_memory_pressure = tcp_enter_memory_pressure, |
| 2605 | .sockets_allocated = &tcp_sockets_allocated, | 1991 | .sockets_allocated = &tcp_sockets_allocated, |
| 1992 | .orphan_count = &tcp_orphan_count, | ||
| 2606 | .memory_allocated = &tcp_memory_allocated, | 1993 | .memory_allocated = &tcp_memory_allocated, |
| 2607 | .memory_pressure = &tcp_memory_pressure, | 1994 | .memory_pressure = &tcp_memory_pressure, |
| 2608 | .sysctl_mem = sysctl_tcp_mem, | 1995 | .sysctl_mem = sysctl_tcp_mem, |
| @@ -2610,6 +1997,7 @@ struct proto tcp_prot = { | |||
| 2610 | .sysctl_rmem = sysctl_tcp_rmem, | 1997 | .sysctl_rmem = sysctl_tcp_rmem, |
| 2611 | .max_header = MAX_TCP_HEADER, | 1998 | .max_header = MAX_TCP_HEADER, |
| 2612 | .obj_size = sizeof(struct tcp_sock), | 1999 | .obj_size = sizeof(struct tcp_sock), |
| 2000 | .twsk_obj_size = sizeof(struct tcp_timewait_sock), | ||
| 2613 | .rsk_prot = &tcp_request_sock_ops, | 2001 | .rsk_prot = &tcp_request_sock_ops, |
| 2614 | }; | 2002 | }; |
| 2615 | 2003 | ||
| @@ -2631,19 +2019,13 @@ void __init tcp_v4_init(struct net_proto_family *ops) | |||
| 2631 | } | 2019 | } |
| 2632 | 2020 | ||
| 2633 | EXPORT_SYMBOL(ipv4_specific); | 2021 | EXPORT_SYMBOL(ipv4_specific); |
| 2634 | EXPORT_SYMBOL(tcp_bind_hash); | 2022 | EXPORT_SYMBOL(inet_bind_bucket_create); |
| 2635 | EXPORT_SYMBOL(tcp_bucket_create); | ||
| 2636 | EXPORT_SYMBOL(tcp_hashinfo); | 2023 | EXPORT_SYMBOL(tcp_hashinfo); |
| 2637 | EXPORT_SYMBOL(tcp_inherit_port); | ||
| 2638 | EXPORT_SYMBOL(tcp_listen_wlock); | ||
| 2639 | EXPORT_SYMBOL(tcp_port_rover); | ||
| 2640 | EXPORT_SYMBOL(tcp_prot); | 2024 | EXPORT_SYMBOL(tcp_prot); |
| 2641 | EXPORT_SYMBOL(tcp_put_port); | ||
| 2642 | EXPORT_SYMBOL(tcp_unhash); | 2025 | EXPORT_SYMBOL(tcp_unhash); |
| 2643 | EXPORT_SYMBOL(tcp_v4_conn_request); | 2026 | EXPORT_SYMBOL(tcp_v4_conn_request); |
| 2644 | EXPORT_SYMBOL(tcp_v4_connect); | 2027 | EXPORT_SYMBOL(tcp_v4_connect); |
| 2645 | EXPORT_SYMBOL(tcp_v4_do_rcv); | 2028 | EXPORT_SYMBOL(tcp_v4_do_rcv); |
| 2646 | EXPORT_SYMBOL(tcp_v4_rebuild_header); | ||
| 2647 | EXPORT_SYMBOL(tcp_v4_remember_stamp); | 2029 | EXPORT_SYMBOL(tcp_v4_remember_stamp); |
| 2648 | EXPORT_SYMBOL(tcp_v4_send_check); | 2030 | EXPORT_SYMBOL(tcp_v4_send_check); |
| 2649 | EXPORT_SYMBOL(tcp_v4_syn_recv_sock); | 2031 | EXPORT_SYMBOL(tcp_v4_syn_recv_sock); |
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index f42a284164b7..a88db28b0af7 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c | |||
| @@ -35,13 +35,27 @@ | |||
| 35 | #define SYNC_INIT 1 | 35 | #define SYNC_INIT 1 |
| 36 | #endif | 36 | #endif |
| 37 | 37 | ||
| 38 | int sysctl_tcp_tw_recycle; | ||
| 39 | int sysctl_tcp_max_tw_buckets = NR_FILE*2; | ||
| 40 | |||
| 41 | int sysctl_tcp_syncookies = SYNC_INIT; | 38 | int sysctl_tcp_syncookies = SYNC_INIT; |
| 42 | int sysctl_tcp_abort_on_overflow; | 39 | int sysctl_tcp_abort_on_overflow; |
| 43 | 40 | ||
| 44 | static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo); | 41 | struct inet_timewait_death_row tcp_death_row = { |
| 42 | .sysctl_max_tw_buckets = NR_FILE * 2, | ||
| 43 | .period = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS, | ||
| 44 | .death_lock = SPIN_LOCK_UNLOCKED, | ||
| 45 | .hashinfo = &tcp_hashinfo, | ||
| 46 | .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0, | ||
| 47 | (unsigned long)&tcp_death_row), | ||
| 48 | .twkill_work = __WORK_INITIALIZER(tcp_death_row.twkill_work, | ||
| 49 | inet_twdr_twkill_work, | ||
| 50 | &tcp_death_row), | ||
| 51 | /* Short-time timewait calendar */ | ||
| 52 | |||
| 53 | .twcal_hand = -1, | ||
| 54 | .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0, | ||
| 55 | (unsigned long)&tcp_death_row), | ||
| 56 | }; | ||
| 57 | |||
| 58 | EXPORT_SYMBOL_GPL(tcp_death_row); | ||
| 45 | 59 | ||
| 46 | static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) | 60 | static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) |
| 47 | { | 61 | { |
| @@ -52,47 +66,6 @@ static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) | |||
| 52 | return (seq == e_win && seq == end_seq); | 66 | return (seq == e_win && seq == end_seq); |
| 53 | } | 67 | } |
| 54 | 68 | ||
| 55 | /* New-style handling of TIME_WAIT sockets. */ | ||
| 56 | |||
| 57 | int tcp_tw_count; | ||
| 58 | |||
| 59 | |||
| 60 | /* Must be called with locally disabled BHs. */ | ||
| 61 | static void tcp_timewait_kill(struct tcp_tw_bucket *tw) | ||
| 62 | { | ||
| 63 | struct tcp_ehash_bucket *ehead; | ||
| 64 | struct tcp_bind_hashbucket *bhead; | ||
| 65 | struct tcp_bind_bucket *tb; | ||
| 66 | |||
| 67 | /* Unlink from established hashes. */ | ||
| 68 | ehead = &tcp_ehash[tw->tw_hashent]; | ||
| 69 | write_lock(&ehead->lock); | ||
| 70 | if (hlist_unhashed(&tw->tw_node)) { | ||
| 71 | write_unlock(&ehead->lock); | ||
| 72 | return; | ||
| 73 | } | ||
| 74 | __hlist_del(&tw->tw_node); | ||
| 75 | sk_node_init(&tw->tw_node); | ||
| 76 | write_unlock(&ehead->lock); | ||
| 77 | |||
| 78 | /* Disassociate with bind bucket. */ | ||
| 79 | bhead = &tcp_bhash[tcp_bhashfn(tw->tw_num)]; | ||
| 80 | spin_lock(&bhead->lock); | ||
| 81 | tb = tw->tw_tb; | ||
| 82 | __hlist_del(&tw->tw_bind_node); | ||
| 83 | tw->tw_tb = NULL; | ||
| 84 | tcp_bucket_destroy(tb); | ||
| 85 | spin_unlock(&bhead->lock); | ||
| 86 | |||
| 87 | #ifdef INET_REFCNT_DEBUG | ||
| 88 | if (atomic_read(&tw->tw_refcnt) != 1) { | ||
| 89 | printk(KERN_DEBUG "tw_bucket %p refcnt=%d\n", tw, | ||
| 90 | atomic_read(&tw->tw_refcnt)); | ||
| 91 | } | ||
| 92 | #endif | ||
| 93 | tcp_tw_put(tw); | ||
| 94 | } | ||
| 95 | |||
| 96 | /* | 69 | /* |
| 97 | * * Main purpose of TIME-WAIT state is to close connection gracefully, | 70 | * * Main purpose of TIME-WAIT state is to close connection gracefully, |
| 98 | * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN | 71 | * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN |
| @@ -122,19 +95,20 @@ static void tcp_timewait_kill(struct tcp_tw_bucket *tw) | |||
| 122 | * to avoid misread sequence numbers, states etc. --ANK | 95 | * to avoid misread sequence numbers, states etc. --ANK |
| 123 | */ | 96 | */ |
| 124 | enum tcp_tw_status | 97 | enum tcp_tw_status |
| 125 | tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, | 98 | tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, |
| 126 | struct tcphdr *th, unsigned len) | 99 | const struct tcphdr *th) |
| 127 | { | 100 | { |
| 101 | struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); | ||
| 128 | struct tcp_options_received tmp_opt; | 102 | struct tcp_options_received tmp_opt; |
| 129 | int paws_reject = 0; | 103 | int paws_reject = 0; |
| 130 | 104 | ||
| 131 | tmp_opt.saw_tstamp = 0; | 105 | tmp_opt.saw_tstamp = 0; |
| 132 | if (th->doff > (sizeof(struct tcphdr) >> 2) && tw->tw_ts_recent_stamp) { | 106 | if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { |
| 133 | tcp_parse_options(skb, &tmp_opt, 0); | 107 | tcp_parse_options(skb, &tmp_opt, 0); |
| 134 | 108 | ||
| 135 | if (tmp_opt.saw_tstamp) { | 109 | if (tmp_opt.saw_tstamp) { |
| 136 | tmp_opt.ts_recent = tw->tw_ts_recent; | 110 | tmp_opt.ts_recent = tcptw->tw_ts_recent; |
| 137 | tmp_opt.ts_recent_stamp = tw->tw_ts_recent_stamp; | 111 | tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; |
| 138 | paws_reject = tcp_paws_check(&tmp_opt, th->rst); | 112 | paws_reject = tcp_paws_check(&tmp_opt, th->rst); |
| 139 | } | 113 | } |
| 140 | } | 114 | } |
| @@ -145,20 +119,20 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, | |||
| 145 | /* Out of window, send ACK */ | 119 | /* Out of window, send ACK */ |
| 146 | if (paws_reject || | 120 | if (paws_reject || |
| 147 | !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, | 121 | !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, |
| 148 | tw->tw_rcv_nxt, | 122 | tcptw->tw_rcv_nxt, |
| 149 | tw->tw_rcv_nxt + tw->tw_rcv_wnd)) | 123 | tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd)) |
| 150 | return TCP_TW_ACK; | 124 | return TCP_TW_ACK; |
| 151 | 125 | ||
| 152 | if (th->rst) | 126 | if (th->rst) |
| 153 | goto kill; | 127 | goto kill; |
| 154 | 128 | ||
| 155 | if (th->syn && !before(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt)) | 129 | if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt)) |
| 156 | goto kill_with_rst; | 130 | goto kill_with_rst; |
| 157 | 131 | ||
| 158 | /* Dup ACK? */ | 132 | /* Dup ACK? */ |
| 159 | if (!after(TCP_SKB_CB(skb)->end_seq, tw->tw_rcv_nxt) || | 133 | if (!after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) || |
| 160 | TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) { | 134 | TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) { |
| 161 | tcp_tw_put(tw); | 135 | inet_twsk_put(tw); |
| 162 | return TCP_TW_SUCCESS; | 136 | return TCP_TW_SUCCESS; |
| 163 | } | 137 | } |
| 164 | 138 | ||
| @@ -166,19 +140,19 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, | |||
| 166 | * reset. | 140 | * reset. |
| 167 | */ | 141 | */ |
| 168 | if (!th->fin || | 142 | if (!th->fin || |
| 169 | TCP_SKB_CB(skb)->end_seq != tw->tw_rcv_nxt + 1) { | 143 | TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) { |
| 170 | kill_with_rst: | 144 | kill_with_rst: |
| 171 | tcp_tw_deschedule(tw); | 145 | inet_twsk_deschedule(tw, &tcp_death_row); |
| 172 | tcp_tw_put(tw); | 146 | inet_twsk_put(tw); |
| 173 | return TCP_TW_RST; | 147 | return TCP_TW_RST; |
| 174 | } | 148 | } |
| 175 | 149 | ||
| 176 | /* FIN arrived, enter true time-wait state. */ | 150 | /* FIN arrived, enter true time-wait state. */ |
| 177 | tw->tw_substate = TCP_TIME_WAIT; | 151 | tw->tw_substate = TCP_TIME_WAIT; |
| 178 | tw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq; | 152 | tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq; |
| 179 | if (tmp_opt.saw_tstamp) { | 153 | if (tmp_opt.saw_tstamp) { |
| 180 | tw->tw_ts_recent_stamp = xtime.tv_sec; | 154 | tcptw->tw_ts_recent_stamp = xtime.tv_sec; |
| 181 | tw->tw_ts_recent = tmp_opt.rcv_tsval; | 155 | tcptw->tw_ts_recent = tmp_opt.rcv_tsval; |
| 182 | } | 156 | } |
| 183 | 157 | ||
| 184 | /* I am shamed, but failed to make it more elegant. | 158 | /* I am shamed, but failed to make it more elegant. |
| @@ -187,11 +161,13 @@ kill_with_rst: | |||
| 187 | * do not undertsnad recycling in any case, it not | 161 | * do not undertsnad recycling in any case, it not |
| 188 | * a big problem in practice. --ANK */ | 162 | * a big problem in practice. --ANK */ |
| 189 | if (tw->tw_family == AF_INET && | 163 | if (tw->tw_family == AF_INET && |
| 190 | sysctl_tcp_tw_recycle && tw->tw_ts_recent_stamp && | 164 | tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && |
| 191 | tcp_v4_tw_remember_stamp(tw)) | 165 | tcp_v4_tw_remember_stamp(tw)) |
| 192 | tcp_tw_schedule(tw, tw->tw_timeout); | 166 | inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout, |
| 167 | TCP_TIMEWAIT_LEN); | ||
| 193 | else | 168 | else |
| 194 | tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); | 169 | inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, |
| 170 | TCP_TIMEWAIT_LEN); | ||
| 195 | return TCP_TW_ACK; | 171 | return TCP_TW_ACK; |
| 196 | } | 172 | } |
| 197 | 173 | ||
| @@ -213,7 +189,7 @@ kill_with_rst: | |||
| 213 | */ | 189 | */ |
| 214 | 190 | ||
| 215 | if (!paws_reject && | 191 | if (!paws_reject && |
| 216 | (TCP_SKB_CB(skb)->seq == tw->tw_rcv_nxt && | 192 | (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt && |
| 217 | (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) { | 193 | (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) { |
| 218 | /* In window segment, it may be only reset or bare ack. */ | 194 | /* In window segment, it may be only reset or bare ack. */ |
| 219 | 195 | ||
| @@ -224,19 +200,20 @@ kill_with_rst: | |||
| 224 | */ | 200 | */ |
| 225 | if (sysctl_tcp_rfc1337 == 0) { | 201 | if (sysctl_tcp_rfc1337 == 0) { |
| 226 | kill: | 202 | kill: |
| 227 | tcp_tw_deschedule(tw); | 203 | inet_twsk_deschedule(tw, &tcp_death_row); |
| 228 | tcp_tw_put(tw); | 204 | inet_twsk_put(tw); |
| 229 | return TCP_TW_SUCCESS; | 205 | return TCP_TW_SUCCESS; |
| 230 | } | 206 | } |
| 231 | } | 207 | } |
| 232 | tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); | 208 | inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, |
| 209 | TCP_TIMEWAIT_LEN); | ||
| 233 | 210 | ||
| 234 | if (tmp_opt.saw_tstamp) { | 211 | if (tmp_opt.saw_tstamp) { |
| 235 | tw->tw_ts_recent = tmp_opt.rcv_tsval; | 212 | tcptw->tw_ts_recent = tmp_opt.rcv_tsval; |
| 236 | tw->tw_ts_recent_stamp = xtime.tv_sec; | 213 | tcptw->tw_ts_recent_stamp = xtime.tv_sec; |
| 237 | } | 214 | } |
| 238 | 215 | ||
| 239 | tcp_tw_put(tw); | 216 | inet_twsk_put(tw); |
| 240 | return TCP_TW_SUCCESS; | 217 | return TCP_TW_SUCCESS; |
| 241 | } | 218 | } |
| 242 | 219 | ||
| @@ -258,9 +235,10 @@ kill: | |||
| 258 | */ | 235 | */ |
| 259 | 236 | ||
| 260 | if (th->syn && !th->rst && !th->ack && !paws_reject && | 237 | if (th->syn && !th->rst && !th->ack && !paws_reject && |
| 261 | (after(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt) || | 238 | (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) || |
| 262 | (tmp_opt.saw_tstamp && (s32)(tw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) { | 239 | (tmp_opt.saw_tstamp && |
| 263 | u32 isn = tw->tw_snd_nxt + 65535 + 2; | 240 | (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) { |
| 241 | u32 isn = tcptw->tw_snd_nxt + 65535 + 2; | ||
| 264 | if (isn == 0) | 242 | if (isn == 0) |
| 265 | isn++; | 243 | isn++; |
| 266 | TCP_SKB_CB(skb)->when = isn; | 244 | TCP_SKB_CB(skb)->when = isn; |
| @@ -278,107 +256,57 @@ kill: | |||
| 278 | * Do not reschedule in the last case. | 256 | * Do not reschedule in the last case. |
| 279 | */ | 257 | */ |
| 280 | if (paws_reject || th->ack) | 258 | if (paws_reject || th->ack) |
| 281 | tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); | 259 | inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, |
| 260 | TCP_TIMEWAIT_LEN); | ||
| 282 | 261 | ||
| 283 | /* Send ACK. Note, we do not put the bucket, | 262 | /* Send ACK. Note, we do not put the bucket, |
| 284 | * it will be released by caller. | 263 | * it will be released by caller. |
| 285 | */ | 264 | */ |
| 286 | return TCP_TW_ACK; | 265 | return TCP_TW_ACK; |
| 287 | } | 266 | } |
| 288 | tcp_tw_put(tw); | 267 | inet_twsk_put(tw); |
| 289 | return TCP_TW_SUCCESS; | 268 | return TCP_TW_SUCCESS; |
| 290 | } | 269 | } |
| 291 | 270 | ||
| 292 | /* Enter the time wait state. This is called with locally disabled BH. | ||
| 293 | * Essentially we whip up a timewait bucket, copy the | ||
| 294 | * relevant info into it from the SK, and mess with hash chains | ||
| 295 | * and list linkage. | ||
| 296 | */ | ||
| 297 | static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw) | ||
| 298 | { | ||
| 299 | struct tcp_ehash_bucket *ehead = &tcp_ehash[sk->sk_hashent]; | ||
| 300 | struct tcp_bind_hashbucket *bhead; | ||
| 301 | |||
| 302 | /* Step 1: Put TW into bind hash. Original socket stays there too. | ||
| 303 | Note, that any socket with inet_sk(sk)->num != 0 MUST be bound in | ||
| 304 | binding cache, even if it is closed. | ||
| 305 | */ | ||
| 306 | bhead = &tcp_bhash[tcp_bhashfn(inet_sk(sk)->num)]; | ||
| 307 | spin_lock(&bhead->lock); | ||
| 308 | tw->tw_tb = tcp_sk(sk)->bind_hash; | ||
| 309 | BUG_TRAP(tcp_sk(sk)->bind_hash); | ||
| 310 | tw_add_bind_node(tw, &tw->tw_tb->owners); | ||
| 311 | spin_unlock(&bhead->lock); | ||
| 312 | |||
| 313 | write_lock(&ehead->lock); | ||
| 314 | |||
| 315 | /* Step 2: Remove SK from established hash. */ | ||
| 316 | if (__sk_del_node_init(sk)) | ||
| 317 | sock_prot_dec_use(sk->sk_prot); | ||
| 318 | |||
| 319 | /* Step 3: Hash TW into TIMEWAIT half of established hash table. */ | ||
| 320 | tw_add_node(tw, &(ehead + tcp_ehash_size)->chain); | ||
| 321 | atomic_inc(&tw->tw_refcnt); | ||
| 322 | |||
| 323 | write_unlock(&ehead->lock); | ||
| 324 | } | ||
| 325 | |||
| 326 | /* | 271 | /* |
| 327 | * Move a socket to time-wait or dead fin-wait-2 state. | 272 | * Move a socket to time-wait or dead fin-wait-2 state. |
| 328 | */ | 273 | */ |
| 329 | void tcp_time_wait(struct sock *sk, int state, int timeo) | 274 | void tcp_time_wait(struct sock *sk, int state, int timeo) |
| 330 | { | 275 | { |
| 331 | struct tcp_tw_bucket *tw = NULL; | 276 | struct inet_timewait_sock *tw = NULL; |
| 332 | struct tcp_sock *tp = tcp_sk(sk); | 277 | const struct tcp_sock *tp = tcp_sk(sk); |
| 333 | int recycle_ok = 0; | 278 | int recycle_ok = 0; |
| 334 | 279 | ||
| 335 | if (sysctl_tcp_tw_recycle && tp->rx_opt.ts_recent_stamp) | 280 | if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) |
| 336 | recycle_ok = tp->af_specific->remember_stamp(sk); | 281 | recycle_ok = tp->af_specific->remember_stamp(sk); |
| 337 | 282 | ||
| 338 | if (tcp_tw_count < sysctl_tcp_max_tw_buckets) | 283 | if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) |
| 339 | tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC); | 284 | tw = inet_twsk_alloc(sk, state); |
| 340 | |||
| 341 | if(tw != NULL) { | ||
| 342 | struct inet_sock *inet = inet_sk(sk); | ||
| 343 | int rto = (tp->rto<<2) - (tp->rto>>1); | ||
| 344 | |||
| 345 | /* Give us an identity. */ | ||
| 346 | tw->tw_daddr = inet->daddr; | ||
| 347 | tw->tw_rcv_saddr = inet->rcv_saddr; | ||
| 348 | tw->tw_bound_dev_if = sk->sk_bound_dev_if; | ||
| 349 | tw->tw_num = inet->num; | ||
| 350 | tw->tw_state = TCP_TIME_WAIT; | ||
| 351 | tw->tw_substate = state; | ||
| 352 | tw->tw_sport = inet->sport; | ||
| 353 | tw->tw_dport = inet->dport; | ||
| 354 | tw->tw_family = sk->sk_family; | ||
| 355 | tw->tw_reuse = sk->sk_reuse; | ||
| 356 | tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; | ||
| 357 | atomic_set(&tw->tw_refcnt, 1); | ||
| 358 | 285 | ||
| 359 | tw->tw_hashent = sk->sk_hashent; | 286 | if (tw != NULL) { |
| 360 | tw->tw_rcv_nxt = tp->rcv_nxt; | 287 | struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); |
| 361 | tw->tw_snd_nxt = tp->snd_nxt; | 288 | const struct inet_connection_sock *icsk = inet_csk(sk); |
| 362 | tw->tw_rcv_wnd = tcp_receive_window(tp); | 289 | const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); |
| 363 | tw->tw_ts_recent = tp->rx_opt.ts_recent; | 290 | |
| 364 | tw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; | 291 | tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; |
| 365 | tw_dead_node_init(tw); | 292 | tcptw->tw_rcv_nxt = tp->rcv_nxt; |
| 293 | tcptw->tw_snd_nxt = tp->snd_nxt; | ||
| 294 | tcptw->tw_rcv_wnd = tcp_receive_window(tp); | ||
| 295 | tcptw->tw_ts_recent = tp->rx_opt.ts_recent; | ||
| 296 | tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; | ||
| 366 | 297 | ||
| 367 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | 298 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
| 368 | if (tw->tw_family == PF_INET6) { | 299 | if (tw->tw_family == PF_INET6) { |
| 369 | struct ipv6_pinfo *np = inet6_sk(sk); | 300 | struct ipv6_pinfo *np = inet6_sk(sk); |
| 301 | struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw); | ||
| 370 | 302 | ||
| 371 | ipv6_addr_copy(&tw->tw_v6_daddr, &np->daddr); | 303 | ipv6_addr_copy(&tcp6tw->tw_v6_daddr, &np->daddr); |
| 372 | ipv6_addr_copy(&tw->tw_v6_rcv_saddr, &np->rcv_saddr); | 304 | ipv6_addr_copy(&tcp6tw->tw_v6_rcv_saddr, &np->rcv_saddr); |
| 373 | tw->tw_v6_ipv6only = np->ipv6only; | 305 | tw->tw_ipv6only = np->ipv6only; |
| 374 | } else { | ||
| 375 | memset(&tw->tw_v6_daddr, 0, sizeof(tw->tw_v6_daddr)); | ||
| 376 | memset(&tw->tw_v6_rcv_saddr, 0, sizeof(tw->tw_v6_rcv_saddr)); | ||
| 377 | tw->tw_v6_ipv6only = 0; | ||
| 378 | } | 306 | } |
| 379 | #endif | 307 | #endif |
| 380 | /* Linkage updates. */ | 308 | /* Linkage updates. */ |
| 381 | __tcp_tw_hashdance(sk, tw); | 309 | __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); |
| 382 | 310 | ||
| 383 | /* Get the TIME_WAIT timeout firing. */ | 311 | /* Get the TIME_WAIT timeout firing. */ |
| 384 | if (timeo < rto) | 312 | if (timeo < rto) |
| @@ -392,8 +320,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) | |||
| 392 | timeo = TCP_TIMEWAIT_LEN; | 320 | timeo = TCP_TIMEWAIT_LEN; |
| 393 | } | 321 | } |
| 394 | 322 | ||
| 395 | tcp_tw_schedule(tw, timeo); | 323 | inet_twsk_schedule(tw, &tcp_death_row, timeo, |
| 396 | tcp_tw_put(tw); | 324 | TCP_TIMEWAIT_LEN); |
| 325 | inet_twsk_put(tw); | ||
| 397 | } else { | 326 | } else { |
| 398 | /* Sorry, if we're out of memory, just CLOSE this | 327 | /* Sorry, if we're out of memory, just CLOSE this |
| 399 | * socket up. We've got bigger problems than | 328 | * socket up. We've got bigger problems than |
| @@ -407,277 +336,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) | |||
| 407 | tcp_done(sk); | 336 | tcp_done(sk); |
| 408 | } | 337 | } |
| 409 | 338 | ||
| 410 | /* Kill off TIME_WAIT sockets once their lifetime has expired. */ | ||
| 411 | static int tcp_tw_death_row_slot; | ||
| 412 | |||
| 413 | static void tcp_twkill(unsigned long); | ||
| 414 | |||
| 415 | /* TIME_WAIT reaping mechanism. */ | ||
| 416 | #define TCP_TWKILL_SLOTS 8 /* Please keep this a power of 2. */ | ||
| 417 | #define TCP_TWKILL_PERIOD (TCP_TIMEWAIT_LEN/TCP_TWKILL_SLOTS) | ||
| 418 | |||
| 419 | #define TCP_TWKILL_QUOTA 100 | ||
| 420 | |||
| 421 | static struct hlist_head tcp_tw_death_row[TCP_TWKILL_SLOTS]; | ||
| 422 | static DEFINE_SPINLOCK(tw_death_lock); | ||
| 423 | static struct timer_list tcp_tw_timer = TIMER_INITIALIZER(tcp_twkill, 0, 0); | ||
| 424 | static void twkill_work(void *); | ||
| 425 | static DECLARE_WORK(tcp_twkill_work, twkill_work, NULL); | ||
| 426 | static u32 twkill_thread_slots; | ||
| 427 | |||
| 428 | /* Returns non-zero if quota exceeded. */ | ||
| 429 | static int tcp_do_twkill_work(int slot, unsigned int quota) | ||
| 430 | { | ||
| 431 | struct tcp_tw_bucket *tw; | ||
| 432 | struct hlist_node *node; | ||
| 433 | unsigned int killed; | ||
| 434 | int ret; | ||
| 435 | |||
| 436 | /* NOTE: compare this to previous version where lock | ||
| 437 | * was released after detaching chain. It was racy, | ||
| 438 | * because tw buckets are scheduled in not serialized context | ||
| 439 | * in 2.3 (with netfilter), and with softnet it is common, because | ||
| 440 | * soft irqs are not sequenced. | ||
| 441 | */ | ||
| 442 | killed = 0; | ||
| 443 | ret = 0; | ||
| 444 | rescan: | ||
| 445 | tw_for_each_inmate(tw, node, &tcp_tw_death_row[slot]) { | ||
| 446 | __tw_del_dead_node(tw); | ||
| 447 | spin_unlock(&tw_death_lock); | ||
| 448 | tcp_timewait_kill(tw); | ||
| 449 | tcp_tw_put(tw); | ||
| 450 | killed++; | ||
| 451 | spin_lock(&tw_death_lock); | ||
| 452 | if (killed > quota) { | ||
| 453 | ret = 1; | ||
| 454 | break; | ||
| 455 | } | ||
| 456 | |||
| 457 | /* While we dropped tw_death_lock, another cpu may have | ||
| 458 | * killed off the next TW bucket in the list, therefore | ||
| 459 | * do a fresh re-read of the hlist head node with the | ||
| 460 | * lock reacquired. We still use the hlist traversal | ||
| 461 | * macro in order to get the prefetches. | ||
| 462 | */ | ||
| 463 | goto rescan; | ||
| 464 | } | ||
| 465 | |||
| 466 | tcp_tw_count -= killed; | ||
| 467 | NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed); | ||
| 468 | |||
| 469 | return ret; | ||
| 470 | } | ||
| 471 | |||
| 472 | static void tcp_twkill(unsigned long dummy) | ||
| 473 | { | ||
| 474 | int need_timer, ret; | ||
| 475 | |||
| 476 | spin_lock(&tw_death_lock); | ||
| 477 | |||
| 478 | if (tcp_tw_count == 0) | ||
| 479 | goto out; | ||
| 480 | |||
| 481 | need_timer = 0; | ||
| 482 | ret = tcp_do_twkill_work(tcp_tw_death_row_slot, TCP_TWKILL_QUOTA); | ||
| 483 | if (ret) { | ||
| 484 | twkill_thread_slots |= (1 << tcp_tw_death_row_slot); | ||
| 485 | mb(); | ||
| 486 | schedule_work(&tcp_twkill_work); | ||
| 487 | need_timer = 1; | ||
| 488 | } else { | ||
| 489 | /* We purged the entire slot, anything left? */ | ||
| 490 | if (tcp_tw_count) | ||
| 491 | need_timer = 1; | ||
| 492 | } | ||
| 493 | tcp_tw_death_row_slot = | ||
| 494 | ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1)); | ||
| 495 | if (need_timer) | ||
| 496 | mod_timer(&tcp_tw_timer, jiffies + TCP_TWKILL_PERIOD); | ||
| 497 | out: | ||
| 498 | spin_unlock(&tw_death_lock); | ||
| 499 | } | ||
| 500 | |||
| 501 | extern void twkill_slots_invalid(void); | ||
| 502 | |||
| 503 | static void twkill_work(void *dummy) | ||
| 504 | { | ||
| 505 | int i; | ||
| 506 | |||
| 507 | if ((TCP_TWKILL_SLOTS - 1) > (sizeof(twkill_thread_slots) * 8)) | ||
| 508 | twkill_slots_invalid(); | ||
| 509 | |||
| 510 | while (twkill_thread_slots) { | ||
| 511 | spin_lock_bh(&tw_death_lock); | ||
| 512 | for (i = 0; i < TCP_TWKILL_SLOTS; i++) { | ||
| 513 | if (!(twkill_thread_slots & (1 << i))) | ||
| 514 | continue; | ||
| 515 | |||
| 516 | while (tcp_do_twkill_work(i, TCP_TWKILL_QUOTA) != 0) { | ||
| 517 | if (need_resched()) { | ||
| 518 | spin_unlock_bh(&tw_death_lock); | ||
| 519 | schedule(); | ||
| 520 | spin_lock_bh(&tw_death_lock); | ||
| 521 | } | ||
| 522 | } | ||
| 523 | |||
| 524 | twkill_thread_slots &= ~(1 << i); | ||
| 525 | } | ||
| 526 | spin_unlock_bh(&tw_death_lock); | ||
| 527 | } | ||
| 528 | } | ||
| 529 | |||
| 530 | /* These are always called from BH context. See callers in | ||
| 531 | * tcp_input.c to verify this. | ||
| 532 | */ | ||
| 533 | |||
| 534 | /* This is for handling early-kills of TIME_WAIT sockets. */ | ||
| 535 | void tcp_tw_deschedule(struct tcp_tw_bucket *tw) | ||
| 536 | { | ||
| 537 | spin_lock(&tw_death_lock); | ||
| 538 | if (tw_del_dead_node(tw)) { | ||
| 539 | tcp_tw_put(tw); | ||
| 540 | if (--tcp_tw_count == 0) | ||
| 541 | del_timer(&tcp_tw_timer); | ||
| 542 | } | ||
| 543 | spin_unlock(&tw_death_lock); | ||
| 544 | tcp_timewait_kill(tw); | ||
| 545 | } | ||
| 546 | |||
| 547 | /* Short-time timewait calendar */ | ||
| 548 | |||
| 549 | static int tcp_twcal_hand = -1; | ||
| 550 | static int tcp_twcal_jiffie; | ||
| 551 | static void tcp_twcal_tick(unsigned long); | ||
| 552 | static struct timer_list tcp_twcal_timer = | ||
| 553 | TIMER_INITIALIZER(tcp_twcal_tick, 0, 0); | ||
| 554 | static struct hlist_head tcp_twcal_row[TCP_TW_RECYCLE_SLOTS]; | ||
| 555 | |||
| 556 | static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo) | ||
| 557 | { | ||
| 558 | struct hlist_head *list; | ||
| 559 | int slot; | ||
| 560 | |||
| 561 | /* timeout := RTO * 3.5 | ||
| 562 | * | ||
| 563 | * 3.5 = 1+2+0.5 to wait for two retransmits. | ||
| 564 | * | ||
| 565 | * RATIONALE: if FIN arrived and we entered TIME-WAIT state, | ||
| 566 | * our ACK acking that FIN can be lost. If N subsequent retransmitted | ||
| 567 | * FINs (or previous seqments) are lost (probability of such event | ||
| 568 | * is p^(N+1), where p is probability to lose single packet and | ||
| 569 | * time to detect the loss is about RTO*(2^N - 1) with exponential | ||
| 570 | * backoff). Normal timewait length is calculated so, that we | ||
| 571 | * waited at least for one retransmitted FIN (maximal RTO is 120sec). | ||
| 572 | * [ BTW Linux. following BSD, violates this requirement waiting | ||
| 573 | * only for 60sec, we should wait at least for 240 secs. | ||
| 574 | * Well, 240 consumes too much of resources 8) | ||
| 575 | * ] | ||
| 576 | * This interval is not reduced to catch old duplicate and | ||
| 577 | * responces to our wandering segments living for two MSLs. | ||
| 578 | * However, if we use PAWS to detect | ||
| 579 | * old duplicates, we can reduce the interval to bounds required | ||
| 580 | * by RTO, rather than MSL. So, if peer understands PAWS, we | ||
| 581 | * kill tw bucket after 3.5*RTO (it is important that this number | ||
| 582 | * is greater than TS tick!) and detect old duplicates with help | ||
| 583 | * of PAWS. | ||
| 584 | */ | ||
| 585 | slot = (timeo + (1<<TCP_TW_RECYCLE_TICK) - 1) >> TCP_TW_RECYCLE_TICK; | ||
| 586 | |||
| 587 | spin_lock(&tw_death_lock); | ||
| 588 | |||
| 589 | /* Unlink it, if it was scheduled */ | ||
| 590 | if (tw_del_dead_node(tw)) | ||
| 591 | tcp_tw_count--; | ||
| 592 | else | ||
| 593 | atomic_inc(&tw->tw_refcnt); | ||
| 594 | |||
| 595 | if (slot >= TCP_TW_RECYCLE_SLOTS) { | ||
| 596 | /* Schedule to slow timer */ | ||
| 597 | if (timeo >= TCP_TIMEWAIT_LEN) { | ||
| 598 | slot = TCP_TWKILL_SLOTS-1; | ||
| 599 | } else { | ||
| 600 | slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD; | ||
| 601 | if (slot >= TCP_TWKILL_SLOTS) | ||
| 602 | slot = TCP_TWKILL_SLOTS-1; | ||
| 603 | } | ||
| 604 | tw->tw_ttd = jiffies + timeo; | ||
| 605 | slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1); | ||
| 606 | list = &tcp_tw_death_row[slot]; | ||
| 607 | } else { | ||
| 608 | tw->tw_ttd = jiffies + (slot << TCP_TW_RECYCLE_TICK); | ||
| 609 | |||
| 610 | if (tcp_twcal_hand < 0) { | ||
| 611 | tcp_twcal_hand = 0; | ||
| 612 | tcp_twcal_jiffie = jiffies; | ||
| 613 | tcp_twcal_timer.expires = tcp_twcal_jiffie + (slot<<TCP_TW_RECYCLE_TICK); | ||
| 614 | add_timer(&tcp_twcal_timer); | ||
| 615 | } else { | ||
| 616 | if (time_after(tcp_twcal_timer.expires, jiffies + (slot<<TCP_TW_RECYCLE_TICK))) | ||
| 617 | mod_timer(&tcp_twcal_timer, jiffies + (slot<<TCP_TW_RECYCLE_TICK)); | ||
| 618 | slot = (tcp_twcal_hand + slot)&(TCP_TW_RECYCLE_SLOTS-1); | ||
| 619 | } | ||
| 620 | list = &tcp_twcal_row[slot]; | ||
| 621 | } | ||
| 622 | |||
| 623 | hlist_add_head(&tw->tw_death_node, list); | ||
| 624 | |||
| 625 | if (tcp_tw_count++ == 0) | ||
| 626 | mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD); | ||
| 627 | spin_unlock(&tw_death_lock); | ||
| 628 | } | ||
| 629 | |||
| 630 | void tcp_twcal_tick(unsigned long dummy) | ||
| 631 | { | ||
| 632 | int n, slot; | ||
| 633 | unsigned long j; | ||
| 634 | unsigned long now = jiffies; | ||
| 635 | int killed = 0; | ||
| 636 | int adv = 0; | ||
| 637 | |||
| 638 | spin_lock(&tw_death_lock); | ||
| 639 | if (tcp_twcal_hand < 0) | ||
| 640 | goto out; | ||
| 641 | |||
| 642 | slot = tcp_twcal_hand; | ||
| 643 | j = tcp_twcal_jiffie; | ||
| 644 | |||
| 645 | for (n=0; n<TCP_TW_RECYCLE_SLOTS; n++) { | ||
| 646 | if (time_before_eq(j, now)) { | ||
| 647 | struct hlist_node *node, *safe; | ||
| 648 | struct tcp_tw_bucket *tw; | ||
| 649 | |||
| 650 | tw_for_each_inmate_safe(tw, node, safe, | ||
| 651 | &tcp_twcal_row[slot]) { | ||
| 652 | __tw_del_dead_node(tw); | ||
| 653 | tcp_timewait_kill(tw); | ||
| 654 | tcp_tw_put(tw); | ||
| 655 | killed++; | ||
| 656 | } | ||
| 657 | } else { | ||
| 658 | if (!adv) { | ||
| 659 | adv = 1; | ||
| 660 | tcp_twcal_jiffie = j; | ||
| 661 | tcp_twcal_hand = slot; | ||
| 662 | } | ||
| 663 | |||
| 664 | if (!hlist_empty(&tcp_twcal_row[slot])) { | ||
| 665 | mod_timer(&tcp_twcal_timer, j); | ||
| 666 | goto out; | ||
| 667 | } | ||
| 668 | } | ||
| 669 | j += (1<<TCP_TW_RECYCLE_TICK); | ||
| 670 | slot = (slot+1)&(TCP_TW_RECYCLE_SLOTS-1); | ||
| 671 | } | ||
| 672 | tcp_twcal_hand = -1; | ||
| 673 | |||
| 674 | out: | ||
| 675 | if ((tcp_tw_count -= killed) == 0) | ||
| 676 | del_timer(&tcp_tw_timer); | ||
| 677 | NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed); | ||
| 678 | spin_unlock(&tw_death_lock); | ||
| 679 | } | ||
| 680 | |||
| 681 | /* This is not only more efficient than what we used to do, it eliminates | 339 | /* This is not only more efficient than what we used to do, it eliminates |
| 682 | * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM | 340 | * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM |
| 683 | * | 341 | * |
| @@ -686,75 +344,27 @@ out: | |||
| 686 | */ | 344 | */ |
| 687 | struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb) | 345 | struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb) |
| 688 | { | 346 | { |
| 689 | /* allocate the newsk from the same slab of the master sock, | 347 | struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC); |
| 690 | * if not, at sk_free time we'll try to free it from the wrong | ||
| 691 | * slabcache (i.e. is it TCPv4 or v6?), this is handled thru sk->sk_prot -acme */ | ||
| 692 | struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, sk->sk_prot, 0); | ||
| 693 | 348 | ||
| 694 | if(newsk != NULL) { | 349 | if (newsk != NULL) { |
| 695 | struct inet_request_sock *ireq = inet_rsk(req); | 350 | const struct inet_request_sock *ireq = inet_rsk(req); |
| 696 | struct tcp_request_sock *treq = tcp_rsk(req); | 351 | struct tcp_request_sock *treq = tcp_rsk(req); |
| 352 | struct inet_connection_sock *newicsk = inet_csk(sk); | ||
| 697 | struct tcp_sock *newtp; | 353 | struct tcp_sock *newtp; |
| 698 | struct sk_filter *filter; | ||
| 699 | |||
| 700 | memcpy(newsk, sk, sizeof(struct tcp_sock)); | ||
| 701 | newsk->sk_state = TCP_SYN_RECV; | ||
| 702 | |||
| 703 | /* SANITY */ | ||
| 704 | sk_node_init(&newsk->sk_node); | ||
| 705 | tcp_sk(newsk)->bind_hash = NULL; | ||
| 706 | |||
| 707 | /* Clone the TCP header template */ | ||
| 708 | inet_sk(newsk)->dport = ireq->rmt_port; | ||
| 709 | |||
| 710 | sock_lock_init(newsk); | ||
| 711 | bh_lock_sock(newsk); | ||
| 712 | |||
| 713 | rwlock_init(&newsk->sk_dst_lock); | ||
| 714 | atomic_set(&newsk->sk_rmem_alloc, 0); | ||
| 715 | skb_queue_head_init(&newsk->sk_receive_queue); | ||
| 716 | atomic_set(&newsk->sk_wmem_alloc, 0); | ||
| 717 | skb_queue_head_init(&newsk->sk_write_queue); | ||
| 718 | atomic_set(&newsk->sk_omem_alloc, 0); | ||
| 719 | newsk->sk_wmem_queued = 0; | ||
| 720 | newsk->sk_forward_alloc = 0; | ||
| 721 | |||
| 722 | sock_reset_flag(newsk, SOCK_DONE); | ||
| 723 | newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; | ||
| 724 | newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; | ||
| 725 | newsk->sk_send_head = NULL; | ||
| 726 | rwlock_init(&newsk->sk_callback_lock); | ||
| 727 | skb_queue_head_init(&newsk->sk_error_queue); | ||
| 728 | newsk->sk_write_space = sk_stream_write_space; | ||
| 729 | |||
| 730 | if ((filter = newsk->sk_filter) != NULL) | ||
| 731 | sk_filter_charge(newsk, filter); | ||
| 732 | |||
| 733 | if (unlikely(xfrm_sk_clone_policy(newsk))) { | ||
| 734 | /* It is still raw copy of parent, so invalidate | ||
| 735 | * destructor and make plain sk_free() */ | ||
| 736 | newsk->sk_destruct = NULL; | ||
| 737 | sk_free(newsk); | ||
| 738 | return NULL; | ||
| 739 | } | ||
| 740 | 354 | ||
| 741 | /* Now setup tcp_sock */ | 355 | /* Now setup tcp_sock */ |
| 742 | newtp = tcp_sk(newsk); | 356 | newtp = tcp_sk(newsk); |
| 743 | newtp->pred_flags = 0; | 357 | newtp->pred_flags = 0; |
| 744 | newtp->rcv_nxt = treq->rcv_isn + 1; | 358 | newtp->rcv_nxt = treq->rcv_isn + 1; |
| 745 | newtp->snd_nxt = treq->snt_isn + 1; | 359 | newtp->snd_nxt = newtp->snd_una = newtp->snd_sml = treq->snt_isn + 1; |
| 746 | newtp->snd_una = treq->snt_isn + 1; | ||
| 747 | newtp->snd_sml = treq->snt_isn + 1; | ||
| 748 | 360 | ||
| 749 | tcp_prequeue_init(newtp); | 361 | tcp_prequeue_init(newtp); |
| 750 | 362 | ||
| 751 | tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn); | 363 | tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn); |
| 752 | 364 | ||
| 753 | newtp->retransmits = 0; | ||
| 754 | newtp->backoff = 0; | ||
| 755 | newtp->srtt = 0; | 365 | newtp->srtt = 0; |
| 756 | newtp->mdev = TCP_TIMEOUT_INIT; | 366 | newtp->mdev = TCP_TIMEOUT_INIT; |
| 757 | newtp->rto = TCP_TIMEOUT_INIT; | 367 | newicsk->icsk_rto = TCP_TIMEOUT_INIT; |
| 758 | 368 | ||
| 759 | newtp->packets_out = 0; | 369 | newtp->packets_out = 0; |
| 760 | newtp->left_out = 0; | 370 | newtp->left_out = 0; |
| @@ -774,9 +384,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
| 774 | newtp->frto_counter = 0; | 384 | newtp->frto_counter = 0; |
| 775 | newtp->frto_highmark = 0; | 385 | newtp->frto_highmark = 0; |
| 776 | 386 | ||
| 777 | newtp->ca_ops = &tcp_reno; | 387 | newicsk->icsk_ca_ops = &tcp_reno; |
| 778 | 388 | ||
| 779 | tcp_set_ca_state(newtp, TCP_CA_Open); | 389 | tcp_set_ca_state(newsk, TCP_CA_Open); |
| 780 | tcp_init_xmit_timers(newsk); | 390 | tcp_init_xmit_timers(newsk); |
| 781 | skb_queue_head_init(&newtp->out_of_order_queue); | 391 | skb_queue_head_init(&newtp->out_of_order_queue); |
| 782 | newtp->rcv_wup = treq->rcv_isn + 1; | 392 | newtp->rcv_wup = treq->rcv_isn + 1; |
| @@ -789,26 +399,12 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
| 789 | newtp->rx_opt.dsack = 0; | 399 | newtp->rx_opt.dsack = 0; |
| 790 | newtp->rx_opt.eff_sacks = 0; | 400 | newtp->rx_opt.eff_sacks = 0; |
| 791 | 401 | ||
| 792 | newtp->probes_out = 0; | ||
| 793 | newtp->rx_opt.num_sacks = 0; | 402 | newtp->rx_opt.num_sacks = 0; |
| 794 | newtp->urg_data = 0; | 403 | newtp->urg_data = 0; |
| 795 | /* Deinitialize accept_queue to trap illegal accesses. */ | ||
| 796 | memset(&newtp->accept_queue, 0, sizeof(newtp->accept_queue)); | ||
| 797 | |||
| 798 | /* Back to base struct sock members. */ | ||
| 799 | newsk->sk_err = 0; | ||
| 800 | newsk->sk_priority = 0; | ||
| 801 | atomic_set(&newsk->sk_refcnt, 2); | ||
| 802 | #ifdef INET_REFCNT_DEBUG | ||
| 803 | atomic_inc(&inet_sock_nr); | ||
| 804 | #endif | ||
| 805 | atomic_inc(&tcp_sockets_allocated); | ||
| 806 | 404 | ||
| 807 | if (sock_flag(newsk, SOCK_KEEPOPEN)) | 405 | if (sock_flag(newsk, SOCK_KEEPOPEN)) |
| 808 | tcp_reset_keepalive_timer(newsk, | 406 | inet_csk_reset_keepalive_timer(newsk, |
| 809 | keepalive_time_when(newtp)); | 407 | keepalive_time_when(newtp)); |
| 810 | newsk->sk_socket = NULL; | ||
| 811 | newsk->sk_sleep = NULL; | ||
| 812 | 408 | ||
| 813 | newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; | 409 | newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; |
| 814 | if((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) { | 410 | if((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) { |
| @@ -838,7 +434,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
| 838 | newtp->tcp_header_len = sizeof(struct tcphdr); | 434 | newtp->tcp_header_len = sizeof(struct tcphdr); |
| 839 | } | 435 | } |
| 840 | if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len) | 436 | if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len) |
| 841 | newtp->ack.last_seg_size = skb->len-newtp->tcp_header_len; | 437 | newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; |
| 842 | newtp->rx_opt.mss_clamp = req->mss; | 438 | newtp->rx_opt.mss_clamp = req->mss; |
| 843 | TCP_ECN_openreq_child(newtp, req); | 439 | TCP_ECN_openreq_child(newtp, req); |
| 844 | if (newtp->ecn_flags&TCP_ECN_OK) | 440 | if (newtp->ecn_flags&TCP_ECN_OK) |
| @@ -934,9 +530,10 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, | |||
| 934 | does sequence test, SYN is truncated, and thus we consider | 530 | does sequence test, SYN is truncated, and thus we consider |
| 935 | it a bare ACK. | 531 | it a bare ACK. |
| 936 | 532 | ||
| 937 | If tp->defer_accept, we silently drop this bare ACK. Otherwise, | 533 | If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this |
| 938 | we create an established connection. Both ends (listening sockets) | 534 | bare ACK. Otherwise, we create an established connection. Both |
| 939 | accept the new incoming connection and try to talk to each other. 8-) | 535 | ends (listening sockets) accept the new incoming connection and try |
| 536 | to talk to each other. 8-) | ||
| 940 | 537 | ||
| 941 | Note: This case is both harmless, and rare. Possibility is about the | 538 | Note: This case is both harmless, and rare. Possibility is about the |
| 942 | same as us discovering intelligent life on another plant tomorrow. | 539 | same as us discovering intelligent life on another plant tomorrow. |
| @@ -1003,7 +600,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, | |||
| 1003 | return NULL; | 600 | return NULL; |
| 1004 | 601 | ||
| 1005 | /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */ | 602 | /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */ |
| 1006 | if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { | 603 | if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && |
| 604 | TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { | ||
| 1007 | inet_rsk(req)->acked = 1; | 605 | inet_rsk(req)->acked = 1; |
| 1008 | return NULL; | 606 | return NULL; |
| 1009 | } | 607 | } |
| @@ -1018,10 +616,10 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, | |||
| 1018 | if (child == NULL) | 616 | if (child == NULL) |
| 1019 | goto listen_overflow; | 617 | goto listen_overflow; |
| 1020 | 618 | ||
| 1021 | tcp_synq_unlink(tp, req, prev); | 619 | inet_csk_reqsk_queue_unlink(sk, req, prev); |
| 1022 | tcp_synq_removed(sk, req); | 620 | inet_csk_reqsk_queue_removed(sk, req); |
| 1023 | 621 | ||
| 1024 | tcp_acceptq_queue(sk, req, child); | 622 | inet_csk_reqsk_queue_add(sk, req, child); |
| 1025 | return child; | 623 | return child; |
| 1026 | 624 | ||
| 1027 | listen_overflow: | 625 | listen_overflow: |
| @@ -1035,7 +633,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, | |||
| 1035 | if (!(flg & TCP_FLAG_RST)) | 633 | if (!(flg & TCP_FLAG_RST)) |
| 1036 | req->rsk_ops->send_reset(skb); | 634 | req->rsk_ops->send_reset(skb); |
| 1037 | 635 | ||
| 1038 | tcp_synq_drop(sk, req, prev); | 636 | inet_csk_reqsk_queue_drop(sk, req, prev); |
| 1039 | return NULL; | 637 | return NULL; |
| 1040 | } | 638 | } |
| 1041 | 639 | ||
| @@ -1074,4 +672,3 @@ EXPORT_SYMBOL(tcp_check_req); | |||
| 1074 | EXPORT_SYMBOL(tcp_child_process); | 672 | EXPORT_SYMBOL(tcp_child_process); |
| 1075 | EXPORT_SYMBOL(tcp_create_openreq_child); | 673 | EXPORT_SYMBOL(tcp_create_openreq_child); |
| 1076 | EXPORT_SYMBOL(tcp_timewait_state_process); | 674 | EXPORT_SYMBOL(tcp_timewait_state_process); |
| 1077 | EXPORT_SYMBOL(tcp_tw_deschedule); | ||
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index dd30dd137b74..75b68116682a 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
| @@ -105,18 +105,19 @@ static __u16 tcp_advertise_mss(struct sock *sk) | |||
| 105 | 105 | ||
| 106 | /* RFC2861. Reset CWND after idle period longer RTO to "restart window". | 106 | /* RFC2861. Reset CWND after idle period longer RTO to "restart window". |
| 107 | * This is the first part of cwnd validation mechanism. */ | 107 | * This is the first part of cwnd validation mechanism. */ |
| 108 | static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst) | 108 | static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst) |
| 109 | { | 109 | { |
| 110 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 110 | s32 delta = tcp_time_stamp - tp->lsndtime; | 111 | s32 delta = tcp_time_stamp - tp->lsndtime; |
| 111 | u32 restart_cwnd = tcp_init_cwnd(tp, dst); | 112 | u32 restart_cwnd = tcp_init_cwnd(tp, dst); |
| 112 | u32 cwnd = tp->snd_cwnd; | 113 | u32 cwnd = tp->snd_cwnd; |
| 113 | 114 | ||
| 114 | tcp_ca_event(tp, CA_EVENT_CWND_RESTART); | 115 | tcp_ca_event(sk, CA_EVENT_CWND_RESTART); |
| 115 | 116 | ||
| 116 | tp->snd_ssthresh = tcp_current_ssthresh(tp); | 117 | tp->snd_ssthresh = tcp_current_ssthresh(sk); |
| 117 | restart_cwnd = min(restart_cwnd, cwnd); | 118 | restart_cwnd = min(restart_cwnd, cwnd); |
| 118 | 119 | ||
| 119 | while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd) | 120 | while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd) |
| 120 | cwnd >>= 1; | 121 | cwnd >>= 1; |
| 121 | tp->snd_cwnd = max(cwnd, restart_cwnd); | 122 | tp->snd_cwnd = max(cwnd, restart_cwnd); |
| 122 | tp->snd_cwnd_stamp = tcp_time_stamp; | 123 | tp->snd_cwnd_stamp = tcp_time_stamp; |
| @@ -126,26 +127,25 @@ static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst) | |||
| 126 | static inline void tcp_event_data_sent(struct tcp_sock *tp, | 127 | static inline void tcp_event_data_sent(struct tcp_sock *tp, |
| 127 | struct sk_buff *skb, struct sock *sk) | 128 | struct sk_buff *skb, struct sock *sk) |
| 128 | { | 129 | { |
| 129 | u32 now = tcp_time_stamp; | 130 | struct inet_connection_sock *icsk = inet_csk(sk); |
| 131 | const u32 now = tcp_time_stamp; | ||
| 130 | 132 | ||
| 131 | if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto) | 133 | if (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto) |
| 132 | tcp_cwnd_restart(tp, __sk_dst_get(sk)); | 134 | tcp_cwnd_restart(sk, __sk_dst_get(sk)); |
| 133 | 135 | ||
| 134 | tp->lsndtime = now; | 136 | tp->lsndtime = now; |
| 135 | 137 | ||
| 136 | /* If it is a reply for ato after last received | 138 | /* If it is a reply for ato after last received |
| 137 | * packet, enter pingpong mode. | 139 | * packet, enter pingpong mode. |
| 138 | */ | 140 | */ |
| 139 | if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato) | 141 | if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato) |
| 140 | tp->ack.pingpong = 1; | 142 | icsk->icsk_ack.pingpong = 1; |
| 141 | } | 143 | } |
| 142 | 144 | ||
| 143 | static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) | 145 | static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) |
| 144 | { | 146 | { |
| 145 | struct tcp_sock *tp = tcp_sk(sk); | 147 | tcp_dec_quickack_mode(sk, pkts); |
| 146 | 148 | inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); | |
| 147 | tcp_dec_quickack_mode(tp, pkts); | ||
| 148 | tcp_clear_xmit_timer(sk, TCP_TIME_DACK); | ||
| 149 | } | 149 | } |
| 150 | 150 | ||
| 151 | /* Determine a window scaling and initial window to offer. | 151 | /* Determine a window scaling and initial window to offer. |
| @@ -265,6 +265,7 @@ static __inline__ u16 tcp_select_window(struct sock *sk) | |||
| 265 | static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) | 265 | static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) |
| 266 | { | 266 | { |
| 267 | if (skb != NULL) { | 267 | if (skb != NULL) { |
| 268 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 268 | struct inet_sock *inet = inet_sk(sk); | 269 | struct inet_sock *inet = inet_sk(sk); |
| 269 | struct tcp_sock *tp = tcp_sk(sk); | 270 | struct tcp_sock *tp = tcp_sk(sk); |
| 270 | struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); | 271 | struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); |
| @@ -280,8 +281,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) | |||
| 280 | #define SYSCTL_FLAG_SACK 0x4 | 281 | #define SYSCTL_FLAG_SACK 0x4 |
| 281 | 282 | ||
| 282 | /* If congestion control is doing timestamping */ | 283 | /* If congestion control is doing timestamping */ |
| 283 | if (tp->ca_ops->rtt_sample) | 284 | if (icsk->icsk_ca_ops->rtt_sample) |
| 284 | do_gettimeofday(&skb->stamp); | 285 | __net_timestamp(skb); |
| 285 | 286 | ||
| 286 | sysctl_flags = 0; | 287 | sysctl_flags = 0; |
| 287 | if (tcb->flags & TCPCB_FLAG_SYN) { | 288 | if (tcb->flags & TCPCB_FLAG_SYN) { |
| @@ -308,7 +309,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) | |||
| 308 | } | 309 | } |
| 309 | 310 | ||
| 310 | if (tcp_packets_in_flight(tp) == 0) | 311 | if (tcp_packets_in_flight(tp) == 0) |
| 311 | tcp_ca_event(tp, CA_EVENT_TX_START); | 312 | tcp_ca_event(sk, CA_EVENT_TX_START); |
| 312 | 313 | ||
| 313 | th = (struct tcphdr *) skb_push(skb, tcp_header_size); | 314 | th = (struct tcphdr *) skb_push(skb, tcp_header_size); |
| 314 | skb->h.th = th; | 315 | skb->h.th = th; |
| @@ -366,7 +367,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) | |||
| 366 | if (err <= 0) | 367 | if (err <= 0) |
| 367 | return err; | 368 | return err; |
| 368 | 369 | ||
| 369 | tcp_enter_cwr(tp); | 370 | tcp_enter_cwr(sk); |
| 370 | 371 | ||
| 371 | /* NET_XMIT_CN is special. It does not guarantee, | 372 | /* NET_XMIT_CN is special. It does not guarantee, |
| 372 | * that this packet is lost. It tells that device | 373 | * that this packet is lost. It tells that device |
| @@ -482,7 +483,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned | |||
| 482 | * skbs, which it never sent before. --ANK | 483 | * skbs, which it never sent before. --ANK |
| 483 | */ | 484 | */ |
| 484 | TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; | 485 | TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; |
| 485 | buff->stamp = skb->stamp; | 486 | buff->tstamp = skb->tstamp; |
| 486 | 487 | ||
| 487 | if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { | 488 | if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { |
| 488 | tp->lost_out -= tcp_skb_pcount(skb); | 489 | tp->lost_out -= tcp_skb_pcount(skb); |
| @@ -505,7 +506,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned | |||
| 505 | 506 | ||
| 506 | /* Link BUFF into the send queue. */ | 507 | /* Link BUFF into the send queue. */ |
| 507 | skb_header_release(buff); | 508 | skb_header_release(buff); |
| 508 | __skb_append(skb, buff); | 509 | __skb_append(skb, buff, &sk->sk_write_queue); |
| 509 | 510 | ||
| 510 | return 0; | 511 | return 0; |
| 511 | } | 512 | } |
| @@ -696,7 +697,7 @@ static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) | |||
| 696 | if (tp->packets_out > tp->snd_cwnd_used) | 697 | if (tp->packets_out > tp->snd_cwnd_used) |
| 697 | tp->snd_cwnd_used = tp->packets_out; | 698 | tp->snd_cwnd_used = tp->packets_out; |
| 698 | 699 | ||
| 699 | if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto) | 700 | if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto) |
| 700 | tcp_cwnd_application_limited(sk); | 701 | tcp_cwnd_application_limited(sk); |
| 701 | } | 702 | } |
| 702 | } | 703 | } |
| @@ -893,7 +894,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, | |||
| 893 | 894 | ||
| 894 | /* Link BUFF into the send queue. */ | 895 | /* Link BUFF into the send queue. */ |
| 895 | skb_header_release(buff); | 896 | skb_header_release(buff); |
| 896 | __skb_append(skb, buff); | 897 | __skb_append(skb, buff, &sk->sk_write_queue); |
| 897 | 898 | ||
| 898 | return 0; | 899 | return 0; |
| 899 | } | 900 | } |
| @@ -905,12 +906,13 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, | |||
| 905 | */ | 906 | */ |
| 906 | static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) | 907 | static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) |
| 907 | { | 908 | { |
| 909 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 908 | u32 send_win, cong_win, limit, in_flight; | 910 | u32 send_win, cong_win, limit, in_flight; |
| 909 | 911 | ||
| 910 | if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) | 912 | if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) |
| 911 | return 0; | 913 | return 0; |
| 912 | 914 | ||
| 913 | if (tp->ca_state != TCP_CA_Open) | 915 | if (icsk->icsk_ca_state != TCP_CA_Open) |
| 914 | return 0; | 916 | return 0; |
| 915 | 917 | ||
| 916 | in_flight = tcp_packets_in_flight(tp); | 918 | in_flight = tcp_packets_in_flight(tp); |
| @@ -1147,6 +1149,7 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now) | |||
| 1147 | */ | 1149 | */ |
| 1148 | u32 __tcp_select_window(struct sock *sk) | 1150 | u32 __tcp_select_window(struct sock *sk) |
| 1149 | { | 1151 | { |
| 1152 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 1150 | struct tcp_sock *tp = tcp_sk(sk); | 1153 | struct tcp_sock *tp = tcp_sk(sk); |
| 1151 | /* MSS for the peer's data. Previous verions used mss_clamp | 1154 | /* MSS for the peer's data. Previous verions used mss_clamp |
| 1152 | * here. I don't know if the value based on our guesses | 1155 | * here. I don't know if the value based on our guesses |
| @@ -1154,7 +1157,7 @@ u32 __tcp_select_window(struct sock *sk) | |||
| 1154 | * but may be worse for the performance because of rcv_mss | 1157 | * but may be worse for the performance because of rcv_mss |
| 1155 | * fluctuations. --SAW 1998/11/1 | 1158 | * fluctuations. --SAW 1998/11/1 |
| 1156 | */ | 1159 | */ |
| 1157 | int mss = tp->ack.rcv_mss; | 1160 | int mss = icsk->icsk_ack.rcv_mss; |
| 1158 | int free_space = tcp_space(sk); | 1161 | int free_space = tcp_space(sk); |
| 1159 | int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk)); | 1162 | int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk)); |
| 1160 | int window; | 1163 | int window; |
| @@ -1163,7 +1166,7 @@ u32 __tcp_select_window(struct sock *sk) | |||
| 1163 | mss = full_space; | 1166 | mss = full_space; |
| 1164 | 1167 | ||
| 1165 | if (free_space < full_space/2) { | 1168 | if (free_space < full_space/2) { |
| 1166 | tp->ack.quick = 0; | 1169 | icsk->icsk_ack.quick = 0; |
| 1167 | 1170 | ||
| 1168 | if (tcp_memory_pressure) | 1171 | if (tcp_memory_pressure) |
| 1169 | tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss); | 1172 | tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss); |
| @@ -1238,7 +1241,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m | |||
| 1238 | tcp_skb_pcount(next_skb) != 1); | 1241 | tcp_skb_pcount(next_skb) != 1); |
| 1239 | 1242 | ||
| 1240 | /* Ok. We will be able to collapse the packet. */ | 1243 | /* Ok. We will be able to collapse the packet. */ |
| 1241 | __skb_unlink(next_skb, next_skb->list); | 1244 | __skb_unlink(next_skb, &sk->sk_write_queue); |
| 1242 | 1245 | ||
| 1243 | memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size); | 1246 | memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size); |
| 1244 | 1247 | ||
| @@ -1286,6 +1289,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m | |||
| 1286 | */ | 1289 | */ |
| 1287 | void tcp_simple_retransmit(struct sock *sk) | 1290 | void tcp_simple_retransmit(struct sock *sk) |
| 1288 | { | 1291 | { |
| 1292 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 1289 | struct tcp_sock *tp = tcp_sk(sk); | 1293 | struct tcp_sock *tp = tcp_sk(sk); |
| 1290 | struct sk_buff *skb; | 1294 | struct sk_buff *skb; |
| 1291 | unsigned int mss = tcp_current_mss(sk, 0); | 1295 | unsigned int mss = tcp_current_mss(sk, 0); |
| @@ -1316,12 +1320,12 @@ void tcp_simple_retransmit(struct sock *sk) | |||
| 1316 | * in network, but units changed and effective | 1320 | * in network, but units changed and effective |
| 1317 | * cwnd/ssthresh really reduced now. | 1321 | * cwnd/ssthresh really reduced now. |
| 1318 | */ | 1322 | */ |
| 1319 | if (tp->ca_state != TCP_CA_Loss) { | 1323 | if (icsk->icsk_ca_state != TCP_CA_Loss) { |
| 1320 | tp->high_seq = tp->snd_nxt; | 1324 | tp->high_seq = tp->snd_nxt; |
| 1321 | tp->snd_ssthresh = tcp_current_ssthresh(tp); | 1325 | tp->snd_ssthresh = tcp_current_ssthresh(sk); |
| 1322 | tp->prior_ssthresh = 0; | 1326 | tp->prior_ssthresh = 0; |
| 1323 | tp->undo_marker = 0; | 1327 | tp->undo_marker = 0; |
| 1324 | tcp_set_ca_state(tp, TCP_CA_Loss); | 1328 | tcp_set_ca_state(sk, TCP_CA_Loss); |
| 1325 | } | 1329 | } |
| 1326 | tcp_xmit_retransmit_queue(sk); | 1330 | tcp_xmit_retransmit_queue(sk); |
| 1327 | } | 1331 | } |
| @@ -1461,6 +1465,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | |||
| 1461 | */ | 1465 | */ |
| 1462 | void tcp_xmit_retransmit_queue(struct sock *sk) | 1466 | void tcp_xmit_retransmit_queue(struct sock *sk) |
| 1463 | { | 1467 | { |
| 1468 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 1464 | struct tcp_sock *tp = tcp_sk(sk); | 1469 | struct tcp_sock *tp = tcp_sk(sk); |
| 1465 | struct sk_buff *skb; | 1470 | struct sk_buff *skb; |
| 1466 | int packet_cnt = tp->lost_out; | 1471 | int packet_cnt = tp->lost_out; |
| @@ -1484,14 +1489,16 @@ void tcp_xmit_retransmit_queue(struct sock *sk) | |||
| 1484 | if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { | 1489 | if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { |
| 1485 | if (tcp_retransmit_skb(sk, skb)) | 1490 | if (tcp_retransmit_skb(sk, skb)) |
| 1486 | return; | 1491 | return; |
| 1487 | if (tp->ca_state != TCP_CA_Loss) | 1492 | if (icsk->icsk_ca_state != TCP_CA_Loss) |
| 1488 | NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS); | 1493 | NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS); |
| 1489 | else | 1494 | else |
| 1490 | NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS); | 1495 | NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS); |
| 1491 | 1496 | ||
| 1492 | if (skb == | 1497 | if (skb == |
| 1493 | skb_peek(&sk->sk_write_queue)) | 1498 | skb_peek(&sk->sk_write_queue)) |
| 1494 | tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); | 1499 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, |
| 1500 | inet_csk(sk)->icsk_rto, | ||
| 1501 | TCP_RTO_MAX); | ||
| 1495 | } | 1502 | } |
| 1496 | 1503 | ||
| 1497 | packet_cnt -= tcp_skb_pcount(skb); | 1504 | packet_cnt -= tcp_skb_pcount(skb); |
| @@ -1504,7 +1511,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) | |||
| 1504 | /* OK, demanded retransmission is finished. */ | 1511 | /* OK, demanded retransmission is finished. */ |
| 1505 | 1512 | ||
| 1506 | /* Forward retransmissions are possible only during Recovery. */ | 1513 | /* Forward retransmissions are possible only during Recovery. */ |
| 1507 | if (tp->ca_state != TCP_CA_Recovery) | 1514 | if (icsk->icsk_ca_state != TCP_CA_Recovery) |
| 1508 | return; | 1515 | return; |
| 1509 | 1516 | ||
| 1510 | /* No forward retransmissions in Reno are possible. */ | 1517 | /* No forward retransmissions in Reno are possible. */ |
| @@ -1544,7 +1551,9 @@ void tcp_xmit_retransmit_queue(struct sock *sk) | |||
| 1544 | break; | 1551 | break; |
| 1545 | 1552 | ||
| 1546 | if (skb == skb_peek(&sk->sk_write_queue)) | 1553 | if (skb == skb_peek(&sk->sk_write_queue)) |
| 1547 | tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); | 1554 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, |
| 1555 | inet_csk(sk)->icsk_rto, | ||
| 1556 | TCP_RTO_MAX); | ||
| 1548 | 1557 | ||
| 1549 | NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS); | 1558 | NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS); |
| 1550 | } | 1559 | } |
| @@ -1573,7 +1582,7 @@ void tcp_send_fin(struct sock *sk) | |||
| 1573 | } else { | 1582 | } else { |
| 1574 | /* Socket is locked, keep trying until memory is available. */ | 1583 | /* Socket is locked, keep trying until memory is available. */ |
| 1575 | for (;;) { | 1584 | for (;;) { |
| 1576 | skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL); | 1585 | skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_KERNEL); |
| 1577 | if (skb) | 1586 | if (skb) |
| 1578 | break; | 1587 | break; |
| 1579 | yield(); | 1588 | yield(); |
| @@ -1780,8 +1789,8 @@ static inline void tcp_connect_init(struct sock *sk) | |||
| 1780 | tp->rcv_wup = 0; | 1789 | tp->rcv_wup = 0; |
| 1781 | tp->copied_seq = 0; | 1790 | tp->copied_seq = 0; |
| 1782 | 1791 | ||
| 1783 | tp->rto = TCP_TIMEOUT_INIT; | 1792 | inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; |
| 1784 | tp->retransmits = 0; | 1793 | inet_csk(sk)->icsk_retransmits = 0; |
| 1785 | tcp_clear_retrans(tp); | 1794 | tcp_clear_retrans(tp); |
| 1786 | } | 1795 | } |
| 1787 | 1796 | ||
| @@ -1795,7 +1804,7 @@ int tcp_connect(struct sock *sk) | |||
| 1795 | 1804 | ||
| 1796 | tcp_connect_init(sk); | 1805 | tcp_connect_init(sk); |
| 1797 | 1806 | ||
| 1798 | buff = alloc_skb(MAX_TCP_HEADER + 15, sk->sk_allocation); | 1807 | buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation); |
| 1799 | if (unlikely(buff == NULL)) | 1808 | if (unlikely(buff == NULL)) |
| 1800 | return -ENOBUFS; | 1809 | return -ENOBUFS; |
| 1801 | 1810 | ||
| @@ -1824,7 +1833,8 @@ int tcp_connect(struct sock *sk) | |||
| 1824 | TCP_INC_STATS(TCP_MIB_ACTIVEOPENS); | 1833 | TCP_INC_STATS(TCP_MIB_ACTIVEOPENS); |
| 1825 | 1834 | ||
| 1826 | /* Timer for repeating the SYN until an answer. */ | 1835 | /* Timer for repeating the SYN until an answer. */ |
| 1827 | tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); | 1836 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, |
| 1837 | inet_csk(sk)->icsk_rto, TCP_RTO_MAX); | ||
| 1828 | return 0; | 1838 | return 0; |
| 1829 | } | 1839 | } |
| 1830 | 1840 | ||
| @@ -1834,20 +1844,21 @@ int tcp_connect(struct sock *sk) | |||
| 1834 | */ | 1844 | */ |
| 1835 | void tcp_send_delayed_ack(struct sock *sk) | 1845 | void tcp_send_delayed_ack(struct sock *sk) |
| 1836 | { | 1846 | { |
| 1837 | struct tcp_sock *tp = tcp_sk(sk); | 1847 | struct inet_connection_sock *icsk = inet_csk(sk); |
| 1838 | int ato = tp->ack.ato; | 1848 | int ato = icsk->icsk_ack.ato; |
| 1839 | unsigned long timeout; | 1849 | unsigned long timeout; |
| 1840 | 1850 | ||
| 1841 | if (ato > TCP_DELACK_MIN) { | 1851 | if (ato > TCP_DELACK_MIN) { |
| 1852 | const struct tcp_sock *tp = tcp_sk(sk); | ||
| 1842 | int max_ato = HZ/2; | 1853 | int max_ato = HZ/2; |
| 1843 | 1854 | ||
| 1844 | if (tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED)) | 1855 | if (icsk->icsk_ack.pingpong || (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)) |
| 1845 | max_ato = TCP_DELACK_MAX; | 1856 | max_ato = TCP_DELACK_MAX; |
| 1846 | 1857 | ||
| 1847 | /* Slow path, intersegment interval is "high". */ | 1858 | /* Slow path, intersegment interval is "high". */ |
| 1848 | 1859 | ||
| 1849 | /* If some rtt estimate is known, use it to bound delayed ack. | 1860 | /* If some rtt estimate is known, use it to bound delayed ack. |
| 1850 | * Do not use tp->rto here, use results of rtt measurements | 1861 | * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements |
| 1851 | * directly. | 1862 | * directly. |
| 1852 | */ | 1863 | */ |
| 1853 | if (tp->srtt) { | 1864 | if (tp->srtt) { |
| @@ -1864,21 +1875,22 @@ void tcp_send_delayed_ack(struct sock *sk) | |||
| 1864 | timeout = jiffies + ato; | 1875 | timeout = jiffies + ato; |
| 1865 | 1876 | ||
| 1866 | /* Use new timeout only if there wasn't a older one earlier. */ | 1877 | /* Use new timeout only if there wasn't a older one earlier. */ |
| 1867 | if (tp->ack.pending&TCP_ACK_TIMER) { | 1878 | if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) { |
| 1868 | /* If delack timer was blocked or is about to expire, | 1879 | /* If delack timer was blocked or is about to expire, |
| 1869 | * send ACK now. | 1880 | * send ACK now. |
| 1870 | */ | 1881 | */ |
| 1871 | if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) { | 1882 | if (icsk->icsk_ack.blocked || |
| 1883 | time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) { | ||
| 1872 | tcp_send_ack(sk); | 1884 | tcp_send_ack(sk); |
| 1873 | return; | 1885 | return; |
| 1874 | } | 1886 | } |
| 1875 | 1887 | ||
| 1876 | if (!time_before(timeout, tp->ack.timeout)) | 1888 | if (!time_before(timeout, icsk->icsk_ack.timeout)) |
| 1877 | timeout = tp->ack.timeout; | 1889 | timeout = icsk->icsk_ack.timeout; |
| 1878 | } | 1890 | } |
| 1879 | tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER; | 1891 | icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER; |
| 1880 | tp->ack.timeout = timeout; | 1892 | icsk->icsk_ack.timeout = timeout; |
| 1881 | sk_reset_timer(sk, &tp->delack_timer, timeout); | 1893 | sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout); |
| 1882 | } | 1894 | } |
| 1883 | 1895 | ||
| 1884 | /* This routine sends an ack and also updates the window. */ | 1896 | /* This routine sends an ack and also updates the window. */ |
| @@ -1895,9 +1907,10 @@ void tcp_send_ack(struct sock *sk) | |||
| 1895 | */ | 1907 | */ |
| 1896 | buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); | 1908 | buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); |
| 1897 | if (buff == NULL) { | 1909 | if (buff == NULL) { |
| 1898 | tcp_schedule_ack(tp); | 1910 | inet_csk_schedule_ack(sk); |
| 1899 | tp->ack.ato = TCP_ATO_MIN; | 1911 | inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; |
| 1900 | tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX); | 1912 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, |
| 1913 | TCP_DELACK_MAX, TCP_RTO_MAX); | ||
| 1901 | return; | 1914 | return; |
| 1902 | } | 1915 | } |
| 1903 | 1916 | ||
| @@ -2011,6 +2024,7 @@ int tcp_write_wakeup(struct sock *sk) | |||
| 2011 | */ | 2024 | */ |
| 2012 | void tcp_send_probe0(struct sock *sk) | 2025 | void tcp_send_probe0(struct sock *sk) |
| 2013 | { | 2026 | { |
| 2027 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 2014 | struct tcp_sock *tp = tcp_sk(sk); | 2028 | struct tcp_sock *tp = tcp_sk(sk); |
| 2015 | int err; | 2029 | int err; |
| 2016 | 2030 | ||
| @@ -2018,28 +2032,31 @@ void tcp_send_probe0(struct sock *sk) | |||
| 2018 | 2032 | ||
| 2019 | if (tp->packets_out || !sk->sk_send_head) { | 2033 | if (tp->packets_out || !sk->sk_send_head) { |
| 2020 | /* Cancel probe timer, if it is not required. */ | 2034 | /* Cancel probe timer, if it is not required. */ |
| 2021 | tp->probes_out = 0; | 2035 | icsk->icsk_probes_out = 0; |
| 2022 | tp->backoff = 0; | 2036 | icsk->icsk_backoff = 0; |
| 2023 | return; | 2037 | return; |
| 2024 | } | 2038 | } |
| 2025 | 2039 | ||
| 2026 | if (err <= 0) { | 2040 | if (err <= 0) { |
| 2027 | if (tp->backoff < sysctl_tcp_retries2) | 2041 | if (icsk->icsk_backoff < sysctl_tcp_retries2) |
| 2028 | tp->backoff++; | 2042 | icsk->icsk_backoff++; |
| 2029 | tp->probes_out++; | 2043 | icsk->icsk_probes_out++; |
| 2030 | tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, | 2044 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, |
| 2031 | min(tp->rto << tp->backoff, TCP_RTO_MAX)); | 2045 | min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX), |
| 2046 | TCP_RTO_MAX); | ||
| 2032 | } else { | 2047 | } else { |
| 2033 | /* If packet was not sent due to local congestion, | 2048 | /* If packet was not sent due to local congestion, |
| 2034 | * do not backoff and do not remember probes_out. | 2049 | * do not backoff and do not remember icsk_probes_out. |
| 2035 | * Let local senders to fight for local resources. | 2050 | * Let local senders to fight for local resources. |
| 2036 | * | 2051 | * |
| 2037 | * Use accumulated backoff yet. | 2052 | * Use accumulated backoff yet. |
| 2038 | */ | 2053 | */ |
| 2039 | if (!tp->probes_out) | 2054 | if (!icsk->icsk_probes_out) |
| 2040 | tp->probes_out=1; | 2055 | icsk->icsk_probes_out = 1; |
| 2041 | tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, | 2056 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, |
| 2042 | min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL)); | 2057 | min(icsk->icsk_rto << icsk->icsk_backoff, |
| 2058 | TCP_RESOURCE_PROBE_INTERVAL), | ||
| 2059 | TCP_RTO_MAX); | ||
| 2043 | } | 2060 | } |
| 2044 | } | 2061 | } |
| 2045 | 2062 | ||
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index 70e108e15c71..327770bf5522 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c | |||
| @@ -16,9 +16,10 @@ | |||
| 16 | #define TCP_SCALABLE_AI_CNT 50U | 16 | #define TCP_SCALABLE_AI_CNT 50U |
| 17 | #define TCP_SCALABLE_MD_SCALE 3 | 17 | #define TCP_SCALABLE_MD_SCALE 3 |
| 18 | 18 | ||
| 19 | static void tcp_scalable_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, | 19 | static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 rtt, |
| 20 | u32 in_flight, int flag) | 20 | u32 in_flight, int flag) |
| 21 | { | 21 | { |
| 22 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 22 | if (in_flight < tp->snd_cwnd) | 23 | if (in_flight < tp->snd_cwnd) |
| 23 | return; | 24 | return; |
| 24 | 25 | ||
| @@ -35,8 +36,9 @@ static void tcp_scalable_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, | |||
| 35 | tp->snd_cwnd_stamp = tcp_time_stamp; | 36 | tp->snd_cwnd_stamp = tcp_time_stamp; |
| 36 | } | 37 | } |
| 37 | 38 | ||
| 38 | static u32 tcp_scalable_ssthresh(struct tcp_sock *tp) | 39 | static u32 tcp_scalable_ssthresh(struct sock *sk) |
| 39 | { | 40 | { |
| 41 | const struct tcp_sock *tp = tcp_sk(sk); | ||
| 40 | return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U); | 42 | return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U); |
| 41 | } | 43 | } |
| 42 | 44 | ||
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 0084227438c2..415ee47ac1c5 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c | |||
| @@ -36,49 +36,13 @@ static void tcp_write_timer(unsigned long); | |||
| 36 | static void tcp_delack_timer(unsigned long); | 36 | static void tcp_delack_timer(unsigned long); |
| 37 | static void tcp_keepalive_timer (unsigned long data); | 37 | static void tcp_keepalive_timer (unsigned long data); |
| 38 | 38 | ||
| 39 | #ifdef TCP_DEBUG | ||
| 40 | const char tcp_timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n"; | ||
| 41 | EXPORT_SYMBOL(tcp_timer_bug_msg); | ||
| 42 | #endif | ||
| 43 | |||
| 44 | /* | ||
| 45 | * Using different timers for retransmit, delayed acks and probes | ||
| 46 | * We may wish use just one timer maintaining a list of expire jiffies | ||
| 47 | * to optimize. | ||
| 48 | */ | ||
| 49 | |||
| 50 | void tcp_init_xmit_timers(struct sock *sk) | 39 | void tcp_init_xmit_timers(struct sock *sk) |
| 51 | { | 40 | { |
| 52 | struct tcp_sock *tp = tcp_sk(sk); | 41 | inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, |
| 53 | 42 | &tcp_keepalive_timer); | |
| 54 | init_timer(&tp->retransmit_timer); | ||
| 55 | tp->retransmit_timer.function=&tcp_write_timer; | ||
| 56 | tp->retransmit_timer.data = (unsigned long) sk; | ||
| 57 | tp->pending = 0; | ||
| 58 | |||
| 59 | init_timer(&tp->delack_timer); | ||
| 60 | tp->delack_timer.function=&tcp_delack_timer; | ||
| 61 | tp->delack_timer.data = (unsigned long) sk; | ||
| 62 | tp->ack.pending = 0; | ||
| 63 | |||
| 64 | init_timer(&sk->sk_timer); | ||
| 65 | sk->sk_timer.function = &tcp_keepalive_timer; | ||
| 66 | sk->sk_timer.data = (unsigned long)sk; | ||
| 67 | } | 43 | } |
| 68 | 44 | ||
| 69 | void tcp_clear_xmit_timers(struct sock *sk) | 45 | EXPORT_SYMBOL(tcp_init_xmit_timers); |
| 70 | { | ||
| 71 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 72 | |||
| 73 | tp->pending = 0; | ||
| 74 | sk_stop_timer(sk, &tp->retransmit_timer); | ||
| 75 | |||
| 76 | tp->ack.pending = 0; | ||
| 77 | tp->ack.blocked = 0; | ||
| 78 | sk_stop_timer(sk, &tp->delack_timer); | ||
| 79 | |||
| 80 | sk_stop_timer(sk, &sk->sk_timer); | ||
| 81 | } | ||
| 82 | 46 | ||
| 83 | static void tcp_write_err(struct sock *sk) | 47 | static void tcp_write_err(struct sock *sk) |
| 84 | { | 48 | { |
| @@ -155,15 +119,15 @@ static int tcp_orphan_retries(struct sock *sk, int alive) | |||
| 155 | /* A write timeout has occurred. Process the after effects. */ | 119 | /* A write timeout has occurred. Process the after effects. */ |
| 156 | static int tcp_write_timeout(struct sock *sk) | 120 | static int tcp_write_timeout(struct sock *sk) |
| 157 | { | 121 | { |
| 158 | struct tcp_sock *tp = tcp_sk(sk); | 122 | const struct inet_connection_sock *icsk = inet_csk(sk); |
| 159 | int retry_until; | 123 | int retry_until; |
| 160 | 124 | ||
| 161 | if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { | 125 | if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { |
| 162 | if (tp->retransmits) | 126 | if (icsk->icsk_retransmits) |
| 163 | dst_negative_advice(&sk->sk_dst_cache); | 127 | dst_negative_advice(&sk->sk_dst_cache); |
| 164 | retry_until = tp->syn_retries ? : sysctl_tcp_syn_retries; | 128 | retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; |
| 165 | } else { | 129 | } else { |
| 166 | if (tp->retransmits >= sysctl_tcp_retries1) { | 130 | if (icsk->icsk_retransmits >= sysctl_tcp_retries1) { |
| 167 | /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black | 131 | /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black |
| 168 | hole detection. :-( | 132 | hole detection. :-( |
| 169 | 133 | ||
| @@ -189,16 +153,16 @@ static int tcp_write_timeout(struct sock *sk) | |||
| 189 | 153 | ||
| 190 | retry_until = sysctl_tcp_retries2; | 154 | retry_until = sysctl_tcp_retries2; |
| 191 | if (sock_flag(sk, SOCK_DEAD)) { | 155 | if (sock_flag(sk, SOCK_DEAD)) { |
| 192 | int alive = (tp->rto < TCP_RTO_MAX); | 156 | const int alive = (icsk->icsk_rto < TCP_RTO_MAX); |
| 193 | 157 | ||
| 194 | retry_until = tcp_orphan_retries(sk, alive); | 158 | retry_until = tcp_orphan_retries(sk, alive); |
| 195 | 159 | ||
| 196 | if (tcp_out_of_resources(sk, alive || tp->retransmits < retry_until)) | 160 | if (tcp_out_of_resources(sk, alive || icsk->icsk_retransmits < retry_until)) |
| 197 | return 1; | 161 | return 1; |
| 198 | } | 162 | } |
| 199 | } | 163 | } |
| 200 | 164 | ||
| 201 | if (tp->retransmits >= retry_until) { | 165 | if (icsk->icsk_retransmits >= retry_until) { |
| 202 | /* Has it gone just too far? */ | 166 | /* Has it gone just too far? */ |
| 203 | tcp_write_err(sk); | 167 | tcp_write_err(sk); |
| 204 | return 1; | 168 | return 1; |
| @@ -210,26 +174,27 @@ static void tcp_delack_timer(unsigned long data) | |||
| 210 | { | 174 | { |
| 211 | struct sock *sk = (struct sock*)data; | 175 | struct sock *sk = (struct sock*)data; |
| 212 | struct tcp_sock *tp = tcp_sk(sk); | 176 | struct tcp_sock *tp = tcp_sk(sk); |
| 177 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 213 | 178 | ||
| 214 | bh_lock_sock(sk); | 179 | bh_lock_sock(sk); |
| 215 | if (sock_owned_by_user(sk)) { | 180 | if (sock_owned_by_user(sk)) { |
| 216 | /* Try again later. */ | 181 | /* Try again later. */ |
| 217 | tp->ack.blocked = 1; | 182 | icsk->icsk_ack.blocked = 1; |
| 218 | NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED); | 183 | NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED); |
| 219 | sk_reset_timer(sk, &tp->delack_timer, jiffies + TCP_DELACK_MIN); | 184 | sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN); |
| 220 | goto out_unlock; | 185 | goto out_unlock; |
| 221 | } | 186 | } |
| 222 | 187 | ||
| 223 | sk_stream_mem_reclaim(sk); | 188 | sk_stream_mem_reclaim(sk); |
| 224 | 189 | ||
| 225 | if (sk->sk_state == TCP_CLOSE || !(tp->ack.pending & TCP_ACK_TIMER)) | 190 | if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) |
| 226 | goto out; | 191 | goto out; |
| 227 | 192 | ||
| 228 | if (time_after(tp->ack.timeout, jiffies)) { | 193 | if (time_after(icsk->icsk_ack.timeout, jiffies)) { |
| 229 | sk_reset_timer(sk, &tp->delack_timer, tp->ack.timeout); | 194 | sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout); |
| 230 | goto out; | 195 | goto out; |
| 231 | } | 196 | } |
| 232 | tp->ack.pending &= ~TCP_ACK_TIMER; | 197 | icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER; |
| 233 | 198 | ||
| 234 | if (!skb_queue_empty(&tp->ucopy.prequeue)) { | 199 | if (!skb_queue_empty(&tp->ucopy.prequeue)) { |
| 235 | struct sk_buff *skb; | 200 | struct sk_buff *skb; |
| @@ -242,16 +207,16 @@ static void tcp_delack_timer(unsigned long data) | |||
| 242 | tp->ucopy.memory = 0; | 207 | tp->ucopy.memory = 0; |
| 243 | } | 208 | } |
| 244 | 209 | ||
| 245 | if (tcp_ack_scheduled(tp)) { | 210 | if (inet_csk_ack_scheduled(sk)) { |
| 246 | if (!tp->ack.pingpong) { | 211 | if (!icsk->icsk_ack.pingpong) { |
| 247 | /* Delayed ACK missed: inflate ATO. */ | 212 | /* Delayed ACK missed: inflate ATO. */ |
| 248 | tp->ack.ato = min(tp->ack.ato << 1, tp->rto); | 213 | icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto); |
| 249 | } else { | 214 | } else { |
| 250 | /* Delayed ACK missed: leave pingpong mode and | 215 | /* Delayed ACK missed: leave pingpong mode and |
| 251 | * deflate ATO. | 216 | * deflate ATO. |
| 252 | */ | 217 | */ |
| 253 | tp->ack.pingpong = 0; | 218 | icsk->icsk_ack.pingpong = 0; |
| 254 | tp->ack.ato = TCP_ATO_MIN; | 219 | icsk->icsk_ack.ato = TCP_ATO_MIN; |
| 255 | } | 220 | } |
| 256 | tcp_send_ack(sk); | 221 | tcp_send_ack(sk); |
| 257 | NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS); | 222 | NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS); |
| @@ -268,11 +233,12 @@ out_unlock: | |||
| 268 | 233 | ||
| 269 | static void tcp_probe_timer(struct sock *sk) | 234 | static void tcp_probe_timer(struct sock *sk) |
| 270 | { | 235 | { |
| 236 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 271 | struct tcp_sock *tp = tcp_sk(sk); | 237 | struct tcp_sock *tp = tcp_sk(sk); |
| 272 | int max_probes; | 238 | int max_probes; |
| 273 | 239 | ||
| 274 | if (tp->packets_out || !sk->sk_send_head) { | 240 | if (tp->packets_out || !sk->sk_send_head) { |
| 275 | tp->probes_out = 0; | 241 | icsk->icsk_probes_out = 0; |
| 276 | return; | 242 | return; |
| 277 | } | 243 | } |
| 278 | 244 | ||
| @@ -283,7 +249,7 @@ static void tcp_probe_timer(struct sock *sk) | |||
| 283 | * FIXME: We ought not to do it, Solaris 2.5 actually has fixing | 249 | * FIXME: We ought not to do it, Solaris 2.5 actually has fixing |
| 284 | * this behaviour in Solaris down as a bug fix. [AC] | 250 | * this behaviour in Solaris down as a bug fix. [AC] |
| 285 | * | 251 | * |
| 286 | * Let me to explain. probes_out is zeroed by incoming ACKs | 252 | * Let me to explain. icsk_probes_out is zeroed by incoming ACKs |
| 287 | * even if they advertise zero window. Hence, connection is killed only | 253 | * even if they advertise zero window. Hence, connection is killed only |
| 288 | * if we received no ACKs for normal connection timeout. It is not killed | 254 | * if we received no ACKs for normal connection timeout. It is not killed |
| 289 | * only because window stays zero for some time, window may be zero | 255 | * only because window stays zero for some time, window may be zero |
| @@ -294,15 +260,15 @@ static void tcp_probe_timer(struct sock *sk) | |||
| 294 | max_probes = sysctl_tcp_retries2; | 260 | max_probes = sysctl_tcp_retries2; |
| 295 | 261 | ||
| 296 | if (sock_flag(sk, SOCK_DEAD)) { | 262 | if (sock_flag(sk, SOCK_DEAD)) { |
| 297 | int alive = ((tp->rto<<tp->backoff) < TCP_RTO_MAX); | 263 | const int alive = ((icsk->icsk_rto << icsk->icsk_backoff) < TCP_RTO_MAX); |
| 298 | 264 | ||
| 299 | max_probes = tcp_orphan_retries(sk, alive); | 265 | max_probes = tcp_orphan_retries(sk, alive); |
| 300 | 266 | ||
| 301 | if (tcp_out_of_resources(sk, alive || tp->probes_out <= max_probes)) | 267 | if (tcp_out_of_resources(sk, alive || icsk->icsk_probes_out <= max_probes)) |
| 302 | return; | 268 | return; |
| 303 | } | 269 | } |
| 304 | 270 | ||
| 305 | if (tp->probes_out > max_probes) { | 271 | if (icsk->icsk_probes_out > max_probes) { |
| 306 | tcp_write_err(sk); | 272 | tcp_write_err(sk); |
| 307 | } else { | 273 | } else { |
| 308 | /* Only send another probe if we didn't close things up. */ | 274 | /* Only send another probe if we didn't close things up. */ |
| @@ -317,6 +283,7 @@ static void tcp_probe_timer(struct sock *sk) | |||
| 317 | static void tcp_retransmit_timer(struct sock *sk) | 283 | static void tcp_retransmit_timer(struct sock *sk) |
| 318 | { | 284 | { |
| 319 | struct tcp_sock *tp = tcp_sk(sk); | 285 | struct tcp_sock *tp = tcp_sk(sk); |
| 286 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 320 | 287 | ||
| 321 | if (!tp->packets_out) | 288 | if (!tp->packets_out) |
| 322 | goto out; | 289 | goto out; |
| @@ -351,20 +318,21 @@ static void tcp_retransmit_timer(struct sock *sk) | |||
| 351 | if (tcp_write_timeout(sk)) | 318 | if (tcp_write_timeout(sk)) |
| 352 | goto out; | 319 | goto out; |
| 353 | 320 | ||
| 354 | if (tp->retransmits == 0) { | 321 | if (icsk->icsk_retransmits == 0) { |
| 355 | if (tp->ca_state == TCP_CA_Disorder || tp->ca_state == TCP_CA_Recovery) { | 322 | if (icsk->icsk_ca_state == TCP_CA_Disorder || |
| 323 | icsk->icsk_ca_state == TCP_CA_Recovery) { | ||
| 356 | if (tp->rx_opt.sack_ok) { | 324 | if (tp->rx_opt.sack_ok) { |
| 357 | if (tp->ca_state == TCP_CA_Recovery) | 325 | if (icsk->icsk_ca_state == TCP_CA_Recovery) |
| 358 | NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL); | 326 | NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL); |
| 359 | else | 327 | else |
| 360 | NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES); | 328 | NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES); |
| 361 | } else { | 329 | } else { |
| 362 | if (tp->ca_state == TCP_CA_Recovery) | 330 | if (icsk->icsk_ca_state == TCP_CA_Recovery) |
| 363 | NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL); | 331 | NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL); |
| 364 | else | 332 | else |
| 365 | NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES); | 333 | NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES); |
| 366 | } | 334 | } |
| 367 | } else if (tp->ca_state == TCP_CA_Loss) { | 335 | } else if (icsk->icsk_ca_state == TCP_CA_Loss) { |
| 368 | NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES); | 336 | NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES); |
| 369 | } else { | 337 | } else { |
| 370 | NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS); | 338 | NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS); |
| @@ -381,10 +349,11 @@ static void tcp_retransmit_timer(struct sock *sk) | |||
| 381 | /* Retransmission failed because of local congestion, | 349 | /* Retransmission failed because of local congestion, |
| 382 | * do not backoff. | 350 | * do not backoff. |
| 383 | */ | 351 | */ |
| 384 | if (!tp->retransmits) | 352 | if (!icsk->icsk_retransmits) |
| 385 | tp->retransmits=1; | 353 | icsk->icsk_retransmits = 1; |
| 386 | tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, | 354 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, |
| 387 | min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL)); | 355 | min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL), |
| 356 | TCP_RTO_MAX); | ||
| 388 | goto out; | 357 | goto out; |
| 389 | } | 358 | } |
| 390 | 359 | ||
| @@ -403,13 +372,13 @@ static void tcp_retransmit_timer(struct sock *sk) | |||
| 403 | * implemented ftp to mars will work nicely. We will have to fix | 372 | * implemented ftp to mars will work nicely. We will have to fix |
| 404 | * the 120 second clamps though! | 373 | * the 120 second clamps though! |
| 405 | */ | 374 | */ |
| 406 | tp->backoff++; | 375 | icsk->icsk_backoff++; |
| 407 | tp->retransmits++; | 376 | icsk->icsk_retransmits++; |
| 408 | 377 | ||
| 409 | out_reset_timer: | 378 | out_reset_timer: |
| 410 | tp->rto = min(tp->rto << 1, TCP_RTO_MAX); | 379 | icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); |
| 411 | tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); | 380 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); |
| 412 | if (tp->retransmits > sysctl_tcp_retries1) | 381 | if (icsk->icsk_retransmits > sysctl_tcp_retries1) |
| 413 | __sk_dst_reset(sk); | 382 | __sk_dst_reset(sk); |
| 414 | 383 | ||
| 415 | out:; | 384 | out:; |
| @@ -418,32 +387,32 @@ out:; | |||
| 418 | static void tcp_write_timer(unsigned long data) | 387 | static void tcp_write_timer(unsigned long data) |
| 419 | { | 388 | { |
| 420 | struct sock *sk = (struct sock*)data; | 389 | struct sock *sk = (struct sock*)data; |
| 421 | struct tcp_sock *tp = tcp_sk(sk); | 390 | struct inet_connection_sock *icsk = inet_csk(sk); |
| 422 | int event; | 391 | int event; |
| 423 | 392 | ||
| 424 | bh_lock_sock(sk); | 393 | bh_lock_sock(sk); |
| 425 | if (sock_owned_by_user(sk)) { | 394 | if (sock_owned_by_user(sk)) { |
| 426 | /* Try again later */ | 395 | /* Try again later */ |
| 427 | sk_reset_timer(sk, &tp->retransmit_timer, jiffies + (HZ / 20)); | 396 | sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20)); |
| 428 | goto out_unlock; | 397 | goto out_unlock; |
| 429 | } | 398 | } |
| 430 | 399 | ||
| 431 | if (sk->sk_state == TCP_CLOSE || !tp->pending) | 400 | if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending) |
| 432 | goto out; | 401 | goto out; |
| 433 | 402 | ||
| 434 | if (time_after(tp->timeout, jiffies)) { | 403 | if (time_after(icsk->icsk_timeout, jiffies)) { |
| 435 | sk_reset_timer(sk, &tp->retransmit_timer, tp->timeout); | 404 | sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); |
| 436 | goto out; | 405 | goto out; |
| 437 | } | 406 | } |
| 438 | 407 | ||
| 439 | event = tp->pending; | 408 | event = icsk->icsk_pending; |
| 440 | tp->pending = 0; | 409 | icsk->icsk_pending = 0; |
| 441 | 410 | ||
| 442 | switch (event) { | 411 | switch (event) { |
| 443 | case TCP_TIME_RETRANS: | 412 | case ICSK_TIME_RETRANS: |
| 444 | tcp_retransmit_timer(sk); | 413 | tcp_retransmit_timer(sk); |
| 445 | break; | 414 | break; |
| 446 | case TCP_TIME_PROBE0: | 415 | case ICSK_TIME_PROBE0: |
| 447 | tcp_probe_timer(sk); | 416 | tcp_probe_timer(sk); |
| 448 | break; | 417 | break; |
| 449 | } | 418 | } |
| @@ -462,96 +431,8 @@ out_unlock: | |||
| 462 | 431 | ||
| 463 | static void tcp_synack_timer(struct sock *sk) | 432 | static void tcp_synack_timer(struct sock *sk) |
| 464 | { | 433 | { |
| 465 | struct tcp_sock *tp = tcp_sk(sk); | 434 | inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL, |
| 466 | struct listen_sock *lopt = tp->accept_queue.listen_opt; | 435 | TCP_TIMEOUT_INIT, TCP_RTO_MAX); |
| 467 | int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries; | ||
| 468 | int thresh = max_retries; | ||
| 469 | unsigned long now = jiffies; | ||
| 470 | struct request_sock **reqp, *req; | ||
| 471 | int i, budget; | ||
| 472 | |||
| 473 | if (lopt == NULL || lopt->qlen == 0) | ||
| 474 | return; | ||
| 475 | |||
| 476 | /* Normally all the openreqs are young and become mature | ||
| 477 | * (i.e. converted to established socket) for first timeout. | ||
| 478 | * If synack was not acknowledged for 3 seconds, it means | ||
| 479 | * one of the following things: synack was lost, ack was lost, | ||
| 480 | * rtt is high or nobody planned to ack (i.e. synflood). | ||
| 481 | * When server is a bit loaded, queue is populated with old | ||
| 482 | * open requests, reducing effective size of queue. | ||
| 483 | * When server is well loaded, queue size reduces to zero | ||
| 484 | * after several minutes of work. It is not synflood, | ||
| 485 | * it is normal operation. The solution is pruning | ||
| 486 | * too old entries overriding normal timeout, when | ||
| 487 | * situation becomes dangerous. | ||
| 488 | * | ||
| 489 | * Essentially, we reserve half of room for young | ||
| 490 | * embrions; and abort old ones without pity, if old | ||
| 491 | * ones are about to clog our table. | ||
| 492 | */ | ||
| 493 | if (lopt->qlen>>(lopt->max_qlen_log-1)) { | ||
| 494 | int young = (lopt->qlen_young<<1); | ||
| 495 | |||
| 496 | while (thresh > 2) { | ||
| 497 | if (lopt->qlen < young) | ||
| 498 | break; | ||
| 499 | thresh--; | ||
| 500 | young <<= 1; | ||
| 501 | } | ||
| 502 | } | ||
| 503 | |||
| 504 | if (tp->defer_accept) | ||
| 505 | max_retries = tp->defer_accept; | ||
| 506 | |||
| 507 | budget = 2*(TCP_SYNQ_HSIZE/(TCP_TIMEOUT_INIT/TCP_SYNQ_INTERVAL)); | ||
| 508 | i = lopt->clock_hand; | ||
| 509 | |||
| 510 | do { | ||
| 511 | reqp=&lopt->syn_table[i]; | ||
| 512 | while ((req = *reqp) != NULL) { | ||
| 513 | if (time_after_eq(now, req->expires)) { | ||
| 514 | if ((req->retrans < thresh || | ||
| 515 | (inet_rsk(req)->acked && req->retrans < max_retries)) | ||
| 516 | && !req->rsk_ops->rtx_syn_ack(sk, req, NULL)) { | ||
| 517 | unsigned long timeo; | ||
| 518 | |||
| 519 | if (req->retrans++ == 0) | ||
| 520 | lopt->qlen_young--; | ||
| 521 | timeo = min((TCP_TIMEOUT_INIT << req->retrans), | ||
| 522 | TCP_RTO_MAX); | ||
| 523 | req->expires = now + timeo; | ||
| 524 | reqp = &req->dl_next; | ||
| 525 | continue; | ||
| 526 | } | ||
| 527 | |||
| 528 | /* Drop this request */ | ||
| 529 | tcp_synq_unlink(tp, req, reqp); | ||
| 530 | reqsk_queue_removed(&tp->accept_queue, req); | ||
| 531 | reqsk_free(req); | ||
| 532 | continue; | ||
| 533 | } | ||
| 534 | reqp = &req->dl_next; | ||
| 535 | } | ||
| 536 | |||
| 537 | i = (i+1)&(TCP_SYNQ_HSIZE-1); | ||
| 538 | |||
| 539 | } while (--budget > 0); | ||
| 540 | |||
| 541 | lopt->clock_hand = i; | ||
| 542 | |||
| 543 | if (lopt->qlen) | ||
| 544 | tcp_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL); | ||
| 545 | } | ||
| 546 | |||
| 547 | void tcp_delete_keepalive_timer (struct sock *sk) | ||
| 548 | { | ||
| 549 | sk_stop_timer(sk, &sk->sk_timer); | ||
| 550 | } | ||
| 551 | |||
| 552 | void tcp_reset_keepalive_timer (struct sock *sk, unsigned long len) | ||
| 553 | { | ||
| 554 | sk_reset_timer(sk, &sk->sk_timer, jiffies + len); | ||
| 555 | } | 436 | } |
| 556 | 437 | ||
| 557 | void tcp_set_keepalive(struct sock *sk, int val) | 438 | void tcp_set_keepalive(struct sock *sk, int val) |
| @@ -560,15 +441,16 @@ void tcp_set_keepalive(struct sock *sk, int val) | |||
| 560 | return; | 441 | return; |
| 561 | 442 | ||
| 562 | if (val && !sock_flag(sk, SOCK_KEEPOPEN)) | 443 | if (val && !sock_flag(sk, SOCK_KEEPOPEN)) |
| 563 | tcp_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk))); | 444 | inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk))); |
| 564 | else if (!val) | 445 | else if (!val) |
| 565 | tcp_delete_keepalive_timer(sk); | 446 | inet_csk_delete_keepalive_timer(sk); |
| 566 | } | 447 | } |
| 567 | 448 | ||
| 568 | 449 | ||
| 569 | static void tcp_keepalive_timer (unsigned long data) | 450 | static void tcp_keepalive_timer (unsigned long data) |
| 570 | { | 451 | { |
| 571 | struct sock *sk = (struct sock *) data; | 452 | struct sock *sk = (struct sock *) data; |
| 453 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 572 | struct tcp_sock *tp = tcp_sk(sk); | 454 | struct tcp_sock *tp = tcp_sk(sk); |
| 573 | __u32 elapsed; | 455 | __u32 elapsed; |
| 574 | 456 | ||
| @@ -576,7 +458,7 @@ static void tcp_keepalive_timer (unsigned long data) | |||
| 576 | bh_lock_sock(sk); | 458 | bh_lock_sock(sk); |
| 577 | if (sock_owned_by_user(sk)) { | 459 | if (sock_owned_by_user(sk)) { |
| 578 | /* Try again later. */ | 460 | /* Try again later. */ |
| 579 | tcp_reset_keepalive_timer (sk, HZ/20); | 461 | inet_csk_reset_keepalive_timer (sk, HZ/20); |
| 580 | goto out; | 462 | goto out; |
| 581 | } | 463 | } |
| 582 | 464 | ||
| @@ -587,7 +469,7 @@ static void tcp_keepalive_timer (unsigned long data) | |||
| 587 | 469 | ||
| 588 | if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) { | 470 | if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) { |
| 589 | if (tp->linger2 >= 0) { | 471 | if (tp->linger2 >= 0) { |
| 590 | int tmo = tcp_fin_time(tp) - TCP_TIMEWAIT_LEN; | 472 | const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN; |
| 591 | 473 | ||
| 592 | if (tmo > 0) { | 474 | if (tmo > 0) { |
| 593 | tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); | 475 | tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); |
| @@ -610,14 +492,14 @@ static void tcp_keepalive_timer (unsigned long data) | |||
| 610 | elapsed = tcp_time_stamp - tp->rcv_tstamp; | 492 | elapsed = tcp_time_stamp - tp->rcv_tstamp; |
| 611 | 493 | ||
| 612 | if (elapsed >= keepalive_time_when(tp)) { | 494 | if (elapsed >= keepalive_time_when(tp)) { |
| 613 | if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) || | 495 | if ((!tp->keepalive_probes && icsk->icsk_probes_out >= sysctl_tcp_keepalive_probes) || |
| 614 | (tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) { | 496 | (tp->keepalive_probes && icsk->icsk_probes_out >= tp->keepalive_probes)) { |
| 615 | tcp_send_active_reset(sk, GFP_ATOMIC); | 497 | tcp_send_active_reset(sk, GFP_ATOMIC); |
| 616 | tcp_write_err(sk); | 498 | tcp_write_err(sk); |
| 617 | goto out; | 499 | goto out; |
| 618 | } | 500 | } |
| 619 | if (tcp_write_wakeup(sk) <= 0) { | 501 | if (tcp_write_wakeup(sk) <= 0) { |
| 620 | tp->probes_out++; | 502 | icsk->icsk_probes_out++; |
| 621 | elapsed = keepalive_intvl_when(tp); | 503 | elapsed = keepalive_intvl_when(tp); |
| 622 | } else { | 504 | } else { |
| 623 | /* If keepalive was lost due to local congestion, | 505 | /* If keepalive was lost due to local congestion, |
| @@ -634,7 +516,7 @@ static void tcp_keepalive_timer (unsigned long data) | |||
| 634 | sk_stream_mem_reclaim(sk); | 516 | sk_stream_mem_reclaim(sk); |
| 635 | 517 | ||
| 636 | resched: | 518 | resched: |
| 637 | tcp_reset_keepalive_timer (sk, elapsed); | 519 | inet_csk_reset_keepalive_timer (sk, elapsed); |
| 638 | goto out; | 520 | goto out; |
| 639 | 521 | ||
| 640 | death: | 522 | death: |
| @@ -644,8 +526,3 @@ out: | |||
| 644 | bh_unlock_sock(sk); | 526 | bh_unlock_sock(sk); |
| 645 | sock_put(sk); | 527 | sock_put(sk); |
| 646 | } | 528 | } |
| 647 | |||
| 648 | EXPORT_SYMBOL(tcp_clear_xmit_timers); | ||
| 649 | EXPORT_SYMBOL(tcp_delete_keepalive_timer); | ||
| 650 | EXPORT_SYMBOL(tcp_init_xmit_timers); | ||
| 651 | EXPORT_SYMBOL(tcp_reset_keepalive_timer); | ||
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 9bd443db5193..93c5f92070f9 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c | |||
| @@ -35,7 +35,7 @@ | |||
| 35 | #include <linux/mm.h> | 35 | #include <linux/mm.h> |
| 36 | #include <linux/module.h> | 36 | #include <linux/module.h> |
| 37 | #include <linux/skbuff.h> | 37 | #include <linux/skbuff.h> |
| 38 | #include <linux/tcp_diag.h> | 38 | #include <linux/inet_diag.h> |
| 39 | 39 | ||
| 40 | #include <net/tcp.h> | 40 | #include <net/tcp.h> |
| 41 | 41 | ||
| @@ -82,9 +82,10 @@ struct vegas { | |||
| 82 | * Instead we must wait until the completion of an RTT during | 82 | * Instead we must wait until the completion of an RTT during |
| 83 | * which we actually receive ACKs. | 83 | * which we actually receive ACKs. |
| 84 | */ | 84 | */ |
| 85 | static inline void vegas_enable(struct tcp_sock *tp) | 85 | static inline void vegas_enable(struct sock *sk) |
| 86 | { | 86 | { |
| 87 | struct vegas *vegas = tcp_ca(tp); | 87 | const struct tcp_sock *tp = tcp_sk(sk); |
| 88 | struct vegas *vegas = inet_csk_ca(sk); | ||
| 88 | 89 | ||
| 89 | /* Begin taking Vegas samples next time we send something. */ | 90 | /* Begin taking Vegas samples next time we send something. */ |
| 90 | vegas->doing_vegas_now = 1; | 91 | vegas->doing_vegas_now = 1; |
| @@ -97,19 +98,19 @@ static inline void vegas_enable(struct tcp_sock *tp) | |||
| 97 | } | 98 | } |
| 98 | 99 | ||
| 99 | /* Stop taking Vegas samples for now. */ | 100 | /* Stop taking Vegas samples for now. */ |
| 100 | static inline void vegas_disable(struct tcp_sock *tp) | 101 | static inline void vegas_disable(struct sock *sk) |
| 101 | { | 102 | { |
| 102 | struct vegas *vegas = tcp_ca(tp); | 103 | struct vegas *vegas = inet_csk_ca(sk); |
| 103 | 104 | ||
| 104 | vegas->doing_vegas_now = 0; | 105 | vegas->doing_vegas_now = 0; |
| 105 | } | 106 | } |
| 106 | 107 | ||
| 107 | static void tcp_vegas_init(struct tcp_sock *tp) | 108 | static void tcp_vegas_init(struct sock *sk) |
| 108 | { | 109 | { |
| 109 | struct vegas *vegas = tcp_ca(tp); | 110 | struct vegas *vegas = inet_csk_ca(sk); |
| 110 | 111 | ||
| 111 | vegas->baseRTT = 0x7fffffff; | 112 | vegas->baseRTT = 0x7fffffff; |
| 112 | vegas_enable(tp); | 113 | vegas_enable(sk); |
| 113 | } | 114 | } |
| 114 | 115 | ||
| 115 | /* Do RTT sampling needed for Vegas. | 116 | /* Do RTT sampling needed for Vegas. |
| @@ -120,9 +121,9 @@ static void tcp_vegas_init(struct tcp_sock *tp) | |||
| 120 | * o min-filter RTT samples from a much longer window (forever for now) | 121 | * o min-filter RTT samples from a much longer window (forever for now) |
| 121 | * to find the propagation delay (baseRTT) | 122 | * to find the propagation delay (baseRTT) |
| 122 | */ | 123 | */ |
| 123 | static void tcp_vegas_rtt_calc(struct tcp_sock *tp, u32 usrtt) | 124 | static void tcp_vegas_rtt_calc(struct sock *sk, u32 usrtt) |
| 124 | { | 125 | { |
| 125 | struct vegas *vegas = tcp_ca(tp); | 126 | struct vegas *vegas = inet_csk_ca(sk); |
| 126 | u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */ | 127 | u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */ |
| 127 | 128 | ||
| 128 | /* Filter to find propagation delay: */ | 129 | /* Filter to find propagation delay: */ |
| @@ -136,13 +137,13 @@ static void tcp_vegas_rtt_calc(struct tcp_sock *tp, u32 usrtt) | |||
| 136 | vegas->cntRTT++; | 137 | vegas->cntRTT++; |
| 137 | } | 138 | } |
| 138 | 139 | ||
| 139 | static void tcp_vegas_state(struct tcp_sock *tp, u8 ca_state) | 140 | static void tcp_vegas_state(struct sock *sk, u8 ca_state) |
| 140 | { | 141 | { |
| 141 | 142 | ||
| 142 | if (ca_state == TCP_CA_Open) | 143 | if (ca_state == TCP_CA_Open) |
| 143 | vegas_enable(tp); | 144 | vegas_enable(sk); |
| 144 | else | 145 | else |
| 145 | vegas_disable(tp); | 146 | vegas_disable(sk); |
| 146 | } | 147 | } |
| 147 | 148 | ||
| 148 | /* | 149 | /* |
| @@ -154,20 +155,21 @@ static void tcp_vegas_state(struct tcp_sock *tp, u8 ca_state) | |||
| 154 | * packets, _then_ we can make Vegas calculations | 155 | * packets, _then_ we can make Vegas calculations |
| 155 | * again. | 156 | * again. |
| 156 | */ | 157 | */ |
| 157 | static void tcp_vegas_cwnd_event(struct tcp_sock *tp, enum tcp_ca_event event) | 158 | static void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event) |
| 158 | { | 159 | { |
| 159 | if (event == CA_EVENT_CWND_RESTART || | 160 | if (event == CA_EVENT_CWND_RESTART || |
| 160 | event == CA_EVENT_TX_START) | 161 | event == CA_EVENT_TX_START) |
| 161 | tcp_vegas_init(tp); | 162 | tcp_vegas_init(sk); |
| 162 | } | 163 | } |
| 163 | 164 | ||
| 164 | static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack, | 165 | static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, |
| 165 | u32 seq_rtt, u32 in_flight, int flag) | 166 | u32 seq_rtt, u32 in_flight, int flag) |
| 166 | { | 167 | { |
| 167 | struct vegas *vegas = tcp_ca(tp); | 168 | struct tcp_sock *tp = tcp_sk(sk); |
| 169 | struct vegas *vegas = inet_csk_ca(sk); | ||
| 168 | 170 | ||
| 169 | if (!vegas->doing_vegas_now) | 171 | if (!vegas->doing_vegas_now) |
| 170 | return tcp_reno_cong_avoid(tp, ack, seq_rtt, in_flight, flag); | 172 | return tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag); |
| 171 | 173 | ||
| 172 | /* The key players are v_beg_snd_una and v_beg_snd_nxt. | 174 | /* The key players are v_beg_snd_una and v_beg_snd_nxt. |
| 173 | * | 175 | * |
| @@ -219,7 +221,7 @@ static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack, | |||
| 219 | * but that's not too awful, since we're taking the min, | 221 | * but that's not too awful, since we're taking the min, |
| 220 | * rather than averaging. | 222 | * rather than averaging. |
| 221 | */ | 223 | */ |
| 222 | tcp_vegas_rtt_calc(tp, seq_rtt*1000); | 224 | tcp_vegas_rtt_calc(sk, seq_rtt * 1000); |
| 223 | 225 | ||
| 224 | /* We do the Vegas calculations only if we got enough RTT | 226 | /* We do the Vegas calculations only if we got enough RTT |
| 225 | * samples that we can be reasonably sure that we got | 227 | * samples that we can be reasonably sure that we got |
| @@ -359,14 +361,14 @@ static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack, | |||
| 359 | } | 361 | } |
| 360 | 362 | ||
| 361 | /* Extract info for Tcp socket info provided via netlink. */ | 363 | /* Extract info for Tcp socket info provided via netlink. */ |
| 362 | static void tcp_vegas_get_info(struct tcp_sock *tp, u32 ext, | 364 | static void tcp_vegas_get_info(struct sock *sk, u32 ext, |
| 363 | struct sk_buff *skb) | 365 | struct sk_buff *skb) |
| 364 | { | 366 | { |
| 365 | const struct vegas *ca = tcp_ca(tp); | 367 | const struct vegas *ca = inet_csk_ca(sk); |
| 366 | if (ext & (1<<(TCPDIAG_VEGASINFO-1))) { | 368 | if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { |
| 367 | struct tcpvegas_info *info; | 369 | struct tcpvegas_info *info; |
| 368 | 370 | ||
| 369 | info = RTA_DATA(__RTA_PUT(skb, TCPDIAG_VEGASINFO, | 371 | info = RTA_DATA(__RTA_PUT(skb, INET_DIAG_VEGASINFO, |
| 370 | sizeof(*info))); | 372 | sizeof(*info))); |
| 371 | 373 | ||
| 372 | info->tcpv_enabled = ca->doing_vegas_now; | 374 | info->tcpv_enabled = ca->doing_vegas_now; |
| @@ -393,7 +395,7 @@ static struct tcp_congestion_ops tcp_vegas = { | |||
| 393 | 395 | ||
| 394 | static int __init tcp_vegas_register(void) | 396 | static int __init tcp_vegas_register(void) |
| 395 | { | 397 | { |
| 396 | BUG_ON(sizeof(struct vegas) > TCP_CA_PRIV_SIZE); | 398 | BUG_ON(sizeof(struct vegas) > ICSK_CA_PRIV_SIZE); |
| 397 | tcp_register_congestion_control(&tcp_vegas); | 399 | tcp_register_congestion_control(&tcp_vegas); |
| 398 | return 0; | 400 | return 0; |
| 399 | } | 401 | } |
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index ef827242c940..0c340c3756c2 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c | |||
| @@ -8,7 +8,7 @@ | |||
| 8 | #include <linux/mm.h> | 8 | #include <linux/mm.h> |
| 9 | #include <linux/module.h> | 9 | #include <linux/module.h> |
| 10 | #include <linux/skbuff.h> | 10 | #include <linux/skbuff.h> |
| 11 | #include <linux/tcp_diag.h> | 11 | #include <linux/inet_diag.h> |
| 12 | #include <net/tcp.h> | 12 | #include <net/tcp.h> |
| 13 | 13 | ||
| 14 | /* TCP Westwood structure */ | 14 | /* TCP Westwood structure */ |
| @@ -40,9 +40,9 @@ struct westwood { | |||
| 40 | * way as soon as possible. It will reasonably happen within the first | 40 | * way as soon as possible. It will reasonably happen within the first |
| 41 | * RTT period of the connection lifetime. | 41 | * RTT period of the connection lifetime. |
| 42 | */ | 42 | */ |
| 43 | static void tcp_westwood_init(struct tcp_sock *tp) | 43 | static void tcp_westwood_init(struct sock *sk) |
| 44 | { | 44 | { |
| 45 | struct westwood *w = tcp_ca(tp); | 45 | struct westwood *w = inet_csk_ca(sk); |
| 46 | 46 | ||
| 47 | w->bk = 0; | 47 | w->bk = 0; |
| 48 | w->bw_ns_est = 0; | 48 | w->bw_ns_est = 0; |
| @@ -51,7 +51,7 @@ static void tcp_westwood_init(struct tcp_sock *tp) | |||
| 51 | w->cumul_ack = 0; | 51 | w->cumul_ack = 0; |
| 52 | w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT; | 52 | w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT; |
| 53 | w->rtt_win_sx = tcp_time_stamp; | 53 | w->rtt_win_sx = tcp_time_stamp; |
| 54 | w->snd_una = tp->snd_una; | 54 | w->snd_una = tcp_sk(sk)->snd_una; |
| 55 | } | 55 | } |
| 56 | 56 | ||
| 57 | /* | 57 | /* |
| @@ -74,11 +74,11 @@ static inline void westwood_filter(struct westwood *w, u32 delta) | |||
| 74 | * Called after processing group of packets. | 74 | * Called after processing group of packets. |
| 75 | * but all westwood needs is the last sample of srtt. | 75 | * but all westwood needs is the last sample of srtt. |
| 76 | */ | 76 | */ |
| 77 | static void tcp_westwood_pkts_acked(struct tcp_sock *tp, u32 cnt) | 77 | static void tcp_westwood_pkts_acked(struct sock *sk, u32 cnt) |
| 78 | { | 78 | { |
| 79 | struct westwood *w = tcp_ca(tp); | 79 | struct westwood *w = inet_csk_ca(sk); |
| 80 | if (cnt > 0) | 80 | if (cnt > 0) |
| 81 | w->rtt = tp->srtt >> 3; | 81 | w->rtt = tcp_sk(sk)->srtt >> 3; |
| 82 | } | 82 | } |
| 83 | 83 | ||
| 84 | /* | 84 | /* |
| @@ -86,9 +86,9 @@ static void tcp_westwood_pkts_acked(struct tcp_sock *tp, u32 cnt) | |||
| 86 | * It updates RTT evaluation window if it is the right moment to do | 86 | * It updates RTT evaluation window if it is the right moment to do |
| 87 | * it. If so it calls filter for evaluating bandwidth. | 87 | * it. If so it calls filter for evaluating bandwidth. |
| 88 | */ | 88 | */ |
| 89 | static void westwood_update_window(struct tcp_sock *tp) | 89 | static void westwood_update_window(struct sock *sk) |
| 90 | { | 90 | { |
| 91 | struct westwood *w = tcp_ca(tp); | 91 | struct westwood *w = inet_csk_ca(sk); |
| 92 | s32 delta = tcp_time_stamp - w->rtt_win_sx; | 92 | s32 delta = tcp_time_stamp - w->rtt_win_sx; |
| 93 | 93 | ||
| 94 | /* | 94 | /* |
| @@ -114,11 +114,12 @@ static void westwood_update_window(struct tcp_sock *tp) | |||
| 114 | * header prediction is successful. In such case in fact update is | 114 | * header prediction is successful. In such case in fact update is |
| 115 | * straight forward and doesn't need any particular care. | 115 | * straight forward and doesn't need any particular care. |
| 116 | */ | 116 | */ |
| 117 | static inline void westwood_fast_bw(struct tcp_sock *tp) | 117 | static inline void westwood_fast_bw(struct sock *sk) |
| 118 | { | 118 | { |
| 119 | struct westwood *w = tcp_ca(tp); | 119 | const struct tcp_sock *tp = tcp_sk(sk); |
| 120 | struct westwood *w = inet_csk_ca(sk); | ||
| 120 | 121 | ||
| 121 | westwood_update_window(tp); | 122 | westwood_update_window(sk); |
| 122 | 123 | ||
| 123 | w->bk += tp->snd_una - w->snd_una; | 124 | w->bk += tp->snd_una - w->snd_una; |
| 124 | w->snd_una = tp->snd_una; | 125 | w->snd_una = tp->snd_una; |
| @@ -130,9 +131,10 @@ static inline void westwood_fast_bw(struct tcp_sock *tp) | |||
| 130 | * This function evaluates cumul_ack for evaluating bk in case of | 131 | * This function evaluates cumul_ack for evaluating bk in case of |
| 131 | * delayed or partial acks. | 132 | * delayed or partial acks. |
| 132 | */ | 133 | */ |
| 133 | static inline u32 westwood_acked_count(struct tcp_sock *tp) | 134 | static inline u32 westwood_acked_count(struct sock *sk) |
| 134 | { | 135 | { |
| 135 | struct westwood *w = tcp_ca(tp); | 136 | const struct tcp_sock *tp = tcp_sk(sk); |
| 137 | struct westwood *w = inet_csk_ca(sk); | ||
| 136 | 138 | ||
| 137 | w->cumul_ack = tp->snd_una - w->snd_una; | 139 | w->cumul_ack = tp->snd_una - w->snd_una; |
| 138 | 140 | ||
| @@ -160,9 +162,10 @@ static inline u32 westwood_acked_count(struct tcp_sock *tp) | |||
| 160 | return w->cumul_ack; | 162 | return w->cumul_ack; |
| 161 | } | 163 | } |
| 162 | 164 | ||
| 163 | static inline u32 westwood_bw_rttmin(const struct tcp_sock *tp) | 165 | static inline u32 westwood_bw_rttmin(const struct sock *sk) |
| 164 | { | 166 | { |
| 165 | struct westwood *w = tcp_ca(tp); | 167 | const struct tcp_sock *tp = tcp_sk(sk); |
| 168 | const struct westwood *w = inet_csk_ca(sk); | ||
| 166 | return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); | 169 | return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); |
| 167 | } | 170 | } |
| 168 | 171 | ||
| @@ -172,31 +175,32 @@ static inline u32 westwood_bw_rttmin(const struct tcp_sock *tp) | |||
| 172 | * in packets we use mss_cache). Rttmin is guaranteed to be >= 2 | 175 | * in packets we use mss_cache). Rttmin is guaranteed to be >= 2 |
| 173 | * so avoids ever returning 0. | 176 | * so avoids ever returning 0. |
| 174 | */ | 177 | */ |
| 175 | static u32 tcp_westwood_cwnd_min(struct tcp_sock *tp) | 178 | static u32 tcp_westwood_cwnd_min(struct sock *sk) |
| 176 | { | 179 | { |
| 177 | return westwood_bw_rttmin(tp); | 180 | return westwood_bw_rttmin(sk); |
| 178 | } | 181 | } |
| 179 | 182 | ||
| 180 | static void tcp_westwood_event(struct tcp_sock *tp, enum tcp_ca_event event) | 183 | static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) |
| 181 | { | 184 | { |
| 182 | struct westwood *w = tcp_ca(tp); | 185 | struct tcp_sock *tp = tcp_sk(sk); |
| 186 | struct westwood *w = inet_csk_ca(sk); | ||
| 183 | 187 | ||
| 184 | switch(event) { | 188 | switch(event) { |
| 185 | case CA_EVENT_FAST_ACK: | 189 | case CA_EVENT_FAST_ACK: |
| 186 | westwood_fast_bw(tp); | 190 | westwood_fast_bw(sk); |
| 187 | break; | 191 | break; |
| 188 | 192 | ||
| 189 | case CA_EVENT_COMPLETE_CWR: | 193 | case CA_EVENT_COMPLETE_CWR: |
| 190 | tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(tp); | 194 | tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(sk); |
| 191 | break; | 195 | break; |
| 192 | 196 | ||
| 193 | case CA_EVENT_FRTO: | 197 | case CA_EVENT_FRTO: |
| 194 | tp->snd_ssthresh = westwood_bw_rttmin(tp); | 198 | tp->snd_ssthresh = westwood_bw_rttmin(sk); |
| 195 | break; | 199 | break; |
| 196 | 200 | ||
| 197 | case CA_EVENT_SLOW_ACK: | 201 | case CA_EVENT_SLOW_ACK: |
| 198 | westwood_update_window(tp); | 202 | westwood_update_window(sk); |
| 199 | w->bk += westwood_acked_count(tp); | 203 | w->bk += westwood_acked_count(sk); |
| 200 | w->rtt_min = min(w->rtt, w->rtt_min); | 204 | w->rtt_min = min(w->rtt, w->rtt_min); |
| 201 | break; | 205 | break; |
| 202 | 206 | ||
| @@ -208,15 +212,15 @@ static void tcp_westwood_event(struct tcp_sock *tp, enum tcp_ca_event event) | |||
| 208 | 212 | ||
| 209 | 213 | ||
| 210 | /* Extract info for Tcp socket info provided via netlink. */ | 214 | /* Extract info for Tcp socket info provided via netlink. */ |
| 211 | static void tcp_westwood_info(struct tcp_sock *tp, u32 ext, | 215 | static void tcp_westwood_info(struct sock *sk, u32 ext, |
| 212 | struct sk_buff *skb) | 216 | struct sk_buff *skb) |
| 213 | { | 217 | { |
| 214 | const struct westwood *ca = tcp_ca(tp); | 218 | const struct westwood *ca = inet_csk_ca(sk); |
| 215 | if (ext & (1<<(TCPDIAG_VEGASINFO-1))) { | 219 | if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { |
| 216 | struct rtattr *rta; | 220 | struct rtattr *rta; |
| 217 | struct tcpvegas_info *info; | 221 | struct tcpvegas_info *info; |
| 218 | 222 | ||
| 219 | rta = __RTA_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*info)); | 223 | rta = __RTA_PUT(skb, INET_DIAG_VEGASINFO, sizeof(*info)); |
| 220 | info = RTA_DATA(rta); | 224 | info = RTA_DATA(rta); |
| 221 | info->tcpv_enabled = 1; | 225 | info->tcpv_enabled = 1; |
| 222 | info->tcpv_rttcnt = 0; | 226 | info->tcpv_rttcnt = 0; |
| @@ -242,7 +246,7 @@ static struct tcp_congestion_ops tcp_westwood = { | |||
| 242 | 246 | ||
| 243 | static int __init tcp_westwood_register(void) | 247 | static int __init tcp_westwood_register(void) |
| 244 | { | 248 | { |
| 245 | BUG_ON(sizeof(struct westwood) > TCP_CA_PRIV_SIZE); | 249 | BUG_ON(sizeof(struct westwood) > ICSK_CA_PRIV_SIZE); |
| 246 | return tcp_register_congestion_control(&tcp_westwood); | 250 | return tcp_register_congestion_control(&tcp_westwood); |
| 247 | } | 251 | } |
| 248 | 252 | ||
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index dc4d07357e3a..e5beca7de86c 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c | |||
| @@ -95,7 +95,8 @@ | |||
| 95 | #include <linux/ipv6.h> | 95 | #include <linux/ipv6.h> |
| 96 | #include <linux/netdevice.h> | 96 | #include <linux/netdevice.h> |
| 97 | #include <net/snmp.h> | 97 | #include <net/snmp.h> |
| 98 | #include <net/tcp.h> | 98 | #include <net/ip.h> |
| 99 | #include <net/tcp_states.h> | ||
| 99 | #include <net/protocol.h> | 100 | #include <net/protocol.h> |
| 100 | #include <linux/skbuff.h> | 101 | #include <linux/skbuff.h> |
| 101 | #include <linux/proc_fs.h> | 102 | #include <linux/proc_fs.h> |
| @@ -112,7 +113,7 @@ | |||
| 112 | * Snmp MIB for the UDP layer | 113 | * Snmp MIB for the UDP layer |
| 113 | */ | 114 | */ |
| 114 | 115 | ||
| 115 | DEFINE_SNMP_STAT(struct udp_mib, udp_statistics); | 116 | DEFINE_SNMP_STAT(struct udp_mib, udp_statistics) __read_mostly; |
| 116 | 117 | ||
| 117 | struct hlist_head udp_hash[UDP_HTABLE_SIZE]; | 118 | struct hlist_head udp_hash[UDP_HTABLE_SIZE]; |
| 118 | DEFINE_RWLOCK(udp_hash_lock); | 119 | DEFINE_RWLOCK(udp_hash_lock); |
| @@ -628,7 +629,7 @@ back_from_confirm: | |||
| 628 | /* ... which is an evident application bug. --ANK */ | 629 | /* ... which is an evident application bug. --ANK */ |
| 629 | release_sock(sk); | 630 | release_sock(sk); |
| 630 | 631 | ||
| 631 | LIMIT_NETDEBUG(printk(KERN_DEBUG "udp cork app bug 2\n")); | 632 | LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 2\n"); |
| 632 | err = -EINVAL; | 633 | err = -EINVAL; |
| 633 | goto out; | 634 | goto out; |
| 634 | } | 635 | } |
| @@ -693,7 +694,7 @@ static int udp_sendpage(struct sock *sk, struct page *page, int offset, | |||
| 693 | if (unlikely(!up->pending)) { | 694 | if (unlikely(!up->pending)) { |
| 694 | release_sock(sk); | 695 | release_sock(sk); |
| 695 | 696 | ||
| 696 | LIMIT_NETDEBUG(printk(KERN_DEBUG "udp cork app bug 3\n")); | 697 | LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 3\n"); |
| 697 | return -EINVAL; | 698 | return -EINVAL; |
| 698 | } | 699 | } |
| 699 | 700 | ||
| @@ -1102,7 +1103,7 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, | |||
| 1102 | skb->ip_summed = CHECKSUM_UNNECESSARY; | 1103 | skb->ip_summed = CHECKSUM_UNNECESSARY; |
| 1103 | if (!udp_check(uh, ulen, saddr, daddr, skb->csum)) | 1104 | if (!udp_check(uh, ulen, saddr, daddr, skb->csum)) |
| 1104 | return 0; | 1105 | return 0; |
| 1105 | LIMIT_NETDEBUG(printk(KERN_DEBUG "udp v4 hw csum failure.\n")); | 1106 | LIMIT_NETDEBUG(KERN_DEBUG "udp v4 hw csum failure.\n"); |
| 1106 | skb->ip_summed = CHECKSUM_NONE; | 1107 | skb->ip_summed = CHECKSUM_NONE; |
| 1107 | } | 1108 | } |
| 1108 | if (skb->ip_summed != CHECKSUM_UNNECESSARY) | 1109 | if (skb->ip_summed != CHECKSUM_UNNECESSARY) |
| @@ -1181,13 +1182,13 @@ int udp_rcv(struct sk_buff *skb) | |||
| 1181 | return(0); | 1182 | return(0); |
| 1182 | 1183 | ||
| 1183 | short_packet: | 1184 | short_packet: |
| 1184 | LIMIT_NETDEBUG(printk(KERN_DEBUG "UDP: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n", | 1185 | LIMIT_NETDEBUG(KERN_DEBUG "UDP: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n", |
| 1185 | NIPQUAD(saddr), | 1186 | NIPQUAD(saddr), |
| 1186 | ntohs(uh->source), | 1187 | ntohs(uh->source), |
| 1187 | ulen, | 1188 | ulen, |
| 1188 | len, | 1189 | len, |
| 1189 | NIPQUAD(daddr), | 1190 | NIPQUAD(daddr), |
| 1190 | ntohs(uh->dest))); | 1191 | ntohs(uh->dest)); |
| 1191 | no_header: | 1192 | no_header: |
| 1192 | UDP_INC_STATS_BH(UDP_MIB_INERRORS); | 1193 | UDP_INC_STATS_BH(UDP_MIB_INERRORS); |
| 1193 | kfree_skb(skb); | 1194 | kfree_skb(skb); |
| @@ -1198,12 +1199,12 @@ csum_error: | |||
| 1198 | * RFC1122: OK. Discards the bad packet silently (as far as | 1199 | * RFC1122: OK. Discards the bad packet silently (as far as |
| 1199 | * the network is concerned, anyway) as per 4.1.3.4 (MUST). | 1200 | * the network is concerned, anyway) as per 4.1.3.4 (MUST). |
| 1200 | */ | 1201 | */ |
| 1201 | LIMIT_NETDEBUG(printk(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n", | 1202 | LIMIT_NETDEBUG(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n", |
| 1202 | NIPQUAD(saddr), | 1203 | NIPQUAD(saddr), |
| 1203 | ntohs(uh->source), | 1204 | ntohs(uh->source), |
| 1204 | NIPQUAD(daddr), | 1205 | NIPQUAD(daddr), |
| 1205 | ntohs(uh->dest), | 1206 | ntohs(uh->dest), |
| 1206 | ulen)); | 1207 | ulen); |
| 1207 | drop: | 1208 | drop: |
| 1208 | UDP_INC_STATS_BH(UDP_MIB_INERRORS); | 1209 | UDP_INC_STATS_BH(UDP_MIB_INERRORS); |
| 1209 | kfree_skb(skb); | 1210 | kfree_skb(skb); |
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index 050611d7a967..d23e07fc81fa 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c | |||
| @@ -128,8 +128,10 @@ void __init xfrm4_state_init(void) | |||
| 128 | xfrm_state_register_afinfo(&xfrm4_state_afinfo); | 128 | xfrm_state_register_afinfo(&xfrm4_state_afinfo); |
| 129 | } | 129 | } |
| 130 | 130 | ||
| 131 | #if 0 | ||
| 131 | void __exit xfrm4_state_fini(void) | 132 | void __exit xfrm4_state_fini(void) |
| 132 | { | 133 | { |
| 133 | xfrm_state_unregister_afinfo(&xfrm4_state_afinfo); | 134 | xfrm_state_unregister_afinfo(&xfrm4_state_afinfo); |
| 134 | } | 135 | } |
| 136 | #endif /* 0 */ | ||
| 135 | 137 | ||
