diff options
author | Glenn Elliott <gelliott@cs.unc.edu> | 2012-03-04 19:47:13 -0500 |
---|---|---|
committer | Glenn Elliott <gelliott@cs.unc.edu> | 2012-03-04 19:47:13 -0500 |
commit | c71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch) | |
tree | ecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /net/ipv4 | |
parent | ea53c912f8a86a8567697115b6a0d8152beee5c8 (diff) | |
parent | 6a00f206debf8a5c8899055726ad127dbeeed098 (diff) |
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts:
litmus/sched_cedf.c
Diffstat (limited to 'net/ipv4')
96 files changed, 6506 insertions, 5512 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 7cd7760144f7..cbb505ba9324 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig | |||
@@ -55,45 +55,9 @@ config IP_ADVANCED_ROUTER | |||
55 | 55 | ||
56 | If unsure, say N here. | 56 | If unsure, say N here. |
57 | 57 | ||
58 | choice | ||
59 | prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)" | ||
60 | depends on IP_ADVANCED_ROUTER | ||
61 | default ASK_IP_FIB_HASH | ||
62 | |||
63 | config ASK_IP_FIB_HASH | ||
64 | bool "FIB_HASH" | ||
65 | ---help--- | ||
66 | Current FIB is very proven and good enough for most users. | ||
67 | |||
68 | config IP_FIB_TRIE | ||
69 | bool "FIB_TRIE" | ||
70 | ---help--- | ||
71 | Use new experimental LC-trie as FIB lookup algorithm. | ||
72 | This improves lookup performance if you have a large | ||
73 | number of routes. | ||
74 | |||
75 | LC-trie is a longest matching prefix lookup algorithm which | ||
76 | performs better than FIB_HASH for large routing tables. | ||
77 | But, it consumes more memory and is more complex. | ||
78 | |||
79 | LC-trie is described in: | ||
80 | |||
81 | IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson | ||
82 | IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, | ||
83 | June 1999 | ||
84 | |||
85 | An experimental study of compression methods for dynamic tries | ||
86 | Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. | ||
87 | http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/ | ||
88 | |||
89 | endchoice | ||
90 | |||
91 | config IP_FIB_HASH | ||
92 | def_bool ASK_IP_FIB_HASH || !IP_ADVANCED_ROUTER | ||
93 | |||
94 | config IP_FIB_TRIE_STATS | 58 | config IP_FIB_TRIE_STATS |
95 | bool "FIB TRIE statistics" | 59 | bool "FIB TRIE statistics" |
96 | depends on IP_FIB_TRIE | 60 | depends on IP_ADVANCED_ROUTER |
97 | ---help--- | 61 | ---help--- |
98 | Keep track of statistics on structure of FIB TRIE table. | 62 | Keep track of statistics on structure of FIB TRIE table. |
99 | Useful for testing and measuring TRIE performance. | 63 | Useful for testing and measuring TRIE performance. |
@@ -140,6 +104,9 @@ config IP_ROUTE_VERBOSE | |||
140 | handled by the klogd daemon which is responsible for kernel messages | 104 | handled by the klogd daemon which is responsible for kernel messages |
141 | ("man klogd"). | 105 | ("man klogd"). |
142 | 106 | ||
107 | config IP_ROUTE_CLASSID | ||
108 | bool | ||
109 | |||
143 | config IP_PNP | 110 | config IP_PNP |
144 | bool "IP: kernel level autoconfiguration" | 111 | bool "IP: kernel level autoconfiguration" |
145 | help | 112 | help |
@@ -215,9 +182,15 @@ config NET_IPIP | |||
215 | be inserted in and removed from the running kernel whenever you | 182 | be inserted in and removed from the running kernel whenever you |
216 | want). Most people won't need this and can say N. | 183 | want). Most people won't need this and can say N. |
217 | 184 | ||
185 | config NET_IPGRE_DEMUX | ||
186 | tristate "IP: GRE demultiplexer" | ||
187 | help | ||
188 | This is helper module to demultiplex GRE packets on GRE version field criteria. | ||
189 | Required by ip_gre and pptp modules. | ||
190 | |||
218 | config NET_IPGRE | 191 | config NET_IPGRE |
219 | tristate "IP: GRE tunnels over IP" | 192 | tristate "IP: GRE tunnels over IP" |
220 | depends on IPV6 || IPV6=n | 193 | depends on (IPV6 || IPV6=n) && NET_IPGRE_DEMUX |
221 | help | 194 | help |
222 | Tunneling means encapsulating data of one protocol type within | 195 | Tunneling means encapsulating data of one protocol type within |
223 | another protocol and sending it over a channel that understands the | 196 | another protocol and sending it over a channel that understands the |
@@ -426,7 +399,9 @@ config INET_DIAG | |||
426 | ---help--- | 399 | ---help--- |
427 | Support for INET (TCP, DCCP, etc) socket monitoring interface used by | 400 | Support for INET (TCP, DCCP, etc) socket monitoring interface used by |
428 | native Linux tools such as ss. ss is included in iproute2, currently | 401 | native Linux tools such as ss. ss is included in iproute2, currently |
429 | downloadable at <http://linux-net.osdl.org/index.php/Iproute2>. | 402 | downloadable at: |
403 | |||
404 | http://www.linuxfoundation.org/collaborate/workgroups/networking/iproute2 | ||
430 | 405 | ||
431 | If unsure, say Y. | 406 | If unsure, say Y. |
432 | 407 | ||
@@ -556,7 +531,7 @@ config TCP_CONG_VENO | |||
556 | distinguishing to circumvent the difficult judgment of the packet loss | 531 | distinguishing to circumvent the difficult judgment of the packet loss |
557 | type. TCP Veno cuts down less congestion window in response to random | 532 | type. TCP Veno cuts down less congestion window in response to random |
558 | loss packets. | 533 | loss packets. |
559 | See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf | 534 | See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186> |
560 | 535 | ||
561 | config TCP_CONG_YEAH | 536 | config TCP_CONG_YEAH |
562 | tristate "YeAH TCP" | 537 | tristate "YeAH TCP" |
@@ -649,4 +624,3 @@ config TCP_MD5SIG | |||
649 | on the Internet. | 624 | on the Internet. |
650 | 625 | ||
651 | If unsure, say N. | 626 | If unsure, say N. |
652 | |||
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 80ff87ce43aa..f2dc69cffb57 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile | |||
@@ -10,16 +10,15 @@ obj-y := route.o inetpeer.o protocol.o \ | |||
10 | tcp_minisocks.o tcp_cong.o \ | 10 | tcp_minisocks.o tcp_cong.o \ |
11 | datagram.o raw.o udp.o udplite.o \ | 11 | datagram.o raw.o udp.o udplite.o \ |
12 | arp.o icmp.o devinet.o af_inet.o igmp.o \ | 12 | arp.o icmp.o devinet.o af_inet.o igmp.o \ |
13 | fib_frontend.o fib_semantics.o \ | 13 | fib_frontend.o fib_semantics.o fib_trie.o \ |
14 | inet_fragment.o | 14 | inet_fragment.o ping.o |
15 | 15 | ||
16 | obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o | 16 | obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o |
17 | obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o | ||
18 | obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o | ||
19 | obj-$(CONFIG_PROC_FS) += proc.o | 17 | obj-$(CONFIG_PROC_FS) += proc.o |
20 | obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o | 18 | obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o |
21 | obj-$(CONFIG_IP_MROUTE) += ipmr.o | 19 | obj-$(CONFIG_IP_MROUTE) += ipmr.o |
22 | obj-$(CONFIG_NET_IPIP) += ipip.o | 20 | obj-$(CONFIG_NET_IPIP) += ipip.o |
21 | obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o | ||
23 | obj-$(CONFIG_NET_IPGRE) += ip_gre.o | 22 | obj-$(CONFIG_NET_IPGRE) += ip_gre.o |
24 | obj-$(CONFIG_SYN_COOKIES) += syncookies.o | 23 | obj-$(CONFIG_SYN_COOKIES) += syncookies.o |
25 | obj-$(CONFIG_INET_AH) += ah4.o | 24 | obj-$(CONFIG_INET_AH) += ah4.o |
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 6a1100c25a9f..ef1528af7abf 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c | |||
@@ -105,6 +105,7 @@ | |||
105 | #include <net/tcp.h> | 105 | #include <net/tcp.h> |
106 | #include <net/udp.h> | 106 | #include <net/udp.h> |
107 | #include <net/udplite.h> | 107 | #include <net/udplite.h> |
108 | #include <net/ping.h> | ||
108 | #include <linux/skbuff.h> | 109 | #include <linux/skbuff.h> |
109 | #include <net/sock.h> | 110 | #include <net/sock.h> |
110 | #include <net/raw.h> | 111 | #include <net/raw.h> |
@@ -153,7 +154,7 @@ void inet_sock_destruct(struct sock *sk) | |||
153 | WARN_ON(sk->sk_wmem_queued); | 154 | WARN_ON(sk->sk_wmem_queued); |
154 | WARN_ON(sk->sk_forward_alloc); | 155 | WARN_ON(sk->sk_forward_alloc); |
155 | 156 | ||
156 | kfree(inet->opt); | 157 | kfree(rcu_dereference_protected(inet->inet_opt, 1)); |
157 | dst_release(rcu_dereference_check(sk->sk_dst_cache, 1)); | 158 | dst_release(rcu_dereference_check(sk->sk_dst_cache, 1)); |
158 | sk_refcnt_debug_dec(sk); | 159 | sk_refcnt_debug_dec(sk); |
159 | } | 160 | } |
@@ -227,18 +228,16 @@ EXPORT_SYMBOL(inet_ehash_secret); | |||
227 | 228 | ||
228 | /* | 229 | /* |
229 | * inet_ehash_secret must be set exactly once | 230 | * inet_ehash_secret must be set exactly once |
230 | * Instead of using a dedicated spinlock, we (ab)use inetsw_lock | ||
231 | */ | 231 | */ |
232 | void build_ehash_secret(void) | 232 | void build_ehash_secret(void) |
233 | { | 233 | { |
234 | u32 rnd; | 234 | u32 rnd; |
235 | |||
235 | do { | 236 | do { |
236 | get_random_bytes(&rnd, sizeof(rnd)); | 237 | get_random_bytes(&rnd, sizeof(rnd)); |
237 | } while (rnd == 0); | 238 | } while (rnd == 0); |
238 | spin_lock_bh(&inetsw_lock); | 239 | |
239 | if (!inet_ehash_secret) | 240 | cmpxchg(&inet_ehash_secret, 0, rnd); |
240 | inet_ehash_secret = rnd; | ||
241 | spin_unlock_bh(&inetsw_lock); | ||
242 | } | 241 | } |
243 | EXPORT_SYMBOL(build_ehash_secret); | 242 | EXPORT_SYMBOL(build_ehash_secret); |
244 | 243 | ||
@@ -466,6 +465,11 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) | |||
466 | if (addr_len < sizeof(struct sockaddr_in)) | 465 | if (addr_len < sizeof(struct sockaddr_in)) |
467 | goto out; | 466 | goto out; |
468 | 467 | ||
468 | if (addr->sin_family != AF_INET) { | ||
469 | err = -EAFNOSUPPORT; | ||
470 | goto out; | ||
471 | } | ||
472 | |||
469 | chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); | 473 | chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); |
470 | 474 | ||
471 | /* Not specified by any standard per-se, however it breaks too | 475 | /* Not specified by any standard per-se, however it breaks too |
@@ -674,6 +678,7 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags) | |||
674 | 678 | ||
675 | lock_sock(sk2); | 679 | lock_sock(sk2); |
676 | 680 | ||
681 | sock_rps_record_flow(sk2); | ||
677 | WARN_ON(!((1 << sk2->sk_state) & | 682 | WARN_ON(!((1 << sk2->sk_state) & |
678 | (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE))); | 683 | (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE))); |
679 | 684 | ||
@@ -882,6 +887,19 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) | |||
882 | } | 887 | } |
883 | EXPORT_SYMBOL(inet_ioctl); | 888 | EXPORT_SYMBOL(inet_ioctl); |
884 | 889 | ||
890 | #ifdef CONFIG_COMPAT | ||
891 | int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) | ||
892 | { | ||
893 | struct sock *sk = sock->sk; | ||
894 | int err = -ENOIOCTLCMD; | ||
895 | |||
896 | if (sk->sk_prot->compat_ioctl) | ||
897 | err = sk->sk_prot->compat_ioctl(sk, cmd, arg); | ||
898 | |||
899 | return err; | ||
900 | } | ||
901 | #endif | ||
902 | |||
885 | const struct proto_ops inet_stream_ops = { | 903 | const struct proto_ops inet_stream_ops = { |
886 | .family = PF_INET, | 904 | .family = PF_INET, |
887 | .owner = THIS_MODULE, | 905 | .owner = THIS_MODULE, |
@@ -905,6 +923,7 @@ const struct proto_ops inet_stream_ops = { | |||
905 | #ifdef CONFIG_COMPAT | 923 | #ifdef CONFIG_COMPAT |
906 | .compat_setsockopt = compat_sock_common_setsockopt, | 924 | .compat_setsockopt = compat_sock_common_setsockopt, |
907 | .compat_getsockopt = compat_sock_common_getsockopt, | 925 | .compat_getsockopt = compat_sock_common_getsockopt, |
926 | .compat_ioctl = inet_compat_ioctl, | ||
908 | #endif | 927 | #endif |
909 | }; | 928 | }; |
910 | EXPORT_SYMBOL(inet_stream_ops); | 929 | EXPORT_SYMBOL(inet_stream_ops); |
@@ -931,6 +950,7 @@ const struct proto_ops inet_dgram_ops = { | |||
931 | #ifdef CONFIG_COMPAT | 950 | #ifdef CONFIG_COMPAT |
932 | .compat_setsockopt = compat_sock_common_setsockopt, | 951 | .compat_setsockopt = compat_sock_common_setsockopt, |
933 | .compat_getsockopt = compat_sock_common_getsockopt, | 952 | .compat_getsockopt = compat_sock_common_getsockopt, |
953 | .compat_ioctl = inet_compat_ioctl, | ||
934 | #endif | 954 | #endif |
935 | }; | 955 | }; |
936 | EXPORT_SYMBOL(inet_dgram_ops); | 956 | EXPORT_SYMBOL(inet_dgram_ops); |
@@ -961,6 +981,7 @@ static const struct proto_ops inet_sockraw_ops = { | |||
961 | #ifdef CONFIG_COMPAT | 981 | #ifdef CONFIG_COMPAT |
962 | .compat_setsockopt = compat_sock_common_setsockopt, | 982 | .compat_setsockopt = compat_sock_common_setsockopt, |
963 | .compat_getsockopt = compat_sock_common_getsockopt, | 983 | .compat_getsockopt = compat_sock_common_getsockopt, |
984 | .compat_ioctl = inet_compat_ioctl, | ||
964 | #endif | 985 | #endif |
965 | }; | 986 | }; |
966 | 987 | ||
@@ -994,6 +1015,14 @@ static struct inet_protosw inetsw_array[] = | |||
994 | .flags = INET_PROTOSW_PERMANENT, | 1015 | .flags = INET_PROTOSW_PERMANENT, |
995 | }, | 1016 | }, |
996 | 1017 | ||
1018 | { | ||
1019 | .type = SOCK_DGRAM, | ||
1020 | .protocol = IPPROTO_ICMP, | ||
1021 | .prot = &ping_prot, | ||
1022 | .ops = &inet_dgram_ops, | ||
1023 | .no_check = UDP_CSUM_DEFAULT, | ||
1024 | .flags = INET_PROTOSW_REUSE, | ||
1025 | }, | ||
997 | 1026 | ||
998 | { | 1027 | { |
999 | .type = SOCK_RAW, | 1028 | .type = SOCK_RAW, |
@@ -1087,27 +1116,29 @@ int sysctl_ip_dynaddr __read_mostly; | |||
1087 | static int inet_sk_reselect_saddr(struct sock *sk) | 1116 | static int inet_sk_reselect_saddr(struct sock *sk) |
1088 | { | 1117 | { |
1089 | struct inet_sock *inet = inet_sk(sk); | 1118 | struct inet_sock *inet = inet_sk(sk); |
1090 | int err; | ||
1091 | struct rtable *rt; | ||
1092 | __be32 old_saddr = inet->inet_saddr; | 1119 | __be32 old_saddr = inet->inet_saddr; |
1093 | __be32 new_saddr; | ||
1094 | __be32 daddr = inet->inet_daddr; | 1120 | __be32 daddr = inet->inet_daddr; |
1121 | struct flowi4 *fl4; | ||
1122 | struct rtable *rt; | ||
1123 | __be32 new_saddr; | ||
1124 | struct ip_options_rcu *inet_opt; | ||
1095 | 1125 | ||
1096 | if (inet->opt && inet->opt->srr) | 1126 | inet_opt = rcu_dereference_protected(inet->inet_opt, |
1097 | daddr = inet->opt->faddr; | 1127 | sock_owned_by_user(sk)); |
1128 | if (inet_opt && inet_opt->opt.srr) | ||
1129 | daddr = inet_opt->opt.faddr; | ||
1098 | 1130 | ||
1099 | /* Query new route. */ | 1131 | /* Query new route. */ |
1100 | err = ip_route_connect(&rt, daddr, 0, | 1132 | fl4 = &inet->cork.fl.u.ip4; |
1101 | RT_CONN_FLAGS(sk), | 1133 | rt = ip_route_connect(fl4, daddr, 0, RT_CONN_FLAGS(sk), |
1102 | sk->sk_bound_dev_if, | 1134 | sk->sk_bound_dev_if, sk->sk_protocol, |
1103 | sk->sk_protocol, | 1135 | inet->inet_sport, inet->inet_dport, sk, false); |
1104 | inet->inet_sport, inet->inet_dport, sk, 0); | 1136 | if (IS_ERR(rt)) |
1105 | if (err) | 1137 | return PTR_ERR(rt); |
1106 | return err; | ||
1107 | 1138 | ||
1108 | sk_setup_caps(sk, &rt->dst); | 1139 | sk_setup_caps(sk, &rt->dst); |
1109 | 1140 | ||
1110 | new_saddr = rt->rt_src; | 1141 | new_saddr = fl4->saddr; |
1111 | 1142 | ||
1112 | if (new_saddr == old_saddr) | 1143 | if (new_saddr == old_saddr) |
1113 | return 0; | 1144 | return 0; |
@@ -1136,6 +1167,8 @@ int inet_sk_rebuild_header(struct sock *sk) | |||
1136 | struct inet_sock *inet = inet_sk(sk); | 1167 | struct inet_sock *inet = inet_sk(sk); |
1137 | struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0); | 1168 | struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0); |
1138 | __be32 daddr; | 1169 | __be32 daddr; |
1170 | struct ip_options_rcu *inet_opt; | ||
1171 | struct flowi4 *fl4; | ||
1139 | int err; | 1172 | int err; |
1140 | 1173 | ||
1141 | /* Route is OK, nothing to do. */ | 1174 | /* Route is OK, nothing to do. */ |
@@ -1143,36 +1176,23 @@ int inet_sk_rebuild_header(struct sock *sk) | |||
1143 | return 0; | 1176 | return 0; |
1144 | 1177 | ||
1145 | /* Reroute. */ | 1178 | /* Reroute. */ |
1179 | rcu_read_lock(); | ||
1180 | inet_opt = rcu_dereference(inet->inet_opt); | ||
1146 | daddr = inet->inet_daddr; | 1181 | daddr = inet->inet_daddr; |
1147 | if (inet->opt && inet->opt->srr) | 1182 | if (inet_opt && inet_opt->opt.srr) |
1148 | daddr = inet->opt->faddr; | 1183 | daddr = inet_opt->opt.faddr; |
1149 | { | 1184 | rcu_read_unlock(); |
1150 | struct flowi fl = { | 1185 | fl4 = &inet->cork.fl.u.ip4; |
1151 | .oif = sk->sk_bound_dev_if, | 1186 | rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr, |
1152 | .mark = sk->sk_mark, | 1187 | inet->inet_dport, inet->inet_sport, |
1153 | .nl_u = { | 1188 | sk->sk_protocol, RT_CONN_FLAGS(sk), |
1154 | .ip4_u = { | 1189 | sk->sk_bound_dev_if); |
1155 | .daddr = daddr, | 1190 | if (!IS_ERR(rt)) { |
1156 | .saddr = inet->inet_saddr, | 1191 | err = 0; |
1157 | .tos = RT_CONN_FLAGS(sk), | ||
1158 | }, | ||
1159 | }, | ||
1160 | .proto = sk->sk_protocol, | ||
1161 | .flags = inet_sk_flowi_flags(sk), | ||
1162 | .uli_u = { | ||
1163 | .ports = { | ||
1164 | .sport = inet->inet_sport, | ||
1165 | .dport = inet->inet_dport, | ||
1166 | }, | ||
1167 | }, | ||
1168 | }; | ||
1169 | |||
1170 | security_sk_classify_flow(sk, &fl); | ||
1171 | err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0); | ||
1172 | } | ||
1173 | if (!err) | ||
1174 | sk_setup_caps(sk, &rt->dst); | 1192 | sk_setup_caps(sk, &rt->dst); |
1175 | else { | 1193 | } else { |
1194 | err = PTR_ERR(rt); | ||
1195 | |||
1176 | /* Routing failed... */ | 1196 | /* Routing failed... */ |
1177 | sk->sk_route_caps = 0; | 1197 | sk->sk_route_caps = 0; |
1178 | /* | 1198 | /* |
@@ -1192,7 +1212,7 @@ EXPORT_SYMBOL(inet_sk_rebuild_header); | |||
1192 | 1212 | ||
1193 | static int inet_gso_send_check(struct sk_buff *skb) | 1213 | static int inet_gso_send_check(struct sk_buff *skb) |
1194 | { | 1214 | { |
1195 | struct iphdr *iph; | 1215 | const struct iphdr *iph; |
1196 | const struct net_protocol *ops; | 1216 | const struct net_protocol *ops; |
1197 | int proto; | 1217 | int proto; |
1198 | int ihl; | 1218 | int ihl; |
@@ -1225,7 +1245,7 @@ out: | |||
1225 | return err; | 1245 | return err; |
1226 | } | 1246 | } |
1227 | 1247 | ||
1228 | static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features) | 1248 | static struct sk_buff *inet_gso_segment(struct sk_buff *skb, u32 features) |
1229 | { | 1249 | { |
1230 | struct sk_buff *segs = ERR_PTR(-EINVAL); | 1250 | struct sk_buff *segs = ERR_PTR(-EINVAL); |
1231 | struct iphdr *iph; | 1251 | struct iphdr *iph; |
@@ -1299,7 +1319,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, | |||
1299 | const struct net_protocol *ops; | 1319 | const struct net_protocol *ops; |
1300 | struct sk_buff **pp = NULL; | 1320 | struct sk_buff **pp = NULL; |
1301 | struct sk_buff *p; | 1321 | struct sk_buff *p; |
1302 | struct iphdr *iph; | 1322 | const struct iphdr *iph; |
1303 | unsigned int hlen; | 1323 | unsigned int hlen; |
1304 | unsigned int off; | 1324 | unsigned int off; |
1305 | unsigned int id; | 1325 | unsigned int id; |
@@ -1522,6 +1542,7 @@ static const struct net_protocol udp_protocol = { | |||
1522 | 1542 | ||
1523 | static const struct net_protocol icmp_protocol = { | 1543 | static const struct net_protocol icmp_protocol = { |
1524 | .handler = icmp_rcv, | 1544 | .handler = icmp_rcv, |
1545 | .err_handler = ping_err, | ||
1525 | .no_policy = 1, | 1546 | .no_policy = 1, |
1526 | .netns_ok = 1, | 1547 | .netns_ok = 1, |
1527 | }; | 1548 | }; |
@@ -1637,6 +1658,10 @@ static int __init inet_init(void) | |||
1637 | if (rc) | 1658 | if (rc) |
1638 | goto out_unregister_udp_proto; | 1659 | goto out_unregister_udp_proto; |
1639 | 1660 | ||
1661 | rc = proto_register(&ping_prot, 1); | ||
1662 | if (rc) | ||
1663 | goto out_unregister_raw_proto; | ||
1664 | |||
1640 | /* | 1665 | /* |
1641 | * Tell SOCKET that we are alive... | 1666 | * Tell SOCKET that we are alive... |
1642 | */ | 1667 | */ |
@@ -1692,6 +1717,8 @@ static int __init inet_init(void) | |||
1692 | /* Add UDP-Lite (RFC 3828) */ | 1717 | /* Add UDP-Lite (RFC 3828) */ |
1693 | udplite4_register(); | 1718 | udplite4_register(); |
1694 | 1719 | ||
1720 | ping_init(); | ||
1721 | |||
1695 | /* | 1722 | /* |
1696 | * Set the ICMP layer up | 1723 | * Set the ICMP layer up |
1697 | */ | 1724 | */ |
@@ -1722,6 +1749,8 @@ static int __init inet_init(void) | |||
1722 | rc = 0; | 1749 | rc = 0; |
1723 | out: | 1750 | out: |
1724 | return rc; | 1751 | return rc; |
1752 | out_unregister_raw_proto: | ||
1753 | proto_unregister(&raw_prot); | ||
1725 | out_unregister_udp_proto: | 1754 | out_unregister_udp_proto: |
1726 | proto_unregister(&udp_prot); | 1755 | proto_unregister(&udp_prot); |
1727 | out_unregister_tcp_proto: | 1756 | out_unregister_tcp_proto: |
@@ -1746,11 +1775,15 @@ static int __init ipv4_proc_init(void) | |||
1746 | goto out_tcp; | 1775 | goto out_tcp; |
1747 | if (udp4_proc_init()) | 1776 | if (udp4_proc_init()) |
1748 | goto out_udp; | 1777 | goto out_udp; |
1778 | if (ping_proc_init()) | ||
1779 | goto out_ping; | ||
1749 | if (ip_misc_proc_init()) | 1780 | if (ip_misc_proc_init()) |
1750 | goto out_misc; | 1781 | goto out_misc; |
1751 | out: | 1782 | out: |
1752 | return rc; | 1783 | return rc; |
1753 | out_misc: | 1784 | out_misc: |
1785 | ping_proc_exit(); | ||
1786 | out_ping: | ||
1754 | udp4_proc_exit(); | 1787 | udp4_proc_exit(); |
1755 | out_udp: | 1788 | out_udp: |
1756 | tcp4_proc_exit(); | 1789 | tcp4_proc_exit(); |
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 880a5ec6dce0..c1f4154552fc 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c | |||
@@ -73,7 +73,7 @@ static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash, | |||
73 | * into IP header for icv calculation. Options are already checked | 73 | * into IP header for icv calculation. Options are already checked |
74 | * for validity, so paranoia is not required. */ | 74 | * for validity, so paranoia is not required. */ |
75 | 75 | ||
76 | static int ip_clear_mutable_options(struct iphdr *iph, __be32 *daddr) | 76 | static int ip_clear_mutable_options(const struct iphdr *iph, __be32 *daddr) |
77 | { | 77 | { |
78 | unsigned char * optptr = (unsigned char*)(iph+1); | 78 | unsigned char * optptr = (unsigned char*)(iph+1); |
79 | int l = iph->ihl*4 - sizeof(struct iphdr); | 79 | int l = iph->ihl*4 - sizeof(struct iphdr); |
@@ -201,11 +201,14 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) | |||
201 | top_iph->ttl = 0; | 201 | top_iph->ttl = 0; |
202 | top_iph->check = 0; | 202 | top_iph->check = 0; |
203 | 203 | ||
204 | ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; | 204 | if (x->props.flags & XFRM_STATE_ALIGN4) |
205 | ah->hdrlen = (XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; | ||
206 | else | ||
207 | ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; | ||
205 | 208 | ||
206 | ah->reserved = 0; | 209 | ah->reserved = 0; |
207 | ah->spi = x->id.spi; | 210 | ah->spi = x->id.spi; |
208 | ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output); | 211 | ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); |
209 | 212 | ||
210 | sg_init_table(sg, nfrags); | 213 | sg_init_table(sg, nfrags); |
211 | skb_to_sgvec(skb, sg, 0, skb->len); | 214 | skb_to_sgvec(skb, sg, 0, skb->len); |
@@ -299,9 +302,15 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) | |||
299 | nexthdr = ah->nexthdr; | 302 | nexthdr = ah->nexthdr; |
300 | ah_hlen = (ah->hdrlen + 2) << 2; | 303 | ah_hlen = (ah->hdrlen + 2) << 2; |
301 | 304 | ||
302 | if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) && | 305 | if (x->props.flags & XFRM_STATE_ALIGN4) { |
303 | ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len)) | 306 | if (ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_full_len) && |
304 | goto out; | 307 | ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len)) |
308 | goto out; | ||
309 | } else { | ||
310 | if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) && | ||
311 | ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len)) | ||
312 | goto out; | ||
313 | } | ||
305 | 314 | ||
306 | if (!pskb_may_pull(skb, ah_hlen)) | 315 | if (!pskb_may_pull(skb, ah_hlen)) |
307 | goto out; | 316 | goto out; |
@@ -314,14 +323,15 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) | |||
314 | 323 | ||
315 | skb->ip_summed = CHECKSUM_NONE; | 324 | skb->ip_summed = CHECKSUM_NONE; |
316 | 325 | ||
317 | ah = (struct ip_auth_hdr *)skb->data; | ||
318 | iph = ip_hdr(skb); | ||
319 | ihl = ip_hdrlen(skb); | ||
320 | 326 | ||
321 | if ((err = skb_cow_data(skb, 0, &trailer)) < 0) | 327 | if ((err = skb_cow_data(skb, 0, &trailer)) < 0) |
322 | goto out; | 328 | goto out; |
323 | nfrags = err; | 329 | nfrags = err; |
324 | 330 | ||
331 | ah = (struct ip_auth_hdr *)skb->data; | ||
332 | iph = ip_hdr(skb); | ||
333 | ihl = ip_hdrlen(skb); | ||
334 | |||
325 | work_iph = ah_alloc_tmp(ahash, nfrags, ihl + ahp->icv_trunc_len); | 335 | work_iph = ah_alloc_tmp(ahash, nfrags, ihl + ahp->icv_trunc_len); |
326 | if (!work_iph) | 336 | if (!work_iph) |
327 | goto out; | 337 | goto out; |
@@ -386,7 +396,7 @@ out: | |||
386 | static void ah4_err(struct sk_buff *skb, u32 info) | 396 | static void ah4_err(struct sk_buff *skb, u32 info) |
387 | { | 397 | { |
388 | struct net *net = dev_net(skb->dev); | 398 | struct net *net = dev_net(skb->dev); |
389 | struct iphdr *iph = (struct iphdr *)skb->data; | 399 | const struct iphdr *iph = (const struct iphdr *)skb->data; |
390 | struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); | 400 | struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); |
391 | struct xfrm_state *x; | 401 | struct xfrm_state *x; |
392 | 402 | ||
@@ -394,7 +404,8 @@ static void ah4_err(struct sk_buff *skb, u32 info) | |||
394 | icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) | 404 | icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) |
395 | return; | 405 | return; |
396 | 406 | ||
397 | x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET); | 407 | x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, |
408 | ah->spi, IPPROTO_AH, AF_INET); | ||
398 | if (!x) | 409 | if (!x) |
399 | return; | 410 | return; |
400 | printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n", | 411 | printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n", |
@@ -449,8 +460,12 @@ static int ah_init_state(struct xfrm_state *x) | |||
449 | 460 | ||
450 | BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN); | 461 | BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN); |
451 | 462 | ||
452 | x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + | 463 | if (x->props.flags & XFRM_STATE_ALIGN4) |
453 | ahp->icv_trunc_len); | 464 | x->props.header_len = XFRM_ALIGN4(sizeof(struct ip_auth_hdr) + |
465 | ahp->icv_trunc_len); | ||
466 | else | ||
467 | x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + | ||
468 | ahp->icv_trunc_len); | ||
454 | if (x->props.mode == XFRM_MODE_TUNNEL) | 469 | if (x->props.mode == XFRM_MODE_TUNNEL) |
455 | x->props.header_len += sizeof(struct iphdr); | 470 | x->props.header_len += sizeof(struct iphdr); |
456 | x->data = ahp; | 471 | x->data = ahp; |
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 96c1955b3e2f..1b74d3b64371 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c | |||
@@ -55,7 +55,7 @@ | |||
55 | * Stuart Cheshire : Metricom and grat arp fixes | 55 | * Stuart Cheshire : Metricom and grat arp fixes |
56 | * *** FOR 2.1 clean this up *** | 56 | * *** FOR 2.1 clean this up *** |
57 | * Lawrence V. Stefani: (08/12/96) Added FDDI support. | 57 | * Lawrence V. Stefani: (08/12/96) Added FDDI support. |
58 | * Alan Cox : Took the AP1000 nasty FDDI hack and | 58 | * Alan Cox : Took the AP1000 nasty FDDI hack and |
59 | * folded into the mainstream FDDI code. | 59 | * folded into the mainstream FDDI code. |
60 | * Ack spit, Linus how did you allow that | 60 | * Ack spit, Linus how did you allow that |
61 | * one in... | 61 | * one in... |
@@ -120,14 +120,14 @@ EXPORT_SYMBOL(clip_tbl_hook); | |||
120 | #endif | 120 | #endif |
121 | 121 | ||
122 | #include <asm/system.h> | 122 | #include <asm/system.h> |
123 | #include <asm/uaccess.h> | 123 | #include <linux/uaccess.h> |
124 | 124 | ||
125 | #include <linux/netfilter_arp.h> | 125 | #include <linux/netfilter_arp.h> |
126 | 126 | ||
127 | /* | 127 | /* |
128 | * Interface to generic neighbour cache. | 128 | * Interface to generic neighbour cache. |
129 | */ | 129 | */ |
130 | static u32 arp_hash(const void *pkey, const struct net_device *dev); | 130 | static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 rnd); |
131 | static int arp_constructor(struct neighbour *neigh); | 131 | static int arp_constructor(struct neighbour *neigh); |
132 | static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb); | 132 | static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb); |
133 | static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb); | 133 | static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb); |
@@ -161,7 +161,7 @@ static const struct neigh_ops arp_direct_ops = { | |||
161 | .queue_xmit = dev_queue_xmit, | 161 | .queue_xmit = dev_queue_xmit, |
162 | }; | 162 | }; |
163 | 163 | ||
164 | const struct neigh_ops arp_broken_ops = { | 164 | static const struct neigh_ops arp_broken_ops = { |
165 | .family = AF_INET, | 165 | .family = AF_INET, |
166 | .solicit = arp_solicit, | 166 | .solicit = arp_solicit, |
167 | .error_report = arp_error_report, | 167 | .error_report = arp_error_report, |
@@ -170,35 +170,34 @@ const struct neigh_ops arp_broken_ops = { | |||
170 | .hh_output = dev_queue_xmit, | 170 | .hh_output = dev_queue_xmit, |
171 | .queue_xmit = dev_queue_xmit, | 171 | .queue_xmit = dev_queue_xmit, |
172 | }; | 172 | }; |
173 | EXPORT_SYMBOL(arp_broken_ops); | ||
174 | 173 | ||
175 | struct neigh_table arp_tbl = { | 174 | struct neigh_table arp_tbl = { |
176 | .family = AF_INET, | 175 | .family = AF_INET, |
177 | .entry_size = sizeof(struct neighbour) + 4, | 176 | .entry_size = sizeof(struct neighbour) + 4, |
178 | .key_len = 4, | 177 | .key_len = 4, |
179 | .hash = arp_hash, | 178 | .hash = arp_hash, |
180 | .constructor = arp_constructor, | 179 | .constructor = arp_constructor, |
181 | .proxy_redo = parp_redo, | 180 | .proxy_redo = parp_redo, |
182 | .id = "arp_cache", | 181 | .id = "arp_cache", |
183 | .parms = { | 182 | .parms = { |
184 | .tbl = &arp_tbl, | 183 | .tbl = &arp_tbl, |
185 | .base_reachable_time = 30 * HZ, | 184 | .base_reachable_time = 30 * HZ, |
186 | .retrans_time = 1 * HZ, | 185 | .retrans_time = 1 * HZ, |
187 | .gc_staletime = 60 * HZ, | 186 | .gc_staletime = 60 * HZ, |
188 | .reachable_time = 30 * HZ, | 187 | .reachable_time = 30 * HZ, |
189 | .delay_probe_time = 5 * HZ, | 188 | .delay_probe_time = 5 * HZ, |
190 | .queue_len = 3, | 189 | .queue_len = 3, |
191 | .ucast_probes = 3, | 190 | .ucast_probes = 3, |
192 | .mcast_probes = 3, | 191 | .mcast_probes = 3, |
193 | .anycast_delay = 1 * HZ, | 192 | .anycast_delay = 1 * HZ, |
194 | .proxy_delay = (8 * HZ) / 10, | 193 | .proxy_delay = (8 * HZ) / 10, |
195 | .proxy_qlen = 64, | 194 | .proxy_qlen = 64, |
196 | .locktime = 1 * HZ, | 195 | .locktime = 1 * HZ, |
197 | }, | 196 | }, |
198 | .gc_interval = 30 * HZ, | 197 | .gc_interval = 30 * HZ, |
199 | .gc_thresh1 = 128, | 198 | .gc_thresh1 = 128, |
200 | .gc_thresh2 = 512, | 199 | .gc_thresh2 = 512, |
201 | .gc_thresh3 = 1024, | 200 | .gc_thresh3 = 1024, |
202 | }; | 201 | }; |
203 | EXPORT_SYMBOL(arp_tbl); | 202 | EXPORT_SYMBOL(arp_tbl); |
204 | 203 | ||
@@ -216,6 +215,9 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir) | |||
216 | case ARPHRD_INFINIBAND: | 215 | case ARPHRD_INFINIBAND: |
217 | ip_ib_mc_map(addr, dev->broadcast, haddr); | 216 | ip_ib_mc_map(addr, dev->broadcast, haddr); |
218 | return 0; | 217 | return 0; |
218 | case ARPHRD_IPGRE: | ||
219 | ip_ipgre_mc_map(addr, dev->broadcast, haddr); | ||
220 | return 0; | ||
219 | default: | 221 | default: |
220 | if (dir) { | 222 | if (dir) { |
221 | memcpy(haddr, dev->broadcast, dev->addr_len); | 223 | memcpy(haddr, dev->broadcast, dev->addr_len); |
@@ -226,14 +228,16 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir) | |||
226 | } | 228 | } |
227 | 229 | ||
228 | 230 | ||
229 | static u32 arp_hash(const void *pkey, const struct net_device *dev) | 231 | static u32 arp_hash(const void *pkey, |
232 | const struct net_device *dev, | ||
233 | __u32 hash_rnd) | ||
230 | { | 234 | { |
231 | return jhash_2words(*(u32 *)pkey, dev->ifindex, arp_tbl.hash_rnd); | 235 | return jhash_2words(*(u32 *)pkey, dev->ifindex, hash_rnd); |
232 | } | 236 | } |
233 | 237 | ||
234 | static int arp_constructor(struct neighbour *neigh) | 238 | static int arp_constructor(struct neighbour *neigh) |
235 | { | 239 | { |
236 | __be32 addr = *(__be32*)neigh->primary_key; | 240 | __be32 addr = *(__be32 *)neigh->primary_key; |
237 | struct net_device *dev = neigh->dev; | 241 | struct net_device *dev = neigh->dev; |
238 | struct in_device *in_dev; | 242 | struct in_device *in_dev; |
239 | struct neigh_parms *parms; | 243 | struct neigh_parms *parms; |
@@ -296,16 +300,19 @@ static int arp_constructor(struct neighbour *neigh) | |||
296 | neigh->ops = &arp_broken_ops; | 300 | neigh->ops = &arp_broken_ops; |
297 | neigh->output = neigh->ops->output; | 301 | neigh->output = neigh->ops->output; |
298 | return 0; | 302 | return 0; |
303 | #else | ||
304 | break; | ||
299 | #endif | 305 | #endif |
300 | ;} | 306 | } |
301 | #endif | 307 | #endif |
302 | if (neigh->type == RTN_MULTICAST) { | 308 | if (neigh->type == RTN_MULTICAST) { |
303 | neigh->nud_state = NUD_NOARP; | 309 | neigh->nud_state = NUD_NOARP; |
304 | arp_mc_map(addr, neigh->ha, dev, 1); | 310 | arp_mc_map(addr, neigh->ha, dev, 1); |
305 | } else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) { | 311 | } else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) { |
306 | neigh->nud_state = NUD_NOARP; | 312 | neigh->nud_state = NUD_NOARP; |
307 | memcpy(neigh->ha, dev->dev_addr, dev->addr_len); | 313 | memcpy(neigh->ha, dev->dev_addr, dev->addr_len); |
308 | } else if (neigh->type == RTN_BROADCAST || dev->flags&IFF_POINTOPOINT) { | 314 | } else if (neigh->type == RTN_BROADCAST || |
315 | (dev->flags & IFF_POINTOPOINT)) { | ||
309 | neigh->nud_state = NUD_NOARP; | 316 | neigh->nud_state = NUD_NOARP; |
310 | memcpy(neigh->ha, dev->broadcast, dev->addr_len); | 317 | memcpy(neigh->ha, dev->broadcast, dev->addr_len); |
311 | } | 318 | } |
@@ -315,7 +322,7 @@ static int arp_constructor(struct neighbour *neigh) | |||
315 | else | 322 | else |
316 | neigh->ops = &arp_generic_ops; | 323 | neigh->ops = &arp_generic_ops; |
317 | 324 | ||
318 | if (neigh->nud_state&NUD_VALID) | 325 | if (neigh->nud_state & NUD_VALID) |
319 | neigh->output = neigh->ops->connected_output; | 326 | neigh->output = neigh->ops->connected_output; |
320 | else | 327 | else |
321 | neigh->output = neigh->ops->output; | 328 | neigh->output = neigh->ops->output; |
@@ -334,7 +341,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) | |||
334 | __be32 saddr = 0; | 341 | __be32 saddr = 0; |
335 | u8 *dst_ha = NULL; | 342 | u8 *dst_ha = NULL; |
336 | struct net_device *dev = neigh->dev; | 343 | struct net_device *dev = neigh->dev; |
337 | __be32 target = *(__be32*)neigh->primary_key; | 344 | __be32 target = *(__be32 *)neigh->primary_key; |
338 | int probes = atomic_read(&neigh->probes); | 345 | int probes = atomic_read(&neigh->probes); |
339 | struct in_device *in_dev; | 346 | struct in_device *in_dev; |
340 | 347 | ||
@@ -347,7 +354,8 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) | |||
347 | switch (IN_DEV_ARP_ANNOUNCE(in_dev)) { | 354 | switch (IN_DEV_ARP_ANNOUNCE(in_dev)) { |
348 | default: | 355 | default: |
349 | case 0: /* By default announce any local IP */ | 356 | case 0: /* By default announce any local IP */ |
350 | if (skb && inet_addr_type(dev_net(dev), ip_hdr(skb)->saddr) == RTN_LOCAL) | 357 | if (skb && inet_addr_type(dev_net(dev), |
358 | ip_hdr(skb)->saddr) == RTN_LOCAL) | ||
351 | saddr = ip_hdr(skb)->saddr; | 359 | saddr = ip_hdr(skb)->saddr; |
352 | break; | 360 | break; |
353 | case 1: /* Restrict announcements of saddr in same subnet */ | 361 | case 1: /* Restrict announcements of saddr in same subnet */ |
@@ -369,16 +377,21 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) | |||
369 | if (!saddr) | 377 | if (!saddr) |
370 | saddr = inet_select_addr(dev, target, RT_SCOPE_LINK); | 378 | saddr = inet_select_addr(dev, target, RT_SCOPE_LINK); |
371 | 379 | ||
372 | if ((probes -= neigh->parms->ucast_probes) < 0) { | 380 | probes -= neigh->parms->ucast_probes; |
373 | if (!(neigh->nud_state&NUD_VALID)) | 381 | if (probes < 0) { |
374 | printk(KERN_DEBUG "trying to ucast probe in NUD_INVALID\n"); | 382 | if (!(neigh->nud_state & NUD_VALID)) |
383 | printk(KERN_DEBUG | ||
384 | "trying to ucast probe in NUD_INVALID\n"); | ||
375 | dst_ha = neigh->ha; | 385 | dst_ha = neigh->ha; |
376 | read_lock_bh(&neigh->lock); | 386 | read_lock_bh(&neigh->lock); |
377 | } else if ((probes -= neigh->parms->app_probes) < 0) { | 387 | } else { |
388 | probes -= neigh->parms->app_probes; | ||
389 | if (probes < 0) { | ||
378 | #ifdef CONFIG_ARPD | 390 | #ifdef CONFIG_ARPD |
379 | neigh_app_ns(neigh); | 391 | neigh_app_ns(neigh); |
380 | #endif | 392 | #endif |
381 | return; | 393 | return; |
394 | } | ||
382 | } | 395 | } |
383 | 396 | ||
384 | arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, | 397 | arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, |
@@ -423,14 +436,13 @@ static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) | |||
423 | 436 | ||
424 | static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) | 437 | static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) |
425 | { | 438 | { |
426 | struct flowi fl = { .nl_u = { .ip4_u = { .daddr = sip, | ||
427 | .saddr = tip } } }; | ||
428 | struct rtable *rt; | 439 | struct rtable *rt; |
429 | int flag = 0; | 440 | int flag = 0; |
430 | /*unsigned long now; */ | 441 | /*unsigned long now; */ |
431 | struct net *net = dev_net(dev); | 442 | struct net *net = dev_net(dev); |
432 | 443 | ||
433 | if (ip_route_output_key(net, &rt, &fl) < 0) | 444 | rt = ip_route_output(net, sip, tip, 0, 0); |
445 | if (IS_ERR(rt)) | ||
434 | return 1; | 446 | return 1; |
435 | if (rt->dst.dev != dev) { | 447 | if (rt->dst.dev != dev) { |
436 | NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER); | 448 | NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER); |
@@ -451,7 +463,8 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) | |||
451 | * is allowed to use this function, it is scheduled to be removed. --ANK | 463 | * is allowed to use this function, it is scheduled to be removed. --ANK |
452 | */ | 464 | */ |
453 | 465 | ||
454 | static int arp_set_predefined(int addr_hint, unsigned char * haddr, __be32 paddr, struct net_device * dev) | 466 | static int arp_set_predefined(int addr_hint, unsigned char *haddr, |
467 | __be32 paddr, struct net_device *dev) | ||
455 | { | 468 | { |
456 | switch (addr_hint) { | 469 | switch (addr_hint) { |
457 | case RTN_LOCAL: | 470 | case RTN_LOCAL: |
@@ -483,17 +496,16 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb) | |||
483 | 496 | ||
484 | paddr = skb_rtable(skb)->rt_gateway; | 497 | paddr = skb_rtable(skb)->rt_gateway; |
485 | 498 | ||
486 | if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr, paddr, dev)) | 499 | if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr, |
500 | paddr, dev)) | ||
487 | return 0; | 501 | return 0; |
488 | 502 | ||
489 | n = __neigh_lookup(&arp_tbl, &paddr, dev, 1); | 503 | n = __neigh_lookup(&arp_tbl, &paddr, dev, 1); |
490 | 504 | ||
491 | if (n) { | 505 | if (n) { |
492 | n->used = jiffies; | 506 | n->used = jiffies; |
493 | if (n->nud_state&NUD_VALID || neigh_event_send(n, skb) == 0) { | 507 | if (n->nud_state & NUD_VALID || neigh_event_send(n, skb) == 0) { |
494 | read_lock_bh(&n->lock); | 508 | neigh_ha_snapshot(haddr, n, dev); |
495 | memcpy(haddr, n->ha, dev->addr_len); | ||
496 | read_unlock_bh(&n->lock); | ||
497 | neigh_release(n); | 509 | neigh_release(n); |
498 | return 0; | 510 | return 0; |
499 | } | 511 | } |
@@ -515,13 +527,14 @@ int arp_bind_neighbour(struct dst_entry *dst) | |||
515 | return -EINVAL; | 527 | return -EINVAL; |
516 | if (n == NULL) { | 528 | if (n == NULL) { |
517 | __be32 nexthop = ((struct rtable *)dst)->rt_gateway; | 529 | __be32 nexthop = ((struct rtable *)dst)->rt_gateway; |
518 | if (dev->flags&(IFF_LOOPBACK|IFF_POINTOPOINT)) | 530 | if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) |
519 | nexthop = 0; | 531 | nexthop = 0; |
520 | n = __neigh_lookup_errno( | 532 | n = __neigh_lookup_errno( |
521 | #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE) | 533 | #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE) |
522 | dev->type == ARPHRD_ATM ? clip_tbl_hook : | 534 | dev->type == ARPHRD_ATM ? |
535 | clip_tbl_hook : | ||
523 | #endif | 536 | #endif |
524 | &arp_tbl, &nexthop, dev); | 537 | &arp_tbl, &nexthop, dev); |
525 | if (IS_ERR(n)) | 538 | if (IS_ERR(n)) |
526 | return PTR_ERR(n); | 539 | return PTR_ERR(n); |
527 | dst->neighbour = n; | 540 | dst->neighbour = n; |
@@ -543,8 +556,8 @@ static inline int arp_fwd_proxy(struct in_device *in_dev, | |||
543 | 556 | ||
544 | if (!IN_DEV_PROXY_ARP(in_dev)) | 557 | if (!IN_DEV_PROXY_ARP(in_dev)) |
545 | return 0; | 558 | return 0; |
546 | 559 | imi = IN_DEV_MEDIUM_ID(in_dev); | |
547 | if ((imi = IN_DEV_MEDIUM_ID(in_dev)) == 0) | 560 | if (imi == 0) |
548 | return 1; | 561 | return 1; |
549 | if (imi == -1) | 562 | if (imi == -1) |
550 | return 0; | 563 | return 0; |
@@ -555,7 +568,7 @@ static inline int arp_fwd_proxy(struct in_device *in_dev, | |||
555 | if (out_dev) | 568 | if (out_dev) |
556 | omi = IN_DEV_MEDIUM_ID(out_dev); | 569 | omi = IN_DEV_MEDIUM_ID(out_dev); |
557 | 570 | ||
558 | return (omi != imi && omi != -1); | 571 | return omi != imi && omi != -1; |
559 | } | 572 | } |
560 | 573 | ||
561 | /* | 574 | /* |
@@ -685,7 +698,7 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, | |||
685 | arp->ar_pln = 4; | 698 | arp->ar_pln = 4; |
686 | arp->ar_op = htons(type); | 699 | arp->ar_op = htons(type); |
687 | 700 | ||
688 | arp_ptr=(unsigned char *)(arp+1); | 701 | arp_ptr = (unsigned char *)(arp + 1); |
689 | 702 | ||
690 | memcpy(arp_ptr, src_hw, dev->addr_len); | 703 | memcpy(arp_ptr, src_hw, dev->addr_len); |
691 | arp_ptr += dev->addr_len; | 704 | arp_ptr += dev->addr_len; |
@@ -735,9 +748,8 @@ void arp_send(int type, int ptype, __be32 dest_ip, | |||
735 | 748 | ||
736 | skb = arp_create(type, ptype, dest_ip, dev, src_ip, | 749 | skb = arp_create(type, ptype, dest_ip, dev, src_ip, |
737 | dest_hw, src_hw, target_hw); | 750 | dest_hw, src_hw, target_hw); |
738 | if (skb == NULL) { | 751 | if (skb == NULL) |
739 | return; | 752 | return; |
740 | } | ||
741 | 753 | ||
742 | arp_xmit(skb); | 754 | arp_xmit(skb); |
743 | } | 755 | } |
@@ -815,7 +827,7 @@ static int arp_process(struct sk_buff *skb) | |||
815 | /* | 827 | /* |
816 | * Extract fields | 828 | * Extract fields |
817 | */ | 829 | */ |
818 | arp_ptr= (unsigned char *)(arp+1); | 830 | arp_ptr = (unsigned char *)(arp + 1); |
819 | sha = arp_ptr; | 831 | sha = arp_ptr; |
820 | arp_ptr += dev->addr_len; | 832 | arp_ptr += dev->addr_len; |
821 | memcpy(&sip, arp_ptr, 4); | 833 | memcpy(&sip, arp_ptr, 4); |
@@ -869,16 +881,17 @@ static int arp_process(struct sk_buff *skb) | |||
869 | addr_type = rt->rt_type; | 881 | addr_type = rt->rt_type; |
870 | 882 | ||
871 | if (addr_type == RTN_LOCAL) { | 883 | if (addr_type == RTN_LOCAL) { |
872 | int dont_send = 0; | 884 | int dont_send; |
873 | 885 | ||
874 | if (!dont_send) | 886 | dont_send = arp_ignore(in_dev, sip, tip); |
875 | dont_send |= arp_ignore(in_dev,sip,tip); | ||
876 | if (!dont_send && IN_DEV_ARPFILTER(in_dev)) | 887 | if (!dont_send && IN_DEV_ARPFILTER(in_dev)) |
877 | dont_send |= arp_filter(sip,tip,dev); | 888 | dont_send = arp_filter(sip, tip, dev); |
878 | if (!dont_send) { | 889 | if (!dont_send) { |
879 | n = neigh_event_ns(&arp_tbl, sha, &sip, dev); | 890 | n = neigh_event_ns(&arp_tbl, sha, &sip, dev); |
880 | if (n) { | 891 | if (n) { |
881 | arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); | 892 | arp_send(ARPOP_REPLY, ETH_P_ARP, sip, |
893 | dev, tip, sha, dev->dev_addr, | ||
894 | sha); | ||
882 | neigh_release(n); | 895 | neigh_release(n); |
883 | } | 896 | } |
884 | } | 897 | } |
@@ -887,8 +900,7 @@ static int arp_process(struct sk_buff *skb) | |||
887 | if (addr_type == RTN_UNICAST && | 900 | if (addr_type == RTN_UNICAST && |
888 | (arp_fwd_proxy(in_dev, dev, rt) || | 901 | (arp_fwd_proxy(in_dev, dev, rt) || |
889 | arp_fwd_pvlan(in_dev, dev, rt, sip, tip) || | 902 | arp_fwd_pvlan(in_dev, dev, rt, sip, tip) || |
890 | pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) | 903 | pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) { |
891 | { | ||
892 | n = neigh_event_ns(&arp_tbl, sha, &sip, dev); | 904 | n = neigh_event_ns(&arp_tbl, sha, &sip, dev); |
893 | if (n) | 905 | if (n) |
894 | neigh_release(n); | 906 | neigh_release(n); |
@@ -896,9 +908,12 @@ static int arp_process(struct sk_buff *skb) | |||
896 | if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED || | 908 | if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED || |
897 | skb->pkt_type == PACKET_HOST || | 909 | skb->pkt_type == PACKET_HOST || |
898 | in_dev->arp_parms->proxy_delay == 0) { | 910 | in_dev->arp_parms->proxy_delay == 0) { |
899 | arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); | 911 | arp_send(ARPOP_REPLY, ETH_P_ARP, sip, |
912 | dev, tip, sha, dev->dev_addr, | ||
913 | sha); | ||
900 | } else { | 914 | } else { |
901 | pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb); | 915 | pneigh_enqueue(&arp_tbl, |
916 | in_dev->arp_parms, skb); | ||
902 | return 0; | 917 | return 0; |
903 | } | 918 | } |
904 | goto out; | 919 | goto out; |
@@ -939,7 +954,8 @@ static int arp_process(struct sk_buff *skb) | |||
939 | if (arp->ar_op != htons(ARPOP_REPLY) || | 954 | if (arp->ar_op != htons(ARPOP_REPLY) || |
940 | skb->pkt_type != PACKET_HOST) | 955 | skb->pkt_type != PACKET_HOST) |
941 | state = NUD_STALE; | 956 | state = NUD_STALE; |
942 | neigh_update(n, sha, state, override ? NEIGH_UPDATE_F_OVERRIDE : 0); | 957 | neigh_update(n, sha, state, |
958 | override ? NEIGH_UPDATE_F_OVERRIDE : 0); | ||
943 | neigh_release(n); | 959 | neigh_release(n); |
944 | } | 960 | } |
945 | 961 | ||
@@ -975,7 +991,8 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev, | |||
975 | arp->ar_pln != 4) | 991 | arp->ar_pln != 4) |
976 | goto freeskb; | 992 | goto freeskb; |
977 | 993 | ||
978 | if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) | 994 | skb = skb_share_check(skb, GFP_ATOMIC); |
995 | if (skb == NULL) | ||
979 | goto out_of_mem; | 996 | goto out_of_mem; |
980 | 997 | ||
981 | memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); | 998 | memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); |
@@ -1018,8 +1035,8 @@ static int arp_req_set_public(struct net *net, struct arpreq *r, | |||
1018 | if (mask && mask != htonl(0xFFFFFFFF)) | 1035 | if (mask && mask != htonl(0xFFFFFFFF)) |
1019 | return -EINVAL; | 1036 | return -EINVAL; |
1020 | if (!dev && (r->arp_flags & ATF_COM)) { | 1037 | if (!dev && (r->arp_flags & ATF_COM)) { |
1021 | dev = dev_getbyhwaddr(net, r->arp_ha.sa_family, | 1038 | dev = dev_getbyhwaddr_rcu(net, r->arp_ha.sa_family, |
1022 | r->arp_ha.sa_data); | 1039 | r->arp_ha.sa_data); |
1023 | if (!dev) | 1040 | if (!dev) |
1024 | return -ENODEV; | 1041 | return -ENODEV; |
1025 | } | 1042 | } |
@@ -1033,7 +1050,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r, | |||
1033 | } | 1050 | } |
1034 | 1051 | ||
1035 | static int arp_req_set(struct net *net, struct arpreq *r, | 1052 | static int arp_req_set(struct net *net, struct arpreq *r, |
1036 | struct net_device * dev) | 1053 | struct net_device *dev) |
1037 | { | 1054 | { |
1038 | __be32 ip; | 1055 | __be32 ip; |
1039 | struct neighbour *neigh; | 1056 | struct neighbour *neigh; |
@@ -1046,11 +1063,10 @@ static int arp_req_set(struct net *net, struct arpreq *r, | |||
1046 | if (r->arp_flags & ATF_PERM) | 1063 | if (r->arp_flags & ATF_PERM) |
1047 | r->arp_flags |= ATF_COM; | 1064 | r->arp_flags |= ATF_COM; |
1048 | if (dev == NULL) { | 1065 | if (dev == NULL) { |
1049 | struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip, | 1066 | struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0); |
1050 | .tos = RTO_ONLINK } } }; | 1067 | |
1051 | struct rtable * rt; | 1068 | if (IS_ERR(rt)) |
1052 | if ((err = ip_route_output_key(net, &rt, &fl)) != 0) | 1069 | return PTR_ERR(rt); |
1053 | return err; | ||
1054 | dev = rt->dst.dev; | 1070 | dev = rt->dst.dev; |
1055 | ip_rt_put(rt); | 1071 | ip_rt_put(rt); |
1056 | if (!dev) | 1072 | if (!dev) |
@@ -1083,9 +1099,9 @@ static int arp_req_set(struct net *net, struct arpreq *r, | |||
1083 | unsigned state = NUD_STALE; | 1099 | unsigned state = NUD_STALE; |
1084 | if (r->arp_flags & ATF_PERM) | 1100 | if (r->arp_flags & ATF_PERM) |
1085 | state = NUD_PERMANENT; | 1101 | state = NUD_PERMANENT; |
1086 | err = neigh_update(neigh, (r->arp_flags&ATF_COM) ? | 1102 | err = neigh_update(neigh, (r->arp_flags & ATF_COM) ? |
1087 | r->arp_ha.sa_data : NULL, state, | 1103 | r->arp_ha.sa_data : NULL, state, |
1088 | NEIGH_UPDATE_F_OVERRIDE| | 1104 | NEIGH_UPDATE_F_OVERRIDE | |
1089 | NEIGH_UPDATE_F_ADMIN); | 1105 | NEIGH_UPDATE_F_ADMIN); |
1090 | neigh_release(neigh); | 1106 | neigh_release(neigh); |
1091 | } | 1107 | } |
@@ -1094,12 +1110,12 @@ static int arp_req_set(struct net *net, struct arpreq *r, | |||
1094 | 1110 | ||
1095 | static unsigned arp_state_to_flags(struct neighbour *neigh) | 1111 | static unsigned arp_state_to_flags(struct neighbour *neigh) |
1096 | { | 1112 | { |
1097 | unsigned flags = 0; | ||
1098 | if (neigh->nud_state&NUD_PERMANENT) | 1113 | if (neigh->nud_state&NUD_PERMANENT) |
1099 | flags = ATF_PERM|ATF_COM; | 1114 | return ATF_PERM | ATF_COM; |
1100 | else if (neigh->nud_state&NUD_VALID) | 1115 | else if (neigh->nud_state&NUD_VALID) |
1101 | flags = ATF_COM; | 1116 | return ATF_COM; |
1102 | return flags; | 1117 | else |
1118 | return 0; | ||
1103 | } | 1119 | } |
1104 | 1120 | ||
1105 | /* | 1121 | /* |
@@ -1126,6 +1142,23 @@ static int arp_req_get(struct arpreq *r, struct net_device *dev) | |||
1126 | return err; | 1142 | return err; |
1127 | } | 1143 | } |
1128 | 1144 | ||
1145 | int arp_invalidate(struct net_device *dev, __be32 ip) | ||
1146 | { | ||
1147 | struct neighbour *neigh = neigh_lookup(&arp_tbl, &ip, dev); | ||
1148 | int err = -ENXIO; | ||
1149 | |||
1150 | if (neigh) { | ||
1151 | if (neigh->nud_state & ~NUD_NOARP) | ||
1152 | err = neigh_update(neigh, NULL, NUD_FAILED, | ||
1153 | NEIGH_UPDATE_F_OVERRIDE| | ||
1154 | NEIGH_UPDATE_F_ADMIN); | ||
1155 | neigh_release(neigh); | ||
1156 | } | ||
1157 | |||
1158 | return err; | ||
1159 | } | ||
1160 | EXPORT_SYMBOL(arp_invalidate); | ||
1161 | |||
1129 | static int arp_req_delete_public(struct net *net, struct arpreq *r, | 1162 | static int arp_req_delete_public(struct net *net, struct arpreq *r, |
1130 | struct net_device *dev) | 1163 | struct net_device *dev) |
1131 | { | 1164 | { |
@@ -1142,37 +1175,24 @@ static int arp_req_delete_public(struct net *net, struct arpreq *r, | |||
1142 | } | 1175 | } |
1143 | 1176 | ||
1144 | static int arp_req_delete(struct net *net, struct arpreq *r, | 1177 | static int arp_req_delete(struct net *net, struct arpreq *r, |
1145 | struct net_device * dev) | 1178 | struct net_device *dev) |
1146 | { | 1179 | { |
1147 | int err; | ||
1148 | __be32 ip; | 1180 | __be32 ip; |
1149 | struct neighbour *neigh; | ||
1150 | 1181 | ||
1151 | if (r->arp_flags & ATF_PUBL) | 1182 | if (r->arp_flags & ATF_PUBL) |
1152 | return arp_req_delete_public(net, r, dev); | 1183 | return arp_req_delete_public(net, r, dev); |
1153 | 1184 | ||
1154 | ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; | 1185 | ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; |
1155 | if (dev == NULL) { | 1186 | if (dev == NULL) { |
1156 | struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip, | 1187 | struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0); |
1157 | .tos = RTO_ONLINK } } }; | 1188 | if (IS_ERR(rt)) |
1158 | struct rtable * rt; | 1189 | return PTR_ERR(rt); |
1159 | if ((err = ip_route_output_key(net, &rt, &fl)) != 0) | ||
1160 | return err; | ||
1161 | dev = rt->dst.dev; | 1190 | dev = rt->dst.dev; |
1162 | ip_rt_put(rt); | 1191 | ip_rt_put(rt); |
1163 | if (!dev) | 1192 | if (!dev) |
1164 | return -EINVAL; | 1193 | return -EINVAL; |
1165 | } | 1194 | } |
1166 | err = -ENXIO; | 1195 | return arp_invalidate(dev, ip); |
1167 | neigh = neigh_lookup(&arp_tbl, &ip, dev); | ||
1168 | if (neigh) { | ||
1169 | if (neigh->nud_state&~NUD_NOARP) | ||
1170 | err = neigh_update(neigh, NULL, NUD_FAILED, | ||
1171 | NEIGH_UPDATE_F_OVERRIDE| | ||
1172 | NEIGH_UPDATE_F_ADMIN); | ||
1173 | neigh_release(neigh); | ||
1174 | } | ||
1175 | return err; | ||
1176 | } | 1196 | } |
1177 | 1197 | ||
1178 | /* | 1198 | /* |
@@ -1186,24 +1206,24 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) | |||
1186 | struct net_device *dev = NULL; | 1206 | struct net_device *dev = NULL; |
1187 | 1207 | ||
1188 | switch (cmd) { | 1208 | switch (cmd) { |
1189 | case SIOCDARP: | 1209 | case SIOCDARP: |
1190 | case SIOCSARP: | 1210 | case SIOCSARP: |
1191 | if (!capable(CAP_NET_ADMIN)) | 1211 | if (!capable(CAP_NET_ADMIN)) |
1192 | return -EPERM; | 1212 | return -EPERM; |
1193 | case SIOCGARP: | 1213 | case SIOCGARP: |
1194 | err = copy_from_user(&r, arg, sizeof(struct arpreq)); | 1214 | err = copy_from_user(&r, arg, sizeof(struct arpreq)); |
1195 | if (err) | 1215 | if (err) |
1196 | return -EFAULT; | 1216 | return -EFAULT; |
1197 | break; | 1217 | break; |
1198 | default: | 1218 | default: |
1199 | return -EINVAL; | 1219 | return -EINVAL; |
1200 | } | 1220 | } |
1201 | 1221 | ||
1202 | if (r.arp_pa.sa_family != AF_INET) | 1222 | if (r.arp_pa.sa_family != AF_INET) |
1203 | return -EPFNOSUPPORT; | 1223 | return -EPFNOSUPPORT; |
1204 | 1224 | ||
1205 | if (!(r.arp_flags & ATF_PUBL) && | 1225 | if (!(r.arp_flags & ATF_PUBL) && |
1206 | (r.arp_flags & (ATF_NETMASK|ATF_DONTPUB))) | 1226 | (r.arp_flags & (ATF_NETMASK | ATF_DONTPUB))) |
1207 | return -EINVAL; | 1227 | return -EINVAL; |
1208 | if (!(r.arp_flags & ATF_NETMASK)) | 1228 | if (!(r.arp_flags & ATF_NETMASK)) |
1209 | ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr = | 1229 | ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr = |
@@ -1211,7 +1231,8 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) | |||
1211 | rtnl_lock(); | 1231 | rtnl_lock(); |
1212 | if (r.arp_dev[0]) { | 1232 | if (r.arp_dev[0]) { |
1213 | err = -ENODEV; | 1233 | err = -ENODEV; |
1214 | if ((dev = __dev_get_by_name(net, r.arp_dev)) == NULL) | 1234 | dev = __dev_get_by_name(net, r.arp_dev); |
1235 | if (dev == NULL) | ||
1215 | goto out; | 1236 | goto out; |
1216 | 1237 | ||
1217 | /* Mmmm... It is wrong... ARPHRD_NETROM==0 */ | 1238 | /* Mmmm... It is wrong... ARPHRD_NETROM==0 */ |
@@ -1234,16 +1255,17 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) | |||
1234 | break; | 1255 | break; |
1235 | case SIOCGARP: | 1256 | case SIOCGARP: |
1236 | err = arp_req_get(&r, dev); | 1257 | err = arp_req_get(&r, dev); |
1237 | if (!err && copy_to_user(arg, &r, sizeof(r))) | ||
1238 | err = -EFAULT; | ||
1239 | break; | 1258 | break; |
1240 | } | 1259 | } |
1241 | out: | 1260 | out: |
1242 | rtnl_unlock(); | 1261 | rtnl_unlock(); |
1262 | if (cmd == SIOCGARP && !err && copy_to_user(arg, &r, sizeof(r))) | ||
1263 | err = -EFAULT; | ||
1243 | return err; | 1264 | return err; |
1244 | } | 1265 | } |
1245 | 1266 | ||
1246 | static int arp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) | 1267 | static int arp_netdev_event(struct notifier_block *this, unsigned long event, |
1268 | void *ptr) | ||
1247 | { | 1269 | { |
1248 | struct net_device *dev = ptr; | 1270 | struct net_device *dev = ptr; |
1249 | 1271 | ||
@@ -1311,12 +1333,13 @@ static char *ax2asc2(ax25_address *a, char *buf) | |||
1311 | for (n = 0, s = buf; n < 6; n++) { | 1333 | for (n = 0, s = buf; n < 6; n++) { |
1312 | c = (a->ax25_call[n] >> 1) & 0x7F; | 1334 | c = (a->ax25_call[n] >> 1) & 0x7F; |
1313 | 1335 | ||
1314 | if (c != ' ') *s++ = c; | 1336 | if (c != ' ') |
1337 | *s++ = c; | ||
1315 | } | 1338 | } |
1316 | 1339 | ||
1317 | *s++ = '-'; | 1340 | *s++ = '-'; |
1318 | 1341 | n = (a->ax25_call[6] >> 1) & 0x0F; | |
1319 | if ((n = ((a->ax25_call[6] >> 1) & 0x0F)) > 9) { | 1342 | if (n > 9) { |
1320 | *s++ = '1'; | 1343 | *s++ = '1'; |
1321 | n -= 10; | 1344 | n -= 10; |
1322 | } | 1345 | } |
@@ -1325,10 +1348,9 @@ static char *ax2asc2(ax25_address *a, char *buf) | |||
1325 | *s++ = '\0'; | 1348 | *s++ = '\0'; |
1326 | 1349 | ||
1327 | if (*buf == '\0' || *buf == '-') | 1350 | if (*buf == '\0' || *buf == '-') |
1328 | return "*"; | 1351 | return "*"; |
1329 | 1352 | ||
1330 | return buf; | 1353 | return buf; |
1331 | |||
1332 | } | 1354 | } |
1333 | #endif /* CONFIG_AX25 */ | 1355 | #endif /* CONFIG_AX25 */ |
1334 | 1356 | ||
@@ -1408,10 +1430,10 @@ static void *arp_seq_start(struct seq_file *seq, loff_t *pos) | |||
1408 | /* ------------------------------------------------------------------------ */ | 1430 | /* ------------------------------------------------------------------------ */ |
1409 | 1431 | ||
1410 | static const struct seq_operations arp_seq_ops = { | 1432 | static const struct seq_operations arp_seq_ops = { |
1411 | .start = arp_seq_start, | 1433 | .start = arp_seq_start, |
1412 | .next = neigh_seq_next, | 1434 | .next = neigh_seq_next, |
1413 | .stop = neigh_seq_stop, | 1435 | .stop = neigh_seq_stop, |
1414 | .show = arp_seq_show, | 1436 | .show = arp_seq_show, |
1415 | }; | 1437 | }; |
1416 | 1438 | ||
1417 | static int arp_seq_open(struct inode *inode, struct file *file) | 1439 | static int arp_seq_open(struct inode *inode, struct file *file) |
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 3a92a76ae41d..2b3c23c287cd 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c | |||
@@ -9,7 +9,7 @@ | |||
9 | * | 9 | * |
10 | * The CIPSO draft specification can be found in the kernel's Documentation | 10 | * The CIPSO draft specification can be found in the kernel's Documentation |
11 | * directory as well as the following URL: | 11 | * directory as well as the following URL: |
12 | * http://netlabel.sourceforge.net/files/draft-ietf-cipso-ipsecurity-01.txt | 12 | * http://tools.ietf.org/id/draft-ietf-cipso-ipsecurity-01.txt |
13 | * The FIPS-188 specification can be found at the following URL: | 13 | * The FIPS-188 specification can be found at the following URL: |
14 | * http://www.itl.nist.gov/fipspubs/fip188.htm | 14 | * http://www.itl.nist.gov/fipspubs/fip188.htm |
15 | * | 15 | * |
@@ -112,7 +112,7 @@ int cipso_v4_rbm_strictvalid = 1; | |||
112 | /* The maximum number of category ranges permitted in the ranged category tag | 112 | /* The maximum number of category ranges permitted in the ranged category tag |
113 | * (tag #5). You may note that the IETF draft states that the maximum number | 113 | * (tag #5). You may note that the IETF draft states that the maximum number |
114 | * of category ranges is 7, but if the low end of the last category range is | 114 | * of category ranges is 7, but if the low end of the last category range is |
115 | * zero then it is possibile to fit 8 category ranges because the zero should | 115 | * zero then it is possible to fit 8 category ranges because the zero should |
116 | * be omitted. */ | 116 | * be omitted. */ |
117 | #define CIPSO_V4_TAG_RNG_CAT_MAX 8 | 117 | #define CIPSO_V4_TAG_RNG_CAT_MAX 8 |
118 | 118 | ||
@@ -438,7 +438,7 @@ cache_add_failure: | |||
438 | * | 438 | * |
439 | * Description: | 439 | * Description: |
440 | * Search the DOI definition list for a DOI definition with a DOI value that | 440 | * Search the DOI definition list for a DOI definition with a DOI value that |
441 | * matches @doi. The caller is responsibile for calling rcu_read_[un]lock(). | 441 | * matches @doi. The caller is responsible for calling rcu_read_[un]lock(). |
442 | * Returns a pointer to the DOI definition on success and NULL on failure. | 442 | * Returns a pointer to the DOI definition on success and NULL on failure. |
443 | */ | 443 | */ |
444 | static struct cipso_v4_doi *cipso_v4_doi_search(u32 doi) | 444 | static struct cipso_v4_doi *cipso_v4_doi_search(u32 doi) |
@@ -1293,7 +1293,7 @@ static int cipso_v4_gentag_rbm(const struct cipso_v4_doi *doi_def, | |||
1293 | return ret_val; | 1293 | return ret_val; |
1294 | 1294 | ||
1295 | /* This will send packets using the "optimized" format when | 1295 | /* This will send packets using the "optimized" format when |
1296 | * possibile as specified in section 3.4.2.6 of the | 1296 | * possible as specified in section 3.4.2.6 of the |
1297 | * CIPSO draft. */ | 1297 | * CIPSO draft. */ |
1298 | if (cipso_v4_rbm_optfmt && ret_val > 0 && ret_val <= 10) | 1298 | if (cipso_v4_rbm_optfmt && ret_val > 0 && ret_val <= 10) |
1299 | tag_len = 14; | 1299 | tag_len = 14; |
@@ -1752,7 +1752,7 @@ validate_return: | |||
1752 | } | 1752 | } |
1753 | 1753 | ||
1754 | /** | 1754 | /** |
1755 | * cipso_v4_error - Send the correct reponse for a bad packet | 1755 | * cipso_v4_error - Send the correct response for a bad packet |
1756 | * @skb: the packet | 1756 | * @skb: the packet |
1757 | * @error: the error code | 1757 | * @error: the error code |
1758 | * @gateway: CIPSO gateway flag | 1758 | * @gateway: CIPSO gateway flag |
@@ -1857,6 +1857,11 @@ static int cipso_v4_genopt(unsigned char *buf, u32 buf_len, | |||
1857 | return CIPSO_V4_HDR_LEN + ret_val; | 1857 | return CIPSO_V4_HDR_LEN + ret_val; |
1858 | } | 1858 | } |
1859 | 1859 | ||
1860 | static void opt_kfree_rcu(struct rcu_head *head) | ||
1861 | { | ||
1862 | kfree(container_of(head, struct ip_options_rcu, rcu)); | ||
1863 | } | ||
1864 | |||
1860 | /** | 1865 | /** |
1861 | * cipso_v4_sock_setattr - Add a CIPSO option to a socket | 1866 | * cipso_v4_sock_setattr - Add a CIPSO option to a socket |
1862 | * @sk: the socket | 1867 | * @sk: the socket |
@@ -1879,7 +1884,7 @@ int cipso_v4_sock_setattr(struct sock *sk, | |||
1879 | unsigned char *buf = NULL; | 1884 | unsigned char *buf = NULL; |
1880 | u32 buf_len; | 1885 | u32 buf_len; |
1881 | u32 opt_len; | 1886 | u32 opt_len; |
1882 | struct ip_options *opt = NULL; | 1887 | struct ip_options_rcu *old, *opt = NULL; |
1883 | struct inet_sock *sk_inet; | 1888 | struct inet_sock *sk_inet; |
1884 | struct inet_connection_sock *sk_conn; | 1889 | struct inet_connection_sock *sk_conn; |
1885 | 1890 | ||
@@ -1915,22 +1920,25 @@ int cipso_v4_sock_setattr(struct sock *sk, | |||
1915 | ret_val = -ENOMEM; | 1920 | ret_val = -ENOMEM; |
1916 | goto socket_setattr_failure; | 1921 | goto socket_setattr_failure; |
1917 | } | 1922 | } |
1918 | memcpy(opt->__data, buf, buf_len); | 1923 | memcpy(opt->opt.__data, buf, buf_len); |
1919 | opt->optlen = opt_len; | 1924 | opt->opt.optlen = opt_len; |
1920 | opt->cipso = sizeof(struct iphdr); | 1925 | opt->opt.cipso = sizeof(struct iphdr); |
1921 | kfree(buf); | 1926 | kfree(buf); |
1922 | buf = NULL; | 1927 | buf = NULL; |
1923 | 1928 | ||
1924 | sk_inet = inet_sk(sk); | 1929 | sk_inet = inet_sk(sk); |
1930 | |||
1931 | old = rcu_dereference_protected(sk_inet->inet_opt, sock_owned_by_user(sk)); | ||
1925 | if (sk_inet->is_icsk) { | 1932 | if (sk_inet->is_icsk) { |
1926 | sk_conn = inet_csk(sk); | 1933 | sk_conn = inet_csk(sk); |
1927 | if (sk_inet->opt) | 1934 | if (old) |
1928 | sk_conn->icsk_ext_hdr_len -= sk_inet->opt->optlen; | 1935 | sk_conn->icsk_ext_hdr_len -= old->opt.optlen; |
1929 | sk_conn->icsk_ext_hdr_len += opt->optlen; | 1936 | sk_conn->icsk_ext_hdr_len += opt->opt.optlen; |
1930 | sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie); | 1937 | sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie); |
1931 | } | 1938 | } |
1932 | opt = xchg(&sk_inet->opt, opt); | 1939 | rcu_assign_pointer(sk_inet->inet_opt, opt); |
1933 | kfree(opt); | 1940 | if (old) |
1941 | call_rcu(&old->rcu, opt_kfree_rcu); | ||
1934 | 1942 | ||
1935 | return 0; | 1943 | return 0; |
1936 | 1944 | ||
@@ -1960,7 +1968,7 @@ int cipso_v4_req_setattr(struct request_sock *req, | |||
1960 | unsigned char *buf = NULL; | 1968 | unsigned char *buf = NULL; |
1961 | u32 buf_len; | 1969 | u32 buf_len; |
1962 | u32 opt_len; | 1970 | u32 opt_len; |
1963 | struct ip_options *opt = NULL; | 1971 | struct ip_options_rcu *opt = NULL; |
1964 | struct inet_request_sock *req_inet; | 1972 | struct inet_request_sock *req_inet; |
1965 | 1973 | ||
1966 | /* We allocate the maximum CIPSO option size here so we are probably | 1974 | /* We allocate the maximum CIPSO option size here so we are probably |
@@ -1988,15 +1996,16 @@ int cipso_v4_req_setattr(struct request_sock *req, | |||
1988 | ret_val = -ENOMEM; | 1996 | ret_val = -ENOMEM; |
1989 | goto req_setattr_failure; | 1997 | goto req_setattr_failure; |
1990 | } | 1998 | } |
1991 | memcpy(opt->__data, buf, buf_len); | 1999 | memcpy(opt->opt.__data, buf, buf_len); |
1992 | opt->optlen = opt_len; | 2000 | opt->opt.optlen = opt_len; |
1993 | opt->cipso = sizeof(struct iphdr); | 2001 | opt->opt.cipso = sizeof(struct iphdr); |
1994 | kfree(buf); | 2002 | kfree(buf); |
1995 | buf = NULL; | 2003 | buf = NULL; |
1996 | 2004 | ||
1997 | req_inet = inet_rsk(req); | 2005 | req_inet = inet_rsk(req); |
1998 | opt = xchg(&req_inet->opt, opt); | 2006 | opt = xchg(&req_inet->opt, opt); |
1999 | kfree(opt); | 2007 | if (opt) |
2008 | call_rcu(&opt->rcu, opt_kfree_rcu); | ||
2000 | 2009 | ||
2001 | return 0; | 2010 | return 0; |
2002 | 2011 | ||
@@ -2016,34 +2025,34 @@ req_setattr_failure: | |||
2016 | * values on failure. | 2025 | * values on failure. |
2017 | * | 2026 | * |
2018 | */ | 2027 | */ |
2019 | static int cipso_v4_delopt(struct ip_options **opt_ptr) | 2028 | static int cipso_v4_delopt(struct ip_options_rcu **opt_ptr) |
2020 | { | 2029 | { |
2021 | int hdr_delta = 0; | 2030 | int hdr_delta = 0; |
2022 | struct ip_options *opt = *opt_ptr; | 2031 | struct ip_options_rcu *opt = *opt_ptr; |
2023 | 2032 | ||
2024 | if (opt->srr || opt->rr || opt->ts || opt->router_alert) { | 2033 | if (opt->opt.srr || opt->opt.rr || opt->opt.ts || opt->opt.router_alert) { |
2025 | u8 cipso_len; | 2034 | u8 cipso_len; |
2026 | u8 cipso_off; | 2035 | u8 cipso_off; |
2027 | unsigned char *cipso_ptr; | 2036 | unsigned char *cipso_ptr; |
2028 | int iter; | 2037 | int iter; |
2029 | int optlen_new; | 2038 | int optlen_new; |
2030 | 2039 | ||
2031 | cipso_off = opt->cipso - sizeof(struct iphdr); | 2040 | cipso_off = opt->opt.cipso - sizeof(struct iphdr); |
2032 | cipso_ptr = &opt->__data[cipso_off]; | 2041 | cipso_ptr = &opt->opt.__data[cipso_off]; |
2033 | cipso_len = cipso_ptr[1]; | 2042 | cipso_len = cipso_ptr[1]; |
2034 | 2043 | ||
2035 | if (opt->srr > opt->cipso) | 2044 | if (opt->opt.srr > opt->opt.cipso) |
2036 | opt->srr -= cipso_len; | 2045 | opt->opt.srr -= cipso_len; |
2037 | if (opt->rr > opt->cipso) | 2046 | if (opt->opt.rr > opt->opt.cipso) |
2038 | opt->rr -= cipso_len; | 2047 | opt->opt.rr -= cipso_len; |
2039 | if (opt->ts > opt->cipso) | 2048 | if (opt->opt.ts > opt->opt.cipso) |
2040 | opt->ts -= cipso_len; | 2049 | opt->opt.ts -= cipso_len; |
2041 | if (opt->router_alert > opt->cipso) | 2050 | if (opt->opt.router_alert > opt->opt.cipso) |
2042 | opt->router_alert -= cipso_len; | 2051 | opt->opt.router_alert -= cipso_len; |
2043 | opt->cipso = 0; | 2052 | opt->opt.cipso = 0; |
2044 | 2053 | ||
2045 | memmove(cipso_ptr, cipso_ptr + cipso_len, | 2054 | memmove(cipso_ptr, cipso_ptr + cipso_len, |
2046 | opt->optlen - cipso_off - cipso_len); | 2055 | opt->opt.optlen - cipso_off - cipso_len); |
2047 | 2056 | ||
2048 | /* determining the new total option length is tricky because of | 2057 | /* determining the new total option length is tricky because of |
2049 | * the padding necessary, the only thing i can think to do at | 2058 | * the padding necessary, the only thing i can think to do at |
@@ -2052,21 +2061,21 @@ static int cipso_v4_delopt(struct ip_options **opt_ptr) | |||
2052 | * from there we can determine the new total option length */ | 2061 | * from there we can determine the new total option length */ |
2053 | iter = 0; | 2062 | iter = 0; |
2054 | optlen_new = 0; | 2063 | optlen_new = 0; |
2055 | while (iter < opt->optlen) | 2064 | while (iter < opt->opt.optlen) |
2056 | if (opt->__data[iter] != IPOPT_NOP) { | 2065 | if (opt->opt.__data[iter] != IPOPT_NOP) { |
2057 | iter += opt->__data[iter + 1]; | 2066 | iter += opt->opt.__data[iter + 1]; |
2058 | optlen_new = iter; | 2067 | optlen_new = iter; |
2059 | } else | 2068 | } else |
2060 | iter++; | 2069 | iter++; |
2061 | hdr_delta = opt->optlen; | 2070 | hdr_delta = opt->opt.optlen; |
2062 | opt->optlen = (optlen_new + 3) & ~3; | 2071 | opt->opt.optlen = (optlen_new + 3) & ~3; |
2063 | hdr_delta -= opt->optlen; | 2072 | hdr_delta -= opt->opt.optlen; |
2064 | } else { | 2073 | } else { |
2065 | /* only the cipso option was present on the socket so we can | 2074 | /* only the cipso option was present on the socket so we can |
2066 | * remove the entire option struct */ | 2075 | * remove the entire option struct */ |
2067 | *opt_ptr = NULL; | 2076 | *opt_ptr = NULL; |
2068 | hdr_delta = opt->optlen; | 2077 | hdr_delta = opt->opt.optlen; |
2069 | kfree(opt); | 2078 | call_rcu(&opt->rcu, opt_kfree_rcu); |
2070 | } | 2079 | } |
2071 | 2080 | ||
2072 | return hdr_delta; | 2081 | return hdr_delta; |
@@ -2083,15 +2092,15 @@ static int cipso_v4_delopt(struct ip_options **opt_ptr) | |||
2083 | void cipso_v4_sock_delattr(struct sock *sk) | 2092 | void cipso_v4_sock_delattr(struct sock *sk) |
2084 | { | 2093 | { |
2085 | int hdr_delta; | 2094 | int hdr_delta; |
2086 | struct ip_options *opt; | 2095 | struct ip_options_rcu *opt; |
2087 | struct inet_sock *sk_inet; | 2096 | struct inet_sock *sk_inet; |
2088 | 2097 | ||
2089 | sk_inet = inet_sk(sk); | 2098 | sk_inet = inet_sk(sk); |
2090 | opt = sk_inet->opt; | 2099 | opt = rcu_dereference_protected(sk_inet->inet_opt, 1); |
2091 | if (opt == NULL || opt->cipso == 0) | 2100 | if (opt == NULL || opt->opt.cipso == 0) |
2092 | return; | 2101 | return; |
2093 | 2102 | ||
2094 | hdr_delta = cipso_v4_delopt(&sk_inet->opt); | 2103 | hdr_delta = cipso_v4_delopt(&sk_inet->inet_opt); |
2095 | if (sk_inet->is_icsk && hdr_delta > 0) { | 2104 | if (sk_inet->is_icsk && hdr_delta > 0) { |
2096 | struct inet_connection_sock *sk_conn = inet_csk(sk); | 2105 | struct inet_connection_sock *sk_conn = inet_csk(sk); |
2097 | sk_conn->icsk_ext_hdr_len -= hdr_delta; | 2106 | sk_conn->icsk_ext_hdr_len -= hdr_delta; |
@@ -2109,12 +2118,12 @@ void cipso_v4_sock_delattr(struct sock *sk) | |||
2109 | */ | 2118 | */ |
2110 | void cipso_v4_req_delattr(struct request_sock *req) | 2119 | void cipso_v4_req_delattr(struct request_sock *req) |
2111 | { | 2120 | { |
2112 | struct ip_options *opt; | 2121 | struct ip_options_rcu *opt; |
2113 | struct inet_request_sock *req_inet; | 2122 | struct inet_request_sock *req_inet; |
2114 | 2123 | ||
2115 | req_inet = inet_rsk(req); | 2124 | req_inet = inet_rsk(req); |
2116 | opt = req_inet->opt; | 2125 | opt = req_inet->opt; |
2117 | if (opt == NULL || opt->cipso == 0) | 2126 | if (opt == NULL || opt->opt.cipso == 0) |
2118 | return; | 2127 | return; |
2119 | 2128 | ||
2120 | cipso_v4_delopt(&req_inet->opt); | 2129 | cipso_v4_delopt(&req_inet->opt); |
@@ -2184,14 +2193,18 @@ getattr_return: | |||
2184 | */ | 2193 | */ |
2185 | int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) | 2194 | int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) |
2186 | { | 2195 | { |
2187 | struct ip_options *opt; | 2196 | struct ip_options_rcu *opt; |
2197 | int res = -ENOMSG; | ||
2188 | 2198 | ||
2189 | opt = inet_sk(sk)->opt; | 2199 | rcu_read_lock(); |
2190 | if (opt == NULL || opt->cipso == 0) | 2200 | opt = rcu_dereference(inet_sk(sk)->inet_opt); |
2191 | return -ENOMSG; | 2201 | if (opt && opt->opt.cipso) |
2192 | 2202 | res = cipso_v4_getattr(opt->opt.__data + | |
2193 | return cipso_v4_getattr(opt->__data + opt->cipso - sizeof(struct iphdr), | 2203 | opt->opt.cipso - |
2194 | secattr); | 2204 | sizeof(struct iphdr), |
2205 | secattr); | ||
2206 | rcu_read_unlock(); | ||
2207 | return res; | ||
2195 | } | 2208 | } |
2196 | 2209 | ||
2197 | /** | 2210 | /** |
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 721a8a37b45c..424fafbc8cb0 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c | |||
@@ -24,6 +24,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) | |||
24 | { | 24 | { |
25 | struct inet_sock *inet = inet_sk(sk); | 25 | struct inet_sock *inet = inet_sk(sk); |
26 | struct sockaddr_in *usin = (struct sockaddr_in *) uaddr; | 26 | struct sockaddr_in *usin = (struct sockaddr_in *) uaddr; |
27 | struct flowi4 *fl4; | ||
27 | struct rtable *rt; | 28 | struct rtable *rt; |
28 | __be32 saddr; | 29 | __be32 saddr; |
29 | int oif; | 30 | int oif; |
@@ -38,6 +39,8 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) | |||
38 | 39 | ||
39 | sk_dst_reset(sk); | 40 | sk_dst_reset(sk); |
40 | 41 | ||
42 | lock_sock(sk); | ||
43 | |||
41 | oif = sk->sk_bound_dev_if; | 44 | oif = sk->sk_bound_dev_if; |
42 | saddr = inet->inet_saddr; | 45 | saddr = inet->inet_saddr; |
43 | if (ipv4_is_multicast(usin->sin_addr.s_addr)) { | 46 | if (ipv4_is_multicast(usin->sin_addr.s_addr)) { |
@@ -46,33 +49,39 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) | |||
46 | if (!saddr) | 49 | if (!saddr) |
47 | saddr = inet->mc_addr; | 50 | saddr = inet->mc_addr; |
48 | } | 51 | } |
49 | err = ip_route_connect(&rt, usin->sin_addr.s_addr, saddr, | 52 | fl4 = &inet->cork.fl.u.ip4; |
50 | RT_CONN_FLAGS(sk), oif, | 53 | rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr, |
51 | sk->sk_protocol, | 54 | RT_CONN_FLAGS(sk), oif, |
52 | inet->inet_sport, usin->sin_port, sk, 1); | 55 | sk->sk_protocol, |
53 | if (err) { | 56 | inet->inet_sport, usin->sin_port, sk, true); |
57 | if (IS_ERR(rt)) { | ||
58 | err = PTR_ERR(rt); | ||
54 | if (err == -ENETUNREACH) | 59 | if (err == -ENETUNREACH) |
55 | IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); | 60 | IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); |
56 | return err; | 61 | goto out; |
57 | } | 62 | } |
58 | 63 | ||
59 | if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) { | 64 | if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) { |
60 | ip_rt_put(rt); | 65 | ip_rt_put(rt); |
61 | return -EACCES; | 66 | err = -EACCES; |
67 | goto out; | ||
62 | } | 68 | } |
63 | if (!inet->inet_saddr) | 69 | if (!inet->inet_saddr) |
64 | inet->inet_saddr = rt->rt_src; /* Update source address */ | 70 | inet->inet_saddr = fl4->saddr; /* Update source address */ |
65 | if (!inet->inet_rcv_saddr) { | 71 | if (!inet->inet_rcv_saddr) { |
66 | inet->inet_rcv_saddr = rt->rt_src; | 72 | inet->inet_rcv_saddr = fl4->saddr; |
67 | if (sk->sk_prot->rehash) | 73 | if (sk->sk_prot->rehash) |
68 | sk->sk_prot->rehash(sk); | 74 | sk->sk_prot->rehash(sk); |
69 | } | 75 | } |
70 | inet->inet_daddr = rt->rt_dst; | 76 | inet->inet_daddr = fl4->daddr; |
71 | inet->inet_dport = usin->sin_port; | 77 | inet->inet_dport = usin->sin_port; |
72 | sk->sk_state = TCP_ESTABLISHED; | 78 | sk->sk_state = TCP_ESTABLISHED; |
73 | inet->inet_id = jiffies; | 79 | inet->inet_id = jiffies; |
74 | 80 | ||
75 | sk_dst_set(sk, &rt->dst); | 81 | sk_dst_set(sk, &rt->dst); |
76 | return(0); | 82 | err = 0; |
83 | out: | ||
84 | release_sock(sk); | ||
85 | return err; | ||
77 | } | 86 | } |
78 | EXPORT_SYMBOL(ip4_datagram_connect); | 87 | EXPORT_SYMBOL(ip4_datagram_connect); |
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index da14c49284f4..0d4a184af16f 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c | |||
@@ -51,6 +51,7 @@ | |||
51 | #include <linux/inetdevice.h> | 51 | #include <linux/inetdevice.h> |
52 | #include <linux/igmp.h> | 52 | #include <linux/igmp.h> |
53 | #include <linux/slab.h> | 53 | #include <linux/slab.h> |
54 | #include <linux/hash.h> | ||
54 | #ifdef CONFIG_SYSCTL | 55 | #ifdef CONFIG_SYSCTL |
55 | #include <linux/sysctl.h> | 56 | #include <linux/sysctl.h> |
56 | #endif | 57 | #endif |
@@ -63,6 +64,8 @@ | |||
63 | #include <net/rtnetlink.h> | 64 | #include <net/rtnetlink.h> |
64 | #include <net/net_namespace.h> | 65 | #include <net/net_namespace.h> |
65 | 66 | ||
67 | #include "fib_lookup.h" | ||
68 | |||
66 | static struct ipv4_devconf ipv4_devconf = { | 69 | static struct ipv4_devconf ipv4_devconf = { |
67 | .data = { | 70 | .data = { |
68 | [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1, | 71 | [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1, |
@@ -92,6 +95,85 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = { | |||
92 | [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, | 95 | [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, |
93 | }; | 96 | }; |
94 | 97 | ||
98 | /* inet_addr_hash's shifting is dependent upon this IN4_ADDR_HSIZE | ||
99 | * value. So if you change this define, make appropriate changes to | ||
100 | * inet_addr_hash as well. | ||
101 | */ | ||
102 | #define IN4_ADDR_HSIZE 256 | ||
103 | static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE]; | ||
104 | static DEFINE_SPINLOCK(inet_addr_hash_lock); | ||
105 | |||
106 | static inline unsigned int inet_addr_hash(struct net *net, __be32 addr) | ||
107 | { | ||
108 | u32 val = (__force u32) addr ^ hash_ptr(net, 8); | ||
109 | |||
110 | return ((val ^ (val >> 8) ^ (val >> 16) ^ (val >> 24)) & | ||
111 | (IN4_ADDR_HSIZE - 1)); | ||
112 | } | ||
113 | |||
114 | static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa) | ||
115 | { | ||
116 | unsigned int hash = inet_addr_hash(net, ifa->ifa_local); | ||
117 | |||
118 | spin_lock(&inet_addr_hash_lock); | ||
119 | hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]); | ||
120 | spin_unlock(&inet_addr_hash_lock); | ||
121 | } | ||
122 | |||
123 | static void inet_hash_remove(struct in_ifaddr *ifa) | ||
124 | { | ||
125 | spin_lock(&inet_addr_hash_lock); | ||
126 | hlist_del_init_rcu(&ifa->hash); | ||
127 | spin_unlock(&inet_addr_hash_lock); | ||
128 | } | ||
129 | |||
130 | /** | ||
131 | * __ip_dev_find - find the first device with a given source address. | ||
132 | * @net: the net namespace | ||
133 | * @addr: the source address | ||
134 | * @devref: if true, take a reference on the found device | ||
135 | * | ||
136 | * If a caller uses devref=false, it should be protected by RCU, or RTNL | ||
137 | */ | ||
138 | struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) | ||
139 | { | ||
140 | unsigned int hash = inet_addr_hash(net, addr); | ||
141 | struct net_device *result = NULL; | ||
142 | struct in_ifaddr *ifa; | ||
143 | struct hlist_node *node; | ||
144 | |||
145 | rcu_read_lock(); | ||
146 | hlist_for_each_entry_rcu(ifa, node, &inet_addr_lst[hash], hash) { | ||
147 | struct net_device *dev = ifa->ifa_dev->dev; | ||
148 | |||
149 | if (!net_eq(dev_net(dev), net)) | ||
150 | continue; | ||
151 | if (ifa->ifa_local == addr) { | ||
152 | result = dev; | ||
153 | break; | ||
154 | } | ||
155 | } | ||
156 | if (!result) { | ||
157 | struct flowi4 fl4 = { .daddr = addr }; | ||
158 | struct fib_result res = { 0 }; | ||
159 | struct fib_table *local; | ||
160 | |||
161 | /* Fallback to FIB local table so that communication | ||
162 | * over loopback subnets work. | ||
163 | */ | ||
164 | local = fib_get_table(net, RT_TABLE_LOCAL); | ||
165 | if (local && | ||
166 | !fib_table_lookup(local, &fl4, &res, FIB_LOOKUP_NOREF) && | ||
167 | res.type == RTN_LOCAL) | ||
168 | result = FIB_RES_DEV(res); | ||
169 | } | ||
170 | if (result && devref) | ||
171 | dev_hold(result); | ||
172 | rcu_read_unlock(); | ||
173 | return result; | ||
174 | } | ||
175 | EXPORT_SYMBOL(__ip_dev_find); | ||
176 | |||
95 | static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32); | 177 | static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32); |
96 | 178 | ||
97 | static BLOCKING_NOTIFIER_HEAD(inetaddr_chain); | 179 | static BLOCKING_NOTIFIER_HEAD(inetaddr_chain); |
@@ -209,7 +291,7 @@ static void inetdev_destroy(struct in_device *in_dev) | |||
209 | inet_free_ifa(ifa); | 291 | inet_free_ifa(ifa); |
210 | } | 292 | } |
211 | 293 | ||
212 | dev->ip_ptr = NULL; | 294 | rcu_assign_pointer(dev->ip_ptr, NULL); |
213 | 295 | ||
214 | devinet_sysctl_unregister(in_dev); | 296 | devinet_sysctl_unregister(in_dev); |
215 | neigh_parms_release(&arp_tbl, in_dev->arp_parms); | 297 | neigh_parms_release(&arp_tbl, in_dev->arp_parms); |
@@ -265,6 +347,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, | |||
265 | } | 347 | } |
266 | 348 | ||
267 | if (!do_promote) { | 349 | if (!do_promote) { |
350 | inet_hash_remove(ifa); | ||
268 | *ifap1 = ifa->ifa_next; | 351 | *ifap1 = ifa->ifa_next; |
269 | 352 | ||
270 | rtmsg_ifa(RTM_DELADDR, ifa, nlh, pid); | 353 | rtmsg_ifa(RTM_DELADDR, ifa, nlh, pid); |
@@ -278,9 +361,21 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, | |||
278 | } | 361 | } |
279 | } | 362 | } |
280 | 363 | ||
364 | /* On promotion all secondaries from subnet are changing | ||
365 | * the primary IP, we must remove all their routes silently | ||
366 | * and later to add them back with new prefsrc. Do this | ||
367 | * while all addresses are on the device list. | ||
368 | */ | ||
369 | for (ifa = promote; ifa; ifa = ifa->ifa_next) { | ||
370 | if (ifa1->ifa_mask == ifa->ifa_mask && | ||
371 | inet_ifa_match(ifa1->ifa_address, ifa)) | ||
372 | fib_del_ifaddr(ifa, ifa1); | ||
373 | } | ||
374 | |||
281 | /* 2. Unlink it */ | 375 | /* 2. Unlink it */ |
282 | 376 | ||
283 | *ifap = ifa1->ifa_next; | 377 | *ifap = ifa1->ifa_next; |
378 | inet_hash_remove(ifa1); | ||
284 | 379 | ||
285 | /* 3. Announce address deletion */ | 380 | /* 3. Announce address deletion */ |
286 | 381 | ||
@@ -296,6 +391,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, | |||
296 | blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1); | 391 | blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1); |
297 | 392 | ||
298 | if (promote) { | 393 | if (promote) { |
394 | struct in_ifaddr *next_sec = promote->ifa_next; | ||
299 | 395 | ||
300 | if (prev_prom) { | 396 | if (prev_prom) { |
301 | prev_prom->ifa_next = promote->ifa_next; | 397 | prev_prom->ifa_next = promote->ifa_next; |
@@ -307,7 +403,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, | |||
307 | rtmsg_ifa(RTM_NEWADDR, promote, nlh, pid); | 403 | rtmsg_ifa(RTM_NEWADDR, promote, nlh, pid); |
308 | blocking_notifier_call_chain(&inetaddr_chain, | 404 | blocking_notifier_call_chain(&inetaddr_chain, |
309 | NETDEV_UP, promote); | 405 | NETDEV_UP, promote); |
310 | for (ifa = promote->ifa_next; ifa; ifa = ifa->ifa_next) { | 406 | for (ifa = next_sec; ifa; ifa = ifa->ifa_next) { |
311 | if (ifa1->ifa_mask != ifa->ifa_mask || | 407 | if (ifa1->ifa_mask != ifa->ifa_mask || |
312 | !inet_ifa_match(ifa1->ifa_address, ifa)) | 408 | !inet_ifa_match(ifa1->ifa_address, ifa)) |
313 | continue; | 409 | continue; |
@@ -368,6 +464,8 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, | |||
368 | ifa->ifa_next = *ifap; | 464 | ifa->ifa_next = *ifap; |
369 | *ifap = ifa; | 465 | *ifap = ifa; |
370 | 466 | ||
467 | inet_hash_insert(dev_net(in_dev->dev), ifa); | ||
468 | |||
371 | /* Send message first, then call notifier. | 469 | /* Send message first, then call notifier. |
372 | Notifier will trigger FIB update, so that | 470 | Notifier will trigger FIB update, so that |
373 | listeners of netlink will know about new ifaddr */ | 471 | listeners of netlink will know about new ifaddr */ |
@@ -403,6 +501,9 @@ static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa) | |||
403 | return inet_insert_ifa(ifa); | 501 | return inet_insert_ifa(ifa); |
404 | } | 502 | } |
405 | 503 | ||
504 | /* Caller must hold RCU or RTNL : | ||
505 | * We dont take a reference on found in_device | ||
506 | */ | ||
406 | struct in_device *inetdev_by_index(struct net *net, int ifindex) | 507 | struct in_device *inetdev_by_index(struct net *net, int ifindex) |
407 | { | 508 | { |
408 | struct net_device *dev; | 509 | struct net_device *dev; |
@@ -411,7 +512,7 @@ struct in_device *inetdev_by_index(struct net *net, int ifindex) | |||
411 | rcu_read_lock(); | 512 | rcu_read_lock(); |
412 | dev = dev_get_by_index_rcu(net, ifindex); | 513 | dev = dev_get_by_index_rcu(net, ifindex); |
413 | if (dev) | 514 | if (dev) |
414 | in_dev = in_dev_get(dev); | 515 | in_dev = rcu_dereference_rtnl(dev->ip_ptr); |
415 | rcu_read_unlock(); | 516 | rcu_read_unlock(); |
416 | return in_dev; | 517 | return in_dev; |
417 | } | 518 | } |
@@ -453,8 +554,6 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg | |||
453 | goto errout; | 554 | goto errout; |
454 | } | 555 | } |
455 | 556 | ||
456 | __in_dev_put(in_dev); | ||
457 | |||
458 | for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; | 557 | for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; |
459 | ifap = &ifa->ifa_next) { | 558 | ifap = &ifa->ifa_next) { |
460 | if (tb[IFA_LOCAL] && | 559 | if (tb[IFA_LOCAL] && |
@@ -520,6 +619,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh) | |||
520 | if (tb[IFA_ADDRESS] == NULL) | 619 | if (tb[IFA_ADDRESS] == NULL) |
521 | tb[IFA_ADDRESS] = tb[IFA_LOCAL]; | 620 | tb[IFA_ADDRESS] = tb[IFA_LOCAL]; |
522 | 621 | ||
622 | INIT_HLIST_NODE(&ifa->hash); | ||
523 | ifa->ifa_prefixlen = ifm->ifa_prefixlen; | 623 | ifa->ifa_prefixlen = ifm->ifa_prefixlen; |
524 | ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen); | 624 | ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen); |
525 | ifa->ifa_flags = ifm->ifa_flags; | 625 | ifa->ifa_flags = ifm->ifa_flags; |
@@ -669,7 +769,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) | |||
669 | ifap = &ifa->ifa_next) { | 769 | ifap = &ifa->ifa_next) { |
670 | if (!strcmp(ifr.ifr_name, ifa->ifa_label) && | 770 | if (!strcmp(ifr.ifr_name, ifa->ifa_label) && |
671 | sin_orig.sin_addr.s_addr == | 771 | sin_orig.sin_addr.s_addr == |
672 | ifa->ifa_address) { | 772 | ifa->ifa_local) { |
673 | break; /* found */ | 773 | break; /* found */ |
674 | } | 774 | } |
675 | } | 775 | } |
@@ -727,6 +827,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) | |||
727 | if (!ifa) { | 827 | if (!ifa) { |
728 | ret = -ENOBUFS; | 828 | ret = -ENOBUFS; |
729 | ifa = inet_alloc_ifa(); | 829 | ifa = inet_alloc_ifa(); |
830 | INIT_HLIST_NODE(&ifa->hash); | ||
730 | if (!ifa) | 831 | if (!ifa) |
731 | break; | 832 | break; |
732 | if (colon) | 833 | if (colon) |
@@ -1029,6 +1130,21 @@ static inline bool inetdev_valid_mtu(unsigned mtu) | |||
1029 | return mtu >= 68; | 1130 | return mtu >= 68; |
1030 | } | 1131 | } |
1031 | 1132 | ||
1133 | static void inetdev_send_gratuitous_arp(struct net_device *dev, | ||
1134 | struct in_device *in_dev) | ||
1135 | |||
1136 | { | ||
1137 | struct in_ifaddr *ifa = in_dev->ifa_list; | ||
1138 | |||
1139 | if (!ifa) | ||
1140 | return; | ||
1141 | |||
1142 | arp_send(ARPOP_REQUEST, ETH_P_ARP, | ||
1143 | ifa->ifa_local, dev, | ||
1144 | ifa->ifa_local, NULL, | ||
1145 | dev->dev_addr, NULL); | ||
1146 | } | ||
1147 | |||
1032 | /* Called only under RTNL semaphore */ | 1148 | /* Called only under RTNL semaphore */ |
1033 | 1149 | ||
1034 | static int inetdev_event(struct notifier_block *this, unsigned long event, | 1150 | static int inetdev_event(struct notifier_block *this, unsigned long event, |
@@ -1059,7 +1175,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, | |||
1059 | switch (event) { | 1175 | switch (event) { |
1060 | case NETDEV_REGISTER: | 1176 | case NETDEV_REGISTER: |
1061 | printk(KERN_DEBUG "inetdev_event: bug\n"); | 1177 | printk(KERN_DEBUG "inetdev_event: bug\n"); |
1062 | dev->ip_ptr = NULL; | 1178 | rcu_assign_pointer(dev->ip_ptr, NULL); |
1063 | break; | 1179 | break; |
1064 | case NETDEV_UP: | 1180 | case NETDEV_UP: |
1065 | if (!inetdev_valid_mtu(dev->mtu)) | 1181 | if (!inetdev_valid_mtu(dev->mtu)) |
@@ -1068,6 +1184,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, | |||
1068 | struct in_ifaddr *ifa = inet_alloc_ifa(); | 1184 | struct in_ifaddr *ifa = inet_alloc_ifa(); |
1069 | 1185 | ||
1070 | if (ifa) { | 1186 | if (ifa) { |
1187 | INIT_HLIST_NODE(&ifa->hash); | ||
1071 | ifa->ifa_local = | 1188 | ifa->ifa_local = |
1072 | ifa->ifa_address = htonl(INADDR_LOOPBACK); | 1189 | ifa->ifa_address = htonl(INADDR_LOOPBACK); |
1073 | ifa->ifa_prefixlen = 8; | 1190 | ifa->ifa_prefixlen = 8; |
@@ -1081,18 +1198,13 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, | |||
1081 | } | 1198 | } |
1082 | ip_mc_up(in_dev); | 1199 | ip_mc_up(in_dev); |
1083 | /* fall through */ | 1200 | /* fall through */ |
1084 | case NETDEV_NOTIFY_PEERS: | ||
1085 | case NETDEV_CHANGEADDR: | 1201 | case NETDEV_CHANGEADDR: |
1202 | if (!IN_DEV_ARP_NOTIFY(in_dev)) | ||
1203 | break; | ||
1204 | /* fall through */ | ||
1205 | case NETDEV_NOTIFY_PEERS: | ||
1086 | /* Send gratuitous ARP to notify of link change */ | 1206 | /* Send gratuitous ARP to notify of link change */ |
1087 | if (IN_DEV_ARP_NOTIFY(in_dev)) { | 1207 | inetdev_send_gratuitous_arp(dev, in_dev); |
1088 | struct in_ifaddr *ifa = in_dev->ifa_list; | ||
1089 | |||
1090 | if (ifa) | ||
1091 | arp_send(ARPOP_REQUEST, ETH_P_ARP, | ||
1092 | ifa->ifa_address, dev, | ||
1093 | ifa->ifa_address, NULL, | ||
1094 | dev->dev_addr, NULL); | ||
1095 | } | ||
1096 | break; | 1208 | break; |
1097 | case NETDEV_DOWN: | 1209 | case NETDEV_DOWN: |
1098 | ip_mc_down(in_dev); | 1210 | ip_mc_down(in_dev); |
@@ -1255,6 +1367,87 @@ errout: | |||
1255 | rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err); | 1367 | rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err); |
1256 | } | 1368 | } |
1257 | 1369 | ||
1370 | static size_t inet_get_link_af_size(const struct net_device *dev) | ||
1371 | { | ||
1372 | struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr); | ||
1373 | |||
1374 | if (!in_dev) | ||
1375 | return 0; | ||
1376 | |||
1377 | return nla_total_size(IPV4_DEVCONF_MAX * 4); /* IFLA_INET_CONF */ | ||
1378 | } | ||
1379 | |||
1380 | static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev) | ||
1381 | { | ||
1382 | struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr); | ||
1383 | struct nlattr *nla; | ||
1384 | int i; | ||
1385 | |||
1386 | if (!in_dev) | ||
1387 | return -ENODATA; | ||
1388 | |||
1389 | nla = nla_reserve(skb, IFLA_INET_CONF, IPV4_DEVCONF_MAX * 4); | ||
1390 | if (nla == NULL) | ||
1391 | return -EMSGSIZE; | ||
1392 | |||
1393 | for (i = 0; i < IPV4_DEVCONF_MAX; i++) | ||
1394 | ((u32 *) nla_data(nla))[i] = in_dev->cnf.data[i]; | ||
1395 | |||
1396 | return 0; | ||
1397 | } | ||
1398 | |||
1399 | static const struct nla_policy inet_af_policy[IFLA_INET_MAX+1] = { | ||
1400 | [IFLA_INET_CONF] = { .type = NLA_NESTED }, | ||
1401 | }; | ||
1402 | |||
1403 | static int inet_validate_link_af(const struct net_device *dev, | ||
1404 | const struct nlattr *nla) | ||
1405 | { | ||
1406 | struct nlattr *a, *tb[IFLA_INET_MAX+1]; | ||
1407 | int err, rem; | ||
1408 | |||
1409 | if (dev && !__in_dev_get_rtnl(dev)) | ||
1410 | return -EAFNOSUPPORT; | ||
1411 | |||
1412 | err = nla_parse_nested(tb, IFLA_INET_MAX, nla, inet_af_policy); | ||
1413 | if (err < 0) | ||
1414 | return err; | ||
1415 | |||
1416 | if (tb[IFLA_INET_CONF]) { | ||
1417 | nla_for_each_nested(a, tb[IFLA_INET_CONF], rem) { | ||
1418 | int cfgid = nla_type(a); | ||
1419 | |||
1420 | if (nla_len(a) < 4) | ||
1421 | return -EINVAL; | ||
1422 | |||
1423 | if (cfgid <= 0 || cfgid > IPV4_DEVCONF_MAX) | ||
1424 | return -EINVAL; | ||
1425 | } | ||
1426 | } | ||
1427 | |||
1428 | return 0; | ||
1429 | } | ||
1430 | |||
1431 | static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla) | ||
1432 | { | ||
1433 | struct in_device *in_dev = __in_dev_get_rtnl(dev); | ||
1434 | struct nlattr *a, *tb[IFLA_INET_MAX+1]; | ||
1435 | int rem; | ||
1436 | |||
1437 | if (!in_dev) | ||
1438 | return -EAFNOSUPPORT; | ||
1439 | |||
1440 | if (nla_parse_nested(tb, IFLA_INET_MAX, nla, NULL) < 0) | ||
1441 | BUG(); | ||
1442 | |||
1443 | if (tb[IFLA_INET_CONF]) { | ||
1444 | nla_for_each_nested(a, tb[IFLA_INET_CONF], rem) | ||
1445 | ipv4_devconf_set(in_dev, nla_type(a), nla_get_u32(a)); | ||
1446 | } | ||
1447 | |||
1448 | return 0; | ||
1449 | } | ||
1450 | |||
1258 | #ifdef CONFIG_SYSCTL | 1451 | #ifdef CONFIG_SYSCTL |
1259 | 1452 | ||
1260 | static void devinet_copy_dflt_conf(struct net *net, int i) | 1453 | static void devinet_copy_dflt_conf(struct net *net, int i) |
@@ -1348,9 +1541,9 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write, | |||
1348 | return ret; | 1541 | return ret; |
1349 | } | 1542 | } |
1350 | 1543 | ||
1351 | int ipv4_doint_and_flush(ctl_table *ctl, int write, | 1544 | static int ipv4_doint_and_flush(ctl_table *ctl, int write, |
1352 | void __user *buffer, | 1545 | void __user *buffer, |
1353 | size_t *lenp, loff_t *ppos) | 1546 | size_t *lenp, loff_t *ppos) |
1354 | { | 1547 | { |
1355 | int *valp = ctl->data; | 1548 | int *valp = ctl->data; |
1356 | int val = *valp; | 1549 | int val = *valp; |
@@ -1487,7 +1680,7 @@ static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf) | |||
1487 | return; | 1680 | return; |
1488 | 1681 | ||
1489 | cnf->sysctl = NULL; | 1682 | cnf->sysctl = NULL; |
1490 | unregister_sysctl_table(t->sysctl_header); | 1683 | unregister_net_sysctl_table(t->sysctl_header); |
1491 | kfree(t->dev_name); | 1684 | kfree(t->dev_name); |
1492 | kfree(t); | 1685 | kfree(t); |
1493 | } | 1686 | } |
@@ -1618,13 +1811,28 @@ static __net_initdata struct pernet_operations devinet_ops = { | |||
1618 | .exit = devinet_exit_net, | 1811 | .exit = devinet_exit_net, |
1619 | }; | 1812 | }; |
1620 | 1813 | ||
1814 | static struct rtnl_af_ops inet_af_ops = { | ||
1815 | .family = AF_INET, | ||
1816 | .fill_link_af = inet_fill_link_af, | ||
1817 | .get_link_af_size = inet_get_link_af_size, | ||
1818 | .validate_link_af = inet_validate_link_af, | ||
1819 | .set_link_af = inet_set_link_af, | ||
1820 | }; | ||
1821 | |||
1621 | void __init devinet_init(void) | 1822 | void __init devinet_init(void) |
1622 | { | 1823 | { |
1824 | int i; | ||
1825 | |||
1826 | for (i = 0; i < IN4_ADDR_HSIZE; i++) | ||
1827 | INIT_HLIST_HEAD(&inet_addr_lst[i]); | ||
1828 | |||
1623 | register_pernet_subsys(&devinet_ops); | 1829 | register_pernet_subsys(&devinet_ops); |
1624 | 1830 | ||
1625 | register_gifconf(PF_INET, inet_gifconf); | 1831 | register_gifconf(PF_INET, inet_gifconf); |
1626 | register_netdevice_notifier(&ip_netdev_notifier); | 1832 | register_netdevice_notifier(&ip_netdev_notifier); |
1627 | 1833 | ||
1834 | rtnl_af_register(&inet_af_ops); | ||
1835 | |||
1628 | rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL); | 1836 | rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL); |
1629 | rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL); | 1837 | rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL); |
1630 | rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr); | 1838 | rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr); |
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 14ca1f1c3fb0..a5b413416da3 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c | |||
@@ -23,6 +23,8 @@ struct esp_skb_cb { | |||
23 | 23 | ||
24 | #define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0])) | 24 | #define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0])) |
25 | 25 | ||
26 | static u32 esp4_get_mtu(struct xfrm_state *x, int mtu); | ||
27 | |||
26 | /* | 28 | /* |
27 | * Allocate an AEAD request structure with extra space for SG and IV. | 29 | * Allocate an AEAD request structure with extra space for SG and IV. |
28 | * | 30 | * |
@@ -31,11 +33,14 @@ struct esp_skb_cb { | |||
31 | * | 33 | * |
32 | * TODO: Use spare space in skb for this where possible. | 34 | * TODO: Use spare space in skb for this where possible. |
33 | */ | 35 | */ |
34 | static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags) | 36 | static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqhilen) |
35 | { | 37 | { |
36 | unsigned int len; | 38 | unsigned int len; |
37 | 39 | ||
38 | len = crypto_aead_ivsize(aead); | 40 | len = seqhilen; |
41 | |||
42 | len += crypto_aead_ivsize(aead); | ||
43 | |||
39 | if (len) { | 44 | if (len) { |
40 | len += crypto_aead_alignmask(aead) & | 45 | len += crypto_aead_alignmask(aead) & |
41 | ~(crypto_tfm_ctx_alignment() - 1); | 46 | ~(crypto_tfm_ctx_alignment() - 1); |
@@ -50,10 +55,15 @@ static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags) | |||
50 | return kmalloc(len, GFP_ATOMIC); | 55 | return kmalloc(len, GFP_ATOMIC); |
51 | } | 56 | } |
52 | 57 | ||
53 | static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp) | 58 | static inline __be32 *esp_tmp_seqhi(void *tmp) |
59 | { | ||
60 | return PTR_ALIGN((__be32 *)tmp, __alignof__(__be32)); | ||
61 | } | ||
62 | static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen) | ||
54 | { | 63 | { |
55 | return crypto_aead_ivsize(aead) ? | 64 | return crypto_aead_ivsize(aead) ? |
56 | PTR_ALIGN((u8 *)tmp, crypto_aead_alignmask(aead) + 1) : tmp; | 65 | PTR_ALIGN((u8 *)tmp + seqhilen, |
66 | crypto_aead_alignmask(aead) + 1) : tmp + seqhilen; | ||
57 | } | 67 | } |
58 | 68 | ||
59 | static inline struct aead_givcrypt_request *esp_tmp_givreq( | 69 | static inline struct aead_givcrypt_request *esp_tmp_givreq( |
@@ -117,46 +127,75 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) | |||
117 | int blksize; | 127 | int blksize; |
118 | int clen; | 128 | int clen; |
119 | int alen; | 129 | int alen; |
130 | int plen; | ||
131 | int tfclen; | ||
120 | int nfrags; | 132 | int nfrags; |
133 | int assoclen; | ||
134 | int sglists; | ||
135 | int seqhilen; | ||
136 | __be32 *seqhi; | ||
121 | 137 | ||
122 | /* skb is pure payload to encrypt */ | 138 | /* skb is pure payload to encrypt */ |
123 | 139 | ||
124 | err = -ENOMEM; | 140 | err = -ENOMEM; |
125 | 141 | ||
126 | /* Round to block size */ | ||
127 | clen = skb->len; | ||
128 | |||
129 | esp = x->data; | 142 | esp = x->data; |
130 | aead = esp->aead; | 143 | aead = esp->aead; |
131 | alen = crypto_aead_authsize(aead); | 144 | alen = crypto_aead_authsize(aead); |
132 | 145 | ||
146 | tfclen = 0; | ||
147 | if (x->tfcpad) { | ||
148 | struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb); | ||
149 | u32 padto; | ||
150 | |||
151 | padto = min(x->tfcpad, esp4_get_mtu(x, dst->child_mtu_cached)); | ||
152 | if (skb->len < padto) | ||
153 | tfclen = padto - skb->len; | ||
154 | } | ||
133 | blksize = ALIGN(crypto_aead_blocksize(aead), 4); | 155 | blksize = ALIGN(crypto_aead_blocksize(aead), 4); |
134 | clen = ALIGN(clen + 2, blksize); | 156 | clen = ALIGN(skb->len + 2 + tfclen, blksize); |
135 | if (esp->padlen) | 157 | if (esp->padlen) |
136 | clen = ALIGN(clen, esp->padlen); | 158 | clen = ALIGN(clen, esp->padlen); |
159 | plen = clen - skb->len - tfclen; | ||
137 | 160 | ||
138 | if ((err = skb_cow_data(skb, clen - skb->len + alen, &trailer)) < 0) | 161 | err = skb_cow_data(skb, tfclen + plen + alen, &trailer); |
162 | if (err < 0) | ||
139 | goto error; | 163 | goto error; |
140 | nfrags = err; | 164 | nfrags = err; |
141 | 165 | ||
142 | tmp = esp_alloc_tmp(aead, nfrags + 1); | 166 | assoclen = sizeof(*esph); |
167 | sglists = 1; | ||
168 | seqhilen = 0; | ||
169 | |||
170 | if (x->props.flags & XFRM_STATE_ESN) { | ||
171 | sglists += 2; | ||
172 | seqhilen += sizeof(__be32); | ||
173 | assoclen += seqhilen; | ||
174 | } | ||
175 | |||
176 | tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen); | ||
143 | if (!tmp) | 177 | if (!tmp) |
144 | goto error; | 178 | goto error; |
145 | 179 | ||
146 | iv = esp_tmp_iv(aead, tmp); | 180 | seqhi = esp_tmp_seqhi(tmp); |
181 | iv = esp_tmp_iv(aead, tmp, seqhilen); | ||
147 | req = esp_tmp_givreq(aead, iv); | 182 | req = esp_tmp_givreq(aead, iv); |
148 | asg = esp_givreq_sg(aead, req); | 183 | asg = esp_givreq_sg(aead, req); |
149 | sg = asg + 1; | 184 | sg = asg + sglists; |
150 | 185 | ||
151 | /* Fill padding... */ | 186 | /* Fill padding... */ |
152 | tail = skb_tail_pointer(trailer); | 187 | tail = skb_tail_pointer(trailer); |
188 | if (tfclen) { | ||
189 | memset(tail, 0, tfclen); | ||
190 | tail += tfclen; | ||
191 | } | ||
153 | do { | 192 | do { |
154 | int i; | 193 | int i; |
155 | for (i=0; i<clen-skb->len - 2; i++) | 194 | for (i = 0; i < plen - 2; i++) |
156 | tail[i] = i + 1; | 195 | tail[i] = i + 1; |
157 | } while (0); | 196 | } while (0); |
158 | tail[clen - skb->len - 2] = (clen - skb->len) - 2; | 197 | tail[plen - 2] = plen - 2; |
159 | tail[clen - skb->len - 1] = *skb_mac_header(skb); | 198 | tail[plen - 1] = *skb_mac_header(skb); |
160 | pskb_put(skb, trailer, clen - skb->len + alen); | 199 | pskb_put(skb, trailer, clen - skb->len + alen); |
161 | 200 | ||
162 | skb_push(skb, -skb_network_offset(skb)); | 201 | skb_push(skb, -skb_network_offset(skb)); |
@@ -199,19 +238,27 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) | |||
199 | } | 238 | } |
200 | 239 | ||
201 | esph->spi = x->id.spi; | 240 | esph->spi = x->id.spi; |
202 | esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output); | 241 | esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); |
203 | 242 | ||
204 | sg_init_table(sg, nfrags); | 243 | sg_init_table(sg, nfrags); |
205 | skb_to_sgvec(skb, sg, | 244 | skb_to_sgvec(skb, sg, |
206 | esph->enc_data + crypto_aead_ivsize(aead) - skb->data, | 245 | esph->enc_data + crypto_aead_ivsize(aead) - skb->data, |
207 | clen + alen); | 246 | clen + alen); |
208 | sg_init_one(asg, esph, sizeof(*esph)); | 247 | |
248 | if ((x->props.flags & XFRM_STATE_ESN)) { | ||
249 | sg_init_table(asg, 3); | ||
250 | sg_set_buf(asg, &esph->spi, sizeof(__be32)); | ||
251 | *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi); | ||
252 | sg_set_buf(asg + 1, seqhi, seqhilen); | ||
253 | sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32)); | ||
254 | } else | ||
255 | sg_init_one(asg, esph, sizeof(*esph)); | ||
209 | 256 | ||
210 | aead_givcrypt_set_callback(req, 0, esp_output_done, skb); | 257 | aead_givcrypt_set_callback(req, 0, esp_output_done, skb); |
211 | aead_givcrypt_set_crypt(req, sg, sg, clen, iv); | 258 | aead_givcrypt_set_crypt(req, sg, sg, clen, iv); |
212 | aead_givcrypt_set_assoc(req, asg, sizeof(*esph)); | 259 | aead_givcrypt_set_assoc(req, asg, assoclen); |
213 | aead_givcrypt_set_giv(req, esph->enc_data, | 260 | aead_givcrypt_set_giv(req, esph->enc_data, |
214 | XFRM_SKB_CB(skb)->seq.output); | 261 | XFRM_SKB_CB(skb)->seq.output.low); |
215 | 262 | ||
216 | ESP_SKB_CB(skb)->tmp = tmp; | 263 | ESP_SKB_CB(skb)->tmp = tmp; |
217 | err = crypto_aead_givencrypt(req); | 264 | err = crypto_aead_givencrypt(req); |
@@ -229,7 +276,7 @@ error: | |||
229 | 276 | ||
230 | static int esp_input_done2(struct sk_buff *skb, int err) | 277 | static int esp_input_done2(struct sk_buff *skb, int err) |
231 | { | 278 | { |
232 | struct iphdr *iph; | 279 | const struct iphdr *iph; |
233 | struct xfrm_state *x = xfrm_input_state(skb); | 280 | struct xfrm_state *x = xfrm_input_state(skb); |
234 | struct esp_data *esp = x->data; | 281 | struct esp_data *esp = x->data; |
235 | struct crypto_aead *aead = esp->aead; | 282 | struct crypto_aead *aead = esp->aead; |
@@ -330,6 +377,10 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) | |||
330 | struct sk_buff *trailer; | 377 | struct sk_buff *trailer; |
331 | int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead); | 378 | int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead); |
332 | int nfrags; | 379 | int nfrags; |
380 | int assoclen; | ||
381 | int sglists; | ||
382 | int seqhilen; | ||
383 | __be32 *seqhi; | ||
333 | void *tmp; | 384 | void *tmp; |
334 | u8 *iv; | 385 | u8 *iv; |
335 | struct scatterlist *sg; | 386 | struct scatterlist *sg; |
@@ -346,16 +397,27 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) | |||
346 | goto out; | 397 | goto out; |
347 | nfrags = err; | 398 | nfrags = err; |
348 | 399 | ||
400 | assoclen = sizeof(*esph); | ||
401 | sglists = 1; | ||
402 | seqhilen = 0; | ||
403 | |||
404 | if (x->props.flags & XFRM_STATE_ESN) { | ||
405 | sglists += 2; | ||
406 | seqhilen += sizeof(__be32); | ||
407 | assoclen += seqhilen; | ||
408 | } | ||
409 | |||
349 | err = -ENOMEM; | 410 | err = -ENOMEM; |
350 | tmp = esp_alloc_tmp(aead, nfrags + 1); | 411 | tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen); |
351 | if (!tmp) | 412 | if (!tmp) |
352 | goto out; | 413 | goto out; |
353 | 414 | ||
354 | ESP_SKB_CB(skb)->tmp = tmp; | 415 | ESP_SKB_CB(skb)->tmp = tmp; |
355 | iv = esp_tmp_iv(aead, tmp); | 416 | seqhi = esp_tmp_seqhi(tmp); |
417 | iv = esp_tmp_iv(aead, tmp, seqhilen); | ||
356 | req = esp_tmp_req(aead, iv); | 418 | req = esp_tmp_req(aead, iv); |
357 | asg = esp_req_sg(aead, req); | 419 | asg = esp_req_sg(aead, req); |
358 | sg = asg + 1; | 420 | sg = asg + sglists; |
359 | 421 | ||
360 | skb->ip_summed = CHECKSUM_NONE; | 422 | skb->ip_summed = CHECKSUM_NONE; |
361 | 423 | ||
@@ -366,11 +428,19 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) | |||
366 | 428 | ||
367 | sg_init_table(sg, nfrags); | 429 | sg_init_table(sg, nfrags); |
368 | skb_to_sgvec(skb, sg, sizeof(*esph) + crypto_aead_ivsize(aead), elen); | 430 | skb_to_sgvec(skb, sg, sizeof(*esph) + crypto_aead_ivsize(aead), elen); |
369 | sg_init_one(asg, esph, sizeof(*esph)); | 431 | |
432 | if ((x->props.flags & XFRM_STATE_ESN)) { | ||
433 | sg_init_table(asg, 3); | ||
434 | sg_set_buf(asg, &esph->spi, sizeof(__be32)); | ||
435 | *seqhi = XFRM_SKB_CB(skb)->seq.input.hi; | ||
436 | sg_set_buf(asg + 1, seqhi, seqhilen); | ||
437 | sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32)); | ||
438 | } else | ||
439 | sg_init_one(asg, esph, sizeof(*esph)); | ||
370 | 440 | ||
371 | aead_request_set_callback(req, 0, esp_input_done, skb); | 441 | aead_request_set_callback(req, 0, esp_input_done, skb); |
372 | aead_request_set_crypt(req, sg, sg, elen, iv); | 442 | aead_request_set_crypt(req, sg, sg, elen, iv); |
373 | aead_request_set_assoc(req, asg, sizeof(*esph)); | 443 | aead_request_set_assoc(req, asg, assoclen); |
374 | 444 | ||
375 | err = crypto_aead_decrypt(req); | 445 | err = crypto_aead_decrypt(req); |
376 | if (err == -EINPROGRESS) | 446 | if (err == -EINPROGRESS) |
@@ -414,7 +484,7 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu) | |||
414 | static void esp4_err(struct sk_buff *skb, u32 info) | 484 | static void esp4_err(struct sk_buff *skb, u32 info) |
415 | { | 485 | { |
416 | struct net *net = dev_net(skb->dev); | 486 | struct net *net = dev_net(skb->dev); |
417 | struct iphdr *iph = (struct iphdr *)skb->data; | 487 | const struct iphdr *iph = (const struct iphdr *)skb->data; |
418 | struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); | 488 | struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); |
419 | struct xfrm_state *x; | 489 | struct xfrm_state *x; |
420 | 490 | ||
@@ -422,7 +492,8 @@ static void esp4_err(struct sk_buff *skb, u32 info) | |||
422 | icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) | 492 | icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) |
423 | return; | 493 | return; |
424 | 494 | ||
425 | x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET); | 495 | x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, |
496 | esph->spi, IPPROTO_ESP, AF_INET); | ||
426 | if (!x) | 497 | if (!x) |
427 | return; | 498 | return; |
428 | NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n", | 499 | NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n", |
@@ -484,10 +555,20 @@ static int esp_init_authenc(struct xfrm_state *x) | |||
484 | goto error; | 555 | goto error; |
485 | 556 | ||
486 | err = -ENAMETOOLONG; | 557 | err = -ENAMETOOLONG; |
487 | if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, "authenc(%s,%s)", | 558 | |
488 | x->aalg ? x->aalg->alg_name : "digest_null", | 559 | if ((x->props.flags & XFRM_STATE_ESN)) { |
489 | x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME) | 560 | if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, |
490 | goto error; | 561 | "authencesn(%s,%s)", |
562 | x->aalg ? x->aalg->alg_name : "digest_null", | ||
563 | x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME) | ||
564 | goto error; | ||
565 | } else { | ||
566 | if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, | ||
567 | "authenc(%s,%s)", | ||
568 | x->aalg ? x->aalg->alg_name : "digest_null", | ||
569 | x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME) | ||
570 | goto error; | ||
571 | } | ||
491 | 572 | ||
492 | aead = crypto_alloc_aead(authenc_name, 0, 0); | 573 | aead = crypto_alloc_aead(authenc_name, 0, 0); |
493 | err = PTR_ERR(aead); | 574 | err = PTR_ERR(aead); |
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 7d02a9f999fa..22524716fe70 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c | |||
@@ -44,6 +44,7 @@ | |||
44 | #include <net/arp.h> | 44 | #include <net/arp.h> |
45 | #include <net/ip_fib.h> | 45 | #include <net/ip_fib.h> |
46 | #include <net/rtnetlink.h> | 46 | #include <net/rtnetlink.h> |
47 | #include <net/xfrm.h> | ||
47 | 48 | ||
48 | #ifndef CONFIG_IP_MULTIPLE_TABLES | 49 | #ifndef CONFIG_IP_MULTIPLE_TABLES |
49 | 50 | ||
@@ -51,11 +52,11 @@ static int __net_init fib4_rules_init(struct net *net) | |||
51 | { | 52 | { |
52 | struct fib_table *local_table, *main_table; | 53 | struct fib_table *local_table, *main_table; |
53 | 54 | ||
54 | local_table = fib_hash_table(RT_TABLE_LOCAL); | 55 | local_table = fib_trie_table(RT_TABLE_LOCAL); |
55 | if (local_table == NULL) | 56 | if (local_table == NULL) |
56 | return -ENOMEM; | 57 | return -ENOMEM; |
57 | 58 | ||
58 | main_table = fib_hash_table(RT_TABLE_MAIN); | 59 | main_table = fib_trie_table(RT_TABLE_MAIN); |
59 | if (main_table == NULL) | 60 | if (main_table == NULL) |
60 | goto fail; | 61 | goto fail; |
61 | 62 | ||
@@ -82,7 +83,7 @@ struct fib_table *fib_new_table(struct net *net, u32 id) | |||
82 | if (tb) | 83 | if (tb) |
83 | return tb; | 84 | return tb; |
84 | 85 | ||
85 | tb = fib_hash_table(id); | 86 | tb = fib_trie_table(id); |
86 | if (!tb) | 87 | if (!tb) |
87 | return NULL; | 88 | return NULL; |
88 | h = id & (FIB_TABLE_HASHSZ - 1); | 89 | h = id & (FIB_TABLE_HASHSZ - 1); |
@@ -114,21 +115,6 @@ struct fib_table *fib_get_table(struct net *net, u32 id) | |||
114 | } | 115 | } |
115 | #endif /* CONFIG_IP_MULTIPLE_TABLES */ | 116 | #endif /* CONFIG_IP_MULTIPLE_TABLES */ |
116 | 117 | ||
117 | void fib_select_default(struct net *net, | ||
118 | const struct flowi *flp, struct fib_result *res) | ||
119 | { | ||
120 | struct fib_table *tb; | ||
121 | int table = RT_TABLE_MAIN; | ||
122 | #ifdef CONFIG_IP_MULTIPLE_TABLES | ||
123 | if (res->r == NULL || res->r->action != FR_ACT_TO_TBL) | ||
124 | return; | ||
125 | table = res->r->table; | ||
126 | #endif | ||
127 | tb = fib_get_table(net, table); | ||
128 | if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) | ||
129 | fib_table_select_default(tb, flp, res); | ||
130 | } | ||
131 | |||
132 | static void fib_flush(struct net *net) | 118 | static void fib_flush(struct net *net) |
133 | { | 119 | { |
134 | int flushed = 0; | 120 | int flushed = 0; |
@@ -148,36 +134,6 @@ static void fib_flush(struct net *net) | |||
148 | } | 134 | } |
149 | 135 | ||
150 | /* | 136 | /* |
151 | * Find the first device with a given source address. | ||
152 | */ | ||
153 | |||
154 | struct net_device * ip_dev_find(struct net *net, __be32 addr) | ||
155 | { | ||
156 | struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } }; | ||
157 | struct fib_result res; | ||
158 | struct net_device *dev = NULL; | ||
159 | struct fib_table *local_table; | ||
160 | |||
161 | #ifdef CONFIG_IP_MULTIPLE_TABLES | ||
162 | res.r = NULL; | ||
163 | #endif | ||
164 | |||
165 | local_table = fib_get_table(net, RT_TABLE_LOCAL); | ||
166 | if (!local_table || fib_table_lookup(local_table, &fl, &res)) | ||
167 | return NULL; | ||
168 | if (res.type != RTN_LOCAL) | ||
169 | goto out; | ||
170 | dev = FIB_RES_DEV(res); | ||
171 | |||
172 | if (dev) | ||
173 | dev_hold(dev); | ||
174 | out: | ||
175 | fib_res_put(&res); | ||
176 | return dev; | ||
177 | } | ||
178 | EXPORT_SYMBOL(ip_dev_find); | ||
179 | |||
180 | /* | ||
181 | * Find address type as if only "dev" was present in the system. If | 137 | * Find address type as if only "dev" was present in the system. If |
182 | * on_dev is NULL then all interfaces are taken into consideration. | 138 | * on_dev is NULL then all interfaces are taken into consideration. |
183 | */ | 139 | */ |
@@ -185,7 +141,7 @@ static inline unsigned __inet_dev_addr_type(struct net *net, | |||
185 | const struct net_device *dev, | 141 | const struct net_device *dev, |
186 | __be32 addr) | 142 | __be32 addr) |
187 | { | 143 | { |
188 | struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } }; | 144 | struct flowi4 fl4 = { .daddr = addr }; |
189 | struct fib_result res; | 145 | struct fib_result res; |
190 | unsigned ret = RTN_BROADCAST; | 146 | unsigned ret = RTN_BROADCAST; |
191 | struct fib_table *local_table; | 147 | struct fib_table *local_table; |
@@ -202,11 +158,12 @@ static inline unsigned __inet_dev_addr_type(struct net *net, | |||
202 | local_table = fib_get_table(net, RT_TABLE_LOCAL); | 158 | local_table = fib_get_table(net, RT_TABLE_LOCAL); |
203 | if (local_table) { | 159 | if (local_table) { |
204 | ret = RTN_UNICAST; | 160 | ret = RTN_UNICAST; |
205 | if (!fib_table_lookup(local_table, &fl, &res)) { | 161 | rcu_read_lock(); |
162 | if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) { | ||
206 | if (!dev || dev == res.fi->fib_dev) | 163 | if (!dev || dev == res.fi->fib_dev) |
207 | ret = res.type; | 164 | ret = res.type; |
208 | fib_res_put(&res); | ||
209 | } | 165 | } |
166 | rcu_read_unlock(); | ||
210 | } | 167 | } |
211 | return ret; | 168 | return ret; |
212 | } | 169 | } |
@@ -220,59 +177,60 @@ EXPORT_SYMBOL(inet_addr_type); | |||
220 | unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, | 177 | unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, |
221 | __be32 addr) | 178 | __be32 addr) |
222 | { | 179 | { |
223 | return __inet_dev_addr_type(net, dev, addr); | 180 | return __inet_dev_addr_type(net, dev, addr); |
224 | } | 181 | } |
225 | EXPORT_SYMBOL(inet_dev_addr_type); | 182 | EXPORT_SYMBOL(inet_dev_addr_type); |
226 | 183 | ||
227 | /* Given (packet source, input interface) and optional (dst, oif, tos): | 184 | /* Given (packet source, input interface) and optional (dst, oif, tos): |
228 | - (main) check, that source is valid i.e. not broadcast or our local | 185 | * - (main) check, that source is valid i.e. not broadcast or our local |
229 | address. | 186 | * address. |
230 | - figure out what "logical" interface this packet arrived | 187 | * - figure out what "logical" interface this packet arrived |
231 | and calculate "specific destination" address. | 188 | * and calculate "specific destination" address. |
232 | - check, that packet arrived from expected physical interface. | 189 | * - check, that packet arrived from expected physical interface. |
190 | * called with rcu_read_lock() | ||
233 | */ | 191 | */ |
234 | 192 | int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, | |
235 | int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, | 193 | int oif, struct net_device *dev, __be32 *spec_dst, |
236 | struct net_device *dev, __be32 *spec_dst, | 194 | u32 *itag) |
237 | u32 *itag, u32 mark) | ||
238 | { | 195 | { |
239 | struct in_device *in_dev; | 196 | struct in_device *in_dev; |
240 | struct flowi fl = { .nl_u = { .ip4_u = | 197 | struct flowi4 fl4; |
241 | { .daddr = src, | ||
242 | .saddr = dst, | ||
243 | .tos = tos } }, | ||
244 | .mark = mark, | ||
245 | .iif = oif }; | ||
246 | |||
247 | struct fib_result res; | 198 | struct fib_result res; |
248 | int no_addr, rpf, accept_local; | 199 | int no_addr, rpf, accept_local; |
249 | bool dev_match; | 200 | bool dev_match; |
250 | int ret; | 201 | int ret; |
251 | struct net *net; | 202 | struct net *net; |
252 | 203 | ||
204 | fl4.flowi4_oif = 0; | ||
205 | fl4.flowi4_iif = oif; | ||
206 | fl4.daddr = src; | ||
207 | fl4.saddr = dst; | ||
208 | fl4.flowi4_tos = tos; | ||
209 | fl4.flowi4_scope = RT_SCOPE_UNIVERSE; | ||
210 | |||
253 | no_addr = rpf = accept_local = 0; | 211 | no_addr = rpf = accept_local = 0; |
254 | rcu_read_lock(); | ||
255 | in_dev = __in_dev_get_rcu(dev); | 212 | in_dev = __in_dev_get_rcu(dev); |
256 | if (in_dev) { | 213 | if (in_dev) { |
257 | no_addr = in_dev->ifa_list == NULL; | 214 | no_addr = in_dev->ifa_list == NULL; |
258 | rpf = IN_DEV_RPFILTER(in_dev); | 215 | |
216 | /* Ignore rp_filter for packets protected by IPsec. */ | ||
217 | rpf = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(in_dev); | ||
218 | |||
259 | accept_local = IN_DEV_ACCEPT_LOCAL(in_dev); | 219 | accept_local = IN_DEV_ACCEPT_LOCAL(in_dev); |
260 | if (mark && !IN_DEV_SRC_VMARK(in_dev)) | 220 | fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0; |
261 | fl.mark = 0; | ||
262 | } | 221 | } |
263 | rcu_read_unlock(); | ||
264 | 222 | ||
265 | if (in_dev == NULL) | 223 | if (in_dev == NULL) |
266 | goto e_inval; | 224 | goto e_inval; |
267 | 225 | ||
268 | net = dev_net(dev); | 226 | net = dev_net(dev); |
269 | if (fib_lookup(net, &fl, &res)) | 227 | if (fib_lookup(net, &fl4, &res)) |
270 | goto last_resort; | 228 | goto last_resort; |
271 | if (res.type != RTN_UNICAST) { | 229 | if (res.type != RTN_UNICAST) { |
272 | if (res.type != RTN_LOCAL || !accept_local) | 230 | if (res.type != RTN_LOCAL || !accept_local) |
273 | goto e_inval_res; | 231 | goto e_inval; |
274 | } | 232 | } |
275 | *spec_dst = FIB_RES_PREFSRC(res); | 233 | *spec_dst = FIB_RES_PREFSRC(net, res); |
276 | fib_combine_itag(itag, &res); | 234 | fib_combine_itag(itag, &res); |
277 | dev_match = false; | 235 | dev_match = false; |
278 | 236 | ||
@@ -291,23 +249,20 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, | |||
291 | #endif | 249 | #endif |
292 | if (dev_match) { | 250 | if (dev_match) { |
293 | ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; | 251 | ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; |
294 | fib_res_put(&res); | ||
295 | return ret; | 252 | return ret; |
296 | } | 253 | } |
297 | fib_res_put(&res); | ||
298 | if (no_addr) | 254 | if (no_addr) |
299 | goto last_resort; | 255 | goto last_resort; |
300 | if (rpf == 1) | 256 | if (rpf == 1) |
301 | goto e_rpf; | 257 | goto e_rpf; |
302 | fl.oif = dev->ifindex; | 258 | fl4.flowi4_oif = dev->ifindex; |
303 | 259 | ||
304 | ret = 0; | 260 | ret = 0; |
305 | if (fib_lookup(net, &fl, &res) == 0) { | 261 | if (fib_lookup(net, &fl4, &res) == 0) { |
306 | if (res.type == RTN_UNICAST) { | 262 | if (res.type == RTN_UNICAST) { |
307 | *spec_dst = FIB_RES_PREFSRC(res); | 263 | *spec_dst = FIB_RES_PREFSRC(net, res); |
308 | ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; | 264 | ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; |
309 | } | 265 | } |
310 | fib_res_put(&res); | ||
311 | } | 266 | } |
312 | return ret; | 267 | return ret; |
313 | 268 | ||
@@ -318,8 +273,6 @@ last_resort: | |||
318 | *itag = 0; | 273 | *itag = 0; |
319 | return 0; | 274 | return 0; |
320 | 275 | ||
321 | e_inval_res: | ||
322 | fib_res_put(&res); | ||
323 | e_inval: | 276 | e_inval: |
324 | return -EINVAL; | 277 | return -EINVAL; |
325 | e_rpf: | 278 | e_rpf: |
@@ -472,9 +425,9 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, | |||
472 | } | 425 | } |
473 | 426 | ||
474 | /* | 427 | /* |
475 | * Handle IP routing ioctl calls. These are used to manipulate the routing tables | 428 | * Handle IP routing ioctl calls. |
429 | * These are used to manipulate the routing tables | ||
476 | */ | 430 | */ |
477 | |||
478 | int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg) | 431 | int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg) |
479 | { | 432 | { |
480 | struct fib_config cfg; | 433 | struct fib_config cfg; |
@@ -518,7 +471,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg) | |||
518 | return -EINVAL; | 471 | return -EINVAL; |
519 | } | 472 | } |
520 | 473 | ||
521 | const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = { | 474 | const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { |
522 | [RTA_DST] = { .type = NLA_U32 }, | 475 | [RTA_DST] = { .type = NLA_U32 }, |
523 | [RTA_SRC] = { .type = NLA_U32 }, | 476 | [RTA_SRC] = { .type = NLA_U32 }, |
524 | [RTA_IIF] = { .type = NLA_U32 }, | 477 | [RTA_IIF] = { .type = NLA_U32 }, |
@@ -532,7 +485,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = { | |||
532 | }; | 485 | }; |
533 | 486 | ||
534 | static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, | 487 | static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, |
535 | struct nlmsghdr *nlh, struct fib_config *cfg) | 488 | struct nlmsghdr *nlh, struct fib_config *cfg) |
536 | { | 489 | { |
537 | struct nlattr *attr; | 490 | struct nlattr *attr; |
538 | int err, remaining; | 491 | int err, remaining; |
@@ -687,12 +640,11 @@ out: | |||
687 | } | 640 | } |
688 | 641 | ||
689 | /* Prepare and feed intra-kernel routing request. | 642 | /* Prepare and feed intra-kernel routing request. |
690 | Really, it should be netlink message, but :-( netlink | 643 | * Really, it should be netlink message, but :-( netlink |
691 | can be not configured, so that we feed it directly | 644 | * can be not configured, so that we feed it directly |
692 | to fib engine. It is legal, because all events occur | 645 | * to fib engine. It is legal, because all events occur |
693 | only when netlink is already locked. | 646 | * only when netlink is already locked. |
694 | */ | 647 | */ |
695 | |||
696 | static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa) | 648 | static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa) |
697 | { | 649 | { |
698 | struct net *net = dev_net(ifa->ifa_dev->dev); | 650 | struct net *net = dev_net(ifa->ifa_dev->dev); |
@@ -738,9 +690,9 @@ void fib_add_ifaddr(struct in_ifaddr *ifa) | |||
738 | struct in_ifaddr *prim = ifa; | 690 | struct in_ifaddr *prim = ifa; |
739 | __be32 mask = ifa->ifa_mask; | 691 | __be32 mask = ifa->ifa_mask; |
740 | __be32 addr = ifa->ifa_local; | 692 | __be32 addr = ifa->ifa_local; |
741 | __be32 prefix = ifa->ifa_address&mask; | 693 | __be32 prefix = ifa->ifa_address & mask; |
742 | 694 | ||
743 | if (ifa->ifa_flags&IFA_F_SECONDARY) { | 695 | if (ifa->ifa_flags & IFA_F_SECONDARY) { |
744 | prim = inet_ifa_byprefix(in_dev, prefix, mask); | 696 | prim = inet_ifa_byprefix(in_dev, prefix, mask); |
745 | if (prim == NULL) { | 697 | if (prim == NULL) { |
746 | printk(KERN_WARNING "fib_add_ifaddr: bug: prim == NULL\n"); | 698 | printk(KERN_WARNING "fib_add_ifaddr: bug: prim == NULL\n"); |
@@ -750,58 +702,118 @@ void fib_add_ifaddr(struct in_ifaddr *ifa) | |||
750 | 702 | ||
751 | fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim); | 703 | fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim); |
752 | 704 | ||
753 | if (!(dev->flags&IFF_UP)) | 705 | if (!(dev->flags & IFF_UP)) |
754 | return; | 706 | return; |
755 | 707 | ||
756 | /* Add broadcast address, if it is explicitly assigned. */ | 708 | /* Add broadcast address, if it is explicitly assigned. */ |
757 | if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF)) | 709 | if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF)) |
758 | fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); | 710 | fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); |
759 | 711 | ||
760 | if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY) && | 712 | if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) && |
761 | (prefix != addr || ifa->ifa_prefixlen < 32)) { | 713 | (prefix != addr || ifa->ifa_prefixlen < 32)) { |
762 | fib_magic(RTM_NEWROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL : | 714 | fib_magic(RTM_NEWROUTE, |
763 | RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim); | 715 | dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, |
716 | prefix, ifa->ifa_prefixlen, prim); | ||
764 | 717 | ||
765 | /* Add network specific broadcasts, when it takes a sense */ | 718 | /* Add network specific broadcasts, when it takes a sense */ |
766 | if (ifa->ifa_prefixlen < 31) { | 719 | if (ifa->ifa_prefixlen < 31) { |
767 | fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim); | 720 | fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim); |
768 | fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix|~mask, 32, prim); | 721 | fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask, |
722 | 32, prim); | ||
769 | } | 723 | } |
770 | } | 724 | } |
771 | } | 725 | } |
772 | 726 | ||
773 | static void fib_del_ifaddr(struct in_ifaddr *ifa) | 727 | /* Delete primary or secondary address. |
728 | * Optionally, on secondary address promotion consider the addresses | ||
729 | * from subnet iprim as deleted, even if they are in device list. | ||
730 | * In this case the secondary ifa can be in device list. | ||
731 | */ | ||
732 | void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim) | ||
774 | { | 733 | { |
775 | struct in_device *in_dev = ifa->ifa_dev; | 734 | struct in_device *in_dev = ifa->ifa_dev; |
776 | struct net_device *dev = in_dev->dev; | 735 | struct net_device *dev = in_dev->dev; |
777 | struct in_ifaddr *ifa1; | 736 | struct in_ifaddr *ifa1; |
778 | struct in_ifaddr *prim = ifa; | 737 | struct in_ifaddr *prim = ifa, *prim1 = NULL; |
779 | __be32 brd = ifa->ifa_address|~ifa->ifa_mask; | 738 | __be32 brd = ifa->ifa_address | ~ifa->ifa_mask; |
780 | __be32 any = ifa->ifa_address&ifa->ifa_mask; | 739 | __be32 any = ifa->ifa_address & ifa->ifa_mask; |
781 | #define LOCAL_OK 1 | 740 | #define LOCAL_OK 1 |
782 | #define BRD_OK 2 | 741 | #define BRD_OK 2 |
783 | #define BRD0_OK 4 | 742 | #define BRD0_OK 4 |
784 | #define BRD1_OK 8 | 743 | #define BRD1_OK 8 |
785 | unsigned ok = 0; | 744 | unsigned ok = 0; |
745 | int subnet = 0; /* Primary network */ | ||
746 | int gone = 1; /* Address is missing */ | ||
747 | int same_prefsrc = 0; /* Another primary with same IP */ | ||
786 | 748 | ||
787 | if (!(ifa->ifa_flags&IFA_F_SECONDARY)) | 749 | if (ifa->ifa_flags & IFA_F_SECONDARY) { |
788 | fib_magic(RTM_DELROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL : | ||
789 | RTN_UNICAST, any, ifa->ifa_prefixlen, prim); | ||
790 | else { | ||
791 | prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); | 750 | prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); |
792 | if (prim == NULL) { | 751 | if (prim == NULL) { |
793 | printk(KERN_WARNING "fib_del_ifaddr: bug: prim == NULL\n"); | 752 | printk(KERN_WARNING "fib_del_ifaddr: bug: prim == NULL\n"); |
794 | return; | 753 | return; |
795 | } | 754 | } |
755 | if (iprim && iprim != prim) { | ||
756 | printk(KERN_WARNING "fib_del_ifaddr: bug: iprim != prim\n"); | ||
757 | return; | ||
758 | } | ||
759 | } else if (!ipv4_is_zeronet(any) && | ||
760 | (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) { | ||
761 | fib_magic(RTM_DELROUTE, | ||
762 | dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, | ||
763 | any, ifa->ifa_prefixlen, prim); | ||
764 | subnet = 1; | ||
796 | } | 765 | } |
797 | 766 | ||
798 | /* Deletion is more complicated than add. | 767 | /* Deletion is more complicated than add. |
799 | We should take care of not to delete too much :-) | 768 | * We should take care of not to delete too much :-) |
800 | 769 | * | |
801 | Scan address list to be sure that addresses are really gone. | 770 | * Scan address list to be sure that addresses are really gone. |
802 | */ | 771 | */ |
803 | 772 | ||
804 | for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) { | 773 | for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) { |
774 | if (ifa1 == ifa) { | ||
775 | /* promotion, keep the IP */ | ||
776 | gone = 0; | ||
777 | continue; | ||
778 | } | ||
779 | /* Ignore IFAs from our subnet */ | ||
780 | if (iprim && ifa1->ifa_mask == iprim->ifa_mask && | ||
781 | inet_ifa_match(ifa1->ifa_address, iprim)) | ||
782 | continue; | ||
783 | |||
784 | /* Ignore ifa1 if it uses different primary IP (prefsrc) */ | ||
785 | if (ifa1->ifa_flags & IFA_F_SECONDARY) { | ||
786 | /* Another address from our subnet? */ | ||
787 | if (ifa1->ifa_mask == prim->ifa_mask && | ||
788 | inet_ifa_match(ifa1->ifa_address, prim)) | ||
789 | prim1 = prim; | ||
790 | else { | ||
791 | /* We reached the secondaries, so | ||
792 | * same_prefsrc should be determined. | ||
793 | */ | ||
794 | if (!same_prefsrc) | ||
795 | continue; | ||
796 | /* Search new prim1 if ifa1 is not | ||
797 | * using the current prim1 | ||
798 | */ | ||
799 | if (!prim1 || | ||
800 | ifa1->ifa_mask != prim1->ifa_mask || | ||
801 | !inet_ifa_match(ifa1->ifa_address, prim1)) | ||
802 | prim1 = inet_ifa_byprefix(in_dev, | ||
803 | ifa1->ifa_address, | ||
804 | ifa1->ifa_mask); | ||
805 | if (!prim1) | ||
806 | continue; | ||
807 | if (prim1->ifa_local != prim->ifa_local) | ||
808 | continue; | ||
809 | } | ||
810 | } else { | ||
811 | if (prim->ifa_local != ifa1->ifa_local) | ||
812 | continue; | ||
813 | prim1 = ifa1; | ||
814 | if (prim != prim1) | ||
815 | same_prefsrc = 1; | ||
816 | } | ||
805 | if (ifa->ifa_local == ifa1->ifa_local) | 817 | if (ifa->ifa_local == ifa1->ifa_local) |
806 | ok |= LOCAL_OK; | 818 | ok |= LOCAL_OK; |
807 | if (ifa->ifa_broadcast == ifa1->ifa_broadcast) | 819 | if (ifa->ifa_broadcast == ifa1->ifa_broadcast) |
@@ -810,25 +822,43 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa) | |||
810 | ok |= BRD1_OK; | 822 | ok |= BRD1_OK; |
811 | if (any == ifa1->ifa_broadcast) | 823 | if (any == ifa1->ifa_broadcast) |
812 | ok |= BRD0_OK; | 824 | ok |= BRD0_OK; |
825 | /* primary has network specific broadcasts */ | ||
826 | if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) { | ||
827 | __be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask; | ||
828 | __be32 any1 = ifa1->ifa_address & ifa1->ifa_mask; | ||
829 | |||
830 | if (!ipv4_is_zeronet(any1)) { | ||
831 | if (ifa->ifa_broadcast == brd1 || | ||
832 | ifa->ifa_broadcast == any1) | ||
833 | ok |= BRD_OK; | ||
834 | if (brd == brd1 || brd == any1) | ||
835 | ok |= BRD1_OK; | ||
836 | if (any == brd1 || any == any1) | ||
837 | ok |= BRD0_OK; | ||
838 | } | ||
839 | } | ||
813 | } | 840 | } |
814 | 841 | ||
815 | if (!(ok&BRD_OK)) | 842 | if (!(ok & BRD_OK)) |
816 | fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); | 843 | fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); |
817 | if (!(ok&BRD1_OK)) | 844 | if (subnet && ifa->ifa_prefixlen < 31) { |
818 | fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim); | 845 | if (!(ok & BRD1_OK)) |
819 | if (!(ok&BRD0_OK)) | 846 | fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim); |
820 | fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim); | 847 | if (!(ok & BRD0_OK)) |
821 | if (!(ok&LOCAL_OK)) { | 848 | fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim); |
849 | } | ||
850 | if (!(ok & LOCAL_OK)) { | ||
822 | fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim); | 851 | fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim); |
823 | 852 | ||
824 | /* Check, that this local address finally disappeared. */ | 853 | /* Check, that this local address finally disappeared. */ |
825 | if (inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) { | 854 | if (gone && |
855 | inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) { | ||
826 | /* And the last, but not the least thing. | 856 | /* And the last, but not the least thing. |
827 | We must flush stray FIB entries. | 857 | * We must flush stray FIB entries. |
828 | 858 | * | |
829 | First of all, we scan fib_info list searching | 859 | * First of all, we scan fib_info list searching |
830 | for stray nexthop entries, then ignite fib_flush. | 860 | * for stray nexthop entries, then ignite fib_flush. |
831 | */ | 861 | */ |
832 | if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local)) | 862 | if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local)) |
833 | fib_flush(dev_net(dev)); | 863 | fib_flush(dev_net(dev)); |
834 | } | 864 | } |
@@ -839,14 +869,16 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa) | |||
839 | #undef BRD1_OK | 869 | #undef BRD1_OK |
840 | } | 870 | } |
841 | 871 | ||
842 | static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb ) | 872 | static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb) |
843 | { | 873 | { |
844 | 874 | ||
845 | struct fib_result res; | 875 | struct fib_result res; |
846 | struct flowi fl = { .mark = frn->fl_mark, | 876 | struct flowi4 fl4 = { |
847 | .nl_u = { .ip4_u = { .daddr = frn->fl_addr, | 877 | .flowi4_mark = frn->fl_mark, |
848 | .tos = frn->fl_tos, | 878 | .daddr = frn->fl_addr, |
849 | .scope = frn->fl_scope } } }; | 879 | .flowi4_tos = frn->fl_tos, |
880 | .flowi4_scope = frn->fl_scope, | ||
881 | }; | ||
850 | 882 | ||
851 | #ifdef CONFIG_IP_MULTIPLE_TABLES | 883 | #ifdef CONFIG_IP_MULTIPLE_TABLES |
852 | res.r = NULL; | 884 | res.r = NULL; |
@@ -857,15 +889,16 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb ) | |||
857 | local_bh_disable(); | 889 | local_bh_disable(); |
858 | 890 | ||
859 | frn->tb_id = tb->tb_id; | 891 | frn->tb_id = tb->tb_id; |
860 | frn->err = fib_table_lookup(tb, &fl, &res); | 892 | rcu_read_lock(); |
893 | frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); | ||
861 | 894 | ||
862 | if (!frn->err) { | 895 | if (!frn->err) { |
863 | frn->prefixlen = res.prefixlen; | 896 | frn->prefixlen = res.prefixlen; |
864 | frn->nh_sel = res.nh_sel; | 897 | frn->nh_sel = res.nh_sel; |
865 | frn->type = res.type; | 898 | frn->type = res.type; |
866 | frn->scope = res.scope; | 899 | frn->scope = res.scope; |
867 | fib_res_put(&res); | ||
868 | } | 900 | } |
901 | rcu_read_unlock(); | ||
869 | local_bh_enable(); | 902 | local_bh_enable(); |
870 | } | 903 | } |
871 | } | 904 | } |
@@ -894,8 +927,8 @@ static void nl_fib_input(struct sk_buff *skb) | |||
894 | 927 | ||
895 | nl_fib_lookup(frn, tb); | 928 | nl_fib_lookup(frn, tb); |
896 | 929 | ||
897 | pid = NETLINK_CB(skb).pid; /* pid of sending process */ | 930 | pid = NETLINK_CB(skb).pid; /* pid of sending process */ |
898 | NETLINK_CB(skb).pid = 0; /* from kernel */ | 931 | NETLINK_CB(skb).pid = 0; /* from kernel */ |
899 | NETLINK_CB(skb).dst_group = 0; /* unicast */ | 932 | NETLINK_CB(skb).dst_group = 0; /* unicast */ |
900 | netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT); | 933 | netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT); |
901 | } | 934 | } |
@@ -929,6 +962,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, | |||
929 | { | 962 | { |
930 | struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; | 963 | struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; |
931 | struct net_device *dev = ifa->ifa_dev->dev; | 964 | struct net_device *dev = ifa->ifa_dev->dev; |
965 | struct net *net = dev_net(dev); | ||
932 | 966 | ||
933 | switch (event) { | 967 | switch (event) { |
934 | case NETDEV_UP: | 968 | case NETDEV_UP: |
@@ -936,13 +970,15 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, | |||
936 | #ifdef CONFIG_IP_ROUTE_MULTIPATH | 970 | #ifdef CONFIG_IP_ROUTE_MULTIPATH |
937 | fib_sync_up(dev); | 971 | fib_sync_up(dev); |
938 | #endif | 972 | #endif |
973 | atomic_inc(&net->ipv4.dev_addr_genid); | ||
939 | rt_cache_flush(dev_net(dev), -1); | 974 | rt_cache_flush(dev_net(dev), -1); |
940 | break; | 975 | break; |
941 | case NETDEV_DOWN: | 976 | case NETDEV_DOWN: |
942 | fib_del_ifaddr(ifa); | 977 | fib_del_ifaddr(ifa, NULL); |
978 | atomic_inc(&net->ipv4.dev_addr_genid); | ||
943 | if (ifa->ifa_dev->ifa_list == NULL) { | 979 | if (ifa->ifa_dev->ifa_list == NULL) { |
944 | /* Last address was deleted from this interface. | 980 | /* Last address was deleted from this interface. |
945 | Disable IP. | 981 | * Disable IP. |
946 | */ | 982 | */ |
947 | fib_disable_ip(dev, 1, 0); | 983 | fib_disable_ip(dev, 1, 0); |
948 | } else { | 984 | } else { |
@@ -957,6 +993,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo | |||
957 | { | 993 | { |
958 | struct net_device *dev = ptr; | 994 | struct net_device *dev = ptr; |
959 | struct in_device *in_dev = __in_dev_get_rtnl(dev); | 995 | struct in_device *in_dev = __in_dev_get_rtnl(dev); |
996 | struct net *net = dev_net(dev); | ||
960 | 997 | ||
961 | if (event == NETDEV_UNREGISTER) { | 998 | if (event == NETDEV_UNREGISTER) { |
962 | fib_disable_ip(dev, 2, -1); | 999 | fib_disable_ip(dev, 2, -1); |
@@ -974,6 +1011,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo | |||
974 | #ifdef CONFIG_IP_ROUTE_MULTIPATH | 1011 | #ifdef CONFIG_IP_ROUTE_MULTIPATH |
975 | fib_sync_up(dev); | 1012 | fib_sync_up(dev); |
976 | #endif | 1013 | #endif |
1014 | atomic_inc(&net->ipv4.dev_addr_genid); | ||
977 | rt_cache_flush(dev_net(dev), -1); | 1015 | rt_cache_flush(dev_net(dev), -1); |
978 | break; | 1016 | break; |
979 | case NETDEV_DOWN: | 1017 | case NETDEV_DOWN: |
@@ -984,7 +1022,11 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo | |||
984 | rt_cache_flush(dev_net(dev), 0); | 1022 | rt_cache_flush(dev_net(dev), 0); |
985 | break; | 1023 | break; |
986 | case NETDEV_UNREGISTER_BATCH: | 1024 | case NETDEV_UNREGISTER_BATCH: |
987 | rt_cache_flush_batch(); | 1025 | /* The batch unregister is only called on the first |
1026 | * device in the list of devices being unregistered. | ||
1027 | * Therefore we should not pass dev_net(dev) in here. | ||
1028 | */ | ||
1029 | rt_cache_flush_batch(NULL); | ||
988 | break; | 1030 | break; |
989 | } | 1031 | } |
990 | return NOTIFY_DONE; | 1032 | return NOTIFY_DONE; |
@@ -1001,16 +1043,15 @@ static struct notifier_block fib_netdev_notifier = { | |||
1001 | static int __net_init ip_fib_net_init(struct net *net) | 1043 | static int __net_init ip_fib_net_init(struct net *net) |
1002 | { | 1044 | { |
1003 | int err; | 1045 | int err; |
1004 | unsigned int i; | 1046 | size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ; |
1047 | |||
1048 | /* Avoid false sharing : Use at least a full cache line */ | ||
1049 | size = max_t(size_t, size, L1_CACHE_BYTES); | ||
1005 | 1050 | ||
1006 | net->ipv4.fib_table_hash = kzalloc( | 1051 | net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL); |
1007 | sizeof(struct hlist_head)*FIB_TABLE_HASHSZ, GFP_KERNEL); | ||
1008 | if (net->ipv4.fib_table_hash == NULL) | 1052 | if (net->ipv4.fib_table_hash == NULL) |
1009 | return -ENOMEM; | 1053 | return -ENOMEM; |
1010 | 1054 | ||
1011 | for (i = 0; i < FIB_TABLE_HASHSZ; i++) | ||
1012 | INIT_HLIST_HEAD(&net->ipv4.fib_table_hash[i]); | ||
1013 | |||
1014 | err = fib4_rules_init(net); | 1055 | err = fib4_rules_init(net); |
1015 | if (err < 0) | 1056 | if (err < 0) |
1016 | goto fail; | 1057 | goto fail; |
@@ -1029,6 +1070,7 @@ static void ip_fib_net_exit(struct net *net) | |||
1029 | fib4_rules_exit(net); | 1070 | fib4_rules_exit(net); |
1030 | #endif | 1071 | #endif |
1031 | 1072 | ||
1073 | rtnl_lock(); | ||
1032 | for (i = 0; i < FIB_TABLE_HASHSZ; i++) { | 1074 | for (i = 0; i < FIB_TABLE_HASHSZ; i++) { |
1033 | struct fib_table *tb; | 1075 | struct fib_table *tb; |
1034 | struct hlist_head *head; | 1076 | struct hlist_head *head; |
@@ -1038,9 +1080,10 @@ static void ip_fib_net_exit(struct net *net) | |||
1038 | hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) { | 1080 | hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) { |
1039 | hlist_del(node); | 1081 | hlist_del(node); |
1040 | fib_table_flush(tb); | 1082 | fib_table_flush(tb); |
1041 | kfree(tb); | 1083 | fib_free_table(tb); |
1042 | } | 1084 | } |
1043 | } | 1085 | } |
1086 | rtnl_unlock(); | ||
1044 | kfree(net->ipv4.fib_table_hash); | 1087 | kfree(net->ipv4.fib_table_hash); |
1045 | } | 1088 | } |
1046 | 1089 | ||
@@ -1089,5 +1132,5 @@ void __init ip_fib_init(void) | |||
1089 | register_netdevice_notifier(&fib_netdev_notifier); | 1132 | register_netdevice_notifier(&fib_netdev_notifier); |
1090 | register_inetaddr_notifier(&fib_inetaddr_notifier); | 1133 | register_inetaddr_notifier(&fib_inetaddr_notifier); |
1091 | 1134 | ||
1092 | fib_hash_init(); | 1135 | fib_trie_init(); |
1093 | } | 1136 | } |
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c deleted file mode 100644 index 4ed7e0dea1bc..000000000000 --- a/net/ipv4/fib_hash.c +++ /dev/null | |||
@@ -1,1070 +0,0 @@ | |||
1 | /* | ||
2 | * INET An implementation of the TCP/IP protocol suite for the LINUX | ||
3 | * operating system. INET is implemented using the BSD Socket | ||
4 | * interface as the means of communication with the user level. | ||
5 | * | ||
6 | * IPv4 FIB: lookup engine and maintenance routines. | ||
7 | * | ||
8 | * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public License | ||
12 | * as published by the Free Software Foundation; either version | ||
13 | * 2 of the License, or (at your option) any later version. | ||
14 | */ | ||
15 | |||
16 | #include <asm/uaccess.h> | ||
17 | #include <asm/system.h> | ||
18 | #include <linux/bitops.h> | ||
19 | #include <linux/types.h> | ||
20 | #include <linux/kernel.h> | ||
21 | #include <linux/mm.h> | ||
22 | #include <linux/string.h> | ||
23 | #include <linux/socket.h> | ||
24 | #include <linux/sockios.h> | ||
25 | #include <linux/errno.h> | ||
26 | #include <linux/in.h> | ||
27 | #include <linux/inet.h> | ||
28 | #include <linux/inetdevice.h> | ||
29 | #include <linux/netdevice.h> | ||
30 | #include <linux/if_arp.h> | ||
31 | #include <linux/proc_fs.h> | ||
32 | #include <linux/skbuff.h> | ||
33 | #include <linux/netlink.h> | ||
34 | #include <linux/init.h> | ||
35 | #include <linux/slab.h> | ||
36 | |||
37 | #include <net/net_namespace.h> | ||
38 | #include <net/ip.h> | ||
39 | #include <net/protocol.h> | ||
40 | #include <net/route.h> | ||
41 | #include <net/tcp.h> | ||
42 | #include <net/sock.h> | ||
43 | #include <net/ip_fib.h> | ||
44 | |||
45 | #include "fib_lookup.h" | ||
46 | |||
47 | static struct kmem_cache *fn_hash_kmem __read_mostly; | ||
48 | static struct kmem_cache *fn_alias_kmem __read_mostly; | ||
49 | |||
50 | struct fib_node { | ||
51 | struct hlist_node fn_hash; | ||
52 | struct list_head fn_alias; | ||
53 | __be32 fn_key; | ||
54 | struct fib_alias fn_embedded_alias; | ||
55 | }; | ||
56 | |||
57 | struct fn_zone { | ||
58 | struct fn_zone *fz_next; /* Next not empty zone */ | ||
59 | struct hlist_head *fz_hash; /* Hash table pointer */ | ||
60 | int fz_nent; /* Number of entries */ | ||
61 | |||
62 | int fz_divisor; /* Hash divisor */ | ||
63 | u32 fz_hashmask; /* (fz_divisor - 1) */ | ||
64 | #define FZ_HASHMASK(fz) ((fz)->fz_hashmask) | ||
65 | |||
66 | int fz_order; /* Zone order */ | ||
67 | __be32 fz_mask; | ||
68 | #define FZ_MASK(fz) ((fz)->fz_mask) | ||
69 | }; | ||
70 | |||
71 | /* NOTE. On fast computers evaluation of fz_hashmask and fz_mask | ||
72 | * can be cheaper than memory lookup, so that FZ_* macros are used. | ||
73 | */ | ||
74 | |||
75 | struct fn_hash { | ||
76 | struct fn_zone *fn_zones[33]; | ||
77 | struct fn_zone *fn_zone_list; | ||
78 | }; | ||
79 | |||
80 | static inline u32 fn_hash(__be32 key, struct fn_zone *fz) | ||
81 | { | ||
82 | u32 h = ntohl(key)>>(32 - fz->fz_order); | ||
83 | h ^= (h>>20); | ||
84 | h ^= (h>>10); | ||
85 | h ^= (h>>5); | ||
86 | h &= FZ_HASHMASK(fz); | ||
87 | return h; | ||
88 | } | ||
89 | |||
90 | static inline __be32 fz_key(__be32 dst, struct fn_zone *fz) | ||
91 | { | ||
92 | return dst & FZ_MASK(fz); | ||
93 | } | ||
94 | |||
95 | static DEFINE_RWLOCK(fib_hash_lock); | ||
96 | static unsigned int fib_hash_genid; | ||
97 | |||
98 | #define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head)) | ||
99 | |||
100 | static struct hlist_head *fz_hash_alloc(int divisor) | ||
101 | { | ||
102 | unsigned long size = divisor * sizeof(struct hlist_head); | ||
103 | |||
104 | if (size <= PAGE_SIZE) { | ||
105 | return kzalloc(size, GFP_KERNEL); | ||
106 | } else { | ||
107 | return (struct hlist_head *) | ||
108 | __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size)); | ||
109 | } | ||
110 | } | ||
111 | |||
112 | /* The fib hash lock must be held when this is called. */ | ||
113 | static inline void fn_rebuild_zone(struct fn_zone *fz, | ||
114 | struct hlist_head *old_ht, | ||
115 | int old_divisor) | ||
116 | { | ||
117 | int i; | ||
118 | |||
119 | for (i = 0; i < old_divisor; i++) { | ||
120 | struct hlist_node *node, *n; | ||
121 | struct fib_node *f; | ||
122 | |||
123 | hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) { | ||
124 | struct hlist_head *new_head; | ||
125 | |||
126 | hlist_del(&f->fn_hash); | ||
127 | |||
128 | new_head = &fz->fz_hash[fn_hash(f->fn_key, fz)]; | ||
129 | hlist_add_head(&f->fn_hash, new_head); | ||
130 | } | ||
131 | } | ||
132 | } | ||
133 | |||
134 | static void fz_hash_free(struct hlist_head *hash, int divisor) | ||
135 | { | ||
136 | unsigned long size = divisor * sizeof(struct hlist_head); | ||
137 | |||
138 | if (size <= PAGE_SIZE) | ||
139 | kfree(hash); | ||
140 | else | ||
141 | free_pages((unsigned long)hash, get_order(size)); | ||
142 | } | ||
143 | |||
144 | static void fn_rehash_zone(struct fn_zone *fz) | ||
145 | { | ||
146 | struct hlist_head *ht, *old_ht; | ||
147 | int old_divisor, new_divisor; | ||
148 | u32 new_hashmask; | ||
149 | |||
150 | old_divisor = fz->fz_divisor; | ||
151 | |||
152 | switch (old_divisor) { | ||
153 | case 16: | ||
154 | new_divisor = 256; | ||
155 | break; | ||
156 | case 256: | ||
157 | new_divisor = 1024; | ||
158 | break; | ||
159 | default: | ||
160 | if ((old_divisor << 1) > FZ_MAX_DIVISOR) { | ||
161 | printk(KERN_CRIT "route.c: bad divisor %d!\n", old_divisor); | ||
162 | return; | ||
163 | } | ||
164 | new_divisor = (old_divisor << 1); | ||
165 | break; | ||
166 | } | ||
167 | |||
168 | new_hashmask = (new_divisor - 1); | ||
169 | |||
170 | #if RT_CACHE_DEBUG >= 2 | ||
171 | printk(KERN_DEBUG "fn_rehash_zone: hash for zone %d grows from %d\n", | ||
172 | fz->fz_order, old_divisor); | ||
173 | #endif | ||
174 | |||
175 | ht = fz_hash_alloc(new_divisor); | ||
176 | |||
177 | if (ht) { | ||
178 | write_lock_bh(&fib_hash_lock); | ||
179 | old_ht = fz->fz_hash; | ||
180 | fz->fz_hash = ht; | ||
181 | fz->fz_hashmask = new_hashmask; | ||
182 | fz->fz_divisor = new_divisor; | ||
183 | fn_rebuild_zone(fz, old_ht, old_divisor); | ||
184 | fib_hash_genid++; | ||
185 | write_unlock_bh(&fib_hash_lock); | ||
186 | |||
187 | fz_hash_free(old_ht, old_divisor); | ||
188 | } | ||
189 | } | ||
190 | |||
191 | static inline void fn_free_node(struct fib_node * f) | ||
192 | { | ||
193 | kmem_cache_free(fn_hash_kmem, f); | ||
194 | } | ||
195 | |||
196 | static inline void fn_free_alias(struct fib_alias *fa, struct fib_node *f) | ||
197 | { | ||
198 | fib_release_info(fa->fa_info); | ||
199 | if (fa == &f->fn_embedded_alias) | ||
200 | fa->fa_info = NULL; | ||
201 | else | ||
202 | kmem_cache_free(fn_alias_kmem, fa); | ||
203 | } | ||
204 | |||
205 | static struct fn_zone * | ||
206 | fn_new_zone(struct fn_hash *table, int z) | ||
207 | { | ||
208 | int i; | ||
209 | struct fn_zone *fz = kzalloc(sizeof(struct fn_zone), GFP_KERNEL); | ||
210 | if (!fz) | ||
211 | return NULL; | ||
212 | |||
213 | if (z) { | ||
214 | fz->fz_divisor = 16; | ||
215 | } else { | ||
216 | fz->fz_divisor = 1; | ||
217 | } | ||
218 | fz->fz_hashmask = (fz->fz_divisor - 1); | ||
219 | fz->fz_hash = fz_hash_alloc(fz->fz_divisor); | ||
220 | if (!fz->fz_hash) { | ||
221 | kfree(fz); | ||
222 | return NULL; | ||
223 | } | ||
224 | fz->fz_order = z; | ||
225 | fz->fz_mask = inet_make_mask(z); | ||
226 | |||
227 | /* Find the first not empty zone with more specific mask */ | ||
228 | for (i=z+1; i<=32; i++) | ||
229 | if (table->fn_zones[i]) | ||
230 | break; | ||
231 | write_lock_bh(&fib_hash_lock); | ||
232 | if (i>32) { | ||
233 | /* No more specific masks, we are the first. */ | ||
234 | fz->fz_next = table->fn_zone_list; | ||
235 | table->fn_zone_list = fz; | ||
236 | } else { | ||
237 | fz->fz_next = table->fn_zones[i]->fz_next; | ||
238 | table->fn_zones[i]->fz_next = fz; | ||
239 | } | ||
240 | table->fn_zones[z] = fz; | ||
241 | fib_hash_genid++; | ||
242 | write_unlock_bh(&fib_hash_lock); | ||
243 | return fz; | ||
244 | } | ||
245 | |||
246 | int fib_table_lookup(struct fib_table *tb, | ||
247 | const struct flowi *flp, struct fib_result *res) | ||
248 | { | ||
249 | int err; | ||
250 | struct fn_zone *fz; | ||
251 | struct fn_hash *t = (struct fn_hash *)tb->tb_data; | ||
252 | |||
253 | read_lock(&fib_hash_lock); | ||
254 | for (fz = t->fn_zone_list; fz; fz = fz->fz_next) { | ||
255 | struct hlist_head *head; | ||
256 | struct hlist_node *node; | ||
257 | struct fib_node *f; | ||
258 | __be32 k = fz_key(flp->fl4_dst, fz); | ||
259 | |||
260 | head = &fz->fz_hash[fn_hash(k, fz)]; | ||
261 | hlist_for_each_entry(f, node, head, fn_hash) { | ||
262 | if (f->fn_key != k) | ||
263 | continue; | ||
264 | |||
265 | err = fib_semantic_match(&f->fn_alias, | ||
266 | flp, res, | ||
267 | fz->fz_order); | ||
268 | if (err <= 0) | ||
269 | goto out; | ||
270 | } | ||
271 | } | ||
272 | err = 1; | ||
273 | out: | ||
274 | read_unlock(&fib_hash_lock); | ||
275 | return err; | ||
276 | } | ||
277 | |||
278 | void fib_table_select_default(struct fib_table *tb, | ||
279 | const struct flowi *flp, struct fib_result *res) | ||
280 | { | ||
281 | int order, last_idx; | ||
282 | struct hlist_node *node; | ||
283 | struct fib_node *f; | ||
284 | struct fib_info *fi = NULL; | ||
285 | struct fib_info *last_resort; | ||
286 | struct fn_hash *t = (struct fn_hash *)tb->tb_data; | ||
287 | struct fn_zone *fz = t->fn_zones[0]; | ||
288 | |||
289 | if (fz == NULL) | ||
290 | return; | ||
291 | |||
292 | last_idx = -1; | ||
293 | last_resort = NULL; | ||
294 | order = -1; | ||
295 | |||
296 | read_lock(&fib_hash_lock); | ||
297 | hlist_for_each_entry(f, node, &fz->fz_hash[0], fn_hash) { | ||
298 | struct fib_alias *fa; | ||
299 | |||
300 | list_for_each_entry(fa, &f->fn_alias, fa_list) { | ||
301 | struct fib_info *next_fi = fa->fa_info; | ||
302 | |||
303 | if (fa->fa_scope != res->scope || | ||
304 | fa->fa_type != RTN_UNICAST) | ||
305 | continue; | ||
306 | |||
307 | if (next_fi->fib_priority > res->fi->fib_priority) | ||
308 | break; | ||
309 | if (!next_fi->fib_nh[0].nh_gw || | ||
310 | next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) | ||
311 | continue; | ||
312 | fa->fa_state |= FA_S_ACCESSED; | ||
313 | |||
314 | if (fi == NULL) { | ||
315 | if (next_fi != res->fi) | ||
316 | break; | ||
317 | } else if (!fib_detect_death(fi, order, &last_resort, | ||
318 | &last_idx, tb->tb_default)) { | ||
319 | fib_result_assign(res, fi); | ||
320 | tb->tb_default = order; | ||
321 | goto out; | ||
322 | } | ||
323 | fi = next_fi; | ||
324 | order++; | ||
325 | } | ||
326 | } | ||
327 | |||
328 | if (order <= 0 || fi == NULL) { | ||
329 | tb->tb_default = -1; | ||
330 | goto out; | ||
331 | } | ||
332 | |||
333 | if (!fib_detect_death(fi, order, &last_resort, &last_idx, | ||
334 | tb->tb_default)) { | ||
335 | fib_result_assign(res, fi); | ||
336 | tb->tb_default = order; | ||
337 | goto out; | ||
338 | } | ||
339 | |||
340 | if (last_idx >= 0) | ||
341 | fib_result_assign(res, last_resort); | ||
342 | tb->tb_default = last_idx; | ||
343 | out: | ||
344 | read_unlock(&fib_hash_lock); | ||
345 | } | ||
346 | |||
347 | /* Insert node F to FZ. */ | ||
348 | static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f) | ||
349 | { | ||
350 | struct hlist_head *head = &fz->fz_hash[fn_hash(f->fn_key, fz)]; | ||
351 | |||
352 | hlist_add_head(&f->fn_hash, head); | ||
353 | } | ||
354 | |||
355 | /* Return the node in FZ matching KEY. */ | ||
356 | static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key) | ||
357 | { | ||
358 | struct hlist_head *head = &fz->fz_hash[fn_hash(key, fz)]; | ||
359 | struct hlist_node *node; | ||
360 | struct fib_node *f; | ||
361 | |||
362 | hlist_for_each_entry(f, node, head, fn_hash) { | ||
363 | if (f->fn_key == key) | ||
364 | return f; | ||
365 | } | ||
366 | |||
367 | return NULL; | ||
368 | } | ||
369 | |||
370 | int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) | ||
371 | { | ||
372 | struct fn_hash *table = (struct fn_hash *) tb->tb_data; | ||
373 | struct fib_node *new_f = NULL; | ||
374 | struct fib_node *f; | ||
375 | struct fib_alias *fa, *new_fa; | ||
376 | struct fn_zone *fz; | ||
377 | struct fib_info *fi; | ||
378 | u8 tos = cfg->fc_tos; | ||
379 | __be32 key; | ||
380 | int err; | ||
381 | |||
382 | if (cfg->fc_dst_len > 32) | ||
383 | return -EINVAL; | ||
384 | |||
385 | fz = table->fn_zones[cfg->fc_dst_len]; | ||
386 | if (!fz && !(fz = fn_new_zone(table, cfg->fc_dst_len))) | ||
387 | return -ENOBUFS; | ||
388 | |||
389 | key = 0; | ||
390 | if (cfg->fc_dst) { | ||
391 | if (cfg->fc_dst & ~FZ_MASK(fz)) | ||
392 | return -EINVAL; | ||
393 | key = fz_key(cfg->fc_dst, fz); | ||
394 | } | ||
395 | |||
396 | fi = fib_create_info(cfg); | ||
397 | if (IS_ERR(fi)) | ||
398 | return PTR_ERR(fi); | ||
399 | |||
400 | if (fz->fz_nent > (fz->fz_divisor<<1) && | ||
401 | fz->fz_divisor < FZ_MAX_DIVISOR && | ||
402 | (cfg->fc_dst_len == 32 || | ||
403 | (1 << cfg->fc_dst_len) > fz->fz_divisor)) | ||
404 | fn_rehash_zone(fz); | ||
405 | |||
406 | f = fib_find_node(fz, key); | ||
407 | |||
408 | if (!f) | ||
409 | fa = NULL; | ||
410 | else | ||
411 | fa = fib_find_alias(&f->fn_alias, tos, fi->fib_priority); | ||
412 | |||
413 | /* Now fa, if non-NULL, points to the first fib alias | ||
414 | * with the same keys [prefix,tos,priority], if such key already | ||
415 | * exists or to the node before which we will insert new one. | ||
416 | * | ||
417 | * If fa is NULL, we will need to allocate a new one and | ||
418 | * insert to the head of f. | ||
419 | * | ||
420 | * If f is NULL, no fib node matched the destination key | ||
421 | * and we need to allocate a new one of those as well. | ||
422 | */ | ||
423 | |||
424 | if (fa && fa->fa_tos == tos && | ||
425 | fa->fa_info->fib_priority == fi->fib_priority) { | ||
426 | struct fib_alias *fa_first, *fa_match; | ||
427 | |||
428 | err = -EEXIST; | ||
429 | if (cfg->fc_nlflags & NLM_F_EXCL) | ||
430 | goto out; | ||
431 | |||
432 | /* We have 2 goals: | ||
433 | * 1. Find exact match for type, scope, fib_info to avoid | ||
434 | * duplicate routes | ||
435 | * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it | ||
436 | */ | ||
437 | fa_match = NULL; | ||
438 | fa_first = fa; | ||
439 | fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list); | ||
440 | list_for_each_entry_continue(fa, &f->fn_alias, fa_list) { | ||
441 | if (fa->fa_tos != tos) | ||
442 | break; | ||
443 | if (fa->fa_info->fib_priority != fi->fib_priority) | ||
444 | break; | ||
445 | if (fa->fa_type == cfg->fc_type && | ||
446 | fa->fa_scope == cfg->fc_scope && | ||
447 | fa->fa_info == fi) { | ||
448 | fa_match = fa; | ||
449 | break; | ||
450 | } | ||
451 | } | ||
452 | |||
453 | if (cfg->fc_nlflags & NLM_F_REPLACE) { | ||
454 | struct fib_info *fi_drop; | ||
455 | u8 state; | ||
456 | |||
457 | fa = fa_first; | ||
458 | if (fa_match) { | ||
459 | if (fa == fa_match) | ||
460 | err = 0; | ||
461 | goto out; | ||
462 | } | ||
463 | write_lock_bh(&fib_hash_lock); | ||
464 | fi_drop = fa->fa_info; | ||
465 | fa->fa_info = fi; | ||
466 | fa->fa_type = cfg->fc_type; | ||
467 | fa->fa_scope = cfg->fc_scope; | ||
468 | state = fa->fa_state; | ||
469 | fa->fa_state &= ~FA_S_ACCESSED; | ||
470 | fib_hash_genid++; | ||
471 | write_unlock_bh(&fib_hash_lock); | ||
472 | |||
473 | fib_release_info(fi_drop); | ||
474 | if (state & FA_S_ACCESSED) | ||
475 | rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); | ||
476 | rtmsg_fib(RTM_NEWROUTE, key, fa, cfg->fc_dst_len, tb->tb_id, | ||
477 | &cfg->fc_nlinfo, NLM_F_REPLACE); | ||
478 | return 0; | ||
479 | } | ||
480 | |||
481 | /* Error if we find a perfect match which | ||
482 | * uses the same scope, type, and nexthop | ||
483 | * information. | ||
484 | */ | ||
485 | if (fa_match) | ||
486 | goto out; | ||
487 | |||
488 | if (!(cfg->fc_nlflags & NLM_F_APPEND)) | ||
489 | fa = fa_first; | ||
490 | } | ||
491 | |||
492 | err = -ENOENT; | ||
493 | if (!(cfg->fc_nlflags & NLM_F_CREATE)) | ||
494 | goto out; | ||
495 | |||
496 | err = -ENOBUFS; | ||
497 | |||
498 | if (!f) { | ||
499 | new_f = kmem_cache_zalloc(fn_hash_kmem, GFP_KERNEL); | ||
500 | if (new_f == NULL) | ||
501 | goto out; | ||
502 | |||
503 | INIT_HLIST_NODE(&new_f->fn_hash); | ||
504 | INIT_LIST_HEAD(&new_f->fn_alias); | ||
505 | new_f->fn_key = key; | ||
506 | f = new_f; | ||
507 | } | ||
508 | |||
509 | new_fa = &f->fn_embedded_alias; | ||
510 | if (new_fa->fa_info != NULL) { | ||
511 | new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); | ||
512 | if (new_fa == NULL) | ||
513 | goto out; | ||
514 | } | ||
515 | new_fa->fa_info = fi; | ||
516 | new_fa->fa_tos = tos; | ||
517 | new_fa->fa_type = cfg->fc_type; | ||
518 | new_fa->fa_scope = cfg->fc_scope; | ||
519 | new_fa->fa_state = 0; | ||
520 | |||
521 | /* | ||
522 | * Insert new entry to the list. | ||
523 | */ | ||
524 | |||
525 | write_lock_bh(&fib_hash_lock); | ||
526 | if (new_f) | ||
527 | fib_insert_node(fz, new_f); | ||
528 | list_add_tail(&new_fa->fa_list, | ||
529 | (fa ? &fa->fa_list : &f->fn_alias)); | ||
530 | fib_hash_genid++; | ||
531 | write_unlock_bh(&fib_hash_lock); | ||
532 | |||
533 | if (new_f) | ||
534 | fz->fz_nent++; | ||
535 | rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); | ||
536 | |||
537 | rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len, tb->tb_id, | ||
538 | &cfg->fc_nlinfo, 0); | ||
539 | return 0; | ||
540 | |||
541 | out: | ||
542 | if (new_f) | ||
543 | kmem_cache_free(fn_hash_kmem, new_f); | ||
544 | fib_release_info(fi); | ||
545 | return err; | ||
546 | } | ||
547 | |||
548 | int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) | ||
549 | { | ||
550 | struct fn_hash *table = (struct fn_hash *)tb->tb_data; | ||
551 | struct fib_node *f; | ||
552 | struct fib_alias *fa, *fa_to_delete; | ||
553 | struct fn_zone *fz; | ||
554 | __be32 key; | ||
555 | |||
556 | if (cfg->fc_dst_len > 32) | ||
557 | return -EINVAL; | ||
558 | |||
559 | if ((fz = table->fn_zones[cfg->fc_dst_len]) == NULL) | ||
560 | return -ESRCH; | ||
561 | |||
562 | key = 0; | ||
563 | if (cfg->fc_dst) { | ||
564 | if (cfg->fc_dst & ~FZ_MASK(fz)) | ||
565 | return -EINVAL; | ||
566 | key = fz_key(cfg->fc_dst, fz); | ||
567 | } | ||
568 | |||
569 | f = fib_find_node(fz, key); | ||
570 | |||
571 | if (!f) | ||
572 | fa = NULL; | ||
573 | else | ||
574 | fa = fib_find_alias(&f->fn_alias, cfg->fc_tos, 0); | ||
575 | if (!fa) | ||
576 | return -ESRCH; | ||
577 | |||
578 | fa_to_delete = NULL; | ||
579 | fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list); | ||
580 | list_for_each_entry_continue(fa, &f->fn_alias, fa_list) { | ||
581 | struct fib_info *fi = fa->fa_info; | ||
582 | |||
583 | if (fa->fa_tos != cfg->fc_tos) | ||
584 | break; | ||
585 | |||
586 | if ((!cfg->fc_type || | ||
587 | fa->fa_type == cfg->fc_type) && | ||
588 | (cfg->fc_scope == RT_SCOPE_NOWHERE || | ||
589 | fa->fa_scope == cfg->fc_scope) && | ||
590 | (!cfg->fc_protocol || | ||
591 | fi->fib_protocol == cfg->fc_protocol) && | ||
592 | fib_nh_match(cfg, fi) == 0) { | ||
593 | fa_to_delete = fa; | ||
594 | break; | ||
595 | } | ||
596 | } | ||
597 | |||
598 | if (fa_to_delete) { | ||
599 | int kill_fn; | ||
600 | |||
601 | fa = fa_to_delete; | ||
602 | rtmsg_fib(RTM_DELROUTE, key, fa, cfg->fc_dst_len, | ||
603 | tb->tb_id, &cfg->fc_nlinfo, 0); | ||
604 | |||
605 | kill_fn = 0; | ||
606 | write_lock_bh(&fib_hash_lock); | ||
607 | list_del(&fa->fa_list); | ||
608 | if (list_empty(&f->fn_alias)) { | ||
609 | hlist_del(&f->fn_hash); | ||
610 | kill_fn = 1; | ||
611 | } | ||
612 | fib_hash_genid++; | ||
613 | write_unlock_bh(&fib_hash_lock); | ||
614 | |||
615 | if (fa->fa_state & FA_S_ACCESSED) | ||
616 | rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); | ||
617 | fn_free_alias(fa, f); | ||
618 | if (kill_fn) { | ||
619 | fn_free_node(f); | ||
620 | fz->fz_nent--; | ||
621 | } | ||
622 | |||
623 | return 0; | ||
624 | } | ||
625 | return -ESRCH; | ||
626 | } | ||
627 | |||
628 | static int fn_flush_list(struct fn_zone *fz, int idx) | ||
629 | { | ||
630 | struct hlist_head *head = &fz->fz_hash[idx]; | ||
631 | struct hlist_node *node, *n; | ||
632 | struct fib_node *f; | ||
633 | int found = 0; | ||
634 | |||
635 | hlist_for_each_entry_safe(f, node, n, head, fn_hash) { | ||
636 | struct fib_alias *fa, *fa_node; | ||
637 | int kill_f; | ||
638 | |||
639 | kill_f = 0; | ||
640 | list_for_each_entry_safe(fa, fa_node, &f->fn_alias, fa_list) { | ||
641 | struct fib_info *fi = fa->fa_info; | ||
642 | |||
643 | if (fi && (fi->fib_flags&RTNH_F_DEAD)) { | ||
644 | write_lock_bh(&fib_hash_lock); | ||
645 | list_del(&fa->fa_list); | ||
646 | if (list_empty(&f->fn_alias)) { | ||
647 | hlist_del(&f->fn_hash); | ||
648 | kill_f = 1; | ||
649 | } | ||
650 | fib_hash_genid++; | ||
651 | write_unlock_bh(&fib_hash_lock); | ||
652 | |||
653 | fn_free_alias(fa, f); | ||
654 | found++; | ||
655 | } | ||
656 | } | ||
657 | if (kill_f) { | ||
658 | fn_free_node(f); | ||
659 | fz->fz_nent--; | ||
660 | } | ||
661 | } | ||
662 | return found; | ||
663 | } | ||
664 | |||
665 | int fib_table_flush(struct fib_table *tb) | ||
666 | { | ||
667 | struct fn_hash *table = (struct fn_hash *) tb->tb_data; | ||
668 | struct fn_zone *fz; | ||
669 | int found = 0; | ||
670 | |||
671 | for (fz = table->fn_zone_list; fz; fz = fz->fz_next) { | ||
672 | int i; | ||
673 | |||
674 | for (i = fz->fz_divisor - 1; i >= 0; i--) | ||
675 | found += fn_flush_list(fz, i); | ||
676 | } | ||
677 | return found; | ||
678 | } | ||
679 | |||
680 | |||
681 | static inline int | ||
682 | fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb, | ||
683 | struct fib_table *tb, | ||
684 | struct fn_zone *fz, | ||
685 | struct hlist_head *head) | ||
686 | { | ||
687 | struct hlist_node *node; | ||
688 | struct fib_node *f; | ||
689 | int i, s_i; | ||
690 | |||
691 | s_i = cb->args[4]; | ||
692 | i = 0; | ||
693 | hlist_for_each_entry(f, node, head, fn_hash) { | ||
694 | struct fib_alias *fa; | ||
695 | |||
696 | list_for_each_entry(fa, &f->fn_alias, fa_list) { | ||
697 | if (i < s_i) | ||
698 | goto next; | ||
699 | |||
700 | if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid, | ||
701 | cb->nlh->nlmsg_seq, | ||
702 | RTM_NEWROUTE, | ||
703 | tb->tb_id, | ||
704 | fa->fa_type, | ||
705 | fa->fa_scope, | ||
706 | f->fn_key, | ||
707 | fz->fz_order, | ||
708 | fa->fa_tos, | ||
709 | fa->fa_info, | ||
710 | NLM_F_MULTI) < 0) { | ||
711 | cb->args[4] = i; | ||
712 | return -1; | ||
713 | } | ||
714 | next: | ||
715 | i++; | ||
716 | } | ||
717 | } | ||
718 | cb->args[4] = i; | ||
719 | return skb->len; | ||
720 | } | ||
721 | |||
722 | static inline int | ||
723 | fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb, | ||
724 | struct fib_table *tb, | ||
725 | struct fn_zone *fz) | ||
726 | { | ||
727 | int h, s_h; | ||
728 | |||
729 | if (fz->fz_hash == NULL) | ||
730 | return skb->len; | ||
731 | s_h = cb->args[3]; | ||
732 | for (h = s_h; h < fz->fz_divisor; h++) { | ||
733 | if (hlist_empty(&fz->fz_hash[h])) | ||
734 | continue; | ||
735 | if (fn_hash_dump_bucket(skb, cb, tb, fz, &fz->fz_hash[h]) < 0) { | ||
736 | cb->args[3] = h; | ||
737 | return -1; | ||
738 | } | ||
739 | memset(&cb->args[4], 0, | ||
740 | sizeof(cb->args) - 4*sizeof(cb->args[0])); | ||
741 | } | ||
742 | cb->args[3] = h; | ||
743 | return skb->len; | ||
744 | } | ||
745 | |||
746 | int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, | ||
747 | struct netlink_callback *cb) | ||
748 | { | ||
749 | int m, s_m; | ||
750 | struct fn_zone *fz; | ||
751 | struct fn_hash *table = (struct fn_hash *)tb->tb_data; | ||
752 | |||
753 | s_m = cb->args[2]; | ||
754 | read_lock(&fib_hash_lock); | ||
755 | for (fz = table->fn_zone_list, m=0; fz; fz = fz->fz_next, m++) { | ||
756 | if (m < s_m) continue; | ||
757 | if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) { | ||
758 | cb->args[2] = m; | ||
759 | read_unlock(&fib_hash_lock); | ||
760 | return -1; | ||
761 | } | ||
762 | memset(&cb->args[3], 0, | ||
763 | sizeof(cb->args) - 3*sizeof(cb->args[0])); | ||
764 | } | ||
765 | read_unlock(&fib_hash_lock); | ||
766 | cb->args[2] = m; | ||
767 | return skb->len; | ||
768 | } | ||
769 | |||
770 | void __init fib_hash_init(void) | ||
771 | { | ||
772 | fn_hash_kmem = kmem_cache_create("ip_fib_hash", sizeof(struct fib_node), | ||
773 | 0, SLAB_PANIC, NULL); | ||
774 | |||
775 | fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias), | ||
776 | 0, SLAB_PANIC, NULL); | ||
777 | |||
778 | } | ||
779 | |||
780 | struct fib_table *fib_hash_table(u32 id) | ||
781 | { | ||
782 | struct fib_table *tb; | ||
783 | |||
784 | tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash), | ||
785 | GFP_KERNEL); | ||
786 | if (tb == NULL) | ||
787 | return NULL; | ||
788 | |||
789 | tb->tb_id = id; | ||
790 | tb->tb_default = -1; | ||
791 | |||
792 | memset(tb->tb_data, 0, sizeof(struct fn_hash)); | ||
793 | return tb; | ||
794 | } | ||
795 | |||
796 | /* ------------------------------------------------------------------------ */ | ||
797 | #ifdef CONFIG_PROC_FS | ||
798 | |||
799 | struct fib_iter_state { | ||
800 | struct seq_net_private p; | ||
801 | struct fn_zone *zone; | ||
802 | int bucket; | ||
803 | struct hlist_head *hash_head; | ||
804 | struct fib_node *fn; | ||
805 | struct fib_alias *fa; | ||
806 | loff_t pos; | ||
807 | unsigned int genid; | ||
808 | int valid; | ||
809 | }; | ||
810 | |||
811 | static struct fib_alias *fib_get_first(struct seq_file *seq) | ||
812 | { | ||
813 | struct fib_iter_state *iter = seq->private; | ||
814 | struct fib_table *main_table; | ||
815 | struct fn_hash *table; | ||
816 | |||
817 | main_table = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN); | ||
818 | table = (struct fn_hash *)main_table->tb_data; | ||
819 | |||
820 | iter->bucket = 0; | ||
821 | iter->hash_head = NULL; | ||
822 | iter->fn = NULL; | ||
823 | iter->fa = NULL; | ||
824 | iter->pos = 0; | ||
825 | iter->genid = fib_hash_genid; | ||
826 | iter->valid = 1; | ||
827 | |||
828 | for (iter->zone = table->fn_zone_list; iter->zone; | ||
829 | iter->zone = iter->zone->fz_next) { | ||
830 | int maxslot; | ||
831 | |||
832 | if (!iter->zone->fz_nent) | ||
833 | continue; | ||
834 | |||
835 | iter->hash_head = iter->zone->fz_hash; | ||
836 | maxslot = iter->zone->fz_divisor; | ||
837 | |||
838 | for (iter->bucket = 0; iter->bucket < maxslot; | ||
839 | ++iter->bucket, ++iter->hash_head) { | ||
840 | struct hlist_node *node; | ||
841 | struct fib_node *fn; | ||
842 | |||
843 | hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) { | ||
844 | struct fib_alias *fa; | ||
845 | |||
846 | list_for_each_entry(fa, &fn->fn_alias, fa_list) { | ||
847 | iter->fn = fn; | ||
848 | iter->fa = fa; | ||
849 | goto out; | ||
850 | } | ||
851 | } | ||
852 | } | ||
853 | } | ||
854 | out: | ||
855 | return iter->fa; | ||
856 | } | ||
857 | |||
858 | static struct fib_alias *fib_get_next(struct seq_file *seq) | ||
859 | { | ||
860 | struct fib_iter_state *iter = seq->private; | ||
861 | struct fib_node *fn; | ||
862 | struct fib_alias *fa; | ||
863 | |||
864 | /* Advance FA, if any. */ | ||
865 | fn = iter->fn; | ||
866 | fa = iter->fa; | ||
867 | if (fa) { | ||
868 | BUG_ON(!fn); | ||
869 | list_for_each_entry_continue(fa, &fn->fn_alias, fa_list) { | ||
870 | iter->fa = fa; | ||
871 | goto out; | ||
872 | } | ||
873 | } | ||
874 | |||
875 | fa = iter->fa = NULL; | ||
876 | |||
877 | /* Advance FN. */ | ||
878 | if (fn) { | ||
879 | struct hlist_node *node = &fn->fn_hash; | ||
880 | hlist_for_each_entry_continue(fn, node, fn_hash) { | ||
881 | iter->fn = fn; | ||
882 | |||
883 | list_for_each_entry(fa, &fn->fn_alias, fa_list) { | ||
884 | iter->fa = fa; | ||
885 | goto out; | ||
886 | } | ||
887 | } | ||
888 | } | ||
889 | |||
890 | fn = iter->fn = NULL; | ||
891 | |||
892 | /* Advance hash chain. */ | ||
893 | if (!iter->zone) | ||
894 | goto out; | ||
895 | |||
896 | for (;;) { | ||
897 | struct hlist_node *node; | ||
898 | int maxslot; | ||
899 | |||
900 | maxslot = iter->zone->fz_divisor; | ||
901 | |||
902 | while (++iter->bucket < maxslot) { | ||
903 | iter->hash_head++; | ||
904 | |||
905 | hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) { | ||
906 | list_for_each_entry(fa, &fn->fn_alias, fa_list) { | ||
907 | iter->fn = fn; | ||
908 | iter->fa = fa; | ||
909 | goto out; | ||
910 | } | ||
911 | } | ||
912 | } | ||
913 | |||
914 | iter->zone = iter->zone->fz_next; | ||
915 | |||
916 | if (!iter->zone) | ||
917 | goto out; | ||
918 | |||
919 | iter->bucket = 0; | ||
920 | iter->hash_head = iter->zone->fz_hash; | ||
921 | |||
922 | hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) { | ||
923 | list_for_each_entry(fa, &fn->fn_alias, fa_list) { | ||
924 | iter->fn = fn; | ||
925 | iter->fa = fa; | ||
926 | goto out; | ||
927 | } | ||
928 | } | ||
929 | } | ||
930 | out: | ||
931 | iter->pos++; | ||
932 | return fa; | ||
933 | } | ||
934 | |||
935 | static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos) | ||
936 | { | ||
937 | struct fib_iter_state *iter = seq->private; | ||
938 | struct fib_alias *fa; | ||
939 | |||
940 | if (iter->valid && pos >= iter->pos && iter->genid == fib_hash_genid) { | ||
941 | fa = iter->fa; | ||
942 | pos -= iter->pos; | ||
943 | } else | ||
944 | fa = fib_get_first(seq); | ||
945 | |||
946 | if (fa) | ||
947 | while (pos && (fa = fib_get_next(seq))) | ||
948 | --pos; | ||
949 | return pos ? NULL : fa; | ||
950 | } | ||
951 | |||
952 | static void *fib_seq_start(struct seq_file *seq, loff_t *pos) | ||
953 | __acquires(fib_hash_lock) | ||
954 | { | ||
955 | void *v = NULL; | ||
956 | |||
957 | read_lock(&fib_hash_lock); | ||
958 | if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN)) | ||
959 | v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; | ||
960 | return v; | ||
961 | } | ||
962 | |||
963 | static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos) | ||
964 | { | ||
965 | ++*pos; | ||
966 | return v == SEQ_START_TOKEN ? fib_get_first(seq) : fib_get_next(seq); | ||
967 | } | ||
968 | |||
969 | static void fib_seq_stop(struct seq_file *seq, void *v) | ||
970 | __releases(fib_hash_lock) | ||
971 | { | ||
972 | read_unlock(&fib_hash_lock); | ||
973 | } | ||
974 | |||
975 | static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi) | ||
976 | { | ||
977 | static const unsigned type2flags[RTN_MAX + 1] = { | ||
978 | [7] = RTF_REJECT, [8] = RTF_REJECT, | ||
979 | }; | ||
980 | unsigned flags = type2flags[type]; | ||
981 | |||
982 | if (fi && fi->fib_nh->nh_gw) | ||
983 | flags |= RTF_GATEWAY; | ||
984 | if (mask == htonl(0xFFFFFFFF)) | ||
985 | flags |= RTF_HOST; | ||
986 | flags |= RTF_UP; | ||
987 | return flags; | ||
988 | } | ||
989 | |||
990 | /* | ||
991 | * This outputs /proc/net/route. | ||
992 | * | ||
993 | * It always works in backward compatibility mode. | ||
994 | * The format of the file is not supposed to be changed. | ||
995 | */ | ||
996 | static int fib_seq_show(struct seq_file *seq, void *v) | ||
997 | { | ||
998 | struct fib_iter_state *iter; | ||
999 | int len; | ||
1000 | __be32 prefix, mask; | ||
1001 | unsigned flags; | ||
1002 | struct fib_node *f; | ||
1003 | struct fib_alias *fa; | ||
1004 | struct fib_info *fi; | ||
1005 | |||
1006 | if (v == SEQ_START_TOKEN) { | ||
1007 | seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway " | ||
1008 | "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU" | ||
1009 | "\tWindow\tIRTT"); | ||
1010 | goto out; | ||
1011 | } | ||
1012 | |||
1013 | iter = seq->private; | ||
1014 | f = iter->fn; | ||
1015 | fa = iter->fa; | ||
1016 | fi = fa->fa_info; | ||
1017 | prefix = f->fn_key; | ||
1018 | mask = FZ_MASK(iter->zone); | ||
1019 | flags = fib_flag_trans(fa->fa_type, mask, fi); | ||
1020 | if (fi) | ||
1021 | seq_printf(seq, | ||
1022 | "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u%n", | ||
1023 | fi->fib_dev ? fi->fib_dev->name : "*", prefix, | ||
1024 | fi->fib_nh->nh_gw, flags, 0, 0, fi->fib_priority, | ||
1025 | mask, (fi->fib_advmss ? fi->fib_advmss + 40 : 0), | ||
1026 | fi->fib_window, | ||
1027 | fi->fib_rtt >> 3, &len); | ||
1028 | else | ||
1029 | seq_printf(seq, | ||
1030 | "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u%n", | ||
1031 | prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0, &len); | ||
1032 | |||
1033 | seq_printf(seq, "%*s\n", 127 - len, ""); | ||
1034 | out: | ||
1035 | return 0; | ||
1036 | } | ||
1037 | |||
1038 | static const struct seq_operations fib_seq_ops = { | ||
1039 | .start = fib_seq_start, | ||
1040 | .next = fib_seq_next, | ||
1041 | .stop = fib_seq_stop, | ||
1042 | .show = fib_seq_show, | ||
1043 | }; | ||
1044 | |||
1045 | static int fib_seq_open(struct inode *inode, struct file *file) | ||
1046 | { | ||
1047 | return seq_open_net(inode, file, &fib_seq_ops, | ||
1048 | sizeof(struct fib_iter_state)); | ||
1049 | } | ||
1050 | |||
1051 | static const struct file_operations fib_seq_fops = { | ||
1052 | .owner = THIS_MODULE, | ||
1053 | .open = fib_seq_open, | ||
1054 | .read = seq_read, | ||
1055 | .llseek = seq_lseek, | ||
1056 | .release = seq_release_net, | ||
1057 | }; | ||
1058 | |||
1059 | int __net_init fib_proc_init(struct net *net) | ||
1060 | { | ||
1061 | if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_seq_fops)) | ||
1062 | return -ENOMEM; | ||
1063 | return 0; | ||
1064 | } | ||
1065 | |||
1066 | void __net_exit fib_proc_exit(struct net *net) | ||
1067 | { | ||
1068 | proc_net_remove(net, "route"); | ||
1069 | } | ||
1070 | #endif /* CONFIG_PROC_FS */ | ||
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index 637b133973bd..af0f14aba169 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h | |||
@@ -10,24 +10,25 @@ struct fib_alias { | |||
10 | struct fib_info *fa_info; | 10 | struct fib_info *fa_info; |
11 | u8 fa_tos; | 11 | u8 fa_tos; |
12 | u8 fa_type; | 12 | u8 fa_type; |
13 | u8 fa_scope; | ||
14 | u8 fa_state; | 13 | u8 fa_state; |
15 | #ifdef CONFIG_IP_FIB_TRIE | ||
16 | struct rcu_head rcu; | 14 | struct rcu_head rcu; |
17 | #endif | ||
18 | }; | 15 | }; |
19 | 16 | ||
20 | #define FA_S_ACCESSED 0x01 | 17 | #define FA_S_ACCESSED 0x01 |
21 | 18 | ||
19 | /* Dont write on fa_state unless needed, to keep it shared on all cpus */ | ||
20 | static inline void fib_alias_accessed(struct fib_alias *fa) | ||
21 | { | ||
22 | if (!(fa->fa_state & FA_S_ACCESSED)) | ||
23 | fa->fa_state |= FA_S_ACCESSED; | ||
24 | } | ||
25 | |||
22 | /* Exported by fib_semantics.c */ | 26 | /* Exported by fib_semantics.c */ |
23 | extern int fib_semantic_match(struct list_head *head, | ||
24 | const struct flowi *flp, | ||
25 | struct fib_result *res, int prefixlen); | ||
26 | extern void fib_release_info(struct fib_info *); | 27 | extern void fib_release_info(struct fib_info *); |
27 | extern struct fib_info *fib_create_info(struct fib_config *cfg); | 28 | extern struct fib_info *fib_create_info(struct fib_config *cfg); |
28 | extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi); | 29 | extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi); |
29 | extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, | 30 | extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, |
30 | u32 tb_id, u8 type, u8 scope, __be32 dst, | 31 | u32 tb_id, u8 type, __be32 dst, |
31 | int dst_len, u8 tos, struct fib_info *fi, | 32 | int dst_len, u8 tos, struct fib_info *fi, |
32 | unsigned int); | 33 | unsigned int); |
33 | extern void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, | 34 | extern void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, |
@@ -42,11 +43,15 @@ extern int fib_detect_death(struct fib_info *fi, int order, | |||
42 | static inline void fib_result_assign(struct fib_result *res, | 43 | static inline void fib_result_assign(struct fib_result *res, |
43 | struct fib_info *fi) | 44 | struct fib_info *fi) |
44 | { | 45 | { |
45 | if (res->fi != NULL) | 46 | /* we used to play games with refcounts, but we now use RCU */ |
46 | fib_info_put(res->fi); | ||
47 | res->fi = fi; | 47 | res->fi = fi; |
48 | if (fi != NULL) | ||
49 | atomic_inc(&fi->fib_clntref); | ||
50 | } | 48 | } |
51 | 49 | ||
50 | struct fib_prop { | ||
51 | int error; | ||
52 | u8 scope; | ||
53 | }; | ||
54 | |||
55 | extern const struct fib_prop fib_props[RTN_MAX + 1]; | ||
56 | |||
52 | #endif /* _FIB_LOOKUP_H */ | 57 | #endif /* _FIB_LOOKUP_H */ |
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 76daeb5ff564..a53bb1b5b118 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c | |||
@@ -6,7 +6,7 @@ | |||
6 | * IPv4 Forwarding Information Base: policy rules. | 6 | * IPv4 Forwarding Information Base: policy rules. |
7 | * | 7 | * |
8 | * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> | 8 | * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> |
9 | * Thomas Graf <tgraf@suug.ch> | 9 | * Thomas Graf <tgraf@suug.ch> |
10 | * | 10 | * |
11 | * This program is free software; you can redistribute it and/or | 11 | * This program is free software; you can redistribute it and/or |
12 | * modify it under the terms of the GNU General Public License | 12 | * modify it under the terms of the GNU General Public License |
@@ -14,7 +14,7 @@ | |||
14 | * 2 of the License, or (at your option) any later version. | 14 | * 2 of the License, or (at your option) any later version. |
15 | * | 15 | * |
16 | * Fixes: | 16 | * Fixes: |
17 | * Rani Assaf : local_rule cannot be deleted | 17 | * Rani Assaf : local_rule cannot be deleted |
18 | * Marc Boucher : routing by fwmark | 18 | * Marc Boucher : routing by fwmark |
19 | */ | 19 | */ |
20 | 20 | ||
@@ -32,8 +32,7 @@ | |||
32 | #include <net/ip_fib.h> | 32 | #include <net/ip_fib.h> |
33 | #include <net/fib_rules.h> | 33 | #include <net/fib_rules.h> |
34 | 34 | ||
35 | struct fib4_rule | 35 | struct fib4_rule { |
36 | { | ||
37 | struct fib_rule common; | 36 | struct fib_rule common; |
38 | u8 dst_len; | 37 | u8 dst_len; |
39 | u8 src_len; | 38 | u8 src_len; |
@@ -42,26 +41,27 @@ struct fib4_rule | |||
42 | __be32 srcmask; | 41 | __be32 srcmask; |
43 | __be32 dst; | 42 | __be32 dst; |
44 | __be32 dstmask; | 43 | __be32 dstmask; |
45 | #ifdef CONFIG_NET_CLS_ROUTE | 44 | #ifdef CONFIG_IP_ROUTE_CLASSID |
46 | u32 tclassid; | 45 | u32 tclassid; |
47 | #endif | 46 | #endif |
48 | }; | 47 | }; |
49 | 48 | ||
50 | #ifdef CONFIG_NET_CLS_ROUTE | 49 | #ifdef CONFIG_IP_ROUTE_CLASSID |
51 | u32 fib_rules_tclass(struct fib_result *res) | 50 | u32 fib_rules_tclass(const struct fib_result *res) |
52 | { | 51 | { |
53 | return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0; | 52 | return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0; |
54 | } | 53 | } |
55 | #endif | 54 | #endif |
56 | 55 | ||
57 | int fib_lookup(struct net *net, struct flowi *flp, struct fib_result *res) | 56 | int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) |
58 | { | 57 | { |
59 | struct fib_lookup_arg arg = { | 58 | struct fib_lookup_arg arg = { |
60 | .result = res, | 59 | .result = res, |
60 | .flags = FIB_LOOKUP_NOREF, | ||
61 | }; | 61 | }; |
62 | int err; | 62 | int err; |
63 | 63 | ||
64 | err = fib_rules_lookup(net->ipv4.rules_ops, flp, 0, &arg); | 64 | err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg); |
65 | res->r = arg.rule; | 65 | res->r = arg.rule; |
66 | 66 | ||
67 | return err; | 67 | return err; |
@@ -91,10 +91,11 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, | |||
91 | goto errout; | 91 | goto errout; |
92 | } | 92 | } |
93 | 93 | ||
94 | if ((tbl = fib_get_table(rule->fr_net, rule->table)) == NULL) | 94 | tbl = fib_get_table(rule->fr_net, rule->table); |
95 | if (!tbl) | ||
95 | goto errout; | 96 | goto errout; |
96 | 97 | ||
97 | err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result); | 98 | err = fib_table_lookup(tbl, &flp->u.ip4, (struct fib_result *) arg->result, arg->flags); |
98 | if (err > 0) | 99 | if (err > 0) |
99 | err = -EAGAIN; | 100 | err = -EAGAIN; |
100 | errout: | 101 | errout: |
@@ -105,14 +106,15 @@ errout: | |||
105 | static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) | 106 | static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) |
106 | { | 107 | { |
107 | struct fib4_rule *r = (struct fib4_rule *) rule; | 108 | struct fib4_rule *r = (struct fib4_rule *) rule; |
108 | __be32 daddr = fl->fl4_dst; | 109 | struct flowi4 *fl4 = &fl->u.ip4; |
109 | __be32 saddr = fl->fl4_src; | 110 | __be32 daddr = fl4->daddr; |
111 | __be32 saddr = fl4->saddr; | ||
110 | 112 | ||
111 | if (((saddr ^ r->src) & r->srcmask) || | 113 | if (((saddr ^ r->src) & r->srcmask) || |
112 | ((daddr ^ r->dst) & r->dstmask)) | 114 | ((daddr ^ r->dst) & r->dstmask)) |
113 | return 0; | 115 | return 0; |
114 | 116 | ||
115 | if (r->tos && (r->tos != fl->fl4_tos)) | 117 | if (r->tos && (r->tos != fl4->flowi4_tos)) |
116 | return 0; | 118 | return 0; |
117 | 119 | ||
118 | return 1; | 120 | return 1; |
@@ -164,7 +166,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, | |||
164 | if (frh->dst_len) | 166 | if (frh->dst_len) |
165 | rule4->dst = nla_get_be32(tb[FRA_DST]); | 167 | rule4->dst = nla_get_be32(tb[FRA_DST]); |
166 | 168 | ||
167 | #ifdef CONFIG_NET_CLS_ROUTE | 169 | #ifdef CONFIG_IP_ROUTE_CLASSID |
168 | if (tb[FRA_FLOW]) | 170 | if (tb[FRA_FLOW]) |
169 | rule4->tclassid = nla_get_u32(tb[FRA_FLOW]); | 171 | rule4->tclassid = nla_get_u32(tb[FRA_FLOW]); |
170 | #endif | 172 | #endif |
@@ -194,7 +196,7 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, | |||
194 | if (frh->tos && (rule4->tos != frh->tos)) | 196 | if (frh->tos && (rule4->tos != frh->tos)) |
195 | return 0; | 197 | return 0; |
196 | 198 | ||
197 | #ifdef CONFIG_NET_CLS_ROUTE | 199 | #ifdef CONFIG_IP_ROUTE_CLASSID |
198 | if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW]))) | 200 | if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW]))) |
199 | return 0; | 201 | return 0; |
200 | #endif | 202 | #endif |
@@ -223,7 +225,7 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb, | |||
223 | if (rule4->src_len) | 225 | if (rule4->src_len) |
224 | NLA_PUT_BE32(skb, FRA_SRC, rule4->src); | 226 | NLA_PUT_BE32(skb, FRA_SRC, rule4->src); |
225 | 227 | ||
226 | #ifdef CONFIG_NET_CLS_ROUTE | 228 | #ifdef CONFIG_IP_ROUTE_CLASSID |
227 | if (rule4->tclassid) | 229 | if (rule4->tclassid) |
228 | NLA_PUT_U32(skb, FRA_FLOW, rule4->tclassid); | 230 | NLA_PUT_U32(skb, FRA_FLOW, rule4->tclassid); |
229 | #endif | 231 | #endif |
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 20f09c5b31e8..33e2c35b74b7 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c | |||
@@ -49,7 +49,7 @@ | |||
49 | static DEFINE_SPINLOCK(fib_info_lock); | 49 | static DEFINE_SPINLOCK(fib_info_lock); |
50 | static struct hlist_head *fib_info_hash; | 50 | static struct hlist_head *fib_info_hash; |
51 | static struct hlist_head *fib_info_laddrhash; | 51 | static struct hlist_head *fib_info_laddrhash; |
52 | static unsigned int fib_hash_size; | 52 | static unsigned int fib_info_hash_size; |
53 | static unsigned int fib_info_cnt; | 53 | static unsigned int fib_info_cnt; |
54 | 54 | ||
55 | #define DEVINDEX_HASHBITS 8 | 55 | #define DEVINDEX_HASHBITS 8 |
@@ -60,89 +60,93 @@ static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE]; | |||
60 | 60 | ||
61 | static DEFINE_SPINLOCK(fib_multipath_lock); | 61 | static DEFINE_SPINLOCK(fib_multipath_lock); |
62 | 62 | ||
63 | #define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \ | 63 | #define for_nexthops(fi) { \ |
64 | for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++) | 64 | int nhsel; const struct fib_nh *nh; \ |
65 | for (nhsel = 0, nh = (fi)->fib_nh; \ | ||
66 | nhsel < (fi)->fib_nhs; \ | ||
67 | nh++, nhsel++) | ||
65 | 68 | ||
66 | #define change_nexthops(fi) { int nhsel; struct fib_nh *nexthop_nh; \ | 69 | #define change_nexthops(fi) { \ |
67 | for (nhsel=0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nexthop_nh++, nhsel++) | 70 | int nhsel; struct fib_nh *nexthop_nh; \ |
71 | for (nhsel = 0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \ | ||
72 | nhsel < (fi)->fib_nhs; \ | ||
73 | nexthop_nh++, nhsel++) | ||
68 | 74 | ||
69 | #else /* CONFIG_IP_ROUTE_MULTIPATH */ | 75 | #else /* CONFIG_IP_ROUTE_MULTIPATH */ |
70 | 76 | ||
71 | /* Hope, that gcc will optimize it to get rid of dummy loop */ | 77 | /* Hope, that gcc will optimize it to get rid of dummy loop */ |
72 | 78 | ||
73 | #define for_nexthops(fi) { int nhsel = 0; const struct fib_nh * nh = (fi)->fib_nh; \ | 79 | #define for_nexthops(fi) { \ |
74 | for (nhsel=0; nhsel < 1; nhsel++) | 80 | int nhsel; const struct fib_nh *nh = (fi)->fib_nh; \ |
81 | for (nhsel = 0; nhsel < 1; nhsel++) | ||
75 | 82 | ||
76 | #define change_nexthops(fi) { int nhsel = 0; struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \ | 83 | #define change_nexthops(fi) { \ |
77 | for (nhsel=0; nhsel < 1; nhsel++) | 84 | int nhsel; \ |
85 | struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \ | ||
86 | for (nhsel = 0; nhsel < 1; nhsel++) | ||
78 | 87 | ||
79 | #endif /* CONFIG_IP_ROUTE_MULTIPATH */ | 88 | #endif /* CONFIG_IP_ROUTE_MULTIPATH */ |
80 | 89 | ||
81 | #define endfor_nexthops(fi) } | 90 | #define endfor_nexthops(fi) } |
82 | 91 | ||
83 | 92 | ||
84 | static const struct | 93 | const struct fib_prop fib_props[RTN_MAX + 1] = { |
85 | { | 94 | [RTN_UNSPEC] = { |
86 | int error; | ||
87 | u8 scope; | ||
88 | } fib_props[RTN_MAX + 1] = { | ||
89 | { | ||
90 | .error = 0, | 95 | .error = 0, |
91 | .scope = RT_SCOPE_NOWHERE, | 96 | .scope = RT_SCOPE_NOWHERE, |
92 | }, /* RTN_UNSPEC */ | 97 | }, |
93 | { | 98 | [RTN_UNICAST] = { |
94 | .error = 0, | 99 | .error = 0, |
95 | .scope = RT_SCOPE_UNIVERSE, | 100 | .scope = RT_SCOPE_UNIVERSE, |
96 | }, /* RTN_UNICAST */ | 101 | }, |
97 | { | 102 | [RTN_LOCAL] = { |
98 | .error = 0, | 103 | .error = 0, |
99 | .scope = RT_SCOPE_HOST, | 104 | .scope = RT_SCOPE_HOST, |
100 | }, /* RTN_LOCAL */ | 105 | }, |
101 | { | 106 | [RTN_BROADCAST] = { |
102 | .error = 0, | 107 | .error = 0, |
103 | .scope = RT_SCOPE_LINK, | 108 | .scope = RT_SCOPE_LINK, |
104 | }, /* RTN_BROADCAST */ | 109 | }, |
105 | { | 110 | [RTN_ANYCAST] = { |
106 | .error = 0, | 111 | .error = 0, |
107 | .scope = RT_SCOPE_LINK, | 112 | .scope = RT_SCOPE_LINK, |
108 | }, /* RTN_ANYCAST */ | 113 | }, |
109 | { | 114 | [RTN_MULTICAST] = { |
110 | .error = 0, | 115 | .error = 0, |
111 | .scope = RT_SCOPE_UNIVERSE, | 116 | .scope = RT_SCOPE_UNIVERSE, |
112 | }, /* RTN_MULTICAST */ | 117 | }, |
113 | { | 118 | [RTN_BLACKHOLE] = { |
114 | .error = -EINVAL, | 119 | .error = -EINVAL, |
115 | .scope = RT_SCOPE_UNIVERSE, | 120 | .scope = RT_SCOPE_UNIVERSE, |
116 | }, /* RTN_BLACKHOLE */ | 121 | }, |
117 | { | 122 | [RTN_UNREACHABLE] = { |
118 | .error = -EHOSTUNREACH, | 123 | .error = -EHOSTUNREACH, |
119 | .scope = RT_SCOPE_UNIVERSE, | 124 | .scope = RT_SCOPE_UNIVERSE, |
120 | }, /* RTN_UNREACHABLE */ | 125 | }, |
121 | { | 126 | [RTN_PROHIBIT] = { |
122 | .error = -EACCES, | 127 | .error = -EACCES, |
123 | .scope = RT_SCOPE_UNIVERSE, | 128 | .scope = RT_SCOPE_UNIVERSE, |
124 | }, /* RTN_PROHIBIT */ | 129 | }, |
125 | { | 130 | [RTN_THROW] = { |
126 | .error = -EAGAIN, | 131 | .error = -EAGAIN, |
127 | .scope = RT_SCOPE_UNIVERSE, | 132 | .scope = RT_SCOPE_UNIVERSE, |
128 | }, /* RTN_THROW */ | 133 | }, |
129 | { | 134 | [RTN_NAT] = { |
130 | .error = -EINVAL, | 135 | .error = -EINVAL, |
131 | .scope = RT_SCOPE_NOWHERE, | 136 | .scope = RT_SCOPE_NOWHERE, |
132 | }, /* RTN_NAT */ | 137 | }, |
133 | { | 138 | [RTN_XRESOLVE] = { |
134 | .error = -EINVAL, | 139 | .error = -EINVAL, |
135 | .scope = RT_SCOPE_NOWHERE, | 140 | .scope = RT_SCOPE_NOWHERE, |
136 | }, /* RTN_XRESOLVE */ | 141 | }, |
137 | }; | 142 | }; |
138 | 143 | ||
139 | |||
140 | /* Release a nexthop info record */ | 144 | /* Release a nexthop info record */ |
141 | 145 | ||
142 | void free_fib_info(struct fib_info *fi) | 146 | void free_fib_info(struct fib_info *fi) |
143 | { | 147 | { |
144 | if (fi->fib_dead == 0) { | 148 | if (fi->fib_dead == 0) { |
145 | printk(KERN_WARNING "Freeing alive fib_info %p\n", fi); | 149 | pr_warning("Freeing alive fib_info %p\n", fi); |
146 | return; | 150 | return; |
147 | } | 151 | } |
148 | change_nexthops(fi) { | 152 | change_nexthops(fi) { |
@@ -152,7 +156,7 @@ void free_fib_info(struct fib_info *fi) | |||
152 | } endfor_nexthops(fi); | 156 | } endfor_nexthops(fi); |
153 | fib_info_cnt--; | 157 | fib_info_cnt--; |
154 | release_net(fi->fib_net); | 158 | release_net(fi->fib_net); |
155 | kfree(fi); | 159 | kfree_rcu(fi, rcu); |
156 | } | 160 | } |
157 | 161 | ||
158 | void fib_release_info(struct fib_info *fi) | 162 | void fib_release_info(struct fib_info *fi) |
@@ -173,7 +177,7 @@ void fib_release_info(struct fib_info *fi) | |||
173 | spin_unlock_bh(&fib_info_lock); | 177 | spin_unlock_bh(&fib_info_lock); |
174 | } | 178 | } |
175 | 179 | ||
176 | static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) | 180 | static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) |
177 | { | 181 | { |
178 | const struct fib_nh *onh = ofi->fib_nh; | 182 | const struct fib_nh *onh = ofi->fib_nh; |
179 | 183 | ||
@@ -184,10 +188,10 @@ static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info * | |||
184 | #ifdef CONFIG_IP_ROUTE_MULTIPATH | 188 | #ifdef CONFIG_IP_ROUTE_MULTIPATH |
185 | nh->nh_weight != onh->nh_weight || | 189 | nh->nh_weight != onh->nh_weight || |
186 | #endif | 190 | #endif |
187 | #ifdef CONFIG_NET_CLS_ROUTE | 191 | #ifdef CONFIG_IP_ROUTE_CLASSID |
188 | nh->nh_tclassid != onh->nh_tclassid || | 192 | nh->nh_tclassid != onh->nh_tclassid || |
189 | #endif | 193 | #endif |
190 | ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD)) | 194 | ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD)) |
191 | return -1; | 195 | return -1; |
192 | onh++; | 196 | onh++; |
193 | } endfor_nexthops(fi); | 197 | } endfor_nexthops(fi); |
@@ -205,10 +209,10 @@ static inline unsigned int fib_devindex_hashfn(unsigned int val) | |||
205 | 209 | ||
206 | static inline unsigned int fib_info_hashfn(const struct fib_info *fi) | 210 | static inline unsigned int fib_info_hashfn(const struct fib_info *fi) |
207 | { | 211 | { |
208 | unsigned int mask = (fib_hash_size - 1); | 212 | unsigned int mask = (fib_info_hash_size - 1); |
209 | unsigned int val = fi->fib_nhs; | 213 | unsigned int val = fi->fib_nhs; |
210 | 214 | ||
211 | val ^= fi->fib_protocol; | 215 | val ^= (fi->fib_protocol << 8) | fi->fib_scope; |
212 | val ^= (__force u32)fi->fib_prefsrc; | 216 | val ^= (__force u32)fi->fib_prefsrc; |
213 | val ^= fi->fib_priority; | 217 | val ^= fi->fib_priority; |
214 | for_nexthops(fi) { | 218 | for_nexthops(fi) { |
@@ -234,11 +238,12 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi) | |||
234 | if (fi->fib_nhs != nfi->fib_nhs) | 238 | if (fi->fib_nhs != nfi->fib_nhs) |
235 | continue; | 239 | continue; |
236 | if (nfi->fib_protocol == fi->fib_protocol && | 240 | if (nfi->fib_protocol == fi->fib_protocol && |
241 | nfi->fib_scope == fi->fib_scope && | ||
237 | nfi->fib_prefsrc == fi->fib_prefsrc && | 242 | nfi->fib_prefsrc == fi->fib_prefsrc && |
238 | nfi->fib_priority == fi->fib_priority && | 243 | nfi->fib_priority == fi->fib_priority && |
239 | memcmp(nfi->fib_metrics, fi->fib_metrics, | 244 | memcmp(nfi->fib_metrics, fi->fib_metrics, |
240 | sizeof(fi->fib_metrics)) == 0 && | 245 | sizeof(u32) * RTAX_MAX) == 0 && |
241 | ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 && | 246 | ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 && |
242 | (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) | 247 | (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) |
243 | return fi; | 248 | return fi; |
244 | } | 249 | } |
@@ -247,9 +252,8 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi) | |||
247 | } | 252 | } |
248 | 253 | ||
249 | /* Check, that the gateway is already configured. | 254 | /* Check, that the gateway is already configured. |
250 | Used only by redirect accept routine. | 255 | * Used only by redirect accept routine. |
251 | */ | 256 | */ |
252 | |||
253 | int ip_fib_check_default(__be32 gw, struct net_device *dev) | 257 | int ip_fib_check_default(__be32 gw, struct net_device *dev) |
254 | { | 258 | { |
255 | struct hlist_head *head; | 259 | struct hlist_head *head; |
@@ -264,7 +268,7 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev) | |||
264 | hlist_for_each_entry(nh, node, head, nh_hash) { | 268 | hlist_for_each_entry(nh, node, head, nh_hash) { |
265 | if (nh->nh_dev == dev && | 269 | if (nh->nh_dev == dev && |
266 | nh->nh_gw == gw && | 270 | nh->nh_gw == gw && |
267 | !(nh->nh_flags&RTNH_F_DEAD)) { | 271 | !(nh->nh_flags & RTNH_F_DEAD)) { |
268 | spin_unlock(&fib_info_lock); | 272 | spin_unlock(&fib_info_lock); |
269 | return 0; | 273 | return 0; |
270 | } | 274 | } |
@@ -315,7 +319,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, | |||
315 | goto errout; | 319 | goto errout; |
316 | 320 | ||
317 | err = fib_dump_info(skb, info->pid, seq, event, tb_id, | 321 | err = fib_dump_info(skb, info->pid, seq, event, tb_id, |
318 | fa->fa_type, fa->fa_scope, key, dst_len, | 322 | fa->fa_type, key, dst_len, |
319 | fa->fa_tos, fa->fa_info, nlm_flags); | 323 | fa->fa_tos, fa->fa_info, nlm_flags); |
320 | if (err < 0) { | 324 | if (err < 0) { |
321 | /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ | 325 | /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ |
@@ -362,10 +366,10 @@ int fib_detect_death(struct fib_info *fi, int order, | |||
362 | } | 366 | } |
363 | if (state == NUD_REACHABLE) | 367 | if (state == NUD_REACHABLE) |
364 | return 0; | 368 | return 0; |
365 | if ((state&NUD_VALID) && order != dflt) | 369 | if ((state & NUD_VALID) && order != dflt) |
366 | return 0; | 370 | return 0; |
367 | if ((state&NUD_VALID) || | 371 | if ((state & NUD_VALID) || |
368 | (*last_idx<0 && order > dflt)) { | 372 | (*last_idx < 0 && order > dflt)) { |
369 | *last_resort = fi; | 373 | *last_resort = fi; |
370 | *last_idx = order; | 374 | *last_idx = order; |
371 | } | 375 | } |
@@ -407,7 +411,7 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, | |||
407 | 411 | ||
408 | nla = nla_find(attrs, attrlen, RTA_GATEWAY); | 412 | nla = nla_find(attrs, attrlen, RTA_GATEWAY); |
409 | nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0; | 413 | nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0; |
410 | #ifdef CONFIG_NET_CLS_ROUTE | 414 | #ifdef CONFIG_IP_ROUTE_CLASSID |
411 | nla = nla_find(attrs, attrlen, RTA_FLOW); | 415 | nla = nla_find(attrs, attrlen, RTA_FLOW); |
412 | nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; | 416 | nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; |
413 | #endif | 417 | #endif |
@@ -461,7 +465,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) | |||
461 | nla = nla_find(attrs, attrlen, RTA_GATEWAY); | 465 | nla = nla_find(attrs, attrlen, RTA_GATEWAY); |
462 | if (nla && nla_get_be32(nla) != nh->nh_gw) | 466 | if (nla && nla_get_be32(nla) != nh->nh_gw) |
463 | return 1; | 467 | return 1; |
464 | #ifdef CONFIG_NET_CLS_ROUTE | 468 | #ifdef CONFIG_IP_ROUTE_CLASSID |
465 | nla = nla_find(attrs, attrlen, RTA_FLOW); | 469 | nla = nla_find(attrs, attrlen, RTA_FLOW); |
466 | if (nla && nla_get_u32(nla) != nh->nh_tclassid) | 470 | if (nla && nla_get_u32(nla) != nh->nh_tclassid) |
467 | return 1; | 471 | return 1; |
@@ -476,145 +480,146 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) | |||
476 | 480 | ||
477 | 481 | ||
478 | /* | 482 | /* |
479 | Picture | 483 | * Picture |
480 | ------- | 484 | * ------- |
481 | 485 | * | |
482 | Semantics of nexthop is very messy by historical reasons. | 486 | * Semantics of nexthop is very messy by historical reasons. |
483 | We have to take into account, that: | 487 | * We have to take into account, that: |
484 | a) gateway can be actually local interface address, | 488 | * a) gateway can be actually local interface address, |
485 | so that gatewayed route is direct. | 489 | * so that gatewayed route is direct. |
486 | b) gateway must be on-link address, possibly | 490 | * b) gateway must be on-link address, possibly |
487 | described not by an ifaddr, but also by a direct route. | 491 | * described not by an ifaddr, but also by a direct route. |
488 | c) If both gateway and interface are specified, they should not | 492 | * c) If both gateway and interface are specified, they should not |
489 | contradict. | 493 | * contradict. |
490 | d) If we use tunnel routes, gateway could be not on-link. | 494 | * d) If we use tunnel routes, gateway could be not on-link. |
491 | 495 | * | |
492 | Attempt to reconcile all of these (alas, self-contradictory) conditions | 496 | * Attempt to reconcile all of these (alas, self-contradictory) conditions |
493 | results in pretty ugly and hairy code with obscure logic. | 497 | * results in pretty ugly and hairy code with obscure logic. |
494 | 498 | * | |
495 | I chose to generalized it instead, so that the size | 499 | * I chose to generalized it instead, so that the size |
496 | of code does not increase practically, but it becomes | 500 | * of code does not increase practically, but it becomes |
497 | much more general. | 501 | * much more general. |
498 | Every prefix is assigned a "scope" value: "host" is local address, | 502 | * Every prefix is assigned a "scope" value: "host" is local address, |
499 | "link" is direct route, | 503 | * "link" is direct route, |
500 | [ ... "site" ... "interior" ... ] | 504 | * [ ... "site" ... "interior" ... ] |
501 | and "universe" is true gateway route with global meaning. | 505 | * and "universe" is true gateway route with global meaning. |
502 | 506 | * | |
503 | Every prefix refers to a set of "nexthop"s (gw, oif), | 507 | * Every prefix refers to a set of "nexthop"s (gw, oif), |
504 | where gw must have narrower scope. This recursion stops | 508 | * where gw must have narrower scope. This recursion stops |
505 | when gw has LOCAL scope or if "nexthop" is declared ONLINK, | 509 | * when gw has LOCAL scope or if "nexthop" is declared ONLINK, |
506 | which means that gw is forced to be on link. | 510 | * which means that gw is forced to be on link. |
507 | 511 | * | |
508 | Code is still hairy, but now it is apparently logically | 512 | * Code is still hairy, but now it is apparently logically |
509 | consistent and very flexible. F.e. as by-product it allows | 513 | * consistent and very flexible. F.e. as by-product it allows |
510 | to co-exists in peace independent exterior and interior | 514 | * to co-exists in peace independent exterior and interior |
511 | routing processes. | 515 | * routing processes. |
512 | 516 | * | |
513 | Normally it looks as following. | 517 | * Normally it looks as following. |
514 | 518 | * | |
515 | {universe prefix} -> (gw, oif) [scope link] | 519 | * {universe prefix} -> (gw, oif) [scope link] |
516 | | | 520 | * | |
517 | |-> {link prefix} -> (gw, oif) [scope local] | 521 | * |-> {link prefix} -> (gw, oif) [scope local] |
518 | | | 522 | * | |
519 | |-> {local prefix} (terminal node) | 523 | * |-> {local prefix} (terminal node) |
520 | */ | 524 | */ |
521 | |||
522 | static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, | 525 | static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, |
523 | struct fib_nh *nh) | 526 | struct fib_nh *nh) |
524 | { | 527 | { |
525 | int err; | 528 | int err; |
526 | struct net *net; | 529 | struct net *net; |
530 | struct net_device *dev; | ||
527 | 531 | ||
528 | net = cfg->fc_nlinfo.nl_net; | 532 | net = cfg->fc_nlinfo.nl_net; |
529 | if (nh->nh_gw) { | 533 | if (nh->nh_gw) { |
530 | struct fib_result res; | 534 | struct fib_result res; |
531 | 535 | ||
532 | if (nh->nh_flags&RTNH_F_ONLINK) { | 536 | if (nh->nh_flags & RTNH_F_ONLINK) { |
533 | struct net_device *dev; | ||
534 | 537 | ||
535 | if (cfg->fc_scope >= RT_SCOPE_LINK) | 538 | if (cfg->fc_scope >= RT_SCOPE_LINK) |
536 | return -EINVAL; | 539 | return -EINVAL; |
537 | if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST) | 540 | if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST) |
538 | return -EINVAL; | 541 | return -EINVAL; |
539 | if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL) | 542 | dev = __dev_get_by_index(net, nh->nh_oif); |
543 | if (!dev) | ||
540 | return -ENODEV; | 544 | return -ENODEV; |
541 | if (!(dev->flags&IFF_UP)) | 545 | if (!(dev->flags & IFF_UP)) |
542 | return -ENETDOWN; | 546 | return -ENETDOWN; |
543 | nh->nh_dev = dev; | 547 | nh->nh_dev = dev; |
544 | dev_hold(dev); | 548 | dev_hold(dev); |
545 | nh->nh_scope = RT_SCOPE_LINK; | 549 | nh->nh_scope = RT_SCOPE_LINK; |
546 | return 0; | 550 | return 0; |
547 | } | 551 | } |
552 | rcu_read_lock(); | ||
548 | { | 553 | { |
549 | struct flowi fl = { | 554 | struct flowi4 fl4 = { |
550 | .nl_u = { | 555 | .daddr = nh->nh_gw, |
551 | .ip4_u = { | 556 | .flowi4_scope = cfg->fc_scope + 1, |
552 | .daddr = nh->nh_gw, | 557 | .flowi4_oif = nh->nh_oif, |
553 | .scope = cfg->fc_scope + 1, | ||
554 | }, | ||
555 | }, | ||
556 | .oif = nh->nh_oif, | ||
557 | }; | 558 | }; |
558 | 559 | ||
559 | /* It is not necessary, but requires a bit of thinking */ | 560 | /* It is not necessary, but requires a bit of thinking */ |
560 | if (fl.fl4_scope < RT_SCOPE_LINK) | 561 | if (fl4.flowi4_scope < RT_SCOPE_LINK) |
561 | fl.fl4_scope = RT_SCOPE_LINK; | 562 | fl4.flowi4_scope = RT_SCOPE_LINK; |
562 | if ((err = fib_lookup(net, &fl, &res)) != 0) | 563 | err = fib_lookup(net, &fl4, &res); |
564 | if (err) { | ||
565 | rcu_read_unlock(); | ||
563 | return err; | 566 | return err; |
567 | } | ||
564 | } | 568 | } |
565 | err = -EINVAL; | 569 | err = -EINVAL; |
566 | if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) | 570 | if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) |
567 | goto out; | 571 | goto out; |
568 | nh->nh_scope = res.scope; | 572 | nh->nh_scope = res.scope; |
569 | nh->nh_oif = FIB_RES_OIF(res); | 573 | nh->nh_oif = FIB_RES_OIF(res); |
570 | if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL) | 574 | nh->nh_dev = dev = FIB_RES_DEV(res); |
575 | if (!dev) | ||
571 | goto out; | 576 | goto out; |
572 | dev_hold(nh->nh_dev); | 577 | dev_hold(dev); |
573 | err = -ENETDOWN; | 578 | err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN; |
574 | if (!(nh->nh_dev->flags & IFF_UP)) | ||
575 | goto out; | ||
576 | err = 0; | ||
577 | out: | ||
578 | fib_res_put(&res); | ||
579 | return err; | ||
580 | } else { | 579 | } else { |
581 | struct in_device *in_dev; | 580 | struct in_device *in_dev; |
582 | 581 | ||
583 | if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK)) | 582 | if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) |
584 | return -EINVAL; | 583 | return -EINVAL; |
585 | 584 | ||
585 | rcu_read_lock(); | ||
586 | err = -ENODEV; | ||
586 | in_dev = inetdev_by_index(net, nh->nh_oif); | 587 | in_dev = inetdev_by_index(net, nh->nh_oif); |
587 | if (in_dev == NULL) | 588 | if (in_dev == NULL) |
588 | return -ENODEV; | 589 | goto out; |
589 | if (!(in_dev->dev->flags&IFF_UP)) { | 590 | err = -ENETDOWN; |
590 | in_dev_put(in_dev); | 591 | if (!(in_dev->dev->flags & IFF_UP)) |
591 | return -ENETDOWN; | 592 | goto out; |
592 | } | ||
593 | nh->nh_dev = in_dev->dev; | 593 | nh->nh_dev = in_dev->dev; |
594 | dev_hold(nh->nh_dev); | 594 | dev_hold(nh->nh_dev); |
595 | nh->nh_scope = RT_SCOPE_HOST; | 595 | nh->nh_scope = RT_SCOPE_HOST; |
596 | in_dev_put(in_dev); | 596 | err = 0; |
597 | } | 597 | } |
598 | return 0; | 598 | out: |
599 | rcu_read_unlock(); | ||
600 | return err; | ||
599 | } | 601 | } |
600 | 602 | ||
601 | static inline unsigned int fib_laddr_hashfn(__be32 val) | 603 | static inline unsigned int fib_laddr_hashfn(__be32 val) |
602 | { | 604 | { |
603 | unsigned int mask = (fib_hash_size - 1); | 605 | unsigned int mask = (fib_info_hash_size - 1); |
604 | 606 | ||
605 | return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask; | 607 | return ((__force u32)val ^ |
608 | ((__force u32)val >> 7) ^ | ||
609 | ((__force u32)val >> 14)) & mask; | ||
606 | } | 610 | } |
607 | 611 | ||
608 | static struct hlist_head *fib_hash_alloc(int bytes) | 612 | static struct hlist_head *fib_info_hash_alloc(int bytes) |
609 | { | 613 | { |
610 | if (bytes <= PAGE_SIZE) | 614 | if (bytes <= PAGE_SIZE) |
611 | return kzalloc(bytes, GFP_KERNEL); | 615 | return kzalloc(bytes, GFP_KERNEL); |
612 | else | 616 | else |
613 | return (struct hlist_head *) | 617 | return (struct hlist_head *) |
614 | __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(bytes)); | 618 | __get_free_pages(GFP_KERNEL | __GFP_ZERO, |
619 | get_order(bytes)); | ||
615 | } | 620 | } |
616 | 621 | ||
617 | static void fib_hash_free(struct hlist_head *hash, int bytes) | 622 | static void fib_info_hash_free(struct hlist_head *hash, int bytes) |
618 | { | 623 | { |
619 | if (!hash) | 624 | if (!hash) |
620 | return; | 625 | return; |
@@ -625,18 +630,18 @@ static void fib_hash_free(struct hlist_head *hash, int bytes) | |||
625 | free_pages((unsigned long) hash, get_order(bytes)); | 630 | free_pages((unsigned long) hash, get_order(bytes)); |
626 | } | 631 | } |
627 | 632 | ||
628 | static void fib_hash_move(struct hlist_head *new_info_hash, | 633 | static void fib_info_hash_move(struct hlist_head *new_info_hash, |
629 | struct hlist_head *new_laddrhash, | 634 | struct hlist_head *new_laddrhash, |
630 | unsigned int new_size) | 635 | unsigned int new_size) |
631 | { | 636 | { |
632 | struct hlist_head *old_info_hash, *old_laddrhash; | 637 | struct hlist_head *old_info_hash, *old_laddrhash; |
633 | unsigned int old_size = fib_hash_size; | 638 | unsigned int old_size = fib_info_hash_size; |
634 | unsigned int i, bytes; | 639 | unsigned int i, bytes; |
635 | 640 | ||
636 | spin_lock_bh(&fib_info_lock); | 641 | spin_lock_bh(&fib_info_lock); |
637 | old_info_hash = fib_info_hash; | 642 | old_info_hash = fib_info_hash; |
638 | old_laddrhash = fib_info_laddrhash; | 643 | old_laddrhash = fib_info_laddrhash; |
639 | fib_hash_size = new_size; | 644 | fib_info_hash_size = new_size; |
640 | 645 | ||
641 | for (i = 0; i < old_size; i++) { | 646 | for (i = 0; i < old_size; i++) { |
642 | struct hlist_head *head = &fib_info_hash[i]; | 647 | struct hlist_head *head = &fib_info_hash[i]; |
@@ -677,8 +682,18 @@ static void fib_hash_move(struct hlist_head *new_info_hash, | |||
677 | spin_unlock_bh(&fib_info_lock); | 682 | spin_unlock_bh(&fib_info_lock); |
678 | 683 | ||
679 | bytes = old_size * sizeof(struct hlist_head *); | 684 | bytes = old_size * sizeof(struct hlist_head *); |
680 | fib_hash_free(old_info_hash, bytes); | 685 | fib_info_hash_free(old_info_hash, bytes); |
681 | fib_hash_free(old_laddrhash, bytes); | 686 | fib_info_hash_free(old_laddrhash, bytes); |
687 | } | ||
688 | |||
689 | __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh) | ||
690 | { | ||
691 | nh->nh_saddr = inet_select_addr(nh->nh_dev, | ||
692 | nh->nh_gw, | ||
693 | nh->nh_parent->fib_scope); | ||
694 | nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid); | ||
695 | |||
696 | return nh->nh_saddr; | ||
682 | } | 697 | } |
683 | 698 | ||
684 | struct fib_info *fib_create_info(struct fib_config *cfg) | 699 | struct fib_info *fib_create_info(struct fib_config *cfg) |
@@ -689,6 +704,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg) | |||
689 | int nhs = 1; | 704 | int nhs = 1; |
690 | struct net *net = cfg->fc_nlinfo.nl_net; | 705 | struct net *net = cfg->fc_nlinfo.nl_net; |
691 | 706 | ||
707 | if (cfg->fc_type > RTN_MAX) | ||
708 | goto err_inval; | ||
709 | |||
692 | /* Fast check to catch the most weird cases */ | 710 | /* Fast check to catch the most weird cases */ |
693 | if (fib_props[cfg->fc_type].scope > cfg->fc_scope) | 711 | if (fib_props[cfg->fc_type].scope > cfg->fc_scope) |
694 | goto err_inval; | 712 | goto err_inval; |
@@ -702,8 +720,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg) | |||
702 | #endif | 720 | #endif |
703 | 721 | ||
704 | err = -ENOBUFS; | 722 | err = -ENOBUFS; |
705 | if (fib_info_cnt >= fib_hash_size) { | 723 | if (fib_info_cnt >= fib_info_hash_size) { |
706 | unsigned int new_size = fib_hash_size << 1; | 724 | unsigned int new_size = fib_info_hash_size << 1; |
707 | struct hlist_head *new_info_hash; | 725 | struct hlist_head *new_info_hash; |
708 | struct hlist_head *new_laddrhash; | 726 | struct hlist_head *new_laddrhash; |
709 | unsigned int bytes; | 727 | unsigned int bytes; |
@@ -711,25 +729,32 @@ struct fib_info *fib_create_info(struct fib_config *cfg) | |||
711 | if (!new_size) | 729 | if (!new_size) |
712 | new_size = 1; | 730 | new_size = 1; |
713 | bytes = new_size * sizeof(struct hlist_head *); | 731 | bytes = new_size * sizeof(struct hlist_head *); |
714 | new_info_hash = fib_hash_alloc(bytes); | 732 | new_info_hash = fib_info_hash_alloc(bytes); |
715 | new_laddrhash = fib_hash_alloc(bytes); | 733 | new_laddrhash = fib_info_hash_alloc(bytes); |
716 | if (!new_info_hash || !new_laddrhash) { | 734 | if (!new_info_hash || !new_laddrhash) { |
717 | fib_hash_free(new_info_hash, bytes); | 735 | fib_info_hash_free(new_info_hash, bytes); |
718 | fib_hash_free(new_laddrhash, bytes); | 736 | fib_info_hash_free(new_laddrhash, bytes); |
719 | } else | 737 | } else |
720 | fib_hash_move(new_info_hash, new_laddrhash, new_size); | 738 | fib_info_hash_move(new_info_hash, new_laddrhash, new_size); |
721 | 739 | ||
722 | if (!fib_hash_size) | 740 | if (!fib_info_hash_size) |
723 | goto failure; | 741 | goto failure; |
724 | } | 742 | } |
725 | 743 | ||
726 | fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); | 744 | fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); |
727 | if (fi == NULL) | 745 | if (fi == NULL) |
728 | goto failure; | 746 | goto failure; |
747 | if (cfg->fc_mx) { | ||
748 | fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); | ||
749 | if (!fi->fib_metrics) | ||
750 | goto failure; | ||
751 | } else | ||
752 | fi->fib_metrics = (u32 *) dst_default_metrics; | ||
729 | fib_info_cnt++; | 753 | fib_info_cnt++; |
730 | 754 | ||
731 | fi->fib_net = hold_net(net); | 755 | fi->fib_net = hold_net(net); |
732 | fi->fib_protocol = cfg->fc_protocol; | 756 | fi->fib_protocol = cfg->fc_protocol; |
757 | fi->fib_scope = cfg->fc_scope; | ||
733 | fi->fib_flags = cfg->fc_flags; | 758 | fi->fib_flags = cfg->fc_flags; |
734 | fi->fib_priority = cfg->fc_priority; | 759 | fi->fib_priority = cfg->fc_priority; |
735 | fi->fib_prefsrc = cfg->fc_prefsrc; | 760 | fi->fib_prefsrc = cfg->fc_prefsrc; |
@@ -763,7 +788,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) | |||
763 | goto err_inval; | 788 | goto err_inval; |
764 | if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw) | 789 | if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw) |
765 | goto err_inval; | 790 | goto err_inval; |
766 | #ifdef CONFIG_NET_CLS_ROUTE | 791 | #ifdef CONFIG_IP_ROUTE_CLASSID |
767 | if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) | 792 | if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) |
768 | goto err_inval; | 793 | goto err_inval; |
769 | #endif | 794 | #endif |
@@ -776,7 +801,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) | |||
776 | nh->nh_oif = cfg->fc_oif; | 801 | nh->nh_oif = cfg->fc_oif; |
777 | nh->nh_gw = cfg->fc_gw; | 802 | nh->nh_gw = cfg->fc_gw; |
778 | nh->nh_flags = cfg->fc_flags; | 803 | nh->nh_flags = cfg->fc_flags; |
779 | #ifdef CONFIG_NET_CLS_ROUTE | 804 | #ifdef CONFIG_IP_ROUTE_CLASSID |
780 | nh->nh_tclassid = cfg->fc_flow; | 805 | nh->nh_tclassid = cfg->fc_flow; |
781 | #endif | 806 | #endif |
782 | #ifdef CONFIG_IP_ROUTE_MULTIPATH | 807 | #ifdef CONFIG_IP_ROUTE_MULTIPATH |
@@ -788,6 +813,17 @@ struct fib_info *fib_create_info(struct fib_config *cfg) | |||
788 | if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp) | 813 | if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp) |
789 | goto err_inval; | 814 | goto err_inval; |
790 | goto link_it; | 815 | goto link_it; |
816 | } else { | ||
817 | switch (cfg->fc_type) { | ||
818 | case RTN_UNICAST: | ||
819 | case RTN_LOCAL: | ||
820 | case RTN_BROADCAST: | ||
821 | case RTN_ANYCAST: | ||
822 | case RTN_MULTICAST: | ||
823 | break; | ||
824 | default: | ||
825 | goto err_inval; | ||
826 | } | ||
791 | } | 827 | } |
792 | 828 | ||
793 | if (cfg->fc_scope > RT_SCOPE_HOST) | 829 | if (cfg->fc_scope > RT_SCOPE_HOST) |
@@ -806,7 +842,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg) | |||
806 | goto failure; | 842 | goto failure; |
807 | } else { | 843 | } else { |
808 | change_nexthops(fi) { | 844 | change_nexthops(fi) { |
809 | if ((err = fib_check_nh(cfg, fi, nexthop_nh)) != 0) | 845 | err = fib_check_nh(cfg, fi, nexthop_nh); |
846 | if (err != 0) | ||
810 | goto failure; | 847 | goto failure; |
811 | } endfor_nexthops(fi) | 848 | } endfor_nexthops(fi) |
812 | } | 849 | } |
@@ -818,8 +855,13 @@ struct fib_info *fib_create_info(struct fib_config *cfg) | |||
818 | goto err_inval; | 855 | goto err_inval; |
819 | } | 856 | } |
820 | 857 | ||
858 | change_nexthops(fi) { | ||
859 | fib_info_update_nh_saddr(net, nexthop_nh); | ||
860 | } endfor_nexthops(fi) | ||
861 | |||
821 | link_it: | 862 | link_it: |
822 | if ((ofi = fib_find_info(fi)) != NULL) { | 863 | ofi = fib_find_info(fi); |
864 | if (ofi) { | ||
823 | fi->fib_dead = 1; | 865 | fi->fib_dead = 1; |
824 | free_fib_info(fi); | 866 | free_fib_info(fi); |
825 | ofi->fib_treeref++; | 867 | ofi->fib_treeref++; |
@@ -862,86 +904,8 @@ failure: | |||
862 | return ERR_PTR(err); | 904 | return ERR_PTR(err); |
863 | } | 905 | } |
864 | 906 | ||
865 | /* Note! fib_semantic_match intentionally uses RCU list functions. */ | ||
866 | int fib_semantic_match(struct list_head *head, const struct flowi *flp, | ||
867 | struct fib_result *res, int prefixlen) | ||
868 | { | ||
869 | struct fib_alias *fa; | ||
870 | int nh_sel = 0; | ||
871 | |||
872 | list_for_each_entry_rcu(fa, head, fa_list) { | ||
873 | int err; | ||
874 | |||
875 | if (fa->fa_tos && | ||
876 | fa->fa_tos != flp->fl4_tos) | ||
877 | continue; | ||
878 | |||
879 | if (fa->fa_scope < flp->fl4_scope) | ||
880 | continue; | ||
881 | |||
882 | fa->fa_state |= FA_S_ACCESSED; | ||
883 | |||
884 | err = fib_props[fa->fa_type].error; | ||
885 | if (err == 0) { | ||
886 | struct fib_info *fi = fa->fa_info; | ||
887 | |||
888 | if (fi->fib_flags & RTNH_F_DEAD) | ||
889 | continue; | ||
890 | |||
891 | switch (fa->fa_type) { | ||
892 | case RTN_UNICAST: | ||
893 | case RTN_LOCAL: | ||
894 | case RTN_BROADCAST: | ||
895 | case RTN_ANYCAST: | ||
896 | case RTN_MULTICAST: | ||
897 | for_nexthops(fi) { | ||
898 | if (nh->nh_flags&RTNH_F_DEAD) | ||
899 | continue; | ||
900 | if (!flp->oif || flp->oif == nh->nh_oif) | ||
901 | break; | ||
902 | } | ||
903 | #ifdef CONFIG_IP_ROUTE_MULTIPATH | ||
904 | if (nhsel < fi->fib_nhs) { | ||
905 | nh_sel = nhsel; | ||
906 | goto out_fill_res; | ||
907 | } | ||
908 | #else | ||
909 | if (nhsel < 1) { | ||
910 | goto out_fill_res; | ||
911 | } | ||
912 | #endif | ||
913 | endfor_nexthops(fi); | ||
914 | continue; | ||
915 | |||
916 | default: | ||
917 | printk(KERN_WARNING "fib_semantic_match bad type %#x\n", | ||
918 | fa->fa_type); | ||
919 | return -EINVAL; | ||
920 | } | ||
921 | } | ||
922 | return err; | ||
923 | } | ||
924 | return 1; | ||
925 | |||
926 | out_fill_res: | ||
927 | res->prefixlen = prefixlen; | ||
928 | res->nh_sel = nh_sel; | ||
929 | res->type = fa->fa_type; | ||
930 | res->scope = fa->fa_scope; | ||
931 | res->fi = fa->fa_info; | ||
932 | atomic_inc(&res->fi->fib_clntref); | ||
933 | return 0; | ||
934 | } | ||
935 | |||
936 | /* Find appropriate source address to this destination */ | ||
937 | |||
938 | __be32 __fib_res_prefsrc(struct fib_result *res) | ||
939 | { | ||
940 | return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope); | ||
941 | } | ||
942 | |||
943 | int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, | 907 | int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, |
944 | u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos, | 908 | u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos, |
945 | struct fib_info *fi, unsigned int flags) | 909 | struct fib_info *fi, unsigned int flags) |
946 | { | 910 | { |
947 | struct nlmsghdr *nlh; | 911 | struct nlmsghdr *nlh; |
@@ -963,7 +927,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, | |||
963 | NLA_PUT_U32(skb, RTA_TABLE, tb_id); | 927 | NLA_PUT_U32(skb, RTA_TABLE, tb_id); |
964 | rtm->rtm_type = type; | 928 | rtm->rtm_type = type; |
965 | rtm->rtm_flags = fi->fib_flags; | 929 | rtm->rtm_flags = fi->fib_flags; |
966 | rtm->rtm_scope = scope; | 930 | rtm->rtm_scope = fi->fib_scope; |
967 | rtm->rtm_protocol = fi->fib_protocol; | 931 | rtm->rtm_protocol = fi->fib_protocol; |
968 | 932 | ||
969 | if (rtm->rtm_dst_len) | 933 | if (rtm->rtm_dst_len) |
@@ -984,7 +948,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, | |||
984 | 948 | ||
985 | if (fi->fib_nh->nh_oif) | 949 | if (fi->fib_nh->nh_oif) |
986 | NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif); | 950 | NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif); |
987 | #ifdef CONFIG_NET_CLS_ROUTE | 951 | #ifdef CONFIG_IP_ROUTE_CLASSID |
988 | if (fi->fib_nh[0].nh_tclassid) | 952 | if (fi->fib_nh[0].nh_tclassid) |
989 | NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid); | 953 | NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid); |
990 | #endif | 954 | #endif |
@@ -1009,7 +973,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, | |||
1009 | 973 | ||
1010 | if (nh->nh_gw) | 974 | if (nh->nh_gw) |
1011 | NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw); | 975 | NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw); |
1012 | #ifdef CONFIG_NET_CLS_ROUTE | 976 | #ifdef CONFIG_IP_ROUTE_CLASSID |
1013 | if (nh->nh_tclassid) | 977 | if (nh->nh_tclassid) |
1014 | NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid); | 978 | NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid); |
1015 | #endif | 979 | #endif |
@@ -1028,10 +992,10 @@ nla_put_failure: | |||
1028 | } | 992 | } |
1029 | 993 | ||
1030 | /* | 994 | /* |
1031 | Update FIB if: | 995 | * Update FIB if: |
1032 | - local address disappeared -> we must delete all the entries | 996 | * - local address disappeared -> we must delete all the entries |
1033 | referring to it. | 997 | * referring to it. |
1034 | - device went down -> we must shutdown all nexthops going via it. | 998 | * - device went down -> we must shutdown all nexthops going via it. |
1035 | */ | 999 | */ |
1036 | int fib_sync_down_addr(struct net *net, __be32 local) | 1000 | int fib_sync_down_addr(struct net *net, __be32 local) |
1037 | { | 1001 | { |
@@ -1078,7 +1042,7 @@ int fib_sync_down_dev(struct net_device *dev, int force) | |||
1078 | prev_fi = fi; | 1042 | prev_fi = fi; |
1079 | dead = 0; | 1043 | dead = 0; |
1080 | change_nexthops(fi) { | 1044 | change_nexthops(fi) { |
1081 | if (nexthop_nh->nh_flags&RTNH_F_DEAD) | 1045 | if (nexthop_nh->nh_flags & RTNH_F_DEAD) |
1082 | dead++; | 1046 | dead++; |
1083 | else if (nexthop_nh->nh_dev == dev && | 1047 | else if (nexthop_nh->nh_dev == dev && |
1084 | nexthop_nh->nh_scope != scope) { | 1048 | nexthop_nh->nh_scope != scope) { |
@@ -1107,13 +1071,68 @@ int fib_sync_down_dev(struct net_device *dev, int force) | |||
1107 | return ret; | 1071 | return ret; |
1108 | } | 1072 | } |
1109 | 1073 | ||
1074 | /* Must be invoked inside of an RCU protected region. */ | ||
1075 | void fib_select_default(struct fib_result *res) | ||
1076 | { | ||
1077 | struct fib_info *fi = NULL, *last_resort = NULL; | ||
1078 | struct list_head *fa_head = res->fa_head; | ||
1079 | struct fib_table *tb = res->table; | ||
1080 | int order = -1, last_idx = -1; | ||
1081 | struct fib_alias *fa; | ||
1082 | |||
1083 | list_for_each_entry_rcu(fa, fa_head, fa_list) { | ||
1084 | struct fib_info *next_fi = fa->fa_info; | ||
1085 | |||
1086 | if (next_fi->fib_scope != res->scope || | ||
1087 | fa->fa_type != RTN_UNICAST) | ||
1088 | continue; | ||
1089 | |||
1090 | if (next_fi->fib_priority > res->fi->fib_priority) | ||
1091 | break; | ||
1092 | if (!next_fi->fib_nh[0].nh_gw || | ||
1093 | next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) | ||
1094 | continue; | ||
1095 | |||
1096 | fib_alias_accessed(fa); | ||
1097 | |||
1098 | if (fi == NULL) { | ||
1099 | if (next_fi != res->fi) | ||
1100 | break; | ||
1101 | } else if (!fib_detect_death(fi, order, &last_resort, | ||
1102 | &last_idx, tb->tb_default)) { | ||
1103 | fib_result_assign(res, fi); | ||
1104 | tb->tb_default = order; | ||
1105 | goto out; | ||
1106 | } | ||
1107 | fi = next_fi; | ||
1108 | order++; | ||
1109 | } | ||
1110 | |||
1111 | if (order <= 0 || fi == NULL) { | ||
1112 | tb->tb_default = -1; | ||
1113 | goto out; | ||
1114 | } | ||
1115 | |||
1116 | if (!fib_detect_death(fi, order, &last_resort, &last_idx, | ||
1117 | tb->tb_default)) { | ||
1118 | fib_result_assign(res, fi); | ||
1119 | tb->tb_default = order; | ||
1120 | goto out; | ||
1121 | } | ||
1122 | |||
1123 | if (last_idx >= 0) | ||
1124 | fib_result_assign(res, last_resort); | ||
1125 | tb->tb_default = last_idx; | ||
1126 | out: | ||
1127 | return; | ||
1128 | } | ||
1129 | |||
1110 | #ifdef CONFIG_IP_ROUTE_MULTIPATH | 1130 | #ifdef CONFIG_IP_ROUTE_MULTIPATH |
1111 | 1131 | ||
1112 | /* | 1132 | /* |
1113 | Dead device goes up. We wake up dead nexthops. | 1133 | * Dead device goes up. We wake up dead nexthops. |
1114 | It takes sense only on multipath routes. | 1134 | * It takes sense only on multipath routes. |
1115 | */ | 1135 | */ |
1116 | |||
1117 | int fib_sync_up(struct net_device *dev) | 1136 | int fib_sync_up(struct net_device *dev) |
1118 | { | 1137 | { |
1119 | struct fib_info *prev_fi; | 1138 | struct fib_info *prev_fi; |
@@ -1123,7 +1142,7 @@ int fib_sync_up(struct net_device *dev) | |||
1123 | struct fib_nh *nh; | 1142 | struct fib_nh *nh; |
1124 | int ret; | 1143 | int ret; |
1125 | 1144 | ||
1126 | if (!(dev->flags&IFF_UP)) | 1145 | if (!(dev->flags & IFF_UP)) |
1127 | return 0; | 1146 | return 0; |
1128 | 1147 | ||
1129 | prev_fi = NULL; | 1148 | prev_fi = NULL; |
@@ -1142,12 +1161,12 @@ int fib_sync_up(struct net_device *dev) | |||
1142 | prev_fi = fi; | 1161 | prev_fi = fi; |
1143 | alive = 0; | 1162 | alive = 0; |
1144 | change_nexthops(fi) { | 1163 | change_nexthops(fi) { |
1145 | if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) { | 1164 | if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) { |
1146 | alive++; | 1165 | alive++; |
1147 | continue; | 1166 | continue; |
1148 | } | 1167 | } |
1149 | if (nexthop_nh->nh_dev == NULL || | 1168 | if (nexthop_nh->nh_dev == NULL || |
1150 | !(nexthop_nh->nh_dev->flags&IFF_UP)) | 1169 | !(nexthop_nh->nh_dev->flags & IFF_UP)) |
1151 | continue; | 1170 | continue; |
1152 | if (nexthop_nh->nh_dev != dev || | 1171 | if (nexthop_nh->nh_dev != dev || |
1153 | !__in_dev_get_rtnl(dev)) | 1172 | !__in_dev_get_rtnl(dev)) |
@@ -1169,11 +1188,10 @@ int fib_sync_up(struct net_device *dev) | |||
1169 | } | 1188 | } |
1170 | 1189 | ||
1171 | /* | 1190 | /* |
1172 | The algorithm is suboptimal, but it provides really | 1191 | * The algorithm is suboptimal, but it provides really |
1173 | fair weighted route distribution. | 1192 | * fair weighted route distribution. |
1174 | */ | 1193 | */ |
1175 | 1194 | void fib_select_multipath(struct fib_result *res) | |
1176 | void fib_select_multipath(const struct flowi *flp, struct fib_result *res) | ||
1177 | { | 1195 | { |
1178 | struct fib_info *fi = res->fi; | 1196 | struct fib_info *fi = res->fi; |
1179 | int w; | 1197 | int w; |
@@ -1182,7 +1200,7 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res) | |||
1182 | if (fi->fib_power <= 0) { | 1200 | if (fi->fib_power <= 0) { |
1183 | int power = 0; | 1201 | int power = 0; |
1184 | change_nexthops(fi) { | 1202 | change_nexthops(fi) { |
1185 | if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) { | 1203 | if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) { |
1186 | power += nexthop_nh->nh_weight; | 1204 | power += nexthop_nh->nh_weight; |
1187 | nexthop_nh->nh_power = nexthop_nh->nh_weight; | 1205 | nexthop_nh->nh_power = nexthop_nh->nh_weight; |
1188 | } | 1206 | } |
@@ -1198,15 +1216,16 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res) | |||
1198 | 1216 | ||
1199 | 1217 | ||
1200 | /* w should be random number [0..fi->fib_power-1], | 1218 | /* w should be random number [0..fi->fib_power-1], |
1201 | it is pretty bad approximation. | 1219 | * it is pretty bad approximation. |
1202 | */ | 1220 | */ |
1203 | 1221 | ||
1204 | w = jiffies % fi->fib_power; | 1222 | w = jiffies % fi->fib_power; |
1205 | 1223 | ||
1206 | change_nexthops(fi) { | 1224 | change_nexthops(fi) { |
1207 | if (!(nexthop_nh->nh_flags&RTNH_F_DEAD) && | 1225 | if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) && |
1208 | nexthop_nh->nh_power) { | 1226 | nexthop_nh->nh_power) { |
1209 | if ((w -= nexthop_nh->nh_power) <= 0) { | 1227 | w -= nexthop_nh->nh_power; |
1228 | if (w <= 0) { | ||
1210 | nexthop_nh->nh_power--; | 1229 | nexthop_nh->nh_power--; |
1211 | fi->fib_power--; | 1230 | fi->fib_power--; |
1212 | res->nh_sel = nhsel; | 1231 | res->nh_sel = nhsel; |
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 4a8e370862bc..58c25ea5a5c1 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c | |||
@@ -12,11 +12,11 @@ | |||
12 | * | 12 | * |
13 | * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet | 13 | * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet |
14 | * | 14 | * |
15 | * This work is based on the LPC-trie which is originally descibed in: | 15 | * This work is based on the LPC-trie which is originally described in: |
16 | * | 16 | * |
17 | * An experimental study of compression methods for dynamic tries | 17 | * An experimental study of compression methods for dynamic tries |
18 | * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. | 18 | * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. |
19 | * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/ | 19 | * http://www.csc.kth.se/~snilsson/software/dyntrie2/ |
20 | * | 20 | * |
21 | * | 21 | * |
22 | * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson | 22 | * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson |
@@ -72,6 +72,7 @@ | |||
72 | #include <linux/init.h> | 72 | #include <linux/init.h> |
73 | #include <linux/list.h> | 73 | #include <linux/list.h> |
74 | #include <linux/slab.h> | 74 | #include <linux/slab.h> |
75 | #include <linux/prefetch.h> | ||
75 | #include <net/net_namespace.h> | 76 | #include <net/net_namespace.h> |
76 | #include <net/ip.h> | 77 | #include <net/ip.h> |
77 | #include <net/protocol.h> | 78 | #include <net/protocol.h> |
@@ -95,7 +96,7 @@ typedef unsigned int t_key; | |||
95 | #define IS_TNODE(n) (!(n->parent & T_LEAF)) | 96 | #define IS_TNODE(n) (!(n->parent & T_LEAF)) |
96 | #define IS_LEAF(n) (n->parent & T_LEAF) | 97 | #define IS_LEAF(n) (n->parent & T_LEAF) |
97 | 98 | ||
98 | struct node { | 99 | struct rt_trie_node { |
99 | unsigned long parent; | 100 | unsigned long parent; |
100 | t_key key; | 101 | t_key key; |
101 | }; | 102 | }; |
@@ -126,7 +127,7 @@ struct tnode { | |||
126 | struct work_struct work; | 127 | struct work_struct work; |
127 | struct tnode *tnode_free; | 128 | struct tnode *tnode_free; |
128 | }; | 129 | }; |
129 | struct node *child[0]; | 130 | struct rt_trie_node __rcu *child[0]; |
130 | }; | 131 | }; |
131 | 132 | ||
132 | #ifdef CONFIG_IP_FIB_TRIE_STATS | 133 | #ifdef CONFIG_IP_FIB_TRIE_STATS |
@@ -151,16 +152,16 @@ struct trie_stat { | |||
151 | }; | 152 | }; |
152 | 153 | ||
153 | struct trie { | 154 | struct trie { |
154 | struct node *trie; | 155 | struct rt_trie_node __rcu *trie; |
155 | #ifdef CONFIG_IP_FIB_TRIE_STATS | 156 | #ifdef CONFIG_IP_FIB_TRIE_STATS |
156 | struct trie_use_stats stats; | 157 | struct trie_use_stats stats; |
157 | #endif | 158 | #endif |
158 | }; | 159 | }; |
159 | 160 | ||
160 | static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n); | 161 | static void put_child(struct trie *t, struct tnode *tn, int i, struct rt_trie_node *n); |
161 | static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, | 162 | static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n, |
162 | int wasfull); | 163 | int wasfull); |
163 | static struct node *resize(struct trie *t, struct tnode *tn); | 164 | static struct rt_trie_node *resize(struct trie *t, struct tnode *tn); |
164 | static struct tnode *inflate(struct trie *t, struct tnode *tn); | 165 | static struct tnode *inflate(struct trie *t, struct tnode *tn); |
165 | static struct tnode *halve(struct trie *t, struct tnode *tn); | 166 | static struct tnode *halve(struct trie *t, struct tnode *tn); |
166 | /* tnodes to free after resize(); protected by RTNL */ | 167 | /* tnodes to free after resize(); protected by RTNL */ |
@@ -177,43 +178,58 @@ static const int sync_pages = 128; | |||
177 | static struct kmem_cache *fn_alias_kmem __read_mostly; | 178 | static struct kmem_cache *fn_alias_kmem __read_mostly; |
178 | static struct kmem_cache *trie_leaf_kmem __read_mostly; | 179 | static struct kmem_cache *trie_leaf_kmem __read_mostly; |
179 | 180 | ||
180 | static inline struct tnode *node_parent(struct node *node) | 181 | /* |
182 | * caller must hold RTNL | ||
183 | */ | ||
184 | static inline struct tnode *node_parent(const struct rt_trie_node *node) | ||
181 | { | 185 | { |
182 | return (struct tnode *)(node->parent & ~NODE_TYPE_MASK); | 186 | unsigned long parent; |
187 | |||
188 | parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held()); | ||
189 | |||
190 | return (struct tnode *)(parent & ~NODE_TYPE_MASK); | ||
183 | } | 191 | } |
184 | 192 | ||
185 | static inline struct tnode *node_parent_rcu(struct node *node) | 193 | /* |
194 | * caller must hold RCU read lock or RTNL | ||
195 | */ | ||
196 | static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node) | ||
186 | { | 197 | { |
187 | struct tnode *ret = node_parent(node); | 198 | unsigned long parent; |
199 | |||
200 | parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() || | ||
201 | lockdep_rtnl_is_held()); | ||
188 | 202 | ||
189 | return rcu_dereference_check(ret, | 203 | return (struct tnode *)(parent & ~NODE_TYPE_MASK); |
190 | rcu_read_lock_held() || | ||
191 | lockdep_rtnl_is_held()); | ||
192 | } | 204 | } |
193 | 205 | ||
194 | /* Same as rcu_assign_pointer | 206 | /* Same as rcu_assign_pointer |
195 | * but that macro() assumes that value is a pointer. | 207 | * but that macro() assumes that value is a pointer. |
196 | */ | 208 | */ |
197 | static inline void node_set_parent(struct node *node, struct tnode *ptr) | 209 | static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr) |
198 | { | 210 | { |
199 | smp_wmb(); | 211 | smp_wmb(); |
200 | node->parent = (unsigned long)ptr | NODE_TYPE(node); | 212 | node->parent = (unsigned long)ptr | NODE_TYPE(node); |
201 | } | 213 | } |
202 | 214 | ||
203 | static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i) | 215 | /* |
216 | * caller must hold RTNL | ||
217 | */ | ||
218 | static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i) | ||
204 | { | 219 | { |
205 | BUG_ON(i >= 1U << tn->bits); | 220 | BUG_ON(i >= 1U << tn->bits); |
206 | 221 | ||
207 | return tn->child[i]; | 222 | return rtnl_dereference(tn->child[i]); |
208 | } | 223 | } |
209 | 224 | ||
210 | static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i) | 225 | /* |
226 | * caller must hold RCU read lock or RTNL | ||
227 | */ | ||
228 | static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i) | ||
211 | { | 229 | { |
212 | struct node *ret = tnode_get_child(tn, i); | 230 | BUG_ON(i >= 1U << tn->bits); |
213 | 231 | ||
214 | return rcu_dereference_check(ret, | 232 | return rcu_dereference_rtnl(tn->child[i]); |
215 | rcu_read_lock_held() || | ||
216 | lockdep_rtnl_is_held()); | ||
217 | } | 233 | } |
218 | 234 | ||
219 | static inline int tnode_child_length(const struct tnode *tn) | 235 | static inline int tnode_child_length(const struct tnode *tn) |
@@ -221,12 +237,12 @@ static inline int tnode_child_length(const struct tnode *tn) | |||
221 | return 1 << tn->bits; | 237 | return 1 << tn->bits; |
222 | } | 238 | } |
223 | 239 | ||
224 | static inline t_key mask_pfx(t_key k, unsigned short l) | 240 | static inline t_key mask_pfx(t_key k, unsigned int l) |
225 | { | 241 | { |
226 | return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l); | 242 | return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l); |
227 | } | 243 | } |
228 | 244 | ||
229 | static inline t_key tkey_extract_bits(t_key a, int offset, int bits) | 245 | static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits) |
230 | { | 246 | { |
231 | if (offset < KEYLENGTH) | 247 | if (offset < KEYLENGTH) |
232 | return ((t_key)(a << offset)) >> (KEYLENGTH - bits); | 248 | return ((t_key)(a << offset)) >> (KEYLENGTH - bits); |
@@ -354,14 +370,9 @@ static inline void free_leaf(struct leaf *l) | |||
354 | call_rcu_bh(&l->rcu, __leaf_free_rcu); | 370 | call_rcu_bh(&l->rcu, __leaf_free_rcu); |
355 | } | 371 | } |
356 | 372 | ||
357 | static void __leaf_info_free_rcu(struct rcu_head *head) | ||
358 | { | ||
359 | kfree(container_of(head, struct leaf_info, rcu)); | ||
360 | } | ||
361 | |||
362 | static inline void free_leaf_info(struct leaf_info *leaf) | 373 | static inline void free_leaf_info(struct leaf_info *leaf) |
363 | { | 374 | { |
364 | call_rcu(&leaf->rcu, __leaf_info_free_rcu); | 375 | kfree_rcu(leaf, rcu); |
365 | } | 376 | } |
366 | 377 | ||
367 | static struct tnode *tnode_alloc(size_t size) | 378 | static struct tnode *tnode_alloc(size_t size) |
@@ -369,7 +380,7 @@ static struct tnode *tnode_alloc(size_t size) | |||
369 | if (size <= PAGE_SIZE) | 380 | if (size <= PAGE_SIZE) |
370 | return kzalloc(size, GFP_KERNEL); | 381 | return kzalloc(size, GFP_KERNEL); |
371 | else | 382 | else |
372 | return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); | 383 | return vzalloc(size); |
373 | } | 384 | } |
374 | 385 | ||
375 | static void __tnode_vfree(struct work_struct *arg) | 386 | static void __tnode_vfree(struct work_struct *arg) |
@@ -382,7 +393,7 @@ static void __tnode_free_rcu(struct rcu_head *head) | |||
382 | { | 393 | { |
383 | struct tnode *tn = container_of(head, struct tnode, rcu); | 394 | struct tnode *tn = container_of(head, struct tnode, rcu); |
384 | size_t size = sizeof(struct tnode) + | 395 | size_t size = sizeof(struct tnode) + |
385 | (sizeof(struct node *) << tn->bits); | 396 | (sizeof(struct rt_trie_node *) << tn->bits); |
386 | 397 | ||
387 | if (size <= PAGE_SIZE) | 398 | if (size <= PAGE_SIZE) |
388 | kfree(tn); | 399 | kfree(tn); |
@@ -406,7 +417,7 @@ static void tnode_free_safe(struct tnode *tn) | |||
406 | tn->tnode_free = tnode_free_head; | 417 | tn->tnode_free = tnode_free_head; |
407 | tnode_free_head = tn; | 418 | tnode_free_head = tn; |
408 | tnode_free_size += sizeof(struct tnode) + | 419 | tnode_free_size += sizeof(struct tnode) + |
409 | (sizeof(struct node *) << tn->bits); | 420 | (sizeof(struct rt_trie_node *) << tn->bits); |
410 | } | 421 | } |
411 | 422 | ||
412 | static void tnode_free_flush(void) | 423 | static void tnode_free_flush(void) |
@@ -447,7 +458,7 @@ static struct leaf_info *leaf_info_new(int plen) | |||
447 | 458 | ||
448 | static struct tnode *tnode_new(t_key key, int pos, int bits) | 459 | static struct tnode *tnode_new(t_key key, int pos, int bits) |
449 | { | 460 | { |
450 | size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits); | 461 | size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits); |
451 | struct tnode *tn = tnode_alloc(sz); | 462 | struct tnode *tn = tnode_alloc(sz); |
452 | 463 | ||
453 | if (tn) { | 464 | if (tn) { |
@@ -459,8 +470,8 @@ static struct tnode *tnode_new(t_key key, int pos, int bits) | |||
459 | tn->empty_children = 1<<bits; | 470 | tn->empty_children = 1<<bits; |
460 | } | 471 | } |
461 | 472 | ||
462 | pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode), | 473 | pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode), |
463 | (unsigned long) (sizeof(struct node) << bits)); | 474 | sizeof(struct rt_trie_node) << bits); |
464 | return tn; | 475 | return tn; |
465 | } | 476 | } |
466 | 477 | ||
@@ -469,7 +480,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits) | |||
469 | * and no bits are skipped. See discussion in dyntree paper p. 6 | 480 | * and no bits are skipped. See discussion in dyntree paper p. 6 |
470 | */ | 481 | */ |
471 | 482 | ||
472 | static inline int tnode_full(const struct tnode *tn, const struct node *n) | 483 | static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n) |
473 | { | 484 | { |
474 | if (n == NULL || IS_LEAF(n)) | 485 | if (n == NULL || IS_LEAF(n)) |
475 | return 0; | 486 | return 0; |
@@ -478,7 +489,7 @@ static inline int tnode_full(const struct tnode *tn, const struct node *n) | |||
478 | } | 489 | } |
479 | 490 | ||
480 | static inline void put_child(struct trie *t, struct tnode *tn, int i, | 491 | static inline void put_child(struct trie *t, struct tnode *tn, int i, |
481 | struct node *n) | 492 | struct rt_trie_node *n) |
482 | { | 493 | { |
483 | tnode_put_child_reorg(tn, i, n, -1); | 494 | tnode_put_child_reorg(tn, i, n, -1); |
484 | } | 495 | } |
@@ -488,10 +499,10 @@ static inline void put_child(struct trie *t, struct tnode *tn, int i, | |||
488 | * Update the value of full_children and empty_children. | 499 | * Update the value of full_children and empty_children. |
489 | */ | 500 | */ |
490 | 501 | ||
491 | static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, | 502 | static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n, |
492 | int wasfull) | 503 | int wasfull) |
493 | { | 504 | { |
494 | struct node *chi = tn->child[i]; | 505 | struct rt_trie_node *chi = rtnl_dereference(tn->child[i]); |
495 | int isfull; | 506 | int isfull; |
496 | 507 | ||
497 | BUG_ON(i >= 1<<tn->bits); | 508 | BUG_ON(i >= 1<<tn->bits); |
@@ -519,7 +530,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, | |||
519 | } | 530 | } |
520 | 531 | ||
521 | #define MAX_WORK 10 | 532 | #define MAX_WORK 10 |
522 | static struct node *resize(struct trie *t, struct tnode *tn) | 533 | static struct rt_trie_node *resize(struct trie *t, struct tnode *tn) |
523 | { | 534 | { |
524 | int i; | 535 | int i; |
525 | struct tnode *old_tn; | 536 | struct tnode *old_tn; |
@@ -609,11 +620,10 @@ static struct node *resize(struct trie *t, struct tnode *tn) | |||
609 | 620 | ||
610 | /* Keep root node larger */ | 621 | /* Keep root node larger */ |
611 | 622 | ||
612 | if (!node_parent((struct node*) tn)) { | 623 | if (!node_parent((struct rt_trie_node *)tn)) { |
613 | inflate_threshold_use = inflate_threshold_root; | 624 | inflate_threshold_use = inflate_threshold_root; |
614 | halve_threshold_use = halve_threshold_root; | 625 | halve_threshold_use = halve_threshold_root; |
615 | } | 626 | } else { |
616 | else { | ||
617 | inflate_threshold_use = inflate_threshold; | 627 | inflate_threshold_use = inflate_threshold; |
618 | halve_threshold_use = halve_threshold; | 628 | halve_threshold_use = halve_threshold; |
619 | } | 629 | } |
@@ -639,8 +649,8 @@ static struct node *resize(struct trie *t, struct tnode *tn) | |||
639 | check_tnode(tn); | 649 | check_tnode(tn); |
640 | 650 | ||
641 | /* Return if at least one inflate is run */ | 651 | /* Return if at least one inflate is run */ |
642 | if( max_work != MAX_WORK) | 652 | if (max_work != MAX_WORK) |
643 | return (struct node *) tn; | 653 | return (struct rt_trie_node *) tn; |
644 | 654 | ||
645 | /* | 655 | /* |
646 | * Halve as long as the number of empty children in this | 656 | * Halve as long as the number of empty children in this |
@@ -668,9 +678,9 @@ static struct node *resize(struct trie *t, struct tnode *tn) | |||
668 | if (tn->empty_children == tnode_child_length(tn) - 1) { | 678 | if (tn->empty_children == tnode_child_length(tn) - 1) { |
669 | one_child: | 679 | one_child: |
670 | for (i = 0; i < tnode_child_length(tn); i++) { | 680 | for (i = 0; i < tnode_child_length(tn); i++) { |
671 | struct node *n; | 681 | struct rt_trie_node *n; |
672 | 682 | ||
673 | n = tn->child[i]; | 683 | n = rtnl_dereference(tn->child[i]); |
674 | if (!n) | 684 | if (!n) |
675 | continue; | 685 | continue; |
676 | 686 | ||
@@ -681,7 +691,21 @@ one_child: | |||
681 | return n; | 691 | return n; |
682 | } | 692 | } |
683 | } | 693 | } |
684 | return (struct node *) tn; | 694 | return (struct rt_trie_node *) tn; |
695 | } | ||
696 | |||
697 | |||
698 | static void tnode_clean_free(struct tnode *tn) | ||
699 | { | ||
700 | int i; | ||
701 | struct tnode *tofree; | ||
702 | |||
703 | for (i = 0; i < tnode_child_length(tn); i++) { | ||
704 | tofree = (struct tnode *)rtnl_dereference(tn->child[i]); | ||
705 | if (tofree) | ||
706 | tnode_free(tofree); | ||
707 | } | ||
708 | tnode_free(tn); | ||
685 | } | 709 | } |
686 | 710 | ||
687 | static struct tnode *inflate(struct trie *t, struct tnode *tn) | 711 | static struct tnode *inflate(struct trie *t, struct tnode *tn) |
@@ -728,14 +752,14 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) | |||
728 | goto nomem; | 752 | goto nomem; |
729 | } | 753 | } |
730 | 754 | ||
731 | put_child(t, tn, 2*i, (struct node *) left); | 755 | put_child(t, tn, 2*i, (struct rt_trie_node *) left); |
732 | put_child(t, tn, 2*i+1, (struct node *) right); | 756 | put_child(t, tn, 2*i+1, (struct rt_trie_node *) right); |
733 | } | 757 | } |
734 | } | 758 | } |
735 | 759 | ||
736 | for (i = 0; i < olen; i++) { | 760 | for (i = 0; i < olen; i++) { |
737 | struct tnode *inode; | 761 | struct tnode *inode; |
738 | struct node *node = tnode_get_child(oldtnode, i); | 762 | struct rt_trie_node *node = tnode_get_child(oldtnode, i); |
739 | struct tnode *left, *right; | 763 | struct tnode *left, *right; |
740 | int size, j; | 764 | int size, j; |
741 | 765 | ||
@@ -760,8 +784,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) | |||
760 | inode = (struct tnode *) node; | 784 | inode = (struct tnode *) node; |
761 | 785 | ||
762 | if (inode->bits == 1) { | 786 | if (inode->bits == 1) { |
763 | put_child(t, tn, 2*i, inode->child[0]); | 787 | put_child(t, tn, 2*i, rtnl_dereference(inode->child[0])); |
764 | put_child(t, tn, 2*i+1, inode->child[1]); | 788 | put_child(t, tn, 2*i+1, rtnl_dereference(inode->child[1])); |
765 | 789 | ||
766 | tnode_free_safe(inode); | 790 | tnode_free_safe(inode); |
767 | continue; | 791 | continue; |
@@ -802,8 +826,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) | |||
802 | 826 | ||
803 | size = tnode_child_length(left); | 827 | size = tnode_child_length(left); |
804 | for (j = 0; j < size; j++) { | 828 | for (j = 0; j < size; j++) { |
805 | put_child(t, left, j, inode->child[j]); | 829 | put_child(t, left, j, rtnl_dereference(inode->child[j])); |
806 | put_child(t, right, j, inode->child[j + size]); | 830 | put_child(t, right, j, rtnl_dereference(inode->child[j + size])); |
807 | } | 831 | } |
808 | put_child(t, tn, 2*i, resize(t, left)); | 832 | put_child(t, tn, 2*i, resize(t, left)); |
809 | put_child(t, tn, 2*i+1, resize(t, right)); | 833 | put_child(t, tn, 2*i+1, resize(t, right)); |
@@ -813,24 +837,14 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) | |||
813 | tnode_free_safe(oldtnode); | 837 | tnode_free_safe(oldtnode); |
814 | return tn; | 838 | return tn; |
815 | nomem: | 839 | nomem: |
816 | { | 840 | tnode_clean_free(tn); |
817 | int size = tnode_child_length(tn); | 841 | return ERR_PTR(-ENOMEM); |
818 | int j; | ||
819 | |||
820 | for (j = 0; j < size; j++) | ||
821 | if (tn->child[j]) | ||
822 | tnode_free((struct tnode *)tn->child[j]); | ||
823 | |||
824 | tnode_free(tn); | ||
825 | |||
826 | return ERR_PTR(-ENOMEM); | ||
827 | } | ||
828 | } | 842 | } |
829 | 843 | ||
830 | static struct tnode *halve(struct trie *t, struct tnode *tn) | 844 | static struct tnode *halve(struct trie *t, struct tnode *tn) |
831 | { | 845 | { |
832 | struct tnode *oldtnode = tn; | 846 | struct tnode *oldtnode = tn; |
833 | struct node *left, *right; | 847 | struct rt_trie_node *left, *right; |
834 | int i; | 848 | int i; |
835 | int olen = tnode_child_length(tn); | 849 | int olen = tnode_child_length(tn); |
836 | 850 | ||
@@ -861,7 +875,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn) | |||
861 | if (!newn) | 875 | if (!newn) |
862 | goto nomem; | 876 | goto nomem; |
863 | 877 | ||
864 | put_child(t, tn, i/2, (struct node *)newn); | 878 | put_child(t, tn, i/2, (struct rt_trie_node *)newn); |
865 | } | 879 | } |
866 | 880 | ||
867 | } | 881 | } |
@@ -895,18 +909,8 @@ static struct tnode *halve(struct trie *t, struct tnode *tn) | |||
895 | tnode_free_safe(oldtnode); | 909 | tnode_free_safe(oldtnode); |
896 | return tn; | 910 | return tn; |
897 | nomem: | 911 | nomem: |
898 | { | 912 | tnode_clean_free(tn); |
899 | int size = tnode_child_length(tn); | 913 | return ERR_PTR(-ENOMEM); |
900 | int j; | ||
901 | |||
902 | for (j = 0; j < size; j++) | ||
903 | if (tn->child[j]) | ||
904 | tnode_free((struct tnode *)tn->child[j]); | ||
905 | |||
906 | tnode_free(tn); | ||
907 | |||
908 | return ERR_PTR(-ENOMEM); | ||
909 | } | ||
910 | } | 914 | } |
911 | 915 | ||
912 | /* readside must use rcu_read_lock currently dump routines | 916 | /* readside must use rcu_read_lock currently dump routines |
@@ -963,12 +967,10 @@ fib_find_node(struct trie *t, u32 key) | |||
963 | { | 967 | { |
964 | int pos; | 968 | int pos; |
965 | struct tnode *tn; | 969 | struct tnode *tn; |
966 | struct node *n; | 970 | struct rt_trie_node *n; |
967 | 971 | ||
968 | pos = 0; | 972 | pos = 0; |
969 | n = rcu_dereference_check(t->trie, | 973 | n = rcu_dereference_rtnl(t->trie); |
970 | rcu_read_lock_held() || | ||
971 | lockdep_rtnl_is_held()); | ||
972 | 974 | ||
973 | while (n != NULL && NODE_TYPE(n) == T_TNODE) { | 975 | while (n != NULL && NODE_TYPE(n) == T_TNODE) { |
974 | tn = (struct tnode *) n; | 976 | tn = (struct tnode *) n; |
@@ -1000,17 +1002,17 @@ static void trie_rebalance(struct trie *t, struct tnode *tn) | |||
1000 | 1002 | ||
1001 | key = tn->key; | 1003 | key = tn->key; |
1002 | 1004 | ||
1003 | while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) { | 1005 | while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) { |
1004 | cindex = tkey_extract_bits(key, tp->pos, tp->bits); | 1006 | cindex = tkey_extract_bits(key, tp->pos, tp->bits); |
1005 | wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); | 1007 | wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); |
1006 | tn = (struct tnode *) resize(t, (struct tnode *)tn); | 1008 | tn = (struct tnode *) resize(t, (struct tnode *)tn); |
1007 | 1009 | ||
1008 | tnode_put_child_reorg((struct tnode *)tp, cindex, | 1010 | tnode_put_child_reorg((struct tnode *)tp, cindex, |
1009 | (struct node *)tn, wasfull); | 1011 | (struct rt_trie_node *)tn, wasfull); |
1010 | 1012 | ||
1011 | tp = node_parent((struct node *) tn); | 1013 | tp = node_parent((struct rt_trie_node *) tn); |
1012 | if (!tp) | 1014 | if (!tp) |
1013 | rcu_assign_pointer(t->trie, (struct node *)tn); | 1015 | rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); |
1014 | 1016 | ||
1015 | tnode_free_flush(); | 1017 | tnode_free_flush(); |
1016 | if (!tp) | 1018 | if (!tp) |
@@ -1022,7 +1024,7 @@ static void trie_rebalance(struct trie *t, struct tnode *tn) | |||
1022 | if (IS_TNODE(tn)) | 1024 | if (IS_TNODE(tn)) |
1023 | tn = (struct tnode *)resize(t, (struct tnode *)tn); | 1025 | tn = (struct tnode *)resize(t, (struct tnode *)tn); |
1024 | 1026 | ||
1025 | rcu_assign_pointer(t->trie, (struct node *)tn); | 1027 | rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); |
1026 | tnode_free_flush(); | 1028 | tnode_free_flush(); |
1027 | } | 1029 | } |
1028 | 1030 | ||
@@ -1032,7 +1034,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) | |||
1032 | { | 1034 | { |
1033 | int pos, newpos; | 1035 | int pos, newpos; |
1034 | struct tnode *tp = NULL, *tn = NULL; | 1036 | struct tnode *tp = NULL, *tn = NULL; |
1035 | struct node *n; | 1037 | struct rt_trie_node *n; |
1036 | struct leaf *l; | 1038 | struct leaf *l; |
1037 | int missbit; | 1039 | int missbit; |
1038 | struct list_head *fa_head = NULL; | 1040 | struct list_head *fa_head = NULL; |
@@ -1040,7 +1042,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) | |||
1040 | t_key cindex; | 1042 | t_key cindex; |
1041 | 1043 | ||
1042 | pos = 0; | 1044 | pos = 0; |
1043 | n = t->trie; | 1045 | n = rtnl_dereference(t->trie); |
1044 | 1046 | ||
1045 | /* If we point to NULL, stop. Either the tree is empty and we should | 1047 | /* If we point to NULL, stop. Either the tree is empty and we should |
1046 | * just put a new leaf in if, or we have reached an empty child slot, | 1048 | * just put a new leaf in if, or we have reached an empty child slot, |
@@ -1118,10 +1120,10 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) | |||
1118 | if (t->trie && n == NULL) { | 1120 | if (t->trie && n == NULL) { |
1119 | /* Case 2: n is NULL, and will just insert a new leaf */ | 1121 | /* Case 2: n is NULL, and will just insert a new leaf */ |
1120 | 1122 | ||
1121 | node_set_parent((struct node *)l, tp); | 1123 | node_set_parent((struct rt_trie_node *)l, tp); |
1122 | 1124 | ||
1123 | cindex = tkey_extract_bits(key, tp->pos, tp->bits); | 1125 | cindex = tkey_extract_bits(key, tp->pos, tp->bits); |
1124 | put_child(t, (struct tnode *)tp, cindex, (struct node *)l); | 1126 | put_child(t, (struct tnode *)tp, cindex, (struct rt_trie_node *)l); |
1125 | } else { | 1127 | } else { |
1126 | /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ | 1128 | /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ |
1127 | /* | 1129 | /* |
@@ -1148,18 +1150,18 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) | |||
1148 | return NULL; | 1150 | return NULL; |
1149 | } | 1151 | } |
1150 | 1152 | ||
1151 | node_set_parent((struct node *)tn, tp); | 1153 | node_set_parent((struct rt_trie_node *)tn, tp); |
1152 | 1154 | ||
1153 | missbit = tkey_extract_bits(key, newpos, 1); | 1155 | missbit = tkey_extract_bits(key, newpos, 1); |
1154 | put_child(t, tn, missbit, (struct node *)l); | 1156 | put_child(t, tn, missbit, (struct rt_trie_node *)l); |
1155 | put_child(t, tn, 1-missbit, n); | 1157 | put_child(t, tn, 1-missbit, n); |
1156 | 1158 | ||
1157 | if (tp) { | 1159 | if (tp) { |
1158 | cindex = tkey_extract_bits(key, tp->pos, tp->bits); | 1160 | cindex = tkey_extract_bits(key, tp->pos, tp->bits); |
1159 | put_child(t, (struct tnode *)tp, cindex, | 1161 | put_child(t, (struct tnode *)tp, cindex, |
1160 | (struct node *)tn); | 1162 | (struct rt_trie_node *)tn); |
1161 | } else { | 1163 | } else { |
1162 | rcu_assign_pointer(t->trie, (struct node *)tn); | 1164 | rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); |
1163 | tp = tn; | 1165 | tp = tn; |
1164 | } | 1166 | } |
1165 | } | 1167 | } |
@@ -1252,7 +1254,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) | |||
1252 | if (fa->fa_info->fib_priority != fi->fib_priority) | 1254 | if (fa->fa_info->fib_priority != fi->fib_priority) |
1253 | break; | 1255 | break; |
1254 | if (fa->fa_type == cfg->fc_type && | 1256 | if (fa->fa_type == cfg->fc_type && |
1255 | fa->fa_scope == cfg->fc_scope && | ||
1256 | fa->fa_info == fi) { | 1257 | fa->fa_info == fi) { |
1257 | fa_match = fa; | 1258 | fa_match = fa; |
1258 | break; | 1259 | break; |
@@ -1278,7 +1279,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) | |||
1278 | new_fa->fa_tos = fa->fa_tos; | 1279 | new_fa->fa_tos = fa->fa_tos; |
1279 | new_fa->fa_info = fi; | 1280 | new_fa->fa_info = fi; |
1280 | new_fa->fa_type = cfg->fc_type; | 1281 | new_fa->fa_type = cfg->fc_type; |
1281 | new_fa->fa_scope = cfg->fc_scope; | ||
1282 | state = fa->fa_state; | 1282 | state = fa->fa_state; |
1283 | new_fa->fa_state = state & ~FA_S_ACCESSED; | 1283 | new_fa->fa_state = state & ~FA_S_ACCESSED; |
1284 | 1284 | ||
@@ -1315,7 +1315,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) | |||
1315 | new_fa->fa_info = fi; | 1315 | new_fa->fa_info = fi; |
1316 | new_fa->fa_tos = tos; | 1316 | new_fa->fa_tos = tos; |
1317 | new_fa->fa_type = cfg->fc_type; | 1317 | new_fa->fa_type = cfg->fc_type; |
1318 | new_fa->fa_scope = cfg->fc_scope; | ||
1319 | new_fa->fa_state = 0; | 1318 | new_fa->fa_state = 0; |
1320 | /* | 1319 | /* |
1321 | * Insert new entry to the list. | 1320 | * Insert new entry to the list. |
@@ -1329,6 +1328,9 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) | |||
1329 | } | 1328 | } |
1330 | } | 1329 | } |
1331 | 1330 | ||
1331 | if (!plen) | ||
1332 | tb->tb_num_default++; | ||
1333 | |||
1332 | list_add_tail_rcu(&new_fa->fa_list, | 1334 | list_add_tail_rcu(&new_fa->fa_list, |
1333 | (fa ? &fa->fa_list : fa_head)); | 1335 | (fa ? &fa->fa_list : fa_head)); |
1334 | 1336 | ||
@@ -1347,52 +1349,86 @@ err: | |||
1347 | } | 1349 | } |
1348 | 1350 | ||
1349 | /* should be called with rcu_read_lock */ | 1351 | /* should be called with rcu_read_lock */ |
1350 | static int check_leaf(struct trie *t, struct leaf *l, | 1352 | static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l, |
1351 | t_key key, const struct flowi *flp, | 1353 | t_key key, const struct flowi4 *flp, |
1352 | struct fib_result *res) | 1354 | struct fib_result *res, int fib_flags) |
1353 | { | 1355 | { |
1354 | struct leaf_info *li; | 1356 | struct leaf_info *li; |
1355 | struct hlist_head *hhead = &l->list; | 1357 | struct hlist_head *hhead = &l->list; |
1356 | struct hlist_node *node; | 1358 | struct hlist_node *node; |
1357 | 1359 | ||
1358 | hlist_for_each_entry_rcu(li, node, hhead, hlist) { | 1360 | hlist_for_each_entry_rcu(li, node, hhead, hlist) { |
1359 | int err; | 1361 | struct fib_alias *fa; |
1360 | int plen = li->plen; | 1362 | int plen = li->plen; |
1361 | __be32 mask = inet_make_mask(plen); | 1363 | __be32 mask = inet_make_mask(plen); |
1362 | 1364 | ||
1363 | if (l->key != (key & ntohl(mask))) | 1365 | if (l->key != (key & ntohl(mask))) |
1364 | continue; | 1366 | continue; |
1365 | 1367 | ||
1366 | err = fib_semantic_match(&li->falh, flp, res, plen); | 1368 | list_for_each_entry_rcu(fa, &li->falh, fa_list) { |
1369 | struct fib_info *fi = fa->fa_info; | ||
1370 | int nhsel, err; | ||
1367 | 1371 | ||
1372 | if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos) | ||
1373 | continue; | ||
1374 | if (fa->fa_info->fib_scope < flp->flowi4_scope) | ||
1375 | continue; | ||
1376 | fib_alias_accessed(fa); | ||
1377 | err = fib_props[fa->fa_type].error; | ||
1378 | if (err) { | ||
1368 | #ifdef CONFIG_IP_FIB_TRIE_STATS | 1379 | #ifdef CONFIG_IP_FIB_TRIE_STATS |
1369 | if (err <= 0) | 1380 | t->stats.semantic_match_passed++; |
1370 | t->stats.semantic_match_passed++; | 1381 | #endif |
1371 | else | 1382 | return err; |
1372 | t->stats.semantic_match_miss++; | 1383 | } |
1384 | if (fi->fib_flags & RTNH_F_DEAD) | ||
1385 | continue; | ||
1386 | for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) { | ||
1387 | const struct fib_nh *nh = &fi->fib_nh[nhsel]; | ||
1388 | |||
1389 | if (nh->nh_flags & RTNH_F_DEAD) | ||
1390 | continue; | ||
1391 | if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif) | ||
1392 | continue; | ||
1393 | |||
1394 | #ifdef CONFIG_IP_FIB_TRIE_STATS | ||
1395 | t->stats.semantic_match_passed++; | ||
1396 | #endif | ||
1397 | res->prefixlen = plen; | ||
1398 | res->nh_sel = nhsel; | ||
1399 | res->type = fa->fa_type; | ||
1400 | res->scope = fa->fa_info->fib_scope; | ||
1401 | res->fi = fi; | ||
1402 | res->table = tb; | ||
1403 | res->fa_head = &li->falh; | ||
1404 | if (!(fib_flags & FIB_LOOKUP_NOREF)) | ||
1405 | atomic_inc(&res->fi->fib_clntref); | ||
1406 | return 0; | ||
1407 | } | ||
1408 | } | ||
1409 | |||
1410 | #ifdef CONFIG_IP_FIB_TRIE_STATS | ||
1411 | t->stats.semantic_match_miss++; | ||
1373 | #endif | 1412 | #endif |
1374 | if (err <= 0) | ||
1375 | return err; | ||
1376 | } | 1413 | } |
1377 | 1414 | ||
1378 | return 1; | 1415 | return 1; |
1379 | } | 1416 | } |
1380 | 1417 | ||
1381 | int fib_table_lookup(struct fib_table *tb, const struct flowi *flp, | 1418 | int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, |
1382 | struct fib_result *res) | 1419 | struct fib_result *res, int fib_flags) |
1383 | { | 1420 | { |
1384 | struct trie *t = (struct trie *) tb->tb_data; | 1421 | struct trie *t = (struct trie *) tb->tb_data; |
1385 | int ret; | 1422 | int ret; |
1386 | struct node *n; | 1423 | struct rt_trie_node *n; |
1387 | struct tnode *pn; | 1424 | struct tnode *pn; |
1388 | int pos, bits; | 1425 | unsigned int pos, bits; |
1389 | t_key key = ntohl(flp->fl4_dst); | 1426 | t_key key = ntohl(flp->daddr); |
1390 | int chopped_off; | 1427 | unsigned int chopped_off; |
1391 | t_key cindex = 0; | 1428 | t_key cindex = 0; |
1392 | int current_prefix_length = KEYLENGTH; | 1429 | unsigned int current_prefix_length = KEYLENGTH; |
1393 | struct tnode *cn; | 1430 | struct tnode *cn; |
1394 | t_key node_prefix, key_prefix, pref_mismatch; | 1431 | t_key pref_mismatch; |
1395 | int mp; | ||
1396 | 1432 | ||
1397 | rcu_read_lock(); | 1433 | rcu_read_lock(); |
1398 | 1434 | ||
@@ -1406,7 +1442,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp, | |||
1406 | 1442 | ||
1407 | /* Just a leaf? */ | 1443 | /* Just a leaf? */ |
1408 | if (IS_LEAF(n)) { | 1444 | if (IS_LEAF(n)) { |
1409 | ret = check_leaf(t, (struct leaf *)n, key, flp, res); | 1445 | ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags); |
1410 | goto found; | 1446 | goto found; |
1411 | } | 1447 | } |
1412 | 1448 | ||
@@ -1431,7 +1467,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp, | |||
1431 | } | 1467 | } |
1432 | 1468 | ||
1433 | if (IS_LEAF(n)) { | 1469 | if (IS_LEAF(n)) { |
1434 | ret = check_leaf(t, (struct leaf *)n, key, flp, res); | 1470 | ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags); |
1435 | if (ret > 0) | 1471 | if (ret > 0) |
1436 | goto backtrace; | 1472 | goto backtrace; |
1437 | goto found; | 1473 | goto found; |
@@ -1507,10 +1543,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp, | |||
1507 | * matching prefix. | 1543 | * matching prefix. |
1508 | */ | 1544 | */ |
1509 | 1545 | ||
1510 | node_prefix = mask_pfx(cn->key, cn->pos); | 1546 | pref_mismatch = mask_pfx(cn->key ^ key, cn->pos); |
1511 | key_prefix = mask_pfx(key, cn->pos); | ||
1512 | pref_mismatch = key_prefix^node_prefix; | ||
1513 | mp = 0; | ||
1514 | 1547 | ||
1515 | /* | 1548 | /* |
1516 | * In short: If skipped bits in this node do not match | 1549 | * In short: If skipped bits in this node do not match |
@@ -1518,13 +1551,9 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp, | |||
1518 | * state.directly. | 1551 | * state.directly. |
1519 | */ | 1552 | */ |
1520 | if (pref_mismatch) { | 1553 | if (pref_mismatch) { |
1521 | while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) { | 1554 | int mp = KEYLENGTH - fls(pref_mismatch); |
1522 | mp++; | ||
1523 | pref_mismatch = pref_mismatch << 1; | ||
1524 | } | ||
1525 | key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp); | ||
1526 | 1555 | ||
1527 | if (key_prefix != 0) | 1556 | if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0) |
1528 | goto backtrace; | 1557 | goto backtrace; |
1529 | 1558 | ||
1530 | if (current_prefix_length >= cn->pos) | 1559 | if (current_prefix_length >= cn->pos) |
@@ -1556,7 +1585,7 @@ backtrace: | |||
1556 | if (chopped_off <= pn->bits) { | 1585 | if (chopped_off <= pn->bits) { |
1557 | cindex &= ~(1 << (chopped_off-1)); | 1586 | cindex &= ~(1 << (chopped_off-1)); |
1558 | } else { | 1587 | } else { |
1559 | struct tnode *parent = node_parent_rcu((struct node *) pn); | 1588 | struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn); |
1560 | if (!parent) | 1589 | if (!parent) |
1561 | goto failed; | 1590 | goto failed; |
1562 | 1591 | ||
@@ -1583,7 +1612,7 @@ found: | |||
1583 | */ | 1612 | */ |
1584 | static void trie_leaf_remove(struct trie *t, struct leaf *l) | 1613 | static void trie_leaf_remove(struct trie *t, struct leaf *l) |
1585 | { | 1614 | { |
1586 | struct tnode *tp = node_parent((struct node *) l); | 1615 | struct tnode *tp = node_parent((struct rt_trie_node *) l); |
1587 | 1616 | ||
1588 | pr_debug("entering trie_leaf_remove(%p)\n", l); | 1617 | pr_debug("entering trie_leaf_remove(%p)\n", l); |
1589 | 1618 | ||
@@ -1644,7 +1673,9 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) | |||
1644 | 1673 | ||
1645 | if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) && | 1674 | if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) && |
1646 | (cfg->fc_scope == RT_SCOPE_NOWHERE || | 1675 | (cfg->fc_scope == RT_SCOPE_NOWHERE || |
1647 | fa->fa_scope == cfg->fc_scope) && | 1676 | fa->fa_info->fib_scope == cfg->fc_scope) && |
1677 | (!cfg->fc_prefsrc || | ||
1678 | fi->fib_prefsrc == cfg->fc_prefsrc) && | ||
1648 | (!cfg->fc_protocol || | 1679 | (!cfg->fc_protocol || |
1649 | fi->fib_protocol == cfg->fc_protocol) && | 1680 | fi->fib_protocol == cfg->fc_protocol) && |
1650 | fib_nh_match(cfg, fi) == 0) { | 1681 | fib_nh_match(cfg, fi) == 0) { |
@@ -1665,6 +1696,9 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) | |||
1665 | 1696 | ||
1666 | list_del_rcu(&fa->fa_list); | 1697 | list_del_rcu(&fa->fa_list); |
1667 | 1698 | ||
1699 | if (!plen) | ||
1700 | tb->tb_num_default--; | ||
1701 | |||
1668 | if (list_empty(fa_head)) { | 1702 | if (list_empty(fa_head)) { |
1669 | hlist_del_rcu(&li->hlist); | 1703 | hlist_del_rcu(&li->hlist); |
1670 | free_leaf_info(li); | 1704 | free_leaf_info(li); |
@@ -1721,7 +1755,7 @@ static int trie_flush_leaf(struct leaf *l) | |||
1721 | * Scan for the next right leaf starting at node p->child[idx] | 1755 | * Scan for the next right leaf starting at node p->child[idx] |
1722 | * Since we have back pointer, no recursion necessary. | 1756 | * Since we have back pointer, no recursion necessary. |
1723 | */ | 1757 | */ |
1724 | static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c) | 1758 | static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c) |
1725 | { | 1759 | { |
1726 | do { | 1760 | do { |
1727 | t_key idx; | 1761 | t_key idx; |
@@ -1737,7 +1771,7 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c) | |||
1737 | continue; | 1771 | continue; |
1738 | 1772 | ||
1739 | if (IS_LEAF(c)) { | 1773 | if (IS_LEAF(c)) { |
1740 | prefetch(p->child[idx]); | 1774 | prefetch(rcu_dereference_rtnl(p->child[idx])); |
1741 | return (struct leaf *) c; | 1775 | return (struct leaf *) c; |
1742 | } | 1776 | } |
1743 | 1777 | ||
@@ -1747,17 +1781,15 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c) | |||
1747 | } | 1781 | } |
1748 | 1782 | ||
1749 | /* Node empty, walk back up to parent */ | 1783 | /* Node empty, walk back up to parent */ |
1750 | c = (struct node *) p; | 1784 | c = (struct rt_trie_node *) p; |
1751 | } while ( (p = node_parent_rcu(c)) != NULL); | 1785 | } while ((p = node_parent_rcu(c)) != NULL); |
1752 | 1786 | ||
1753 | return NULL; /* Root of trie */ | 1787 | return NULL; /* Root of trie */ |
1754 | } | 1788 | } |
1755 | 1789 | ||
1756 | static struct leaf *trie_firstleaf(struct trie *t) | 1790 | static struct leaf *trie_firstleaf(struct trie *t) |
1757 | { | 1791 | { |
1758 | struct tnode *n = (struct tnode *) rcu_dereference_check(t->trie, | 1792 | struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie); |
1759 | rcu_read_lock_held() || | ||
1760 | lockdep_rtnl_is_held()); | ||
1761 | 1793 | ||
1762 | if (!n) | 1794 | if (!n) |
1763 | return NULL; | 1795 | return NULL; |
@@ -1770,7 +1802,7 @@ static struct leaf *trie_firstleaf(struct trie *t) | |||
1770 | 1802 | ||
1771 | static struct leaf *trie_nextleaf(struct leaf *l) | 1803 | static struct leaf *trie_nextleaf(struct leaf *l) |
1772 | { | 1804 | { |
1773 | struct node *c = (struct node *) l; | 1805 | struct rt_trie_node *c = (struct rt_trie_node *) l; |
1774 | struct tnode *p = node_parent_rcu(c); | 1806 | struct tnode *p = node_parent_rcu(c); |
1775 | 1807 | ||
1776 | if (!p) | 1808 | if (!p) |
@@ -1814,77 +1846,9 @@ int fib_table_flush(struct fib_table *tb) | |||
1814 | return found; | 1846 | return found; |
1815 | } | 1847 | } |
1816 | 1848 | ||
1817 | void fib_table_select_default(struct fib_table *tb, | 1849 | void fib_free_table(struct fib_table *tb) |
1818 | const struct flowi *flp, | ||
1819 | struct fib_result *res) | ||
1820 | { | 1850 | { |
1821 | struct trie *t = (struct trie *) tb->tb_data; | 1851 | kfree(tb); |
1822 | int order, last_idx; | ||
1823 | struct fib_info *fi = NULL; | ||
1824 | struct fib_info *last_resort; | ||
1825 | struct fib_alias *fa = NULL; | ||
1826 | struct list_head *fa_head; | ||
1827 | struct leaf *l; | ||
1828 | |||
1829 | last_idx = -1; | ||
1830 | last_resort = NULL; | ||
1831 | order = -1; | ||
1832 | |||
1833 | rcu_read_lock(); | ||
1834 | |||
1835 | l = fib_find_node(t, 0); | ||
1836 | if (!l) | ||
1837 | goto out; | ||
1838 | |||
1839 | fa_head = get_fa_head(l, 0); | ||
1840 | if (!fa_head) | ||
1841 | goto out; | ||
1842 | |||
1843 | if (list_empty(fa_head)) | ||
1844 | goto out; | ||
1845 | |||
1846 | list_for_each_entry_rcu(fa, fa_head, fa_list) { | ||
1847 | struct fib_info *next_fi = fa->fa_info; | ||
1848 | |||
1849 | if (fa->fa_scope != res->scope || | ||
1850 | fa->fa_type != RTN_UNICAST) | ||
1851 | continue; | ||
1852 | |||
1853 | if (next_fi->fib_priority > res->fi->fib_priority) | ||
1854 | break; | ||
1855 | if (!next_fi->fib_nh[0].nh_gw || | ||
1856 | next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) | ||
1857 | continue; | ||
1858 | fa->fa_state |= FA_S_ACCESSED; | ||
1859 | |||
1860 | if (fi == NULL) { | ||
1861 | if (next_fi != res->fi) | ||
1862 | break; | ||
1863 | } else if (!fib_detect_death(fi, order, &last_resort, | ||
1864 | &last_idx, tb->tb_default)) { | ||
1865 | fib_result_assign(res, fi); | ||
1866 | tb->tb_default = order; | ||
1867 | goto out; | ||
1868 | } | ||
1869 | fi = next_fi; | ||
1870 | order++; | ||
1871 | } | ||
1872 | if (order <= 0 || fi == NULL) { | ||
1873 | tb->tb_default = -1; | ||
1874 | goto out; | ||
1875 | } | ||
1876 | |||
1877 | if (!fib_detect_death(fi, order, &last_resort, &last_idx, | ||
1878 | tb->tb_default)) { | ||
1879 | fib_result_assign(res, fi); | ||
1880 | tb->tb_default = order; | ||
1881 | goto out; | ||
1882 | } | ||
1883 | if (last_idx >= 0) | ||
1884 | fib_result_assign(res, last_resort); | ||
1885 | tb->tb_default = last_idx; | ||
1886 | out: | ||
1887 | rcu_read_unlock(); | ||
1888 | } | 1852 | } |
1889 | 1853 | ||
1890 | static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, | 1854 | static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, |
@@ -1911,7 +1875,6 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, | |||
1911 | RTM_NEWROUTE, | 1875 | RTM_NEWROUTE, |
1912 | tb->tb_id, | 1876 | tb->tb_id, |
1913 | fa->fa_type, | 1877 | fa->fa_type, |
1914 | fa->fa_scope, | ||
1915 | xkey, | 1878 | xkey, |
1916 | plen, | 1879 | plen, |
1917 | fa->fa_tos, | 1880 | fa->fa_tos, |
@@ -2001,7 +1964,7 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, | |||
2001 | return skb->len; | 1964 | return skb->len; |
2002 | } | 1965 | } |
2003 | 1966 | ||
2004 | void __init fib_hash_init(void) | 1967 | void __init fib_trie_init(void) |
2005 | { | 1968 | { |
2006 | fn_alias_kmem = kmem_cache_create("ip_fib_alias", | 1969 | fn_alias_kmem = kmem_cache_create("ip_fib_alias", |
2007 | sizeof(struct fib_alias), | 1970 | sizeof(struct fib_alias), |
@@ -2014,8 +1977,7 @@ void __init fib_hash_init(void) | |||
2014 | } | 1977 | } |
2015 | 1978 | ||
2016 | 1979 | ||
2017 | /* Fix more generic FIB names for init later */ | 1980 | struct fib_table *fib_trie_table(u32 id) |
2018 | struct fib_table *fib_hash_table(u32 id) | ||
2019 | { | 1981 | { |
2020 | struct fib_table *tb; | 1982 | struct fib_table *tb; |
2021 | struct trie *t; | 1983 | struct trie *t; |
@@ -2027,13 +1989,11 @@ struct fib_table *fib_hash_table(u32 id) | |||
2027 | 1989 | ||
2028 | tb->tb_id = id; | 1990 | tb->tb_id = id; |
2029 | tb->tb_default = -1; | 1991 | tb->tb_default = -1; |
1992 | tb->tb_num_default = 0; | ||
2030 | 1993 | ||
2031 | t = (struct trie *) tb->tb_data; | 1994 | t = (struct trie *) tb->tb_data; |
2032 | memset(t, 0, sizeof(*t)); | 1995 | memset(t, 0, sizeof(*t)); |
2033 | 1996 | ||
2034 | if (id == RT_TABLE_LOCAL) | ||
2035 | pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION); | ||
2036 | |||
2037 | return tb; | 1997 | return tb; |
2038 | } | 1998 | } |
2039 | 1999 | ||
@@ -2043,14 +2003,14 @@ struct fib_trie_iter { | |||
2043 | struct seq_net_private p; | 2003 | struct seq_net_private p; |
2044 | struct fib_table *tb; | 2004 | struct fib_table *tb; |
2045 | struct tnode *tnode; | 2005 | struct tnode *tnode; |
2046 | unsigned index; | 2006 | unsigned int index; |
2047 | unsigned depth; | 2007 | unsigned int depth; |
2048 | }; | 2008 | }; |
2049 | 2009 | ||
2050 | static struct node *fib_trie_get_next(struct fib_trie_iter *iter) | 2010 | static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter) |
2051 | { | 2011 | { |
2052 | struct tnode *tn = iter->tnode; | 2012 | struct tnode *tn = iter->tnode; |
2053 | unsigned cindex = iter->index; | 2013 | unsigned int cindex = iter->index; |
2054 | struct tnode *p; | 2014 | struct tnode *p; |
2055 | 2015 | ||
2056 | /* A single entry routing table */ | 2016 | /* A single entry routing table */ |
@@ -2061,7 +2021,7 @@ static struct node *fib_trie_get_next(struct fib_trie_iter *iter) | |||
2061 | iter->tnode, iter->index, iter->depth); | 2021 | iter->tnode, iter->index, iter->depth); |
2062 | rescan: | 2022 | rescan: |
2063 | while (cindex < (1<<tn->bits)) { | 2023 | while (cindex < (1<<tn->bits)) { |
2064 | struct node *n = tnode_get_child_rcu(tn, cindex); | 2024 | struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex); |
2065 | 2025 | ||
2066 | if (n) { | 2026 | if (n) { |
2067 | if (IS_LEAF(n)) { | 2027 | if (IS_LEAF(n)) { |
@@ -2080,7 +2040,7 @@ rescan: | |||
2080 | } | 2040 | } |
2081 | 2041 | ||
2082 | /* Current node exhausted, pop back up */ | 2042 | /* Current node exhausted, pop back up */ |
2083 | p = node_parent_rcu((struct node *)tn); | 2043 | p = node_parent_rcu((struct rt_trie_node *)tn); |
2084 | if (p) { | 2044 | if (p) { |
2085 | cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1; | 2045 | cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1; |
2086 | tn = p; | 2046 | tn = p; |
@@ -2092,10 +2052,10 @@ rescan: | |||
2092 | return NULL; | 2052 | return NULL; |
2093 | } | 2053 | } |
2094 | 2054 | ||
2095 | static struct node *fib_trie_get_first(struct fib_trie_iter *iter, | 2055 | static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter, |
2096 | struct trie *t) | 2056 | struct trie *t) |
2097 | { | 2057 | { |
2098 | struct node *n; | 2058 | struct rt_trie_node *n; |
2099 | 2059 | ||
2100 | if (!t) | 2060 | if (!t) |
2101 | return NULL; | 2061 | return NULL; |
@@ -2119,7 +2079,7 @@ static struct node *fib_trie_get_first(struct fib_trie_iter *iter, | |||
2119 | 2079 | ||
2120 | static void trie_collect_stats(struct trie *t, struct trie_stat *s) | 2080 | static void trie_collect_stats(struct trie *t, struct trie_stat *s) |
2121 | { | 2081 | { |
2122 | struct node *n; | 2082 | struct rt_trie_node *n; |
2123 | struct fib_trie_iter iter; | 2083 | struct fib_trie_iter iter; |
2124 | 2084 | ||
2125 | memset(s, 0, sizeof(*s)); | 2085 | memset(s, 0, sizeof(*s)); |
@@ -2159,7 +2119,7 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s) | |||
2159 | */ | 2119 | */ |
2160 | static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) | 2120 | static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) |
2161 | { | 2121 | { |
2162 | unsigned i, max, pointers, bytes, avdepth; | 2122 | unsigned int i, max, pointers, bytes, avdepth; |
2163 | 2123 | ||
2164 | if (stat->leaves) | 2124 | if (stat->leaves) |
2165 | avdepth = stat->totdepth*100 / stat->leaves; | 2125 | avdepth = stat->totdepth*100 / stat->leaves; |
@@ -2192,7 +2152,7 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) | |||
2192 | seq_putc(seq, '\n'); | 2152 | seq_putc(seq, '\n'); |
2193 | seq_printf(seq, "\tPointers: %u\n", pointers); | 2153 | seq_printf(seq, "\tPointers: %u\n", pointers); |
2194 | 2154 | ||
2195 | bytes += sizeof(struct node *) * pointers; | 2155 | bytes += sizeof(struct rt_trie_node *) * pointers; |
2196 | seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers); | 2156 | seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers); |
2197 | seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024); | 2157 | seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024); |
2198 | } | 2158 | } |
@@ -2273,7 +2233,7 @@ static const struct file_operations fib_triestat_fops = { | |||
2273 | .release = single_release_net, | 2233 | .release = single_release_net, |
2274 | }; | 2234 | }; |
2275 | 2235 | ||
2276 | static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos) | 2236 | static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos) |
2277 | { | 2237 | { |
2278 | struct fib_trie_iter *iter = seq->private; | 2238 | struct fib_trie_iter *iter = seq->private; |
2279 | struct net *net = seq_file_net(seq); | 2239 | struct net *net = seq_file_net(seq); |
@@ -2286,7 +2246,7 @@ static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos) | |||
2286 | struct fib_table *tb; | 2246 | struct fib_table *tb; |
2287 | 2247 | ||
2288 | hlist_for_each_entry_rcu(tb, node, head, tb_hlist) { | 2248 | hlist_for_each_entry_rcu(tb, node, head, tb_hlist) { |
2289 | struct node *n; | 2249 | struct rt_trie_node *n; |
2290 | 2250 | ||
2291 | for (n = fib_trie_get_first(iter, | 2251 | for (n = fib_trie_get_first(iter, |
2292 | (struct trie *) tb->tb_data); | 2252 | (struct trie *) tb->tb_data); |
@@ -2315,7 +2275,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) | |||
2315 | struct fib_table *tb = iter->tb; | 2275 | struct fib_table *tb = iter->tb; |
2316 | struct hlist_node *tb_node; | 2276 | struct hlist_node *tb_node; |
2317 | unsigned int h; | 2277 | unsigned int h; |
2318 | struct node *n; | 2278 | struct rt_trie_node *n; |
2319 | 2279 | ||
2320 | ++*pos; | 2280 | ++*pos; |
2321 | /* next node in same table */ | 2281 | /* next node in same table */ |
@@ -2325,7 +2285,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) | |||
2325 | 2285 | ||
2326 | /* walk rest of this hash chain */ | 2286 | /* walk rest of this hash chain */ |
2327 | h = tb->tb_id & (FIB_TABLE_HASHSZ - 1); | 2287 | h = tb->tb_id & (FIB_TABLE_HASHSZ - 1); |
2328 | while ( (tb_node = rcu_dereference(tb->tb_hlist.next)) ) { | 2288 | while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) { |
2329 | tb = hlist_entry(tb_node, struct fib_table, tb_hlist); | 2289 | tb = hlist_entry(tb_node, struct fib_table, tb_hlist); |
2330 | n = fib_trie_get_first(iter, (struct trie *) tb->tb_data); | 2290 | n = fib_trie_get_first(iter, (struct trie *) tb->tb_data); |
2331 | if (n) | 2291 | if (n) |
@@ -2356,7 +2316,8 @@ static void fib_trie_seq_stop(struct seq_file *seq, void *v) | |||
2356 | 2316 | ||
2357 | static void seq_indent(struct seq_file *seq, int n) | 2317 | static void seq_indent(struct seq_file *seq, int n) |
2358 | { | 2318 | { |
2359 | while (n-- > 0) seq_puts(seq, " "); | 2319 | while (n-- > 0) |
2320 | seq_puts(seq, " "); | ||
2360 | } | 2321 | } |
2361 | 2322 | ||
2362 | static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s) | 2323 | static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s) |
@@ -2388,7 +2349,7 @@ static const char *const rtn_type_names[__RTN_MAX] = { | |||
2388 | [RTN_XRESOLVE] = "XRESOLVE", | 2349 | [RTN_XRESOLVE] = "XRESOLVE", |
2389 | }; | 2350 | }; |
2390 | 2351 | ||
2391 | static inline const char *rtn_type(char *buf, size_t len, unsigned t) | 2352 | static inline const char *rtn_type(char *buf, size_t len, unsigned int t) |
2392 | { | 2353 | { |
2393 | if (t < __RTN_MAX && rtn_type_names[t]) | 2354 | if (t < __RTN_MAX && rtn_type_names[t]) |
2394 | return rtn_type_names[t]; | 2355 | return rtn_type_names[t]; |
@@ -2400,7 +2361,7 @@ static inline const char *rtn_type(char *buf, size_t len, unsigned t) | |||
2400 | static int fib_trie_seq_show(struct seq_file *seq, void *v) | 2361 | static int fib_trie_seq_show(struct seq_file *seq, void *v) |
2401 | { | 2362 | { |
2402 | const struct fib_trie_iter *iter = seq->private; | 2363 | const struct fib_trie_iter *iter = seq->private; |
2403 | struct node *n = v; | 2364 | struct rt_trie_node *n = v; |
2404 | 2365 | ||
2405 | if (!node_parent_rcu(n)) | 2366 | if (!node_parent_rcu(n)) |
2406 | fib_table_print(seq, iter->tb); | 2367 | fib_table_print(seq, iter->tb); |
@@ -2432,7 +2393,7 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v) | |||
2432 | seq_indent(seq, iter->depth+1); | 2393 | seq_indent(seq, iter->depth+1); |
2433 | seq_printf(seq, " /%d %s %s", li->plen, | 2394 | seq_printf(seq, " /%d %s %s", li->plen, |
2434 | rtn_scope(buf1, sizeof(buf1), | 2395 | rtn_scope(buf1, sizeof(buf1), |
2435 | fa->fa_scope), | 2396 | fa->fa_info->fib_scope), |
2436 | rtn_type(buf2, sizeof(buf2), | 2397 | rtn_type(buf2, sizeof(buf2), |
2437 | fa->fa_type)); | 2398 | fa->fa_type)); |
2438 | if (fa->fa_tos) | 2399 | if (fa->fa_tos) |
@@ -2544,13 +2505,12 @@ static void fib_route_seq_stop(struct seq_file *seq, void *v) | |||
2544 | rcu_read_unlock(); | 2505 | rcu_read_unlock(); |
2545 | } | 2506 | } |
2546 | 2507 | ||
2547 | static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi) | 2508 | static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi) |
2548 | { | 2509 | { |
2549 | static unsigned type2flags[RTN_MAX + 1] = { | 2510 | unsigned int flags = 0; |
2550 | [7] = RTF_REJECT, [8] = RTF_REJECT, | ||
2551 | }; | ||
2552 | unsigned flags = type2flags[type]; | ||
2553 | 2511 | ||
2512 | if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT) | ||
2513 | flags = RTF_REJECT; | ||
2554 | if (fi && fi->fib_nh->nh_gw) | 2514 | if (fi && fi->fib_nh->nh_gw) |
2555 | flags |= RTF_GATEWAY; | 2515 | flags |= RTF_GATEWAY; |
2556 | if (mask == htonl(0xFFFFFFFF)) | 2516 | if (mask == htonl(0xFFFFFFFF)) |
@@ -2562,7 +2522,7 @@ static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi) | |||
2562 | /* | 2522 | /* |
2563 | * This outputs /proc/net/route. | 2523 | * This outputs /proc/net/route. |
2564 | * The format of the file is not supposed to be changed | 2524 | * The format of the file is not supposed to be changed |
2565 | * and needs to be same as fib_hash output to avoid breaking | 2525 | * and needs to be same as fib_hash output to avoid breaking |
2566 | * legacy utilities | 2526 | * legacy utilities |
2567 | */ | 2527 | */ |
2568 | static int fib_route_seq_show(struct seq_file *seq, void *v) | 2528 | static int fib_route_seq_show(struct seq_file *seq, void *v) |
@@ -2587,7 +2547,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) | |||
2587 | 2547 | ||
2588 | list_for_each_entry_rcu(fa, &li->falh, fa_list) { | 2548 | list_for_each_entry_rcu(fa, &li->falh, fa_list) { |
2589 | const struct fib_info *fi = fa->fa_info; | 2549 | const struct fib_info *fi = fa->fa_info; |
2590 | unsigned flags = fib_flag_trans(fa->fa_type, mask, fi); | 2550 | unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi); |
2591 | int len; | 2551 | int len; |
2592 | 2552 | ||
2593 | if (fa->fa_type == RTN_BROADCAST | 2553 | if (fa->fa_type == RTN_BROADCAST |
diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c new file mode 100644 index 000000000000..c6933f2ea310 --- /dev/null +++ b/net/ipv4/gre.c | |||
@@ -0,0 +1,152 @@ | |||
1 | /* | ||
2 | * GRE over IPv4 demultiplexer driver | ||
3 | * | ||
4 | * Authors: Dmitry Kozlov (xeb@mail.ru) | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | */ | ||
12 | |||
13 | #include <linux/module.h> | ||
14 | #include <linux/kernel.h> | ||
15 | #include <linux/kmod.h> | ||
16 | #include <linux/skbuff.h> | ||
17 | #include <linux/in.h> | ||
18 | #include <linux/netdevice.h> | ||
19 | #include <linux/version.h> | ||
20 | #include <linux/spinlock.h> | ||
21 | #include <net/protocol.h> | ||
22 | #include <net/gre.h> | ||
23 | |||
24 | |||
25 | static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly; | ||
26 | static DEFINE_SPINLOCK(gre_proto_lock); | ||
27 | |||
28 | int gre_add_protocol(const struct gre_protocol *proto, u8 version) | ||
29 | { | ||
30 | if (version >= GREPROTO_MAX) | ||
31 | goto err_out; | ||
32 | |||
33 | spin_lock(&gre_proto_lock); | ||
34 | if (gre_proto[version]) | ||
35 | goto err_out_unlock; | ||
36 | |||
37 | rcu_assign_pointer(gre_proto[version], proto); | ||
38 | spin_unlock(&gre_proto_lock); | ||
39 | return 0; | ||
40 | |||
41 | err_out_unlock: | ||
42 | spin_unlock(&gre_proto_lock); | ||
43 | err_out: | ||
44 | return -1; | ||
45 | } | ||
46 | EXPORT_SYMBOL_GPL(gre_add_protocol); | ||
47 | |||
48 | int gre_del_protocol(const struct gre_protocol *proto, u8 version) | ||
49 | { | ||
50 | if (version >= GREPROTO_MAX) | ||
51 | goto err_out; | ||
52 | |||
53 | spin_lock(&gre_proto_lock); | ||
54 | if (rcu_dereference_protected(gre_proto[version], | ||
55 | lockdep_is_held(&gre_proto_lock)) != proto) | ||
56 | goto err_out_unlock; | ||
57 | rcu_assign_pointer(gre_proto[version], NULL); | ||
58 | spin_unlock(&gre_proto_lock); | ||
59 | synchronize_rcu(); | ||
60 | return 0; | ||
61 | |||
62 | err_out_unlock: | ||
63 | spin_unlock(&gre_proto_lock); | ||
64 | err_out: | ||
65 | return -1; | ||
66 | } | ||
67 | EXPORT_SYMBOL_GPL(gre_del_protocol); | ||
68 | |||
69 | static int gre_rcv(struct sk_buff *skb) | ||
70 | { | ||
71 | const struct gre_protocol *proto; | ||
72 | u8 ver; | ||
73 | int ret; | ||
74 | |||
75 | if (!pskb_may_pull(skb, 12)) | ||
76 | goto drop; | ||
77 | |||
78 | ver = skb->data[1]&0x7f; | ||
79 | if (ver >= GREPROTO_MAX) | ||
80 | goto drop; | ||
81 | |||
82 | rcu_read_lock(); | ||
83 | proto = rcu_dereference(gre_proto[ver]); | ||
84 | if (!proto || !proto->handler) | ||
85 | goto drop_unlock; | ||
86 | ret = proto->handler(skb); | ||
87 | rcu_read_unlock(); | ||
88 | return ret; | ||
89 | |||
90 | drop_unlock: | ||
91 | rcu_read_unlock(); | ||
92 | drop: | ||
93 | kfree_skb(skb); | ||
94 | return NET_RX_DROP; | ||
95 | } | ||
96 | |||
97 | static void gre_err(struct sk_buff *skb, u32 info) | ||
98 | { | ||
99 | const struct gre_protocol *proto; | ||
100 | u8 ver; | ||
101 | |||
102 | if (!pskb_may_pull(skb, 12)) | ||
103 | goto drop; | ||
104 | |||
105 | ver = skb->data[1]&0x7f; | ||
106 | if (ver >= GREPROTO_MAX) | ||
107 | goto drop; | ||
108 | |||
109 | rcu_read_lock(); | ||
110 | proto = rcu_dereference(gre_proto[ver]); | ||
111 | if (!proto || !proto->err_handler) | ||
112 | goto drop_unlock; | ||
113 | proto->err_handler(skb, info); | ||
114 | rcu_read_unlock(); | ||
115 | return; | ||
116 | |||
117 | drop_unlock: | ||
118 | rcu_read_unlock(); | ||
119 | drop: | ||
120 | kfree_skb(skb); | ||
121 | } | ||
122 | |||
123 | static const struct net_protocol net_gre_protocol = { | ||
124 | .handler = gre_rcv, | ||
125 | .err_handler = gre_err, | ||
126 | .netns_ok = 1, | ||
127 | }; | ||
128 | |||
129 | static int __init gre_init(void) | ||
130 | { | ||
131 | pr_info("GRE over IPv4 demultiplexor driver"); | ||
132 | |||
133 | if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) { | ||
134 | pr_err("gre: can't add protocol\n"); | ||
135 | return -EAGAIN; | ||
136 | } | ||
137 | |||
138 | return 0; | ||
139 | } | ||
140 | |||
141 | static void __exit gre_exit(void) | ||
142 | { | ||
143 | inet_del_protocol(&net_gre_protocol, IPPROTO_GRE); | ||
144 | } | ||
145 | |||
146 | module_init(gre_init); | ||
147 | module_exit(gre_exit); | ||
148 | |||
149 | MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver"); | ||
150 | MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)"); | ||
151 | MODULE_LICENSE("GPL"); | ||
152 | |||
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index a0d847c7cba5..5395e45dcce6 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c | |||
@@ -83,6 +83,7 @@ | |||
83 | #include <net/tcp.h> | 83 | #include <net/tcp.h> |
84 | #include <net/udp.h> | 84 | #include <net/udp.h> |
85 | #include <net/raw.h> | 85 | #include <net/raw.h> |
86 | #include <net/ping.h> | ||
86 | #include <linux/skbuff.h> | 87 | #include <linux/skbuff.h> |
87 | #include <net/sock.h> | 88 | #include <net/sock.h> |
88 | #include <linux/errno.h> | 89 | #include <linux/errno.h> |
@@ -108,8 +109,7 @@ struct icmp_bxm { | |||
108 | __be32 times[3]; | 109 | __be32 times[3]; |
109 | } data; | 110 | } data; |
110 | int head_len; | 111 | int head_len; |
111 | struct ip_options replyopts; | 112 | struct ip_options_data replyopts; |
112 | unsigned char optbuf[40]; | ||
113 | }; | 113 | }; |
114 | 114 | ||
115 | /* An array of errno for error messages from dest unreach. */ | 115 | /* An array of errno for error messages from dest unreach. */ |
@@ -233,48 +233,11 @@ static inline void icmp_xmit_unlock(struct sock *sk) | |||
233 | * Send an ICMP frame. | 233 | * Send an ICMP frame. |
234 | */ | 234 | */ |
235 | 235 | ||
236 | /* | 236 | static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, |
237 | * Check transmit rate limitation for given message. | 237 | struct flowi4 *fl4, int type, int code) |
238 | * The rate information is held in the destination cache now. | ||
239 | * This function is generic and could be used for other purposes | ||
240 | * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov. | ||
241 | * | ||
242 | * Note that the same dst_entry fields are modified by functions in | ||
243 | * route.c too, but these work for packet destinations while xrlim_allow | ||
244 | * works for icmp destinations. This means the rate limiting information | ||
245 | * for one "ip object" is shared - and these ICMPs are twice limited: | ||
246 | * by source and by destination. | ||
247 | * | ||
248 | * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate | ||
249 | * SHOULD allow setting of rate limits | ||
250 | * | ||
251 | * Shared between ICMPv4 and ICMPv6. | ||
252 | */ | ||
253 | #define XRLIM_BURST_FACTOR 6 | ||
254 | int xrlim_allow(struct dst_entry *dst, int timeout) | ||
255 | { | ||
256 | unsigned long now, token = dst->rate_tokens; | ||
257 | int rc = 0; | ||
258 | |||
259 | now = jiffies; | ||
260 | token += now - dst->rate_last; | ||
261 | dst->rate_last = now; | ||
262 | if (token > XRLIM_BURST_FACTOR * timeout) | ||
263 | token = XRLIM_BURST_FACTOR * timeout; | ||
264 | if (token >= timeout) { | ||
265 | token -= timeout; | ||
266 | rc = 1; | ||
267 | } | ||
268 | dst->rate_tokens = token; | ||
269 | return rc; | ||
270 | } | ||
271 | EXPORT_SYMBOL(xrlim_allow); | ||
272 | |||
273 | static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt, | ||
274 | int type, int code) | ||
275 | { | 238 | { |
276 | struct dst_entry *dst = &rt->dst; | 239 | struct dst_entry *dst = &rt->dst; |
277 | int rc = 1; | 240 | bool rc = true; |
278 | 241 | ||
279 | if (type > NR_ICMP_TYPES) | 242 | if (type > NR_ICMP_TYPES) |
280 | goto out; | 243 | goto out; |
@@ -288,8 +251,12 @@ static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt, | |||
288 | goto out; | 251 | goto out; |
289 | 252 | ||
290 | /* Limit if icmp type is enabled in ratemask. */ | 253 | /* Limit if icmp type is enabled in ratemask. */ |
291 | if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) | 254 | if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) { |
292 | rc = xrlim_allow(dst, net->ipv4.sysctl_icmp_ratelimit); | 255 | if (!rt->peer) |
256 | rt_bind_peer(rt, fl4->daddr, 1); | ||
257 | rc = inet_peer_xrlim_allow(rt->peer, | ||
258 | net->ipv4.sysctl_icmp_ratelimit); | ||
259 | } | ||
293 | out: | 260 | out: |
294 | return rc; | 261 | return rc; |
295 | } | 262 | } |
@@ -324,13 +291,14 @@ static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd, | |||
324 | } | 291 | } |
325 | 292 | ||
326 | static void icmp_push_reply(struct icmp_bxm *icmp_param, | 293 | static void icmp_push_reply(struct icmp_bxm *icmp_param, |
294 | struct flowi4 *fl4, | ||
327 | struct ipcm_cookie *ipc, struct rtable **rt) | 295 | struct ipcm_cookie *ipc, struct rtable **rt) |
328 | { | 296 | { |
329 | struct sock *sk; | 297 | struct sock *sk; |
330 | struct sk_buff *skb; | 298 | struct sk_buff *skb; |
331 | 299 | ||
332 | sk = icmp_sk(dev_net((*rt)->dst.dev)); | 300 | sk = icmp_sk(dev_net((*rt)->dst.dev)); |
333 | if (ip_append_data(sk, icmp_glue_bits, icmp_param, | 301 | if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param, |
334 | icmp_param->data_len+icmp_param->head_len, | 302 | icmp_param->data_len+icmp_param->head_len, |
335 | icmp_param->head_len, | 303 | icmp_param->head_len, |
336 | ipc, rt, MSG_DONTWAIT) < 0) { | 304 | ipc, rt, MSG_DONTWAIT) < 0) { |
@@ -349,7 +317,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param, | |||
349 | icmp_param->head_len, csum); | 317 | icmp_param->head_len, csum); |
350 | icmph->checksum = csum_fold(csum); | 318 | icmph->checksum = csum_fold(csum); |
351 | skb->ip_summed = CHECKSUM_NONE; | 319 | skb->ip_summed = CHECKSUM_NONE; |
352 | ip_push_pending_frames(sk); | 320 | ip_push_pending_frames(sk, fl4); |
353 | } | 321 | } |
354 | } | 322 | } |
355 | 323 | ||
@@ -362,11 +330,12 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) | |||
362 | struct ipcm_cookie ipc; | 330 | struct ipcm_cookie ipc; |
363 | struct rtable *rt = skb_rtable(skb); | 331 | struct rtable *rt = skb_rtable(skb); |
364 | struct net *net = dev_net(rt->dst.dev); | 332 | struct net *net = dev_net(rt->dst.dev); |
333 | struct flowi4 fl4; | ||
365 | struct sock *sk; | 334 | struct sock *sk; |
366 | struct inet_sock *inet; | 335 | struct inet_sock *inet; |
367 | __be32 daddr; | 336 | __be32 daddr; |
368 | 337 | ||
369 | if (ip_options_echo(&icmp_param->replyopts, skb)) | 338 | if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb)) |
370 | return; | 339 | return; |
371 | 340 | ||
372 | sk = icmp_xmit_lock(net); | 341 | sk = icmp_xmit_lock(net); |
@@ -377,32 +346,120 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) | |||
377 | icmp_param->data.icmph.checksum = 0; | 346 | icmp_param->data.icmph.checksum = 0; |
378 | 347 | ||
379 | inet->tos = ip_hdr(skb)->tos; | 348 | inet->tos = ip_hdr(skb)->tos; |
380 | daddr = ipc.addr = rt->rt_src; | 349 | daddr = ipc.addr = ip_hdr(skb)->saddr; |
381 | ipc.opt = NULL; | 350 | ipc.opt = NULL; |
382 | ipc.shtx.flags = 0; | 351 | ipc.tx_flags = 0; |
383 | if (icmp_param->replyopts.optlen) { | 352 | if (icmp_param->replyopts.opt.opt.optlen) { |
384 | ipc.opt = &icmp_param->replyopts; | 353 | ipc.opt = &icmp_param->replyopts.opt; |
385 | if (ipc.opt->srr) | 354 | if (ipc.opt->opt.srr) |
386 | daddr = icmp_param->replyopts.faddr; | 355 | daddr = icmp_param->replyopts.opt.opt.faddr; |
387 | } | 356 | } |
388 | { | 357 | memset(&fl4, 0, sizeof(fl4)); |
389 | struct flowi fl = { .nl_u = { .ip4_u = | 358 | fl4.daddr = daddr; |
390 | { .daddr = daddr, | 359 | fl4.saddr = rt->rt_spec_dst; |
391 | .saddr = rt->rt_spec_dst, | 360 | fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); |
392 | .tos = RT_TOS(ip_hdr(skb)->tos) } }, | 361 | fl4.flowi4_proto = IPPROTO_ICMP; |
393 | .proto = IPPROTO_ICMP }; | 362 | security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); |
394 | security_skb_classify_flow(skb, &fl); | 363 | rt = ip_route_output_key(net, &fl4); |
395 | if (ip_route_output_key(net, &rt, &fl)) | 364 | if (IS_ERR(rt)) |
396 | goto out_unlock; | 365 | goto out_unlock; |
397 | } | 366 | if (icmpv4_xrlim_allow(net, rt, &fl4, icmp_param->data.icmph.type, |
398 | if (icmpv4_xrlim_allow(net, rt, icmp_param->data.icmph.type, | ||
399 | icmp_param->data.icmph.code)) | 367 | icmp_param->data.icmph.code)) |
400 | icmp_push_reply(icmp_param, &ipc, &rt); | 368 | icmp_push_reply(icmp_param, &fl4, &ipc, &rt); |
401 | ip_rt_put(rt); | 369 | ip_rt_put(rt); |
402 | out_unlock: | 370 | out_unlock: |
403 | icmp_xmit_unlock(sk); | 371 | icmp_xmit_unlock(sk); |
404 | } | 372 | } |
405 | 373 | ||
374 | static struct rtable *icmp_route_lookup(struct net *net, | ||
375 | struct flowi4 *fl4, | ||
376 | struct sk_buff *skb_in, | ||
377 | const struct iphdr *iph, | ||
378 | __be32 saddr, u8 tos, | ||
379 | int type, int code, | ||
380 | struct icmp_bxm *param) | ||
381 | { | ||
382 | struct rtable *rt, *rt2; | ||
383 | int err; | ||
384 | |||
385 | memset(fl4, 0, sizeof(*fl4)); | ||
386 | fl4->daddr = (param->replyopts.opt.opt.srr ? | ||
387 | param->replyopts.opt.opt.faddr : iph->saddr); | ||
388 | fl4->saddr = saddr; | ||
389 | fl4->flowi4_tos = RT_TOS(tos); | ||
390 | fl4->flowi4_proto = IPPROTO_ICMP; | ||
391 | fl4->fl4_icmp_type = type; | ||
392 | fl4->fl4_icmp_code = code; | ||
393 | security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); | ||
394 | rt = __ip_route_output_key(net, fl4); | ||
395 | if (IS_ERR(rt)) | ||
396 | return rt; | ||
397 | |||
398 | /* No need to clone since we're just using its address. */ | ||
399 | rt2 = rt; | ||
400 | |||
401 | rt = (struct rtable *) xfrm_lookup(net, &rt->dst, | ||
402 | flowi4_to_flowi(fl4), NULL, 0); | ||
403 | if (!IS_ERR(rt)) { | ||
404 | if (rt != rt2) | ||
405 | return rt; | ||
406 | } else if (PTR_ERR(rt) == -EPERM) { | ||
407 | rt = NULL; | ||
408 | } else | ||
409 | return rt; | ||
410 | |||
411 | err = xfrm_decode_session_reverse(skb_in, flowi4_to_flowi(fl4), AF_INET); | ||
412 | if (err) | ||
413 | goto relookup_failed; | ||
414 | |||
415 | if (inet_addr_type(net, fl4->saddr) == RTN_LOCAL) { | ||
416 | rt2 = __ip_route_output_key(net, fl4); | ||
417 | if (IS_ERR(rt2)) | ||
418 | err = PTR_ERR(rt2); | ||
419 | } else { | ||
420 | struct flowi4 fl4_2 = {}; | ||
421 | unsigned long orefdst; | ||
422 | |||
423 | fl4_2.daddr = fl4->saddr; | ||
424 | rt2 = ip_route_output_key(net, &fl4_2); | ||
425 | if (IS_ERR(rt2)) { | ||
426 | err = PTR_ERR(rt2); | ||
427 | goto relookup_failed; | ||
428 | } | ||
429 | /* Ugh! */ | ||
430 | orefdst = skb_in->_skb_refdst; /* save old refdst */ | ||
431 | err = ip_route_input(skb_in, fl4->daddr, fl4->saddr, | ||
432 | RT_TOS(tos), rt2->dst.dev); | ||
433 | |||
434 | dst_release(&rt2->dst); | ||
435 | rt2 = skb_rtable(skb_in); | ||
436 | skb_in->_skb_refdst = orefdst; /* restore old refdst */ | ||
437 | } | ||
438 | |||
439 | if (err) | ||
440 | goto relookup_failed; | ||
441 | |||
442 | rt2 = (struct rtable *) xfrm_lookup(net, &rt2->dst, | ||
443 | flowi4_to_flowi(fl4), NULL, | ||
444 | XFRM_LOOKUP_ICMP); | ||
445 | if (!IS_ERR(rt2)) { | ||
446 | dst_release(&rt->dst); | ||
447 | rt = rt2; | ||
448 | } else if (PTR_ERR(rt2) == -EPERM) { | ||
449 | if (rt) | ||
450 | dst_release(&rt->dst); | ||
451 | return rt2; | ||
452 | } else { | ||
453 | err = PTR_ERR(rt2); | ||
454 | goto relookup_failed; | ||
455 | } | ||
456 | return rt; | ||
457 | |||
458 | relookup_failed: | ||
459 | if (rt) | ||
460 | return rt; | ||
461 | return ERR_PTR(err); | ||
462 | } | ||
406 | 463 | ||
407 | /* | 464 | /* |
408 | * Send an ICMP message in response to a situation | 465 | * Send an ICMP message in response to a situation |
@@ -422,6 +479,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) | |||
422 | struct icmp_bxm icmp_param; | 479 | struct icmp_bxm icmp_param; |
423 | struct rtable *rt = skb_rtable(skb_in); | 480 | struct rtable *rt = skb_rtable(skb_in); |
424 | struct ipcm_cookie ipc; | 481 | struct ipcm_cookie ipc; |
482 | struct flowi4 fl4; | ||
425 | __be32 saddr; | 483 | __be32 saddr; |
426 | u8 tos; | 484 | u8 tos; |
427 | struct net *net; | 485 | struct net *net; |
@@ -506,9 +564,9 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) | |||
506 | struct net_device *dev = NULL; | 564 | struct net_device *dev = NULL; |
507 | 565 | ||
508 | rcu_read_lock(); | 566 | rcu_read_lock(); |
509 | if (rt->fl.iif && | 567 | if (rt_is_input_route(rt) && |
510 | net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr) | 568 | net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr) |
511 | dev = dev_get_by_index_rcu(net, rt->fl.iif); | 569 | dev = dev_get_by_index_rcu(net, rt->rt_iif); |
512 | 570 | ||
513 | if (dev) | 571 | if (dev) |
514 | saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK); | 572 | saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK); |
@@ -521,7 +579,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) | |||
521 | IPTOS_PREC_INTERNETCONTROL) : | 579 | IPTOS_PREC_INTERNETCONTROL) : |
522 | iph->tos; | 580 | iph->tos; |
523 | 581 | ||
524 | if (ip_options_echo(&icmp_param.replyopts, skb_in)) | 582 | if (ip_options_echo(&icmp_param.replyopts.opt.opt, skb_in)) |
525 | goto out_unlock; | 583 | goto out_unlock; |
526 | 584 | ||
527 | 585 | ||
@@ -537,96 +595,15 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) | |||
537 | icmp_param.offset = skb_network_offset(skb_in); | 595 | icmp_param.offset = skb_network_offset(skb_in); |
538 | inet_sk(sk)->tos = tos; | 596 | inet_sk(sk)->tos = tos; |
539 | ipc.addr = iph->saddr; | 597 | ipc.addr = iph->saddr; |
540 | ipc.opt = &icmp_param.replyopts; | 598 | ipc.opt = &icmp_param.replyopts.opt; |
541 | ipc.shtx.flags = 0; | 599 | ipc.tx_flags = 0; |
542 | |||
543 | { | ||
544 | struct flowi fl = { | ||
545 | .nl_u = { | ||
546 | .ip4_u = { | ||
547 | .daddr = icmp_param.replyopts.srr ? | ||
548 | icmp_param.replyopts.faddr : | ||
549 | iph->saddr, | ||
550 | .saddr = saddr, | ||
551 | .tos = RT_TOS(tos) | ||
552 | } | ||
553 | }, | ||
554 | .proto = IPPROTO_ICMP, | ||
555 | .uli_u = { | ||
556 | .icmpt = { | ||
557 | .type = type, | ||
558 | .code = code | ||
559 | } | ||
560 | } | ||
561 | }; | ||
562 | int err; | ||
563 | struct rtable *rt2; | ||
564 | |||
565 | security_skb_classify_flow(skb_in, &fl); | ||
566 | if (__ip_route_output_key(net, &rt, &fl)) | ||
567 | goto out_unlock; | ||
568 | |||
569 | /* No need to clone since we're just using its address. */ | ||
570 | rt2 = rt; | ||
571 | |||
572 | err = xfrm_lookup(net, (struct dst_entry **)&rt, &fl, NULL, 0); | ||
573 | switch (err) { | ||
574 | case 0: | ||
575 | if (rt != rt2) | ||
576 | goto route_done; | ||
577 | break; | ||
578 | case -EPERM: | ||
579 | rt = NULL; | ||
580 | break; | ||
581 | default: | ||
582 | goto out_unlock; | ||
583 | } | ||
584 | |||
585 | if (xfrm_decode_session_reverse(skb_in, &fl, AF_INET)) | ||
586 | goto relookup_failed; | ||
587 | |||
588 | if (inet_addr_type(net, fl.fl4_src) == RTN_LOCAL) | ||
589 | err = __ip_route_output_key(net, &rt2, &fl); | ||
590 | else { | ||
591 | struct flowi fl2 = {}; | ||
592 | unsigned long orefdst; | ||
593 | 600 | ||
594 | fl2.fl4_dst = fl.fl4_src; | 601 | rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, |
595 | if (ip_route_output_key(net, &rt2, &fl2)) | 602 | type, code, &icmp_param); |
596 | goto relookup_failed; | 603 | if (IS_ERR(rt)) |
597 | 604 | goto out_unlock; | |
598 | /* Ugh! */ | ||
599 | orefdst = skb_in->_skb_refdst; /* save old refdst */ | ||
600 | err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src, | ||
601 | RT_TOS(tos), rt2->dst.dev); | ||
602 | |||
603 | dst_release(&rt2->dst); | ||
604 | rt2 = skb_rtable(skb_in); | ||
605 | skb_in->_skb_refdst = orefdst; /* restore old refdst */ | ||
606 | } | ||
607 | |||
608 | if (err) | ||
609 | goto relookup_failed; | ||
610 | |||
611 | err = xfrm_lookup(net, (struct dst_entry **)&rt2, &fl, NULL, | ||
612 | XFRM_LOOKUP_ICMP); | ||
613 | switch (err) { | ||
614 | case 0: | ||
615 | dst_release(&rt->dst); | ||
616 | rt = rt2; | ||
617 | break; | ||
618 | case -EPERM: | ||
619 | goto ende; | ||
620 | default: | ||
621 | relookup_failed: | ||
622 | if (!rt) | ||
623 | goto out_unlock; | ||
624 | break; | ||
625 | } | ||
626 | } | ||
627 | 605 | ||
628 | route_done: | 606 | if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code)) |
629 | if (!icmpv4_xrlim_allow(net, rt, type, code)) | ||
630 | goto ende; | 607 | goto ende; |
631 | 608 | ||
632 | /* RFC says return as much as we can without exceeding 576 bytes. */ | 609 | /* RFC says return as much as we can without exceeding 576 bytes. */ |
@@ -634,7 +611,7 @@ route_done: | |||
634 | room = dst_mtu(&rt->dst); | 611 | room = dst_mtu(&rt->dst); |
635 | if (room > 576) | 612 | if (room > 576) |
636 | room = 576; | 613 | room = 576; |
637 | room -= sizeof(struct iphdr) + icmp_param.replyopts.optlen; | 614 | room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen; |
638 | room -= sizeof(struct icmphdr); | 615 | room -= sizeof(struct icmphdr); |
639 | 616 | ||
640 | icmp_param.data_len = skb_in->len - icmp_param.offset; | 617 | icmp_param.data_len = skb_in->len - icmp_param.offset; |
@@ -642,7 +619,7 @@ route_done: | |||
642 | icmp_param.data_len = room; | 619 | icmp_param.data_len = room; |
643 | icmp_param.head_len = sizeof(struct icmphdr); | 620 | icmp_param.head_len = sizeof(struct icmphdr); |
644 | 621 | ||
645 | icmp_push_reply(&icmp_param, &ipc, &rt); | 622 | icmp_push_reply(&icmp_param, &fl4, &ipc, &rt); |
646 | ende: | 623 | ende: |
647 | ip_rt_put(rt); | 624 | ip_rt_put(rt); |
648 | out_unlock: | 625 | out_unlock: |
@@ -658,7 +635,7 @@ EXPORT_SYMBOL(icmp_send); | |||
658 | 635 | ||
659 | static void icmp_unreach(struct sk_buff *skb) | 636 | static void icmp_unreach(struct sk_buff *skb) |
660 | { | 637 | { |
661 | struct iphdr *iph; | 638 | const struct iphdr *iph; |
662 | struct icmphdr *icmph; | 639 | struct icmphdr *icmph; |
663 | int hash, protocol; | 640 | int hash, protocol; |
664 | const struct net_protocol *ipprot; | 641 | const struct net_protocol *ipprot; |
@@ -677,7 +654,7 @@ static void icmp_unreach(struct sk_buff *skb) | |||
677 | goto out_err; | 654 | goto out_err; |
678 | 655 | ||
679 | icmph = icmp_hdr(skb); | 656 | icmph = icmp_hdr(skb); |
680 | iph = (struct iphdr *)skb->data; | 657 | iph = (const struct iphdr *)skb->data; |
681 | 658 | ||
682 | if (iph->ihl < 5) /* Mangled header, drop. */ | 659 | if (iph->ihl < 5) /* Mangled header, drop. */ |
683 | goto out_err; | 660 | goto out_err; |
@@ -725,7 +702,7 @@ static void icmp_unreach(struct sk_buff *skb) | |||
725 | */ | 702 | */ |
726 | 703 | ||
727 | /* | 704 | /* |
728 | * Check the other end isnt violating RFC 1122. Some routers send | 705 | * Check the other end isn't violating RFC 1122. Some routers send |
729 | * bogus responses to broadcast frames. If you see this message | 706 | * bogus responses to broadcast frames. If you see this message |
730 | * first check your netmask matches at both ends, if it does then | 707 | * first check your netmask matches at both ends, if it does then |
731 | * get the other vendor to fix their kit. | 708 | * get the other vendor to fix their kit. |
@@ -750,7 +727,7 @@ static void icmp_unreach(struct sk_buff *skb) | |||
750 | if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) | 727 | if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) |
751 | goto out; | 728 | goto out; |
752 | 729 | ||
753 | iph = (struct iphdr *)skb->data; | 730 | iph = (const struct iphdr *)skb->data; |
754 | protocol = iph->protocol; | 731 | protocol = iph->protocol; |
755 | 732 | ||
756 | /* | 733 | /* |
@@ -779,7 +756,7 @@ out_err: | |||
779 | 756 | ||
780 | static void icmp_redirect(struct sk_buff *skb) | 757 | static void icmp_redirect(struct sk_buff *skb) |
781 | { | 758 | { |
782 | struct iphdr *iph; | 759 | const struct iphdr *iph; |
783 | 760 | ||
784 | if (skb->len < sizeof(struct iphdr)) | 761 | if (skb->len < sizeof(struct iphdr)) |
785 | goto out_err; | 762 | goto out_err; |
@@ -790,7 +767,7 @@ static void icmp_redirect(struct sk_buff *skb) | |||
790 | if (!pskb_may_pull(skb, sizeof(struct iphdr))) | 767 | if (!pskb_may_pull(skb, sizeof(struct iphdr))) |
791 | goto out; | 768 | goto out; |
792 | 769 | ||
793 | iph = (struct iphdr *)skb->data; | 770 | iph = (const struct iphdr *)skb->data; |
794 | 771 | ||
795 | switch (icmp_hdr(skb)->code & 7) { | 772 | switch (icmp_hdr(skb)->code & 7) { |
796 | case ICMP_REDIR_NET: | 773 | case ICMP_REDIR_NET: |
@@ -805,6 +782,15 @@ static void icmp_redirect(struct sk_buff *skb) | |||
805 | iph->saddr, skb->dev); | 782 | iph->saddr, skb->dev); |
806 | break; | 783 | break; |
807 | } | 784 | } |
785 | |||
786 | /* Ping wants to see redirects. | ||
787 | * Let's pretend they are errors of sorts... */ | ||
788 | if (iph->protocol == IPPROTO_ICMP && | ||
789 | iph->ihl >= 5 && | ||
790 | pskb_may_pull(skb, (iph->ihl<<2)+8)) { | ||
791 | ping_err(skb, icmp_hdr(skb)->un.gateway); | ||
792 | } | ||
793 | |||
808 | out: | 794 | out: |
809 | return; | 795 | return; |
810 | out_err: | 796 | out_err: |
@@ -954,12 +940,12 @@ static void icmp_address_reply(struct sk_buff *skb) | |||
954 | BUG_ON(mp == NULL); | 940 | BUG_ON(mp == NULL); |
955 | for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { | 941 | for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { |
956 | if (*mp == ifa->ifa_mask && | 942 | if (*mp == ifa->ifa_mask && |
957 | inet_ifa_match(rt->rt_src, ifa)) | 943 | inet_ifa_match(ip_hdr(skb)->saddr, ifa)) |
958 | break; | 944 | break; |
959 | } | 945 | } |
960 | if (!ifa && net_ratelimit()) { | 946 | if (!ifa && net_ratelimit()) { |
961 | printk(KERN_INFO "Wrong address mask %pI4 from %s/%pI4\n", | 947 | printk(KERN_INFO "Wrong address mask %pI4 from %s/%pI4\n", |
962 | mp, dev->name, &rt->rt_src); | 948 | mp, dev->name, &ip_hdr(skb)->saddr); |
963 | } | 949 | } |
964 | } | 950 | } |
965 | } | 951 | } |
@@ -1065,7 +1051,7 @@ error: | |||
1065 | */ | 1051 | */ |
1066 | static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = { | 1052 | static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = { |
1067 | [ICMP_ECHOREPLY] = { | 1053 | [ICMP_ECHOREPLY] = { |
1068 | .handler = icmp_discard, | 1054 | .handler = ping_rcv, |
1069 | }, | 1055 | }, |
1070 | [1] = { | 1056 | [1] = { |
1071 | .handler = icmp_discard, | 1057 | .handler = icmp_discard, |
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 2a4bb76f2132..f1d27f6c9351 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c | |||
@@ -153,17 +153,27 @@ static void ip_ma_put(struct ip_mc_list *im) | |||
153 | { | 153 | { |
154 | if (atomic_dec_and_test(&im->refcnt)) { | 154 | if (atomic_dec_and_test(&im->refcnt)) { |
155 | in_dev_put(im->interface); | 155 | in_dev_put(im->interface); |
156 | kfree(im); | 156 | kfree_rcu(im, rcu); |
157 | } | 157 | } |
158 | } | 158 | } |
159 | 159 | ||
160 | #define for_each_pmc_rcu(in_dev, pmc) \ | ||
161 | for (pmc = rcu_dereference(in_dev->mc_list); \ | ||
162 | pmc != NULL; \ | ||
163 | pmc = rcu_dereference(pmc->next_rcu)) | ||
164 | |||
165 | #define for_each_pmc_rtnl(in_dev, pmc) \ | ||
166 | for (pmc = rtnl_dereference(in_dev->mc_list); \ | ||
167 | pmc != NULL; \ | ||
168 | pmc = rtnl_dereference(pmc->next_rcu)) | ||
169 | |||
160 | #ifdef CONFIG_IP_MULTICAST | 170 | #ifdef CONFIG_IP_MULTICAST |
161 | 171 | ||
162 | /* | 172 | /* |
163 | * Timer management | 173 | * Timer management |
164 | */ | 174 | */ |
165 | 175 | ||
166 | static __inline__ void igmp_stop_timer(struct ip_mc_list *im) | 176 | static void igmp_stop_timer(struct ip_mc_list *im) |
167 | { | 177 | { |
168 | spin_lock_bh(&im->lock); | 178 | spin_lock_bh(&im->lock); |
169 | if (del_timer(&im->timer)) | 179 | if (del_timer(&im->timer)) |
@@ -284,6 +294,8 @@ igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted) | |||
284 | return scount; | 294 | return scount; |
285 | } | 295 | } |
286 | 296 | ||
297 | #define igmp_skb_size(skb) (*(unsigned int *)((skb)->cb)) | ||
298 | |||
287 | static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) | 299 | static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) |
288 | { | 300 | { |
289 | struct sk_buff *skb; | 301 | struct sk_buff *skb; |
@@ -291,24 +303,24 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) | |||
291 | struct iphdr *pip; | 303 | struct iphdr *pip; |
292 | struct igmpv3_report *pig; | 304 | struct igmpv3_report *pig; |
293 | struct net *net = dev_net(dev); | 305 | struct net *net = dev_net(dev); |
306 | struct flowi4 fl4; | ||
294 | 307 | ||
295 | skb = alloc_skb(size + LL_ALLOCATED_SPACE(dev), GFP_ATOMIC); | 308 | while (1) { |
296 | if (skb == NULL) | 309 | skb = alloc_skb(size + LL_ALLOCATED_SPACE(dev), |
297 | return NULL; | 310 | GFP_ATOMIC | __GFP_NOWARN); |
298 | 311 | if (skb) | |
299 | { | 312 | break; |
300 | struct flowi fl = { .oif = dev->ifindex, | 313 | size >>= 1; |
301 | .nl_u = { .ip4_u = { | 314 | if (size < 256) |
302 | .daddr = IGMPV3_ALL_MCR } }, | ||
303 | .proto = IPPROTO_IGMP }; | ||
304 | if (ip_route_output_key(net, &rt, &fl)) { | ||
305 | kfree_skb(skb); | ||
306 | return NULL; | 315 | return NULL; |
307 | } | ||
308 | } | 316 | } |
309 | if (rt->rt_src == 0) { | 317 | igmp_skb_size(skb) = size; |
318 | |||
319 | rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0, | ||
320 | 0, 0, | ||
321 | IPPROTO_IGMP, 0, dev->ifindex); | ||
322 | if (IS_ERR(rt)) { | ||
310 | kfree_skb(skb); | 323 | kfree_skb(skb); |
311 | ip_rt_put(rt); | ||
312 | return NULL; | 324 | return NULL; |
313 | } | 325 | } |
314 | 326 | ||
@@ -326,8 +338,8 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) | |||
326 | pip->tos = 0xc0; | 338 | pip->tos = 0xc0; |
327 | pip->frag_off = htons(IP_DF); | 339 | pip->frag_off = htons(IP_DF); |
328 | pip->ttl = 1; | 340 | pip->ttl = 1; |
329 | pip->daddr = rt->rt_dst; | 341 | pip->daddr = fl4.daddr; |
330 | pip->saddr = rt->rt_src; | 342 | pip->saddr = fl4.saddr; |
331 | pip->protocol = IPPROTO_IGMP; | 343 | pip->protocol = IPPROTO_IGMP; |
332 | pip->tot_len = 0; /* filled in later */ | 344 | pip->tot_len = 0; /* filled in later */ |
333 | ip_select_ident(pip, &rt->dst, NULL); | 345 | ip_select_ident(pip, &rt->dst, NULL); |
@@ -384,7 +396,7 @@ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc, | |||
384 | return skb; | 396 | return skb; |
385 | } | 397 | } |
386 | 398 | ||
387 | #define AVAILABLE(skb) ((skb) ? ((skb)->dev ? (skb)->dev->mtu - (skb)->len : \ | 399 | #define AVAILABLE(skb) ((skb) ? ((skb)->dev ? igmp_skb_size(skb) - (skb)->len : \ |
388 | skb_tailroom(skb)) : 0) | 400 | skb_tailroom(skb)) : 0) |
389 | 401 | ||
390 | static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, | 402 | static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, |
@@ -502,8 +514,8 @@ static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc) | |||
502 | int type; | 514 | int type; |
503 | 515 | ||
504 | if (!pmc) { | 516 | if (!pmc) { |
505 | read_lock(&in_dev->mc_list_lock); | 517 | rcu_read_lock(); |
506 | for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) { | 518 | for_each_pmc_rcu(in_dev, pmc) { |
507 | if (pmc->multiaddr == IGMP_ALL_HOSTS) | 519 | if (pmc->multiaddr == IGMP_ALL_HOSTS) |
508 | continue; | 520 | continue; |
509 | spin_lock_bh(&pmc->lock); | 521 | spin_lock_bh(&pmc->lock); |
@@ -514,7 +526,7 @@ static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc) | |||
514 | skb = add_grec(skb, pmc, type, 0, 0); | 526 | skb = add_grec(skb, pmc, type, 0, 0); |
515 | spin_unlock_bh(&pmc->lock); | 527 | spin_unlock_bh(&pmc->lock); |
516 | } | 528 | } |
517 | read_unlock(&in_dev->mc_list_lock); | 529 | rcu_read_unlock(); |
518 | } else { | 530 | } else { |
519 | spin_lock_bh(&pmc->lock); | 531 | spin_lock_bh(&pmc->lock); |
520 | if (pmc->sfcount[MCAST_EXCLUDE]) | 532 | if (pmc->sfcount[MCAST_EXCLUDE]) |
@@ -556,7 +568,7 @@ static void igmpv3_send_cr(struct in_device *in_dev) | |||
556 | struct sk_buff *skb = NULL; | 568 | struct sk_buff *skb = NULL; |
557 | int type, dtype; | 569 | int type, dtype; |
558 | 570 | ||
559 | read_lock(&in_dev->mc_list_lock); | 571 | rcu_read_lock(); |
560 | spin_lock_bh(&in_dev->mc_tomb_lock); | 572 | spin_lock_bh(&in_dev->mc_tomb_lock); |
561 | 573 | ||
562 | /* deleted MCA's */ | 574 | /* deleted MCA's */ |
@@ -593,7 +605,7 @@ static void igmpv3_send_cr(struct in_device *in_dev) | |||
593 | spin_unlock_bh(&in_dev->mc_tomb_lock); | 605 | spin_unlock_bh(&in_dev->mc_tomb_lock); |
594 | 606 | ||
595 | /* change recs */ | 607 | /* change recs */ |
596 | for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) { | 608 | for_each_pmc_rcu(in_dev, pmc) { |
597 | spin_lock_bh(&pmc->lock); | 609 | spin_lock_bh(&pmc->lock); |
598 | if (pmc->sfcount[MCAST_EXCLUDE]) { | 610 | if (pmc->sfcount[MCAST_EXCLUDE]) { |
599 | type = IGMPV3_BLOCK_OLD_SOURCES; | 611 | type = IGMPV3_BLOCK_OLD_SOURCES; |
@@ -616,7 +628,7 @@ static void igmpv3_send_cr(struct in_device *in_dev) | |||
616 | } | 628 | } |
617 | spin_unlock_bh(&pmc->lock); | 629 | spin_unlock_bh(&pmc->lock); |
618 | } | 630 | } |
619 | read_unlock(&in_dev->mc_list_lock); | 631 | rcu_read_unlock(); |
620 | 632 | ||
621 | if (!skb) | 633 | if (!skb) |
622 | return; | 634 | return; |
@@ -633,6 +645,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, | |||
633 | struct net_device *dev = in_dev->dev; | 645 | struct net_device *dev = in_dev->dev; |
634 | struct net *net = dev_net(dev); | 646 | struct net *net = dev_net(dev); |
635 | __be32 group = pmc ? pmc->multiaddr : 0; | 647 | __be32 group = pmc ? pmc->multiaddr : 0; |
648 | struct flowi4 fl4; | ||
636 | __be32 dst; | 649 | __be32 dst; |
637 | 650 | ||
638 | if (type == IGMPV3_HOST_MEMBERSHIP_REPORT) | 651 | if (type == IGMPV3_HOST_MEMBERSHIP_REPORT) |
@@ -642,17 +655,11 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, | |||
642 | else | 655 | else |
643 | dst = group; | 656 | dst = group; |
644 | 657 | ||
645 | { | 658 | rt = ip_route_output_ports(net, &fl4, NULL, dst, 0, |
646 | struct flowi fl = { .oif = dev->ifindex, | 659 | 0, 0, |
647 | .nl_u = { .ip4_u = { .daddr = dst } }, | 660 | IPPROTO_IGMP, 0, dev->ifindex); |
648 | .proto = IPPROTO_IGMP }; | 661 | if (IS_ERR(rt)) |
649 | if (ip_route_output_key(net, &rt, &fl)) | ||
650 | return -1; | ||
651 | } | ||
652 | if (rt->rt_src == 0) { | ||
653 | ip_rt_put(rt); | ||
654 | return -1; | 662 | return -1; |
655 | } | ||
656 | 663 | ||
657 | skb = alloc_skb(IGMP_SIZE+LL_ALLOCATED_SPACE(dev), GFP_ATOMIC); | 664 | skb = alloc_skb(IGMP_SIZE+LL_ALLOCATED_SPACE(dev), GFP_ATOMIC); |
658 | if (skb == NULL) { | 665 | if (skb == NULL) { |
@@ -674,7 +681,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, | |||
674 | iph->frag_off = htons(IP_DF); | 681 | iph->frag_off = htons(IP_DF); |
675 | iph->ttl = 1; | 682 | iph->ttl = 1; |
676 | iph->daddr = dst; | 683 | iph->daddr = dst; |
677 | iph->saddr = rt->rt_src; | 684 | iph->saddr = fl4.saddr; |
678 | iph->protocol = IPPROTO_IGMP; | 685 | iph->protocol = IPPROTO_IGMP; |
679 | ip_select_ident(iph, &rt->dst, NULL); | 686 | ip_select_ident(iph, &rt->dst, NULL); |
680 | ((u8*)&iph[1])[0] = IPOPT_RA; | 687 | ((u8*)&iph[1])[0] = IPOPT_RA; |
@@ -813,14 +820,14 @@ static void igmp_heard_report(struct in_device *in_dev, __be32 group) | |||
813 | if (group == IGMP_ALL_HOSTS) | 820 | if (group == IGMP_ALL_HOSTS) |
814 | return; | 821 | return; |
815 | 822 | ||
816 | read_lock(&in_dev->mc_list_lock); | 823 | rcu_read_lock(); |
817 | for (im=in_dev->mc_list; im!=NULL; im=im->next) { | 824 | for_each_pmc_rcu(in_dev, im) { |
818 | if (im->multiaddr == group) { | 825 | if (im->multiaddr == group) { |
819 | igmp_stop_timer(im); | 826 | igmp_stop_timer(im); |
820 | break; | 827 | break; |
821 | } | 828 | } |
822 | } | 829 | } |
823 | read_unlock(&in_dev->mc_list_lock); | 830 | rcu_read_unlock(); |
824 | } | 831 | } |
825 | 832 | ||
826 | static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, | 833 | static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, |
@@ -906,8 +913,8 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, | |||
906 | * - Use the igmp->igmp_code field as the maximum | 913 | * - Use the igmp->igmp_code field as the maximum |
907 | * delay possible | 914 | * delay possible |
908 | */ | 915 | */ |
909 | read_lock(&in_dev->mc_list_lock); | 916 | rcu_read_lock(); |
910 | for (im=in_dev->mc_list; im!=NULL; im=im->next) { | 917 | for_each_pmc_rcu(in_dev, im) { |
911 | int changed; | 918 | int changed; |
912 | 919 | ||
913 | if (group && group != im->multiaddr) | 920 | if (group && group != im->multiaddr) |
@@ -925,7 +932,7 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, | |||
925 | if (changed) | 932 | if (changed) |
926 | igmp_mod_timer(im, max_delay); | 933 | igmp_mod_timer(im, max_delay); |
927 | } | 934 | } |
928 | read_unlock(&in_dev->mc_list_lock); | 935 | rcu_read_unlock(); |
929 | } | 936 | } |
930 | 937 | ||
931 | /* called in rcu_read_lock() section */ | 938 | /* called in rcu_read_lock() section */ |
@@ -961,7 +968,7 @@ int igmp_rcv(struct sk_buff *skb) | |||
961 | case IGMP_HOST_MEMBERSHIP_REPORT: | 968 | case IGMP_HOST_MEMBERSHIP_REPORT: |
962 | case IGMPV2_HOST_MEMBERSHIP_REPORT: | 969 | case IGMPV2_HOST_MEMBERSHIP_REPORT: |
963 | /* Is it our report looped back? */ | 970 | /* Is it our report looped back? */ |
964 | if (skb_rtable(skb)->fl.iif == 0) | 971 | if (rt_is_output_route(skb_rtable(skb))) |
965 | break; | 972 | break; |
966 | /* don't rely on MC router hearing unicast reports */ | 973 | /* don't rely on MC router hearing unicast reports */ |
967 | if (skb->pkt_type == PACKET_MULTICAST || | 974 | if (skb->pkt_type == PACKET_MULTICAST || |
@@ -1110,8 +1117,8 @@ static void igmpv3_clear_delrec(struct in_device *in_dev) | |||
1110 | kfree(pmc); | 1117 | kfree(pmc); |
1111 | } | 1118 | } |
1112 | /* clear dead sources, too */ | 1119 | /* clear dead sources, too */ |
1113 | read_lock(&in_dev->mc_list_lock); | 1120 | rcu_read_lock(); |
1114 | for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) { | 1121 | for_each_pmc_rcu(in_dev, pmc) { |
1115 | struct ip_sf_list *psf, *psf_next; | 1122 | struct ip_sf_list *psf, *psf_next; |
1116 | 1123 | ||
1117 | spin_lock_bh(&pmc->lock); | 1124 | spin_lock_bh(&pmc->lock); |
@@ -1123,7 +1130,7 @@ static void igmpv3_clear_delrec(struct in_device *in_dev) | |||
1123 | kfree(psf); | 1130 | kfree(psf); |
1124 | } | 1131 | } |
1125 | } | 1132 | } |
1126 | read_unlock(&in_dev->mc_list_lock); | 1133 | rcu_read_unlock(); |
1127 | } | 1134 | } |
1128 | #endif | 1135 | #endif |
1129 | 1136 | ||
@@ -1148,20 +1155,18 @@ static void igmp_group_dropped(struct ip_mc_list *im) | |||
1148 | 1155 | ||
1149 | if (!in_dev->dead) { | 1156 | if (!in_dev->dead) { |
1150 | if (IGMP_V1_SEEN(in_dev)) | 1157 | if (IGMP_V1_SEEN(in_dev)) |
1151 | goto done; | 1158 | return; |
1152 | if (IGMP_V2_SEEN(in_dev)) { | 1159 | if (IGMP_V2_SEEN(in_dev)) { |
1153 | if (reporter) | 1160 | if (reporter) |
1154 | igmp_send_report(in_dev, im, IGMP_HOST_LEAVE_MESSAGE); | 1161 | igmp_send_report(in_dev, im, IGMP_HOST_LEAVE_MESSAGE); |
1155 | goto done; | 1162 | return; |
1156 | } | 1163 | } |
1157 | /* IGMPv3 */ | 1164 | /* IGMPv3 */ |
1158 | igmpv3_add_delrec(in_dev, im); | 1165 | igmpv3_add_delrec(in_dev, im); |
1159 | 1166 | ||
1160 | igmp_ifc_event(in_dev); | 1167 | igmp_ifc_event(in_dev); |
1161 | } | 1168 | } |
1162 | done: | ||
1163 | #endif | 1169 | #endif |
1164 | ip_mc_clear_src(im); | ||
1165 | } | 1170 | } |
1166 | 1171 | ||
1167 | static void igmp_group_added(struct ip_mc_list *im) | 1172 | static void igmp_group_added(struct ip_mc_list *im) |
@@ -1209,7 +1214,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) | |||
1209 | 1214 | ||
1210 | ASSERT_RTNL(); | 1215 | ASSERT_RTNL(); |
1211 | 1216 | ||
1212 | for (im=in_dev->mc_list; im; im=im->next) { | 1217 | for_each_pmc_rtnl(in_dev, im) { |
1213 | if (im->multiaddr == addr) { | 1218 | if (im->multiaddr == addr) { |
1214 | im->users++; | 1219 | im->users++; |
1215 | ip_mc_add_src(in_dev, &addr, MCAST_EXCLUDE, 0, NULL, 0); | 1220 | ip_mc_add_src(in_dev, &addr, MCAST_EXCLUDE, 0, NULL, 0); |
@@ -1217,7 +1222,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) | |||
1217 | } | 1222 | } |
1218 | } | 1223 | } |
1219 | 1224 | ||
1220 | im = kmalloc(sizeof(*im), GFP_KERNEL); | 1225 | im = kzalloc(sizeof(*im), GFP_KERNEL); |
1221 | if (!im) | 1226 | if (!im) |
1222 | goto out; | 1227 | goto out; |
1223 | 1228 | ||
@@ -1227,26 +1232,18 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) | |||
1227 | im->multiaddr = addr; | 1232 | im->multiaddr = addr; |
1228 | /* initial mode is (EX, empty) */ | 1233 | /* initial mode is (EX, empty) */ |
1229 | im->sfmode = MCAST_EXCLUDE; | 1234 | im->sfmode = MCAST_EXCLUDE; |
1230 | im->sfcount[MCAST_INCLUDE] = 0; | ||
1231 | im->sfcount[MCAST_EXCLUDE] = 1; | 1235 | im->sfcount[MCAST_EXCLUDE] = 1; |
1232 | im->sources = NULL; | ||
1233 | im->tomb = NULL; | ||
1234 | im->crcount = 0; | ||
1235 | atomic_set(&im->refcnt, 1); | 1236 | atomic_set(&im->refcnt, 1); |
1236 | spin_lock_init(&im->lock); | 1237 | spin_lock_init(&im->lock); |
1237 | #ifdef CONFIG_IP_MULTICAST | 1238 | #ifdef CONFIG_IP_MULTICAST |
1238 | im->tm_running = 0; | ||
1239 | setup_timer(&im->timer, &igmp_timer_expire, (unsigned long)im); | 1239 | setup_timer(&im->timer, &igmp_timer_expire, (unsigned long)im); |
1240 | im->unsolicit_count = IGMP_Unsolicited_Report_Count; | 1240 | im->unsolicit_count = IGMP_Unsolicited_Report_Count; |
1241 | im->reporter = 0; | ||
1242 | im->gsquery = 0; | ||
1243 | #endif | 1241 | #endif |
1244 | im->loaded = 0; | 1242 | |
1245 | write_lock_bh(&in_dev->mc_list_lock); | 1243 | im->next_rcu = in_dev->mc_list; |
1246 | im->next = in_dev->mc_list; | ||
1247 | in_dev->mc_list = im; | ||
1248 | in_dev->mc_count++; | 1244 | in_dev->mc_count++; |
1249 | write_unlock_bh(&in_dev->mc_list_lock); | 1245 | rcu_assign_pointer(in_dev->mc_list, im); |
1246 | |||
1250 | #ifdef CONFIG_IP_MULTICAST | 1247 | #ifdef CONFIG_IP_MULTICAST |
1251 | igmpv3_del_delrec(in_dev, im->multiaddr); | 1248 | igmpv3_del_delrec(in_dev, im->multiaddr); |
1252 | #endif | 1249 | #endif |
@@ -1260,26 +1257,32 @@ EXPORT_SYMBOL(ip_mc_inc_group); | |||
1260 | 1257 | ||
1261 | /* | 1258 | /* |
1262 | * Resend IGMP JOIN report; used for bonding. | 1259 | * Resend IGMP JOIN report; used for bonding. |
1260 | * Called with rcu_read_lock() | ||
1263 | */ | 1261 | */ |
1264 | void ip_mc_rejoin_group(struct ip_mc_list *im) | 1262 | void ip_mc_rejoin_groups(struct in_device *in_dev) |
1265 | { | 1263 | { |
1266 | #ifdef CONFIG_IP_MULTICAST | 1264 | #ifdef CONFIG_IP_MULTICAST |
1267 | struct in_device *in_dev = im->interface; | 1265 | struct ip_mc_list *im; |
1266 | int type; | ||
1268 | 1267 | ||
1269 | if (im->multiaddr == IGMP_ALL_HOSTS) | 1268 | for_each_pmc_rcu(in_dev, im) { |
1270 | return; | 1269 | if (im->multiaddr == IGMP_ALL_HOSTS) |
1270 | continue; | ||
1271 | 1271 | ||
1272 | if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) { | 1272 | /* a failover is happening and switches |
1273 | igmp_mod_timer(im, IGMP_Initial_Report_Delay); | 1273 | * must be notified immediately |
1274 | return; | 1274 | */ |
1275 | if (IGMP_V1_SEEN(in_dev)) | ||
1276 | type = IGMP_HOST_MEMBERSHIP_REPORT; | ||
1277 | else if (IGMP_V2_SEEN(in_dev)) | ||
1278 | type = IGMPV2_HOST_MEMBERSHIP_REPORT; | ||
1279 | else | ||
1280 | type = IGMPV3_HOST_MEMBERSHIP_REPORT; | ||
1281 | igmp_send_report(in_dev, im, type); | ||
1275 | } | 1282 | } |
1276 | /* else, v3 */ | ||
1277 | im->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : | ||
1278 | IGMP_Unsolicited_Report_Count; | ||
1279 | igmp_ifc_event(in_dev); | ||
1280 | #endif | 1283 | #endif |
1281 | } | 1284 | } |
1282 | EXPORT_SYMBOL(ip_mc_rejoin_group); | 1285 | EXPORT_SYMBOL(ip_mc_rejoin_groups); |
1283 | 1286 | ||
1284 | /* | 1287 | /* |
1285 | * A socket has left a multicast group on device dev | 1288 | * A socket has left a multicast group on device dev |
@@ -1287,18 +1290,20 @@ EXPORT_SYMBOL(ip_mc_rejoin_group); | |||
1287 | 1290 | ||
1288 | void ip_mc_dec_group(struct in_device *in_dev, __be32 addr) | 1291 | void ip_mc_dec_group(struct in_device *in_dev, __be32 addr) |
1289 | { | 1292 | { |
1290 | struct ip_mc_list *i, **ip; | 1293 | struct ip_mc_list *i; |
1294 | struct ip_mc_list __rcu **ip; | ||
1291 | 1295 | ||
1292 | ASSERT_RTNL(); | 1296 | ASSERT_RTNL(); |
1293 | 1297 | ||
1294 | for (ip=&in_dev->mc_list; (i=*ip)!=NULL; ip=&i->next) { | 1298 | for (ip = &in_dev->mc_list; |
1299 | (i = rtnl_dereference(*ip)) != NULL; | ||
1300 | ip = &i->next_rcu) { | ||
1295 | if (i->multiaddr == addr) { | 1301 | if (i->multiaddr == addr) { |
1296 | if (--i->users == 0) { | 1302 | if (--i->users == 0) { |
1297 | write_lock_bh(&in_dev->mc_list_lock); | 1303 | *ip = i->next_rcu; |
1298 | *ip = i->next; | ||
1299 | in_dev->mc_count--; | 1304 | in_dev->mc_count--; |
1300 | write_unlock_bh(&in_dev->mc_list_lock); | ||
1301 | igmp_group_dropped(i); | 1305 | igmp_group_dropped(i); |
1306 | ip_mc_clear_src(i); | ||
1302 | 1307 | ||
1303 | if (!in_dev->dead) | 1308 | if (!in_dev->dead) |
1304 | ip_rt_multicast_event(in_dev); | 1309 | ip_rt_multicast_event(in_dev); |
@@ -1316,34 +1321,34 @@ EXPORT_SYMBOL(ip_mc_dec_group); | |||
1316 | 1321 | ||
1317 | void ip_mc_unmap(struct in_device *in_dev) | 1322 | void ip_mc_unmap(struct in_device *in_dev) |
1318 | { | 1323 | { |
1319 | struct ip_mc_list *i; | 1324 | struct ip_mc_list *pmc; |
1320 | 1325 | ||
1321 | ASSERT_RTNL(); | 1326 | ASSERT_RTNL(); |
1322 | 1327 | ||
1323 | for (i = in_dev->mc_list; i; i = i->next) | 1328 | for_each_pmc_rtnl(in_dev, pmc) |
1324 | igmp_group_dropped(i); | 1329 | igmp_group_dropped(pmc); |
1325 | } | 1330 | } |
1326 | 1331 | ||
1327 | void ip_mc_remap(struct in_device *in_dev) | 1332 | void ip_mc_remap(struct in_device *in_dev) |
1328 | { | 1333 | { |
1329 | struct ip_mc_list *i; | 1334 | struct ip_mc_list *pmc; |
1330 | 1335 | ||
1331 | ASSERT_RTNL(); | 1336 | ASSERT_RTNL(); |
1332 | 1337 | ||
1333 | for (i = in_dev->mc_list; i; i = i->next) | 1338 | for_each_pmc_rtnl(in_dev, pmc) |
1334 | igmp_group_added(i); | 1339 | igmp_group_added(pmc); |
1335 | } | 1340 | } |
1336 | 1341 | ||
1337 | /* Device going down */ | 1342 | /* Device going down */ |
1338 | 1343 | ||
1339 | void ip_mc_down(struct in_device *in_dev) | 1344 | void ip_mc_down(struct in_device *in_dev) |
1340 | { | 1345 | { |
1341 | struct ip_mc_list *i; | 1346 | struct ip_mc_list *pmc; |
1342 | 1347 | ||
1343 | ASSERT_RTNL(); | 1348 | ASSERT_RTNL(); |
1344 | 1349 | ||
1345 | for (i=in_dev->mc_list; i; i=i->next) | 1350 | for_each_pmc_rtnl(in_dev, pmc) |
1346 | igmp_group_dropped(i); | 1351 | igmp_group_dropped(pmc); |
1347 | 1352 | ||
1348 | #ifdef CONFIG_IP_MULTICAST | 1353 | #ifdef CONFIG_IP_MULTICAST |
1349 | in_dev->mr_ifc_count = 0; | 1354 | in_dev->mr_ifc_count = 0; |
@@ -1374,7 +1379,6 @@ void ip_mc_init_dev(struct in_device *in_dev) | |||
1374 | in_dev->mr_qrv = IGMP_Unsolicited_Report_Count; | 1379 | in_dev->mr_qrv = IGMP_Unsolicited_Report_Count; |
1375 | #endif | 1380 | #endif |
1376 | 1381 | ||
1377 | rwlock_init(&in_dev->mc_list_lock); | ||
1378 | spin_lock_init(&in_dev->mc_tomb_lock); | 1382 | spin_lock_init(&in_dev->mc_tomb_lock); |
1379 | } | 1383 | } |
1380 | 1384 | ||
@@ -1382,14 +1386,14 @@ void ip_mc_init_dev(struct in_device *in_dev) | |||
1382 | 1386 | ||
1383 | void ip_mc_up(struct in_device *in_dev) | 1387 | void ip_mc_up(struct in_device *in_dev) |
1384 | { | 1388 | { |
1385 | struct ip_mc_list *i; | 1389 | struct ip_mc_list *pmc; |
1386 | 1390 | ||
1387 | ASSERT_RTNL(); | 1391 | ASSERT_RTNL(); |
1388 | 1392 | ||
1389 | ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); | 1393 | ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); |
1390 | 1394 | ||
1391 | for (i=in_dev->mc_list; i; i=i->next) | 1395 | for_each_pmc_rtnl(in_dev, pmc) |
1392 | igmp_group_added(i); | 1396 | igmp_group_added(pmc); |
1393 | } | 1397 | } |
1394 | 1398 | ||
1395 | /* | 1399 | /* |
@@ -1405,43 +1409,40 @@ void ip_mc_destroy_dev(struct in_device *in_dev) | |||
1405 | /* Deactivate timers */ | 1409 | /* Deactivate timers */ |
1406 | ip_mc_down(in_dev); | 1410 | ip_mc_down(in_dev); |
1407 | 1411 | ||
1408 | write_lock_bh(&in_dev->mc_list_lock); | 1412 | while ((i = rtnl_dereference(in_dev->mc_list)) != NULL) { |
1409 | while ((i = in_dev->mc_list) != NULL) { | 1413 | in_dev->mc_list = i->next_rcu; |
1410 | in_dev->mc_list = i->next; | ||
1411 | in_dev->mc_count--; | 1414 | in_dev->mc_count--; |
1412 | write_unlock_bh(&in_dev->mc_list_lock); | ||
1413 | igmp_group_dropped(i); | ||
1414 | ip_ma_put(i); | ||
1415 | 1415 | ||
1416 | write_lock_bh(&in_dev->mc_list_lock); | 1416 | /* We've dropped the groups in ip_mc_down already */ |
1417 | ip_mc_clear_src(i); | ||
1418 | ip_ma_put(i); | ||
1417 | } | 1419 | } |
1418 | write_unlock_bh(&in_dev->mc_list_lock); | ||
1419 | } | 1420 | } |
1420 | 1421 | ||
1422 | /* RTNL is locked */ | ||
1421 | static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) | 1423 | static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) |
1422 | { | 1424 | { |
1423 | struct flowi fl = { .nl_u = { .ip4_u = | ||
1424 | { .daddr = imr->imr_multiaddr.s_addr } } }; | ||
1425 | struct rtable *rt; | ||
1426 | struct net_device *dev = NULL; | 1425 | struct net_device *dev = NULL; |
1427 | struct in_device *idev = NULL; | 1426 | struct in_device *idev = NULL; |
1428 | 1427 | ||
1429 | if (imr->imr_ifindex) { | 1428 | if (imr->imr_ifindex) { |
1430 | idev = inetdev_by_index(net, imr->imr_ifindex); | 1429 | idev = inetdev_by_index(net, imr->imr_ifindex); |
1431 | if (idev) | ||
1432 | __in_dev_put(idev); | ||
1433 | return idev; | 1430 | return idev; |
1434 | } | 1431 | } |
1435 | if (imr->imr_address.s_addr) { | 1432 | if (imr->imr_address.s_addr) { |
1436 | dev = ip_dev_find(net, imr->imr_address.s_addr); | 1433 | dev = __ip_dev_find(net, imr->imr_address.s_addr, false); |
1437 | if (!dev) | 1434 | if (!dev) |
1438 | return NULL; | 1435 | return NULL; |
1439 | dev_put(dev); | ||
1440 | } | 1436 | } |
1441 | 1437 | ||
1442 | if (!dev && !ip_route_output_key(net, &rt, &fl)) { | 1438 | if (!dev) { |
1443 | dev = rt->dst.dev; | 1439 | struct rtable *rt = ip_route_output(net, |
1444 | ip_rt_put(rt); | 1440 | imr->imr_multiaddr.s_addr, |
1441 | 0, 0, 0); | ||
1442 | if (!IS_ERR(rt)) { | ||
1443 | dev = rt->dst.dev; | ||
1444 | ip_rt_put(rt); | ||
1445 | } | ||
1445 | } | 1446 | } |
1446 | if (dev) { | 1447 | if (dev) { |
1447 | imr->imr_ifindex = dev->ifindex; | 1448 | imr->imr_ifindex = dev->ifindex; |
@@ -1515,18 +1516,18 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode, | |||
1515 | 1516 | ||
1516 | if (!in_dev) | 1517 | if (!in_dev) |
1517 | return -ENODEV; | 1518 | return -ENODEV; |
1518 | read_lock(&in_dev->mc_list_lock); | 1519 | rcu_read_lock(); |
1519 | for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) { | 1520 | for_each_pmc_rcu(in_dev, pmc) { |
1520 | if (*pmca == pmc->multiaddr) | 1521 | if (*pmca == pmc->multiaddr) |
1521 | break; | 1522 | break; |
1522 | } | 1523 | } |
1523 | if (!pmc) { | 1524 | if (!pmc) { |
1524 | /* MCA not found?? bug */ | 1525 | /* MCA not found?? bug */ |
1525 | read_unlock(&in_dev->mc_list_lock); | 1526 | rcu_read_unlock(); |
1526 | return -ESRCH; | 1527 | return -ESRCH; |
1527 | } | 1528 | } |
1528 | spin_lock_bh(&pmc->lock); | 1529 | spin_lock_bh(&pmc->lock); |
1529 | read_unlock(&in_dev->mc_list_lock); | 1530 | rcu_read_unlock(); |
1530 | #ifdef CONFIG_IP_MULTICAST | 1531 | #ifdef CONFIG_IP_MULTICAST |
1531 | sf_markstate(pmc); | 1532 | sf_markstate(pmc); |
1532 | #endif | 1533 | #endif |
@@ -1687,18 +1688,18 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, | |||
1687 | 1688 | ||
1688 | if (!in_dev) | 1689 | if (!in_dev) |
1689 | return -ENODEV; | 1690 | return -ENODEV; |
1690 | read_lock(&in_dev->mc_list_lock); | 1691 | rcu_read_lock(); |
1691 | for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) { | 1692 | for_each_pmc_rcu(in_dev, pmc) { |
1692 | if (*pmca == pmc->multiaddr) | 1693 | if (*pmca == pmc->multiaddr) |
1693 | break; | 1694 | break; |
1694 | } | 1695 | } |
1695 | if (!pmc) { | 1696 | if (!pmc) { |
1696 | /* MCA not found?? bug */ | 1697 | /* MCA not found?? bug */ |
1697 | read_unlock(&in_dev->mc_list_lock); | 1698 | rcu_read_unlock(); |
1698 | return -ESRCH; | 1699 | return -ESRCH; |
1699 | } | 1700 | } |
1700 | spin_lock_bh(&pmc->lock); | 1701 | spin_lock_bh(&pmc->lock); |
1701 | read_unlock(&in_dev->mc_list_lock); | 1702 | rcu_read_unlock(); |
1702 | 1703 | ||
1703 | #ifdef CONFIG_IP_MULTICAST | 1704 | #ifdef CONFIG_IP_MULTICAST |
1704 | sf_markstate(pmc); | 1705 | sf_markstate(pmc); |
@@ -1795,7 +1796,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr) | |||
1795 | 1796 | ||
1796 | err = -EADDRINUSE; | 1797 | err = -EADDRINUSE; |
1797 | ifindex = imr->imr_ifindex; | 1798 | ifindex = imr->imr_ifindex; |
1798 | for (i = inet->mc_list; i; i = i->next) { | 1799 | for_each_pmc_rtnl(inet, i) { |
1799 | if (i->multi.imr_multiaddr.s_addr == addr && | 1800 | if (i->multi.imr_multiaddr.s_addr == addr && |
1800 | i->multi.imr_ifindex == ifindex) | 1801 | i->multi.imr_ifindex == ifindex) |
1801 | goto done; | 1802 | goto done; |
@@ -1809,7 +1810,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr) | |||
1809 | goto done; | 1810 | goto done; |
1810 | 1811 | ||
1811 | memcpy(&iml->multi, imr, sizeof(*imr)); | 1812 | memcpy(&iml->multi, imr, sizeof(*imr)); |
1812 | iml->next = inet->mc_list; | 1813 | iml->next_rcu = inet->mc_list; |
1813 | iml->sflist = NULL; | 1814 | iml->sflist = NULL; |
1814 | iml->sfmode = MCAST_EXCLUDE; | 1815 | iml->sfmode = MCAST_EXCLUDE; |
1815 | rcu_assign_pointer(inet->mc_list, iml); | 1816 | rcu_assign_pointer(inet->mc_list, iml); |
@@ -1821,19 +1822,10 @@ done: | |||
1821 | } | 1822 | } |
1822 | EXPORT_SYMBOL(ip_mc_join_group); | 1823 | EXPORT_SYMBOL(ip_mc_join_group); |
1823 | 1824 | ||
1824 | static void ip_sf_socklist_reclaim(struct rcu_head *rp) | ||
1825 | { | ||
1826 | struct ip_sf_socklist *psf; | ||
1827 | |||
1828 | psf = container_of(rp, struct ip_sf_socklist, rcu); | ||
1829 | /* sk_omem_alloc should have been decreased by the caller*/ | ||
1830 | kfree(psf); | ||
1831 | } | ||
1832 | |||
1833 | static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, | 1825 | static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, |
1834 | struct in_device *in_dev) | 1826 | struct in_device *in_dev) |
1835 | { | 1827 | { |
1836 | struct ip_sf_socklist *psf = iml->sflist; | 1828 | struct ip_sf_socklist *psf = rtnl_dereference(iml->sflist); |
1837 | int err; | 1829 | int err; |
1838 | 1830 | ||
1839 | if (psf == NULL) { | 1831 | if (psf == NULL) { |
@@ -1846,21 +1838,10 @@ static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, | |||
1846 | rcu_assign_pointer(iml->sflist, NULL); | 1838 | rcu_assign_pointer(iml->sflist, NULL); |
1847 | /* decrease mem now to avoid the memleak warning */ | 1839 | /* decrease mem now to avoid the memleak warning */ |
1848 | atomic_sub(IP_SFLSIZE(psf->sl_max), &sk->sk_omem_alloc); | 1840 | atomic_sub(IP_SFLSIZE(psf->sl_max), &sk->sk_omem_alloc); |
1849 | call_rcu(&psf->rcu, ip_sf_socklist_reclaim); | 1841 | kfree_rcu(psf, rcu); |
1850 | return err; | 1842 | return err; |
1851 | } | 1843 | } |
1852 | 1844 | ||
1853 | |||
1854 | static void ip_mc_socklist_reclaim(struct rcu_head *rp) | ||
1855 | { | ||
1856 | struct ip_mc_socklist *iml; | ||
1857 | |||
1858 | iml = container_of(rp, struct ip_mc_socklist, rcu); | ||
1859 | /* sk_omem_alloc should have been decreased by the caller*/ | ||
1860 | kfree(iml); | ||
1861 | } | ||
1862 | |||
1863 | |||
1864 | /* | 1845 | /* |
1865 | * Ask a socket to leave a group. | 1846 | * Ask a socket to leave a group. |
1866 | */ | 1847 | */ |
@@ -1868,7 +1849,8 @@ static void ip_mc_socklist_reclaim(struct rcu_head *rp) | |||
1868 | int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) | 1849 | int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) |
1869 | { | 1850 | { |
1870 | struct inet_sock *inet = inet_sk(sk); | 1851 | struct inet_sock *inet = inet_sk(sk); |
1871 | struct ip_mc_socklist *iml, **imlp; | 1852 | struct ip_mc_socklist *iml; |
1853 | struct ip_mc_socklist __rcu **imlp; | ||
1872 | struct in_device *in_dev; | 1854 | struct in_device *in_dev; |
1873 | struct net *net = sock_net(sk); | 1855 | struct net *net = sock_net(sk); |
1874 | __be32 group = imr->imr_multiaddr.s_addr; | 1856 | __be32 group = imr->imr_multiaddr.s_addr; |
@@ -1878,7 +1860,9 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) | |||
1878 | rtnl_lock(); | 1860 | rtnl_lock(); |
1879 | in_dev = ip_mc_find_dev(net, imr); | 1861 | in_dev = ip_mc_find_dev(net, imr); |
1880 | ifindex = imr->imr_ifindex; | 1862 | ifindex = imr->imr_ifindex; |
1881 | for (imlp = &inet->mc_list; (iml = *imlp) != NULL; imlp = &iml->next) { | 1863 | for (imlp = &inet->mc_list; |
1864 | (iml = rtnl_dereference(*imlp)) != NULL; | ||
1865 | imlp = &iml->next_rcu) { | ||
1882 | if (iml->multi.imr_multiaddr.s_addr != group) | 1866 | if (iml->multi.imr_multiaddr.s_addr != group) |
1883 | continue; | 1867 | continue; |
1884 | if (ifindex) { | 1868 | if (ifindex) { |
@@ -1890,14 +1874,14 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) | |||
1890 | 1874 | ||
1891 | (void) ip_mc_leave_src(sk, iml, in_dev); | 1875 | (void) ip_mc_leave_src(sk, iml, in_dev); |
1892 | 1876 | ||
1893 | rcu_assign_pointer(*imlp, iml->next); | 1877 | *imlp = iml->next_rcu; |
1894 | 1878 | ||
1895 | if (in_dev) | 1879 | if (in_dev) |
1896 | ip_mc_dec_group(in_dev, group); | 1880 | ip_mc_dec_group(in_dev, group); |
1897 | rtnl_unlock(); | 1881 | rtnl_unlock(); |
1898 | /* decrease mem now to avoid the memleak warning */ | 1882 | /* decrease mem now to avoid the memleak warning */ |
1899 | atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); | 1883 | atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); |
1900 | call_rcu(&iml->rcu, ip_mc_socklist_reclaim); | 1884 | kfree_rcu(iml, rcu); |
1901 | return 0; | 1885 | return 0; |
1902 | } | 1886 | } |
1903 | if (!in_dev) | 1887 | if (!in_dev) |
@@ -1936,7 +1920,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct | |||
1936 | } | 1920 | } |
1937 | err = -EADDRNOTAVAIL; | 1921 | err = -EADDRNOTAVAIL; |
1938 | 1922 | ||
1939 | for (pmc=inet->mc_list; pmc; pmc=pmc->next) { | 1923 | for_each_pmc_rtnl(inet, pmc) { |
1940 | if ((pmc->multi.imr_multiaddr.s_addr == | 1924 | if ((pmc->multi.imr_multiaddr.s_addr == |
1941 | imr.imr_multiaddr.s_addr) && | 1925 | imr.imr_multiaddr.s_addr) && |
1942 | (pmc->multi.imr_ifindex == imr.imr_ifindex)) | 1926 | (pmc->multi.imr_ifindex == imr.imr_ifindex)) |
@@ -1960,7 +1944,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct | |||
1960 | pmc->sfmode = omode; | 1944 | pmc->sfmode = omode; |
1961 | } | 1945 | } |
1962 | 1946 | ||
1963 | psl = pmc->sflist; | 1947 | psl = rtnl_dereference(pmc->sflist); |
1964 | if (!add) { | 1948 | if (!add) { |
1965 | if (!psl) | 1949 | if (!psl) |
1966 | goto done; /* err = -EADDRNOTAVAIL */ | 1950 | goto done; /* err = -EADDRNOTAVAIL */ |
@@ -2014,7 +1998,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct | |||
2014 | newpsl->sl_addr[i] = psl->sl_addr[i]; | 1998 | newpsl->sl_addr[i] = psl->sl_addr[i]; |
2015 | /* decrease mem now to avoid the memleak warning */ | 1999 | /* decrease mem now to avoid the memleak warning */ |
2016 | atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); | 2000 | atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); |
2017 | call_rcu(&psl->rcu, ip_sf_socklist_reclaim); | 2001 | kfree_rcu(psl, rcu); |
2018 | } | 2002 | } |
2019 | rcu_assign_pointer(pmc->sflist, newpsl); | 2003 | rcu_assign_pointer(pmc->sflist, newpsl); |
2020 | psl = newpsl; | 2004 | psl = newpsl; |
@@ -2079,7 +2063,7 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) | |||
2079 | goto done; | 2063 | goto done; |
2080 | } | 2064 | } |
2081 | 2065 | ||
2082 | for (pmc=inet->mc_list; pmc; pmc=pmc->next) { | 2066 | for_each_pmc_rtnl(inet, pmc) { |
2083 | if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr && | 2067 | if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr && |
2084 | pmc->multi.imr_ifindex == imr.imr_ifindex) | 2068 | pmc->multi.imr_ifindex == imr.imr_ifindex) |
2085 | break; | 2069 | break; |
@@ -2109,13 +2093,13 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) | |||
2109 | (void) ip_mc_add_src(in_dev, &msf->imsf_multiaddr, | 2093 | (void) ip_mc_add_src(in_dev, &msf->imsf_multiaddr, |
2110 | msf->imsf_fmode, 0, NULL, 0); | 2094 | msf->imsf_fmode, 0, NULL, 0); |
2111 | } | 2095 | } |
2112 | psl = pmc->sflist; | 2096 | psl = rtnl_dereference(pmc->sflist); |
2113 | if (psl) { | 2097 | if (psl) { |
2114 | (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, | 2098 | (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, |
2115 | psl->sl_count, psl->sl_addr, 0); | 2099 | psl->sl_count, psl->sl_addr, 0); |
2116 | /* decrease mem now to avoid the memleak warning */ | 2100 | /* decrease mem now to avoid the memleak warning */ |
2117 | atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); | 2101 | atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); |
2118 | call_rcu(&psl->rcu, ip_sf_socklist_reclaim); | 2102 | kfree_rcu(psl, rcu); |
2119 | } else | 2103 | } else |
2120 | (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, | 2104 | (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, |
2121 | 0, NULL, 0); | 2105 | 0, NULL, 0); |
@@ -2157,7 +2141,7 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf, | |||
2157 | } | 2141 | } |
2158 | err = -EADDRNOTAVAIL; | 2142 | err = -EADDRNOTAVAIL; |
2159 | 2143 | ||
2160 | for (pmc=inet->mc_list; pmc; pmc=pmc->next) { | 2144 | for_each_pmc_rtnl(inet, pmc) { |
2161 | if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr && | 2145 | if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr && |
2162 | pmc->multi.imr_ifindex == imr.imr_ifindex) | 2146 | pmc->multi.imr_ifindex == imr.imr_ifindex) |
2163 | break; | 2147 | break; |
@@ -2165,7 +2149,7 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf, | |||
2165 | if (!pmc) /* must have a prior join */ | 2149 | if (!pmc) /* must have a prior join */ |
2166 | goto done; | 2150 | goto done; |
2167 | msf->imsf_fmode = pmc->sfmode; | 2151 | msf->imsf_fmode = pmc->sfmode; |
2168 | psl = pmc->sflist; | 2152 | psl = rtnl_dereference(pmc->sflist); |
2169 | rtnl_unlock(); | 2153 | rtnl_unlock(); |
2170 | if (!psl) { | 2154 | if (!psl) { |
2171 | len = 0; | 2155 | len = 0; |
@@ -2210,7 +2194,7 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, | |||
2210 | 2194 | ||
2211 | err = -EADDRNOTAVAIL; | 2195 | err = -EADDRNOTAVAIL; |
2212 | 2196 | ||
2213 | for (pmc=inet->mc_list; pmc; pmc=pmc->next) { | 2197 | for_each_pmc_rtnl(inet, pmc) { |
2214 | if (pmc->multi.imr_multiaddr.s_addr == addr && | 2198 | if (pmc->multi.imr_multiaddr.s_addr == addr && |
2215 | pmc->multi.imr_ifindex == gsf->gf_interface) | 2199 | pmc->multi.imr_ifindex == gsf->gf_interface) |
2216 | break; | 2200 | break; |
@@ -2218,7 +2202,7 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, | |||
2218 | if (!pmc) /* must have a prior join */ | 2202 | if (!pmc) /* must have a prior join */ |
2219 | goto done; | 2203 | goto done; |
2220 | gsf->gf_fmode = pmc->sfmode; | 2204 | gsf->gf_fmode = pmc->sfmode; |
2221 | psl = pmc->sflist; | 2205 | psl = rtnl_dereference(pmc->sflist); |
2222 | rtnl_unlock(); | 2206 | rtnl_unlock(); |
2223 | count = psl ? psl->sl_count : 0; | 2207 | count = psl ? psl->sl_count : 0; |
2224 | copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc; | 2208 | copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc; |
@@ -2259,7 +2243,7 @@ int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif) | |||
2259 | goto out; | 2243 | goto out; |
2260 | 2244 | ||
2261 | rcu_read_lock(); | 2245 | rcu_read_lock(); |
2262 | for (pmc=rcu_dereference(inet->mc_list); pmc; pmc=rcu_dereference(pmc->next)) { | 2246 | for_each_pmc_rcu(inet, pmc) { |
2263 | if (pmc->multi.imr_multiaddr.s_addr == loc_addr && | 2247 | if (pmc->multi.imr_multiaddr.s_addr == loc_addr && |
2264 | pmc->multi.imr_ifindex == dif) | 2248 | pmc->multi.imr_ifindex == dif) |
2265 | break; | 2249 | break; |
@@ -2267,7 +2251,7 @@ int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif) | |||
2267 | ret = inet->mc_all; | 2251 | ret = inet->mc_all; |
2268 | if (!pmc) | 2252 | if (!pmc) |
2269 | goto unlock; | 2253 | goto unlock; |
2270 | psl = pmc->sflist; | 2254 | psl = rcu_dereference(pmc->sflist); |
2271 | ret = (pmc->sfmode == MCAST_EXCLUDE); | 2255 | ret = (pmc->sfmode == MCAST_EXCLUDE); |
2272 | if (!psl) | 2256 | if (!psl) |
2273 | goto unlock; | 2257 | goto unlock; |
@@ -2302,31 +2286,29 @@ void ip_mc_drop_socket(struct sock *sk) | |||
2302 | return; | 2286 | return; |
2303 | 2287 | ||
2304 | rtnl_lock(); | 2288 | rtnl_lock(); |
2305 | while ((iml = inet->mc_list) != NULL) { | 2289 | while ((iml = rtnl_dereference(inet->mc_list)) != NULL) { |
2306 | struct in_device *in_dev; | 2290 | struct in_device *in_dev; |
2307 | rcu_assign_pointer(inet->mc_list, iml->next); | ||
2308 | 2291 | ||
2292 | inet->mc_list = iml->next_rcu; | ||
2309 | in_dev = inetdev_by_index(net, iml->multi.imr_ifindex); | 2293 | in_dev = inetdev_by_index(net, iml->multi.imr_ifindex); |
2310 | (void) ip_mc_leave_src(sk, iml, in_dev); | 2294 | (void) ip_mc_leave_src(sk, iml, in_dev); |
2311 | if (in_dev != NULL) { | 2295 | if (in_dev != NULL) |
2312 | ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr); | 2296 | ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr); |
2313 | in_dev_put(in_dev); | ||
2314 | } | ||
2315 | /* decrease mem now to avoid the memleak warning */ | 2297 | /* decrease mem now to avoid the memleak warning */ |
2316 | atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); | 2298 | atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); |
2317 | call_rcu(&iml->rcu, ip_mc_socklist_reclaim); | 2299 | kfree_rcu(iml, rcu); |
2318 | } | 2300 | } |
2319 | rtnl_unlock(); | 2301 | rtnl_unlock(); |
2320 | } | 2302 | } |
2321 | 2303 | ||
2322 | int ip_check_mc(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto) | 2304 | /* called with rcu_read_lock() */ |
2305 | int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto) | ||
2323 | { | 2306 | { |
2324 | struct ip_mc_list *im; | 2307 | struct ip_mc_list *im; |
2325 | struct ip_sf_list *psf; | 2308 | struct ip_sf_list *psf; |
2326 | int rv = 0; | 2309 | int rv = 0; |
2327 | 2310 | ||
2328 | read_lock(&in_dev->mc_list_lock); | 2311 | for_each_pmc_rcu(in_dev, im) { |
2329 | for (im=in_dev->mc_list; im; im=im->next) { | ||
2330 | if (im->multiaddr == mc_addr) | 2312 | if (im->multiaddr == mc_addr) |
2331 | break; | 2313 | break; |
2332 | } | 2314 | } |
@@ -2347,7 +2329,6 @@ int ip_check_mc(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 p | |||
2347 | } else | 2329 | } else |
2348 | rv = 1; /* unspecified source; tentatively allow */ | 2330 | rv = 1; /* unspecified source; tentatively allow */ |
2349 | } | 2331 | } |
2350 | read_unlock(&in_dev->mc_list_lock); | ||
2351 | return rv; | 2332 | return rv; |
2352 | } | 2333 | } |
2353 | 2334 | ||
@@ -2373,13 +2354,11 @@ static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq) | |||
2373 | in_dev = __in_dev_get_rcu(state->dev); | 2354 | in_dev = __in_dev_get_rcu(state->dev); |
2374 | if (!in_dev) | 2355 | if (!in_dev) |
2375 | continue; | 2356 | continue; |
2376 | read_lock(&in_dev->mc_list_lock); | 2357 | im = rcu_dereference(in_dev->mc_list); |
2377 | im = in_dev->mc_list; | ||
2378 | if (im) { | 2358 | if (im) { |
2379 | state->in_dev = in_dev; | 2359 | state->in_dev = in_dev; |
2380 | break; | 2360 | break; |
2381 | } | 2361 | } |
2382 | read_unlock(&in_dev->mc_list_lock); | ||
2383 | } | 2362 | } |
2384 | return im; | 2363 | return im; |
2385 | } | 2364 | } |
@@ -2387,11 +2366,9 @@ static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq) | |||
2387 | static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_list *im) | 2366 | static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_list *im) |
2388 | { | 2367 | { |
2389 | struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); | 2368 | struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); |
2390 | im = im->next; | ||
2391 | while (!im) { | ||
2392 | if (likely(state->in_dev != NULL)) | ||
2393 | read_unlock(&state->in_dev->mc_list_lock); | ||
2394 | 2369 | ||
2370 | im = rcu_dereference(im->next_rcu); | ||
2371 | while (!im) { | ||
2395 | state->dev = next_net_device_rcu(state->dev); | 2372 | state->dev = next_net_device_rcu(state->dev); |
2396 | if (!state->dev) { | 2373 | if (!state->dev) { |
2397 | state->in_dev = NULL; | 2374 | state->in_dev = NULL; |
@@ -2400,8 +2377,7 @@ static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_li | |||
2400 | state->in_dev = __in_dev_get_rcu(state->dev); | 2377 | state->in_dev = __in_dev_get_rcu(state->dev); |
2401 | if (!state->in_dev) | 2378 | if (!state->in_dev) |
2402 | continue; | 2379 | continue; |
2403 | read_lock(&state->in_dev->mc_list_lock); | 2380 | im = rcu_dereference(state->in_dev->mc_list); |
2404 | im = state->in_dev->mc_list; | ||
2405 | } | 2381 | } |
2406 | return im; | 2382 | return im; |
2407 | } | 2383 | } |
@@ -2437,10 +2413,8 @@ static void igmp_mc_seq_stop(struct seq_file *seq, void *v) | |||
2437 | __releases(rcu) | 2413 | __releases(rcu) |
2438 | { | 2414 | { |
2439 | struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); | 2415 | struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); |
2440 | if (likely(state->in_dev != NULL)) { | 2416 | |
2441 | read_unlock(&state->in_dev->mc_list_lock); | 2417 | state->in_dev = NULL; |
2442 | state->in_dev = NULL; | ||
2443 | } | ||
2444 | state->dev = NULL; | 2418 | state->dev = NULL; |
2445 | rcu_read_unlock(); | 2419 | rcu_read_unlock(); |
2446 | } | 2420 | } |
@@ -2462,7 +2436,7 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v) | |||
2462 | querier = "NONE"; | 2436 | querier = "NONE"; |
2463 | #endif | 2437 | #endif |
2464 | 2438 | ||
2465 | if (state->in_dev->mc_list == im) { | 2439 | if (rcu_dereference(state->in_dev->mc_list) == im) { |
2466 | seq_printf(seq, "%d\t%-10s: %5d %7s\n", | 2440 | seq_printf(seq, "%d\t%-10s: %5d %7s\n", |
2467 | state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier); | 2441 | state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier); |
2468 | } | 2442 | } |
@@ -2521,8 +2495,7 @@ static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq) | |||
2521 | idev = __in_dev_get_rcu(state->dev); | 2495 | idev = __in_dev_get_rcu(state->dev); |
2522 | if (unlikely(idev == NULL)) | 2496 | if (unlikely(idev == NULL)) |
2523 | continue; | 2497 | continue; |
2524 | read_lock(&idev->mc_list_lock); | 2498 | im = rcu_dereference(idev->mc_list); |
2525 | im = idev->mc_list; | ||
2526 | if (likely(im != NULL)) { | 2499 | if (likely(im != NULL)) { |
2527 | spin_lock_bh(&im->lock); | 2500 | spin_lock_bh(&im->lock); |
2528 | psf = im->sources; | 2501 | psf = im->sources; |
@@ -2533,7 +2506,6 @@ static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq) | |||
2533 | } | 2506 | } |
2534 | spin_unlock_bh(&im->lock); | 2507 | spin_unlock_bh(&im->lock); |
2535 | } | 2508 | } |
2536 | read_unlock(&idev->mc_list_lock); | ||
2537 | } | 2509 | } |
2538 | return psf; | 2510 | return psf; |
2539 | } | 2511 | } |
@@ -2547,9 +2519,6 @@ static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_l | |||
2547 | spin_unlock_bh(&state->im->lock); | 2519 | spin_unlock_bh(&state->im->lock); |
2548 | state->im = state->im->next; | 2520 | state->im = state->im->next; |
2549 | while (!state->im) { | 2521 | while (!state->im) { |
2550 | if (likely(state->idev != NULL)) | ||
2551 | read_unlock(&state->idev->mc_list_lock); | ||
2552 | |||
2553 | state->dev = next_net_device_rcu(state->dev); | 2522 | state->dev = next_net_device_rcu(state->dev); |
2554 | if (!state->dev) { | 2523 | if (!state->dev) { |
2555 | state->idev = NULL; | 2524 | state->idev = NULL; |
@@ -2558,8 +2527,7 @@ static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_l | |||
2558 | state->idev = __in_dev_get_rcu(state->dev); | 2527 | state->idev = __in_dev_get_rcu(state->dev); |
2559 | if (!state->idev) | 2528 | if (!state->idev) |
2560 | continue; | 2529 | continue; |
2561 | read_lock(&state->idev->mc_list_lock); | 2530 | state->im = rcu_dereference(state->idev->mc_list); |
2562 | state->im = state->idev->mc_list; | ||
2563 | } | 2531 | } |
2564 | if (!state->im) | 2532 | if (!state->im) |
2565 | break; | 2533 | break; |
@@ -2605,10 +2573,7 @@ static void igmp_mcf_seq_stop(struct seq_file *seq, void *v) | |||
2605 | spin_unlock_bh(&state->im->lock); | 2573 | spin_unlock_bh(&state->im->lock); |
2606 | state->im = NULL; | 2574 | state->im = NULL; |
2607 | } | 2575 | } |
2608 | if (likely(state->idev != NULL)) { | 2576 | state->idev = NULL; |
2609 | read_unlock(&state->idev->mc_list_lock); | ||
2610 | state->idev = NULL; | ||
2611 | } | ||
2612 | state->dev = NULL; | 2577 | state->dev = NULL; |
2613 | rcu_read_unlock(); | 2578 | rcu_read_unlock(); |
2614 | } | 2579 | } |
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 7174370b1195..c14d88ad348d 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c | |||
@@ -33,7 +33,7 @@ EXPORT_SYMBOL(inet_csk_timer_bug_msg); | |||
33 | * This struct holds the first and last local port number. | 33 | * This struct holds the first and last local port number. |
34 | */ | 34 | */ |
35 | struct local_ports sysctl_local_ports __read_mostly = { | 35 | struct local_ports sysctl_local_ports __read_mostly = { |
36 | .lock = SEQLOCK_UNLOCKED, | 36 | .lock = __SEQLOCK_UNLOCKED(sysctl_local_ports.lock), |
37 | .range = { 32768, 61000 }, | 37 | .range = { 32768, 61000 }, |
38 | }; | 38 | }; |
39 | 39 | ||
@@ -55,7 +55,6 @@ EXPORT_SYMBOL(inet_get_local_port_range); | |||
55 | int inet_csk_bind_conflict(const struct sock *sk, | 55 | int inet_csk_bind_conflict(const struct sock *sk, |
56 | const struct inet_bind_bucket *tb) | 56 | const struct inet_bind_bucket *tb) |
57 | { | 57 | { |
58 | const __be32 sk_rcv_saddr = inet_rcv_saddr(sk); | ||
59 | struct sock *sk2; | 58 | struct sock *sk2; |
60 | struct hlist_node *node; | 59 | struct hlist_node *node; |
61 | int reuse = sk->sk_reuse; | 60 | int reuse = sk->sk_reuse; |
@@ -75,9 +74,9 @@ int inet_csk_bind_conflict(const struct sock *sk, | |||
75 | sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { | 74 | sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { |
76 | if (!reuse || !sk2->sk_reuse || | 75 | if (!reuse || !sk2->sk_reuse || |
77 | sk2->sk_state == TCP_LISTEN) { | 76 | sk2->sk_state == TCP_LISTEN) { |
78 | const __be32 sk2_rcv_saddr = inet_rcv_saddr(sk2); | 77 | const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2); |
79 | if (!sk2_rcv_saddr || !sk_rcv_saddr || | 78 | if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) || |
80 | sk2_rcv_saddr == sk_rcv_saddr) | 79 | sk2_rcv_saddr == sk_rcv_saddr(sk)) |
81 | break; | 80 | break; |
82 | } | 81 | } |
83 | } | 82 | } |
@@ -351,30 +350,24 @@ void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len) | |||
351 | EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); | 350 | EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); |
352 | 351 | ||
353 | struct dst_entry *inet_csk_route_req(struct sock *sk, | 352 | struct dst_entry *inet_csk_route_req(struct sock *sk, |
353 | struct flowi4 *fl4, | ||
354 | const struct request_sock *req) | 354 | const struct request_sock *req) |
355 | { | 355 | { |
356 | struct rtable *rt; | 356 | struct rtable *rt; |
357 | const struct inet_request_sock *ireq = inet_rsk(req); | 357 | const struct inet_request_sock *ireq = inet_rsk(req); |
358 | struct ip_options *opt = inet_rsk(req)->opt; | 358 | struct ip_options_rcu *opt = inet_rsk(req)->opt; |
359 | struct flowi fl = { .oif = sk->sk_bound_dev_if, | ||
360 | .mark = sk->sk_mark, | ||
361 | .nl_u = { .ip4_u = | ||
362 | { .daddr = ((opt && opt->srr) ? | ||
363 | opt->faddr : | ||
364 | ireq->rmt_addr), | ||
365 | .saddr = ireq->loc_addr, | ||
366 | .tos = RT_CONN_FLAGS(sk) } }, | ||
367 | .proto = sk->sk_protocol, | ||
368 | .flags = inet_sk_flowi_flags(sk), | ||
369 | .uli_u = { .ports = | ||
370 | { .sport = inet_sk(sk)->inet_sport, | ||
371 | .dport = ireq->rmt_port } } }; | ||
372 | struct net *net = sock_net(sk); | 359 | struct net *net = sock_net(sk); |
373 | 360 | ||
374 | security_req_classify_flow(req, &fl); | 361 | flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, |
375 | if (ip_route_output_flow(net, &rt, &fl, sk, 0)) | 362 | RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, |
363 | sk->sk_protocol, inet_sk_flowi_flags(sk), | ||
364 | (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr, | ||
365 | ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport); | ||
366 | security_req_classify_flow(req, flowi4_to_flowi(fl4)); | ||
367 | rt = ip_route_output_flow(net, fl4, sk); | ||
368 | if (IS_ERR(rt)) | ||
376 | goto no_route; | 369 | goto no_route; |
377 | if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) | 370 | if (opt && opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway) |
378 | goto route_err; | 371 | goto route_err; |
379 | return &rt->dst; | 372 | return &rt->dst; |
380 | 373 | ||
@@ -386,6 +379,39 @@ no_route: | |||
386 | } | 379 | } |
387 | EXPORT_SYMBOL_GPL(inet_csk_route_req); | 380 | EXPORT_SYMBOL_GPL(inet_csk_route_req); |
388 | 381 | ||
382 | struct dst_entry *inet_csk_route_child_sock(struct sock *sk, | ||
383 | struct sock *newsk, | ||
384 | const struct request_sock *req) | ||
385 | { | ||
386 | const struct inet_request_sock *ireq = inet_rsk(req); | ||
387 | struct inet_sock *newinet = inet_sk(newsk); | ||
388 | struct ip_options_rcu *opt = ireq->opt; | ||
389 | struct net *net = sock_net(sk); | ||
390 | struct flowi4 *fl4; | ||
391 | struct rtable *rt; | ||
392 | |||
393 | fl4 = &newinet->cork.fl.u.ip4; | ||
394 | flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, | ||
395 | RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, | ||
396 | sk->sk_protocol, inet_sk_flowi_flags(sk), | ||
397 | (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr, | ||
398 | ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport); | ||
399 | security_req_classify_flow(req, flowi4_to_flowi(fl4)); | ||
400 | rt = ip_route_output_flow(net, fl4, sk); | ||
401 | if (IS_ERR(rt)) | ||
402 | goto no_route; | ||
403 | if (opt && opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway) | ||
404 | goto route_err; | ||
405 | return &rt->dst; | ||
406 | |||
407 | route_err: | ||
408 | ip_rt_put(rt); | ||
409 | no_route: | ||
410 | IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); | ||
411 | return NULL; | ||
412 | } | ||
413 | EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); | ||
414 | |||
389 | static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport, | 415 | static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport, |
390 | const u32 rnd, const u32 synq_hsize) | 416 | const u32 rnd, const u32 synq_hsize) |
391 | { | 417 | { |
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index e5fa2ddce320..3267d3898437 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c | |||
@@ -124,7 +124,7 @@ static int inet_csk_diag_fill(struct sock *sk, | |||
124 | 124 | ||
125 | #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) | 125 | #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) |
126 | if (r->idiag_family == AF_INET6) { | 126 | if (r->idiag_family == AF_INET6) { |
127 | struct ipv6_pinfo *np = inet6_sk(sk); | 127 | const struct ipv6_pinfo *np = inet6_sk(sk); |
128 | 128 | ||
129 | ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, | 129 | ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, |
130 | &np->rcv_saddr); | 130 | &np->rcv_saddr); |
@@ -425,7 +425,7 @@ static int inet_diag_bc_run(const void *bc, int len, | |||
425 | bc += op->no; | 425 | bc += op->no; |
426 | } | 426 | } |
427 | } | 427 | } |
428 | return (len == 0); | 428 | return len == 0; |
429 | } | 429 | } |
430 | 430 | ||
431 | static int valid_cc(const void *bc, int len, int cc) | 431 | static int valid_cc(const void *bc, int len, int cc) |
@@ -437,7 +437,7 @@ static int valid_cc(const void *bc, int len, int cc) | |||
437 | return 0; | 437 | return 0; |
438 | if (cc == len) | 438 | if (cc == len) |
439 | return 1; | 439 | return 1; |
440 | if (op->yes < 4) | 440 | if (op->yes < 4 || op->yes & 3) |
441 | return 0; | 441 | return 0; |
442 | len -= op->yes; | 442 | len -= op->yes; |
443 | bc += op->yes; | 443 | bc += op->yes; |
@@ -447,11 +447,11 @@ static int valid_cc(const void *bc, int len, int cc) | |||
447 | 447 | ||
448 | static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) | 448 | static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) |
449 | { | 449 | { |
450 | const unsigned char *bc = bytecode; | 450 | const void *bc = bytecode; |
451 | int len = bytecode_len; | 451 | int len = bytecode_len; |
452 | 452 | ||
453 | while (len > 0) { | 453 | while (len > 0) { |
454 | struct inet_diag_bc_op *op = (struct inet_diag_bc_op *)bc; | 454 | const struct inet_diag_bc_op *op = bc; |
455 | 455 | ||
456 | //printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len); | 456 | //printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len); |
457 | switch (op->code) { | 457 | switch (op->code) { |
@@ -462,22 +462,20 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) | |||
462 | case INET_DIAG_BC_S_LE: | 462 | case INET_DIAG_BC_S_LE: |
463 | case INET_DIAG_BC_D_GE: | 463 | case INET_DIAG_BC_D_GE: |
464 | case INET_DIAG_BC_D_LE: | 464 | case INET_DIAG_BC_D_LE: |
465 | if (op->yes < 4 || op->yes > len + 4) | ||
466 | return -EINVAL; | ||
467 | case INET_DIAG_BC_JMP: | 465 | case INET_DIAG_BC_JMP: |
468 | if (op->no < 4 || op->no > len + 4) | 466 | if (op->no < 4 || op->no > len + 4 || op->no & 3) |
469 | return -EINVAL; | 467 | return -EINVAL; |
470 | if (op->no < len && | 468 | if (op->no < len && |
471 | !valid_cc(bytecode, bytecode_len, len - op->no)) | 469 | !valid_cc(bytecode, bytecode_len, len - op->no)) |
472 | return -EINVAL; | 470 | return -EINVAL; |
473 | break; | 471 | break; |
474 | case INET_DIAG_BC_NOP: | 472 | case INET_DIAG_BC_NOP: |
475 | if (op->yes < 4 || op->yes > len + 4) | ||
476 | return -EINVAL; | ||
477 | break; | 473 | break; |
478 | default: | 474 | default: |
479 | return -EINVAL; | 475 | return -EINVAL; |
480 | } | 476 | } |
477 | if (op->yes < 4 || op->yes > len + 4 || op->yes & 3) | ||
478 | return -EINVAL; | ||
481 | bc += op->yes; | 479 | bc += op->yes; |
482 | len -= op->yes; | 480 | len -= op->yes; |
483 | } | 481 | } |
@@ -490,9 +488,11 @@ static int inet_csk_diag_dump(struct sock *sk, | |||
490 | { | 488 | { |
491 | struct inet_diag_req *r = NLMSG_DATA(cb->nlh); | 489 | struct inet_diag_req *r = NLMSG_DATA(cb->nlh); |
492 | 490 | ||
493 | if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { | 491 | if (nlmsg_attrlen(cb->nlh, sizeof(*r))) { |
494 | struct inet_diag_entry entry; | 492 | struct inet_diag_entry entry; |
495 | struct rtattr *bc = (struct rtattr *)(r + 1); | 493 | const struct nlattr *bc = nlmsg_find_attr(cb->nlh, |
494 | sizeof(*r), | ||
495 | INET_DIAG_REQ_BYTECODE); | ||
496 | struct inet_sock *inet = inet_sk(sk); | 496 | struct inet_sock *inet = inet_sk(sk); |
497 | 497 | ||
498 | entry.family = sk->sk_family; | 498 | entry.family = sk->sk_family; |
@@ -512,7 +512,7 @@ static int inet_csk_diag_dump(struct sock *sk, | |||
512 | entry.dport = ntohs(inet->inet_dport); | 512 | entry.dport = ntohs(inet->inet_dport); |
513 | entry.userlocks = sk->sk_userlocks; | 513 | entry.userlocks = sk->sk_userlocks; |
514 | 514 | ||
515 | if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry)) | 515 | if (!inet_diag_bc_run(nla_data(bc), nla_len(bc), &entry)) |
516 | return 0; | 516 | return 0; |
517 | } | 517 | } |
518 | 518 | ||
@@ -527,9 +527,11 @@ static int inet_twsk_diag_dump(struct inet_timewait_sock *tw, | |||
527 | { | 527 | { |
528 | struct inet_diag_req *r = NLMSG_DATA(cb->nlh); | 528 | struct inet_diag_req *r = NLMSG_DATA(cb->nlh); |
529 | 529 | ||
530 | if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { | 530 | if (nlmsg_attrlen(cb->nlh, sizeof(*r))) { |
531 | struct inet_diag_entry entry; | 531 | struct inet_diag_entry entry; |
532 | struct rtattr *bc = (struct rtattr *)(r + 1); | 532 | const struct nlattr *bc = nlmsg_find_attr(cb->nlh, |
533 | sizeof(*r), | ||
534 | INET_DIAG_REQ_BYTECODE); | ||
533 | 535 | ||
534 | entry.family = tw->tw_family; | 536 | entry.family = tw->tw_family; |
535 | #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) | 537 | #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) |
@@ -548,7 +550,7 @@ static int inet_twsk_diag_dump(struct inet_timewait_sock *tw, | |||
548 | entry.dport = ntohs(tw->tw_dport); | 550 | entry.dport = ntohs(tw->tw_dport); |
549 | entry.userlocks = 0; | 551 | entry.userlocks = 0; |
550 | 552 | ||
551 | if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry)) | 553 | if (!inet_diag_bc_run(nla_data(bc), nla_len(bc), &entry)) |
552 | return 0; | 554 | return 0; |
553 | } | 555 | } |
554 | 556 | ||
@@ -618,7 +620,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, | |||
618 | struct inet_diag_req *r = NLMSG_DATA(cb->nlh); | 620 | struct inet_diag_req *r = NLMSG_DATA(cb->nlh); |
619 | struct inet_connection_sock *icsk = inet_csk(sk); | 621 | struct inet_connection_sock *icsk = inet_csk(sk); |
620 | struct listen_sock *lopt; | 622 | struct listen_sock *lopt; |
621 | struct rtattr *bc = NULL; | 623 | const struct nlattr *bc = NULL; |
622 | struct inet_sock *inet = inet_sk(sk); | 624 | struct inet_sock *inet = inet_sk(sk); |
623 | int j, s_j; | 625 | int j, s_j; |
624 | int reqnum, s_reqnum; | 626 | int reqnum, s_reqnum; |
@@ -638,8 +640,9 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, | |||
638 | if (!lopt || !lopt->qlen) | 640 | if (!lopt || !lopt->qlen) |
639 | goto out; | 641 | goto out; |
640 | 642 | ||
641 | if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { | 643 | if (nlmsg_attrlen(cb->nlh, sizeof(*r))) { |
642 | bc = (struct rtattr *)(r + 1); | 644 | bc = nlmsg_find_attr(cb->nlh, sizeof(*r), |
645 | INET_DIAG_REQ_BYTECODE); | ||
643 | entry.sport = inet->inet_num; | 646 | entry.sport = inet->inet_num; |
644 | entry.userlocks = sk->sk_userlocks; | 647 | entry.userlocks = sk->sk_userlocks; |
645 | } | 648 | } |
@@ -672,8 +675,8 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, | |||
672 | &ireq->rmt_addr; | 675 | &ireq->rmt_addr; |
673 | entry.dport = ntohs(ireq->rmt_port); | 676 | entry.dport = ntohs(ireq->rmt_port); |
674 | 677 | ||
675 | if (!inet_diag_bc_run(RTA_DATA(bc), | 678 | if (!inet_diag_bc_run(nla_data(bc), |
676 | RTA_PAYLOAD(bc), &entry)) | 679 | nla_len(bc), &entry)) |
677 | continue; | 680 | continue; |
678 | } | 681 | } |
679 | 682 | ||
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index fb7ad5a21ff3..3c0369a3a663 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c | |||
@@ -101,19 +101,42 @@ void inet_put_port(struct sock *sk) | |||
101 | } | 101 | } |
102 | EXPORT_SYMBOL(inet_put_port); | 102 | EXPORT_SYMBOL(inet_put_port); |
103 | 103 | ||
104 | void __inet_inherit_port(struct sock *sk, struct sock *child) | 104 | int __inet_inherit_port(struct sock *sk, struct sock *child) |
105 | { | 105 | { |
106 | struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; | 106 | struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; |
107 | const int bhash = inet_bhashfn(sock_net(sk), inet_sk(child)->inet_num, | 107 | unsigned short port = inet_sk(child)->inet_num; |
108 | const int bhash = inet_bhashfn(sock_net(sk), port, | ||
108 | table->bhash_size); | 109 | table->bhash_size); |
109 | struct inet_bind_hashbucket *head = &table->bhash[bhash]; | 110 | struct inet_bind_hashbucket *head = &table->bhash[bhash]; |
110 | struct inet_bind_bucket *tb; | 111 | struct inet_bind_bucket *tb; |
111 | 112 | ||
112 | spin_lock(&head->lock); | 113 | spin_lock(&head->lock); |
113 | tb = inet_csk(sk)->icsk_bind_hash; | 114 | tb = inet_csk(sk)->icsk_bind_hash; |
114 | sk_add_bind_node(child, &tb->owners); | 115 | if (tb->port != port) { |
115 | inet_csk(child)->icsk_bind_hash = tb; | 116 | /* NOTE: using tproxy and redirecting skbs to a proxy |
117 | * on a different listener port breaks the assumption | ||
118 | * that the listener socket's icsk_bind_hash is the same | ||
119 | * as that of the child socket. We have to look up or | ||
120 | * create a new bind bucket for the child here. */ | ||
121 | struct hlist_node *node; | ||
122 | inet_bind_bucket_for_each(tb, node, &head->chain) { | ||
123 | if (net_eq(ib_net(tb), sock_net(sk)) && | ||
124 | tb->port == port) | ||
125 | break; | ||
126 | } | ||
127 | if (!node) { | ||
128 | tb = inet_bind_bucket_create(table->bind_bucket_cachep, | ||
129 | sock_net(sk), head, port); | ||
130 | if (!tb) { | ||
131 | spin_unlock(&head->lock); | ||
132 | return -ENOMEM; | ||
133 | } | ||
134 | } | ||
135 | } | ||
136 | inet_bind_hash(child, tb, port); | ||
116 | spin_unlock(&head->lock); | 137 | spin_unlock(&head->lock); |
138 | |||
139 | return 0; | ||
117 | } | 140 | } |
118 | EXPORT_SYMBOL_GPL(__inet_inherit_port); | 141 | EXPORT_SYMBOL_GPL(__inet_inherit_port); |
119 | 142 | ||
diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c index 47038cb6c138..85a0f75dae64 100644 --- a/net/ipv4/inet_lro.c +++ b/net/ipv4/inet_lro.c | |||
@@ -51,8 +51,8 @@ MODULE_DESCRIPTION("Large Receive Offload (ipv4 / tcp)"); | |||
51 | * Basic tcp checks whether packet is suitable for LRO | 51 | * Basic tcp checks whether packet is suitable for LRO |
52 | */ | 52 | */ |
53 | 53 | ||
54 | static int lro_tcp_ip_check(struct iphdr *iph, struct tcphdr *tcph, | 54 | static int lro_tcp_ip_check(const struct iphdr *iph, const struct tcphdr *tcph, |
55 | int len, struct net_lro_desc *lro_desc) | 55 | int len, const struct net_lro_desc *lro_desc) |
56 | { | 56 | { |
57 | /* check ip header: don't aggregate padded frames */ | 57 | /* check ip header: don't aggregate padded frames */ |
58 | if (ntohs(iph->tot_len) != len) | 58 | if (ntohs(iph->tot_len) != len) |
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index c5af909cf701..3c8dfa16614d 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c | |||
@@ -505,7 +505,9 @@ restart: | |||
505 | } | 505 | } |
506 | 506 | ||
507 | rcu_read_unlock(); | 507 | rcu_read_unlock(); |
508 | local_bh_disable(); | ||
508 | inet_twsk_deschedule(tw, twdr); | 509 | inet_twsk_deschedule(tw, twdr); |
510 | local_bh_enable(); | ||
509 | inet_twsk_put(tw); | 511 | inet_twsk_put(tw); |
510 | goto restart_rcu; | 512 | goto restart_rcu; |
511 | } | 513 | } |
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 9ffa24b9a804..ce616d92cc54 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c | |||
@@ -63,7 +63,7 @@ | |||
63 | * refcnt: atomically against modifications on other CPU; | 63 | * refcnt: atomically against modifications on other CPU; |
64 | * usually under some other lock to prevent node disappearing | 64 | * usually under some other lock to prevent node disappearing |
65 | * dtime: unused node list lock | 65 | * dtime: unused node list lock |
66 | * v4daddr: unchangeable | 66 | * daddr: unchangeable |
67 | * ip_id_count: atomic value (no lock needed) | 67 | * ip_id_count: atomic value (no lock needed) |
68 | */ | 68 | */ |
69 | 69 | ||
@@ -72,21 +72,31 @@ static struct kmem_cache *peer_cachep __read_mostly; | |||
72 | #define node_height(x) x->avl_height | 72 | #define node_height(x) x->avl_height |
73 | 73 | ||
74 | #define peer_avl_empty ((struct inet_peer *)&peer_fake_node) | 74 | #define peer_avl_empty ((struct inet_peer *)&peer_fake_node) |
75 | #define peer_avl_empty_rcu ((struct inet_peer __rcu __force *)&peer_fake_node) | ||
75 | static const struct inet_peer peer_fake_node = { | 76 | static const struct inet_peer peer_fake_node = { |
76 | .avl_left = peer_avl_empty, | 77 | .avl_left = peer_avl_empty_rcu, |
77 | .avl_right = peer_avl_empty, | 78 | .avl_right = peer_avl_empty_rcu, |
78 | .avl_height = 0 | 79 | .avl_height = 0 |
79 | }; | 80 | }; |
80 | 81 | ||
81 | static struct { | 82 | struct inet_peer_base { |
82 | struct inet_peer *root; | 83 | struct inet_peer __rcu *root; |
83 | spinlock_t lock; | 84 | seqlock_t lock; |
84 | int total; | 85 | int total; |
85 | } peers = { | 86 | }; |
86 | .root = peer_avl_empty, | 87 | |
87 | .lock = __SPIN_LOCK_UNLOCKED(peers.lock), | 88 | static struct inet_peer_base v4_peers = { |
89 | .root = peer_avl_empty_rcu, | ||
90 | .lock = __SEQLOCK_UNLOCKED(v4_peers.lock), | ||
91 | .total = 0, | ||
92 | }; | ||
93 | |||
94 | static struct inet_peer_base v6_peers = { | ||
95 | .root = peer_avl_empty_rcu, | ||
96 | .lock = __SEQLOCK_UNLOCKED(v6_peers.lock), | ||
88 | .total = 0, | 97 | .total = 0, |
89 | }; | 98 | }; |
99 | |||
90 | #define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ | 100 | #define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ |
91 | 101 | ||
92 | /* Exported for sysctl_net_ipv4. */ | 102 | /* Exported for sysctl_net_ipv4. */ |
@@ -144,62 +154,99 @@ void __init inet_initpeers(void) | |||
144 | /* Called with or without local BH being disabled. */ | 154 | /* Called with or without local BH being disabled. */ |
145 | static void unlink_from_unused(struct inet_peer *p) | 155 | static void unlink_from_unused(struct inet_peer *p) |
146 | { | 156 | { |
147 | if (!list_empty(&p->unused)) { | 157 | spin_lock_bh(&unused_peers.lock); |
148 | spin_lock_bh(&unused_peers.lock); | 158 | list_del_init(&p->unused); |
149 | list_del_init(&p->unused); | 159 | spin_unlock_bh(&unused_peers.lock); |
150 | spin_unlock_bh(&unused_peers.lock); | 160 | } |
161 | |||
162 | static int addr_compare(const struct inetpeer_addr *a, | ||
163 | const struct inetpeer_addr *b) | ||
164 | { | ||
165 | int i, n = (a->family == AF_INET ? 1 : 4); | ||
166 | |||
167 | for (i = 0; i < n; i++) { | ||
168 | if (a->addr.a6[i] == b->addr.a6[i]) | ||
169 | continue; | ||
170 | if (a->addr.a6[i] < b->addr.a6[i]) | ||
171 | return -1; | ||
172 | return 1; | ||
151 | } | 173 | } |
174 | |||
175 | return 0; | ||
152 | } | 176 | } |
153 | 177 | ||
178 | #define rcu_deref_locked(X, BASE) \ | ||
179 | rcu_dereference_protected(X, lockdep_is_held(&(BASE)->lock.lock)) | ||
180 | |||
154 | /* | 181 | /* |
155 | * Called with local BH disabled and the pool lock held. | 182 | * Called with local BH disabled and the pool lock held. |
156 | */ | 183 | */ |
157 | #define lookup(_daddr, _stack) \ | 184 | #define lookup(_daddr, _stack, _base) \ |
158 | ({ \ | 185 | ({ \ |
159 | struct inet_peer *u, **v; \ | 186 | struct inet_peer *u; \ |
187 | struct inet_peer __rcu **v; \ | ||
160 | \ | 188 | \ |
161 | stackptr = _stack; \ | 189 | stackptr = _stack; \ |
162 | *stackptr++ = &peers.root; \ | 190 | *stackptr++ = &_base->root; \ |
163 | for (u = peers.root; u != peer_avl_empty; ) { \ | 191 | for (u = rcu_deref_locked(_base->root, _base); \ |
164 | if (_daddr == u->v4daddr) \ | 192 | u != peer_avl_empty; ) { \ |
193 | int cmp = addr_compare(_daddr, &u->daddr); \ | ||
194 | if (cmp == 0) \ | ||
165 | break; \ | 195 | break; \ |
166 | if ((__force __u32)_daddr < (__force __u32)u->v4daddr) \ | 196 | if (cmp == -1) \ |
167 | v = &u->avl_left; \ | 197 | v = &u->avl_left; \ |
168 | else \ | 198 | else \ |
169 | v = &u->avl_right; \ | 199 | v = &u->avl_right; \ |
170 | *stackptr++ = v; \ | 200 | *stackptr++ = v; \ |
171 | u = *v; \ | 201 | u = rcu_deref_locked(*v, _base); \ |
172 | } \ | 202 | } \ |
173 | u; \ | 203 | u; \ |
174 | }) | 204 | }) |
175 | 205 | ||
206 | static bool atomic_add_unless_return(atomic_t *ptr, int a, int u, int *newv) | ||
207 | { | ||
208 | int cur, old = atomic_read(ptr); | ||
209 | |||
210 | while (old != u) { | ||
211 | *newv = old + a; | ||
212 | cur = atomic_cmpxchg(ptr, old, *newv); | ||
213 | if (cur == old) | ||
214 | return true; | ||
215 | old = cur; | ||
216 | } | ||
217 | return false; | ||
218 | } | ||
219 | |||
176 | /* | 220 | /* |
177 | * Called with rcu_read_lock_bh() | 221 | * Called with rcu_read_lock() |
178 | * Because we hold no lock against a writer, its quite possible we fall | 222 | * Because we hold no lock against a writer, its quite possible we fall |
179 | * in an endless loop. | 223 | * in an endless loop. |
180 | * But every pointer we follow is guaranteed to be valid thanks to RCU. | 224 | * But every pointer we follow is guaranteed to be valid thanks to RCU. |
181 | * We exit from this function if number of links exceeds PEER_MAXDEPTH | 225 | * We exit from this function if number of links exceeds PEER_MAXDEPTH |
182 | */ | 226 | */ |
183 | static struct inet_peer *lookup_rcu_bh(__be32 daddr) | 227 | static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr, |
228 | struct inet_peer_base *base, | ||
229 | int *newrefcnt) | ||
184 | { | 230 | { |
185 | struct inet_peer *u = rcu_dereference_bh(peers.root); | 231 | struct inet_peer *u = rcu_dereference(base->root); |
186 | int count = 0; | 232 | int count = 0; |
187 | 233 | ||
188 | while (u != peer_avl_empty) { | 234 | while (u != peer_avl_empty) { |
189 | if (daddr == u->v4daddr) { | 235 | int cmp = addr_compare(daddr, &u->daddr); |
236 | if (cmp == 0) { | ||
190 | /* Before taking a reference, check if this entry was | 237 | /* Before taking a reference, check if this entry was |
191 | * deleted, unlink_from_pool() sets refcnt=-1 to make | 238 | * deleted, unlink_from_pool() sets refcnt=-1 to make |
192 | * distinction between an unused entry (refcnt=0) and | 239 | * distinction between an unused entry (refcnt=0) and |
193 | * a freed one. | 240 | * a freed one. |
194 | */ | 241 | */ |
195 | if (unlikely(!atomic_add_unless(&u->refcnt, 1, -1))) | 242 | if (!atomic_add_unless_return(&u->refcnt, 1, -1, newrefcnt)) |
196 | u = NULL; | 243 | u = NULL; |
197 | return u; | 244 | return u; |
198 | } | 245 | } |
199 | if ((__force __u32)daddr < (__force __u32)u->v4daddr) | 246 | if (cmp == -1) |
200 | u = rcu_dereference_bh(u->avl_left); | 247 | u = rcu_dereference(u->avl_left); |
201 | else | 248 | else |
202 | u = rcu_dereference_bh(u->avl_right); | 249 | u = rcu_dereference(u->avl_right); |
203 | if (unlikely(++count == PEER_MAXDEPTH)) | 250 | if (unlikely(++count == PEER_MAXDEPTH)) |
204 | break; | 251 | break; |
205 | } | 252 | } |
@@ -207,15 +254,17 @@ static struct inet_peer *lookup_rcu_bh(__be32 daddr) | |||
207 | } | 254 | } |
208 | 255 | ||
209 | /* Called with local BH disabled and the pool lock held. */ | 256 | /* Called with local BH disabled and the pool lock held. */ |
210 | #define lookup_rightempty(start) \ | 257 | #define lookup_rightempty(start, base) \ |
211 | ({ \ | 258 | ({ \ |
212 | struct inet_peer *u, **v; \ | 259 | struct inet_peer *u; \ |
260 | struct inet_peer __rcu **v; \ | ||
213 | *stackptr++ = &start->avl_left; \ | 261 | *stackptr++ = &start->avl_left; \ |
214 | v = &start->avl_left; \ | 262 | v = &start->avl_left; \ |
215 | for (u = *v; u->avl_right != peer_avl_empty; ) { \ | 263 | for (u = rcu_deref_locked(*v, base); \ |
264 | u->avl_right != peer_avl_empty_rcu; ) { \ | ||
216 | v = &u->avl_right; \ | 265 | v = &u->avl_right; \ |
217 | *stackptr++ = v; \ | 266 | *stackptr++ = v; \ |
218 | u = *v; \ | 267 | u = rcu_deref_locked(*v, base); \ |
219 | } \ | 268 | } \ |
220 | u; \ | 269 | u; \ |
221 | }) | 270 | }) |
@@ -224,74 +273,76 @@ static struct inet_peer *lookup_rcu_bh(__be32 daddr) | |||
224 | * Variable names are the proof of operation correctness. | 273 | * Variable names are the proof of operation correctness. |
225 | * Look into mm/map_avl.c for more detail description of the ideas. | 274 | * Look into mm/map_avl.c for more detail description of the ideas. |
226 | */ | 275 | */ |
227 | static void peer_avl_rebalance(struct inet_peer **stack[], | 276 | static void peer_avl_rebalance(struct inet_peer __rcu **stack[], |
228 | struct inet_peer ***stackend) | 277 | struct inet_peer __rcu ***stackend, |
278 | struct inet_peer_base *base) | ||
229 | { | 279 | { |
230 | struct inet_peer **nodep, *node, *l, *r; | 280 | struct inet_peer __rcu **nodep; |
281 | struct inet_peer *node, *l, *r; | ||
231 | int lh, rh; | 282 | int lh, rh; |
232 | 283 | ||
233 | while (stackend > stack) { | 284 | while (stackend > stack) { |
234 | nodep = *--stackend; | 285 | nodep = *--stackend; |
235 | node = *nodep; | 286 | node = rcu_deref_locked(*nodep, base); |
236 | l = node->avl_left; | 287 | l = rcu_deref_locked(node->avl_left, base); |
237 | r = node->avl_right; | 288 | r = rcu_deref_locked(node->avl_right, base); |
238 | lh = node_height(l); | 289 | lh = node_height(l); |
239 | rh = node_height(r); | 290 | rh = node_height(r); |
240 | if (lh > rh + 1) { /* l: RH+2 */ | 291 | if (lh > rh + 1) { /* l: RH+2 */ |
241 | struct inet_peer *ll, *lr, *lrl, *lrr; | 292 | struct inet_peer *ll, *lr, *lrl, *lrr; |
242 | int lrh; | 293 | int lrh; |
243 | ll = l->avl_left; | 294 | ll = rcu_deref_locked(l->avl_left, base); |
244 | lr = l->avl_right; | 295 | lr = rcu_deref_locked(l->avl_right, base); |
245 | lrh = node_height(lr); | 296 | lrh = node_height(lr); |
246 | if (lrh <= node_height(ll)) { /* ll: RH+1 */ | 297 | if (lrh <= node_height(ll)) { /* ll: RH+1 */ |
247 | node->avl_left = lr; /* lr: RH or RH+1 */ | 298 | RCU_INIT_POINTER(node->avl_left, lr); /* lr: RH or RH+1 */ |
248 | node->avl_right = r; /* r: RH */ | 299 | RCU_INIT_POINTER(node->avl_right, r); /* r: RH */ |
249 | node->avl_height = lrh + 1; /* RH+1 or RH+2 */ | 300 | node->avl_height = lrh + 1; /* RH+1 or RH+2 */ |
250 | l->avl_left = ll; /* ll: RH+1 */ | 301 | RCU_INIT_POINTER(l->avl_left, ll); /* ll: RH+1 */ |
251 | l->avl_right = node; /* node: RH+1 or RH+2 */ | 302 | RCU_INIT_POINTER(l->avl_right, node); /* node: RH+1 or RH+2 */ |
252 | l->avl_height = node->avl_height + 1; | 303 | l->avl_height = node->avl_height + 1; |
253 | *nodep = l; | 304 | RCU_INIT_POINTER(*nodep, l); |
254 | } else { /* ll: RH, lr: RH+1 */ | 305 | } else { /* ll: RH, lr: RH+1 */ |
255 | lrl = lr->avl_left; /* lrl: RH or RH-1 */ | 306 | lrl = rcu_deref_locked(lr->avl_left, base);/* lrl: RH or RH-1 */ |
256 | lrr = lr->avl_right; /* lrr: RH or RH-1 */ | 307 | lrr = rcu_deref_locked(lr->avl_right, base);/* lrr: RH or RH-1 */ |
257 | node->avl_left = lrr; /* lrr: RH or RH-1 */ | 308 | RCU_INIT_POINTER(node->avl_left, lrr); /* lrr: RH or RH-1 */ |
258 | node->avl_right = r; /* r: RH */ | 309 | RCU_INIT_POINTER(node->avl_right, r); /* r: RH */ |
259 | node->avl_height = rh + 1; /* node: RH+1 */ | 310 | node->avl_height = rh + 1; /* node: RH+1 */ |
260 | l->avl_left = ll; /* ll: RH */ | 311 | RCU_INIT_POINTER(l->avl_left, ll); /* ll: RH */ |
261 | l->avl_right = lrl; /* lrl: RH or RH-1 */ | 312 | RCU_INIT_POINTER(l->avl_right, lrl); /* lrl: RH or RH-1 */ |
262 | l->avl_height = rh + 1; /* l: RH+1 */ | 313 | l->avl_height = rh + 1; /* l: RH+1 */ |
263 | lr->avl_left = l; /* l: RH+1 */ | 314 | RCU_INIT_POINTER(lr->avl_left, l); /* l: RH+1 */ |
264 | lr->avl_right = node; /* node: RH+1 */ | 315 | RCU_INIT_POINTER(lr->avl_right, node); /* node: RH+1 */ |
265 | lr->avl_height = rh + 2; | 316 | lr->avl_height = rh + 2; |
266 | *nodep = lr; | 317 | RCU_INIT_POINTER(*nodep, lr); |
267 | } | 318 | } |
268 | } else if (rh > lh + 1) { /* r: LH+2 */ | 319 | } else if (rh > lh + 1) { /* r: LH+2 */ |
269 | struct inet_peer *rr, *rl, *rlr, *rll; | 320 | struct inet_peer *rr, *rl, *rlr, *rll; |
270 | int rlh; | 321 | int rlh; |
271 | rr = r->avl_right; | 322 | rr = rcu_deref_locked(r->avl_right, base); |
272 | rl = r->avl_left; | 323 | rl = rcu_deref_locked(r->avl_left, base); |
273 | rlh = node_height(rl); | 324 | rlh = node_height(rl); |
274 | if (rlh <= node_height(rr)) { /* rr: LH+1 */ | 325 | if (rlh <= node_height(rr)) { /* rr: LH+1 */ |
275 | node->avl_right = rl; /* rl: LH or LH+1 */ | 326 | RCU_INIT_POINTER(node->avl_right, rl); /* rl: LH or LH+1 */ |
276 | node->avl_left = l; /* l: LH */ | 327 | RCU_INIT_POINTER(node->avl_left, l); /* l: LH */ |
277 | node->avl_height = rlh + 1; /* LH+1 or LH+2 */ | 328 | node->avl_height = rlh + 1; /* LH+1 or LH+2 */ |
278 | r->avl_right = rr; /* rr: LH+1 */ | 329 | RCU_INIT_POINTER(r->avl_right, rr); /* rr: LH+1 */ |
279 | r->avl_left = node; /* node: LH+1 or LH+2 */ | 330 | RCU_INIT_POINTER(r->avl_left, node); /* node: LH+1 or LH+2 */ |
280 | r->avl_height = node->avl_height + 1; | 331 | r->avl_height = node->avl_height + 1; |
281 | *nodep = r; | 332 | RCU_INIT_POINTER(*nodep, r); |
282 | } else { /* rr: RH, rl: RH+1 */ | 333 | } else { /* rr: RH, rl: RH+1 */ |
283 | rlr = rl->avl_right; /* rlr: LH or LH-1 */ | 334 | rlr = rcu_deref_locked(rl->avl_right, base);/* rlr: LH or LH-1 */ |
284 | rll = rl->avl_left; /* rll: LH or LH-1 */ | 335 | rll = rcu_deref_locked(rl->avl_left, base);/* rll: LH or LH-1 */ |
285 | node->avl_right = rll; /* rll: LH or LH-1 */ | 336 | RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */ |
286 | node->avl_left = l; /* l: LH */ | 337 | RCU_INIT_POINTER(node->avl_left, l); /* l: LH */ |
287 | node->avl_height = lh + 1; /* node: LH+1 */ | 338 | node->avl_height = lh + 1; /* node: LH+1 */ |
288 | r->avl_right = rr; /* rr: LH */ | 339 | RCU_INIT_POINTER(r->avl_right, rr); /* rr: LH */ |
289 | r->avl_left = rlr; /* rlr: LH or LH-1 */ | 340 | RCU_INIT_POINTER(r->avl_left, rlr); /* rlr: LH or LH-1 */ |
290 | r->avl_height = lh + 1; /* r: LH+1 */ | 341 | r->avl_height = lh + 1; /* r: LH+1 */ |
291 | rl->avl_right = r; /* r: LH+1 */ | 342 | RCU_INIT_POINTER(rl->avl_right, r); /* r: LH+1 */ |
292 | rl->avl_left = node; /* node: LH+1 */ | 343 | RCU_INIT_POINTER(rl->avl_left, node); /* node: LH+1 */ |
293 | rl->avl_height = lh + 2; | 344 | rl->avl_height = lh + 2; |
294 | *nodep = rl; | 345 | RCU_INIT_POINTER(*nodep, rl); |
295 | } | 346 | } |
296 | } else { | 347 | } else { |
297 | node->avl_height = (lh > rh ? lh : rh) + 1; | 348 | node->avl_height = (lh > rh ? lh : rh) + 1; |
@@ -300,14 +351,14 @@ static void peer_avl_rebalance(struct inet_peer **stack[], | |||
300 | } | 351 | } |
301 | 352 | ||
302 | /* Called with local BH disabled and the pool lock held. */ | 353 | /* Called with local BH disabled and the pool lock held. */ |
303 | #define link_to_pool(n) \ | 354 | #define link_to_pool(n, base) \ |
304 | do { \ | 355 | do { \ |
305 | n->avl_height = 1; \ | 356 | n->avl_height = 1; \ |
306 | n->avl_left = peer_avl_empty; \ | 357 | n->avl_left = peer_avl_empty_rcu; \ |
307 | n->avl_right = peer_avl_empty; \ | 358 | n->avl_right = peer_avl_empty_rcu; \ |
308 | smp_wmb(); /* lockless readers can catch us now */ \ | 359 | /* lockless readers can catch us now */ \ |
309 | **--stackptr = n; \ | 360 | rcu_assign_pointer(**--stackptr, n); \ |
310 | peer_avl_rebalance(stack, stackptr); \ | 361 | peer_avl_rebalance(stack, stackptr, base); \ |
311 | } while (0) | 362 | } while (0) |
312 | 363 | ||
313 | static void inetpeer_free_rcu(struct rcu_head *head) | 364 | static void inetpeer_free_rcu(struct rcu_head *head) |
@@ -316,13 +367,14 @@ static void inetpeer_free_rcu(struct rcu_head *head) | |||
316 | } | 367 | } |
317 | 368 | ||
318 | /* May be called with local BH enabled. */ | 369 | /* May be called with local BH enabled. */ |
319 | static void unlink_from_pool(struct inet_peer *p) | 370 | static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base, |
371 | struct inet_peer __rcu **stack[PEER_MAXDEPTH]) | ||
320 | { | 372 | { |
321 | int do_free; | 373 | int do_free; |
322 | 374 | ||
323 | do_free = 0; | 375 | do_free = 0; |
324 | 376 | ||
325 | spin_lock_bh(&peers.lock); | 377 | write_seqlock_bh(&base->lock); |
326 | /* Check the reference counter. It was artificially incremented by 1 | 378 | /* Check the reference counter. It was artificially incremented by 1 |
327 | * in cleanup() function to prevent sudden disappearing. If we can | 379 | * in cleanup() function to prevent sudden disappearing. If we can |
328 | * atomically (because of lockless readers) take this last reference, | 380 | * atomically (because of lockless readers) take this last reference, |
@@ -330,38 +382,37 @@ static void unlink_from_pool(struct inet_peer *p) | |||
330 | * We use refcnt=-1 to alert lockless readers this entry is deleted. | 382 | * We use refcnt=-1 to alert lockless readers this entry is deleted. |
331 | */ | 383 | */ |
332 | if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) { | 384 | if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) { |
333 | struct inet_peer **stack[PEER_MAXDEPTH]; | 385 | struct inet_peer __rcu ***stackptr, ***delp; |
334 | struct inet_peer ***stackptr, ***delp; | 386 | if (lookup(&p->daddr, stack, base) != p) |
335 | if (lookup(p->v4daddr, stack) != p) | ||
336 | BUG(); | 387 | BUG(); |
337 | delp = stackptr - 1; /* *delp[0] == p */ | 388 | delp = stackptr - 1; /* *delp[0] == p */ |
338 | if (p->avl_left == peer_avl_empty) { | 389 | if (p->avl_left == peer_avl_empty_rcu) { |
339 | *delp[0] = p->avl_right; | 390 | *delp[0] = p->avl_right; |
340 | --stackptr; | 391 | --stackptr; |
341 | } else { | 392 | } else { |
342 | /* look for a node to insert instead of p */ | 393 | /* look for a node to insert instead of p */ |
343 | struct inet_peer *t; | 394 | struct inet_peer *t; |
344 | t = lookup_rightempty(p); | 395 | t = lookup_rightempty(p, base); |
345 | BUG_ON(*stackptr[-1] != t); | 396 | BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t); |
346 | **--stackptr = t->avl_left; | 397 | **--stackptr = t->avl_left; |
347 | /* t is removed, t->v4daddr > x->v4daddr for any | 398 | /* t is removed, t->daddr > x->daddr for any |
348 | * x in p->avl_left subtree. | 399 | * x in p->avl_left subtree. |
349 | * Put t in the old place of p. */ | 400 | * Put t in the old place of p. */ |
350 | *delp[0] = t; | 401 | RCU_INIT_POINTER(*delp[0], t); |
351 | t->avl_left = p->avl_left; | 402 | t->avl_left = p->avl_left; |
352 | t->avl_right = p->avl_right; | 403 | t->avl_right = p->avl_right; |
353 | t->avl_height = p->avl_height; | 404 | t->avl_height = p->avl_height; |
354 | BUG_ON(delp[1] != &p->avl_left); | 405 | BUG_ON(delp[1] != &p->avl_left); |
355 | delp[1] = &t->avl_left; /* was &p->avl_left */ | 406 | delp[1] = &t->avl_left; /* was &p->avl_left */ |
356 | } | 407 | } |
357 | peer_avl_rebalance(stack, stackptr); | 408 | peer_avl_rebalance(stack, stackptr, base); |
358 | peers.total--; | 409 | base->total--; |
359 | do_free = 1; | 410 | do_free = 1; |
360 | } | 411 | } |
361 | spin_unlock_bh(&peers.lock); | 412 | write_sequnlock_bh(&base->lock); |
362 | 413 | ||
363 | if (do_free) | 414 | if (do_free) |
364 | call_rcu_bh(&p->rcu, inetpeer_free_rcu); | 415 | call_rcu(&p->rcu, inetpeer_free_rcu); |
365 | else | 416 | else |
366 | /* The node is used again. Decrease the reference counter | 417 | /* The node is used again. Decrease the reference counter |
367 | * back. The loop "cleanup -> unlink_from_unused | 418 | * back. The loop "cleanup -> unlink_from_unused |
@@ -373,8 +424,18 @@ static void unlink_from_pool(struct inet_peer *p) | |||
373 | inet_putpeer(p); | 424 | inet_putpeer(p); |
374 | } | 425 | } |
375 | 426 | ||
427 | static struct inet_peer_base *family_to_base(int family) | ||
428 | { | ||
429 | return (family == AF_INET ? &v4_peers : &v6_peers); | ||
430 | } | ||
431 | |||
432 | static struct inet_peer_base *peer_to_base(struct inet_peer *p) | ||
433 | { | ||
434 | return family_to_base(p->daddr.family); | ||
435 | } | ||
436 | |||
376 | /* May be called with local BH enabled. */ | 437 | /* May be called with local BH enabled. */ |
377 | static int cleanup_once(unsigned long ttl) | 438 | static int cleanup_once(unsigned long ttl, struct inet_peer __rcu **stack[PEER_MAXDEPTH]) |
378 | { | 439 | { |
379 | struct inet_peer *p = NULL; | 440 | struct inet_peer *p = NULL; |
380 | 441 | ||
@@ -406,79 +467,101 @@ static int cleanup_once(unsigned long ttl) | |||
406 | * happen because of entry limits in route cache. */ | 467 | * happen because of entry limits in route cache. */ |
407 | return -1; | 468 | return -1; |
408 | 469 | ||
409 | unlink_from_pool(p); | 470 | unlink_from_pool(p, peer_to_base(p), stack); |
410 | return 0; | 471 | return 0; |
411 | } | 472 | } |
412 | 473 | ||
413 | /* Called with or without local BH being disabled. */ | 474 | /* Called with or without local BH being disabled. */ |
414 | struct inet_peer *inet_getpeer(__be32 daddr, int create) | 475 | struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create) |
415 | { | 476 | { |
477 | struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; | ||
478 | struct inet_peer_base *base = family_to_base(daddr->family); | ||
416 | struct inet_peer *p; | 479 | struct inet_peer *p; |
417 | struct inet_peer **stack[PEER_MAXDEPTH], ***stackptr; | 480 | unsigned int sequence; |
481 | int invalidated, newrefcnt = 0; | ||
418 | 482 | ||
419 | /* Look up for the address quickly, lockless. | 483 | /* Look up for the address quickly, lockless. |
420 | * Because of a concurrent writer, we might not find an existing entry. | 484 | * Because of a concurrent writer, we might not find an existing entry. |
421 | */ | 485 | */ |
422 | rcu_read_lock_bh(); | 486 | rcu_read_lock(); |
423 | p = lookup_rcu_bh(daddr); | 487 | sequence = read_seqbegin(&base->lock); |
424 | rcu_read_unlock_bh(); | 488 | p = lookup_rcu(daddr, base, &newrefcnt); |
489 | invalidated = read_seqretry(&base->lock, sequence); | ||
490 | rcu_read_unlock(); | ||
425 | 491 | ||
426 | if (p) { | 492 | if (p) { |
427 | /* The existing node has been found. | 493 | found: /* The existing node has been found. |
428 | * Remove the entry from unused list if it was there. | 494 | * Remove the entry from unused list if it was there. |
429 | */ | 495 | */ |
430 | unlink_from_unused(p); | 496 | if (newrefcnt == 1) |
497 | unlink_from_unused(p); | ||
431 | return p; | 498 | return p; |
432 | } | 499 | } |
433 | 500 | ||
501 | /* If no writer did a change during our lookup, we can return early. */ | ||
502 | if (!create && !invalidated) | ||
503 | return NULL; | ||
504 | |||
434 | /* retry an exact lookup, taking the lock before. | 505 | /* retry an exact lookup, taking the lock before. |
435 | * At least, nodes should be hot in our cache. | 506 | * At least, nodes should be hot in our cache. |
436 | */ | 507 | */ |
437 | spin_lock_bh(&peers.lock); | 508 | write_seqlock_bh(&base->lock); |
438 | p = lookup(daddr, stack); | 509 | p = lookup(daddr, stack, base); |
439 | if (p != peer_avl_empty) { | 510 | if (p != peer_avl_empty) { |
440 | atomic_inc(&p->refcnt); | 511 | newrefcnt = atomic_inc_return(&p->refcnt); |
441 | spin_unlock_bh(&peers.lock); | 512 | write_sequnlock_bh(&base->lock); |
442 | /* Remove the entry from unused list if it was there. */ | 513 | goto found; |
443 | unlink_from_unused(p); | ||
444 | return p; | ||
445 | } | 514 | } |
446 | p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL; | 515 | p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL; |
447 | if (p) { | 516 | if (p) { |
448 | p->v4daddr = daddr; | 517 | p->daddr = *daddr; |
449 | atomic_set(&p->refcnt, 1); | 518 | atomic_set(&p->refcnt, 1); |
450 | atomic_set(&p->rid, 0); | 519 | atomic_set(&p->rid, 0); |
451 | atomic_set(&p->ip_id_count, secure_ip_id(daddr)); | 520 | atomic_set(&p->ip_id_count, secure_ip_id(daddr->addr.a4)); |
452 | p->tcp_ts_stamp = 0; | 521 | p->tcp_ts_stamp = 0; |
522 | p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; | ||
523 | p->rate_tokens = 0; | ||
524 | p->rate_last = 0; | ||
525 | p->pmtu_expires = 0; | ||
526 | p->pmtu_orig = 0; | ||
527 | memset(&p->redirect_learned, 0, sizeof(p->redirect_learned)); | ||
453 | INIT_LIST_HEAD(&p->unused); | 528 | INIT_LIST_HEAD(&p->unused); |
454 | 529 | ||
455 | 530 | ||
456 | /* Link the node. */ | 531 | /* Link the node. */ |
457 | link_to_pool(p); | 532 | link_to_pool(p, base); |
458 | peers.total++; | 533 | base->total++; |
459 | } | 534 | } |
460 | spin_unlock_bh(&peers.lock); | 535 | write_sequnlock_bh(&base->lock); |
461 | 536 | ||
462 | if (peers.total >= inet_peer_threshold) | 537 | if (base->total >= inet_peer_threshold) |
463 | /* Remove one less-recently-used entry. */ | 538 | /* Remove one less-recently-used entry. */ |
464 | cleanup_once(0); | 539 | cleanup_once(0, stack); |
465 | 540 | ||
466 | return p; | 541 | return p; |
467 | } | 542 | } |
468 | 543 | ||
544 | static int compute_total(void) | ||
545 | { | ||
546 | return v4_peers.total + v6_peers.total; | ||
547 | } | ||
548 | EXPORT_SYMBOL_GPL(inet_getpeer); | ||
549 | |||
469 | /* Called with local BH disabled. */ | 550 | /* Called with local BH disabled. */ |
470 | static void peer_check_expire(unsigned long dummy) | 551 | static void peer_check_expire(unsigned long dummy) |
471 | { | 552 | { |
472 | unsigned long now = jiffies; | 553 | unsigned long now = jiffies; |
473 | int ttl; | 554 | int ttl, total; |
555 | struct inet_peer __rcu **stack[PEER_MAXDEPTH]; | ||
474 | 556 | ||
475 | if (peers.total >= inet_peer_threshold) | 557 | total = compute_total(); |
558 | if (total >= inet_peer_threshold) | ||
476 | ttl = inet_peer_minttl; | 559 | ttl = inet_peer_minttl; |
477 | else | 560 | else |
478 | ttl = inet_peer_maxttl | 561 | ttl = inet_peer_maxttl |
479 | - (inet_peer_maxttl - inet_peer_minttl) / HZ * | 562 | - (inet_peer_maxttl - inet_peer_minttl) / HZ * |
480 | peers.total / inet_peer_threshold * HZ; | 563 | total / inet_peer_threshold * HZ; |
481 | while (!cleanup_once(ttl)) { | 564 | while (!cleanup_once(ttl, stack)) { |
482 | if (jiffies != now) | 565 | if (jiffies != now) |
483 | break; | 566 | break; |
484 | } | 567 | } |
@@ -486,13 +569,14 @@ static void peer_check_expire(unsigned long dummy) | |||
486 | /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime | 569 | /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime |
487 | * interval depending on the total number of entries (more entries, | 570 | * interval depending on the total number of entries (more entries, |
488 | * less interval). */ | 571 | * less interval). */ |
489 | if (peers.total >= inet_peer_threshold) | 572 | total = compute_total(); |
573 | if (total >= inet_peer_threshold) | ||
490 | peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime; | 574 | peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime; |
491 | else | 575 | else |
492 | peer_periodic_timer.expires = jiffies | 576 | peer_periodic_timer.expires = jiffies |
493 | + inet_peer_gc_maxtime | 577 | + inet_peer_gc_maxtime |
494 | - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ * | 578 | - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ * |
495 | peers.total / inet_peer_threshold * HZ; | 579 | total / inet_peer_threshold * HZ; |
496 | add_timer(&peer_periodic_timer); | 580 | add_timer(&peer_periodic_timer); |
497 | } | 581 | } |
498 | 582 | ||
@@ -508,3 +592,45 @@ void inet_putpeer(struct inet_peer *p) | |||
508 | 592 | ||
509 | local_bh_enable(); | 593 | local_bh_enable(); |
510 | } | 594 | } |
595 | EXPORT_SYMBOL_GPL(inet_putpeer); | ||
596 | |||
597 | /* | ||
598 | * Check transmit rate limitation for given message. | ||
599 | * The rate information is held in the inet_peer entries now. | ||
600 | * This function is generic and could be used for other purposes | ||
601 | * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov. | ||
602 | * | ||
603 | * Note that the same inet_peer fields are modified by functions in | ||
604 | * route.c too, but these work for packet destinations while xrlim_allow | ||
605 | * works for icmp destinations. This means the rate limiting information | ||
606 | * for one "ip object" is shared - and these ICMPs are twice limited: | ||
607 | * by source and by destination. | ||
608 | * | ||
609 | * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate | ||
610 | * SHOULD allow setting of rate limits | ||
611 | * | ||
612 | * Shared between ICMPv4 and ICMPv6. | ||
613 | */ | ||
614 | #define XRLIM_BURST_FACTOR 6 | ||
615 | bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout) | ||
616 | { | ||
617 | unsigned long now, token; | ||
618 | bool rc = false; | ||
619 | |||
620 | if (!peer) | ||
621 | return true; | ||
622 | |||
623 | token = peer->rate_tokens; | ||
624 | now = jiffies; | ||
625 | token += now - peer->rate_last; | ||
626 | peer->rate_last = now; | ||
627 | if (token > XRLIM_BURST_FACTOR * timeout) | ||
628 | token = XRLIM_BURST_FACTOR * timeout; | ||
629 | if (token >= timeout) { | ||
630 | token -= timeout; | ||
631 | rc = true; | ||
632 | } | ||
633 | peer->rate_tokens = token; | ||
634 | return rc; | ||
635 | } | ||
636 | EXPORT_SYMBOL(inet_peer_xrlim_allow); | ||
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 99461f09320f..3b34d1c86270 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c | |||
@@ -84,7 +84,7 @@ int ip_forward(struct sk_buff *skb) | |||
84 | 84 | ||
85 | rt = skb_rtable(skb); | 85 | rt = skb_rtable(skb); |
86 | 86 | ||
87 | if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) | 87 | if (opt->is_strictroute && ip_hdr(skb)->daddr != rt->rt_gateway) |
88 | goto sr_failed; | 88 | goto sr_failed; |
89 | 89 | ||
90 | if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) && | 90 | if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) && |
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index b7c41654dde5..0ad6035f6366 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c | |||
@@ -45,6 +45,7 @@ | |||
45 | #include <linux/udp.h> | 45 | #include <linux/udp.h> |
46 | #include <linux/inet.h> | 46 | #include <linux/inet.h> |
47 | #include <linux/netfilter_ipv4.h> | 47 | #include <linux/netfilter_ipv4.h> |
48 | #include <net/inet_ecn.h> | ||
48 | 49 | ||
49 | /* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6 | 50 | /* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6 |
50 | * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c | 51 | * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c |
@@ -70,11 +71,46 @@ struct ipq { | |||
70 | __be32 daddr; | 71 | __be32 daddr; |
71 | __be16 id; | 72 | __be16 id; |
72 | u8 protocol; | 73 | u8 protocol; |
74 | u8 ecn; /* RFC3168 support */ | ||
73 | int iif; | 75 | int iif; |
74 | unsigned int rid; | 76 | unsigned int rid; |
75 | struct inet_peer *peer; | 77 | struct inet_peer *peer; |
76 | }; | 78 | }; |
77 | 79 | ||
80 | /* RFC 3168 support : | ||
81 | * We want to check ECN values of all fragments, do detect invalid combinations. | ||
82 | * In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value. | ||
83 | */ | ||
84 | #define IPFRAG_ECN_NOT_ECT 0x01 /* one frag had ECN_NOT_ECT */ | ||
85 | #define IPFRAG_ECN_ECT_1 0x02 /* one frag had ECN_ECT_1 */ | ||
86 | #define IPFRAG_ECN_ECT_0 0x04 /* one frag had ECN_ECT_0 */ | ||
87 | #define IPFRAG_ECN_CE 0x08 /* one frag had ECN_CE */ | ||
88 | |||
89 | static inline u8 ip4_frag_ecn(u8 tos) | ||
90 | { | ||
91 | return 1 << (tos & INET_ECN_MASK); | ||
92 | } | ||
93 | |||
94 | /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements | ||
95 | * Value : 0xff if frame should be dropped. | ||
96 | * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field | ||
97 | */ | ||
98 | static const u8 ip4_frag_ecn_table[16] = { | ||
99 | /* at least one fragment had CE, and others ECT_0 or ECT_1 */ | ||
100 | [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = INET_ECN_CE, | ||
101 | [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = INET_ECN_CE, | ||
102 | [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = INET_ECN_CE, | ||
103 | |||
104 | /* invalid combinations : drop frame */ | ||
105 | [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff, | ||
106 | [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff, | ||
107 | [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff, | ||
108 | [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, | ||
109 | [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff, | ||
110 | [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff, | ||
111 | [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, | ||
112 | }; | ||
113 | |||
78 | static struct inet_frags ip4_frags; | 114 | static struct inet_frags ip4_frags; |
79 | 115 | ||
80 | int ip_frag_nqueues(struct net *net) | 116 | int ip_frag_nqueues(struct net *net) |
@@ -116,11 +152,11 @@ static int ip4_frag_match(struct inet_frag_queue *q, void *a) | |||
116 | struct ip4_create_arg *arg = a; | 152 | struct ip4_create_arg *arg = a; |
117 | 153 | ||
118 | qp = container_of(q, struct ipq, q); | 154 | qp = container_of(q, struct ipq, q); |
119 | return (qp->id == arg->iph->id && | 155 | return qp->id == arg->iph->id && |
120 | qp->saddr == arg->iph->saddr && | 156 | qp->saddr == arg->iph->saddr && |
121 | qp->daddr == arg->iph->daddr && | 157 | qp->daddr == arg->iph->daddr && |
122 | qp->protocol == arg->iph->protocol && | 158 | qp->protocol == arg->iph->protocol && |
123 | qp->user == arg->user); | 159 | qp->user == arg->user; |
124 | } | 160 | } |
125 | 161 | ||
126 | /* Memory Tracking Functions. */ | 162 | /* Memory Tracking Functions. */ |
@@ -137,11 +173,12 @@ static void ip4_frag_init(struct inet_frag_queue *q, void *a) | |||
137 | 173 | ||
138 | qp->protocol = arg->iph->protocol; | 174 | qp->protocol = arg->iph->protocol; |
139 | qp->id = arg->iph->id; | 175 | qp->id = arg->iph->id; |
176 | qp->ecn = ip4_frag_ecn(arg->iph->tos); | ||
140 | qp->saddr = arg->iph->saddr; | 177 | qp->saddr = arg->iph->saddr; |
141 | qp->daddr = arg->iph->daddr; | 178 | qp->daddr = arg->iph->daddr; |
142 | qp->user = arg->user; | 179 | qp->user = arg->user; |
143 | qp->peer = sysctl_ipfrag_max_dist ? | 180 | qp->peer = sysctl_ipfrag_max_dist ? |
144 | inet_getpeer(arg->iph->saddr, 1) : NULL; | 181 | inet_getpeer_v4(arg->iph->saddr, 1) : NULL; |
145 | } | 182 | } |
146 | 183 | ||
147 | static __inline__ void ip4_frag_free(struct inet_frag_queue *q) | 184 | static __inline__ void ip4_frag_free(struct inet_frag_queue *q) |
@@ -204,31 +241,30 @@ static void ip_expire(unsigned long arg) | |||
204 | 241 | ||
205 | if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) { | 242 | if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) { |
206 | struct sk_buff *head = qp->q.fragments; | 243 | struct sk_buff *head = qp->q.fragments; |
244 | const struct iphdr *iph; | ||
245 | int err; | ||
207 | 246 | ||
208 | rcu_read_lock(); | 247 | rcu_read_lock(); |
209 | head->dev = dev_get_by_index_rcu(net, qp->iif); | 248 | head->dev = dev_get_by_index_rcu(net, qp->iif); |
210 | if (!head->dev) | 249 | if (!head->dev) |
211 | goto out_rcu_unlock; | 250 | goto out_rcu_unlock; |
212 | 251 | ||
252 | /* skb dst is stale, drop it, and perform route lookup again */ | ||
253 | skb_dst_drop(head); | ||
254 | iph = ip_hdr(head); | ||
255 | err = ip_route_input_noref(head, iph->daddr, iph->saddr, | ||
256 | iph->tos, head->dev); | ||
257 | if (err) | ||
258 | goto out_rcu_unlock; | ||
259 | |||
213 | /* | 260 | /* |
214 | * Only search router table for the head fragment, | 261 | * Only an end host needs to send an ICMP |
215 | * when defraging timeout at PRE_ROUTING HOOK. | 262 | * "Fragment Reassembly Timeout" message, per RFC792. |
216 | */ | 263 | */ |
217 | if (qp->user == IP_DEFRAG_CONNTRACK_IN && !skb_dst(head)) { | 264 | if (qp->user == IP_DEFRAG_CONNTRACK_IN && |
218 | const struct iphdr *iph = ip_hdr(head); | 265 | skb_rtable(head)->rt_type != RTN_LOCAL) |
219 | int err = ip_route_input(head, iph->daddr, iph->saddr, | 266 | goto out_rcu_unlock; |
220 | iph->tos, head->dev); | ||
221 | if (unlikely(err)) | ||
222 | goto out_rcu_unlock; | ||
223 | |||
224 | /* | ||
225 | * Only an end host needs to send an ICMP | ||
226 | * "Fragment Reassembly Timeout" message, per RFC792. | ||
227 | */ | ||
228 | if (skb_rtable(head)->rt_type != RTN_LOCAL) | ||
229 | goto out_rcu_unlock; | ||
230 | 267 | ||
231 | } | ||
232 | 268 | ||
233 | /* Send an ICMP "Fragment Reassembly Timeout" message. */ | 269 | /* Send an ICMP "Fragment Reassembly Timeout" message. */ |
234 | icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); | 270 | icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); |
@@ -316,6 +352,7 @@ static int ip_frag_reinit(struct ipq *qp) | |||
316 | qp->q.fragments = NULL; | 352 | qp->q.fragments = NULL; |
317 | qp->q.fragments_tail = NULL; | 353 | qp->q.fragments_tail = NULL; |
318 | qp->iif = 0; | 354 | qp->iif = 0; |
355 | qp->ecn = 0; | ||
319 | 356 | ||
320 | return 0; | 357 | return 0; |
321 | } | 358 | } |
@@ -328,6 +365,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) | |||
328 | int flags, offset; | 365 | int flags, offset; |
329 | int ihl, end; | 366 | int ihl, end; |
330 | int err = -ENOENT; | 367 | int err = -ENOENT; |
368 | u8 ecn; | ||
331 | 369 | ||
332 | if (qp->q.last_in & INET_FRAG_COMPLETE) | 370 | if (qp->q.last_in & INET_FRAG_COMPLETE) |
333 | goto err; | 371 | goto err; |
@@ -339,6 +377,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) | |||
339 | goto err; | 377 | goto err; |
340 | } | 378 | } |
341 | 379 | ||
380 | ecn = ip4_frag_ecn(ip_hdr(skb)->tos); | ||
342 | offset = ntohs(ip_hdr(skb)->frag_off); | 381 | offset = ntohs(ip_hdr(skb)->frag_off); |
343 | flags = offset & ~IP_OFFSET; | 382 | flags = offset & ~IP_OFFSET; |
344 | offset &= IP_OFFSET; | 383 | offset &= IP_OFFSET; |
@@ -472,6 +511,7 @@ found: | |||
472 | } | 511 | } |
473 | qp->q.stamp = skb->tstamp; | 512 | qp->q.stamp = skb->tstamp; |
474 | qp->q.meat += skb->len; | 513 | qp->q.meat += skb->len; |
514 | qp->ecn |= ecn; | ||
475 | atomic_add(skb->truesize, &qp->q.net->mem); | 515 | atomic_add(skb->truesize, &qp->q.net->mem); |
476 | if (offset == 0) | 516 | if (offset == 0) |
477 | qp->q.last_in |= INET_FRAG_FIRST_IN; | 517 | qp->q.last_in |= INET_FRAG_FIRST_IN; |
@@ -502,9 +542,15 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, | |||
502 | int len; | 542 | int len; |
503 | int ihlen; | 543 | int ihlen; |
504 | int err; | 544 | int err; |
545 | u8 ecn; | ||
505 | 546 | ||
506 | ipq_kill(qp); | 547 | ipq_kill(qp); |
507 | 548 | ||
549 | ecn = ip4_frag_ecn_table[qp->ecn]; | ||
550 | if (unlikely(ecn == 0xff)) { | ||
551 | err = -EINVAL; | ||
552 | goto out_fail; | ||
553 | } | ||
508 | /* Make the one we just received the head. */ | 554 | /* Make the one we just received the head. */ |
509 | if (prev) { | 555 | if (prev) { |
510 | head = prev->next; | 556 | head = prev->next; |
@@ -542,7 +588,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, | |||
542 | /* If the first fragment is fragmented itself, we split | 588 | /* If the first fragment is fragmented itself, we split |
543 | * it to two chunks: the first with data and paged part | 589 | * it to two chunks: the first with data and paged part |
544 | * and the second, holding only fragments. */ | 590 | * and the second, holding only fragments. */ |
545 | if (skb_has_frags(head)) { | 591 | if (skb_has_frag_list(head)) { |
546 | struct sk_buff *clone; | 592 | struct sk_buff *clone; |
547 | int i, plen = 0; | 593 | int i, plen = 0; |
548 | 594 | ||
@@ -583,6 +629,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, | |||
583 | iph = ip_hdr(head); | 629 | iph = ip_hdr(head); |
584 | iph->frag_off = 0; | 630 | iph->frag_off = 0; |
585 | iph->tot_len = htons(len); | 631 | iph->tot_len = htons(len); |
632 | iph->tos |= ecn; | ||
586 | IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS); | 633 | IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS); |
587 | qp->q.fragments = NULL; | 634 | qp->q.fragments = NULL; |
588 | qp->q.fragments_tail = NULL; | 635 | qp->q.fragments_tail = NULL; |
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 35c93e8b6a46..8871067560db 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c | |||
@@ -44,6 +44,7 @@ | |||
44 | #include <net/net_namespace.h> | 44 | #include <net/net_namespace.h> |
45 | #include <net/netns/generic.h> | 45 | #include <net/netns/generic.h> |
46 | #include <net/rtnetlink.h> | 46 | #include <net/rtnetlink.h> |
47 | #include <net/gre.h> | ||
47 | 48 | ||
48 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | 49 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
49 | #include <net/ipv6.h> | 50 | #include <net/ipv6.h> |
@@ -63,13 +64,13 @@ | |||
63 | We cannot track such dead loops during route installation, | 64 | We cannot track such dead loops during route installation, |
64 | it is infeasible task. The most general solutions would be | 65 | it is infeasible task. The most general solutions would be |
65 | to keep skb->encapsulation counter (sort of local ttl), | 66 | to keep skb->encapsulation counter (sort of local ttl), |
66 | and silently drop packet when it expires. It is the best | 67 | and silently drop packet when it expires. It is a good |
67 | solution, but it supposes maintaing new variable in ALL | 68 | solution, but it supposes maintaing new variable in ALL |
68 | skb, even if no tunneling is used. | 69 | skb, even if no tunneling is used. |
69 | 70 | ||
70 | Current solution: HARD_TX_LOCK lock breaks dead loops. | 71 | Current solution: xmit_recursion breaks dead loops. This is a percpu |
71 | 72 | counter, since when we enter the first ndo_xmit(), cpu migration is | |
72 | 73 | forbidden. We force an exit if this counter reaches RECURSION_LIMIT | |
73 | 74 | ||
74 | 2. Networking dead loops would not kill routers, but would really | 75 | 2. Networking dead loops would not kill routers, but would really |
75 | kill network. IP hop limit plays role of "t->recursion" in this case, | 76 | kill network. IP hop limit plays role of "t->recursion" in this case, |
@@ -128,7 +129,7 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev); | |||
128 | 129 | ||
129 | static int ipgre_net_id __read_mostly; | 130 | static int ipgre_net_id __read_mostly; |
130 | struct ipgre_net { | 131 | struct ipgre_net { |
131 | struct ip_tunnel *tunnels[4][HASH_SIZE]; | 132 | struct ip_tunnel __rcu *tunnels[4][HASH_SIZE]; |
132 | 133 | ||
133 | struct net_device *fb_tunnel_dev; | 134 | struct net_device *fb_tunnel_dev; |
134 | }; | 135 | }; |
@@ -158,13 +159,40 @@ struct ipgre_net { | |||
158 | #define tunnels_l tunnels[1] | 159 | #define tunnels_l tunnels[1] |
159 | #define tunnels_wc tunnels[0] | 160 | #define tunnels_wc tunnels[0] |
160 | /* | 161 | /* |
161 | * Locking : hash tables are protected by RCU and a spinlock | 162 | * Locking : hash tables are protected by RCU and RTNL |
162 | */ | 163 | */ |
163 | static DEFINE_SPINLOCK(ipgre_lock); | ||
164 | 164 | ||
165 | #define for_each_ip_tunnel_rcu(start) \ | 165 | #define for_each_ip_tunnel_rcu(start) \ |
166 | for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) | 166 | for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) |
167 | 167 | ||
168 | /* often modified stats are per cpu, other are shared (netdev->stats) */ | ||
169 | struct pcpu_tstats { | ||
170 | unsigned long rx_packets; | ||
171 | unsigned long rx_bytes; | ||
172 | unsigned long tx_packets; | ||
173 | unsigned long tx_bytes; | ||
174 | }; | ||
175 | |||
176 | static struct net_device_stats *ipgre_get_stats(struct net_device *dev) | ||
177 | { | ||
178 | struct pcpu_tstats sum = { 0 }; | ||
179 | int i; | ||
180 | |||
181 | for_each_possible_cpu(i) { | ||
182 | const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); | ||
183 | |||
184 | sum.rx_packets += tstats->rx_packets; | ||
185 | sum.rx_bytes += tstats->rx_bytes; | ||
186 | sum.tx_packets += tstats->tx_packets; | ||
187 | sum.tx_bytes += tstats->tx_bytes; | ||
188 | } | ||
189 | dev->stats.rx_packets = sum.rx_packets; | ||
190 | dev->stats.rx_bytes = sum.rx_bytes; | ||
191 | dev->stats.tx_packets = sum.tx_packets; | ||
192 | dev->stats.tx_bytes = sum.tx_bytes; | ||
193 | return &dev->stats; | ||
194 | } | ||
195 | |||
168 | /* Given src, dst and key, find appropriate for input tunnel. */ | 196 | /* Given src, dst and key, find appropriate for input tunnel. */ |
169 | 197 | ||
170 | static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev, | 198 | static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev, |
@@ -173,8 +201,8 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev, | |||
173 | { | 201 | { |
174 | struct net *net = dev_net(dev); | 202 | struct net *net = dev_net(dev); |
175 | int link = dev->ifindex; | 203 | int link = dev->ifindex; |
176 | unsigned h0 = HASH(remote); | 204 | unsigned int h0 = HASH(remote); |
177 | unsigned h1 = HASH(key); | 205 | unsigned int h1 = HASH(key); |
178 | struct ip_tunnel *t, *cand = NULL; | 206 | struct ip_tunnel *t, *cand = NULL; |
179 | struct ipgre_net *ign = net_generic(net, ipgre_net_id); | 207 | struct ipgre_net *ign = net_generic(net, ipgre_net_id); |
180 | int dev_type = (gre_proto == htons(ETH_P_TEB)) ? | 208 | int dev_type = (gre_proto == htons(ETH_P_TEB)) ? |
@@ -289,13 +317,13 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev, | |||
289 | return NULL; | 317 | return NULL; |
290 | } | 318 | } |
291 | 319 | ||
292 | static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign, | 320 | static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign, |
293 | struct ip_tunnel_parm *parms) | 321 | struct ip_tunnel_parm *parms) |
294 | { | 322 | { |
295 | __be32 remote = parms->iph.daddr; | 323 | __be32 remote = parms->iph.daddr; |
296 | __be32 local = parms->iph.saddr; | 324 | __be32 local = parms->iph.saddr; |
297 | __be32 key = parms->i_key; | 325 | __be32 key = parms->i_key; |
298 | unsigned h = HASH(key); | 326 | unsigned int h = HASH(key); |
299 | int prio = 0; | 327 | int prio = 0; |
300 | 328 | ||
301 | if (local) | 329 | if (local) |
@@ -308,7 +336,7 @@ static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign, | |||
308 | return &ign->tunnels[prio][h]; | 336 | return &ign->tunnels[prio][h]; |
309 | } | 337 | } |
310 | 338 | ||
311 | static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign, | 339 | static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign, |
312 | struct ip_tunnel *t) | 340 | struct ip_tunnel *t) |
313 | { | 341 | { |
314 | return __ipgre_bucket(ign, &t->parms); | 342 | return __ipgre_bucket(ign, &t->parms); |
@@ -316,23 +344,22 @@ static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign, | |||
316 | 344 | ||
317 | static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t) | 345 | static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t) |
318 | { | 346 | { |
319 | struct ip_tunnel **tp = ipgre_bucket(ign, t); | 347 | struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t); |
320 | 348 | ||
321 | spin_lock_bh(&ipgre_lock); | 349 | rcu_assign_pointer(t->next, rtnl_dereference(*tp)); |
322 | t->next = *tp; | ||
323 | rcu_assign_pointer(*tp, t); | 350 | rcu_assign_pointer(*tp, t); |
324 | spin_unlock_bh(&ipgre_lock); | ||
325 | } | 351 | } |
326 | 352 | ||
327 | static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t) | 353 | static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t) |
328 | { | 354 | { |
329 | struct ip_tunnel **tp; | 355 | struct ip_tunnel __rcu **tp; |
330 | 356 | struct ip_tunnel *iter; | |
331 | for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) { | 357 | |
332 | if (t == *tp) { | 358 | for (tp = ipgre_bucket(ign, t); |
333 | spin_lock_bh(&ipgre_lock); | 359 | (iter = rtnl_dereference(*tp)) != NULL; |
334 | *tp = t->next; | 360 | tp = &iter->next) { |
335 | spin_unlock_bh(&ipgre_lock); | 361 | if (t == iter) { |
362 | rcu_assign_pointer(*tp, t->next); | ||
336 | break; | 363 | break; |
337 | } | 364 | } |
338 | } | 365 | } |
@@ -346,10 +373,13 @@ static struct ip_tunnel *ipgre_tunnel_find(struct net *net, | |||
346 | __be32 local = parms->iph.saddr; | 373 | __be32 local = parms->iph.saddr; |
347 | __be32 key = parms->i_key; | 374 | __be32 key = parms->i_key; |
348 | int link = parms->link; | 375 | int link = parms->link; |
349 | struct ip_tunnel *t, **tp; | 376 | struct ip_tunnel *t; |
377 | struct ip_tunnel __rcu **tp; | ||
350 | struct ipgre_net *ign = net_generic(net, ipgre_net_id); | 378 | struct ipgre_net *ign = net_generic(net, ipgre_net_id); |
351 | 379 | ||
352 | for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next) | 380 | for (tp = __ipgre_bucket(ign, parms); |
381 | (t = rtnl_dereference(*tp)) != NULL; | ||
382 | tp = &t->next) | ||
353 | if (local == t->parms.iph.saddr && | 383 | if (local == t->parms.iph.saddr && |
354 | remote == t->parms.iph.daddr && | 384 | remote == t->parms.iph.daddr && |
355 | key == t->parms.i_key && | 385 | key == t->parms.i_key && |
@@ -360,7 +390,7 @@ static struct ip_tunnel *ipgre_tunnel_find(struct net *net, | |||
360 | return t; | 390 | return t; |
361 | } | 391 | } |
362 | 392 | ||
363 | static struct ip_tunnel * ipgre_tunnel_locate(struct net *net, | 393 | static struct ip_tunnel *ipgre_tunnel_locate(struct net *net, |
364 | struct ip_tunnel_parm *parms, int create) | 394 | struct ip_tunnel_parm *parms, int create) |
365 | { | 395 | { |
366 | struct ip_tunnel *t, *nt; | 396 | struct ip_tunnel *t, *nt; |
@@ -375,19 +405,14 @@ static struct ip_tunnel * ipgre_tunnel_locate(struct net *net, | |||
375 | if (parms->name[0]) | 405 | if (parms->name[0]) |
376 | strlcpy(name, parms->name, IFNAMSIZ); | 406 | strlcpy(name, parms->name, IFNAMSIZ); |
377 | else | 407 | else |
378 | sprintf(name, "gre%%d"); | 408 | strcpy(name, "gre%d"); |
379 | 409 | ||
380 | dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup); | 410 | dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup); |
381 | if (!dev) | 411 | if (!dev) |
382 | return NULL; | 412 | return NULL; |
383 | 413 | ||
384 | dev_net_set(dev, net); | 414 | dev_net_set(dev, net); |
385 | 415 | ||
386 | if (strchr(name, '%')) { | ||
387 | if (dev_alloc_name(dev, name) < 0) | ||
388 | goto failed_free; | ||
389 | } | ||
390 | |||
391 | nt = netdev_priv(dev); | 416 | nt = netdev_priv(dev); |
392 | nt->parms = *parms; | 417 | nt->parms = *parms; |
393 | dev->rtnl_link_ops = &ipgre_link_ops; | 418 | dev->rtnl_link_ops = &ipgre_link_ops; |
@@ -432,7 +457,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info) | |||
432 | by themself??? | 457 | by themself??? |
433 | */ | 458 | */ |
434 | 459 | ||
435 | struct iphdr *iph = (struct iphdr *)skb->data; | 460 | const struct iphdr *iph = (const struct iphdr *)skb->data; |
436 | __be16 *p = (__be16*)(skb->data+(iph->ihl<<2)); | 461 | __be16 *p = (__be16*)(skb->data+(iph->ihl<<2)); |
437 | int grehlen = (iph->ihl<<2) + 4; | 462 | int grehlen = (iph->ihl<<2) + 4; |
438 | const int type = icmp_hdr(skb)->type; | 463 | const int type = icmp_hdr(skb)->type; |
@@ -504,7 +529,7 @@ out: | |||
504 | rcu_read_unlock(); | 529 | rcu_read_unlock(); |
505 | } | 530 | } |
506 | 531 | ||
507 | static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb) | 532 | static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb) |
508 | { | 533 | { |
509 | if (INET_ECN_is_ce(iph->tos)) { | 534 | if (INET_ECN_is_ce(iph->tos)) { |
510 | if (skb->protocol == htons(ETH_P_IP)) { | 535 | if (skb->protocol == htons(ETH_P_IP)) { |
@@ -516,19 +541,19 @@ static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb) | |||
516 | } | 541 | } |
517 | 542 | ||
518 | static inline u8 | 543 | static inline u8 |
519 | ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb) | 544 | ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb) |
520 | { | 545 | { |
521 | u8 inner = 0; | 546 | u8 inner = 0; |
522 | if (skb->protocol == htons(ETH_P_IP)) | 547 | if (skb->protocol == htons(ETH_P_IP)) |
523 | inner = old_iph->tos; | 548 | inner = old_iph->tos; |
524 | else if (skb->protocol == htons(ETH_P_IPV6)) | 549 | else if (skb->protocol == htons(ETH_P_IPV6)) |
525 | inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph); | 550 | inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph); |
526 | return INET_ECN_encapsulate(tos, inner); | 551 | return INET_ECN_encapsulate(tos, inner); |
527 | } | 552 | } |
528 | 553 | ||
529 | static int ipgre_rcv(struct sk_buff *skb) | 554 | static int ipgre_rcv(struct sk_buff *skb) |
530 | { | 555 | { |
531 | struct iphdr *iph; | 556 | const struct iphdr *iph; |
532 | u8 *h; | 557 | u8 *h; |
533 | __be16 flags; | 558 | __be16 flags; |
534 | __sum16 csum = 0; | 559 | __sum16 csum = 0; |
@@ -582,7 +607,7 @@ static int ipgre_rcv(struct sk_buff *skb) | |||
582 | if ((tunnel = ipgre_tunnel_lookup(skb->dev, | 607 | if ((tunnel = ipgre_tunnel_lookup(skb->dev, |
583 | iph->saddr, iph->daddr, key, | 608 | iph->saddr, iph->daddr, key, |
584 | gre_proto))) { | 609 | gre_proto))) { |
585 | struct net_device_stats *stats = &tunnel->dev->stats; | 610 | struct pcpu_tstats *tstats; |
586 | 611 | ||
587 | secpath_reset(skb); | 612 | secpath_reset(skb); |
588 | 613 | ||
@@ -604,24 +629,24 @@ static int ipgre_rcv(struct sk_buff *skb) | |||
604 | #ifdef CONFIG_NET_IPGRE_BROADCAST | 629 | #ifdef CONFIG_NET_IPGRE_BROADCAST |
605 | if (ipv4_is_multicast(iph->daddr)) { | 630 | if (ipv4_is_multicast(iph->daddr)) { |
606 | /* Looped back packet, drop it! */ | 631 | /* Looped back packet, drop it! */ |
607 | if (skb_rtable(skb)->fl.iif == 0) | 632 | if (rt_is_output_route(skb_rtable(skb))) |
608 | goto drop; | 633 | goto drop; |
609 | stats->multicast++; | 634 | tunnel->dev->stats.multicast++; |
610 | skb->pkt_type = PACKET_BROADCAST; | 635 | skb->pkt_type = PACKET_BROADCAST; |
611 | } | 636 | } |
612 | #endif | 637 | #endif |
613 | 638 | ||
614 | if (((flags&GRE_CSUM) && csum) || | 639 | if (((flags&GRE_CSUM) && csum) || |
615 | (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) { | 640 | (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) { |
616 | stats->rx_crc_errors++; | 641 | tunnel->dev->stats.rx_crc_errors++; |
617 | stats->rx_errors++; | 642 | tunnel->dev->stats.rx_errors++; |
618 | goto drop; | 643 | goto drop; |
619 | } | 644 | } |
620 | if (tunnel->parms.i_flags&GRE_SEQ) { | 645 | if (tunnel->parms.i_flags&GRE_SEQ) { |
621 | if (!(flags&GRE_SEQ) || | 646 | if (!(flags&GRE_SEQ) || |
622 | (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) { | 647 | (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) { |
623 | stats->rx_fifo_errors++; | 648 | tunnel->dev->stats.rx_fifo_errors++; |
624 | stats->rx_errors++; | 649 | tunnel->dev->stats.rx_errors++; |
625 | goto drop; | 650 | goto drop; |
626 | } | 651 | } |
627 | tunnel->i_seqno = seqno + 1; | 652 | tunnel->i_seqno = seqno + 1; |
@@ -630,8 +655,8 @@ static int ipgre_rcv(struct sk_buff *skb) | |||
630 | /* Warning: All skb pointers will be invalidated! */ | 655 | /* Warning: All skb pointers will be invalidated! */ |
631 | if (tunnel->dev->type == ARPHRD_ETHER) { | 656 | if (tunnel->dev->type == ARPHRD_ETHER) { |
632 | if (!pskb_may_pull(skb, ETH_HLEN)) { | 657 | if (!pskb_may_pull(skb, ETH_HLEN)) { |
633 | stats->rx_length_errors++; | 658 | tunnel->dev->stats.rx_length_errors++; |
634 | stats->rx_errors++; | 659 | tunnel->dev->stats.rx_errors++; |
635 | goto drop; | 660 | goto drop; |
636 | } | 661 | } |
637 | 662 | ||
@@ -640,14 +665,19 @@ static int ipgre_rcv(struct sk_buff *skb) | |||
640 | skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); | 665 | skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); |
641 | } | 666 | } |
642 | 667 | ||
643 | skb_tunnel_rx(skb, tunnel->dev); | 668 | tstats = this_cpu_ptr(tunnel->dev->tstats); |
669 | tstats->rx_packets++; | ||
670 | tstats->rx_bytes += skb->len; | ||
671 | |||
672 | __skb_tunnel_rx(skb, tunnel->dev); | ||
644 | 673 | ||
645 | skb_reset_network_header(skb); | 674 | skb_reset_network_header(skb); |
646 | ipgre_ecn_decapsulate(iph, skb); | 675 | ipgre_ecn_decapsulate(iph, skb); |
647 | 676 | ||
648 | netif_rx(skb); | 677 | netif_rx(skb); |
678 | |||
649 | rcu_read_unlock(); | 679 | rcu_read_unlock(); |
650 | return(0); | 680 | return 0; |
651 | } | 681 | } |
652 | icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); | 682 | icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); |
653 | 683 | ||
@@ -655,20 +685,20 @@ drop: | |||
655 | rcu_read_unlock(); | 685 | rcu_read_unlock(); |
656 | drop_nolock: | 686 | drop_nolock: |
657 | kfree_skb(skb); | 687 | kfree_skb(skb); |
658 | return(0); | 688 | return 0; |
659 | } | 689 | } |
660 | 690 | ||
661 | static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) | 691 | static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) |
662 | { | 692 | { |
663 | struct ip_tunnel *tunnel = netdev_priv(dev); | 693 | struct ip_tunnel *tunnel = netdev_priv(dev); |
664 | struct net_device_stats *stats = &dev->stats; | 694 | struct pcpu_tstats *tstats; |
665 | struct netdev_queue *txq = netdev_get_tx_queue(dev, 0); | 695 | const struct iphdr *old_iph = ip_hdr(skb); |
666 | struct iphdr *old_iph = ip_hdr(skb); | 696 | const struct iphdr *tiph; |
667 | struct iphdr *tiph; | 697 | struct flowi4 fl4; |
668 | u8 tos; | 698 | u8 tos; |
669 | __be16 df; | 699 | __be16 df; |
670 | struct rtable *rt; /* Route to the other host */ | 700 | struct rtable *rt; /* Route to the other host */ |
671 | struct net_device *tdev; /* Device to other host */ | 701 | struct net_device *tdev; /* Device to other host */ |
672 | struct iphdr *iph; /* Our new IP header */ | 702 | struct iphdr *iph; /* Our new IP header */ |
673 | unsigned int max_headroom; /* The extra header space needed */ | 703 | unsigned int max_headroom; /* The extra header space needed */ |
674 | int gre_hlen; | 704 | int gre_hlen; |
@@ -680,7 +710,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev | |||
680 | 710 | ||
681 | if (dev->header_ops && dev->type == ARPHRD_IPGRE) { | 711 | if (dev->header_ops && dev->type == ARPHRD_IPGRE) { |
682 | gre_hlen = 0; | 712 | gre_hlen = 0; |
683 | tiph = (struct iphdr *)skb->data; | 713 | tiph = (const struct iphdr *)skb->data; |
684 | } else { | 714 | } else { |
685 | gre_hlen = tunnel->hlen; | 715 | gre_hlen = tunnel->hlen; |
686 | tiph = &tunnel->parms.iph; | 716 | tiph = &tunnel->parms.iph; |
@@ -690,7 +720,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev | |||
690 | /* NBMA tunnel */ | 720 | /* NBMA tunnel */ |
691 | 721 | ||
692 | if (skb_dst(skb) == NULL) { | 722 | if (skb_dst(skb) == NULL) { |
693 | stats->tx_fifo_errors++; | 723 | dev->stats.tx_fifo_errors++; |
694 | goto tx_error; | 724 | goto tx_error; |
695 | } | 725 | } |
696 | 726 | ||
@@ -701,14 +731,14 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev | |||
701 | } | 731 | } |
702 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | 732 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
703 | else if (skb->protocol == htons(ETH_P_IPV6)) { | 733 | else if (skb->protocol == htons(ETH_P_IPV6)) { |
704 | struct in6_addr *addr6; | 734 | const struct in6_addr *addr6; |
705 | int addr_type; | 735 | int addr_type; |
706 | struct neighbour *neigh = skb_dst(skb)->neighbour; | 736 | struct neighbour *neigh = skb_dst(skb)->neighbour; |
707 | 737 | ||
708 | if (neigh == NULL) | 738 | if (neigh == NULL) |
709 | goto tx_error; | 739 | goto tx_error; |
710 | 740 | ||
711 | addr6 = (struct in6_addr *)&neigh->primary_key; | 741 | addr6 = (const struct in6_addr *)&neigh->primary_key; |
712 | addr_type = ipv6_addr_type(addr6); | 742 | addr_type = ipv6_addr_type(addr6); |
713 | 743 | ||
714 | if (addr_type == IPV6_ADDR_ANY) { | 744 | if (addr_type == IPV6_ADDR_ANY) { |
@@ -732,26 +762,21 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev | |||
732 | if (skb->protocol == htons(ETH_P_IP)) | 762 | if (skb->protocol == htons(ETH_P_IP)) |
733 | tos = old_iph->tos; | 763 | tos = old_iph->tos; |
734 | else if (skb->protocol == htons(ETH_P_IPV6)) | 764 | else if (skb->protocol == htons(ETH_P_IPV6)) |
735 | tos = ipv6_get_dsfield((struct ipv6hdr *)old_iph); | 765 | tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph); |
736 | } | 766 | } |
737 | 767 | ||
738 | { | 768 | rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr, |
739 | struct flowi fl = { .oif = tunnel->parms.link, | 769 | tunnel->parms.o_key, RT_TOS(tos), |
740 | .nl_u = { .ip4_u = | 770 | tunnel->parms.link); |
741 | { .daddr = dst, | 771 | if (IS_ERR(rt)) { |
742 | .saddr = tiph->saddr, | 772 | dev->stats.tx_carrier_errors++; |
743 | .tos = RT_TOS(tos) } }, | 773 | goto tx_error; |
744 | .proto = IPPROTO_GRE }; | ||
745 | if (ip_route_output_key(dev_net(dev), &rt, &fl)) { | ||
746 | stats->tx_carrier_errors++; | ||
747 | goto tx_error; | ||
748 | } | ||
749 | } | 774 | } |
750 | tdev = rt->dst.dev; | 775 | tdev = rt->dst.dev; |
751 | 776 | ||
752 | if (tdev == dev) { | 777 | if (tdev == dev) { |
753 | ip_rt_put(rt); | 778 | ip_rt_put(rt); |
754 | stats->collisions++; | 779 | dev->stats.collisions++; |
755 | goto tx_error; | 780 | goto tx_error; |
756 | } | 781 | } |
757 | 782 | ||
@@ -783,7 +808,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev | |||
783 | !ipv4_is_multicast(tunnel->parms.iph.daddr)) || | 808 | !ipv4_is_multicast(tunnel->parms.iph.daddr)) || |
784 | rt6->rt6i_dst.plen == 128) { | 809 | rt6->rt6i_dst.plen == 128) { |
785 | rt6->rt6i_flags |= RTF_MODIFIED; | 810 | rt6->rt6i_flags |= RTF_MODIFIED; |
786 | skb_dst(skb)->metrics[RTAX_MTU-1] = mtu; | 811 | dst_metric_set(skb_dst(skb), RTAX_MTU, mtu); |
787 | } | 812 | } |
788 | } | 813 | } |
789 | 814 | ||
@@ -814,7 +839,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev | |||
814 | dev->needed_headroom = max_headroom; | 839 | dev->needed_headroom = max_headroom; |
815 | if (!new_skb) { | 840 | if (!new_skb) { |
816 | ip_rt_put(rt); | 841 | ip_rt_put(rt); |
817 | txq->tx_dropped++; | 842 | dev->stats.tx_dropped++; |
818 | dev_kfree_skb(skb); | 843 | dev_kfree_skb(skb); |
819 | return NETDEV_TX_OK; | 844 | return NETDEV_TX_OK; |
820 | } | 845 | } |
@@ -844,18 +869,18 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev | |||
844 | iph->frag_off = df; | 869 | iph->frag_off = df; |
845 | iph->protocol = IPPROTO_GRE; | 870 | iph->protocol = IPPROTO_GRE; |
846 | iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb); | 871 | iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb); |
847 | iph->daddr = rt->rt_dst; | 872 | iph->daddr = fl4.daddr; |
848 | iph->saddr = rt->rt_src; | 873 | iph->saddr = fl4.saddr; |
849 | 874 | ||
850 | if ((iph->ttl = tiph->ttl) == 0) { | 875 | if ((iph->ttl = tiph->ttl) == 0) { |
851 | if (skb->protocol == htons(ETH_P_IP)) | 876 | if (skb->protocol == htons(ETH_P_IP)) |
852 | iph->ttl = old_iph->ttl; | 877 | iph->ttl = old_iph->ttl; |
853 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | 878 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
854 | else if (skb->protocol == htons(ETH_P_IPV6)) | 879 | else if (skb->protocol == htons(ETH_P_IPV6)) |
855 | iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit; | 880 | iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit; |
856 | #endif | 881 | #endif |
857 | else | 882 | else |
858 | iph->ttl = dst_metric(&rt->dst, RTAX_HOPLIMIT); | 883 | iph->ttl = ip4_dst_hoplimit(&rt->dst); |
859 | } | 884 | } |
860 | 885 | ||
861 | ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags; | 886 | ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags; |
@@ -881,15 +906,15 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev | |||
881 | } | 906 | } |
882 | 907 | ||
883 | nf_reset(skb); | 908 | nf_reset(skb); |
884 | 909 | tstats = this_cpu_ptr(dev->tstats); | |
885 | IPTUNNEL_XMIT(); | 910 | __IPTUNNEL_XMIT(tstats, &dev->stats); |
886 | return NETDEV_TX_OK; | 911 | return NETDEV_TX_OK; |
887 | 912 | ||
888 | tx_error_icmp: | 913 | tx_error_icmp: |
889 | dst_link_failure(skb); | 914 | dst_link_failure(skb); |
890 | 915 | ||
891 | tx_error: | 916 | tx_error: |
892 | stats->tx_errors++; | 917 | dev->stats.tx_errors++; |
893 | dev_kfree_skb(skb); | 918 | dev_kfree_skb(skb); |
894 | return NETDEV_TX_OK; | 919 | return NETDEV_TX_OK; |
895 | } | 920 | } |
@@ -898,7 +923,7 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev) | |||
898 | { | 923 | { |
899 | struct net_device *tdev = NULL; | 924 | struct net_device *tdev = NULL; |
900 | struct ip_tunnel *tunnel; | 925 | struct ip_tunnel *tunnel; |
901 | struct iphdr *iph; | 926 | const struct iphdr *iph; |
902 | int hlen = LL_MAX_HEADER; | 927 | int hlen = LL_MAX_HEADER; |
903 | int mtu = ETH_DATA_LEN; | 928 | int mtu = ETH_DATA_LEN; |
904 | int addend = sizeof(struct iphdr) + 4; | 929 | int addend = sizeof(struct iphdr) + 4; |
@@ -909,14 +934,15 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev) | |||
909 | /* Guess output device to choose reasonable mtu and needed_headroom */ | 934 | /* Guess output device to choose reasonable mtu and needed_headroom */ |
910 | 935 | ||
911 | if (iph->daddr) { | 936 | if (iph->daddr) { |
912 | struct flowi fl = { .oif = tunnel->parms.link, | 937 | struct flowi4 fl4; |
913 | .nl_u = { .ip4_u = | ||
914 | { .daddr = iph->daddr, | ||
915 | .saddr = iph->saddr, | ||
916 | .tos = RT_TOS(iph->tos) } }, | ||
917 | .proto = IPPROTO_GRE }; | ||
918 | struct rtable *rt; | 938 | struct rtable *rt; |
919 | if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { | 939 | |
940 | rt = ip_route_output_gre(dev_net(dev), &fl4, | ||
941 | iph->daddr, iph->saddr, | ||
942 | tunnel->parms.o_key, | ||
943 | RT_TOS(iph->tos), | ||
944 | tunnel->parms.link); | ||
945 | if (!IS_ERR(rt)) { | ||
920 | tdev = rt->dst.dev; | 946 | tdev = rt->dst.dev; |
921 | ip_rt_put(rt); | 947 | ip_rt_put(rt); |
922 | } | 948 | } |
@@ -1012,7 +1038,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) | |||
1012 | break; | 1038 | break; |
1013 | } | 1039 | } |
1014 | } else { | 1040 | } else { |
1015 | unsigned nflags = 0; | 1041 | unsigned int nflags = 0; |
1016 | 1042 | ||
1017 | t = netdev_priv(dev); | 1043 | t = netdev_priv(dev); |
1018 | 1044 | ||
@@ -1026,6 +1052,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) | |||
1026 | break; | 1052 | break; |
1027 | } | 1053 | } |
1028 | ipgre_tunnel_unlink(ign, t); | 1054 | ipgre_tunnel_unlink(ign, t); |
1055 | synchronize_net(); | ||
1029 | t->parms.iph.saddr = p.iph.saddr; | 1056 | t->parms.iph.saddr = p.iph.saddr; |
1030 | t->parms.iph.daddr = p.iph.daddr; | 1057 | t->parms.iph.daddr = p.iph.daddr; |
1031 | t->parms.i_key = p.i_key; | 1058 | t->parms.i_key = p.i_key; |
@@ -1125,7 +1152,7 @@ static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu) | |||
1125 | 1152 | ||
1126 | static int ipgre_header(struct sk_buff *skb, struct net_device *dev, | 1153 | static int ipgre_header(struct sk_buff *skb, struct net_device *dev, |
1127 | unsigned short type, | 1154 | unsigned short type, |
1128 | const void *daddr, const void *saddr, unsigned len) | 1155 | const void *daddr, const void *saddr, unsigned int len) |
1129 | { | 1156 | { |
1130 | struct ip_tunnel *t = netdev_priv(dev); | 1157 | struct ip_tunnel *t = netdev_priv(dev); |
1131 | struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen); | 1158 | struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen); |
@@ -1151,7 +1178,7 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev, | |||
1151 | 1178 | ||
1152 | static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr) | 1179 | static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr) |
1153 | { | 1180 | { |
1154 | struct iphdr *iph = (struct iphdr *) skb_mac_header(skb); | 1181 | const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb); |
1155 | memcpy(haddr, &iph->saddr, 4); | 1182 | memcpy(haddr, &iph->saddr, 4); |
1156 | return 4; | 1183 | return 4; |
1157 | } | 1184 | } |
@@ -1167,14 +1194,16 @@ static int ipgre_open(struct net_device *dev) | |||
1167 | struct ip_tunnel *t = netdev_priv(dev); | 1194 | struct ip_tunnel *t = netdev_priv(dev); |
1168 | 1195 | ||
1169 | if (ipv4_is_multicast(t->parms.iph.daddr)) { | 1196 | if (ipv4_is_multicast(t->parms.iph.daddr)) { |
1170 | struct flowi fl = { .oif = t->parms.link, | 1197 | struct flowi4 fl4; |
1171 | .nl_u = { .ip4_u = | ||
1172 | { .daddr = t->parms.iph.daddr, | ||
1173 | .saddr = t->parms.iph.saddr, | ||
1174 | .tos = RT_TOS(t->parms.iph.tos) } }, | ||
1175 | .proto = IPPROTO_GRE }; | ||
1176 | struct rtable *rt; | 1198 | struct rtable *rt; |
1177 | if (ip_route_output_key(dev_net(dev), &rt, &fl)) | 1199 | |
1200 | rt = ip_route_output_gre(dev_net(dev), &fl4, | ||
1201 | t->parms.iph.daddr, | ||
1202 | t->parms.iph.saddr, | ||
1203 | t->parms.o_key, | ||
1204 | RT_TOS(t->parms.iph.tos), | ||
1205 | t->parms.link); | ||
1206 | if (IS_ERR(rt)) | ||
1178 | return -EADDRNOTAVAIL; | 1207 | return -EADDRNOTAVAIL; |
1179 | dev = rt->dst.dev; | 1208 | dev = rt->dst.dev; |
1180 | ip_rt_put(rt); | 1209 | ip_rt_put(rt); |
@@ -1193,10 +1222,8 @@ static int ipgre_close(struct net_device *dev) | |||
1193 | if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) { | 1222 | if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) { |
1194 | struct in_device *in_dev; | 1223 | struct in_device *in_dev; |
1195 | in_dev = inetdev_by_index(dev_net(dev), t->mlink); | 1224 | in_dev = inetdev_by_index(dev_net(dev), t->mlink); |
1196 | if (in_dev) { | 1225 | if (in_dev) |
1197 | ip_mc_dec_group(in_dev, t->parms.iph.daddr); | 1226 | ip_mc_dec_group(in_dev, t->parms.iph.daddr); |
1198 | in_dev_put(in_dev); | ||
1199 | } | ||
1200 | } | 1227 | } |
1201 | return 0; | 1228 | return 0; |
1202 | } | 1229 | } |
@@ -1213,12 +1240,19 @@ static const struct net_device_ops ipgre_netdev_ops = { | |||
1213 | .ndo_start_xmit = ipgre_tunnel_xmit, | 1240 | .ndo_start_xmit = ipgre_tunnel_xmit, |
1214 | .ndo_do_ioctl = ipgre_tunnel_ioctl, | 1241 | .ndo_do_ioctl = ipgre_tunnel_ioctl, |
1215 | .ndo_change_mtu = ipgre_tunnel_change_mtu, | 1242 | .ndo_change_mtu = ipgre_tunnel_change_mtu, |
1243 | .ndo_get_stats = ipgre_get_stats, | ||
1216 | }; | 1244 | }; |
1217 | 1245 | ||
1246 | static void ipgre_dev_free(struct net_device *dev) | ||
1247 | { | ||
1248 | free_percpu(dev->tstats); | ||
1249 | free_netdev(dev); | ||
1250 | } | ||
1251 | |||
1218 | static void ipgre_tunnel_setup(struct net_device *dev) | 1252 | static void ipgre_tunnel_setup(struct net_device *dev) |
1219 | { | 1253 | { |
1220 | dev->netdev_ops = &ipgre_netdev_ops; | 1254 | dev->netdev_ops = &ipgre_netdev_ops; |
1221 | dev->destructor = free_netdev; | 1255 | dev->destructor = ipgre_dev_free; |
1222 | 1256 | ||
1223 | dev->type = ARPHRD_IPGRE; | 1257 | dev->type = ARPHRD_IPGRE; |
1224 | dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; | 1258 | dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; |
@@ -1256,6 +1290,10 @@ static int ipgre_tunnel_init(struct net_device *dev) | |||
1256 | } else | 1290 | } else |
1257 | dev->header_ops = &ipgre_header_ops; | 1291 | dev->header_ops = &ipgre_header_ops; |
1258 | 1292 | ||
1293 | dev->tstats = alloc_percpu(struct pcpu_tstats); | ||
1294 | if (!dev->tstats) | ||
1295 | return -ENOMEM; | ||
1296 | |||
1259 | return 0; | 1297 | return 0; |
1260 | } | 1298 | } |
1261 | 1299 | ||
@@ -1263,7 +1301,6 @@ static void ipgre_fb_tunnel_init(struct net_device *dev) | |||
1263 | { | 1301 | { |
1264 | struct ip_tunnel *tunnel = netdev_priv(dev); | 1302 | struct ip_tunnel *tunnel = netdev_priv(dev); |
1265 | struct iphdr *iph = &tunnel->parms.iph; | 1303 | struct iphdr *iph = &tunnel->parms.iph; |
1266 | struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id); | ||
1267 | 1304 | ||
1268 | tunnel->dev = dev; | 1305 | tunnel->dev = dev; |
1269 | strcpy(tunnel->parms.name, dev->name); | 1306 | strcpy(tunnel->parms.name, dev->name); |
@@ -1274,14 +1311,12 @@ static void ipgre_fb_tunnel_init(struct net_device *dev) | |||
1274 | tunnel->hlen = sizeof(struct iphdr) + 4; | 1311 | tunnel->hlen = sizeof(struct iphdr) + 4; |
1275 | 1312 | ||
1276 | dev_hold(dev); | 1313 | dev_hold(dev); |
1277 | ign->tunnels_wc[0] = tunnel; | ||
1278 | } | 1314 | } |
1279 | 1315 | ||
1280 | 1316 | ||
1281 | static const struct net_protocol ipgre_protocol = { | 1317 | static const struct gre_protocol ipgre_protocol = { |
1282 | .handler = ipgre_rcv, | 1318 | .handler = ipgre_rcv, |
1283 | .err_handler = ipgre_err, | 1319 | .err_handler = ipgre_err, |
1284 | .netns_ok = 1, | ||
1285 | }; | 1320 | }; |
1286 | 1321 | ||
1287 | static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head) | 1322 | static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head) |
@@ -1291,11 +1326,13 @@ static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head) | |||
1291 | for (prio = 0; prio < 4; prio++) { | 1326 | for (prio = 0; prio < 4; prio++) { |
1292 | int h; | 1327 | int h; |
1293 | for (h = 0; h < HASH_SIZE; h++) { | 1328 | for (h = 0; h < HASH_SIZE; h++) { |
1294 | struct ip_tunnel *t = ign->tunnels[prio][h]; | 1329 | struct ip_tunnel *t; |
1330 | |||
1331 | t = rtnl_dereference(ign->tunnels[prio][h]); | ||
1295 | 1332 | ||
1296 | while (t != NULL) { | 1333 | while (t != NULL) { |
1297 | unregister_netdevice_queue(t->dev, head); | 1334 | unregister_netdevice_queue(t->dev, head); |
1298 | t = t->next; | 1335 | t = rtnl_dereference(t->next); |
1299 | } | 1336 | } |
1300 | } | 1337 | } |
1301 | } | 1338 | } |
@@ -1320,10 +1357,12 @@ static int __net_init ipgre_init_net(struct net *net) | |||
1320 | if ((err = register_netdev(ign->fb_tunnel_dev))) | 1357 | if ((err = register_netdev(ign->fb_tunnel_dev))) |
1321 | goto err_reg_dev; | 1358 | goto err_reg_dev; |
1322 | 1359 | ||
1360 | rcu_assign_pointer(ign->tunnels_wc[0], | ||
1361 | netdev_priv(ign->fb_tunnel_dev)); | ||
1323 | return 0; | 1362 | return 0; |
1324 | 1363 | ||
1325 | err_reg_dev: | 1364 | err_reg_dev: |
1326 | free_netdev(ign->fb_tunnel_dev); | 1365 | ipgre_dev_free(ign->fb_tunnel_dev); |
1327 | err_alloc_dev: | 1366 | err_alloc_dev: |
1328 | return err; | 1367 | return err; |
1329 | } | 1368 | } |
@@ -1441,6 +1480,10 @@ static int ipgre_tap_init(struct net_device *dev) | |||
1441 | 1480 | ||
1442 | ipgre_tunnel_bind_dev(dev); | 1481 | ipgre_tunnel_bind_dev(dev); |
1443 | 1482 | ||
1483 | dev->tstats = alloc_percpu(struct pcpu_tstats); | ||
1484 | if (!dev->tstats) | ||
1485 | return -ENOMEM; | ||
1486 | |||
1444 | return 0; | 1487 | return 0; |
1445 | } | 1488 | } |
1446 | 1489 | ||
@@ -1451,6 +1494,7 @@ static const struct net_device_ops ipgre_tap_netdev_ops = { | |||
1451 | .ndo_set_mac_address = eth_mac_addr, | 1494 | .ndo_set_mac_address = eth_mac_addr, |
1452 | .ndo_validate_addr = eth_validate_addr, | 1495 | .ndo_validate_addr = eth_validate_addr, |
1453 | .ndo_change_mtu = ipgre_tunnel_change_mtu, | 1496 | .ndo_change_mtu = ipgre_tunnel_change_mtu, |
1497 | .ndo_get_stats = ipgre_get_stats, | ||
1454 | }; | 1498 | }; |
1455 | 1499 | ||
1456 | static void ipgre_tap_setup(struct net_device *dev) | 1500 | static void ipgre_tap_setup(struct net_device *dev) |
@@ -1459,7 +1503,7 @@ static void ipgre_tap_setup(struct net_device *dev) | |||
1459 | ether_setup(dev); | 1503 | ether_setup(dev); |
1460 | 1504 | ||
1461 | dev->netdev_ops = &ipgre_tap_netdev_ops; | 1505 | dev->netdev_ops = &ipgre_tap_netdev_ops; |
1462 | dev->destructor = free_netdev; | 1506 | dev->destructor = ipgre_dev_free; |
1463 | 1507 | ||
1464 | dev->iflink = 0; | 1508 | dev->iflink = 0; |
1465 | dev->features |= NETIF_F_NETNS_LOCAL; | 1509 | dev->features |= NETIF_F_NETNS_LOCAL; |
@@ -1487,6 +1531,10 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nla | |||
1487 | if (!tb[IFLA_MTU]) | 1531 | if (!tb[IFLA_MTU]) |
1488 | dev->mtu = mtu; | 1532 | dev->mtu = mtu; |
1489 | 1533 | ||
1534 | /* Can use a lockless transmit, unless we generate output sequences */ | ||
1535 | if (!(nt->parms.o_flags & GRE_SEQ)) | ||
1536 | dev->features |= NETIF_F_LLTX; | ||
1537 | |||
1490 | err = register_netdevice(dev); | 1538 | err = register_netdevice(dev); |
1491 | if (err) | 1539 | if (err) |
1492 | goto out; | 1540 | goto out; |
@@ -1522,7 +1570,7 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], | |||
1522 | t = nt; | 1570 | t = nt; |
1523 | 1571 | ||
1524 | if (dev->type != ARPHRD_ETHER) { | 1572 | if (dev->type != ARPHRD_ETHER) { |
1525 | unsigned nflags = 0; | 1573 | unsigned int nflags = 0; |
1526 | 1574 | ||
1527 | if (ipv4_is_multicast(p.iph.daddr)) | 1575 | if (ipv4_is_multicast(p.iph.daddr)) |
1528 | nflags = IFF_BROADCAST; | 1576 | nflags = IFF_BROADCAST; |
@@ -1663,7 +1711,7 @@ static int __init ipgre_init(void) | |||
1663 | if (err < 0) | 1711 | if (err < 0) |
1664 | return err; | 1712 | return err; |
1665 | 1713 | ||
1666 | err = inet_add_protocol(&ipgre_protocol, IPPROTO_GRE); | 1714 | err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO); |
1667 | if (err < 0) { | 1715 | if (err < 0) { |
1668 | printk(KERN_INFO "ipgre init: can't add protocol\n"); | 1716 | printk(KERN_INFO "ipgre init: can't add protocol\n"); |
1669 | goto add_proto_failed; | 1717 | goto add_proto_failed; |
@@ -1683,7 +1731,7 @@ out: | |||
1683 | tap_ops_failed: | 1731 | tap_ops_failed: |
1684 | rtnl_link_unregister(&ipgre_link_ops); | 1732 | rtnl_link_unregister(&ipgre_link_ops); |
1685 | rtnl_link_failed: | 1733 | rtnl_link_failed: |
1686 | inet_del_protocol(&ipgre_protocol, IPPROTO_GRE); | 1734 | gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); |
1687 | add_proto_failed: | 1735 | add_proto_failed: |
1688 | unregister_pernet_device(&ipgre_net_ops); | 1736 | unregister_pernet_device(&ipgre_net_ops); |
1689 | goto out; | 1737 | goto out; |
@@ -1693,7 +1741,7 @@ static void __exit ipgre_fini(void) | |||
1693 | { | 1741 | { |
1694 | rtnl_link_unregister(&ipgre_tap_ops); | 1742 | rtnl_link_unregister(&ipgre_tap_ops); |
1695 | rtnl_link_unregister(&ipgre_link_ops); | 1743 | rtnl_link_unregister(&ipgre_link_ops); |
1696 | if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) | 1744 | if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0) |
1697 | printk(KERN_INFO "ipgre close: can't remove protocol\n"); | 1745 | printk(KERN_INFO "ipgre close: can't remove protocol\n"); |
1698 | unregister_pernet_device(&ipgre_net_ops); | 1746 | unregister_pernet_device(&ipgre_net_ops); |
1699 | } | 1747 | } |
@@ -1703,3 +1751,4 @@ module_exit(ipgre_fini); | |||
1703 | MODULE_LICENSE("GPL"); | 1751 | MODULE_LICENSE("GPL"); |
1704 | MODULE_ALIAS_RTNL_LINK("gre"); | 1752 | MODULE_ALIAS_RTNL_LINK("gre"); |
1705 | MODULE_ALIAS_RTNL_LINK("gretap"); | 1753 | MODULE_ALIAS_RTNL_LINK("gretap"); |
1754 | MODULE_ALIAS_NETDEV("gre0"); | ||
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index d859bcc26cb7..c8f48efc5fd3 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c | |||
@@ -268,7 +268,7 @@ int ip_local_deliver(struct sk_buff *skb) | |||
268 | static inline int ip_rcv_options(struct sk_buff *skb) | 268 | static inline int ip_rcv_options(struct sk_buff *skb) |
269 | { | 269 | { |
270 | struct ip_options *opt; | 270 | struct ip_options *opt; |
271 | struct iphdr *iph; | 271 | const struct iphdr *iph; |
272 | struct net_device *dev = skb->dev; | 272 | struct net_device *dev = skb->dev; |
273 | 273 | ||
274 | /* It looks as overkill, because not all | 274 | /* It looks as overkill, because not all |
@@ -340,7 +340,7 @@ static int ip_rcv_finish(struct sk_buff *skb) | |||
340 | } | 340 | } |
341 | } | 341 | } |
342 | 342 | ||
343 | #ifdef CONFIG_NET_CLS_ROUTE | 343 | #ifdef CONFIG_IP_ROUTE_CLASSID |
344 | if (unlikely(skb_dst(skb)->tclassid)) { | 344 | if (unlikely(skb_dst(skb)->tclassid)) { |
345 | struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct); | 345 | struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct); |
346 | u32 idx = skb_dst(skb)->tclassid; | 346 | u32 idx = skb_dst(skb)->tclassid; |
@@ -374,7 +374,7 @@ drop: | |||
374 | */ | 374 | */ |
375 | int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) | 375 | int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) |
376 | { | 376 | { |
377 | struct iphdr *iph; | 377 | const struct iphdr *iph; |
378 | u32 len; | 378 | u32 len; |
379 | 379 | ||
380 | /* When the interface is in promisc. mode, drop all the crap | 380 | /* When the interface is in promisc. mode, drop all the crap |
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index ba9836c488ed..ec93335901dd 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/slab.h> | 14 | #include <linux/slab.h> |
15 | #include <linux/types.h> | 15 | #include <linux/types.h> |
16 | #include <asm/uaccess.h> | 16 | #include <asm/uaccess.h> |
17 | #include <asm/unaligned.h> | ||
17 | #include <linux/skbuff.h> | 18 | #include <linux/skbuff.h> |
18 | #include <linux/ip.h> | 19 | #include <linux/ip.h> |
19 | #include <linux/icmp.h> | 20 | #include <linux/icmp.h> |
@@ -36,8 +37,8 @@ | |||
36 | * saddr is address of outgoing interface. | 37 | * saddr is address of outgoing interface. |
37 | */ | 38 | */ |
38 | 39 | ||
39 | void ip_options_build(struct sk_buff * skb, struct ip_options * opt, | 40 | void ip_options_build(struct sk_buff *skb, struct ip_options *opt, |
40 | __be32 daddr, struct rtable *rt, int is_frag) | 41 | __be32 daddr, struct rtable *rt, int is_frag) |
41 | { | 42 | { |
42 | unsigned char *iph = skb_network_header(skb); | 43 | unsigned char *iph = skb_network_header(skb); |
43 | 44 | ||
@@ -50,9 +51,9 @@ void ip_options_build(struct sk_buff * skb, struct ip_options * opt, | |||
50 | 51 | ||
51 | if (!is_frag) { | 52 | if (!is_frag) { |
52 | if (opt->rr_needaddr) | 53 | if (opt->rr_needaddr) |
53 | ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, rt); | 54 | ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, skb, rt); |
54 | if (opt->ts_needaddr) | 55 | if (opt->ts_needaddr) |
55 | ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, rt); | 56 | ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, skb, rt); |
56 | if (opt->ts_needtime) { | 57 | if (opt->ts_needtime) { |
57 | struct timespec tv; | 58 | struct timespec tv; |
58 | __be32 midtime; | 59 | __be32 midtime; |
@@ -83,9 +84,9 @@ void ip_options_build(struct sk_buff * skb, struct ip_options * opt, | |||
83 | * NOTE: dopt cannot point to skb. | 84 | * NOTE: dopt cannot point to skb. |
84 | */ | 85 | */ |
85 | 86 | ||
86 | int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb) | 87 | int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb) |
87 | { | 88 | { |
88 | struct ip_options *sopt; | 89 | const struct ip_options *sopt; |
89 | unsigned char *sptr, *dptr; | 90 | unsigned char *sptr, *dptr; |
90 | int soffset, doffset; | 91 | int soffset, doffset; |
91 | int optlen; | 92 | int optlen; |
@@ -95,10 +96,8 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb) | |||
95 | 96 | ||
96 | sopt = &(IPCB(skb)->opt); | 97 | sopt = &(IPCB(skb)->opt); |
97 | 98 | ||
98 | if (sopt->optlen == 0) { | 99 | if (sopt->optlen == 0) |
99 | dopt->optlen = 0; | ||
100 | return 0; | 100 | return 0; |
101 | } | ||
102 | 101 | ||
103 | sptr = skb_network_header(skb); | 102 | sptr = skb_network_header(skb); |
104 | dptr = dopt->__data; | 103 | dptr = dopt->__data; |
@@ -140,11 +139,11 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb) | |||
140 | } else { | 139 | } else { |
141 | dopt->ts_needtime = 0; | 140 | dopt->ts_needtime = 0; |
142 | 141 | ||
143 | if (soffset + 8 <= optlen) { | 142 | if (soffset + 7 <= optlen) { |
144 | __be32 addr; | 143 | __be32 addr; |
145 | 144 | ||
146 | memcpy(&addr, sptr+soffset-1, 4); | 145 | memcpy(&addr, dptr+soffset-1, 4); |
147 | if (inet_addr_type(dev_net(skb_dst(skb)->dev), addr) != RTN_LOCAL) { | 146 | if (inet_addr_type(dev_net(skb_dst(skb)->dev), addr) != RTN_UNICAST) { |
148 | dopt->ts_needtime = 1; | 147 | dopt->ts_needtime = 1; |
149 | soffset += 8; | 148 | soffset += 8; |
150 | } | 149 | } |
@@ -157,7 +156,7 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb) | |||
157 | dopt->optlen += optlen; | 156 | dopt->optlen += optlen; |
158 | } | 157 | } |
159 | if (sopt->srr) { | 158 | if (sopt->srr) { |
160 | unsigned char * start = sptr+sopt->srr; | 159 | unsigned char *start = sptr+sopt->srr; |
161 | __be32 faddr; | 160 | __be32 faddr; |
162 | 161 | ||
163 | optlen = start[1]; | 162 | optlen = start[1]; |
@@ -329,7 +328,7 @@ int ip_options_compile(struct net *net, | |||
329 | pp_ptr = optptr + 2; | 328 | pp_ptr = optptr + 2; |
330 | goto error; | 329 | goto error; |
331 | } | 330 | } |
332 | if (skb) { | 331 | if (rt) { |
333 | memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); | 332 | memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); |
334 | opt->is_changed = 1; | 333 | opt->is_changed = 1; |
335 | } | 334 | } |
@@ -352,7 +351,7 @@ int ip_options_compile(struct net *net, | |||
352 | goto error; | 351 | goto error; |
353 | } | 352 | } |
354 | if (optptr[2] <= optlen) { | 353 | if (optptr[2] <= optlen) { |
355 | __be32 *timeptr = NULL; | 354 | unsigned char *timeptr = NULL; |
356 | if (optptr[2]+3 > optptr[1]) { | 355 | if (optptr[2]+3 > optptr[1]) { |
357 | pp_ptr = optptr + 2; | 356 | pp_ptr = optptr + 2; |
358 | goto error; | 357 | goto error; |
@@ -361,7 +360,7 @@ int ip_options_compile(struct net *net, | |||
361 | case IPOPT_TS_TSONLY: | 360 | case IPOPT_TS_TSONLY: |
362 | opt->ts = optptr - iph; | 361 | opt->ts = optptr - iph; |
363 | if (skb) | 362 | if (skb) |
364 | timeptr = (__be32*)&optptr[optptr[2]-1]; | 363 | timeptr = &optptr[optptr[2]-1]; |
365 | opt->ts_needtime = 1; | 364 | opt->ts_needtime = 1; |
366 | optptr[2] += 4; | 365 | optptr[2] += 4; |
367 | break; | 366 | break; |
@@ -371,9 +370,9 @@ int ip_options_compile(struct net *net, | |||
371 | goto error; | 370 | goto error; |
372 | } | 371 | } |
373 | opt->ts = optptr - iph; | 372 | opt->ts = optptr - iph; |
374 | if (skb) { | 373 | if (rt) { |
375 | memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); | 374 | memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); |
376 | timeptr = (__be32*)&optptr[optptr[2]+3]; | 375 | timeptr = &optptr[optptr[2]+3]; |
377 | } | 376 | } |
378 | opt->ts_needaddr = 1; | 377 | opt->ts_needaddr = 1; |
379 | opt->ts_needtime = 1; | 378 | opt->ts_needtime = 1; |
@@ -391,7 +390,7 @@ int ip_options_compile(struct net *net, | |||
391 | if (inet_addr_type(net, addr) == RTN_UNICAST) | 390 | if (inet_addr_type(net, addr) == RTN_UNICAST) |
392 | break; | 391 | break; |
393 | if (skb) | 392 | if (skb) |
394 | timeptr = (__be32*)&optptr[optptr[2]+3]; | 393 | timeptr = &optptr[optptr[2]+3]; |
395 | } | 394 | } |
396 | opt->ts_needtime = 1; | 395 | opt->ts_needtime = 1; |
397 | optptr[2] += 8; | 396 | optptr[2] += 8; |
@@ -405,10 +404,10 @@ int ip_options_compile(struct net *net, | |||
405 | } | 404 | } |
406 | if (timeptr) { | 405 | if (timeptr) { |
407 | struct timespec tv; | 406 | struct timespec tv; |
408 | __be32 midtime; | 407 | u32 midtime; |
409 | getnstimeofday(&tv); | 408 | getnstimeofday(&tv); |
410 | midtime = htonl((tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC); | 409 | midtime = (tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC; |
411 | memcpy(timeptr, &midtime, sizeof(__be32)); | 410 | put_unaligned_be32(midtime, timeptr); |
412 | opt->is_changed = 1; | 411 | opt->is_changed = 1; |
413 | } | 412 | } |
414 | } else { | 413 | } else { |
@@ -466,7 +465,7 @@ error: | |||
466 | } | 465 | } |
467 | return -EINVAL; | 466 | return -EINVAL; |
468 | } | 467 | } |
469 | 468 | EXPORT_SYMBOL(ip_options_compile); | |
470 | 469 | ||
471 | /* | 470 | /* |
472 | * Undo all the changes done by ip_options_compile(). | 471 | * Undo all the changes done by ip_options_compile(). |
@@ -499,19 +498,19 @@ void ip_options_undo(struct ip_options * opt) | |||
499 | } | 498 | } |
500 | } | 499 | } |
501 | 500 | ||
502 | static struct ip_options *ip_options_get_alloc(const int optlen) | 501 | static struct ip_options_rcu *ip_options_get_alloc(const int optlen) |
503 | { | 502 | { |
504 | return kzalloc(sizeof(struct ip_options) + ((optlen + 3) & ~3), | 503 | return kzalloc(sizeof(struct ip_options_rcu) + ((optlen + 3) & ~3), |
505 | GFP_KERNEL); | 504 | GFP_KERNEL); |
506 | } | 505 | } |
507 | 506 | ||
508 | static int ip_options_get_finish(struct net *net, struct ip_options **optp, | 507 | static int ip_options_get_finish(struct net *net, struct ip_options_rcu **optp, |
509 | struct ip_options *opt, int optlen) | 508 | struct ip_options_rcu *opt, int optlen) |
510 | { | 509 | { |
511 | while (optlen & 3) | 510 | while (optlen & 3) |
512 | opt->__data[optlen++] = IPOPT_END; | 511 | opt->opt.__data[optlen++] = IPOPT_END; |
513 | opt->optlen = optlen; | 512 | opt->opt.optlen = optlen; |
514 | if (optlen && ip_options_compile(net, opt, NULL)) { | 513 | if (optlen && ip_options_compile(net, &opt->opt, NULL)) { |
515 | kfree(opt); | 514 | kfree(opt); |
516 | return -EINVAL; | 515 | return -EINVAL; |
517 | } | 516 | } |
@@ -520,29 +519,29 @@ static int ip_options_get_finish(struct net *net, struct ip_options **optp, | |||
520 | return 0; | 519 | return 0; |
521 | } | 520 | } |
522 | 521 | ||
523 | int ip_options_get_from_user(struct net *net, struct ip_options **optp, | 522 | int ip_options_get_from_user(struct net *net, struct ip_options_rcu **optp, |
524 | unsigned char __user *data, int optlen) | 523 | unsigned char __user *data, int optlen) |
525 | { | 524 | { |
526 | struct ip_options *opt = ip_options_get_alloc(optlen); | 525 | struct ip_options_rcu *opt = ip_options_get_alloc(optlen); |
527 | 526 | ||
528 | if (!opt) | 527 | if (!opt) |
529 | return -ENOMEM; | 528 | return -ENOMEM; |
530 | if (optlen && copy_from_user(opt->__data, data, optlen)) { | 529 | if (optlen && copy_from_user(opt->opt.__data, data, optlen)) { |
531 | kfree(opt); | 530 | kfree(opt); |
532 | return -EFAULT; | 531 | return -EFAULT; |
533 | } | 532 | } |
534 | return ip_options_get_finish(net, optp, opt, optlen); | 533 | return ip_options_get_finish(net, optp, opt, optlen); |
535 | } | 534 | } |
536 | 535 | ||
537 | int ip_options_get(struct net *net, struct ip_options **optp, | 536 | int ip_options_get(struct net *net, struct ip_options_rcu **optp, |
538 | unsigned char *data, int optlen) | 537 | unsigned char *data, int optlen) |
539 | { | 538 | { |
540 | struct ip_options *opt = ip_options_get_alloc(optlen); | 539 | struct ip_options_rcu *opt = ip_options_get_alloc(optlen); |
541 | 540 | ||
542 | if (!opt) | 541 | if (!opt) |
543 | return -ENOMEM; | 542 | return -ENOMEM; |
544 | if (optlen) | 543 | if (optlen) |
545 | memcpy(opt->__data, data, optlen); | 544 | memcpy(opt->opt.__data, data, optlen); |
546 | return ip_options_get_finish(net, optp, opt, optlen); | 545 | return ip_options_get_finish(net, optp, opt, optlen); |
547 | } | 546 | } |
548 | 547 | ||
@@ -555,7 +554,7 @@ void ip_forward_options(struct sk_buff *skb) | |||
555 | 554 | ||
556 | if (opt->rr_needaddr) { | 555 | if (opt->rr_needaddr) { |
557 | optptr = (unsigned char *)raw + opt->rr; | 556 | optptr = (unsigned char *)raw + opt->rr; |
558 | ip_rt_get_source(&optptr[optptr[2]-5], rt); | 557 | ip_rt_get_source(&optptr[optptr[2]-5], skb, rt); |
559 | opt->is_changed = 1; | 558 | opt->is_changed = 1; |
560 | } | 559 | } |
561 | if (opt->srr_is_hit) { | 560 | if (opt->srr_is_hit) { |
@@ -569,19 +568,18 @@ void ip_forward_options(struct sk_buff *skb) | |||
569 | ) { | 568 | ) { |
570 | if (srrptr + 3 > srrspace) | 569 | if (srrptr + 3 > srrspace) |
571 | break; | 570 | break; |
572 | if (memcmp(&rt->rt_dst, &optptr[srrptr-1], 4) == 0) | 571 | if (memcmp(&ip_hdr(skb)->daddr, &optptr[srrptr-1], 4) == 0) |
573 | break; | 572 | break; |
574 | } | 573 | } |
575 | if (srrptr + 3 <= srrspace) { | 574 | if (srrptr + 3 <= srrspace) { |
576 | opt->is_changed = 1; | 575 | opt->is_changed = 1; |
577 | ip_rt_get_source(&optptr[srrptr-1], rt); | 576 | ip_rt_get_source(&optptr[srrptr-1], skb, rt); |
578 | ip_hdr(skb)->daddr = rt->rt_dst; | ||
579 | optptr[2] = srrptr+4; | 577 | optptr[2] = srrptr+4; |
580 | } else if (net_ratelimit()) | 578 | } else if (net_ratelimit()) |
581 | printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n"); | 579 | printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n"); |
582 | if (opt->ts_needaddr) { | 580 | if (opt->ts_needaddr) { |
583 | optptr = raw + opt->ts; | 581 | optptr = raw + opt->ts; |
584 | ip_rt_get_source(&optptr[optptr[2]-9], rt); | 582 | ip_rt_get_source(&optptr[optptr[2]-9], skb, rt); |
585 | opt->is_changed = 1; | 583 | opt->is_changed = 1; |
586 | } | 584 | } |
587 | } | 585 | } |
@@ -603,7 +601,7 @@ int ip_options_rcv_srr(struct sk_buff *skb) | |||
603 | unsigned long orefdst; | 601 | unsigned long orefdst; |
604 | int err; | 602 | int err; |
605 | 603 | ||
606 | if (!opt->srr) | 604 | if (!rt) |
607 | return 0; | 605 | return 0; |
608 | 606 | ||
609 | if (skb->pkt_type != PACKET_HOST) | 607 | if (skb->pkt_type != PACKET_HOST) |
@@ -637,7 +635,7 @@ int ip_options_rcv_srr(struct sk_buff *skb) | |||
637 | if (rt2->rt_type != RTN_LOCAL) | 635 | if (rt2->rt_type != RTN_LOCAL) |
638 | break; | 636 | break; |
639 | /* Superfast 8) loopback forward */ | 637 | /* Superfast 8) loopback forward */ |
640 | memcpy(&iph->daddr, &optptr[srrptr-1], 4); | 638 | iph->daddr = nexthop; |
641 | opt->is_changed = 1; | 639 | opt->is_changed = 1; |
642 | } | 640 | } |
643 | if (srrptr <= srrspace) { | 641 | if (srrptr <= srrspace) { |
@@ -646,3 +644,4 @@ int ip_options_rcv_srr(struct sk_buff *skb) | |||
646 | } | 644 | } |
647 | return 0; | 645 | return 0; |
648 | } | 646 | } |
647 | EXPORT_SYMBOL(ip_options_rcv_srr); | ||
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 7649d7750075..84f26e8e6c60 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c | |||
@@ -82,6 +82,7 @@ | |||
82 | #include <linux/tcp.h> | 82 | #include <linux/tcp.h> |
83 | 83 | ||
84 | int sysctl_ip_default_ttl __read_mostly = IPDEFTTL; | 84 | int sysctl_ip_default_ttl __read_mostly = IPDEFTTL; |
85 | EXPORT_SYMBOL(sysctl_ip_default_ttl); | ||
85 | 86 | ||
86 | /* Generate a checksum for an outgoing IP datagram. */ | 87 | /* Generate a checksum for an outgoing IP datagram. */ |
87 | __inline__ void ip_send_check(struct iphdr *iph) | 88 | __inline__ void ip_send_check(struct iphdr *iph) |
@@ -130,7 +131,7 @@ static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) | |||
130 | int ttl = inet->uc_ttl; | 131 | int ttl = inet->uc_ttl; |
131 | 132 | ||
132 | if (ttl < 0) | 133 | if (ttl < 0) |
133 | ttl = dst_metric(dst, RTAX_HOPLIMIT); | 134 | ttl = ip4_dst_hoplimit(dst); |
134 | return ttl; | 135 | return ttl; |
135 | } | 136 | } |
136 | 137 | ||
@@ -139,14 +140,14 @@ static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) | |||
139 | * | 140 | * |
140 | */ | 141 | */ |
141 | int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, | 142 | int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, |
142 | __be32 saddr, __be32 daddr, struct ip_options *opt) | 143 | __be32 saddr, __be32 daddr, struct ip_options_rcu *opt) |
143 | { | 144 | { |
144 | struct inet_sock *inet = inet_sk(sk); | 145 | struct inet_sock *inet = inet_sk(sk); |
145 | struct rtable *rt = skb_rtable(skb); | 146 | struct rtable *rt = skb_rtable(skb); |
146 | struct iphdr *iph; | 147 | struct iphdr *iph; |
147 | 148 | ||
148 | /* Build the IP header. */ | 149 | /* Build the IP header. */ |
149 | skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0)); | 150 | skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0)); |
150 | skb_reset_network_header(skb); | 151 | skb_reset_network_header(skb); |
151 | iph = ip_hdr(skb); | 152 | iph = ip_hdr(skb); |
152 | iph->version = 4; | 153 | iph->version = 4; |
@@ -157,14 +158,14 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, | |||
157 | else | 158 | else |
158 | iph->frag_off = 0; | 159 | iph->frag_off = 0; |
159 | iph->ttl = ip_select_ttl(inet, &rt->dst); | 160 | iph->ttl = ip_select_ttl(inet, &rt->dst); |
160 | iph->daddr = rt->rt_dst; | 161 | iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); |
161 | iph->saddr = rt->rt_src; | 162 | iph->saddr = saddr; |
162 | iph->protocol = sk->sk_protocol; | 163 | iph->protocol = sk->sk_protocol; |
163 | ip_select_ident(iph, &rt->dst, sk); | 164 | ip_select_ident(iph, &rt->dst, sk); |
164 | 165 | ||
165 | if (opt && opt->optlen) { | 166 | if (opt && opt->opt.optlen) { |
166 | iph->ihl += opt->optlen>>2; | 167 | iph->ihl += opt->opt.optlen>>2; |
167 | ip_options_build(skb, opt, daddr, rt, 0); | 168 | ip_options_build(skb, &opt->opt, daddr, rt, 0); |
168 | } | 169 | } |
169 | 170 | ||
170 | skb->priority = sk->sk_priority; | 171 | skb->priority = sk->sk_priority; |
@@ -311,11 +312,12 @@ int ip_output(struct sk_buff *skb) | |||
311 | !(IPCB(skb)->flags & IPSKB_REROUTED)); | 312 | !(IPCB(skb)->flags & IPSKB_REROUTED)); |
312 | } | 313 | } |
313 | 314 | ||
314 | int ip_queue_xmit(struct sk_buff *skb) | 315 | int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl) |
315 | { | 316 | { |
316 | struct sock *sk = skb->sk; | 317 | struct sock *sk = skb->sk; |
317 | struct inet_sock *inet = inet_sk(sk); | 318 | struct inet_sock *inet = inet_sk(sk); |
318 | struct ip_options *opt = inet->opt; | 319 | struct ip_options_rcu *inet_opt; |
320 | struct flowi4 *fl4; | ||
319 | struct rtable *rt; | 321 | struct rtable *rt; |
320 | struct iphdr *iph; | 322 | struct iphdr *iph; |
321 | int res; | 323 | int res; |
@@ -324,6 +326,8 @@ int ip_queue_xmit(struct sk_buff *skb) | |||
324 | * f.e. by something like SCTP. | 326 | * f.e. by something like SCTP. |
325 | */ | 327 | */ |
326 | rcu_read_lock(); | 328 | rcu_read_lock(); |
329 | inet_opt = rcu_dereference(inet->inet_opt); | ||
330 | fl4 = &fl->u.ip4; | ||
327 | rt = skb_rtable(skb); | 331 | rt = skb_rtable(skb); |
328 | if (rt != NULL) | 332 | if (rt != NULL) |
329 | goto packet_routed; | 333 | goto packet_routed; |
@@ -335,40 +339,32 @@ int ip_queue_xmit(struct sk_buff *skb) | |||
335 | 339 | ||
336 | /* Use correct destination address if we have options. */ | 340 | /* Use correct destination address if we have options. */ |
337 | daddr = inet->inet_daddr; | 341 | daddr = inet->inet_daddr; |
338 | if(opt && opt->srr) | 342 | if (inet_opt && inet_opt->opt.srr) |
339 | daddr = opt->faddr; | 343 | daddr = inet_opt->opt.faddr; |
340 | 344 | ||
341 | { | 345 | /* If this fails, retransmit mechanism of transport layer will |
342 | struct flowi fl = { .oif = sk->sk_bound_dev_if, | 346 | * keep trying until route appears or the connection times |
343 | .mark = sk->sk_mark, | 347 | * itself out. |
344 | .nl_u = { .ip4_u = | 348 | */ |
345 | { .daddr = daddr, | 349 | rt = ip_route_output_ports(sock_net(sk), fl4, sk, |
346 | .saddr = inet->inet_saddr, | 350 | daddr, inet->inet_saddr, |
347 | .tos = RT_CONN_FLAGS(sk) } }, | 351 | inet->inet_dport, |
348 | .proto = sk->sk_protocol, | 352 | inet->inet_sport, |
349 | .flags = inet_sk_flowi_flags(sk), | 353 | sk->sk_protocol, |
350 | .uli_u = { .ports = | 354 | RT_CONN_FLAGS(sk), |
351 | { .sport = inet->inet_sport, | 355 | sk->sk_bound_dev_if); |
352 | .dport = inet->inet_dport } } }; | 356 | if (IS_ERR(rt)) |
353 | 357 | goto no_route; | |
354 | /* If this fails, retransmit mechanism of transport layer will | ||
355 | * keep trying until route appears or the connection times | ||
356 | * itself out. | ||
357 | */ | ||
358 | security_sk_classify_flow(sk, &fl); | ||
359 | if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0)) | ||
360 | goto no_route; | ||
361 | } | ||
362 | sk_setup_caps(sk, &rt->dst); | 358 | sk_setup_caps(sk, &rt->dst); |
363 | } | 359 | } |
364 | skb_dst_set_noref(skb, &rt->dst); | 360 | skb_dst_set_noref(skb, &rt->dst); |
365 | 361 | ||
366 | packet_routed: | 362 | packet_routed: |
367 | if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) | 363 | if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway) |
368 | goto no_route; | 364 | goto no_route; |
369 | 365 | ||
370 | /* OK, we know where to send it, allocate and build IP header. */ | 366 | /* OK, we know where to send it, allocate and build IP header. */ |
371 | skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0)); | 367 | skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0)); |
372 | skb_reset_network_header(skb); | 368 | skb_reset_network_header(skb); |
373 | iph = ip_hdr(skb); | 369 | iph = ip_hdr(skb); |
374 | *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); | 370 | *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); |
@@ -378,13 +374,13 @@ packet_routed: | |||
378 | iph->frag_off = 0; | 374 | iph->frag_off = 0; |
379 | iph->ttl = ip_select_ttl(inet, &rt->dst); | 375 | iph->ttl = ip_select_ttl(inet, &rt->dst); |
380 | iph->protocol = sk->sk_protocol; | 376 | iph->protocol = sk->sk_protocol; |
381 | iph->saddr = rt->rt_src; | 377 | iph->saddr = fl4->saddr; |
382 | iph->daddr = rt->rt_dst; | 378 | iph->daddr = fl4->daddr; |
383 | /* Transport layer set skb->h.foo itself. */ | 379 | /* Transport layer set skb->h.foo itself. */ |
384 | 380 | ||
385 | if (opt && opt->optlen) { | 381 | if (inet_opt && inet_opt->opt.optlen) { |
386 | iph->ihl += opt->optlen >> 2; | 382 | iph->ihl += inet_opt->opt.optlen >> 2; |
387 | ip_options_build(skb, opt, inet->inet_daddr, rt, 0); | 383 | ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0); |
388 | } | 384 | } |
389 | 385 | ||
390 | ip_select_ident_more(iph, &rt->dst, sk, | 386 | ip_select_ident_more(iph, &rt->dst, sk, |
@@ -487,7 +483,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) | |||
487 | * LATER: this step can be merged to real generation of fragments, | 483 | * LATER: this step can be merged to real generation of fragments, |
488 | * we can switch to copy when see the first bad fragment. | 484 | * we can switch to copy when see the first bad fragment. |
489 | */ | 485 | */ |
490 | if (skb_has_frags(skb)) { | 486 | if (skb_has_frag_list(skb)) { |
491 | struct sk_buff *frag, *frag2; | 487 | struct sk_buff *frag, *frag2; |
492 | int first_len = skb_pagelen(skb); | 488 | int first_len = skb_pagelen(skb); |
493 | 489 | ||
@@ -610,7 +606,7 @@ slow_path: | |||
610 | /* IF: it doesn't fit, use 'mtu' - the data space left */ | 606 | /* IF: it doesn't fit, use 'mtu' - the data space left */ |
611 | if (len > mtu) | 607 | if (len > mtu) |
612 | len = mtu; | 608 | len = mtu; |
613 | /* IF: we are not sending upto and including the packet end | 609 | /* IF: we are not sending up to and including the packet end |
614 | then align the next start on an eight byte boundary */ | 610 | then align the next start on an eight byte boundary */ |
615 | if (len < left) { | 611 | if (len < left) { |
616 | len &= ~7; | 612 | len &= ~7; |
@@ -734,6 +730,7 @@ csum_page(struct page *page, int offset, int copy) | |||
734 | } | 730 | } |
735 | 731 | ||
736 | static inline int ip_ufo_append_data(struct sock *sk, | 732 | static inline int ip_ufo_append_data(struct sock *sk, |
733 | struct sk_buff_head *queue, | ||
737 | int getfrag(void *from, char *to, int offset, int len, | 734 | int getfrag(void *from, char *to, int offset, int len, |
738 | int odd, struct sk_buff *skb), | 735 | int odd, struct sk_buff *skb), |
739 | void *from, int length, int hh_len, int fragheaderlen, | 736 | void *from, int length, int hh_len, int fragheaderlen, |
@@ -746,7 +743,7 @@ static inline int ip_ufo_append_data(struct sock *sk, | |||
746 | * device, so create one single skb packet containing complete | 743 | * device, so create one single skb packet containing complete |
747 | * udp datagram | 744 | * udp datagram |
748 | */ | 745 | */ |
749 | if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) { | 746 | if ((skb = skb_peek_tail(queue)) == NULL) { |
750 | skb = sock_alloc_send_skb(sk, | 747 | skb = sock_alloc_send_skb(sk, |
751 | hh_len + fragheaderlen + transhdrlen + 20, | 748 | hh_len + fragheaderlen + transhdrlen + 20, |
752 | (flags & MSG_DONTWAIT), &err); | 749 | (flags & MSG_DONTWAIT), &err); |
@@ -768,40 +765,30 @@ static inline int ip_ufo_append_data(struct sock *sk, | |||
768 | 765 | ||
769 | skb->ip_summed = CHECKSUM_PARTIAL; | 766 | skb->ip_summed = CHECKSUM_PARTIAL; |
770 | skb->csum = 0; | 767 | skb->csum = 0; |
771 | sk->sk_sndmsg_off = 0; | ||
772 | 768 | ||
773 | /* specify the length of each IP datagram fragment */ | 769 | /* specify the length of each IP datagram fragment */ |
774 | skb_shinfo(skb)->gso_size = mtu - fragheaderlen; | 770 | skb_shinfo(skb)->gso_size = mtu - fragheaderlen; |
775 | skb_shinfo(skb)->gso_type = SKB_GSO_UDP; | 771 | skb_shinfo(skb)->gso_type = SKB_GSO_UDP; |
776 | __skb_queue_tail(&sk->sk_write_queue, skb); | 772 | __skb_queue_tail(queue, skb); |
777 | } | 773 | } |
778 | 774 | ||
779 | return skb_append_datato_frags(sk, skb, getfrag, from, | 775 | return skb_append_datato_frags(sk, skb, getfrag, from, |
780 | (length - transhdrlen)); | 776 | (length - transhdrlen)); |
781 | } | 777 | } |
782 | 778 | ||
783 | /* | 779 | static int __ip_append_data(struct sock *sk, |
784 | * ip_append_data() and ip_append_page() can make one large IP datagram | 780 | struct flowi4 *fl4, |
785 | * from many pieces of data. Each pieces will be holded on the socket | 781 | struct sk_buff_head *queue, |
786 | * until ip_push_pending_frames() is called. Each piece can be a page | 782 | struct inet_cork *cork, |
787 | * or non-page data. | 783 | int getfrag(void *from, char *to, int offset, |
788 | * | 784 | int len, int odd, struct sk_buff *skb), |
789 | * Not only UDP, other transport protocols - e.g. raw sockets - can use | 785 | void *from, int length, int transhdrlen, |
790 | * this interface potentially. | 786 | unsigned int flags) |
791 | * | ||
792 | * LATER: length must be adjusted by pad at tail, when it is required. | ||
793 | */ | ||
794 | int ip_append_data(struct sock *sk, | ||
795 | int getfrag(void *from, char *to, int offset, int len, | ||
796 | int odd, struct sk_buff *skb), | ||
797 | void *from, int length, int transhdrlen, | ||
798 | struct ipcm_cookie *ipc, struct rtable **rtp, | ||
799 | unsigned int flags) | ||
800 | { | 787 | { |
801 | struct inet_sock *inet = inet_sk(sk); | 788 | struct inet_sock *inet = inet_sk(sk); |
802 | struct sk_buff *skb; | 789 | struct sk_buff *skb; |
803 | 790 | ||
804 | struct ip_options *opt = NULL; | 791 | struct ip_options *opt = cork->opt; |
805 | int hh_len; | 792 | int hh_len; |
806 | int exthdrlen; | 793 | int exthdrlen; |
807 | int mtu; | 794 | int mtu; |
@@ -810,60 +797,20 @@ int ip_append_data(struct sock *sk, | |||
810 | int offset = 0; | 797 | int offset = 0; |
811 | unsigned int maxfraglen, fragheaderlen; | 798 | unsigned int maxfraglen, fragheaderlen; |
812 | int csummode = CHECKSUM_NONE; | 799 | int csummode = CHECKSUM_NONE; |
813 | struct rtable *rt; | 800 | struct rtable *rt = (struct rtable *)cork->dst; |
814 | 801 | ||
815 | if (flags&MSG_PROBE) | 802 | skb = skb_peek_tail(queue); |
816 | return 0; | ||
817 | 803 | ||
818 | if (skb_queue_empty(&sk->sk_write_queue)) { | 804 | exthdrlen = !skb ? rt->dst.header_len : 0; |
819 | /* | 805 | mtu = cork->fragsize; |
820 | * setup for corking. | ||
821 | */ | ||
822 | opt = ipc->opt; | ||
823 | if (opt) { | ||
824 | if (inet->cork.opt == NULL) { | ||
825 | inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation); | ||
826 | if (unlikely(inet->cork.opt == NULL)) | ||
827 | return -ENOBUFS; | ||
828 | } | ||
829 | memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen); | ||
830 | inet->cork.flags |= IPCORK_OPT; | ||
831 | inet->cork.addr = ipc->addr; | ||
832 | } | ||
833 | rt = *rtp; | ||
834 | if (unlikely(!rt)) | ||
835 | return -EFAULT; | ||
836 | /* | ||
837 | * We steal reference to this route, caller should not release it | ||
838 | */ | ||
839 | *rtp = NULL; | ||
840 | inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ? | ||
841 | rt->dst.dev->mtu : | ||
842 | dst_mtu(rt->dst.path); | ||
843 | inet->cork.dst = &rt->dst; | ||
844 | inet->cork.length = 0; | ||
845 | sk->sk_sndmsg_page = NULL; | ||
846 | sk->sk_sndmsg_off = 0; | ||
847 | if ((exthdrlen = rt->dst.header_len) != 0) { | ||
848 | length += exthdrlen; | ||
849 | transhdrlen += exthdrlen; | ||
850 | } | ||
851 | } else { | ||
852 | rt = (struct rtable *)inet->cork.dst; | ||
853 | if (inet->cork.flags & IPCORK_OPT) | ||
854 | opt = inet->cork.opt; | ||
855 | 806 | ||
856 | transhdrlen = 0; | ||
857 | exthdrlen = 0; | ||
858 | mtu = inet->cork.fragsize; | ||
859 | } | ||
860 | hh_len = LL_RESERVED_SPACE(rt->dst.dev); | 807 | hh_len = LL_RESERVED_SPACE(rt->dst.dev); |
861 | 808 | ||
862 | fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); | 809 | fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); |
863 | maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; | 810 | maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; |
864 | 811 | ||
865 | if (inet->cork.length + length > 0xFFFF - fragheaderlen) { | 812 | if (cork->length + length > 0xFFFF - fragheaderlen) { |
866 | ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, | 813 | ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, |
867 | mtu-exthdrlen); | 814 | mtu-exthdrlen); |
868 | return -EMSGSIZE; | 815 | return -EMSGSIZE; |
869 | } | 816 | } |
@@ -878,15 +825,13 @@ int ip_append_data(struct sock *sk, | |||
878 | !exthdrlen) | 825 | !exthdrlen) |
879 | csummode = CHECKSUM_PARTIAL; | 826 | csummode = CHECKSUM_PARTIAL; |
880 | 827 | ||
881 | skb = skb_peek_tail(&sk->sk_write_queue); | 828 | cork->length += length; |
882 | |||
883 | inet->cork.length += length; | ||
884 | if (((length > mtu) || (skb && skb_is_gso(skb))) && | 829 | if (((length > mtu) || (skb && skb_is_gso(skb))) && |
885 | (sk->sk_protocol == IPPROTO_UDP) && | 830 | (sk->sk_protocol == IPPROTO_UDP) && |
886 | (rt->dst.dev->features & NETIF_F_UFO)) { | 831 | (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) { |
887 | err = ip_ufo_append_data(sk, getfrag, from, length, hh_len, | 832 | err = ip_ufo_append_data(sk, queue, getfrag, from, length, |
888 | fragheaderlen, transhdrlen, mtu, | 833 | hh_len, fragheaderlen, transhdrlen, |
889 | flags); | 834 | mtu, flags); |
890 | if (err) | 835 | if (err) |
891 | goto error; | 836 | goto error; |
892 | return 0; | 837 | return 0; |
@@ -934,7 +879,9 @@ alloc_new_skb: | |||
934 | !(rt->dst.dev->features&NETIF_F_SG)) | 879 | !(rt->dst.dev->features&NETIF_F_SG)) |
935 | alloclen = mtu; | 880 | alloclen = mtu; |
936 | else | 881 | else |
937 | alloclen = datalen + fragheaderlen; | 882 | alloclen = fraglen; |
883 | |||
884 | alloclen += exthdrlen; | ||
938 | 885 | ||
939 | /* The last fragment gets additional space at tail. | 886 | /* The last fragment gets additional space at tail. |
940 | * Note, with MSG_MORE we overallocate on fragments, | 887 | * Note, with MSG_MORE we overallocate on fragments, |
@@ -960,7 +907,7 @@ alloc_new_skb: | |||
960 | else | 907 | else |
961 | /* only the initial fragment is | 908 | /* only the initial fragment is |
962 | time stamped */ | 909 | time stamped */ |
963 | ipc->shtx.flags = 0; | 910 | cork->tx_flags = 0; |
964 | } | 911 | } |
965 | if (skb == NULL) | 912 | if (skb == NULL) |
966 | goto error; | 913 | goto error; |
@@ -971,16 +918,16 @@ alloc_new_skb: | |||
971 | skb->ip_summed = csummode; | 918 | skb->ip_summed = csummode; |
972 | skb->csum = 0; | 919 | skb->csum = 0; |
973 | skb_reserve(skb, hh_len); | 920 | skb_reserve(skb, hh_len); |
974 | *skb_tx(skb) = ipc->shtx; | 921 | skb_shinfo(skb)->tx_flags = cork->tx_flags; |
975 | 922 | ||
976 | /* | 923 | /* |
977 | * Find where to start putting bytes. | 924 | * Find where to start putting bytes. |
978 | */ | 925 | */ |
979 | data = skb_put(skb, fraglen); | 926 | data = skb_put(skb, fraglen + exthdrlen); |
980 | skb_set_network_header(skb, exthdrlen); | 927 | skb_set_network_header(skb, exthdrlen); |
981 | skb->transport_header = (skb->network_header + | 928 | skb->transport_header = (skb->network_header + |
982 | fragheaderlen); | 929 | fragheaderlen); |
983 | data += fragheaderlen; | 930 | data += fragheaderlen + exthdrlen; |
984 | 931 | ||
985 | if (fraggap) { | 932 | if (fraggap) { |
986 | skb->csum = skb_copy_and_csum_bits( | 933 | skb->csum = skb_copy_and_csum_bits( |
@@ -1008,7 +955,7 @@ alloc_new_skb: | |||
1008 | /* | 955 | /* |
1009 | * Put the packet on the pending queue. | 956 | * Put the packet on the pending queue. |
1010 | */ | 957 | */ |
1011 | __skb_queue_tail(&sk->sk_write_queue, skb); | 958 | __skb_queue_tail(queue, skb); |
1012 | continue; | 959 | continue; |
1013 | } | 960 | } |
1014 | 961 | ||
@@ -1028,8 +975,8 @@ alloc_new_skb: | |||
1028 | } else { | 975 | } else { |
1029 | int i = skb_shinfo(skb)->nr_frags; | 976 | int i = skb_shinfo(skb)->nr_frags; |
1030 | skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; | 977 | skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; |
1031 | struct page *page = sk->sk_sndmsg_page; | 978 | struct page *page = cork->page; |
1032 | int off = sk->sk_sndmsg_off; | 979 | int off = cork->off; |
1033 | unsigned int left; | 980 | unsigned int left; |
1034 | 981 | ||
1035 | if (page && (left = PAGE_SIZE - off) > 0) { | 982 | if (page && (left = PAGE_SIZE - off) > 0) { |
@@ -1041,7 +988,7 @@ alloc_new_skb: | |||
1041 | goto error; | 988 | goto error; |
1042 | } | 989 | } |
1043 | get_page(page); | 990 | get_page(page); |
1044 | skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0); | 991 | skb_fill_page_desc(skb, i, page, off, 0); |
1045 | frag = &skb_shinfo(skb)->frags[i]; | 992 | frag = &skb_shinfo(skb)->frags[i]; |
1046 | } | 993 | } |
1047 | } else if (i < MAX_SKB_FRAGS) { | 994 | } else if (i < MAX_SKB_FRAGS) { |
@@ -1052,8 +999,8 @@ alloc_new_skb: | |||
1052 | err = -ENOMEM; | 999 | err = -ENOMEM; |
1053 | goto error; | 1000 | goto error; |
1054 | } | 1001 | } |
1055 | sk->sk_sndmsg_page = page; | 1002 | cork->page = page; |
1056 | sk->sk_sndmsg_off = 0; | 1003 | cork->off = 0; |
1057 | 1004 | ||
1058 | skb_fill_page_desc(skb, i, page, 0, 0); | 1005 | skb_fill_page_desc(skb, i, page, 0, 0); |
1059 | frag = &skb_shinfo(skb)->frags[i]; | 1006 | frag = &skb_shinfo(skb)->frags[i]; |
@@ -1065,7 +1012,7 @@ alloc_new_skb: | |||
1065 | err = -EFAULT; | 1012 | err = -EFAULT; |
1066 | goto error; | 1013 | goto error; |
1067 | } | 1014 | } |
1068 | sk->sk_sndmsg_off += copy; | 1015 | cork->off += copy; |
1069 | frag->size += copy; | 1016 | frag->size += copy; |
1070 | skb->len += copy; | 1017 | skb->len += copy; |
1071 | skb->data_len += copy; | 1018 | skb->data_len += copy; |
@@ -1079,18 +1026,95 @@ alloc_new_skb: | |||
1079 | return 0; | 1026 | return 0; |
1080 | 1027 | ||
1081 | error: | 1028 | error: |
1082 | inet->cork.length -= length; | 1029 | cork->length -= length; |
1083 | IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); | 1030 | IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); |
1084 | return err; | 1031 | return err; |
1085 | } | 1032 | } |
1086 | 1033 | ||
1087 | ssize_t ip_append_page(struct sock *sk, struct page *page, | 1034 | static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, |
1035 | struct ipcm_cookie *ipc, struct rtable **rtp) | ||
1036 | { | ||
1037 | struct inet_sock *inet = inet_sk(sk); | ||
1038 | struct ip_options_rcu *opt; | ||
1039 | struct rtable *rt; | ||
1040 | |||
1041 | /* | ||
1042 | * setup for corking. | ||
1043 | */ | ||
1044 | opt = ipc->opt; | ||
1045 | if (opt) { | ||
1046 | if (cork->opt == NULL) { | ||
1047 | cork->opt = kmalloc(sizeof(struct ip_options) + 40, | ||
1048 | sk->sk_allocation); | ||
1049 | if (unlikely(cork->opt == NULL)) | ||
1050 | return -ENOBUFS; | ||
1051 | } | ||
1052 | memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen); | ||
1053 | cork->flags |= IPCORK_OPT; | ||
1054 | cork->addr = ipc->addr; | ||
1055 | } | ||
1056 | rt = *rtp; | ||
1057 | if (unlikely(!rt)) | ||
1058 | return -EFAULT; | ||
1059 | /* | ||
1060 | * We steal reference to this route, caller should not release it | ||
1061 | */ | ||
1062 | *rtp = NULL; | ||
1063 | cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ? | ||
1064 | rt->dst.dev->mtu : dst_mtu(&rt->dst); | ||
1065 | cork->dst = &rt->dst; | ||
1066 | cork->length = 0; | ||
1067 | cork->tx_flags = ipc->tx_flags; | ||
1068 | cork->page = NULL; | ||
1069 | cork->off = 0; | ||
1070 | |||
1071 | return 0; | ||
1072 | } | ||
1073 | |||
1074 | /* | ||
1075 | * ip_append_data() and ip_append_page() can make one large IP datagram | ||
1076 | * from many pieces of data. Each pieces will be holded on the socket | ||
1077 | * until ip_push_pending_frames() is called. Each piece can be a page | ||
1078 | * or non-page data. | ||
1079 | * | ||
1080 | * Not only UDP, other transport protocols - e.g. raw sockets - can use | ||
1081 | * this interface potentially. | ||
1082 | * | ||
1083 | * LATER: length must be adjusted by pad at tail, when it is required. | ||
1084 | */ | ||
1085 | int ip_append_data(struct sock *sk, struct flowi4 *fl4, | ||
1086 | int getfrag(void *from, char *to, int offset, int len, | ||
1087 | int odd, struct sk_buff *skb), | ||
1088 | void *from, int length, int transhdrlen, | ||
1089 | struct ipcm_cookie *ipc, struct rtable **rtp, | ||
1090 | unsigned int flags) | ||
1091 | { | ||
1092 | struct inet_sock *inet = inet_sk(sk); | ||
1093 | int err; | ||
1094 | |||
1095 | if (flags&MSG_PROBE) | ||
1096 | return 0; | ||
1097 | |||
1098 | if (skb_queue_empty(&sk->sk_write_queue)) { | ||
1099 | err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp); | ||
1100 | if (err) | ||
1101 | return err; | ||
1102 | } else { | ||
1103 | transhdrlen = 0; | ||
1104 | } | ||
1105 | |||
1106 | return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag, | ||
1107 | from, length, transhdrlen, flags); | ||
1108 | } | ||
1109 | |||
1110 | ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, | ||
1088 | int offset, size_t size, int flags) | 1111 | int offset, size_t size, int flags) |
1089 | { | 1112 | { |
1090 | struct inet_sock *inet = inet_sk(sk); | 1113 | struct inet_sock *inet = inet_sk(sk); |
1091 | struct sk_buff *skb; | 1114 | struct sk_buff *skb; |
1092 | struct rtable *rt; | 1115 | struct rtable *rt; |
1093 | struct ip_options *opt = NULL; | 1116 | struct ip_options *opt = NULL; |
1117 | struct inet_cork *cork; | ||
1094 | int hh_len; | 1118 | int hh_len; |
1095 | int mtu; | 1119 | int mtu; |
1096 | int len; | 1120 | int len; |
@@ -1106,28 +1130,29 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, | |||
1106 | if (skb_queue_empty(&sk->sk_write_queue)) | 1130 | if (skb_queue_empty(&sk->sk_write_queue)) |
1107 | return -EINVAL; | 1131 | return -EINVAL; |
1108 | 1132 | ||
1109 | rt = (struct rtable *)inet->cork.dst; | 1133 | cork = &inet->cork.base; |
1110 | if (inet->cork.flags & IPCORK_OPT) | 1134 | rt = (struct rtable *)cork->dst; |
1111 | opt = inet->cork.opt; | 1135 | if (cork->flags & IPCORK_OPT) |
1136 | opt = cork->opt; | ||
1112 | 1137 | ||
1113 | if (!(rt->dst.dev->features&NETIF_F_SG)) | 1138 | if (!(rt->dst.dev->features&NETIF_F_SG)) |
1114 | return -EOPNOTSUPP; | 1139 | return -EOPNOTSUPP; |
1115 | 1140 | ||
1116 | hh_len = LL_RESERVED_SPACE(rt->dst.dev); | 1141 | hh_len = LL_RESERVED_SPACE(rt->dst.dev); |
1117 | mtu = inet->cork.fragsize; | 1142 | mtu = cork->fragsize; |
1118 | 1143 | ||
1119 | fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); | 1144 | fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); |
1120 | maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; | 1145 | maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; |
1121 | 1146 | ||
1122 | if (inet->cork.length + size > 0xFFFF - fragheaderlen) { | 1147 | if (cork->length + size > 0xFFFF - fragheaderlen) { |
1123 | ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu); | 1148 | ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu); |
1124 | return -EMSGSIZE; | 1149 | return -EMSGSIZE; |
1125 | } | 1150 | } |
1126 | 1151 | ||
1127 | if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) | 1152 | if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) |
1128 | return -EINVAL; | 1153 | return -EINVAL; |
1129 | 1154 | ||
1130 | inet->cork.length += size; | 1155 | cork->length += size; |
1131 | if ((size + skb->len > mtu) && | 1156 | if ((size + skb->len > mtu) && |
1132 | (sk->sk_protocol == IPPROTO_UDP) && | 1157 | (sk->sk_protocol == IPPROTO_UDP) && |
1133 | (rt->dst.dev->features & NETIF_F_UFO)) { | 1158 | (rt->dst.dev->features & NETIF_F_UFO)) { |
@@ -1222,45 +1247,47 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, | |||
1222 | return 0; | 1247 | return 0; |
1223 | 1248 | ||
1224 | error: | 1249 | error: |
1225 | inet->cork.length -= size; | 1250 | cork->length -= size; |
1226 | IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); | 1251 | IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); |
1227 | return err; | 1252 | return err; |
1228 | } | 1253 | } |
1229 | 1254 | ||
1230 | static void ip_cork_release(struct inet_sock *inet) | 1255 | static void ip_cork_release(struct inet_cork *cork) |
1231 | { | 1256 | { |
1232 | inet->cork.flags &= ~IPCORK_OPT; | 1257 | cork->flags &= ~IPCORK_OPT; |
1233 | kfree(inet->cork.opt); | 1258 | kfree(cork->opt); |
1234 | inet->cork.opt = NULL; | 1259 | cork->opt = NULL; |
1235 | dst_release(inet->cork.dst); | 1260 | dst_release(cork->dst); |
1236 | inet->cork.dst = NULL; | 1261 | cork->dst = NULL; |
1237 | } | 1262 | } |
1238 | 1263 | ||
1239 | /* | 1264 | /* |
1240 | * Combined all pending IP fragments on the socket as one IP datagram | 1265 | * Combined all pending IP fragments on the socket as one IP datagram |
1241 | * and push them out. | 1266 | * and push them out. |
1242 | */ | 1267 | */ |
1243 | int ip_push_pending_frames(struct sock *sk) | 1268 | struct sk_buff *__ip_make_skb(struct sock *sk, |
1269 | struct flowi4 *fl4, | ||
1270 | struct sk_buff_head *queue, | ||
1271 | struct inet_cork *cork) | ||
1244 | { | 1272 | { |
1245 | struct sk_buff *skb, *tmp_skb; | 1273 | struct sk_buff *skb, *tmp_skb; |
1246 | struct sk_buff **tail_skb; | 1274 | struct sk_buff **tail_skb; |
1247 | struct inet_sock *inet = inet_sk(sk); | 1275 | struct inet_sock *inet = inet_sk(sk); |
1248 | struct net *net = sock_net(sk); | 1276 | struct net *net = sock_net(sk); |
1249 | struct ip_options *opt = NULL; | 1277 | struct ip_options *opt = NULL; |
1250 | struct rtable *rt = (struct rtable *)inet->cork.dst; | 1278 | struct rtable *rt = (struct rtable *)cork->dst; |
1251 | struct iphdr *iph; | 1279 | struct iphdr *iph; |
1252 | __be16 df = 0; | 1280 | __be16 df = 0; |
1253 | __u8 ttl; | 1281 | __u8 ttl; |
1254 | int err = 0; | ||
1255 | 1282 | ||
1256 | if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL) | 1283 | if ((skb = __skb_dequeue(queue)) == NULL) |
1257 | goto out; | 1284 | goto out; |
1258 | tail_skb = &(skb_shinfo(skb)->frag_list); | 1285 | tail_skb = &(skb_shinfo(skb)->frag_list); |
1259 | 1286 | ||
1260 | /* move skb->data to ip header from ext header */ | 1287 | /* move skb->data to ip header from ext header */ |
1261 | if (skb->data < skb_network_header(skb)) | 1288 | if (skb->data < skb_network_header(skb)) |
1262 | __skb_pull(skb, skb_network_offset(skb)); | 1289 | __skb_pull(skb, skb_network_offset(skb)); |
1263 | while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { | 1290 | while ((tmp_skb = __skb_dequeue(queue)) != NULL) { |
1264 | __skb_pull(tmp_skb, skb_network_header_len(skb)); | 1291 | __skb_pull(tmp_skb, skb_network_header_len(skb)); |
1265 | *tail_skb = tmp_skb; | 1292 | *tail_skb = tmp_skb; |
1266 | tail_skb = &(tmp_skb->next); | 1293 | tail_skb = &(tmp_skb->next); |
@@ -1286,8 +1313,8 @@ int ip_push_pending_frames(struct sock *sk) | |||
1286 | ip_dont_fragment(sk, &rt->dst))) | 1313 | ip_dont_fragment(sk, &rt->dst))) |
1287 | df = htons(IP_DF); | 1314 | df = htons(IP_DF); |
1288 | 1315 | ||
1289 | if (inet->cork.flags & IPCORK_OPT) | 1316 | if (cork->flags & IPCORK_OPT) |
1290 | opt = inet->cork.opt; | 1317 | opt = cork->opt; |
1291 | 1318 | ||
1292 | if (rt->rt_type == RTN_MULTICAST) | 1319 | if (rt->rt_type == RTN_MULTICAST) |
1293 | ttl = inet->mc_ttl; | 1320 | ttl = inet->mc_ttl; |
@@ -1297,17 +1324,18 @@ int ip_push_pending_frames(struct sock *sk) | |||
1297 | iph = (struct iphdr *)skb->data; | 1324 | iph = (struct iphdr *)skb->data; |
1298 | iph->version = 4; | 1325 | iph->version = 4; |
1299 | iph->ihl = 5; | 1326 | iph->ihl = 5; |
1300 | if (opt) { | ||
1301 | iph->ihl += opt->optlen>>2; | ||
1302 | ip_options_build(skb, opt, inet->cork.addr, rt, 0); | ||
1303 | } | ||
1304 | iph->tos = inet->tos; | 1327 | iph->tos = inet->tos; |
1305 | iph->frag_off = df; | 1328 | iph->frag_off = df; |
1306 | ip_select_ident(iph, &rt->dst, sk); | 1329 | ip_select_ident(iph, &rt->dst, sk); |
1307 | iph->ttl = ttl; | 1330 | iph->ttl = ttl; |
1308 | iph->protocol = sk->sk_protocol; | 1331 | iph->protocol = sk->sk_protocol; |
1309 | iph->saddr = rt->rt_src; | 1332 | iph->saddr = fl4->saddr; |
1310 | iph->daddr = rt->rt_dst; | 1333 | iph->daddr = fl4->daddr; |
1334 | |||
1335 | if (opt) { | ||
1336 | iph->ihl += opt->optlen>>2; | ||
1337 | ip_options_build(skb, opt, cork->addr, rt, 0); | ||
1338 | } | ||
1311 | 1339 | ||
1312 | skb->priority = sk->sk_priority; | 1340 | skb->priority = sk->sk_priority; |
1313 | skb->mark = sk->sk_mark; | 1341 | skb->mark = sk->sk_mark; |
@@ -1315,44 +1343,99 @@ int ip_push_pending_frames(struct sock *sk) | |||
1315 | * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec | 1343 | * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec |
1316 | * on dst refcount | 1344 | * on dst refcount |
1317 | */ | 1345 | */ |
1318 | inet->cork.dst = NULL; | 1346 | cork->dst = NULL; |
1319 | skb_dst_set(skb, &rt->dst); | 1347 | skb_dst_set(skb, &rt->dst); |
1320 | 1348 | ||
1321 | if (iph->protocol == IPPROTO_ICMP) | 1349 | if (iph->protocol == IPPROTO_ICMP) |
1322 | icmp_out_count(net, ((struct icmphdr *) | 1350 | icmp_out_count(net, ((struct icmphdr *) |
1323 | skb_transport_header(skb))->type); | 1351 | skb_transport_header(skb))->type); |
1324 | 1352 | ||
1325 | /* Netfilter gets whole the not fragmented skb. */ | 1353 | ip_cork_release(cork); |
1354 | out: | ||
1355 | return skb; | ||
1356 | } | ||
1357 | |||
1358 | int ip_send_skb(struct sk_buff *skb) | ||
1359 | { | ||
1360 | struct net *net = sock_net(skb->sk); | ||
1361 | int err; | ||
1362 | |||
1326 | err = ip_local_out(skb); | 1363 | err = ip_local_out(skb); |
1327 | if (err) { | 1364 | if (err) { |
1328 | if (err > 0) | 1365 | if (err > 0) |
1329 | err = net_xmit_errno(err); | 1366 | err = net_xmit_errno(err); |
1330 | if (err) | 1367 | if (err) |
1331 | goto error; | 1368 | IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); |
1332 | } | 1369 | } |
1333 | 1370 | ||
1334 | out: | ||
1335 | ip_cork_release(inet); | ||
1336 | return err; | 1371 | return err; |
1372 | } | ||
1337 | 1373 | ||
1338 | error: | 1374 | int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4) |
1339 | IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); | 1375 | { |
1340 | goto out; | 1376 | struct sk_buff *skb; |
1377 | |||
1378 | skb = ip_finish_skb(sk, fl4); | ||
1379 | if (!skb) | ||
1380 | return 0; | ||
1381 | |||
1382 | /* Netfilter gets whole the not fragmented skb. */ | ||
1383 | return ip_send_skb(skb); | ||
1341 | } | 1384 | } |
1342 | 1385 | ||
1343 | /* | 1386 | /* |
1344 | * Throw away all pending data on the socket. | 1387 | * Throw away all pending data on the socket. |
1345 | */ | 1388 | */ |
1346 | void ip_flush_pending_frames(struct sock *sk) | 1389 | static void __ip_flush_pending_frames(struct sock *sk, |
1390 | struct sk_buff_head *queue, | ||
1391 | struct inet_cork *cork) | ||
1347 | { | 1392 | { |
1348 | struct sk_buff *skb; | 1393 | struct sk_buff *skb; |
1349 | 1394 | ||
1350 | while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) | 1395 | while ((skb = __skb_dequeue_tail(queue)) != NULL) |
1351 | kfree_skb(skb); | 1396 | kfree_skb(skb); |
1352 | 1397 | ||
1353 | ip_cork_release(inet_sk(sk)); | 1398 | ip_cork_release(cork); |
1354 | } | 1399 | } |
1355 | 1400 | ||
1401 | void ip_flush_pending_frames(struct sock *sk) | ||
1402 | { | ||
1403 | __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base); | ||
1404 | } | ||
1405 | |||
1406 | struct sk_buff *ip_make_skb(struct sock *sk, | ||
1407 | struct flowi4 *fl4, | ||
1408 | int getfrag(void *from, char *to, int offset, | ||
1409 | int len, int odd, struct sk_buff *skb), | ||
1410 | void *from, int length, int transhdrlen, | ||
1411 | struct ipcm_cookie *ipc, struct rtable **rtp, | ||
1412 | unsigned int flags) | ||
1413 | { | ||
1414 | struct inet_cork cork; | ||
1415 | struct sk_buff_head queue; | ||
1416 | int err; | ||
1417 | |||
1418 | if (flags & MSG_PROBE) | ||
1419 | return NULL; | ||
1420 | |||
1421 | __skb_queue_head_init(&queue); | ||
1422 | |||
1423 | cork.flags = 0; | ||
1424 | cork.addr = 0; | ||
1425 | cork.opt = NULL; | ||
1426 | err = ip_setup_cork(sk, &cork, ipc, rtp); | ||
1427 | if (err) | ||
1428 | return ERR_PTR(err); | ||
1429 | |||
1430 | err = __ip_append_data(sk, fl4, &queue, &cork, getfrag, | ||
1431 | from, length, transhdrlen, flags); | ||
1432 | if (err) { | ||
1433 | __ip_flush_pending_frames(sk, &queue, &cork); | ||
1434 | return ERR_PTR(err); | ||
1435 | } | ||
1436 | |||
1437 | return __ip_make_skb(sk, fl4, &queue, &cork); | ||
1438 | } | ||
1356 | 1439 | ||
1357 | /* | 1440 | /* |
1358 | * Fetch data from kernel space and fill in checksum if needed. | 1441 | * Fetch data from kernel space and fill in checksum if needed. |
@@ -1374,48 +1457,39 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset, | |||
1374 | * Should run single threaded per socket because it uses the sock | 1457 | * Should run single threaded per socket because it uses the sock |
1375 | * structure to pass arguments. | 1458 | * structure to pass arguments. |
1376 | */ | 1459 | */ |
1377 | void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg, | 1460 | void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr, |
1378 | unsigned int len) | 1461 | struct ip_reply_arg *arg, unsigned int len) |
1379 | { | 1462 | { |
1380 | struct inet_sock *inet = inet_sk(sk); | 1463 | struct inet_sock *inet = inet_sk(sk); |
1381 | struct { | 1464 | struct ip_options_data replyopts; |
1382 | struct ip_options opt; | ||
1383 | char data[40]; | ||
1384 | } replyopts; | ||
1385 | struct ipcm_cookie ipc; | 1465 | struct ipcm_cookie ipc; |
1386 | __be32 daddr; | 1466 | struct flowi4 fl4; |
1387 | struct rtable *rt = skb_rtable(skb); | 1467 | struct rtable *rt = skb_rtable(skb); |
1388 | 1468 | ||
1389 | if (ip_options_echo(&replyopts.opt, skb)) | 1469 | if (ip_options_echo(&replyopts.opt.opt, skb)) |
1390 | return; | 1470 | return; |
1391 | 1471 | ||
1392 | daddr = ipc.addr = rt->rt_src; | 1472 | ipc.addr = daddr; |
1393 | ipc.opt = NULL; | 1473 | ipc.opt = NULL; |
1394 | ipc.shtx.flags = 0; | 1474 | ipc.tx_flags = 0; |
1395 | 1475 | ||
1396 | if (replyopts.opt.optlen) { | 1476 | if (replyopts.opt.opt.optlen) { |
1397 | ipc.opt = &replyopts.opt; | 1477 | ipc.opt = &replyopts.opt; |
1398 | 1478 | ||
1399 | if (ipc.opt->srr) | 1479 | if (replyopts.opt.opt.srr) |
1400 | daddr = replyopts.opt.faddr; | 1480 | daddr = replyopts.opt.opt.faddr; |
1401 | } | 1481 | } |
1402 | 1482 | ||
1403 | { | 1483 | flowi4_init_output(&fl4, arg->bound_dev_if, 0, |
1404 | struct flowi fl = { .oif = arg->bound_dev_if, | 1484 | RT_TOS(ip_hdr(skb)->tos), |
1405 | .nl_u = { .ip4_u = | 1485 | RT_SCOPE_UNIVERSE, sk->sk_protocol, |
1406 | { .daddr = daddr, | 1486 | ip_reply_arg_flowi_flags(arg), |
1407 | .saddr = rt->rt_spec_dst, | 1487 | daddr, rt->rt_spec_dst, |
1408 | .tos = RT_TOS(ip_hdr(skb)->tos) } }, | 1488 | tcp_hdr(skb)->source, tcp_hdr(skb)->dest); |
1409 | /* Not quite clean, but right. */ | 1489 | security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); |
1410 | .uli_u = { .ports = | 1490 | rt = ip_route_output_key(sock_net(sk), &fl4); |
1411 | { .sport = tcp_hdr(skb)->dest, | 1491 | if (IS_ERR(rt)) |
1412 | .dport = tcp_hdr(skb)->source } }, | 1492 | return; |
1413 | .proto = sk->sk_protocol, | ||
1414 | .flags = ip_reply_arg_flowi_flags(arg) }; | ||
1415 | security_skb_classify_flow(skb, &fl); | ||
1416 | if (ip_route_output_key(sock_net(sk), &rt, &fl)) | ||
1417 | return; | ||
1418 | } | ||
1419 | 1493 | ||
1420 | /* And let IP do all the hard work. | 1494 | /* And let IP do all the hard work. |
1421 | 1495 | ||
@@ -1428,7 +1502,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar | |||
1428 | sk->sk_priority = skb->priority; | 1502 | sk->sk_priority = skb->priority; |
1429 | sk->sk_protocol = ip_hdr(skb)->protocol; | 1503 | sk->sk_protocol = ip_hdr(skb)->protocol; |
1430 | sk->sk_bound_dev_if = arg->bound_dev_if; | 1504 | sk->sk_bound_dev_if = arg->bound_dev_if; |
1431 | ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0, | 1505 | ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0, |
1432 | &ipc, &rt, MSG_DONTWAIT); | 1506 | &ipc, &rt, MSG_DONTWAIT); |
1433 | if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { | 1507 | if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { |
1434 | if (arg->csumoffset >= 0) | 1508 | if (arg->csumoffset >= 0) |
@@ -1436,7 +1510,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar | |||
1436 | arg->csumoffset) = csum_fold(csum_add(skb->csum, | 1510 | arg->csumoffset) = csum_fold(csum_add(skb->csum, |
1437 | arg->csum)); | 1511 | arg->csum)); |
1438 | skb->ip_summed = CHECKSUM_NONE; | 1512 | skb->ip_summed = CHECKSUM_NONE; |
1439 | ip_push_pending_frames(sk); | 1513 | ip_push_pending_frames(sk, &fl4); |
1440 | } | 1514 | } |
1441 | 1515 | ||
1442 | bh_unlock_sock(sk); | 1516 | bh_unlock_sock(sk); |
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 64b70ad162e3..ab0c9efd1efa 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c | |||
@@ -131,7 +131,7 @@ static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb) | |||
131 | static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) | 131 | static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) |
132 | { | 132 | { |
133 | struct sockaddr_in sin; | 133 | struct sockaddr_in sin; |
134 | struct iphdr *iph = ip_hdr(skb); | 134 | const struct iphdr *iph = ip_hdr(skb); |
135 | __be16 *ports = (__be16 *)skb_transport_header(skb); | 135 | __be16 *ports = (__be16 *)skb_transport_header(skb); |
136 | 136 | ||
137 | if (skb_transport_offset(skb) + 4 > skb->len) | 137 | if (skb_transport_offset(skb) + 4 > skb->len) |
@@ -238,7 +238,7 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc) | |||
238 | but receiver should be enough clever f.e. to forward mtrace requests, | 238 | but receiver should be enough clever f.e. to forward mtrace requests, |
239 | sent to multicast group to reach destination designated router. | 239 | sent to multicast group to reach destination designated router. |
240 | */ | 240 | */ |
241 | struct ip_ra_chain *ip_ra_chain; | 241 | struct ip_ra_chain __rcu *ip_ra_chain; |
242 | static DEFINE_SPINLOCK(ip_ra_lock); | 242 | static DEFINE_SPINLOCK(ip_ra_lock); |
243 | 243 | ||
244 | 244 | ||
@@ -253,7 +253,8 @@ static void ip_ra_destroy_rcu(struct rcu_head *head) | |||
253 | int ip_ra_control(struct sock *sk, unsigned char on, | 253 | int ip_ra_control(struct sock *sk, unsigned char on, |
254 | void (*destructor)(struct sock *)) | 254 | void (*destructor)(struct sock *)) |
255 | { | 255 | { |
256 | struct ip_ra_chain *ra, *new_ra, **rap; | 256 | struct ip_ra_chain *ra, *new_ra; |
257 | struct ip_ra_chain __rcu **rap; | ||
257 | 258 | ||
258 | if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW) | 259 | if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW) |
259 | return -EINVAL; | 260 | return -EINVAL; |
@@ -261,7 +262,10 @@ int ip_ra_control(struct sock *sk, unsigned char on, | |||
261 | new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; | 262 | new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; |
262 | 263 | ||
263 | spin_lock_bh(&ip_ra_lock); | 264 | spin_lock_bh(&ip_ra_lock); |
264 | for (rap = &ip_ra_chain; (ra = *rap) != NULL; rap = &ra->next) { | 265 | for (rap = &ip_ra_chain; |
266 | (ra = rcu_dereference_protected(*rap, | ||
267 | lockdep_is_held(&ip_ra_lock))) != NULL; | ||
268 | rap = &ra->next) { | ||
265 | if (ra->sk == sk) { | 269 | if (ra->sk == sk) { |
266 | if (on) { | 270 | if (on) { |
267 | spin_unlock_bh(&ip_ra_lock); | 271 | spin_unlock_bh(&ip_ra_lock); |
@@ -447,6 +451,11 @@ out: | |||
447 | } | 451 | } |
448 | 452 | ||
449 | 453 | ||
454 | static void opt_kfree_rcu(struct rcu_head *head) | ||
455 | { | ||
456 | kfree(container_of(head, struct ip_options_rcu, rcu)); | ||
457 | } | ||
458 | |||
450 | /* | 459 | /* |
451 | * Socket option code for IP. This is the end of the line after any | 460 | * Socket option code for IP. This is the end of the line after any |
452 | * TCP,UDP etc options on an IP socket. | 461 | * TCP,UDP etc options on an IP socket. |
@@ -493,13 +502,16 @@ static int do_ip_setsockopt(struct sock *sk, int level, | |||
493 | switch (optname) { | 502 | switch (optname) { |
494 | case IP_OPTIONS: | 503 | case IP_OPTIONS: |
495 | { | 504 | { |
496 | struct ip_options *opt = NULL; | 505 | struct ip_options_rcu *old, *opt = NULL; |
506 | |||
497 | if (optlen > 40) | 507 | if (optlen > 40) |
498 | goto e_inval; | 508 | goto e_inval; |
499 | err = ip_options_get_from_user(sock_net(sk), &opt, | 509 | err = ip_options_get_from_user(sock_net(sk), &opt, |
500 | optval, optlen); | 510 | optval, optlen); |
501 | if (err) | 511 | if (err) |
502 | break; | 512 | break; |
513 | old = rcu_dereference_protected(inet->inet_opt, | ||
514 | sock_owned_by_user(sk)); | ||
503 | if (inet->is_icsk) { | 515 | if (inet->is_icsk) { |
504 | struct inet_connection_sock *icsk = inet_csk(sk); | 516 | struct inet_connection_sock *icsk = inet_csk(sk); |
505 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | 517 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
@@ -508,17 +520,18 @@ static int do_ip_setsockopt(struct sock *sk, int level, | |||
508 | (TCPF_LISTEN | TCPF_CLOSE)) && | 520 | (TCPF_LISTEN | TCPF_CLOSE)) && |
509 | inet->inet_daddr != LOOPBACK4_IPV6)) { | 521 | inet->inet_daddr != LOOPBACK4_IPV6)) { |
510 | #endif | 522 | #endif |
511 | if (inet->opt) | 523 | if (old) |
512 | icsk->icsk_ext_hdr_len -= inet->opt->optlen; | 524 | icsk->icsk_ext_hdr_len -= old->opt.optlen; |
513 | if (opt) | 525 | if (opt) |
514 | icsk->icsk_ext_hdr_len += opt->optlen; | 526 | icsk->icsk_ext_hdr_len += opt->opt.optlen; |
515 | icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); | 527 | icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); |
516 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | 528 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
517 | } | 529 | } |
518 | #endif | 530 | #endif |
519 | } | 531 | } |
520 | opt = xchg(&inet->opt, opt); | 532 | rcu_assign_pointer(inet->inet_opt, opt); |
521 | kfree(opt); | 533 | if (old) |
534 | call_rcu(&old->rcu, opt_kfree_rcu); | ||
522 | break; | 535 | break; |
523 | } | 536 | } |
524 | case IP_PKTINFO: | 537 | case IP_PKTINFO: |
@@ -1077,12 +1090,16 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, | |||
1077 | case IP_OPTIONS: | 1090 | case IP_OPTIONS: |
1078 | { | 1091 | { |
1079 | unsigned char optbuf[sizeof(struct ip_options)+40]; | 1092 | unsigned char optbuf[sizeof(struct ip_options)+40]; |
1080 | struct ip_options * opt = (struct ip_options *)optbuf; | 1093 | struct ip_options *opt = (struct ip_options *)optbuf; |
1094 | struct ip_options_rcu *inet_opt; | ||
1095 | |||
1096 | inet_opt = rcu_dereference_protected(inet->inet_opt, | ||
1097 | sock_owned_by_user(sk)); | ||
1081 | opt->optlen = 0; | 1098 | opt->optlen = 0; |
1082 | if (inet->opt) | 1099 | if (inet_opt) |
1083 | memcpy(optbuf, inet->opt, | 1100 | memcpy(optbuf, &inet_opt->opt, |
1084 | sizeof(struct ip_options)+ | 1101 | sizeof(struct ip_options) + |
1085 | inet->opt->optlen); | 1102 | inet_opt->opt.optlen); |
1086 | release_sock(sk); | 1103 | release_sock(sk); |
1087 | 1104 | ||
1088 | if (opt->optlen == 0) | 1105 | if (opt->optlen == 0) |
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 629067571f02..c857f6f49b03 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c | |||
@@ -27,7 +27,7 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) | |||
27 | { | 27 | { |
28 | struct net *net = dev_net(skb->dev); | 28 | struct net *net = dev_net(skb->dev); |
29 | __be32 spi; | 29 | __be32 spi; |
30 | struct iphdr *iph = (struct iphdr *)skb->data; | 30 | const struct iphdr *iph = (const struct iphdr *)skb->data; |
31 | struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); | 31 | struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); |
32 | struct xfrm_state *x; | 32 | struct xfrm_state *x; |
33 | 33 | ||
@@ -36,7 +36,7 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) | |||
36 | return; | 36 | return; |
37 | 37 | ||
38 | spi = htonl(ntohs(ipch->cpi)); | 38 | spi = htonl(ntohs(ipch->cpi)); |
39 | x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, | 39 | x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, |
40 | spi, IPPROTO_COMP, AF_INET); | 40 | spi, IPPROTO_COMP, AF_INET); |
41 | if (!x) | 41 | if (!x) |
42 | return; | 42 | return; |
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 3a6e1ec5e9ae..ab7e5542c1cf 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c | |||
@@ -87,8 +87,8 @@ | |||
87 | #endif | 87 | #endif |
88 | 88 | ||
89 | /* Define the friendly delay before and after opening net devices */ | 89 | /* Define the friendly delay before and after opening net devices */ |
90 | #define CONF_PRE_OPEN 500 /* Before opening: 1/2 second */ | 90 | #define CONF_POST_OPEN 10 /* After opening: 10 msecs */ |
91 | #define CONF_POST_OPEN 1 /* After opening: 1 second */ | 91 | #define CONF_CARRIER_TIMEOUT 120000 /* Wait for carrier timeout */ |
92 | 92 | ||
93 | /* Define the timeout for waiting for a DHCP/BOOTP/RARP reply */ | 93 | /* Define the timeout for waiting for a DHCP/BOOTP/RARP reply */ |
94 | #define CONF_OPEN_RETRIES 2 /* (Re)open devices twice */ | 94 | #define CONF_OPEN_RETRIES 2 /* (Re)open devices twice */ |
@@ -188,14 +188,14 @@ struct ic_device { | |||
188 | static struct ic_device *ic_first_dev __initdata = NULL;/* List of open device */ | 188 | static struct ic_device *ic_first_dev __initdata = NULL;/* List of open device */ |
189 | static struct net_device *ic_dev __initdata = NULL; /* Selected device */ | 189 | static struct net_device *ic_dev __initdata = NULL; /* Selected device */ |
190 | 190 | ||
191 | static bool __init ic_device_match(struct net_device *dev) | 191 | static bool __init ic_is_init_dev(struct net_device *dev) |
192 | { | 192 | { |
193 | if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) : | 193 | if (dev->flags & IFF_LOOPBACK) |
194 | return false; | ||
195 | return user_dev_name[0] ? !strcmp(dev->name, user_dev_name) : | ||
194 | (!(dev->flags & IFF_LOOPBACK) && | 196 | (!(dev->flags & IFF_LOOPBACK) && |
195 | (dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) && | 197 | (dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) && |
196 | strncmp(dev->name, "dummy", 5))) | 198 | strncmp(dev->name, "dummy", 5)); |
197 | return true; | ||
198 | return false; | ||
199 | } | 199 | } |
200 | 200 | ||
201 | static int __init ic_open_devs(void) | 201 | static int __init ic_open_devs(void) |
@@ -203,6 +203,7 @@ static int __init ic_open_devs(void) | |||
203 | struct ic_device *d, **last; | 203 | struct ic_device *d, **last; |
204 | struct net_device *dev; | 204 | struct net_device *dev; |
205 | unsigned short oflags; | 205 | unsigned short oflags; |
206 | unsigned long start; | ||
206 | 207 | ||
207 | last = &ic_first_dev; | 208 | last = &ic_first_dev; |
208 | rtnl_lock(); | 209 | rtnl_lock(); |
@@ -216,9 +217,7 @@ static int __init ic_open_devs(void) | |||
216 | } | 217 | } |
217 | 218 | ||
218 | for_each_netdev(&init_net, dev) { | 219 | for_each_netdev(&init_net, dev) { |
219 | if (dev->flags & IFF_LOOPBACK) | 220 | if (ic_is_init_dev(dev)) { |
220 | continue; | ||
221 | if (ic_device_match(dev)) { | ||
222 | int able = 0; | 221 | int able = 0; |
223 | if (dev->mtu >= 364) | 222 | if (dev->mtu >= 364) |
224 | able |= IC_BOOTP; | 223 | able |= IC_BOOTP; |
@@ -252,6 +251,17 @@ static int __init ic_open_devs(void) | |||
252 | dev->name, able, d->xid)); | 251 | dev->name, able, d->xid)); |
253 | } | 252 | } |
254 | } | 253 | } |
254 | |||
255 | /* wait for a carrier on at least one device */ | ||
256 | start = jiffies; | ||
257 | while (jiffies - start < msecs_to_jiffies(CONF_CARRIER_TIMEOUT)) { | ||
258 | for_each_netdev(&init_net, dev) | ||
259 | if (ic_is_init_dev(dev) && netif_carrier_ok(dev)) | ||
260 | goto have_carrier; | ||
261 | |||
262 | msleep(1); | ||
263 | } | ||
264 | have_carrier: | ||
255 | rtnl_unlock(); | 265 | rtnl_unlock(); |
256 | 266 | ||
257 | *last = NULL; | 267 | *last = NULL; |
@@ -1191,13 +1201,13 @@ static int __init ic_dynamic(void) | |||
1191 | (ic_proto_enabled & IC_USE_DHCP) && | 1201 | (ic_proto_enabled & IC_USE_DHCP) && |
1192 | ic_dhcp_msgtype != DHCPACK) { | 1202 | ic_dhcp_msgtype != DHCPACK) { |
1193 | ic_got_reply = 0; | 1203 | ic_got_reply = 0; |
1194 | printk(","); | 1204 | printk(KERN_CONT ","); |
1195 | continue; | 1205 | continue; |
1196 | } | 1206 | } |
1197 | #endif /* IPCONFIG_DHCP */ | 1207 | #endif /* IPCONFIG_DHCP */ |
1198 | 1208 | ||
1199 | if (ic_got_reply) { | 1209 | if (ic_got_reply) { |
1200 | printk(" OK\n"); | 1210 | printk(KERN_CONT " OK\n"); |
1201 | break; | 1211 | break; |
1202 | } | 1212 | } |
1203 | 1213 | ||
@@ -1205,7 +1215,7 @@ static int __init ic_dynamic(void) | |||
1205 | continue; | 1215 | continue; |
1206 | 1216 | ||
1207 | if (! --retries) { | 1217 | if (! --retries) { |
1208 | printk(" timed out!\n"); | 1218 | printk(KERN_CONT " timed out!\n"); |
1209 | break; | 1219 | break; |
1210 | } | 1220 | } |
1211 | 1221 | ||
@@ -1215,7 +1225,7 @@ static int __init ic_dynamic(void) | |||
1215 | if (timeout > CONF_TIMEOUT_MAX) | 1225 | if (timeout > CONF_TIMEOUT_MAX) |
1216 | timeout = CONF_TIMEOUT_MAX; | 1226 | timeout = CONF_TIMEOUT_MAX; |
1217 | 1227 | ||
1218 | printk("."); | 1228 | printk(KERN_CONT "."); |
1219 | } | 1229 | } |
1220 | 1230 | ||
1221 | #ifdef IPCONFIG_BOOTP | 1231 | #ifdef IPCONFIG_BOOTP |
@@ -1236,7 +1246,7 @@ static int __init ic_dynamic(void) | |||
1236 | ((ic_got_reply & IC_RARP) ? "RARP" | 1246 | ((ic_got_reply & IC_RARP) ? "RARP" |
1237 | : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"), | 1247 | : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"), |
1238 | &ic_servaddr); | 1248 | &ic_servaddr); |
1239 | printk("my address is %pI4\n", &ic_myaddr); | 1249 | printk(KERN_CONT "my address is %pI4\n", &ic_myaddr); |
1240 | 1250 | ||
1241 | return 0; | 1251 | return 0; |
1242 | } | 1252 | } |
@@ -1324,14 +1334,13 @@ static int __init wait_for_devices(void) | |||
1324 | { | 1334 | { |
1325 | int i; | 1335 | int i; |
1326 | 1336 | ||
1327 | msleep(CONF_PRE_OPEN); | ||
1328 | for (i = 0; i < DEVICE_WAIT_MAX; i++) { | 1337 | for (i = 0; i < DEVICE_WAIT_MAX; i++) { |
1329 | struct net_device *dev; | 1338 | struct net_device *dev; |
1330 | int found = 0; | 1339 | int found = 0; |
1331 | 1340 | ||
1332 | rtnl_lock(); | 1341 | rtnl_lock(); |
1333 | for_each_netdev(&init_net, dev) { | 1342 | for_each_netdev(&init_net, dev) { |
1334 | if (ic_device_match(dev)) { | 1343 | if (ic_is_init_dev(dev)) { |
1335 | found = 1; | 1344 | found = 1; |
1336 | break; | 1345 | break; |
1337 | } | 1346 | } |
@@ -1378,7 +1387,7 @@ static int __init ip_auto_config(void) | |||
1378 | return err; | 1387 | return err; |
1379 | 1388 | ||
1380 | /* Give drivers a chance to settle */ | 1389 | /* Give drivers a chance to settle */ |
1381 | ssleep(CONF_POST_OPEN); | 1390 | msleep(CONF_POST_OPEN); |
1382 | 1391 | ||
1383 | /* | 1392 | /* |
1384 | * If the config information is insufficient (e.g., our IP address or | 1393 | * If the config information is insufficient (e.g., our IP address or |
@@ -1444,7 +1453,7 @@ static int __init ip_auto_config(void) | |||
1444 | root_server_addr = addr; | 1453 | root_server_addr = addr; |
1445 | 1454 | ||
1446 | /* | 1455 | /* |
1447 | * Use defaults whereever applicable. | 1456 | * Use defaults wherever applicable. |
1448 | */ | 1457 | */ |
1449 | if (ic_defaults() < 0) | 1458 | if (ic_defaults() < 0) |
1450 | return -1; | 1459 | return -1; |
@@ -1468,19 +1477,19 @@ static int __init ip_auto_config(void) | |||
1468 | /* | 1477 | /* |
1469 | * Clue in the operator. | 1478 | * Clue in the operator. |
1470 | */ | 1479 | */ |
1471 | printk("IP-Config: Complete:"); | 1480 | printk("IP-Config: Complete:\n"); |
1472 | printk("\n device=%s", ic_dev->name); | 1481 | printk(" device=%s", ic_dev->name); |
1473 | printk(", addr=%pI4", &ic_myaddr); | 1482 | printk(KERN_CONT ", addr=%pI4", &ic_myaddr); |
1474 | printk(", mask=%pI4", &ic_netmask); | 1483 | printk(KERN_CONT ", mask=%pI4", &ic_netmask); |
1475 | printk(", gw=%pI4", &ic_gateway); | 1484 | printk(KERN_CONT ", gw=%pI4", &ic_gateway); |
1476 | printk(",\n host=%s, domain=%s, nis-domain=%s", | 1485 | printk(KERN_CONT ",\n host=%s, domain=%s, nis-domain=%s", |
1477 | utsname()->nodename, ic_domain, utsname()->domainname); | 1486 | utsname()->nodename, ic_domain, utsname()->domainname); |
1478 | printk(",\n bootserver=%pI4", &ic_servaddr); | 1487 | printk(KERN_CONT ",\n bootserver=%pI4", &ic_servaddr); |
1479 | printk(", rootserver=%pI4", &root_server_addr); | 1488 | printk(KERN_CONT ", rootserver=%pI4", &root_server_addr); |
1480 | printk(", rootpath=%s", root_server_path); | 1489 | printk(KERN_CONT ", rootpath=%s", root_server_path); |
1481 | if (ic_dev_mtu) | 1490 | if (ic_dev_mtu) |
1482 | printk(", mtu=%d", ic_dev_mtu); | 1491 | printk(KERN_CONT ", mtu=%d", ic_dev_mtu); |
1483 | printk("\n"); | 1492 | printk(KERN_CONT "\n"); |
1484 | #endif /* !SILENT */ | 1493 | #endif /* !SILENT */ |
1485 | 1494 | ||
1486 | return 0; | 1495 | return 0; |
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index ec036731a70b..378b20b7ca6e 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c | |||
@@ -122,31 +122,59 @@ | |||
122 | 122 | ||
123 | static int ipip_net_id __read_mostly; | 123 | static int ipip_net_id __read_mostly; |
124 | struct ipip_net { | 124 | struct ipip_net { |
125 | struct ip_tunnel *tunnels_r_l[HASH_SIZE]; | 125 | struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE]; |
126 | struct ip_tunnel *tunnels_r[HASH_SIZE]; | 126 | struct ip_tunnel __rcu *tunnels_r[HASH_SIZE]; |
127 | struct ip_tunnel *tunnels_l[HASH_SIZE]; | 127 | struct ip_tunnel __rcu *tunnels_l[HASH_SIZE]; |
128 | struct ip_tunnel *tunnels_wc[1]; | 128 | struct ip_tunnel __rcu *tunnels_wc[1]; |
129 | struct ip_tunnel **tunnels[4]; | 129 | struct ip_tunnel __rcu **tunnels[4]; |
130 | 130 | ||
131 | struct net_device *fb_tunnel_dev; | 131 | struct net_device *fb_tunnel_dev; |
132 | }; | 132 | }; |
133 | 133 | ||
134 | static void ipip_tunnel_init(struct net_device *dev); | 134 | static int ipip_tunnel_init(struct net_device *dev); |
135 | static void ipip_tunnel_setup(struct net_device *dev); | 135 | static void ipip_tunnel_setup(struct net_device *dev); |
136 | static void ipip_dev_free(struct net_device *dev); | ||
136 | 137 | ||
137 | /* | 138 | /* |
138 | * Locking : hash tables are protected by RCU and a spinlock | 139 | * Locking : hash tables are protected by RCU and RTNL |
139 | */ | 140 | */ |
140 | static DEFINE_SPINLOCK(ipip_lock); | ||
141 | 141 | ||
142 | #define for_each_ip_tunnel_rcu(start) \ | 142 | #define for_each_ip_tunnel_rcu(start) \ |
143 | for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) | 143 | for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) |
144 | 144 | ||
145 | /* often modified stats are per cpu, other are shared (netdev->stats) */ | ||
146 | struct pcpu_tstats { | ||
147 | unsigned long rx_packets; | ||
148 | unsigned long rx_bytes; | ||
149 | unsigned long tx_packets; | ||
150 | unsigned long tx_bytes; | ||
151 | }; | ||
152 | |||
153 | static struct net_device_stats *ipip_get_stats(struct net_device *dev) | ||
154 | { | ||
155 | struct pcpu_tstats sum = { 0 }; | ||
156 | int i; | ||
157 | |||
158 | for_each_possible_cpu(i) { | ||
159 | const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); | ||
160 | |||
161 | sum.rx_packets += tstats->rx_packets; | ||
162 | sum.rx_bytes += tstats->rx_bytes; | ||
163 | sum.tx_packets += tstats->tx_packets; | ||
164 | sum.tx_bytes += tstats->tx_bytes; | ||
165 | } | ||
166 | dev->stats.rx_packets = sum.rx_packets; | ||
167 | dev->stats.rx_bytes = sum.rx_bytes; | ||
168 | dev->stats.tx_packets = sum.tx_packets; | ||
169 | dev->stats.tx_bytes = sum.tx_bytes; | ||
170 | return &dev->stats; | ||
171 | } | ||
172 | |||
145 | static struct ip_tunnel * ipip_tunnel_lookup(struct net *net, | 173 | static struct ip_tunnel * ipip_tunnel_lookup(struct net *net, |
146 | __be32 remote, __be32 local) | 174 | __be32 remote, __be32 local) |
147 | { | 175 | { |
148 | unsigned h0 = HASH(remote); | 176 | unsigned int h0 = HASH(remote); |
149 | unsigned h1 = HASH(local); | 177 | unsigned int h1 = HASH(local); |
150 | struct ip_tunnel *t; | 178 | struct ip_tunnel *t; |
151 | struct ipip_net *ipn = net_generic(net, ipip_net_id); | 179 | struct ipip_net *ipn = net_generic(net, ipip_net_id); |
152 | 180 | ||
@@ -169,12 +197,12 @@ static struct ip_tunnel * ipip_tunnel_lookup(struct net *net, | |||
169 | return NULL; | 197 | return NULL; |
170 | } | 198 | } |
171 | 199 | ||
172 | static struct ip_tunnel **__ipip_bucket(struct ipip_net *ipn, | 200 | static struct ip_tunnel __rcu **__ipip_bucket(struct ipip_net *ipn, |
173 | struct ip_tunnel_parm *parms) | 201 | struct ip_tunnel_parm *parms) |
174 | { | 202 | { |
175 | __be32 remote = parms->iph.daddr; | 203 | __be32 remote = parms->iph.daddr; |
176 | __be32 local = parms->iph.saddr; | 204 | __be32 local = parms->iph.saddr; |
177 | unsigned h = 0; | 205 | unsigned int h = 0; |
178 | int prio = 0; | 206 | int prio = 0; |
179 | 207 | ||
180 | if (remote) { | 208 | if (remote) { |
@@ -188,7 +216,7 @@ static struct ip_tunnel **__ipip_bucket(struct ipip_net *ipn, | |||
188 | return &ipn->tunnels[prio][h]; | 216 | return &ipn->tunnels[prio][h]; |
189 | } | 217 | } |
190 | 218 | ||
191 | static inline struct ip_tunnel **ipip_bucket(struct ipip_net *ipn, | 219 | static inline struct ip_tunnel __rcu **ipip_bucket(struct ipip_net *ipn, |
192 | struct ip_tunnel *t) | 220 | struct ip_tunnel *t) |
193 | { | 221 | { |
194 | return __ipip_bucket(ipn, &t->parms); | 222 | return __ipip_bucket(ipn, &t->parms); |
@@ -196,13 +224,14 @@ static inline struct ip_tunnel **ipip_bucket(struct ipip_net *ipn, | |||
196 | 224 | ||
197 | static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t) | 225 | static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t) |
198 | { | 226 | { |
199 | struct ip_tunnel **tp; | 227 | struct ip_tunnel __rcu **tp; |
200 | 228 | struct ip_tunnel *iter; | |
201 | for (tp = ipip_bucket(ipn, t); *tp; tp = &(*tp)->next) { | 229 | |
202 | if (t == *tp) { | 230 | for (tp = ipip_bucket(ipn, t); |
203 | spin_lock_bh(&ipip_lock); | 231 | (iter = rtnl_dereference(*tp)) != NULL; |
204 | *tp = t->next; | 232 | tp = &iter->next) { |
205 | spin_unlock_bh(&ipip_lock); | 233 | if (t == iter) { |
234 | rcu_assign_pointer(*tp, t->next); | ||
206 | break; | 235 | break; |
207 | } | 236 | } |
208 | } | 237 | } |
@@ -210,12 +239,10 @@ static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t) | |||
210 | 239 | ||
211 | static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t) | 240 | static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t) |
212 | { | 241 | { |
213 | struct ip_tunnel **tp = ipip_bucket(ipn, t); | 242 | struct ip_tunnel __rcu **tp = ipip_bucket(ipn, t); |
214 | 243 | ||
215 | spin_lock_bh(&ipip_lock); | 244 | rcu_assign_pointer(t->next, rtnl_dereference(*tp)); |
216 | t->next = *tp; | ||
217 | rcu_assign_pointer(*tp, t); | 245 | rcu_assign_pointer(*tp, t); |
218 | spin_unlock_bh(&ipip_lock); | ||
219 | } | 246 | } |
220 | 247 | ||
221 | static struct ip_tunnel * ipip_tunnel_locate(struct net *net, | 248 | static struct ip_tunnel * ipip_tunnel_locate(struct net *net, |
@@ -223,12 +250,15 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net, | |||
223 | { | 250 | { |
224 | __be32 remote = parms->iph.daddr; | 251 | __be32 remote = parms->iph.daddr; |
225 | __be32 local = parms->iph.saddr; | 252 | __be32 local = parms->iph.saddr; |
226 | struct ip_tunnel *t, **tp, *nt; | 253 | struct ip_tunnel *t, *nt; |
254 | struct ip_tunnel __rcu **tp; | ||
227 | struct net_device *dev; | 255 | struct net_device *dev; |
228 | char name[IFNAMSIZ]; | 256 | char name[IFNAMSIZ]; |
229 | struct ipip_net *ipn = net_generic(net, ipip_net_id); | 257 | struct ipip_net *ipn = net_generic(net, ipip_net_id); |
230 | 258 | ||
231 | for (tp = __ipip_bucket(ipn, parms); (t = *tp) != NULL; tp = &t->next) { | 259 | for (tp = __ipip_bucket(ipn, parms); |
260 | (t = rtnl_dereference(*tp)) != NULL; | ||
261 | tp = &t->next) { | ||
232 | if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) | 262 | if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) |
233 | return t; | 263 | return t; |
234 | } | 264 | } |
@@ -238,7 +268,7 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net, | |||
238 | if (parms->name[0]) | 268 | if (parms->name[0]) |
239 | strlcpy(name, parms->name, IFNAMSIZ); | 269 | strlcpy(name, parms->name, IFNAMSIZ); |
240 | else | 270 | else |
241 | sprintf(name, "tunl%%d"); | 271 | strcpy(name, "tunl%d"); |
242 | 272 | ||
243 | dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup); | 273 | dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup); |
244 | if (dev == NULL) | 274 | if (dev == NULL) |
@@ -246,15 +276,11 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net, | |||
246 | 276 | ||
247 | dev_net_set(dev, net); | 277 | dev_net_set(dev, net); |
248 | 278 | ||
249 | if (strchr(name, '%')) { | ||
250 | if (dev_alloc_name(dev, name) < 0) | ||
251 | goto failed_free; | ||
252 | } | ||
253 | |||
254 | nt = netdev_priv(dev); | 279 | nt = netdev_priv(dev); |
255 | nt->parms = *parms; | 280 | nt->parms = *parms; |
256 | 281 | ||
257 | ipip_tunnel_init(dev); | 282 | if (ipip_tunnel_init(dev) < 0) |
283 | goto failed_free; | ||
258 | 284 | ||
259 | if (register_netdevice(dev) < 0) | 285 | if (register_netdevice(dev) < 0) |
260 | goto failed_free; | 286 | goto failed_free; |
@@ -264,20 +290,19 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net, | |||
264 | return nt; | 290 | return nt; |
265 | 291 | ||
266 | failed_free: | 292 | failed_free: |
267 | free_netdev(dev); | 293 | ipip_dev_free(dev); |
268 | return NULL; | 294 | return NULL; |
269 | } | 295 | } |
270 | 296 | ||
297 | /* called with RTNL */ | ||
271 | static void ipip_tunnel_uninit(struct net_device *dev) | 298 | static void ipip_tunnel_uninit(struct net_device *dev) |
272 | { | 299 | { |
273 | struct net *net = dev_net(dev); | 300 | struct net *net = dev_net(dev); |
274 | struct ipip_net *ipn = net_generic(net, ipip_net_id); | 301 | struct ipip_net *ipn = net_generic(net, ipip_net_id); |
275 | 302 | ||
276 | if (dev == ipn->fb_tunnel_dev) { | 303 | if (dev == ipn->fb_tunnel_dev) |
277 | spin_lock_bh(&ipip_lock); | 304 | rcu_assign_pointer(ipn->tunnels_wc[0], NULL); |
278 | ipn->tunnels_wc[0] = NULL; | 305 | else |
279 | spin_unlock_bh(&ipip_lock); | ||
280 | } else | ||
281 | ipip_tunnel_unlink(ipn, netdev_priv(dev)); | 306 | ipip_tunnel_unlink(ipn, netdev_priv(dev)); |
282 | dev_put(dev); | 307 | dev_put(dev); |
283 | } | 308 | } |
@@ -289,7 +314,7 @@ static int ipip_err(struct sk_buff *skb, u32 info) | |||
289 | 8 bytes of packet payload. It means, that precise relaying of | 314 | 8 bytes of packet payload. It means, that precise relaying of |
290 | ICMP in the real Internet is absolutely infeasible. | 315 | ICMP in the real Internet is absolutely infeasible. |
291 | */ | 316 | */ |
292 | struct iphdr *iph = (struct iphdr *)skb->data; | 317 | const struct iphdr *iph = (const struct iphdr *)skb->data; |
293 | const int type = icmp_hdr(skb)->type; | 318 | const int type = icmp_hdr(skb)->type; |
294 | const int code = icmp_hdr(skb)->code; | 319 | const int code = icmp_hdr(skb)->code; |
295 | struct ip_tunnel *t; | 320 | struct ip_tunnel *t; |
@@ -359,8 +384,10 @@ static int ipip_rcv(struct sk_buff *skb) | |||
359 | const struct iphdr *iph = ip_hdr(skb); | 384 | const struct iphdr *iph = ip_hdr(skb); |
360 | 385 | ||
361 | rcu_read_lock(); | 386 | rcu_read_lock(); |
362 | if ((tunnel = ipip_tunnel_lookup(dev_net(skb->dev), | 387 | tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr); |
363 | iph->saddr, iph->daddr)) != NULL) { | 388 | if (tunnel != NULL) { |
389 | struct pcpu_tstats *tstats; | ||
390 | |||
364 | if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { | 391 | if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { |
365 | rcu_read_unlock(); | 392 | rcu_read_unlock(); |
366 | kfree_skb(skb); | 393 | kfree_skb(skb); |
@@ -374,10 +401,16 @@ static int ipip_rcv(struct sk_buff *skb) | |||
374 | skb->protocol = htons(ETH_P_IP); | 401 | skb->protocol = htons(ETH_P_IP); |
375 | skb->pkt_type = PACKET_HOST; | 402 | skb->pkt_type = PACKET_HOST; |
376 | 403 | ||
377 | skb_tunnel_rx(skb, tunnel->dev); | 404 | tstats = this_cpu_ptr(tunnel->dev->tstats); |
405 | tstats->rx_packets++; | ||
406 | tstats->rx_bytes += skb->len; | ||
407 | |||
408 | __skb_tunnel_rx(skb, tunnel->dev); | ||
378 | 409 | ||
379 | ipip_ecn_decapsulate(iph, skb); | 410 | ipip_ecn_decapsulate(iph, skb); |
411 | |||
380 | netif_rx(skb); | 412 | netif_rx(skb); |
413 | |||
381 | rcu_read_unlock(); | 414 | rcu_read_unlock(); |
382 | return 0; | 415 | return 0; |
383 | } | 416 | } |
@@ -394,52 +427,49 @@ static int ipip_rcv(struct sk_buff *skb) | |||
394 | static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) | 427 | static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) |
395 | { | 428 | { |
396 | struct ip_tunnel *tunnel = netdev_priv(dev); | 429 | struct ip_tunnel *tunnel = netdev_priv(dev); |
397 | struct net_device_stats *stats = &dev->stats; | 430 | struct pcpu_tstats *tstats; |
398 | struct netdev_queue *txq = netdev_get_tx_queue(dev, 0); | 431 | const struct iphdr *tiph = &tunnel->parms.iph; |
399 | struct iphdr *tiph = &tunnel->parms.iph; | ||
400 | u8 tos = tunnel->parms.iph.tos; | 432 | u8 tos = tunnel->parms.iph.tos; |
401 | __be16 df = tiph->frag_off; | 433 | __be16 df = tiph->frag_off; |
402 | struct rtable *rt; /* Route to the other host */ | 434 | struct rtable *rt; /* Route to the other host */ |
403 | struct net_device *tdev; /* Device to other host */ | 435 | struct net_device *tdev; /* Device to other host */ |
404 | struct iphdr *old_iph = ip_hdr(skb); | 436 | const struct iphdr *old_iph = ip_hdr(skb); |
405 | struct iphdr *iph; /* Our new IP header */ | 437 | struct iphdr *iph; /* Our new IP header */ |
406 | unsigned int max_headroom; /* The extra header space needed */ | 438 | unsigned int max_headroom; /* The extra header space needed */ |
407 | __be32 dst = tiph->daddr; | 439 | __be32 dst = tiph->daddr; |
440 | struct flowi4 fl4; | ||
408 | int mtu; | 441 | int mtu; |
409 | 442 | ||
410 | if (skb->protocol != htons(ETH_P_IP)) | 443 | if (skb->protocol != htons(ETH_P_IP)) |
411 | goto tx_error; | 444 | goto tx_error; |
412 | 445 | ||
413 | if (tos&1) | 446 | if (tos & 1) |
414 | tos = old_iph->tos; | 447 | tos = old_iph->tos; |
415 | 448 | ||
416 | if (!dst) { | 449 | if (!dst) { |
417 | /* NBMA tunnel */ | 450 | /* NBMA tunnel */ |
418 | if ((rt = skb_rtable(skb)) == NULL) { | 451 | if ((rt = skb_rtable(skb)) == NULL) { |
419 | stats->tx_fifo_errors++; | 452 | dev->stats.tx_fifo_errors++; |
420 | goto tx_error; | 453 | goto tx_error; |
421 | } | 454 | } |
422 | if ((dst = rt->rt_gateway) == 0) | 455 | if ((dst = rt->rt_gateway) == 0) |
423 | goto tx_error_icmp; | 456 | goto tx_error_icmp; |
424 | } | 457 | } |
425 | 458 | ||
426 | { | 459 | rt = ip_route_output_ports(dev_net(dev), &fl4, NULL, |
427 | struct flowi fl = { .oif = tunnel->parms.link, | 460 | dst, tiph->saddr, |
428 | .nl_u = { .ip4_u = | 461 | 0, 0, |
429 | { .daddr = dst, | 462 | IPPROTO_IPIP, RT_TOS(tos), |
430 | .saddr = tiph->saddr, | 463 | tunnel->parms.link); |
431 | .tos = RT_TOS(tos) } }, | 464 | if (IS_ERR(rt)) { |
432 | .proto = IPPROTO_IPIP }; | 465 | dev->stats.tx_carrier_errors++; |
433 | if (ip_route_output_key(dev_net(dev), &rt, &fl)) { | 466 | goto tx_error_icmp; |
434 | stats->tx_carrier_errors++; | ||
435 | goto tx_error_icmp; | ||
436 | } | ||
437 | } | 467 | } |
438 | tdev = rt->dst.dev; | 468 | tdev = rt->dst.dev; |
439 | 469 | ||
440 | if (tdev == dev) { | 470 | if (tdev == dev) { |
441 | ip_rt_put(rt); | 471 | ip_rt_put(rt); |
442 | stats->collisions++; | 472 | dev->stats.collisions++; |
443 | goto tx_error; | 473 | goto tx_error; |
444 | } | 474 | } |
445 | 475 | ||
@@ -449,7 +479,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) | |||
449 | mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); | 479 | mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); |
450 | 480 | ||
451 | if (mtu < 68) { | 481 | if (mtu < 68) { |
452 | stats->collisions++; | 482 | dev->stats.collisions++; |
453 | ip_rt_put(rt); | 483 | ip_rt_put(rt); |
454 | goto tx_error; | 484 | goto tx_error; |
455 | } | 485 | } |
@@ -485,7 +515,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) | |||
485 | struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); | 515 | struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); |
486 | if (!new_skb) { | 516 | if (!new_skb) { |
487 | ip_rt_put(rt); | 517 | ip_rt_put(rt); |
488 | txq->tx_dropped++; | 518 | dev->stats.tx_dropped++; |
489 | dev_kfree_skb(skb); | 519 | dev_kfree_skb(skb); |
490 | return NETDEV_TX_OK; | 520 | return NETDEV_TX_OK; |
491 | } | 521 | } |
@@ -515,21 +545,21 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) | |||
515 | iph->frag_off = df; | 545 | iph->frag_off = df; |
516 | iph->protocol = IPPROTO_IPIP; | 546 | iph->protocol = IPPROTO_IPIP; |
517 | iph->tos = INET_ECN_encapsulate(tos, old_iph->tos); | 547 | iph->tos = INET_ECN_encapsulate(tos, old_iph->tos); |
518 | iph->daddr = rt->rt_dst; | 548 | iph->daddr = fl4.daddr; |
519 | iph->saddr = rt->rt_src; | 549 | iph->saddr = fl4.saddr; |
520 | 550 | ||
521 | if ((iph->ttl = tiph->ttl) == 0) | 551 | if ((iph->ttl = tiph->ttl) == 0) |
522 | iph->ttl = old_iph->ttl; | 552 | iph->ttl = old_iph->ttl; |
523 | 553 | ||
524 | nf_reset(skb); | 554 | nf_reset(skb); |
525 | 555 | tstats = this_cpu_ptr(dev->tstats); | |
526 | IPTUNNEL_XMIT(); | 556 | __IPTUNNEL_XMIT(tstats, &dev->stats); |
527 | return NETDEV_TX_OK; | 557 | return NETDEV_TX_OK; |
528 | 558 | ||
529 | tx_error_icmp: | 559 | tx_error_icmp: |
530 | dst_link_failure(skb); | 560 | dst_link_failure(skb); |
531 | tx_error: | 561 | tx_error: |
532 | stats->tx_errors++; | 562 | dev->stats.tx_errors++; |
533 | dev_kfree_skb(skb); | 563 | dev_kfree_skb(skb); |
534 | return NETDEV_TX_OK; | 564 | return NETDEV_TX_OK; |
535 | } | 565 | } |
@@ -538,20 +568,22 @@ static void ipip_tunnel_bind_dev(struct net_device *dev) | |||
538 | { | 568 | { |
539 | struct net_device *tdev = NULL; | 569 | struct net_device *tdev = NULL; |
540 | struct ip_tunnel *tunnel; | 570 | struct ip_tunnel *tunnel; |
541 | struct iphdr *iph; | 571 | const struct iphdr *iph; |
542 | 572 | ||
543 | tunnel = netdev_priv(dev); | 573 | tunnel = netdev_priv(dev); |
544 | iph = &tunnel->parms.iph; | 574 | iph = &tunnel->parms.iph; |
545 | 575 | ||
546 | if (iph->daddr) { | 576 | if (iph->daddr) { |
547 | struct flowi fl = { .oif = tunnel->parms.link, | ||
548 | .nl_u = { .ip4_u = | ||
549 | { .daddr = iph->daddr, | ||
550 | .saddr = iph->saddr, | ||
551 | .tos = RT_TOS(iph->tos) } }, | ||
552 | .proto = IPPROTO_IPIP }; | ||
553 | struct rtable *rt; | 577 | struct rtable *rt; |
554 | if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { | 578 | struct flowi4 fl4; |
579 | |||
580 | rt = ip_route_output_ports(dev_net(dev), &fl4, NULL, | ||
581 | iph->daddr, iph->saddr, | ||
582 | 0, 0, | ||
583 | IPPROTO_IPIP, | ||
584 | RT_TOS(iph->tos), | ||
585 | tunnel->parms.link); | ||
586 | if (!IS_ERR(rt)) { | ||
555 | tdev = rt->dst.dev; | 587 | tdev = rt->dst.dev; |
556 | ip_rt_put(rt); | 588 | ip_rt_put(rt); |
557 | } | 589 | } |
@@ -627,6 +659,7 @@ ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) | |||
627 | } | 659 | } |
628 | t = netdev_priv(dev); | 660 | t = netdev_priv(dev); |
629 | ipip_tunnel_unlink(ipn, t); | 661 | ipip_tunnel_unlink(ipn, t); |
662 | synchronize_net(); | ||
630 | t->parms.iph.saddr = p.iph.saddr; | 663 | t->parms.iph.saddr = p.iph.saddr; |
631 | t->parms.iph.daddr = p.iph.daddr; | 664 | t->parms.iph.daddr = p.iph.daddr; |
632 | memcpy(dev->dev_addr, &p.iph.saddr, 4); | 665 | memcpy(dev->dev_addr, &p.iph.saddr, 4); |
@@ -696,13 +729,19 @@ static const struct net_device_ops ipip_netdev_ops = { | |||
696 | .ndo_start_xmit = ipip_tunnel_xmit, | 729 | .ndo_start_xmit = ipip_tunnel_xmit, |
697 | .ndo_do_ioctl = ipip_tunnel_ioctl, | 730 | .ndo_do_ioctl = ipip_tunnel_ioctl, |
698 | .ndo_change_mtu = ipip_tunnel_change_mtu, | 731 | .ndo_change_mtu = ipip_tunnel_change_mtu, |
699 | 732 | .ndo_get_stats = ipip_get_stats, | |
700 | }; | 733 | }; |
701 | 734 | ||
735 | static void ipip_dev_free(struct net_device *dev) | ||
736 | { | ||
737 | free_percpu(dev->tstats); | ||
738 | free_netdev(dev); | ||
739 | } | ||
740 | |||
702 | static void ipip_tunnel_setup(struct net_device *dev) | 741 | static void ipip_tunnel_setup(struct net_device *dev) |
703 | { | 742 | { |
704 | dev->netdev_ops = &ipip_netdev_ops; | 743 | dev->netdev_ops = &ipip_netdev_ops; |
705 | dev->destructor = free_netdev; | 744 | dev->destructor = ipip_dev_free; |
706 | 745 | ||
707 | dev->type = ARPHRD_TUNNEL; | 746 | dev->type = ARPHRD_TUNNEL; |
708 | dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); | 747 | dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); |
@@ -711,10 +750,11 @@ static void ipip_tunnel_setup(struct net_device *dev) | |||
711 | dev->iflink = 0; | 750 | dev->iflink = 0; |
712 | dev->addr_len = 4; | 751 | dev->addr_len = 4; |
713 | dev->features |= NETIF_F_NETNS_LOCAL; | 752 | dev->features |= NETIF_F_NETNS_LOCAL; |
753 | dev->features |= NETIF_F_LLTX; | ||
714 | dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; | 754 | dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; |
715 | } | 755 | } |
716 | 756 | ||
717 | static void ipip_tunnel_init(struct net_device *dev) | 757 | static int ipip_tunnel_init(struct net_device *dev) |
718 | { | 758 | { |
719 | struct ip_tunnel *tunnel = netdev_priv(dev); | 759 | struct ip_tunnel *tunnel = netdev_priv(dev); |
720 | 760 | ||
@@ -725,9 +765,15 @@ static void ipip_tunnel_init(struct net_device *dev) | |||
725 | memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); | 765 | memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); |
726 | 766 | ||
727 | ipip_tunnel_bind_dev(dev); | 767 | ipip_tunnel_bind_dev(dev); |
768 | |||
769 | dev->tstats = alloc_percpu(struct pcpu_tstats); | ||
770 | if (!dev->tstats) | ||
771 | return -ENOMEM; | ||
772 | |||
773 | return 0; | ||
728 | } | 774 | } |
729 | 775 | ||
730 | static void __net_init ipip_fb_tunnel_init(struct net_device *dev) | 776 | static int __net_init ipip_fb_tunnel_init(struct net_device *dev) |
731 | { | 777 | { |
732 | struct ip_tunnel *tunnel = netdev_priv(dev); | 778 | struct ip_tunnel *tunnel = netdev_priv(dev); |
733 | struct iphdr *iph = &tunnel->parms.iph; | 779 | struct iphdr *iph = &tunnel->parms.iph; |
@@ -740,11 +786,16 @@ static void __net_init ipip_fb_tunnel_init(struct net_device *dev) | |||
740 | iph->protocol = IPPROTO_IPIP; | 786 | iph->protocol = IPPROTO_IPIP; |
741 | iph->ihl = 5; | 787 | iph->ihl = 5; |
742 | 788 | ||
789 | dev->tstats = alloc_percpu(struct pcpu_tstats); | ||
790 | if (!dev->tstats) | ||
791 | return -ENOMEM; | ||
792 | |||
743 | dev_hold(dev); | 793 | dev_hold(dev); |
744 | ipn->tunnels_wc[0] = tunnel; | 794 | rcu_assign_pointer(ipn->tunnels_wc[0], tunnel); |
795 | return 0; | ||
745 | } | 796 | } |
746 | 797 | ||
747 | static struct xfrm_tunnel ipip_handler = { | 798 | static struct xfrm_tunnel ipip_handler __read_mostly = { |
748 | .handler = ipip_rcv, | 799 | .handler = ipip_rcv, |
749 | .err_handler = ipip_err, | 800 | .err_handler = ipip_err, |
750 | .priority = 1, | 801 | .priority = 1, |
@@ -760,11 +811,12 @@ static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head) | |||
760 | for (prio = 1; prio < 4; prio++) { | 811 | for (prio = 1; prio < 4; prio++) { |
761 | int h; | 812 | int h; |
762 | for (h = 0; h < HASH_SIZE; h++) { | 813 | for (h = 0; h < HASH_SIZE; h++) { |
763 | struct ip_tunnel *t = ipn->tunnels[prio][h]; | 814 | struct ip_tunnel *t; |
764 | 815 | ||
816 | t = rtnl_dereference(ipn->tunnels[prio][h]); | ||
765 | while (t != NULL) { | 817 | while (t != NULL) { |
766 | unregister_netdevice_queue(t->dev, head); | 818 | unregister_netdevice_queue(t->dev, head); |
767 | t = t->next; | 819 | t = rtnl_dereference(t->next); |
768 | } | 820 | } |
769 | } | 821 | } |
770 | } | 822 | } |
@@ -789,7 +841,9 @@ static int __net_init ipip_init_net(struct net *net) | |||
789 | } | 841 | } |
790 | dev_net_set(ipn->fb_tunnel_dev, net); | 842 | dev_net_set(ipn->fb_tunnel_dev, net); |
791 | 843 | ||
792 | ipip_fb_tunnel_init(ipn->fb_tunnel_dev); | 844 | err = ipip_fb_tunnel_init(ipn->fb_tunnel_dev); |
845 | if (err) | ||
846 | goto err_reg_dev; | ||
793 | 847 | ||
794 | if ((err = register_netdev(ipn->fb_tunnel_dev))) | 848 | if ((err = register_netdev(ipn->fb_tunnel_dev))) |
795 | goto err_reg_dev; | 849 | goto err_reg_dev; |
@@ -797,7 +851,7 @@ static int __net_init ipip_init_net(struct net *net) | |||
797 | return 0; | 851 | return 0; |
798 | 852 | ||
799 | err_reg_dev: | 853 | err_reg_dev: |
800 | free_netdev(ipn->fb_tunnel_dev); | 854 | ipip_dev_free(ipn->fb_tunnel_dev); |
801 | err_alloc_dev: | 855 | err_alloc_dev: |
802 | /* nothing */ | 856 | /* nothing */ |
803 | return err; | 857 | return err; |
@@ -850,3 +904,4 @@ static void __exit ipip_fini(void) | |||
850 | module_init(ipip_init); | 904 | module_init(ipip_init); |
851 | module_exit(ipip_fini); | 905 | module_exit(ipip_fini); |
852 | MODULE_LICENSE("GPL"); | 906 | MODULE_LICENSE("GPL"); |
907 | MODULE_ALIAS_NETDEV("tunl0"); | ||
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 179fcab866fc..30a7763c400e 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c | |||
@@ -60,6 +60,7 @@ | |||
60 | #include <linux/notifier.h> | 60 | #include <linux/notifier.h> |
61 | #include <linux/if_arp.h> | 61 | #include <linux/if_arp.h> |
62 | #include <linux/netfilter_ipv4.h> | 62 | #include <linux/netfilter_ipv4.h> |
63 | #include <linux/compat.h> | ||
63 | #include <net/ipip.h> | 64 | #include <net/ipip.h> |
64 | #include <net/checksum.h> | 65 | #include <net/checksum.h> |
65 | #include <net/netlink.h> | 66 | #include <net/netlink.h> |
@@ -75,7 +76,7 @@ struct mr_table { | |||
75 | struct net *net; | 76 | struct net *net; |
76 | #endif | 77 | #endif |
77 | u32 id; | 78 | u32 id; |
78 | struct sock *mroute_sk; | 79 | struct sock __rcu *mroute_sk; |
79 | struct timer_list ipmr_expire_timer; | 80 | struct timer_list ipmr_expire_timer; |
80 | struct list_head mfc_unres_queue; | 81 | struct list_head mfc_unres_queue; |
81 | struct list_head mfc_cache_array[MFC_LINES]; | 82 | struct list_head mfc_cache_array[MFC_LINES]; |
@@ -98,7 +99,7 @@ struct ipmr_result { | |||
98 | }; | 99 | }; |
99 | 100 | ||
100 | /* Big lock, protecting vif table, mrt cache and mroute socket state. | 101 | /* Big lock, protecting vif table, mrt cache and mroute socket state. |
101 | Note that the changes are semaphored via rtnl_lock. | 102 | * Note that the changes are semaphored via rtnl_lock. |
102 | */ | 103 | */ |
103 | 104 | ||
104 | static DEFINE_RWLOCK(mrt_lock); | 105 | static DEFINE_RWLOCK(mrt_lock); |
@@ -113,11 +114,11 @@ static DEFINE_RWLOCK(mrt_lock); | |||
113 | static DEFINE_SPINLOCK(mfc_unres_lock); | 114 | static DEFINE_SPINLOCK(mfc_unres_lock); |
114 | 115 | ||
115 | /* We return to original Alan's scheme. Hash table of resolved | 116 | /* We return to original Alan's scheme. Hash table of resolved |
116 | entries is changed only in process context and protected | 117 | * entries is changed only in process context and protected |
117 | with weak lock mrt_lock. Queue of unresolved entries is protected | 118 | * with weak lock mrt_lock. Queue of unresolved entries is protected |
118 | with strong spinlock mfc_unres_lock. | 119 | * with strong spinlock mfc_unres_lock. |
119 | 120 | * | |
120 | In this case data path is free of exclusive locks at all. | 121 | * In this case data path is free of exclusive locks at all. |
121 | */ | 122 | */ |
122 | 123 | ||
123 | static struct kmem_cache *mrt_cachep __read_mostly; | 124 | static struct kmem_cache *mrt_cachep __read_mostly; |
@@ -147,14 +148,15 @@ static struct mr_table *ipmr_get_table(struct net *net, u32 id) | |||
147 | return NULL; | 148 | return NULL; |
148 | } | 149 | } |
149 | 150 | ||
150 | static int ipmr_fib_lookup(struct net *net, struct flowi *flp, | 151 | static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4, |
151 | struct mr_table **mrt) | 152 | struct mr_table **mrt) |
152 | { | 153 | { |
153 | struct ipmr_result res; | 154 | struct ipmr_result res; |
154 | struct fib_lookup_arg arg = { .result = &res, }; | 155 | struct fib_lookup_arg arg = { .result = &res, }; |
155 | int err; | 156 | int err; |
156 | 157 | ||
157 | err = fib_rules_lookup(net->ipv4.mr_rules_ops, flp, 0, &arg); | 158 | err = fib_rules_lookup(net->ipv4.mr_rules_ops, |
159 | flowi4_to_flowi(flp4), 0, &arg); | ||
158 | if (err < 0) | 160 | if (err < 0) |
159 | return err; | 161 | return err; |
160 | *mrt = res.mrt; | 162 | *mrt = res.mrt; |
@@ -282,7 +284,7 @@ static struct mr_table *ipmr_get_table(struct net *net, u32 id) | |||
282 | return net->ipv4.mrt; | 284 | return net->ipv4.mrt; |
283 | } | 285 | } |
284 | 286 | ||
285 | static int ipmr_fib_lookup(struct net *net, struct flowi *flp, | 287 | static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4, |
286 | struct mr_table **mrt) | 288 | struct mr_table **mrt) |
287 | { | 289 | { |
288 | *mrt = net->ipv4.mrt; | 290 | *mrt = net->ipv4.mrt; |
@@ -396,9 +398,9 @@ struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v) | |||
396 | set_fs(KERNEL_DS); | 398 | set_fs(KERNEL_DS); |
397 | err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL); | 399 | err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL); |
398 | set_fs(oldfs); | 400 | set_fs(oldfs); |
399 | } else | 401 | } else { |
400 | err = -EOPNOTSUPP; | 402 | err = -EOPNOTSUPP; |
401 | 403 | } | |
402 | dev = NULL; | 404 | dev = NULL; |
403 | 405 | ||
404 | if (err == 0 && | 406 | if (err == 0 && |
@@ -434,14 +436,14 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) | |||
434 | { | 436 | { |
435 | struct net *net = dev_net(dev); | 437 | struct net *net = dev_net(dev); |
436 | struct mr_table *mrt; | 438 | struct mr_table *mrt; |
437 | struct flowi fl = { | 439 | struct flowi4 fl4 = { |
438 | .oif = dev->ifindex, | 440 | .flowi4_oif = dev->ifindex, |
439 | .iif = skb->skb_iif, | 441 | .flowi4_iif = skb->skb_iif, |
440 | .mark = skb->mark, | 442 | .flowi4_mark = skb->mark, |
441 | }; | 443 | }; |
442 | int err; | 444 | int err; |
443 | 445 | ||
444 | err = ipmr_fib_lookup(net, &fl, &mrt); | 446 | err = ipmr_fib_lookup(net, &fl4, &mrt); |
445 | if (err < 0) { | 447 | if (err < 0) { |
446 | kfree_skb(skb); | 448 | kfree_skb(skb); |
447 | return err; | 449 | return err; |
@@ -495,7 +497,8 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) | |||
495 | dev->iflink = 0; | 497 | dev->iflink = 0; |
496 | 498 | ||
497 | rcu_read_lock(); | 499 | rcu_read_lock(); |
498 | if ((in_dev = __in_dev_get_rcu(dev)) == NULL) { | 500 | in_dev = __in_dev_get_rcu(dev); |
501 | if (!in_dev) { | ||
499 | rcu_read_unlock(); | 502 | rcu_read_unlock(); |
500 | goto failure; | 503 | goto failure; |
501 | } | 504 | } |
@@ -552,9 +555,10 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify, | |||
552 | mrt->mroute_reg_vif_num = -1; | 555 | mrt->mroute_reg_vif_num = -1; |
553 | #endif | 556 | #endif |
554 | 557 | ||
555 | if (vifi+1 == mrt->maxvif) { | 558 | if (vifi + 1 == mrt->maxvif) { |
556 | int tmp; | 559 | int tmp; |
557 | for (tmp=vifi-1; tmp>=0; tmp--) { | 560 | |
561 | for (tmp = vifi - 1; tmp >= 0; tmp--) { | ||
558 | if (VIF_EXISTS(mrt, tmp)) | 562 | if (VIF_EXISTS(mrt, tmp)) |
559 | break; | 563 | break; |
560 | } | 564 | } |
@@ -565,25 +569,33 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify, | |||
565 | 569 | ||
566 | dev_set_allmulti(dev, -1); | 570 | dev_set_allmulti(dev, -1); |
567 | 571 | ||
568 | if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) { | 572 | in_dev = __in_dev_get_rtnl(dev); |
573 | if (in_dev) { | ||
569 | IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--; | 574 | IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--; |
570 | ip_rt_multicast_event(in_dev); | 575 | ip_rt_multicast_event(in_dev); |
571 | } | 576 | } |
572 | 577 | ||
573 | if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify) | 578 | if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER) && !notify) |
574 | unregister_netdevice_queue(dev, head); | 579 | unregister_netdevice_queue(dev, head); |
575 | 580 | ||
576 | dev_put(dev); | 581 | dev_put(dev); |
577 | return 0; | 582 | return 0; |
578 | } | 583 | } |
579 | 584 | ||
580 | static inline void ipmr_cache_free(struct mfc_cache *c) | 585 | static void ipmr_cache_free_rcu(struct rcu_head *head) |
581 | { | 586 | { |
587 | struct mfc_cache *c = container_of(head, struct mfc_cache, rcu); | ||
588 | |||
582 | kmem_cache_free(mrt_cachep, c); | 589 | kmem_cache_free(mrt_cachep, c); |
583 | } | 590 | } |
584 | 591 | ||
592 | static inline void ipmr_cache_free(struct mfc_cache *c) | ||
593 | { | ||
594 | call_rcu(&c->rcu, ipmr_cache_free_rcu); | ||
595 | } | ||
596 | |||
585 | /* Destroy an unresolved cache entry, killing queued skbs | 597 | /* Destroy an unresolved cache entry, killing queued skbs |
586 | and reporting error to netlink readers. | 598 | * and reporting error to netlink readers. |
587 | */ | 599 | */ |
588 | 600 | ||
589 | static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c) | 601 | static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c) |
@@ -605,8 +617,9 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c) | |||
605 | memset(&e->msg, 0, sizeof(e->msg)); | 617 | memset(&e->msg, 0, sizeof(e->msg)); |
606 | 618 | ||
607 | rtnl_unicast(skb, net, NETLINK_CB(skb).pid); | 619 | rtnl_unicast(skb, net, NETLINK_CB(skb).pid); |
608 | } else | 620 | } else { |
609 | kfree_skb(skb); | 621 | kfree_skb(skb); |
622 | } | ||
610 | } | 623 | } |
611 | 624 | ||
612 | ipmr_cache_free(c); | 625 | ipmr_cache_free(c); |
@@ -724,13 +737,13 @@ static int vif_add(struct net *net, struct mr_table *mrt, | |||
724 | case 0: | 737 | case 0: |
725 | if (vifc->vifc_flags == VIFF_USE_IFINDEX) { | 738 | if (vifc->vifc_flags == VIFF_USE_IFINDEX) { |
726 | dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex); | 739 | dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex); |
727 | if (dev && dev->ip_ptr == NULL) { | 740 | if (dev && __in_dev_get_rtnl(dev) == NULL) { |
728 | dev_put(dev); | 741 | dev_put(dev); |
729 | return -EADDRNOTAVAIL; | 742 | return -EADDRNOTAVAIL; |
730 | } | 743 | } |
731 | } else | 744 | } else { |
732 | dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr); | 745 | dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr); |
733 | 746 | } | |
734 | if (!dev) | 747 | if (!dev) |
735 | return -EADDRNOTAVAIL; | 748 | return -EADDRNOTAVAIL; |
736 | err = dev_set_allmulti(dev, 1); | 749 | err = dev_set_allmulti(dev, 1); |
@@ -743,16 +756,16 @@ static int vif_add(struct net *net, struct mr_table *mrt, | |||
743 | return -EINVAL; | 756 | return -EINVAL; |
744 | } | 757 | } |
745 | 758 | ||
746 | if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) { | 759 | in_dev = __in_dev_get_rtnl(dev); |
760 | if (!in_dev) { | ||
747 | dev_put(dev); | 761 | dev_put(dev); |
748 | return -EADDRNOTAVAIL; | 762 | return -EADDRNOTAVAIL; |
749 | } | 763 | } |
750 | IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++; | 764 | IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++; |
751 | ip_rt_multicast_event(in_dev); | 765 | ip_rt_multicast_event(in_dev); |
752 | 766 | ||
753 | /* | 767 | /* Fill in the VIF structures */ |
754 | * Fill in the VIF structures | 768 | |
755 | */ | ||
756 | v->rate_limit = vifc->vifc_rate_limit; | 769 | v->rate_limit = vifc->vifc_rate_limit; |
757 | v->local = vifc->vifc_lcl_addr.s_addr; | 770 | v->local = vifc->vifc_lcl_addr.s_addr; |
758 | v->remote = vifc->vifc_rmt_addr.s_addr; | 771 | v->remote = vifc->vifc_rmt_addr.s_addr; |
@@ -765,14 +778,14 @@ static int vif_add(struct net *net, struct mr_table *mrt, | |||
765 | v->pkt_in = 0; | 778 | v->pkt_in = 0; |
766 | v->pkt_out = 0; | 779 | v->pkt_out = 0; |
767 | v->link = dev->ifindex; | 780 | v->link = dev->ifindex; |
768 | if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER)) | 781 | if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER)) |
769 | v->link = dev->iflink; | 782 | v->link = dev->iflink; |
770 | 783 | ||
771 | /* And finish update writing critical data */ | 784 | /* And finish update writing critical data */ |
772 | write_lock_bh(&mrt_lock); | 785 | write_lock_bh(&mrt_lock); |
773 | v->dev = dev; | 786 | v->dev = dev; |
774 | #ifdef CONFIG_IP_PIMSM | 787 | #ifdef CONFIG_IP_PIMSM |
775 | if (v->flags&VIFF_REGISTER) | 788 | if (v->flags & VIFF_REGISTER) |
776 | mrt->mroute_reg_vif_num = vifi; | 789 | mrt->mroute_reg_vif_num = vifi; |
777 | #endif | 790 | #endif |
778 | if (vifi+1 > mrt->maxvif) | 791 | if (vifi+1 > mrt->maxvif) |
@@ -781,6 +794,7 @@ static int vif_add(struct net *net, struct mr_table *mrt, | |||
781 | return 0; | 794 | return 0; |
782 | } | 795 | } |
783 | 796 | ||
797 | /* called with rcu_read_lock() */ | ||
784 | static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt, | 798 | static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt, |
785 | __be32 origin, | 799 | __be32 origin, |
786 | __be32 mcastgrp) | 800 | __be32 mcastgrp) |
@@ -788,7 +802,7 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt, | |||
788 | int line = MFC_HASH(mcastgrp, origin); | 802 | int line = MFC_HASH(mcastgrp, origin); |
789 | struct mfc_cache *c; | 803 | struct mfc_cache *c; |
790 | 804 | ||
791 | list_for_each_entry(c, &mrt->mfc_cache_array[line], list) { | 805 | list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) { |
792 | if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp) | 806 | if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp) |
793 | return c; | 807 | return c; |
794 | } | 808 | } |
@@ -801,19 +815,20 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt, | |||
801 | static struct mfc_cache *ipmr_cache_alloc(void) | 815 | static struct mfc_cache *ipmr_cache_alloc(void) |
802 | { | 816 | { |
803 | struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); | 817 | struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); |
804 | if (c == NULL) | 818 | |
805 | return NULL; | 819 | if (c) |
806 | c->mfc_un.res.minvif = MAXVIFS; | 820 | c->mfc_un.res.minvif = MAXVIFS; |
807 | return c; | 821 | return c; |
808 | } | 822 | } |
809 | 823 | ||
810 | static struct mfc_cache *ipmr_cache_alloc_unres(void) | 824 | static struct mfc_cache *ipmr_cache_alloc_unres(void) |
811 | { | 825 | { |
812 | struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); | 826 | struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); |
813 | if (c == NULL) | 827 | |
814 | return NULL; | 828 | if (c) { |
815 | skb_queue_head_init(&c->mfc_un.unres.unresolved); | 829 | skb_queue_head_init(&c->mfc_un.unres.unresolved); |
816 | c->mfc_un.unres.expires = jiffies + 10*HZ; | 830 | c->mfc_un.unres.expires = jiffies + 10*HZ; |
831 | } | ||
817 | return c; | 832 | return c; |
818 | } | 833 | } |
819 | 834 | ||
@@ -827,17 +842,15 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt, | |||
827 | struct sk_buff *skb; | 842 | struct sk_buff *skb; |
828 | struct nlmsgerr *e; | 843 | struct nlmsgerr *e; |
829 | 844 | ||
830 | /* | 845 | /* Play the pending entries through our router */ |
831 | * Play the pending entries through our router | ||
832 | */ | ||
833 | 846 | ||
834 | while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) { | 847 | while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) { |
835 | if (ip_hdr(skb)->version == 0) { | 848 | if (ip_hdr(skb)->version == 0) { |
836 | struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); | 849 | struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); |
837 | 850 | ||
838 | if (__ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) { | 851 | if (__ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) { |
839 | nlh->nlmsg_len = (skb_tail_pointer(skb) - | 852 | nlh->nlmsg_len = skb_tail_pointer(skb) - |
840 | (u8 *)nlh); | 853 | (u8 *)nlh; |
841 | } else { | 854 | } else { |
842 | nlh->nlmsg_type = NLMSG_ERROR; | 855 | nlh->nlmsg_type = NLMSG_ERROR; |
843 | nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); | 856 | nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); |
@@ -848,8 +861,9 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt, | |||
848 | } | 861 | } |
849 | 862 | ||
850 | rtnl_unicast(skb, net, NETLINK_CB(skb).pid); | 863 | rtnl_unicast(skb, net, NETLINK_CB(skb).pid); |
851 | } else | 864 | } else { |
852 | ip_mr_forward(net, mrt, skb, c, 0); | 865 | ip_mr_forward(net, mrt, skb, c, 0); |
866 | } | ||
853 | } | 867 | } |
854 | } | 868 | } |
855 | 869 | ||
@@ -867,6 +881,7 @@ static int ipmr_cache_report(struct mr_table *mrt, | |||
867 | const int ihl = ip_hdrlen(pkt); | 881 | const int ihl = ip_hdrlen(pkt); |
868 | struct igmphdr *igmp; | 882 | struct igmphdr *igmp; |
869 | struct igmpmsg *msg; | 883 | struct igmpmsg *msg; |
884 | struct sock *mroute_sk; | ||
870 | int ret; | 885 | int ret; |
871 | 886 | ||
872 | #ifdef CONFIG_IP_PIMSM | 887 | #ifdef CONFIG_IP_PIMSM |
@@ -882,9 +897,9 @@ static int ipmr_cache_report(struct mr_table *mrt, | |||
882 | #ifdef CONFIG_IP_PIMSM | 897 | #ifdef CONFIG_IP_PIMSM |
883 | if (assert == IGMPMSG_WHOLEPKT) { | 898 | if (assert == IGMPMSG_WHOLEPKT) { |
884 | /* Ugly, but we have no choice with this interface. | 899 | /* Ugly, but we have no choice with this interface. |
885 | Duplicate old header, fix ihl, length etc. | 900 | * Duplicate old header, fix ihl, length etc. |
886 | And all this only to mangle msg->im_msgtype and | 901 | * And all this only to mangle msg->im_msgtype and |
887 | to set msg->im_mbz to "mbz" :-) | 902 | * to set msg->im_mbz to "mbz" :-) |
888 | */ | 903 | */ |
889 | skb_push(skb, sizeof(struct iphdr)); | 904 | skb_push(skb, sizeof(struct iphdr)); |
890 | skb_reset_network_header(skb); | 905 | skb_reset_network_header(skb); |
@@ -901,39 +916,38 @@ static int ipmr_cache_report(struct mr_table *mrt, | |||
901 | #endif | 916 | #endif |
902 | { | 917 | { |
903 | 918 | ||
904 | /* | 919 | /* Copy the IP header */ |
905 | * Copy the IP header | ||
906 | */ | ||
907 | 920 | ||
908 | skb->network_header = skb->tail; | 921 | skb->network_header = skb->tail; |
909 | skb_put(skb, ihl); | 922 | skb_put(skb, ihl); |
910 | skb_copy_to_linear_data(skb, pkt->data, ihl); | 923 | skb_copy_to_linear_data(skb, pkt->data, ihl); |
911 | ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */ | 924 | ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */ |
912 | msg = (struct igmpmsg *)skb_network_header(skb); | 925 | msg = (struct igmpmsg *)skb_network_header(skb); |
913 | msg->im_vif = vifi; | 926 | msg->im_vif = vifi; |
914 | skb_dst_set(skb, dst_clone(skb_dst(pkt))); | 927 | skb_dst_set(skb, dst_clone(skb_dst(pkt))); |
915 | 928 | ||
916 | /* | 929 | /* Add our header */ |
917 | * Add our header | ||
918 | */ | ||
919 | 930 | ||
920 | igmp=(struct igmphdr *)skb_put(skb, sizeof(struct igmphdr)); | 931 | igmp = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr)); |
921 | igmp->type = | 932 | igmp->type = |
922 | msg->im_msgtype = assert; | 933 | msg->im_msgtype = assert; |
923 | igmp->code = 0; | 934 | igmp->code = 0; |
924 | ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */ | 935 | ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */ |
925 | skb->transport_header = skb->network_header; | 936 | skb->transport_header = skb->network_header; |
926 | } | 937 | } |
927 | 938 | ||
928 | if (mrt->mroute_sk == NULL) { | 939 | rcu_read_lock(); |
940 | mroute_sk = rcu_dereference(mrt->mroute_sk); | ||
941 | if (mroute_sk == NULL) { | ||
942 | rcu_read_unlock(); | ||
929 | kfree_skb(skb); | 943 | kfree_skb(skb); |
930 | return -EINVAL; | 944 | return -EINVAL; |
931 | } | 945 | } |
932 | 946 | ||
933 | /* | 947 | /* Deliver to mrouted */ |
934 | * Deliver to mrouted | 948 | |
935 | */ | 949 | ret = sock_queue_rcv_skb(mroute_sk, skb); |
936 | ret = sock_queue_rcv_skb(mrt->mroute_sk, skb); | 950 | rcu_read_unlock(); |
937 | if (ret < 0) { | 951 | if (ret < 0) { |
938 | if (net_ratelimit()) | 952 | if (net_ratelimit()) |
939 | printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n"); | 953 | printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n"); |
@@ -965,9 +979,7 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb) | |||
965 | } | 979 | } |
966 | 980 | ||
967 | if (!found) { | 981 | if (!found) { |
968 | /* | 982 | /* Create a new entry if allowable */ |
969 | * Create a new entry if allowable | ||
970 | */ | ||
971 | 983 | ||
972 | if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 || | 984 | if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 || |
973 | (c = ipmr_cache_alloc_unres()) == NULL) { | 985 | (c = ipmr_cache_alloc_unres()) == NULL) { |
@@ -977,16 +989,14 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb) | |||
977 | return -ENOBUFS; | 989 | return -ENOBUFS; |
978 | } | 990 | } |
979 | 991 | ||
980 | /* | 992 | /* Fill in the new cache entry */ |
981 | * Fill in the new cache entry | 993 | |
982 | */ | ||
983 | c->mfc_parent = -1; | 994 | c->mfc_parent = -1; |
984 | c->mfc_origin = iph->saddr; | 995 | c->mfc_origin = iph->saddr; |
985 | c->mfc_mcastgrp = iph->daddr; | 996 | c->mfc_mcastgrp = iph->daddr; |
986 | 997 | ||
987 | /* | 998 | /* Reflect first query at mrouted. */ |
988 | * Reflect first query at mrouted. | 999 | |
989 | */ | ||
990 | err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE); | 1000 | err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE); |
991 | if (err < 0) { | 1001 | if (err < 0) { |
992 | /* If the report failed throw the cache entry | 1002 | /* If the report failed throw the cache entry |
@@ -1006,10 +1016,9 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb) | |||
1006 | mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires); | 1016 | mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires); |
1007 | } | 1017 | } |
1008 | 1018 | ||
1009 | /* | 1019 | /* See if we can append the packet */ |
1010 | * See if we can append the packet | 1020 | |
1011 | */ | 1021 | if (c->mfc_un.unres.unresolved.qlen > 3) { |
1012 | if (c->mfc_un.unres.unresolved.qlen>3) { | ||
1013 | kfree_skb(skb); | 1022 | kfree_skb(skb); |
1014 | err = -ENOBUFS; | 1023 | err = -ENOBUFS; |
1015 | } else { | 1024 | } else { |
@@ -1035,9 +1044,7 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc) | |||
1035 | list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) { | 1044 | list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) { |
1036 | if (c->mfc_origin == mfc->mfcc_origin.s_addr && | 1045 | if (c->mfc_origin == mfc->mfcc_origin.s_addr && |
1037 | c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) { | 1046 | c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) { |
1038 | write_lock_bh(&mrt_lock); | 1047 | list_del_rcu(&c->list); |
1039 | list_del(&c->list); | ||
1040 | write_unlock_bh(&mrt_lock); | ||
1041 | 1048 | ||
1042 | ipmr_cache_free(c); | 1049 | ipmr_cache_free(c); |
1043 | return 0; | 1050 | return 0; |
@@ -1090,9 +1097,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, | |||
1090 | if (!mrtsock) | 1097 | if (!mrtsock) |
1091 | c->mfc_flags |= MFC_STATIC; | 1098 | c->mfc_flags |= MFC_STATIC; |
1092 | 1099 | ||
1093 | write_lock_bh(&mrt_lock); | 1100 | list_add_rcu(&c->list, &mrt->mfc_cache_array[line]); |
1094 | list_add(&c->list, &mrt->mfc_cache_array[line]); | ||
1095 | write_unlock_bh(&mrt_lock); | ||
1096 | 1101 | ||
1097 | /* | 1102 | /* |
1098 | * Check to see if we resolved a queued list. If so we | 1103 | * Check to see if we resolved a queued list. If so we |
@@ -1130,26 +1135,21 @@ static void mroute_clean_tables(struct mr_table *mrt) | |||
1130 | LIST_HEAD(list); | 1135 | LIST_HEAD(list); |
1131 | struct mfc_cache *c, *next; | 1136 | struct mfc_cache *c, *next; |
1132 | 1137 | ||
1133 | /* | 1138 | /* Shut down all active vif entries */ |
1134 | * Shut down all active vif entries | 1139 | |
1135 | */ | ||
1136 | for (i = 0; i < mrt->maxvif; i++) { | 1140 | for (i = 0; i < mrt->maxvif; i++) { |
1137 | if (!(mrt->vif_table[i].flags&VIFF_STATIC)) | 1141 | if (!(mrt->vif_table[i].flags & VIFF_STATIC)) |
1138 | vif_delete(mrt, i, 0, &list); | 1142 | vif_delete(mrt, i, 0, &list); |
1139 | } | 1143 | } |
1140 | unregister_netdevice_many(&list); | 1144 | unregister_netdevice_many(&list); |
1141 | 1145 | ||
1142 | /* | 1146 | /* Wipe the cache */ |
1143 | * Wipe the cache | 1147 | |
1144 | */ | ||
1145 | for (i = 0; i < MFC_LINES; i++) { | 1148 | for (i = 0; i < MFC_LINES; i++) { |
1146 | list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) { | 1149 | list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) { |
1147 | if (c->mfc_flags&MFC_STATIC) | 1150 | if (c->mfc_flags & MFC_STATIC) |
1148 | continue; | 1151 | continue; |
1149 | write_lock_bh(&mrt_lock); | 1152 | list_del_rcu(&c->list); |
1150 | list_del(&c->list); | ||
1151 | write_unlock_bh(&mrt_lock); | ||
1152 | |||
1153 | ipmr_cache_free(c); | 1153 | ipmr_cache_free(c); |
1154 | } | 1154 | } |
1155 | } | 1155 | } |
@@ -1164,6 +1164,9 @@ static void mroute_clean_tables(struct mr_table *mrt) | |||
1164 | } | 1164 | } |
1165 | } | 1165 | } |
1166 | 1166 | ||
1167 | /* called from ip_ra_control(), before an RCU grace period, | ||
1168 | * we dont need to call synchronize_rcu() here | ||
1169 | */ | ||
1167 | static void mrtsock_destruct(struct sock *sk) | 1170 | static void mrtsock_destruct(struct sock *sk) |
1168 | { | 1171 | { |
1169 | struct net *net = sock_net(sk); | 1172 | struct net *net = sock_net(sk); |
@@ -1171,13 +1174,9 @@ static void mrtsock_destruct(struct sock *sk) | |||
1171 | 1174 | ||
1172 | rtnl_lock(); | 1175 | rtnl_lock(); |
1173 | ipmr_for_each_table(mrt, net) { | 1176 | ipmr_for_each_table(mrt, net) { |
1174 | if (sk == mrt->mroute_sk) { | 1177 | if (sk == rtnl_dereference(mrt->mroute_sk)) { |
1175 | IPV4_DEVCONF_ALL(net, MC_FORWARDING)--; | 1178 | IPV4_DEVCONF_ALL(net, MC_FORWARDING)--; |
1176 | 1179 | rcu_assign_pointer(mrt->mroute_sk, NULL); | |
1177 | write_lock_bh(&mrt_lock); | ||
1178 | mrt->mroute_sk = NULL; | ||
1179 | write_unlock_bh(&mrt_lock); | ||
1180 | |||
1181 | mroute_clean_tables(mrt); | 1180 | mroute_clean_tables(mrt); |
1182 | } | 1181 | } |
1183 | } | 1182 | } |
@@ -1204,7 +1203,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi | |||
1204 | return -ENOENT; | 1203 | return -ENOENT; |
1205 | 1204 | ||
1206 | if (optname != MRT_INIT) { | 1205 | if (optname != MRT_INIT) { |
1207 | if (sk != mrt->mroute_sk && !capable(CAP_NET_ADMIN)) | 1206 | if (sk != rcu_dereference_raw(mrt->mroute_sk) && |
1207 | !capable(CAP_NET_ADMIN)) | ||
1208 | return -EACCES; | 1208 | return -EACCES; |
1209 | } | 1209 | } |
1210 | 1210 | ||
@@ -1217,23 +1217,20 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi | |||
1217 | return -ENOPROTOOPT; | 1217 | return -ENOPROTOOPT; |
1218 | 1218 | ||
1219 | rtnl_lock(); | 1219 | rtnl_lock(); |
1220 | if (mrt->mroute_sk) { | 1220 | if (rtnl_dereference(mrt->mroute_sk)) { |
1221 | rtnl_unlock(); | 1221 | rtnl_unlock(); |
1222 | return -EADDRINUSE; | 1222 | return -EADDRINUSE; |
1223 | } | 1223 | } |
1224 | 1224 | ||
1225 | ret = ip_ra_control(sk, 1, mrtsock_destruct); | 1225 | ret = ip_ra_control(sk, 1, mrtsock_destruct); |
1226 | if (ret == 0) { | 1226 | if (ret == 0) { |
1227 | write_lock_bh(&mrt_lock); | 1227 | rcu_assign_pointer(mrt->mroute_sk, sk); |
1228 | mrt->mroute_sk = sk; | ||
1229 | write_unlock_bh(&mrt_lock); | ||
1230 | |||
1231 | IPV4_DEVCONF_ALL(net, MC_FORWARDING)++; | 1228 | IPV4_DEVCONF_ALL(net, MC_FORWARDING)++; |
1232 | } | 1229 | } |
1233 | rtnl_unlock(); | 1230 | rtnl_unlock(); |
1234 | return ret; | 1231 | return ret; |
1235 | case MRT_DONE: | 1232 | case MRT_DONE: |
1236 | if (sk != mrt->mroute_sk) | 1233 | if (sk != rcu_dereference_raw(mrt->mroute_sk)) |
1237 | return -EACCES; | 1234 | return -EACCES; |
1238 | return ip_ra_control(sk, 0, NULL); | 1235 | return ip_ra_control(sk, 0, NULL); |
1239 | case MRT_ADD_VIF: | 1236 | case MRT_ADD_VIF: |
@@ -1246,7 +1243,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi | |||
1246 | return -ENFILE; | 1243 | return -ENFILE; |
1247 | rtnl_lock(); | 1244 | rtnl_lock(); |
1248 | if (optname == MRT_ADD_VIF) { | 1245 | if (optname == MRT_ADD_VIF) { |
1249 | ret = vif_add(net, mrt, &vif, sk == mrt->mroute_sk); | 1246 | ret = vif_add(net, mrt, &vif, |
1247 | sk == rtnl_dereference(mrt->mroute_sk)); | ||
1250 | } else { | 1248 | } else { |
1251 | ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL); | 1249 | ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL); |
1252 | } | 1250 | } |
@@ -1267,7 +1265,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi | |||
1267 | if (optname == MRT_DEL_MFC) | 1265 | if (optname == MRT_DEL_MFC) |
1268 | ret = ipmr_mfc_delete(mrt, &mfc); | 1266 | ret = ipmr_mfc_delete(mrt, &mfc); |
1269 | else | 1267 | else |
1270 | ret = ipmr_mfc_add(net, mrt, &mfc, sk == mrt->mroute_sk); | 1268 | ret = ipmr_mfc_add(net, mrt, &mfc, |
1269 | sk == rtnl_dereference(mrt->mroute_sk)); | ||
1271 | rtnl_unlock(); | 1270 | rtnl_unlock(); |
1272 | return ret; | 1271 | return ret; |
1273 | /* | 1272 | /* |
@@ -1276,7 +1275,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi | |||
1276 | case MRT_ASSERT: | 1275 | case MRT_ASSERT: |
1277 | { | 1276 | { |
1278 | int v; | 1277 | int v; |
1279 | if (get_user(v,(int __user *)optval)) | 1278 | if (get_user(v, (int __user *)optval)) |
1280 | return -EFAULT; | 1279 | return -EFAULT; |
1281 | mrt->mroute_do_assert = (v) ? 1 : 0; | 1280 | mrt->mroute_do_assert = (v) ? 1 : 0; |
1282 | return 0; | 1281 | return 0; |
@@ -1286,7 +1285,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi | |||
1286 | { | 1285 | { |
1287 | int v; | 1286 | int v; |
1288 | 1287 | ||
1289 | if (get_user(v,(int __user *)optval)) | 1288 | if (get_user(v, (int __user *)optval)) |
1290 | return -EFAULT; | 1289 | return -EFAULT; |
1291 | v = (v) ? 1 : 0; | 1290 | v = (v) ? 1 : 0; |
1292 | 1291 | ||
@@ -1309,14 +1308,16 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi | |||
1309 | return -EINVAL; | 1308 | return -EINVAL; |
1310 | if (get_user(v, (u32 __user *)optval)) | 1309 | if (get_user(v, (u32 __user *)optval)) |
1311 | return -EFAULT; | 1310 | return -EFAULT; |
1312 | if (sk == mrt->mroute_sk) | ||
1313 | return -EBUSY; | ||
1314 | 1311 | ||
1315 | rtnl_lock(); | 1312 | rtnl_lock(); |
1316 | ret = 0; | 1313 | ret = 0; |
1317 | if (!ipmr_new_table(net, v)) | 1314 | if (sk == rtnl_dereference(mrt->mroute_sk)) { |
1318 | ret = -ENOMEM; | 1315 | ret = -EBUSY; |
1319 | raw_sk(sk)->ipmr_table = v; | 1316 | } else { |
1317 | if (!ipmr_new_table(net, v)) | ||
1318 | ret = -ENOMEM; | ||
1319 | raw_sk(sk)->ipmr_table = v; | ||
1320 | } | ||
1320 | rtnl_unlock(); | 1321 | rtnl_unlock(); |
1321 | return ret; | 1322 | return ret; |
1322 | } | 1323 | } |
@@ -1347,9 +1348,9 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int | |||
1347 | 1348 | ||
1348 | if (optname != MRT_VERSION && | 1349 | if (optname != MRT_VERSION && |
1349 | #ifdef CONFIG_IP_PIMSM | 1350 | #ifdef CONFIG_IP_PIMSM |
1350 | optname!=MRT_PIM && | 1351 | optname != MRT_PIM && |
1351 | #endif | 1352 | #endif |
1352 | optname!=MRT_ASSERT) | 1353 | optname != MRT_ASSERT) |
1353 | return -ENOPROTOOPT; | 1354 | return -ENOPROTOOPT; |
1354 | 1355 | ||
1355 | if (get_user(olr, optlen)) | 1356 | if (get_user(olr, optlen)) |
@@ -1416,24 +1417,99 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) | |||
1416 | if (copy_from_user(&sr, arg, sizeof(sr))) | 1417 | if (copy_from_user(&sr, arg, sizeof(sr))) |
1417 | return -EFAULT; | 1418 | return -EFAULT; |
1418 | 1419 | ||
1419 | read_lock(&mrt_lock); | 1420 | rcu_read_lock(); |
1420 | c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr); | 1421 | c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr); |
1421 | if (c) { | 1422 | if (c) { |
1422 | sr.pktcnt = c->mfc_un.res.pkt; | 1423 | sr.pktcnt = c->mfc_un.res.pkt; |
1423 | sr.bytecnt = c->mfc_un.res.bytes; | 1424 | sr.bytecnt = c->mfc_un.res.bytes; |
1424 | sr.wrong_if = c->mfc_un.res.wrong_if; | 1425 | sr.wrong_if = c->mfc_un.res.wrong_if; |
1425 | read_unlock(&mrt_lock); | 1426 | rcu_read_unlock(); |
1426 | 1427 | ||
1427 | if (copy_to_user(arg, &sr, sizeof(sr))) | 1428 | if (copy_to_user(arg, &sr, sizeof(sr))) |
1428 | return -EFAULT; | 1429 | return -EFAULT; |
1429 | return 0; | 1430 | return 0; |
1430 | } | 1431 | } |
1432 | rcu_read_unlock(); | ||
1433 | return -EADDRNOTAVAIL; | ||
1434 | default: | ||
1435 | return -ENOIOCTLCMD; | ||
1436 | } | ||
1437 | } | ||
1438 | |||
1439 | #ifdef CONFIG_COMPAT | ||
1440 | struct compat_sioc_sg_req { | ||
1441 | struct in_addr src; | ||
1442 | struct in_addr grp; | ||
1443 | compat_ulong_t pktcnt; | ||
1444 | compat_ulong_t bytecnt; | ||
1445 | compat_ulong_t wrong_if; | ||
1446 | }; | ||
1447 | |||
1448 | struct compat_sioc_vif_req { | ||
1449 | vifi_t vifi; /* Which iface */ | ||
1450 | compat_ulong_t icount; | ||
1451 | compat_ulong_t ocount; | ||
1452 | compat_ulong_t ibytes; | ||
1453 | compat_ulong_t obytes; | ||
1454 | }; | ||
1455 | |||
1456 | int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) | ||
1457 | { | ||
1458 | struct compat_sioc_sg_req sr; | ||
1459 | struct compat_sioc_vif_req vr; | ||
1460 | struct vif_device *vif; | ||
1461 | struct mfc_cache *c; | ||
1462 | struct net *net = sock_net(sk); | ||
1463 | struct mr_table *mrt; | ||
1464 | |||
1465 | mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); | ||
1466 | if (mrt == NULL) | ||
1467 | return -ENOENT; | ||
1468 | |||
1469 | switch (cmd) { | ||
1470 | case SIOCGETVIFCNT: | ||
1471 | if (copy_from_user(&vr, arg, sizeof(vr))) | ||
1472 | return -EFAULT; | ||
1473 | if (vr.vifi >= mrt->maxvif) | ||
1474 | return -EINVAL; | ||
1475 | read_lock(&mrt_lock); | ||
1476 | vif = &mrt->vif_table[vr.vifi]; | ||
1477 | if (VIF_EXISTS(mrt, vr.vifi)) { | ||
1478 | vr.icount = vif->pkt_in; | ||
1479 | vr.ocount = vif->pkt_out; | ||
1480 | vr.ibytes = vif->bytes_in; | ||
1481 | vr.obytes = vif->bytes_out; | ||
1482 | read_unlock(&mrt_lock); | ||
1483 | |||
1484 | if (copy_to_user(arg, &vr, sizeof(vr))) | ||
1485 | return -EFAULT; | ||
1486 | return 0; | ||
1487 | } | ||
1431 | read_unlock(&mrt_lock); | 1488 | read_unlock(&mrt_lock); |
1432 | return -EADDRNOTAVAIL; | 1489 | return -EADDRNOTAVAIL; |
1490 | case SIOCGETSGCNT: | ||
1491 | if (copy_from_user(&sr, arg, sizeof(sr))) | ||
1492 | return -EFAULT; | ||
1493 | |||
1494 | rcu_read_lock(); | ||
1495 | c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr); | ||
1496 | if (c) { | ||
1497 | sr.pktcnt = c->mfc_un.res.pkt; | ||
1498 | sr.bytecnt = c->mfc_un.res.bytes; | ||
1499 | sr.wrong_if = c->mfc_un.res.wrong_if; | ||
1500 | rcu_read_unlock(); | ||
1501 | |||
1502 | if (copy_to_user(arg, &sr, sizeof(sr))) | ||
1503 | return -EFAULT; | ||
1504 | return 0; | ||
1505 | } | ||
1506 | rcu_read_unlock(); | ||
1507 | return -EADDRNOTAVAIL; | ||
1433 | default: | 1508 | default: |
1434 | return -ENOIOCTLCMD; | 1509 | return -ENOIOCTLCMD; |
1435 | } | 1510 | } |
1436 | } | 1511 | } |
1512 | #endif | ||
1437 | 1513 | ||
1438 | 1514 | ||
1439 | static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr) | 1515 | static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr) |
@@ -1465,7 +1541,7 @@ static struct notifier_block ip_mr_notifier = { | |||
1465 | }; | 1541 | }; |
1466 | 1542 | ||
1467 | /* | 1543 | /* |
1468 | * Encapsulate a packet by attaching a valid IPIP header to it. | 1544 | * Encapsulate a packet by attaching a valid IPIP header to it. |
1469 | * This avoids tunnel drivers and other mess and gives us the speed so | 1545 | * This avoids tunnel drivers and other mess and gives us the speed so |
1470 | * important for multicast video. | 1546 | * important for multicast video. |
1471 | */ | 1547 | */ |
@@ -1473,14 +1549,14 @@ static struct notifier_block ip_mr_notifier = { | |||
1473 | static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr) | 1549 | static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr) |
1474 | { | 1550 | { |
1475 | struct iphdr *iph; | 1551 | struct iphdr *iph; |
1476 | struct iphdr *old_iph = ip_hdr(skb); | 1552 | const struct iphdr *old_iph = ip_hdr(skb); |
1477 | 1553 | ||
1478 | skb_push(skb, sizeof(struct iphdr)); | 1554 | skb_push(skb, sizeof(struct iphdr)); |
1479 | skb->transport_header = skb->network_header; | 1555 | skb->transport_header = skb->network_header; |
1480 | skb_reset_network_header(skb); | 1556 | skb_reset_network_header(skb); |
1481 | iph = ip_hdr(skb); | 1557 | iph = ip_hdr(skb); |
1482 | 1558 | ||
1483 | iph->version = 4; | 1559 | iph->version = 4; |
1484 | iph->tos = old_iph->tos; | 1560 | iph->tos = old_iph->tos; |
1485 | iph->ttl = old_iph->ttl; | 1561 | iph->ttl = old_iph->ttl; |
1486 | iph->frag_off = 0; | 1562 | iph->frag_off = 0; |
@@ -1498,7 +1574,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr) | |||
1498 | 1574 | ||
1499 | static inline int ipmr_forward_finish(struct sk_buff *skb) | 1575 | static inline int ipmr_forward_finish(struct sk_buff *skb) |
1500 | { | 1576 | { |
1501 | struct ip_options * opt = &(IPCB(skb)->opt); | 1577 | struct ip_options *opt = &(IPCB(skb)->opt); |
1502 | 1578 | ||
1503 | IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); | 1579 | IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); |
1504 | 1580 | ||
@@ -1519,6 +1595,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, | |||
1519 | struct vif_device *vif = &mrt->vif_table[vifi]; | 1595 | struct vif_device *vif = &mrt->vif_table[vifi]; |
1520 | struct net_device *dev; | 1596 | struct net_device *dev; |
1521 | struct rtable *rt; | 1597 | struct rtable *rt; |
1598 | struct flowi4 fl4; | ||
1522 | int encap = 0; | 1599 | int encap = 0; |
1523 | 1600 | ||
1524 | if (vif->dev == NULL) | 1601 | if (vif->dev == NULL) |
@@ -1535,23 +1612,21 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, | |||
1535 | } | 1612 | } |
1536 | #endif | 1613 | #endif |
1537 | 1614 | ||
1538 | if (vif->flags&VIFF_TUNNEL) { | 1615 | if (vif->flags & VIFF_TUNNEL) { |
1539 | struct flowi fl = { .oif = vif->link, | 1616 | rt = ip_route_output_ports(net, &fl4, NULL, |
1540 | .nl_u = { .ip4_u = | 1617 | vif->remote, vif->local, |
1541 | { .daddr = vif->remote, | 1618 | 0, 0, |
1542 | .saddr = vif->local, | 1619 | IPPROTO_IPIP, |
1543 | .tos = RT_TOS(iph->tos) } }, | 1620 | RT_TOS(iph->tos), vif->link); |
1544 | .proto = IPPROTO_IPIP }; | 1621 | if (IS_ERR(rt)) |
1545 | if (ip_route_output_key(net, &rt, &fl)) | ||
1546 | goto out_free; | 1622 | goto out_free; |
1547 | encap = sizeof(struct iphdr); | 1623 | encap = sizeof(struct iphdr); |
1548 | } else { | 1624 | } else { |
1549 | struct flowi fl = { .oif = vif->link, | 1625 | rt = ip_route_output_ports(net, &fl4, NULL, iph->daddr, 0, |
1550 | .nl_u = { .ip4_u = | 1626 | 0, 0, |
1551 | { .daddr = iph->daddr, | 1627 | IPPROTO_IPIP, |
1552 | .tos = RT_TOS(iph->tos) } }, | 1628 | RT_TOS(iph->tos), vif->link); |
1553 | .proto = IPPROTO_IPIP }; | 1629 | if (IS_ERR(rt)) |
1554 | if (ip_route_output_key(net, &rt, &fl)) | ||
1555 | goto out_free; | 1630 | goto out_free; |
1556 | } | 1631 | } |
1557 | 1632 | ||
@@ -1559,8 +1634,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, | |||
1559 | 1634 | ||
1560 | if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) { | 1635 | if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) { |
1561 | /* Do not fragment multicasts. Alas, IPv4 does not | 1636 | /* Do not fragment multicasts. Alas, IPv4 does not |
1562 | allow to send ICMP, so that packets will disappear | 1637 | * allow to send ICMP, so that packets will disappear |
1563 | to blackhole. | 1638 | * to blackhole. |
1564 | */ | 1639 | */ |
1565 | 1640 | ||
1566 | IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS); | 1641 | IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS); |
@@ -1583,7 +1658,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, | |||
1583 | ip_decrease_ttl(ip_hdr(skb)); | 1658 | ip_decrease_ttl(ip_hdr(skb)); |
1584 | 1659 | ||
1585 | /* FIXME: forward and output firewalls used to be called here. | 1660 | /* FIXME: forward and output firewalls used to be called here. |
1586 | * What do we do with netfilter? -- RR */ | 1661 | * What do we do with netfilter? -- RR |
1662 | */ | ||
1587 | if (vif->flags & VIFF_TUNNEL) { | 1663 | if (vif->flags & VIFF_TUNNEL) { |
1588 | ip_encap(skb, vif->local, vif->remote); | 1664 | ip_encap(skb, vif->local, vif->remote); |
1589 | /* FIXME: extra output firewall step used to be here. --RR */ | 1665 | /* FIXME: extra output firewall step used to be here. --RR */ |
@@ -1642,17 +1718,17 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt, | |||
1642 | if (mrt->vif_table[vif].dev != skb->dev) { | 1718 | if (mrt->vif_table[vif].dev != skb->dev) { |
1643 | int true_vifi; | 1719 | int true_vifi; |
1644 | 1720 | ||
1645 | if (skb_rtable(skb)->fl.iif == 0) { | 1721 | if (rt_is_output_route(skb_rtable(skb))) { |
1646 | /* It is our own packet, looped back. | 1722 | /* It is our own packet, looped back. |
1647 | Very complicated situation... | 1723 | * Very complicated situation... |
1648 | 1724 | * | |
1649 | The best workaround until routing daemons will be | 1725 | * The best workaround until routing daemons will be |
1650 | fixed is not to redistribute packet, if it was | 1726 | * fixed is not to redistribute packet, if it was |
1651 | send through wrong interface. It means, that | 1727 | * send through wrong interface. It means, that |
1652 | multicast applications WILL NOT work for | 1728 | * multicast applications WILL NOT work for |
1653 | (S,G), which have default multicast route pointing | 1729 | * (S,G), which have default multicast route pointing |
1654 | to wrong oif. In any case, it is not a good | 1730 | * to wrong oif. In any case, it is not a good |
1655 | idea to use multicasting applications on router. | 1731 | * idea to use multicasting applications on router. |
1656 | */ | 1732 | */ |
1657 | goto dont_forward; | 1733 | goto dont_forward; |
1658 | } | 1734 | } |
@@ -1662,9 +1738,9 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt, | |||
1662 | 1738 | ||
1663 | if (true_vifi >= 0 && mrt->mroute_do_assert && | 1739 | if (true_vifi >= 0 && mrt->mroute_do_assert && |
1664 | /* pimsm uses asserts, when switching from RPT to SPT, | 1740 | /* pimsm uses asserts, when switching from RPT to SPT, |
1665 | so that we cannot check that packet arrived on an oif. | 1741 | * so that we cannot check that packet arrived on an oif. |
1666 | It is bad, but otherwise we would need to move pretty | 1742 | * It is bad, but otherwise we would need to move pretty |
1667 | large chunk of pimd to kernel. Ough... --ANK | 1743 | * large chunk of pimd to kernel. Ough... --ANK |
1668 | */ | 1744 | */ |
1669 | (mrt->mroute_do_pim || | 1745 | (mrt->mroute_do_pim || |
1670 | cache->mfc_un.res.ttls[true_vifi] < 255) && | 1746 | cache->mfc_un.res.ttls[true_vifi] < 255) && |
@@ -1682,10 +1758,12 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt, | |||
1682 | /* | 1758 | /* |
1683 | * Forward the frame | 1759 | * Forward the frame |
1684 | */ | 1760 | */ |
1685 | for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) { | 1761 | for (ct = cache->mfc_un.res.maxvif - 1; |
1762 | ct >= cache->mfc_un.res.minvif; ct--) { | ||
1686 | if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) { | 1763 | if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) { |
1687 | if (psend != -1) { | 1764 | if (psend != -1) { |
1688 | struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); | 1765 | struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); |
1766 | |||
1689 | if (skb2) | 1767 | if (skb2) |
1690 | ipmr_queue_xmit(net, mrt, skb2, cache, | 1768 | ipmr_queue_xmit(net, mrt, skb2, cache, |
1691 | psend); | 1769 | psend); |
@@ -1696,6 +1774,7 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt, | |||
1696 | if (psend != -1) { | 1774 | if (psend != -1) { |
1697 | if (local) { | 1775 | if (local) { |
1698 | struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); | 1776 | struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); |
1777 | |||
1699 | if (skb2) | 1778 | if (skb2) |
1700 | ipmr_queue_xmit(net, mrt, skb2, cache, psend); | 1779 | ipmr_queue_xmit(net, mrt, skb2, cache, psend); |
1701 | } else { | 1780 | } else { |
@@ -1710,9 +1789,30 @@ dont_forward: | |||
1710 | return 0; | 1789 | return 0; |
1711 | } | 1790 | } |
1712 | 1791 | ||
1792 | static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb) | ||
1793 | { | ||
1794 | struct rtable *rt = skb_rtable(skb); | ||
1795 | struct iphdr *iph = ip_hdr(skb); | ||
1796 | struct flowi4 fl4 = { | ||
1797 | .daddr = iph->daddr, | ||
1798 | .saddr = iph->saddr, | ||
1799 | .flowi4_tos = iph->tos, | ||
1800 | .flowi4_oif = rt->rt_oif, | ||
1801 | .flowi4_iif = rt->rt_iif, | ||
1802 | .flowi4_mark = rt->rt_mark, | ||
1803 | }; | ||
1804 | struct mr_table *mrt; | ||
1805 | int err; | ||
1806 | |||
1807 | err = ipmr_fib_lookup(net, &fl4, &mrt); | ||
1808 | if (err) | ||
1809 | return ERR_PTR(err); | ||
1810 | return mrt; | ||
1811 | } | ||
1713 | 1812 | ||
1714 | /* | 1813 | /* |
1715 | * Multicast packets for forwarding arrive here | 1814 | * Multicast packets for forwarding arrive here |
1815 | * Called with rcu_read_lock(); | ||
1716 | */ | 1816 | */ |
1717 | 1817 | ||
1718 | int ip_mr_input(struct sk_buff *skb) | 1818 | int ip_mr_input(struct sk_buff *skb) |
@@ -1721,43 +1821,41 @@ int ip_mr_input(struct sk_buff *skb) | |||
1721 | struct net *net = dev_net(skb->dev); | 1821 | struct net *net = dev_net(skb->dev); |
1722 | int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL; | 1822 | int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL; |
1723 | struct mr_table *mrt; | 1823 | struct mr_table *mrt; |
1724 | int err; | ||
1725 | 1824 | ||
1726 | /* Packet is looped back after forward, it should not be | 1825 | /* Packet is looped back after forward, it should not be |
1727 | forwarded second time, but still can be delivered locally. | 1826 | * forwarded second time, but still can be delivered locally. |
1728 | */ | 1827 | */ |
1729 | if (IPCB(skb)->flags&IPSKB_FORWARDED) | 1828 | if (IPCB(skb)->flags & IPSKB_FORWARDED) |
1730 | goto dont_forward; | 1829 | goto dont_forward; |
1731 | 1830 | ||
1732 | err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt); | 1831 | mrt = ipmr_rt_fib_lookup(net, skb); |
1733 | if (err < 0) { | 1832 | if (IS_ERR(mrt)) { |
1734 | kfree_skb(skb); | 1833 | kfree_skb(skb); |
1735 | return err; | 1834 | return PTR_ERR(mrt); |
1736 | } | 1835 | } |
1737 | |||
1738 | if (!local) { | 1836 | if (!local) { |
1739 | if (IPCB(skb)->opt.router_alert) { | 1837 | if (IPCB(skb)->opt.router_alert) { |
1740 | if (ip_call_ra_chain(skb)) | 1838 | if (ip_call_ra_chain(skb)) |
1741 | return 0; | 1839 | return 0; |
1742 | } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){ | 1840 | } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP) { |
1743 | /* IGMPv1 (and broken IGMPv2 implementations sort of | 1841 | /* IGMPv1 (and broken IGMPv2 implementations sort of |
1744 | Cisco IOS <= 11.2(8)) do not put router alert | 1842 | * Cisco IOS <= 11.2(8)) do not put router alert |
1745 | option to IGMP packets destined to routable | 1843 | * option to IGMP packets destined to routable |
1746 | groups. It is very bad, because it means | 1844 | * groups. It is very bad, because it means |
1747 | that we can forward NO IGMP messages. | 1845 | * that we can forward NO IGMP messages. |
1748 | */ | 1846 | */ |
1749 | read_lock(&mrt_lock); | 1847 | struct sock *mroute_sk; |
1750 | if (mrt->mroute_sk) { | 1848 | |
1751 | nf_reset(skb); | 1849 | mroute_sk = rcu_dereference(mrt->mroute_sk); |
1752 | raw_rcv(mrt->mroute_sk, skb); | 1850 | if (mroute_sk) { |
1753 | read_unlock(&mrt_lock); | 1851 | nf_reset(skb); |
1754 | return 0; | 1852 | raw_rcv(mroute_sk, skb); |
1755 | } | 1853 | return 0; |
1756 | read_unlock(&mrt_lock); | 1854 | } |
1757 | } | 1855 | } |
1758 | } | 1856 | } |
1759 | 1857 | ||
1760 | read_lock(&mrt_lock); | 1858 | /* already under rcu_read_lock() */ |
1761 | cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); | 1859 | cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); |
1762 | 1860 | ||
1763 | /* | 1861 | /* |
@@ -1769,13 +1867,12 @@ int ip_mr_input(struct sk_buff *skb) | |||
1769 | if (local) { | 1867 | if (local) { |
1770 | struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); | 1868 | struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); |
1771 | ip_local_deliver(skb); | 1869 | ip_local_deliver(skb); |
1772 | if (skb2 == NULL) { | 1870 | if (skb2 == NULL) |
1773 | read_unlock(&mrt_lock); | ||
1774 | return -ENOBUFS; | 1871 | return -ENOBUFS; |
1775 | } | ||
1776 | skb = skb2; | 1872 | skb = skb2; |
1777 | } | 1873 | } |
1778 | 1874 | ||
1875 | read_lock(&mrt_lock); | ||
1779 | vif = ipmr_find_vif(mrt, skb->dev); | 1876 | vif = ipmr_find_vif(mrt, skb->dev); |
1780 | if (vif >= 0) { | 1877 | if (vif >= 0) { |
1781 | int err2 = ipmr_cache_unresolved(mrt, vif, skb); | 1878 | int err2 = ipmr_cache_unresolved(mrt, vif, skb); |
@@ -1788,8 +1885,8 @@ int ip_mr_input(struct sk_buff *skb) | |||
1788 | return -ENODEV; | 1885 | return -ENODEV; |
1789 | } | 1886 | } |
1790 | 1887 | ||
1888 | read_lock(&mrt_lock); | ||
1791 | ip_mr_forward(net, mrt, skb, cache, local); | 1889 | ip_mr_forward(net, mrt, skb, cache, local); |
1792 | |||
1793 | read_unlock(&mrt_lock); | 1890 | read_unlock(&mrt_lock); |
1794 | 1891 | ||
1795 | if (local) | 1892 | if (local) |
@@ -1805,6 +1902,7 @@ dont_forward: | |||
1805 | } | 1902 | } |
1806 | 1903 | ||
1807 | #ifdef CONFIG_IP_PIMSM | 1904 | #ifdef CONFIG_IP_PIMSM |
1905 | /* called with rcu_read_lock() */ | ||
1808 | static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb, | 1906 | static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb, |
1809 | unsigned int pimlen) | 1907 | unsigned int pimlen) |
1810 | { | 1908 | { |
@@ -1813,10 +1911,10 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb, | |||
1813 | 1911 | ||
1814 | encap = (struct iphdr *)(skb_transport_header(skb) + pimlen); | 1912 | encap = (struct iphdr *)(skb_transport_header(skb) + pimlen); |
1815 | /* | 1913 | /* |
1816 | Check that: | 1914 | * Check that: |
1817 | a. packet is really destinted to a multicast group | 1915 | * a. packet is really sent to a multicast group |
1818 | b. packet is not a NULL-REGISTER | 1916 | * b. packet is not a NULL-REGISTER |
1819 | c. packet is not truncated | 1917 | * c. packet is not truncated |
1820 | */ | 1918 | */ |
1821 | if (!ipv4_is_multicast(encap->daddr) || | 1919 | if (!ipv4_is_multicast(encap->daddr) || |
1822 | encap->tot_len == 0 || | 1920 | encap->tot_len == 0 || |
@@ -1826,26 +1924,23 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb, | |||
1826 | read_lock(&mrt_lock); | 1924 | read_lock(&mrt_lock); |
1827 | if (mrt->mroute_reg_vif_num >= 0) | 1925 | if (mrt->mroute_reg_vif_num >= 0) |
1828 | reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev; | 1926 | reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev; |
1829 | if (reg_dev) | ||
1830 | dev_hold(reg_dev); | ||
1831 | read_unlock(&mrt_lock); | 1927 | read_unlock(&mrt_lock); |
1832 | 1928 | ||
1833 | if (reg_dev == NULL) | 1929 | if (reg_dev == NULL) |
1834 | return 1; | 1930 | return 1; |
1835 | 1931 | ||
1836 | skb->mac_header = skb->network_header; | 1932 | skb->mac_header = skb->network_header; |
1837 | skb_pull(skb, (u8*)encap - skb->data); | 1933 | skb_pull(skb, (u8 *)encap - skb->data); |
1838 | skb_reset_network_header(skb); | 1934 | skb_reset_network_header(skb); |
1839 | skb->protocol = htons(ETH_P_IP); | 1935 | skb->protocol = htons(ETH_P_IP); |
1840 | skb->ip_summed = 0; | 1936 | skb->ip_summed = CHECKSUM_NONE; |
1841 | skb->pkt_type = PACKET_HOST; | 1937 | skb->pkt_type = PACKET_HOST; |
1842 | 1938 | ||
1843 | skb_tunnel_rx(skb, reg_dev); | 1939 | skb_tunnel_rx(skb, reg_dev); |
1844 | 1940 | ||
1845 | netif_rx(skb); | 1941 | netif_rx(skb); |
1846 | dev_put(reg_dev); | ||
1847 | 1942 | ||
1848 | return 0; | 1943 | return NET_RX_SUCCESS; |
1849 | } | 1944 | } |
1850 | #endif | 1945 | #endif |
1851 | 1946 | ||
@@ -1854,7 +1949,7 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb, | |||
1854 | * Handle IGMP messages of PIMv1 | 1949 | * Handle IGMP messages of PIMv1 |
1855 | */ | 1950 | */ |
1856 | 1951 | ||
1857 | int pim_rcv_v1(struct sk_buff * skb) | 1952 | int pim_rcv_v1(struct sk_buff *skb) |
1858 | { | 1953 | { |
1859 | struct igmphdr *pim; | 1954 | struct igmphdr *pim; |
1860 | struct net *net = dev_net(skb->dev); | 1955 | struct net *net = dev_net(skb->dev); |
@@ -1865,9 +1960,9 @@ int pim_rcv_v1(struct sk_buff * skb) | |||
1865 | 1960 | ||
1866 | pim = igmp_hdr(skb); | 1961 | pim = igmp_hdr(skb); |
1867 | 1962 | ||
1868 | if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0) | 1963 | mrt = ipmr_rt_fib_lookup(net, skb); |
1964 | if (IS_ERR(mrt)) | ||
1869 | goto drop; | 1965 | goto drop; |
1870 | |||
1871 | if (!mrt->mroute_do_pim || | 1966 | if (!mrt->mroute_do_pim || |
1872 | pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) | 1967 | pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) |
1873 | goto drop; | 1968 | goto drop; |
@@ -1881,7 +1976,7 @@ drop: | |||
1881 | #endif | 1976 | #endif |
1882 | 1977 | ||
1883 | #ifdef CONFIG_IP_PIMSM_V2 | 1978 | #ifdef CONFIG_IP_PIMSM_V2 |
1884 | static int pim_rcv(struct sk_buff * skb) | 1979 | static int pim_rcv(struct sk_buff *skb) |
1885 | { | 1980 | { |
1886 | struct pimreghdr *pim; | 1981 | struct pimreghdr *pim; |
1887 | struct net *net = dev_net(skb->dev); | 1982 | struct net *net = dev_net(skb->dev); |
@@ -1891,15 +1986,15 @@ static int pim_rcv(struct sk_buff * skb) | |||
1891 | goto drop; | 1986 | goto drop; |
1892 | 1987 | ||
1893 | pim = (struct pimreghdr *)skb_transport_header(skb); | 1988 | pim = (struct pimreghdr *)skb_transport_header(skb); |
1894 | if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) || | 1989 | if (pim->type != ((PIM_VERSION << 4) | (PIM_REGISTER)) || |
1895 | (pim->flags&PIM_NULL_REGISTER) || | 1990 | (pim->flags & PIM_NULL_REGISTER) || |
1896 | (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 && | 1991 | (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 && |
1897 | csum_fold(skb_checksum(skb, 0, skb->len, 0)))) | 1992 | csum_fold(skb_checksum(skb, 0, skb->len, 0)))) |
1898 | goto drop; | 1993 | goto drop; |
1899 | 1994 | ||
1900 | if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0) | 1995 | mrt = ipmr_rt_fib_lookup(net, skb); |
1996 | if (IS_ERR(mrt)) | ||
1901 | goto drop; | 1997 | goto drop; |
1902 | |||
1903 | if (__pim_rcv(mrt, skb, sizeof(*pim))) { | 1998 | if (__pim_rcv(mrt, skb, sizeof(*pim))) { |
1904 | drop: | 1999 | drop: |
1905 | kfree_skb(skb); | 2000 | kfree_skb(skb); |
@@ -1946,40 +2041,45 @@ rtattr_failure: | |||
1946 | return -EMSGSIZE; | 2041 | return -EMSGSIZE; |
1947 | } | 2042 | } |
1948 | 2043 | ||
1949 | int ipmr_get_route(struct net *net, | 2044 | int ipmr_get_route(struct net *net, struct sk_buff *skb, |
1950 | struct sk_buff *skb, struct rtmsg *rtm, int nowait) | 2045 | __be32 saddr, __be32 daddr, |
2046 | struct rtmsg *rtm, int nowait) | ||
1951 | { | 2047 | { |
1952 | int err; | ||
1953 | struct mr_table *mrt; | ||
1954 | struct mfc_cache *cache; | 2048 | struct mfc_cache *cache; |
1955 | struct rtable *rt = skb_rtable(skb); | 2049 | struct mr_table *mrt; |
2050 | int err; | ||
1956 | 2051 | ||
1957 | mrt = ipmr_get_table(net, RT_TABLE_DEFAULT); | 2052 | mrt = ipmr_get_table(net, RT_TABLE_DEFAULT); |
1958 | if (mrt == NULL) | 2053 | if (mrt == NULL) |
1959 | return -ENOENT; | 2054 | return -ENOENT; |
1960 | 2055 | ||
1961 | read_lock(&mrt_lock); | 2056 | rcu_read_lock(); |
1962 | cache = ipmr_cache_find(mrt, rt->rt_src, rt->rt_dst); | 2057 | cache = ipmr_cache_find(mrt, saddr, daddr); |
1963 | 2058 | ||
1964 | if (cache == NULL) { | 2059 | if (cache == NULL) { |
1965 | struct sk_buff *skb2; | 2060 | struct sk_buff *skb2; |
1966 | struct iphdr *iph; | 2061 | struct iphdr *iph; |
1967 | struct net_device *dev; | 2062 | struct net_device *dev; |
1968 | int vif; | 2063 | int vif = -1; |
1969 | 2064 | ||
1970 | if (nowait) { | 2065 | if (nowait) { |
1971 | read_unlock(&mrt_lock); | 2066 | rcu_read_unlock(); |
1972 | return -EAGAIN; | 2067 | return -EAGAIN; |
1973 | } | 2068 | } |
1974 | 2069 | ||
1975 | dev = skb->dev; | 2070 | dev = skb->dev; |
1976 | if (dev == NULL || (vif = ipmr_find_vif(mrt, dev)) < 0) { | 2071 | read_lock(&mrt_lock); |
2072 | if (dev) | ||
2073 | vif = ipmr_find_vif(mrt, dev); | ||
2074 | if (vif < 0) { | ||
1977 | read_unlock(&mrt_lock); | 2075 | read_unlock(&mrt_lock); |
2076 | rcu_read_unlock(); | ||
1978 | return -ENODEV; | 2077 | return -ENODEV; |
1979 | } | 2078 | } |
1980 | skb2 = skb_clone(skb, GFP_ATOMIC); | 2079 | skb2 = skb_clone(skb, GFP_ATOMIC); |
1981 | if (!skb2) { | 2080 | if (!skb2) { |
1982 | read_unlock(&mrt_lock); | 2081 | read_unlock(&mrt_lock); |
2082 | rcu_read_unlock(); | ||
1983 | return -ENOMEM; | 2083 | return -ENOMEM; |
1984 | } | 2084 | } |
1985 | 2085 | ||
@@ -1987,18 +2087,21 @@ int ipmr_get_route(struct net *net, | |||
1987 | skb_reset_network_header(skb2); | 2087 | skb_reset_network_header(skb2); |
1988 | iph = ip_hdr(skb2); | 2088 | iph = ip_hdr(skb2); |
1989 | iph->ihl = sizeof(struct iphdr) >> 2; | 2089 | iph->ihl = sizeof(struct iphdr) >> 2; |
1990 | iph->saddr = rt->rt_src; | 2090 | iph->saddr = saddr; |
1991 | iph->daddr = rt->rt_dst; | 2091 | iph->daddr = daddr; |
1992 | iph->version = 0; | 2092 | iph->version = 0; |
1993 | err = ipmr_cache_unresolved(mrt, vif, skb2); | 2093 | err = ipmr_cache_unresolved(mrt, vif, skb2); |
1994 | read_unlock(&mrt_lock); | 2094 | read_unlock(&mrt_lock); |
2095 | rcu_read_unlock(); | ||
1995 | return err; | 2096 | return err; |
1996 | } | 2097 | } |
1997 | 2098 | ||
1998 | if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY)) | 2099 | read_lock(&mrt_lock); |
2100 | if (!nowait && (rtm->rtm_flags & RTM_F_NOTIFY)) | ||
1999 | cache->mfc_flags |= MFC_NOTIFY; | 2101 | cache->mfc_flags |= MFC_NOTIFY; |
2000 | err = __ipmr_fill_mroute(mrt, skb, cache, rtm); | 2102 | err = __ipmr_fill_mroute(mrt, skb, cache, rtm); |
2001 | read_unlock(&mrt_lock); | 2103 | read_unlock(&mrt_lock); |
2104 | rcu_read_unlock(); | ||
2002 | return err; | 2105 | return err; |
2003 | } | 2106 | } |
2004 | 2107 | ||
@@ -2050,14 +2153,14 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) | |||
2050 | s_h = cb->args[1]; | 2153 | s_h = cb->args[1]; |
2051 | s_e = cb->args[2]; | 2154 | s_e = cb->args[2]; |
2052 | 2155 | ||
2053 | read_lock(&mrt_lock); | 2156 | rcu_read_lock(); |
2054 | ipmr_for_each_table(mrt, net) { | 2157 | ipmr_for_each_table(mrt, net) { |
2055 | if (t < s_t) | 2158 | if (t < s_t) |
2056 | goto next_table; | 2159 | goto next_table; |
2057 | if (t > s_t) | 2160 | if (t > s_t) |
2058 | s_h = 0; | 2161 | s_h = 0; |
2059 | for (h = s_h; h < MFC_LINES; h++) { | 2162 | for (h = s_h; h < MFC_LINES; h++) { |
2060 | list_for_each_entry(mfc, &mrt->mfc_cache_array[h], list) { | 2163 | list_for_each_entry_rcu(mfc, &mrt->mfc_cache_array[h], list) { |
2061 | if (e < s_e) | 2164 | if (e < s_e) |
2062 | goto next_entry; | 2165 | goto next_entry; |
2063 | if (ipmr_fill_mroute(mrt, skb, | 2166 | if (ipmr_fill_mroute(mrt, skb, |
@@ -2075,7 +2178,7 @@ next_table: | |||
2075 | t++; | 2178 | t++; |
2076 | } | 2179 | } |
2077 | done: | 2180 | done: |
2078 | read_unlock(&mrt_lock); | 2181 | rcu_read_unlock(); |
2079 | 2182 | ||
2080 | cb->args[2] = e; | 2183 | cb->args[2] = e; |
2081 | cb->args[1] = h; | 2184 | cb->args[1] = h; |
@@ -2086,7 +2189,8 @@ done: | |||
2086 | 2189 | ||
2087 | #ifdef CONFIG_PROC_FS | 2190 | #ifdef CONFIG_PROC_FS |
2088 | /* | 2191 | /* |
2089 | * The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif | 2192 | * The /proc interfaces to multicast routing : |
2193 | * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif | ||
2090 | */ | 2194 | */ |
2091 | struct ipmr_vif_iter { | 2195 | struct ipmr_vif_iter { |
2092 | struct seq_net_private p; | 2196 | struct seq_net_private p; |
@@ -2208,14 +2312,14 @@ static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net, | |||
2208 | struct mr_table *mrt = it->mrt; | 2312 | struct mr_table *mrt = it->mrt; |
2209 | struct mfc_cache *mfc; | 2313 | struct mfc_cache *mfc; |
2210 | 2314 | ||
2211 | read_lock(&mrt_lock); | 2315 | rcu_read_lock(); |
2212 | for (it->ct = 0; it->ct < MFC_LINES; it->ct++) { | 2316 | for (it->ct = 0; it->ct < MFC_LINES; it->ct++) { |
2213 | it->cache = &mrt->mfc_cache_array[it->ct]; | 2317 | it->cache = &mrt->mfc_cache_array[it->ct]; |
2214 | list_for_each_entry(mfc, it->cache, list) | 2318 | list_for_each_entry_rcu(mfc, it->cache, list) |
2215 | if (pos-- == 0) | 2319 | if (pos-- == 0) |
2216 | return mfc; | 2320 | return mfc; |
2217 | } | 2321 | } |
2218 | read_unlock(&mrt_lock); | 2322 | rcu_read_unlock(); |
2219 | 2323 | ||
2220 | spin_lock_bh(&mfc_unres_lock); | 2324 | spin_lock_bh(&mfc_unres_lock); |
2221 | it->cache = &mrt->mfc_unres_queue; | 2325 | it->cache = &mrt->mfc_unres_queue; |
@@ -2274,7 +2378,7 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) | |||
2274 | } | 2378 | } |
2275 | 2379 | ||
2276 | /* exhausted cache_array, show unresolved */ | 2380 | /* exhausted cache_array, show unresolved */ |
2277 | read_unlock(&mrt_lock); | 2381 | rcu_read_unlock(); |
2278 | it->cache = &mrt->mfc_unres_queue; | 2382 | it->cache = &mrt->mfc_unres_queue; |
2279 | it->ct = 0; | 2383 | it->ct = 0; |
2280 | 2384 | ||
@@ -2282,7 +2386,7 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) | |||
2282 | if (!list_empty(it->cache)) | 2386 | if (!list_empty(it->cache)) |
2283 | return list_first_entry(it->cache, struct mfc_cache, list); | 2387 | return list_first_entry(it->cache, struct mfc_cache, list); |
2284 | 2388 | ||
2285 | end_of_list: | 2389 | end_of_list: |
2286 | spin_unlock_bh(&mfc_unres_lock); | 2390 | spin_unlock_bh(&mfc_unres_lock); |
2287 | it->cache = NULL; | 2391 | it->cache = NULL; |
2288 | 2392 | ||
@@ -2297,7 +2401,7 @@ static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v) | |||
2297 | if (it->cache == &mrt->mfc_unres_queue) | 2401 | if (it->cache == &mrt->mfc_unres_queue) |
2298 | spin_unlock_bh(&mfc_unres_lock); | 2402 | spin_unlock_bh(&mfc_unres_lock); |
2299 | else if (it->cache == &mrt->mfc_cache_array[it->ct]) | 2403 | else if (it->cache == &mrt->mfc_cache_array[it->ct]) |
2300 | read_unlock(&mrt_lock); | 2404 | rcu_read_unlock(); |
2301 | } | 2405 | } |
2302 | 2406 | ||
2303 | static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) | 2407 | static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) |
@@ -2323,7 +2427,7 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) | |||
2323 | mfc->mfc_un.res.bytes, | 2427 | mfc->mfc_un.res.bytes, |
2324 | mfc->mfc_un.res.wrong_if); | 2428 | mfc->mfc_un.res.wrong_if); |
2325 | for (n = mfc->mfc_un.res.minvif; | 2429 | for (n = mfc->mfc_un.res.minvif; |
2326 | n < mfc->mfc_un.res.maxvif; n++ ) { | 2430 | n < mfc->mfc_un.res.maxvif; n++) { |
2327 | if (VIF_EXISTS(mrt, n) && | 2431 | if (VIF_EXISTS(mrt, n) && |
2328 | mfc->mfc_un.res.ttls[n] < 255) | 2432 | mfc->mfc_un.res.ttls[n] < 255) |
2329 | seq_printf(seq, | 2433 | seq_printf(seq, |
@@ -2421,7 +2525,7 @@ int __init ip_mr_init(void) | |||
2421 | 2525 | ||
2422 | mrt_cachep = kmem_cache_create("ip_mrt_cache", | 2526 | mrt_cachep = kmem_cache_create("ip_mrt_cache", |
2423 | sizeof(struct mfc_cache), | 2527 | sizeof(struct mfc_cache), |
2424 | 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, | 2528 | 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, |
2425 | NULL); | 2529 | NULL); |
2426 | if (!mrt_cachep) | 2530 | if (!mrt_cachep) |
2427 | return -ENOMEM; | 2531 | return -ENOMEM; |
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index d88a46c54fd1..2e97e3ec1eb7 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c | |||
@@ -16,60 +16,47 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) | |||
16 | struct net *net = dev_net(skb_dst(skb)->dev); | 16 | struct net *net = dev_net(skb_dst(skb)->dev); |
17 | const struct iphdr *iph = ip_hdr(skb); | 17 | const struct iphdr *iph = ip_hdr(skb); |
18 | struct rtable *rt; | 18 | struct rtable *rt; |
19 | struct flowi fl = {}; | 19 | struct flowi4 fl4 = {}; |
20 | unsigned long orefdst; | 20 | __be32 saddr = iph->saddr; |
21 | __u8 flags = 0; | ||
21 | unsigned int hh_len; | 22 | unsigned int hh_len; |
22 | unsigned int type; | ||
23 | 23 | ||
24 | type = inet_addr_type(net, iph->saddr); | 24 | if (!skb->sk && addr_type != RTN_LOCAL) { |
25 | if (skb->sk && inet_sk(skb->sk)->transparent) | 25 | if (addr_type == RTN_UNSPEC) |
26 | type = RTN_LOCAL; | 26 | addr_type = inet_addr_type(net, saddr); |
27 | if (addr_type == RTN_UNSPEC) | 27 | if (addr_type == RTN_LOCAL || addr_type == RTN_UNICAST) |
28 | addr_type = type; | 28 | flags |= FLOWI_FLAG_ANYSRC; |
29 | else | ||
30 | saddr = 0; | ||
31 | } | ||
29 | 32 | ||
30 | /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause | 33 | /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause |
31 | * packets with foreign saddr to appear on the NF_INET_LOCAL_OUT hook. | 34 | * packets with foreign saddr to appear on the NF_INET_LOCAL_OUT hook. |
32 | */ | 35 | */ |
33 | if (addr_type == RTN_LOCAL) { | 36 | fl4.daddr = iph->daddr; |
34 | fl.nl_u.ip4_u.daddr = iph->daddr; | 37 | fl4.saddr = saddr; |
35 | if (type == RTN_LOCAL) | 38 | fl4.flowi4_tos = RT_TOS(iph->tos); |
36 | fl.nl_u.ip4_u.saddr = iph->saddr; | 39 | fl4.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0; |
37 | fl.nl_u.ip4_u.tos = RT_TOS(iph->tos); | 40 | fl4.flowi4_mark = skb->mark; |
38 | fl.oif = skb->sk ? skb->sk->sk_bound_dev_if : 0; | 41 | fl4.flowi4_flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : flags; |
39 | fl.mark = skb->mark; | 42 | rt = ip_route_output_key(net, &fl4); |
40 | fl.flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0; | 43 | if (IS_ERR(rt)) |
41 | if (ip_route_output_key(net, &rt, &fl) != 0) | 44 | return -1; |
42 | return -1; | ||
43 | |||
44 | /* Drop old route. */ | ||
45 | skb_dst_drop(skb); | ||
46 | skb_dst_set(skb, &rt->dst); | ||
47 | } else { | ||
48 | /* non-local src, find valid iif to satisfy | ||
49 | * rp-filter when calling ip_route_input. */ | ||
50 | fl.nl_u.ip4_u.daddr = iph->saddr; | ||
51 | if (ip_route_output_key(net, &rt, &fl) != 0) | ||
52 | return -1; | ||
53 | 45 | ||
54 | orefdst = skb->_skb_refdst; | 46 | /* Drop old route. */ |
55 | if (ip_route_input(skb, iph->daddr, iph->saddr, | 47 | skb_dst_drop(skb); |
56 | RT_TOS(iph->tos), rt->dst.dev) != 0) { | 48 | skb_dst_set(skb, &rt->dst); |
57 | dst_release(&rt->dst); | ||
58 | return -1; | ||
59 | } | ||
60 | dst_release(&rt->dst); | ||
61 | refdst_drop(orefdst); | ||
62 | } | ||
63 | 49 | ||
64 | if (skb_dst(skb)->error) | 50 | if (skb_dst(skb)->error) |
65 | return -1; | 51 | return -1; |
66 | 52 | ||
67 | #ifdef CONFIG_XFRM | 53 | #ifdef CONFIG_XFRM |
68 | if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && | 54 | if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && |
69 | xfrm_decode_session(skb, &fl, AF_INET) == 0) { | 55 | xfrm_decode_session(skb, flowi4_to_flowi(&fl4), AF_INET) == 0) { |
70 | struct dst_entry *dst = skb_dst(skb); | 56 | struct dst_entry *dst = skb_dst(skb); |
71 | skb_dst_set(skb, NULL); | 57 | skb_dst_set(skb, NULL); |
72 | if (xfrm_lookup(net, &dst, &fl, skb->sk, 0)) | 58 | dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), skb->sk, 0); |
59 | if (IS_ERR(dst)) | ||
73 | return -1; | 60 | return -1; |
74 | skb_dst_set(skb, dst); | 61 | skb_dst_set(skb, dst); |
75 | } | 62 | } |
@@ -102,7 +89,8 @@ int ip_xfrm_me_harder(struct sk_buff *skb) | |||
102 | dst = ((struct xfrm_dst *)dst)->route; | 89 | dst = ((struct xfrm_dst *)dst)->route; |
103 | dst_hold(dst); | 90 | dst_hold(dst); |
104 | 91 | ||
105 | if (xfrm_lookup(dev_net(dst->dev), &dst, &fl, skb->sk, 0) < 0) | 92 | dst = xfrm_lookup(dev_net(dst->dev), dst, &fl, skb->sk, 0); |
93 | if (IS_ERR(dst)) | ||
106 | return -1; | 94 | return -1; |
107 | 95 | ||
108 | skb_dst_drop(skb); | 96 | skb_dst_drop(skb); |
@@ -217,9 +205,14 @@ static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook, | |||
217 | return csum; | 205 | return csum; |
218 | } | 206 | } |
219 | 207 | ||
220 | static int nf_ip_route(struct dst_entry **dst, struct flowi *fl) | 208 | static int nf_ip_route(struct net *net, struct dst_entry **dst, |
209 | struct flowi *fl, bool strict __always_unused) | ||
221 | { | 210 | { |
222 | return ip_route_output_key(&init_net, (struct rtable **)dst, fl); | 211 | struct rtable *rt = ip_route_output_key(net, &fl->u.ip4); |
212 | if (IS_ERR(rt)) | ||
213 | return PTR_ERR(rt); | ||
214 | *dst = &rt->dst; | ||
215 | return 0; | ||
223 | } | 216 | } |
224 | 217 | ||
225 | static const struct nf_afinfo nf_ip_afinfo = { | 218 | static const struct nf_afinfo nf_ip_afinfo = { |
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 1833bdbf9805..1dfc18a03fd4 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig | |||
@@ -64,16 +64,6 @@ config IP_NF_IPTABLES | |||
64 | if IP_NF_IPTABLES | 64 | if IP_NF_IPTABLES |
65 | 65 | ||
66 | # The matches. | 66 | # The matches. |
67 | config IP_NF_MATCH_ADDRTYPE | ||
68 | tristate '"addrtype" address type match support' | ||
69 | depends on NETFILTER_ADVANCED | ||
70 | help | ||
71 | This option allows you to match what routing thinks of an address, | ||
72 | eg. UNICAST, LOCAL, BROADCAST, ... | ||
73 | |||
74 | If you want to compile it as a module, say M here and read | ||
75 | <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. | ||
76 | |||
77 | config IP_NF_MATCH_AH | 67 | config IP_NF_MATCH_AH |
78 | tristate '"ah" match support' | 68 | tristate '"ah" match support' |
79 | depends on NETFILTER_ADVANCED | 69 | depends on NETFILTER_ADVANCED |
@@ -147,7 +137,7 @@ config IP_NF_TARGET_ULOG | |||
147 | which can only be viewed through syslog. | 137 | which can only be viewed through syslog. |
148 | 138 | ||
149 | The appropriate userspace logging daemon (ulogd) may be obtained from | 139 | The appropriate userspace logging daemon (ulogd) may be obtained from |
150 | <http://www.gnumonks.org/projects/ulogd/> | 140 | <http://www.netfilter.org/projects/ulogd/index.html> |
151 | 141 | ||
152 | To compile it as a module, choose M here. If unsure, say N. | 142 | To compile it as a module, choose M here. If unsure, say N. |
153 | 143 | ||
@@ -206,8 +196,9 @@ config IP_NF_TARGET_REDIRECT | |||
206 | 196 | ||
207 | config NF_NAT_SNMP_BASIC | 197 | config NF_NAT_SNMP_BASIC |
208 | tristate "Basic SNMP-ALG support" | 198 | tristate "Basic SNMP-ALG support" |
209 | depends on NF_NAT | 199 | depends on NF_CONNTRACK_SNMP && NF_NAT |
210 | depends on NETFILTER_ADVANCED | 200 | depends on NETFILTER_ADVANCED |
201 | default NF_NAT && NF_CONNTRACK_SNMP | ||
211 | ---help--- | 202 | ---help--- |
212 | 203 | ||
213 | This module implements an Application Layer Gateway (ALG) for | 204 | This module implements an Application Layer Gateway (ALG) for |
@@ -324,10 +315,10 @@ config IP_NF_TARGET_ECN | |||
324 | 315 | ||
325 | config IP_NF_TARGET_TTL | 316 | config IP_NF_TARGET_TTL |
326 | tristate '"TTL" target support' | 317 | tristate '"TTL" target support' |
327 | depends on NETFILTER_ADVANCED | 318 | depends on NETFILTER_ADVANCED && IP_NF_MANGLE |
328 | select NETFILTER_XT_TARGET_HL | 319 | select NETFILTER_XT_TARGET_HL |
329 | ---help--- | 320 | ---help--- |
330 | This is a backwards-compat option for the user's convenience | 321 | This is a backwards-compatible option for the user's convenience |
331 | (e.g. when running oldconfig). It selects | 322 | (e.g. when running oldconfig). It selects |
332 | CONFIG_NETFILTER_XT_TARGET_HL. | 323 | CONFIG_NETFILTER_XT_TARGET_HL. |
333 | 324 | ||
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 48111594ee9b..dca2082ec683 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile | |||
@@ -3,15 +3,15 @@ | |||
3 | # | 3 | # |
4 | 4 | ||
5 | # objects for l3 independent conntrack | 5 | # objects for l3 independent conntrack |
6 | nf_conntrack_ipv4-objs := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o | 6 | nf_conntrack_ipv4-y := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o |
7 | ifeq ($(CONFIG_NF_CONNTRACK_PROC_COMPAT),y) | 7 | ifeq ($(CONFIG_NF_CONNTRACK_PROC_COMPAT),y) |
8 | ifeq ($(CONFIG_PROC_FS),y) | 8 | ifeq ($(CONFIG_PROC_FS),y) |
9 | nf_conntrack_ipv4-objs += nf_conntrack_l3proto_ipv4_compat.o | 9 | nf_conntrack_ipv4-objs += nf_conntrack_l3proto_ipv4_compat.o |
10 | endif | 10 | endif |
11 | endif | 11 | endif |
12 | 12 | ||
13 | nf_nat-objs := nf_nat_core.o nf_nat_helper.o nf_nat_proto_unknown.o nf_nat_proto_common.o nf_nat_proto_tcp.o nf_nat_proto_udp.o nf_nat_proto_icmp.o | 13 | nf_nat-y := nf_nat_core.o nf_nat_helper.o nf_nat_proto_unknown.o nf_nat_proto_common.o nf_nat_proto_tcp.o nf_nat_proto_udp.o nf_nat_proto_icmp.o |
14 | iptable_nat-objs := nf_nat_rule.o nf_nat_standalone.o | 14 | iptable_nat-y := nf_nat_rule.o nf_nat_standalone.o |
15 | 15 | ||
16 | # connection tracking | 16 | # connection tracking |
17 | obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o | 17 | obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o |
@@ -48,7 +48,6 @@ obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o | |||
48 | obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o | 48 | obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o |
49 | 49 | ||
50 | # matches | 50 | # matches |
51 | obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o | ||
52 | obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o | 51 | obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o |
53 | obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o | 52 | obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o |
54 | 53 | ||
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index e8f4f9a57f12..fd7a3f68917f 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c | |||
@@ -72,11 +72,11 @@ static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap, | |||
72 | for (i = 0; i < len; i++) | 72 | for (i = 0; i < len; i++) |
73 | ret |= (hdr_addr[i] ^ ap->addr[i]) & ap->mask[i]; | 73 | ret |= (hdr_addr[i] ^ ap->addr[i]) & ap->mask[i]; |
74 | 74 | ||
75 | return (ret != 0); | 75 | return ret != 0; |
76 | } | 76 | } |
77 | 77 | ||
78 | /* | 78 | /* |
79 | * Unfortunatly, _b and _mask are not aligned to an int (or long int) | 79 | * Unfortunately, _b and _mask are not aligned to an int (or long int) |
80 | * Some arches dont care, unrolling the loop is a win on them. | 80 | * Some arches dont care, unrolling the loop is a win on them. |
81 | * For other arches, we only have a 16bit alignement. | 81 | * For other arches, we only have a 16bit alignement. |
82 | */ | 82 | */ |
@@ -228,7 +228,7 @@ arpt_error(struct sk_buff *skb, const struct xt_action_param *par) | |||
228 | return NF_DROP; | 228 | return NF_DROP; |
229 | } | 229 | } |
230 | 230 | ||
231 | static inline const struct arpt_entry_target * | 231 | static inline const struct xt_entry_target * |
232 | arpt_get_target_c(const struct arpt_entry *e) | 232 | arpt_get_target_c(const struct arpt_entry *e) |
233 | { | 233 | { |
234 | return arpt_get_target((struct arpt_entry *)e); | 234 | return arpt_get_target((struct arpt_entry *)e); |
@@ -260,6 +260,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, | |||
260 | void *table_base; | 260 | void *table_base; |
261 | const struct xt_table_info *private; | 261 | const struct xt_table_info *private; |
262 | struct xt_action_param acpar; | 262 | struct xt_action_param acpar; |
263 | unsigned int addend; | ||
263 | 264 | ||
264 | if (!pskb_may_pull(skb, arp_hdr_len(skb->dev))) | 265 | if (!pskb_may_pull(skb, arp_hdr_len(skb->dev))) |
265 | return NF_DROP; | 266 | return NF_DROP; |
@@ -267,7 +268,8 @@ unsigned int arpt_do_table(struct sk_buff *skb, | |||
267 | indev = in ? in->name : nulldevname; | 268 | indev = in ? in->name : nulldevname; |
268 | outdev = out ? out->name : nulldevname; | 269 | outdev = out ? out->name : nulldevname; |
269 | 270 | ||
270 | xt_info_rdlock_bh(); | 271 | local_bh_disable(); |
272 | addend = xt_write_recseq_begin(); | ||
271 | private = table->private; | 273 | private = table->private; |
272 | table_base = private->entries[smp_processor_id()]; | 274 | table_base = private->entries[smp_processor_id()]; |
273 | 275 | ||
@@ -282,7 +284,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, | |||
282 | 284 | ||
283 | arp = arp_hdr(skb); | 285 | arp = arp_hdr(skb); |
284 | do { | 286 | do { |
285 | const struct arpt_entry_target *t; | 287 | const struct xt_entry_target *t; |
286 | 288 | ||
287 | if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) { | 289 | if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) { |
288 | e = arpt_next_entry(e); | 290 | e = arpt_next_entry(e); |
@@ -297,10 +299,10 @@ unsigned int arpt_do_table(struct sk_buff *skb, | |||
297 | if (!t->u.kernel.target->target) { | 299 | if (!t->u.kernel.target->target) { |
298 | int v; | 300 | int v; |
299 | 301 | ||
300 | v = ((struct arpt_standard_target *)t)->verdict; | 302 | v = ((struct xt_standard_target *)t)->verdict; |
301 | if (v < 0) { | 303 | if (v < 0) { |
302 | /* Pop from stack? */ | 304 | /* Pop from stack? */ |
303 | if (v != ARPT_RETURN) { | 305 | if (v != XT_RETURN) { |
304 | verdict = (unsigned)(-v) - 1; | 306 | verdict = (unsigned)(-v) - 1; |
305 | break; | 307 | break; |
306 | } | 308 | } |
@@ -332,13 +334,14 @@ unsigned int arpt_do_table(struct sk_buff *skb, | |||
332 | /* Target might have changed stuff. */ | 334 | /* Target might have changed stuff. */ |
333 | arp = arp_hdr(skb); | 335 | arp = arp_hdr(skb); |
334 | 336 | ||
335 | if (verdict == ARPT_CONTINUE) | 337 | if (verdict == XT_CONTINUE) |
336 | e = arpt_next_entry(e); | 338 | e = arpt_next_entry(e); |
337 | else | 339 | else |
338 | /* Verdict */ | 340 | /* Verdict */ |
339 | break; | 341 | break; |
340 | } while (!acpar.hotdrop); | 342 | } while (!acpar.hotdrop); |
341 | xt_info_rdunlock_bh(); | 343 | xt_write_recseq_end(addend); |
344 | local_bh_enable(); | ||
342 | 345 | ||
343 | if (acpar.hotdrop) | 346 | if (acpar.hotdrop) |
344 | return NF_DROP; | 347 | return NF_DROP; |
@@ -377,7 +380,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo, | |||
377 | e->counters.pcnt = pos; | 380 | e->counters.pcnt = pos; |
378 | 381 | ||
379 | for (;;) { | 382 | for (;;) { |
380 | const struct arpt_standard_target *t | 383 | const struct xt_standard_target *t |
381 | = (void *)arpt_get_target_c(e); | 384 | = (void *)arpt_get_target_c(e); |
382 | int visited = e->comefrom & (1 << hook); | 385 | int visited = e->comefrom & (1 << hook); |
383 | 386 | ||
@@ -392,13 +395,13 @@ static int mark_source_chains(const struct xt_table_info *newinfo, | |||
392 | /* Unconditional return/END. */ | 395 | /* Unconditional return/END. */ |
393 | if ((e->target_offset == sizeof(struct arpt_entry) && | 396 | if ((e->target_offset == sizeof(struct arpt_entry) && |
394 | (strcmp(t->target.u.user.name, | 397 | (strcmp(t->target.u.user.name, |
395 | ARPT_STANDARD_TARGET) == 0) && | 398 | XT_STANDARD_TARGET) == 0) && |
396 | t->verdict < 0 && unconditional(&e->arp)) || | 399 | t->verdict < 0 && unconditional(&e->arp)) || |
397 | visited) { | 400 | visited) { |
398 | unsigned int oldpos, size; | 401 | unsigned int oldpos, size; |
399 | 402 | ||
400 | if ((strcmp(t->target.u.user.name, | 403 | if ((strcmp(t->target.u.user.name, |
401 | ARPT_STANDARD_TARGET) == 0) && | 404 | XT_STANDARD_TARGET) == 0) && |
402 | t->verdict < -NF_MAX_VERDICT - 1) { | 405 | t->verdict < -NF_MAX_VERDICT - 1) { |
403 | duprintf("mark_source_chains: bad " | 406 | duprintf("mark_source_chains: bad " |
404 | "negative verdict (%i)\n", | 407 | "negative verdict (%i)\n", |
@@ -433,7 +436,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo, | |||
433 | int newpos = t->verdict; | 436 | int newpos = t->verdict; |
434 | 437 | ||
435 | if (strcmp(t->target.u.user.name, | 438 | if (strcmp(t->target.u.user.name, |
436 | ARPT_STANDARD_TARGET) == 0 && | 439 | XT_STANDARD_TARGET) == 0 && |
437 | newpos >= 0) { | 440 | newpos >= 0) { |
438 | if (newpos > newinfo->size - | 441 | if (newpos > newinfo->size - |
439 | sizeof(struct arpt_entry)) { | 442 | sizeof(struct arpt_entry)) { |
@@ -464,14 +467,14 @@ static int mark_source_chains(const struct xt_table_info *newinfo, | |||
464 | 467 | ||
465 | static inline int check_entry(const struct arpt_entry *e, const char *name) | 468 | static inline int check_entry(const struct arpt_entry *e, const char *name) |
466 | { | 469 | { |
467 | const struct arpt_entry_target *t; | 470 | const struct xt_entry_target *t; |
468 | 471 | ||
469 | if (!arp_checkentry(&e->arp)) { | 472 | if (!arp_checkentry(&e->arp)) { |
470 | duprintf("arp_tables: arp check failed %p %s.\n", e, name); | 473 | duprintf("arp_tables: arp check failed %p %s.\n", e, name); |
471 | return -EINVAL; | 474 | return -EINVAL; |
472 | } | 475 | } |
473 | 476 | ||
474 | if (e->target_offset + sizeof(struct arpt_entry_target) > e->next_offset) | 477 | if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset) |
475 | return -EINVAL; | 478 | return -EINVAL; |
476 | 479 | ||
477 | t = arpt_get_target_c(e); | 480 | t = arpt_get_target_c(e); |
@@ -483,7 +486,7 @@ static inline int check_entry(const struct arpt_entry *e, const char *name) | |||
483 | 486 | ||
484 | static inline int check_target(struct arpt_entry *e, const char *name) | 487 | static inline int check_target(struct arpt_entry *e, const char *name) |
485 | { | 488 | { |
486 | struct arpt_entry_target *t = arpt_get_target(e); | 489 | struct xt_entry_target *t = arpt_get_target(e); |
487 | int ret; | 490 | int ret; |
488 | struct xt_tgchk_param par = { | 491 | struct xt_tgchk_param par = { |
489 | .table = name, | 492 | .table = name, |
@@ -506,7 +509,7 @@ static inline int check_target(struct arpt_entry *e, const char *name) | |||
506 | static inline int | 509 | static inline int |
507 | find_check_entry(struct arpt_entry *e, const char *name, unsigned int size) | 510 | find_check_entry(struct arpt_entry *e, const char *name, unsigned int size) |
508 | { | 511 | { |
509 | struct arpt_entry_target *t; | 512 | struct xt_entry_target *t; |
510 | struct xt_target *target; | 513 | struct xt_target *target; |
511 | int ret; | 514 | int ret; |
512 | 515 | ||
@@ -536,7 +539,7 @@ out: | |||
536 | 539 | ||
537 | static bool check_underflow(const struct arpt_entry *e) | 540 | static bool check_underflow(const struct arpt_entry *e) |
538 | { | 541 | { |
539 | const struct arpt_entry_target *t; | 542 | const struct xt_entry_target *t; |
540 | unsigned int verdict; | 543 | unsigned int verdict; |
541 | 544 | ||
542 | if (!unconditional(&e->arp)) | 545 | if (!unconditional(&e->arp)) |
@@ -544,7 +547,7 @@ static bool check_underflow(const struct arpt_entry *e) | |||
544 | t = arpt_get_target_c(e); | 547 | t = arpt_get_target_c(e); |
545 | if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) | 548 | if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) |
546 | return false; | 549 | return false; |
547 | verdict = ((struct arpt_standard_target *)t)->verdict; | 550 | verdict = ((struct xt_standard_target *)t)->verdict; |
548 | verdict = -verdict - 1; | 551 | verdict = -verdict - 1; |
549 | return verdict == NF_DROP || verdict == NF_ACCEPT; | 552 | return verdict == NF_DROP || verdict == NF_ACCEPT; |
550 | } | 553 | } |
@@ -566,7 +569,7 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, | |||
566 | } | 569 | } |
567 | 570 | ||
568 | if (e->next_offset | 571 | if (e->next_offset |
569 | < sizeof(struct arpt_entry) + sizeof(struct arpt_entry_target)) { | 572 | < sizeof(struct arpt_entry) + sizeof(struct xt_entry_target)) { |
570 | duprintf("checking: element %p size %u\n", | 573 | duprintf("checking: element %p size %u\n", |
571 | e, e->next_offset); | 574 | e, e->next_offset); |
572 | return -EINVAL; | 575 | return -EINVAL; |
@@ -598,7 +601,7 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, | |||
598 | static inline void cleanup_entry(struct arpt_entry *e) | 601 | static inline void cleanup_entry(struct arpt_entry *e) |
599 | { | 602 | { |
600 | struct xt_tgdtor_param par; | 603 | struct xt_tgdtor_param par; |
601 | struct arpt_entry_target *t; | 604 | struct xt_entry_target *t; |
602 | 605 | ||
603 | t = arpt_get_target(e); | 606 | t = arpt_get_target(e); |
604 | par.target = t->u.kernel.target; | 607 | par.target = t->u.kernel.target; |
@@ -710,42 +713,25 @@ static void get_counters(const struct xt_table_info *t, | |||
710 | struct arpt_entry *iter; | 713 | struct arpt_entry *iter; |
711 | unsigned int cpu; | 714 | unsigned int cpu; |
712 | unsigned int i; | 715 | unsigned int i; |
713 | unsigned int curcpu = get_cpu(); | ||
714 | |||
715 | /* Instead of clearing (by a previous call to memset()) | ||
716 | * the counters and using adds, we set the counters | ||
717 | * with data used by 'current' CPU | ||
718 | * | ||
719 | * Bottom half has to be disabled to prevent deadlock | ||
720 | * if new softirq were to run and call ipt_do_table | ||
721 | */ | ||
722 | local_bh_disable(); | ||
723 | i = 0; | ||
724 | xt_entry_foreach(iter, t->entries[curcpu], t->size) { | ||
725 | SET_COUNTER(counters[i], iter->counters.bcnt, | ||
726 | iter->counters.pcnt); | ||
727 | ++i; | ||
728 | } | ||
729 | local_bh_enable(); | ||
730 | /* Processing counters from other cpus, we can let bottom half enabled, | ||
731 | * (preemption is disabled) | ||
732 | */ | ||
733 | 716 | ||
734 | for_each_possible_cpu(cpu) { | 717 | for_each_possible_cpu(cpu) { |
735 | if (cpu == curcpu) | 718 | seqcount_t *s = &per_cpu(xt_recseq, cpu); |
736 | continue; | 719 | |
737 | i = 0; | 720 | i = 0; |
738 | local_bh_disable(); | ||
739 | xt_info_wrlock(cpu); | ||
740 | xt_entry_foreach(iter, t->entries[cpu], t->size) { | 721 | xt_entry_foreach(iter, t->entries[cpu], t->size) { |
741 | ADD_COUNTER(counters[i], iter->counters.bcnt, | 722 | u64 bcnt, pcnt; |
742 | iter->counters.pcnt); | 723 | unsigned int start; |
724 | |||
725 | do { | ||
726 | start = read_seqcount_begin(s); | ||
727 | bcnt = iter->counters.bcnt; | ||
728 | pcnt = iter->counters.pcnt; | ||
729 | } while (read_seqcount_retry(s, start)); | ||
730 | |||
731 | ADD_COUNTER(counters[i], bcnt, pcnt); | ||
743 | ++i; | 732 | ++i; |
744 | } | 733 | } |
745 | xt_info_wrunlock(cpu); | ||
746 | local_bh_enable(); | ||
747 | } | 734 | } |
748 | put_cpu(); | ||
749 | } | 735 | } |
750 | 736 | ||
751 | static struct xt_counters *alloc_counters(const struct xt_table *table) | 737 | static struct xt_counters *alloc_counters(const struct xt_table *table) |
@@ -759,7 +745,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table) | |||
759 | * about). | 745 | * about). |
760 | */ | 746 | */ |
761 | countersize = sizeof(struct xt_counters) * private->number; | 747 | countersize = sizeof(struct xt_counters) * private->number; |
762 | counters = vmalloc(countersize); | 748 | counters = vzalloc(countersize); |
763 | 749 | ||
764 | if (counters == NULL) | 750 | if (counters == NULL) |
765 | return ERR_PTR(-ENOMEM); | 751 | return ERR_PTR(-ENOMEM); |
@@ -794,7 +780,7 @@ static int copy_entries_to_user(unsigned int total_size, | |||
794 | /* FIXME: use iterator macros --RR */ | 780 | /* FIXME: use iterator macros --RR */ |
795 | /* ... then go back and fix counters and names */ | 781 | /* ... then go back and fix counters and names */ |
796 | for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ | 782 | for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ |
797 | const struct arpt_entry_target *t; | 783 | const struct xt_entry_target *t; |
798 | 784 | ||
799 | e = (struct arpt_entry *)(loc_cpu_entry + off); | 785 | e = (struct arpt_entry *)(loc_cpu_entry + off); |
800 | if (copy_to_user(userptr + off | 786 | if (copy_to_user(userptr + off |
@@ -807,7 +793,7 @@ static int copy_entries_to_user(unsigned int total_size, | |||
807 | 793 | ||
808 | t = arpt_get_target_c(e); | 794 | t = arpt_get_target_c(e); |
809 | if (copy_to_user(userptr + off + e->target_offset | 795 | if (copy_to_user(userptr + off + e->target_offset |
810 | + offsetof(struct arpt_entry_target, | 796 | + offsetof(struct xt_entry_target, |
811 | u.user.name), | 797 | u.user.name), |
812 | t->u.kernel.target->name, | 798 | t->u.kernel.target->name, |
813 | strlen(t->u.kernel.target->name)+1) != 0) { | 799 | strlen(t->u.kernel.target->name)+1) != 0) { |
@@ -844,7 +830,7 @@ static int compat_calc_entry(const struct arpt_entry *e, | |||
844 | const struct xt_table_info *info, | 830 | const struct xt_table_info *info, |
845 | const void *base, struct xt_table_info *newinfo) | 831 | const void *base, struct xt_table_info *newinfo) |
846 | { | 832 | { |
847 | const struct arpt_entry_target *t; | 833 | const struct xt_entry_target *t; |
848 | unsigned int entry_offset; | 834 | unsigned int entry_offset; |
849 | int off, i, ret; | 835 | int off, i, ret; |
850 | 836 | ||
@@ -883,6 +869,7 @@ static int compat_table_info(const struct xt_table_info *info, | |||
883 | memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); | 869 | memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); |
884 | newinfo->initial_entries = 0; | 870 | newinfo->initial_entries = 0; |
885 | loc_cpu_entry = info->entries[raw_smp_processor_id()]; | 871 | loc_cpu_entry = info->entries[raw_smp_processor_id()]; |
872 | xt_compat_init_offsets(NFPROTO_ARP, info->number); | ||
886 | xt_entry_foreach(iter, loc_cpu_entry, info->size) { | 873 | xt_entry_foreach(iter, loc_cpu_entry, info->size) { |
887 | ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); | 874 | ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); |
888 | if (ret != 0) | 875 | if (ret != 0) |
@@ -895,7 +882,7 @@ static int compat_table_info(const struct xt_table_info *info, | |||
895 | static int get_info(struct net *net, void __user *user, | 882 | static int get_info(struct net *net, void __user *user, |
896 | const int *len, int compat) | 883 | const int *len, int compat) |
897 | { | 884 | { |
898 | char name[ARPT_TABLE_MAXNAMELEN]; | 885 | char name[XT_TABLE_MAXNAMELEN]; |
899 | struct xt_table *t; | 886 | struct xt_table *t; |
900 | int ret; | 887 | int ret; |
901 | 888 | ||
@@ -908,7 +895,7 @@ static int get_info(struct net *net, void __user *user, | |||
908 | if (copy_from_user(name, user, sizeof(name)) != 0) | 895 | if (copy_from_user(name, user, sizeof(name)) != 0) |
909 | return -EFAULT; | 896 | return -EFAULT; |
910 | 897 | ||
911 | name[ARPT_TABLE_MAXNAMELEN-1] = '\0'; | 898 | name[XT_TABLE_MAXNAMELEN-1] = '\0'; |
912 | #ifdef CONFIG_COMPAT | 899 | #ifdef CONFIG_COMPAT |
913 | if (compat) | 900 | if (compat) |
914 | xt_compat_lock(NFPROTO_ARP); | 901 | xt_compat_lock(NFPROTO_ARP); |
@@ -927,6 +914,7 @@ static int get_info(struct net *net, void __user *user, | |||
927 | private = &tmp; | 914 | private = &tmp; |
928 | } | 915 | } |
929 | #endif | 916 | #endif |
917 | memset(&info, 0, sizeof(info)); | ||
930 | info.valid_hooks = t->valid_hooks; | 918 | info.valid_hooks = t->valid_hooks; |
931 | memcpy(info.hook_entry, private->hook_entry, | 919 | memcpy(info.hook_entry, private->hook_entry, |
932 | sizeof(info.hook_entry)); | 920 | sizeof(info.hook_entry)); |
@@ -1006,7 +994,7 @@ static int __do_replace(struct net *net, const char *name, | |||
1006 | struct arpt_entry *iter; | 994 | struct arpt_entry *iter; |
1007 | 995 | ||
1008 | ret = 0; | 996 | ret = 0; |
1009 | counters = vmalloc(num_counters * sizeof(struct xt_counters)); | 997 | counters = vzalloc(num_counters * sizeof(struct xt_counters)); |
1010 | if (!counters) { | 998 | if (!counters) { |
1011 | ret = -ENOMEM; | 999 | ret = -ENOMEM; |
1012 | goto out; | 1000 | goto out; |
@@ -1081,6 +1069,7 @@ static int do_replace(struct net *net, const void __user *user, | |||
1081 | /* overflow check */ | 1069 | /* overflow check */ |
1082 | if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) | 1070 | if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) |
1083 | return -ENOMEM; | 1071 | return -ENOMEM; |
1072 | tmp.name[sizeof(tmp.name)-1] = 0; | ||
1084 | 1073 | ||
1085 | newinfo = xt_alloc_table_info(tmp.size); | 1074 | newinfo = xt_alloc_table_info(tmp.size); |
1086 | if (!newinfo) | 1075 | if (!newinfo) |
@@ -1129,6 +1118,7 @@ static int do_add_counters(struct net *net, const void __user *user, | |||
1129 | int ret = 0; | 1118 | int ret = 0; |
1130 | void *loc_cpu_entry; | 1119 | void *loc_cpu_entry; |
1131 | struct arpt_entry *iter; | 1120 | struct arpt_entry *iter; |
1121 | unsigned int addend; | ||
1132 | #ifdef CONFIG_COMPAT | 1122 | #ifdef CONFIG_COMPAT |
1133 | struct compat_xt_counters_info compat_tmp; | 1123 | struct compat_xt_counters_info compat_tmp; |
1134 | 1124 | ||
@@ -1185,12 +1175,12 @@ static int do_add_counters(struct net *net, const void __user *user, | |||
1185 | /* Choose the copy that is on our node */ | 1175 | /* Choose the copy that is on our node */ |
1186 | curcpu = smp_processor_id(); | 1176 | curcpu = smp_processor_id(); |
1187 | loc_cpu_entry = private->entries[curcpu]; | 1177 | loc_cpu_entry = private->entries[curcpu]; |
1188 | xt_info_wrlock(curcpu); | 1178 | addend = xt_write_recseq_begin(); |
1189 | xt_entry_foreach(iter, loc_cpu_entry, private->size) { | 1179 | xt_entry_foreach(iter, loc_cpu_entry, private->size) { |
1190 | ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt); | 1180 | ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt); |
1191 | ++i; | 1181 | ++i; |
1192 | } | 1182 | } |
1193 | xt_info_wrunlock(curcpu); | 1183 | xt_write_recseq_end(addend); |
1194 | unlock_up_free: | 1184 | unlock_up_free: |
1195 | local_bh_enable(); | 1185 | local_bh_enable(); |
1196 | xt_table_unlock(t); | 1186 | xt_table_unlock(t); |
@@ -1204,7 +1194,7 @@ static int do_add_counters(struct net *net, const void __user *user, | |||
1204 | #ifdef CONFIG_COMPAT | 1194 | #ifdef CONFIG_COMPAT |
1205 | static inline void compat_release_entry(struct compat_arpt_entry *e) | 1195 | static inline void compat_release_entry(struct compat_arpt_entry *e) |
1206 | { | 1196 | { |
1207 | struct arpt_entry_target *t; | 1197 | struct xt_entry_target *t; |
1208 | 1198 | ||
1209 | t = compat_arpt_get_target(e); | 1199 | t = compat_arpt_get_target(e); |
1210 | module_put(t->u.kernel.target->me); | 1200 | module_put(t->u.kernel.target->me); |
@@ -1220,7 +1210,7 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, | |||
1220 | const unsigned int *underflows, | 1210 | const unsigned int *underflows, |
1221 | const char *name) | 1211 | const char *name) |
1222 | { | 1212 | { |
1223 | struct arpt_entry_target *t; | 1213 | struct xt_entry_target *t; |
1224 | struct xt_target *target; | 1214 | struct xt_target *target; |
1225 | unsigned int entry_offset; | 1215 | unsigned int entry_offset; |
1226 | int ret, off, h; | 1216 | int ret, off, h; |
@@ -1288,7 +1278,7 @@ compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr, | |||
1288 | unsigned int *size, const char *name, | 1278 | unsigned int *size, const char *name, |
1289 | struct xt_table_info *newinfo, unsigned char *base) | 1279 | struct xt_table_info *newinfo, unsigned char *base) |
1290 | { | 1280 | { |
1291 | struct arpt_entry_target *t; | 1281 | struct xt_entry_target *t; |
1292 | struct xt_target *target; | 1282 | struct xt_target *target; |
1293 | struct arpt_entry *de; | 1283 | struct arpt_entry *de; |
1294 | unsigned int origsize; | 1284 | unsigned int origsize; |
@@ -1349,6 +1339,7 @@ static int translate_compat_table(const char *name, | |||
1349 | duprintf("translate_compat_table: size %u\n", info->size); | 1339 | duprintf("translate_compat_table: size %u\n", info->size); |
1350 | j = 0; | 1340 | j = 0; |
1351 | xt_compat_lock(NFPROTO_ARP); | 1341 | xt_compat_lock(NFPROTO_ARP); |
1342 | xt_compat_init_offsets(NFPROTO_ARP, number); | ||
1352 | /* Walk through entries, checking offsets. */ | 1343 | /* Walk through entries, checking offsets. */ |
1353 | xt_entry_foreach(iter0, entry0, total_size) { | 1344 | xt_entry_foreach(iter0, entry0, total_size) { |
1354 | ret = check_compat_entry_size_and_hooks(iter0, info, &size, | 1345 | ret = check_compat_entry_size_and_hooks(iter0, info, &size, |
@@ -1474,7 +1465,7 @@ out_unlock: | |||
1474 | } | 1465 | } |
1475 | 1466 | ||
1476 | struct compat_arpt_replace { | 1467 | struct compat_arpt_replace { |
1477 | char name[ARPT_TABLE_MAXNAMELEN]; | 1468 | char name[XT_TABLE_MAXNAMELEN]; |
1478 | u32 valid_hooks; | 1469 | u32 valid_hooks; |
1479 | u32 num_entries; | 1470 | u32 num_entries; |
1480 | u32 size; | 1471 | u32 size; |
@@ -1502,6 +1493,7 @@ static int compat_do_replace(struct net *net, void __user *user, | |||
1502 | return -ENOMEM; | 1493 | return -ENOMEM; |
1503 | if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) | 1494 | if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) |
1504 | return -ENOMEM; | 1495 | return -ENOMEM; |
1496 | tmp.name[sizeof(tmp.name)-1] = 0; | ||
1505 | 1497 | ||
1506 | newinfo = xt_alloc_table_info(tmp.size); | 1498 | newinfo = xt_alloc_table_info(tmp.size); |
1507 | if (!newinfo) | 1499 | if (!newinfo) |
@@ -1567,7 +1559,7 @@ static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr, | |||
1567 | struct xt_counters *counters, | 1559 | struct xt_counters *counters, |
1568 | unsigned int i) | 1560 | unsigned int i) |
1569 | { | 1561 | { |
1570 | struct arpt_entry_target *t; | 1562 | struct xt_entry_target *t; |
1571 | struct compat_arpt_entry __user *ce; | 1563 | struct compat_arpt_entry __user *ce; |
1572 | u_int16_t target_offset, next_offset; | 1564 | u_int16_t target_offset, next_offset; |
1573 | compat_uint_t origsize; | 1565 | compat_uint_t origsize; |
@@ -1628,7 +1620,7 @@ static int compat_copy_entries_to_user(unsigned int total_size, | |||
1628 | } | 1620 | } |
1629 | 1621 | ||
1630 | struct compat_arpt_get_entries { | 1622 | struct compat_arpt_get_entries { |
1631 | char name[ARPT_TABLE_MAXNAMELEN]; | 1623 | char name[XT_TABLE_MAXNAMELEN]; |
1632 | compat_uint_t size; | 1624 | compat_uint_t size; |
1633 | struct compat_arpt_entry entrytable[0]; | 1625 | struct compat_arpt_entry entrytable[0]; |
1634 | }; | 1626 | }; |
@@ -1754,6 +1746,7 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len | |||
1754 | ret = -EFAULT; | 1746 | ret = -EFAULT; |
1755 | break; | 1747 | break; |
1756 | } | 1748 | } |
1749 | rev.name[sizeof(rev.name)-1] = 0; | ||
1757 | 1750 | ||
1758 | try_then_request_module(xt_find_revision(NFPROTO_ARP, rev.name, | 1751 | try_then_request_module(xt_find_revision(NFPROTO_ARP, rev.name, |
1759 | rev.revision, 1, &ret), | 1752 | rev.revision, 1, &ret), |
@@ -1828,7 +1821,7 @@ void arpt_unregister_table(struct xt_table *table) | |||
1828 | /* The built-in targets: standard (NULL) and error. */ | 1821 | /* The built-in targets: standard (NULL) and error. */ |
1829 | static struct xt_target arpt_builtin_tg[] __read_mostly = { | 1822 | static struct xt_target arpt_builtin_tg[] __read_mostly = { |
1830 | { | 1823 | { |
1831 | .name = ARPT_STANDARD_TARGET, | 1824 | .name = XT_STANDARD_TARGET, |
1832 | .targetsize = sizeof(int), | 1825 | .targetsize = sizeof(int), |
1833 | .family = NFPROTO_ARP, | 1826 | .family = NFPROTO_ARP, |
1834 | #ifdef CONFIG_COMPAT | 1827 | #ifdef CONFIG_COMPAT |
@@ -1838,9 +1831,9 @@ static struct xt_target arpt_builtin_tg[] __read_mostly = { | |||
1838 | #endif | 1831 | #endif |
1839 | }, | 1832 | }, |
1840 | { | 1833 | { |
1841 | .name = ARPT_ERROR_TARGET, | 1834 | .name = XT_ERROR_TARGET, |
1842 | .target = arpt_error, | 1835 | .target = arpt_error, |
1843 | .targetsize = ARPT_FUNCTION_MAXNAMELEN, | 1836 | .targetsize = XT_FUNCTION_MAXNAMELEN, |
1844 | .family = NFPROTO_ARP, | 1837 | .family = NFPROTO_ARP, |
1845 | }, | 1838 | }, |
1846 | }; | 1839 | }; |
@@ -1885,7 +1878,7 @@ static int __init arp_tables_init(void) | |||
1885 | if (ret < 0) | 1878 | if (ret < 0) |
1886 | goto err1; | 1879 | goto err1; |
1887 | 1880 | ||
1888 | /* Noone else will be downing sem now, so we won't sleep */ | 1881 | /* No one else will be downing sem now, so we won't sleep */ |
1889 | ret = xt_register_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg)); | 1882 | ret = xt_register_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg)); |
1890 | if (ret < 0) | 1883 | if (ret < 0) |
1891 | goto err2; | 1884 | goto err2; |
diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c index e1be7dd1171b..a5e52a9f0a12 100644 --- a/net/ipv4/netfilter/arpt_mangle.c +++ b/net/ipv4/netfilter/arpt_mangle.c | |||
@@ -60,12 +60,12 @@ static int checkentry(const struct xt_tgchk_param *par) | |||
60 | 60 | ||
61 | if (mangle->flags & ~ARPT_MANGLE_MASK || | 61 | if (mangle->flags & ~ARPT_MANGLE_MASK || |
62 | !(mangle->flags & ARPT_MANGLE_MASK)) | 62 | !(mangle->flags & ARPT_MANGLE_MASK)) |
63 | return false; | 63 | return -EINVAL; |
64 | 64 | ||
65 | if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT && | 65 | if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT && |
66 | mangle->target != ARPT_CONTINUE) | 66 | mangle->target != XT_CONTINUE) |
67 | return false; | 67 | return -EINVAL; |
68 | return true; | 68 | return 0; |
69 | } | 69 | } |
70 | 70 | ||
71 | static struct xt_target arpt_mangle_reg __read_mostly = { | 71 | static struct xt_target arpt_mangle_reg __read_mostly = { |
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index d2c1311cb28d..5c9b9d963918 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c | |||
@@ -203,7 +203,8 @@ ipq_build_packet_message(struct nf_queue_entry *entry, int *errp) | |||
203 | else | 203 | else |
204 | pmsg->outdev_name[0] = '\0'; | 204 | pmsg->outdev_name[0] = '\0'; |
205 | 205 | ||
206 | if (entry->indev && entry->skb->dev) { | 206 | if (entry->indev && entry->skb->dev && |
207 | entry->skb->mac_header != entry->skb->network_header) { | ||
207 | pmsg->hw_type = entry->skb->dev->type; | 208 | pmsg->hw_type = entry->skb->dev->type; |
208 | pmsg->hw_addrlen = dev_parse_header(entry->skb, | 209 | pmsg->hw_addrlen = dev_parse_header(entry->skb, |
209 | pmsg->hw_addr); | 210 | pmsg->hw_addr); |
@@ -402,7 +403,8 @@ ipq_dev_drop(int ifindex) | |||
402 | static inline void | 403 | static inline void |
403 | __ipq_rcv_skb(struct sk_buff *skb) | 404 | __ipq_rcv_skb(struct sk_buff *skb) |
404 | { | 405 | { |
405 | int status, type, pid, flags, nlmsglen, skblen; | 406 | int status, type, pid, flags; |
407 | unsigned int nlmsglen, skblen; | ||
406 | struct nlmsghdr *nlh; | 408 | struct nlmsghdr *nlh; |
407 | 409 | ||
408 | skblen = skb->len; | 410 | skblen = skb->len; |
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index d163f2e3b2e9..24e556e83a3b 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c | |||
@@ -68,15 +68,6 @@ void *ipt_alloc_initial_table(const struct xt_table *info) | |||
68 | } | 68 | } |
69 | EXPORT_SYMBOL_GPL(ipt_alloc_initial_table); | 69 | EXPORT_SYMBOL_GPL(ipt_alloc_initial_table); |
70 | 70 | ||
71 | /* | ||
72 | We keep a set of rules for each CPU, so we can avoid write-locking | ||
73 | them in the softirq when updating the counters and therefore | ||
74 | only need to read-lock in the softirq; doing a write_lock_bh() in user | ||
75 | context stops packets coming through and allows user context to read | ||
76 | the counters or update the rules. | ||
77 | |||
78 | Hence the start of any table is given by get_table() below. */ | ||
79 | |||
80 | /* Returns whether matches rule or not. */ | 71 | /* Returns whether matches rule or not. */ |
81 | /* Performance critical - called for every packet */ | 72 | /* Performance critical - called for every packet */ |
82 | static inline bool | 73 | static inline bool |
@@ -186,7 +177,7 @@ static inline bool unconditional(const struct ipt_ip *ip) | |||
186 | } | 177 | } |
187 | 178 | ||
188 | /* for const-correctness */ | 179 | /* for const-correctness */ |
189 | static inline const struct ipt_entry_target * | 180 | static inline const struct xt_entry_target * |
190 | ipt_get_target_c(const struct ipt_entry *e) | 181 | ipt_get_target_c(const struct ipt_entry *e) |
191 | { | 182 | { |
192 | return ipt_get_target((struct ipt_entry *)e); | 183 | return ipt_get_target((struct ipt_entry *)e); |
@@ -230,9 +221,9 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e, | |||
230 | const char *hookname, const char **chainname, | 221 | const char *hookname, const char **chainname, |
231 | const char **comment, unsigned int *rulenum) | 222 | const char **comment, unsigned int *rulenum) |
232 | { | 223 | { |
233 | const struct ipt_standard_target *t = (void *)ipt_get_target_c(s); | 224 | const struct xt_standard_target *t = (void *)ipt_get_target_c(s); |
234 | 225 | ||
235 | if (strcmp(t->target.u.kernel.target->name, IPT_ERROR_TARGET) == 0) { | 226 | if (strcmp(t->target.u.kernel.target->name, XT_ERROR_TARGET) == 0) { |
236 | /* Head of user chain: ERROR target with chainname */ | 227 | /* Head of user chain: ERROR target with chainname */ |
237 | *chainname = t->target.data; | 228 | *chainname = t->target.data; |
238 | (*rulenum) = 0; | 229 | (*rulenum) = 0; |
@@ -241,7 +232,7 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e, | |||
241 | 232 | ||
242 | if (s->target_offset == sizeof(struct ipt_entry) && | 233 | if (s->target_offset == sizeof(struct ipt_entry) && |
243 | strcmp(t->target.u.kernel.target->name, | 234 | strcmp(t->target.u.kernel.target->name, |
244 | IPT_STANDARD_TARGET) == 0 && | 235 | XT_STANDARD_TARGET) == 0 && |
245 | t->verdict < 0 && | 236 | t->verdict < 0 && |
246 | unconditional(&s->ip)) { | 237 | unconditional(&s->ip)) { |
247 | /* Tail of chains: STANDARD target (return/policy) */ | 238 | /* Tail of chains: STANDARD target (return/policy) */ |
@@ -311,6 +302,7 @@ ipt_do_table(struct sk_buff *skb, | |||
311 | unsigned int *stackptr, origptr, cpu; | 302 | unsigned int *stackptr, origptr, cpu; |
312 | const struct xt_table_info *private; | 303 | const struct xt_table_info *private; |
313 | struct xt_action_param acpar; | 304 | struct xt_action_param acpar; |
305 | unsigned int addend; | ||
314 | 306 | ||
315 | /* Initialization */ | 307 | /* Initialization */ |
316 | ip = ip_hdr(skb); | 308 | ip = ip_hdr(skb); |
@@ -331,7 +323,8 @@ ipt_do_table(struct sk_buff *skb, | |||
331 | acpar.hooknum = hook; | 323 | acpar.hooknum = hook; |
332 | 324 | ||
333 | IP_NF_ASSERT(table->valid_hooks & (1 << hook)); | 325 | IP_NF_ASSERT(table->valid_hooks & (1 << hook)); |
334 | xt_info_rdlock_bh(); | 326 | local_bh_disable(); |
327 | addend = xt_write_recseq_begin(); | ||
335 | private = table->private; | 328 | private = table->private; |
336 | cpu = smp_processor_id(); | 329 | cpu = smp_processor_id(); |
337 | table_base = private->entries[cpu]; | 330 | table_base = private->entries[cpu]; |
@@ -346,7 +339,7 @@ ipt_do_table(struct sk_buff *skb, | |||
346 | get_entry(table_base, private->underflow[hook])); | 339 | get_entry(table_base, private->underflow[hook])); |
347 | 340 | ||
348 | do { | 341 | do { |
349 | const struct ipt_entry_target *t; | 342 | const struct xt_entry_target *t; |
350 | const struct xt_entry_match *ematch; | 343 | const struct xt_entry_match *ematch; |
351 | 344 | ||
352 | IP_NF_ASSERT(e); | 345 | IP_NF_ASSERT(e); |
@@ -380,14 +373,14 @@ ipt_do_table(struct sk_buff *skb, | |||
380 | if (!t->u.kernel.target->target) { | 373 | if (!t->u.kernel.target->target) { |
381 | int v; | 374 | int v; |
382 | 375 | ||
383 | v = ((struct ipt_standard_target *)t)->verdict; | 376 | v = ((struct xt_standard_target *)t)->verdict; |
384 | if (v < 0) { | 377 | if (v < 0) { |
385 | /* Pop from stack? */ | 378 | /* Pop from stack? */ |
386 | if (v != IPT_RETURN) { | 379 | if (v != XT_RETURN) { |
387 | verdict = (unsigned)(-v) - 1; | 380 | verdict = (unsigned)(-v) - 1; |
388 | break; | 381 | break; |
389 | } | 382 | } |
390 | if (*stackptr == 0) { | 383 | if (*stackptr <= origptr) { |
391 | e = get_entry(table_base, | 384 | e = get_entry(table_base, |
392 | private->underflow[hook]); | 385 | private->underflow[hook]); |
393 | pr_debug("Underflow (this is normal) " | 386 | pr_debug("Underflow (this is normal) " |
@@ -421,16 +414,18 @@ ipt_do_table(struct sk_buff *skb, | |||
421 | verdict = t->u.kernel.target->target(skb, &acpar); | 414 | verdict = t->u.kernel.target->target(skb, &acpar); |
422 | /* Target might have changed stuff. */ | 415 | /* Target might have changed stuff. */ |
423 | ip = ip_hdr(skb); | 416 | ip = ip_hdr(skb); |
424 | if (verdict == IPT_CONTINUE) | 417 | if (verdict == XT_CONTINUE) |
425 | e = ipt_next_entry(e); | 418 | e = ipt_next_entry(e); |
426 | else | 419 | else |
427 | /* Verdict */ | 420 | /* Verdict */ |
428 | break; | 421 | break; |
429 | } while (!acpar.hotdrop); | 422 | } while (!acpar.hotdrop); |
430 | xt_info_rdunlock_bh(); | ||
431 | pr_debug("Exiting %s; resetting sp from %u to %u\n", | 423 | pr_debug("Exiting %s; resetting sp from %u to %u\n", |
432 | __func__, *stackptr, origptr); | 424 | __func__, *stackptr, origptr); |
433 | *stackptr = origptr; | 425 | *stackptr = origptr; |
426 | xt_write_recseq_end(addend); | ||
427 | local_bh_enable(); | ||
428 | |||
434 | #ifdef DEBUG_ALLOW_ALL | 429 | #ifdef DEBUG_ALLOW_ALL |
435 | return NF_ACCEPT; | 430 | return NF_ACCEPT; |
436 | #else | 431 | #else |
@@ -461,7 +456,7 @@ mark_source_chains(const struct xt_table_info *newinfo, | |||
461 | e->counters.pcnt = pos; | 456 | e->counters.pcnt = pos; |
462 | 457 | ||
463 | for (;;) { | 458 | for (;;) { |
464 | const struct ipt_standard_target *t | 459 | const struct xt_standard_target *t |
465 | = (void *)ipt_get_target_c(e); | 460 | = (void *)ipt_get_target_c(e); |
466 | int visited = e->comefrom & (1 << hook); | 461 | int visited = e->comefrom & (1 << hook); |
467 | 462 | ||
@@ -475,13 +470,13 @@ mark_source_chains(const struct xt_table_info *newinfo, | |||
475 | /* Unconditional return/END. */ | 470 | /* Unconditional return/END. */ |
476 | if ((e->target_offset == sizeof(struct ipt_entry) && | 471 | if ((e->target_offset == sizeof(struct ipt_entry) && |
477 | (strcmp(t->target.u.user.name, | 472 | (strcmp(t->target.u.user.name, |
478 | IPT_STANDARD_TARGET) == 0) && | 473 | XT_STANDARD_TARGET) == 0) && |
479 | t->verdict < 0 && unconditional(&e->ip)) || | 474 | t->verdict < 0 && unconditional(&e->ip)) || |
480 | visited) { | 475 | visited) { |
481 | unsigned int oldpos, size; | 476 | unsigned int oldpos, size; |
482 | 477 | ||
483 | if ((strcmp(t->target.u.user.name, | 478 | if ((strcmp(t->target.u.user.name, |
484 | IPT_STANDARD_TARGET) == 0) && | 479 | XT_STANDARD_TARGET) == 0) && |
485 | t->verdict < -NF_MAX_VERDICT - 1) { | 480 | t->verdict < -NF_MAX_VERDICT - 1) { |
486 | duprintf("mark_source_chains: bad " | 481 | duprintf("mark_source_chains: bad " |
487 | "negative verdict (%i)\n", | 482 | "negative verdict (%i)\n", |
@@ -524,7 +519,7 @@ mark_source_chains(const struct xt_table_info *newinfo, | |||
524 | int newpos = t->verdict; | 519 | int newpos = t->verdict; |
525 | 520 | ||
526 | if (strcmp(t->target.u.user.name, | 521 | if (strcmp(t->target.u.user.name, |
527 | IPT_STANDARD_TARGET) == 0 && | 522 | XT_STANDARD_TARGET) == 0 && |
528 | newpos >= 0) { | 523 | newpos >= 0) { |
529 | if (newpos > newinfo->size - | 524 | if (newpos > newinfo->size - |
530 | sizeof(struct ipt_entry)) { | 525 | sizeof(struct ipt_entry)) { |
@@ -552,7 +547,7 @@ mark_source_chains(const struct xt_table_info *newinfo, | |||
552 | return 1; | 547 | return 1; |
553 | } | 548 | } |
554 | 549 | ||
555 | static void cleanup_match(struct ipt_entry_match *m, struct net *net) | 550 | static void cleanup_match(struct xt_entry_match *m, struct net *net) |
556 | { | 551 | { |
557 | struct xt_mtdtor_param par; | 552 | struct xt_mtdtor_param par; |
558 | 553 | ||
@@ -568,14 +563,14 @@ static void cleanup_match(struct ipt_entry_match *m, struct net *net) | |||
568 | static int | 563 | static int |
569 | check_entry(const struct ipt_entry *e, const char *name) | 564 | check_entry(const struct ipt_entry *e, const char *name) |
570 | { | 565 | { |
571 | const struct ipt_entry_target *t; | 566 | const struct xt_entry_target *t; |
572 | 567 | ||
573 | if (!ip_checkentry(&e->ip)) { | 568 | if (!ip_checkentry(&e->ip)) { |
574 | duprintf("ip check failed %p %s.\n", e, par->match->name); | 569 | duprintf("ip check failed %p %s.\n", e, name); |
575 | return -EINVAL; | 570 | return -EINVAL; |
576 | } | 571 | } |
577 | 572 | ||
578 | if (e->target_offset + sizeof(struct ipt_entry_target) > | 573 | if (e->target_offset + sizeof(struct xt_entry_target) > |
579 | e->next_offset) | 574 | e->next_offset) |
580 | return -EINVAL; | 575 | return -EINVAL; |
581 | 576 | ||
@@ -587,7 +582,7 @@ check_entry(const struct ipt_entry *e, const char *name) | |||
587 | } | 582 | } |
588 | 583 | ||
589 | static int | 584 | static int |
590 | check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par) | 585 | check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) |
591 | { | 586 | { |
592 | const struct ipt_ip *ip = par->entryinfo; | 587 | const struct ipt_ip *ip = par->entryinfo; |
593 | int ret; | 588 | int ret; |
@@ -605,7 +600,7 @@ check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par) | |||
605 | } | 600 | } |
606 | 601 | ||
607 | static int | 602 | static int |
608 | find_check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par) | 603 | find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) |
609 | { | 604 | { |
610 | struct xt_match *match; | 605 | struct xt_match *match; |
611 | int ret; | 606 | int ret; |
@@ -630,7 +625,7 @@ err: | |||
630 | 625 | ||
631 | static int check_target(struct ipt_entry *e, struct net *net, const char *name) | 626 | static int check_target(struct ipt_entry *e, struct net *net, const char *name) |
632 | { | 627 | { |
633 | struct ipt_entry_target *t = ipt_get_target(e); | 628 | struct xt_entry_target *t = ipt_get_target(e); |
634 | struct xt_tgchk_param par = { | 629 | struct xt_tgchk_param par = { |
635 | .net = net, | 630 | .net = net, |
636 | .table = name, | 631 | .table = name, |
@@ -656,7 +651,7 @@ static int | |||
656 | find_check_entry(struct ipt_entry *e, struct net *net, const char *name, | 651 | find_check_entry(struct ipt_entry *e, struct net *net, const char *name, |
657 | unsigned int size) | 652 | unsigned int size) |
658 | { | 653 | { |
659 | struct ipt_entry_target *t; | 654 | struct xt_entry_target *t; |
660 | struct xt_target *target; | 655 | struct xt_target *target; |
661 | int ret; | 656 | int ret; |
662 | unsigned int j; | 657 | unsigned int j; |
@@ -707,7 +702,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, | |||
707 | 702 | ||
708 | static bool check_underflow(const struct ipt_entry *e) | 703 | static bool check_underflow(const struct ipt_entry *e) |
709 | { | 704 | { |
710 | const struct ipt_entry_target *t; | 705 | const struct xt_entry_target *t; |
711 | unsigned int verdict; | 706 | unsigned int verdict; |
712 | 707 | ||
713 | if (!unconditional(&e->ip)) | 708 | if (!unconditional(&e->ip)) |
@@ -715,7 +710,7 @@ static bool check_underflow(const struct ipt_entry *e) | |||
715 | t = ipt_get_target_c(e); | 710 | t = ipt_get_target_c(e); |
716 | if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) | 711 | if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) |
717 | return false; | 712 | return false; |
718 | verdict = ((struct ipt_standard_target *)t)->verdict; | 713 | verdict = ((struct xt_standard_target *)t)->verdict; |
719 | verdict = -verdict - 1; | 714 | verdict = -verdict - 1; |
720 | return verdict == NF_DROP || verdict == NF_ACCEPT; | 715 | return verdict == NF_DROP || verdict == NF_ACCEPT; |
721 | } | 716 | } |
@@ -738,7 +733,7 @@ check_entry_size_and_hooks(struct ipt_entry *e, | |||
738 | } | 733 | } |
739 | 734 | ||
740 | if (e->next_offset | 735 | if (e->next_offset |
741 | < sizeof(struct ipt_entry) + sizeof(struct ipt_entry_target)) { | 736 | < sizeof(struct ipt_entry) + sizeof(struct xt_entry_target)) { |
742 | duprintf("checking: element %p size %u\n", | 737 | duprintf("checking: element %p size %u\n", |
743 | e, e->next_offset); | 738 | e, e->next_offset); |
744 | return -EINVAL; | 739 | return -EINVAL; |
@@ -771,7 +766,7 @@ static void | |||
771 | cleanup_entry(struct ipt_entry *e, struct net *net) | 766 | cleanup_entry(struct ipt_entry *e, struct net *net) |
772 | { | 767 | { |
773 | struct xt_tgdtor_param par; | 768 | struct xt_tgdtor_param par; |
774 | struct ipt_entry_target *t; | 769 | struct xt_entry_target *t; |
775 | struct xt_entry_match *ematch; | 770 | struct xt_entry_match *ematch; |
776 | 771 | ||
777 | /* Cleanup all matches */ | 772 | /* Cleanup all matches */ |
@@ -884,42 +879,25 @@ get_counters(const struct xt_table_info *t, | |||
884 | struct ipt_entry *iter; | 879 | struct ipt_entry *iter; |
885 | unsigned int cpu; | 880 | unsigned int cpu; |
886 | unsigned int i; | 881 | unsigned int i; |
887 | unsigned int curcpu = get_cpu(); | ||
888 | |||
889 | /* Instead of clearing (by a previous call to memset()) | ||
890 | * the counters and using adds, we set the counters | ||
891 | * with data used by 'current' CPU. | ||
892 | * | ||
893 | * Bottom half has to be disabled to prevent deadlock | ||
894 | * if new softirq were to run and call ipt_do_table | ||
895 | */ | ||
896 | local_bh_disable(); | ||
897 | i = 0; | ||
898 | xt_entry_foreach(iter, t->entries[curcpu], t->size) { | ||
899 | SET_COUNTER(counters[i], iter->counters.bcnt, | ||
900 | iter->counters.pcnt); | ||
901 | ++i; | ||
902 | } | ||
903 | local_bh_enable(); | ||
904 | /* Processing counters from other cpus, we can let bottom half enabled, | ||
905 | * (preemption is disabled) | ||
906 | */ | ||
907 | 882 | ||
908 | for_each_possible_cpu(cpu) { | 883 | for_each_possible_cpu(cpu) { |
909 | if (cpu == curcpu) | 884 | seqcount_t *s = &per_cpu(xt_recseq, cpu); |
910 | continue; | 885 | |
911 | i = 0; | 886 | i = 0; |
912 | local_bh_disable(); | ||
913 | xt_info_wrlock(cpu); | ||
914 | xt_entry_foreach(iter, t->entries[cpu], t->size) { | 887 | xt_entry_foreach(iter, t->entries[cpu], t->size) { |
915 | ADD_COUNTER(counters[i], iter->counters.bcnt, | 888 | u64 bcnt, pcnt; |
916 | iter->counters.pcnt); | 889 | unsigned int start; |
890 | |||
891 | do { | ||
892 | start = read_seqcount_begin(s); | ||
893 | bcnt = iter->counters.bcnt; | ||
894 | pcnt = iter->counters.pcnt; | ||
895 | } while (read_seqcount_retry(s, start)); | ||
896 | |||
897 | ADD_COUNTER(counters[i], bcnt, pcnt); | ||
917 | ++i; /* macro does multi eval of i */ | 898 | ++i; /* macro does multi eval of i */ |
918 | } | 899 | } |
919 | xt_info_wrunlock(cpu); | ||
920 | local_bh_enable(); | ||
921 | } | 900 | } |
922 | put_cpu(); | ||
923 | } | 901 | } |
924 | 902 | ||
925 | static struct xt_counters *alloc_counters(const struct xt_table *table) | 903 | static struct xt_counters *alloc_counters(const struct xt_table *table) |
@@ -932,7 +910,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table) | |||
932 | (other than comefrom, which userspace doesn't care | 910 | (other than comefrom, which userspace doesn't care |
933 | about). */ | 911 | about). */ |
934 | countersize = sizeof(struct xt_counters) * private->number; | 912 | countersize = sizeof(struct xt_counters) * private->number; |
935 | counters = vmalloc(countersize); | 913 | counters = vzalloc(countersize); |
936 | 914 | ||
937 | if (counters == NULL) | 915 | if (counters == NULL) |
938 | return ERR_PTR(-ENOMEM); | 916 | return ERR_PTR(-ENOMEM); |
@@ -972,8 +950,8 @@ copy_entries_to_user(unsigned int total_size, | |||
972 | /* ... then go back and fix counters and names */ | 950 | /* ... then go back and fix counters and names */ |
973 | for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ | 951 | for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ |
974 | unsigned int i; | 952 | unsigned int i; |
975 | const struct ipt_entry_match *m; | 953 | const struct xt_entry_match *m; |
976 | const struct ipt_entry_target *t; | 954 | const struct xt_entry_target *t; |
977 | 955 | ||
978 | e = (struct ipt_entry *)(loc_cpu_entry + off); | 956 | e = (struct ipt_entry *)(loc_cpu_entry + off); |
979 | if (copy_to_user(userptr + off | 957 | if (copy_to_user(userptr + off |
@@ -990,7 +968,7 @@ copy_entries_to_user(unsigned int total_size, | |||
990 | m = (void *)e + i; | 968 | m = (void *)e + i; |
991 | 969 | ||
992 | if (copy_to_user(userptr + off + i | 970 | if (copy_to_user(userptr + off + i |
993 | + offsetof(struct ipt_entry_match, | 971 | + offsetof(struct xt_entry_match, |
994 | u.user.name), | 972 | u.user.name), |
995 | m->u.kernel.match->name, | 973 | m->u.kernel.match->name, |
996 | strlen(m->u.kernel.match->name)+1) | 974 | strlen(m->u.kernel.match->name)+1) |
@@ -1002,7 +980,7 @@ copy_entries_to_user(unsigned int total_size, | |||
1002 | 980 | ||
1003 | t = ipt_get_target_c(e); | 981 | t = ipt_get_target_c(e); |
1004 | if (copy_to_user(userptr + off + e->target_offset | 982 | if (copy_to_user(userptr + off + e->target_offset |
1005 | + offsetof(struct ipt_entry_target, | 983 | + offsetof(struct xt_entry_target, |
1006 | u.user.name), | 984 | u.user.name), |
1007 | t->u.kernel.target->name, | 985 | t->u.kernel.target->name, |
1008 | strlen(t->u.kernel.target->name)+1) != 0) { | 986 | strlen(t->u.kernel.target->name)+1) != 0) { |
@@ -1040,7 +1018,7 @@ static int compat_calc_entry(const struct ipt_entry *e, | |||
1040 | const void *base, struct xt_table_info *newinfo) | 1018 | const void *base, struct xt_table_info *newinfo) |
1041 | { | 1019 | { |
1042 | const struct xt_entry_match *ematch; | 1020 | const struct xt_entry_match *ematch; |
1043 | const struct ipt_entry_target *t; | 1021 | const struct xt_entry_target *t; |
1044 | unsigned int entry_offset; | 1022 | unsigned int entry_offset; |
1045 | int off, i, ret; | 1023 | int off, i, ret; |
1046 | 1024 | ||
@@ -1080,6 +1058,7 @@ static int compat_table_info(const struct xt_table_info *info, | |||
1080 | memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); | 1058 | memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); |
1081 | newinfo->initial_entries = 0; | 1059 | newinfo->initial_entries = 0; |
1082 | loc_cpu_entry = info->entries[raw_smp_processor_id()]; | 1060 | loc_cpu_entry = info->entries[raw_smp_processor_id()]; |
1061 | xt_compat_init_offsets(AF_INET, info->number); | ||
1083 | xt_entry_foreach(iter, loc_cpu_entry, info->size) { | 1062 | xt_entry_foreach(iter, loc_cpu_entry, info->size) { |
1084 | ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); | 1063 | ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); |
1085 | if (ret != 0) | 1064 | if (ret != 0) |
@@ -1092,7 +1071,7 @@ static int compat_table_info(const struct xt_table_info *info, | |||
1092 | static int get_info(struct net *net, void __user *user, | 1071 | static int get_info(struct net *net, void __user *user, |
1093 | const int *len, int compat) | 1072 | const int *len, int compat) |
1094 | { | 1073 | { |
1095 | char name[IPT_TABLE_MAXNAMELEN]; | 1074 | char name[XT_TABLE_MAXNAMELEN]; |
1096 | struct xt_table *t; | 1075 | struct xt_table *t; |
1097 | int ret; | 1076 | int ret; |
1098 | 1077 | ||
@@ -1105,7 +1084,7 @@ static int get_info(struct net *net, void __user *user, | |||
1105 | if (copy_from_user(name, user, sizeof(name)) != 0) | 1084 | if (copy_from_user(name, user, sizeof(name)) != 0) |
1106 | return -EFAULT; | 1085 | return -EFAULT; |
1107 | 1086 | ||
1108 | name[IPT_TABLE_MAXNAMELEN-1] = '\0'; | 1087 | name[XT_TABLE_MAXNAMELEN-1] = '\0'; |
1109 | #ifdef CONFIG_COMPAT | 1088 | #ifdef CONFIG_COMPAT |
1110 | if (compat) | 1089 | if (compat) |
1111 | xt_compat_lock(AF_INET); | 1090 | xt_compat_lock(AF_INET); |
@@ -1124,6 +1103,7 @@ static int get_info(struct net *net, void __user *user, | |||
1124 | private = &tmp; | 1103 | private = &tmp; |
1125 | } | 1104 | } |
1126 | #endif | 1105 | #endif |
1106 | memset(&info, 0, sizeof(info)); | ||
1127 | info.valid_hooks = t->valid_hooks; | 1107 | info.valid_hooks = t->valid_hooks; |
1128 | memcpy(info.hook_entry, private->hook_entry, | 1108 | memcpy(info.hook_entry, private->hook_entry, |
1129 | sizeof(info.hook_entry)); | 1109 | sizeof(info.hook_entry)); |
@@ -1202,7 +1182,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, | |||
1202 | struct ipt_entry *iter; | 1182 | struct ipt_entry *iter; |
1203 | 1183 | ||
1204 | ret = 0; | 1184 | ret = 0; |
1205 | counters = vmalloc(num_counters * sizeof(struct xt_counters)); | 1185 | counters = vzalloc(num_counters * sizeof(struct xt_counters)); |
1206 | if (!counters) { | 1186 | if (!counters) { |
1207 | ret = -ENOMEM; | 1187 | ret = -ENOMEM; |
1208 | goto out; | 1188 | goto out; |
@@ -1277,6 +1257,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len) | |||
1277 | /* overflow check */ | 1257 | /* overflow check */ |
1278 | if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) | 1258 | if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) |
1279 | return -ENOMEM; | 1259 | return -ENOMEM; |
1260 | tmp.name[sizeof(tmp.name)-1] = 0; | ||
1280 | 1261 | ||
1281 | newinfo = xt_alloc_table_info(tmp.size); | 1262 | newinfo = xt_alloc_table_info(tmp.size); |
1282 | if (!newinfo) | 1263 | if (!newinfo) |
@@ -1326,6 +1307,7 @@ do_add_counters(struct net *net, const void __user *user, | |||
1326 | int ret = 0; | 1307 | int ret = 0; |
1327 | void *loc_cpu_entry; | 1308 | void *loc_cpu_entry; |
1328 | struct ipt_entry *iter; | 1309 | struct ipt_entry *iter; |
1310 | unsigned int addend; | ||
1329 | #ifdef CONFIG_COMPAT | 1311 | #ifdef CONFIG_COMPAT |
1330 | struct compat_xt_counters_info compat_tmp; | 1312 | struct compat_xt_counters_info compat_tmp; |
1331 | 1313 | ||
@@ -1382,12 +1364,12 @@ do_add_counters(struct net *net, const void __user *user, | |||
1382 | /* Choose the copy that is on our node */ | 1364 | /* Choose the copy that is on our node */ |
1383 | curcpu = smp_processor_id(); | 1365 | curcpu = smp_processor_id(); |
1384 | loc_cpu_entry = private->entries[curcpu]; | 1366 | loc_cpu_entry = private->entries[curcpu]; |
1385 | xt_info_wrlock(curcpu); | 1367 | addend = xt_write_recseq_begin(); |
1386 | xt_entry_foreach(iter, loc_cpu_entry, private->size) { | 1368 | xt_entry_foreach(iter, loc_cpu_entry, private->size) { |
1387 | ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt); | 1369 | ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt); |
1388 | ++i; | 1370 | ++i; |
1389 | } | 1371 | } |
1390 | xt_info_wrunlock(curcpu); | 1372 | xt_write_recseq_end(addend); |
1391 | unlock_up_free: | 1373 | unlock_up_free: |
1392 | local_bh_enable(); | 1374 | local_bh_enable(); |
1393 | xt_table_unlock(t); | 1375 | xt_table_unlock(t); |
@@ -1400,14 +1382,14 @@ do_add_counters(struct net *net, const void __user *user, | |||
1400 | 1382 | ||
1401 | #ifdef CONFIG_COMPAT | 1383 | #ifdef CONFIG_COMPAT |
1402 | struct compat_ipt_replace { | 1384 | struct compat_ipt_replace { |
1403 | char name[IPT_TABLE_MAXNAMELEN]; | 1385 | char name[XT_TABLE_MAXNAMELEN]; |
1404 | u32 valid_hooks; | 1386 | u32 valid_hooks; |
1405 | u32 num_entries; | 1387 | u32 num_entries; |
1406 | u32 size; | 1388 | u32 size; |
1407 | u32 hook_entry[NF_INET_NUMHOOKS]; | 1389 | u32 hook_entry[NF_INET_NUMHOOKS]; |
1408 | u32 underflow[NF_INET_NUMHOOKS]; | 1390 | u32 underflow[NF_INET_NUMHOOKS]; |
1409 | u32 num_counters; | 1391 | u32 num_counters; |
1410 | compat_uptr_t counters; /* struct ipt_counters * */ | 1392 | compat_uptr_t counters; /* struct xt_counters * */ |
1411 | struct compat_ipt_entry entries[0]; | 1393 | struct compat_ipt_entry entries[0]; |
1412 | }; | 1394 | }; |
1413 | 1395 | ||
@@ -1416,7 +1398,7 @@ compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr, | |||
1416 | unsigned int *size, struct xt_counters *counters, | 1398 | unsigned int *size, struct xt_counters *counters, |
1417 | unsigned int i) | 1399 | unsigned int i) |
1418 | { | 1400 | { |
1419 | struct ipt_entry_target *t; | 1401 | struct xt_entry_target *t; |
1420 | struct compat_ipt_entry __user *ce; | 1402 | struct compat_ipt_entry __user *ce; |
1421 | u_int16_t target_offset, next_offset; | 1403 | u_int16_t target_offset, next_offset; |
1422 | compat_uint_t origsize; | 1404 | compat_uint_t origsize; |
@@ -1451,7 +1433,7 @@ compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr, | |||
1451 | } | 1433 | } |
1452 | 1434 | ||
1453 | static int | 1435 | static int |
1454 | compat_find_calc_match(struct ipt_entry_match *m, | 1436 | compat_find_calc_match(struct xt_entry_match *m, |
1455 | const char *name, | 1437 | const char *name, |
1456 | const struct ipt_ip *ip, | 1438 | const struct ipt_ip *ip, |
1457 | unsigned int hookmask, | 1439 | unsigned int hookmask, |
@@ -1473,7 +1455,7 @@ compat_find_calc_match(struct ipt_entry_match *m, | |||
1473 | 1455 | ||
1474 | static void compat_release_entry(struct compat_ipt_entry *e) | 1456 | static void compat_release_entry(struct compat_ipt_entry *e) |
1475 | { | 1457 | { |
1476 | struct ipt_entry_target *t; | 1458 | struct xt_entry_target *t; |
1477 | struct xt_entry_match *ematch; | 1459 | struct xt_entry_match *ematch; |
1478 | 1460 | ||
1479 | /* Cleanup all matches */ | 1461 | /* Cleanup all matches */ |
@@ -1494,7 +1476,7 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, | |||
1494 | const char *name) | 1476 | const char *name) |
1495 | { | 1477 | { |
1496 | struct xt_entry_match *ematch; | 1478 | struct xt_entry_match *ematch; |
1497 | struct ipt_entry_target *t; | 1479 | struct xt_entry_target *t; |
1498 | struct xt_target *target; | 1480 | struct xt_target *target; |
1499 | unsigned int entry_offset; | 1481 | unsigned int entry_offset; |
1500 | unsigned int j; | 1482 | unsigned int j; |
@@ -1576,7 +1558,7 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr, | |||
1576 | unsigned int *size, const char *name, | 1558 | unsigned int *size, const char *name, |
1577 | struct xt_table_info *newinfo, unsigned char *base) | 1559 | struct xt_table_info *newinfo, unsigned char *base) |
1578 | { | 1560 | { |
1579 | struct ipt_entry_target *t; | 1561 | struct xt_entry_target *t; |
1580 | struct xt_target *target; | 1562 | struct xt_target *target; |
1581 | struct ipt_entry *de; | 1563 | struct ipt_entry *de; |
1582 | unsigned int origsize; | 1564 | unsigned int origsize; |
@@ -1680,6 +1662,7 @@ translate_compat_table(struct net *net, | |||
1680 | duprintf("translate_compat_table: size %u\n", info->size); | 1662 | duprintf("translate_compat_table: size %u\n", info->size); |
1681 | j = 0; | 1663 | j = 0; |
1682 | xt_compat_lock(AF_INET); | 1664 | xt_compat_lock(AF_INET); |
1665 | xt_compat_init_offsets(AF_INET, number); | ||
1683 | /* Walk through entries, checking offsets. */ | 1666 | /* Walk through entries, checking offsets. */ |
1684 | xt_entry_foreach(iter0, entry0, total_size) { | 1667 | xt_entry_foreach(iter0, entry0, total_size) { |
1685 | ret = check_compat_entry_size_and_hooks(iter0, info, &size, | 1668 | ret = check_compat_entry_size_and_hooks(iter0, info, &size, |
@@ -1821,6 +1804,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) | |||
1821 | return -ENOMEM; | 1804 | return -ENOMEM; |
1822 | if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) | 1805 | if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) |
1823 | return -ENOMEM; | 1806 | return -ENOMEM; |
1807 | tmp.name[sizeof(tmp.name)-1] = 0; | ||
1824 | 1808 | ||
1825 | newinfo = xt_alloc_table_info(tmp.size); | 1809 | newinfo = xt_alloc_table_info(tmp.size); |
1826 | if (!newinfo) | 1810 | if (!newinfo) |
@@ -1884,7 +1868,7 @@ compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, | |||
1884 | } | 1868 | } |
1885 | 1869 | ||
1886 | struct compat_ipt_get_entries { | 1870 | struct compat_ipt_get_entries { |
1887 | char name[IPT_TABLE_MAXNAMELEN]; | 1871 | char name[XT_TABLE_MAXNAMELEN]; |
1888 | compat_uint_t size; | 1872 | compat_uint_t size; |
1889 | struct compat_ipt_entry entrytable[0]; | 1873 | struct compat_ipt_entry entrytable[0]; |
1890 | }; | 1874 | }; |
@@ -2039,7 +2023,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) | |||
2039 | 2023 | ||
2040 | case IPT_SO_GET_REVISION_MATCH: | 2024 | case IPT_SO_GET_REVISION_MATCH: |
2041 | case IPT_SO_GET_REVISION_TARGET: { | 2025 | case IPT_SO_GET_REVISION_TARGET: { |
2042 | struct ipt_get_revision rev; | 2026 | struct xt_get_revision rev; |
2043 | int target; | 2027 | int target; |
2044 | 2028 | ||
2045 | if (*len != sizeof(rev)) { | 2029 | if (*len != sizeof(rev)) { |
@@ -2050,6 +2034,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) | |||
2050 | ret = -EFAULT; | 2034 | ret = -EFAULT; |
2051 | break; | 2035 | break; |
2052 | } | 2036 | } |
2037 | rev.name[sizeof(rev.name)-1] = 0; | ||
2053 | 2038 | ||
2054 | if (cmd == IPT_SO_GET_REVISION_TARGET) | 2039 | if (cmd == IPT_SO_GET_REVISION_TARGET) |
2055 | target = 1; | 2040 | target = 1; |
@@ -2176,7 +2161,7 @@ static int icmp_checkentry(const struct xt_mtchk_param *par) | |||
2176 | 2161 | ||
2177 | static struct xt_target ipt_builtin_tg[] __read_mostly = { | 2162 | static struct xt_target ipt_builtin_tg[] __read_mostly = { |
2178 | { | 2163 | { |
2179 | .name = IPT_STANDARD_TARGET, | 2164 | .name = XT_STANDARD_TARGET, |
2180 | .targetsize = sizeof(int), | 2165 | .targetsize = sizeof(int), |
2181 | .family = NFPROTO_IPV4, | 2166 | .family = NFPROTO_IPV4, |
2182 | #ifdef CONFIG_COMPAT | 2167 | #ifdef CONFIG_COMPAT |
@@ -2186,9 +2171,9 @@ static struct xt_target ipt_builtin_tg[] __read_mostly = { | |||
2186 | #endif | 2171 | #endif |
2187 | }, | 2172 | }, |
2188 | { | 2173 | { |
2189 | .name = IPT_ERROR_TARGET, | 2174 | .name = XT_ERROR_TARGET, |
2190 | .target = ipt_error, | 2175 | .target = ipt_error, |
2191 | .targetsize = IPT_FUNCTION_MAXNAMELEN, | 2176 | .targetsize = XT_FUNCTION_MAXNAMELEN, |
2192 | .family = NFPROTO_IPV4, | 2177 | .family = NFPROTO_IPV4, |
2193 | }, | 2178 | }, |
2194 | }; | 2179 | }; |
@@ -2244,7 +2229,7 @@ static int __init ip_tables_init(void) | |||
2244 | if (ret < 0) | 2229 | if (ret < 0) |
2245 | goto err1; | 2230 | goto err1; |
2246 | 2231 | ||
2247 | /* Noone else will be downing sem now, so we won't sleep */ | 2232 | /* No one else will be downing sem now, so we won't sleep */ |
2248 | ret = xt_register_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg)); | 2233 | ret = xt_register_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg)); |
2249 | if (ret < 0) | 2234 | if (ret < 0) |
2250 | goto err2; | 2235 | goto err2; |
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 3a43cf36db87..5c9e97c79017 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c | |||
@@ -29,6 +29,7 @@ | |||
29 | #include <net/netfilter/nf_conntrack.h> | 29 | #include <net/netfilter/nf_conntrack.h> |
30 | #include <net/net_namespace.h> | 30 | #include <net/net_namespace.h> |
31 | #include <net/checksum.h> | 31 | #include <net/checksum.h> |
32 | #include <net/ip.h> | ||
32 | 33 | ||
33 | #define CLUSTERIP_VERSION "0.8" | 34 | #define CLUSTERIP_VERSION "0.8" |
34 | 35 | ||
@@ -231,24 +232,22 @@ clusterip_hashfn(const struct sk_buff *skb, | |||
231 | { | 232 | { |
232 | const struct iphdr *iph = ip_hdr(skb); | 233 | const struct iphdr *iph = ip_hdr(skb); |
233 | unsigned long hashval; | 234 | unsigned long hashval; |
234 | u_int16_t sport, dport; | 235 | u_int16_t sport = 0, dport = 0; |
235 | const u_int16_t *ports; | 236 | int poff; |
236 | 237 | ||
237 | switch (iph->protocol) { | 238 | poff = proto_ports_offset(iph->protocol); |
238 | case IPPROTO_TCP: | 239 | if (poff >= 0) { |
239 | case IPPROTO_UDP: | 240 | const u_int16_t *ports; |
240 | case IPPROTO_UDPLITE: | 241 | u16 _ports[2]; |
241 | case IPPROTO_SCTP: | 242 | |
242 | case IPPROTO_DCCP: | 243 | ports = skb_header_pointer(skb, iph->ihl * 4 + poff, 4, _ports); |
243 | case IPPROTO_ICMP: | 244 | if (ports) { |
244 | ports = (const void *)iph+iph->ihl*4; | 245 | sport = ports[0]; |
245 | sport = ports[0]; | 246 | dport = ports[1]; |
246 | dport = ports[1]; | 247 | } |
247 | break; | 248 | } else { |
248 | default: | ||
249 | if (net_ratelimit()) | 249 | if (net_ratelimit()) |
250 | pr_info("unknown protocol %u\n", iph->protocol); | 250 | pr_info("unknown protocol %u\n", iph->protocol); |
251 | sport = dport = 0; | ||
252 | } | 251 | } |
253 | 252 | ||
254 | switch (config->hash_mode) { | 253 | switch (config->hash_mode) { |
@@ -301,19 +300,14 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par) | |||
301 | * that the ->target() function isn't called after ->destroy() */ | 300 | * that the ->target() function isn't called after ->destroy() */ |
302 | 301 | ||
303 | ct = nf_ct_get(skb, &ctinfo); | 302 | ct = nf_ct_get(skb, &ctinfo); |
304 | if (ct == NULL) { | 303 | if (ct == NULL) |
305 | pr_info("no conntrack!\n"); | ||
306 | /* FIXME: need to drop invalid ones, since replies | ||
307 | * to outgoing connections of other nodes will be | ||
308 | * marked as INVALID */ | ||
309 | return NF_DROP; | 304 | return NF_DROP; |
310 | } | ||
311 | 305 | ||
312 | /* special case: ICMP error handling. conntrack distinguishes between | 306 | /* special case: ICMP error handling. conntrack distinguishes between |
313 | * error messages (RELATED) and information requests (see below) */ | 307 | * error messages (RELATED) and information requests (see below) */ |
314 | if (ip_hdr(skb)->protocol == IPPROTO_ICMP && | 308 | if (ip_hdr(skb)->protocol == IPPROTO_ICMP && |
315 | (ctinfo == IP_CT_RELATED || | 309 | (ctinfo == IP_CT_RELATED || |
316 | ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)) | 310 | ctinfo == IP_CT_RELATED_REPLY)) |
317 | return XT_CONTINUE; | 311 | return XT_CONTINUE; |
318 | 312 | ||
319 | /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO, | 313 | /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO, |
@@ -327,12 +321,12 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par) | |||
327 | ct->mark = hash; | 321 | ct->mark = hash; |
328 | break; | 322 | break; |
329 | case IP_CT_RELATED: | 323 | case IP_CT_RELATED: |
330 | case IP_CT_RELATED+IP_CT_IS_REPLY: | 324 | case IP_CT_RELATED_REPLY: |
331 | /* FIXME: we don't handle expectations at the | 325 | /* FIXME: we don't handle expectations at the |
332 | * moment. they can arrive on a different node than | 326 | * moment. they can arrive on a different node than |
333 | * the master connection (e.g. FTP passive mode) */ | 327 | * the master connection (e.g. FTP passive mode) */ |
334 | case IP_CT_ESTABLISHED: | 328 | case IP_CT_ESTABLISHED: |
335 | case IP_CT_ESTABLISHED+IP_CT_IS_REPLY: | 329 | case IP_CT_ESTABLISHED_REPLY: |
336 | break; | 330 | break; |
337 | default: | 331 | default: |
338 | break; | 332 | break; |
@@ -670,8 +664,11 @@ static ssize_t clusterip_proc_write(struct file *file, const char __user *input, | |||
670 | char buffer[PROC_WRITELEN+1]; | 664 | char buffer[PROC_WRITELEN+1]; |
671 | unsigned long nodenum; | 665 | unsigned long nodenum; |
672 | 666 | ||
673 | if (copy_from_user(buffer, input, PROC_WRITELEN)) | 667 | if (size > PROC_WRITELEN) |
668 | return -EIO; | ||
669 | if (copy_from_user(buffer, input, size)) | ||
674 | return -EFAULT; | 670 | return -EFAULT; |
671 | buffer[size] = 0; | ||
675 | 672 | ||
676 | if (*buffer == '+') { | 673 | if (*buffer == '+') { |
677 | nodenum = simple_strtoul(buffer+1, NULL, 10); | 674 | nodenum = simple_strtoul(buffer+1, NULL, 10); |
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c index 915fc17d7ce2..d76d6c9ed946 100644 --- a/net/ipv4/netfilter/ipt_LOG.c +++ b/net/ipv4/netfilter/ipt_LOG.c | |||
@@ -24,16 +24,15 @@ | |||
24 | #include <linux/netfilter/x_tables.h> | 24 | #include <linux/netfilter/x_tables.h> |
25 | #include <linux/netfilter_ipv4/ipt_LOG.h> | 25 | #include <linux/netfilter_ipv4/ipt_LOG.h> |
26 | #include <net/netfilter/nf_log.h> | 26 | #include <net/netfilter/nf_log.h> |
27 | #include <net/netfilter/xt_log.h> | ||
27 | 28 | ||
28 | MODULE_LICENSE("GPL"); | 29 | MODULE_LICENSE("GPL"); |
29 | MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); | 30 | MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); |
30 | MODULE_DESCRIPTION("Xtables: IPv4 packet logging to syslog"); | 31 | MODULE_DESCRIPTION("Xtables: IPv4 packet logging to syslog"); |
31 | 32 | ||
32 | /* Use lock to serialize, so printks don't overlap */ | ||
33 | static DEFINE_SPINLOCK(log_lock); | ||
34 | |||
35 | /* One level of recursion won't kill us */ | 33 | /* One level of recursion won't kill us */ |
36 | static void dump_packet(const struct nf_loginfo *info, | 34 | static void dump_packet(struct sbuff *m, |
35 | const struct nf_loginfo *info, | ||
37 | const struct sk_buff *skb, | 36 | const struct sk_buff *skb, |
38 | unsigned int iphoff) | 37 | unsigned int iphoff) |
39 | { | 38 | { |
@@ -48,32 +47,32 @@ static void dump_packet(const struct nf_loginfo *info, | |||
48 | 47 | ||
49 | ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); | 48 | ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); |
50 | if (ih == NULL) { | 49 | if (ih == NULL) { |
51 | printk("TRUNCATED"); | 50 | sb_add(m, "TRUNCATED"); |
52 | return; | 51 | return; |
53 | } | 52 | } |
54 | 53 | ||
55 | /* Important fields: | 54 | /* Important fields: |
56 | * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */ | 55 | * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */ |
57 | /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */ | 56 | /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */ |
58 | printk("SRC=%pI4 DST=%pI4 ", | 57 | sb_add(m, "SRC=%pI4 DST=%pI4 ", |
59 | &ih->saddr, &ih->daddr); | 58 | &ih->saddr, &ih->daddr); |
60 | 59 | ||
61 | /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ | 60 | /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ |
62 | printk("LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", | 61 | sb_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", |
63 | ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK, | 62 | ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK, |
64 | ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); | 63 | ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); |
65 | 64 | ||
66 | /* Max length: 6 "CE DF MF " */ | 65 | /* Max length: 6 "CE DF MF " */ |
67 | if (ntohs(ih->frag_off) & IP_CE) | 66 | if (ntohs(ih->frag_off) & IP_CE) |
68 | printk("CE "); | 67 | sb_add(m, "CE "); |
69 | if (ntohs(ih->frag_off) & IP_DF) | 68 | if (ntohs(ih->frag_off) & IP_DF) |
70 | printk("DF "); | 69 | sb_add(m, "DF "); |
71 | if (ntohs(ih->frag_off) & IP_MF) | 70 | if (ntohs(ih->frag_off) & IP_MF) |
72 | printk("MF "); | 71 | sb_add(m, "MF "); |
73 | 72 | ||
74 | /* Max length: 11 "FRAG:65535 " */ | 73 | /* Max length: 11 "FRAG:65535 " */ |
75 | if (ntohs(ih->frag_off) & IP_OFFSET) | 74 | if (ntohs(ih->frag_off) & IP_OFFSET) |
76 | printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); | 75 | sb_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); |
77 | 76 | ||
78 | if ((logflags & IPT_LOG_IPOPT) && | 77 | if ((logflags & IPT_LOG_IPOPT) && |
79 | ih->ihl * 4 > sizeof(struct iphdr)) { | 78 | ih->ihl * 4 > sizeof(struct iphdr)) { |
@@ -85,15 +84,15 @@ static void dump_packet(const struct nf_loginfo *info, | |||
85 | op = skb_header_pointer(skb, iphoff+sizeof(_iph), | 84 | op = skb_header_pointer(skb, iphoff+sizeof(_iph), |
86 | optsize, _opt); | 85 | optsize, _opt); |
87 | if (op == NULL) { | 86 | if (op == NULL) { |
88 | printk("TRUNCATED"); | 87 | sb_add(m, "TRUNCATED"); |
89 | return; | 88 | return; |
90 | } | 89 | } |
91 | 90 | ||
92 | /* Max length: 127 "OPT (" 15*4*2chars ") " */ | 91 | /* Max length: 127 "OPT (" 15*4*2chars ") " */ |
93 | printk("OPT ("); | 92 | sb_add(m, "OPT ("); |
94 | for (i = 0; i < optsize; i++) | 93 | for (i = 0; i < optsize; i++) |
95 | printk("%02X", op[i]); | 94 | sb_add(m, "%02X", op[i]); |
96 | printk(") "); | 95 | sb_add(m, ") "); |
97 | } | 96 | } |
98 | 97 | ||
99 | switch (ih->protocol) { | 98 | switch (ih->protocol) { |
@@ -102,7 +101,7 @@ static void dump_packet(const struct nf_loginfo *info, | |||
102 | const struct tcphdr *th; | 101 | const struct tcphdr *th; |
103 | 102 | ||
104 | /* Max length: 10 "PROTO=TCP " */ | 103 | /* Max length: 10 "PROTO=TCP " */ |
105 | printk("PROTO=TCP "); | 104 | sb_add(m, "PROTO=TCP "); |
106 | 105 | ||
107 | if (ntohs(ih->frag_off) & IP_OFFSET) | 106 | if (ntohs(ih->frag_off) & IP_OFFSET) |
108 | break; | 107 | break; |
@@ -111,41 +110,41 @@ static void dump_packet(const struct nf_loginfo *info, | |||
111 | th = skb_header_pointer(skb, iphoff + ih->ihl * 4, | 110 | th = skb_header_pointer(skb, iphoff + ih->ihl * 4, |
112 | sizeof(_tcph), &_tcph); | 111 | sizeof(_tcph), &_tcph); |
113 | if (th == NULL) { | 112 | if (th == NULL) { |
114 | printk("INCOMPLETE [%u bytes] ", | 113 | sb_add(m, "INCOMPLETE [%u bytes] ", |
115 | skb->len - iphoff - ih->ihl*4); | 114 | skb->len - iphoff - ih->ihl*4); |
116 | break; | 115 | break; |
117 | } | 116 | } |
118 | 117 | ||
119 | /* Max length: 20 "SPT=65535 DPT=65535 " */ | 118 | /* Max length: 20 "SPT=65535 DPT=65535 " */ |
120 | printk("SPT=%u DPT=%u ", | 119 | sb_add(m, "SPT=%u DPT=%u ", |
121 | ntohs(th->source), ntohs(th->dest)); | 120 | ntohs(th->source), ntohs(th->dest)); |
122 | /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ | 121 | /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ |
123 | if (logflags & IPT_LOG_TCPSEQ) | 122 | if (logflags & IPT_LOG_TCPSEQ) |
124 | printk("SEQ=%u ACK=%u ", | 123 | sb_add(m, "SEQ=%u ACK=%u ", |
125 | ntohl(th->seq), ntohl(th->ack_seq)); | 124 | ntohl(th->seq), ntohl(th->ack_seq)); |
126 | /* Max length: 13 "WINDOW=65535 " */ | 125 | /* Max length: 13 "WINDOW=65535 " */ |
127 | printk("WINDOW=%u ", ntohs(th->window)); | 126 | sb_add(m, "WINDOW=%u ", ntohs(th->window)); |
128 | /* Max length: 9 "RES=0x3F " */ | 127 | /* Max length: 9 "RES=0x3F " */ |
129 | printk("RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); | 128 | sb_add(m, "RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); |
130 | /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ | 129 | /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ |
131 | if (th->cwr) | 130 | if (th->cwr) |
132 | printk("CWR "); | 131 | sb_add(m, "CWR "); |
133 | if (th->ece) | 132 | if (th->ece) |
134 | printk("ECE "); | 133 | sb_add(m, "ECE "); |
135 | if (th->urg) | 134 | if (th->urg) |
136 | printk("URG "); | 135 | sb_add(m, "URG "); |
137 | if (th->ack) | 136 | if (th->ack) |
138 | printk("ACK "); | 137 | sb_add(m, "ACK "); |
139 | if (th->psh) | 138 | if (th->psh) |
140 | printk("PSH "); | 139 | sb_add(m, "PSH "); |
141 | if (th->rst) | 140 | if (th->rst) |
142 | printk("RST "); | 141 | sb_add(m, "RST "); |
143 | if (th->syn) | 142 | if (th->syn) |
144 | printk("SYN "); | 143 | sb_add(m, "SYN "); |
145 | if (th->fin) | 144 | if (th->fin) |
146 | printk("FIN "); | 145 | sb_add(m, "FIN "); |
147 | /* Max length: 11 "URGP=65535 " */ | 146 | /* Max length: 11 "URGP=65535 " */ |
148 | printk("URGP=%u ", ntohs(th->urg_ptr)); | 147 | sb_add(m, "URGP=%u ", ntohs(th->urg_ptr)); |
149 | 148 | ||
150 | if ((logflags & IPT_LOG_TCPOPT) && | 149 | if ((logflags & IPT_LOG_TCPOPT) && |
151 | th->doff * 4 > sizeof(struct tcphdr)) { | 150 | th->doff * 4 > sizeof(struct tcphdr)) { |
@@ -158,15 +157,15 @@ static void dump_packet(const struct nf_loginfo *info, | |||
158 | iphoff+ih->ihl*4+sizeof(_tcph), | 157 | iphoff+ih->ihl*4+sizeof(_tcph), |
159 | optsize, _opt); | 158 | optsize, _opt); |
160 | if (op == NULL) { | 159 | if (op == NULL) { |
161 | printk("TRUNCATED"); | 160 | sb_add(m, "TRUNCATED"); |
162 | return; | 161 | return; |
163 | } | 162 | } |
164 | 163 | ||
165 | /* Max length: 127 "OPT (" 15*4*2chars ") " */ | 164 | /* Max length: 127 "OPT (" 15*4*2chars ") " */ |
166 | printk("OPT ("); | 165 | sb_add(m, "OPT ("); |
167 | for (i = 0; i < optsize; i++) | 166 | for (i = 0; i < optsize; i++) |
168 | printk("%02X", op[i]); | 167 | sb_add(m, "%02X", op[i]); |
169 | printk(") "); | 168 | sb_add(m, ") "); |
170 | } | 169 | } |
171 | break; | 170 | break; |
172 | } | 171 | } |
@@ -177,9 +176,9 @@ static void dump_packet(const struct nf_loginfo *info, | |||
177 | 176 | ||
178 | if (ih->protocol == IPPROTO_UDP) | 177 | if (ih->protocol == IPPROTO_UDP) |
179 | /* Max length: 10 "PROTO=UDP " */ | 178 | /* Max length: 10 "PROTO=UDP " */ |
180 | printk("PROTO=UDP " ); | 179 | sb_add(m, "PROTO=UDP " ); |
181 | else /* Max length: 14 "PROTO=UDPLITE " */ | 180 | else /* Max length: 14 "PROTO=UDPLITE " */ |
182 | printk("PROTO=UDPLITE "); | 181 | sb_add(m, "PROTO=UDPLITE "); |
183 | 182 | ||
184 | if (ntohs(ih->frag_off) & IP_OFFSET) | 183 | if (ntohs(ih->frag_off) & IP_OFFSET) |
185 | break; | 184 | break; |
@@ -188,13 +187,13 @@ static void dump_packet(const struct nf_loginfo *info, | |||
188 | uh = skb_header_pointer(skb, iphoff+ih->ihl*4, | 187 | uh = skb_header_pointer(skb, iphoff+ih->ihl*4, |
189 | sizeof(_udph), &_udph); | 188 | sizeof(_udph), &_udph); |
190 | if (uh == NULL) { | 189 | if (uh == NULL) { |
191 | printk("INCOMPLETE [%u bytes] ", | 190 | sb_add(m, "INCOMPLETE [%u bytes] ", |
192 | skb->len - iphoff - ih->ihl*4); | 191 | skb->len - iphoff - ih->ihl*4); |
193 | break; | 192 | break; |
194 | } | 193 | } |
195 | 194 | ||
196 | /* Max length: 20 "SPT=65535 DPT=65535 " */ | 195 | /* Max length: 20 "SPT=65535 DPT=65535 " */ |
197 | printk("SPT=%u DPT=%u LEN=%u ", | 196 | sb_add(m, "SPT=%u DPT=%u LEN=%u ", |
198 | ntohs(uh->source), ntohs(uh->dest), | 197 | ntohs(uh->source), ntohs(uh->dest), |
199 | ntohs(uh->len)); | 198 | ntohs(uh->len)); |
200 | break; | 199 | break; |
@@ -221,7 +220,7 @@ static void dump_packet(const struct nf_loginfo *info, | |||
221 | [ICMP_ADDRESSREPLY] = 12 }; | 220 | [ICMP_ADDRESSREPLY] = 12 }; |
222 | 221 | ||
223 | /* Max length: 11 "PROTO=ICMP " */ | 222 | /* Max length: 11 "PROTO=ICMP " */ |
224 | printk("PROTO=ICMP "); | 223 | sb_add(m, "PROTO=ICMP "); |
225 | 224 | ||
226 | if (ntohs(ih->frag_off) & IP_OFFSET) | 225 | if (ntohs(ih->frag_off) & IP_OFFSET) |
227 | break; | 226 | break; |
@@ -230,19 +229,19 @@ static void dump_packet(const struct nf_loginfo *info, | |||
230 | ich = skb_header_pointer(skb, iphoff + ih->ihl * 4, | 229 | ich = skb_header_pointer(skb, iphoff + ih->ihl * 4, |
231 | sizeof(_icmph), &_icmph); | 230 | sizeof(_icmph), &_icmph); |
232 | if (ich == NULL) { | 231 | if (ich == NULL) { |
233 | printk("INCOMPLETE [%u bytes] ", | 232 | sb_add(m, "INCOMPLETE [%u bytes] ", |
234 | skb->len - iphoff - ih->ihl*4); | 233 | skb->len - iphoff - ih->ihl*4); |
235 | break; | 234 | break; |
236 | } | 235 | } |
237 | 236 | ||
238 | /* Max length: 18 "TYPE=255 CODE=255 " */ | 237 | /* Max length: 18 "TYPE=255 CODE=255 " */ |
239 | printk("TYPE=%u CODE=%u ", ich->type, ich->code); | 238 | sb_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code); |
240 | 239 | ||
241 | /* Max length: 25 "INCOMPLETE [65535 bytes] " */ | 240 | /* Max length: 25 "INCOMPLETE [65535 bytes] " */ |
242 | if (ich->type <= NR_ICMP_TYPES && | 241 | if (ich->type <= NR_ICMP_TYPES && |
243 | required_len[ich->type] && | 242 | required_len[ich->type] && |
244 | skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) { | 243 | skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) { |
245 | printk("INCOMPLETE [%u bytes] ", | 244 | sb_add(m, "INCOMPLETE [%u bytes] ", |
246 | skb->len - iphoff - ih->ihl*4); | 245 | skb->len - iphoff - ih->ihl*4); |
247 | break; | 246 | break; |
248 | } | 247 | } |
@@ -251,35 +250,35 @@ static void dump_packet(const struct nf_loginfo *info, | |||
251 | case ICMP_ECHOREPLY: | 250 | case ICMP_ECHOREPLY: |
252 | case ICMP_ECHO: | 251 | case ICMP_ECHO: |
253 | /* Max length: 19 "ID=65535 SEQ=65535 " */ | 252 | /* Max length: 19 "ID=65535 SEQ=65535 " */ |
254 | printk("ID=%u SEQ=%u ", | 253 | sb_add(m, "ID=%u SEQ=%u ", |
255 | ntohs(ich->un.echo.id), | 254 | ntohs(ich->un.echo.id), |
256 | ntohs(ich->un.echo.sequence)); | 255 | ntohs(ich->un.echo.sequence)); |
257 | break; | 256 | break; |
258 | 257 | ||
259 | case ICMP_PARAMETERPROB: | 258 | case ICMP_PARAMETERPROB: |
260 | /* Max length: 14 "PARAMETER=255 " */ | 259 | /* Max length: 14 "PARAMETER=255 " */ |
261 | printk("PARAMETER=%u ", | 260 | sb_add(m, "PARAMETER=%u ", |
262 | ntohl(ich->un.gateway) >> 24); | 261 | ntohl(ich->un.gateway) >> 24); |
263 | break; | 262 | break; |
264 | case ICMP_REDIRECT: | 263 | case ICMP_REDIRECT: |
265 | /* Max length: 24 "GATEWAY=255.255.255.255 " */ | 264 | /* Max length: 24 "GATEWAY=255.255.255.255 " */ |
266 | printk("GATEWAY=%pI4 ", &ich->un.gateway); | 265 | sb_add(m, "GATEWAY=%pI4 ", &ich->un.gateway); |
267 | /* Fall through */ | 266 | /* Fall through */ |
268 | case ICMP_DEST_UNREACH: | 267 | case ICMP_DEST_UNREACH: |
269 | case ICMP_SOURCE_QUENCH: | 268 | case ICMP_SOURCE_QUENCH: |
270 | case ICMP_TIME_EXCEEDED: | 269 | case ICMP_TIME_EXCEEDED: |
271 | /* Max length: 3+maxlen */ | 270 | /* Max length: 3+maxlen */ |
272 | if (!iphoff) { /* Only recurse once. */ | 271 | if (!iphoff) { /* Only recurse once. */ |
273 | printk("["); | 272 | sb_add(m, "["); |
274 | dump_packet(info, skb, | 273 | dump_packet(m, info, skb, |
275 | iphoff + ih->ihl*4+sizeof(_icmph)); | 274 | iphoff + ih->ihl*4+sizeof(_icmph)); |
276 | printk("] "); | 275 | sb_add(m, "] "); |
277 | } | 276 | } |
278 | 277 | ||
279 | /* Max length: 10 "MTU=65535 " */ | 278 | /* Max length: 10 "MTU=65535 " */ |
280 | if (ich->type == ICMP_DEST_UNREACH && | 279 | if (ich->type == ICMP_DEST_UNREACH && |
281 | ich->code == ICMP_FRAG_NEEDED) | 280 | ich->code == ICMP_FRAG_NEEDED) |
282 | printk("MTU=%u ", ntohs(ich->un.frag.mtu)); | 281 | sb_add(m, "MTU=%u ", ntohs(ich->un.frag.mtu)); |
283 | } | 282 | } |
284 | break; | 283 | break; |
285 | } | 284 | } |
@@ -292,19 +291,19 @@ static void dump_packet(const struct nf_loginfo *info, | |||
292 | break; | 291 | break; |
293 | 292 | ||
294 | /* Max length: 9 "PROTO=AH " */ | 293 | /* Max length: 9 "PROTO=AH " */ |
295 | printk("PROTO=AH "); | 294 | sb_add(m, "PROTO=AH "); |
296 | 295 | ||
297 | /* Max length: 25 "INCOMPLETE [65535 bytes] " */ | 296 | /* Max length: 25 "INCOMPLETE [65535 bytes] " */ |
298 | ah = skb_header_pointer(skb, iphoff+ih->ihl*4, | 297 | ah = skb_header_pointer(skb, iphoff+ih->ihl*4, |
299 | sizeof(_ahdr), &_ahdr); | 298 | sizeof(_ahdr), &_ahdr); |
300 | if (ah == NULL) { | 299 | if (ah == NULL) { |
301 | printk("INCOMPLETE [%u bytes] ", | 300 | sb_add(m, "INCOMPLETE [%u bytes] ", |
302 | skb->len - iphoff - ih->ihl*4); | 301 | skb->len - iphoff - ih->ihl*4); |
303 | break; | 302 | break; |
304 | } | 303 | } |
305 | 304 | ||
306 | /* Length: 15 "SPI=0xF1234567 " */ | 305 | /* Length: 15 "SPI=0xF1234567 " */ |
307 | printk("SPI=0x%x ", ntohl(ah->spi)); | 306 | sb_add(m, "SPI=0x%x ", ntohl(ah->spi)); |
308 | break; | 307 | break; |
309 | } | 308 | } |
310 | case IPPROTO_ESP: { | 309 | case IPPROTO_ESP: { |
@@ -312,7 +311,7 @@ static void dump_packet(const struct nf_loginfo *info, | |||
312 | const struct ip_esp_hdr *eh; | 311 | const struct ip_esp_hdr *eh; |
313 | 312 | ||
314 | /* Max length: 10 "PROTO=ESP " */ | 313 | /* Max length: 10 "PROTO=ESP " */ |
315 | printk("PROTO=ESP "); | 314 | sb_add(m, "PROTO=ESP "); |
316 | 315 | ||
317 | if (ntohs(ih->frag_off) & IP_OFFSET) | 316 | if (ntohs(ih->frag_off) & IP_OFFSET) |
318 | break; | 317 | break; |
@@ -321,25 +320,25 @@ static void dump_packet(const struct nf_loginfo *info, | |||
321 | eh = skb_header_pointer(skb, iphoff+ih->ihl*4, | 320 | eh = skb_header_pointer(skb, iphoff+ih->ihl*4, |
322 | sizeof(_esph), &_esph); | 321 | sizeof(_esph), &_esph); |
323 | if (eh == NULL) { | 322 | if (eh == NULL) { |
324 | printk("INCOMPLETE [%u bytes] ", | 323 | sb_add(m, "INCOMPLETE [%u bytes] ", |
325 | skb->len - iphoff - ih->ihl*4); | 324 | skb->len - iphoff - ih->ihl*4); |
326 | break; | 325 | break; |
327 | } | 326 | } |
328 | 327 | ||
329 | /* Length: 15 "SPI=0xF1234567 " */ | 328 | /* Length: 15 "SPI=0xF1234567 " */ |
330 | printk("SPI=0x%x ", ntohl(eh->spi)); | 329 | sb_add(m, "SPI=0x%x ", ntohl(eh->spi)); |
331 | break; | 330 | break; |
332 | } | 331 | } |
333 | /* Max length: 10 "PROTO 255 " */ | 332 | /* Max length: 10 "PROTO 255 " */ |
334 | default: | 333 | default: |
335 | printk("PROTO=%u ", ih->protocol); | 334 | sb_add(m, "PROTO=%u ", ih->protocol); |
336 | } | 335 | } |
337 | 336 | ||
338 | /* Max length: 15 "UID=4294967295 " */ | 337 | /* Max length: 15 "UID=4294967295 " */ |
339 | if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) { | 338 | if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) { |
340 | read_lock_bh(&skb->sk->sk_callback_lock); | 339 | read_lock_bh(&skb->sk->sk_callback_lock); |
341 | if (skb->sk->sk_socket && skb->sk->sk_socket->file) | 340 | if (skb->sk->sk_socket && skb->sk->sk_socket->file) |
342 | printk("UID=%u GID=%u ", | 341 | sb_add(m, "UID=%u GID=%u ", |
343 | skb->sk->sk_socket->file->f_cred->fsuid, | 342 | skb->sk->sk_socket->file->f_cred->fsuid, |
344 | skb->sk->sk_socket->file->f_cred->fsgid); | 343 | skb->sk->sk_socket->file->f_cred->fsgid); |
345 | read_unlock_bh(&skb->sk->sk_callback_lock); | 344 | read_unlock_bh(&skb->sk->sk_callback_lock); |
@@ -347,7 +346,7 @@ static void dump_packet(const struct nf_loginfo *info, | |||
347 | 346 | ||
348 | /* Max length: 16 "MARK=0xFFFFFFFF " */ | 347 | /* Max length: 16 "MARK=0xFFFFFFFF " */ |
349 | if (!iphoff && skb->mark) | 348 | if (!iphoff && skb->mark) |
350 | printk("MARK=0x%x ", skb->mark); | 349 | sb_add(m, "MARK=0x%x ", skb->mark); |
351 | 350 | ||
352 | /* Proto Max log string length */ | 351 | /* Proto Max log string length */ |
353 | /* IP: 40+46+6+11+127 = 230 */ | 352 | /* IP: 40+46+6+11+127 = 230 */ |
@@ -364,7 +363,8 @@ static void dump_packet(const struct nf_loginfo *info, | |||
364 | /* maxlen = 230+ 91 + 230 + 252 = 803 */ | 363 | /* maxlen = 230+ 91 + 230 + 252 = 803 */ |
365 | } | 364 | } |
366 | 365 | ||
367 | static void dump_mac_header(const struct nf_loginfo *info, | 366 | static void dump_mac_header(struct sbuff *m, |
367 | const struct nf_loginfo *info, | ||
368 | const struct sk_buff *skb) | 368 | const struct sk_buff *skb) |
369 | { | 369 | { |
370 | struct net_device *dev = skb->dev; | 370 | struct net_device *dev = skb->dev; |
@@ -378,7 +378,7 @@ static void dump_mac_header(const struct nf_loginfo *info, | |||
378 | 378 | ||
379 | switch (dev->type) { | 379 | switch (dev->type) { |
380 | case ARPHRD_ETHER: | 380 | case ARPHRD_ETHER: |
381 | printk("MACSRC=%pM MACDST=%pM MACPROTO=%04x ", | 381 | sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ", |
382 | eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, | 382 | eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, |
383 | ntohs(eth_hdr(skb)->h_proto)); | 383 | ntohs(eth_hdr(skb)->h_proto)); |
384 | return; | 384 | return; |
@@ -387,17 +387,17 @@ static void dump_mac_header(const struct nf_loginfo *info, | |||
387 | } | 387 | } |
388 | 388 | ||
389 | fallback: | 389 | fallback: |
390 | printk("MAC="); | 390 | sb_add(m, "MAC="); |
391 | if (dev->hard_header_len && | 391 | if (dev->hard_header_len && |
392 | skb->mac_header != skb->network_header) { | 392 | skb->mac_header != skb->network_header) { |
393 | const unsigned char *p = skb_mac_header(skb); | 393 | const unsigned char *p = skb_mac_header(skb); |
394 | unsigned int i; | 394 | unsigned int i; |
395 | 395 | ||
396 | printk("%02x", *p++); | 396 | sb_add(m, "%02x", *p++); |
397 | for (i = 1; i < dev->hard_header_len; i++, p++) | 397 | for (i = 1; i < dev->hard_header_len; i++, p++) |
398 | printk(":%02x", *p); | 398 | sb_add(m, ":%02x", *p); |
399 | } | 399 | } |
400 | printk(" "); | 400 | sb_add(m, " "); |
401 | } | 401 | } |
402 | 402 | ||
403 | static struct nf_loginfo default_loginfo = { | 403 | static struct nf_loginfo default_loginfo = { |
@@ -419,11 +419,12 @@ ipt_log_packet(u_int8_t pf, | |||
419 | const struct nf_loginfo *loginfo, | 419 | const struct nf_loginfo *loginfo, |
420 | const char *prefix) | 420 | const char *prefix) |
421 | { | 421 | { |
422 | struct sbuff *m = sb_open(); | ||
423 | |||
422 | if (!loginfo) | 424 | if (!loginfo) |
423 | loginfo = &default_loginfo; | 425 | loginfo = &default_loginfo; |
424 | 426 | ||
425 | spin_lock_bh(&log_lock); | 427 | sb_add(m, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, |
426 | printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, | ||
427 | prefix, | 428 | prefix, |
428 | in ? in->name : "", | 429 | in ? in->name : "", |
429 | out ? out->name : ""); | 430 | out ? out->name : ""); |
@@ -434,20 +435,19 @@ ipt_log_packet(u_int8_t pf, | |||
434 | 435 | ||
435 | physindev = skb->nf_bridge->physindev; | 436 | physindev = skb->nf_bridge->physindev; |
436 | if (physindev && in != physindev) | 437 | if (physindev && in != physindev) |
437 | printk("PHYSIN=%s ", physindev->name); | 438 | sb_add(m, "PHYSIN=%s ", physindev->name); |
438 | physoutdev = skb->nf_bridge->physoutdev; | 439 | physoutdev = skb->nf_bridge->physoutdev; |
439 | if (physoutdev && out != physoutdev) | 440 | if (physoutdev && out != physoutdev) |
440 | printk("PHYSOUT=%s ", physoutdev->name); | 441 | sb_add(m, "PHYSOUT=%s ", physoutdev->name); |
441 | } | 442 | } |
442 | #endif | 443 | #endif |
443 | 444 | ||
444 | /* MAC logging for input path only. */ | 445 | if (in != NULL) |
445 | if (in && !out) | 446 | dump_mac_header(m, loginfo, skb); |
446 | dump_mac_header(loginfo, skb); | 447 | |
448 | dump_packet(m, loginfo, skb, 0); | ||
447 | 449 | ||
448 | dump_packet(loginfo, skb, 0); | 450 | sb_close(m); |
449 | printk("\n"); | ||
450 | spin_unlock_bh(&log_lock); | ||
451 | } | 451 | } |
452 | 452 | ||
453 | static unsigned int | 453 | static unsigned int |
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index d2ed9dc74ebc..9931152a78b5 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c | |||
@@ -60,7 +60,7 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par) | |||
60 | nat = nfct_nat(ct); | 60 | nat = nfct_nat(ct); |
61 | 61 | ||
62 | NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || | 62 | NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || |
63 | ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); | 63 | ctinfo == IP_CT_RELATED_REPLY)); |
64 | 64 | ||
65 | /* Source address is 0.0.0.0 - locally generated packet that is | 65 | /* Source address is 0.0.0.0 - locally generated packet that is |
66 | * probably not supposed to be masqueraded. | 66 | * probably not supposed to be masqueraded. |
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 43eec80c0e7c..51f13f8ec724 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c | |||
@@ -40,7 +40,6 @@ static void send_reset(struct sk_buff *oldskb, int hook) | |||
40 | struct iphdr *niph; | 40 | struct iphdr *niph; |
41 | const struct tcphdr *oth; | 41 | const struct tcphdr *oth; |
42 | struct tcphdr _otcph, *tcph; | 42 | struct tcphdr _otcph, *tcph; |
43 | unsigned int addr_type; | ||
44 | 43 | ||
45 | /* IP header checks: fragment. */ | 44 | /* IP header checks: fragment. */ |
46 | if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET)) | 45 | if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET)) |
@@ -55,6 +54,9 @@ static void send_reset(struct sk_buff *oldskb, int hook) | |||
55 | if (oth->rst) | 54 | if (oth->rst) |
56 | return; | 55 | return; |
57 | 56 | ||
57 | if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) | ||
58 | return; | ||
59 | |||
58 | /* Check checksum */ | 60 | /* Check checksum */ |
59 | if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP)) | 61 | if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP)) |
60 | return; | 62 | return; |
@@ -101,22 +103,14 @@ static void send_reset(struct sk_buff *oldskb, int hook) | |||
101 | nskb->csum_start = (unsigned char *)tcph - nskb->head; | 103 | nskb->csum_start = (unsigned char *)tcph - nskb->head; |
102 | nskb->csum_offset = offsetof(struct tcphdr, check); | 104 | nskb->csum_offset = offsetof(struct tcphdr, check); |
103 | 105 | ||
104 | addr_type = RTN_UNSPEC; | ||
105 | if (hook != NF_INET_FORWARD | ||
106 | #ifdef CONFIG_BRIDGE_NETFILTER | ||
107 | || (nskb->nf_bridge && nskb->nf_bridge->mask & BRNF_BRIDGED) | ||
108 | #endif | ||
109 | ) | ||
110 | addr_type = RTN_LOCAL; | ||
111 | |||
112 | /* ip_route_me_harder expects skb->dst to be set */ | 106 | /* ip_route_me_harder expects skb->dst to be set */ |
113 | skb_dst_set_noref(nskb, skb_dst(oldskb)); | 107 | skb_dst_set_noref(nskb, skb_dst(oldskb)); |
114 | 108 | ||
115 | nskb->protocol = htons(ETH_P_IP); | 109 | nskb->protocol = htons(ETH_P_IP); |
116 | if (ip_route_me_harder(nskb, addr_type)) | 110 | if (ip_route_me_harder(nskb, RTN_UNSPEC)) |
117 | goto free_nskb; | 111 | goto free_nskb; |
118 | 112 | ||
119 | niph->ttl = dst_metric(skb_dst(nskb), RTAX_HOPLIMIT); | 113 | niph->ttl = ip4_dst_hoplimit(skb_dst(nskb)); |
120 | 114 | ||
121 | /* "Never happens" */ | 115 | /* "Never happens" */ |
122 | if (nskb->len > dst_mtu(skb_dst(nskb))) | 116 | if (nskb->len > dst_mtu(skb_dst(nskb))) |
diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c deleted file mode 100644 index db8bff0fb86d..000000000000 --- a/net/ipv4/netfilter/ipt_addrtype.c +++ /dev/null | |||
@@ -1,134 +0,0 @@ | |||
1 | /* | ||
2 | * iptables module to match inet_addr_type() of an ip. | ||
3 | * | ||
4 | * Copyright (c) 2004 Patrick McHardy <kaber@trash.net> | ||
5 | * (C) 2007 Laszlo Attila Toth <panther@balabit.hu> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
12 | #include <linux/kernel.h> | ||
13 | #include <linux/module.h> | ||
14 | #include <linux/skbuff.h> | ||
15 | #include <linux/netdevice.h> | ||
16 | #include <linux/ip.h> | ||
17 | #include <net/route.h> | ||
18 | |||
19 | #include <linux/netfilter_ipv4/ipt_addrtype.h> | ||
20 | #include <linux/netfilter/x_tables.h> | ||
21 | |||
22 | MODULE_LICENSE("GPL"); | ||
23 | MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); | ||
24 | MODULE_DESCRIPTION("Xtables: address type match for IPv4"); | ||
25 | |||
26 | static inline bool match_type(struct net *net, const struct net_device *dev, | ||
27 | __be32 addr, u_int16_t mask) | ||
28 | { | ||
29 | return !!(mask & (1 << inet_dev_addr_type(net, dev, addr))); | ||
30 | } | ||
31 | |||
32 | static bool | ||
33 | addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) | ||
34 | { | ||
35 | struct net *net = dev_net(par->in ? par->in : par->out); | ||
36 | const struct ipt_addrtype_info *info = par->matchinfo; | ||
37 | const struct iphdr *iph = ip_hdr(skb); | ||
38 | bool ret = true; | ||
39 | |||
40 | if (info->source) | ||
41 | ret &= match_type(net, NULL, iph->saddr, info->source) ^ | ||
42 | info->invert_source; | ||
43 | if (info->dest) | ||
44 | ret &= match_type(net, NULL, iph->daddr, info->dest) ^ | ||
45 | info->invert_dest; | ||
46 | |||
47 | return ret; | ||
48 | } | ||
49 | |||
50 | static bool | ||
51 | addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) | ||
52 | { | ||
53 | struct net *net = dev_net(par->in ? par->in : par->out); | ||
54 | const struct ipt_addrtype_info_v1 *info = par->matchinfo; | ||
55 | const struct iphdr *iph = ip_hdr(skb); | ||
56 | const struct net_device *dev = NULL; | ||
57 | bool ret = true; | ||
58 | |||
59 | if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) | ||
60 | dev = par->in; | ||
61 | else if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) | ||
62 | dev = par->out; | ||
63 | |||
64 | if (info->source) | ||
65 | ret &= match_type(net, dev, iph->saddr, info->source) ^ | ||
66 | (info->flags & IPT_ADDRTYPE_INVERT_SOURCE); | ||
67 | if (ret && info->dest) | ||
68 | ret &= match_type(net, dev, iph->daddr, info->dest) ^ | ||
69 | !!(info->flags & IPT_ADDRTYPE_INVERT_DEST); | ||
70 | return ret; | ||
71 | } | ||
72 | |||
73 | static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par) | ||
74 | { | ||
75 | struct ipt_addrtype_info_v1 *info = par->matchinfo; | ||
76 | |||
77 | if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN && | ||
78 | info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) { | ||
79 | pr_info("both incoming and outgoing " | ||
80 | "interface limitation cannot be selected\n"); | ||
81 | return -EINVAL; | ||
82 | } | ||
83 | |||
84 | if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) | | ||
85 | (1 << NF_INET_LOCAL_IN)) && | ||
86 | info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) { | ||
87 | pr_info("output interface limitation " | ||
88 | "not valid in PREROUTING and INPUT\n"); | ||
89 | return -EINVAL; | ||
90 | } | ||
91 | |||
92 | if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) | | ||
93 | (1 << NF_INET_LOCAL_OUT)) && | ||
94 | info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) { | ||
95 | pr_info("input interface limitation " | ||
96 | "not valid in POSTROUTING and OUTPUT\n"); | ||
97 | return -EINVAL; | ||
98 | } | ||
99 | |||
100 | return 0; | ||
101 | } | ||
102 | |||
103 | static struct xt_match addrtype_mt_reg[] __read_mostly = { | ||
104 | { | ||
105 | .name = "addrtype", | ||
106 | .family = NFPROTO_IPV4, | ||
107 | .match = addrtype_mt_v0, | ||
108 | .matchsize = sizeof(struct ipt_addrtype_info), | ||
109 | .me = THIS_MODULE | ||
110 | }, | ||
111 | { | ||
112 | .name = "addrtype", | ||
113 | .family = NFPROTO_IPV4, | ||
114 | .revision = 1, | ||
115 | .match = addrtype_mt_v1, | ||
116 | .checkentry = addrtype_mt_checkentry_v1, | ||
117 | .matchsize = sizeof(struct ipt_addrtype_info_v1), | ||
118 | .me = THIS_MODULE | ||
119 | } | ||
120 | }; | ||
121 | |||
122 | static int __init addrtype_mt_init(void) | ||
123 | { | ||
124 | return xt_register_matches(addrtype_mt_reg, | ||
125 | ARRAY_SIZE(addrtype_mt_reg)); | ||
126 | } | ||
127 | |||
128 | static void __exit addrtype_mt_exit(void) | ||
129 | { | ||
130 | xt_unregister_matches(addrtype_mt_reg, ARRAY_SIZE(addrtype_mt_reg)); | ||
131 | } | ||
132 | |||
133 | module_init(addrtype_mt_init); | ||
134 | module_exit(addrtype_mt_exit); | ||
diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c index af6e9c778345..2b57e52c746c 100644 --- a/net/ipv4/netfilter/ipt_ecn.c +++ b/net/ipv4/netfilter/ipt_ecn.c | |||
@@ -25,7 +25,8 @@ MODULE_LICENSE("GPL"); | |||
25 | static inline bool match_ip(const struct sk_buff *skb, | 25 | static inline bool match_ip(const struct sk_buff *skb, |
26 | const struct ipt_ecn_info *einfo) | 26 | const struct ipt_ecn_info *einfo) |
27 | { | 27 | { |
28 | return (ip_hdr(skb)->tos & IPT_ECN_IP_MASK) == einfo->ip_ect; | 28 | return ((ip_hdr(skb)->tos & IPT_ECN_IP_MASK) == einfo->ip_ect) ^ |
29 | !!(einfo->invert & IPT_ECN_OP_MATCH_IP); | ||
29 | } | 30 | } |
30 | 31 | ||
31 | static inline bool match_tcp(const struct sk_buff *skb, | 32 | static inline bool match_tcp(const struct sk_buff *skb, |
@@ -76,8 +77,6 @@ static bool ecn_mt(const struct sk_buff *skb, struct xt_action_param *par) | |||
76 | return false; | 77 | return false; |
77 | 78 | ||
78 | if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)) { | 79 | if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)) { |
79 | if (ip_hdr(skb)->protocol != IPPROTO_TCP) | ||
80 | return false; | ||
81 | if (!match_tcp(skb, info, &par->hotdrop)) | 80 | if (!match_tcp(skb, info, &par->hotdrop)) |
82 | return false; | 81 | return false; |
83 | } | 82 | } |
@@ -97,7 +96,7 @@ static int ecn_mt_check(const struct xt_mtchk_param *par) | |||
97 | return -EINVAL; | 96 | return -EINVAL; |
98 | 97 | ||
99 | if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR) && | 98 | if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR) && |
100 | ip->proto != IPPROTO_TCP) { | 99 | (ip->proto != IPPROTO_TCP || ip->invflags & IPT_INV_PROTO)) { |
101 | pr_info("cannot match TCP bits in rule for non-tcp packets\n"); | 100 | pr_info("cannot match TCP bits in rule for non-tcp packets\n"); |
102 | return -EINVAL; | 101 | return -EINVAL; |
103 | } | 102 | } |
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index 294a2a32f293..aef5d1fbe77d 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c | |||
@@ -60,7 +60,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out) | |||
60 | ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, NULL, out, | 60 | ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, NULL, out, |
61 | dev_net(out)->ipv4.iptable_mangle); | 61 | dev_net(out)->ipv4.iptable_mangle); |
62 | /* Reroute for ANY change. */ | 62 | /* Reroute for ANY change. */ |
63 | if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE) { | 63 | if (ret != NF_DROP && ret != NF_STOLEN) { |
64 | iph = ip_hdr(skb); | 64 | iph = ip_hdr(skb); |
65 | 65 | ||
66 | if (iph->saddr != saddr || | 66 | if (iph->saddr != saddr || |
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 5a03c02af999..de9da21113a1 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | |||
@@ -101,7 +101,7 @@ static unsigned int ipv4_confirm(unsigned int hooknum, | |||
101 | 101 | ||
102 | /* This is where we call the helper: as the packet goes out. */ | 102 | /* This is where we call the helper: as the packet goes out. */ |
103 | ct = nf_ct_get(skb, &ctinfo); | 103 | ct = nf_ct_get(skb, &ctinfo); |
104 | if (!ct || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY) | 104 | if (!ct || ctinfo == IP_CT_RELATED_REPLY) |
105 | goto out; | 105 | goto out; |
106 | 106 | ||
107 | help = nfct_help(ct); | 107 | help = nfct_help(ct); |
@@ -121,7 +121,9 @@ static unsigned int ipv4_confirm(unsigned int hooknum, | |||
121 | return ret; | 121 | return ret; |
122 | } | 122 | } |
123 | 123 | ||
124 | if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status)) { | 124 | /* adjust seqs for loopback traffic only in outgoing direction */ |
125 | if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && | ||
126 | !nf_is_loopback_packet(skb)) { | ||
125 | typeof(nf_nat_seq_adjust_hook) seq_adjust; | 127 | typeof(nf_nat_seq_adjust_hook) seq_adjust; |
126 | 128 | ||
127 | seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook); | 129 | seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook); |
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index 244f7cb08d68..5585980fce2e 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c | |||
@@ -11,6 +11,7 @@ | |||
11 | #include <linux/proc_fs.h> | 11 | #include <linux/proc_fs.h> |
12 | #include <linux/seq_file.h> | 12 | #include <linux/seq_file.h> |
13 | #include <linux/percpu.h> | 13 | #include <linux/percpu.h> |
14 | #include <linux/security.h> | ||
14 | #include <net/net_namespace.h> | 15 | #include <net/net_namespace.h> |
15 | 16 | ||
16 | #include <linux/netfilter.h> | 17 | #include <linux/netfilter.h> |
@@ -19,6 +20,7 @@ | |||
19 | #include <net/netfilter/nf_conntrack_l4proto.h> | 20 | #include <net/netfilter/nf_conntrack_l4proto.h> |
20 | #include <net/netfilter/nf_conntrack_expect.h> | 21 | #include <net/netfilter/nf_conntrack_expect.h> |
21 | #include <net/netfilter/nf_conntrack_acct.h> | 22 | #include <net/netfilter/nf_conntrack_acct.h> |
23 | #include <linux/rculist_nulls.h> | ||
22 | 24 | ||
23 | struct ct_iter_state { | 25 | struct ct_iter_state { |
24 | struct seq_net_private p; | 26 | struct seq_net_private p; |
@@ -34,7 +36,8 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) | |||
34 | for (st->bucket = 0; | 36 | for (st->bucket = 0; |
35 | st->bucket < net->ct.htable_size; | 37 | st->bucket < net->ct.htable_size; |
36 | st->bucket++) { | 38 | st->bucket++) { |
37 | n = rcu_dereference(net->ct.hash[st->bucket].first); | 39 | n = rcu_dereference( |
40 | hlist_nulls_first_rcu(&net->ct.hash[st->bucket])); | ||
38 | if (!is_a_nulls(n)) | 41 | if (!is_a_nulls(n)) |
39 | return n; | 42 | return n; |
40 | } | 43 | } |
@@ -47,13 +50,14 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, | |||
47 | struct net *net = seq_file_net(seq); | 50 | struct net *net = seq_file_net(seq); |
48 | struct ct_iter_state *st = seq->private; | 51 | struct ct_iter_state *st = seq->private; |
49 | 52 | ||
50 | head = rcu_dereference(head->next); | 53 | head = rcu_dereference(hlist_nulls_next_rcu(head)); |
51 | while (is_a_nulls(head)) { | 54 | while (is_a_nulls(head)) { |
52 | if (likely(get_nulls_value(head) == st->bucket)) { | 55 | if (likely(get_nulls_value(head) == st->bucket)) { |
53 | if (++st->bucket >= net->ct.htable_size) | 56 | if (++st->bucket >= net->ct.htable_size) |
54 | return NULL; | 57 | return NULL; |
55 | } | 58 | } |
56 | head = rcu_dereference(net->ct.hash[st->bucket].first); | 59 | head = rcu_dereference( |
60 | hlist_nulls_first_rcu(&net->ct.hash[st->bucket])); | ||
57 | } | 61 | } |
58 | return head; | 62 | return head; |
59 | } | 63 | } |
@@ -87,6 +91,29 @@ static void ct_seq_stop(struct seq_file *s, void *v) | |||
87 | rcu_read_unlock(); | 91 | rcu_read_unlock(); |
88 | } | 92 | } |
89 | 93 | ||
94 | #ifdef CONFIG_NF_CONNTRACK_SECMARK | ||
95 | static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) | ||
96 | { | ||
97 | int ret; | ||
98 | u32 len; | ||
99 | char *secctx; | ||
100 | |||
101 | ret = security_secid_to_secctx(ct->secmark, &secctx, &len); | ||
102 | if (ret) | ||
103 | return 0; | ||
104 | |||
105 | ret = seq_printf(s, "secctx=%s ", secctx); | ||
106 | |||
107 | security_release_secctx(secctx, len); | ||
108 | return ret; | ||
109 | } | ||
110 | #else | ||
111 | static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) | ||
112 | { | ||
113 | return 0; | ||
114 | } | ||
115 | #endif | ||
116 | |||
90 | static int ct_seq_show(struct seq_file *s, void *v) | 117 | static int ct_seq_show(struct seq_file *s, void *v) |
91 | { | 118 | { |
92 | struct nf_conntrack_tuple_hash *hash = v; | 119 | struct nf_conntrack_tuple_hash *hash = v; |
@@ -148,10 +175,8 @@ static int ct_seq_show(struct seq_file *s, void *v) | |||
148 | goto release; | 175 | goto release; |
149 | #endif | 176 | #endif |
150 | 177 | ||
151 | #ifdef CONFIG_NF_CONNTRACK_SECMARK | 178 | if (ct_show_secctx(s, ct)) |
152 | if (seq_printf(s, "secmark=%u ", ct->secmark)) | ||
153 | goto release; | 179 | goto release; |
154 | #endif | ||
155 | 180 | ||
156 | if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use))) | 181 | if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use))) |
157 | goto release; | 182 | goto release; |
@@ -195,7 +220,8 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq) | |||
195 | struct hlist_node *n; | 220 | struct hlist_node *n; |
196 | 221 | ||
197 | for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { | 222 | for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { |
198 | n = rcu_dereference(net->ct.expect_hash[st->bucket].first); | 223 | n = rcu_dereference( |
224 | hlist_first_rcu(&net->ct.expect_hash[st->bucket])); | ||
199 | if (n) | 225 | if (n) |
200 | return n; | 226 | return n; |
201 | } | 227 | } |
@@ -208,11 +234,12 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq, | |||
208 | struct net *net = seq_file_net(seq); | 234 | struct net *net = seq_file_net(seq); |
209 | struct ct_expect_iter_state *st = seq->private; | 235 | struct ct_expect_iter_state *st = seq->private; |
210 | 236 | ||
211 | head = rcu_dereference(head->next); | 237 | head = rcu_dereference(hlist_next_rcu(head)); |
212 | while (head == NULL) { | 238 | while (head == NULL) { |
213 | if (++st->bucket >= nf_ct_expect_hsize) | 239 | if (++st->bucket >= nf_ct_expect_hsize) |
214 | return NULL; | 240 | return NULL; |
215 | head = rcu_dereference(net->ct.expect_hash[st->bucket].first); | 241 | head = rcu_dereference( |
242 | hlist_first_rcu(&net->ct.expect_hash[st->bucket])); | ||
216 | } | 243 | } |
217 | return head; | 244 | return head; |
218 | } | 245 | } |
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 7404bde95994..ab5b27a2916f 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c | |||
@@ -160,7 +160,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, | |||
160 | /* Update skb to refer to this connection */ | 160 | /* Update skb to refer to this connection */ |
161 | skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general; | 161 | skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general; |
162 | skb->nfctinfo = *ctinfo; | 162 | skb->nfctinfo = *ctinfo; |
163 | return -NF_ACCEPT; | 163 | return NF_ACCEPT; |
164 | } | 164 | } |
165 | 165 | ||
166 | /* Small and modified version of icmp_rcv */ | 166 | /* Small and modified version of icmp_rcv */ |
diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c index c31b87668250..703f366fd235 100644 --- a/net/ipv4/netfilter/nf_nat_amanda.c +++ b/net/ipv4/netfilter/nf_nat_amanda.c | |||
@@ -44,9 +44,16 @@ static unsigned int help(struct sk_buff *skb, | |||
44 | 44 | ||
45 | /* Try to get same port: if not, try to change it. */ | 45 | /* Try to get same port: if not, try to change it. */ |
46 | for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { | 46 | for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { |
47 | int res; | ||
48 | |||
47 | exp->tuple.dst.u.tcp.port = htons(port); | 49 | exp->tuple.dst.u.tcp.port = htons(port); |
48 | if (nf_ct_expect_related(exp) == 0) | 50 | res = nf_ct_expect_related(exp); |
51 | if (res == 0) | ||
52 | break; | ||
53 | else if (res != -EBUSY) { | ||
54 | port = 0; | ||
49 | break; | 55 | break; |
56 | } | ||
50 | } | 57 | } |
51 | 58 | ||
52 | if (port == 0) | 59 | if (port == 0) |
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index 8c8632d9b93c..3346de5d94d0 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c | |||
@@ -38,7 +38,7 @@ static DEFINE_SPINLOCK(nf_nat_lock); | |||
38 | static struct nf_conntrack_l3proto *l3proto __read_mostly; | 38 | static struct nf_conntrack_l3proto *l3proto __read_mostly; |
39 | 39 | ||
40 | #define MAX_IP_NAT_PROTO 256 | 40 | #define MAX_IP_NAT_PROTO 256 |
41 | static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO] | 41 | static const struct nf_nat_protocol __rcu *nf_nat_protos[MAX_IP_NAT_PROTO] |
42 | __read_mostly; | 42 | __read_mostly; |
43 | 43 | ||
44 | static inline const struct nf_nat_protocol * | 44 | static inline const struct nf_nat_protocol * |
@@ -47,28 +47,6 @@ __nf_nat_proto_find(u_int8_t protonum) | |||
47 | return rcu_dereference(nf_nat_protos[protonum]); | 47 | return rcu_dereference(nf_nat_protos[protonum]); |
48 | } | 48 | } |
49 | 49 | ||
50 | const struct nf_nat_protocol * | ||
51 | nf_nat_proto_find_get(u_int8_t protonum) | ||
52 | { | ||
53 | const struct nf_nat_protocol *p; | ||
54 | |||
55 | rcu_read_lock(); | ||
56 | p = __nf_nat_proto_find(protonum); | ||
57 | if (!try_module_get(p->me)) | ||
58 | p = &nf_nat_unknown_protocol; | ||
59 | rcu_read_unlock(); | ||
60 | |||
61 | return p; | ||
62 | } | ||
63 | EXPORT_SYMBOL_GPL(nf_nat_proto_find_get); | ||
64 | |||
65 | void | ||
66 | nf_nat_proto_put(const struct nf_nat_protocol *p) | ||
67 | { | ||
68 | module_put(p->me); | ||
69 | } | ||
70 | EXPORT_SYMBOL_GPL(nf_nat_proto_put); | ||
71 | |||
72 | /* We keep an extra hash for each conntrack, for fast searching. */ | 50 | /* We keep an extra hash for each conntrack, for fast searching. */ |
73 | static inline unsigned int | 51 | static inline unsigned int |
74 | hash_by_src(const struct net *net, u16 zone, | 52 | hash_by_src(const struct net *net, u16 zone, |
@@ -243,7 +221,14 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, | |||
243 | manips not an issue. */ | 221 | manips not an issue. */ |
244 | if (maniptype == IP_NAT_MANIP_SRC && | 222 | if (maniptype == IP_NAT_MANIP_SRC && |
245 | !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) { | 223 | !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) { |
246 | if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) { | 224 | /* try the original tuple first */ |
225 | if (in_range(orig_tuple, range)) { | ||
226 | if (!nf_nat_used_tuple(orig_tuple, ct)) { | ||
227 | *tuple = *orig_tuple; | ||
228 | return; | ||
229 | } | ||
230 | } else if (find_appropriate_src(net, zone, orig_tuple, tuple, | ||
231 | range)) { | ||
247 | pr_debug("get_unique_tuple: Found current src map\n"); | 232 | pr_debug("get_unique_tuple: Found current src map\n"); |
248 | if (!nf_nat_used_tuple(tuple, ct)) | 233 | if (!nf_nat_used_tuple(tuple, ct)) |
249 | return; | 234 | return; |
@@ -262,11 +247,17 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, | |||
262 | proto = __nf_nat_proto_find(orig_tuple->dst.protonum); | 247 | proto = __nf_nat_proto_find(orig_tuple->dst.protonum); |
263 | 248 | ||
264 | /* Only bother mapping if it's not already in range and unique */ | 249 | /* Only bother mapping if it's not already in range and unique */ |
265 | if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM) && | 250 | if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) { |
266 | (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) || | 251 | if (range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) { |
267 | proto->in_range(tuple, maniptype, &range->min, &range->max)) && | 252 | if (proto->in_range(tuple, maniptype, &range->min, |
268 | !nf_nat_used_tuple(tuple, ct)) | 253 | &range->max) && |
269 | goto out; | 254 | (range->min.all == range->max.all || |
255 | !nf_nat_used_tuple(tuple, ct))) | ||
256 | goto out; | ||
257 | } else if (!nf_nat_used_tuple(tuple, ct)) { | ||
258 | goto out; | ||
259 | } | ||
260 | } | ||
270 | 261 | ||
271 | /* Last change: get protocol to try to obtain unique tuple. */ | 262 | /* Last change: get protocol to try to obtain unique tuple. */ |
272 | proto->unique_tuple(tuple, range, maniptype, ct); | 263 | proto->unique_tuple(tuple, range, maniptype, ct); |
@@ -282,7 +273,6 @@ nf_nat_setup_info(struct nf_conn *ct, | |||
282 | struct net *net = nf_ct_net(ct); | 273 | struct net *net = nf_ct_net(ct); |
283 | struct nf_conntrack_tuple curr_tuple, new_tuple; | 274 | struct nf_conntrack_tuple curr_tuple, new_tuple; |
284 | struct nf_conn_nat *nat; | 275 | struct nf_conn_nat *nat; |
285 | int have_to_hash = !(ct->status & IPS_NAT_DONE_MASK); | ||
286 | 276 | ||
287 | /* nat helper or nfctnetlink also setup binding */ | 277 | /* nat helper or nfctnetlink also setup binding */ |
288 | nat = nfct_nat(ct); | 278 | nat = nfct_nat(ct); |
@@ -322,8 +312,7 @@ nf_nat_setup_info(struct nf_conn *ct, | |||
322 | ct->status |= IPS_DST_NAT; | 312 | ct->status |= IPS_DST_NAT; |
323 | } | 313 | } |
324 | 314 | ||
325 | /* Place in source hash if this is the first time. */ | 315 | if (maniptype == IP_NAT_MANIP_SRC) { |
326 | if (have_to_hash) { | ||
327 | unsigned int srchash; | 316 | unsigned int srchash; |
328 | 317 | ||
329 | srchash = hash_by_src(net, nf_ct_zone(ct), | 318 | srchash = hash_by_src(net, nf_ct_zone(ct), |
@@ -339,9 +328,9 @@ nf_nat_setup_info(struct nf_conn *ct, | |||
339 | 328 | ||
340 | /* It's done. */ | 329 | /* It's done. */ |
341 | if (maniptype == IP_NAT_MANIP_DST) | 330 | if (maniptype == IP_NAT_MANIP_DST) |
342 | set_bit(IPS_DST_NAT_DONE_BIT, &ct->status); | 331 | ct->status |= IPS_DST_NAT_DONE; |
343 | else | 332 | else |
344 | set_bit(IPS_SRC_NAT_DONE_BIT, &ct->status); | 333 | ct->status |= IPS_SRC_NAT_DONE; |
345 | 334 | ||
346 | return NF_ACCEPT; | 335 | return NF_ACCEPT; |
347 | } | 336 | } |
@@ -444,7 +433,7 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct, | |||
444 | 433 | ||
445 | /* Must be RELATED */ | 434 | /* Must be RELATED */ |
446 | NF_CT_ASSERT(skb->nfctinfo == IP_CT_RELATED || | 435 | NF_CT_ASSERT(skb->nfctinfo == IP_CT_RELATED || |
447 | skb->nfctinfo == IP_CT_RELATED+IP_CT_IS_REPLY); | 436 | skb->nfctinfo == IP_CT_RELATED_REPLY); |
448 | 437 | ||
449 | /* Redirects on non-null nats must be dropped, else they'll | 438 | /* Redirects on non-null nats must be dropped, else they'll |
450 | start talking to each other without our translation, and be | 439 | start talking to each other without our translation, and be |
@@ -458,6 +447,18 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct, | |||
458 | return 0; | 447 | return 0; |
459 | } | 448 | } |
460 | 449 | ||
450 | if (manip == IP_NAT_MANIP_SRC) | ||
451 | statusbit = IPS_SRC_NAT; | ||
452 | else | ||
453 | statusbit = IPS_DST_NAT; | ||
454 | |||
455 | /* Invert if this is reply dir. */ | ||
456 | if (dir == IP_CT_DIR_REPLY) | ||
457 | statusbit ^= IPS_NAT_MASK; | ||
458 | |||
459 | if (!(ct->status & statusbit)) | ||
460 | return 1; | ||
461 | |||
461 | pr_debug("icmp_reply_translation: translating error %p manip %u " | 462 | pr_debug("icmp_reply_translation: translating error %p manip %u " |
462 | "dir %s\n", skb, manip, | 463 | "dir %s\n", skb, manip, |
463 | dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY"); | 464 | dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY"); |
@@ -492,20 +493,9 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct, | |||
492 | 493 | ||
493 | /* Change outer to look the reply to an incoming packet | 494 | /* Change outer to look the reply to an incoming packet |
494 | * (proto 0 means don't invert per-proto part). */ | 495 | * (proto 0 means don't invert per-proto part). */ |
495 | if (manip == IP_NAT_MANIP_SRC) | 496 | nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); |
496 | statusbit = IPS_SRC_NAT; | 497 | if (!manip_pkt(0, skb, 0, &target, manip)) |
497 | else | 498 | return 0; |
498 | statusbit = IPS_DST_NAT; | ||
499 | |||
500 | /* Invert if this is reply dir. */ | ||
501 | if (dir == IP_CT_DIR_REPLY) | ||
502 | statusbit ^= IPS_NAT_MASK; | ||
503 | |||
504 | if (ct->status & statusbit) { | ||
505 | nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); | ||
506 | if (!manip_pkt(0, skb, 0, &target, manip)) | ||
507 | return 0; | ||
508 | } | ||
509 | 499 | ||
510 | return 1; | 500 | return 1; |
511 | } | 501 | } |
@@ -517,7 +507,10 @@ int nf_nat_protocol_register(const struct nf_nat_protocol *proto) | |||
517 | int ret = 0; | 507 | int ret = 0; |
518 | 508 | ||
519 | spin_lock_bh(&nf_nat_lock); | 509 | spin_lock_bh(&nf_nat_lock); |
520 | if (nf_nat_protos[proto->protonum] != &nf_nat_unknown_protocol) { | 510 | if (rcu_dereference_protected( |
511 | nf_nat_protos[proto->protonum], | ||
512 | lockdep_is_held(&nf_nat_lock) | ||
513 | ) != &nf_nat_unknown_protocol) { | ||
521 | ret = -EBUSY; | 514 | ret = -EBUSY; |
522 | goto out; | 515 | goto out; |
523 | } | 516 | } |
@@ -528,7 +521,7 @@ int nf_nat_protocol_register(const struct nf_nat_protocol *proto) | |||
528 | } | 521 | } |
529 | EXPORT_SYMBOL(nf_nat_protocol_register); | 522 | EXPORT_SYMBOL(nf_nat_protocol_register); |
530 | 523 | ||
531 | /* Noone stores the protocol anywhere; simply delete it. */ | 524 | /* No one stores the protocol anywhere; simply delete it. */ |
532 | void nf_nat_protocol_unregister(const struct nf_nat_protocol *proto) | 525 | void nf_nat_protocol_unregister(const struct nf_nat_protocol *proto) |
533 | { | 526 | { |
534 | spin_lock_bh(&nf_nat_lock); | 527 | spin_lock_bh(&nf_nat_lock); |
@@ -539,7 +532,7 @@ void nf_nat_protocol_unregister(const struct nf_nat_protocol *proto) | |||
539 | } | 532 | } |
540 | EXPORT_SYMBOL(nf_nat_protocol_unregister); | 533 | EXPORT_SYMBOL(nf_nat_protocol_unregister); |
541 | 534 | ||
542 | /* Noone using conntrack by the time this called. */ | 535 | /* No one using conntrack by the time this called. */ |
543 | static void nf_nat_cleanup_conntrack(struct nf_conn *ct) | 536 | static void nf_nat_cleanup_conntrack(struct nf_conn *ct) |
544 | { | 537 | { |
545 | struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT); | 538 | struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT); |
@@ -547,7 +540,7 @@ static void nf_nat_cleanup_conntrack(struct nf_conn *ct) | |||
547 | if (nat == NULL || nat->ct == NULL) | 540 | if (nat == NULL || nat->ct == NULL) |
548 | return; | 541 | return; |
549 | 542 | ||
550 | NF_CT_ASSERT(nat->ct->status & IPS_NAT_DONE_MASK); | 543 | NF_CT_ASSERT(nat->ct->status & IPS_SRC_NAT_DONE); |
551 | 544 | ||
552 | spin_lock_bh(&nf_nat_lock); | 545 | spin_lock_bh(&nf_nat_lock); |
553 | hlist_del_rcu(&nat->bysource); | 546 | hlist_del_rcu(&nat->bysource); |
@@ -560,11 +553,10 @@ static void nf_nat_move_storage(void *new, void *old) | |||
560 | struct nf_conn_nat *old_nat = old; | 553 | struct nf_conn_nat *old_nat = old; |
561 | struct nf_conn *ct = old_nat->ct; | 554 | struct nf_conn *ct = old_nat->ct; |
562 | 555 | ||
563 | if (!ct || !(ct->status & IPS_NAT_DONE_MASK)) | 556 | if (!ct || !(ct->status & IPS_SRC_NAT_DONE)) |
564 | return; | 557 | return; |
565 | 558 | ||
566 | spin_lock_bh(&nf_nat_lock); | 559 | spin_lock_bh(&nf_nat_lock); |
567 | new_nat->ct = ct; | ||
568 | hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource); | 560 | hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource); |
569 | spin_unlock_bh(&nf_nat_lock); | 561 | spin_unlock_bh(&nf_nat_lock); |
570 | } | 562 | } |
@@ -583,6 +575,26 @@ static struct nf_ct_ext_type nat_extend __read_mostly = { | |||
583 | #include <linux/netfilter/nfnetlink.h> | 575 | #include <linux/netfilter/nfnetlink.h> |
584 | #include <linux/netfilter/nfnetlink_conntrack.h> | 576 | #include <linux/netfilter/nfnetlink_conntrack.h> |
585 | 577 | ||
578 | static const struct nf_nat_protocol * | ||
579 | nf_nat_proto_find_get(u_int8_t protonum) | ||
580 | { | ||
581 | const struct nf_nat_protocol *p; | ||
582 | |||
583 | rcu_read_lock(); | ||
584 | p = __nf_nat_proto_find(protonum); | ||
585 | if (!try_module_get(p->me)) | ||
586 | p = &nf_nat_unknown_protocol; | ||
587 | rcu_read_unlock(); | ||
588 | |||
589 | return p; | ||
590 | } | ||
591 | |||
592 | static void | ||
593 | nf_nat_proto_put(const struct nf_nat_protocol *p) | ||
594 | { | ||
595 | module_put(p->me); | ||
596 | } | ||
597 | |||
586 | static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = { | 598 | static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = { |
587 | [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 }, | 599 | [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 }, |
588 | [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 }, | 600 | [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 }, |
@@ -674,8 +686,7 @@ static int __net_init nf_nat_net_init(struct net *net) | |||
674 | { | 686 | { |
675 | /* Leave them the same for the moment. */ | 687 | /* Leave them the same for the moment. */ |
676 | net->ipv4.nat_htable_size = net->ct.htable_size; | 688 | net->ipv4.nat_htable_size = net->ct.htable_size; |
677 | net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, | 689 | net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, 0); |
678 | &net->ipv4.nat_vmalloced, 0); | ||
679 | if (!net->ipv4.nat_bysource) | 690 | if (!net->ipv4.nat_bysource) |
680 | return -ENOMEM; | 691 | return -ENOMEM; |
681 | return 0; | 692 | return 0; |
@@ -697,8 +708,7 @@ static void __net_exit nf_nat_net_exit(struct net *net) | |||
697 | { | 708 | { |
698 | nf_ct_iterate_cleanup(net, &clean_nat, NULL); | 709 | nf_ct_iterate_cleanup(net, &clean_nat, NULL); |
699 | synchronize_rcu(); | 710 | synchronize_rcu(); |
700 | nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_vmalloced, | 711 | nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_htable_size); |
701 | net->ipv4.nat_htable_size); | ||
702 | } | 712 | } |
703 | 713 | ||
704 | static struct pernet_operations nf_nat_net_ops = { | 714 | static struct pernet_operations nf_nat_net_ops = { |
diff --git a/net/ipv4/netfilter/nf_nat_ftp.c b/net/ipv4/netfilter/nf_nat_ftp.c index 86e0e84ff0a0..dc73abb3fe27 100644 --- a/net/ipv4/netfilter/nf_nat_ftp.c +++ b/net/ipv4/netfilter/nf_nat_ftp.c | |||
@@ -79,9 +79,16 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb, | |||
79 | 79 | ||
80 | /* Try to get same port: if not, try to change it. */ | 80 | /* Try to get same port: if not, try to change it. */ |
81 | for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { | 81 | for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { |
82 | int ret; | ||
83 | |||
82 | exp->tuple.dst.u.tcp.port = htons(port); | 84 | exp->tuple.dst.u.tcp.port = htons(port); |
83 | if (nf_ct_expect_related(exp) == 0) | 85 | ret = nf_ct_expect_related(exp); |
86 | if (ret == 0) | ||
87 | break; | ||
88 | else if (ret != -EBUSY) { | ||
89 | port = 0; | ||
84 | break; | 90 | break; |
91 | } | ||
85 | } | 92 | } |
86 | 93 | ||
87 | if (port == 0) | 94 | if (port == 0) |
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c index 5045196d853c..790f3160e012 100644 --- a/net/ipv4/netfilter/nf_nat_h323.c +++ b/net/ipv4/netfilter/nf_nat_h323.c | |||
@@ -222,13 +222,24 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, | |||
222 | /* Try to get a pair of ports. */ | 222 | /* Try to get a pair of ports. */ |
223 | for (nated_port = ntohs(rtp_exp->tuple.dst.u.udp.port); | 223 | for (nated_port = ntohs(rtp_exp->tuple.dst.u.udp.port); |
224 | nated_port != 0; nated_port += 2) { | 224 | nated_port != 0; nated_port += 2) { |
225 | int ret; | ||
226 | |||
225 | rtp_exp->tuple.dst.u.udp.port = htons(nated_port); | 227 | rtp_exp->tuple.dst.u.udp.port = htons(nated_port); |
226 | if (nf_ct_expect_related(rtp_exp) == 0) { | 228 | ret = nf_ct_expect_related(rtp_exp); |
229 | if (ret == 0) { | ||
227 | rtcp_exp->tuple.dst.u.udp.port = | 230 | rtcp_exp->tuple.dst.u.udp.port = |
228 | htons(nated_port + 1); | 231 | htons(nated_port + 1); |
229 | if (nf_ct_expect_related(rtcp_exp) == 0) | 232 | ret = nf_ct_expect_related(rtcp_exp); |
233 | if (ret == 0) | ||
234 | break; | ||
235 | else if (ret != -EBUSY) { | ||
236 | nf_ct_unexpect_related(rtp_exp); | ||
237 | nated_port = 0; | ||
230 | break; | 238 | break; |
231 | nf_ct_unexpect_related(rtp_exp); | 239 | } |
240 | } else if (ret != -EBUSY) { | ||
241 | nated_port = 0; | ||
242 | break; | ||
232 | } | 243 | } |
233 | } | 244 | } |
234 | 245 | ||
@@ -284,9 +295,16 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct, | |||
284 | 295 | ||
285 | /* Try to get same port: if not, try to change it. */ | 296 | /* Try to get same port: if not, try to change it. */ |
286 | for (; nated_port != 0; nated_port++) { | 297 | for (; nated_port != 0; nated_port++) { |
298 | int ret; | ||
299 | |||
287 | exp->tuple.dst.u.tcp.port = htons(nated_port); | 300 | exp->tuple.dst.u.tcp.port = htons(nated_port); |
288 | if (nf_ct_expect_related(exp) == 0) | 301 | ret = nf_ct_expect_related(exp); |
302 | if (ret == 0) | ||
303 | break; | ||
304 | else if (ret != -EBUSY) { | ||
305 | nated_port = 0; | ||
289 | break; | 306 | break; |
307 | } | ||
290 | } | 308 | } |
291 | 309 | ||
292 | if (nated_port == 0) { /* No port available */ | 310 | if (nated_port == 0) { /* No port available */ |
@@ -334,9 +352,16 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct, | |||
334 | 352 | ||
335 | /* Try to get same port: if not, try to change it. */ | 353 | /* Try to get same port: if not, try to change it. */ |
336 | for (; nated_port != 0; nated_port++) { | 354 | for (; nated_port != 0; nated_port++) { |
355 | int ret; | ||
356 | |||
337 | exp->tuple.dst.u.tcp.port = htons(nated_port); | 357 | exp->tuple.dst.u.tcp.port = htons(nated_port); |
338 | if (nf_ct_expect_related(exp) == 0) | 358 | ret = nf_ct_expect_related(exp); |
359 | if (ret == 0) | ||
339 | break; | 360 | break; |
361 | else if (ret != -EBUSY) { | ||
362 | nated_port = 0; | ||
363 | break; | ||
364 | } | ||
340 | } | 365 | } |
341 | 366 | ||
342 | if (nated_port == 0) { /* No port available */ | 367 | if (nated_port == 0) { /* No port available */ |
@@ -418,9 +443,16 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct, | |||
418 | 443 | ||
419 | /* Try to get same port: if not, try to change it. */ | 444 | /* Try to get same port: if not, try to change it. */ |
420 | for (; nated_port != 0; nated_port++) { | 445 | for (; nated_port != 0; nated_port++) { |
446 | int ret; | ||
447 | |||
421 | exp->tuple.dst.u.tcp.port = htons(nated_port); | 448 | exp->tuple.dst.u.tcp.port = htons(nated_port); |
422 | if (nf_ct_expect_related(exp) == 0) | 449 | ret = nf_ct_expect_related(exp); |
450 | if (ret == 0) | ||
451 | break; | ||
452 | else if (ret != -EBUSY) { | ||
453 | nated_port = 0; | ||
423 | break; | 454 | break; |
455 | } | ||
424 | } | 456 | } |
425 | 457 | ||
426 | if (nated_port == 0) { /* No port available */ | 458 | if (nated_port == 0) { /* No port available */ |
@@ -500,9 +532,16 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct, | |||
500 | 532 | ||
501 | /* Try to get same port: if not, try to change it. */ | 533 | /* Try to get same port: if not, try to change it. */ |
502 | for (nated_port = ntohs(port); nated_port != 0; nated_port++) { | 534 | for (nated_port = ntohs(port); nated_port != 0; nated_port++) { |
535 | int ret; | ||
536 | |||
503 | exp->tuple.dst.u.tcp.port = htons(nated_port); | 537 | exp->tuple.dst.u.tcp.port = htons(nated_port); |
504 | if (nf_ct_expect_related(exp) == 0) | 538 | ret = nf_ct_expect_related(exp); |
539 | if (ret == 0) | ||
505 | break; | 540 | break; |
541 | else if (ret != -EBUSY) { | ||
542 | nated_port = 0; | ||
543 | break; | ||
544 | } | ||
506 | } | 545 | } |
507 | 546 | ||
508 | if (nated_port == 0) { /* No port available */ | 547 | if (nated_port == 0) { /* No port available */ |
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c index 4a0c6b548eee..ebc5f8894f99 100644 --- a/net/ipv4/netfilter/nf_nat_helper.c +++ b/net/ipv4/netfilter/nf_nat_helper.c | |||
@@ -153,6 +153,35 @@ void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo, | |||
153 | } | 153 | } |
154 | EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust); | 154 | EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust); |
155 | 155 | ||
156 | static void nf_nat_csum(struct sk_buff *skb, const struct iphdr *iph, void *data, | ||
157 | int datalen, __sum16 *check, int oldlen) | ||
158 | { | ||
159 | struct rtable *rt = skb_rtable(skb); | ||
160 | |||
161 | if (skb->ip_summed != CHECKSUM_PARTIAL) { | ||
162 | if (!(rt->rt_flags & RTCF_LOCAL) && | ||
163 | (!skb->dev || skb->dev->features & NETIF_F_V4_CSUM)) { | ||
164 | skb->ip_summed = CHECKSUM_PARTIAL; | ||
165 | skb->csum_start = skb_headroom(skb) + | ||
166 | skb_network_offset(skb) + | ||
167 | iph->ihl * 4; | ||
168 | skb->csum_offset = (void *)check - data; | ||
169 | *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, | ||
170 | datalen, iph->protocol, 0); | ||
171 | } else { | ||
172 | *check = 0; | ||
173 | *check = csum_tcpudp_magic(iph->saddr, iph->daddr, | ||
174 | datalen, iph->protocol, | ||
175 | csum_partial(data, datalen, | ||
176 | 0)); | ||
177 | if (iph->protocol == IPPROTO_UDP && !*check) | ||
178 | *check = CSUM_MANGLED_0; | ||
179 | } | ||
180 | } else | ||
181 | inet_proto_csum_replace2(check, skb, | ||
182 | htons(oldlen), htons(datalen), 1); | ||
183 | } | ||
184 | |||
156 | /* Generic function for mangling variable-length address changes inside | 185 | /* Generic function for mangling variable-length address changes inside |
157 | * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX | 186 | * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX |
158 | * command in FTP). | 187 | * command in FTP). |
@@ -169,7 +198,6 @@ int __nf_nat_mangle_tcp_packet(struct sk_buff *skb, | |||
169 | const char *rep_buffer, | 198 | const char *rep_buffer, |
170 | unsigned int rep_len, bool adjust) | 199 | unsigned int rep_len, bool adjust) |
171 | { | 200 | { |
172 | struct rtable *rt = skb_rtable(skb); | ||
173 | struct iphdr *iph; | 201 | struct iphdr *iph; |
174 | struct tcphdr *tcph; | 202 | struct tcphdr *tcph; |
175 | int oldlen, datalen; | 203 | int oldlen, datalen; |
@@ -192,26 +220,7 @@ int __nf_nat_mangle_tcp_packet(struct sk_buff *skb, | |||
192 | match_offset, match_len, rep_buffer, rep_len); | 220 | match_offset, match_len, rep_buffer, rep_len); |
193 | 221 | ||
194 | datalen = skb->len - iph->ihl*4; | 222 | datalen = skb->len - iph->ihl*4; |
195 | if (skb->ip_summed != CHECKSUM_PARTIAL) { | 223 | nf_nat_csum(skb, iph, tcph, datalen, &tcph->check, oldlen); |
196 | if (!(rt->rt_flags & RTCF_LOCAL) && | ||
197 | skb->dev->features & NETIF_F_V4_CSUM) { | ||
198 | skb->ip_summed = CHECKSUM_PARTIAL; | ||
199 | skb->csum_start = skb_headroom(skb) + | ||
200 | skb_network_offset(skb) + | ||
201 | iph->ihl * 4; | ||
202 | skb->csum_offset = offsetof(struct tcphdr, check); | ||
203 | tcph->check = ~tcp_v4_check(datalen, | ||
204 | iph->saddr, iph->daddr, 0); | ||
205 | } else { | ||
206 | tcph->check = 0; | ||
207 | tcph->check = tcp_v4_check(datalen, | ||
208 | iph->saddr, iph->daddr, | ||
209 | csum_partial(tcph, | ||
210 | datalen, 0)); | ||
211 | } | ||
212 | } else | ||
213 | inet_proto_csum_replace2(&tcph->check, skb, | ||
214 | htons(oldlen), htons(datalen), 1); | ||
215 | 224 | ||
216 | if (adjust && rep_len != match_len) | 225 | if (adjust && rep_len != match_len) |
217 | nf_nat_set_seq_adjust(ct, ctinfo, tcph->seq, | 226 | nf_nat_set_seq_adjust(ct, ctinfo, tcph->seq, |
@@ -240,7 +249,6 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb, | |||
240 | const char *rep_buffer, | 249 | const char *rep_buffer, |
241 | unsigned int rep_len) | 250 | unsigned int rep_len) |
242 | { | 251 | { |
243 | struct rtable *rt = skb_rtable(skb); | ||
244 | struct iphdr *iph; | 252 | struct iphdr *iph; |
245 | struct udphdr *udph; | 253 | struct udphdr *udph; |
246 | int datalen, oldlen; | 254 | int datalen, oldlen; |
@@ -274,29 +282,7 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb, | |||
274 | if (!udph->check && skb->ip_summed != CHECKSUM_PARTIAL) | 282 | if (!udph->check && skb->ip_summed != CHECKSUM_PARTIAL) |
275 | return 1; | 283 | return 1; |
276 | 284 | ||
277 | if (skb->ip_summed != CHECKSUM_PARTIAL) { | 285 | nf_nat_csum(skb, iph, udph, datalen, &udph->check, oldlen); |
278 | if (!(rt->rt_flags & RTCF_LOCAL) && | ||
279 | skb->dev->features & NETIF_F_V4_CSUM) { | ||
280 | skb->ip_summed = CHECKSUM_PARTIAL; | ||
281 | skb->csum_start = skb_headroom(skb) + | ||
282 | skb_network_offset(skb) + | ||
283 | iph->ihl * 4; | ||
284 | skb->csum_offset = offsetof(struct udphdr, check); | ||
285 | udph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, | ||
286 | datalen, IPPROTO_UDP, | ||
287 | 0); | ||
288 | } else { | ||
289 | udph->check = 0; | ||
290 | udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr, | ||
291 | datalen, IPPROTO_UDP, | ||
292 | csum_partial(udph, | ||
293 | datalen, 0)); | ||
294 | if (!udph->check) | ||
295 | udph->check = CSUM_MANGLED_0; | ||
296 | } | ||
297 | } else | ||
298 | inet_proto_csum_replace2(&udph->check, skb, | ||
299 | htons(oldlen), htons(datalen), 1); | ||
300 | 286 | ||
301 | return 1; | 287 | return 1; |
302 | } | 288 | } |
diff --git a/net/ipv4/netfilter/nf_nat_irc.c b/net/ipv4/netfilter/nf_nat_irc.c index ea83a886b03e..535e1a802356 100644 --- a/net/ipv4/netfilter/nf_nat_irc.c +++ b/net/ipv4/netfilter/nf_nat_irc.c | |||
@@ -45,9 +45,16 @@ static unsigned int help(struct sk_buff *skb, | |||
45 | 45 | ||
46 | /* Try to get same port: if not, try to change it. */ | 46 | /* Try to get same port: if not, try to change it. */ |
47 | for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { | 47 | for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { |
48 | int ret; | ||
49 | |||
48 | exp->tuple.dst.u.tcp.port = htons(port); | 50 | exp->tuple.dst.u.tcp.port = htons(port); |
49 | if (nf_ct_expect_related(exp) == 0) | 51 | ret = nf_ct_expect_related(exp); |
52 | if (ret == 0) | ||
53 | break; | ||
54 | else if (ret != -EBUSY) { | ||
55 | port = 0; | ||
50 | break; | 56 | break; |
57 | } | ||
51 | } | 58 | } |
52 | 59 | ||
53 | if (port == 0) | 60 | if (port == 0) |
diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c index ebbd319f62f5..733c9abc1cbd 100644 --- a/net/ipv4/netfilter/nf_nat_rule.c +++ b/net/ipv4/netfilter/nf_nat_rule.c | |||
@@ -53,7 +53,7 @@ ipt_snat_target(struct sk_buff *skb, const struct xt_action_param *par) | |||
53 | 53 | ||
54 | /* Connection must be valid and new. */ | 54 | /* Connection must be valid and new. */ |
55 | NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || | 55 | NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || |
56 | ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); | 56 | ctinfo == IP_CT_RELATED_REPLY)); |
57 | NF_CT_ASSERT(par->out != NULL); | 57 | NF_CT_ASSERT(par->out != NULL); |
58 | 58 | ||
59 | return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_SRC); | 59 | return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_SRC); |
@@ -106,16 +106,15 @@ alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) | |||
106 | { | 106 | { |
107 | /* Force range to this IP; let proto decide mapping for | 107 | /* Force range to this IP; let proto decide mapping for |
108 | per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). | 108 | per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). |
109 | Use reply in case it's already been mangled (eg local packet). | ||
110 | */ | 109 | */ |
111 | __be32 ip | 110 | struct nf_nat_range range; |
112 | = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC | 111 | |
113 | ? ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip | 112 | range.flags = 0; |
114 | : ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip); | 113 | pr_debug("Allocating NULL binding for %p (%pI4)\n", ct, |
115 | struct nf_nat_range range | 114 | HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC ? |
116 | = { IP_NAT_RANGE_MAP_IPS, ip, ip, { 0 }, { 0 } }; | 115 | &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip : |
117 | 116 | &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip); | |
118 | pr_debug("Allocating NULL binding for %p (%pI4)\n", ct, &ip); | 117 | |
119 | return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum)); | 118 | return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum)); |
120 | } | 119 | } |
121 | 120 | ||
diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c index 11b538deaaec..e40cf7816fdb 100644 --- a/net/ipv4/netfilter/nf_nat_sip.c +++ b/net/ipv4/netfilter/nf_nat_sip.c | |||
@@ -307,9 +307,16 @@ static unsigned int ip_nat_sip_expect(struct sk_buff *skb, unsigned int dataoff, | |||
307 | exp->expectfn = ip_nat_sip_expected; | 307 | exp->expectfn = ip_nat_sip_expected; |
308 | 308 | ||
309 | for (; port != 0; port++) { | 309 | for (; port != 0; port++) { |
310 | int ret; | ||
311 | |||
310 | exp->tuple.dst.u.udp.port = htons(port); | 312 | exp->tuple.dst.u.udp.port = htons(port); |
311 | if (nf_ct_expect_related(exp) == 0) | 313 | ret = nf_ct_expect_related(exp); |
314 | if (ret == 0) | ||
315 | break; | ||
316 | else if (ret != -EBUSY) { | ||
317 | port = 0; | ||
312 | break; | 318 | break; |
319 | } | ||
313 | } | 320 | } |
314 | 321 | ||
315 | if (port == 0) | 322 | if (port == 0) |
@@ -480,13 +487,25 @@ static unsigned int ip_nat_sdp_media(struct sk_buff *skb, unsigned int dataoff, | |||
480 | /* Try to get same pair of ports: if not, try to change them. */ | 487 | /* Try to get same pair of ports: if not, try to change them. */ |
481 | for (port = ntohs(rtp_exp->tuple.dst.u.udp.port); | 488 | for (port = ntohs(rtp_exp->tuple.dst.u.udp.port); |
482 | port != 0; port += 2) { | 489 | port != 0; port += 2) { |
490 | int ret; | ||
491 | |||
483 | rtp_exp->tuple.dst.u.udp.port = htons(port); | 492 | rtp_exp->tuple.dst.u.udp.port = htons(port); |
484 | if (nf_ct_expect_related(rtp_exp) != 0) | 493 | ret = nf_ct_expect_related(rtp_exp); |
494 | if (ret == -EBUSY) | ||
485 | continue; | 495 | continue; |
496 | else if (ret < 0) { | ||
497 | port = 0; | ||
498 | break; | ||
499 | } | ||
486 | rtcp_exp->tuple.dst.u.udp.port = htons(port + 1); | 500 | rtcp_exp->tuple.dst.u.udp.port = htons(port + 1); |
487 | if (nf_ct_expect_related(rtcp_exp) == 0) | 501 | ret = nf_ct_expect_related(rtcp_exp); |
502 | if (ret == 0) | ||
488 | break; | 503 | break; |
489 | nf_ct_unexpect_related(rtp_exp); | 504 | else if (ret != -EBUSY) { |
505 | nf_ct_unexpect_related(rtp_exp); | ||
506 | port = 0; | ||
507 | break; | ||
508 | } | ||
490 | } | 509 | } |
491 | 510 | ||
492 | if (port == 0) | 511 | if (port == 0) |
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c index ee5f419d0a56..8812a02078ab 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c | |||
@@ -54,6 +54,7 @@ | |||
54 | #include <net/netfilter/nf_conntrack_expect.h> | 54 | #include <net/netfilter/nf_conntrack_expect.h> |
55 | #include <net/netfilter/nf_conntrack_helper.h> | 55 | #include <net/netfilter/nf_conntrack_helper.h> |
56 | #include <net/netfilter/nf_nat_helper.h> | 56 | #include <net/netfilter/nf_nat_helper.h> |
57 | #include <linux/netfilter/nf_conntrack_snmp.h> | ||
57 | 58 | ||
58 | MODULE_LICENSE("GPL"); | 59 | MODULE_LICENSE("GPL"); |
59 | MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); | 60 | MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); |
@@ -1310,9 +1311,9 @@ static int __init nf_nat_snmp_basic_init(void) | |||
1310 | { | 1311 | { |
1311 | int ret = 0; | 1312 | int ret = 0; |
1312 | 1313 | ||
1313 | ret = nf_conntrack_helper_register(&snmp_helper); | 1314 | BUG_ON(nf_nat_snmp_hook != NULL); |
1314 | if (ret < 0) | 1315 | rcu_assign_pointer(nf_nat_snmp_hook, help); |
1315 | return ret; | 1316 | |
1316 | ret = nf_conntrack_helper_register(&snmp_trap_helper); | 1317 | ret = nf_conntrack_helper_register(&snmp_trap_helper); |
1317 | if (ret < 0) { | 1318 | if (ret < 0) { |
1318 | nf_conntrack_helper_unregister(&snmp_helper); | 1319 | nf_conntrack_helper_unregister(&snmp_helper); |
@@ -1323,7 +1324,7 @@ static int __init nf_nat_snmp_basic_init(void) | |||
1323 | 1324 | ||
1324 | static void __exit nf_nat_snmp_basic_fini(void) | 1325 | static void __exit nf_nat_snmp_basic_fini(void) |
1325 | { | 1326 | { |
1326 | nf_conntrack_helper_unregister(&snmp_helper); | 1327 | rcu_assign_pointer(nf_nat_snmp_hook, NULL); |
1327 | nf_conntrack_helper_unregister(&snmp_trap_helper); | 1328 | nf_conntrack_helper_unregister(&snmp_trap_helper); |
1328 | } | 1329 | } |
1329 | 1330 | ||
diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c index 95481fee8bdb..483b76d042da 100644 --- a/net/ipv4/netfilter/nf_nat_standalone.c +++ b/net/ipv4/netfilter/nf_nat_standalone.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #ifdef CONFIG_XFRM | 31 | #ifdef CONFIG_XFRM |
32 | static void nat_decode_session(struct sk_buff *skb, struct flowi *fl) | 32 | static void nat_decode_session(struct sk_buff *skb, struct flowi *fl) |
33 | { | 33 | { |
34 | struct flowi4 *fl4 = &fl->u.ip4; | ||
34 | const struct nf_conn *ct; | 35 | const struct nf_conn *ct; |
35 | const struct nf_conntrack_tuple *t; | 36 | const struct nf_conntrack_tuple *t; |
36 | enum ip_conntrack_info ctinfo; | 37 | enum ip_conntrack_info ctinfo; |
@@ -49,25 +50,25 @@ static void nat_decode_session(struct sk_buff *skb, struct flowi *fl) | |||
49 | statusbit = IPS_SRC_NAT; | 50 | statusbit = IPS_SRC_NAT; |
50 | 51 | ||
51 | if (ct->status & statusbit) { | 52 | if (ct->status & statusbit) { |
52 | fl->fl4_dst = t->dst.u3.ip; | 53 | fl4->daddr = t->dst.u3.ip; |
53 | if (t->dst.protonum == IPPROTO_TCP || | 54 | if (t->dst.protonum == IPPROTO_TCP || |
54 | t->dst.protonum == IPPROTO_UDP || | 55 | t->dst.protonum == IPPROTO_UDP || |
55 | t->dst.protonum == IPPROTO_UDPLITE || | 56 | t->dst.protonum == IPPROTO_UDPLITE || |
56 | t->dst.protonum == IPPROTO_DCCP || | 57 | t->dst.protonum == IPPROTO_DCCP || |
57 | t->dst.protonum == IPPROTO_SCTP) | 58 | t->dst.protonum == IPPROTO_SCTP) |
58 | fl->fl_ip_dport = t->dst.u.tcp.port; | 59 | fl4->fl4_dport = t->dst.u.tcp.port; |
59 | } | 60 | } |
60 | 61 | ||
61 | statusbit ^= IPS_NAT_MASK; | 62 | statusbit ^= IPS_NAT_MASK; |
62 | 63 | ||
63 | if (ct->status & statusbit) { | 64 | if (ct->status & statusbit) { |
64 | fl->fl4_src = t->src.u3.ip; | 65 | fl4->saddr = t->src.u3.ip; |
65 | if (t->dst.protonum == IPPROTO_TCP || | 66 | if (t->dst.protonum == IPPROTO_TCP || |
66 | t->dst.protonum == IPPROTO_UDP || | 67 | t->dst.protonum == IPPROTO_UDP || |
67 | t->dst.protonum == IPPROTO_UDPLITE || | 68 | t->dst.protonum == IPPROTO_UDPLITE || |
68 | t->dst.protonum == IPPROTO_DCCP || | 69 | t->dst.protonum == IPPROTO_DCCP || |
69 | t->dst.protonum == IPPROTO_SCTP) | 70 | t->dst.protonum == IPPROTO_SCTP) |
70 | fl->fl_ip_sport = t->src.u.tcp.port; | 71 | fl4->fl4_sport = t->src.u.tcp.port; |
71 | } | 72 | } |
72 | } | 73 | } |
73 | #endif | 74 | #endif |
@@ -115,7 +116,7 @@ nf_nat_fn(unsigned int hooknum, | |||
115 | 116 | ||
116 | switch (ctinfo) { | 117 | switch (ctinfo) { |
117 | case IP_CT_RELATED: | 118 | case IP_CT_RELATED: |
118 | case IP_CT_RELATED+IP_CT_IS_REPLY: | 119 | case IP_CT_RELATED_REPLY: |
119 | if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { | 120 | if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { |
120 | if (!nf_nat_icmp_reply_translation(ct, ctinfo, | 121 | if (!nf_nat_icmp_reply_translation(ct, ctinfo, |
121 | hooknum, skb)) | 122 | hooknum, skb)) |
@@ -143,7 +144,7 @@ nf_nat_fn(unsigned int hooknum, | |||
143 | default: | 144 | default: |
144 | /* ESTABLISHED */ | 145 | /* ESTABLISHED */ |
145 | NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || | 146 | NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || |
146 | ctinfo == (IP_CT_ESTABLISHED+IP_CT_IS_REPLY)); | 147 | ctinfo == IP_CT_ESTABLISHED_REPLY); |
147 | } | 148 | } |
148 | 149 | ||
149 | return nf_nat_packet(ct, ctinfo, hooknum, skb); | 150 | return nf_nat_packet(ct, ctinfo, hooknum, skb); |
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c new file mode 100644 index 000000000000..39b403f854c6 --- /dev/null +++ b/net/ipv4/ping.c | |||
@@ -0,0 +1,931 @@ | |||
1 | /* | ||
2 | * INET An implementation of the TCP/IP protocol suite for the LINUX | ||
3 | * operating system. INET is implemented using the BSD Socket | ||
4 | * interface as the means of communication with the user level. | ||
5 | * | ||
6 | * "Ping" sockets | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or | ||
9 | * modify it under the terms of the GNU General Public License | ||
10 | * as published by the Free Software Foundation; either version | ||
11 | * 2 of the License, or (at your option) any later version. | ||
12 | * | ||
13 | * Based on ipv4/udp.c code. | ||
14 | * | ||
15 | * Authors: Vasiliy Kulikov / Openwall (for Linux 2.6), | ||
16 | * Pavel Kankovsky (for Linux 2.4.32) | ||
17 | * | ||
18 | * Pavel gave all rights to bugs to Vasiliy, | ||
19 | * none of the bugs are Pavel's now. | ||
20 | * | ||
21 | */ | ||
22 | |||
23 | #include <asm/system.h> | ||
24 | #include <linux/uaccess.h> | ||
25 | #include <linux/types.h> | ||
26 | #include <linux/fcntl.h> | ||
27 | #include <linux/socket.h> | ||
28 | #include <linux/sockios.h> | ||
29 | #include <linux/in.h> | ||
30 | #include <linux/errno.h> | ||
31 | #include <linux/timer.h> | ||
32 | #include <linux/mm.h> | ||
33 | #include <linux/inet.h> | ||
34 | #include <linux/netdevice.h> | ||
35 | #include <net/snmp.h> | ||
36 | #include <net/ip.h> | ||
37 | #include <net/ipv6.h> | ||
38 | #include <net/icmp.h> | ||
39 | #include <net/protocol.h> | ||
40 | #include <linux/skbuff.h> | ||
41 | #include <linux/proc_fs.h> | ||
42 | #include <net/sock.h> | ||
43 | #include <net/ping.h> | ||
44 | #include <net/udp.h> | ||
45 | #include <net/route.h> | ||
46 | #include <net/inet_common.h> | ||
47 | #include <net/checksum.h> | ||
48 | |||
49 | |||
50 | static struct ping_table ping_table; | ||
51 | |||
52 | static u16 ping_port_rover; | ||
53 | |||
54 | static inline int ping_hashfn(struct net *net, unsigned num, unsigned mask) | ||
55 | { | ||
56 | int res = (num + net_hash_mix(net)) & mask; | ||
57 | pr_debug("hash(%d) = %d\n", num, res); | ||
58 | return res; | ||
59 | } | ||
60 | |||
61 | static inline struct hlist_nulls_head *ping_hashslot(struct ping_table *table, | ||
62 | struct net *net, unsigned num) | ||
63 | { | ||
64 | return &table->hash[ping_hashfn(net, num, PING_HTABLE_MASK)]; | ||
65 | } | ||
66 | |||
67 | static int ping_v4_get_port(struct sock *sk, unsigned short ident) | ||
68 | { | ||
69 | struct hlist_nulls_node *node; | ||
70 | struct hlist_nulls_head *hlist; | ||
71 | struct inet_sock *isk, *isk2; | ||
72 | struct sock *sk2 = NULL; | ||
73 | |||
74 | isk = inet_sk(sk); | ||
75 | write_lock_bh(&ping_table.lock); | ||
76 | if (ident == 0) { | ||
77 | u32 i; | ||
78 | u16 result = ping_port_rover + 1; | ||
79 | |||
80 | for (i = 0; i < (1L << 16); i++, result++) { | ||
81 | if (!result) | ||
82 | result++; /* avoid zero */ | ||
83 | hlist = ping_hashslot(&ping_table, sock_net(sk), | ||
84 | result); | ||
85 | ping_portaddr_for_each_entry(sk2, node, hlist) { | ||
86 | isk2 = inet_sk(sk2); | ||
87 | |||
88 | if (isk2->inet_num == result) | ||
89 | goto next_port; | ||
90 | } | ||
91 | |||
92 | /* found */ | ||
93 | ping_port_rover = ident = result; | ||
94 | break; | ||
95 | next_port: | ||
96 | ; | ||
97 | } | ||
98 | if (i >= (1L << 16)) | ||
99 | goto fail; | ||
100 | } else { | ||
101 | hlist = ping_hashslot(&ping_table, sock_net(sk), ident); | ||
102 | ping_portaddr_for_each_entry(sk2, node, hlist) { | ||
103 | isk2 = inet_sk(sk2); | ||
104 | |||
105 | if ((isk2->inet_num == ident) && | ||
106 | (sk2 != sk) && | ||
107 | (!sk2->sk_reuse || !sk->sk_reuse)) | ||
108 | goto fail; | ||
109 | } | ||
110 | } | ||
111 | |||
112 | pr_debug("found port/ident = %d\n", ident); | ||
113 | isk->inet_num = ident; | ||
114 | if (sk_unhashed(sk)) { | ||
115 | pr_debug("was not hashed\n"); | ||
116 | sock_hold(sk); | ||
117 | hlist_nulls_add_head(&sk->sk_nulls_node, hlist); | ||
118 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); | ||
119 | } | ||
120 | write_unlock_bh(&ping_table.lock); | ||
121 | return 0; | ||
122 | |||
123 | fail: | ||
124 | write_unlock_bh(&ping_table.lock); | ||
125 | return 1; | ||
126 | } | ||
127 | |||
128 | static void ping_v4_hash(struct sock *sk) | ||
129 | { | ||
130 | pr_debug("ping_v4_hash(sk->port=%u)\n", inet_sk(sk)->inet_num); | ||
131 | BUG(); /* "Please do not press this button again." */ | ||
132 | } | ||
133 | |||
134 | static void ping_v4_unhash(struct sock *sk) | ||
135 | { | ||
136 | struct inet_sock *isk = inet_sk(sk); | ||
137 | pr_debug("ping_v4_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num); | ||
138 | if (sk_hashed(sk)) { | ||
139 | write_lock_bh(&ping_table.lock); | ||
140 | hlist_nulls_del(&sk->sk_nulls_node); | ||
141 | sock_put(sk); | ||
142 | isk->inet_num = isk->inet_sport = 0; | ||
143 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); | ||
144 | write_unlock_bh(&ping_table.lock); | ||
145 | } | ||
146 | } | ||
147 | |||
148 | static struct sock *ping_v4_lookup(struct net *net, u32 saddr, u32 daddr, | ||
149 | u16 ident, int dif) | ||
150 | { | ||
151 | struct hlist_nulls_head *hslot = ping_hashslot(&ping_table, net, ident); | ||
152 | struct sock *sk = NULL; | ||
153 | struct inet_sock *isk; | ||
154 | struct hlist_nulls_node *hnode; | ||
155 | |||
156 | pr_debug("try to find: num = %d, daddr = %ld, dif = %d\n", | ||
157 | (int)ident, (unsigned long)daddr, dif); | ||
158 | read_lock_bh(&ping_table.lock); | ||
159 | |||
160 | ping_portaddr_for_each_entry(sk, hnode, hslot) { | ||
161 | isk = inet_sk(sk); | ||
162 | |||
163 | pr_debug("found: %p: num = %d, daddr = %ld, dif = %d\n", sk, | ||
164 | (int)isk->inet_num, (unsigned long)isk->inet_rcv_saddr, | ||
165 | sk->sk_bound_dev_if); | ||
166 | |||
167 | pr_debug("iterate\n"); | ||
168 | if (isk->inet_num != ident) | ||
169 | continue; | ||
170 | if (isk->inet_rcv_saddr && isk->inet_rcv_saddr != daddr) | ||
171 | continue; | ||
172 | if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) | ||
173 | continue; | ||
174 | |||
175 | sock_hold(sk); | ||
176 | goto exit; | ||
177 | } | ||
178 | |||
179 | sk = NULL; | ||
180 | exit: | ||
181 | read_unlock_bh(&ping_table.lock); | ||
182 | |||
183 | return sk; | ||
184 | } | ||
185 | |||
186 | static void inet_get_ping_group_range_net(struct net *net, gid_t *low, | ||
187 | gid_t *high) | ||
188 | { | ||
189 | gid_t *data = net->ipv4.sysctl_ping_group_range; | ||
190 | unsigned seq; | ||
191 | do { | ||
192 | seq = read_seqbegin(&sysctl_local_ports.lock); | ||
193 | |||
194 | *low = data[0]; | ||
195 | *high = data[1]; | ||
196 | } while (read_seqretry(&sysctl_local_ports.lock, seq)); | ||
197 | } | ||
198 | |||
199 | |||
200 | static int ping_init_sock(struct sock *sk) | ||
201 | { | ||
202 | struct net *net = sock_net(sk); | ||
203 | gid_t group = current_egid(); | ||
204 | gid_t range[2]; | ||
205 | struct group_info *group_info = get_current_groups(); | ||
206 | int i, j, count = group_info->ngroups; | ||
207 | |||
208 | inet_get_ping_group_range_net(net, range, range+1); | ||
209 | if (range[0] <= group && group <= range[1]) | ||
210 | return 0; | ||
211 | |||
212 | for (i = 0; i < group_info->nblocks; i++) { | ||
213 | int cp_count = min_t(int, NGROUPS_PER_BLOCK, count); | ||
214 | |||
215 | for (j = 0; j < cp_count; j++) { | ||
216 | group = group_info->blocks[i][j]; | ||
217 | if (range[0] <= group && group <= range[1]) | ||
218 | return 0; | ||
219 | } | ||
220 | |||
221 | count -= cp_count; | ||
222 | } | ||
223 | |||
224 | return -EACCES; | ||
225 | } | ||
226 | |||
227 | static void ping_close(struct sock *sk, long timeout) | ||
228 | { | ||
229 | pr_debug("ping_close(sk=%p,sk->num=%u)\n", | ||
230 | inet_sk(sk), inet_sk(sk)->inet_num); | ||
231 | pr_debug("isk->refcnt = %d\n", sk->sk_refcnt.counter); | ||
232 | |||
233 | sk_common_release(sk); | ||
234 | } | ||
235 | |||
236 | /* | ||
237 | * We need our own bind because there are no privileged id's == local ports. | ||
238 | * Moreover, we don't allow binding to multi- and broadcast addresses. | ||
239 | */ | ||
240 | |||
241 | static int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) | ||
242 | { | ||
243 | struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; | ||
244 | struct inet_sock *isk = inet_sk(sk); | ||
245 | unsigned short snum; | ||
246 | int chk_addr_ret; | ||
247 | int err; | ||
248 | |||
249 | if (addr_len < sizeof(struct sockaddr_in)) | ||
250 | return -EINVAL; | ||
251 | |||
252 | pr_debug("ping_v4_bind(sk=%p,sa_addr=%08x,sa_port=%d)\n", | ||
253 | sk, addr->sin_addr.s_addr, ntohs(addr->sin_port)); | ||
254 | |||
255 | chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); | ||
256 | if (addr->sin_addr.s_addr == INADDR_ANY) | ||
257 | chk_addr_ret = RTN_LOCAL; | ||
258 | |||
259 | if ((sysctl_ip_nonlocal_bind == 0 && | ||
260 | isk->freebind == 0 && isk->transparent == 0 && | ||
261 | chk_addr_ret != RTN_LOCAL) || | ||
262 | chk_addr_ret == RTN_MULTICAST || | ||
263 | chk_addr_ret == RTN_BROADCAST) | ||
264 | return -EADDRNOTAVAIL; | ||
265 | |||
266 | lock_sock(sk); | ||
267 | |||
268 | err = -EINVAL; | ||
269 | if (isk->inet_num != 0) | ||
270 | goto out; | ||
271 | |||
272 | err = -EADDRINUSE; | ||
273 | isk->inet_rcv_saddr = isk->inet_saddr = addr->sin_addr.s_addr; | ||
274 | snum = ntohs(addr->sin_port); | ||
275 | if (ping_v4_get_port(sk, snum) != 0) { | ||
276 | isk->inet_saddr = isk->inet_rcv_saddr = 0; | ||
277 | goto out; | ||
278 | } | ||
279 | |||
280 | pr_debug("after bind(): num = %d, daddr = %ld, dif = %d\n", | ||
281 | (int)isk->inet_num, | ||
282 | (unsigned long) isk->inet_rcv_saddr, | ||
283 | (int)sk->sk_bound_dev_if); | ||
284 | |||
285 | err = 0; | ||
286 | if (isk->inet_rcv_saddr) | ||
287 | sk->sk_userlocks |= SOCK_BINDADDR_LOCK; | ||
288 | if (snum) | ||
289 | sk->sk_userlocks |= SOCK_BINDPORT_LOCK; | ||
290 | isk->inet_sport = htons(isk->inet_num); | ||
291 | isk->inet_daddr = 0; | ||
292 | isk->inet_dport = 0; | ||
293 | sk_dst_reset(sk); | ||
294 | out: | ||
295 | release_sock(sk); | ||
296 | pr_debug("ping_v4_bind -> %d\n", err); | ||
297 | return err; | ||
298 | } | ||
299 | |||
300 | /* | ||
301 | * Is this a supported type of ICMP message? | ||
302 | */ | ||
303 | |||
304 | static inline int ping_supported(int type, int code) | ||
305 | { | ||
306 | if (type == ICMP_ECHO && code == 0) | ||
307 | return 1; | ||
308 | return 0; | ||
309 | } | ||
310 | |||
311 | /* | ||
312 | * This routine is called by the ICMP module when it gets some | ||
313 | * sort of error condition. | ||
314 | */ | ||
315 | |||
316 | static int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); | ||
317 | |||
318 | void ping_err(struct sk_buff *skb, u32 info) | ||
319 | { | ||
320 | struct iphdr *iph = (struct iphdr *)skb->data; | ||
321 | struct icmphdr *icmph = (struct icmphdr *)(skb->data+(iph->ihl<<2)); | ||
322 | struct inet_sock *inet_sock; | ||
323 | int type = icmph->type; | ||
324 | int code = icmph->code; | ||
325 | struct net *net = dev_net(skb->dev); | ||
326 | struct sock *sk; | ||
327 | int harderr; | ||
328 | int err; | ||
329 | |||
330 | /* We assume the packet has already been checked by icmp_unreach */ | ||
331 | |||
332 | if (!ping_supported(icmph->type, icmph->code)) | ||
333 | return; | ||
334 | |||
335 | pr_debug("ping_err(type=%04x,code=%04x,id=%04x,seq=%04x)\n", type, | ||
336 | code, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence)); | ||
337 | |||
338 | sk = ping_v4_lookup(net, iph->daddr, iph->saddr, | ||
339 | ntohs(icmph->un.echo.id), skb->dev->ifindex); | ||
340 | if (sk == NULL) { | ||
341 | ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); | ||
342 | pr_debug("no socket, dropping\n"); | ||
343 | return; /* No socket for error */ | ||
344 | } | ||
345 | pr_debug("err on socket %p\n", sk); | ||
346 | |||
347 | err = 0; | ||
348 | harderr = 0; | ||
349 | inet_sock = inet_sk(sk); | ||
350 | |||
351 | switch (type) { | ||
352 | default: | ||
353 | case ICMP_TIME_EXCEEDED: | ||
354 | err = EHOSTUNREACH; | ||
355 | break; | ||
356 | case ICMP_SOURCE_QUENCH: | ||
357 | /* This is not a real error but ping wants to see it. | ||
358 | * Report it with some fake errno. */ | ||
359 | err = EREMOTEIO; | ||
360 | break; | ||
361 | case ICMP_PARAMETERPROB: | ||
362 | err = EPROTO; | ||
363 | harderr = 1; | ||
364 | break; | ||
365 | case ICMP_DEST_UNREACH: | ||
366 | if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ | ||
367 | if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) { | ||
368 | err = EMSGSIZE; | ||
369 | harderr = 1; | ||
370 | break; | ||
371 | } | ||
372 | goto out; | ||
373 | } | ||
374 | err = EHOSTUNREACH; | ||
375 | if (code <= NR_ICMP_UNREACH) { | ||
376 | harderr = icmp_err_convert[code].fatal; | ||
377 | err = icmp_err_convert[code].errno; | ||
378 | } | ||
379 | break; | ||
380 | case ICMP_REDIRECT: | ||
381 | /* See ICMP_SOURCE_QUENCH */ | ||
382 | err = EREMOTEIO; | ||
383 | break; | ||
384 | } | ||
385 | |||
386 | /* | ||
387 | * RFC1122: OK. Passes ICMP errors back to application, as per | ||
388 | * 4.1.3.3. | ||
389 | */ | ||
390 | if (!inet_sock->recverr) { | ||
391 | if (!harderr || sk->sk_state != TCP_ESTABLISHED) | ||
392 | goto out; | ||
393 | } else { | ||
394 | ip_icmp_error(sk, skb, err, 0 /* no remote port */, | ||
395 | info, (u8 *)icmph); | ||
396 | } | ||
397 | sk->sk_err = err; | ||
398 | sk->sk_error_report(sk); | ||
399 | out: | ||
400 | sock_put(sk); | ||
401 | } | ||
402 | |||
403 | /* | ||
404 | * Copy and checksum an ICMP Echo packet from user space into a buffer. | ||
405 | */ | ||
406 | |||
407 | struct pingfakehdr { | ||
408 | struct icmphdr icmph; | ||
409 | struct iovec *iov; | ||
410 | u32 wcheck; | ||
411 | }; | ||
412 | |||
413 | static int ping_getfrag(void *from, char * to, | ||
414 | int offset, int fraglen, int odd, struct sk_buff *skb) | ||
415 | { | ||
416 | struct pingfakehdr *pfh = (struct pingfakehdr *)from; | ||
417 | |||
418 | if (offset == 0) { | ||
419 | if (fraglen < sizeof(struct icmphdr)) | ||
420 | BUG(); | ||
421 | if (csum_partial_copy_fromiovecend(to + sizeof(struct icmphdr), | ||
422 | pfh->iov, 0, fraglen - sizeof(struct icmphdr), | ||
423 | &pfh->wcheck)) | ||
424 | return -EFAULT; | ||
425 | |||
426 | return 0; | ||
427 | } | ||
428 | if (offset < sizeof(struct icmphdr)) | ||
429 | BUG(); | ||
430 | if (csum_partial_copy_fromiovecend | ||
431 | (to, pfh->iov, offset - sizeof(struct icmphdr), | ||
432 | fraglen, &pfh->wcheck)) | ||
433 | return -EFAULT; | ||
434 | return 0; | ||
435 | } | ||
436 | |||
437 | static int ping_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh, | ||
438 | struct flowi4 *fl4) | ||
439 | { | ||
440 | struct sk_buff *skb = skb_peek(&sk->sk_write_queue); | ||
441 | |||
442 | pfh->wcheck = csum_partial((char *)&pfh->icmph, | ||
443 | sizeof(struct icmphdr), pfh->wcheck); | ||
444 | pfh->icmph.checksum = csum_fold(pfh->wcheck); | ||
445 | memcpy(icmp_hdr(skb), &pfh->icmph, sizeof(struct icmphdr)); | ||
446 | skb->ip_summed = CHECKSUM_NONE; | ||
447 | return ip_push_pending_frames(sk, fl4); | ||
448 | } | ||
449 | |||
450 | static int ping_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | ||
451 | size_t len) | ||
452 | { | ||
453 | struct net *net = sock_net(sk); | ||
454 | struct flowi4 fl4; | ||
455 | struct inet_sock *inet = inet_sk(sk); | ||
456 | struct ipcm_cookie ipc; | ||
457 | struct icmphdr user_icmph; | ||
458 | struct pingfakehdr pfh; | ||
459 | struct rtable *rt = NULL; | ||
460 | struct ip_options_data opt_copy; | ||
461 | int free = 0; | ||
462 | u32 saddr, daddr, faddr; | ||
463 | u8 tos; | ||
464 | int err; | ||
465 | |||
466 | pr_debug("ping_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num); | ||
467 | |||
468 | |||
469 | if (len > 0xFFFF) | ||
470 | return -EMSGSIZE; | ||
471 | |||
472 | /* | ||
473 | * Check the flags. | ||
474 | */ | ||
475 | |||
476 | /* Mirror BSD error message compatibility */ | ||
477 | if (msg->msg_flags & MSG_OOB) | ||
478 | return -EOPNOTSUPP; | ||
479 | |||
480 | /* | ||
481 | * Fetch the ICMP header provided by the userland. | ||
482 | * iovec is modified! | ||
483 | */ | ||
484 | |||
485 | if (memcpy_fromiovec((u8 *)&user_icmph, msg->msg_iov, | ||
486 | sizeof(struct icmphdr))) | ||
487 | return -EFAULT; | ||
488 | if (!ping_supported(user_icmph.type, user_icmph.code)) | ||
489 | return -EINVAL; | ||
490 | |||
491 | /* | ||
492 | * Get and verify the address. | ||
493 | */ | ||
494 | |||
495 | if (msg->msg_name) { | ||
496 | struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name; | ||
497 | if (msg->msg_namelen < sizeof(*usin)) | ||
498 | return -EINVAL; | ||
499 | if (usin->sin_family != AF_INET) | ||
500 | return -EINVAL; | ||
501 | daddr = usin->sin_addr.s_addr; | ||
502 | /* no remote port */ | ||
503 | } else { | ||
504 | if (sk->sk_state != TCP_ESTABLISHED) | ||
505 | return -EDESTADDRREQ; | ||
506 | daddr = inet->inet_daddr; | ||
507 | /* no remote port */ | ||
508 | } | ||
509 | |||
510 | ipc.addr = inet->inet_saddr; | ||
511 | ipc.opt = NULL; | ||
512 | ipc.oif = sk->sk_bound_dev_if; | ||
513 | ipc.tx_flags = 0; | ||
514 | err = sock_tx_timestamp(sk, &ipc.tx_flags); | ||
515 | if (err) | ||
516 | return err; | ||
517 | |||
518 | if (msg->msg_controllen) { | ||
519 | err = ip_cmsg_send(sock_net(sk), msg, &ipc); | ||
520 | if (err) | ||
521 | return err; | ||
522 | if (ipc.opt) | ||
523 | free = 1; | ||
524 | } | ||
525 | if (!ipc.opt) { | ||
526 | struct ip_options_rcu *inet_opt; | ||
527 | |||
528 | rcu_read_lock(); | ||
529 | inet_opt = rcu_dereference(inet->inet_opt); | ||
530 | if (inet_opt) { | ||
531 | memcpy(&opt_copy, inet_opt, | ||
532 | sizeof(*inet_opt) + inet_opt->opt.optlen); | ||
533 | ipc.opt = &opt_copy.opt; | ||
534 | } | ||
535 | rcu_read_unlock(); | ||
536 | } | ||
537 | |||
538 | saddr = ipc.addr; | ||
539 | ipc.addr = faddr = daddr; | ||
540 | |||
541 | if (ipc.opt && ipc.opt->opt.srr) { | ||
542 | if (!daddr) | ||
543 | return -EINVAL; | ||
544 | faddr = ipc.opt->opt.faddr; | ||
545 | } | ||
546 | tos = RT_TOS(inet->tos); | ||
547 | if (sock_flag(sk, SOCK_LOCALROUTE) || | ||
548 | (msg->msg_flags & MSG_DONTROUTE) || | ||
549 | (ipc.opt && ipc.opt->opt.is_strictroute)) { | ||
550 | tos |= RTO_ONLINK; | ||
551 | } | ||
552 | |||
553 | if (ipv4_is_multicast(daddr)) { | ||
554 | if (!ipc.oif) | ||
555 | ipc.oif = inet->mc_index; | ||
556 | if (!saddr) | ||
557 | saddr = inet->mc_addr; | ||
558 | } | ||
559 | |||
560 | flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, | ||
561 | RT_SCOPE_UNIVERSE, sk->sk_protocol, | ||
562 | inet_sk_flowi_flags(sk), faddr, saddr, 0, 0); | ||
563 | |||
564 | security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); | ||
565 | rt = ip_route_output_flow(net, &fl4, sk); | ||
566 | if (IS_ERR(rt)) { | ||
567 | err = PTR_ERR(rt); | ||
568 | rt = NULL; | ||
569 | if (err == -ENETUNREACH) | ||
570 | IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); | ||
571 | goto out; | ||
572 | } | ||
573 | |||
574 | err = -EACCES; | ||
575 | if ((rt->rt_flags & RTCF_BROADCAST) && | ||
576 | !sock_flag(sk, SOCK_BROADCAST)) | ||
577 | goto out; | ||
578 | |||
579 | if (msg->msg_flags & MSG_CONFIRM) | ||
580 | goto do_confirm; | ||
581 | back_from_confirm: | ||
582 | |||
583 | if (!ipc.addr) | ||
584 | ipc.addr = fl4.daddr; | ||
585 | |||
586 | lock_sock(sk); | ||
587 | |||
588 | pfh.icmph.type = user_icmph.type; /* already checked */ | ||
589 | pfh.icmph.code = user_icmph.code; /* ditto */ | ||
590 | pfh.icmph.checksum = 0; | ||
591 | pfh.icmph.un.echo.id = inet->inet_sport; | ||
592 | pfh.icmph.un.echo.sequence = user_icmph.un.echo.sequence; | ||
593 | pfh.iov = msg->msg_iov; | ||
594 | pfh.wcheck = 0; | ||
595 | |||
596 | err = ip_append_data(sk, &fl4, ping_getfrag, &pfh, len, | ||
597 | 0, &ipc, &rt, msg->msg_flags); | ||
598 | if (err) | ||
599 | ip_flush_pending_frames(sk); | ||
600 | else | ||
601 | err = ping_push_pending_frames(sk, &pfh, &fl4); | ||
602 | release_sock(sk); | ||
603 | |||
604 | out: | ||
605 | ip_rt_put(rt); | ||
606 | if (free) | ||
607 | kfree(ipc.opt); | ||
608 | if (!err) { | ||
609 | icmp_out_count(sock_net(sk), user_icmph.type); | ||
610 | return len; | ||
611 | } | ||
612 | return err; | ||
613 | |||
614 | do_confirm: | ||
615 | dst_confirm(&rt->dst); | ||
616 | if (!(msg->msg_flags & MSG_PROBE) || len) | ||
617 | goto back_from_confirm; | ||
618 | err = 0; | ||
619 | goto out; | ||
620 | } | ||
621 | |||
622 | static int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | ||
623 | size_t len, int noblock, int flags, int *addr_len) | ||
624 | { | ||
625 | struct inet_sock *isk = inet_sk(sk); | ||
626 | struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; | ||
627 | struct sk_buff *skb; | ||
628 | int copied, err; | ||
629 | |||
630 | pr_debug("ping_recvmsg(sk=%p,sk->num=%u)\n", isk, isk->inet_num); | ||
631 | |||
632 | if (flags & MSG_OOB) | ||
633 | goto out; | ||
634 | |||
635 | if (addr_len) | ||
636 | *addr_len = sizeof(*sin); | ||
637 | |||
638 | if (flags & MSG_ERRQUEUE) | ||
639 | return ip_recv_error(sk, msg, len); | ||
640 | |||
641 | skb = skb_recv_datagram(sk, flags, noblock, &err); | ||
642 | if (!skb) | ||
643 | goto out; | ||
644 | |||
645 | copied = skb->len; | ||
646 | if (copied > len) { | ||
647 | msg->msg_flags |= MSG_TRUNC; | ||
648 | copied = len; | ||
649 | } | ||
650 | |||
651 | /* Don't bother checking the checksum */ | ||
652 | err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); | ||
653 | if (err) | ||
654 | goto done; | ||
655 | |||
656 | sock_recv_timestamp(msg, sk, skb); | ||
657 | |||
658 | /* Copy the address. */ | ||
659 | if (sin) { | ||
660 | sin->sin_family = AF_INET; | ||
661 | sin->sin_port = 0 /* skb->h.uh->source */; | ||
662 | sin->sin_addr.s_addr = ip_hdr(skb)->saddr; | ||
663 | memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); | ||
664 | } | ||
665 | if (isk->cmsg_flags) | ||
666 | ip_cmsg_recv(msg, skb); | ||
667 | err = copied; | ||
668 | |||
669 | done: | ||
670 | skb_free_datagram(sk, skb); | ||
671 | out: | ||
672 | pr_debug("ping_recvmsg -> %d\n", err); | ||
673 | return err; | ||
674 | } | ||
675 | |||
676 | static int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) | ||
677 | { | ||
678 | pr_debug("ping_queue_rcv_skb(sk=%p,sk->num=%d,skb=%p)\n", | ||
679 | inet_sk(sk), inet_sk(sk)->inet_num, skb); | ||
680 | if (sock_queue_rcv_skb(sk, skb) < 0) { | ||
681 | ICMP_INC_STATS_BH(sock_net(sk), ICMP_MIB_INERRORS); | ||
682 | kfree_skb(skb); | ||
683 | pr_debug("ping_queue_rcv_skb -> failed\n"); | ||
684 | return -1; | ||
685 | } | ||
686 | return 0; | ||
687 | } | ||
688 | |||
689 | |||
690 | /* | ||
691 | * All we need to do is get the socket. | ||
692 | */ | ||
693 | |||
694 | void ping_rcv(struct sk_buff *skb) | ||
695 | { | ||
696 | struct sock *sk; | ||
697 | struct net *net = dev_net(skb->dev); | ||
698 | struct iphdr *iph = ip_hdr(skb); | ||
699 | struct icmphdr *icmph = icmp_hdr(skb); | ||
700 | u32 saddr = iph->saddr; | ||
701 | u32 daddr = iph->daddr; | ||
702 | |||
703 | /* We assume the packet has already been checked by icmp_rcv */ | ||
704 | |||
705 | pr_debug("ping_rcv(skb=%p,id=%04x,seq=%04x)\n", | ||
706 | skb, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence)); | ||
707 | |||
708 | /* Push ICMP header back */ | ||
709 | skb_push(skb, skb->data - (u8 *)icmph); | ||
710 | |||
711 | sk = ping_v4_lookup(net, saddr, daddr, ntohs(icmph->un.echo.id), | ||
712 | skb->dev->ifindex); | ||
713 | if (sk != NULL) { | ||
714 | pr_debug("rcv on socket %p\n", sk); | ||
715 | ping_queue_rcv_skb(sk, skb_get(skb)); | ||
716 | sock_put(sk); | ||
717 | return; | ||
718 | } | ||
719 | pr_debug("no socket, dropping\n"); | ||
720 | |||
721 | /* We're called from icmp_rcv(). kfree_skb() is done there. */ | ||
722 | } | ||
723 | |||
724 | struct proto ping_prot = { | ||
725 | .name = "PING", | ||
726 | .owner = THIS_MODULE, | ||
727 | .init = ping_init_sock, | ||
728 | .close = ping_close, | ||
729 | .connect = ip4_datagram_connect, | ||
730 | .disconnect = udp_disconnect, | ||
731 | .setsockopt = ip_setsockopt, | ||
732 | .getsockopt = ip_getsockopt, | ||
733 | .sendmsg = ping_sendmsg, | ||
734 | .recvmsg = ping_recvmsg, | ||
735 | .bind = ping_bind, | ||
736 | .backlog_rcv = ping_queue_rcv_skb, | ||
737 | .hash = ping_v4_hash, | ||
738 | .unhash = ping_v4_unhash, | ||
739 | .get_port = ping_v4_get_port, | ||
740 | .obj_size = sizeof(struct inet_sock), | ||
741 | }; | ||
742 | EXPORT_SYMBOL(ping_prot); | ||
743 | |||
744 | #ifdef CONFIG_PROC_FS | ||
745 | |||
746 | static struct sock *ping_get_first(struct seq_file *seq, int start) | ||
747 | { | ||
748 | struct sock *sk; | ||
749 | struct ping_iter_state *state = seq->private; | ||
750 | struct net *net = seq_file_net(seq); | ||
751 | |||
752 | for (state->bucket = start; state->bucket < PING_HTABLE_SIZE; | ||
753 | ++state->bucket) { | ||
754 | struct hlist_nulls_node *node; | ||
755 | struct hlist_nulls_head *hslot; | ||
756 | |||
757 | hslot = &ping_table.hash[state->bucket]; | ||
758 | |||
759 | if (hlist_nulls_empty(hslot)) | ||
760 | continue; | ||
761 | |||
762 | sk_nulls_for_each(sk, node, hslot) { | ||
763 | if (net_eq(sock_net(sk), net)) | ||
764 | goto found; | ||
765 | } | ||
766 | } | ||
767 | sk = NULL; | ||
768 | found: | ||
769 | return sk; | ||
770 | } | ||
771 | |||
772 | static struct sock *ping_get_next(struct seq_file *seq, struct sock *sk) | ||
773 | { | ||
774 | struct ping_iter_state *state = seq->private; | ||
775 | struct net *net = seq_file_net(seq); | ||
776 | |||
777 | do { | ||
778 | sk = sk_nulls_next(sk); | ||
779 | } while (sk && (!net_eq(sock_net(sk), net))); | ||
780 | |||
781 | if (!sk) | ||
782 | return ping_get_first(seq, state->bucket + 1); | ||
783 | return sk; | ||
784 | } | ||
785 | |||
786 | static struct sock *ping_get_idx(struct seq_file *seq, loff_t pos) | ||
787 | { | ||
788 | struct sock *sk = ping_get_first(seq, 0); | ||
789 | |||
790 | if (sk) | ||
791 | while (pos && (sk = ping_get_next(seq, sk)) != NULL) | ||
792 | --pos; | ||
793 | return pos ? NULL : sk; | ||
794 | } | ||
795 | |||
796 | static void *ping_seq_start(struct seq_file *seq, loff_t *pos) | ||
797 | { | ||
798 | struct ping_iter_state *state = seq->private; | ||
799 | state->bucket = 0; | ||
800 | |||
801 | read_lock_bh(&ping_table.lock); | ||
802 | |||
803 | return *pos ? ping_get_idx(seq, *pos-1) : SEQ_START_TOKEN; | ||
804 | } | ||
805 | |||
806 | static void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos) | ||
807 | { | ||
808 | struct sock *sk; | ||
809 | |||
810 | if (v == SEQ_START_TOKEN) | ||
811 | sk = ping_get_idx(seq, 0); | ||
812 | else | ||
813 | sk = ping_get_next(seq, v); | ||
814 | |||
815 | ++*pos; | ||
816 | return sk; | ||
817 | } | ||
818 | |||
819 | static void ping_seq_stop(struct seq_file *seq, void *v) | ||
820 | { | ||
821 | read_unlock_bh(&ping_table.lock); | ||
822 | } | ||
823 | |||
824 | static void ping_format_sock(struct sock *sp, struct seq_file *f, | ||
825 | int bucket, int *len) | ||
826 | { | ||
827 | struct inet_sock *inet = inet_sk(sp); | ||
828 | __be32 dest = inet->inet_daddr; | ||
829 | __be32 src = inet->inet_rcv_saddr; | ||
830 | __u16 destp = ntohs(inet->inet_dport); | ||
831 | __u16 srcp = ntohs(inet->inet_sport); | ||
832 | |||
833 | seq_printf(f, "%5d: %08X:%04X %08X:%04X" | ||
834 | " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d%n", | ||
835 | bucket, src, srcp, dest, destp, sp->sk_state, | ||
836 | sk_wmem_alloc_get(sp), | ||
837 | sk_rmem_alloc_get(sp), | ||
838 | 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), | ||
839 | atomic_read(&sp->sk_refcnt), sp, | ||
840 | atomic_read(&sp->sk_drops), len); | ||
841 | } | ||
842 | |||
843 | static int ping_seq_show(struct seq_file *seq, void *v) | ||
844 | { | ||
845 | if (v == SEQ_START_TOKEN) | ||
846 | seq_printf(seq, "%-127s\n", | ||
847 | " sl local_address rem_address st tx_queue " | ||
848 | "rx_queue tr tm->when retrnsmt uid timeout " | ||
849 | "inode ref pointer drops"); | ||
850 | else { | ||
851 | struct ping_iter_state *state = seq->private; | ||
852 | int len; | ||
853 | |||
854 | ping_format_sock(v, seq, state->bucket, &len); | ||
855 | seq_printf(seq, "%*s\n", 127 - len, ""); | ||
856 | } | ||
857 | return 0; | ||
858 | } | ||
859 | |||
860 | static const struct seq_operations ping_seq_ops = { | ||
861 | .show = ping_seq_show, | ||
862 | .start = ping_seq_start, | ||
863 | .next = ping_seq_next, | ||
864 | .stop = ping_seq_stop, | ||
865 | }; | ||
866 | |||
867 | static int ping_seq_open(struct inode *inode, struct file *file) | ||
868 | { | ||
869 | return seq_open_net(inode, file, &ping_seq_ops, | ||
870 | sizeof(struct ping_iter_state)); | ||
871 | } | ||
872 | |||
873 | static const struct file_operations ping_seq_fops = { | ||
874 | .open = ping_seq_open, | ||
875 | .read = seq_read, | ||
876 | .llseek = seq_lseek, | ||
877 | .release = seq_release_net, | ||
878 | }; | ||
879 | |||
880 | static int ping_proc_register(struct net *net) | ||
881 | { | ||
882 | struct proc_dir_entry *p; | ||
883 | int rc = 0; | ||
884 | |||
885 | p = proc_net_fops_create(net, "icmp", S_IRUGO, &ping_seq_fops); | ||
886 | if (!p) | ||
887 | rc = -ENOMEM; | ||
888 | return rc; | ||
889 | } | ||
890 | |||
891 | static void ping_proc_unregister(struct net *net) | ||
892 | { | ||
893 | proc_net_remove(net, "icmp"); | ||
894 | } | ||
895 | |||
896 | |||
897 | static int __net_init ping_proc_init_net(struct net *net) | ||
898 | { | ||
899 | return ping_proc_register(net); | ||
900 | } | ||
901 | |||
902 | static void __net_exit ping_proc_exit_net(struct net *net) | ||
903 | { | ||
904 | ping_proc_unregister(net); | ||
905 | } | ||
906 | |||
907 | static struct pernet_operations ping_net_ops = { | ||
908 | .init = ping_proc_init_net, | ||
909 | .exit = ping_proc_exit_net, | ||
910 | }; | ||
911 | |||
912 | int __init ping_proc_init(void) | ||
913 | { | ||
914 | return register_pernet_subsys(&ping_net_ops); | ||
915 | } | ||
916 | |||
917 | void ping_proc_exit(void) | ||
918 | { | ||
919 | unregister_pernet_subsys(&ping_net_ops); | ||
920 | } | ||
921 | |||
922 | #endif | ||
923 | |||
924 | void __init ping_init(void) | ||
925 | { | ||
926 | int i; | ||
927 | |||
928 | for (i = 0; i < PING_HTABLE_SIZE; i++) | ||
929 | INIT_HLIST_NULLS_HEAD(&ping_table.hash[i], i); | ||
930 | rwlock_init(&ping_table.lock); | ||
931 | } | ||
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 4ae1f203f7cb..b14ec7d03b6e 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c | |||
@@ -59,13 +59,13 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) | |||
59 | local_bh_enable(); | 59 | local_bh_enable(); |
60 | 60 | ||
61 | socket_seq_show(seq); | 61 | socket_seq_show(seq); |
62 | seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n", | 62 | seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n", |
63 | sock_prot_inuse_get(net, &tcp_prot), orphans, | 63 | sock_prot_inuse_get(net, &tcp_prot), orphans, |
64 | tcp_death_row.tw_count, sockets, | 64 | tcp_death_row.tw_count, sockets, |
65 | atomic_read(&tcp_memory_allocated)); | 65 | atomic_long_read(&tcp_memory_allocated)); |
66 | seq_printf(seq, "UDP: inuse %d mem %d\n", | 66 | seq_printf(seq, "UDP: inuse %d mem %ld\n", |
67 | sock_prot_inuse_get(net, &udp_prot), | 67 | sock_prot_inuse_get(net, &udp_prot), |
68 | atomic_read(&udp_memory_allocated)); | 68 | atomic_long_read(&udp_memory_allocated)); |
69 | seq_printf(seq, "UDPLITE: inuse %d\n", | 69 | seq_printf(seq, "UDPLITE: inuse %d\n", |
70 | sock_prot_inuse_get(net, &udplite_prot)); | 70 | sock_prot_inuse_get(net, &udplite_prot)); |
71 | seq_printf(seq, "RAW: inuse %d\n", | 71 | seq_printf(seq, "RAW: inuse %d\n", |
@@ -253,6 +253,7 @@ static const struct snmp_mib snmp4_net_list[] = { | |||
253 | SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP), | 253 | SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP), |
254 | SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP), | 254 | SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP), |
255 | SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER), | 255 | SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER), |
256 | SNMP_MIB_ITEM("TCPTimeWaitOverflow", LINUX_MIB_TCPTIMEWAITOVERFLOW), | ||
256 | SNMP_MIB_SENTINEL | 257 | SNMP_MIB_SENTINEL |
257 | }; | 258 | }; |
258 | 259 | ||
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index f2d297351405..9ae5c01cd0b2 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c | |||
@@ -28,8 +28,7 @@ | |||
28 | #include <linux/spinlock.h> | 28 | #include <linux/spinlock.h> |
29 | #include <net/protocol.h> | 29 | #include <net/protocol.h> |
30 | 30 | ||
31 | const struct net_protocol *inet_protos[MAX_INET_PROTOS] ____cacheline_aligned_in_smp; | 31 | const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly; |
32 | static DEFINE_SPINLOCK(inet_proto_lock); | ||
33 | 32 | ||
34 | /* | 33 | /* |
35 | * Add a protocol handler to the hash tables | 34 | * Add a protocol handler to the hash tables |
@@ -37,20 +36,10 @@ static DEFINE_SPINLOCK(inet_proto_lock); | |||
37 | 36 | ||
38 | int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) | 37 | int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) |
39 | { | 38 | { |
40 | int hash, ret; | 39 | int hash = protocol & (MAX_INET_PROTOS - 1); |
41 | 40 | ||
42 | hash = protocol & (MAX_INET_PROTOS - 1); | 41 | return !cmpxchg((const struct net_protocol **)&inet_protos[hash], |
43 | 42 | NULL, prot) ? 0 : -1; | |
44 | spin_lock_bh(&inet_proto_lock); | ||
45 | if (inet_protos[hash]) { | ||
46 | ret = -1; | ||
47 | } else { | ||
48 | inet_protos[hash] = prot; | ||
49 | ret = 0; | ||
50 | } | ||
51 | spin_unlock_bh(&inet_proto_lock); | ||
52 | |||
53 | return ret; | ||
54 | } | 43 | } |
55 | EXPORT_SYMBOL(inet_add_protocol); | 44 | EXPORT_SYMBOL(inet_add_protocol); |
56 | 45 | ||
@@ -60,18 +49,10 @@ EXPORT_SYMBOL(inet_add_protocol); | |||
60 | 49 | ||
61 | int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol) | 50 | int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol) |
62 | { | 51 | { |
63 | int hash, ret; | 52 | int ret, hash = protocol & (MAX_INET_PROTOS - 1); |
64 | |||
65 | hash = protocol & (MAX_INET_PROTOS - 1); | ||
66 | 53 | ||
67 | spin_lock_bh(&inet_proto_lock); | 54 | ret = (cmpxchg((const struct net_protocol **)&inet_protos[hash], |
68 | if (inet_protos[hash] == prot) { | 55 | prot, NULL) == prot) ? 0 : -1; |
69 | inet_protos[hash] = NULL; | ||
70 | ret = 0; | ||
71 | } else { | ||
72 | ret = -1; | ||
73 | } | ||
74 | spin_unlock_bh(&inet_proto_lock); | ||
75 | 56 | ||
76 | synchronize_net(); | 57 | synchronize_net(); |
77 | 58 | ||
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 009a7b2aa1ef..c9893d43242e 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c | |||
@@ -76,6 +76,7 @@ | |||
76 | #include <linux/seq_file.h> | 76 | #include <linux/seq_file.h> |
77 | #include <linux/netfilter.h> | 77 | #include <linux/netfilter.h> |
78 | #include <linux/netfilter_ipv4.h> | 78 | #include <linux/netfilter_ipv4.h> |
79 | #include <linux/compat.h> | ||
79 | 80 | ||
80 | static struct raw_hashinfo raw_v4_hashinfo = { | 81 | static struct raw_hashinfo raw_v4_hashinfo = { |
81 | .lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock), | 82 | .lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock), |
@@ -153,7 +154,7 @@ static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb) | |||
153 | * RFC 1122: SHOULD pass TOS value up to the transport layer. | 154 | * RFC 1122: SHOULD pass TOS value up to the transport layer. |
154 | * -> It does. And not only TOS, but all IP header. | 155 | * -> It does. And not only TOS, but all IP header. |
155 | */ | 156 | */ |
156 | static int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) | 157 | static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash) |
157 | { | 158 | { |
158 | struct sock *sk; | 159 | struct sock *sk; |
159 | struct hlist_head *head; | 160 | struct hlist_head *head; |
@@ -246,7 +247,7 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) | |||
246 | } | 247 | } |
247 | 248 | ||
248 | if (inet->recverr) { | 249 | if (inet->recverr) { |
249 | struct iphdr *iph = (struct iphdr *)skb->data; | 250 | const struct iphdr *iph = (const struct iphdr *)skb->data; |
250 | u8 *payload = skb->data + (iph->ihl << 2); | 251 | u8 *payload = skb->data + (iph->ihl << 2); |
251 | 252 | ||
252 | if (inet->hdrincl) | 253 | if (inet->hdrincl) |
@@ -264,7 +265,7 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info) | |||
264 | { | 265 | { |
265 | int hash; | 266 | int hash; |
266 | struct sock *raw_sk; | 267 | struct sock *raw_sk; |
267 | struct iphdr *iph; | 268 | const struct iphdr *iph; |
268 | struct net *net; | 269 | struct net *net; |
269 | 270 | ||
270 | hash = protocol & (RAW_HTABLE_SIZE - 1); | 271 | hash = protocol & (RAW_HTABLE_SIZE - 1); |
@@ -272,7 +273,7 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info) | |||
272 | read_lock(&raw_v4_hashinfo.lock); | 273 | read_lock(&raw_v4_hashinfo.lock); |
273 | raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]); | 274 | raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]); |
274 | if (raw_sk != NULL) { | 275 | if (raw_sk != NULL) { |
275 | iph = (struct iphdr *)skb->data; | 276 | iph = (const struct iphdr *)skb->data; |
276 | net = dev_net(skb->dev); | 277 | net = dev_net(skb->dev); |
277 | 278 | ||
278 | while ((raw_sk = __raw_v4_lookup(net, raw_sk, protocol, | 279 | while ((raw_sk = __raw_v4_lookup(net, raw_sk, protocol, |
@@ -280,7 +281,7 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info) | |||
280 | skb->dev->ifindex)) != NULL) { | 281 | skb->dev->ifindex)) != NULL) { |
281 | raw_err(raw_sk, skb, info); | 282 | raw_err(raw_sk, skb, info); |
282 | raw_sk = sk_next(raw_sk); | 283 | raw_sk = sk_next(raw_sk); |
283 | iph = (struct iphdr *)skb->data; | 284 | iph = (const struct iphdr *)skb->data; |
284 | } | 285 | } |
285 | } | 286 | } |
286 | read_unlock(&raw_v4_hashinfo.lock); | 287 | read_unlock(&raw_v4_hashinfo.lock); |
@@ -313,9 +314,10 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb) | |||
313 | return 0; | 314 | return 0; |
314 | } | 315 | } |
315 | 316 | ||
316 | static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, | 317 | static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, |
317 | struct rtable **rtp, | 318 | void *from, size_t length, |
318 | unsigned int flags) | 319 | struct rtable **rtp, |
320 | unsigned int flags) | ||
319 | { | 321 | { |
320 | struct inet_sock *inet = inet_sk(sk); | 322 | struct inet_sock *inet = inet_sk(sk); |
321 | struct net *net = sock_net(sk); | 323 | struct net *net = sock_net(sk); |
@@ -326,7 +328,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, | |||
326 | struct rtable *rt = *rtp; | 328 | struct rtable *rt = *rtp; |
327 | 329 | ||
328 | if (length > rt->dst.dev->mtu) { | 330 | if (length > rt->dst.dev->mtu) { |
329 | ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, | 331 | ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, |
330 | rt->dst.dev->mtu); | 332 | rt->dst.dev->mtu); |
331 | return -EMSGSIZE; | 333 | return -EMSGSIZE; |
332 | } | 334 | } |
@@ -371,7 +373,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, | |||
371 | 373 | ||
372 | if (iphlen >= sizeof(*iph)) { | 374 | if (iphlen >= sizeof(*iph)) { |
373 | if (!iph->saddr) | 375 | if (!iph->saddr) |
374 | iph->saddr = rt->rt_src; | 376 | iph->saddr = fl4->saddr; |
375 | iph->check = 0; | 377 | iph->check = 0; |
376 | iph->tot_len = htons(length); | 378 | iph->tot_len = htons(length); |
377 | if (!iph->id) | 379 | if (!iph->id) |
@@ -401,7 +403,7 @@ error: | |||
401 | return err; | 403 | return err; |
402 | } | 404 | } |
403 | 405 | ||
404 | static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg) | 406 | static int raw_probe_proto_opt(struct flowi4 *fl4, struct msghdr *msg) |
405 | { | 407 | { |
406 | struct iovec *iov; | 408 | struct iovec *iov; |
407 | u8 __user *type = NULL; | 409 | u8 __user *type = NULL; |
@@ -417,7 +419,7 @@ static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg) | |||
417 | if (!iov) | 419 | if (!iov) |
418 | continue; | 420 | continue; |
419 | 421 | ||
420 | switch (fl->proto) { | 422 | switch (fl4->flowi4_proto) { |
421 | case IPPROTO_ICMP: | 423 | case IPPROTO_ICMP: |
422 | /* check if one-byte field is readable or not. */ | 424 | /* check if one-byte field is readable or not. */ |
423 | if (iov->iov_base && iov->iov_len < 1) | 425 | if (iov->iov_base && iov->iov_len < 1) |
@@ -432,8 +434,8 @@ static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg) | |||
432 | code = iov->iov_base; | 434 | code = iov->iov_base; |
433 | 435 | ||
434 | if (type && code) { | 436 | if (type && code) { |
435 | if (get_user(fl->fl_icmp_type, type) || | 437 | if (get_user(fl4->fl4_icmp_type, type) || |
436 | get_user(fl->fl_icmp_code, code)) | 438 | get_user(fl4->fl4_icmp_code, code)) |
437 | return -EFAULT; | 439 | return -EFAULT; |
438 | probed = 1; | 440 | probed = 1; |
439 | } | 441 | } |
@@ -454,11 +456,13 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
454 | struct inet_sock *inet = inet_sk(sk); | 456 | struct inet_sock *inet = inet_sk(sk); |
455 | struct ipcm_cookie ipc; | 457 | struct ipcm_cookie ipc; |
456 | struct rtable *rt = NULL; | 458 | struct rtable *rt = NULL; |
459 | struct flowi4 fl4; | ||
457 | int free = 0; | 460 | int free = 0; |
458 | __be32 daddr; | 461 | __be32 daddr; |
459 | __be32 saddr; | 462 | __be32 saddr; |
460 | u8 tos; | 463 | u8 tos; |
461 | int err; | 464 | int err; |
465 | struct ip_options_data opt_copy; | ||
462 | 466 | ||
463 | err = -EMSGSIZE; | 467 | err = -EMSGSIZE; |
464 | if (len > 0xFFFF) | 468 | if (len > 0xFFFF) |
@@ -505,7 +509,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
505 | 509 | ||
506 | ipc.addr = inet->inet_saddr; | 510 | ipc.addr = inet->inet_saddr; |
507 | ipc.opt = NULL; | 511 | ipc.opt = NULL; |
508 | ipc.shtx.flags = 0; | 512 | ipc.tx_flags = 0; |
509 | ipc.oif = sk->sk_bound_dev_if; | 513 | ipc.oif = sk->sk_bound_dev_if; |
510 | 514 | ||
511 | if (msg->msg_controllen) { | 515 | if (msg->msg_controllen) { |
@@ -519,8 +523,18 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
519 | saddr = ipc.addr; | 523 | saddr = ipc.addr; |
520 | ipc.addr = daddr; | 524 | ipc.addr = daddr; |
521 | 525 | ||
522 | if (!ipc.opt) | 526 | if (!ipc.opt) { |
523 | ipc.opt = inet->opt; | 527 | struct ip_options_rcu *inet_opt; |
528 | |||
529 | rcu_read_lock(); | ||
530 | inet_opt = rcu_dereference(inet->inet_opt); | ||
531 | if (inet_opt) { | ||
532 | memcpy(&opt_copy, inet_opt, | ||
533 | sizeof(*inet_opt) + inet_opt->opt.optlen); | ||
534 | ipc.opt = &opt_copy.opt; | ||
535 | } | ||
536 | rcu_read_unlock(); | ||
537 | } | ||
524 | 538 | ||
525 | if (ipc.opt) { | 539 | if (ipc.opt) { |
526 | err = -EINVAL; | 540 | err = -EINVAL; |
@@ -529,10 +543,10 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
529 | */ | 543 | */ |
530 | if (inet->hdrincl) | 544 | if (inet->hdrincl) |
531 | goto done; | 545 | goto done; |
532 | if (ipc.opt->srr) { | 546 | if (ipc.opt->opt.srr) { |
533 | if (!daddr) | 547 | if (!daddr) |
534 | goto done; | 548 | goto done; |
535 | daddr = ipc.opt->faddr; | 549 | daddr = ipc.opt->opt.faddr; |
536 | } | 550 | } |
537 | } | 551 | } |
538 | tos = RT_CONN_FLAGS(sk); | 552 | tos = RT_CONN_FLAGS(sk); |
@@ -546,27 +560,24 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
546 | saddr = inet->mc_addr; | 560 | saddr = inet->mc_addr; |
547 | } | 561 | } |
548 | 562 | ||
549 | { | 563 | flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, |
550 | struct flowi fl = { .oif = ipc.oif, | 564 | RT_SCOPE_UNIVERSE, |
551 | .mark = sk->sk_mark, | 565 | inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, |
552 | .nl_u = { .ip4_u = | 566 | FLOWI_FLAG_CAN_SLEEP, daddr, saddr, 0, 0); |
553 | { .daddr = daddr, | ||
554 | .saddr = saddr, | ||
555 | .tos = tos } }, | ||
556 | .proto = inet->hdrincl ? IPPROTO_RAW : | ||
557 | sk->sk_protocol, | ||
558 | }; | ||
559 | if (!inet->hdrincl) { | ||
560 | err = raw_probe_proto_opt(&fl, msg); | ||
561 | if (err) | ||
562 | goto done; | ||
563 | } | ||
564 | 567 | ||
565 | security_sk_classify_flow(sk, &fl); | 568 | if (!inet->hdrincl) { |
566 | err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 1); | 569 | err = raw_probe_proto_opt(&fl4, msg); |
570 | if (err) | ||
571 | goto done; | ||
567 | } | 572 | } |
568 | if (err) | 573 | |
574 | security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); | ||
575 | rt = ip_route_output_flow(sock_net(sk), &fl4, sk); | ||
576 | if (IS_ERR(rt)) { | ||
577 | err = PTR_ERR(rt); | ||
578 | rt = NULL; | ||
569 | goto done; | 579 | goto done; |
580 | } | ||
570 | 581 | ||
571 | err = -EACCES; | 582 | err = -EACCES; |
572 | if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST)) | 583 | if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST)) |
@@ -577,19 +588,20 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
577 | back_from_confirm: | 588 | back_from_confirm: |
578 | 589 | ||
579 | if (inet->hdrincl) | 590 | if (inet->hdrincl) |
580 | err = raw_send_hdrinc(sk, msg->msg_iov, len, | 591 | err = raw_send_hdrinc(sk, &fl4, msg->msg_iov, len, |
581 | &rt, msg->msg_flags); | 592 | &rt, msg->msg_flags); |
582 | 593 | ||
583 | else { | 594 | else { |
584 | if (!ipc.addr) | 595 | if (!ipc.addr) |
585 | ipc.addr = rt->rt_dst; | 596 | ipc.addr = fl4.daddr; |
586 | lock_sock(sk); | 597 | lock_sock(sk); |
587 | err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0, | 598 | err = ip_append_data(sk, &fl4, ip_generic_getfrag, |
588 | &ipc, &rt, msg->msg_flags); | 599 | msg->msg_iov, len, 0, |
600 | &ipc, &rt, msg->msg_flags); | ||
589 | if (err) | 601 | if (err) |
590 | ip_flush_pending_frames(sk); | 602 | ip_flush_pending_frames(sk); |
591 | else if (!(msg->msg_flags & MSG_MORE)) { | 603 | else if (!(msg->msg_flags & MSG_MORE)) { |
592 | err = ip_push_pending_frames(sk); | 604 | err = ip_push_pending_frames(sk, &fl4); |
593 | if (err == -ENOBUFS && !inet->recverr) | 605 | if (err == -ENOBUFS && !inet->recverr) |
594 | err = 0; | 606 | err = 0; |
595 | } | 607 | } |
@@ -616,7 +628,7 @@ do_confirm: | |||
616 | static void raw_close(struct sock *sk, long timeout) | 628 | static void raw_close(struct sock *sk, long timeout) |
617 | { | 629 | { |
618 | /* | 630 | /* |
619 | * Raw sockets may have direct kernel refereneces. Kill them. | 631 | * Raw sockets may have direct kernel references. Kill them. |
620 | */ | 632 | */ |
621 | ip_ra_control(sk, 0, NULL); | 633 | ip_ra_control(sk, 0, NULL); |
622 | 634 | ||
@@ -839,6 +851,23 @@ static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg) | |||
839 | } | 851 | } |
840 | } | 852 | } |
841 | 853 | ||
854 | #ifdef CONFIG_COMPAT | ||
855 | static int compat_raw_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg) | ||
856 | { | ||
857 | switch (cmd) { | ||
858 | case SIOCOUTQ: | ||
859 | case SIOCINQ: | ||
860 | return -ENOIOCTLCMD; | ||
861 | default: | ||
862 | #ifdef CONFIG_IP_MROUTE | ||
863 | return ipmr_compat_ioctl(sk, cmd, compat_ptr(arg)); | ||
864 | #else | ||
865 | return -ENOIOCTLCMD; | ||
866 | #endif | ||
867 | } | ||
868 | } | ||
869 | #endif | ||
870 | |||
842 | struct proto raw_prot = { | 871 | struct proto raw_prot = { |
843 | .name = "RAW", | 872 | .name = "RAW", |
844 | .owner = THIS_MODULE, | 873 | .owner = THIS_MODULE, |
@@ -861,6 +890,7 @@ struct proto raw_prot = { | |||
861 | #ifdef CONFIG_COMPAT | 890 | #ifdef CONFIG_COMPAT |
862 | .compat_setsockopt = compat_raw_setsockopt, | 891 | .compat_setsockopt = compat_raw_setsockopt, |
863 | .compat_getsockopt = compat_raw_getsockopt, | 892 | .compat_getsockopt = compat_raw_getsockopt, |
893 | .compat_ioctl = compat_raw_ioctl, | ||
864 | #endif | 894 | #endif |
865 | }; | 895 | }; |
866 | 896 | ||
@@ -949,7 +979,7 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) | |||
949 | srcp = inet->inet_num; | 979 | srcp = inet->inet_num; |
950 | 980 | ||
951 | seq_printf(seq, "%4d: %08X:%04X %08X:%04X" | 981 | seq_printf(seq, "%4d: %08X:%04X %08X:%04X" |
952 | " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d\n", | 982 | " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d\n", |
953 | i, src, srcp, dest, destp, sp->sk_state, | 983 | i, src, srcp, dest, destp, sp->sk_state, |
954 | sk_wmem_alloc_get(sp), | 984 | sk_wmem_alloc_get(sp), |
955 | sk_rmem_alloc_get(sp), | 985 | sk_rmem_alloc_get(sp), |
diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ac6559cb54f9..aa13ef105110 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c | |||
@@ -109,8 +109,8 @@ | |||
109 | #include <linux/sysctl.h> | 109 | #include <linux/sysctl.h> |
110 | #endif | 110 | #endif |
111 | 111 | ||
112 | #define RT_FL_TOS(oldflp) \ | 112 | #define RT_FL_TOS(oldflp4) \ |
113 | ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK))) | 113 | ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))) |
114 | 114 | ||
115 | #define IP_MAX_MTU 0xFFF0 | 115 | #define IP_MAX_MTU 0xFFF0 |
116 | 116 | ||
@@ -131,42 +131,80 @@ static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; | |||
131 | static int ip_rt_min_advmss __read_mostly = 256; | 131 | static int ip_rt_min_advmss __read_mostly = 256; |
132 | static int rt_chain_length_max __read_mostly = 20; | 132 | static int rt_chain_length_max __read_mostly = 20; |
133 | 133 | ||
134 | static struct delayed_work expires_work; | ||
135 | static unsigned long expires_ljiffies; | ||
136 | |||
137 | /* | 134 | /* |
138 | * Interface to generic destination cache. | 135 | * Interface to generic destination cache. |
139 | */ | 136 | */ |
140 | 137 | ||
141 | static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); | 138 | static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); |
139 | static unsigned int ipv4_default_advmss(const struct dst_entry *dst); | ||
140 | static unsigned int ipv4_default_mtu(const struct dst_entry *dst); | ||
142 | static void ipv4_dst_destroy(struct dst_entry *dst); | 141 | static void ipv4_dst_destroy(struct dst_entry *dst); |
143 | static void ipv4_dst_ifdown(struct dst_entry *dst, | ||
144 | struct net_device *dev, int how); | ||
145 | static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); | 142 | static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); |
146 | static void ipv4_link_failure(struct sk_buff *skb); | 143 | static void ipv4_link_failure(struct sk_buff *skb); |
147 | static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); | 144 | static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); |
148 | static int rt_garbage_collect(struct dst_ops *ops); | 145 | static int rt_garbage_collect(struct dst_ops *ops); |
149 | 146 | ||
147 | static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, | ||
148 | int how) | ||
149 | { | ||
150 | } | ||
151 | |||
152 | static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) | ||
153 | { | ||
154 | struct rtable *rt = (struct rtable *) dst; | ||
155 | struct inet_peer *peer; | ||
156 | u32 *p = NULL; | ||
157 | |||
158 | if (!rt->peer) | ||
159 | rt_bind_peer(rt, rt->rt_dst, 1); | ||
160 | |||
161 | peer = rt->peer; | ||
162 | if (peer) { | ||
163 | u32 *old_p = __DST_METRICS_PTR(old); | ||
164 | unsigned long prev, new; | ||
165 | |||
166 | p = peer->metrics; | ||
167 | if (inet_metrics_new(peer)) | ||
168 | memcpy(p, old_p, sizeof(u32) * RTAX_MAX); | ||
169 | |||
170 | new = (unsigned long) p; | ||
171 | prev = cmpxchg(&dst->_metrics, old, new); | ||
172 | |||
173 | if (prev != old) { | ||
174 | p = __DST_METRICS_PTR(prev); | ||
175 | if (prev & DST_METRICS_READ_ONLY) | ||
176 | p = NULL; | ||
177 | } else { | ||
178 | if (rt->fi) { | ||
179 | fib_info_put(rt->fi); | ||
180 | rt->fi = NULL; | ||
181 | } | ||
182 | } | ||
183 | } | ||
184 | return p; | ||
185 | } | ||
150 | 186 | ||
151 | static struct dst_ops ipv4_dst_ops = { | 187 | static struct dst_ops ipv4_dst_ops = { |
152 | .family = AF_INET, | 188 | .family = AF_INET, |
153 | .protocol = cpu_to_be16(ETH_P_IP), | 189 | .protocol = cpu_to_be16(ETH_P_IP), |
154 | .gc = rt_garbage_collect, | 190 | .gc = rt_garbage_collect, |
155 | .check = ipv4_dst_check, | 191 | .check = ipv4_dst_check, |
192 | .default_advmss = ipv4_default_advmss, | ||
193 | .default_mtu = ipv4_default_mtu, | ||
194 | .cow_metrics = ipv4_cow_metrics, | ||
156 | .destroy = ipv4_dst_destroy, | 195 | .destroy = ipv4_dst_destroy, |
157 | .ifdown = ipv4_dst_ifdown, | 196 | .ifdown = ipv4_dst_ifdown, |
158 | .negative_advice = ipv4_negative_advice, | 197 | .negative_advice = ipv4_negative_advice, |
159 | .link_failure = ipv4_link_failure, | 198 | .link_failure = ipv4_link_failure, |
160 | .update_pmtu = ip_rt_update_pmtu, | 199 | .update_pmtu = ip_rt_update_pmtu, |
161 | .local_out = __ip_local_out, | 200 | .local_out = __ip_local_out, |
162 | .entries = ATOMIC_INIT(0), | ||
163 | }; | 201 | }; |
164 | 202 | ||
165 | #define ECN_OR_COST(class) TC_PRIO_##class | 203 | #define ECN_OR_COST(class) TC_PRIO_##class |
166 | 204 | ||
167 | const __u8 ip_tos2prio[16] = { | 205 | const __u8 ip_tos2prio[16] = { |
168 | TC_PRIO_BESTEFFORT, | 206 | TC_PRIO_BESTEFFORT, |
169 | ECN_OR_COST(FILLER), | 207 | ECN_OR_COST(BESTEFFORT), |
170 | TC_PRIO_BESTEFFORT, | 208 | TC_PRIO_BESTEFFORT, |
171 | ECN_OR_COST(BESTEFFORT), | 209 | ECN_OR_COST(BESTEFFORT), |
172 | TC_PRIO_BULK, | 210 | TC_PRIO_BULK, |
@@ -199,7 +237,7 @@ const __u8 ip_tos2prio[16] = { | |||
199 | */ | 237 | */ |
200 | 238 | ||
201 | struct rt_hash_bucket { | 239 | struct rt_hash_bucket { |
202 | struct rtable *chain; | 240 | struct rtable __rcu *chain; |
203 | }; | 241 | }; |
204 | 242 | ||
205 | #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ | 243 | #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ |
@@ -281,7 +319,7 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq) | |||
281 | struct rtable *r = NULL; | 319 | struct rtable *r = NULL; |
282 | 320 | ||
283 | for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { | 321 | for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { |
284 | if (!rt_hash_table[st->bucket].chain) | 322 | if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain)) |
285 | continue; | 323 | continue; |
286 | rcu_read_lock_bh(); | 324 | rcu_read_lock_bh(); |
287 | r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); | 325 | r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); |
@@ -301,17 +339,17 @@ static struct rtable *__rt_cache_get_next(struct seq_file *seq, | |||
301 | { | 339 | { |
302 | struct rt_cache_iter_state *st = seq->private; | 340 | struct rt_cache_iter_state *st = seq->private; |
303 | 341 | ||
304 | r = r->dst.rt_next; | 342 | r = rcu_dereference_bh(r->dst.rt_next); |
305 | while (!r) { | 343 | while (!r) { |
306 | rcu_read_unlock_bh(); | 344 | rcu_read_unlock_bh(); |
307 | do { | 345 | do { |
308 | if (--st->bucket < 0) | 346 | if (--st->bucket < 0) |
309 | return NULL; | 347 | return NULL; |
310 | } while (!rt_hash_table[st->bucket].chain); | 348 | } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain)); |
311 | rcu_read_lock_bh(); | 349 | rcu_read_lock_bh(); |
312 | r = rt_hash_table[st->bucket].chain; | 350 | r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); |
313 | } | 351 | } |
314 | return rcu_dereference_bh(r); | 352 | return r; |
315 | } | 353 | } |
316 | 354 | ||
317 | static struct rtable *rt_cache_get_next(struct seq_file *seq, | 355 | static struct rtable *rt_cache_get_next(struct seq_file *seq, |
@@ -382,12 +420,11 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v) | |||
382 | (__force u32)r->rt_gateway, | 420 | (__force u32)r->rt_gateway, |
383 | r->rt_flags, atomic_read(&r->dst.__refcnt), | 421 | r->rt_flags, atomic_read(&r->dst.__refcnt), |
384 | r->dst.__use, 0, (__force u32)r->rt_src, | 422 | r->dst.__use, 0, (__force u32)r->rt_src, |
385 | (dst_metric(&r->dst, RTAX_ADVMSS) ? | 423 | dst_metric_advmss(&r->dst) + 40, |
386 | (int)dst_metric(&r->dst, RTAX_ADVMSS) + 40 : 0), | ||
387 | dst_metric(&r->dst, RTAX_WINDOW), | 424 | dst_metric(&r->dst, RTAX_WINDOW), |
388 | (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) + | 425 | (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) + |
389 | dst_metric(&r->dst, RTAX_RTTVAR)), | 426 | dst_metric(&r->dst, RTAX_RTTVAR)), |
390 | r->fl.fl4_tos, | 427 | r->rt_key_tos, |
391 | r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1, | 428 | r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1, |
392 | r->dst.hh ? (r->dst.hh->hh_output == | 429 | r->dst.hh ? (r->dst.hh->hh_output == |
393 | dev_queue_xmit) : 0, | 430 | dev_queue_xmit) : 0, |
@@ -466,7 +503,7 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v) | |||
466 | 503 | ||
467 | seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " | 504 | seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " |
468 | " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", | 505 | " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", |
469 | atomic_read(&ipv4_dst_ops.entries), | 506 | dst_entries_get_slow(&ipv4_dst_ops), |
470 | st->in_hit, | 507 | st->in_hit, |
471 | st->in_slow_tot, | 508 | st->in_slow_tot, |
472 | st->in_slow_mc, | 509 | st->in_slow_mc, |
@@ -510,7 +547,7 @@ static const struct file_operations rt_cpu_seq_fops = { | |||
510 | .release = seq_release, | 547 | .release = seq_release, |
511 | }; | 548 | }; |
512 | 549 | ||
513 | #ifdef CONFIG_NET_CLS_ROUTE | 550 | #ifdef CONFIG_IP_ROUTE_CLASSID |
514 | static int rt_acct_proc_show(struct seq_file *m, void *v) | 551 | static int rt_acct_proc_show(struct seq_file *m, void *v) |
515 | { | 552 | { |
516 | struct ip_rt_acct *dst, *src; | 553 | struct ip_rt_acct *dst, *src; |
@@ -563,14 +600,14 @@ static int __net_init ip_rt_do_proc_init(struct net *net) | |||
563 | if (!pde) | 600 | if (!pde) |
564 | goto err2; | 601 | goto err2; |
565 | 602 | ||
566 | #ifdef CONFIG_NET_CLS_ROUTE | 603 | #ifdef CONFIG_IP_ROUTE_CLASSID |
567 | pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops); | 604 | pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops); |
568 | if (!pde) | 605 | if (!pde) |
569 | goto err3; | 606 | goto err3; |
570 | #endif | 607 | #endif |
571 | return 0; | 608 | return 0; |
572 | 609 | ||
573 | #ifdef CONFIG_NET_CLS_ROUTE | 610 | #ifdef CONFIG_IP_ROUTE_CLASSID |
574 | err3: | 611 | err3: |
575 | remove_proc_entry("rt_cache", net->proc_net_stat); | 612 | remove_proc_entry("rt_cache", net->proc_net_stat); |
576 | #endif | 613 | #endif |
@@ -584,7 +621,7 @@ static void __net_exit ip_rt_do_proc_exit(struct net *net) | |||
584 | { | 621 | { |
585 | remove_proc_entry("rt_cache", net->proc_net_stat); | 622 | remove_proc_entry("rt_cache", net->proc_net_stat); |
586 | remove_proc_entry("rt_cache", net->proc_net); | 623 | remove_proc_entry("rt_cache", net->proc_net); |
587 | #ifdef CONFIG_NET_CLS_ROUTE | 624 | #ifdef CONFIG_IP_ROUTE_CLASSID |
588 | remove_proc_entry("rt_acct", net->proc_net); | 625 | remove_proc_entry("rt_acct", net->proc_net); |
589 | #endif | 626 | #endif |
590 | } | 627 | } |
@@ -622,13 +659,13 @@ static inline int rt_fast_clean(struct rtable *rth) | |||
622 | /* Kill broadcast/multicast entries very aggresively, if they | 659 | /* Kill broadcast/multicast entries very aggresively, if they |
623 | collide in hash table with more useful entries */ | 660 | collide in hash table with more useful entries */ |
624 | return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && | 661 | return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && |
625 | rth->fl.iif && rth->dst.rt_next; | 662 | rt_is_input_route(rth) && rth->dst.rt_next; |
626 | } | 663 | } |
627 | 664 | ||
628 | static inline int rt_valuable(struct rtable *rth) | 665 | static inline int rt_valuable(struct rtable *rth) |
629 | { | 666 | { |
630 | return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || | 667 | return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || |
631 | rth->dst.expires; | 668 | (rth->peer && rth->peer->pmtu_expires); |
632 | } | 669 | } |
633 | 670 | ||
634 | static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) | 671 | static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) |
@@ -639,13 +676,7 @@ static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long t | |||
639 | if (atomic_read(&rth->dst.__refcnt)) | 676 | if (atomic_read(&rth->dst.__refcnt)) |
640 | goto out; | 677 | goto out; |
641 | 678 | ||
642 | ret = 1; | ||
643 | if (rth->dst.expires && | ||
644 | time_after_eq(jiffies, rth->dst.expires)) | ||
645 | goto out; | ||
646 | |||
647 | age = jiffies - rth->dst.lastuse; | 679 | age = jiffies - rth->dst.lastuse; |
648 | ret = 0; | ||
649 | if ((age <= tmo1 && !rt_fast_clean(rth)) || | 680 | if ((age <= tmo1 && !rt_fast_clean(rth)) || |
650 | (age <= tmo2 && rt_valuable(rth))) | 681 | (age <= tmo2 && rt_valuable(rth))) |
651 | goto out; | 682 | goto out; |
@@ -667,7 +698,7 @@ static inline u32 rt_score(struct rtable *rt) | |||
667 | if (rt_valuable(rt)) | 698 | if (rt_valuable(rt)) |
668 | score |= (1<<31); | 699 | score |= (1<<31); |
669 | 700 | ||
670 | if (!rt->fl.iif || | 701 | if (rt_is_output_route(rt) || |
671 | !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL))) | 702 | !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL))) |
672 | score |= (1<<30); | 703 | score |= (1<<30); |
673 | 704 | ||
@@ -680,22 +711,22 @@ static inline bool rt_caching(const struct net *net) | |||
680 | net->ipv4.sysctl_rt_cache_rebuild_count; | 711 | net->ipv4.sysctl_rt_cache_rebuild_count; |
681 | } | 712 | } |
682 | 713 | ||
683 | static inline bool compare_hash_inputs(const struct flowi *fl1, | 714 | static inline bool compare_hash_inputs(const struct rtable *rt1, |
684 | const struct flowi *fl2) | 715 | const struct rtable *rt2) |
685 | { | 716 | { |
686 | return ((((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) | | 717 | return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) | |
687 | ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) | | 718 | ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) | |
688 | (fl1->iif ^ fl2->iif)) == 0); | 719 | (rt1->rt_iif ^ rt2->rt_iif)) == 0); |
689 | } | 720 | } |
690 | 721 | ||
691 | static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) | 722 | static inline int compare_keys(struct rtable *rt1, struct rtable *rt2) |
692 | { | 723 | { |
693 | return (((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) | | 724 | return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) | |
694 | ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) | | 725 | ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) | |
695 | (fl1->mark ^ fl2->mark) | | 726 | (rt1->rt_mark ^ rt2->rt_mark) | |
696 | (*(u16 *)&fl1->nl_u.ip4_u.tos ^ *(u16 *)&fl2->nl_u.ip4_u.tos) | | 727 | (rt1->rt_key_tos ^ rt2->rt_key_tos) | |
697 | (fl1->oif ^ fl2->oif) | | 728 | (rt1->rt_oif ^ rt2->rt_oif) | |
698 | (fl1->iif ^ fl2->iif)) == 0; | 729 | (rt1->rt_iif ^ rt2->rt_iif)) == 0; |
699 | } | 730 | } |
700 | 731 | ||
701 | static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) | 732 | static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) |
@@ -713,55 +744,48 @@ static inline int rt_is_expired(struct rtable *rth) | |||
713 | * Can be called by a softirq or a process. | 744 | * Can be called by a softirq or a process. |
714 | * In the later case, we want to be reschedule if necessary | 745 | * In the later case, we want to be reschedule if necessary |
715 | */ | 746 | */ |
716 | static void rt_do_flush(int process_context) | 747 | static void rt_do_flush(struct net *net, int process_context) |
717 | { | 748 | { |
718 | unsigned int i; | 749 | unsigned int i; |
719 | struct rtable *rth, *next; | 750 | struct rtable *rth, *next; |
720 | struct rtable * tail; | ||
721 | 751 | ||
722 | for (i = 0; i <= rt_hash_mask; i++) { | 752 | for (i = 0; i <= rt_hash_mask; i++) { |
753 | struct rtable __rcu **pprev; | ||
754 | struct rtable *list; | ||
755 | |||
723 | if (process_context && need_resched()) | 756 | if (process_context && need_resched()) |
724 | cond_resched(); | 757 | cond_resched(); |
725 | rth = rt_hash_table[i].chain; | 758 | rth = rcu_dereference_raw(rt_hash_table[i].chain); |
726 | if (!rth) | 759 | if (!rth) |
727 | continue; | 760 | continue; |
728 | 761 | ||
729 | spin_lock_bh(rt_hash_lock_addr(i)); | 762 | spin_lock_bh(rt_hash_lock_addr(i)); |
730 | #ifdef CONFIG_NET_NS | ||
731 | { | ||
732 | struct rtable ** prev, * p; | ||
733 | 763 | ||
734 | rth = rt_hash_table[i].chain; | 764 | list = NULL; |
765 | pprev = &rt_hash_table[i].chain; | ||
766 | rth = rcu_dereference_protected(*pprev, | ||
767 | lockdep_is_held(rt_hash_lock_addr(i))); | ||
735 | 768 | ||
736 | /* defer releasing the head of the list after spin_unlock */ | 769 | while (rth) { |
737 | for (tail = rth; tail; tail = tail->dst.rt_next) | 770 | next = rcu_dereference_protected(rth->dst.rt_next, |
738 | if (!rt_is_expired(tail)) | 771 | lockdep_is_held(rt_hash_lock_addr(i))); |
739 | break; | 772 | |
740 | if (rth != tail) | 773 | if (!net || |
741 | rt_hash_table[i].chain = tail; | 774 | net_eq(dev_net(rth->dst.dev), net)) { |
742 | 775 | rcu_assign_pointer(*pprev, next); | |
743 | /* call rt_free on entries after the tail requiring flush */ | 776 | rcu_assign_pointer(rth->dst.rt_next, list); |
744 | prev = &rt_hash_table[i].chain; | 777 | list = rth; |
745 | for (p = *prev; p; p = next) { | ||
746 | next = p->dst.rt_next; | ||
747 | if (!rt_is_expired(p)) { | ||
748 | prev = &p->dst.rt_next; | ||
749 | } else { | 778 | } else { |
750 | *prev = next; | 779 | pprev = &rth->dst.rt_next; |
751 | rt_free(p); | ||
752 | } | 780 | } |
781 | rth = next; | ||
753 | } | 782 | } |
754 | } | 783 | |
755 | #else | ||
756 | rth = rt_hash_table[i].chain; | ||
757 | rt_hash_table[i].chain = NULL; | ||
758 | tail = NULL; | ||
759 | #endif | ||
760 | spin_unlock_bh(rt_hash_lock_addr(i)); | 784 | spin_unlock_bh(rt_hash_lock_addr(i)); |
761 | 785 | ||
762 | for (; rth != tail; rth = next) { | 786 | for (; list; list = next) { |
763 | next = rth->dst.rt_next; | 787 | next = rcu_dereference_protected(list->dst.rt_next, 1); |
764 | rt_free(rth); | 788 | rt_free(list); |
765 | } | 789 | } |
766 | } | 790 | } |
767 | } | 791 | } |
@@ -789,104 +813,15 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth) | |||
789 | const struct rtable *aux = head; | 813 | const struct rtable *aux = head; |
790 | 814 | ||
791 | while (aux != rth) { | 815 | while (aux != rth) { |
792 | if (compare_hash_inputs(&aux->fl, &rth->fl)) | 816 | if (compare_hash_inputs(aux, rth)) |
793 | return 0; | 817 | return 0; |
794 | aux = aux->dst.rt_next; | 818 | aux = rcu_dereference_protected(aux->dst.rt_next, 1); |
795 | } | 819 | } |
796 | return ONE; | 820 | return ONE; |
797 | } | 821 | } |
798 | 822 | ||
799 | static void rt_check_expire(void) | ||
800 | { | ||
801 | static unsigned int rover; | ||
802 | unsigned int i = rover, goal; | ||
803 | struct rtable *rth, **rthp; | ||
804 | unsigned long samples = 0; | ||
805 | unsigned long sum = 0, sum2 = 0; | ||
806 | unsigned long delta; | ||
807 | u64 mult; | ||
808 | |||
809 | delta = jiffies - expires_ljiffies; | ||
810 | expires_ljiffies = jiffies; | ||
811 | mult = ((u64)delta) << rt_hash_log; | ||
812 | if (ip_rt_gc_timeout > 1) | ||
813 | do_div(mult, ip_rt_gc_timeout); | ||
814 | goal = (unsigned int)mult; | ||
815 | if (goal > rt_hash_mask) | ||
816 | goal = rt_hash_mask + 1; | ||
817 | for (; goal > 0; goal--) { | ||
818 | unsigned long tmo = ip_rt_gc_timeout; | ||
819 | unsigned long length; | ||
820 | |||
821 | i = (i + 1) & rt_hash_mask; | ||
822 | rthp = &rt_hash_table[i].chain; | ||
823 | |||
824 | if (need_resched()) | ||
825 | cond_resched(); | ||
826 | |||
827 | samples++; | ||
828 | |||
829 | if (*rthp == NULL) | ||
830 | continue; | ||
831 | length = 0; | ||
832 | spin_lock_bh(rt_hash_lock_addr(i)); | ||
833 | while ((rth = *rthp) != NULL) { | ||
834 | prefetch(rth->dst.rt_next); | ||
835 | if (rt_is_expired(rth)) { | ||
836 | *rthp = rth->dst.rt_next; | ||
837 | rt_free(rth); | ||
838 | continue; | ||
839 | } | ||
840 | if (rth->dst.expires) { | ||
841 | /* Entry is expired even if it is in use */ | ||
842 | if (time_before_eq(jiffies, rth->dst.expires)) { | ||
843 | nofree: | ||
844 | tmo >>= 1; | ||
845 | rthp = &rth->dst.rt_next; | ||
846 | /* | ||
847 | * We only count entries on | ||
848 | * a chain with equal hash inputs once | ||
849 | * so that entries for different QOS | ||
850 | * levels, and other non-hash input | ||
851 | * attributes don't unfairly skew | ||
852 | * the length computation | ||
853 | */ | ||
854 | length += has_noalias(rt_hash_table[i].chain, rth); | ||
855 | continue; | ||
856 | } | ||
857 | } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) | ||
858 | goto nofree; | ||
859 | |||
860 | /* Cleanup aged off entries. */ | ||
861 | *rthp = rth->dst.rt_next; | ||
862 | rt_free(rth); | ||
863 | } | ||
864 | spin_unlock_bh(rt_hash_lock_addr(i)); | ||
865 | sum += length; | ||
866 | sum2 += length*length; | ||
867 | } | ||
868 | if (samples) { | ||
869 | unsigned long avg = sum / samples; | ||
870 | unsigned long sd = int_sqrt(sum2 / samples - avg*avg); | ||
871 | rt_chain_length_max = max_t(unsigned long, | ||
872 | ip_rt_gc_elasticity, | ||
873 | (avg + 4*sd) >> FRACT_BITS); | ||
874 | } | ||
875 | rover = i; | ||
876 | } | ||
877 | |||
878 | /* | ||
879 | * rt_worker_func() is run in process context. | ||
880 | * we call rt_check_expire() to scan part of the hash table | ||
881 | */ | ||
882 | static void rt_worker_func(struct work_struct *work) | ||
883 | { | ||
884 | rt_check_expire(); | ||
885 | schedule_delayed_work(&expires_work, ip_rt_gc_interval); | ||
886 | } | ||
887 | |||
888 | /* | 823 | /* |
889 | * Pertubation of rt_genid by a small quantity [1..256] | 824 | * Perturbation of rt_genid by a small quantity [1..256] |
890 | * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() | 825 | * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() |
891 | * many times (2^24) without giving recent rt_genid. | 826 | * many times (2^24) without giving recent rt_genid. |
892 | * Jenkins hash is strong enough that litle changes of rt_genid are OK. | 827 | * Jenkins hash is strong enough that litle changes of rt_genid are OK. |
@@ -907,13 +842,13 @@ void rt_cache_flush(struct net *net, int delay) | |||
907 | { | 842 | { |
908 | rt_cache_invalidate(net); | 843 | rt_cache_invalidate(net); |
909 | if (delay >= 0) | 844 | if (delay >= 0) |
910 | rt_do_flush(!in_softirq()); | 845 | rt_do_flush(net, !in_softirq()); |
911 | } | 846 | } |
912 | 847 | ||
913 | /* Flush previous cache invalidated entries from the cache */ | 848 | /* Flush previous cache invalidated entries from the cache */ |
914 | void rt_cache_flush_batch(void) | 849 | void rt_cache_flush_batch(struct net *net) |
915 | { | 850 | { |
916 | rt_do_flush(!in_softirq()); | 851 | rt_do_flush(net, !in_softirq()); |
917 | } | 852 | } |
918 | 853 | ||
919 | static void rt_emergency_hash_rebuild(struct net *net) | 854 | static void rt_emergency_hash_rebuild(struct net *net) |
@@ -942,9 +877,11 @@ static int rt_garbage_collect(struct dst_ops *ops) | |||
942 | static unsigned long last_gc; | 877 | static unsigned long last_gc; |
943 | static int rover; | 878 | static int rover; |
944 | static int equilibrium; | 879 | static int equilibrium; |
945 | struct rtable *rth, **rthp; | 880 | struct rtable *rth; |
881 | struct rtable __rcu **rthp; | ||
946 | unsigned long now = jiffies; | 882 | unsigned long now = jiffies; |
947 | int goal; | 883 | int goal; |
884 | int entries = dst_entries_get_fast(&ipv4_dst_ops); | ||
948 | 885 | ||
949 | /* | 886 | /* |
950 | * Garbage collection is pretty expensive, | 887 | * Garbage collection is pretty expensive, |
@@ -954,28 +891,28 @@ static int rt_garbage_collect(struct dst_ops *ops) | |||
954 | RT_CACHE_STAT_INC(gc_total); | 891 | RT_CACHE_STAT_INC(gc_total); |
955 | 892 | ||
956 | if (now - last_gc < ip_rt_gc_min_interval && | 893 | if (now - last_gc < ip_rt_gc_min_interval && |
957 | atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) { | 894 | entries < ip_rt_max_size) { |
958 | RT_CACHE_STAT_INC(gc_ignored); | 895 | RT_CACHE_STAT_INC(gc_ignored); |
959 | goto out; | 896 | goto out; |
960 | } | 897 | } |
961 | 898 | ||
899 | entries = dst_entries_get_slow(&ipv4_dst_ops); | ||
962 | /* Calculate number of entries, which we want to expire now. */ | 900 | /* Calculate number of entries, which we want to expire now. */ |
963 | goal = atomic_read(&ipv4_dst_ops.entries) - | 901 | goal = entries - (ip_rt_gc_elasticity << rt_hash_log); |
964 | (ip_rt_gc_elasticity << rt_hash_log); | ||
965 | if (goal <= 0) { | 902 | if (goal <= 0) { |
966 | if (equilibrium < ipv4_dst_ops.gc_thresh) | 903 | if (equilibrium < ipv4_dst_ops.gc_thresh) |
967 | equilibrium = ipv4_dst_ops.gc_thresh; | 904 | equilibrium = ipv4_dst_ops.gc_thresh; |
968 | goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; | 905 | goal = entries - equilibrium; |
969 | if (goal > 0) { | 906 | if (goal > 0) { |
970 | equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1); | 907 | equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1); |
971 | goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; | 908 | goal = entries - equilibrium; |
972 | } | 909 | } |
973 | } else { | 910 | } else { |
974 | /* We are in dangerous area. Try to reduce cache really | 911 | /* We are in dangerous area. Try to reduce cache really |
975 | * aggressively. | 912 | * aggressively. |
976 | */ | 913 | */ |
977 | goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1); | 914 | goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1); |
978 | equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal; | 915 | equilibrium = entries - goal; |
979 | } | 916 | } |
980 | 917 | ||
981 | if (now - last_gc >= ip_rt_gc_min_interval) | 918 | if (now - last_gc >= ip_rt_gc_min_interval) |
@@ -995,7 +932,8 @@ static int rt_garbage_collect(struct dst_ops *ops) | |||
995 | k = (k + 1) & rt_hash_mask; | 932 | k = (k + 1) & rt_hash_mask; |
996 | rthp = &rt_hash_table[k].chain; | 933 | rthp = &rt_hash_table[k].chain; |
997 | spin_lock_bh(rt_hash_lock_addr(k)); | 934 | spin_lock_bh(rt_hash_lock_addr(k)); |
998 | while ((rth = *rthp) != NULL) { | 935 | while ((rth = rcu_dereference_protected(*rthp, |
936 | lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) { | ||
999 | if (!rt_is_expired(rth) && | 937 | if (!rt_is_expired(rth) && |
1000 | !rt_may_expire(rth, tmo, expire)) { | 938 | !rt_may_expire(rth, tmo, expire)) { |
1001 | tmo >>= 1; | 939 | tmo >>= 1; |
@@ -1030,16 +968,14 @@ static int rt_garbage_collect(struct dst_ops *ops) | |||
1030 | break; | 968 | break; |
1031 | 969 | ||
1032 | expire >>= 1; | 970 | expire >>= 1; |
1033 | #if RT_CACHE_DEBUG >= 2 | ||
1034 | printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire, | ||
1035 | atomic_read(&ipv4_dst_ops.entries), goal, i); | ||
1036 | #endif | ||
1037 | 971 | ||
1038 | if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) | 972 | if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size) |
1039 | goto out; | 973 | goto out; |
1040 | } while (!in_softirq() && time_before_eq(jiffies, now)); | 974 | } while (!in_softirq() && time_before_eq(jiffies, now)); |
1041 | 975 | ||
1042 | if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) | 976 | if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size) |
977 | goto out; | ||
978 | if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size) | ||
1043 | goto out; | 979 | goto out; |
1044 | if (net_ratelimit()) | 980 | if (net_ratelimit()) |
1045 | printk(KERN_WARNING "dst cache overflow\n"); | 981 | printk(KERN_WARNING "dst cache overflow\n"); |
@@ -1049,12 +985,9 @@ static int rt_garbage_collect(struct dst_ops *ops) | |||
1049 | work_done: | 985 | work_done: |
1050 | expire += ip_rt_gc_min_interval; | 986 | expire += ip_rt_gc_min_interval; |
1051 | if (expire > ip_rt_gc_timeout || | 987 | if (expire > ip_rt_gc_timeout || |
1052 | atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh) | 988 | dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh || |
989 | dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh) | ||
1053 | expire = ip_rt_gc_timeout; | 990 | expire = ip_rt_gc_timeout; |
1054 | #if RT_CACHE_DEBUG >= 2 | ||
1055 | printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire, | ||
1056 | atomic_read(&ipv4_dst_ops.entries), goal, rover); | ||
1057 | #endif | ||
1058 | out: return 0; | 991 | out: return 0; |
1059 | } | 992 | } |
1060 | 993 | ||
@@ -1068,17 +1001,17 @@ static int slow_chain_length(const struct rtable *head) | |||
1068 | 1001 | ||
1069 | while (rth) { | 1002 | while (rth) { |
1070 | length += has_noalias(head, rth); | 1003 | length += has_noalias(head, rth); |
1071 | rth = rth->dst.rt_next; | 1004 | rth = rcu_dereference_protected(rth->dst.rt_next, 1); |
1072 | } | 1005 | } |
1073 | return length >> FRACT_BITS; | 1006 | return length >> FRACT_BITS; |
1074 | } | 1007 | } |
1075 | 1008 | ||
1076 | static int rt_intern_hash(unsigned hash, struct rtable *rt, | 1009 | static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt, |
1077 | struct rtable **rp, struct sk_buff *skb, int ifindex) | 1010 | struct sk_buff *skb, int ifindex) |
1078 | { | 1011 | { |
1079 | struct rtable *rth, **rthp; | 1012 | struct rtable *rth, *cand; |
1013 | struct rtable __rcu **rthp, **candp; | ||
1080 | unsigned long now; | 1014 | unsigned long now; |
1081 | struct rtable *cand, **candp; | ||
1082 | u32 min_score; | 1015 | u32 min_score; |
1083 | int chain_length; | 1016 | int chain_length; |
1084 | int attempts = !in_softirq(); | 1017 | int attempts = !in_softirq(); |
@@ -1102,36 +1035,37 @@ restart: | |||
1102 | * Note that we do rt_free on this new route entry, so that | 1035 | * Note that we do rt_free on this new route entry, so that |
1103 | * once its refcount hits zero, we are still able to reap it | 1036 | * once its refcount hits zero, we are still able to reap it |
1104 | * (Thanks Alexey) | 1037 | * (Thanks Alexey) |
1105 | * Note also the rt_free uses call_rcu. We don't actually | 1038 | * Note: To avoid expensive rcu stuff for this uncached dst, |
1106 | * need rcu protection here, this is just our path to get | 1039 | * we set DST_NOCACHE so that dst_release() can free dst without |
1107 | * on the route gc list. | 1040 | * waiting a grace period. |
1108 | */ | 1041 | */ |
1109 | 1042 | ||
1110 | if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { | 1043 | rt->dst.flags |= DST_NOCACHE; |
1044 | if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) { | ||
1111 | int err = arp_bind_neighbour(&rt->dst); | 1045 | int err = arp_bind_neighbour(&rt->dst); |
1112 | if (err) { | 1046 | if (err) { |
1113 | if (net_ratelimit()) | 1047 | if (net_ratelimit()) |
1114 | printk(KERN_WARNING | 1048 | printk(KERN_WARNING |
1115 | "Neighbour table failure & not caching routes.\n"); | 1049 | "Neighbour table failure & not caching routes.\n"); |
1116 | rt_drop(rt); | 1050 | ip_rt_put(rt); |
1117 | return err; | 1051 | return ERR_PTR(err); |
1118 | } | 1052 | } |
1119 | } | 1053 | } |
1120 | 1054 | ||
1121 | rt_free(rt); | ||
1122 | goto skip_hashing; | 1055 | goto skip_hashing; |
1123 | } | 1056 | } |
1124 | 1057 | ||
1125 | rthp = &rt_hash_table[hash].chain; | 1058 | rthp = &rt_hash_table[hash].chain; |
1126 | 1059 | ||
1127 | spin_lock_bh(rt_hash_lock_addr(hash)); | 1060 | spin_lock_bh(rt_hash_lock_addr(hash)); |
1128 | while ((rth = *rthp) != NULL) { | 1061 | while ((rth = rcu_dereference_protected(*rthp, |
1062 | lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) { | ||
1129 | if (rt_is_expired(rth)) { | 1063 | if (rt_is_expired(rth)) { |
1130 | *rthp = rth->dst.rt_next; | 1064 | *rthp = rth->dst.rt_next; |
1131 | rt_free(rth); | 1065 | rt_free(rth); |
1132 | continue; | 1066 | continue; |
1133 | } | 1067 | } |
1134 | if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) { | 1068 | if (compare_keys(rth, rt) && compare_netns(rth, rt)) { |
1135 | /* Put it first */ | 1069 | /* Put it first */ |
1136 | *rthp = rth->dst.rt_next; | 1070 | *rthp = rth->dst.rt_next; |
1137 | /* | 1071 | /* |
@@ -1151,11 +1085,9 @@ restart: | |||
1151 | spin_unlock_bh(rt_hash_lock_addr(hash)); | 1085 | spin_unlock_bh(rt_hash_lock_addr(hash)); |
1152 | 1086 | ||
1153 | rt_drop(rt); | 1087 | rt_drop(rt); |
1154 | if (rp) | 1088 | if (skb) |
1155 | *rp = rth; | ||
1156 | else | ||
1157 | skb_dst_set(skb, &rth->dst); | 1089 | skb_dst_set(skb, &rth->dst); |
1158 | return 0; | 1090 | return rth; |
1159 | } | 1091 | } |
1160 | 1092 | ||
1161 | if (!atomic_read(&rth->dst.__refcnt)) { | 1093 | if (!atomic_read(&rth->dst.__refcnt)) { |
@@ -1196,7 +1128,7 @@ restart: | |||
1196 | rt_emergency_hash_rebuild(net); | 1128 | rt_emergency_hash_rebuild(net); |
1197 | spin_unlock_bh(rt_hash_lock_addr(hash)); | 1129 | spin_unlock_bh(rt_hash_lock_addr(hash)); |
1198 | 1130 | ||
1199 | hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, | 1131 | hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, |
1200 | ifindex, rt_genid(net)); | 1132 | ifindex, rt_genid(net)); |
1201 | goto restart; | 1133 | goto restart; |
1202 | } | 1134 | } |
@@ -1205,14 +1137,14 @@ restart: | |||
1205 | /* Try to bind route to arp only if it is output | 1137 | /* Try to bind route to arp only if it is output |
1206 | route or unicast forwarding path. | 1138 | route or unicast forwarding path. |
1207 | */ | 1139 | */ |
1208 | if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { | 1140 | if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) { |
1209 | int err = arp_bind_neighbour(&rt->dst); | 1141 | int err = arp_bind_neighbour(&rt->dst); |
1210 | if (err) { | 1142 | if (err) { |
1211 | spin_unlock_bh(rt_hash_lock_addr(hash)); | 1143 | spin_unlock_bh(rt_hash_lock_addr(hash)); |
1212 | 1144 | ||
1213 | if (err != -ENOBUFS) { | 1145 | if (err != -ENOBUFS) { |
1214 | rt_drop(rt); | 1146 | rt_drop(rt); |
1215 | return err; | 1147 | return ERR_PTR(err); |
1216 | } | 1148 | } |
1217 | 1149 | ||
1218 | /* Neighbour tables are full and nothing | 1150 | /* Neighbour tables are full and nothing |
@@ -1233,25 +1165,15 @@ restart: | |||
1233 | if (net_ratelimit()) | 1165 | if (net_ratelimit()) |
1234 | printk(KERN_WARNING "ipv4: Neighbour table overflow.\n"); | 1166 | printk(KERN_WARNING "ipv4: Neighbour table overflow.\n"); |
1235 | rt_drop(rt); | 1167 | rt_drop(rt); |
1236 | return -ENOBUFS; | 1168 | return ERR_PTR(-ENOBUFS); |
1237 | } | 1169 | } |
1238 | } | 1170 | } |
1239 | 1171 | ||
1240 | rt->dst.rt_next = rt_hash_table[hash].chain; | 1172 | rt->dst.rt_next = rt_hash_table[hash].chain; |
1241 | 1173 | ||
1242 | #if RT_CACHE_DEBUG >= 2 | ||
1243 | if (rt->dst.rt_next) { | ||
1244 | struct rtable *trt; | ||
1245 | printk(KERN_DEBUG "rt_cache @%02x: %pI4", | ||
1246 | hash, &rt->rt_dst); | ||
1247 | for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next) | ||
1248 | printk(" . %pI4", &trt->rt_dst); | ||
1249 | printk("\n"); | ||
1250 | } | ||
1251 | #endif | ||
1252 | /* | 1174 | /* |
1253 | * Since lookup is lockfree, we must make sure | 1175 | * Since lookup is lockfree, we must make sure |
1254 | * previous writes to rt are comitted to memory | 1176 | * previous writes to rt are committed to memory |
1255 | * before making rt visible to other CPUS. | 1177 | * before making rt visible to other CPUS. |
1256 | */ | 1178 | */ |
1257 | rcu_assign_pointer(rt_hash_table[hash].chain, rt); | 1179 | rcu_assign_pointer(rt_hash_table[hash].chain, rt); |
@@ -1259,28 +1181,28 @@ restart: | |||
1259 | spin_unlock_bh(rt_hash_lock_addr(hash)); | 1181 | spin_unlock_bh(rt_hash_lock_addr(hash)); |
1260 | 1182 | ||
1261 | skip_hashing: | 1183 | skip_hashing: |
1262 | if (rp) | 1184 | if (skb) |
1263 | *rp = rt; | ||
1264 | else | ||
1265 | skb_dst_set(skb, &rt->dst); | 1185 | skb_dst_set(skb, &rt->dst); |
1266 | return 0; | 1186 | return rt; |
1187 | } | ||
1188 | |||
1189 | static atomic_t __rt_peer_genid = ATOMIC_INIT(0); | ||
1190 | |||
1191 | static u32 rt_peer_genid(void) | ||
1192 | { | ||
1193 | return atomic_read(&__rt_peer_genid); | ||
1267 | } | 1194 | } |
1268 | 1195 | ||
1269 | void rt_bind_peer(struct rtable *rt, int create) | 1196 | void rt_bind_peer(struct rtable *rt, __be32 daddr, int create) |
1270 | { | 1197 | { |
1271 | static DEFINE_SPINLOCK(rt_peer_lock); | ||
1272 | struct inet_peer *peer; | 1198 | struct inet_peer *peer; |
1273 | 1199 | ||
1274 | peer = inet_getpeer(rt->rt_dst, create); | 1200 | peer = inet_getpeer_v4(daddr, create); |
1275 | 1201 | ||
1276 | spin_lock_bh(&rt_peer_lock); | 1202 | if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL) |
1277 | if (rt->peer == NULL) { | ||
1278 | rt->peer = peer; | ||
1279 | peer = NULL; | ||
1280 | } | ||
1281 | spin_unlock_bh(&rt_peer_lock); | ||
1282 | if (peer) | ||
1283 | inet_putpeer(peer); | 1203 | inet_putpeer(peer); |
1204 | else | ||
1205 | rt->rt_peer_genid = rt_peer_genid(); | ||
1284 | } | 1206 | } |
1285 | 1207 | ||
1286 | /* | 1208 | /* |
@@ -1309,7 +1231,7 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) | |||
1309 | 1231 | ||
1310 | if (rt) { | 1232 | if (rt) { |
1311 | if (rt->peer == NULL) | 1233 | if (rt->peer == NULL) |
1312 | rt_bind_peer(rt, 1); | 1234 | rt_bind_peer(rt, rt->rt_dst, 1); |
1313 | 1235 | ||
1314 | /* If peer is attached to destination, it is never detached, | 1236 | /* If peer is attached to destination, it is never detached, |
1315 | so that we need not to grab a lock to dereference it. | 1237 | so that we need not to grab a lock to dereference it. |
@@ -1328,12 +1250,14 @@ EXPORT_SYMBOL(__ip_select_ident); | |||
1328 | 1250 | ||
1329 | static void rt_del(unsigned hash, struct rtable *rt) | 1251 | static void rt_del(unsigned hash, struct rtable *rt) |
1330 | { | 1252 | { |
1331 | struct rtable **rthp, *aux; | 1253 | struct rtable __rcu **rthp; |
1254 | struct rtable *aux; | ||
1332 | 1255 | ||
1333 | rthp = &rt_hash_table[hash].chain; | 1256 | rthp = &rt_hash_table[hash].chain; |
1334 | spin_lock_bh(rt_hash_lock_addr(hash)); | 1257 | spin_lock_bh(rt_hash_lock_addr(hash)); |
1335 | ip_rt_put(rt); | 1258 | ip_rt_put(rt); |
1336 | while ((aux = *rthp) != NULL) { | 1259 | while ((aux = rcu_dereference_protected(*rthp, |
1260 | lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) { | ||
1337 | if (aux == rt || rt_is_expired(aux)) { | 1261 | if (aux == rt || rt_is_expired(aux)) { |
1338 | *rthp = aux->dst.rt_next; | 1262 | *rthp = aux->dst.rt_next; |
1339 | rt_free(aux); | 1263 | rt_free(aux); |
@@ -1348,12 +1272,8 @@ static void rt_del(unsigned hash, struct rtable *rt) | |||
1348 | void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, | 1272 | void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, |
1349 | __be32 saddr, struct net_device *dev) | 1273 | __be32 saddr, struct net_device *dev) |
1350 | { | 1274 | { |
1351 | int i, k; | ||
1352 | struct in_device *in_dev = __in_dev_get_rcu(dev); | 1275 | struct in_device *in_dev = __in_dev_get_rcu(dev); |
1353 | struct rtable *rth, **rthp; | 1276 | struct inet_peer *peer; |
1354 | __be32 skeys[2] = { saddr, 0 }; | ||
1355 | int ikeys[2] = { dev->ifindex, 0 }; | ||
1356 | struct netevent_redirect netevent; | ||
1357 | struct net *net; | 1277 | struct net *net; |
1358 | 1278 | ||
1359 | if (!in_dev) | 1279 | if (!in_dev) |
@@ -1365,9 +1285,6 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, | |||
1365 | ipv4_is_zeronet(new_gw)) | 1285 | ipv4_is_zeronet(new_gw)) |
1366 | goto reject_redirect; | 1286 | goto reject_redirect; |
1367 | 1287 | ||
1368 | if (!rt_caching(net)) | ||
1369 | goto reject_redirect; | ||
1370 | |||
1371 | if (!IN_DEV_SHARED_MEDIA(in_dev)) { | 1288 | if (!IN_DEV_SHARED_MEDIA(in_dev)) { |
1372 | if (!inet_addr_onlink(in_dev, new_gw, old_gw)) | 1289 | if (!inet_addr_onlink(in_dev, new_gw, old_gw)) |
1373 | goto reject_redirect; | 1290 | goto reject_redirect; |
@@ -1378,93 +1295,13 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, | |||
1378 | goto reject_redirect; | 1295 | goto reject_redirect; |
1379 | } | 1296 | } |
1380 | 1297 | ||
1381 | for (i = 0; i < 2; i++) { | 1298 | peer = inet_getpeer_v4(daddr, 1); |
1382 | for (k = 0; k < 2; k++) { | 1299 | if (peer) { |
1383 | unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], | 1300 | peer->redirect_learned.a4 = new_gw; |
1384 | rt_genid(net)); | ||
1385 | |||
1386 | rthp=&rt_hash_table[hash].chain; | ||
1387 | |||
1388 | while ((rth = rcu_dereference(*rthp)) != NULL) { | ||
1389 | struct rtable *rt; | ||
1390 | |||
1391 | if (rth->fl.fl4_dst != daddr || | ||
1392 | rth->fl.fl4_src != skeys[i] || | ||
1393 | rth->fl.oif != ikeys[k] || | ||
1394 | rth->fl.iif != 0 || | ||
1395 | rt_is_expired(rth) || | ||
1396 | !net_eq(dev_net(rth->dst.dev), net)) { | ||
1397 | rthp = &rth->dst.rt_next; | ||
1398 | continue; | ||
1399 | } | ||
1400 | |||
1401 | if (rth->rt_dst != daddr || | ||
1402 | rth->rt_src != saddr || | ||
1403 | rth->dst.error || | ||
1404 | rth->rt_gateway != old_gw || | ||
1405 | rth->dst.dev != dev) | ||
1406 | break; | ||
1407 | |||
1408 | dst_hold(&rth->dst); | ||
1409 | |||
1410 | rt = dst_alloc(&ipv4_dst_ops); | ||
1411 | if (rt == NULL) { | ||
1412 | ip_rt_put(rth); | ||
1413 | return; | ||
1414 | } | ||
1415 | |||
1416 | /* Copy all the information. */ | ||
1417 | *rt = *rth; | ||
1418 | rt->dst.__use = 1; | ||
1419 | atomic_set(&rt->dst.__refcnt, 1); | ||
1420 | rt->dst.child = NULL; | ||
1421 | if (rt->dst.dev) | ||
1422 | dev_hold(rt->dst.dev); | ||
1423 | if (rt->idev) | ||
1424 | in_dev_hold(rt->idev); | ||
1425 | rt->dst.obsolete = -1; | ||
1426 | rt->dst.lastuse = jiffies; | ||
1427 | rt->dst.path = &rt->dst; | ||
1428 | rt->dst.neighbour = NULL; | ||
1429 | rt->dst.hh = NULL; | ||
1430 | #ifdef CONFIG_XFRM | ||
1431 | rt->dst.xfrm = NULL; | ||
1432 | #endif | ||
1433 | rt->rt_genid = rt_genid(net); | ||
1434 | rt->rt_flags |= RTCF_REDIRECTED; | ||
1435 | |||
1436 | /* Gateway is different ... */ | ||
1437 | rt->rt_gateway = new_gw; | ||
1438 | |||
1439 | /* Redirect received -> path was valid */ | ||
1440 | dst_confirm(&rth->dst); | ||
1441 | |||
1442 | if (rt->peer) | ||
1443 | atomic_inc(&rt->peer->refcnt); | ||
1444 | |||
1445 | if (arp_bind_neighbour(&rt->dst) || | ||
1446 | !(rt->dst.neighbour->nud_state & | ||
1447 | NUD_VALID)) { | ||
1448 | if (rt->dst.neighbour) | ||
1449 | neigh_event_send(rt->dst.neighbour, NULL); | ||
1450 | ip_rt_put(rth); | ||
1451 | rt_drop(rt); | ||
1452 | goto do_next; | ||
1453 | } | ||
1454 | 1301 | ||
1455 | netevent.old = &rth->dst; | 1302 | inet_putpeer(peer); |
1456 | netevent.new = &rt->dst; | ||
1457 | call_netevent_notifiers(NETEVENT_REDIRECT, | ||
1458 | &netevent); | ||
1459 | 1303 | ||
1460 | rt_del(hash, rth); | 1304 | atomic_inc(&__rt_peer_genid); |
1461 | if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif)) | ||
1462 | ip_rt_put(rt); | ||
1463 | goto do_next; | ||
1464 | } | ||
1465 | do_next: | ||
1466 | ; | ||
1467 | } | ||
1468 | } | 1305 | } |
1469 | return; | 1306 | return; |
1470 | 1307 | ||
@@ -1479,6 +1316,23 @@ reject_redirect: | |||
1479 | ; | 1316 | ; |
1480 | } | 1317 | } |
1481 | 1318 | ||
1319 | static bool peer_pmtu_expired(struct inet_peer *peer) | ||
1320 | { | ||
1321 | unsigned long orig = ACCESS_ONCE(peer->pmtu_expires); | ||
1322 | |||
1323 | return orig && | ||
1324 | time_after_eq(jiffies, orig) && | ||
1325 | cmpxchg(&peer->pmtu_expires, orig, 0) == orig; | ||
1326 | } | ||
1327 | |||
1328 | static bool peer_pmtu_cleaned(struct inet_peer *peer) | ||
1329 | { | ||
1330 | unsigned long orig = ACCESS_ONCE(peer->pmtu_expires); | ||
1331 | |||
1332 | return orig && | ||
1333 | cmpxchg(&peer->pmtu_expires, orig, 0) == orig; | ||
1334 | } | ||
1335 | |||
1482 | static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) | 1336 | static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) |
1483 | { | 1337 | { |
1484 | struct rtable *rt = (struct rtable *)dst; | 1338 | struct rtable *rt = (struct rtable *)dst; |
@@ -1488,18 +1342,14 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) | |||
1488 | if (dst->obsolete > 0) { | 1342 | if (dst->obsolete > 0) { |
1489 | ip_rt_put(rt); | 1343 | ip_rt_put(rt); |
1490 | ret = NULL; | 1344 | ret = NULL; |
1491 | } else if ((rt->rt_flags & RTCF_REDIRECTED) || | 1345 | } else if (rt->rt_flags & RTCF_REDIRECTED) { |
1492 | (rt->dst.expires && | 1346 | unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, |
1493 | time_after_eq(jiffies, rt->dst.expires))) { | 1347 | rt->rt_oif, |
1494 | unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, | ||
1495 | rt->fl.oif, | ||
1496 | rt_genid(dev_net(dst->dev))); | 1348 | rt_genid(dev_net(dst->dev))); |
1497 | #if RT_CACHE_DEBUG >= 1 | ||
1498 | printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n", | ||
1499 | &rt->rt_dst, rt->fl.fl4_tos); | ||
1500 | #endif | ||
1501 | rt_del(hash, rt); | 1349 | rt_del(hash, rt); |
1502 | ret = NULL; | 1350 | ret = NULL; |
1351 | } else if (rt->peer && peer_pmtu_expired(rt->peer)) { | ||
1352 | dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig); | ||
1503 | } | 1353 | } |
1504 | } | 1354 | } |
1505 | return ret; | 1355 | return ret; |
@@ -1525,6 +1375,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) | |||
1525 | { | 1375 | { |
1526 | struct rtable *rt = skb_rtable(skb); | 1376 | struct rtable *rt = skb_rtable(skb); |
1527 | struct in_device *in_dev; | 1377 | struct in_device *in_dev; |
1378 | struct inet_peer *peer; | ||
1528 | int log_martians; | 1379 | int log_martians; |
1529 | 1380 | ||
1530 | rcu_read_lock(); | 1381 | rcu_read_lock(); |
@@ -1536,36 +1387,44 @@ void ip_rt_send_redirect(struct sk_buff *skb) | |||
1536 | log_martians = IN_DEV_LOG_MARTIANS(in_dev); | 1387 | log_martians = IN_DEV_LOG_MARTIANS(in_dev); |
1537 | rcu_read_unlock(); | 1388 | rcu_read_unlock(); |
1538 | 1389 | ||
1390 | if (!rt->peer) | ||
1391 | rt_bind_peer(rt, rt->rt_dst, 1); | ||
1392 | peer = rt->peer; | ||
1393 | if (!peer) { | ||
1394 | icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); | ||
1395 | return; | ||
1396 | } | ||
1397 | |||
1539 | /* No redirected packets during ip_rt_redirect_silence; | 1398 | /* No redirected packets during ip_rt_redirect_silence; |
1540 | * reset the algorithm. | 1399 | * reset the algorithm. |
1541 | */ | 1400 | */ |
1542 | if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence)) | 1401 | if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) |
1543 | rt->dst.rate_tokens = 0; | 1402 | peer->rate_tokens = 0; |
1544 | 1403 | ||
1545 | /* Too many ignored redirects; do not send anything | 1404 | /* Too many ignored redirects; do not send anything |
1546 | * set dst.rate_last to the last seen redirected packet. | 1405 | * set dst.rate_last to the last seen redirected packet. |
1547 | */ | 1406 | */ |
1548 | if (rt->dst.rate_tokens >= ip_rt_redirect_number) { | 1407 | if (peer->rate_tokens >= ip_rt_redirect_number) { |
1549 | rt->dst.rate_last = jiffies; | 1408 | peer->rate_last = jiffies; |
1550 | return; | 1409 | return; |
1551 | } | 1410 | } |
1552 | 1411 | ||
1553 | /* Check for load limit; set rate_last to the latest sent | 1412 | /* Check for load limit; set rate_last to the latest sent |
1554 | * redirect. | 1413 | * redirect. |
1555 | */ | 1414 | */ |
1556 | if (rt->dst.rate_tokens == 0 || | 1415 | if (peer->rate_tokens == 0 || |
1557 | time_after(jiffies, | 1416 | time_after(jiffies, |
1558 | (rt->dst.rate_last + | 1417 | (peer->rate_last + |
1559 | (ip_rt_redirect_load << rt->dst.rate_tokens)))) { | 1418 | (ip_rt_redirect_load << peer->rate_tokens)))) { |
1560 | icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); | 1419 | icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); |
1561 | rt->dst.rate_last = jiffies; | 1420 | peer->rate_last = jiffies; |
1562 | ++rt->dst.rate_tokens; | 1421 | ++peer->rate_tokens; |
1563 | #ifdef CONFIG_IP_ROUTE_VERBOSE | 1422 | #ifdef CONFIG_IP_ROUTE_VERBOSE |
1564 | if (log_martians && | 1423 | if (log_martians && |
1565 | rt->dst.rate_tokens == ip_rt_redirect_number && | 1424 | peer->rate_tokens == ip_rt_redirect_number && |
1566 | net_ratelimit()) | 1425 | net_ratelimit()) |
1567 | printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n", | 1426 | printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n", |
1568 | &rt->rt_src, rt->rt_iif, | 1427 | &ip_hdr(skb)->saddr, rt->rt_iif, |
1569 | &rt->rt_dst, &rt->rt_gateway); | 1428 | &rt->rt_dst, &rt->rt_gateway); |
1570 | #endif | 1429 | #endif |
1571 | } | 1430 | } |
@@ -1574,7 +1433,9 @@ void ip_rt_send_redirect(struct sk_buff *skb) | |||
1574 | static int ip_error(struct sk_buff *skb) | 1433 | static int ip_error(struct sk_buff *skb) |
1575 | { | 1434 | { |
1576 | struct rtable *rt = skb_rtable(skb); | 1435 | struct rtable *rt = skb_rtable(skb); |
1436 | struct inet_peer *peer; | ||
1577 | unsigned long now; | 1437 | unsigned long now; |
1438 | bool send; | ||
1578 | int code; | 1439 | int code; |
1579 | 1440 | ||
1580 | switch (rt->dst.error) { | 1441 | switch (rt->dst.error) { |
@@ -1594,15 +1455,24 @@ static int ip_error(struct sk_buff *skb) | |||
1594 | break; | 1455 | break; |
1595 | } | 1456 | } |
1596 | 1457 | ||
1597 | now = jiffies; | 1458 | if (!rt->peer) |
1598 | rt->dst.rate_tokens += now - rt->dst.rate_last; | 1459 | rt_bind_peer(rt, rt->rt_dst, 1); |
1599 | if (rt->dst.rate_tokens > ip_rt_error_burst) | 1460 | peer = rt->peer; |
1600 | rt->dst.rate_tokens = ip_rt_error_burst; | 1461 | |
1601 | rt->dst.rate_last = now; | 1462 | send = true; |
1602 | if (rt->dst.rate_tokens >= ip_rt_error_cost) { | 1463 | if (peer) { |
1603 | rt->dst.rate_tokens -= ip_rt_error_cost; | 1464 | now = jiffies; |
1465 | peer->rate_tokens += now - peer->rate_last; | ||
1466 | if (peer->rate_tokens > ip_rt_error_burst) | ||
1467 | peer->rate_tokens = ip_rt_error_burst; | ||
1468 | peer->rate_last = now; | ||
1469 | if (peer->rate_tokens >= ip_rt_error_cost) | ||
1470 | peer->rate_tokens -= ip_rt_error_cost; | ||
1471 | else | ||
1472 | send = false; | ||
1473 | } | ||
1474 | if (send) | ||
1604 | icmp_send(skb, ICMP_DEST_UNREACH, code, 0); | 1475 | icmp_send(skb, ICMP_DEST_UNREACH, code, 0); |
1605 | } | ||
1606 | 1476 | ||
1607 | out: kfree_skb(skb); | 1477 | out: kfree_skb(skb); |
1608 | return 0; | 1478 | return 0; |
@@ -1626,88 +1496,148 @@ static inline unsigned short guess_mtu(unsigned short old_mtu) | |||
1626 | return 68; | 1496 | return 68; |
1627 | } | 1497 | } |
1628 | 1498 | ||
1629 | unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, | 1499 | unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph, |
1630 | unsigned short new_mtu, | 1500 | unsigned short new_mtu, |
1631 | struct net_device *dev) | 1501 | struct net_device *dev) |
1632 | { | 1502 | { |
1633 | int i, k; | ||
1634 | unsigned short old_mtu = ntohs(iph->tot_len); | 1503 | unsigned short old_mtu = ntohs(iph->tot_len); |
1635 | struct rtable *rth; | ||
1636 | int ikeys[2] = { dev->ifindex, 0 }; | ||
1637 | __be32 skeys[2] = { iph->saddr, 0, }; | ||
1638 | __be32 daddr = iph->daddr; | ||
1639 | unsigned short est_mtu = 0; | 1504 | unsigned short est_mtu = 0; |
1505 | struct inet_peer *peer; | ||
1640 | 1506 | ||
1641 | for (k = 0; k < 2; k++) { | 1507 | peer = inet_getpeer_v4(iph->daddr, 1); |
1642 | for (i = 0; i < 2; i++) { | 1508 | if (peer) { |
1643 | unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], | 1509 | unsigned short mtu = new_mtu; |
1644 | rt_genid(net)); | 1510 | |
1645 | 1511 | if (new_mtu < 68 || new_mtu >= old_mtu) { | |
1646 | rcu_read_lock(); | 1512 | /* BSD 4.2 derived systems incorrectly adjust |
1647 | for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; | 1513 | * tot_len by the IP header length, and report |
1648 | rth = rcu_dereference(rth->dst.rt_next)) { | 1514 | * a zero MTU in the ICMP message. |
1649 | unsigned short mtu = new_mtu; | 1515 | */ |
1650 | 1516 | if (mtu == 0 && | |
1651 | if (rth->fl.fl4_dst != daddr || | 1517 | old_mtu >= 68 + (iph->ihl << 2)) |
1652 | rth->fl.fl4_src != skeys[i] || | 1518 | old_mtu -= iph->ihl << 2; |
1653 | rth->rt_dst != daddr || | 1519 | mtu = guess_mtu(old_mtu); |
1654 | rth->rt_src != iph->saddr || | 1520 | } |
1655 | rth->fl.oif != ikeys[k] || | ||
1656 | rth->fl.iif != 0 || | ||
1657 | dst_metric_locked(&rth->dst, RTAX_MTU) || | ||
1658 | !net_eq(dev_net(rth->dst.dev), net) || | ||
1659 | rt_is_expired(rth)) | ||
1660 | continue; | ||
1661 | 1521 | ||
1662 | if (new_mtu < 68 || new_mtu >= old_mtu) { | 1522 | if (mtu < ip_rt_min_pmtu) |
1523 | mtu = ip_rt_min_pmtu; | ||
1524 | if (!peer->pmtu_expires || mtu < peer->pmtu_learned) { | ||
1525 | unsigned long pmtu_expires; | ||
1663 | 1526 | ||
1664 | /* BSD 4.2 compatibility hack :-( */ | 1527 | pmtu_expires = jiffies + ip_rt_mtu_expires; |
1665 | if (mtu == 0 && | 1528 | if (!pmtu_expires) |
1666 | old_mtu >= dst_mtu(&rth->dst) && | 1529 | pmtu_expires = 1UL; |
1667 | old_mtu >= 68 + (iph->ihl << 2)) | ||
1668 | old_mtu -= iph->ihl << 2; | ||
1669 | 1530 | ||
1670 | mtu = guess_mtu(old_mtu); | 1531 | est_mtu = mtu; |
1671 | } | 1532 | peer->pmtu_learned = mtu; |
1672 | if (mtu <= dst_mtu(&rth->dst)) { | 1533 | peer->pmtu_expires = pmtu_expires; |
1673 | if (mtu < dst_mtu(&rth->dst)) { | ||
1674 | dst_confirm(&rth->dst); | ||
1675 | if (mtu < ip_rt_min_pmtu) { | ||
1676 | mtu = ip_rt_min_pmtu; | ||
1677 | rth->dst.metrics[RTAX_LOCK-1] |= | ||
1678 | (1 << RTAX_MTU); | ||
1679 | } | ||
1680 | rth->dst.metrics[RTAX_MTU-1] = mtu; | ||
1681 | dst_set_expires(&rth->dst, | ||
1682 | ip_rt_mtu_expires); | ||
1683 | } | ||
1684 | est_mtu = mtu; | ||
1685 | } | ||
1686 | } | ||
1687 | rcu_read_unlock(); | ||
1688 | } | 1534 | } |
1535 | |||
1536 | inet_putpeer(peer); | ||
1537 | |||
1538 | atomic_inc(&__rt_peer_genid); | ||
1689 | } | 1539 | } |
1690 | return est_mtu ? : new_mtu; | 1540 | return est_mtu ? : new_mtu; |
1691 | } | 1541 | } |
1692 | 1542 | ||
1543 | static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer) | ||
1544 | { | ||
1545 | unsigned long expires = ACCESS_ONCE(peer->pmtu_expires); | ||
1546 | |||
1547 | if (!expires) | ||
1548 | return; | ||
1549 | if (time_before(jiffies, expires)) { | ||
1550 | u32 orig_dst_mtu = dst_mtu(dst); | ||
1551 | if (peer->pmtu_learned < orig_dst_mtu) { | ||
1552 | if (!peer->pmtu_orig) | ||
1553 | peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU); | ||
1554 | dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned); | ||
1555 | } | ||
1556 | } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires) | ||
1557 | dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig); | ||
1558 | } | ||
1559 | |||
1693 | static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) | 1560 | static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) |
1694 | { | 1561 | { |
1695 | if (dst_mtu(dst) > mtu && mtu >= 68 && | 1562 | struct rtable *rt = (struct rtable *) dst; |
1696 | !(dst_metric_locked(dst, RTAX_MTU))) { | 1563 | struct inet_peer *peer; |
1697 | if (mtu < ip_rt_min_pmtu) { | 1564 | |
1565 | dst_confirm(dst); | ||
1566 | |||
1567 | if (!rt->peer) | ||
1568 | rt_bind_peer(rt, rt->rt_dst, 1); | ||
1569 | peer = rt->peer; | ||
1570 | if (peer) { | ||
1571 | unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires); | ||
1572 | |||
1573 | if (mtu < ip_rt_min_pmtu) | ||
1698 | mtu = ip_rt_min_pmtu; | 1574 | mtu = ip_rt_min_pmtu; |
1699 | dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU); | 1575 | if (!pmtu_expires || mtu < peer->pmtu_learned) { |
1576 | |||
1577 | pmtu_expires = jiffies + ip_rt_mtu_expires; | ||
1578 | if (!pmtu_expires) | ||
1579 | pmtu_expires = 1UL; | ||
1580 | |||
1581 | peer->pmtu_learned = mtu; | ||
1582 | peer->pmtu_expires = pmtu_expires; | ||
1583 | |||
1584 | atomic_inc(&__rt_peer_genid); | ||
1585 | rt->rt_peer_genid = rt_peer_genid(); | ||
1700 | } | 1586 | } |
1701 | dst->metrics[RTAX_MTU-1] = mtu; | 1587 | check_peer_pmtu(dst, peer); |
1702 | dst_set_expires(dst, ip_rt_mtu_expires); | 1588 | } |
1703 | call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst); | 1589 | } |
1590 | |||
1591 | static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer) | ||
1592 | { | ||
1593 | struct rtable *rt = (struct rtable *) dst; | ||
1594 | __be32 orig_gw = rt->rt_gateway; | ||
1595 | |||
1596 | dst_confirm(&rt->dst); | ||
1597 | |||
1598 | neigh_release(rt->dst.neighbour); | ||
1599 | rt->dst.neighbour = NULL; | ||
1600 | |||
1601 | rt->rt_gateway = peer->redirect_learned.a4; | ||
1602 | if (arp_bind_neighbour(&rt->dst) || | ||
1603 | !(rt->dst.neighbour->nud_state & NUD_VALID)) { | ||
1604 | if (rt->dst.neighbour) | ||
1605 | neigh_event_send(rt->dst.neighbour, NULL); | ||
1606 | rt->rt_gateway = orig_gw; | ||
1607 | return -EAGAIN; | ||
1608 | } else { | ||
1609 | rt->rt_flags |= RTCF_REDIRECTED; | ||
1610 | call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, | ||
1611 | rt->dst.neighbour); | ||
1704 | } | 1612 | } |
1613 | return 0; | ||
1705 | } | 1614 | } |
1706 | 1615 | ||
1707 | static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) | 1616 | static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) |
1708 | { | 1617 | { |
1709 | if (rt_is_expired((struct rtable *)dst)) | 1618 | struct rtable *rt = (struct rtable *) dst; |
1619 | |||
1620 | if (rt_is_expired(rt)) | ||
1710 | return NULL; | 1621 | return NULL; |
1622 | if (rt->rt_peer_genid != rt_peer_genid()) { | ||
1623 | struct inet_peer *peer; | ||
1624 | |||
1625 | if (!rt->peer) | ||
1626 | rt_bind_peer(rt, rt->rt_dst, 0); | ||
1627 | |||
1628 | peer = rt->peer; | ||
1629 | if (peer) { | ||
1630 | check_peer_pmtu(dst, peer); | ||
1631 | |||
1632 | if (peer->redirect_learned.a4 && | ||
1633 | peer->redirect_learned.a4 != rt->rt_gateway) { | ||
1634 | if (check_peer_redir(dst, peer)) | ||
1635 | return NULL; | ||
1636 | } | ||
1637 | } | ||
1638 | |||
1639 | rt->rt_peer_genid = rt_peer_genid(); | ||
1640 | } | ||
1711 | return dst; | 1641 | return dst; |
1712 | } | 1642 | } |
1713 | 1643 | ||
@@ -1715,33 +1645,17 @@ static void ipv4_dst_destroy(struct dst_entry *dst) | |||
1715 | { | 1645 | { |
1716 | struct rtable *rt = (struct rtable *) dst; | 1646 | struct rtable *rt = (struct rtable *) dst; |
1717 | struct inet_peer *peer = rt->peer; | 1647 | struct inet_peer *peer = rt->peer; |
1718 | struct in_device *idev = rt->idev; | ||
1719 | 1648 | ||
1649 | if (rt->fi) { | ||
1650 | fib_info_put(rt->fi); | ||
1651 | rt->fi = NULL; | ||
1652 | } | ||
1720 | if (peer) { | 1653 | if (peer) { |
1721 | rt->peer = NULL; | 1654 | rt->peer = NULL; |
1722 | inet_putpeer(peer); | 1655 | inet_putpeer(peer); |
1723 | } | 1656 | } |
1724 | |||
1725 | if (idev) { | ||
1726 | rt->idev = NULL; | ||
1727 | in_dev_put(idev); | ||
1728 | } | ||
1729 | } | 1657 | } |
1730 | 1658 | ||
1731 | static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, | ||
1732 | int how) | ||
1733 | { | ||
1734 | struct rtable *rt = (struct rtable *) dst; | ||
1735 | struct in_device *idev = rt->idev; | ||
1736 | if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) { | ||
1737 | struct in_device *loopback_idev = | ||
1738 | in_dev_get(dev_net(dev)->loopback_dev); | ||
1739 | if (loopback_idev) { | ||
1740 | rt->idev = loopback_idev; | ||
1741 | in_dev_put(idev); | ||
1742 | } | ||
1743 | } | ||
1744 | } | ||
1745 | 1659 | ||
1746 | static void ipv4_link_failure(struct sk_buff *skb) | 1660 | static void ipv4_link_failure(struct sk_buff *skb) |
1747 | { | 1661 | { |
@@ -1750,8 +1664,8 @@ static void ipv4_link_failure(struct sk_buff *skb) | |||
1750 | icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); | 1664 | icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); |
1751 | 1665 | ||
1752 | rt = skb_rtable(skb); | 1666 | rt = skb_rtable(skb); |
1753 | if (rt) | 1667 | if (rt && rt->peer && peer_pmtu_cleaned(rt->peer)) |
1754 | dst_set_expires(&rt->dst, 0); | 1668 | dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig); |
1755 | } | 1669 | } |
1756 | 1670 | ||
1757 | static int ip_rt_bug(struct sk_buff *skb) | 1671 | static int ip_rt_bug(struct sk_buff *skb) |
@@ -1760,6 +1674,7 @@ static int ip_rt_bug(struct sk_buff *skb) | |||
1760 | &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, | 1674 | &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, |
1761 | skb->dev ? skb->dev->name : "?"); | 1675 | skb->dev ? skb->dev->name : "?"); |
1762 | kfree_skb(skb); | 1676 | kfree_skb(skb); |
1677 | WARN_ON(1); | ||
1763 | return 0; | 1678 | return 0; |
1764 | } | 1679 | } |
1765 | 1680 | ||
@@ -1772,23 +1687,39 @@ static int ip_rt_bug(struct sk_buff *skb) | |||
1772 | in IP options! | 1687 | in IP options! |
1773 | */ | 1688 | */ |
1774 | 1689 | ||
1775 | void ip_rt_get_source(u8 *addr, struct rtable *rt) | 1690 | void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) |
1776 | { | 1691 | { |
1777 | __be32 src; | 1692 | __be32 src; |
1778 | struct fib_result res; | ||
1779 | 1693 | ||
1780 | if (rt->fl.iif == 0) | 1694 | if (rt_is_output_route(rt)) |
1781 | src = rt->rt_src; | 1695 | src = ip_hdr(skb)->saddr; |
1782 | else if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0) { | 1696 | else { |
1783 | src = FIB_RES_PREFSRC(res); | 1697 | struct fib_result res; |
1784 | fib_res_put(&res); | 1698 | struct flowi4 fl4; |
1785 | } else | 1699 | struct iphdr *iph; |
1786 | src = inet_select_addr(rt->dst.dev, rt->rt_gateway, | 1700 | |
1701 | iph = ip_hdr(skb); | ||
1702 | |||
1703 | memset(&fl4, 0, sizeof(fl4)); | ||
1704 | fl4.daddr = iph->daddr; | ||
1705 | fl4.saddr = iph->saddr; | ||
1706 | fl4.flowi4_tos = iph->tos; | ||
1707 | fl4.flowi4_oif = rt->dst.dev->ifindex; | ||
1708 | fl4.flowi4_iif = skb->dev->ifindex; | ||
1709 | fl4.flowi4_mark = skb->mark; | ||
1710 | |||
1711 | rcu_read_lock(); | ||
1712 | if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0) | ||
1713 | src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res); | ||
1714 | else | ||
1715 | src = inet_select_addr(rt->dst.dev, rt->rt_gateway, | ||
1787 | RT_SCOPE_UNIVERSE); | 1716 | RT_SCOPE_UNIVERSE); |
1717 | rcu_read_unlock(); | ||
1718 | } | ||
1788 | memcpy(addr, &src, 4); | 1719 | memcpy(addr, &src, 4); |
1789 | } | 1720 | } |
1790 | 1721 | ||
1791 | #ifdef CONFIG_NET_CLS_ROUTE | 1722 | #ifdef CONFIG_IP_ROUTE_CLASSID |
1792 | static void set_class_tag(struct rtable *rt, u32 tag) | 1723 | static void set_class_tag(struct rtable *rt, u32 tag) |
1793 | { | 1724 | { |
1794 | if (!(rt->dst.tclassid & 0xFFFF)) | 1725 | if (!(rt->dst.tclassid & 0xFFFF)) |
@@ -1798,46 +1729,107 @@ static void set_class_tag(struct rtable *rt, u32 tag) | |||
1798 | } | 1729 | } |
1799 | #endif | 1730 | #endif |
1800 | 1731 | ||
1801 | static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) | 1732 | static unsigned int ipv4_default_advmss(const struct dst_entry *dst) |
1802 | { | 1733 | { |
1803 | struct fib_info *fi = res->fi; | 1734 | unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS); |
1735 | |||
1736 | if (advmss == 0) { | ||
1737 | advmss = max_t(unsigned int, dst->dev->mtu - 40, | ||
1738 | ip_rt_min_advmss); | ||
1739 | if (advmss > 65535 - 40) | ||
1740 | advmss = 65535 - 40; | ||
1741 | } | ||
1742 | return advmss; | ||
1743 | } | ||
1744 | |||
1745 | static unsigned int ipv4_default_mtu(const struct dst_entry *dst) | ||
1746 | { | ||
1747 | unsigned int mtu = dst->dev->mtu; | ||
1748 | |||
1749 | if (unlikely(dst_metric_locked(dst, RTAX_MTU))) { | ||
1750 | const struct rtable *rt = (const struct rtable *) dst; | ||
1751 | |||
1752 | if (rt->rt_gateway != rt->rt_dst && mtu > 576) | ||
1753 | mtu = 576; | ||
1754 | } | ||
1755 | |||
1756 | if (mtu > IP_MAX_MTU) | ||
1757 | mtu = IP_MAX_MTU; | ||
1758 | |||
1759 | return mtu; | ||
1760 | } | ||
1761 | |||
1762 | static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4, | ||
1763 | struct fib_info *fi) | ||
1764 | { | ||
1765 | struct inet_peer *peer; | ||
1766 | int create = 0; | ||
1767 | |||
1768 | /* If a peer entry exists for this destination, we must hook | ||
1769 | * it up in order to get at cached metrics. | ||
1770 | */ | ||
1771 | if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS)) | ||
1772 | create = 1; | ||
1773 | |||
1774 | rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create); | ||
1775 | if (peer) { | ||
1776 | rt->rt_peer_genid = rt_peer_genid(); | ||
1777 | if (inet_metrics_new(peer)) | ||
1778 | memcpy(peer->metrics, fi->fib_metrics, | ||
1779 | sizeof(u32) * RTAX_MAX); | ||
1780 | dst_init_metrics(&rt->dst, peer->metrics, false); | ||
1781 | |||
1782 | check_peer_pmtu(&rt->dst, peer); | ||
1783 | if (peer->redirect_learned.a4 && | ||
1784 | peer->redirect_learned.a4 != rt->rt_gateway) { | ||
1785 | rt->rt_gateway = peer->redirect_learned.a4; | ||
1786 | rt->rt_flags |= RTCF_REDIRECTED; | ||
1787 | } | ||
1788 | } else { | ||
1789 | if (fi->fib_metrics != (u32 *) dst_default_metrics) { | ||
1790 | rt->fi = fi; | ||
1791 | atomic_inc(&fi->fib_clntref); | ||
1792 | } | ||
1793 | dst_init_metrics(&rt->dst, fi->fib_metrics, true); | ||
1794 | } | ||
1795 | } | ||
1796 | |||
1797 | static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4, | ||
1798 | const struct fib_result *res, | ||
1799 | struct fib_info *fi, u16 type, u32 itag) | ||
1800 | { | ||
1801 | struct dst_entry *dst = &rt->dst; | ||
1804 | 1802 | ||
1805 | if (fi) { | 1803 | if (fi) { |
1806 | if (FIB_RES_GW(*res) && | 1804 | if (FIB_RES_GW(*res) && |
1807 | FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) | 1805 | FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) |
1808 | rt->rt_gateway = FIB_RES_GW(*res); | 1806 | rt->rt_gateway = FIB_RES_GW(*res); |
1809 | memcpy(rt->dst.metrics, fi->fib_metrics, | 1807 | rt_init_metrics(rt, fl4, fi); |
1810 | sizeof(rt->dst.metrics)); | 1808 | #ifdef CONFIG_IP_ROUTE_CLASSID |
1811 | if (fi->fib_mtu == 0) { | 1809 | dst->tclassid = FIB_RES_NH(*res).nh_tclassid; |
1812 | rt->dst.metrics[RTAX_MTU-1] = rt->dst.dev->mtu; | ||
1813 | if (dst_metric_locked(&rt->dst, RTAX_MTU) && | ||
1814 | rt->rt_gateway != rt->rt_dst && | ||
1815 | rt->dst.dev->mtu > 576) | ||
1816 | rt->dst.metrics[RTAX_MTU-1] = 576; | ||
1817 | } | ||
1818 | #ifdef CONFIG_NET_CLS_ROUTE | ||
1819 | rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid; | ||
1820 | #endif | 1810 | #endif |
1821 | } else | 1811 | } |
1822 | rt->dst.metrics[RTAX_MTU-1]= rt->dst.dev->mtu; | 1812 | |
1823 | 1813 | if (dst_mtu(dst) > IP_MAX_MTU) | |
1824 | if (dst_metric(&rt->dst, RTAX_HOPLIMIT) == 0) | 1814 | dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU); |
1825 | rt->dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl; | 1815 | if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40) |
1826 | if (dst_mtu(&rt->dst) > IP_MAX_MTU) | 1816 | dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40); |
1827 | rt->dst.metrics[RTAX_MTU-1] = IP_MAX_MTU; | 1817 | |
1828 | if (dst_metric(&rt->dst, RTAX_ADVMSS) == 0) | 1818 | #ifdef CONFIG_IP_ROUTE_CLASSID |
1829 | rt->dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->dst.dev->mtu - 40, | ||
1830 | ip_rt_min_advmss); | ||
1831 | if (dst_metric(&rt->dst, RTAX_ADVMSS) > 65535 - 40) | ||
1832 | rt->dst.metrics[RTAX_ADVMSS-1] = 65535 - 40; | ||
1833 | |||
1834 | #ifdef CONFIG_NET_CLS_ROUTE | ||
1835 | #ifdef CONFIG_IP_MULTIPLE_TABLES | 1819 | #ifdef CONFIG_IP_MULTIPLE_TABLES |
1836 | set_class_tag(rt, fib_rules_tclass(res)); | 1820 | set_class_tag(rt, fib_rules_tclass(res)); |
1837 | #endif | 1821 | #endif |
1838 | set_class_tag(rt, itag); | 1822 | set_class_tag(rt, itag); |
1839 | #endif | 1823 | #endif |
1840 | rt->rt_type = res->type; | 1824 | } |
1825 | |||
1826 | static struct rtable *rt_dst_alloc(struct net_device *dev, | ||
1827 | bool nopolicy, bool noxfrm) | ||
1828 | { | ||
1829 | return dst_alloc(&ipv4_dst_ops, dev, 1, -1, | ||
1830 | DST_HOST | | ||
1831 | (nopolicy ? DST_NOPOLICY : 0) | | ||
1832 | (noxfrm ? DST_NOXFRM : 0)); | ||
1841 | } | 1833 | } |
1842 | 1834 | ||
1843 | /* called in rcu_read_lock() section */ | 1835 | /* called in rcu_read_lock() section */ |
@@ -1865,42 +1857,38 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
1865 | goto e_inval; | 1857 | goto e_inval; |
1866 | spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); | 1858 | spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); |
1867 | } else { | 1859 | } else { |
1868 | err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, | 1860 | err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst, |
1869 | &itag, 0); | 1861 | &itag); |
1870 | if (err < 0) | 1862 | if (err < 0) |
1871 | goto e_err; | 1863 | goto e_err; |
1872 | } | 1864 | } |
1873 | rth = dst_alloc(&ipv4_dst_ops); | 1865 | rth = rt_dst_alloc(init_net.loopback_dev, |
1866 | IN_DEV_CONF_GET(in_dev, NOPOLICY), false); | ||
1874 | if (!rth) | 1867 | if (!rth) |
1875 | goto e_nobufs; | 1868 | goto e_nobufs; |
1876 | 1869 | ||
1870 | #ifdef CONFIG_IP_ROUTE_CLASSID | ||
1871 | rth->dst.tclassid = itag; | ||
1872 | #endif | ||
1877 | rth->dst.output = ip_rt_bug; | 1873 | rth->dst.output = ip_rt_bug; |
1878 | rth->dst.obsolete = -1; | ||
1879 | 1874 | ||
1880 | atomic_set(&rth->dst.__refcnt, 1); | 1875 | rth->rt_key_dst = daddr; |
1881 | rth->dst.flags= DST_HOST; | 1876 | rth->rt_key_src = saddr; |
1882 | if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) | 1877 | rth->rt_genid = rt_genid(dev_net(dev)); |
1883 | rth->dst.flags |= DST_NOPOLICY; | 1878 | rth->rt_flags = RTCF_MULTICAST; |
1884 | rth->fl.fl4_dst = daddr; | 1879 | rth->rt_type = RTN_MULTICAST; |
1880 | rth->rt_key_tos = tos; | ||
1885 | rth->rt_dst = daddr; | 1881 | rth->rt_dst = daddr; |
1886 | rth->fl.fl4_tos = tos; | ||
1887 | rth->fl.mark = skb->mark; | ||
1888 | rth->fl.fl4_src = saddr; | ||
1889 | rth->rt_src = saddr; | 1882 | rth->rt_src = saddr; |
1890 | #ifdef CONFIG_NET_CLS_ROUTE | 1883 | rth->rt_route_iif = dev->ifindex; |
1891 | rth->dst.tclassid = itag; | 1884 | rth->rt_iif = dev->ifindex; |
1892 | #endif | 1885 | rth->rt_oif = 0; |
1893 | rth->rt_iif = | 1886 | rth->rt_mark = skb->mark; |
1894 | rth->fl.iif = dev->ifindex; | ||
1895 | rth->dst.dev = init_net.loopback_dev; | ||
1896 | dev_hold(rth->dst.dev); | ||
1897 | rth->idev = in_dev_get(rth->dst.dev); | ||
1898 | rth->fl.oif = 0; | ||
1899 | rth->rt_gateway = daddr; | 1887 | rth->rt_gateway = daddr; |
1900 | rth->rt_spec_dst= spec_dst; | 1888 | rth->rt_spec_dst= spec_dst; |
1901 | rth->rt_genid = rt_genid(dev_net(dev)); | 1889 | rth->rt_peer_genid = 0; |
1902 | rth->rt_flags = RTCF_MULTICAST; | 1890 | rth->peer = NULL; |
1903 | rth->rt_type = RTN_MULTICAST; | 1891 | rth->fi = NULL; |
1904 | if (our) { | 1892 | if (our) { |
1905 | rth->dst.input= ip_local_deliver; | 1893 | rth->dst.input= ip_local_deliver; |
1906 | rth->rt_flags |= RTCF_LOCAL; | 1894 | rth->rt_flags |= RTCF_LOCAL; |
@@ -1913,7 +1901,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
1913 | RT_CACHE_STAT_INC(in_slow_mc); | 1901 | RT_CACHE_STAT_INC(in_slow_mc); |
1914 | 1902 | ||
1915 | hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); | 1903 | hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); |
1916 | return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex); | 1904 | rth = rt_intern_hash(hash, rth, skb, dev->ifindex); |
1905 | return IS_ERR(rth) ? PTR_ERR(rth) : 0; | ||
1917 | 1906 | ||
1918 | e_nobufs: | 1907 | e_nobufs: |
1919 | return -ENOBUFS; | 1908 | return -ENOBUFS; |
@@ -1956,7 +1945,7 @@ static void ip_handle_martian_source(struct net_device *dev, | |||
1956 | 1945 | ||
1957 | /* called in rcu_read_lock() section */ | 1946 | /* called in rcu_read_lock() section */ |
1958 | static int __mkroute_input(struct sk_buff *skb, | 1947 | static int __mkroute_input(struct sk_buff *skb, |
1959 | struct fib_result *res, | 1948 | const struct fib_result *res, |
1960 | struct in_device *in_dev, | 1949 | struct in_device *in_dev, |
1961 | __be32 daddr, __be32 saddr, u32 tos, | 1950 | __be32 daddr, __be32 saddr, u32 tos, |
1962 | struct rtable **result) | 1951 | struct rtable **result) |
@@ -1978,8 +1967,8 @@ static int __mkroute_input(struct sk_buff *skb, | |||
1978 | } | 1967 | } |
1979 | 1968 | ||
1980 | 1969 | ||
1981 | err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res), | 1970 | err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), |
1982 | in_dev->dev, &spec_dst, &itag, skb->mark); | 1971 | in_dev->dev, &spec_dst, &itag); |
1983 | if (err < 0) { | 1972 | if (err < 0) { |
1984 | ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, | 1973 | ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, |
1985 | saddr); | 1974 | saddr); |
@@ -2010,42 +1999,36 @@ static int __mkroute_input(struct sk_buff *skb, | |||
2010 | } | 1999 | } |
2011 | } | 2000 | } |
2012 | 2001 | ||
2013 | 2002 | rth = rt_dst_alloc(out_dev->dev, | |
2014 | rth = dst_alloc(&ipv4_dst_ops); | 2003 | IN_DEV_CONF_GET(in_dev, NOPOLICY), |
2004 | IN_DEV_CONF_GET(out_dev, NOXFRM)); | ||
2015 | if (!rth) { | 2005 | if (!rth) { |
2016 | err = -ENOBUFS; | 2006 | err = -ENOBUFS; |
2017 | goto cleanup; | 2007 | goto cleanup; |
2018 | } | 2008 | } |
2019 | 2009 | ||
2020 | atomic_set(&rth->dst.__refcnt, 1); | 2010 | rth->rt_key_dst = daddr; |
2021 | rth->dst.flags= DST_HOST; | 2011 | rth->rt_key_src = saddr; |
2022 | if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) | 2012 | rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); |
2023 | rth->dst.flags |= DST_NOPOLICY; | 2013 | rth->rt_flags = flags; |
2024 | if (IN_DEV_CONF_GET(out_dev, NOXFRM)) | 2014 | rth->rt_type = res->type; |
2025 | rth->dst.flags |= DST_NOXFRM; | 2015 | rth->rt_key_tos = tos; |
2026 | rth->fl.fl4_dst = daddr; | ||
2027 | rth->rt_dst = daddr; | 2016 | rth->rt_dst = daddr; |
2028 | rth->fl.fl4_tos = tos; | ||
2029 | rth->fl.mark = skb->mark; | ||
2030 | rth->fl.fl4_src = saddr; | ||
2031 | rth->rt_src = saddr; | 2017 | rth->rt_src = saddr; |
2018 | rth->rt_route_iif = in_dev->dev->ifindex; | ||
2019 | rth->rt_iif = in_dev->dev->ifindex; | ||
2020 | rth->rt_oif = 0; | ||
2021 | rth->rt_mark = skb->mark; | ||
2032 | rth->rt_gateway = daddr; | 2022 | rth->rt_gateway = daddr; |
2033 | rth->rt_iif = | ||
2034 | rth->fl.iif = in_dev->dev->ifindex; | ||
2035 | rth->dst.dev = (out_dev)->dev; | ||
2036 | dev_hold(rth->dst.dev); | ||
2037 | rth->idev = in_dev_get(rth->dst.dev); | ||
2038 | rth->fl.oif = 0; | ||
2039 | rth->rt_spec_dst= spec_dst; | 2023 | rth->rt_spec_dst= spec_dst; |
2024 | rth->rt_peer_genid = 0; | ||
2025 | rth->peer = NULL; | ||
2026 | rth->fi = NULL; | ||
2040 | 2027 | ||
2041 | rth->dst.obsolete = -1; | ||
2042 | rth->dst.input = ip_forward; | 2028 | rth->dst.input = ip_forward; |
2043 | rth->dst.output = ip_output; | 2029 | rth->dst.output = ip_output; |
2044 | rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); | ||
2045 | |||
2046 | rt_set_nexthop(rth, res, itag); | ||
2047 | 2030 | ||
2048 | rth->rt_flags = flags; | 2031 | rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag); |
2049 | 2032 | ||
2050 | *result = rth; | 2033 | *result = rth; |
2051 | err = 0; | 2034 | err = 0; |
@@ -2055,7 +2038,7 @@ static int __mkroute_input(struct sk_buff *skb, | |||
2055 | 2038 | ||
2056 | static int ip_mkroute_input(struct sk_buff *skb, | 2039 | static int ip_mkroute_input(struct sk_buff *skb, |
2057 | struct fib_result *res, | 2040 | struct fib_result *res, |
2058 | const struct flowi *fl, | 2041 | const struct flowi4 *fl4, |
2059 | struct in_device *in_dev, | 2042 | struct in_device *in_dev, |
2060 | __be32 daddr, __be32 saddr, u32 tos) | 2043 | __be32 daddr, __be32 saddr, u32 tos) |
2061 | { | 2044 | { |
@@ -2064,8 +2047,8 @@ static int ip_mkroute_input(struct sk_buff *skb, | |||
2064 | unsigned hash; | 2047 | unsigned hash; |
2065 | 2048 | ||
2066 | #ifdef CONFIG_IP_ROUTE_MULTIPATH | 2049 | #ifdef CONFIG_IP_ROUTE_MULTIPATH |
2067 | if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0) | 2050 | if (res->fi && res->fi->fib_nhs > 1) |
2068 | fib_select_multipath(fl, res); | 2051 | fib_select_multipath(res); |
2069 | #endif | 2052 | #endif |
2070 | 2053 | ||
2071 | /* create a routing cache entry */ | 2054 | /* create a routing cache entry */ |
@@ -2074,9 +2057,12 @@ static int ip_mkroute_input(struct sk_buff *skb, | |||
2074 | return err; | 2057 | return err; |
2075 | 2058 | ||
2076 | /* put it into the cache */ | 2059 | /* put it into the cache */ |
2077 | hash = rt_hash(daddr, saddr, fl->iif, | 2060 | hash = rt_hash(daddr, saddr, fl4->flowi4_iif, |
2078 | rt_genid(dev_net(rth->dst.dev))); | 2061 | rt_genid(dev_net(rth->dst.dev))); |
2079 | return rt_intern_hash(hash, rth, NULL, skb, fl->iif); | 2062 | rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif); |
2063 | if (IS_ERR(rth)) | ||
2064 | return PTR_ERR(rth); | ||
2065 | return 0; | ||
2080 | } | 2066 | } |
2081 | 2067 | ||
2082 | /* | 2068 | /* |
@@ -2087,6 +2073,7 @@ static int ip_mkroute_input(struct sk_buff *skb, | |||
2087 | * Such approach solves two big problems: | 2073 | * Such approach solves two big problems: |
2088 | * 1. Not simplex devices are handled properly. | 2074 | * 1. Not simplex devices are handled properly. |
2089 | * 2. IP spoofing attempts are filtered with 100% of guarantee. | 2075 | * 2. IP spoofing attempts are filtered with 100% of guarantee. |
2076 | * called with rcu_read_lock() | ||
2090 | */ | 2077 | */ |
2091 | 2078 | ||
2092 | static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, | 2079 | static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, |
@@ -2094,21 +2081,13 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
2094 | { | 2081 | { |
2095 | struct fib_result res; | 2082 | struct fib_result res; |
2096 | struct in_device *in_dev = __in_dev_get_rcu(dev); | 2083 | struct in_device *in_dev = __in_dev_get_rcu(dev); |
2097 | struct flowi fl = { .nl_u = { .ip4_u = | 2084 | struct flowi4 fl4; |
2098 | { .daddr = daddr, | ||
2099 | .saddr = saddr, | ||
2100 | .tos = tos, | ||
2101 | .scope = RT_SCOPE_UNIVERSE, | ||
2102 | } }, | ||
2103 | .mark = skb->mark, | ||
2104 | .iif = dev->ifindex }; | ||
2105 | unsigned flags = 0; | 2085 | unsigned flags = 0; |
2106 | u32 itag = 0; | 2086 | u32 itag = 0; |
2107 | struct rtable * rth; | 2087 | struct rtable * rth; |
2108 | unsigned hash; | 2088 | unsigned hash; |
2109 | __be32 spec_dst; | 2089 | __be32 spec_dst; |
2110 | int err = -EINVAL; | 2090 | int err = -EINVAL; |
2111 | int free_res = 0; | ||
2112 | struct net * net = dev_net(dev); | 2091 | struct net * net = dev_net(dev); |
2113 | 2092 | ||
2114 | /* IP on this device is disabled. */ | 2093 | /* IP on this device is disabled. */ |
@@ -2124,7 +2103,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
2124 | ipv4_is_loopback(saddr)) | 2103 | ipv4_is_loopback(saddr)) |
2125 | goto martian_source; | 2104 | goto martian_source; |
2126 | 2105 | ||
2127 | if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0)) | 2106 | if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) |
2128 | goto brd_input; | 2107 | goto brd_input; |
2129 | 2108 | ||
2130 | /* Accept zero addresses only to limited broadcast; | 2109 | /* Accept zero addresses only to limited broadcast; |
@@ -2133,19 +2112,25 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
2133 | if (ipv4_is_zeronet(saddr)) | 2112 | if (ipv4_is_zeronet(saddr)) |
2134 | goto martian_source; | 2113 | goto martian_source; |
2135 | 2114 | ||
2136 | if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) || | 2115 | if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr)) |
2137 | ipv4_is_loopback(daddr)) | ||
2138 | goto martian_destination; | 2116 | goto martian_destination; |
2139 | 2117 | ||
2140 | /* | 2118 | /* |
2141 | * Now we are ready to route packet. | 2119 | * Now we are ready to route packet. |
2142 | */ | 2120 | */ |
2143 | if ((err = fib_lookup(net, &fl, &res)) != 0) { | 2121 | fl4.flowi4_oif = 0; |
2122 | fl4.flowi4_iif = dev->ifindex; | ||
2123 | fl4.flowi4_mark = skb->mark; | ||
2124 | fl4.flowi4_tos = tos; | ||
2125 | fl4.flowi4_scope = RT_SCOPE_UNIVERSE; | ||
2126 | fl4.daddr = daddr; | ||
2127 | fl4.saddr = saddr; | ||
2128 | err = fib_lookup(net, &fl4, &res); | ||
2129 | if (err != 0) { | ||
2144 | if (!IN_DEV_FORWARD(in_dev)) | 2130 | if (!IN_DEV_FORWARD(in_dev)) |
2145 | goto e_hostunreach; | 2131 | goto e_hostunreach; |
2146 | goto no_route; | 2132 | goto no_route; |
2147 | } | 2133 | } |
2148 | free_res = 1; | ||
2149 | 2134 | ||
2150 | RT_CACHE_STAT_INC(in_slow_tot); | 2135 | RT_CACHE_STAT_INC(in_slow_tot); |
2151 | 2136 | ||
@@ -2153,9 +2138,9 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
2153 | goto brd_input; | 2138 | goto brd_input; |
2154 | 2139 | ||
2155 | if (res.type == RTN_LOCAL) { | 2140 | if (res.type == RTN_LOCAL) { |
2156 | err = fib_validate_source(saddr, daddr, tos, | 2141 | err = fib_validate_source(skb, saddr, daddr, tos, |
2157 | net->loopback_dev->ifindex, | 2142 | net->loopback_dev->ifindex, |
2158 | dev, &spec_dst, &itag, skb->mark); | 2143 | dev, &spec_dst, &itag); |
2159 | if (err < 0) | 2144 | if (err < 0) |
2160 | goto martian_source_keep_err; | 2145 | goto martian_source_keep_err; |
2161 | if (err) | 2146 | if (err) |
@@ -2169,10 +2154,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
2169 | if (res.type != RTN_UNICAST) | 2154 | if (res.type != RTN_UNICAST) |
2170 | goto martian_destination; | 2155 | goto martian_destination; |
2171 | 2156 | ||
2172 | err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos); | 2157 | err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos); |
2173 | done: | ||
2174 | if (free_res) | ||
2175 | fib_res_put(&res); | ||
2176 | out: return err; | 2158 | out: return err; |
2177 | 2159 | ||
2178 | brd_input: | 2160 | brd_input: |
@@ -2182,8 +2164,8 @@ brd_input: | |||
2182 | if (ipv4_is_zeronet(saddr)) | 2164 | if (ipv4_is_zeronet(saddr)) |
2183 | spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); | 2165 | spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); |
2184 | else { | 2166 | else { |
2185 | err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, | 2167 | err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst, |
2186 | &itag, skb->mark); | 2168 | &itag); |
2187 | if (err < 0) | 2169 | if (err < 0) |
2188 | goto martian_source_keep_err; | 2170 | goto martian_source_keep_err; |
2189 | if (err) | 2171 | if (err) |
@@ -2194,45 +2176,48 @@ brd_input: | |||
2194 | RT_CACHE_STAT_INC(in_brd); | 2176 | RT_CACHE_STAT_INC(in_brd); |
2195 | 2177 | ||
2196 | local_input: | 2178 | local_input: |
2197 | rth = dst_alloc(&ipv4_dst_ops); | 2179 | rth = rt_dst_alloc(net->loopback_dev, |
2180 | IN_DEV_CONF_GET(in_dev, NOPOLICY), false); | ||
2198 | if (!rth) | 2181 | if (!rth) |
2199 | goto e_nobufs; | 2182 | goto e_nobufs; |
2200 | 2183 | ||
2184 | rth->dst.input= ip_local_deliver; | ||
2201 | rth->dst.output= ip_rt_bug; | 2185 | rth->dst.output= ip_rt_bug; |
2202 | rth->dst.obsolete = -1; | 2186 | #ifdef CONFIG_IP_ROUTE_CLASSID |
2203 | rth->rt_genid = rt_genid(net); | 2187 | rth->dst.tclassid = itag; |
2188 | #endif | ||
2204 | 2189 | ||
2205 | atomic_set(&rth->dst.__refcnt, 1); | 2190 | rth->rt_key_dst = daddr; |
2206 | rth->dst.flags= DST_HOST; | 2191 | rth->rt_key_src = saddr; |
2207 | if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) | 2192 | rth->rt_genid = rt_genid(net); |
2208 | rth->dst.flags |= DST_NOPOLICY; | 2193 | rth->rt_flags = flags|RTCF_LOCAL; |
2209 | rth->fl.fl4_dst = daddr; | 2194 | rth->rt_type = res.type; |
2195 | rth->rt_key_tos = tos; | ||
2210 | rth->rt_dst = daddr; | 2196 | rth->rt_dst = daddr; |
2211 | rth->fl.fl4_tos = tos; | ||
2212 | rth->fl.mark = skb->mark; | ||
2213 | rth->fl.fl4_src = saddr; | ||
2214 | rth->rt_src = saddr; | 2197 | rth->rt_src = saddr; |
2215 | #ifdef CONFIG_NET_CLS_ROUTE | 2198 | #ifdef CONFIG_IP_ROUTE_CLASSID |
2216 | rth->dst.tclassid = itag; | 2199 | rth->dst.tclassid = itag; |
2217 | #endif | 2200 | #endif |
2218 | rth->rt_iif = | 2201 | rth->rt_route_iif = dev->ifindex; |
2219 | rth->fl.iif = dev->ifindex; | 2202 | rth->rt_iif = dev->ifindex; |
2220 | rth->dst.dev = net->loopback_dev; | 2203 | rth->rt_oif = 0; |
2221 | dev_hold(rth->dst.dev); | 2204 | rth->rt_mark = skb->mark; |
2222 | rth->idev = in_dev_get(rth->dst.dev); | ||
2223 | rth->rt_gateway = daddr; | 2205 | rth->rt_gateway = daddr; |
2224 | rth->rt_spec_dst= spec_dst; | 2206 | rth->rt_spec_dst= spec_dst; |
2225 | rth->dst.input= ip_local_deliver; | 2207 | rth->rt_peer_genid = 0; |
2226 | rth->rt_flags = flags|RTCF_LOCAL; | 2208 | rth->peer = NULL; |
2209 | rth->fi = NULL; | ||
2227 | if (res.type == RTN_UNREACHABLE) { | 2210 | if (res.type == RTN_UNREACHABLE) { |
2228 | rth->dst.input= ip_error; | 2211 | rth->dst.input= ip_error; |
2229 | rth->dst.error= -err; | 2212 | rth->dst.error= -err; |
2230 | rth->rt_flags &= ~RTCF_LOCAL; | 2213 | rth->rt_flags &= ~RTCF_LOCAL; |
2231 | } | 2214 | } |
2232 | rth->rt_type = res.type; | 2215 | hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net)); |
2233 | hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net)); | 2216 | rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif); |
2234 | err = rt_intern_hash(hash, rth, NULL, skb, fl.iif); | 2217 | err = 0; |
2235 | goto done; | 2218 | if (IS_ERR(rth)) |
2219 | err = PTR_ERR(rth); | ||
2220 | goto out; | ||
2236 | 2221 | ||
2237 | no_route: | 2222 | no_route: |
2238 | RT_CACHE_STAT_INC(in_no_route); | 2223 | RT_CACHE_STAT_INC(in_no_route); |
@@ -2255,21 +2240,21 @@ martian_destination: | |||
2255 | 2240 | ||
2256 | e_hostunreach: | 2241 | e_hostunreach: |
2257 | err = -EHOSTUNREACH; | 2242 | err = -EHOSTUNREACH; |
2258 | goto done; | 2243 | goto out; |
2259 | 2244 | ||
2260 | e_inval: | 2245 | e_inval: |
2261 | err = -EINVAL; | 2246 | err = -EINVAL; |
2262 | goto done; | 2247 | goto out; |
2263 | 2248 | ||
2264 | e_nobufs: | 2249 | e_nobufs: |
2265 | err = -ENOBUFS; | 2250 | err = -ENOBUFS; |
2266 | goto done; | 2251 | goto out; |
2267 | 2252 | ||
2268 | martian_source: | 2253 | martian_source: |
2269 | err = -EINVAL; | 2254 | err = -EINVAL; |
2270 | martian_source_keep_err: | 2255 | martian_source_keep_err: |
2271 | ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); | 2256 | ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); |
2272 | goto done; | 2257 | goto out; |
2273 | } | 2258 | } |
2274 | 2259 | ||
2275 | int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, | 2260 | int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, |
@@ -2293,12 +2278,12 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
2293 | 2278 | ||
2294 | for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; | 2279 | for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; |
2295 | rth = rcu_dereference(rth->dst.rt_next)) { | 2280 | rth = rcu_dereference(rth->dst.rt_next)) { |
2296 | if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) | | 2281 | if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) | |
2297 | ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) | | 2282 | ((__force u32)rth->rt_key_src ^ (__force u32)saddr) | |
2298 | (rth->fl.iif ^ iif) | | 2283 | (rth->rt_iif ^ iif) | |
2299 | rth->fl.oif | | 2284 | rth->rt_oif | |
2300 | (rth->fl.fl4_tos ^ tos)) == 0 && | 2285 | (rth->rt_key_tos ^ tos)) == 0 && |
2301 | rth->fl.mark == skb->mark && | 2286 | rth->rt_mark == skb->mark && |
2302 | net_eq(dev_net(rth->dst.dev), net) && | 2287 | net_eq(dev_net(rth->dst.dev), net) && |
2303 | !rt_is_expired(rth)) { | 2288 | !rt_is_expired(rth)) { |
2304 | if (noref) { | 2289 | if (noref) { |
@@ -2331,8 +2316,8 @@ skip_cache: | |||
2331 | struct in_device *in_dev = __in_dev_get_rcu(dev); | 2316 | struct in_device *in_dev = __in_dev_get_rcu(dev); |
2332 | 2317 | ||
2333 | if (in_dev) { | 2318 | if (in_dev) { |
2334 | int our = ip_check_mc(in_dev, daddr, saddr, | 2319 | int our = ip_check_mc_rcu(in_dev, daddr, saddr, |
2335 | ip_hdr(skb)->protocol); | 2320 | ip_hdr(skb)->protocol); |
2336 | if (our | 2321 | if (our |
2337 | #ifdef CONFIG_IP_MROUTE | 2322 | #ifdef CONFIG_IP_MROUTE |
2338 | || | 2323 | || |
@@ -2355,108 +2340,95 @@ skip_cache: | |||
2355 | } | 2340 | } |
2356 | EXPORT_SYMBOL(ip_route_input_common); | 2341 | EXPORT_SYMBOL(ip_route_input_common); |
2357 | 2342 | ||
2358 | static int __mkroute_output(struct rtable **result, | 2343 | /* called with rcu_read_lock() */ |
2359 | struct fib_result *res, | 2344 | static struct rtable *__mkroute_output(const struct fib_result *res, |
2360 | const struct flowi *fl, | 2345 | const struct flowi4 *fl4, |
2361 | const struct flowi *oldflp, | 2346 | __be32 orig_daddr, __be32 orig_saddr, |
2362 | struct net_device *dev_out, | 2347 | int orig_oif, struct net_device *dev_out, |
2363 | unsigned flags) | 2348 | unsigned int flags) |
2364 | { | 2349 | { |
2365 | struct rtable *rth; | 2350 | struct fib_info *fi = res->fi; |
2351 | u32 tos = RT_FL_TOS(fl4); | ||
2366 | struct in_device *in_dev; | 2352 | struct in_device *in_dev; |
2367 | u32 tos = RT_FL_TOS(oldflp); | 2353 | u16 type = res->type; |
2368 | int err = 0; | 2354 | struct rtable *rth; |
2369 | 2355 | ||
2370 | if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK)) | 2356 | if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK)) |
2371 | return -EINVAL; | 2357 | return ERR_PTR(-EINVAL); |
2372 | 2358 | ||
2373 | if (fl->fl4_dst == htonl(0xFFFFFFFF)) | 2359 | if (ipv4_is_lbcast(fl4->daddr)) |
2374 | res->type = RTN_BROADCAST; | 2360 | type = RTN_BROADCAST; |
2375 | else if (ipv4_is_multicast(fl->fl4_dst)) | 2361 | else if (ipv4_is_multicast(fl4->daddr)) |
2376 | res->type = RTN_MULTICAST; | 2362 | type = RTN_MULTICAST; |
2377 | else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst)) | 2363 | else if (ipv4_is_zeronet(fl4->daddr)) |
2378 | return -EINVAL; | 2364 | return ERR_PTR(-EINVAL); |
2379 | 2365 | ||
2380 | if (dev_out->flags & IFF_LOOPBACK) | 2366 | if (dev_out->flags & IFF_LOOPBACK) |
2381 | flags |= RTCF_LOCAL; | 2367 | flags |= RTCF_LOCAL; |
2382 | 2368 | ||
2383 | /* get work reference to inet device */ | 2369 | in_dev = __in_dev_get_rcu(dev_out); |
2384 | in_dev = in_dev_get(dev_out); | ||
2385 | if (!in_dev) | 2370 | if (!in_dev) |
2386 | return -EINVAL; | 2371 | return ERR_PTR(-EINVAL); |
2387 | 2372 | ||
2388 | if (res->type == RTN_BROADCAST) { | 2373 | if (type == RTN_BROADCAST) { |
2389 | flags |= RTCF_BROADCAST | RTCF_LOCAL; | 2374 | flags |= RTCF_BROADCAST | RTCF_LOCAL; |
2390 | if (res->fi) { | 2375 | fi = NULL; |
2391 | fib_info_put(res->fi); | 2376 | } else if (type == RTN_MULTICAST) { |
2392 | res->fi = NULL; | 2377 | flags |= RTCF_MULTICAST | RTCF_LOCAL; |
2393 | } | 2378 | if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr, |
2394 | } else if (res->type == RTN_MULTICAST) { | 2379 | fl4->flowi4_proto)) |
2395 | flags |= RTCF_MULTICAST|RTCF_LOCAL; | ||
2396 | if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, | ||
2397 | oldflp->proto)) | ||
2398 | flags &= ~RTCF_LOCAL; | 2380 | flags &= ~RTCF_LOCAL; |
2399 | /* If multicast route do not exist use | 2381 | /* If multicast route do not exist use |
2400 | default one, but do not gateway in this case. | 2382 | * default one, but do not gateway in this case. |
2401 | Yes, it is hack. | 2383 | * Yes, it is hack. |
2402 | */ | 2384 | */ |
2403 | if (res->fi && res->prefixlen < 4) { | 2385 | if (fi && res->prefixlen < 4) |
2404 | fib_info_put(res->fi); | 2386 | fi = NULL; |
2405 | res->fi = NULL; | ||
2406 | } | ||
2407 | } | 2387 | } |
2408 | 2388 | ||
2389 | rth = rt_dst_alloc(dev_out, | ||
2390 | IN_DEV_CONF_GET(in_dev, NOPOLICY), | ||
2391 | IN_DEV_CONF_GET(in_dev, NOXFRM)); | ||
2392 | if (!rth) | ||
2393 | return ERR_PTR(-ENOBUFS); | ||
2409 | 2394 | ||
2410 | rth = dst_alloc(&ipv4_dst_ops); | 2395 | rth->dst.output = ip_output; |
2411 | if (!rth) { | ||
2412 | err = -ENOBUFS; | ||
2413 | goto cleanup; | ||
2414 | } | ||
2415 | 2396 | ||
2416 | atomic_set(&rth->dst.__refcnt, 1); | 2397 | rth->rt_key_dst = orig_daddr; |
2417 | rth->dst.flags= DST_HOST; | 2398 | rth->rt_key_src = orig_saddr; |
2418 | if (IN_DEV_CONF_GET(in_dev, NOXFRM)) | ||
2419 | rth->dst.flags |= DST_NOXFRM; | ||
2420 | if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) | ||
2421 | rth->dst.flags |= DST_NOPOLICY; | ||
2422 | |||
2423 | rth->fl.fl4_dst = oldflp->fl4_dst; | ||
2424 | rth->fl.fl4_tos = tos; | ||
2425 | rth->fl.fl4_src = oldflp->fl4_src; | ||
2426 | rth->fl.oif = oldflp->oif; | ||
2427 | rth->fl.mark = oldflp->mark; | ||
2428 | rth->rt_dst = fl->fl4_dst; | ||
2429 | rth->rt_src = fl->fl4_src; | ||
2430 | rth->rt_iif = oldflp->oif ? : dev_out->ifindex; | ||
2431 | /* get references to the devices that are to be hold by the routing | ||
2432 | cache entry */ | ||
2433 | rth->dst.dev = dev_out; | ||
2434 | dev_hold(dev_out); | ||
2435 | rth->idev = in_dev_get(dev_out); | ||
2436 | rth->rt_gateway = fl->fl4_dst; | ||
2437 | rth->rt_spec_dst= fl->fl4_src; | ||
2438 | |||
2439 | rth->dst.output=ip_output; | ||
2440 | rth->dst.obsolete = -1; | ||
2441 | rth->rt_genid = rt_genid(dev_net(dev_out)); | 2399 | rth->rt_genid = rt_genid(dev_net(dev_out)); |
2400 | rth->rt_flags = flags; | ||
2401 | rth->rt_type = type; | ||
2402 | rth->rt_key_tos = tos; | ||
2403 | rth->rt_dst = fl4->daddr; | ||
2404 | rth->rt_src = fl4->saddr; | ||
2405 | rth->rt_route_iif = 0; | ||
2406 | rth->rt_iif = orig_oif ? : dev_out->ifindex; | ||
2407 | rth->rt_oif = orig_oif; | ||
2408 | rth->rt_mark = fl4->flowi4_mark; | ||
2409 | rth->rt_gateway = fl4->daddr; | ||
2410 | rth->rt_spec_dst= fl4->saddr; | ||
2411 | rth->rt_peer_genid = 0; | ||
2412 | rth->peer = NULL; | ||
2413 | rth->fi = NULL; | ||
2442 | 2414 | ||
2443 | RT_CACHE_STAT_INC(out_slow_tot); | 2415 | RT_CACHE_STAT_INC(out_slow_tot); |
2444 | 2416 | ||
2445 | if (flags & RTCF_LOCAL) { | 2417 | if (flags & RTCF_LOCAL) { |
2446 | rth->dst.input = ip_local_deliver; | 2418 | rth->dst.input = ip_local_deliver; |
2447 | rth->rt_spec_dst = fl->fl4_dst; | 2419 | rth->rt_spec_dst = fl4->daddr; |
2448 | } | 2420 | } |
2449 | if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { | 2421 | if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { |
2450 | rth->rt_spec_dst = fl->fl4_src; | 2422 | rth->rt_spec_dst = fl4->saddr; |
2451 | if (flags & RTCF_LOCAL && | 2423 | if (flags & RTCF_LOCAL && |
2452 | !(dev_out->flags & IFF_LOOPBACK)) { | 2424 | !(dev_out->flags & IFF_LOOPBACK)) { |
2453 | rth->dst.output = ip_mc_output; | 2425 | rth->dst.output = ip_mc_output; |
2454 | RT_CACHE_STAT_INC(out_slow_mc); | 2426 | RT_CACHE_STAT_INC(out_slow_mc); |
2455 | } | 2427 | } |
2456 | #ifdef CONFIG_IP_MROUTE | 2428 | #ifdef CONFIG_IP_MROUTE |
2457 | if (res->type == RTN_MULTICAST) { | 2429 | if (type == RTN_MULTICAST) { |
2458 | if (IN_DEV_MFORWARD(in_dev) && | 2430 | if (IN_DEV_MFORWARD(in_dev) && |
2459 | !ipv4_is_local_multicast(oldflp->fl4_dst)) { | 2431 | !ipv4_is_local_multicast(fl4->daddr)) { |
2460 | rth->dst.input = ip_mr_input; | 2432 | rth->dst.input = ip_mr_input; |
2461 | rth->dst.output = ip_mc_output; | 2433 | rth->dst.output = ip_mc_output; |
2462 | } | 2434 | } |
@@ -2464,73 +2436,47 @@ static int __mkroute_output(struct rtable **result, | |||
2464 | #endif | 2436 | #endif |
2465 | } | 2437 | } |
2466 | 2438 | ||
2467 | rt_set_nexthop(rth, res, 0); | 2439 | rt_set_nexthop(rth, fl4, res, fi, type, 0); |
2468 | |||
2469 | rth->rt_flags = flags; | ||
2470 | |||
2471 | *result = rth; | ||
2472 | cleanup: | ||
2473 | /* release work reference to inet device */ | ||
2474 | in_dev_put(in_dev); | ||
2475 | |||
2476 | return err; | ||
2477 | } | ||
2478 | |||
2479 | static int ip_mkroute_output(struct rtable **rp, | ||
2480 | struct fib_result *res, | ||
2481 | const struct flowi *fl, | ||
2482 | const struct flowi *oldflp, | ||
2483 | struct net_device *dev_out, | ||
2484 | unsigned flags) | ||
2485 | { | ||
2486 | struct rtable *rth = NULL; | ||
2487 | int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags); | ||
2488 | unsigned hash; | ||
2489 | if (err == 0) { | ||
2490 | hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif, | ||
2491 | rt_genid(dev_net(dev_out))); | ||
2492 | err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif); | ||
2493 | } | ||
2494 | 2440 | ||
2495 | return err; | 2441 | return rth; |
2496 | } | 2442 | } |
2497 | 2443 | ||
2498 | /* | 2444 | /* |
2499 | * Major route resolver routine. | 2445 | * Major route resolver routine. |
2446 | * called with rcu_read_lock(); | ||
2500 | */ | 2447 | */ |
2501 | 2448 | ||
2502 | static int ip_route_output_slow(struct net *net, struct rtable **rp, | 2449 | static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4) |
2503 | const struct flowi *oldflp) | 2450 | { |
2504 | { | ||
2505 | u32 tos = RT_FL_TOS(oldflp); | ||
2506 | struct flowi fl = { .nl_u = { .ip4_u = | ||
2507 | { .daddr = oldflp->fl4_dst, | ||
2508 | .saddr = oldflp->fl4_src, | ||
2509 | .tos = tos & IPTOS_RT_MASK, | ||
2510 | .scope = ((tos & RTO_ONLINK) ? | ||
2511 | RT_SCOPE_LINK : | ||
2512 | RT_SCOPE_UNIVERSE), | ||
2513 | } }, | ||
2514 | .mark = oldflp->mark, | ||
2515 | .iif = net->loopback_dev->ifindex, | ||
2516 | .oif = oldflp->oif }; | ||
2517 | struct fib_result res; | ||
2518 | unsigned flags = 0; | ||
2519 | struct net_device *dev_out = NULL; | 2451 | struct net_device *dev_out = NULL; |
2520 | int free_res = 0; | 2452 | u32 tos = RT_FL_TOS(fl4); |
2521 | int err; | 2453 | unsigned int flags = 0; |
2522 | 2454 | struct fib_result res; | |
2455 | struct rtable *rth; | ||
2456 | __be32 orig_daddr; | ||
2457 | __be32 orig_saddr; | ||
2458 | int orig_oif; | ||
2523 | 2459 | ||
2524 | res.fi = NULL; | 2460 | res.fi = NULL; |
2525 | #ifdef CONFIG_IP_MULTIPLE_TABLES | 2461 | #ifdef CONFIG_IP_MULTIPLE_TABLES |
2526 | res.r = NULL; | 2462 | res.r = NULL; |
2527 | #endif | 2463 | #endif |
2528 | 2464 | ||
2529 | if (oldflp->fl4_src) { | 2465 | orig_daddr = fl4->daddr; |
2530 | err = -EINVAL; | 2466 | orig_saddr = fl4->saddr; |
2531 | if (ipv4_is_multicast(oldflp->fl4_src) || | 2467 | orig_oif = fl4->flowi4_oif; |
2532 | ipv4_is_lbcast(oldflp->fl4_src) || | 2468 | |
2533 | ipv4_is_zeronet(oldflp->fl4_src)) | 2469 | fl4->flowi4_iif = net->loopback_dev->ifindex; |
2470 | fl4->flowi4_tos = tos & IPTOS_RT_MASK; | ||
2471 | fl4->flowi4_scope = ((tos & RTO_ONLINK) ? | ||
2472 | RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); | ||
2473 | |||
2474 | rcu_read_lock(); | ||
2475 | if (fl4->saddr) { | ||
2476 | rth = ERR_PTR(-EINVAL); | ||
2477 | if (ipv4_is_multicast(fl4->saddr) || | ||
2478 | ipv4_is_lbcast(fl4->saddr) || | ||
2479 | ipv4_is_zeronet(fl4->saddr)) | ||
2534 | goto out; | 2480 | goto out; |
2535 | 2481 | ||
2536 | /* I removed check for oif == dev_out->oif here. | 2482 | /* I removed check for oif == dev_out->oif here. |
@@ -2541,11 +2487,11 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, | |||
2541 | of another iface. --ANK | 2487 | of another iface. --ANK |
2542 | */ | 2488 | */ |
2543 | 2489 | ||
2544 | if (oldflp->oif == 0 && | 2490 | if (fl4->flowi4_oif == 0 && |
2545 | (ipv4_is_multicast(oldflp->fl4_dst) || | 2491 | (ipv4_is_multicast(fl4->daddr) || |
2546 | oldflp->fl4_dst == htonl(0xFFFFFFFF))) { | 2492 | ipv4_is_lbcast(fl4->daddr))) { |
2547 | /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ | 2493 | /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ |
2548 | dev_out = ip_dev_find(net, oldflp->fl4_src); | 2494 | dev_out = __ip_dev_find(net, fl4->saddr, false); |
2549 | if (dev_out == NULL) | 2495 | if (dev_out == NULL) |
2550 | goto out; | 2496 | goto out; |
2551 | 2497 | ||
@@ -2564,67 +2510,60 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, | |||
2564 | Luckily, this hack is good workaround. | 2510 | Luckily, this hack is good workaround. |
2565 | */ | 2511 | */ |
2566 | 2512 | ||
2567 | fl.oif = dev_out->ifindex; | 2513 | fl4->flowi4_oif = dev_out->ifindex; |
2568 | goto make_route; | 2514 | goto make_route; |
2569 | } | 2515 | } |
2570 | 2516 | ||
2571 | if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) { | 2517 | if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) { |
2572 | /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ | 2518 | /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ |
2573 | dev_out = ip_dev_find(net, oldflp->fl4_src); | 2519 | if (!__ip_dev_find(net, fl4->saddr, false)) |
2574 | if (dev_out == NULL) | ||
2575 | goto out; | 2520 | goto out; |
2576 | dev_put(dev_out); | ||
2577 | dev_out = NULL; | ||
2578 | } | 2521 | } |
2579 | } | 2522 | } |
2580 | 2523 | ||
2581 | 2524 | ||
2582 | if (oldflp->oif) { | 2525 | if (fl4->flowi4_oif) { |
2583 | dev_out = dev_get_by_index(net, oldflp->oif); | 2526 | dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif); |
2584 | err = -ENODEV; | 2527 | rth = ERR_PTR(-ENODEV); |
2585 | if (dev_out == NULL) | 2528 | if (dev_out == NULL) |
2586 | goto out; | 2529 | goto out; |
2587 | 2530 | ||
2588 | /* RACE: Check return value of inet_select_addr instead. */ | 2531 | /* RACE: Check return value of inet_select_addr instead. */ |
2589 | if (__in_dev_get_rtnl(dev_out) == NULL) { | 2532 | if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) { |
2590 | dev_put(dev_out); | 2533 | rth = ERR_PTR(-ENETUNREACH); |
2591 | goto out; /* Wrong error code */ | 2534 | goto out; |
2592 | } | 2535 | } |
2593 | 2536 | if (ipv4_is_local_multicast(fl4->daddr) || | |
2594 | if (ipv4_is_local_multicast(oldflp->fl4_dst) || | 2537 | ipv4_is_lbcast(fl4->daddr)) { |
2595 | oldflp->fl4_dst == htonl(0xFFFFFFFF)) { | 2538 | if (!fl4->saddr) |
2596 | if (!fl.fl4_src) | 2539 | fl4->saddr = inet_select_addr(dev_out, 0, |
2597 | fl.fl4_src = inet_select_addr(dev_out, 0, | ||
2598 | RT_SCOPE_LINK); | 2540 | RT_SCOPE_LINK); |
2599 | goto make_route; | 2541 | goto make_route; |
2600 | } | 2542 | } |
2601 | if (!fl.fl4_src) { | 2543 | if (fl4->saddr) { |
2602 | if (ipv4_is_multicast(oldflp->fl4_dst)) | 2544 | if (ipv4_is_multicast(fl4->daddr)) |
2603 | fl.fl4_src = inet_select_addr(dev_out, 0, | 2545 | fl4->saddr = inet_select_addr(dev_out, 0, |
2604 | fl.fl4_scope); | 2546 | fl4->flowi4_scope); |
2605 | else if (!oldflp->fl4_dst) | 2547 | else if (!fl4->daddr) |
2606 | fl.fl4_src = inet_select_addr(dev_out, 0, | 2548 | fl4->saddr = inet_select_addr(dev_out, 0, |
2607 | RT_SCOPE_HOST); | 2549 | RT_SCOPE_HOST); |
2608 | } | 2550 | } |
2609 | } | 2551 | } |
2610 | 2552 | ||
2611 | if (!fl.fl4_dst) { | 2553 | if (!fl4->daddr) { |
2612 | fl.fl4_dst = fl.fl4_src; | 2554 | fl4->daddr = fl4->saddr; |
2613 | if (!fl.fl4_dst) | 2555 | if (!fl4->daddr) |
2614 | fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); | 2556 | fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK); |
2615 | if (dev_out) | ||
2616 | dev_put(dev_out); | ||
2617 | dev_out = net->loopback_dev; | 2557 | dev_out = net->loopback_dev; |
2618 | dev_hold(dev_out); | 2558 | fl4->flowi4_oif = net->loopback_dev->ifindex; |
2619 | fl.oif = net->loopback_dev->ifindex; | ||
2620 | res.type = RTN_LOCAL; | 2559 | res.type = RTN_LOCAL; |
2621 | flags |= RTCF_LOCAL; | 2560 | flags |= RTCF_LOCAL; |
2622 | goto make_route; | 2561 | goto make_route; |
2623 | } | 2562 | } |
2624 | 2563 | ||
2625 | if (fib_lookup(net, &fl, &res)) { | 2564 | if (fib_lookup(net, fl4, &res)) { |
2626 | res.fi = NULL; | 2565 | res.fi = NULL; |
2627 | if (oldflp->oif) { | 2566 | if (fl4->flowi4_oif) { |
2628 | /* Apparently, routing tables are wrong. Assume, | 2567 | /* Apparently, routing tables are wrong. Assume, |
2629 | that the destination is on link. | 2568 | that the destination is on link. |
2630 | 2569 | ||
@@ -2643,98 +2582,100 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, | |||
2643 | likely IPv6, but we do not. | 2582 | likely IPv6, but we do not. |
2644 | */ | 2583 | */ |
2645 | 2584 | ||
2646 | if (fl.fl4_src == 0) | 2585 | if (fl4->saddr == 0) |
2647 | fl.fl4_src = inet_select_addr(dev_out, 0, | 2586 | fl4->saddr = inet_select_addr(dev_out, 0, |
2648 | RT_SCOPE_LINK); | 2587 | RT_SCOPE_LINK); |
2649 | res.type = RTN_UNICAST; | 2588 | res.type = RTN_UNICAST; |
2650 | goto make_route; | 2589 | goto make_route; |
2651 | } | 2590 | } |
2652 | if (dev_out) | 2591 | rth = ERR_PTR(-ENETUNREACH); |
2653 | dev_put(dev_out); | ||
2654 | err = -ENETUNREACH; | ||
2655 | goto out; | 2592 | goto out; |
2656 | } | 2593 | } |
2657 | free_res = 1; | ||
2658 | 2594 | ||
2659 | if (res.type == RTN_LOCAL) { | 2595 | if (res.type == RTN_LOCAL) { |
2660 | if (!fl.fl4_src) | 2596 | if (!fl4->saddr) { |
2661 | fl.fl4_src = fl.fl4_dst; | 2597 | if (res.fi->fib_prefsrc) |
2662 | if (dev_out) | 2598 | fl4->saddr = res.fi->fib_prefsrc; |
2663 | dev_put(dev_out); | 2599 | else |
2600 | fl4->saddr = fl4->daddr; | ||
2601 | } | ||
2664 | dev_out = net->loopback_dev; | 2602 | dev_out = net->loopback_dev; |
2665 | dev_hold(dev_out); | 2603 | fl4->flowi4_oif = dev_out->ifindex; |
2666 | fl.oif = dev_out->ifindex; | ||
2667 | if (res.fi) | ||
2668 | fib_info_put(res.fi); | ||
2669 | res.fi = NULL; | 2604 | res.fi = NULL; |
2670 | flags |= RTCF_LOCAL; | 2605 | flags |= RTCF_LOCAL; |
2671 | goto make_route; | 2606 | goto make_route; |
2672 | } | 2607 | } |
2673 | 2608 | ||
2674 | #ifdef CONFIG_IP_ROUTE_MULTIPATH | 2609 | #ifdef CONFIG_IP_ROUTE_MULTIPATH |
2675 | if (res.fi->fib_nhs > 1 && fl.oif == 0) | 2610 | if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0) |
2676 | fib_select_multipath(&fl, &res); | 2611 | fib_select_multipath(&res); |
2677 | else | 2612 | else |
2678 | #endif | 2613 | #endif |
2679 | if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif) | 2614 | if (!res.prefixlen && |
2680 | fib_select_default(net, &fl, &res); | 2615 | res.table->tb_num_default > 1 && |
2616 | res.type == RTN_UNICAST && !fl4->flowi4_oif) | ||
2617 | fib_select_default(&res); | ||
2681 | 2618 | ||
2682 | if (!fl.fl4_src) | 2619 | if (!fl4->saddr) |
2683 | fl.fl4_src = FIB_RES_PREFSRC(res); | 2620 | fl4->saddr = FIB_RES_PREFSRC(net, res); |
2684 | 2621 | ||
2685 | if (dev_out) | ||
2686 | dev_put(dev_out); | ||
2687 | dev_out = FIB_RES_DEV(res); | 2622 | dev_out = FIB_RES_DEV(res); |
2688 | dev_hold(dev_out); | 2623 | fl4->flowi4_oif = dev_out->ifindex; |
2689 | fl.oif = dev_out->ifindex; | ||
2690 | 2624 | ||
2691 | 2625 | ||
2692 | make_route: | 2626 | make_route: |
2693 | err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags); | 2627 | rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif, |
2628 | dev_out, flags); | ||
2629 | if (!IS_ERR(rth)) { | ||
2630 | unsigned int hash; | ||
2694 | 2631 | ||
2632 | hash = rt_hash(orig_daddr, orig_saddr, orig_oif, | ||
2633 | rt_genid(dev_net(dev_out))); | ||
2634 | rth = rt_intern_hash(hash, rth, NULL, orig_oif); | ||
2635 | } | ||
2695 | 2636 | ||
2696 | if (free_res) | 2637 | out: |
2697 | fib_res_put(&res); | 2638 | rcu_read_unlock(); |
2698 | if (dev_out) | 2639 | return rth; |
2699 | dev_put(dev_out); | ||
2700 | out: return err; | ||
2701 | } | 2640 | } |
2702 | 2641 | ||
2703 | int __ip_route_output_key(struct net *net, struct rtable **rp, | 2642 | struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4) |
2704 | const struct flowi *flp) | ||
2705 | { | 2643 | { |
2706 | unsigned hash; | ||
2707 | struct rtable *rth; | 2644 | struct rtable *rth; |
2645 | unsigned int hash; | ||
2708 | 2646 | ||
2709 | if (!rt_caching(net)) | 2647 | if (!rt_caching(net)) |
2710 | goto slow_output; | 2648 | goto slow_output; |
2711 | 2649 | ||
2712 | hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net)); | 2650 | hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net)); |
2713 | 2651 | ||
2714 | rcu_read_lock_bh(); | 2652 | rcu_read_lock_bh(); |
2715 | for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth; | 2653 | for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth; |
2716 | rth = rcu_dereference_bh(rth->dst.rt_next)) { | 2654 | rth = rcu_dereference_bh(rth->dst.rt_next)) { |
2717 | if (rth->fl.fl4_dst == flp->fl4_dst && | 2655 | if (rth->rt_key_dst == flp4->daddr && |
2718 | rth->fl.fl4_src == flp->fl4_src && | 2656 | rth->rt_key_src == flp4->saddr && |
2719 | rth->fl.iif == 0 && | 2657 | rt_is_output_route(rth) && |
2720 | rth->fl.oif == flp->oif && | 2658 | rth->rt_oif == flp4->flowi4_oif && |
2721 | rth->fl.mark == flp->mark && | 2659 | rth->rt_mark == flp4->flowi4_mark && |
2722 | !((rth->fl.fl4_tos ^ flp->fl4_tos) & | 2660 | !((rth->rt_key_tos ^ flp4->flowi4_tos) & |
2723 | (IPTOS_RT_MASK | RTO_ONLINK)) && | 2661 | (IPTOS_RT_MASK | RTO_ONLINK)) && |
2724 | net_eq(dev_net(rth->dst.dev), net) && | 2662 | net_eq(dev_net(rth->dst.dev), net) && |
2725 | !rt_is_expired(rth)) { | 2663 | !rt_is_expired(rth)) { |
2726 | dst_use(&rth->dst, jiffies); | 2664 | dst_use(&rth->dst, jiffies); |
2727 | RT_CACHE_STAT_INC(out_hit); | 2665 | RT_CACHE_STAT_INC(out_hit); |
2728 | rcu_read_unlock_bh(); | 2666 | rcu_read_unlock_bh(); |
2729 | *rp = rth; | 2667 | if (!flp4->saddr) |
2730 | return 0; | 2668 | flp4->saddr = rth->rt_src; |
2669 | if (!flp4->daddr) | ||
2670 | flp4->daddr = rth->rt_dst; | ||
2671 | return rth; | ||
2731 | } | 2672 | } |
2732 | RT_CACHE_STAT_INC(out_hlist_search); | 2673 | RT_CACHE_STAT_INC(out_hlist_search); |
2733 | } | 2674 | } |
2734 | rcu_read_unlock_bh(); | 2675 | rcu_read_unlock_bh(); |
2735 | 2676 | ||
2736 | slow_output: | 2677 | slow_output: |
2737 | return ip_route_output_slow(net, rp, flp); | 2678 | return ip_route_output_slow(net, flp4); |
2738 | } | 2679 | } |
2739 | EXPORT_SYMBOL_GPL(__ip_route_output_key); | 2680 | EXPORT_SYMBOL_GPL(__ip_route_output_key); |
2740 | 2681 | ||
@@ -2743,95 +2684,96 @@ static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 coo | |||
2743 | return NULL; | 2684 | return NULL; |
2744 | } | 2685 | } |
2745 | 2686 | ||
2687 | static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst) | ||
2688 | { | ||
2689 | return 0; | ||
2690 | } | ||
2691 | |||
2746 | static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) | 2692 | static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) |
2747 | { | 2693 | { |
2748 | } | 2694 | } |
2749 | 2695 | ||
2696 | static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst, | ||
2697 | unsigned long old) | ||
2698 | { | ||
2699 | return NULL; | ||
2700 | } | ||
2701 | |||
2750 | static struct dst_ops ipv4_dst_blackhole_ops = { | 2702 | static struct dst_ops ipv4_dst_blackhole_ops = { |
2751 | .family = AF_INET, | 2703 | .family = AF_INET, |
2752 | .protocol = cpu_to_be16(ETH_P_IP), | 2704 | .protocol = cpu_to_be16(ETH_P_IP), |
2753 | .destroy = ipv4_dst_destroy, | 2705 | .destroy = ipv4_dst_destroy, |
2754 | .check = ipv4_blackhole_dst_check, | 2706 | .check = ipv4_blackhole_dst_check, |
2707 | .default_mtu = ipv4_blackhole_default_mtu, | ||
2708 | .default_advmss = ipv4_default_advmss, | ||
2755 | .update_pmtu = ipv4_rt_blackhole_update_pmtu, | 2709 | .update_pmtu = ipv4_rt_blackhole_update_pmtu, |
2756 | .entries = ATOMIC_INIT(0), | 2710 | .cow_metrics = ipv4_rt_blackhole_cow_metrics, |
2757 | }; | 2711 | }; |
2758 | 2712 | ||
2759 | 2713 | struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) | |
2760 | static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp) | ||
2761 | { | 2714 | { |
2762 | struct rtable *ort = *rp; | 2715 | struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0); |
2763 | struct rtable *rt = (struct rtable *) | 2716 | struct rtable *ort = (struct rtable *) dst_orig; |
2764 | dst_alloc(&ipv4_dst_blackhole_ops); | ||
2765 | 2717 | ||
2766 | if (rt) { | 2718 | if (rt) { |
2767 | struct dst_entry *new = &rt->dst; | 2719 | struct dst_entry *new = &rt->dst; |
2768 | 2720 | ||
2769 | atomic_set(&new->__refcnt, 1); | ||
2770 | new->__use = 1; | 2721 | new->__use = 1; |
2771 | new->input = dst_discard; | 2722 | new->input = dst_discard; |
2772 | new->output = dst_discard; | 2723 | new->output = dst_discard; |
2773 | memcpy(new->metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32)); | 2724 | dst_copy_metrics(new, &ort->dst); |
2774 | 2725 | ||
2775 | new->dev = ort->dst.dev; | 2726 | new->dev = ort->dst.dev; |
2776 | if (new->dev) | 2727 | if (new->dev) |
2777 | dev_hold(new->dev); | 2728 | dev_hold(new->dev); |
2778 | 2729 | ||
2779 | rt->fl = ort->fl; | 2730 | rt->rt_key_dst = ort->rt_key_dst; |
2731 | rt->rt_key_src = ort->rt_key_src; | ||
2732 | rt->rt_key_tos = ort->rt_key_tos; | ||
2733 | rt->rt_route_iif = ort->rt_route_iif; | ||
2734 | rt->rt_iif = ort->rt_iif; | ||
2735 | rt->rt_oif = ort->rt_oif; | ||
2736 | rt->rt_mark = ort->rt_mark; | ||
2780 | 2737 | ||
2781 | rt->idev = ort->idev; | ||
2782 | if (rt->idev) | ||
2783 | in_dev_hold(rt->idev); | ||
2784 | rt->rt_genid = rt_genid(net); | 2738 | rt->rt_genid = rt_genid(net); |
2785 | rt->rt_flags = ort->rt_flags; | 2739 | rt->rt_flags = ort->rt_flags; |
2786 | rt->rt_type = ort->rt_type; | 2740 | rt->rt_type = ort->rt_type; |
2787 | rt->rt_dst = ort->rt_dst; | 2741 | rt->rt_dst = ort->rt_dst; |
2788 | rt->rt_src = ort->rt_src; | 2742 | rt->rt_src = ort->rt_src; |
2789 | rt->rt_iif = ort->rt_iif; | ||
2790 | rt->rt_gateway = ort->rt_gateway; | 2743 | rt->rt_gateway = ort->rt_gateway; |
2791 | rt->rt_spec_dst = ort->rt_spec_dst; | 2744 | rt->rt_spec_dst = ort->rt_spec_dst; |
2792 | rt->peer = ort->peer; | 2745 | rt->peer = ort->peer; |
2793 | if (rt->peer) | 2746 | if (rt->peer) |
2794 | atomic_inc(&rt->peer->refcnt); | 2747 | atomic_inc(&rt->peer->refcnt); |
2748 | rt->fi = ort->fi; | ||
2749 | if (rt->fi) | ||
2750 | atomic_inc(&rt->fi->fib_clntref); | ||
2795 | 2751 | ||
2796 | dst_free(new); | 2752 | dst_free(new); |
2797 | } | 2753 | } |
2798 | 2754 | ||
2799 | dst_release(&(*rp)->dst); | 2755 | dst_release(dst_orig); |
2800 | *rp = rt; | 2756 | |
2801 | return (rt ? 0 : -ENOMEM); | 2757 | return rt ? &rt->dst : ERR_PTR(-ENOMEM); |
2802 | } | 2758 | } |
2803 | 2759 | ||
2804 | int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp, | 2760 | struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, |
2805 | struct sock *sk, int flags) | 2761 | struct sock *sk) |
2806 | { | 2762 | { |
2807 | int err; | 2763 | struct rtable *rt = __ip_route_output_key(net, flp4); |
2808 | |||
2809 | if ((err = __ip_route_output_key(net, rp, flp)) != 0) | ||
2810 | return err; | ||
2811 | 2764 | ||
2812 | if (flp->proto) { | 2765 | if (IS_ERR(rt)) |
2813 | if (!flp->fl4_src) | 2766 | return rt; |
2814 | flp->fl4_src = (*rp)->rt_src; | ||
2815 | if (!flp->fl4_dst) | ||
2816 | flp->fl4_dst = (*rp)->rt_dst; | ||
2817 | err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk, | ||
2818 | flags ? XFRM_LOOKUP_WAIT : 0); | ||
2819 | if (err == -EREMOTE) | ||
2820 | err = ipv4_dst_blackhole(net, rp, flp); | ||
2821 | 2767 | ||
2822 | return err; | 2768 | if (flp4->flowi4_proto) |
2823 | } | 2769 | rt = (struct rtable *) xfrm_lookup(net, &rt->dst, |
2770 | flowi4_to_flowi(flp4), | ||
2771 | sk, 0); | ||
2824 | 2772 | ||
2825 | return 0; | 2773 | return rt; |
2826 | } | 2774 | } |
2827 | EXPORT_SYMBOL_GPL(ip_route_output_flow); | 2775 | EXPORT_SYMBOL_GPL(ip_route_output_flow); |
2828 | 2776 | ||
2829 | int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp) | ||
2830 | { | ||
2831 | return ip_route_output_flow(net, rp, flp, NULL, 0); | ||
2832 | } | ||
2833 | EXPORT_SYMBOL(ip_route_output_key); | ||
2834 | |||
2835 | static int rt_fill_info(struct net *net, | 2777 | static int rt_fill_info(struct net *net, |
2836 | struct sk_buff *skb, u32 pid, u32 seq, int event, | 2778 | struct sk_buff *skb, u32 pid, u32 seq, int event, |
2837 | int nowait, unsigned int flags) | 2779 | int nowait, unsigned int flags) |
@@ -2839,7 +2781,8 @@ static int rt_fill_info(struct net *net, | |||
2839 | struct rtable *rt = skb_rtable(skb); | 2781 | struct rtable *rt = skb_rtable(skb); |
2840 | struct rtmsg *r; | 2782 | struct rtmsg *r; |
2841 | struct nlmsghdr *nlh; | 2783 | struct nlmsghdr *nlh; |
2842 | long expires; | 2784 | long expires = 0; |
2785 | const struct inet_peer *peer = rt->peer; | ||
2843 | u32 id = 0, ts = 0, tsage = 0, error; | 2786 | u32 id = 0, ts = 0, tsage = 0, error; |
2844 | 2787 | ||
2845 | nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags); | 2788 | nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags); |
@@ -2850,7 +2793,7 @@ static int rt_fill_info(struct net *net, | |||
2850 | r->rtm_family = AF_INET; | 2793 | r->rtm_family = AF_INET; |
2851 | r->rtm_dst_len = 32; | 2794 | r->rtm_dst_len = 32; |
2852 | r->rtm_src_len = 0; | 2795 | r->rtm_src_len = 0; |
2853 | r->rtm_tos = rt->fl.fl4_tos; | 2796 | r->rtm_tos = rt->rt_key_tos; |
2854 | r->rtm_table = RT_TABLE_MAIN; | 2797 | r->rtm_table = RT_TABLE_MAIN; |
2855 | NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN); | 2798 | NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN); |
2856 | r->rtm_type = rt->rt_type; | 2799 | r->rtm_type = rt->rt_type; |
@@ -2862,48 +2805,52 @@ static int rt_fill_info(struct net *net, | |||
2862 | 2805 | ||
2863 | NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst); | 2806 | NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst); |
2864 | 2807 | ||
2865 | if (rt->fl.fl4_src) { | 2808 | if (rt->rt_key_src) { |
2866 | r->rtm_src_len = 32; | 2809 | r->rtm_src_len = 32; |
2867 | NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src); | 2810 | NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src); |
2868 | } | 2811 | } |
2869 | if (rt->dst.dev) | 2812 | if (rt->dst.dev) |
2870 | NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex); | 2813 | NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex); |
2871 | #ifdef CONFIG_NET_CLS_ROUTE | 2814 | #ifdef CONFIG_IP_ROUTE_CLASSID |
2872 | if (rt->dst.tclassid) | 2815 | if (rt->dst.tclassid) |
2873 | NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid); | 2816 | NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid); |
2874 | #endif | 2817 | #endif |
2875 | if (rt->fl.iif) | 2818 | if (rt_is_input_route(rt)) |
2876 | NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst); | 2819 | NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst); |
2877 | else if (rt->rt_src != rt->fl.fl4_src) | 2820 | else if (rt->rt_src != rt->rt_key_src) |
2878 | NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src); | 2821 | NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src); |
2879 | 2822 | ||
2880 | if (rt->rt_dst != rt->rt_gateway) | 2823 | if (rt->rt_dst != rt->rt_gateway) |
2881 | NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway); | 2824 | NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway); |
2882 | 2825 | ||
2883 | if (rtnetlink_put_metrics(skb, rt->dst.metrics) < 0) | 2826 | if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0) |
2884 | goto nla_put_failure; | 2827 | goto nla_put_failure; |
2885 | 2828 | ||
2886 | if (rt->fl.mark) | 2829 | if (rt->rt_mark) |
2887 | NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark); | 2830 | NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark); |
2888 | 2831 | ||
2889 | error = rt->dst.error; | 2832 | error = rt->dst.error; |
2890 | expires = rt->dst.expires ? rt->dst.expires - jiffies : 0; | 2833 | if (peer) { |
2891 | if (rt->peer) { | ||
2892 | inet_peer_refcheck(rt->peer); | 2834 | inet_peer_refcheck(rt->peer); |
2893 | id = atomic_read(&rt->peer->ip_id_count) & 0xffff; | 2835 | id = atomic_read(&peer->ip_id_count) & 0xffff; |
2894 | if (rt->peer->tcp_ts_stamp) { | 2836 | if (peer->tcp_ts_stamp) { |
2895 | ts = rt->peer->tcp_ts; | 2837 | ts = peer->tcp_ts; |
2896 | tsage = get_seconds() - rt->peer->tcp_ts_stamp; | 2838 | tsage = get_seconds() - peer->tcp_ts_stamp; |
2897 | } | 2839 | } |
2840 | expires = ACCESS_ONCE(peer->pmtu_expires); | ||
2841 | if (expires) | ||
2842 | expires -= jiffies; | ||
2898 | } | 2843 | } |
2899 | 2844 | ||
2900 | if (rt->fl.iif) { | 2845 | if (rt_is_input_route(rt)) { |
2901 | #ifdef CONFIG_IP_MROUTE | 2846 | #ifdef CONFIG_IP_MROUTE |
2902 | __be32 dst = rt->rt_dst; | 2847 | __be32 dst = rt->rt_dst; |
2903 | 2848 | ||
2904 | if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && | 2849 | if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && |
2905 | IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { | 2850 | IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { |
2906 | int err = ipmr_get_route(net, skb, r, nowait); | 2851 | int err = ipmr_get_route(net, skb, |
2852 | rt->rt_src, rt->rt_dst, | ||
2853 | r, nowait); | ||
2907 | if (err <= 0) { | 2854 | if (err <= 0) { |
2908 | if (!nowait) { | 2855 | if (!nowait) { |
2909 | if (err == 0) | 2856 | if (err == 0) |
@@ -2917,7 +2864,7 @@ static int rt_fill_info(struct net *net, | |||
2917 | } | 2864 | } |
2918 | } else | 2865 | } else |
2919 | #endif | 2866 | #endif |
2920 | NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif); | 2867 | NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif); |
2921 | } | 2868 | } |
2922 | 2869 | ||
2923 | if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage, | 2870 | if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage, |
@@ -2991,18 +2938,18 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void | |||
2991 | if (err == 0 && rt->dst.error) | 2938 | if (err == 0 && rt->dst.error) |
2992 | err = -rt->dst.error; | 2939 | err = -rt->dst.error; |
2993 | } else { | 2940 | } else { |
2994 | struct flowi fl = { | 2941 | struct flowi4 fl4 = { |
2995 | .nl_u = { | 2942 | .daddr = dst, |
2996 | .ip4_u = { | 2943 | .saddr = src, |
2997 | .daddr = dst, | 2944 | .flowi4_tos = rtm->rtm_tos, |
2998 | .saddr = src, | 2945 | .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0, |
2999 | .tos = rtm->rtm_tos, | 2946 | .flowi4_mark = mark, |
3000 | }, | ||
3001 | }, | ||
3002 | .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0, | ||
3003 | .mark = mark, | ||
3004 | }; | 2947 | }; |
3005 | err = ip_route_output_key(net, &rt, &fl); | 2948 | rt = ip_route_output_key(net, &fl4); |
2949 | |||
2950 | err = 0; | ||
2951 | if (IS_ERR(rt)) | ||
2952 | err = PTR_ERR(rt); | ||
3006 | } | 2953 | } |
3007 | 2954 | ||
3008 | if (err) | 2955 | if (err) |
@@ -3285,6 +3232,8 @@ static __net_init int rt_genid_init(struct net *net) | |||
3285 | { | 3232 | { |
3286 | get_random_bytes(&net->ipv4.rt_genid, | 3233 | get_random_bytes(&net->ipv4.rt_genid, |
3287 | sizeof(net->ipv4.rt_genid)); | 3234 | sizeof(net->ipv4.rt_genid)); |
3235 | get_random_bytes(&net->ipv4.dev_addr_genid, | ||
3236 | sizeof(net->ipv4.dev_addr_genid)); | ||
3288 | return 0; | 3237 | return 0; |
3289 | } | 3238 | } |
3290 | 3239 | ||
@@ -3293,9 +3242,9 @@ static __net_initdata struct pernet_operations rt_genid_ops = { | |||
3293 | }; | 3242 | }; |
3294 | 3243 | ||
3295 | 3244 | ||
3296 | #ifdef CONFIG_NET_CLS_ROUTE | 3245 | #ifdef CONFIG_IP_ROUTE_CLASSID |
3297 | struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; | 3246 | struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; |
3298 | #endif /* CONFIG_NET_CLS_ROUTE */ | 3247 | #endif /* CONFIG_IP_ROUTE_CLASSID */ |
3299 | 3248 | ||
3300 | static __initdata unsigned long rhash_entries; | 3249 | static __initdata unsigned long rhash_entries; |
3301 | static int __init set_rhash_entries(char *str) | 3250 | static int __init set_rhash_entries(char *str) |
@@ -3311,7 +3260,7 @@ int __init ip_rt_init(void) | |||
3311 | { | 3260 | { |
3312 | int rc = 0; | 3261 | int rc = 0; |
3313 | 3262 | ||
3314 | #ifdef CONFIG_NET_CLS_ROUTE | 3263 | #ifdef CONFIG_IP_ROUTE_CLASSID |
3315 | ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); | 3264 | ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); |
3316 | if (!ip_rt_acct) | 3265 | if (!ip_rt_acct) |
3317 | panic("IP: failed to allocate ip_rt_acct\n"); | 3266 | panic("IP: failed to allocate ip_rt_acct\n"); |
@@ -3323,6 +3272,12 @@ int __init ip_rt_init(void) | |||
3323 | 3272 | ||
3324 | ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; | 3273 | ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; |
3325 | 3274 | ||
3275 | if (dst_entries_init(&ipv4_dst_ops) < 0) | ||
3276 | panic("IP: failed to allocate ipv4_dst_ops counter\n"); | ||
3277 | |||
3278 | if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0) | ||
3279 | panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); | ||
3280 | |||
3326 | rt_hash_table = (struct rt_hash_bucket *) | 3281 | rt_hash_table = (struct rt_hash_bucket *) |
3327 | alloc_large_system_hash("IP route cache", | 3282 | alloc_large_system_hash("IP route cache", |
3328 | sizeof(struct rt_hash_bucket), | 3283 | sizeof(struct rt_hash_bucket), |
@@ -3342,14 +3297,6 @@ int __init ip_rt_init(void) | |||
3342 | devinet_init(); | 3297 | devinet_init(); |
3343 | ip_fib_init(); | 3298 | ip_fib_init(); |
3344 | 3299 | ||
3345 | /* All the timers, started at system startup tend | ||
3346 | to synchronize. Perturb it a bit. | ||
3347 | */ | ||
3348 | INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func); | ||
3349 | expires_ljiffies = jiffies; | ||
3350 | schedule_delayed_work(&expires_work, | ||
3351 | net_random() % ip_rt_gc_interval + ip_rt_gc_interval); | ||
3352 | |||
3353 | if (ip_rt_proc_init()) | 3300 | if (ip_rt_proc_init()) |
3354 | printk(KERN_ERR "Unable to create route proc files\n"); | 3301 | printk(KERN_ERR "Unable to create route proc files\n"); |
3355 | #ifdef CONFIG_XFRM | 3302 | #ifdef CONFIG_XFRM |
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 650cace2180d..26461492a847 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c | |||
@@ -321,10 +321,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, | |||
321 | * the ACK carries the same options again (see RFC1122 4.2.3.8) | 321 | * the ACK carries the same options again (see RFC1122 4.2.3.8) |
322 | */ | 322 | */ |
323 | if (opt && opt->optlen) { | 323 | if (opt && opt->optlen) { |
324 | int opt_size = sizeof(struct ip_options) + opt->optlen; | 324 | int opt_size = sizeof(struct ip_options_rcu) + opt->optlen; |
325 | 325 | ||
326 | ireq->opt = kmalloc(opt_size, GFP_ATOMIC); | 326 | ireq->opt = kmalloc(opt_size, GFP_ATOMIC); |
327 | if (ireq->opt != NULL && ip_options_echo(ireq->opt, skb)) { | 327 | if (ireq->opt != NULL && ip_options_echo(&ireq->opt->opt, skb)) { |
328 | kfree(ireq->opt); | 328 | kfree(ireq->opt); |
329 | ireq->opt = NULL; | 329 | ireq->opt = NULL; |
330 | } | 330 | } |
@@ -345,20 +345,16 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, | |||
345 | * no easy way to do this. | 345 | * no easy way to do this. |
346 | */ | 346 | */ |
347 | { | 347 | { |
348 | struct flowi fl = { .mark = sk->sk_mark, | 348 | struct flowi4 fl4; |
349 | .nl_u = { .ip4_u = | 349 | |
350 | { .daddr = ((opt && opt->srr) ? | 350 | flowi4_init_output(&fl4, 0, sk->sk_mark, RT_CONN_FLAGS(sk), |
351 | opt->faddr : | 351 | RT_SCOPE_UNIVERSE, IPPROTO_TCP, |
352 | ireq->rmt_addr), | 352 | inet_sk_flowi_flags(sk), |
353 | .saddr = ireq->loc_addr, | 353 | (opt && opt->srr) ? opt->faddr : ireq->rmt_addr, |
354 | .tos = RT_CONN_FLAGS(sk) } }, | 354 | ireq->loc_addr, th->source, th->dest); |
355 | .proto = IPPROTO_TCP, | 355 | security_req_classify_flow(req, flowi4_to_flowi(&fl4)); |
356 | .flags = inet_sk_flowi_flags(sk), | 356 | rt = ip_route_output_key(sock_net(sk), &fl4); |
357 | .uli_u = { .ports = | 357 | if (IS_ERR(rt)) { |
358 | { .sport = th->dest, | ||
359 | .dport = th->source } } }; | ||
360 | security_req_classify_flow(req, &fl); | ||
361 | if (ip_route_output_key(sock_net(sk), &rt, &fl)) { | ||
362 | reqsk_free(req); | 358 | reqsk_free(req); |
363 | goto out; | 359 | goto out; |
364 | } | 360 | } |
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index d96c1da4b17c..57d0752e239a 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/seqlock.h> | 13 | #include <linux/seqlock.h> |
14 | #include <linux/init.h> | 14 | #include <linux/init.h> |
15 | #include <linux/slab.h> | 15 | #include <linux/slab.h> |
16 | #include <linux/nsproxy.h> | ||
16 | #include <net/snmp.h> | 17 | #include <net/snmp.h> |
17 | #include <net/icmp.h> | 18 | #include <net/icmp.h> |
18 | #include <net/ip.h> | 19 | #include <net/ip.h> |
@@ -21,11 +22,18 @@ | |||
21 | #include <net/udp.h> | 22 | #include <net/udp.h> |
22 | #include <net/cipso_ipv4.h> | 23 | #include <net/cipso_ipv4.h> |
23 | #include <net/inet_frag.h> | 24 | #include <net/inet_frag.h> |
25 | #include <net/ping.h> | ||
24 | 26 | ||
25 | static int zero; | 27 | static int zero; |
26 | static int tcp_retr1_max = 255; | 28 | static int tcp_retr1_max = 255; |
27 | static int ip_local_port_range_min[] = { 1, 1 }; | 29 | static int ip_local_port_range_min[] = { 1, 1 }; |
28 | static int ip_local_port_range_max[] = { 65535, 65535 }; | 30 | static int ip_local_port_range_max[] = { 65535, 65535 }; |
31 | static int tcp_adv_win_scale_min = -31; | ||
32 | static int tcp_adv_win_scale_max = 31; | ||
33 | static int ip_ttl_min = 1; | ||
34 | static int ip_ttl_max = 255; | ||
35 | static int ip_ping_group_range_min[] = { 0, 0 }; | ||
36 | static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; | ||
29 | 37 | ||
30 | /* Update system visible IP port range */ | 38 | /* Update system visible IP port range */ |
31 | static void set_local_port_range(int range[2]) | 39 | static void set_local_port_range(int range[2]) |
@@ -64,6 +72,53 @@ static int ipv4_local_port_range(ctl_table *table, int write, | |||
64 | return ret; | 72 | return ret; |
65 | } | 73 | } |
66 | 74 | ||
75 | |||
76 | void inet_get_ping_group_range_table(struct ctl_table *table, gid_t *low, gid_t *high) | ||
77 | { | ||
78 | gid_t *data = table->data; | ||
79 | unsigned seq; | ||
80 | do { | ||
81 | seq = read_seqbegin(&sysctl_local_ports.lock); | ||
82 | |||
83 | *low = data[0]; | ||
84 | *high = data[1]; | ||
85 | } while (read_seqretry(&sysctl_local_ports.lock, seq)); | ||
86 | } | ||
87 | |||
88 | /* Update system visible IP port range */ | ||
89 | static void set_ping_group_range(struct ctl_table *table, int range[2]) | ||
90 | { | ||
91 | gid_t *data = table->data; | ||
92 | write_seqlock(&sysctl_local_ports.lock); | ||
93 | data[0] = range[0]; | ||
94 | data[1] = range[1]; | ||
95 | write_sequnlock(&sysctl_local_ports.lock); | ||
96 | } | ||
97 | |||
98 | /* Validate changes from /proc interface. */ | ||
99 | static int ipv4_ping_group_range(ctl_table *table, int write, | ||
100 | void __user *buffer, | ||
101 | size_t *lenp, loff_t *ppos) | ||
102 | { | ||
103 | int ret; | ||
104 | gid_t range[2]; | ||
105 | ctl_table tmp = { | ||
106 | .data = &range, | ||
107 | .maxlen = sizeof(range), | ||
108 | .mode = table->mode, | ||
109 | .extra1 = &ip_ping_group_range_min, | ||
110 | .extra2 = &ip_ping_group_range_max, | ||
111 | }; | ||
112 | |||
113 | inet_get_ping_group_range_table(table, range, range + 1); | ||
114 | ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); | ||
115 | |||
116 | if (write && ret == 0) | ||
117 | set_ping_group_range(table, range); | ||
118 | |||
119 | return ret; | ||
120 | } | ||
121 | |||
67 | static int proc_tcp_congestion_control(ctl_table *ctl, int write, | 122 | static int proc_tcp_congestion_control(ctl_table *ctl, int write, |
68 | void __user *buffer, size_t *lenp, loff_t *ppos) | 123 | void __user *buffer, size_t *lenp, loff_t *ppos) |
69 | { | 124 | { |
@@ -153,8 +208,9 @@ static struct ctl_table ipv4_table[] = { | |||
153 | .data = &sysctl_ip_default_ttl, | 208 | .data = &sysctl_ip_default_ttl, |
154 | .maxlen = sizeof(int), | 209 | .maxlen = sizeof(int), |
155 | .mode = 0644, | 210 | .mode = 0644, |
156 | .proc_handler = ipv4_doint_and_flush, | 211 | .proc_handler = proc_dointvec_minmax, |
157 | .extra2 = &init_net, | 212 | .extra1 = &ip_ttl_min, |
213 | .extra2 = &ip_ttl_max, | ||
158 | }, | 214 | }, |
159 | { | 215 | { |
160 | .procname = "ip_no_pmtu_disc", | 216 | .procname = "ip_no_pmtu_disc", |
@@ -306,7 +362,6 @@ static struct ctl_table ipv4_table[] = { | |||
306 | .mode = 0644, | 362 | .mode = 0644, |
307 | .proc_handler = proc_do_large_bitmap, | 363 | .proc_handler = proc_do_large_bitmap, |
308 | }, | 364 | }, |
309 | #ifdef CONFIG_IP_MULTICAST | ||
310 | { | 365 | { |
311 | .procname = "igmp_max_memberships", | 366 | .procname = "igmp_max_memberships", |
312 | .data = &sysctl_igmp_max_memberships, | 367 | .data = &sysctl_igmp_max_memberships, |
@@ -314,8 +369,6 @@ static struct ctl_table ipv4_table[] = { | |||
314 | .mode = 0644, | 369 | .mode = 0644, |
315 | .proc_handler = proc_dointvec | 370 | .proc_handler = proc_dointvec |
316 | }, | 371 | }, |
317 | |||
318 | #endif | ||
319 | { | 372 | { |
320 | .procname = "igmp_max_msf", | 373 | .procname = "igmp_max_msf", |
321 | .data = &sysctl_igmp_max_msf, | 374 | .data = &sysctl_igmp_max_msf, |
@@ -398,7 +451,7 @@ static struct ctl_table ipv4_table[] = { | |||
398 | .data = &sysctl_tcp_mem, | 451 | .data = &sysctl_tcp_mem, |
399 | .maxlen = sizeof(sysctl_tcp_mem), | 452 | .maxlen = sizeof(sysctl_tcp_mem), |
400 | .mode = 0644, | 453 | .mode = 0644, |
401 | .proc_handler = proc_dointvec | 454 | .proc_handler = proc_doulongvec_minmax |
402 | }, | 455 | }, |
403 | { | 456 | { |
404 | .procname = "tcp_wmem", | 457 | .procname = "tcp_wmem", |
@@ -426,7 +479,9 @@ static struct ctl_table ipv4_table[] = { | |||
426 | .data = &sysctl_tcp_adv_win_scale, | 479 | .data = &sysctl_tcp_adv_win_scale, |
427 | .maxlen = sizeof(int), | 480 | .maxlen = sizeof(int), |
428 | .mode = 0644, | 481 | .mode = 0644, |
429 | .proc_handler = proc_dointvec | 482 | .proc_handler = proc_dointvec_minmax, |
483 | .extra1 = &tcp_adv_win_scale_min, | ||
484 | .extra2 = &tcp_adv_win_scale_max, | ||
430 | }, | 485 | }, |
431 | { | 486 | { |
432 | .procname = "tcp_tw_reuse", | 487 | .procname = "tcp_tw_reuse", |
@@ -602,8 +657,7 @@ static struct ctl_table ipv4_table[] = { | |||
602 | .data = &sysctl_udp_mem, | 657 | .data = &sysctl_udp_mem, |
603 | .maxlen = sizeof(sysctl_udp_mem), | 658 | .maxlen = sizeof(sysctl_udp_mem), |
604 | .mode = 0644, | 659 | .mode = 0644, |
605 | .proc_handler = proc_dointvec_minmax, | 660 | .proc_handler = proc_doulongvec_minmax, |
606 | .extra1 = &zero | ||
607 | }, | 661 | }, |
608 | { | 662 | { |
609 | .procname = "udp_rmem_min", | 663 | .procname = "udp_rmem_min", |
@@ -674,6 +728,13 @@ static struct ctl_table ipv4_net_table[] = { | |||
674 | .mode = 0644, | 728 | .mode = 0644, |
675 | .proc_handler = proc_dointvec | 729 | .proc_handler = proc_dointvec |
676 | }, | 730 | }, |
731 | { | ||
732 | .procname = "ping_group_range", | ||
733 | .data = &init_net.ipv4.sysctl_ping_group_range, | ||
734 | .maxlen = sizeof(init_net.ipv4.sysctl_ping_group_range), | ||
735 | .mode = 0644, | ||
736 | .proc_handler = ipv4_ping_group_range, | ||
737 | }, | ||
677 | { } | 738 | { } |
678 | }; | 739 | }; |
679 | 740 | ||
@@ -708,8 +769,18 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) | |||
708 | &net->ipv4.sysctl_icmp_ratemask; | 769 | &net->ipv4.sysctl_icmp_ratemask; |
709 | table[6].data = | 770 | table[6].data = |
710 | &net->ipv4.sysctl_rt_cache_rebuild_count; | 771 | &net->ipv4.sysctl_rt_cache_rebuild_count; |
772 | table[7].data = | ||
773 | &net->ipv4.sysctl_ping_group_range; | ||
774 | |||
711 | } | 775 | } |
712 | 776 | ||
777 | /* | ||
778 | * Sane defaults - nobody may create ping sockets. | ||
779 | * Boot scripts should set this to distro-specific group. | ||
780 | */ | ||
781 | net->ipv4.sysctl_ping_group_range[0] = 1; | ||
782 | net->ipv4.sysctl_ping_group_range[1] = 0; | ||
783 | |||
713 | net->ipv4.sysctl_rt_cache_rebuild_count = 4; | 784 | net->ipv4.sysctl_rt_cache_rebuild_count = 4; |
714 | 785 | ||
715 | net->ipv4.ipv4_hdr = register_net_sysctl_table(net, | 786 | net->ipv4.ipv4_hdr = register_net_sysctl_table(net, |
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f115ea68a4ef..46febcacb729 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -282,7 +282,7 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; | |||
282 | struct percpu_counter tcp_orphan_count; | 282 | struct percpu_counter tcp_orphan_count; |
283 | EXPORT_SYMBOL_GPL(tcp_orphan_count); | 283 | EXPORT_SYMBOL_GPL(tcp_orphan_count); |
284 | 284 | ||
285 | int sysctl_tcp_mem[3] __read_mostly; | 285 | long sysctl_tcp_mem[3] __read_mostly; |
286 | int sysctl_tcp_wmem[3] __read_mostly; | 286 | int sysctl_tcp_wmem[3] __read_mostly; |
287 | int sysctl_tcp_rmem[3] __read_mostly; | 287 | int sysctl_tcp_rmem[3] __read_mostly; |
288 | 288 | ||
@@ -290,7 +290,7 @@ EXPORT_SYMBOL(sysctl_tcp_mem); | |||
290 | EXPORT_SYMBOL(sysctl_tcp_rmem); | 290 | EXPORT_SYMBOL(sysctl_tcp_rmem); |
291 | EXPORT_SYMBOL(sysctl_tcp_wmem); | 291 | EXPORT_SYMBOL(sysctl_tcp_wmem); |
292 | 292 | ||
293 | atomic_t tcp_memory_allocated; /* Current allocated memory. */ | 293 | atomic_long_t tcp_memory_allocated; /* Current allocated memory. */ |
294 | EXPORT_SYMBOL(tcp_memory_allocated); | 294 | EXPORT_SYMBOL(tcp_memory_allocated); |
295 | 295 | ||
296 | /* | 296 | /* |
@@ -505,6 +505,15 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) | |||
505 | else | 505 | else |
506 | answ = tp->write_seq - tp->snd_una; | 506 | answ = tp->write_seq - tp->snd_una; |
507 | break; | 507 | break; |
508 | case SIOCOUTQNSD: | ||
509 | if (sk->sk_state == TCP_LISTEN) | ||
510 | return -EINVAL; | ||
511 | |||
512 | if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) | ||
513 | answ = 0; | ||
514 | else | ||
515 | answ = tp->write_seq - tp->snd_nxt; | ||
516 | break; | ||
508 | default: | 517 | default: |
509 | return -ENOIOCTLCMD; | 518 | return -ENOIOCTLCMD; |
510 | } | 519 | } |
@@ -873,9 +882,7 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset, | |||
873 | flags); | 882 | flags); |
874 | 883 | ||
875 | lock_sock(sk); | 884 | lock_sock(sk); |
876 | TCP_CHECK_TIMER(sk); | ||
877 | res = do_tcp_sendpages(sk, &page, offset, size, flags); | 885 | res = do_tcp_sendpages(sk, &page, offset, size, flags); |
878 | TCP_CHECK_TIMER(sk); | ||
879 | release_sock(sk); | 886 | release_sock(sk); |
880 | return res; | 887 | return res; |
881 | } | 888 | } |
@@ -916,7 +923,6 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
916 | long timeo; | 923 | long timeo; |
917 | 924 | ||
918 | lock_sock(sk); | 925 | lock_sock(sk); |
919 | TCP_CHECK_TIMER(sk); | ||
920 | 926 | ||
921 | flags = msg->msg_flags; | 927 | flags = msg->msg_flags; |
922 | timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); | 928 | timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); |
@@ -993,7 +999,8 @@ new_segment: | |||
993 | /* We have some space in skb head. Superb! */ | 999 | /* We have some space in skb head. Superb! */ |
994 | if (copy > skb_tailroom(skb)) | 1000 | if (copy > skb_tailroom(skb)) |
995 | copy = skb_tailroom(skb); | 1001 | copy = skb_tailroom(skb); |
996 | if ((err = skb_add_data(skb, from, copy)) != 0) | 1002 | err = skb_add_data_nocache(sk, skb, from, copy); |
1003 | if (err) | ||
997 | goto do_fault; | 1004 | goto do_fault; |
998 | } else { | 1005 | } else { |
999 | int merge = 0; | 1006 | int merge = 0; |
@@ -1036,8 +1043,8 @@ new_segment: | |||
1036 | 1043 | ||
1037 | /* Time to copy data. We are close to | 1044 | /* Time to copy data. We are close to |
1038 | * the end! */ | 1045 | * the end! */ |
1039 | err = skb_copy_to_page(sk, from, skb, page, | 1046 | err = skb_copy_to_page_nocache(sk, from, skb, |
1040 | off, copy); | 1047 | page, off, copy); |
1041 | if (err) { | 1048 | if (err) { |
1042 | /* If this page was new, give it to the | 1049 | /* If this page was new, give it to the |
1043 | * socket so it does not get leaked. | 1050 | * socket so it does not get leaked. |
@@ -1104,7 +1111,6 @@ wait_for_memory: | |||
1104 | out: | 1111 | out: |
1105 | if (copied) | 1112 | if (copied) |
1106 | tcp_push(sk, flags, mss_now, tp->nonagle); | 1113 | tcp_push(sk, flags, mss_now, tp->nonagle); |
1107 | TCP_CHECK_TIMER(sk); | ||
1108 | release_sock(sk); | 1114 | release_sock(sk); |
1109 | return copied; | 1115 | return copied; |
1110 | 1116 | ||
@@ -1123,7 +1129,6 @@ do_error: | |||
1123 | goto out; | 1129 | goto out; |
1124 | out_err: | 1130 | out_err: |
1125 | err = sk_stream_error(sk, flags, err); | 1131 | err = sk_stream_error(sk, flags, err); |
1126 | TCP_CHECK_TIMER(sk); | ||
1127 | release_sock(sk); | 1132 | release_sock(sk); |
1128 | return err; | 1133 | return err; |
1129 | } | 1134 | } |
@@ -1193,7 +1198,7 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied) | |||
1193 | struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); | 1198 | struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); |
1194 | 1199 | ||
1195 | WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq), | 1200 | WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq), |
1196 | KERN_INFO "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n", | 1201 | "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n", |
1197 | tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt); | 1202 | tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt); |
1198 | #endif | 1203 | #endif |
1199 | 1204 | ||
@@ -1415,8 +1420,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
1415 | 1420 | ||
1416 | lock_sock(sk); | 1421 | lock_sock(sk); |
1417 | 1422 | ||
1418 | TCP_CHECK_TIMER(sk); | ||
1419 | |||
1420 | err = -ENOTCONN; | 1423 | err = -ENOTCONN; |
1421 | if (sk->sk_state == TCP_LISTEN) | 1424 | if (sk->sk_state == TCP_LISTEN) |
1422 | goto out; | 1425 | goto out; |
@@ -1477,10 +1480,9 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
1477 | * shouldn't happen. | 1480 | * shouldn't happen. |
1478 | */ | 1481 | */ |
1479 | if (WARN(before(*seq, TCP_SKB_CB(skb)->seq), | 1482 | if (WARN(before(*seq, TCP_SKB_CB(skb)->seq), |
1480 | KERN_INFO "recvmsg bug: copied %X " | 1483 | "recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n", |
1481 | "seq %X rcvnxt %X fl %X\n", *seq, | 1484 | *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, |
1482 | TCP_SKB_CB(skb)->seq, tp->rcv_nxt, | 1485 | flags)) |
1483 | flags)) | ||
1484 | break; | 1486 | break; |
1485 | 1487 | ||
1486 | offset = *seq - TCP_SKB_CB(skb)->seq; | 1488 | offset = *seq - TCP_SKB_CB(skb)->seq; |
@@ -1490,10 +1492,9 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
1490 | goto found_ok_skb; | 1492 | goto found_ok_skb; |
1491 | if (tcp_hdr(skb)->fin) | 1493 | if (tcp_hdr(skb)->fin) |
1492 | goto found_fin_ok; | 1494 | goto found_fin_ok; |
1493 | WARN(!(flags & MSG_PEEK), KERN_INFO "recvmsg bug 2: " | 1495 | WARN(!(flags & MSG_PEEK), |
1494 | "copied %X seq %X rcvnxt %X fl %X\n", | 1496 | "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n", |
1495 | *seq, TCP_SKB_CB(skb)->seq, | 1497 | *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags); |
1496 | tp->rcv_nxt, flags); | ||
1497 | } | 1498 | } |
1498 | 1499 | ||
1499 | /* Well, if we have backlog, try to process it now yet. */ | 1500 | /* Well, if we have backlog, try to process it now yet. */ |
@@ -1769,12 +1770,10 @@ skip_copy: | |||
1769 | /* Clean up data we have read: This will do ACK frames. */ | 1770 | /* Clean up data we have read: This will do ACK frames. */ |
1770 | tcp_cleanup_rbuf(sk, copied); | 1771 | tcp_cleanup_rbuf(sk, copied); |
1771 | 1772 | ||
1772 | TCP_CHECK_TIMER(sk); | ||
1773 | release_sock(sk); | 1773 | release_sock(sk); |
1774 | return copied; | 1774 | return copied; |
1775 | 1775 | ||
1776 | out: | 1776 | out: |
1777 | TCP_CHECK_TIMER(sk); | ||
1778 | release_sock(sk); | 1777 | release_sock(sk); |
1779 | return err; | 1778 | return err; |
1780 | 1779 | ||
@@ -2246,7 +2245,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, | |||
2246 | /* Values greater than interface MTU won't take effect. However | 2245 | /* Values greater than interface MTU won't take effect. However |
2247 | * at the point when this call is done we typically don't yet | 2246 | * at the point when this call is done we typically don't yet |
2248 | * know which interface is going to be used */ | 2247 | * know which interface is going to be used */ |
2249 | if (val < 8 || val > MAX_TCP_WINDOW) { | 2248 | if (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW) { |
2250 | err = -EINVAL; | 2249 | err = -EINVAL; |
2251 | break; | 2250 | break; |
2252 | } | 2251 | } |
@@ -2392,7 +2391,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level, | |||
2392 | err = tp->af_specific->md5_parse(sk, optval, optlen); | 2391 | err = tp->af_specific->md5_parse(sk, optval, optlen); |
2393 | break; | 2392 | break; |
2394 | #endif | 2393 | #endif |
2395 | 2394 | case TCP_USER_TIMEOUT: | |
2395 | /* Cap the max timeout in ms TCP will retry/retrans | ||
2396 | * before giving up and aborting (ETIMEDOUT) a connection. | ||
2397 | */ | ||
2398 | icsk->icsk_user_timeout = msecs_to_jiffies(val); | ||
2399 | break; | ||
2396 | default: | 2400 | default: |
2397 | err = -ENOPROTOOPT; | 2401 | err = -ENOPROTOOPT; |
2398 | break; | 2402 | break; |
@@ -2611,6 +2615,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level, | |||
2611 | case TCP_THIN_DUPACK: | 2615 | case TCP_THIN_DUPACK: |
2612 | val = tp->thin_dupack; | 2616 | val = tp->thin_dupack; |
2613 | break; | 2617 | break; |
2618 | |||
2619 | case TCP_USER_TIMEOUT: | ||
2620 | val = jiffies_to_msecs(icsk->icsk_user_timeout); | ||
2621 | break; | ||
2614 | default: | 2622 | default: |
2615 | return -ENOPROTOOPT; | 2623 | return -ENOPROTOOPT; |
2616 | } | 2624 | } |
@@ -2646,7 +2654,7 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname, | |||
2646 | EXPORT_SYMBOL(compat_tcp_getsockopt); | 2654 | EXPORT_SYMBOL(compat_tcp_getsockopt); |
2647 | #endif | 2655 | #endif |
2648 | 2656 | ||
2649 | struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features) | 2657 | struct sk_buff *tcp_tso_segment(struct sk_buff *skb, u32 features) |
2650 | { | 2658 | { |
2651 | struct sk_buff *segs = ERR_PTR(-EINVAL); | 2659 | struct sk_buff *segs = ERR_PTR(-EINVAL); |
2652 | struct tcphdr *th; | 2660 | struct tcphdr *th; |
@@ -3212,7 +3220,7 @@ __setup("thash_entries=", set_thash_entries); | |||
3212 | void __init tcp_init(void) | 3220 | void __init tcp_init(void) |
3213 | { | 3221 | { |
3214 | struct sk_buff *skb = NULL; | 3222 | struct sk_buff *skb = NULL; |
3215 | unsigned long nr_pages, limit; | 3223 | unsigned long limit; |
3216 | int i, max_share, cnt; | 3224 | int i, max_share, cnt; |
3217 | unsigned long jiffy = jiffies; | 3225 | unsigned long jiffy = jiffies; |
3218 | 3226 | ||
@@ -3269,13 +3277,7 @@ void __init tcp_init(void) | |||
3269 | sysctl_tcp_max_orphans = cnt / 2; | 3277 | sysctl_tcp_max_orphans = cnt / 2; |
3270 | sysctl_max_syn_backlog = max(128, cnt / 256); | 3278 | sysctl_max_syn_backlog = max(128, cnt / 256); |
3271 | 3279 | ||
3272 | /* Set the pressure threshold to be a fraction of global memory that | 3280 | limit = nr_free_buffer_pages() / 8; |
3273 | * is up to 1/2 at 256 MB, decreasing toward zero with the amount of | ||
3274 | * memory, with a floor of 128 pages. | ||
3275 | */ | ||
3276 | nr_pages = totalram_pages - totalhigh_pages; | ||
3277 | limit = min(nr_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT); | ||
3278 | limit = (limit * (nr_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11); | ||
3279 | limit = max(limit, 128UL); | 3281 | limit = max(limit, 128UL); |
3280 | sysctl_tcp_mem[0] = limit / 4 * 3; | 3282 | sysctl_tcp_mem[0] = limit / 4 * 3; |
3281 | sysctl_tcp_mem[1] = limit; | 3283 | sysctl_tcp_mem[1] = limit; |
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 3b53fd1af23f..6187eb4d1dcf 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c | |||
@@ -209,7 +209,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt) | |||
209 | } | 209 | } |
210 | 210 | ||
211 | 211 | ||
212 | static struct tcp_congestion_ops bictcp = { | 212 | static struct tcp_congestion_ops bictcp __read_mostly = { |
213 | .init = bictcp_init, | 213 | .init = bictcp_init, |
214 | .ssthresh = bictcp_recalc_ssthresh, | 214 | .ssthresh = bictcp_recalc_ssthresh, |
215 | .cong_avoid = bictcp_cong_avoid, | 215 | .cong_avoid = bictcp_cong_avoid, |
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 71d5f2f29fa6..f376b05cca81 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c | |||
@@ -39,7 +39,7 @@ | |||
39 | 39 | ||
40 | /* Number of delay samples for detecting the increase of delay */ | 40 | /* Number of delay samples for detecting the increase of delay */ |
41 | #define HYSTART_MIN_SAMPLES 8 | 41 | #define HYSTART_MIN_SAMPLES 8 |
42 | #define HYSTART_DELAY_MIN (2U<<3) | 42 | #define HYSTART_DELAY_MIN (4U<<3) |
43 | #define HYSTART_DELAY_MAX (16U<<3) | 43 | #define HYSTART_DELAY_MAX (16U<<3) |
44 | #define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX) | 44 | #define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX) |
45 | 45 | ||
@@ -52,6 +52,7 @@ static int tcp_friendliness __read_mostly = 1; | |||
52 | static int hystart __read_mostly = 1; | 52 | static int hystart __read_mostly = 1; |
53 | static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY; | 53 | static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY; |
54 | static int hystart_low_window __read_mostly = 16; | 54 | static int hystart_low_window __read_mostly = 16; |
55 | static int hystart_ack_delta __read_mostly = 2; | ||
55 | 56 | ||
56 | static u32 cube_rtt_scale __read_mostly; | 57 | static u32 cube_rtt_scale __read_mostly; |
57 | static u32 beta_scale __read_mostly; | 58 | static u32 beta_scale __read_mostly; |
@@ -75,6 +76,8 @@ MODULE_PARM_DESC(hystart_detect, "hyrbrid slow start detection mechanisms" | |||
75 | " 1: packet-train 2: delay 3: both packet-train and delay"); | 76 | " 1: packet-train 2: delay 3: both packet-train and delay"); |
76 | module_param(hystart_low_window, int, 0644); | 77 | module_param(hystart_low_window, int, 0644); |
77 | MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start"); | 78 | MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start"); |
79 | module_param(hystart_ack_delta, int, 0644); | ||
80 | MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (msecs)"); | ||
78 | 81 | ||
79 | /* BIC TCP Parameters */ | 82 | /* BIC TCP Parameters */ |
80 | struct bictcp { | 83 | struct bictcp { |
@@ -85,17 +88,18 @@ struct bictcp { | |||
85 | u32 last_time; /* time when updated last_cwnd */ | 88 | u32 last_time; /* time when updated last_cwnd */ |
86 | u32 bic_origin_point;/* origin point of bic function */ | 89 | u32 bic_origin_point;/* origin point of bic function */ |
87 | u32 bic_K; /* time to origin point from the beginning of the current epoch */ | 90 | u32 bic_K; /* time to origin point from the beginning of the current epoch */ |
88 | u32 delay_min; /* min delay */ | 91 | u32 delay_min; /* min delay (msec << 3) */ |
89 | u32 epoch_start; /* beginning of an epoch */ | 92 | u32 epoch_start; /* beginning of an epoch */ |
90 | u32 ack_cnt; /* number of acks */ | 93 | u32 ack_cnt; /* number of acks */ |
91 | u32 tcp_cwnd; /* estimated tcp cwnd */ | 94 | u32 tcp_cwnd; /* estimated tcp cwnd */ |
92 | #define ACK_RATIO_SHIFT 4 | 95 | #define ACK_RATIO_SHIFT 4 |
96 | #define ACK_RATIO_LIMIT (32u << ACK_RATIO_SHIFT) | ||
93 | u16 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ | 97 | u16 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ |
94 | u8 sample_cnt; /* number of samples to decide curr_rtt */ | 98 | u8 sample_cnt; /* number of samples to decide curr_rtt */ |
95 | u8 found; /* the exit point is found? */ | 99 | u8 found; /* the exit point is found? */ |
96 | u32 round_start; /* beginning of each round */ | 100 | u32 round_start; /* beginning of each round */ |
97 | u32 end_seq; /* end_seq of the round */ | 101 | u32 end_seq; /* end_seq of the round */ |
98 | u32 last_jiffies; /* last time when the ACK spacing is close */ | 102 | u32 last_ack; /* last time when the ACK spacing is close */ |
99 | u32 curr_rtt; /* the minimum rtt of current round */ | 103 | u32 curr_rtt; /* the minimum rtt of current round */ |
100 | }; | 104 | }; |
101 | 105 | ||
@@ -116,12 +120,21 @@ static inline void bictcp_reset(struct bictcp *ca) | |||
116 | ca->found = 0; | 120 | ca->found = 0; |
117 | } | 121 | } |
118 | 122 | ||
123 | static inline u32 bictcp_clock(void) | ||
124 | { | ||
125 | #if HZ < 1000 | ||
126 | return ktime_to_ms(ktime_get_real()); | ||
127 | #else | ||
128 | return jiffies_to_msecs(jiffies); | ||
129 | #endif | ||
130 | } | ||
131 | |||
119 | static inline void bictcp_hystart_reset(struct sock *sk) | 132 | static inline void bictcp_hystart_reset(struct sock *sk) |
120 | { | 133 | { |
121 | struct tcp_sock *tp = tcp_sk(sk); | 134 | struct tcp_sock *tp = tcp_sk(sk); |
122 | struct bictcp *ca = inet_csk_ca(sk); | 135 | struct bictcp *ca = inet_csk_ca(sk); |
123 | 136 | ||
124 | ca->round_start = ca->last_jiffies = jiffies; | 137 | ca->round_start = ca->last_ack = bictcp_clock(); |
125 | ca->end_seq = tp->snd_nxt; | 138 | ca->end_seq = tp->snd_nxt; |
126 | ca->curr_rtt = 0; | 139 | ca->curr_rtt = 0; |
127 | ca->sample_cnt = 0; | 140 | ca->sample_cnt = 0; |
@@ -236,8 +249,8 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) | |||
236 | */ | 249 | */ |
237 | 250 | ||
238 | /* change the unit from HZ to bictcp_HZ */ | 251 | /* change the unit from HZ to bictcp_HZ */ |
239 | t = ((tcp_time_stamp + (ca->delay_min>>3) - ca->epoch_start) | 252 | t = ((tcp_time_stamp + msecs_to_jiffies(ca->delay_min>>3) |
240 | << BICTCP_HZ) / HZ; | 253 | - ca->epoch_start) << BICTCP_HZ) / HZ; |
241 | 254 | ||
242 | if (t < ca->bic_K) /* t - K */ | 255 | if (t < ca->bic_K) /* t - K */ |
243 | offs = ca->bic_K - t; | 256 | offs = ca->bic_K - t; |
@@ -258,6 +271,13 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) | |||
258 | ca->cnt = 100 * cwnd; /* very small increment*/ | 271 | ca->cnt = 100 * cwnd; /* very small increment*/ |
259 | } | 272 | } |
260 | 273 | ||
274 | /* | ||
275 | * The initial growth of cubic function may be too conservative | ||
276 | * when the available bandwidth is still unknown. | ||
277 | */ | ||
278 | if (ca->loss_cwnd == 0 && ca->cnt > 20) | ||
279 | ca->cnt = 20; /* increase cwnd 5% per RTT */ | ||
280 | |||
261 | /* TCP Friendly */ | 281 | /* TCP Friendly */ |
262 | if (tcp_friendliness) { | 282 | if (tcp_friendliness) { |
263 | u32 scale = beta_scale; | 283 | u32 scale = beta_scale; |
@@ -339,12 +359,12 @@ static void hystart_update(struct sock *sk, u32 delay) | |||
339 | struct bictcp *ca = inet_csk_ca(sk); | 359 | struct bictcp *ca = inet_csk_ca(sk); |
340 | 360 | ||
341 | if (!(ca->found & hystart_detect)) { | 361 | if (!(ca->found & hystart_detect)) { |
342 | u32 curr_jiffies = jiffies; | 362 | u32 now = bictcp_clock(); |
343 | 363 | ||
344 | /* first detection parameter - ack-train detection */ | 364 | /* first detection parameter - ack-train detection */ |
345 | if (curr_jiffies - ca->last_jiffies <= msecs_to_jiffies(2)) { | 365 | if ((s32)(now - ca->last_ack) <= hystart_ack_delta) { |
346 | ca->last_jiffies = curr_jiffies; | 366 | ca->last_ack = now; |
347 | if (curr_jiffies - ca->round_start >= ca->delay_min>>4) | 367 | if ((s32)(now - ca->round_start) > ca->delay_min >> 4) |
348 | ca->found |= HYSTART_ACK_TRAIN; | 368 | ca->found |= HYSTART_ACK_TRAIN; |
349 | } | 369 | } |
350 | 370 | ||
@@ -379,8 +399,12 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) | |||
379 | u32 delay; | 399 | u32 delay; |
380 | 400 | ||
381 | if (icsk->icsk_ca_state == TCP_CA_Open) { | 401 | if (icsk->icsk_ca_state == TCP_CA_Open) { |
382 | cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; | 402 | u32 ratio = ca->delayed_ack; |
383 | ca->delayed_ack += cnt; | 403 | |
404 | ratio -= ca->delayed_ack >> ACK_RATIO_SHIFT; | ||
405 | ratio += cnt; | ||
406 | |||
407 | ca->delayed_ack = min(ratio, ACK_RATIO_LIMIT); | ||
384 | } | 408 | } |
385 | 409 | ||
386 | /* Some calls are for duplicates without timetamps */ | 410 | /* Some calls are for duplicates without timetamps */ |
@@ -391,7 +415,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) | |||
391 | if ((s32)(tcp_time_stamp - ca->epoch_start) < HZ) | 415 | if ((s32)(tcp_time_stamp - ca->epoch_start) < HZ) |
392 | return; | 416 | return; |
393 | 417 | ||
394 | delay = usecs_to_jiffies(rtt_us) << 3; | 418 | delay = (rtt_us << 3) / USEC_PER_MSEC; |
395 | if (delay == 0) | 419 | if (delay == 0) |
396 | delay = 1; | 420 | delay = 1; |
397 | 421 | ||
@@ -405,7 +429,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) | |||
405 | hystart_update(sk, delay); | 429 | hystart_update(sk, delay); |
406 | } | 430 | } |
407 | 431 | ||
408 | static struct tcp_congestion_ops cubictcp = { | 432 | static struct tcp_congestion_ops cubictcp __read_mostly = { |
409 | .init = bictcp_init, | 433 | .init = bictcp_init, |
410 | .ssthresh = bictcp_recalc_ssthresh, | 434 | .ssthresh = bictcp_recalc_ssthresh, |
411 | .cong_avoid = bictcp_cong_avoid, | 435 | .cong_avoid = bictcp_cong_avoid, |
@@ -447,6 +471,10 @@ static int __init cubictcp_register(void) | |||
447 | /* divide by bic_scale and by constant Srtt (100ms) */ | 471 | /* divide by bic_scale and by constant Srtt (100ms) */ |
448 | do_div(cube_factor, bic_scale * 10); | 472 | do_div(cube_factor, bic_scale * 10); |
449 | 473 | ||
474 | /* hystart needs ms clock resolution */ | ||
475 | if (hystart && HZ < 1000) | ||
476 | cubictcp.flags |= TCP_CONG_RTT_STAMP; | ||
477 | |||
450 | return tcp_register_congestion_control(&cubictcp); | 478 | return tcp_register_congestion_control(&cubictcp); |
451 | } | 479 | } |
452 | 480 | ||
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 8b6caaf75bb9..30f27f6b3655 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c | |||
@@ -158,7 +158,7 @@ static u32 hstcp_ssthresh(struct sock *sk) | |||
158 | } | 158 | } |
159 | 159 | ||
160 | 160 | ||
161 | static struct tcp_congestion_ops tcp_highspeed = { | 161 | static struct tcp_congestion_ops tcp_highspeed __read_mostly = { |
162 | .init = hstcp_init, | 162 | .init = hstcp_init, |
163 | .ssthresh = hstcp_ssthresh, | 163 | .ssthresh = hstcp_ssthresh, |
164 | .cong_avoid = hstcp_cong_avoid, | 164 | .cong_avoid = hstcp_cong_avoid, |
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 7c94a4955416..c1a8175361e8 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c | |||
@@ -284,7 +284,7 @@ static void htcp_state(struct sock *sk, u8 new_state) | |||
284 | } | 284 | } |
285 | } | 285 | } |
286 | 286 | ||
287 | static struct tcp_congestion_ops htcp = { | 287 | static struct tcp_congestion_ops htcp __read_mostly = { |
288 | .init = htcp_init, | 288 | .init = htcp_init, |
289 | .ssthresh = htcp_recalc_ssthresh, | 289 | .ssthresh = htcp_recalc_ssthresh, |
290 | .cong_avoid = htcp_cong_avoid, | 290 | .cong_avoid = htcp_cong_avoid, |
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index 377bc9349371..fe3ecf484b44 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c | |||
@@ -162,7 +162,7 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) | |||
162 | tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); | 162 | tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); |
163 | } | 163 | } |
164 | 164 | ||
165 | static struct tcp_congestion_ops tcp_hybla = { | 165 | static struct tcp_congestion_ops tcp_hybla __read_mostly = { |
166 | .init = hybla_init, | 166 | .init = hybla_init, |
167 | .ssthresh = tcp_reno_ssthresh, | 167 | .ssthresh = tcp_reno_ssthresh, |
168 | .min_cwnd = tcp_reno_min_cwnd, | 168 | .min_cwnd = tcp_reno_min_cwnd, |
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index 1eba160b72dc..813b43a76fec 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c | |||
@@ -6,7 +6,7 @@ | |||
6 | * The algorithm is described in: | 6 | * The algorithm is described in: |
7 | * "TCP-Illinois: A Loss and Delay-Based Congestion Control Algorithm | 7 | * "TCP-Illinois: A Loss and Delay-Based Congestion Control Algorithm |
8 | * for High-Speed Networks" | 8 | * for High-Speed Networks" |
9 | * http://www.ews.uiuc.edu/~shaoliu/papersandslides/liubassri06perf.pdf | 9 | * http://www.ifp.illinois.edu/~srikant/Papers/liubassri06perf.pdf |
10 | * | 10 | * |
11 | * Implemented from description in paper and ns-2 simulation. | 11 | * Implemented from description in paper and ns-2 simulation. |
12 | * Copyright (C) 2007 Stephen Hemminger <shemminger@linux-foundation.org> | 12 | * Copyright (C) 2007 Stephen Hemminger <shemminger@linux-foundation.org> |
@@ -322,7 +322,7 @@ static void tcp_illinois_info(struct sock *sk, u32 ext, | |||
322 | } | 322 | } |
323 | } | 323 | } |
324 | 324 | ||
325 | static struct tcp_congestion_ops tcp_illinois = { | 325 | static struct tcp_congestion_ops tcp_illinois __read_mostly = { |
326 | .flags = TCP_CONG_RTT_STAMP, | 326 | .flags = TCP_CONG_RTT_STAMP, |
327 | .init = tcp_illinois_init, | 327 | .init = tcp_illinois_init, |
328 | .ssthresh = tcp_illinois_ssthresh, | 328 | .ssthresh = tcp_illinois_ssthresh, |
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b55f60f6fcbe..bef9f04c22ba 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -182,7 +182,7 @@ static void tcp_incr_quickack(struct sock *sk) | |||
182 | icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS); | 182 | icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS); |
183 | } | 183 | } |
184 | 184 | ||
185 | void tcp_enter_quickack_mode(struct sock *sk) | 185 | static void tcp_enter_quickack_mode(struct sock *sk) |
186 | { | 186 | { |
187 | struct inet_connection_sock *icsk = inet_csk(sk); | 187 | struct inet_connection_sock *icsk = inet_csk(sk); |
188 | tcp_incr_quickack(sk); | 188 | tcp_incr_quickack(sk); |
@@ -259,8 +259,11 @@ static void tcp_fixup_sndbuf(struct sock *sk) | |||
259 | int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 + | 259 | int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 + |
260 | sizeof(struct sk_buff); | 260 | sizeof(struct sk_buff); |
261 | 261 | ||
262 | if (sk->sk_sndbuf < 3 * sndmem) | 262 | if (sk->sk_sndbuf < 3 * sndmem) { |
263 | sk->sk_sndbuf = min(3 * sndmem, sysctl_tcp_wmem[2]); | 263 | sk->sk_sndbuf = 3 * sndmem; |
264 | if (sk->sk_sndbuf > sysctl_tcp_wmem[2]) | ||
265 | sk->sk_sndbuf = sysctl_tcp_wmem[2]; | ||
266 | } | ||
264 | } | 267 | } |
265 | 268 | ||
266 | /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) | 269 | /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) |
@@ -396,7 +399,7 @@ static void tcp_clamp_window(struct sock *sk) | |||
396 | if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && | 399 | if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && |
397 | !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && | 400 | !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && |
398 | !tcp_memory_pressure && | 401 | !tcp_memory_pressure && |
399 | atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { | 402 | atomic_long_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { |
400 | sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), | 403 | sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), |
401 | sysctl_tcp_rmem[2]); | 404 | sysctl_tcp_rmem[2]); |
402 | } | 405 | } |
@@ -428,10 +431,10 @@ EXPORT_SYMBOL(tcp_initialize_rcv_mss); | |||
428 | * | 431 | * |
429 | * The algorithm for RTT estimation w/o timestamps is based on | 432 | * The algorithm for RTT estimation w/o timestamps is based on |
430 | * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL. | 433 | * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL. |
431 | * <http://www.lanl.gov/radiant/website/pubs/drs/lacsi2001.ps> | 434 | * <http://public.lanl.gov/radiant/pubs.html#DRS> |
432 | * | 435 | * |
433 | * More detail on this code can be found at | 436 | * More detail on this code can be found at |
434 | * <http://www.psc.edu/~jheffner/senior_thesis.ps>, | 437 | * <http://staff.psc.edu/jheffner/>, |
435 | * though this reference is out of date. A new paper | 438 | * though this reference is out of date. A new paper |
436 | * is pending. | 439 | * is pending. |
437 | */ | 440 | */ |
@@ -731,7 +734,7 @@ void tcp_update_metrics(struct sock *sk) | |||
731 | * Reset our results. | 734 | * Reset our results. |
732 | */ | 735 | */ |
733 | if (!(dst_metric_locked(dst, RTAX_RTT))) | 736 | if (!(dst_metric_locked(dst, RTAX_RTT))) |
734 | dst->metrics[RTAX_RTT - 1] = 0; | 737 | dst_metric_set(dst, RTAX_RTT, 0); |
735 | return; | 738 | return; |
736 | } | 739 | } |
737 | 740 | ||
@@ -773,57 +776,48 @@ void tcp_update_metrics(struct sock *sk) | |||
773 | if (dst_metric(dst, RTAX_SSTHRESH) && | 776 | if (dst_metric(dst, RTAX_SSTHRESH) && |
774 | !dst_metric_locked(dst, RTAX_SSTHRESH) && | 777 | !dst_metric_locked(dst, RTAX_SSTHRESH) && |
775 | (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH)) | 778 | (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH)) |
776 | dst->metrics[RTAX_SSTHRESH-1] = tp->snd_cwnd >> 1; | 779 | dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1); |
777 | if (!dst_metric_locked(dst, RTAX_CWND) && | 780 | if (!dst_metric_locked(dst, RTAX_CWND) && |
778 | tp->snd_cwnd > dst_metric(dst, RTAX_CWND)) | 781 | tp->snd_cwnd > dst_metric(dst, RTAX_CWND)) |
779 | dst->metrics[RTAX_CWND - 1] = tp->snd_cwnd; | 782 | dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd); |
780 | } else if (tp->snd_cwnd > tp->snd_ssthresh && | 783 | } else if (tp->snd_cwnd > tp->snd_ssthresh && |
781 | icsk->icsk_ca_state == TCP_CA_Open) { | 784 | icsk->icsk_ca_state == TCP_CA_Open) { |
782 | /* Cong. avoidance phase, cwnd is reliable. */ | 785 | /* Cong. avoidance phase, cwnd is reliable. */ |
783 | if (!dst_metric_locked(dst, RTAX_SSTHRESH)) | 786 | if (!dst_metric_locked(dst, RTAX_SSTHRESH)) |
784 | dst->metrics[RTAX_SSTHRESH-1] = | 787 | dst_metric_set(dst, RTAX_SSTHRESH, |
785 | max(tp->snd_cwnd >> 1, tp->snd_ssthresh); | 788 | max(tp->snd_cwnd >> 1, tp->snd_ssthresh)); |
786 | if (!dst_metric_locked(dst, RTAX_CWND)) | 789 | if (!dst_metric_locked(dst, RTAX_CWND)) |
787 | dst->metrics[RTAX_CWND-1] = (dst_metric(dst, RTAX_CWND) + tp->snd_cwnd) >> 1; | 790 | dst_metric_set(dst, RTAX_CWND, |
791 | (dst_metric(dst, RTAX_CWND) + | ||
792 | tp->snd_cwnd) >> 1); | ||
788 | } else { | 793 | } else { |
789 | /* Else slow start did not finish, cwnd is non-sense, | 794 | /* Else slow start did not finish, cwnd is non-sense, |
790 | ssthresh may be also invalid. | 795 | ssthresh may be also invalid. |
791 | */ | 796 | */ |
792 | if (!dst_metric_locked(dst, RTAX_CWND)) | 797 | if (!dst_metric_locked(dst, RTAX_CWND)) |
793 | dst->metrics[RTAX_CWND-1] = (dst_metric(dst, RTAX_CWND) + tp->snd_ssthresh) >> 1; | 798 | dst_metric_set(dst, RTAX_CWND, |
799 | (dst_metric(dst, RTAX_CWND) + | ||
800 | tp->snd_ssthresh) >> 1); | ||
794 | if (dst_metric(dst, RTAX_SSTHRESH) && | 801 | if (dst_metric(dst, RTAX_SSTHRESH) && |
795 | !dst_metric_locked(dst, RTAX_SSTHRESH) && | 802 | !dst_metric_locked(dst, RTAX_SSTHRESH) && |
796 | tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH)) | 803 | tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH)) |
797 | dst->metrics[RTAX_SSTHRESH-1] = tp->snd_ssthresh; | 804 | dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh); |
798 | } | 805 | } |
799 | 806 | ||
800 | if (!dst_metric_locked(dst, RTAX_REORDERING)) { | 807 | if (!dst_metric_locked(dst, RTAX_REORDERING)) { |
801 | if (dst_metric(dst, RTAX_REORDERING) < tp->reordering && | 808 | if (dst_metric(dst, RTAX_REORDERING) < tp->reordering && |
802 | tp->reordering != sysctl_tcp_reordering) | 809 | tp->reordering != sysctl_tcp_reordering) |
803 | dst->metrics[RTAX_REORDERING-1] = tp->reordering; | 810 | dst_metric_set(dst, RTAX_REORDERING, tp->reordering); |
804 | } | 811 | } |
805 | } | 812 | } |
806 | } | 813 | } |
807 | 814 | ||
808 | /* Numbers are taken from RFC3390. | ||
809 | * | ||
810 | * John Heffner states: | ||
811 | * | ||
812 | * The RFC specifies a window of no more than 4380 bytes | ||
813 | * unless 2*MSS > 4380. Reading the pseudocode in the RFC | ||
814 | * is a bit misleading because they use a clamp at 4380 bytes | ||
815 | * rather than use a multiplier in the relevant range. | ||
816 | */ | ||
817 | __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) | 815 | __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) |
818 | { | 816 | { |
819 | __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); | 817 | __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); |
820 | 818 | ||
821 | if (!cwnd) { | 819 | if (!cwnd) |
822 | if (tp->mss_cache > 1460) | 820 | cwnd = TCP_INIT_CWND; |
823 | cwnd = 2; | ||
824 | else | ||
825 | cwnd = (tp->mss_cache > 1095) ? 3 : 4; | ||
826 | } | ||
827 | return min_t(__u32, cwnd, tp->snd_cwnd_clamp); | 821 | return min_t(__u32, cwnd, tp->snd_cwnd_clamp); |
828 | } | 822 | } |
829 | 823 | ||
@@ -922,25 +916,20 @@ static void tcp_init_metrics(struct sock *sk) | |||
922 | tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); | 916 | tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); |
923 | } | 917 | } |
924 | tcp_set_rto(sk); | 918 | tcp_set_rto(sk); |
925 | if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) | 919 | if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) { |
926 | goto reset; | ||
927 | |||
928 | cwnd: | ||
929 | tp->snd_cwnd = tcp_init_cwnd(tp, dst); | ||
930 | tp->snd_cwnd_stamp = tcp_time_stamp; | ||
931 | return; | ||
932 | |||
933 | reset: | 920 | reset: |
934 | /* Play conservative. If timestamps are not | 921 | /* Play conservative. If timestamps are not |
935 | * supported, TCP will fail to recalculate correct | 922 | * supported, TCP will fail to recalculate correct |
936 | * rtt, if initial rto is too small. FORGET ALL AND RESET! | 923 | * rtt, if initial rto is too small. FORGET ALL AND RESET! |
937 | */ | 924 | */ |
938 | if (!tp->rx_opt.saw_tstamp && tp->srtt) { | 925 | if (!tp->rx_opt.saw_tstamp && tp->srtt) { |
939 | tp->srtt = 0; | 926 | tp->srtt = 0; |
940 | tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT; | 927 | tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT; |
941 | inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; | 928 | inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; |
929 | } | ||
942 | } | 930 | } |
943 | goto cwnd; | 931 | tp->snd_cwnd = tcp_init_cwnd(tp, dst); |
932 | tp->snd_cwnd_stamp = tcp_time_stamp; | ||
944 | } | 933 | } |
945 | 934 | ||
946 | static void tcp_update_reordering(struct sock *sk, const int metric, | 935 | static void tcp_update_reordering(struct sock *sk, const int metric, |
@@ -1233,7 +1222,7 @@ static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb, | |||
1233 | } | 1222 | } |
1234 | 1223 | ||
1235 | /* D-SACK for already forgotten data... Do dumb counting. */ | 1224 | /* D-SACK for already forgotten data... Do dumb counting. */ |
1236 | if (dup_sack && | 1225 | if (dup_sack && tp->undo_marker && tp->undo_retrans && |
1237 | !after(end_seq_0, prior_snd_una) && | 1226 | !after(end_seq_0, prior_snd_una) && |
1238 | after(end_seq_0, tp->undo_marker)) | 1227 | after(end_seq_0, tp->undo_marker)) |
1239 | tp->undo_retrans--; | 1228 | tp->undo_retrans--; |
@@ -1310,7 +1299,8 @@ static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, | |||
1310 | 1299 | ||
1311 | /* Account D-SACK for retransmitted packet. */ | 1300 | /* Account D-SACK for retransmitted packet. */ |
1312 | if (dup_sack && (sacked & TCPCB_RETRANS)) { | 1301 | if (dup_sack && (sacked & TCPCB_RETRANS)) { |
1313 | if (after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker)) | 1302 | if (tp->undo_marker && tp->undo_retrans && |
1303 | after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker)) | ||
1314 | tp->undo_retrans--; | 1304 | tp->undo_retrans--; |
1315 | if (sacked & TCPCB_SACKED_ACKED) | 1305 | if (sacked & TCPCB_SACKED_ACKED) |
1316 | state->reord = min(fack_count, state->reord); | 1306 | state->reord = min(fack_count, state->reord); |
@@ -2314,7 +2304,7 @@ static inline int tcp_dupack_heuristics(struct tcp_sock *tp) | |||
2314 | 2304 | ||
2315 | static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb) | 2305 | static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb) |
2316 | { | 2306 | { |
2317 | return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto); | 2307 | return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto; |
2318 | } | 2308 | } |
2319 | 2309 | ||
2320 | static inline int tcp_head_timedout(struct sock *sk) | 2310 | static inline int tcp_head_timedout(struct sock *sk) |
@@ -2508,7 +2498,7 @@ static void tcp_timeout_skbs(struct sock *sk) | |||
2508 | /* Mark head of queue up as lost. With RFC3517 SACK, the packets is | 2498 | /* Mark head of queue up as lost. With RFC3517 SACK, the packets is |
2509 | * is against sacked "cnt", otherwise it's against facked "cnt" | 2499 | * is against sacked "cnt", otherwise it's against facked "cnt" |
2510 | */ | 2500 | */ |
2511 | static void tcp_mark_head_lost(struct sock *sk, int packets) | 2501 | static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) |
2512 | { | 2502 | { |
2513 | struct tcp_sock *tp = tcp_sk(sk); | 2503 | struct tcp_sock *tp = tcp_sk(sk); |
2514 | struct sk_buff *skb; | 2504 | struct sk_buff *skb; |
@@ -2516,13 +2506,13 @@ static void tcp_mark_head_lost(struct sock *sk, int packets) | |||
2516 | int err; | 2506 | int err; |
2517 | unsigned int mss; | 2507 | unsigned int mss; |
2518 | 2508 | ||
2519 | if (packets == 0) | ||
2520 | return; | ||
2521 | |||
2522 | WARN_ON(packets > tp->packets_out); | 2509 | WARN_ON(packets > tp->packets_out); |
2523 | if (tp->lost_skb_hint) { | 2510 | if (tp->lost_skb_hint) { |
2524 | skb = tp->lost_skb_hint; | 2511 | skb = tp->lost_skb_hint; |
2525 | cnt = tp->lost_cnt_hint; | 2512 | cnt = tp->lost_cnt_hint; |
2513 | /* Head already handled? */ | ||
2514 | if (mark_head && skb != tcp_write_queue_head(sk)) | ||
2515 | return; | ||
2526 | } else { | 2516 | } else { |
2527 | skb = tcp_write_queue_head(sk); | 2517 | skb = tcp_write_queue_head(sk); |
2528 | cnt = 0; | 2518 | cnt = 0; |
@@ -2557,6 +2547,9 @@ static void tcp_mark_head_lost(struct sock *sk, int packets) | |||
2557 | } | 2547 | } |
2558 | 2548 | ||
2559 | tcp_skb_mark_lost(tp, skb); | 2549 | tcp_skb_mark_lost(tp, skb); |
2550 | |||
2551 | if (mark_head) | ||
2552 | break; | ||
2560 | } | 2553 | } |
2561 | tcp_verify_left_out(tp); | 2554 | tcp_verify_left_out(tp); |
2562 | } | 2555 | } |
@@ -2568,17 +2561,18 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit) | |||
2568 | struct tcp_sock *tp = tcp_sk(sk); | 2561 | struct tcp_sock *tp = tcp_sk(sk); |
2569 | 2562 | ||
2570 | if (tcp_is_reno(tp)) { | 2563 | if (tcp_is_reno(tp)) { |
2571 | tcp_mark_head_lost(sk, 1); | 2564 | tcp_mark_head_lost(sk, 1, 1); |
2572 | } else if (tcp_is_fack(tp)) { | 2565 | } else if (tcp_is_fack(tp)) { |
2573 | int lost = tp->fackets_out - tp->reordering; | 2566 | int lost = tp->fackets_out - tp->reordering; |
2574 | if (lost <= 0) | 2567 | if (lost <= 0) |
2575 | lost = 1; | 2568 | lost = 1; |
2576 | tcp_mark_head_lost(sk, lost); | 2569 | tcp_mark_head_lost(sk, lost, 0); |
2577 | } else { | 2570 | } else { |
2578 | int sacked_upto = tp->sacked_out - tp->reordering; | 2571 | int sacked_upto = tp->sacked_out - tp->reordering; |
2579 | if (sacked_upto < fast_rexmit) | 2572 | if (sacked_upto >= 0) |
2580 | sacked_upto = fast_rexmit; | 2573 | tcp_mark_head_lost(sk, sacked_upto, 0); |
2581 | tcp_mark_head_lost(sk, sacked_upto); | 2574 | else if (fast_rexmit) |
2575 | tcp_mark_head_lost(sk, 1, 1); | ||
2582 | } | 2576 | } |
2583 | 2577 | ||
2584 | tcp_timeout_skbs(sk); | 2578 | tcp_timeout_skbs(sk); |
@@ -2665,7 +2659,7 @@ static void DBGUNDO(struct sock *sk, const char *msg) | |||
2665 | #define DBGUNDO(x...) do { } while (0) | 2659 | #define DBGUNDO(x...) do { } while (0) |
2666 | #endif | 2660 | #endif |
2667 | 2661 | ||
2668 | static void tcp_undo_cwr(struct sock *sk, const int undo) | 2662 | static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh) |
2669 | { | 2663 | { |
2670 | struct tcp_sock *tp = tcp_sk(sk); | 2664 | struct tcp_sock *tp = tcp_sk(sk); |
2671 | 2665 | ||
@@ -2677,14 +2671,13 @@ static void tcp_undo_cwr(struct sock *sk, const int undo) | |||
2677 | else | 2671 | else |
2678 | tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1); | 2672 | tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1); |
2679 | 2673 | ||
2680 | if (undo && tp->prior_ssthresh > tp->snd_ssthresh) { | 2674 | if (undo_ssthresh && tp->prior_ssthresh > tp->snd_ssthresh) { |
2681 | tp->snd_ssthresh = tp->prior_ssthresh; | 2675 | tp->snd_ssthresh = tp->prior_ssthresh; |
2682 | TCP_ECN_withdraw_cwr(tp); | 2676 | TCP_ECN_withdraw_cwr(tp); |
2683 | } | 2677 | } |
2684 | } else { | 2678 | } else { |
2685 | tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh); | 2679 | tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh); |
2686 | } | 2680 | } |
2687 | tcp_moderate_cwnd(tp); | ||
2688 | tp->snd_cwnd_stamp = tcp_time_stamp; | 2681 | tp->snd_cwnd_stamp = tcp_time_stamp; |
2689 | } | 2682 | } |
2690 | 2683 | ||
@@ -2705,7 +2698,7 @@ static int tcp_try_undo_recovery(struct sock *sk) | |||
2705 | * or our original transmission succeeded. | 2698 | * or our original transmission succeeded. |
2706 | */ | 2699 | */ |
2707 | DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans"); | 2700 | DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans"); |
2708 | tcp_undo_cwr(sk, 1); | 2701 | tcp_undo_cwr(sk, true); |
2709 | if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) | 2702 | if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) |
2710 | mib_idx = LINUX_MIB_TCPLOSSUNDO; | 2703 | mib_idx = LINUX_MIB_TCPLOSSUNDO; |
2711 | else | 2704 | else |
@@ -2732,7 +2725,7 @@ static void tcp_try_undo_dsack(struct sock *sk) | |||
2732 | 2725 | ||
2733 | if (tp->undo_marker && !tp->undo_retrans) { | 2726 | if (tp->undo_marker && !tp->undo_retrans) { |
2734 | DBGUNDO(sk, "D-SACK"); | 2727 | DBGUNDO(sk, "D-SACK"); |
2735 | tcp_undo_cwr(sk, 1); | 2728 | tcp_undo_cwr(sk, true); |
2736 | tp->undo_marker = 0; | 2729 | tp->undo_marker = 0; |
2737 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO); | 2730 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO); |
2738 | } | 2731 | } |
@@ -2785,7 +2778,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked) | |||
2785 | tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); | 2778 | tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); |
2786 | 2779 | ||
2787 | DBGUNDO(sk, "Hoe"); | 2780 | DBGUNDO(sk, "Hoe"); |
2788 | tcp_undo_cwr(sk, 0); | 2781 | tcp_undo_cwr(sk, false); |
2789 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); | 2782 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); |
2790 | 2783 | ||
2791 | /* So... Do not make Hoe's retransmit yet. | 2784 | /* So... Do not make Hoe's retransmit yet. |
@@ -2814,7 +2807,7 @@ static int tcp_try_undo_loss(struct sock *sk) | |||
2814 | 2807 | ||
2815 | DBGUNDO(sk, "partial loss"); | 2808 | DBGUNDO(sk, "partial loss"); |
2816 | tp->lost_out = 0; | 2809 | tp->lost_out = 0; |
2817 | tcp_undo_cwr(sk, 1); | 2810 | tcp_undo_cwr(sk, true); |
2818 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO); | 2811 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO); |
2819 | inet_csk(sk)->icsk_retransmits = 0; | 2812 | inet_csk(sk)->icsk_retransmits = 0; |
2820 | tp->undo_marker = 0; | 2813 | tp->undo_marker = 0; |
@@ -2828,8 +2821,11 @@ static int tcp_try_undo_loss(struct sock *sk) | |||
2828 | static inline void tcp_complete_cwr(struct sock *sk) | 2821 | static inline void tcp_complete_cwr(struct sock *sk) |
2829 | { | 2822 | { |
2830 | struct tcp_sock *tp = tcp_sk(sk); | 2823 | struct tcp_sock *tp = tcp_sk(sk); |
2831 | tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); | 2824 | /* Do not moderate cwnd if it's already undone in cwr or recovery */ |
2832 | tp->snd_cwnd_stamp = tcp_time_stamp; | 2825 | if (tp->undo_marker && tp->snd_cwnd > tp->snd_ssthresh) { |
2826 | tp->snd_cwnd = tp->snd_ssthresh; | ||
2827 | tp->snd_cwnd_stamp = tcp_time_stamp; | ||
2828 | } | ||
2833 | tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); | 2829 | tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); |
2834 | } | 2830 | } |
2835 | 2831 | ||
@@ -2887,7 +2883,7 @@ static void tcp_mtup_probe_success(struct sock *sk) | |||
2887 | icsk->icsk_mtup.probe_size; | 2883 | icsk->icsk_mtup.probe_size; |
2888 | tp->snd_cwnd_cnt = 0; | 2884 | tp->snd_cwnd_cnt = 0; |
2889 | tp->snd_cwnd_stamp = tcp_time_stamp; | 2885 | tp->snd_cwnd_stamp = tcp_time_stamp; |
2890 | tp->rcv_ssthresh = tcp_current_ssthresh(sk); | 2886 | tp->snd_ssthresh = tcp_current_ssthresh(sk); |
2891 | 2887 | ||
2892 | icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size; | 2888 | icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size; |
2893 | icsk->icsk_mtup.probe_size = 0; | 2889 | icsk->icsk_mtup.probe_size = 0; |
@@ -2984,7 +2980,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) | |||
2984 | before(tp->snd_una, tp->high_seq) && | 2980 | before(tp->snd_una, tp->high_seq) && |
2985 | icsk->icsk_ca_state != TCP_CA_Open && | 2981 | icsk->icsk_ca_state != TCP_CA_Open && |
2986 | tp->fackets_out > tp->reordering) { | 2982 | tp->fackets_out > tp->reordering) { |
2987 | tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering); | 2983 | tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering, 0); |
2988 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS); | 2984 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS); |
2989 | } | 2985 | } |
2990 | 2986 | ||
@@ -3356,7 +3352,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
3356 | net_invalid_timestamp())) | 3352 | net_invalid_timestamp())) |
3357 | rtt_us = ktime_us_delta(ktime_get_real(), | 3353 | rtt_us = ktime_us_delta(ktime_get_real(), |
3358 | last_ackt); | 3354 | last_ackt); |
3359 | else if (ca_seq_rtt > 0) | 3355 | else if (ca_seq_rtt >= 0) |
3360 | rtt_us = jiffies_to_usecs(ca_seq_rtt); | 3356 | rtt_us = jiffies_to_usecs(ca_seq_rtt); |
3361 | } | 3357 | } |
3362 | 3358 | ||
@@ -3412,8 +3408,8 @@ static void tcp_ack_probe(struct sock *sk) | |||
3412 | 3408 | ||
3413 | static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag) | 3409 | static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag) |
3414 | { | 3410 | { |
3415 | return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || | 3411 | return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || |
3416 | inet_csk(sk)->icsk_ca_state != TCP_CA_Open); | 3412 | inet_csk(sk)->icsk_ca_state != TCP_CA_Open; |
3417 | } | 3413 | } |
3418 | 3414 | ||
3419 | static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag) | 3415 | static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag) |
@@ -3430,9 +3426,9 @@ static inline int tcp_may_update_window(const struct tcp_sock *tp, | |||
3430 | const u32 ack, const u32 ack_seq, | 3426 | const u32 ack, const u32 ack_seq, |
3431 | const u32 nwin) | 3427 | const u32 nwin) |
3432 | { | 3428 | { |
3433 | return (after(ack, tp->snd_una) || | 3429 | return after(ack, tp->snd_una) || |
3434 | after(ack_seq, tp->snd_wl1) || | 3430 | after(ack_seq, tp->snd_wl1) || |
3435 | (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd)); | 3431 | (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd); |
3436 | } | 3432 | } |
3437 | 3433 | ||
3438 | /* Update our send window. | 3434 | /* Update our send window. |
@@ -3500,7 +3496,7 @@ static void tcp_undo_spur_to_response(struct sock *sk, int flag) | |||
3500 | if (flag & FLAG_ECE) | 3496 | if (flag & FLAG_ECE) |
3501 | tcp_ratehalving_spur_to_response(sk); | 3497 | tcp_ratehalving_spur_to_response(sk); |
3502 | else | 3498 | else |
3503 | tcp_undo_cwr(sk, 1); | 3499 | tcp_undo_cwr(sk, true); |
3504 | } | 3500 | } |
3505 | 3501 | ||
3506 | /* F-RTO spurious RTO detection algorithm (RFC4138) | 3502 | /* F-RTO spurious RTO detection algorithm (RFC4138) |
@@ -4406,7 +4402,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) | |||
4406 | if (!skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) { | 4402 | if (!skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) { |
4407 | tp->ucopy.len -= chunk; | 4403 | tp->ucopy.len -= chunk; |
4408 | tp->copied_seq += chunk; | 4404 | tp->copied_seq += chunk; |
4409 | eaten = (chunk == skb->len && !th->fin); | 4405 | eaten = (chunk == skb->len); |
4410 | tcp_rcv_space_adjust(sk); | 4406 | tcp_rcv_space_adjust(sk); |
4411 | } | 4407 | } |
4412 | local_bh_disable(); | 4408 | local_bh_disable(); |
@@ -4870,7 +4866,7 @@ static int tcp_should_expand_sndbuf(struct sock *sk) | |||
4870 | return 0; | 4866 | return 0; |
4871 | 4867 | ||
4872 | /* If we are under soft global TCP memory pressure, do not expand. */ | 4868 | /* If we are under soft global TCP memory pressure, do not expand. */ |
4873 | if (atomic_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0]) | 4869 | if (atomic_long_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0]) |
4874 | return 0; | 4870 | return 0; |
4875 | 4871 | ||
4876 | /* If we filled the congestion window, do not expand. */ | 4872 | /* If we filled the congestion window, do not expand. */ |
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 020766292bb0..708dc203b034 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
@@ -146,13 +146,15 @@ EXPORT_SYMBOL_GPL(tcp_twsk_unique); | |||
146 | /* This will initiate an outgoing connection. */ | 146 | /* This will initiate an outgoing connection. */ |
147 | int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) | 147 | int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) |
148 | { | 148 | { |
149 | struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; | ||
149 | struct inet_sock *inet = inet_sk(sk); | 150 | struct inet_sock *inet = inet_sk(sk); |
150 | struct tcp_sock *tp = tcp_sk(sk); | 151 | struct tcp_sock *tp = tcp_sk(sk); |
151 | struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; | 152 | __be16 orig_sport, orig_dport; |
152 | struct rtable *rt; | ||
153 | __be32 daddr, nexthop; | 153 | __be32 daddr, nexthop; |
154 | int tmp; | 154 | struct flowi4 *fl4; |
155 | struct rtable *rt; | ||
155 | int err; | 156 | int err; |
157 | struct ip_options_rcu *inet_opt; | ||
156 | 158 | ||
157 | if (addr_len < sizeof(struct sockaddr_in)) | 159 | if (addr_len < sizeof(struct sockaddr_in)) |
158 | return -EINVAL; | 160 | return -EINVAL; |
@@ -161,20 +163,26 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) | |||
161 | return -EAFNOSUPPORT; | 163 | return -EAFNOSUPPORT; |
162 | 164 | ||
163 | nexthop = daddr = usin->sin_addr.s_addr; | 165 | nexthop = daddr = usin->sin_addr.s_addr; |
164 | if (inet->opt && inet->opt->srr) { | 166 | inet_opt = rcu_dereference_protected(inet->inet_opt, |
167 | sock_owned_by_user(sk)); | ||
168 | if (inet_opt && inet_opt->opt.srr) { | ||
165 | if (!daddr) | 169 | if (!daddr) |
166 | return -EINVAL; | 170 | return -EINVAL; |
167 | nexthop = inet->opt->faddr; | 171 | nexthop = inet_opt->opt.faddr; |
168 | } | 172 | } |
169 | 173 | ||
170 | tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr, | 174 | orig_sport = inet->inet_sport; |
171 | RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, | 175 | orig_dport = usin->sin_port; |
172 | IPPROTO_TCP, | 176 | fl4 = &inet->cork.fl.u.ip4; |
173 | inet->inet_sport, usin->sin_port, sk, 1); | 177 | rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, |
174 | if (tmp < 0) { | 178 | RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, |
175 | if (tmp == -ENETUNREACH) | 179 | IPPROTO_TCP, |
180 | orig_sport, orig_dport, sk, true); | ||
181 | if (IS_ERR(rt)) { | ||
182 | err = PTR_ERR(rt); | ||
183 | if (err == -ENETUNREACH) | ||
176 | IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); | 184 | IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); |
177 | return tmp; | 185 | return err; |
178 | } | 186 | } |
179 | 187 | ||
180 | if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { | 188 | if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { |
@@ -182,11 +190,11 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) | |||
182 | return -ENETUNREACH; | 190 | return -ENETUNREACH; |
183 | } | 191 | } |
184 | 192 | ||
185 | if (!inet->opt || !inet->opt->srr) | 193 | if (!inet_opt || !inet_opt->opt.srr) |
186 | daddr = rt->rt_dst; | 194 | daddr = fl4->daddr; |
187 | 195 | ||
188 | if (!inet->inet_saddr) | 196 | if (!inet->inet_saddr) |
189 | inet->inet_saddr = rt->rt_src; | 197 | inet->inet_saddr = fl4->saddr; |
190 | inet->inet_rcv_saddr = inet->inet_saddr; | 198 | inet->inet_rcv_saddr = inet->inet_saddr; |
191 | 199 | ||
192 | if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { | 200 | if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { |
@@ -197,8 +205,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) | |||
197 | } | 205 | } |
198 | 206 | ||
199 | if (tcp_death_row.sysctl_tw_recycle && | 207 | if (tcp_death_row.sysctl_tw_recycle && |
200 | !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) { | 208 | !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) { |
201 | struct inet_peer *peer = rt_get_peer(rt); | 209 | struct inet_peer *peer = rt_get_peer(rt, fl4->daddr); |
202 | /* | 210 | /* |
203 | * VJ's idea. We save last timestamp seen from | 211 | * VJ's idea. We save last timestamp seen from |
204 | * the destination in peer table, when entering state | 212 | * the destination in peer table, when entering state |
@@ -218,8 +226,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) | |||
218 | inet->inet_daddr = daddr; | 226 | inet->inet_daddr = daddr; |
219 | 227 | ||
220 | inet_csk(sk)->icsk_ext_hdr_len = 0; | 228 | inet_csk(sk)->icsk_ext_hdr_len = 0; |
221 | if (inet->opt) | 229 | if (inet_opt) |
222 | inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen; | 230 | inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; |
223 | 231 | ||
224 | tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; | 232 | tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; |
225 | 233 | ||
@@ -233,11 +241,13 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) | |||
233 | if (err) | 241 | if (err) |
234 | goto failure; | 242 | goto failure; |
235 | 243 | ||
236 | err = ip_route_newports(&rt, IPPROTO_TCP, | 244 | rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, |
237 | inet->inet_sport, inet->inet_dport, sk); | 245 | inet->inet_sport, inet->inet_dport, sk); |
238 | if (err) | 246 | if (IS_ERR(rt)) { |
247 | err = PTR_ERR(rt); | ||
248 | rt = NULL; | ||
239 | goto failure; | 249 | goto failure; |
240 | 250 | } | |
241 | /* OK, now commit destination to socket. */ | 251 | /* OK, now commit destination to socket. */ |
242 | sk->sk_gso_type = SKB_GSO_TCPV4; | 252 | sk->sk_gso_type = SKB_GSO_TCPV4; |
243 | sk_setup_caps(sk, &rt->dst); | 253 | sk_setup_caps(sk, &rt->dst); |
@@ -273,7 +283,7 @@ EXPORT_SYMBOL(tcp_v4_connect); | |||
273 | /* | 283 | /* |
274 | * This routine does path mtu discovery as defined in RFC1191. | 284 | * This routine does path mtu discovery as defined in RFC1191. |
275 | */ | 285 | */ |
276 | static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu) | 286 | static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu) |
277 | { | 287 | { |
278 | struct dst_entry *dst; | 288 | struct dst_entry *dst; |
279 | struct inet_sock *inet = inet_sk(sk); | 289 | struct inet_sock *inet = inet_sk(sk); |
@@ -335,7 +345,7 @@ static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu) | |||
335 | 345 | ||
336 | void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) | 346 | void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) |
337 | { | 347 | { |
338 | struct iphdr *iph = (struct iphdr *)icmp_skb->data; | 348 | const struct iphdr *iph = (const struct iphdr *)icmp_skb->data; |
339 | struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2)); | 349 | struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2)); |
340 | struct inet_connection_sock *icsk; | 350 | struct inet_connection_sock *icsk; |
341 | struct tcp_sock *tp; | 351 | struct tcp_sock *tp; |
@@ -415,6 +425,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) | |||
415 | !icsk->icsk_backoff) | 425 | !icsk->icsk_backoff) |
416 | break; | 426 | break; |
417 | 427 | ||
428 | if (sock_owned_by_user(sk)) | ||
429 | break; | ||
430 | |||
418 | icsk->icsk_backoff--; | 431 | icsk->icsk_backoff--; |
419 | inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) << | 432 | inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) << |
420 | icsk->icsk_backoff; | 433 | icsk->icsk_backoff; |
@@ -429,11 +442,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) | |||
429 | if (remaining) { | 442 | if (remaining) { |
430 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, | 443 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, |
431 | remaining, TCP_RTO_MAX); | 444 | remaining, TCP_RTO_MAX); |
432 | } else if (sock_owned_by_user(sk)) { | ||
433 | /* RTO revert clocked out retransmission, | ||
434 | * but socket is locked. Will defer. */ | ||
435 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, | ||
436 | HZ/20, TCP_RTO_MAX); | ||
437 | } else { | 445 | } else { |
438 | /* RTO revert clocked out retransmission. | 446 | /* RTO revert clocked out retransmission. |
439 | * Will retransmit now */ | 447 | * Will retransmit now */ |
@@ -643,7 +651,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) | |||
643 | arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0; | 651 | arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0; |
644 | 652 | ||
645 | net = dev_net(skb_dst(skb)->dev); | 653 | net = dev_net(skb_dst(skb)->dev); |
646 | ip_send_reply(net->ipv4.tcp_sock, skb, | 654 | ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr, |
647 | &arg, arg.iov[0].iov_len); | 655 | &arg, arg.iov[0].iov_len); |
648 | 656 | ||
649 | TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); | 657 | TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); |
@@ -718,7 +726,7 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, | |||
718 | if (oif) | 726 | if (oif) |
719 | arg.bound_dev_if = oif; | 727 | arg.bound_dev_if = oif; |
720 | 728 | ||
721 | ip_send_reply(net->ipv4.tcp_sock, skb, | 729 | ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr, |
722 | &arg, arg.iov[0].iov_len); | 730 | &arg, arg.iov[0].iov_len); |
723 | 731 | ||
724 | TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); | 732 | TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); |
@@ -761,11 +769,12 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, | |||
761 | struct request_values *rvp) | 769 | struct request_values *rvp) |
762 | { | 770 | { |
763 | const struct inet_request_sock *ireq = inet_rsk(req); | 771 | const struct inet_request_sock *ireq = inet_rsk(req); |
772 | struct flowi4 fl4; | ||
764 | int err = -1; | 773 | int err = -1; |
765 | struct sk_buff * skb; | 774 | struct sk_buff * skb; |
766 | 775 | ||
767 | /* First, grab a route. */ | 776 | /* First, grab a route. */ |
768 | if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL) | 777 | if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) |
769 | return -1; | 778 | return -1; |
770 | 779 | ||
771 | skb = tcp_make_synack(sk, dst, req, rvp); | 780 | skb = tcp_make_synack(sk, dst, req, rvp); |
@@ -816,17 +825,18 @@ static void syn_flood_warning(const struct sk_buff *skb) | |||
816 | /* | 825 | /* |
817 | * Save and compile IPv4 options into the request_sock if needed. | 826 | * Save and compile IPv4 options into the request_sock if needed. |
818 | */ | 827 | */ |
819 | static struct ip_options *tcp_v4_save_options(struct sock *sk, | 828 | static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk, |
820 | struct sk_buff *skb) | 829 | struct sk_buff *skb) |
821 | { | 830 | { |
822 | struct ip_options *opt = &(IPCB(skb)->opt); | 831 | const struct ip_options *opt = &(IPCB(skb)->opt); |
823 | struct ip_options *dopt = NULL; | 832 | struct ip_options_rcu *dopt = NULL; |
824 | 833 | ||
825 | if (opt && opt->optlen) { | 834 | if (opt && opt->optlen) { |
826 | int opt_size = optlength(opt); | 835 | int opt_size = sizeof(*dopt) + opt->optlen; |
836 | |||
827 | dopt = kmalloc(opt_size, GFP_ATOMIC); | 837 | dopt = kmalloc(opt_size, GFP_ATOMIC); |
828 | if (dopt) { | 838 | if (dopt) { |
829 | if (ip_options_echo(dopt, skb)) { | 839 | if (ip_options_echo(&dopt->opt, skb)) { |
830 | kfree(dopt); | 840 | kfree(dopt); |
831 | dopt = NULL; | 841 | dopt = NULL; |
832 | } | 842 | } |
@@ -1212,12 +1222,6 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { | |||
1212 | }; | 1222 | }; |
1213 | #endif | 1223 | #endif |
1214 | 1224 | ||
1215 | static struct timewait_sock_ops tcp_timewait_sock_ops = { | ||
1216 | .twsk_obj_size = sizeof(struct tcp_timewait_sock), | ||
1217 | .twsk_unique = tcp_twsk_unique, | ||
1218 | .twsk_destructor= tcp_twsk_destructor, | ||
1219 | }; | ||
1220 | |||
1221 | int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | 1225 | int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) |
1222 | { | 1226 | { |
1223 | struct tcp_extend_values tmp_ext; | 1227 | struct tcp_extend_values tmp_ext; |
@@ -1335,6 +1339,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | |||
1335 | req->cookie_ts = tmp_opt.tstamp_ok; | 1339 | req->cookie_ts = tmp_opt.tstamp_ok; |
1336 | } else if (!isn) { | 1340 | } else if (!isn) { |
1337 | struct inet_peer *peer = NULL; | 1341 | struct inet_peer *peer = NULL; |
1342 | struct flowi4 fl4; | ||
1338 | 1343 | ||
1339 | /* VJ's idea. We save last timestamp seen | 1344 | /* VJ's idea. We save last timestamp seen |
1340 | * from the destination in peer table, when entering | 1345 | * from the destination in peer table, when entering |
@@ -1347,9 +1352,9 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | |||
1347 | */ | 1352 | */ |
1348 | if (tmp_opt.saw_tstamp && | 1353 | if (tmp_opt.saw_tstamp && |
1349 | tcp_death_row.sysctl_tw_recycle && | 1354 | tcp_death_row.sysctl_tw_recycle && |
1350 | (dst = inet_csk_route_req(sk, req)) != NULL && | 1355 | (dst = inet_csk_route_req(sk, &fl4, req)) != NULL && |
1351 | (peer = rt_get_peer((struct rtable *)dst)) != NULL && | 1356 | fl4.daddr == saddr && |
1352 | peer->v4daddr == saddr) { | 1357 | (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) { |
1353 | inet_peer_refcheck(peer); | 1358 | inet_peer_refcheck(peer); |
1354 | if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && | 1359 | if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && |
1355 | (s32)(peer->tcp_ts - req->ts_recent) > | 1360 | (s32)(peer->tcp_ts - req->ts_recent) > |
@@ -1413,19 +1418,16 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, | |||
1413 | #ifdef CONFIG_TCP_MD5SIG | 1418 | #ifdef CONFIG_TCP_MD5SIG |
1414 | struct tcp_md5sig_key *key; | 1419 | struct tcp_md5sig_key *key; |
1415 | #endif | 1420 | #endif |
1421 | struct ip_options_rcu *inet_opt; | ||
1416 | 1422 | ||
1417 | if (sk_acceptq_is_full(sk)) | 1423 | if (sk_acceptq_is_full(sk)) |
1418 | goto exit_overflow; | 1424 | goto exit_overflow; |
1419 | 1425 | ||
1420 | if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL) | ||
1421 | goto exit; | ||
1422 | |||
1423 | newsk = tcp_create_openreq_child(sk, req, skb); | 1426 | newsk = tcp_create_openreq_child(sk, req, skb); |
1424 | if (!newsk) | 1427 | if (!newsk) |
1425 | goto exit; | 1428 | goto exit_nonewsk; |
1426 | 1429 | ||
1427 | newsk->sk_gso_type = SKB_GSO_TCPV4; | 1430 | newsk->sk_gso_type = SKB_GSO_TCPV4; |
1428 | sk_setup_caps(newsk, dst); | ||
1429 | 1431 | ||
1430 | newtp = tcp_sk(newsk); | 1432 | newtp = tcp_sk(newsk); |
1431 | newinet = inet_sk(newsk); | 1433 | newinet = inet_sk(newsk); |
@@ -1433,18 +1435,24 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, | |||
1433 | newinet->inet_daddr = ireq->rmt_addr; | 1435 | newinet->inet_daddr = ireq->rmt_addr; |
1434 | newinet->inet_rcv_saddr = ireq->loc_addr; | 1436 | newinet->inet_rcv_saddr = ireq->loc_addr; |
1435 | newinet->inet_saddr = ireq->loc_addr; | 1437 | newinet->inet_saddr = ireq->loc_addr; |
1436 | newinet->opt = ireq->opt; | 1438 | inet_opt = ireq->opt; |
1439 | rcu_assign_pointer(newinet->inet_opt, inet_opt); | ||
1437 | ireq->opt = NULL; | 1440 | ireq->opt = NULL; |
1438 | newinet->mc_index = inet_iif(skb); | 1441 | newinet->mc_index = inet_iif(skb); |
1439 | newinet->mc_ttl = ip_hdr(skb)->ttl; | 1442 | newinet->mc_ttl = ip_hdr(skb)->ttl; |
1440 | inet_csk(newsk)->icsk_ext_hdr_len = 0; | 1443 | inet_csk(newsk)->icsk_ext_hdr_len = 0; |
1441 | if (newinet->opt) | 1444 | if (inet_opt) |
1442 | inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen; | 1445 | inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; |
1443 | newinet->inet_id = newtp->write_seq ^ jiffies; | 1446 | newinet->inet_id = newtp->write_seq ^ jiffies; |
1444 | 1447 | ||
1448 | if (!dst && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL) | ||
1449 | goto put_and_exit; | ||
1450 | |||
1451 | sk_setup_caps(newsk, dst); | ||
1452 | |||
1445 | tcp_mtup_init(newsk); | 1453 | tcp_mtup_init(newsk); |
1446 | tcp_sync_mss(newsk, dst_mtu(dst)); | 1454 | tcp_sync_mss(newsk, dst_mtu(dst)); |
1447 | newtp->advmss = dst_metric(dst, RTAX_ADVMSS); | 1455 | newtp->advmss = dst_metric_advmss(dst); |
1448 | if (tcp_sk(sk)->rx_opt.user_mss && | 1456 | if (tcp_sk(sk)->rx_opt.user_mss && |
1449 | tcp_sk(sk)->rx_opt.user_mss < newtp->advmss) | 1457 | tcp_sk(sk)->rx_opt.user_mss < newtp->advmss) |
1450 | newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; | 1458 | newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; |
@@ -1469,17 +1477,22 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, | |||
1469 | } | 1477 | } |
1470 | #endif | 1478 | #endif |
1471 | 1479 | ||
1480 | if (__inet_inherit_port(sk, newsk) < 0) | ||
1481 | goto put_and_exit; | ||
1472 | __inet_hash_nolisten(newsk, NULL); | 1482 | __inet_hash_nolisten(newsk, NULL); |
1473 | __inet_inherit_port(sk, newsk); | ||
1474 | 1483 | ||
1475 | return newsk; | 1484 | return newsk; |
1476 | 1485 | ||
1477 | exit_overflow: | 1486 | exit_overflow: |
1478 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); | 1487 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); |
1488 | exit_nonewsk: | ||
1489 | dst_release(dst); | ||
1479 | exit: | 1490 | exit: |
1480 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); | 1491 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); |
1481 | dst_release(dst); | ||
1482 | return NULL; | 1492 | return NULL; |
1493 | put_and_exit: | ||
1494 | sock_put(newsk); | ||
1495 | goto exit; | ||
1483 | } | 1496 | } |
1484 | EXPORT_SYMBOL(tcp_v4_syn_recv_sock); | 1497 | EXPORT_SYMBOL(tcp_v4_syn_recv_sock); |
1485 | 1498 | ||
@@ -1560,12 +1573,10 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) | |||
1560 | 1573 | ||
1561 | if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ | 1574 | if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ |
1562 | sock_rps_save_rxhash(sk, skb->rxhash); | 1575 | sock_rps_save_rxhash(sk, skb->rxhash); |
1563 | TCP_CHECK_TIMER(sk); | ||
1564 | if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { | 1576 | if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { |
1565 | rsk = sk; | 1577 | rsk = sk; |
1566 | goto reset; | 1578 | goto reset; |
1567 | } | 1579 | } |
1568 | TCP_CHECK_TIMER(sk); | ||
1569 | return 0; | 1580 | return 0; |
1570 | } | 1581 | } |
1571 | 1582 | ||
@@ -1578,6 +1589,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) | |||
1578 | goto discard; | 1589 | goto discard; |
1579 | 1590 | ||
1580 | if (nsk != sk) { | 1591 | if (nsk != sk) { |
1592 | sock_rps_save_rxhash(nsk, skb->rxhash); | ||
1581 | if (tcp_child_process(sk, nsk, skb)) { | 1593 | if (tcp_child_process(sk, nsk, skb)) { |
1582 | rsk = nsk; | 1594 | rsk = nsk; |
1583 | goto reset; | 1595 | goto reset; |
@@ -1587,13 +1599,10 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) | |||
1587 | } else | 1599 | } else |
1588 | sock_rps_save_rxhash(sk, skb->rxhash); | 1600 | sock_rps_save_rxhash(sk, skb->rxhash); |
1589 | 1601 | ||
1590 | |||
1591 | TCP_CHECK_TIMER(sk); | ||
1592 | if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) { | 1602 | if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) { |
1593 | rsk = sk; | 1603 | rsk = sk; |
1594 | goto reset; | 1604 | goto reset; |
1595 | } | 1605 | } |
1596 | TCP_CHECK_TIMER(sk); | ||
1597 | return 0; | 1606 | return 0; |
1598 | 1607 | ||
1599 | reset: | 1608 | reset: |
@@ -1761,64 +1770,41 @@ do_time_wait: | |||
1761 | goto discard_it; | 1770 | goto discard_it; |
1762 | } | 1771 | } |
1763 | 1772 | ||
1764 | /* VJ's idea. Save last timestamp seen from this destination | 1773 | struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it) |
1765 | * and hold it at least for normal timewait interval to use for duplicate | ||
1766 | * segment detection in subsequent connections, before they enter synchronized | ||
1767 | * state. | ||
1768 | */ | ||
1769 | |||
1770 | int tcp_v4_remember_stamp(struct sock *sk) | ||
1771 | { | 1774 | { |
1775 | struct rtable *rt = (struct rtable *) __sk_dst_get(sk); | ||
1772 | struct inet_sock *inet = inet_sk(sk); | 1776 | struct inet_sock *inet = inet_sk(sk); |
1773 | struct tcp_sock *tp = tcp_sk(sk); | 1777 | struct inet_peer *peer; |
1774 | struct rtable *rt = (struct rtable *)__sk_dst_get(sk); | ||
1775 | struct inet_peer *peer = NULL; | ||
1776 | int release_it = 0; | ||
1777 | 1778 | ||
1778 | if (!rt || rt->rt_dst != inet->inet_daddr) { | 1779 | if (!rt || |
1779 | peer = inet_getpeer(inet->inet_daddr, 1); | 1780 | inet->cork.fl.u.ip4.daddr != inet->inet_daddr) { |
1780 | release_it = 1; | 1781 | peer = inet_getpeer_v4(inet->inet_daddr, 1); |
1782 | *release_it = true; | ||
1781 | } else { | 1783 | } else { |
1782 | if (!rt->peer) | 1784 | if (!rt->peer) |
1783 | rt_bind_peer(rt, 1); | 1785 | rt_bind_peer(rt, inet->inet_daddr, 1); |
1784 | peer = rt->peer; | 1786 | peer = rt->peer; |
1787 | *release_it = false; | ||
1785 | } | 1788 | } |
1786 | 1789 | ||
1787 | if (peer) { | 1790 | return peer; |
1788 | if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 || | ||
1789 | ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && | ||
1790 | peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) { | ||
1791 | peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp; | ||
1792 | peer->tcp_ts = tp->rx_opt.ts_recent; | ||
1793 | } | ||
1794 | if (release_it) | ||
1795 | inet_putpeer(peer); | ||
1796 | return 1; | ||
1797 | } | ||
1798 | |||
1799 | return 0; | ||
1800 | } | 1791 | } |
1801 | EXPORT_SYMBOL(tcp_v4_remember_stamp); | 1792 | EXPORT_SYMBOL(tcp_v4_get_peer); |
1802 | 1793 | ||
1803 | int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw) | 1794 | void *tcp_v4_tw_get_peer(struct sock *sk) |
1804 | { | 1795 | { |
1805 | struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1); | 1796 | struct inet_timewait_sock *tw = inet_twsk(sk); |
1806 | |||
1807 | if (peer) { | ||
1808 | const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); | ||
1809 | |||
1810 | if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 || | ||
1811 | ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && | ||
1812 | peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) { | ||
1813 | peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp; | ||
1814 | peer->tcp_ts = tcptw->tw_ts_recent; | ||
1815 | } | ||
1816 | inet_putpeer(peer); | ||
1817 | return 1; | ||
1818 | } | ||
1819 | 1797 | ||
1820 | return 0; | 1798 | return inet_getpeer_v4(tw->tw_daddr, 1); |
1821 | } | 1799 | } |
1800 | EXPORT_SYMBOL(tcp_v4_tw_get_peer); | ||
1801 | |||
1802 | static struct timewait_sock_ops tcp_timewait_sock_ops = { | ||
1803 | .twsk_obj_size = sizeof(struct tcp_timewait_sock), | ||
1804 | .twsk_unique = tcp_twsk_unique, | ||
1805 | .twsk_destructor= tcp_twsk_destructor, | ||
1806 | .twsk_getpeer = tcp_v4_tw_get_peer, | ||
1807 | }; | ||
1822 | 1808 | ||
1823 | const struct inet_connection_sock_af_ops ipv4_specific = { | 1809 | const struct inet_connection_sock_af_ops ipv4_specific = { |
1824 | .queue_xmit = ip_queue_xmit, | 1810 | .queue_xmit = ip_queue_xmit, |
@@ -1826,7 +1812,7 @@ const struct inet_connection_sock_af_ops ipv4_specific = { | |||
1826 | .rebuild_header = inet_sk_rebuild_header, | 1812 | .rebuild_header = inet_sk_rebuild_header, |
1827 | .conn_request = tcp_v4_conn_request, | 1813 | .conn_request = tcp_v4_conn_request, |
1828 | .syn_recv_sock = tcp_v4_syn_recv_sock, | 1814 | .syn_recv_sock = tcp_v4_syn_recv_sock, |
1829 | .remember_stamp = tcp_v4_remember_stamp, | 1815 | .get_peer = tcp_v4_get_peer, |
1830 | .net_header_len = sizeof(struct iphdr), | 1816 | .net_header_len = sizeof(struct iphdr), |
1831 | .setsockopt = ip_setsockopt, | 1817 | .setsockopt = ip_setsockopt, |
1832 | .getsockopt = ip_getsockopt, | 1818 | .getsockopt = ip_getsockopt, |
@@ -2022,13 +2008,12 @@ static void *listening_get_next(struct seq_file *seq, void *cur) | |||
2022 | } | 2008 | } |
2023 | req = req->dl_next; | 2009 | req = req->dl_next; |
2024 | } | 2010 | } |
2025 | st->offset = 0; | ||
2026 | if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries) | 2011 | if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries) |
2027 | break; | 2012 | break; |
2028 | get_req: | 2013 | get_req: |
2029 | req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket]; | 2014 | req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket]; |
2030 | } | 2015 | } |
2031 | sk = sk_next(st->syn_wait_sk); | 2016 | sk = sk_nulls_next(st->syn_wait_sk); |
2032 | st->state = TCP_SEQ_STATE_LISTENING; | 2017 | st->state = TCP_SEQ_STATE_LISTENING; |
2033 | read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); | 2018 | read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); |
2034 | } else { | 2019 | } else { |
@@ -2037,11 +2022,13 @@ get_req: | |||
2037 | if (reqsk_queue_len(&icsk->icsk_accept_queue)) | 2022 | if (reqsk_queue_len(&icsk->icsk_accept_queue)) |
2038 | goto start_req; | 2023 | goto start_req; |
2039 | read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); | 2024 | read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); |
2040 | sk = sk_next(sk); | 2025 | sk = sk_nulls_next(sk); |
2041 | } | 2026 | } |
2042 | get_sk: | 2027 | get_sk: |
2043 | sk_nulls_for_each_from(sk, node) { | 2028 | sk_nulls_for_each_from(sk, node) { |
2044 | if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) { | 2029 | if (!net_eq(sock_net(sk), net)) |
2030 | continue; | ||
2031 | if (sk->sk_family == st->family) { | ||
2045 | cur = sk; | 2032 | cur = sk; |
2046 | goto out; | 2033 | goto out; |
2047 | } | 2034 | } |
@@ -2385,7 +2372,7 @@ static void get_openreq4(struct sock *sk, struct request_sock *req, | |||
2385 | int ttd = req->expires - jiffies; | 2372 | int ttd = req->expires - jiffies; |
2386 | 2373 | ||
2387 | seq_printf(f, "%4d: %08X:%04X %08X:%04X" | 2374 | seq_printf(f, "%4d: %08X:%04X %08X:%04X" |
2388 | " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n", | 2375 | " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n", |
2389 | i, | 2376 | i, |
2390 | ireq->loc_addr, | 2377 | ireq->loc_addr, |
2391 | ntohs(inet_sk(sk)->inet_sport), | 2378 | ntohs(inet_sk(sk)->inet_sport), |
@@ -2440,7 +2427,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) | |||
2440 | rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); | 2427 | rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); |
2441 | 2428 | ||
2442 | seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " | 2429 | seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " |
2443 | "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n", | 2430 | "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n", |
2444 | i, src, srcp, dest, destp, sk->sk_state, | 2431 | i, src, srcp, dest, destp, sk->sk_state, |
2445 | tp->write_seq - tp->snd_una, | 2432 | tp->write_seq - tp->snd_una, |
2446 | rx_queue, | 2433 | rx_queue, |
@@ -2475,7 +2462,7 @@ static void get_timewait4_sock(struct inet_timewait_sock *tw, | |||
2475 | srcp = ntohs(tw->tw_sport); | 2462 | srcp = ntohs(tw->tw_sport); |
2476 | 2463 | ||
2477 | seq_printf(f, "%4d: %08X:%04X %08X:%04X" | 2464 | seq_printf(f, "%4d: %08X:%04X %08X:%04X" |
2478 | " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n", | 2465 | " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n", |
2479 | i, src, srcp, dest, destp, tw->tw_substate, 0, 0, | 2466 | i, src, srcp, dest, destp, tw->tw_substate, 0, 0, |
2480 | 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0, | 2467 | 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0, |
2481 | atomic_read(&tw->tw_refcnt), tw, len); | 2468 | atomic_read(&tw->tw_refcnt), tw, len); |
@@ -2553,7 +2540,7 @@ void tcp4_proc_exit(void) | |||
2553 | 2540 | ||
2554 | struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) | 2541 | struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) |
2555 | { | 2542 | { |
2556 | struct iphdr *iph = skb_gro_network_header(skb); | 2543 | const struct iphdr *iph = skb_gro_network_header(skb); |
2557 | 2544 | ||
2558 | switch (skb->ip_summed) { | 2545 | switch (skb->ip_summed) { |
2559 | case CHECKSUM_COMPLETE: | 2546 | case CHECKSUM_COMPLETE: |
@@ -2571,11 +2558,10 @@ struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) | |||
2571 | 2558 | ||
2572 | return tcp_gro_receive(head, skb); | 2559 | return tcp_gro_receive(head, skb); |
2573 | } | 2560 | } |
2574 | EXPORT_SYMBOL(tcp4_gro_receive); | ||
2575 | 2561 | ||
2576 | int tcp4_gro_complete(struct sk_buff *skb) | 2562 | int tcp4_gro_complete(struct sk_buff *skb) |
2577 | { | 2563 | { |
2578 | struct iphdr *iph = ip_hdr(skb); | 2564 | const struct iphdr *iph = ip_hdr(skb); |
2579 | struct tcphdr *th = tcp_hdr(skb); | 2565 | struct tcphdr *th = tcp_hdr(skb); |
2580 | 2566 | ||
2581 | th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb), | 2567 | th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb), |
@@ -2584,7 +2570,6 @@ int tcp4_gro_complete(struct sk_buff *skb) | |||
2584 | 2570 | ||
2585 | return tcp_gro_complete(skb); | 2571 | return tcp_gro_complete(skb); |
2586 | } | 2572 | } |
2587 | EXPORT_SYMBOL(tcp4_gro_complete); | ||
2588 | 2573 | ||
2589 | struct proto tcp_prot = { | 2574 | struct proto tcp_prot = { |
2590 | .name = "TCP", | 2575 | .name = "TCP", |
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index de870377fbba..72f7218b03f5 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c | |||
@@ -12,7 +12,7 @@ | |||
12 | * within cong_avoid. | 12 | * within cong_avoid. |
13 | * o Error correcting in remote HZ, therefore remote HZ will be keeped | 13 | * o Error correcting in remote HZ, therefore remote HZ will be keeped |
14 | * on checking and updating. | 14 | * on checking and updating. |
15 | * o Handling calculation of One-Way-Delay (OWD) within rtt_sample, sicne | 15 | * o Handling calculation of One-Way-Delay (OWD) within rtt_sample, since |
16 | * OWD have a similar meaning as RTT. Also correct the buggy formular. | 16 | * OWD have a similar meaning as RTT. Also correct the buggy formular. |
17 | * o Handle reaction for Early Congestion Indication (ECI) within | 17 | * o Handle reaction for Early Congestion Indication (ECI) within |
18 | * pkts_acked, as mentioned within pseudo code. | 18 | * pkts_acked, as mentioned within pseudo code. |
@@ -313,7 +313,7 @@ static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, s32 rtt_us) | |||
313 | lp->last_drop = tcp_time_stamp; | 313 | lp->last_drop = tcp_time_stamp; |
314 | } | 314 | } |
315 | 315 | ||
316 | static struct tcp_congestion_ops tcp_lp = { | 316 | static struct tcp_congestion_ops tcp_lp __read_mostly = { |
317 | .flags = TCP_CONG_RTT_STAMP, | 317 | .flags = TCP_CONG_RTT_STAMP, |
318 | .init = tcp_lp_init, | 318 | .init = tcp_lp_init, |
319 | .ssthresh = tcp_reno_ssthresh, | 319 | .ssthresh = tcp_reno_ssthresh, |
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index f25b56cb85cb..80b1f80759ab 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c | |||
@@ -49,13 +49,63 @@ struct inet_timewait_death_row tcp_death_row = { | |||
49 | }; | 49 | }; |
50 | EXPORT_SYMBOL_GPL(tcp_death_row); | 50 | EXPORT_SYMBOL_GPL(tcp_death_row); |
51 | 51 | ||
52 | /* VJ's idea. Save last timestamp seen from this destination | ||
53 | * and hold it at least for normal timewait interval to use for duplicate | ||
54 | * segment detection in subsequent connections, before they enter synchronized | ||
55 | * state. | ||
56 | */ | ||
57 | |||
58 | static int tcp_remember_stamp(struct sock *sk) | ||
59 | { | ||
60 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
61 | struct tcp_sock *tp = tcp_sk(sk); | ||
62 | struct inet_peer *peer; | ||
63 | bool release_it; | ||
64 | |||
65 | peer = icsk->icsk_af_ops->get_peer(sk, &release_it); | ||
66 | if (peer) { | ||
67 | if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 || | ||
68 | ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && | ||
69 | peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) { | ||
70 | peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp; | ||
71 | peer->tcp_ts = tp->rx_opt.ts_recent; | ||
72 | } | ||
73 | if (release_it) | ||
74 | inet_putpeer(peer); | ||
75 | return 1; | ||
76 | } | ||
77 | |||
78 | return 0; | ||
79 | } | ||
80 | |||
81 | static int tcp_tw_remember_stamp(struct inet_timewait_sock *tw) | ||
82 | { | ||
83 | struct sock *sk = (struct sock *) tw; | ||
84 | struct inet_peer *peer; | ||
85 | |||
86 | peer = twsk_getpeer(sk); | ||
87 | if (peer) { | ||
88 | const struct tcp_timewait_sock *tcptw = tcp_twsk(sk); | ||
89 | |||
90 | if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 || | ||
91 | ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && | ||
92 | peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) { | ||
93 | peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp; | ||
94 | peer->tcp_ts = tcptw->tw_ts_recent; | ||
95 | } | ||
96 | inet_putpeer(peer); | ||
97 | return 1; | ||
98 | } | ||
99 | return 0; | ||
100 | } | ||
101 | |||
52 | static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) | 102 | static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) |
53 | { | 103 | { |
54 | if (seq == s_win) | 104 | if (seq == s_win) |
55 | return 1; | 105 | return 1; |
56 | if (after(end_seq, s_win) && before(seq, e_win)) | 106 | if (after(end_seq, s_win) && before(seq, e_win)) |
57 | return 1; | 107 | return 1; |
58 | return (seq == e_win && seq == end_seq); | 108 | return seq == e_win && seq == end_seq; |
59 | } | 109 | } |
60 | 110 | ||
61 | /* | 111 | /* |
@@ -149,14 +199,9 @@ kill_with_rst: | |||
149 | tcptw->tw_ts_recent = tmp_opt.rcv_tsval; | 199 | tcptw->tw_ts_recent = tmp_opt.rcv_tsval; |
150 | } | 200 | } |
151 | 201 | ||
152 | /* I am shamed, but failed to make it more elegant. | 202 | if (tcp_death_row.sysctl_tw_recycle && |
153 | * Yes, it is direct reference to IP, which is impossible | 203 | tcptw->tw_ts_recent_stamp && |
154 | * to generalize to IPv6. Taking into account that IPv6 | 204 | tcp_tw_remember_stamp(tw)) |
155 | * do not understand recycling in any case, it not | ||
156 | * a big problem in practice. --ANK */ | ||
157 | if (tw->tw_family == AF_INET && | ||
158 | tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && | ||
159 | tcp_v4_tw_remember_stamp(tw)) | ||
160 | inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout, | 205 | inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout, |
161 | TCP_TIMEWAIT_LEN); | 206 | TCP_TIMEWAIT_LEN); |
162 | else | 207 | else |
@@ -274,7 +319,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) | |||
274 | int recycle_ok = 0; | 319 | int recycle_ok = 0; |
275 | 320 | ||
276 | if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) | 321 | if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) |
277 | recycle_ok = icsk->icsk_af_ops->remember_stamp(sk); | 322 | recycle_ok = tcp_remember_stamp(sk); |
278 | 323 | ||
279 | if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) | 324 | if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) |
280 | tw = inet_twsk_alloc(sk, state); | 325 | tw = inet_twsk_alloc(sk, state); |
@@ -347,7 +392,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) | |||
347 | * socket up. We've got bigger problems than | 392 | * socket up. We've got bigger problems than |
348 | * non-graceful socket closings. | 393 | * non-graceful socket closings. |
349 | */ | 394 | */ |
350 | LIMIT_NETDEBUG(KERN_INFO "TCP: time wait bucket table overflow\n"); | 395 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW); |
351 | } | 396 | } |
352 | 397 | ||
353 | tcp_update_metrics(sk); | 398 | tcp_update_metrics(sk); |
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index de3bd8458588..882e0b0964d0 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -55,7 +55,7 @@ int sysctl_tcp_workaround_signed_windows __read_mostly = 0; | |||
55 | int sysctl_tcp_tso_win_divisor __read_mostly = 3; | 55 | int sysctl_tcp_tso_win_divisor __read_mostly = 3; |
56 | 56 | ||
57 | int sysctl_tcp_mtu_probing __read_mostly = 0; | 57 | int sysctl_tcp_mtu_probing __read_mostly = 0; |
58 | int sysctl_tcp_base_mss __read_mostly = 512; | 58 | int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS; |
59 | 59 | ||
60 | /* By default, RFC2861 behavior. */ | 60 | /* By default, RFC2861 behavior. */ |
61 | int sysctl_tcp_slow_start_after_idle __read_mostly = 1; | 61 | int sysctl_tcp_slow_start_after_idle __read_mostly = 1; |
@@ -73,7 +73,7 @@ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) | |||
73 | tcp_advance_send_head(sk, skb); | 73 | tcp_advance_send_head(sk, skb); |
74 | tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; | 74 | tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; |
75 | 75 | ||
76 | /* Don't override Nagle indefinately with F-RTO */ | 76 | /* Don't override Nagle indefinitely with F-RTO */ |
77 | if (tp->frto_counter == 2) | 77 | if (tp->frto_counter == 2) |
78 | tp->frto_counter = 3; | 78 | tp->frto_counter = 3; |
79 | 79 | ||
@@ -119,9 +119,13 @@ static __u16 tcp_advertise_mss(struct sock *sk) | |||
119 | struct dst_entry *dst = __sk_dst_get(sk); | 119 | struct dst_entry *dst = __sk_dst_get(sk); |
120 | int mss = tp->advmss; | 120 | int mss = tp->advmss; |
121 | 121 | ||
122 | if (dst && dst_metric(dst, RTAX_ADVMSS) < mss) { | 122 | if (dst) { |
123 | mss = dst_metric(dst, RTAX_ADVMSS); | 123 | unsigned int metric = dst_metric_advmss(dst); |
124 | tp->advmss = mss; | 124 | |
125 | if (metric < mss) { | ||
126 | mss = metric; | ||
127 | tp->advmss = mss; | ||
128 | } | ||
125 | } | 129 | } |
126 | 130 | ||
127 | return (__u16)mss; | 131 | return (__u16)mss; |
@@ -224,24 +228,22 @@ void tcp_select_initial_window(int __space, __u32 mss, | |||
224 | } | 228 | } |
225 | } | 229 | } |
226 | 230 | ||
227 | /* Set initial window to value enough for senders, | 231 | /* Set initial window to a value enough for senders starting with |
228 | * following RFC2414. Senders, not following this RFC, | 232 | * initial congestion window of TCP_DEFAULT_INIT_RCVWND. Place |
229 | * will be satisfied with 2. | 233 | * a limit on the initial window when mss is larger than 1460. |
230 | */ | 234 | */ |
231 | if (mss > (1 << *rcv_wscale)) { | 235 | if (mss > (1 << *rcv_wscale)) { |
232 | int init_cwnd = 4; | 236 | int init_cwnd = TCP_DEFAULT_INIT_RCVWND; |
233 | if (mss > 1460 * 3) | 237 | if (mss > 1460) |
234 | init_cwnd = 2; | 238 | init_cwnd = |
235 | else if (mss > 1460) | 239 | max_t(u32, (1460 * TCP_DEFAULT_INIT_RCVWND) / mss, 2); |
236 | init_cwnd = 3; | ||
237 | /* when initializing use the value from init_rcv_wnd | 240 | /* when initializing use the value from init_rcv_wnd |
238 | * rather than the default from above | 241 | * rather than the default from above |
239 | */ | 242 | */ |
240 | if (init_rcv_wnd && | 243 | if (init_rcv_wnd) |
241 | (*rcv_wnd > init_rcv_wnd * mss)) | 244 | *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); |
242 | *rcv_wnd = init_rcv_wnd * mss; | 245 | else |
243 | else if (*rcv_wnd > init_cwnd * mss) | 246 | *rcv_wnd = min(*rcv_wnd, init_cwnd * mss); |
244 | *rcv_wnd = init_cwnd * mss; | ||
245 | } | 247 | } |
246 | 248 | ||
247 | /* Set the clamp no higher than max representable value */ | 249 | /* Set the clamp no higher than max representable value */ |
@@ -392,27 +394,30 @@ struct tcp_out_options { | |||
392 | */ | 394 | */ |
393 | static u8 tcp_cookie_size_check(u8 desired) | 395 | static u8 tcp_cookie_size_check(u8 desired) |
394 | { | 396 | { |
395 | if (desired > 0) { | 397 | int cookie_size; |
398 | |||
399 | if (desired > 0) | ||
396 | /* previously specified */ | 400 | /* previously specified */ |
397 | return desired; | 401 | return desired; |
398 | } | 402 | |
399 | if (sysctl_tcp_cookie_size <= 0) { | 403 | cookie_size = ACCESS_ONCE(sysctl_tcp_cookie_size); |
404 | if (cookie_size <= 0) | ||
400 | /* no default specified */ | 405 | /* no default specified */ |
401 | return 0; | 406 | return 0; |
402 | } | 407 | |
403 | if (sysctl_tcp_cookie_size <= TCP_COOKIE_MIN) { | 408 | if (cookie_size <= TCP_COOKIE_MIN) |
404 | /* value too small, specify minimum */ | 409 | /* value too small, specify minimum */ |
405 | return TCP_COOKIE_MIN; | 410 | return TCP_COOKIE_MIN; |
406 | } | 411 | |
407 | if (sysctl_tcp_cookie_size >= TCP_COOKIE_MAX) { | 412 | if (cookie_size >= TCP_COOKIE_MAX) |
408 | /* value too large, specify maximum */ | 413 | /* value too large, specify maximum */ |
409 | return TCP_COOKIE_MAX; | 414 | return TCP_COOKIE_MAX; |
410 | } | 415 | |
411 | if (0x1 & sysctl_tcp_cookie_size) { | 416 | if (cookie_size & 1) |
412 | /* 8-bit multiple, illegal, fix it */ | 417 | /* 8-bit multiple, illegal, fix it */ |
413 | return (u8)(sysctl_tcp_cookie_size + 0x1); | 418 | cookie_size++; |
414 | } | 419 | |
415 | return (u8)sysctl_tcp_cookie_size; | 420 | return (u8)cookie_size; |
416 | } | 421 | } |
417 | 422 | ||
418 | /* Write previously computed TCP options to the packet. | 423 | /* Write previously computed TCP options to the packet. |
@@ -828,8 +833,11 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
828 | &md5); | 833 | &md5); |
829 | tcp_header_size = tcp_options_size + sizeof(struct tcphdr); | 834 | tcp_header_size = tcp_options_size + sizeof(struct tcphdr); |
830 | 835 | ||
831 | if (tcp_packets_in_flight(tp) == 0) | 836 | if (tcp_packets_in_flight(tp) == 0) { |
832 | tcp_ca_event(sk, CA_EVENT_TX_START); | 837 | tcp_ca_event(sk, CA_EVENT_TX_START); |
838 | skb->ooo_okay = 1; | ||
839 | } else | ||
840 | skb->ooo_okay = 0; | ||
833 | 841 | ||
834 | skb_push(skb, tcp_header_size); | 842 | skb_push(skb, tcp_header_size); |
835 | skb_reset_transport_header(skb); | 843 | skb_reset_transport_header(skb); |
@@ -891,7 +899,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
891 | TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, | 899 | TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, |
892 | tcp_skb_pcount(skb)); | 900 | tcp_skb_pcount(skb)); |
893 | 901 | ||
894 | err = icsk->icsk_af_ops->queue_xmit(skb); | 902 | err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl); |
895 | if (likely(err <= 0)) | 903 | if (likely(err <= 0)) |
896 | return err; | 904 | return err; |
897 | 905 | ||
@@ -995,7 +1003,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, | |||
995 | int nlen; | 1003 | int nlen; |
996 | u8 flags; | 1004 | u8 flags; |
997 | 1005 | ||
998 | BUG_ON(len > skb->len); | 1006 | if (WARN_ON(len > skb->len)) |
1007 | return -EINVAL; | ||
999 | 1008 | ||
1000 | nsize = skb_headlen(skb) - len; | 1009 | nsize = skb_headlen(skb) - len; |
1001 | if (nsize < 0) | 1010 | if (nsize < 0) |
@@ -1342,7 +1351,7 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, | |||
1342 | return 0; | 1351 | return 0; |
1343 | } | 1352 | } |
1344 | 1353 | ||
1345 | /* Intialize TSO state of a skb. | 1354 | /* Initialize TSO state of a skb. |
1346 | * This must be invoked the first time we consider transmitting | 1355 | * This must be invoked the first time we consider transmitting |
1347 | * SKB onto the wire. | 1356 | * SKB onto the wire. |
1348 | */ | 1357 | */ |
@@ -1376,9 +1385,9 @@ static inline int tcp_nagle_check(const struct tcp_sock *tp, | |||
1376 | const struct sk_buff *skb, | 1385 | const struct sk_buff *skb, |
1377 | unsigned mss_now, int nonagle) | 1386 | unsigned mss_now, int nonagle) |
1378 | { | 1387 | { |
1379 | return (skb->len < mss_now && | 1388 | return skb->len < mss_now && |
1380 | ((nonagle & TCP_NAGLE_CORK) || | 1389 | ((nonagle & TCP_NAGLE_CORK) || |
1381 | (!nonagle && tp->packets_out && tcp_minshall_check(tp)))); | 1390 | (!nonagle && tp->packets_out && tcp_minshall_check(tp))); |
1382 | } | 1391 | } |
1383 | 1392 | ||
1384 | /* Return non-zero if the Nagle test allows this packet to be | 1393 | /* Return non-zero if the Nagle test allows this packet to be |
@@ -1449,10 +1458,10 @@ int tcp_may_send_now(struct sock *sk) | |||
1449 | struct tcp_sock *tp = tcp_sk(sk); | 1458 | struct tcp_sock *tp = tcp_sk(sk); |
1450 | struct sk_buff *skb = tcp_send_head(sk); | 1459 | struct sk_buff *skb = tcp_send_head(sk); |
1451 | 1460 | ||
1452 | return (skb && | 1461 | return skb && |
1453 | tcp_snd_test(sk, skb, tcp_current_mss(sk), | 1462 | tcp_snd_test(sk, skb, tcp_current_mss(sk), |
1454 | (tcp_skb_is_last(sk, skb) ? | 1463 | (tcp_skb_is_last(sk, skb) ? |
1455 | tp->nonagle : TCP_NAGLE_PUSH))); | 1464 | tp->nonagle : TCP_NAGLE_PUSH)); |
1456 | } | 1465 | } |
1457 | 1466 | ||
1458 | /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet | 1467 | /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet |
@@ -1519,6 +1528,7 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) | |||
1519 | struct tcp_sock *tp = tcp_sk(sk); | 1528 | struct tcp_sock *tp = tcp_sk(sk); |
1520 | const struct inet_connection_sock *icsk = inet_csk(sk); | 1529 | const struct inet_connection_sock *icsk = inet_csk(sk); |
1521 | u32 send_win, cong_win, limit, in_flight; | 1530 | u32 send_win, cong_win, limit, in_flight; |
1531 | int win_divisor; | ||
1522 | 1532 | ||
1523 | if (TCP_SKB_CB(skb)->flags & TCPHDR_FIN) | 1533 | if (TCP_SKB_CB(skb)->flags & TCPHDR_FIN) |
1524 | goto send_now; | 1534 | goto send_now; |
@@ -1550,13 +1560,14 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) | |||
1550 | if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len)) | 1560 | if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len)) |
1551 | goto send_now; | 1561 | goto send_now; |
1552 | 1562 | ||
1553 | if (sysctl_tcp_tso_win_divisor) { | 1563 | win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor); |
1564 | if (win_divisor) { | ||
1554 | u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); | 1565 | u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); |
1555 | 1566 | ||
1556 | /* If at least some fraction of a window is available, | 1567 | /* If at least some fraction of a window is available, |
1557 | * just use it. | 1568 | * just use it. |
1558 | */ | 1569 | */ |
1559 | chunk /= sysctl_tcp_tso_win_divisor; | 1570 | chunk /= win_divisor; |
1560 | if (limit >= chunk) | 1571 | if (limit >= chunk) |
1561 | goto send_now; | 1572 | goto send_now; |
1562 | } else { | 1573 | } else { |
@@ -2152,7 +2163,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | |||
2152 | if (!tp->retrans_stamp) | 2163 | if (!tp->retrans_stamp) |
2153 | tp->retrans_stamp = TCP_SKB_CB(skb)->when; | 2164 | tp->retrans_stamp = TCP_SKB_CB(skb)->when; |
2154 | 2165 | ||
2155 | tp->undo_retrans++; | 2166 | tp->undo_retrans += tcp_skb_pcount(skb); |
2156 | 2167 | ||
2157 | /* snd_nxt is stored to detect loss of retransmitted segment, | 2168 | /* snd_nxt is stored to detect loss of retransmitted segment, |
2158 | * see tcp_input.c tcp_sacktag_write_queue(). | 2169 | * see tcp_input.c tcp_sacktag_write_queue(). |
@@ -2421,7 +2432,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
2421 | 2432 | ||
2422 | skb_dst_set(skb, dst_clone(dst)); | 2433 | skb_dst_set(skb, dst_clone(dst)); |
2423 | 2434 | ||
2424 | mss = dst_metric(dst, RTAX_ADVMSS); | 2435 | mss = dst_metric_advmss(dst); |
2425 | if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) | 2436 | if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) |
2426 | mss = tp->rx_opt.user_mss; | 2437 | mss = tp->rx_opt.user_mss; |
2427 | 2438 | ||
@@ -2429,6 +2440,12 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
2429 | __u8 rcv_wscale; | 2440 | __u8 rcv_wscale; |
2430 | /* Set this up on the first call only */ | 2441 | /* Set this up on the first call only */ |
2431 | req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); | 2442 | req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); |
2443 | |||
2444 | /* limit the window selection if the user enforce a smaller rx buffer */ | ||
2445 | if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && | ||
2446 | (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0)) | ||
2447 | req->window_clamp = tcp_full_space(sk); | ||
2448 | |||
2432 | /* tcp_full_space because it is guaranteed to be the first packet */ | 2449 | /* tcp_full_space because it is guaranteed to be the first packet */ |
2433 | tcp_select_initial_window(tcp_full_space(sk), | 2450 | tcp_select_initial_window(tcp_full_space(sk), |
2434 | mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), | 2451 | mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), |
@@ -2549,12 +2566,17 @@ static void tcp_connect_init(struct sock *sk) | |||
2549 | 2566 | ||
2550 | if (!tp->window_clamp) | 2567 | if (!tp->window_clamp) |
2551 | tp->window_clamp = dst_metric(dst, RTAX_WINDOW); | 2568 | tp->window_clamp = dst_metric(dst, RTAX_WINDOW); |
2552 | tp->advmss = dst_metric(dst, RTAX_ADVMSS); | 2569 | tp->advmss = dst_metric_advmss(dst); |
2553 | if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss) | 2570 | if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss) |
2554 | tp->advmss = tp->rx_opt.user_mss; | 2571 | tp->advmss = tp->rx_opt.user_mss; |
2555 | 2572 | ||
2556 | tcp_initialize_rcv_mss(sk); | 2573 | tcp_initialize_rcv_mss(sk); |
2557 | 2574 | ||
2575 | /* limit the window selection if the user enforce a smaller rx buffer */ | ||
2576 | if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && | ||
2577 | (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0)) | ||
2578 | tp->window_clamp = tcp_full_space(sk); | ||
2579 | |||
2558 | tcp_select_initial_window(tcp_full_space(sk), | 2580 | tcp_select_initial_window(tcp_full_space(sk), |
2559 | tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), | 2581 | tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), |
2560 | &tp->rcv_wnd, | 2582 | &tp->rcv_wnd, |
@@ -2587,6 +2609,7 @@ int tcp_connect(struct sock *sk) | |||
2587 | { | 2609 | { |
2588 | struct tcp_sock *tp = tcp_sk(sk); | 2610 | struct tcp_sock *tp = tcp_sk(sk); |
2589 | struct sk_buff *buff; | 2611 | struct sk_buff *buff; |
2612 | int err; | ||
2590 | 2613 | ||
2591 | tcp_connect_init(sk); | 2614 | tcp_connect_init(sk); |
2592 | 2615 | ||
@@ -2609,7 +2632,9 @@ int tcp_connect(struct sock *sk) | |||
2609 | sk->sk_wmem_queued += buff->truesize; | 2632 | sk->sk_wmem_queued += buff->truesize; |
2610 | sk_mem_charge(sk, buff->truesize); | 2633 | sk_mem_charge(sk, buff->truesize); |
2611 | tp->packets_out += tcp_skb_pcount(buff); | 2634 | tp->packets_out += tcp_skb_pcount(buff); |
2612 | tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); | 2635 | err = tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); |
2636 | if (err == -ECONNREFUSED) | ||
2637 | return err; | ||
2613 | 2638 | ||
2614 | /* We change tp->snd_nxt after the tcp_transmit_skb() call | 2639 | /* We change tp->snd_nxt after the tcp_transmit_skb() call |
2615 | * in order to make this packet get counted in tcpOutSegs. | 2640 | * in order to make this packet get counted in tcpOutSegs. |
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index f8efada580e8..85ee7eb7e38e 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c | |||
@@ -154,7 +154,7 @@ static int tcpprobe_sprint(char *tbuf, int n) | |||
154 | struct timespec tv | 154 | struct timespec tv |
155 | = ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start)); | 155 | = ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start)); |
156 | 156 | ||
157 | return snprintf(tbuf, n, | 157 | return scnprintf(tbuf, n, |
158 | "%lu.%09lu %pI4:%u %pI4:%u %d %#x %#x %u %u %u %u\n", | 158 | "%lu.%09lu %pI4:%u %pI4:%u %d %#x %#x %u %u %u %u\n", |
159 | (unsigned long) tv.tv_sec, | 159 | (unsigned long) tv.tv_sec, |
160 | (unsigned long) tv.tv_nsec, | 160 | (unsigned long) tv.tv_nsec, |
@@ -174,7 +174,7 @@ static ssize_t tcpprobe_read(struct file *file, char __user *buf, | |||
174 | return -EINVAL; | 174 | return -EINVAL; |
175 | 175 | ||
176 | while (cnt < len) { | 176 | while (cnt < len) { |
177 | char tbuf[128]; | 177 | char tbuf[164]; |
178 | int width; | 178 | int width; |
179 | 179 | ||
180 | /* Wait for data in buffer */ | 180 | /* Wait for data in buffer */ |
@@ -214,6 +214,7 @@ static const struct file_operations tcpprobe_fops = { | |||
214 | .owner = THIS_MODULE, | 214 | .owner = THIS_MODULE, |
215 | .open = tcpprobe_open, | 215 | .open = tcpprobe_open, |
216 | .read = tcpprobe_read, | 216 | .read = tcpprobe_read, |
217 | .llseek = noop_llseek, | ||
217 | }; | 218 | }; |
218 | 219 | ||
219 | static __init int tcpprobe_init(void) | 220 | static __init int tcpprobe_init(void) |
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index a76513779e2b..8ce55b8aaec8 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c | |||
@@ -35,7 +35,7 @@ static u32 tcp_scalable_ssthresh(struct sock *sk) | |||
35 | } | 35 | } |
36 | 36 | ||
37 | 37 | ||
38 | static struct tcp_congestion_ops tcp_scalable = { | 38 | static struct tcp_congestion_ops tcp_scalable __read_mostly = { |
39 | .ssthresh = tcp_scalable_ssthresh, | 39 | .ssthresh = tcp_scalable_ssthresh, |
40 | .cong_avoid = tcp_scalable_cong_avoid, | 40 | .cong_avoid = tcp_scalable_cong_avoid, |
41 | .min_cwnd = tcp_reno_min_cwnd, | 41 | .min_cwnd = tcp_reno_min_cwnd, |
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 74c54b30600f..ecd44b0c45f1 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c | |||
@@ -140,10 +140,10 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) | |||
140 | */ | 140 | */ |
141 | static bool retransmits_timed_out(struct sock *sk, | 141 | static bool retransmits_timed_out(struct sock *sk, |
142 | unsigned int boundary, | 142 | unsigned int boundary, |
143 | unsigned int timeout, | ||
143 | bool syn_set) | 144 | bool syn_set) |
144 | { | 145 | { |
145 | unsigned int timeout, linear_backoff_thresh; | 146 | unsigned int linear_backoff_thresh, start_ts; |
146 | unsigned int start_ts; | ||
147 | unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN; | 147 | unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN; |
148 | 148 | ||
149 | if (!inet_csk(sk)->icsk_retransmits) | 149 | if (!inet_csk(sk)->icsk_retransmits) |
@@ -154,14 +154,15 @@ static bool retransmits_timed_out(struct sock *sk, | |||
154 | else | 154 | else |
155 | start_ts = tcp_sk(sk)->retrans_stamp; | 155 | start_ts = tcp_sk(sk)->retrans_stamp; |
156 | 156 | ||
157 | linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base); | 157 | if (likely(timeout == 0)) { |
158 | 158 | linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base); | |
159 | if (boundary <= linear_backoff_thresh) | ||
160 | timeout = ((2 << boundary) - 1) * rto_base; | ||
161 | else | ||
162 | timeout = ((2 << linear_backoff_thresh) - 1) * rto_base + | ||
163 | (boundary - linear_backoff_thresh) * TCP_RTO_MAX; | ||
164 | 159 | ||
160 | if (boundary <= linear_backoff_thresh) | ||
161 | timeout = ((2 << boundary) - 1) * rto_base; | ||
162 | else | ||
163 | timeout = ((2 << linear_backoff_thresh) - 1) * rto_base + | ||
164 | (boundary - linear_backoff_thresh) * TCP_RTO_MAX; | ||
165 | } | ||
165 | return (tcp_time_stamp - start_ts) >= timeout; | 166 | return (tcp_time_stamp - start_ts) >= timeout; |
166 | } | 167 | } |
167 | 168 | ||
@@ -178,7 +179,7 @@ static int tcp_write_timeout(struct sock *sk) | |||
178 | retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; | 179 | retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; |
179 | syn_set = 1; | 180 | syn_set = 1; |
180 | } else { | 181 | } else { |
181 | if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0)) { | 182 | if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) { |
182 | /* Black hole detection */ | 183 | /* Black hole detection */ |
183 | tcp_mtu_probing(icsk, sk); | 184 | tcp_mtu_probing(icsk, sk); |
184 | 185 | ||
@@ -191,14 +192,15 @@ static int tcp_write_timeout(struct sock *sk) | |||
191 | 192 | ||
192 | retry_until = tcp_orphan_retries(sk, alive); | 193 | retry_until = tcp_orphan_retries(sk, alive); |
193 | do_reset = alive || | 194 | do_reset = alive || |
194 | !retransmits_timed_out(sk, retry_until, 0); | 195 | !retransmits_timed_out(sk, retry_until, 0, 0); |
195 | 196 | ||
196 | if (tcp_out_of_resources(sk, do_reset)) | 197 | if (tcp_out_of_resources(sk, do_reset)) |
197 | return 1; | 198 | return 1; |
198 | } | 199 | } |
199 | } | 200 | } |
200 | 201 | ||
201 | if (retransmits_timed_out(sk, retry_until, syn_set)) { | 202 | if (retransmits_timed_out(sk, retry_until, |
203 | syn_set ? 0 : icsk->icsk_user_timeout, syn_set)) { | ||
202 | /* Has it gone just too far? */ | 204 | /* Has it gone just too far? */ |
203 | tcp_write_err(sk); | 205 | tcp_write_err(sk); |
204 | return 1; | 206 | return 1; |
@@ -257,7 +259,6 @@ static void tcp_delack_timer(unsigned long data) | |||
257 | tcp_send_ack(sk); | 259 | tcp_send_ack(sk); |
258 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS); | 260 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS); |
259 | } | 261 | } |
260 | TCP_CHECK_TIMER(sk); | ||
261 | 262 | ||
262 | out: | 263 | out: |
263 | if (tcp_memory_pressure) | 264 | if (tcp_memory_pressure) |
@@ -365,18 +366,19 @@ void tcp_retransmit_timer(struct sock *sk) | |||
365 | if (icsk->icsk_retransmits == 0) { | 366 | if (icsk->icsk_retransmits == 0) { |
366 | int mib_idx; | 367 | int mib_idx; |
367 | 368 | ||
368 | if (icsk->icsk_ca_state == TCP_CA_Disorder) { | 369 | if (icsk->icsk_ca_state == TCP_CA_Recovery) { |
369 | if (tcp_is_sack(tp)) | ||
370 | mib_idx = LINUX_MIB_TCPSACKFAILURES; | ||
371 | else | ||
372 | mib_idx = LINUX_MIB_TCPRENOFAILURES; | ||
373 | } else if (icsk->icsk_ca_state == TCP_CA_Recovery) { | ||
374 | if (tcp_is_sack(tp)) | 370 | if (tcp_is_sack(tp)) |
375 | mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL; | 371 | mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL; |
376 | else | 372 | else |
377 | mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL; | 373 | mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL; |
378 | } else if (icsk->icsk_ca_state == TCP_CA_Loss) { | 374 | } else if (icsk->icsk_ca_state == TCP_CA_Loss) { |
379 | mib_idx = LINUX_MIB_TCPLOSSFAILURES; | 375 | mib_idx = LINUX_MIB_TCPLOSSFAILURES; |
376 | } else if ((icsk->icsk_ca_state == TCP_CA_Disorder) || | ||
377 | tp->sacked_out) { | ||
378 | if (tcp_is_sack(tp)) | ||
379 | mib_idx = LINUX_MIB_TCPSACKFAILURES; | ||
380 | else | ||
381 | mib_idx = LINUX_MIB_TCPRENOFAILURES; | ||
380 | } else { | 382 | } else { |
381 | mib_idx = LINUX_MIB_TCPTIMEOUTS; | 383 | mib_idx = LINUX_MIB_TCPTIMEOUTS; |
382 | } | 384 | } |
@@ -440,7 +442,7 @@ out_reset_timer: | |||
440 | icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); | 442 | icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); |
441 | } | 443 | } |
442 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); | 444 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); |
443 | if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0)) | 445 | if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0)) |
444 | __sk_dst_reset(sk); | 446 | __sk_dst_reset(sk); |
445 | 447 | ||
446 | out:; | 448 | out:; |
@@ -478,7 +480,6 @@ static void tcp_write_timer(unsigned long data) | |||
478 | tcp_probe_timer(sk); | 480 | tcp_probe_timer(sk); |
479 | break; | 481 | break; |
480 | } | 482 | } |
481 | TCP_CHECK_TIMER(sk); | ||
482 | 483 | ||
483 | out: | 484 | out: |
484 | sk_mem_reclaim(sk); | 485 | sk_mem_reclaim(sk); |
@@ -560,7 +561,14 @@ static void tcp_keepalive_timer (unsigned long data) | |||
560 | elapsed = keepalive_time_elapsed(tp); | 561 | elapsed = keepalive_time_elapsed(tp); |
561 | 562 | ||
562 | if (elapsed >= keepalive_time_when(tp)) { | 563 | if (elapsed >= keepalive_time_when(tp)) { |
563 | if (icsk->icsk_probes_out >= keepalive_probes(tp)) { | 564 | /* If the TCP_USER_TIMEOUT option is enabled, use that |
565 | * to determine when to timeout instead. | ||
566 | */ | ||
567 | if ((icsk->icsk_user_timeout != 0 && | ||
568 | elapsed >= icsk->icsk_user_timeout && | ||
569 | icsk->icsk_probes_out > 0) || | ||
570 | (icsk->icsk_user_timeout == 0 && | ||
571 | icsk->icsk_probes_out >= keepalive_probes(tp))) { | ||
564 | tcp_send_active_reset(sk, GFP_ATOMIC); | 572 | tcp_send_active_reset(sk, GFP_ATOMIC); |
565 | tcp_write_err(sk); | 573 | tcp_write_err(sk); |
566 | goto out; | 574 | goto out; |
@@ -579,7 +587,6 @@ static void tcp_keepalive_timer (unsigned long data) | |||
579 | elapsed = keepalive_time_when(tp) - elapsed; | 587 | elapsed = keepalive_time_when(tp) - elapsed; |
580 | } | 588 | } |
581 | 589 | ||
582 | TCP_CHECK_TIMER(sk); | ||
583 | sk_mem_reclaim(sk); | 590 | sk_mem_reclaim(sk); |
584 | 591 | ||
585 | resched: | 592 | resched: |
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index c6743eec9b7d..80fa2bfd7ede 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c | |||
@@ -304,7 +304,7 @@ void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) | |||
304 | } | 304 | } |
305 | EXPORT_SYMBOL_GPL(tcp_vegas_get_info); | 305 | EXPORT_SYMBOL_GPL(tcp_vegas_get_info); |
306 | 306 | ||
307 | static struct tcp_congestion_ops tcp_vegas = { | 307 | static struct tcp_congestion_ops tcp_vegas __read_mostly = { |
308 | .flags = TCP_CONG_RTT_STAMP, | 308 | .flags = TCP_CONG_RTT_STAMP, |
309 | .init = tcp_vegas_init, | 309 | .init = tcp_vegas_init, |
310 | .ssthresh = tcp_reno_ssthresh, | 310 | .ssthresh = tcp_reno_ssthresh, |
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index b612acf76183..ac43cd747bce 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c | |||
@@ -6,7 +6,7 @@ | |||
6 | * "TCP Veno: TCP Enhancement for Transmission over Wireless Access Networks." | 6 | * "TCP Veno: TCP Enhancement for Transmission over Wireless Access Networks." |
7 | * IEEE Journal on Selected Areas in Communication, | 7 | * IEEE Journal on Selected Areas in Communication, |
8 | * Feb. 2003. | 8 | * Feb. 2003. |
9 | * See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf | 9 | * See http://www.ie.cuhk.edu.hk/fileadmin/staff_upload/soung/Journal/J3.pdf |
10 | */ | 10 | */ |
11 | 11 | ||
12 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
@@ -201,7 +201,7 @@ static u32 tcp_veno_ssthresh(struct sock *sk) | |||
201 | return max(tp->snd_cwnd >> 1U, 2U); | 201 | return max(tp->snd_cwnd >> 1U, 2U); |
202 | } | 202 | } |
203 | 203 | ||
204 | static struct tcp_congestion_ops tcp_veno = { | 204 | static struct tcp_congestion_ops tcp_veno __read_mostly = { |
205 | .flags = TCP_CONG_RTT_STAMP, | 205 | .flags = TCP_CONG_RTT_STAMP, |
206 | .init = tcp_veno_init, | 206 | .init = tcp_veno_init, |
207 | .ssthresh = tcp_veno_ssthresh, | 207 | .ssthresh = tcp_veno_ssthresh, |
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 20151d6a6241..1b91bf48e277 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c | |||
@@ -80,7 +80,7 @@ static void tcp_westwood_init(struct sock *sk) | |||
80 | */ | 80 | */ |
81 | static inline u32 westwood_do_filter(u32 a, u32 b) | 81 | static inline u32 westwood_do_filter(u32 a, u32 b) |
82 | { | 82 | { |
83 | return (((7 * a) + b) >> 3); | 83 | return ((7 * a) + b) >> 3; |
84 | } | 84 | } |
85 | 85 | ||
86 | static void westwood_filter(struct westwood *w, u32 delta) | 86 | static void westwood_filter(struct westwood *w, u32 delta) |
@@ -272,7 +272,7 @@ static void tcp_westwood_info(struct sock *sk, u32 ext, | |||
272 | } | 272 | } |
273 | 273 | ||
274 | 274 | ||
275 | static struct tcp_congestion_ops tcp_westwood = { | 275 | static struct tcp_congestion_ops tcp_westwood __read_mostly = { |
276 | .init = tcp_westwood_init, | 276 | .init = tcp_westwood_init, |
277 | .ssthresh = tcp_reno_ssthresh, | 277 | .ssthresh = tcp_reno_ssthresh, |
278 | .cong_avoid = tcp_reno_cong_avoid, | 278 | .cong_avoid = tcp_reno_cong_avoid, |
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index a0f240358892..05c3b6f0e8e1 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c | |||
@@ -20,7 +20,7 @@ | |||
20 | #define TCP_YEAH_DELTA 3 //log minimum fraction of cwnd to be removed on loss | 20 | #define TCP_YEAH_DELTA 3 //log minimum fraction of cwnd to be removed on loss |
21 | #define TCP_YEAH_EPSILON 1 //log maximum fraction to be removed on early decongestion | 21 | #define TCP_YEAH_EPSILON 1 //log maximum fraction to be removed on early decongestion |
22 | #define TCP_YEAH_PHY 8 //lin maximum delta from base | 22 | #define TCP_YEAH_PHY 8 //lin maximum delta from base |
23 | #define TCP_YEAH_RHO 16 //lin minumum number of consecutive rtt to consider competition on loss | 23 | #define TCP_YEAH_RHO 16 //lin minimum number of consecutive rtt to consider competition on loss |
24 | #define TCP_YEAH_ZETA 50 //lin minimum number of state switchs to reset reno_count | 24 | #define TCP_YEAH_ZETA 50 //lin minimum number of state switchs to reset reno_count |
25 | 25 | ||
26 | #define TCP_SCALABLE_AI_CNT 100U | 26 | #define TCP_SCALABLE_AI_CNT 100U |
@@ -225,7 +225,7 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) { | |||
225 | return tp->snd_cwnd - reduction; | 225 | return tp->snd_cwnd - reduction; |
226 | } | 226 | } |
227 | 227 | ||
228 | static struct tcp_congestion_ops tcp_yeah = { | 228 | static struct tcp_congestion_ops tcp_yeah __read_mostly = { |
229 | .flags = TCP_CONG_RTT_STAMP, | 229 | .flags = TCP_CONG_RTT_STAMP, |
230 | .init = tcp_yeah_init, | 230 | .init = tcp_yeah_init, |
231 | .ssthresh = tcp_yeah_ssthresh, | 231 | .ssthresh = tcp_yeah_ssthresh, |
diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c index 59186ca7808a..ac3b3ee4b07c 100644 --- a/net/ipv4/tunnel4.c +++ b/net/ipv4/tunnel4.c | |||
@@ -14,32 +14,37 @@ | |||
14 | #include <net/protocol.h> | 14 | #include <net/protocol.h> |
15 | #include <net/xfrm.h> | 15 | #include <net/xfrm.h> |
16 | 16 | ||
17 | static struct xfrm_tunnel *tunnel4_handlers; | 17 | static struct xfrm_tunnel __rcu *tunnel4_handlers __read_mostly; |
18 | static struct xfrm_tunnel *tunnel64_handlers; | 18 | static struct xfrm_tunnel __rcu *tunnel64_handlers __read_mostly; |
19 | static DEFINE_MUTEX(tunnel4_mutex); | 19 | static DEFINE_MUTEX(tunnel4_mutex); |
20 | 20 | ||
21 | static inline struct xfrm_tunnel **fam_handlers(unsigned short family) | 21 | static inline struct xfrm_tunnel __rcu **fam_handlers(unsigned short family) |
22 | { | 22 | { |
23 | return (family == AF_INET) ? &tunnel4_handlers : &tunnel64_handlers; | 23 | return (family == AF_INET) ? &tunnel4_handlers : &tunnel64_handlers; |
24 | } | 24 | } |
25 | 25 | ||
26 | int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family) | 26 | int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family) |
27 | { | 27 | { |
28 | struct xfrm_tunnel **pprev; | 28 | struct xfrm_tunnel __rcu **pprev; |
29 | struct xfrm_tunnel *t; | ||
30 | |||
29 | int ret = -EEXIST; | 31 | int ret = -EEXIST; |
30 | int priority = handler->priority; | 32 | int priority = handler->priority; |
31 | 33 | ||
32 | mutex_lock(&tunnel4_mutex); | 34 | mutex_lock(&tunnel4_mutex); |
33 | 35 | ||
34 | for (pprev = fam_handlers(family); *pprev; pprev = &(*pprev)->next) { | 36 | for (pprev = fam_handlers(family); |
35 | if ((*pprev)->priority > priority) | 37 | (t = rcu_dereference_protected(*pprev, |
38 | lockdep_is_held(&tunnel4_mutex))) != NULL; | ||
39 | pprev = &t->next) { | ||
40 | if (t->priority > priority) | ||
36 | break; | 41 | break; |
37 | if ((*pprev)->priority == priority) | 42 | if (t->priority == priority) |
38 | goto err; | 43 | goto err; |
39 | } | 44 | } |
40 | 45 | ||
41 | handler->next = *pprev; | 46 | handler->next = *pprev; |
42 | *pprev = handler; | 47 | rcu_assign_pointer(*pprev, handler); |
43 | 48 | ||
44 | ret = 0; | 49 | ret = 0; |
45 | 50 | ||
@@ -52,13 +57,17 @@ EXPORT_SYMBOL(xfrm4_tunnel_register); | |||
52 | 57 | ||
53 | int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family) | 58 | int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family) |
54 | { | 59 | { |
55 | struct xfrm_tunnel **pprev; | 60 | struct xfrm_tunnel __rcu **pprev; |
61 | struct xfrm_tunnel *t; | ||
56 | int ret = -ENOENT; | 62 | int ret = -ENOENT; |
57 | 63 | ||
58 | mutex_lock(&tunnel4_mutex); | 64 | mutex_lock(&tunnel4_mutex); |
59 | 65 | ||
60 | for (pprev = fam_handlers(family); *pprev; pprev = &(*pprev)->next) { | 66 | for (pprev = fam_handlers(family); |
61 | if (*pprev == handler) { | 67 | (t = rcu_dereference_protected(*pprev, |
68 | lockdep_is_held(&tunnel4_mutex))) != NULL; | ||
69 | pprev = &t->next) { | ||
70 | if (t == handler) { | ||
62 | *pprev = handler->next; | 71 | *pprev = handler->next; |
63 | ret = 0; | 72 | ret = 0; |
64 | break; | 73 | break; |
@@ -73,6 +82,11 @@ int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family) | |||
73 | } | 82 | } |
74 | EXPORT_SYMBOL(xfrm4_tunnel_deregister); | 83 | EXPORT_SYMBOL(xfrm4_tunnel_deregister); |
75 | 84 | ||
85 | #define for_each_tunnel_rcu(head, handler) \ | ||
86 | for (handler = rcu_dereference(head); \ | ||
87 | handler != NULL; \ | ||
88 | handler = rcu_dereference(handler->next)) \ | ||
89 | |||
76 | static int tunnel4_rcv(struct sk_buff *skb) | 90 | static int tunnel4_rcv(struct sk_buff *skb) |
77 | { | 91 | { |
78 | struct xfrm_tunnel *handler; | 92 | struct xfrm_tunnel *handler; |
@@ -80,7 +94,7 @@ static int tunnel4_rcv(struct sk_buff *skb) | |||
80 | if (!pskb_may_pull(skb, sizeof(struct iphdr))) | 94 | if (!pskb_may_pull(skb, sizeof(struct iphdr))) |
81 | goto drop; | 95 | goto drop; |
82 | 96 | ||
83 | for (handler = tunnel4_handlers; handler; handler = handler->next) | 97 | for_each_tunnel_rcu(tunnel4_handlers, handler) |
84 | if (!handler->handler(skb)) | 98 | if (!handler->handler(skb)) |
85 | return 0; | 99 | return 0; |
86 | 100 | ||
@@ -99,7 +113,7 @@ static int tunnel64_rcv(struct sk_buff *skb) | |||
99 | if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) | 113 | if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) |
100 | goto drop; | 114 | goto drop; |
101 | 115 | ||
102 | for (handler = tunnel64_handlers; handler; handler = handler->next) | 116 | for_each_tunnel_rcu(tunnel64_handlers, handler) |
103 | if (!handler->handler(skb)) | 117 | if (!handler->handler(skb)) |
104 | return 0; | 118 | return 0; |
105 | 119 | ||
@@ -115,7 +129,7 @@ static void tunnel4_err(struct sk_buff *skb, u32 info) | |||
115 | { | 129 | { |
116 | struct xfrm_tunnel *handler; | 130 | struct xfrm_tunnel *handler; |
117 | 131 | ||
118 | for (handler = tunnel4_handlers; handler; handler = handler->next) | 132 | for_each_tunnel_rcu(tunnel4_handlers, handler) |
119 | if (!handler->err_handler(skb, info)) | 133 | if (!handler->err_handler(skb, info)) |
120 | break; | 134 | break; |
121 | } | 135 | } |
@@ -125,7 +139,7 @@ static void tunnel64_err(struct sk_buff *skb, u32 info) | |||
125 | { | 139 | { |
126 | struct xfrm_tunnel *handler; | 140 | struct xfrm_tunnel *handler; |
127 | 141 | ||
128 | for (handler = tunnel64_handlers; handler; handler = handler->next) | 142 | for_each_tunnel_rcu(tunnel64_handlers, handler) |
129 | if (!handler->err_handler(skb, info)) | 143 | if (!handler->err_handler(skb, info)) |
130 | break; | 144 | break; |
131 | } | 145 | } |
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index fb23c2e63b52..198f75b7bdd3 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c | |||
@@ -110,7 +110,7 @@ | |||
110 | struct udp_table udp_table __read_mostly; | 110 | struct udp_table udp_table __read_mostly; |
111 | EXPORT_SYMBOL(udp_table); | 111 | EXPORT_SYMBOL(udp_table); |
112 | 112 | ||
113 | int sysctl_udp_mem[3] __read_mostly; | 113 | long sysctl_udp_mem[3] __read_mostly; |
114 | EXPORT_SYMBOL(sysctl_udp_mem); | 114 | EXPORT_SYMBOL(sysctl_udp_mem); |
115 | 115 | ||
116 | int sysctl_udp_rmem_min __read_mostly; | 116 | int sysctl_udp_rmem_min __read_mostly; |
@@ -119,7 +119,7 @@ EXPORT_SYMBOL(sysctl_udp_rmem_min); | |||
119 | int sysctl_udp_wmem_min __read_mostly; | 119 | int sysctl_udp_wmem_min __read_mostly; |
120 | EXPORT_SYMBOL(sysctl_udp_wmem_min); | 120 | EXPORT_SYMBOL(sysctl_udp_wmem_min); |
121 | 121 | ||
122 | atomic_t udp_memory_allocated; | 122 | atomic_long_t udp_memory_allocated; |
123 | EXPORT_SYMBOL(udp_memory_allocated); | 123 | EXPORT_SYMBOL(udp_memory_allocated); |
124 | 124 | ||
125 | #define MAX_UDP_PORTS 65536 | 125 | #define MAX_UDP_PORTS 65536 |
@@ -189,7 +189,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, | |||
189 | * @sk: socket struct in question | 189 | * @sk: socket struct in question |
190 | * @snum: port number to look up | 190 | * @snum: port number to look up |
191 | * @saddr_comp: AF-dependent comparison of bound local IP addresses | 191 | * @saddr_comp: AF-dependent comparison of bound local IP addresses |
192 | * @hash2_nulladdr: AF-dependant hash value in secondary hash chains, | 192 | * @hash2_nulladdr: AF-dependent hash value in secondary hash chains, |
193 | * with NULL address | 193 | * with NULL address |
194 | */ | 194 | */ |
195 | int udp_lib_get_port(struct sock *sk, unsigned short snum, | 195 | int udp_lib_get_port(struct sock *sk, unsigned short snum, |
@@ -430,7 +430,7 @@ begin: | |||
430 | 430 | ||
431 | if (result) { | 431 | if (result) { |
432 | exact_match: | 432 | exact_match: |
433 | if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) | 433 | if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) |
434 | result = NULL; | 434 | result = NULL; |
435 | else if (unlikely(compute_score2(result, net, saddr, sport, | 435 | else if (unlikely(compute_score2(result, net, saddr, sport, |
436 | daddr, hnum, dif) < badness)) { | 436 | daddr, hnum, dif) < badness)) { |
@@ -500,7 +500,7 @@ begin: | |||
500 | goto begin; | 500 | goto begin; |
501 | 501 | ||
502 | if (result) { | 502 | if (result) { |
503 | if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) | 503 | if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) |
504 | result = NULL; | 504 | result = NULL; |
505 | else if (unlikely(compute_score(result, net, saddr, hnum, sport, | 505 | else if (unlikely(compute_score(result, net, saddr, hnum, sport, |
506 | daddr, dport, dif) < badness)) { | 506 | daddr, dport, dif) < badness)) { |
@@ -578,7 +578,7 @@ found: | |||
578 | void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) | 578 | void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) |
579 | { | 579 | { |
580 | struct inet_sock *inet; | 580 | struct inet_sock *inet; |
581 | struct iphdr *iph = (struct iphdr *)skb->data; | 581 | const struct iphdr *iph = (const struct iphdr *)skb->data; |
582 | struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2)); | 582 | struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2)); |
583 | const int type = icmp_hdr(skb)->type; | 583 | const int type = icmp_hdr(skb)->type; |
584 | const int code = icmp_hdr(skb)->code; | 584 | const int code = icmp_hdr(skb)->code; |
@@ -663,75 +663,71 @@ void udp_flush_pending_frames(struct sock *sk) | |||
663 | EXPORT_SYMBOL(udp_flush_pending_frames); | 663 | EXPORT_SYMBOL(udp_flush_pending_frames); |
664 | 664 | ||
665 | /** | 665 | /** |
666 | * udp4_hwcsum_outgoing - handle outgoing HW checksumming | 666 | * udp4_hwcsum - handle outgoing HW checksumming |
667 | * @sk: socket we are sending on | ||
668 | * @skb: sk_buff containing the filled-in UDP header | 667 | * @skb: sk_buff containing the filled-in UDP header |
669 | * (checksum field must be zeroed out) | 668 | * (checksum field must be zeroed out) |
669 | * @src: source IP address | ||
670 | * @dst: destination IP address | ||
670 | */ | 671 | */ |
671 | static void udp4_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb, | 672 | static void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst) |
672 | __be32 src, __be32 dst, int len) | ||
673 | { | 673 | { |
674 | unsigned int offset; | ||
675 | struct udphdr *uh = udp_hdr(skb); | 674 | struct udphdr *uh = udp_hdr(skb); |
675 | struct sk_buff *frags = skb_shinfo(skb)->frag_list; | ||
676 | int offset = skb_transport_offset(skb); | ||
677 | int len = skb->len - offset; | ||
678 | int hlen = len; | ||
676 | __wsum csum = 0; | 679 | __wsum csum = 0; |
677 | 680 | ||
678 | if (skb_queue_len(&sk->sk_write_queue) == 1) { | 681 | if (!frags) { |
679 | /* | 682 | /* |
680 | * Only one fragment on the socket. | 683 | * Only one fragment on the socket. |
681 | */ | 684 | */ |
682 | skb->csum_start = skb_transport_header(skb) - skb->head; | 685 | skb->csum_start = skb_transport_header(skb) - skb->head; |
683 | skb->csum_offset = offsetof(struct udphdr, check); | 686 | skb->csum_offset = offsetof(struct udphdr, check); |
684 | uh->check = ~csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, 0); | 687 | uh->check = ~csum_tcpudp_magic(src, dst, len, |
688 | IPPROTO_UDP, 0); | ||
685 | } else { | 689 | } else { |
686 | /* | 690 | /* |
687 | * HW-checksum won't work as there are two or more | 691 | * HW-checksum won't work as there are two or more |
688 | * fragments on the socket so that all csums of sk_buffs | 692 | * fragments on the socket so that all csums of sk_buffs |
689 | * should be together | 693 | * should be together |
690 | */ | 694 | */ |
691 | offset = skb_transport_offset(skb); | 695 | do { |
692 | skb->csum = skb_checksum(skb, offset, skb->len - offset, 0); | 696 | csum = csum_add(csum, frags->csum); |
697 | hlen -= frags->len; | ||
698 | } while ((frags = frags->next)); | ||
693 | 699 | ||
700 | csum = skb_checksum(skb, offset, hlen, csum); | ||
694 | skb->ip_summed = CHECKSUM_NONE; | 701 | skb->ip_summed = CHECKSUM_NONE; |
695 | 702 | ||
696 | skb_queue_walk(&sk->sk_write_queue, skb) { | ||
697 | csum = csum_add(csum, skb->csum); | ||
698 | } | ||
699 | |||
700 | uh->check = csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, csum); | 703 | uh->check = csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, csum); |
701 | if (uh->check == 0) | 704 | if (uh->check == 0) |
702 | uh->check = CSUM_MANGLED_0; | 705 | uh->check = CSUM_MANGLED_0; |
703 | } | 706 | } |
704 | } | 707 | } |
705 | 708 | ||
706 | /* | 709 | static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) |
707 | * Push out all pending data as one UDP datagram. Socket is locked. | ||
708 | */ | ||
709 | static int udp_push_pending_frames(struct sock *sk) | ||
710 | { | 710 | { |
711 | struct udp_sock *up = udp_sk(sk); | 711 | struct sock *sk = skb->sk; |
712 | struct inet_sock *inet = inet_sk(sk); | 712 | struct inet_sock *inet = inet_sk(sk); |
713 | struct flowi *fl = &inet->cork.fl; | ||
714 | struct sk_buff *skb; | ||
715 | struct udphdr *uh; | 713 | struct udphdr *uh; |
716 | int err = 0; | 714 | int err = 0; |
717 | int is_udplite = IS_UDPLITE(sk); | 715 | int is_udplite = IS_UDPLITE(sk); |
716 | int offset = skb_transport_offset(skb); | ||
717 | int len = skb->len - offset; | ||
718 | __wsum csum = 0; | 718 | __wsum csum = 0; |
719 | 719 | ||
720 | /* Grab the skbuff where UDP header space exists. */ | ||
721 | if ((skb = skb_peek(&sk->sk_write_queue)) == NULL) | ||
722 | goto out; | ||
723 | |||
724 | /* | 720 | /* |
725 | * Create a UDP header | 721 | * Create a UDP header |
726 | */ | 722 | */ |
727 | uh = udp_hdr(skb); | 723 | uh = udp_hdr(skb); |
728 | uh->source = fl->fl_ip_sport; | 724 | uh->source = inet->inet_sport; |
729 | uh->dest = fl->fl_ip_dport; | 725 | uh->dest = fl4->fl4_dport; |
730 | uh->len = htons(up->len); | 726 | uh->len = htons(len); |
731 | uh->check = 0; | 727 | uh->check = 0; |
732 | 728 | ||
733 | if (is_udplite) /* UDP-Lite */ | 729 | if (is_udplite) /* UDP-Lite */ |
734 | csum = udplite_csum_outgoing(sk, skb); | 730 | csum = udplite_csum(skb); |
735 | 731 | ||
736 | else if (sk->sk_no_check == UDP_CSUM_NOXMIT) { /* UDP csum disabled */ | 732 | else if (sk->sk_no_check == UDP_CSUM_NOXMIT) { /* UDP csum disabled */ |
737 | 733 | ||
@@ -740,20 +736,20 @@ static int udp_push_pending_frames(struct sock *sk) | |||
740 | 736 | ||
741 | } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */ | 737 | } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */ |
742 | 738 | ||
743 | udp4_hwcsum_outgoing(sk, skb, fl->fl4_src, fl->fl4_dst, up->len); | 739 | udp4_hwcsum(skb, fl4->saddr, fl4->daddr); |
744 | goto send; | 740 | goto send; |
745 | 741 | ||
746 | } else /* `normal' UDP */ | 742 | } else |
747 | csum = udp_csum_outgoing(sk, skb); | 743 | csum = udp_csum(skb); |
748 | 744 | ||
749 | /* add protocol-dependent pseudo-header */ | 745 | /* add protocol-dependent pseudo-header */ |
750 | uh->check = csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst, up->len, | 746 | uh->check = csum_tcpudp_magic(fl4->saddr, fl4->daddr, len, |
751 | sk->sk_protocol, csum); | 747 | sk->sk_protocol, csum); |
752 | if (uh->check == 0) | 748 | if (uh->check == 0) |
753 | uh->check = CSUM_MANGLED_0; | 749 | uh->check = CSUM_MANGLED_0; |
754 | 750 | ||
755 | send: | 751 | send: |
756 | err = ip_push_pending_frames(sk); | 752 | err = ip_send_skb(skb); |
757 | if (err) { | 753 | if (err) { |
758 | if (err == -ENOBUFS && !inet->recverr) { | 754 | if (err == -ENOBUFS && !inet->recverr) { |
759 | UDP_INC_STATS_USER(sock_net(sk), | 755 | UDP_INC_STATS_USER(sock_net(sk), |
@@ -763,6 +759,26 @@ send: | |||
763 | } else | 759 | } else |
764 | UDP_INC_STATS_USER(sock_net(sk), | 760 | UDP_INC_STATS_USER(sock_net(sk), |
765 | UDP_MIB_OUTDATAGRAMS, is_udplite); | 761 | UDP_MIB_OUTDATAGRAMS, is_udplite); |
762 | return err; | ||
763 | } | ||
764 | |||
765 | /* | ||
766 | * Push out all pending data as one UDP datagram. Socket is locked. | ||
767 | */ | ||
768 | static int udp_push_pending_frames(struct sock *sk) | ||
769 | { | ||
770 | struct udp_sock *up = udp_sk(sk); | ||
771 | struct inet_sock *inet = inet_sk(sk); | ||
772 | struct flowi4 *fl4 = &inet->cork.fl.u.ip4; | ||
773 | struct sk_buff *skb; | ||
774 | int err = 0; | ||
775 | |||
776 | skb = ip_finish_skb(sk, fl4); | ||
777 | if (!skb) | ||
778 | goto out; | ||
779 | |||
780 | err = udp_send_skb(skb, fl4); | ||
781 | |||
766 | out: | 782 | out: |
767 | up->len = 0; | 783 | up->len = 0; |
768 | up->pending = 0; | 784 | up->pending = 0; |
@@ -774,6 +790,8 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
774 | { | 790 | { |
775 | struct inet_sock *inet = inet_sk(sk); | 791 | struct inet_sock *inet = inet_sk(sk); |
776 | struct udp_sock *up = udp_sk(sk); | 792 | struct udp_sock *up = udp_sk(sk); |
793 | struct flowi4 fl4_stack; | ||
794 | struct flowi4 *fl4; | ||
777 | int ulen = len; | 795 | int ulen = len; |
778 | struct ipcm_cookie ipc; | 796 | struct ipcm_cookie ipc; |
779 | struct rtable *rt = NULL; | 797 | struct rtable *rt = NULL; |
@@ -785,6 +803,8 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
785 | int err, is_udplite = IS_UDPLITE(sk); | 803 | int err, is_udplite = IS_UDPLITE(sk); |
786 | int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; | 804 | int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; |
787 | int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); | 805 | int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); |
806 | struct sk_buff *skb; | ||
807 | struct ip_options_data opt_copy; | ||
788 | 808 | ||
789 | if (len > 0xFFFF) | 809 | if (len > 0xFFFF) |
790 | return -EMSGSIZE; | 810 | return -EMSGSIZE; |
@@ -797,8 +817,11 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
797 | return -EOPNOTSUPP; | 817 | return -EOPNOTSUPP; |
798 | 818 | ||
799 | ipc.opt = NULL; | 819 | ipc.opt = NULL; |
800 | ipc.shtx.flags = 0; | 820 | ipc.tx_flags = 0; |
801 | 821 | ||
822 | getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; | ||
823 | |||
824 | fl4 = &inet->cork.fl.u.ip4; | ||
802 | if (up->pending) { | 825 | if (up->pending) { |
803 | /* | 826 | /* |
804 | * There are pending frames. | 827 | * There are pending frames. |
@@ -845,7 +868,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
845 | ipc.addr = inet->inet_saddr; | 868 | ipc.addr = inet->inet_saddr; |
846 | 869 | ||
847 | ipc.oif = sk->sk_bound_dev_if; | 870 | ipc.oif = sk->sk_bound_dev_if; |
848 | err = sock_tx_timestamp(msg, sk, &ipc.shtx); | 871 | err = sock_tx_timestamp(sk, &ipc.tx_flags); |
849 | if (err) | 872 | if (err) |
850 | return err; | 873 | return err; |
851 | if (msg->msg_controllen) { | 874 | if (msg->msg_controllen) { |
@@ -856,22 +879,32 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
856 | free = 1; | 879 | free = 1; |
857 | connected = 0; | 880 | connected = 0; |
858 | } | 881 | } |
859 | if (!ipc.opt) | 882 | if (!ipc.opt) { |
860 | ipc.opt = inet->opt; | 883 | struct ip_options_rcu *inet_opt; |
884 | |||
885 | rcu_read_lock(); | ||
886 | inet_opt = rcu_dereference(inet->inet_opt); | ||
887 | if (inet_opt) { | ||
888 | memcpy(&opt_copy, inet_opt, | ||
889 | sizeof(*inet_opt) + inet_opt->opt.optlen); | ||
890 | ipc.opt = &opt_copy.opt; | ||
891 | } | ||
892 | rcu_read_unlock(); | ||
893 | } | ||
861 | 894 | ||
862 | saddr = ipc.addr; | 895 | saddr = ipc.addr; |
863 | ipc.addr = faddr = daddr; | 896 | ipc.addr = faddr = daddr; |
864 | 897 | ||
865 | if (ipc.opt && ipc.opt->srr) { | 898 | if (ipc.opt && ipc.opt->opt.srr) { |
866 | if (!daddr) | 899 | if (!daddr) |
867 | return -EINVAL; | 900 | return -EINVAL; |
868 | faddr = ipc.opt->faddr; | 901 | faddr = ipc.opt->opt.faddr; |
869 | connected = 0; | 902 | connected = 0; |
870 | } | 903 | } |
871 | tos = RT_TOS(inet->tos); | 904 | tos = RT_TOS(inet->tos); |
872 | if (sock_flag(sk, SOCK_LOCALROUTE) || | 905 | if (sock_flag(sk, SOCK_LOCALROUTE) || |
873 | (msg->msg_flags & MSG_DONTROUTE) || | 906 | (msg->msg_flags & MSG_DONTROUTE) || |
874 | (ipc.opt && ipc.opt->is_strictroute)) { | 907 | (ipc.opt && ipc.opt->opt.is_strictroute)) { |
875 | tos |= RTO_ONLINK; | 908 | tos |= RTO_ONLINK; |
876 | connected = 0; | 909 | connected = 0; |
877 | } | 910 | } |
@@ -888,22 +921,19 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
888 | rt = (struct rtable *)sk_dst_check(sk, 0); | 921 | rt = (struct rtable *)sk_dst_check(sk, 0); |
889 | 922 | ||
890 | if (rt == NULL) { | 923 | if (rt == NULL) { |
891 | struct flowi fl = { .oif = ipc.oif, | ||
892 | .mark = sk->sk_mark, | ||
893 | .nl_u = { .ip4_u = | ||
894 | { .daddr = faddr, | ||
895 | .saddr = saddr, | ||
896 | .tos = tos } }, | ||
897 | .proto = sk->sk_protocol, | ||
898 | .flags = inet_sk_flowi_flags(sk), | ||
899 | .uli_u = { .ports = | ||
900 | { .sport = inet->inet_sport, | ||
901 | .dport = dport } } }; | ||
902 | struct net *net = sock_net(sk); | 924 | struct net *net = sock_net(sk); |
903 | 925 | ||
904 | security_sk_classify_flow(sk, &fl); | 926 | fl4 = &fl4_stack; |
905 | err = ip_route_output_flow(net, &rt, &fl, sk, 1); | 927 | flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos, |
906 | if (err) { | 928 | RT_SCOPE_UNIVERSE, sk->sk_protocol, |
929 | inet_sk_flowi_flags(sk)|FLOWI_FLAG_CAN_SLEEP, | ||
930 | faddr, saddr, dport, inet->inet_sport); | ||
931 | |||
932 | security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); | ||
933 | rt = ip_route_output_flow(net, fl4, sk); | ||
934 | if (IS_ERR(rt)) { | ||
935 | err = PTR_ERR(rt); | ||
936 | rt = NULL; | ||
907 | if (err == -ENETUNREACH) | 937 | if (err == -ENETUNREACH) |
908 | IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); | 938 | IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); |
909 | goto out; | 939 | goto out; |
@@ -921,9 +951,20 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
921 | goto do_confirm; | 951 | goto do_confirm; |
922 | back_from_confirm: | 952 | back_from_confirm: |
923 | 953 | ||
924 | saddr = rt->rt_src; | 954 | saddr = fl4->saddr; |
925 | if (!ipc.addr) | 955 | if (!ipc.addr) |
926 | daddr = ipc.addr = rt->rt_dst; | 956 | daddr = ipc.addr = fl4->daddr; |
957 | |||
958 | /* Lockless fast path for the non-corking case. */ | ||
959 | if (!corkreq) { | ||
960 | skb = ip_make_skb(sk, fl4, getfrag, msg->msg_iov, ulen, | ||
961 | sizeof(struct udphdr), &ipc, &rt, | ||
962 | msg->msg_flags); | ||
963 | err = PTR_ERR(skb); | ||
964 | if (skb && !IS_ERR(skb)) | ||
965 | err = udp_send_skb(skb, fl4); | ||
966 | goto out; | ||
967 | } | ||
927 | 968 | ||
928 | lock_sock(sk); | 969 | lock_sock(sk); |
929 | if (unlikely(up->pending)) { | 970 | if (unlikely(up->pending)) { |
@@ -938,18 +979,18 @@ back_from_confirm: | |||
938 | /* | 979 | /* |
939 | * Now cork the socket to pend data. | 980 | * Now cork the socket to pend data. |
940 | */ | 981 | */ |
941 | inet->cork.fl.fl4_dst = daddr; | 982 | fl4 = &inet->cork.fl.u.ip4; |
942 | inet->cork.fl.fl_ip_dport = dport; | 983 | fl4->daddr = daddr; |
943 | inet->cork.fl.fl4_src = saddr; | 984 | fl4->saddr = saddr; |
944 | inet->cork.fl.fl_ip_sport = inet->inet_sport; | 985 | fl4->fl4_dport = dport; |
986 | fl4->fl4_sport = inet->inet_sport; | ||
945 | up->pending = AF_INET; | 987 | up->pending = AF_INET; |
946 | 988 | ||
947 | do_append_data: | 989 | do_append_data: |
948 | up->len += ulen; | 990 | up->len += ulen; |
949 | getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; | 991 | err = ip_append_data(sk, fl4, getfrag, msg->msg_iov, ulen, |
950 | err = ip_append_data(sk, getfrag, msg->msg_iov, ulen, | 992 | sizeof(struct udphdr), &ipc, &rt, |
951 | sizeof(struct udphdr), &ipc, &rt, | 993 | corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); |
952 | corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); | ||
953 | if (err) | 994 | if (err) |
954 | udp_flush_pending_frames(sk); | 995 | udp_flush_pending_frames(sk); |
955 | else if (!corkreq) | 996 | else if (!corkreq) |
@@ -989,6 +1030,7 @@ EXPORT_SYMBOL(udp_sendmsg); | |||
989 | int udp_sendpage(struct sock *sk, struct page *page, int offset, | 1030 | int udp_sendpage(struct sock *sk, struct page *page, int offset, |
990 | size_t size, int flags) | 1031 | size_t size, int flags) |
991 | { | 1032 | { |
1033 | struct inet_sock *inet = inet_sk(sk); | ||
992 | struct udp_sock *up = udp_sk(sk); | 1034 | struct udp_sock *up = udp_sk(sk); |
993 | int ret; | 1035 | int ret; |
994 | 1036 | ||
@@ -1013,7 +1055,8 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset, | |||
1013 | return -EINVAL; | 1055 | return -EINVAL; |
1014 | } | 1056 | } |
1015 | 1057 | ||
1016 | ret = ip_append_page(sk, page, offset, size, flags); | 1058 | ret = ip_append_page(sk, &inet->cork.fl.u.ip4, |
1059 | page, offset, size, flags); | ||
1017 | if (ret == -EOPNOTSUPP) { | 1060 | if (ret == -EOPNOTSUPP) { |
1018 | release_sock(sk); | 1061 | release_sock(sk); |
1019 | return sock_no_sendpage(sk->sk_socket, page, offset, | 1062 | return sock_no_sendpage(sk->sk_socket, page, offset, |
@@ -1206,6 +1249,9 @@ csum_copy_err: | |||
1206 | 1249 | ||
1207 | if (noblock) | 1250 | if (noblock) |
1208 | return -EAGAIN; | 1251 | return -EAGAIN; |
1252 | |||
1253 | /* starting over for a new packet */ | ||
1254 | msg->msg_flags &= ~MSG_TRUNC; | ||
1209 | goto try_again; | 1255 | goto try_again; |
1210 | } | 1256 | } |
1211 | 1257 | ||
@@ -1413,7 +1459,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) | |||
1413 | } | 1459 | } |
1414 | } | 1460 | } |
1415 | 1461 | ||
1416 | if (sk->sk_filter) { | 1462 | if (rcu_dereference_raw(sk->sk_filter)) { |
1417 | if (udp_lib_checksum_complete(skb)) | 1463 | if (udp_lib_checksum_complete(skb)) |
1418 | goto drop; | 1464 | goto drop; |
1419 | } | 1465 | } |
@@ -1899,6 +1945,7 @@ struct proto udp_prot = { | |||
1899 | .compat_setsockopt = compat_udp_setsockopt, | 1945 | .compat_setsockopt = compat_udp_setsockopt, |
1900 | .compat_getsockopt = compat_udp_getsockopt, | 1946 | .compat_getsockopt = compat_udp_getsockopt, |
1901 | #endif | 1947 | #endif |
1948 | .clear_sk = sk_prot_clear_portaddr_nulls, | ||
1902 | }; | 1949 | }; |
1903 | EXPORT_SYMBOL(udp_prot); | 1950 | EXPORT_SYMBOL(udp_prot); |
1904 | 1951 | ||
@@ -2046,7 +2093,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f, | |||
2046 | __u16 srcp = ntohs(inet->inet_sport); | 2093 | __u16 srcp = ntohs(inet->inet_sport); |
2047 | 2094 | ||
2048 | seq_printf(f, "%5d: %08X:%04X %08X:%04X" | 2095 | seq_printf(f, "%5d: %08X:%04X %08X:%04X" |
2049 | " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d%n", | 2096 | " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d%n", |
2050 | bucket, src, srcp, dest, destp, sp->sk_state, | 2097 | bucket, src, srcp, dest, destp, sp->sk_state, |
2051 | sk_wmem_alloc_get(sp), | 2098 | sk_wmem_alloc_get(sp), |
2052 | sk_rmem_alloc_get(sp), | 2099 | sk_rmem_alloc_get(sp), |
@@ -2162,16 +2209,10 @@ void __init udp_table_init(struct udp_table *table, const char *name) | |||
2162 | 2209 | ||
2163 | void __init udp_init(void) | 2210 | void __init udp_init(void) |
2164 | { | 2211 | { |
2165 | unsigned long nr_pages, limit; | 2212 | unsigned long limit; |
2166 | 2213 | ||
2167 | udp_table_init(&udp_table, "UDP"); | 2214 | udp_table_init(&udp_table, "UDP"); |
2168 | /* Set the pressure threshold up by the same strategy of TCP. It is a | 2215 | limit = nr_free_buffer_pages() / 8; |
2169 | * fraction of global memory that is up to 1/2 at 256 MB, decreasing | ||
2170 | * toward zero with the amount of memory, with a floor of 128 pages. | ||
2171 | */ | ||
2172 | nr_pages = totalram_pages - totalhigh_pages; | ||
2173 | limit = min(nr_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT); | ||
2174 | limit = (limit * (nr_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11); | ||
2175 | limit = max(limit, 128UL); | 2216 | limit = max(limit, 128UL); |
2176 | sysctl_udp_mem[0] = limit / 4 * 3; | 2217 | sysctl_udp_mem[0] = limit / 4 * 3; |
2177 | sysctl_udp_mem[1] = limit; | 2218 | sysctl_udp_mem[1] = limit; |
@@ -2200,7 +2241,7 @@ int udp4_ufo_send_check(struct sk_buff *skb) | |||
2200 | return 0; | 2241 | return 0; |
2201 | } | 2242 | } |
2202 | 2243 | ||
2203 | struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, int features) | 2244 | struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, u32 features) |
2204 | { | 2245 | { |
2205 | struct sk_buff *segs = ERR_PTR(-EINVAL); | 2246 | struct sk_buff *segs = ERR_PTR(-EINVAL); |
2206 | unsigned int mss; | 2247 | unsigned int mss; |
@@ -2228,7 +2269,7 @@ struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, int features) | |||
2228 | /* Do software UFO. Complete and fill in the UDP checksum as HW cannot | 2269 | /* Do software UFO. Complete and fill in the UDP checksum as HW cannot |
2229 | * do checksum of UDP packets sent as multiple IP fragments. | 2270 | * do checksum of UDP packets sent as multiple IP fragments. |
2230 | */ | 2271 | */ |
2231 | offset = skb->csum_start - skb_headroom(skb); | 2272 | offset = skb_checksum_start_offset(skb); |
2232 | csum = skb_checksum(skb, offset, skb->len - offset, 0); | 2273 | csum = skb_checksum(skb, offset, skb->len - offset, 0); |
2233 | offset += skb->csum_offset; | 2274 | offset += skb->csum_offset; |
2234 | *(__sum16 *)(skb->data + offset) = csum_fold(csum); | 2275 | *(__sum16 *)(skb->data + offset) = csum_fold(csum); |
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index ab76aa928fa9..aee9963f7f5a 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c | |||
@@ -57,6 +57,7 @@ struct proto udplite_prot = { | |||
57 | .compat_setsockopt = compat_udp_setsockopt, | 57 | .compat_setsockopt = compat_udp_setsockopt, |
58 | .compat_getsockopt = compat_udp_getsockopt, | 58 | .compat_getsockopt = compat_udp_getsockopt, |
59 | #endif | 59 | #endif |
60 | .clear_sk = sk_prot_clear_portaddr_nulls, | ||
60 | }; | 61 | }; |
61 | EXPORT_SYMBOL(udplite_prot); | 62 | EXPORT_SYMBOL(udplite_prot); |
62 | 63 | ||
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index 6f368413eb0e..534972e114ac 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c | |||
@@ -56,7 +56,7 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) | |||
56 | 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF)); | 56 | 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF)); |
57 | ip_select_ident(top_iph, dst->child, NULL); | 57 | ip_select_ident(top_iph, dst->child, NULL); |
58 | 58 | ||
59 | top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT); | 59 | top_iph->ttl = ip4_dst_hoplimit(dst->child); |
60 | 60 | ||
61 | top_iph->saddr = x->props.saddr.a4; | 61 | top_iph->saddr = x->props.saddr.a4; |
62 | top_iph->daddr = x->id.daddr.a4; | 62 | top_iph->daddr = x->id.daddr.a4; |
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 571aa96a175c..327a617d594c 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c | |||
@@ -32,7 +32,12 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb) | |||
32 | dst = skb_dst(skb); | 32 | dst = skb_dst(skb); |
33 | mtu = dst_mtu(dst); | 33 | mtu = dst_mtu(dst); |
34 | if (skb->len > mtu) { | 34 | if (skb->len > mtu) { |
35 | icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); | 35 | if (skb->sk) |
36 | ip_local_error(skb->sk, EMSGSIZE, ip_hdr(skb)->daddr, | ||
37 | inet_sk(skb->sk)->inet_dport, mtu); | ||
38 | else | ||
39 | icmp_send(skb, ICMP_DEST_UNREACH, | ||
40 | ICMP_FRAG_NEEDED, htonl(mtu)); | ||
36 | ret = -EMSGSIZE; | 41 | ret = -EMSGSIZE; |
37 | } | 42 | } |
38 | out: | 43 | out: |
@@ -69,7 +74,7 @@ int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb) | |||
69 | } | 74 | } |
70 | EXPORT_SYMBOL(xfrm4_prepare_output); | 75 | EXPORT_SYMBOL(xfrm4_prepare_output); |
71 | 76 | ||
72 | static int xfrm4_output_finish(struct sk_buff *skb) | 77 | int xfrm4_output_finish(struct sk_buff *skb) |
73 | { | 78 | { |
74 | #ifdef CONFIG_NETFILTER | 79 | #ifdef CONFIG_NETFILTER |
75 | if (!skb_dst(skb)->xfrm) { | 80 | if (!skb_dst(skb)->xfrm) { |
@@ -86,7 +91,11 @@ static int xfrm4_output_finish(struct sk_buff *skb) | |||
86 | 91 | ||
87 | int xfrm4_output(struct sk_buff *skb) | 92 | int xfrm4_output(struct sk_buff *skb) |
88 | { | 93 | { |
94 | struct dst_entry *dst = skb_dst(skb); | ||
95 | struct xfrm_state *x = dst->xfrm; | ||
96 | |||
89 | return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, | 97 | return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, |
90 | NULL, skb_dst(skb)->dev, xfrm4_output_finish, | 98 | NULL, dst->dev, |
99 | x->outer_mode->afinfo->output_finish, | ||
91 | !(IPCB(skb)->flags & IPSKB_REROUTED)); | 100 | !(IPCB(skb)->flags & IPSKB_REROUTED)); |
92 | } | 101 | } |
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index a580349f0b8a..981e43eaf704 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c | |||
@@ -11,57 +11,60 @@ | |||
11 | #include <linux/err.h> | 11 | #include <linux/err.h> |
12 | #include <linux/kernel.h> | 12 | #include <linux/kernel.h> |
13 | #include <linux/inetdevice.h> | 13 | #include <linux/inetdevice.h> |
14 | #include <linux/if_tunnel.h> | ||
14 | #include <net/dst.h> | 15 | #include <net/dst.h> |
15 | #include <net/xfrm.h> | 16 | #include <net/xfrm.h> |
16 | #include <net/ip.h> | 17 | #include <net/ip.h> |
17 | 18 | ||
18 | static struct xfrm_policy_afinfo xfrm4_policy_afinfo; | 19 | static struct xfrm_policy_afinfo xfrm4_policy_afinfo; |
19 | 20 | ||
20 | static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, | 21 | static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, |
21 | xfrm_address_t *saddr, | 22 | int tos, |
22 | xfrm_address_t *daddr) | 23 | const xfrm_address_t *saddr, |
24 | const xfrm_address_t *daddr) | ||
23 | { | 25 | { |
24 | struct flowi fl = { | ||
25 | .nl_u = { | ||
26 | .ip4_u = { | ||
27 | .tos = tos, | ||
28 | .daddr = daddr->a4, | ||
29 | }, | ||
30 | }, | ||
31 | }; | ||
32 | struct dst_entry *dst; | ||
33 | struct rtable *rt; | 26 | struct rtable *rt; |
34 | int err; | ||
35 | 27 | ||
28 | memset(fl4, 0, sizeof(*fl4)); | ||
29 | fl4->daddr = daddr->a4; | ||
30 | fl4->flowi4_tos = tos; | ||
36 | if (saddr) | 31 | if (saddr) |
37 | fl.fl4_src = saddr->a4; | 32 | fl4->saddr = saddr->a4; |
33 | |||
34 | rt = __ip_route_output_key(net, fl4); | ||
35 | if (!IS_ERR(rt)) | ||
36 | return &rt->dst; | ||
38 | 37 | ||
39 | err = __ip_route_output_key(net, &rt, &fl); | 38 | return ERR_CAST(rt); |
40 | dst = &rt->dst; | 39 | } |
41 | if (err) | 40 | |
42 | dst = ERR_PTR(err); | 41 | static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, |
43 | return dst; | 42 | const xfrm_address_t *saddr, |
43 | const xfrm_address_t *daddr) | ||
44 | { | ||
45 | struct flowi4 fl4; | ||
46 | |||
47 | return __xfrm4_dst_lookup(net, &fl4, tos, saddr, daddr); | ||
44 | } | 48 | } |
45 | 49 | ||
46 | static int xfrm4_get_saddr(struct net *net, | 50 | static int xfrm4_get_saddr(struct net *net, |
47 | xfrm_address_t *saddr, xfrm_address_t *daddr) | 51 | xfrm_address_t *saddr, xfrm_address_t *daddr) |
48 | { | 52 | { |
49 | struct dst_entry *dst; | 53 | struct dst_entry *dst; |
50 | struct rtable *rt; | 54 | struct flowi4 fl4; |
51 | 55 | ||
52 | dst = xfrm4_dst_lookup(net, 0, NULL, daddr); | 56 | dst = __xfrm4_dst_lookup(net, &fl4, 0, NULL, daddr); |
53 | if (IS_ERR(dst)) | 57 | if (IS_ERR(dst)) |
54 | return -EHOSTUNREACH; | 58 | return -EHOSTUNREACH; |
55 | 59 | ||
56 | rt = (struct rtable *)dst; | 60 | saddr->a4 = fl4.saddr; |
57 | saddr->a4 = rt->rt_src; | ||
58 | dst_release(dst); | 61 | dst_release(dst); |
59 | return 0; | 62 | return 0; |
60 | } | 63 | } |
61 | 64 | ||
62 | static int xfrm4_get_tos(struct flowi *fl) | 65 | static int xfrm4_get_tos(const struct flowi *fl) |
63 | { | 66 | { |
64 | return IPTOS_RT_MASK & fl->fl4_tos; /* Strip ECN bits */ | 67 | return IPTOS_RT_MASK & fl->u.ip4.flowi4_tos; /* Strip ECN bits */ |
65 | } | 68 | } |
66 | 69 | ||
67 | static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst, | 70 | static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst, |
@@ -71,19 +74,22 @@ static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst, | |||
71 | } | 74 | } |
72 | 75 | ||
73 | static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, | 76 | static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, |
74 | struct flowi *fl) | 77 | const struct flowi *fl) |
75 | { | 78 | { |
76 | struct rtable *rt = (struct rtable *)xdst->route; | 79 | struct rtable *rt = (struct rtable *)xdst->route; |
80 | const struct flowi4 *fl4 = &fl->u.ip4; | ||
77 | 81 | ||
78 | xdst->u.rt.fl = *fl; | 82 | rt->rt_key_dst = fl4->daddr; |
83 | rt->rt_key_src = fl4->saddr; | ||
84 | rt->rt_key_tos = fl4->flowi4_tos; | ||
85 | rt->rt_route_iif = fl4->flowi4_iif; | ||
86 | rt->rt_iif = fl4->flowi4_iif; | ||
87 | rt->rt_oif = fl4->flowi4_oif; | ||
88 | rt->rt_mark = fl4->flowi4_mark; | ||
79 | 89 | ||
80 | xdst->u.dst.dev = dev; | 90 | xdst->u.dst.dev = dev; |
81 | dev_hold(dev); | 91 | dev_hold(dev); |
82 | 92 | ||
83 | xdst->u.rt.idev = in_dev_get(dev); | ||
84 | if (!xdst->u.rt.idev) | ||
85 | return -ENODEV; | ||
86 | |||
87 | xdst->u.rt.peer = rt->peer; | 93 | xdst->u.rt.peer = rt->peer; |
88 | if (rt->peer) | 94 | if (rt->peer) |
89 | atomic_inc(&rt->peer->refcnt); | 95 | atomic_inc(&rt->peer->refcnt); |
@@ -104,11 +110,12 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, | |||
104 | static void | 110 | static void |
105 | _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) | 111 | _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) |
106 | { | 112 | { |
107 | struct iphdr *iph = ip_hdr(skb); | 113 | const struct iphdr *iph = ip_hdr(skb); |
108 | u8 *xprth = skb_network_header(skb) + iph->ihl * 4; | 114 | u8 *xprth = skb_network_header(skb) + iph->ihl * 4; |
115 | struct flowi4 *fl4 = &fl->u.ip4; | ||
109 | 116 | ||
110 | memset(fl, 0, sizeof(struct flowi)); | 117 | memset(fl4, 0, sizeof(struct flowi4)); |
111 | fl->mark = skb->mark; | 118 | fl4->flowi4_mark = skb->mark; |
112 | 119 | ||
113 | if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) { | 120 | if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) { |
114 | switch (iph->protocol) { | 121 | switch (iph->protocol) { |
@@ -121,8 +128,8 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) | |||
121 | pskb_may_pull(skb, xprth + 4 - skb->data)) { | 128 | pskb_may_pull(skb, xprth + 4 - skb->data)) { |
122 | __be16 *ports = (__be16 *)xprth; | 129 | __be16 *ports = (__be16 *)xprth; |
123 | 130 | ||
124 | fl->fl_ip_sport = ports[!!reverse]; | 131 | fl4->fl4_sport = ports[!!reverse]; |
125 | fl->fl_ip_dport = ports[!reverse]; | 132 | fl4->fl4_dport = ports[!reverse]; |
126 | } | 133 | } |
127 | break; | 134 | break; |
128 | 135 | ||
@@ -130,8 +137,8 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) | |||
130 | if (pskb_may_pull(skb, xprth + 2 - skb->data)) { | 137 | if (pskb_may_pull(skb, xprth + 2 - skb->data)) { |
131 | u8 *icmp = xprth; | 138 | u8 *icmp = xprth; |
132 | 139 | ||
133 | fl->fl_icmp_type = icmp[0]; | 140 | fl4->fl4_icmp_type = icmp[0]; |
134 | fl->fl_icmp_code = icmp[1]; | 141 | fl4->fl4_icmp_code = icmp[1]; |
135 | } | 142 | } |
136 | break; | 143 | break; |
137 | 144 | ||
@@ -139,7 +146,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) | |||
139 | if (pskb_may_pull(skb, xprth + 4 - skb->data)) { | 146 | if (pskb_may_pull(skb, xprth + 4 - skb->data)) { |
140 | __be32 *ehdr = (__be32 *)xprth; | 147 | __be32 *ehdr = (__be32 *)xprth; |
141 | 148 | ||
142 | fl->fl_ipsec_spi = ehdr[0]; | 149 | fl4->fl4_ipsec_spi = ehdr[0]; |
143 | } | 150 | } |
144 | break; | 151 | break; |
145 | 152 | ||
@@ -147,7 +154,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) | |||
147 | if (pskb_may_pull(skb, xprth + 8 - skb->data)) { | 154 | if (pskb_may_pull(skb, xprth + 8 - skb->data)) { |
148 | __be32 *ah_hdr = (__be32*)xprth; | 155 | __be32 *ah_hdr = (__be32*)xprth; |
149 | 156 | ||
150 | fl->fl_ipsec_spi = ah_hdr[1]; | 157 | fl4->fl4_ipsec_spi = ah_hdr[1]; |
151 | } | 158 | } |
152 | break; | 159 | break; |
153 | 160 | ||
@@ -155,18 +162,32 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) | |||
155 | if (pskb_may_pull(skb, xprth + 4 - skb->data)) { | 162 | if (pskb_may_pull(skb, xprth + 4 - skb->data)) { |
156 | __be16 *ipcomp_hdr = (__be16 *)xprth; | 163 | __be16 *ipcomp_hdr = (__be16 *)xprth; |
157 | 164 | ||
158 | fl->fl_ipsec_spi = htonl(ntohs(ipcomp_hdr[1])); | 165 | fl4->fl4_ipsec_spi = htonl(ntohs(ipcomp_hdr[1])); |
166 | } | ||
167 | break; | ||
168 | |||
169 | case IPPROTO_GRE: | ||
170 | if (pskb_may_pull(skb, xprth + 12 - skb->data)) { | ||
171 | __be16 *greflags = (__be16 *)xprth; | ||
172 | __be32 *gre_hdr = (__be32 *)xprth; | ||
173 | |||
174 | if (greflags[0] & GRE_KEY) { | ||
175 | if (greflags[0] & GRE_CSUM) | ||
176 | gre_hdr++; | ||
177 | fl4->fl4_gre_key = gre_hdr[1]; | ||
178 | } | ||
159 | } | 179 | } |
160 | break; | 180 | break; |
181 | |||
161 | default: | 182 | default: |
162 | fl->fl_ipsec_spi = 0; | 183 | fl4->fl4_ipsec_spi = 0; |
163 | break; | 184 | break; |
164 | } | 185 | } |
165 | } | 186 | } |
166 | fl->proto = iph->protocol; | 187 | fl4->flowi4_proto = iph->protocol; |
167 | fl->fl4_dst = reverse ? iph->saddr : iph->daddr; | 188 | fl4->daddr = reverse ? iph->saddr : iph->daddr; |
168 | fl->fl4_src = reverse ? iph->daddr : iph->saddr; | 189 | fl4->saddr = reverse ? iph->daddr : iph->saddr; |
169 | fl->fl4_tos = iph->tos; | 190 | fl4->flowi4_tos = iph->tos; |
170 | } | 191 | } |
171 | 192 | ||
172 | static inline int xfrm4_garbage_collect(struct dst_ops *ops) | 193 | static inline int xfrm4_garbage_collect(struct dst_ops *ops) |
@@ -174,7 +195,7 @@ static inline int xfrm4_garbage_collect(struct dst_ops *ops) | |||
174 | struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops); | 195 | struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops); |
175 | 196 | ||
176 | xfrm4_policy_afinfo.garbage_collect(net); | 197 | xfrm4_policy_afinfo.garbage_collect(net); |
177 | return (atomic_read(&ops->entries) > ops->gc_thresh * 2); | 198 | return (dst_entries_get_slow(ops) > ops->gc_thresh * 2); |
178 | } | 199 | } |
179 | 200 | ||
180 | static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu) | 201 | static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu) |
@@ -189,37 +210,20 @@ static void xfrm4_dst_destroy(struct dst_entry *dst) | |||
189 | { | 210 | { |
190 | struct xfrm_dst *xdst = (struct xfrm_dst *)dst; | 211 | struct xfrm_dst *xdst = (struct xfrm_dst *)dst; |
191 | 212 | ||
192 | if (likely(xdst->u.rt.idev)) | 213 | dst_destroy_metrics_generic(dst); |
193 | in_dev_put(xdst->u.rt.idev); | 214 | |
194 | if (likely(xdst->u.rt.peer)) | 215 | if (likely(xdst->u.rt.peer)) |
195 | inet_putpeer(xdst->u.rt.peer); | 216 | inet_putpeer(xdst->u.rt.peer); |
217 | |||
196 | xfrm_dst_destroy(xdst); | 218 | xfrm_dst_destroy(xdst); |
197 | } | 219 | } |
198 | 220 | ||
199 | static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, | 221 | static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, |
200 | int unregister) | 222 | int unregister) |
201 | { | 223 | { |
202 | struct xfrm_dst *xdst; | ||
203 | |||
204 | if (!unregister) | 224 | if (!unregister) |
205 | return; | 225 | return; |
206 | 226 | ||
207 | xdst = (struct xfrm_dst *)dst; | ||
208 | if (xdst->u.rt.idev->dev == dev) { | ||
209 | struct in_device *loopback_idev = | ||
210 | in_dev_get(dev_net(dev)->loopback_dev); | ||
211 | BUG_ON(!loopback_idev); | ||
212 | |||
213 | do { | ||
214 | in_dev_put(xdst->u.rt.idev); | ||
215 | xdst->u.rt.idev = loopback_idev; | ||
216 | in_dev_hold(loopback_idev); | ||
217 | xdst = (struct xfrm_dst *)xdst->u.dst.child; | ||
218 | } while (xdst->u.dst.xfrm); | ||
219 | |||
220 | __in_dev_put(loopback_idev); | ||
221 | } | ||
222 | |||
223 | xfrm_dst_ifdown(dst, dev); | 227 | xfrm_dst_ifdown(dst, dev); |
224 | } | 228 | } |
225 | 229 | ||
@@ -228,11 +232,11 @@ static struct dst_ops xfrm4_dst_ops = { | |||
228 | .protocol = cpu_to_be16(ETH_P_IP), | 232 | .protocol = cpu_to_be16(ETH_P_IP), |
229 | .gc = xfrm4_garbage_collect, | 233 | .gc = xfrm4_garbage_collect, |
230 | .update_pmtu = xfrm4_update_pmtu, | 234 | .update_pmtu = xfrm4_update_pmtu, |
235 | .cow_metrics = dst_cow_metrics_generic, | ||
231 | .destroy = xfrm4_dst_destroy, | 236 | .destroy = xfrm4_dst_destroy, |
232 | .ifdown = xfrm4_dst_ifdown, | 237 | .ifdown = xfrm4_dst_ifdown, |
233 | .local_out = __ip_local_out, | 238 | .local_out = __ip_local_out, |
234 | .gc_thresh = 1024, | 239 | .gc_thresh = 1024, |
235 | .entries = ATOMIC_INIT(0), | ||
236 | }; | 240 | }; |
237 | 241 | ||
238 | static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { | 242 | static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { |
@@ -244,6 +248,7 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { | |||
244 | .get_tos = xfrm4_get_tos, | 248 | .get_tos = xfrm4_get_tos, |
245 | .init_path = xfrm4_init_path, | 249 | .init_path = xfrm4_init_path, |
246 | .fill_dst = xfrm4_fill_dst, | 250 | .fill_dst = xfrm4_fill_dst, |
251 | .blackhole_route = ipv4_blackhole_route, | ||
247 | }; | 252 | }; |
248 | 253 | ||
249 | #ifdef CONFIG_SYSCTL | 254 | #ifdef CONFIG_SYSCTL |
@@ -288,6 +293,7 @@ void __init xfrm4_init(int rt_max_size) | |||
288 | * and start cleaning when were 1/2 full | 293 | * and start cleaning when were 1/2 full |
289 | */ | 294 | */ |
290 | xfrm4_dst_ops.gc_thresh = rt_max_size/2; | 295 | xfrm4_dst_ops.gc_thresh = rt_max_size/2; |
296 | dst_entries_init(&xfrm4_dst_ops); | ||
291 | 297 | ||
292 | xfrm4_state_init(); | 298 | xfrm4_state_init(); |
293 | xfrm4_policy_init(); | 299 | xfrm4_policy_init(); |
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index 47947624eccc..d9ac0a0058b5 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c | |||
@@ -21,24 +21,26 @@ static int xfrm4_init_flags(struct xfrm_state *x) | |||
21 | } | 21 | } |
22 | 22 | ||
23 | static void | 23 | static void |
24 | __xfrm4_init_tempsel(struct xfrm_selector *sel, struct flowi *fl) | 24 | __xfrm4_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl) |
25 | { | 25 | { |
26 | sel->daddr.a4 = fl->fl4_dst; | 26 | const struct flowi4 *fl4 = &fl->u.ip4; |
27 | sel->saddr.a4 = fl->fl4_src; | 27 | |
28 | sel->dport = xfrm_flowi_dport(fl); | 28 | sel->daddr.a4 = fl4->daddr; |
29 | sel->saddr.a4 = fl4->saddr; | ||
30 | sel->dport = xfrm_flowi_dport(fl, &fl4->uli); | ||
29 | sel->dport_mask = htons(0xffff); | 31 | sel->dport_mask = htons(0xffff); |
30 | sel->sport = xfrm_flowi_sport(fl); | 32 | sel->sport = xfrm_flowi_sport(fl, &fl4->uli); |
31 | sel->sport_mask = htons(0xffff); | 33 | sel->sport_mask = htons(0xffff); |
32 | sel->family = AF_INET; | 34 | sel->family = AF_INET; |
33 | sel->prefixlen_d = 32; | 35 | sel->prefixlen_d = 32; |
34 | sel->prefixlen_s = 32; | 36 | sel->prefixlen_s = 32; |
35 | sel->proto = fl->proto; | 37 | sel->proto = fl4->flowi4_proto; |
36 | sel->ifindex = fl->oif; | 38 | sel->ifindex = fl4->flowi4_oif; |
37 | } | 39 | } |
38 | 40 | ||
39 | static void | 41 | static void |
40 | xfrm4_init_temprop(struct xfrm_state *x, struct xfrm_tmpl *tmpl, | 42 | xfrm4_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl, |
41 | xfrm_address_t *daddr, xfrm_address_t *saddr) | 43 | const xfrm_address_t *daddr, const xfrm_address_t *saddr) |
42 | { | 44 | { |
43 | x->id = tmpl->id; | 45 | x->id = tmpl->id; |
44 | if (x->id.daddr.a4 == 0) | 46 | if (x->id.daddr.a4 == 0) |
@@ -53,7 +55,7 @@ xfrm4_init_temprop(struct xfrm_state *x, struct xfrm_tmpl *tmpl, | |||
53 | 55 | ||
54 | int xfrm4_extract_header(struct sk_buff *skb) | 56 | int xfrm4_extract_header(struct sk_buff *skb) |
55 | { | 57 | { |
56 | struct iphdr *iph = ip_hdr(skb); | 58 | const struct iphdr *iph = ip_hdr(skb); |
57 | 59 | ||
58 | XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph); | 60 | XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph); |
59 | XFRM_MODE_SKB_CB(skb)->id = iph->id; | 61 | XFRM_MODE_SKB_CB(skb)->id = iph->id; |
@@ -76,6 +78,7 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = { | |||
76 | .init_tempsel = __xfrm4_init_tempsel, | 78 | .init_tempsel = __xfrm4_init_tempsel, |
77 | .init_temprop = xfrm4_init_temprop, | 79 | .init_temprop = xfrm4_init_temprop, |
78 | .output = xfrm4_output, | 80 | .output = xfrm4_output, |
81 | .output_finish = xfrm4_output_finish, | ||
79 | .extract_input = xfrm4_extract_input, | 82 | .extract_input = xfrm4_extract_input, |
80 | .extract_output = xfrm4_extract_output, | 83 | .extract_output = xfrm4_extract_output, |
81 | .transport_finish = xfrm4_transport_finish, | 84 | .transport_finish = xfrm4_transport_finish, |
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c index 41f5982d2087..82806455e859 100644 --- a/net/ipv4/xfrm4_tunnel.c +++ b/net/ipv4/xfrm4_tunnel.c | |||
@@ -58,14 +58,14 @@ static int xfrm_tunnel_err(struct sk_buff *skb, u32 info) | |||
58 | return -ENOENT; | 58 | return -ENOENT; |
59 | } | 59 | } |
60 | 60 | ||
61 | static struct xfrm_tunnel xfrm_tunnel_handler = { | 61 | static struct xfrm_tunnel xfrm_tunnel_handler __read_mostly = { |
62 | .handler = xfrm_tunnel_rcv, | 62 | .handler = xfrm_tunnel_rcv, |
63 | .err_handler = xfrm_tunnel_err, | 63 | .err_handler = xfrm_tunnel_err, |
64 | .priority = 2, | 64 | .priority = 2, |
65 | }; | 65 | }; |
66 | 66 | ||
67 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | 67 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
68 | static struct xfrm_tunnel xfrm64_tunnel_handler = { | 68 | static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = { |
69 | .handler = xfrm_tunnel_rcv, | 69 | .handler = xfrm_tunnel_rcv, |
70 | .err_handler = xfrm_tunnel_err, | 70 | .err_handler = xfrm_tunnel_err, |
71 | .priority = 2, | 71 | .priority = 2, |