diff options
author | Eric Dumazet <dada1@cosmosbay.com> | 2007-04-30 03:26:00 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2007-04-30 03:26:00 -0400 |
commit | 6aaf47fa48d3c44280810b1b470261d340e4ed87 (patch) | |
tree | ee59c6bd3ee482ef85a7b65f057beff4f4598b9b | |
parent | 65def812ab25d7565756e5748d91e22e302197ee (diff) |
[PATCH] INET : IPV4 UDP lookups converted to a 2 pass algo
Some people want to have many UDP sockets, binded to a single port but
many different addresses. We currently hash all those sockets into a
single chain. Processing of incoming packets is very expensive,
because the whole chain must be examined to find the best match.
I chose in this patch to hash UDP sockets with a hash function that
take into account both their port number and address : This has a
drawback because we need two lookups : one with a given address, one
with a wildcard (null) address.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | net/ipv4/udp.c | 171 |
1 files changed, 114 insertions, 57 deletions
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index cec0f2cc49b7..144970704c2c 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c | |||
@@ -114,14 +114,33 @@ DEFINE_RWLOCK(udp_hash_lock); | |||
114 | 114 | ||
115 | static int udp_port_rover; | 115 | static int udp_port_rover; |
116 | 116 | ||
117 | static inline int __udp_lib_lport_inuse(__u16 num, struct hlist_head udptable[]) | 117 | /* |
118 | * Note about this hash function : | ||
119 | * Typical use is probably daddr = 0, only dport is going to vary hash | ||
120 | */ | ||
121 | static inline unsigned int hash_port_and_addr(__u16 port, __be32 addr) | ||
122 | { | ||
123 | addr ^= addr >> 16; | ||
124 | addr ^= addr >> 8; | ||
125 | return port ^ addr; | ||
126 | } | ||
127 | |||
128 | static inline int __udp_lib_port_inuse(unsigned int hash, int port, | ||
129 | __be32 daddr, struct hlist_head udptable[]) | ||
118 | { | 130 | { |
119 | struct sock *sk; | 131 | struct sock *sk; |
120 | struct hlist_node *node; | 132 | struct hlist_node *node; |
133 | struct inet_sock *inet; | ||
121 | 134 | ||
122 | sk_for_each(sk, node, &udptable[num & (UDP_HTABLE_SIZE - 1)]) | 135 | sk_for_each(sk, node, &udptable[hash & (UDP_HTABLE_SIZE - 1)]) { |
123 | if (sk->sk_hash == num) | 136 | if (sk->sk_hash != hash) |
137 | continue; | ||
138 | inet = inet_sk(sk); | ||
139 | if (inet->num != port) | ||
140 | continue; | ||
141 | if (inet->rcv_saddr == daddr) | ||
124 | return 1; | 142 | return 1; |
143 | } | ||
125 | return 0; | 144 | return 0; |
126 | } | 145 | } |
127 | 146 | ||
@@ -142,6 +161,7 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum, | |||
142 | struct hlist_node *node; | 161 | struct hlist_node *node; |
143 | struct hlist_head *head; | 162 | struct hlist_head *head; |
144 | struct sock *sk2; | 163 | struct sock *sk2; |
164 | unsigned int hash; | ||
145 | int error = 1; | 165 | int error = 1; |
146 | 166 | ||
147 | write_lock_bh(&udp_hash_lock); | 167 | write_lock_bh(&udp_hash_lock); |
@@ -156,7 +176,9 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum, | |||
156 | for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) { | 176 | for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) { |
157 | int size; | 177 | int size; |
158 | 178 | ||
159 | head = &udptable[result & (UDP_HTABLE_SIZE - 1)]; | 179 | hash = hash_port_and_addr(result, |
180 | inet_sk(sk)->rcv_saddr); | ||
181 | head = &udptable[hash & (UDP_HTABLE_SIZE - 1)]; | ||
160 | if (hlist_empty(head)) { | 182 | if (hlist_empty(head)) { |
161 | if (result > sysctl_local_port_range[1]) | 183 | if (result > sysctl_local_port_range[1]) |
162 | result = sysctl_local_port_range[0] + | 184 | result = sysctl_local_port_range[0] + |
@@ -181,7 +203,10 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum, | |||
181 | result = sysctl_local_port_range[0] | 203 | result = sysctl_local_port_range[0] |
182 | + ((result - sysctl_local_port_range[0]) & | 204 | + ((result - sysctl_local_port_range[0]) & |
183 | (UDP_HTABLE_SIZE - 1)); | 205 | (UDP_HTABLE_SIZE - 1)); |
184 | if (! __udp_lib_lport_inuse(result, udptable)) | 206 | hash = hash_port_and_addr(result, |
207 | inet_sk(sk)->rcv_saddr); | ||
208 | if (! __udp_lib_port_inuse(hash, result, | ||
209 | inet_sk(sk)->rcv_saddr, udptable)) | ||
185 | break; | 210 | break; |
186 | } | 211 | } |
187 | if (i >= (1 << 16) / UDP_HTABLE_SIZE) | 212 | if (i >= (1 << 16) / UDP_HTABLE_SIZE) |
@@ -189,11 +214,13 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum, | |||
189 | gotit: | 214 | gotit: |
190 | *port_rover = snum = result; | 215 | *port_rover = snum = result; |
191 | } else { | 216 | } else { |
192 | head = &udptable[snum & (UDP_HTABLE_SIZE - 1)]; | 217 | hash = hash_port_and_addr(snum, inet_sk(sk)->rcv_saddr); |
218 | head = &udptable[hash & (UDP_HTABLE_SIZE - 1)]; | ||
193 | 219 | ||
194 | sk_for_each(sk2, node, head) | 220 | sk_for_each(sk2, node, head) |
195 | if (sk2->sk_hash == snum && | 221 | if (sk2->sk_hash == hash && |
196 | sk2 != sk && | 222 | sk2 != sk && |
223 | inet_sk(sk2)->num == snum && | ||
197 | (!sk2->sk_reuse || !sk->sk_reuse) && | 224 | (!sk2->sk_reuse || !sk->sk_reuse) && |
198 | (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if | 225 | (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if |
199 | || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && | 226 | || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && |
@@ -201,9 +228,9 @@ gotit: | |||
201 | goto fail; | 228 | goto fail; |
202 | } | 229 | } |
203 | inet_sk(sk)->num = snum; | 230 | inet_sk(sk)->num = snum; |
204 | sk->sk_hash = snum; | 231 | sk->sk_hash = hash; |
205 | if (sk_unhashed(sk)) { | 232 | if (sk_unhashed(sk)) { |
206 | head = &udptable[snum & (UDP_HTABLE_SIZE - 1)]; | 233 | head = &udptable[hash & (UDP_HTABLE_SIZE - 1)]; |
207 | sk_add_node(sk, head); | 234 | sk_add_node(sk, head); |
208 | sock_prot_inc_use(sk->sk_prot); | 235 | sock_prot_inc_use(sk->sk_prot); |
209 | } | 236 | } |
@@ -242,63 +269,78 @@ static struct sock *__udp4_lib_lookup(__be32 saddr, __be16 sport, | |||
242 | { | 269 | { |
243 | struct sock *sk, *result = NULL; | 270 | struct sock *sk, *result = NULL; |
244 | struct hlist_node *node; | 271 | struct hlist_node *node; |
245 | unsigned short hnum = ntohs(dport); | 272 | unsigned int hash, hashwild; |
246 | int badness = -1; | 273 | int score, best = -1; |
274 | |||
275 | hash = hash_port_and_addr(ntohs(dport), daddr); | ||
276 | hashwild = hash_port_and_addr(ntohs(dport), 0); | ||
247 | 277 | ||
248 | read_lock(&udp_hash_lock); | 278 | read_lock(&udp_hash_lock); |
249 | sk_for_each(sk, node, &udptable[hnum & (UDP_HTABLE_SIZE - 1)]) { | 279 | |
280 | lookup: | ||
281 | |||
282 | sk_for_each(sk, node, &udptable[hash & (UDP_HTABLE_SIZE - 1)]) { | ||
250 | struct inet_sock *inet = inet_sk(sk); | 283 | struct inet_sock *inet = inet_sk(sk); |
251 | 284 | ||
252 | if (sk->sk_hash == hnum && !ipv6_only_sock(sk)) { | 285 | if (sk->sk_hash != hash || ipv6_only_sock(sk) || |
253 | int score = (sk->sk_family == PF_INET ? 1 : 0); | 286 | inet->num != dport) |
254 | if (inet->rcv_saddr) { | 287 | continue; |
255 | if (inet->rcv_saddr != daddr) | 288 | |
256 | continue; | 289 | score = (sk->sk_family == PF_INET ? 1 : 0); |
257 | score+=2; | 290 | if (inet->rcv_saddr) { |
258 | } | 291 | if (inet->rcv_saddr != daddr) |
259 | if (inet->daddr) { | 292 | continue; |
260 | if (inet->daddr != saddr) | 293 | score+=2; |
261 | continue; | 294 | } |
262 | score+=2; | 295 | if (inet->daddr) { |
263 | } | 296 | if (inet->daddr != saddr) |
264 | if (inet->dport) { | 297 | continue; |
265 | if (inet->dport != sport) | 298 | score+=2; |
266 | continue; | 299 | } |
267 | score+=2; | 300 | if (inet->dport) { |
268 | } | 301 | if (inet->dport != sport) |
269 | if (sk->sk_bound_dev_if) { | 302 | continue; |
270 | if (sk->sk_bound_dev_if != dif) | 303 | score+=2; |
271 | continue; | 304 | } |
272 | score+=2; | 305 | if (sk->sk_bound_dev_if) { |
273 | } | 306 | if (sk->sk_bound_dev_if != dif) |
274 | if (score == 9) { | 307 | continue; |
275 | result = sk; | 308 | score+=2; |
276 | break; | 309 | } |
277 | } else if (score > badness) { | 310 | if (score == 9) { |
278 | result = sk; | 311 | result = sk; |
279 | badness = score; | 312 | goto found; |
280 | } | 313 | } else if (score > best) { |
314 | result = sk; | ||
315 | best = score; | ||
281 | } | 316 | } |
282 | } | 317 | } |
318 | |||
319 | if (hash != hashwild) { | ||
320 | hash = hashwild; | ||
321 | goto lookup; | ||
322 | } | ||
323 | found: | ||
283 | if (result) | 324 | if (result) |
284 | sock_hold(result); | 325 | sock_hold(result); |
285 | read_unlock(&udp_hash_lock); | 326 | read_unlock(&udp_hash_lock); |
286 | return result; | 327 | return result; |
287 | } | 328 | } |
288 | 329 | ||
289 | static inline struct sock *udp_v4_mcast_next(struct sock *sk, | 330 | static inline struct sock *udp_v4_mcast_next( |
290 | __be16 loc_port, __be32 loc_addr, | 331 | struct sock *sk, |
291 | __be16 rmt_port, __be32 rmt_addr, | 332 | unsigned int hnum, __be16 loc_port, __be32 loc_addr, |
292 | int dif) | 333 | __be16 rmt_port, __be32 rmt_addr, |
334 | int dif) | ||
293 | { | 335 | { |
294 | struct hlist_node *node; | 336 | struct hlist_node *node; |
295 | struct sock *s = sk; | 337 | struct sock *s = sk; |
296 | unsigned short hnum = ntohs(loc_port); | ||
297 | 338 | ||
298 | sk_for_each_from(s, node) { | 339 | sk_for_each_from(s, node) { |
299 | struct inet_sock *inet = inet_sk(s); | 340 | struct inet_sock *inet = inet_sk(s); |
300 | 341 | ||
301 | if (s->sk_hash != hnum || | 342 | if (s->sk_hash != hnum || |
343 | inet->num != loc_port || | ||
302 | (inet->daddr && inet->daddr != rmt_addr) || | 344 | (inet->daddr && inet->daddr != rmt_addr) || |
303 | (inet->dport != rmt_port && inet->dport) || | 345 | (inet->dport != rmt_port && inet->dport) || |
304 | (inet->rcv_saddr && inet->rcv_saddr != loc_addr) || | 346 | (inet->rcv_saddr && inet->rcv_saddr != loc_addr) || |
@@ -1129,29 +1171,44 @@ static int __udp4_lib_mcast_deliver(struct sk_buff *skb, | |||
1129 | __be32 saddr, __be32 daddr, | 1171 | __be32 saddr, __be32 daddr, |
1130 | struct hlist_head udptable[]) | 1172 | struct hlist_head udptable[]) |
1131 | { | 1173 | { |
1132 | struct sock *sk; | 1174 | struct sock *sk, *skw, *sknext; |
1133 | int dif; | 1175 | int dif; |
1176 | unsigned int hash = hash_port_and_addr(ntohs(uh->dest), daddr); | ||
1177 | unsigned int hashwild = hash_port_and_addr(ntohs(uh->dest), 0); | ||
1134 | 1178 | ||
1135 | read_lock(&udp_hash_lock); | ||
1136 | sk = sk_head(&udptable[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]); | ||
1137 | dif = skb->dev->ifindex; | 1179 | dif = skb->dev->ifindex; |
1138 | sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif); | ||
1139 | if (sk) { | ||
1140 | struct sock *sknext = NULL; | ||
1141 | 1180 | ||
1181 | read_lock(&udp_hash_lock); | ||
1182 | |||
1183 | sk = sk_head(&udptable[hash & (UDP_HTABLE_SIZE - 1)]); | ||
1184 | skw = sk_head(&udptable[hashwild & (UDP_HTABLE_SIZE - 1)]); | ||
1185 | |||
1186 | sk = udp_v4_mcast_next(sk, hash, uh->dest, daddr, uh->source, saddr, dif); | ||
1187 | if (!sk) { | ||
1188 | hash = hashwild; | ||
1189 | sk = udp_v4_mcast_next(skw, hash, uh->dest, daddr, uh->source, | ||
1190 | saddr, dif); | ||
1191 | } | ||
1192 | if (sk) { | ||
1142 | do { | 1193 | do { |
1143 | struct sk_buff *skb1 = skb; | 1194 | struct sk_buff *skb1 = skb; |
1144 | 1195 | sknext = udp_v4_mcast_next(sk_next(sk), hash, uh->dest, | |
1145 | sknext = udp_v4_mcast_next(sk_next(sk), uh->dest, daddr, | 1196 | daddr, uh->source, saddr, dif); |
1146 | uh->source, saddr, dif); | 1197 | if (!sknext && hash != hashwild) { |
1198 | hash = hashwild; | ||
1199 | sknext = udp_v4_mcast_next(skw, hash, uh->dest, | ||
1200 | daddr, uh->source, saddr, dif); | ||
1201 | } | ||
1147 | if (sknext) | 1202 | if (sknext) |
1148 | skb1 = skb_clone(skb, GFP_ATOMIC); | 1203 | skb1 = skb_clone(skb, GFP_ATOMIC); |
1149 | 1204 | ||
1150 | if (skb1) { | 1205 | if (skb1) { |
1151 | int ret = udp_queue_rcv_skb(sk, skb1); | 1206 | int ret = udp_queue_rcv_skb(sk, skb1); |
1152 | if (ret > 0) | 1207 | if (ret > 0) |
1153 | /* we should probably re-process instead | 1208 | /* |
1154 | * of dropping packets here. */ | 1209 | * we should probably re-process |
1210 | * instead of dropping packets here. | ||
1211 | */ | ||
1155 | kfree_skb(skb1); | 1212 | kfree_skb(skb1); |
1156 | } | 1213 | } |
1157 | sk = sknext; | 1214 | sk = sknext; |