aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Dumazet <dada1@cosmosbay.com>2007-04-30 03:26:00 -0400
committerDavid S. Miller <davem@davemloft.net>2007-04-30 03:26:00 -0400
commit6aaf47fa48d3c44280810b1b470261d340e4ed87 (patch)
treeee59c6bd3ee482ef85a7b65f057beff4f4598b9b
parent65def812ab25d7565756e5748d91e22e302197ee (diff)
[PATCH] INET : IPV4 UDP lookups converted to a 2 pass algo
Some people want to have many UDP sockets, binded to a single port but many different addresses. We currently hash all those sockets into a single chain. Processing of incoming packets is very expensive, because the whole chain must be examined to find the best match. I chose in this patch to hash UDP sockets with a hash function that take into account both their port number and address : This has a drawback because we need two lookups : one with a given address, one with a wildcard (null) address. Signed-off-by: Eric Dumazet <dada1@cosmosbay.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--net/ipv4/udp.c171
1 files changed, 114 insertions, 57 deletions
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index cec0f2cc49b7..144970704c2c 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -114,14 +114,33 @@ DEFINE_RWLOCK(udp_hash_lock);
114 114
115static int udp_port_rover; 115static int udp_port_rover;
116 116
117static inline int __udp_lib_lport_inuse(__u16 num, struct hlist_head udptable[]) 117/*
118 * Note about this hash function :
119 * Typical use is probably daddr = 0, only dport is going to vary hash
120 */
121static inline unsigned int hash_port_and_addr(__u16 port, __be32 addr)
122{
123 addr ^= addr >> 16;
124 addr ^= addr >> 8;
125 return port ^ addr;
126}
127
128static inline int __udp_lib_port_inuse(unsigned int hash, int port,
129 __be32 daddr, struct hlist_head udptable[])
118{ 130{
119 struct sock *sk; 131 struct sock *sk;
120 struct hlist_node *node; 132 struct hlist_node *node;
133 struct inet_sock *inet;
121 134
122 sk_for_each(sk, node, &udptable[num & (UDP_HTABLE_SIZE - 1)]) 135 sk_for_each(sk, node, &udptable[hash & (UDP_HTABLE_SIZE - 1)]) {
123 if (sk->sk_hash == num) 136 if (sk->sk_hash != hash)
137 continue;
138 inet = inet_sk(sk);
139 if (inet->num != port)
140 continue;
141 if (inet->rcv_saddr == daddr)
124 return 1; 142 return 1;
143 }
125 return 0; 144 return 0;
126} 145}
127 146
@@ -142,6 +161,7 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum,
142 struct hlist_node *node; 161 struct hlist_node *node;
143 struct hlist_head *head; 162 struct hlist_head *head;
144 struct sock *sk2; 163 struct sock *sk2;
164 unsigned int hash;
145 int error = 1; 165 int error = 1;
146 166
147 write_lock_bh(&udp_hash_lock); 167 write_lock_bh(&udp_hash_lock);
@@ -156,7 +176,9 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum,
156 for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) { 176 for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) {
157 int size; 177 int size;
158 178
159 head = &udptable[result & (UDP_HTABLE_SIZE - 1)]; 179 hash = hash_port_and_addr(result,
180 inet_sk(sk)->rcv_saddr);
181 head = &udptable[hash & (UDP_HTABLE_SIZE - 1)];
160 if (hlist_empty(head)) { 182 if (hlist_empty(head)) {
161 if (result > sysctl_local_port_range[1]) 183 if (result > sysctl_local_port_range[1])
162 result = sysctl_local_port_range[0] + 184 result = sysctl_local_port_range[0] +
@@ -181,7 +203,10 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum,
181 result = sysctl_local_port_range[0] 203 result = sysctl_local_port_range[0]
182 + ((result - sysctl_local_port_range[0]) & 204 + ((result - sysctl_local_port_range[0]) &
183 (UDP_HTABLE_SIZE - 1)); 205 (UDP_HTABLE_SIZE - 1));
184 if (! __udp_lib_lport_inuse(result, udptable)) 206 hash = hash_port_and_addr(result,
207 inet_sk(sk)->rcv_saddr);
208 if (! __udp_lib_port_inuse(hash, result,
209 inet_sk(sk)->rcv_saddr, udptable))
185 break; 210 break;
186 } 211 }
187 if (i >= (1 << 16) / UDP_HTABLE_SIZE) 212 if (i >= (1 << 16) / UDP_HTABLE_SIZE)
@@ -189,11 +214,13 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum,
189gotit: 214gotit:
190 *port_rover = snum = result; 215 *port_rover = snum = result;
191 } else { 216 } else {
192 head = &udptable[snum & (UDP_HTABLE_SIZE - 1)]; 217 hash = hash_port_and_addr(snum, inet_sk(sk)->rcv_saddr);
218 head = &udptable[hash & (UDP_HTABLE_SIZE - 1)];
193 219
194 sk_for_each(sk2, node, head) 220 sk_for_each(sk2, node, head)
195 if (sk2->sk_hash == snum && 221 if (sk2->sk_hash == hash &&
196 sk2 != sk && 222 sk2 != sk &&
223 inet_sk(sk2)->num == snum &&
197 (!sk2->sk_reuse || !sk->sk_reuse) && 224 (!sk2->sk_reuse || !sk->sk_reuse) &&
198 (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if 225 (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if
199 || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && 226 || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
@@ -201,9 +228,9 @@ gotit:
201 goto fail; 228 goto fail;
202 } 229 }
203 inet_sk(sk)->num = snum; 230 inet_sk(sk)->num = snum;
204 sk->sk_hash = snum; 231 sk->sk_hash = hash;
205 if (sk_unhashed(sk)) { 232 if (sk_unhashed(sk)) {
206 head = &udptable[snum & (UDP_HTABLE_SIZE - 1)]; 233 head = &udptable[hash & (UDP_HTABLE_SIZE - 1)];
207 sk_add_node(sk, head); 234 sk_add_node(sk, head);
208 sock_prot_inc_use(sk->sk_prot); 235 sock_prot_inc_use(sk->sk_prot);
209 } 236 }
@@ -242,63 +269,78 @@ static struct sock *__udp4_lib_lookup(__be32 saddr, __be16 sport,
242{ 269{
243 struct sock *sk, *result = NULL; 270 struct sock *sk, *result = NULL;
244 struct hlist_node *node; 271 struct hlist_node *node;
245 unsigned short hnum = ntohs(dport); 272 unsigned int hash, hashwild;
246 int badness = -1; 273 int score, best = -1;
274
275 hash = hash_port_and_addr(ntohs(dport), daddr);
276 hashwild = hash_port_and_addr(ntohs(dport), 0);
247 277
248 read_lock(&udp_hash_lock); 278 read_lock(&udp_hash_lock);
249 sk_for_each(sk, node, &udptable[hnum & (UDP_HTABLE_SIZE - 1)]) { 279
280lookup:
281
282 sk_for_each(sk, node, &udptable[hash & (UDP_HTABLE_SIZE - 1)]) {
250 struct inet_sock *inet = inet_sk(sk); 283 struct inet_sock *inet = inet_sk(sk);
251 284
252 if (sk->sk_hash == hnum && !ipv6_only_sock(sk)) { 285 if (sk->sk_hash != hash || ipv6_only_sock(sk) ||
253 int score = (sk->sk_family == PF_INET ? 1 : 0); 286 inet->num != dport)
254 if (inet->rcv_saddr) { 287 continue;
255 if (inet->rcv_saddr != daddr) 288
256 continue; 289 score = (sk->sk_family == PF_INET ? 1 : 0);
257 score+=2; 290 if (inet->rcv_saddr) {
258 } 291 if (inet->rcv_saddr != daddr)
259 if (inet->daddr) { 292 continue;
260 if (inet->daddr != saddr) 293 score+=2;
261 continue; 294 }
262 score+=2; 295 if (inet->daddr) {
263 } 296 if (inet->daddr != saddr)
264 if (inet->dport) { 297 continue;
265 if (inet->dport != sport) 298 score+=2;
266 continue; 299 }
267 score+=2; 300 if (inet->dport) {
268 } 301 if (inet->dport != sport)
269 if (sk->sk_bound_dev_if) { 302 continue;
270 if (sk->sk_bound_dev_if != dif) 303 score+=2;
271 continue; 304 }
272 score+=2; 305 if (sk->sk_bound_dev_if) {
273 } 306 if (sk->sk_bound_dev_if != dif)
274 if (score == 9) { 307 continue;
275 result = sk; 308 score+=2;
276 break; 309 }
277 } else if (score > badness) { 310 if (score == 9) {
278 result = sk; 311 result = sk;
279 badness = score; 312 goto found;
280 } 313 } else if (score > best) {
314 result = sk;
315 best = score;
281 } 316 }
282 } 317 }
318
319 if (hash != hashwild) {
320 hash = hashwild;
321 goto lookup;
322 }
323found:
283 if (result) 324 if (result)
284 sock_hold(result); 325 sock_hold(result);
285 read_unlock(&udp_hash_lock); 326 read_unlock(&udp_hash_lock);
286 return result; 327 return result;
287} 328}
288 329
289static inline struct sock *udp_v4_mcast_next(struct sock *sk, 330static inline struct sock *udp_v4_mcast_next(
290 __be16 loc_port, __be32 loc_addr, 331 struct sock *sk,
291 __be16 rmt_port, __be32 rmt_addr, 332 unsigned int hnum, __be16 loc_port, __be32 loc_addr,
292 int dif) 333 __be16 rmt_port, __be32 rmt_addr,
334 int dif)
293{ 335{
294 struct hlist_node *node; 336 struct hlist_node *node;
295 struct sock *s = sk; 337 struct sock *s = sk;
296 unsigned short hnum = ntohs(loc_port);
297 338
298 sk_for_each_from(s, node) { 339 sk_for_each_from(s, node) {
299 struct inet_sock *inet = inet_sk(s); 340 struct inet_sock *inet = inet_sk(s);
300 341
301 if (s->sk_hash != hnum || 342 if (s->sk_hash != hnum ||
343 inet->num != loc_port ||
302 (inet->daddr && inet->daddr != rmt_addr) || 344 (inet->daddr && inet->daddr != rmt_addr) ||
303 (inet->dport != rmt_port && inet->dport) || 345 (inet->dport != rmt_port && inet->dport) ||
304 (inet->rcv_saddr && inet->rcv_saddr != loc_addr) || 346 (inet->rcv_saddr && inet->rcv_saddr != loc_addr) ||
@@ -1129,29 +1171,44 @@ static int __udp4_lib_mcast_deliver(struct sk_buff *skb,
1129 __be32 saddr, __be32 daddr, 1171 __be32 saddr, __be32 daddr,
1130 struct hlist_head udptable[]) 1172 struct hlist_head udptable[])
1131{ 1173{
1132 struct sock *sk; 1174 struct sock *sk, *skw, *sknext;
1133 int dif; 1175 int dif;
1176 unsigned int hash = hash_port_and_addr(ntohs(uh->dest), daddr);
1177 unsigned int hashwild = hash_port_and_addr(ntohs(uh->dest), 0);
1134 1178
1135 read_lock(&udp_hash_lock);
1136 sk = sk_head(&udptable[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]);
1137 dif = skb->dev->ifindex; 1179 dif = skb->dev->ifindex;
1138 sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif);
1139 if (sk) {
1140 struct sock *sknext = NULL;
1141 1180
1181 read_lock(&udp_hash_lock);
1182
1183 sk = sk_head(&udptable[hash & (UDP_HTABLE_SIZE - 1)]);
1184 skw = sk_head(&udptable[hashwild & (UDP_HTABLE_SIZE - 1)]);
1185
1186 sk = udp_v4_mcast_next(sk, hash, uh->dest, daddr, uh->source, saddr, dif);
1187 if (!sk) {
1188 hash = hashwild;
1189 sk = udp_v4_mcast_next(skw, hash, uh->dest, daddr, uh->source,
1190 saddr, dif);
1191 }
1192 if (sk) {
1142 do { 1193 do {
1143 struct sk_buff *skb1 = skb; 1194 struct sk_buff *skb1 = skb;
1144 1195 sknext = udp_v4_mcast_next(sk_next(sk), hash, uh->dest,
1145 sknext = udp_v4_mcast_next(sk_next(sk), uh->dest, daddr, 1196 daddr, uh->source, saddr, dif);
1146 uh->source, saddr, dif); 1197 if (!sknext && hash != hashwild) {
1198 hash = hashwild;
1199 sknext = udp_v4_mcast_next(skw, hash, uh->dest,
1200 daddr, uh->source, saddr, dif);
1201 }
1147 if (sknext) 1202 if (sknext)
1148 skb1 = skb_clone(skb, GFP_ATOMIC); 1203 skb1 = skb_clone(skb, GFP_ATOMIC);
1149 1204
1150 if (skb1) { 1205 if (skb1) {
1151 int ret = udp_queue_rcv_skb(sk, skb1); 1206 int ret = udp_queue_rcv_skb(sk, skb1);
1152 if (ret > 0) 1207 if (ret > 0)
1153 /* we should probably re-process instead 1208 /*
1154 * of dropping packets here. */ 1209 * we should probably re-process
1210 * instead of dropping packets here.
1211 */
1155 kfree_skb(skb1); 1212 kfree_skb(skb1);
1156 } 1213 }
1157 sk = sknext; 1214 sk = sknext;