diff options
Diffstat (limited to 'net/ipv4/udp.c')
-rw-r--r-- | net/ipv4/udp.c | 484 |
1 files changed, 381 insertions, 103 deletions
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 0fa9f70e4b19..1f9534846ca9 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c | |||
@@ -106,7 +106,7 @@ | |||
106 | #include <net/xfrm.h> | 106 | #include <net/xfrm.h> |
107 | #include "udp_impl.h" | 107 | #include "udp_impl.h" |
108 | 108 | ||
109 | struct udp_table udp_table; | 109 | struct udp_table udp_table __read_mostly; |
110 | EXPORT_SYMBOL(udp_table); | 110 | EXPORT_SYMBOL(udp_table); |
111 | 111 | ||
112 | int sysctl_udp_mem[3] __read_mostly; | 112 | int sysctl_udp_mem[3] __read_mostly; |
@@ -121,28 +121,30 @@ EXPORT_SYMBOL(sysctl_udp_wmem_min); | |||
121 | atomic_t udp_memory_allocated; | 121 | atomic_t udp_memory_allocated; |
122 | EXPORT_SYMBOL(udp_memory_allocated); | 122 | EXPORT_SYMBOL(udp_memory_allocated); |
123 | 123 | ||
124 | #define PORTS_PER_CHAIN (65536 / UDP_HTABLE_SIZE) | 124 | #define MAX_UDP_PORTS 65536 |
125 | #define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN) | ||
125 | 126 | ||
126 | static int udp_lib_lport_inuse(struct net *net, __u16 num, | 127 | static int udp_lib_lport_inuse(struct net *net, __u16 num, |
127 | const struct udp_hslot *hslot, | 128 | const struct udp_hslot *hslot, |
128 | unsigned long *bitmap, | 129 | unsigned long *bitmap, |
129 | struct sock *sk, | 130 | struct sock *sk, |
130 | int (*saddr_comp)(const struct sock *sk1, | 131 | int (*saddr_comp)(const struct sock *sk1, |
131 | const struct sock *sk2)) | 132 | const struct sock *sk2), |
133 | unsigned int log) | ||
132 | { | 134 | { |
133 | struct sock *sk2; | 135 | struct sock *sk2; |
134 | struct hlist_nulls_node *node; | 136 | struct hlist_nulls_node *node; |
135 | 137 | ||
136 | sk_nulls_for_each(sk2, node, &hslot->head) | 138 | sk_nulls_for_each(sk2, node, &hslot->head) |
137 | if (net_eq(sock_net(sk2), net) && | 139 | if (net_eq(sock_net(sk2), net) && |
138 | sk2 != sk && | 140 | sk2 != sk && |
139 | (bitmap || sk2->sk_hash == num) && | 141 | (bitmap || udp_sk(sk2)->udp_port_hash == num) && |
140 | (!sk2->sk_reuse || !sk->sk_reuse) && | 142 | (!sk2->sk_reuse || !sk->sk_reuse) && |
141 | (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if | 143 | (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || |
142 | || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && | 144 | sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && |
143 | (*saddr_comp)(sk, sk2)) { | 145 | (*saddr_comp)(sk, sk2)) { |
144 | if (bitmap) | 146 | if (bitmap) |
145 | __set_bit(sk2->sk_hash / UDP_HTABLE_SIZE, | 147 | __set_bit(udp_sk(sk2)->udp_port_hash >> log, |
146 | bitmap); | 148 | bitmap); |
147 | else | 149 | else |
148 | return 1; | 150 | return 1; |
@@ -150,18 +152,51 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, | |||
150 | return 0; | 152 | return 0; |
151 | } | 153 | } |
152 | 154 | ||
155 | /* | ||
156 | * Note: we still hold spinlock of primary hash chain, so no other writer | ||
157 | * can insert/delete a socket with local_port == num | ||
158 | */ | ||
159 | static int udp_lib_lport_inuse2(struct net *net, __u16 num, | ||
160 | struct udp_hslot *hslot2, | ||
161 | struct sock *sk, | ||
162 | int (*saddr_comp)(const struct sock *sk1, | ||
163 | const struct sock *sk2)) | ||
164 | { | ||
165 | struct sock *sk2; | ||
166 | struct hlist_nulls_node *node; | ||
167 | int res = 0; | ||
168 | |||
169 | spin_lock(&hslot2->lock); | ||
170 | udp_portaddr_for_each_entry(sk2, node, &hslot2->head) | ||
171 | if (net_eq(sock_net(sk2), net) && | ||
172 | sk2 != sk && | ||
173 | (udp_sk(sk2)->udp_port_hash == num) && | ||
174 | (!sk2->sk_reuse || !sk->sk_reuse) && | ||
175 | (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || | ||
176 | sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && | ||
177 | (*saddr_comp)(sk, sk2)) { | ||
178 | res = 1; | ||
179 | break; | ||
180 | } | ||
181 | spin_unlock(&hslot2->lock); | ||
182 | return res; | ||
183 | } | ||
184 | |||
153 | /** | 185 | /** |
154 | * udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6 | 186 | * udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6 |
155 | * | 187 | * |
156 | * @sk: socket struct in question | 188 | * @sk: socket struct in question |
157 | * @snum: port number to look up | 189 | * @snum: port number to look up |
158 | * @saddr_comp: AF-dependent comparison of bound local IP addresses | 190 | * @saddr_comp: AF-dependent comparison of bound local IP addresses |
191 | * @hash2_nulladdr: AF-dependant hash value in secondary hash chains, | ||
192 | * with NULL address | ||
159 | */ | 193 | */ |
160 | int udp_lib_get_port(struct sock *sk, unsigned short snum, | 194 | int udp_lib_get_port(struct sock *sk, unsigned short snum, |
161 | int (*saddr_comp)(const struct sock *sk1, | 195 | int (*saddr_comp)(const struct sock *sk1, |
162 | const struct sock *sk2)) | 196 | const struct sock *sk2), |
197 | unsigned int hash2_nulladdr) | ||
163 | { | 198 | { |
164 | struct udp_hslot *hslot; | 199 | struct udp_hslot *hslot, *hslot2; |
165 | struct udp_table *udptable = sk->sk_prot->h.udp_table; | 200 | struct udp_table *udptable = sk->sk_prot->h.udp_table; |
166 | int error = 1; | 201 | int error = 1; |
167 | struct net *net = sock_net(sk); | 202 | struct net *net = sock_net(sk); |
@@ -180,13 +215,15 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, | |||
180 | /* | 215 | /* |
181 | * force rand to be an odd multiple of UDP_HTABLE_SIZE | 216 | * force rand to be an odd multiple of UDP_HTABLE_SIZE |
182 | */ | 217 | */ |
183 | rand = (rand | 1) * UDP_HTABLE_SIZE; | 218 | rand = (rand | 1) * (udptable->mask + 1); |
184 | for (last = first + UDP_HTABLE_SIZE; first != last; first++) { | 219 | for (last = first + udptable->mask + 1; |
185 | hslot = &udptable->hash[udp_hashfn(net, first)]; | 220 | first != last; |
221 | first++) { | ||
222 | hslot = udp_hashslot(udptable, net, first); | ||
186 | bitmap_zero(bitmap, PORTS_PER_CHAIN); | 223 | bitmap_zero(bitmap, PORTS_PER_CHAIN); |
187 | spin_lock_bh(&hslot->lock); | 224 | spin_lock_bh(&hslot->lock); |
188 | udp_lib_lport_inuse(net, snum, hslot, bitmap, sk, | 225 | udp_lib_lport_inuse(net, snum, hslot, bitmap, sk, |
189 | saddr_comp); | 226 | saddr_comp, udptable->log); |
190 | 227 | ||
191 | snum = first; | 228 | snum = first; |
192 | /* | 229 | /* |
@@ -196,7 +233,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, | |||
196 | */ | 233 | */ |
197 | do { | 234 | do { |
198 | if (low <= snum && snum <= high && | 235 | if (low <= snum && snum <= high && |
199 | !test_bit(snum / UDP_HTABLE_SIZE, bitmap)) | 236 | !test_bit(snum >> udptable->log, bitmap)) |
200 | goto found; | 237 | goto found; |
201 | snum += rand; | 238 | snum += rand; |
202 | } while (snum != first); | 239 | } while (snum != first); |
@@ -204,17 +241,51 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, | |||
204 | } | 241 | } |
205 | goto fail; | 242 | goto fail; |
206 | } else { | 243 | } else { |
207 | hslot = &udptable->hash[udp_hashfn(net, snum)]; | 244 | hslot = udp_hashslot(udptable, net, snum); |
208 | spin_lock_bh(&hslot->lock); | 245 | spin_lock_bh(&hslot->lock); |
209 | if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, saddr_comp)) | 246 | if (hslot->count > 10) { |
247 | int exist; | ||
248 | unsigned int slot2 = udp_sk(sk)->udp_portaddr_hash ^ snum; | ||
249 | |||
250 | slot2 &= udptable->mask; | ||
251 | hash2_nulladdr &= udptable->mask; | ||
252 | |||
253 | hslot2 = udp_hashslot2(udptable, slot2); | ||
254 | if (hslot->count < hslot2->count) | ||
255 | goto scan_primary_hash; | ||
256 | |||
257 | exist = udp_lib_lport_inuse2(net, snum, hslot2, | ||
258 | sk, saddr_comp); | ||
259 | if (!exist && (hash2_nulladdr != slot2)) { | ||
260 | hslot2 = udp_hashslot2(udptable, hash2_nulladdr); | ||
261 | exist = udp_lib_lport_inuse2(net, snum, hslot2, | ||
262 | sk, saddr_comp); | ||
263 | } | ||
264 | if (exist) | ||
265 | goto fail_unlock; | ||
266 | else | ||
267 | goto found; | ||
268 | } | ||
269 | scan_primary_hash: | ||
270 | if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, | ||
271 | saddr_comp, 0)) | ||
210 | goto fail_unlock; | 272 | goto fail_unlock; |
211 | } | 273 | } |
212 | found: | 274 | found: |
213 | inet_sk(sk)->num = snum; | 275 | inet_sk(sk)->inet_num = snum; |
214 | sk->sk_hash = snum; | 276 | udp_sk(sk)->udp_port_hash = snum; |
277 | udp_sk(sk)->udp_portaddr_hash ^= snum; | ||
215 | if (sk_unhashed(sk)) { | 278 | if (sk_unhashed(sk)) { |
216 | sk_nulls_add_node_rcu(sk, &hslot->head); | 279 | sk_nulls_add_node_rcu(sk, &hslot->head); |
280 | hslot->count++; | ||
217 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); | 281 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); |
282 | |||
283 | hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); | ||
284 | spin_lock(&hslot2->lock); | ||
285 | hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, | ||
286 | &hslot2->head); | ||
287 | hslot2->count++; | ||
288 | spin_unlock(&hslot2->lock); | ||
218 | } | 289 | } |
219 | error = 0; | 290 | error = 0; |
220 | fail_unlock: | 291 | fail_unlock: |
@@ -229,13 +300,26 @@ static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) | |||
229 | struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2); | 300 | struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2); |
230 | 301 | ||
231 | return (!ipv6_only_sock(sk2) && | 302 | return (!ipv6_only_sock(sk2) && |
232 | (!inet1->rcv_saddr || !inet2->rcv_saddr || | 303 | (!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr || |
233 | inet1->rcv_saddr == inet2->rcv_saddr)); | 304 | inet1->inet_rcv_saddr == inet2->inet_rcv_saddr)); |
305 | } | ||
306 | |||
307 | static unsigned int udp4_portaddr_hash(struct net *net, __be32 saddr, | ||
308 | unsigned int port) | ||
309 | { | ||
310 | return jhash_1word(saddr, net_hash_mix(net)) ^ port; | ||
234 | } | 311 | } |
235 | 312 | ||
236 | int udp_v4_get_port(struct sock *sk, unsigned short snum) | 313 | int udp_v4_get_port(struct sock *sk, unsigned short snum) |
237 | { | 314 | { |
238 | return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal); | 315 | unsigned int hash2_nulladdr = |
316 | udp4_portaddr_hash(sock_net(sk), INADDR_ANY, snum); | ||
317 | unsigned int hash2_partial = | ||
318 | udp4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0); | ||
319 | |||
320 | /* precompute partial secondary hash */ | ||
321 | udp_sk(sk)->udp_portaddr_hash = hash2_partial; | ||
322 | return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal, hash2_nulladdr); | ||
239 | } | 323 | } |
240 | 324 | ||
241 | static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr, | 325 | static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr, |
@@ -244,23 +328,61 @@ static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr, | |||
244 | { | 328 | { |
245 | int score = -1; | 329 | int score = -1; |
246 | 330 | ||
247 | if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum && | 331 | if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum && |
248 | !ipv6_only_sock(sk)) { | 332 | !ipv6_only_sock(sk)) { |
249 | struct inet_sock *inet = inet_sk(sk); | 333 | struct inet_sock *inet = inet_sk(sk); |
250 | 334 | ||
251 | score = (sk->sk_family == PF_INET ? 1 : 0); | 335 | score = (sk->sk_family == PF_INET ? 1 : 0); |
252 | if (inet->rcv_saddr) { | 336 | if (inet->inet_rcv_saddr) { |
253 | if (inet->rcv_saddr != daddr) | 337 | if (inet->inet_rcv_saddr != daddr) |
338 | return -1; | ||
339 | score += 2; | ||
340 | } | ||
341 | if (inet->inet_daddr) { | ||
342 | if (inet->inet_daddr != saddr) | ||
343 | return -1; | ||
344 | score += 2; | ||
345 | } | ||
346 | if (inet->inet_dport) { | ||
347 | if (inet->inet_dport != sport) | ||
348 | return -1; | ||
349 | score += 2; | ||
350 | } | ||
351 | if (sk->sk_bound_dev_if) { | ||
352 | if (sk->sk_bound_dev_if != dif) | ||
254 | return -1; | 353 | return -1; |
255 | score += 2; | 354 | score += 2; |
256 | } | 355 | } |
257 | if (inet->daddr) { | 356 | } |
258 | if (inet->daddr != saddr) | 357 | return score; |
358 | } | ||
359 | |||
360 | /* | ||
361 | * In this second variant, we check (daddr, dport) matches (inet_rcv_sadd, inet_num) | ||
362 | */ | ||
363 | #define SCORE2_MAX (1 + 2 + 2 + 2) | ||
364 | static inline int compute_score2(struct sock *sk, struct net *net, | ||
365 | __be32 saddr, __be16 sport, | ||
366 | __be32 daddr, unsigned int hnum, int dif) | ||
367 | { | ||
368 | int score = -1; | ||
369 | |||
370 | if (net_eq(sock_net(sk), net) && !ipv6_only_sock(sk)) { | ||
371 | struct inet_sock *inet = inet_sk(sk); | ||
372 | |||
373 | if (inet->inet_rcv_saddr != daddr) | ||
374 | return -1; | ||
375 | if (inet->inet_num != hnum) | ||
376 | return -1; | ||
377 | |||
378 | score = (sk->sk_family == PF_INET ? 1 : 0); | ||
379 | if (inet->inet_daddr) { | ||
380 | if (inet->inet_daddr != saddr) | ||
259 | return -1; | 381 | return -1; |
260 | score += 2; | 382 | score += 2; |
261 | } | 383 | } |
262 | if (inet->dport) { | 384 | if (inet->inet_dport) { |
263 | if (inet->dport != sport) | 385 | if (inet->inet_dport != sport) |
264 | return -1; | 386 | return -1; |
265 | score += 2; | 387 | score += 2; |
266 | } | 388 | } |
@@ -273,6 +395,51 @@ static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr, | |||
273 | return score; | 395 | return score; |
274 | } | 396 | } |
275 | 397 | ||
398 | |||
399 | /* called with read_rcu_lock() */ | ||
400 | static struct sock *udp4_lib_lookup2(struct net *net, | ||
401 | __be32 saddr, __be16 sport, | ||
402 | __be32 daddr, unsigned int hnum, int dif, | ||
403 | struct udp_hslot *hslot2, unsigned int slot2) | ||
404 | { | ||
405 | struct sock *sk, *result; | ||
406 | struct hlist_nulls_node *node; | ||
407 | int score, badness; | ||
408 | |||
409 | begin: | ||
410 | result = NULL; | ||
411 | badness = -1; | ||
412 | udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) { | ||
413 | score = compute_score2(sk, net, saddr, sport, | ||
414 | daddr, hnum, dif); | ||
415 | if (score > badness) { | ||
416 | result = sk; | ||
417 | badness = score; | ||
418 | if (score == SCORE2_MAX) | ||
419 | goto exact_match; | ||
420 | } | ||
421 | } | ||
422 | /* | ||
423 | * if the nulls value we got at the end of this lookup is | ||
424 | * not the expected one, we must restart lookup. | ||
425 | * We probably met an item that was moved to another chain. | ||
426 | */ | ||
427 | if (get_nulls_value(node) != slot2) | ||
428 | goto begin; | ||
429 | |||
430 | if (result) { | ||
431 | exact_match: | ||
432 | if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) | ||
433 | result = NULL; | ||
434 | else if (unlikely(compute_score2(result, net, saddr, sport, | ||
435 | daddr, hnum, dif) < badness)) { | ||
436 | sock_put(result); | ||
437 | goto begin; | ||
438 | } | ||
439 | } | ||
440 | return result; | ||
441 | } | ||
442 | |||
276 | /* UDP is nearly always wildcards out the wazoo, it makes no sense to try | 443 | /* UDP is nearly always wildcards out the wazoo, it makes no sense to try |
277 | * harder than this. -DaveM | 444 | * harder than this. -DaveM |
278 | */ | 445 | */ |
@@ -283,11 +450,35 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, | |||
283 | struct sock *sk, *result; | 450 | struct sock *sk, *result; |
284 | struct hlist_nulls_node *node; | 451 | struct hlist_nulls_node *node; |
285 | unsigned short hnum = ntohs(dport); | 452 | unsigned short hnum = ntohs(dport); |
286 | unsigned int hash = udp_hashfn(net, hnum); | 453 | unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); |
287 | struct udp_hslot *hslot = &udptable->hash[hash]; | 454 | struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; |
288 | int score, badness; | 455 | int score, badness; |
289 | 456 | ||
290 | rcu_read_lock(); | 457 | rcu_read_lock(); |
458 | if (hslot->count > 10) { | ||
459 | hash2 = udp4_portaddr_hash(net, daddr, hnum); | ||
460 | slot2 = hash2 & udptable->mask; | ||
461 | hslot2 = &udptable->hash2[slot2]; | ||
462 | if (hslot->count < hslot2->count) | ||
463 | goto begin; | ||
464 | |||
465 | result = udp4_lib_lookup2(net, saddr, sport, | ||
466 | daddr, hnum, dif, | ||
467 | hslot2, slot2); | ||
468 | if (!result) { | ||
469 | hash2 = udp4_portaddr_hash(net, INADDR_ANY, hnum); | ||
470 | slot2 = hash2 & udptable->mask; | ||
471 | hslot2 = &udptable->hash2[slot2]; | ||
472 | if (hslot->count < hslot2->count) | ||
473 | goto begin; | ||
474 | |||
475 | result = udp4_lib_lookup2(net, INADDR_ANY, sport, | ||
476 | daddr, hnum, dif, | ||
477 | hslot2, slot2); | ||
478 | } | ||
479 | rcu_read_unlock(); | ||
480 | return result; | ||
481 | } | ||
291 | begin: | 482 | begin: |
292 | result = NULL; | 483 | result = NULL; |
293 | badness = -1; | 484 | badness = -1; |
@@ -304,7 +495,7 @@ begin: | |||
304 | * not the expected one, we must restart lookup. | 495 | * not the expected one, we must restart lookup. |
305 | * We probably met an item that was moved to another chain. | 496 | * We probably met an item that was moved to another chain. |
306 | */ | 497 | */ |
307 | if (get_nulls_value(node) != hash) | 498 | if (get_nulls_value(node) != slot) |
308 | goto begin; | 499 | goto begin; |
309 | 500 | ||
310 | if (result) { | 501 | if (result) { |
@@ -354,12 +545,13 @@ static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk, | |||
354 | sk_nulls_for_each_from(s, node) { | 545 | sk_nulls_for_each_from(s, node) { |
355 | struct inet_sock *inet = inet_sk(s); | 546 | struct inet_sock *inet = inet_sk(s); |
356 | 547 | ||
357 | if (!net_eq(sock_net(s), net) || | 548 | if (!net_eq(sock_net(s), net) || |
358 | s->sk_hash != hnum || | 549 | udp_sk(s)->udp_port_hash != hnum || |
359 | (inet->daddr && inet->daddr != rmt_addr) || | 550 | (inet->inet_daddr && inet->inet_daddr != rmt_addr) || |
360 | (inet->dport != rmt_port && inet->dport) || | 551 | (inet->inet_dport != rmt_port && inet->inet_dport) || |
361 | (inet->rcv_saddr && inet->rcv_saddr != loc_addr) || | 552 | (inet->inet_rcv_saddr && |
362 | ipv6_only_sock(s) || | 553 | inet->inet_rcv_saddr != loc_addr) || |
554 | ipv6_only_sock(s) || | ||
363 | (s->sk_bound_dev_if && s->sk_bound_dev_if != dif)) | 555 | (s->sk_bound_dev_if && s->sk_bound_dev_if != dif)) |
364 | continue; | 556 | continue; |
365 | if (!ip_mc_sf_allow(s, loc_addr, rmt_addr, dif)) | 557 | if (!ip_mc_sf_allow(s, loc_addr, rmt_addr, dif)) |
@@ -642,14 +834,14 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
642 | } else { | 834 | } else { |
643 | if (sk->sk_state != TCP_ESTABLISHED) | 835 | if (sk->sk_state != TCP_ESTABLISHED) |
644 | return -EDESTADDRREQ; | 836 | return -EDESTADDRREQ; |
645 | daddr = inet->daddr; | 837 | daddr = inet->inet_daddr; |
646 | dport = inet->dport; | 838 | dport = inet->inet_dport; |
647 | /* Open fast path for connected socket. | 839 | /* Open fast path for connected socket. |
648 | Route will not be used, if at least one option is set. | 840 | Route will not be used, if at least one option is set. |
649 | */ | 841 | */ |
650 | connected = 1; | 842 | connected = 1; |
651 | } | 843 | } |
652 | ipc.addr = inet->saddr; | 844 | ipc.addr = inet->inet_saddr; |
653 | 845 | ||
654 | ipc.oif = sk->sk_bound_dev_if; | 846 | ipc.oif = sk->sk_bound_dev_if; |
655 | err = sock_tx_timestamp(msg, sk, &ipc.shtx); | 847 | err = sock_tx_timestamp(msg, sk, &ipc.shtx); |
@@ -704,7 +896,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
704 | .proto = sk->sk_protocol, | 896 | .proto = sk->sk_protocol, |
705 | .flags = inet_sk_flowi_flags(sk), | 897 | .flags = inet_sk_flowi_flags(sk), |
706 | .uli_u = { .ports = | 898 | .uli_u = { .ports = |
707 | { .sport = inet->sport, | 899 | { .sport = inet->inet_sport, |
708 | .dport = dport } } }; | 900 | .dport = dport } } }; |
709 | struct net *net = sock_net(sk); | 901 | struct net *net = sock_net(sk); |
710 | 902 | ||
@@ -748,7 +940,7 @@ back_from_confirm: | |||
748 | inet->cork.fl.fl4_dst = daddr; | 940 | inet->cork.fl.fl4_dst = daddr; |
749 | inet->cork.fl.fl_ip_dport = dport; | 941 | inet->cork.fl.fl_ip_dport = dport; |
750 | inet->cork.fl.fl4_src = saddr; | 942 | inet->cork.fl.fl4_src = saddr; |
751 | inet->cork.fl.fl_ip_sport = inet->sport; | 943 | inet->cork.fl.fl_ip_sport = inet->inet_sport; |
752 | up->pending = AF_INET; | 944 | up->pending = AF_INET; |
753 | 945 | ||
754 | do_append_data: | 946 | do_append_data: |
@@ -862,6 +1054,7 @@ static unsigned int first_packet_length(struct sock *sk) | |||
862 | udp_lib_checksum_complete(skb)) { | 1054 | udp_lib_checksum_complete(skb)) { |
863 | UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, | 1055 | UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, |
864 | IS_UDPLITE(sk)); | 1056 | IS_UDPLITE(sk)); |
1057 | atomic_inc(&sk->sk_drops); | ||
865 | __skb_unlink(skb, rcvq); | 1058 | __skb_unlink(skb, rcvq); |
866 | __skb_queue_tail(&list_kill, skb); | 1059 | __skb_queue_tail(&list_kill, skb); |
867 | } | 1060 | } |
@@ -982,7 +1175,7 @@ try_again: | |||
982 | UDP_INC_STATS_USER(sock_net(sk), | 1175 | UDP_INC_STATS_USER(sock_net(sk), |
983 | UDP_MIB_INDATAGRAMS, is_udplite); | 1176 | UDP_MIB_INDATAGRAMS, is_udplite); |
984 | 1177 | ||
985 | sock_recv_timestamp(msg, sk, skb); | 1178 | sock_recv_ts_and_drops(msg, sk, skb); |
986 | 1179 | ||
987 | /* Copy the address. */ | 1180 | /* Copy the address. */ |
988 | if (sin) { | 1181 | if (sin) { |
@@ -1023,15 +1216,15 @@ int udp_disconnect(struct sock *sk, int flags) | |||
1023 | */ | 1216 | */ |
1024 | 1217 | ||
1025 | sk->sk_state = TCP_CLOSE; | 1218 | sk->sk_state = TCP_CLOSE; |
1026 | inet->daddr = 0; | 1219 | inet->inet_daddr = 0; |
1027 | inet->dport = 0; | 1220 | inet->inet_dport = 0; |
1028 | sk->sk_bound_dev_if = 0; | 1221 | sk->sk_bound_dev_if = 0; |
1029 | if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) | 1222 | if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) |
1030 | inet_reset_saddr(sk); | 1223 | inet_reset_saddr(sk); |
1031 | 1224 | ||
1032 | if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) { | 1225 | if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) { |
1033 | sk->sk_prot->unhash(sk); | 1226 | sk->sk_prot->unhash(sk); |
1034 | inet->sport = 0; | 1227 | inet->inet_sport = 0; |
1035 | } | 1228 | } |
1036 | sk_dst_reset(sk); | 1229 | sk_dst_reset(sk); |
1037 | return 0; | 1230 | return 0; |
@@ -1042,13 +1235,22 @@ void udp_lib_unhash(struct sock *sk) | |||
1042 | { | 1235 | { |
1043 | if (sk_hashed(sk)) { | 1236 | if (sk_hashed(sk)) { |
1044 | struct udp_table *udptable = sk->sk_prot->h.udp_table; | 1237 | struct udp_table *udptable = sk->sk_prot->h.udp_table; |
1045 | unsigned int hash = udp_hashfn(sock_net(sk), sk->sk_hash); | 1238 | struct udp_hslot *hslot, *hslot2; |
1046 | struct udp_hslot *hslot = &udptable->hash[hash]; | 1239 | |
1240 | hslot = udp_hashslot(udptable, sock_net(sk), | ||
1241 | udp_sk(sk)->udp_port_hash); | ||
1242 | hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); | ||
1047 | 1243 | ||
1048 | spin_lock_bh(&hslot->lock); | 1244 | spin_lock_bh(&hslot->lock); |
1049 | if (sk_nulls_del_node_init_rcu(sk)) { | 1245 | if (sk_nulls_del_node_init_rcu(sk)) { |
1050 | inet_sk(sk)->num = 0; | 1246 | hslot->count--; |
1247 | inet_sk(sk)->inet_num = 0; | ||
1051 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); | 1248 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); |
1249 | |||
1250 | spin_lock(&hslot2->lock); | ||
1251 | hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); | ||
1252 | hslot2->count--; | ||
1253 | spin_unlock(&hslot2->lock); | ||
1052 | } | 1254 | } |
1053 | spin_unlock_bh(&hslot->lock); | 1255 | spin_unlock_bh(&hslot->lock); |
1054 | } | 1256 | } |
@@ -1057,25 +1259,22 @@ EXPORT_SYMBOL(udp_lib_unhash); | |||
1057 | 1259 | ||
1058 | static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) | 1260 | static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) |
1059 | { | 1261 | { |
1060 | int is_udplite = IS_UDPLITE(sk); | 1262 | int rc = sock_queue_rcv_skb(sk, skb); |
1061 | int rc; | 1263 | |
1264 | if (rc < 0) { | ||
1265 | int is_udplite = IS_UDPLITE(sk); | ||
1062 | 1266 | ||
1063 | if ((rc = sock_queue_rcv_skb(sk, skb)) < 0) { | ||
1064 | /* Note that an ENOMEM error is charged twice */ | 1267 | /* Note that an ENOMEM error is charged twice */ |
1065 | if (rc == -ENOMEM) { | 1268 | if (rc == -ENOMEM) |
1066 | UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, | 1269 | UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, |
1067 | is_udplite); | 1270 | is_udplite); |
1068 | atomic_inc(&sk->sk_drops); | 1271 | UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); |
1069 | } | 1272 | kfree_skb(skb); |
1070 | goto drop; | 1273 | return -1; |
1071 | } | 1274 | } |
1072 | 1275 | ||
1073 | return 0; | 1276 | return 0; |
1074 | 1277 | ||
1075 | drop: | ||
1076 | UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); | ||
1077 | kfree_skb(skb); | ||
1078 | return -1; | ||
1079 | } | 1278 | } |
1080 | 1279 | ||
1081 | /* returns: | 1280 | /* returns: |
@@ -1182,53 +1381,88 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) | |||
1182 | 1381 | ||
1183 | drop: | 1382 | drop: |
1184 | UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); | 1383 | UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); |
1384 | atomic_inc(&sk->sk_drops); | ||
1185 | kfree_skb(skb); | 1385 | kfree_skb(skb); |
1186 | return -1; | 1386 | return -1; |
1187 | } | 1387 | } |
1188 | 1388 | ||
1389 | |||
1390 | static void flush_stack(struct sock **stack, unsigned int count, | ||
1391 | struct sk_buff *skb, unsigned int final) | ||
1392 | { | ||
1393 | unsigned int i; | ||
1394 | struct sk_buff *skb1 = NULL; | ||
1395 | struct sock *sk; | ||
1396 | |||
1397 | for (i = 0; i < count; i++) { | ||
1398 | sk = stack[i]; | ||
1399 | if (likely(skb1 == NULL)) | ||
1400 | skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC); | ||
1401 | |||
1402 | if (!skb1) { | ||
1403 | atomic_inc(&sk->sk_drops); | ||
1404 | UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, | ||
1405 | IS_UDPLITE(sk)); | ||
1406 | UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, | ||
1407 | IS_UDPLITE(sk)); | ||
1408 | } | ||
1409 | |||
1410 | if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0) | ||
1411 | skb1 = NULL; | ||
1412 | } | ||
1413 | if (unlikely(skb1)) | ||
1414 | kfree_skb(skb1); | ||
1415 | } | ||
1416 | |||
1189 | /* | 1417 | /* |
1190 | * Multicasts and broadcasts go to each listener. | 1418 | * Multicasts and broadcasts go to each listener. |
1191 | * | 1419 | * |
1192 | * Note: called only from the BH handler context, | 1420 | * Note: called only from the BH handler context. |
1193 | * so we don't need to lock the hashes. | ||
1194 | */ | 1421 | */ |
1195 | static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, | 1422 | static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, |
1196 | struct udphdr *uh, | 1423 | struct udphdr *uh, |
1197 | __be32 saddr, __be32 daddr, | 1424 | __be32 saddr, __be32 daddr, |
1198 | struct udp_table *udptable) | 1425 | struct udp_table *udptable) |
1199 | { | 1426 | { |
1200 | struct sock *sk; | 1427 | struct sock *sk, *stack[256 / sizeof(struct sock *)]; |
1201 | struct udp_hslot *hslot = &udptable->hash[udp_hashfn(net, ntohs(uh->dest))]; | 1428 | struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest)); |
1202 | int dif; | 1429 | int dif; |
1430 | unsigned int i, count = 0; | ||
1203 | 1431 | ||
1204 | spin_lock(&hslot->lock); | 1432 | spin_lock(&hslot->lock); |
1205 | sk = sk_nulls_head(&hslot->head); | 1433 | sk = sk_nulls_head(&hslot->head); |
1206 | dif = skb->dev->ifindex; | 1434 | dif = skb->dev->ifindex; |
1207 | sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif); | 1435 | sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif); |
1208 | if (sk) { | 1436 | while (sk) { |
1209 | struct sock *sknext = NULL; | 1437 | stack[count++] = sk; |
1210 | 1438 | sk = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest, | |
1211 | do { | 1439 | daddr, uh->source, saddr, dif); |
1212 | struct sk_buff *skb1 = skb; | 1440 | if (unlikely(count == ARRAY_SIZE(stack))) { |
1213 | 1441 | if (!sk) | |
1214 | sknext = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest, | 1442 | break; |
1215 | daddr, uh->source, saddr, | 1443 | flush_stack(stack, count, skb, ~0); |
1216 | dif); | 1444 | count = 0; |
1217 | if (sknext) | 1445 | } |
1218 | skb1 = skb_clone(skb, GFP_ATOMIC); | 1446 | } |
1219 | 1447 | /* | |
1220 | if (skb1) { | 1448 | * before releasing chain lock, we must take a reference on sockets |
1221 | int ret = udp_queue_rcv_skb(sk, skb1); | 1449 | */ |
1222 | if (ret > 0) | 1450 | for (i = 0; i < count; i++) |
1223 | /* we should probably re-process instead | 1451 | sock_hold(stack[i]); |
1224 | * of dropping packets here. */ | 1452 | |
1225 | kfree_skb(skb1); | ||
1226 | } | ||
1227 | sk = sknext; | ||
1228 | } while (sknext); | ||
1229 | } else | ||
1230 | consume_skb(skb); | ||
1231 | spin_unlock(&hslot->lock); | 1453 | spin_unlock(&hslot->lock); |
1454 | |||
1455 | /* | ||
1456 | * do the slow work with no lock held | ||
1457 | */ | ||
1458 | if (count) { | ||
1459 | flush_stack(stack, count, skb, count - 1); | ||
1460 | |||
1461 | for (i = 0; i < count; i++) | ||
1462 | sock_put(stack[i]); | ||
1463 | } else { | ||
1464 | kfree_skb(skb); | ||
1465 | } | ||
1232 | return 0; | 1466 | return 0; |
1233 | } | 1467 | } |
1234 | 1468 | ||
@@ -1620,9 +1854,14 @@ static struct sock *udp_get_first(struct seq_file *seq, int start) | |||
1620 | struct udp_iter_state *state = seq->private; | 1854 | struct udp_iter_state *state = seq->private; |
1621 | struct net *net = seq_file_net(seq); | 1855 | struct net *net = seq_file_net(seq); |
1622 | 1856 | ||
1623 | for (state->bucket = start; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) { | 1857 | for (state->bucket = start; state->bucket <= state->udp_table->mask; |
1858 | ++state->bucket) { | ||
1624 | struct hlist_nulls_node *node; | 1859 | struct hlist_nulls_node *node; |
1625 | struct udp_hslot *hslot = &state->udp_table->hash[state->bucket]; | 1860 | struct udp_hslot *hslot = &state->udp_table->hash[state->bucket]; |
1861 | |||
1862 | if (hlist_nulls_empty(&hslot->head)) | ||
1863 | continue; | ||
1864 | |||
1626 | spin_lock_bh(&hslot->lock); | 1865 | spin_lock_bh(&hslot->lock); |
1627 | sk_nulls_for_each(sk, node, &hslot->head) { | 1866 | sk_nulls_for_each(sk, node, &hslot->head) { |
1628 | if (!net_eq(sock_net(sk), net)) | 1867 | if (!net_eq(sock_net(sk), net)) |
@@ -1647,7 +1886,7 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk) | |||
1647 | } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family)); | 1886 | } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family)); |
1648 | 1887 | ||
1649 | if (!sk) { | 1888 | if (!sk) { |
1650 | if (state->bucket < UDP_HTABLE_SIZE) | 1889 | if (state->bucket <= state->udp_table->mask) |
1651 | spin_unlock_bh(&state->udp_table->hash[state->bucket].lock); | 1890 | spin_unlock_bh(&state->udp_table->hash[state->bucket].lock); |
1652 | return udp_get_first(seq, state->bucket + 1); | 1891 | return udp_get_first(seq, state->bucket + 1); |
1653 | } | 1892 | } |
@@ -1667,7 +1906,7 @@ static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos) | |||
1667 | static void *udp_seq_start(struct seq_file *seq, loff_t *pos) | 1906 | static void *udp_seq_start(struct seq_file *seq, loff_t *pos) |
1668 | { | 1907 | { |
1669 | struct udp_iter_state *state = seq->private; | 1908 | struct udp_iter_state *state = seq->private; |
1670 | state->bucket = UDP_HTABLE_SIZE; | 1909 | state->bucket = MAX_UDP_PORTS; |
1671 | 1910 | ||
1672 | return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN; | 1911 | return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN; |
1673 | } | 1912 | } |
@@ -1689,7 +1928,7 @@ static void udp_seq_stop(struct seq_file *seq, void *v) | |||
1689 | { | 1928 | { |
1690 | struct udp_iter_state *state = seq->private; | 1929 | struct udp_iter_state *state = seq->private; |
1691 | 1930 | ||
1692 | if (state->bucket < UDP_HTABLE_SIZE) | 1931 | if (state->bucket <= state->udp_table->mask) |
1693 | spin_unlock_bh(&state->udp_table->hash[state->bucket].lock); | 1932 | spin_unlock_bh(&state->udp_table->hash[state->bucket].lock); |
1694 | } | 1933 | } |
1695 | 1934 | ||
@@ -1744,12 +1983,12 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f, | |||
1744 | int bucket, int *len) | 1983 | int bucket, int *len) |
1745 | { | 1984 | { |
1746 | struct inet_sock *inet = inet_sk(sp); | 1985 | struct inet_sock *inet = inet_sk(sp); |
1747 | __be32 dest = inet->daddr; | 1986 | __be32 dest = inet->inet_daddr; |
1748 | __be32 src = inet->rcv_saddr; | 1987 | __be32 src = inet->inet_rcv_saddr; |
1749 | __u16 destp = ntohs(inet->dport); | 1988 | __u16 destp = ntohs(inet->inet_dport); |
1750 | __u16 srcp = ntohs(inet->sport); | 1989 | __u16 srcp = ntohs(inet->inet_sport); |
1751 | 1990 | ||
1752 | seq_printf(f, "%4d: %08X:%04X %08X:%04X" | 1991 | seq_printf(f, "%5d: %08X:%04X %08X:%04X" |
1753 | " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d%n", | 1992 | " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d%n", |
1754 | bucket, src, srcp, dest, destp, sp->sk_state, | 1993 | bucket, src, srcp, dest, destp, sp->sk_state, |
1755 | sk_wmem_alloc_get(sp), | 1994 | sk_wmem_alloc_get(sp), |
@@ -1815,21 +2054,60 @@ void udp4_proc_exit(void) | |||
1815 | } | 2054 | } |
1816 | #endif /* CONFIG_PROC_FS */ | 2055 | #endif /* CONFIG_PROC_FS */ |
1817 | 2056 | ||
1818 | void __init udp_table_init(struct udp_table *table) | 2057 | static __initdata unsigned long uhash_entries; |
2058 | static int __init set_uhash_entries(char *str) | ||
1819 | { | 2059 | { |
1820 | int i; | 2060 | if (!str) |
2061 | return 0; | ||
2062 | uhash_entries = simple_strtoul(str, &str, 0); | ||
2063 | if (uhash_entries && uhash_entries < UDP_HTABLE_SIZE_MIN) | ||
2064 | uhash_entries = UDP_HTABLE_SIZE_MIN; | ||
2065 | return 1; | ||
2066 | } | ||
2067 | __setup("uhash_entries=", set_uhash_entries); | ||
1821 | 2068 | ||
1822 | for (i = 0; i < UDP_HTABLE_SIZE; i++) { | 2069 | void __init udp_table_init(struct udp_table *table, const char *name) |
2070 | { | ||
2071 | unsigned int i; | ||
2072 | |||
2073 | if (!CONFIG_BASE_SMALL) | ||
2074 | table->hash = alloc_large_system_hash(name, | ||
2075 | 2 * sizeof(struct udp_hslot), | ||
2076 | uhash_entries, | ||
2077 | 21, /* one slot per 2 MB */ | ||
2078 | 0, | ||
2079 | &table->log, | ||
2080 | &table->mask, | ||
2081 | 64 * 1024); | ||
2082 | /* | ||
2083 | * Make sure hash table has the minimum size | ||
2084 | */ | ||
2085 | if (CONFIG_BASE_SMALL || table->mask < UDP_HTABLE_SIZE_MIN - 1) { | ||
2086 | table->hash = kmalloc(UDP_HTABLE_SIZE_MIN * | ||
2087 | 2 * sizeof(struct udp_hslot), GFP_KERNEL); | ||
2088 | if (!table->hash) | ||
2089 | panic(name); | ||
2090 | table->log = ilog2(UDP_HTABLE_SIZE_MIN); | ||
2091 | table->mask = UDP_HTABLE_SIZE_MIN - 1; | ||
2092 | } | ||
2093 | table->hash2 = table->hash + (table->mask + 1); | ||
2094 | for (i = 0; i <= table->mask; i++) { | ||
1823 | INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i); | 2095 | INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i); |
2096 | table->hash[i].count = 0; | ||
1824 | spin_lock_init(&table->hash[i].lock); | 2097 | spin_lock_init(&table->hash[i].lock); |
1825 | } | 2098 | } |
2099 | for (i = 0; i <= table->mask; i++) { | ||
2100 | INIT_HLIST_NULLS_HEAD(&table->hash2[i].head, i); | ||
2101 | table->hash2[i].count = 0; | ||
2102 | spin_lock_init(&table->hash2[i].lock); | ||
2103 | } | ||
1826 | } | 2104 | } |
1827 | 2105 | ||
1828 | void __init udp_init(void) | 2106 | void __init udp_init(void) |
1829 | { | 2107 | { |
1830 | unsigned long nr_pages, limit; | 2108 | unsigned long nr_pages, limit; |
1831 | 2109 | ||
1832 | udp_table_init(&udp_table); | 2110 | udp_table_init(&udp_table, "UDP"); |
1833 | /* Set the pressure threshold up by the same strategy of TCP. It is a | 2111 | /* Set the pressure threshold up by the same strategy of TCP. It is a |
1834 | * fraction of global memory that is up to 1/2 at 256 MB, decreasing | 2112 | * fraction of global memory that is up to 1/2 at 256 MB, decreasing |
1835 | * toward zero with the amount of memory, with a floor of 128 pages. | 2113 | * toward zero with the amount of memory, with a floor of 128 pages. |