aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/udp.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/udp.c')
-rw-r--r--net/ipv4/udp.c484
1 files changed, 381 insertions, 103 deletions
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 0fa9f70e4b19..1f9534846ca9 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -106,7 +106,7 @@
106#include <net/xfrm.h> 106#include <net/xfrm.h>
107#include "udp_impl.h" 107#include "udp_impl.h"
108 108
109struct udp_table udp_table; 109struct udp_table udp_table __read_mostly;
110EXPORT_SYMBOL(udp_table); 110EXPORT_SYMBOL(udp_table);
111 111
112int sysctl_udp_mem[3] __read_mostly; 112int sysctl_udp_mem[3] __read_mostly;
@@ -121,28 +121,30 @@ EXPORT_SYMBOL(sysctl_udp_wmem_min);
121atomic_t udp_memory_allocated; 121atomic_t udp_memory_allocated;
122EXPORT_SYMBOL(udp_memory_allocated); 122EXPORT_SYMBOL(udp_memory_allocated);
123 123
124#define PORTS_PER_CHAIN (65536 / UDP_HTABLE_SIZE) 124#define MAX_UDP_PORTS 65536
125#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
125 126
126static int udp_lib_lport_inuse(struct net *net, __u16 num, 127static int udp_lib_lport_inuse(struct net *net, __u16 num,
127 const struct udp_hslot *hslot, 128 const struct udp_hslot *hslot,
128 unsigned long *bitmap, 129 unsigned long *bitmap,
129 struct sock *sk, 130 struct sock *sk,
130 int (*saddr_comp)(const struct sock *sk1, 131 int (*saddr_comp)(const struct sock *sk1,
131 const struct sock *sk2)) 132 const struct sock *sk2),
133 unsigned int log)
132{ 134{
133 struct sock *sk2; 135 struct sock *sk2;
134 struct hlist_nulls_node *node; 136 struct hlist_nulls_node *node;
135 137
136 sk_nulls_for_each(sk2, node, &hslot->head) 138 sk_nulls_for_each(sk2, node, &hslot->head)
137 if (net_eq(sock_net(sk2), net) && 139 if (net_eq(sock_net(sk2), net) &&
138 sk2 != sk && 140 sk2 != sk &&
139 (bitmap || sk2->sk_hash == num) && 141 (bitmap || udp_sk(sk2)->udp_port_hash == num) &&
140 (!sk2->sk_reuse || !sk->sk_reuse) && 142 (!sk2->sk_reuse || !sk->sk_reuse) &&
141 (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if 143 (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
142 || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && 144 sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
143 (*saddr_comp)(sk, sk2)) { 145 (*saddr_comp)(sk, sk2)) {
144 if (bitmap) 146 if (bitmap)
145 __set_bit(sk2->sk_hash / UDP_HTABLE_SIZE, 147 __set_bit(udp_sk(sk2)->udp_port_hash >> log,
146 bitmap); 148 bitmap);
147 else 149 else
148 return 1; 150 return 1;
@@ -150,18 +152,51 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
150 return 0; 152 return 0;
151} 153}
152 154
155/*
156 * Note: we still hold spinlock of primary hash chain, so no other writer
157 * can insert/delete a socket with local_port == num
158 */
159static int udp_lib_lport_inuse2(struct net *net, __u16 num,
160 struct udp_hslot *hslot2,
161 struct sock *sk,
162 int (*saddr_comp)(const struct sock *sk1,
163 const struct sock *sk2))
164{
165 struct sock *sk2;
166 struct hlist_nulls_node *node;
167 int res = 0;
168
169 spin_lock(&hslot2->lock);
170 udp_portaddr_for_each_entry(sk2, node, &hslot2->head)
171 if (net_eq(sock_net(sk2), net) &&
172 sk2 != sk &&
173 (udp_sk(sk2)->udp_port_hash == num) &&
174 (!sk2->sk_reuse || !sk->sk_reuse) &&
175 (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
176 sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
177 (*saddr_comp)(sk, sk2)) {
178 res = 1;
179 break;
180 }
181 spin_unlock(&hslot2->lock);
182 return res;
183}
184
153/** 185/**
154 * udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6 186 * udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6
155 * 187 *
156 * @sk: socket struct in question 188 * @sk: socket struct in question
157 * @snum: port number to look up 189 * @snum: port number to look up
158 * @saddr_comp: AF-dependent comparison of bound local IP addresses 190 * @saddr_comp: AF-dependent comparison of bound local IP addresses
191 * @hash2_nulladdr: AF-dependant hash value in secondary hash chains,
192 * with NULL address
159 */ 193 */
160int udp_lib_get_port(struct sock *sk, unsigned short snum, 194int udp_lib_get_port(struct sock *sk, unsigned short snum,
161 int (*saddr_comp)(const struct sock *sk1, 195 int (*saddr_comp)(const struct sock *sk1,
162 const struct sock *sk2)) 196 const struct sock *sk2),
197 unsigned int hash2_nulladdr)
163{ 198{
164 struct udp_hslot *hslot; 199 struct udp_hslot *hslot, *hslot2;
165 struct udp_table *udptable = sk->sk_prot->h.udp_table; 200 struct udp_table *udptable = sk->sk_prot->h.udp_table;
166 int error = 1; 201 int error = 1;
167 struct net *net = sock_net(sk); 202 struct net *net = sock_net(sk);
@@ -180,13 +215,15 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
180 /* 215 /*
181 * force rand to be an odd multiple of UDP_HTABLE_SIZE 216 * force rand to be an odd multiple of UDP_HTABLE_SIZE
182 */ 217 */
183 rand = (rand | 1) * UDP_HTABLE_SIZE; 218 rand = (rand | 1) * (udptable->mask + 1);
184 for (last = first + UDP_HTABLE_SIZE; first != last; first++) { 219 for (last = first + udptable->mask + 1;
185 hslot = &udptable->hash[udp_hashfn(net, first)]; 220 first != last;
221 first++) {
222 hslot = udp_hashslot(udptable, net, first);
186 bitmap_zero(bitmap, PORTS_PER_CHAIN); 223 bitmap_zero(bitmap, PORTS_PER_CHAIN);
187 spin_lock_bh(&hslot->lock); 224 spin_lock_bh(&hslot->lock);
188 udp_lib_lport_inuse(net, snum, hslot, bitmap, sk, 225 udp_lib_lport_inuse(net, snum, hslot, bitmap, sk,
189 saddr_comp); 226 saddr_comp, udptable->log);
190 227
191 snum = first; 228 snum = first;
192 /* 229 /*
@@ -196,7 +233,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
196 */ 233 */
197 do { 234 do {
198 if (low <= snum && snum <= high && 235 if (low <= snum && snum <= high &&
199 !test_bit(snum / UDP_HTABLE_SIZE, bitmap)) 236 !test_bit(snum >> udptable->log, bitmap))
200 goto found; 237 goto found;
201 snum += rand; 238 snum += rand;
202 } while (snum != first); 239 } while (snum != first);
@@ -204,17 +241,51 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
204 } 241 }
205 goto fail; 242 goto fail;
206 } else { 243 } else {
207 hslot = &udptable->hash[udp_hashfn(net, snum)]; 244 hslot = udp_hashslot(udptable, net, snum);
208 spin_lock_bh(&hslot->lock); 245 spin_lock_bh(&hslot->lock);
209 if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, saddr_comp)) 246 if (hslot->count > 10) {
247 int exist;
248 unsigned int slot2 = udp_sk(sk)->udp_portaddr_hash ^ snum;
249
250 slot2 &= udptable->mask;
251 hash2_nulladdr &= udptable->mask;
252
253 hslot2 = udp_hashslot2(udptable, slot2);
254 if (hslot->count < hslot2->count)
255 goto scan_primary_hash;
256
257 exist = udp_lib_lport_inuse2(net, snum, hslot2,
258 sk, saddr_comp);
259 if (!exist && (hash2_nulladdr != slot2)) {
260 hslot2 = udp_hashslot2(udptable, hash2_nulladdr);
261 exist = udp_lib_lport_inuse2(net, snum, hslot2,
262 sk, saddr_comp);
263 }
264 if (exist)
265 goto fail_unlock;
266 else
267 goto found;
268 }
269scan_primary_hash:
270 if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk,
271 saddr_comp, 0))
210 goto fail_unlock; 272 goto fail_unlock;
211 } 273 }
212found: 274found:
213 inet_sk(sk)->num = snum; 275 inet_sk(sk)->inet_num = snum;
214 sk->sk_hash = snum; 276 udp_sk(sk)->udp_port_hash = snum;
277 udp_sk(sk)->udp_portaddr_hash ^= snum;
215 if (sk_unhashed(sk)) { 278 if (sk_unhashed(sk)) {
216 sk_nulls_add_node_rcu(sk, &hslot->head); 279 sk_nulls_add_node_rcu(sk, &hslot->head);
280 hslot->count++;
217 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 281 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
282
283 hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
284 spin_lock(&hslot2->lock);
285 hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
286 &hslot2->head);
287 hslot2->count++;
288 spin_unlock(&hslot2->lock);
218 } 289 }
219 error = 0; 290 error = 0;
220fail_unlock: 291fail_unlock:
@@ -229,13 +300,26 @@ static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2)
229 struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2); 300 struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2);
230 301
231 return (!ipv6_only_sock(sk2) && 302 return (!ipv6_only_sock(sk2) &&
232 (!inet1->rcv_saddr || !inet2->rcv_saddr || 303 (!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr ||
233 inet1->rcv_saddr == inet2->rcv_saddr)); 304 inet1->inet_rcv_saddr == inet2->inet_rcv_saddr));
305}
306
307static unsigned int udp4_portaddr_hash(struct net *net, __be32 saddr,
308 unsigned int port)
309{
310 return jhash_1word(saddr, net_hash_mix(net)) ^ port;
234} 311}
235 312
236int udp_v4_get_port(struct sock *sk, unsigned short snum) 313int udp_v4_get_port(struct sock *sk, unsigned short snum)
237{ 314{
238 return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal); 315 unsigned int hash2_nulladdr =
316 udp4_portaddr_hash(sock_net(sk), INADDR_ANY, snum);
317 unsigned int hash2_partial =
318 udp4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0);
319
320 /* precompute partial secondary hash */
321 udp_sk(sk)->udp_portaddr_hash = hash2_partial;
322 return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal, hash2_nulladdr);
239} 323}
240 324
241static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr, 325static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
@@ -244,23 +328,61 @@ static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
244{ 328{
245 int score = -1; 329 int score = -1;
246 330
247 if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum && 331 if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum &&
248 !ipv6_only_sock(sk)) { 332 !ipv6_only_sock(sk)) {
249 struct inet_sock *inet = inet_sk(sk); 333 struct inet_sock *inet = inet_sk(sk);
250 334
251 score = (sk->sk_family == PF_INET ? 1 : 0); 335 score = (sk->sk_family == PF_INET ? 1 : 0);
252 if (inet->rcv_saddr) { 336 if (inet->inet_rcv_saddr) {
253 if (inet->rcv_saddr != daddr) 337 if (inet->inet_rcv_saddr != daddr)
338 return -1;
339 score += 2;
340 }
341 if (inet->inet_daddr) {
342 if (inet->inet_daddr != saddr)
343 return -1;
344 score += 2;
345 }
346 if (inet->inet_dport) {
347 if (inet->inet_dport != sport)
348 return -1;
349 score += 2;
350 }
351 if (sk->sk_bound_dev_if) {
352 if (sk->sk_bound_dev_if != dif)
254 return -1; 353 return -1;
255 score += 2; 354 score += 2;
256 } 355 }
257 if (inet->daddr) { 356 }
258 if (inet->daddr != saddr) 357 return score;
358}
359
360/*
361 * In this second variant, we check (daddr, dport) matches (inet_rcv_sadd, inet_num)
362 */
363#define SCORE2_MAX (1 + 2 + 2 + 2)
364static inline int compute_score2(struct sock *sk, struct net *net,
365 __be32 saddr, __be16 sport,
366 __be32 daddr, unsigned int hnum, int dif)
367{
368 int score = -1;
369
370 if (net_eq(sock_net(sk), net) && !ipv6_only_sock(sk)) {
371 struct inet_sock *inet = inet_sk(sk);
372
373 if (inet->inet_rcv_saddr != daddr)
374 return -1;
375 if (inet->inet_num != hnum)
376 return -1;
377
378 score = (sk->sk_family == PF_INET ? 1 : 0);
379 if (inet->inet_daddr) {
380 if (inet->inet_daddr != saddr)
259 return -1; 381 return -1;
260 score += 2; 382 score += 2;
261 } 383 }
262 if (inet->dport) { 384 if (inet->inet_dport) {
263 if (inet->dport != sport) 385 if (inet->inet_dport != sport)
264 return -1; 386 return -1;
265 score += 2; 387 score += 2;
266 } 388 }
@@ -273,6 +395,51 @@ static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
273 return score; 395 return score;
274} 396}
275 397
398
399/* called with read_rcu_lock() */
400static struct sock *udp4_lib_lookup2(struct net *net,
401 __be32 saddr, __be16 sport,
402 __be32 daddr, unsigned int hnum, int dif,
403 struct udp_hslot *hslot2, unsigned int slot2)
404{
405 struct sock *sk, *result;
406 struct hlist_nulls_node *node;
407 int score, badness;
408
409begin:
410 result = NULL;
411 badness = -1;
412 udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {
413 score = compute_score2(sk, net, saddr, sport,
414 daddr, hnum, dif);
415 if (score > badness) {
416 result = sk;
417 badness = score;
418 if (score == SCORE2_MAX)
419 goto exact_match;
420 }
421 }
422 /*
423 * if the nulls value we got at the end of this lookup is
424 * not the expected one, we must restart lookup.
425 * We probably met an item that was moved to another chain.
426 */
427 if (get_nulls_value(node) != slot2)
428 goto begin;
429
430 if (result) {
431exact_match:
432 if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
433 result = NULL;
434 else if (unlikely(compute_score2(result, net, saddr, sport,
435 daddr, hnum, dif) < badness)) {
436 sock_put(result);
437 goto begin;
438 }
439 }
440 return result;
441}
442
276/* UDP is nearly always wildcards out the wazoo, it makes no sense to try 443/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
277 * harder than this. -DaveM 444 * harder than this. -DaveM
278 */ 445 */
@@ -283,11 +450,35 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
283 struct sock *sk, *result; 450 struct sock *sk, *result;
284 struct hlist_nulls_node *node; 451 struct hlist_nulls_node *node;
285 unsigned short hnum = ntohs(dport); 452 unsigned short hnum = ntohs(dport);
286 unsigned int hash = udp_hashfn(net, hnum); 453 unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
287 struct udp_hslot *hslot = &udptable->hash[hash]; 454 struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
288 int score, badness; 455 int score, badness;
289 456
290 rcu_read_lock(); 457 rcu_read_lock();
458 if (hslot->count > 10) {
459 hash2 = udp4_portaddr_hash(net, daddr, hnum);
460 slot2 = hash2 & udptable->mask;
461 hslot2 = &udptable->hash2[slot2];
462 if (hslot->count < hslot2->count)
463 goto begin;
464
465 result = udp4_lib_lookup2(net, saddr, sport,
466 daddr, hnum, dif,
467 hslot2, slot2);
468 if (!result) {
469 hash2 = udp4_portaddr_hash(net, INADDR_ANY, hnum);
470 slot2 = hash2 & udptable->mask;
471 hslot2 = &udptable->hash2[slot2];
472 if (hslot->count < hslot2->count)
473 goto begin;
474
475 result = udp4_lib_lookup2(net, INADDR_ANY, sport,
476 daddr, hnum, dif,
477 hslot2, slot2);
478 }
479 rcu_read_unlock();
480 return result;
481 }
291begin: 482begin:
292 result = NULL; 483 result = NULL;
293 badness = -1; 484 badness = -1;
@@ -304,7 +495,7 @@ begin:
304 * not the expected one, we must restart lookup. 495 * not the expected one, we must restart lookup.
305 * We probably met an item that was moved to another chain. 496 * We probably met an item that was moved to another chain.
306 */ 497 */
307 if (get_nulls_value(node) != hash) 498 if (get_nulls_value(node) != slot)
308 goto begin; 499 goto begin;
309 500
310 if (result) { 501 if (result) {
@@ -354,12 +545,13 @@ static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,
354 sk_nulls_for_each_from(s, node) { 545 sk_nulls_for_each_from(s, node) {
355 struct inet_sock *inet = inet_sk(s); 546 struct inet_sock *inet = inet_sk(s);
356 547
357 if (!net_eq(sock_net(s), net) || 548 if (!net_eq(sock_net(s), net) ||
358 s->sk_hash != hnum || 549 udp_sk(s)->udp_port_hash != hnum ||
359 (inet->daddr && inet->daddr != rmt_addr) || 550 (inet->inet_daddr && inet->inet_daddr != rmt_addr) ||
360 (inet->dport != rmt_port && inet->dport) || 551 (inet->inet_dport != rmt_port && inet->inet_dport) ||
361 (inet->rcv_saddr && inet->rcv_saddr != loc_addr) || 552 (inet->inet_rcv_saddr &&
362 ipv6_only_sock(s) || 553 inet->inet_rcv_saddr != loc_addr) ||
554 ipv6_only_sock(s) ||
363 (s->sk_bound_dev_if && s->sk_bound_dev_if != dif)) 555 (s->sk_bound_dev_if && s->sk_bound_dev_if != dif))
364 continue; 556 continue;
365 if (!ip_mc_sf_allow(s, loc_addr, rmt_addr, dif)) 557 if (!ip_mc_sf_allow(s, loc_addr, rmt_addr, dif))
@@ -642,14 +834,14 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
642 } else { 834 } else {
643 if (sk->sk_state != TCP_ESTABLISHED) 835 if (sk->sk_state != TCP_ESTABLISHED)
644 return -EDESTADDRREQ; 836 return -EDESTADDRREQ;
645 daddr = inet->daddr; 837 daddr = inet->inet_daddr;
646 dport = inet->dport; 838 dport = inet->inet_dport;
647 /* Open fast path for connected socket. 839 /* Open fast path for connected socket.
648 Route will not be used, if at least one option is set. 840 Route will not be used, if at least one option is set.
649 */ 841 */
650 connected = 1; 842 connected = 1;
651 } 843 }
652 ipc.addr = inet->saddr; 844 ipc.addr = inet->inet_saddr;
653 845
654 ipc.oif = sk->sk_bound_dev_if; 846 ipc.oif = sk->sk_bound_dev_if;
655 err = sock_tx_timestamp(msg, sk, &ipc.shtx); 847 err = sock_tx_timestamp(msg, sk, &ipc.shtx);
@@ -704,7 +896,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
704 .proto = sk->sk_protocol, 896 .proto = sk->sk_protocol,
705 .flags = inet_sk_flowi_flags(sk), 897 .flags = inet_sk_flowi_flags(sk),
706 .uli_u = { .ports = 898 .uli_u = { .ports =
707 { .sport = inet->sport, 899 { .sport = inet->inet_sport,
708 .dport = dport } } }; 900 .dport = dport } } };
709 struct net *net = sock_net(sk); 901 struct net *net = sock_net(sk);
710 902
@@ -748,7 +940,7 @@ back_from_confirm:
748 inet->cork.fl.fl4_dst = daddr; 940 inet->cork.fl.fl4_dst = daddr;
749 inet->cork.fl.fl_ip_dport = dport; 941 inet->cork.fl.fl_ip_dport = dport;
750 inet->cork.fl.fl4_src = saddr; 942 inet->cork.fl.fl4_src = saddr;
751 inet->cork.fl.fl_ip_sport = inet->sport; 943 inet->cork.fl.fl_ip_sport = inet->inet_sport;
752 up->pending = AF_INET; 944 up->pending = AF_INET;
753 945
754do_append_data: 946do_append_data:
@@ -862,6 +1054,7 @@ static unsigned int first_packet_length(struct sock *sk)
862 udp_lib_checksum_complete(skb)) { 1054 udp_lib_checksum_complete(skb)) {
863 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, 1055 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
864 IS_UDPLITE(sk)); 1056 IS_UDPLITE(sk));
1057 atomic_inc(&sk->sk_drops);
865 __skb_unlink(skb, rcvq); 1058 __skb_unlink(skb, rcvq);
866 __skb_queue_tail(&list_kill, skb); 1059 __skb_queue_tail(&list_kill, skb);
867 } 1060 }
@@ -982,7 +1175,7 @@ try_again:
982 UDP_INC_STATS_USER(sock_net(sk), 1175 UDP_INC_STATS_USER(sock_net(sk),
983 UDP_MIB_INDATAGRAMS, is_udplite); 1176 UDP_MIB_INDATAGRAMS, is_udplite);
984 1177
985 sock_recv_timestamp(msg, sk, skb); 1178 sock_recv_ts_and_drops(msg, sk, skb);
986 1179
987 /* Copy the address. */ 1180 /* Copy the address. */
988 if (sin) { 1181 if (sin) {
@@ -1023,15 +1216,15 @@ int udp_disconnect(struct sock *sk, int flags)
1023 */ 1216 */
1024 1217
1025 sk->sk_state = TCP_CLOSE; 1218 sk->sk_state = TCP_CLOSE;
1026 inet->daddr = 0; 1219 inet->inet_daddr = 0;
1027 inet->dport = 0; 1220 inet->inet_dport = 0;
1028 sk->sk_bound_dev_if = 0; 1221 sk->sk_bound_dev_if = 0;
1029 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) 1222 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1030 inet_reset_saddr(sk); 1223 inet_reset_saddr(sk);
1031 1224
1032 if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) { 1225 if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) {
1033 sk->sk_prot->unhash(sk); 1226 sk->sk_prot->unhash(sk);
1034 inet->sport = 0; 1227 inet->inet_sport = 0;
1035 } 1228 }
1036 sk_dst_reset(sk); 1229 sk_dst_reset(sk);
1037 return 0; 1230 return 0;
@@ -1042,13 +1235,22 @@ void udp_lib_unhash(struct sock *sk)
1042{ 1235{
1043 if (sk_hashed(sk)) { 1236 if (sk_hashed(sk)) {
1044 struct udp_table *udptable = sk->sk_prot->h.udp_table; 1237 struct udp_table *udptable = sk->sk_prot->h.udp_table;
1045 unsigned int hash = udp_hashfn(sock_net(sk), sk->sk_hash); 1238 struct udp_hslot *hslot, *hslot2;
1046 struct udp_hslot *hslot = &udptable->hash[hash]; 1239
1240 hslot = udp_hashslot(udptable, sock_net(sk),
1241 udp_sk(sk)->udp_port_hash);
1242 hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
1047 1243
1048 spin_lock_bh(&hslot->lock); 1244 spin_lock_bh(&hslot->lock);
1049 if (sk_nulls_del_node_init_rcu(sk)) { 1245 if (sk_nulls_del_node_init_rcu(sk)) {
1050 inet_sk(sk)->num = 0; 1246 hslot->count--;
1247 inet_sk(sk)->inet_num = 0;
1051 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 1248 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
1249
1250 spin_lock(&hslot2->lock);
1251 hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
1252 hslot2->count--;
1253 spin_unlock(&hslot2->lock);
1052 } 1254 }
1053 spin_unlock_bh(&hslot->lock); 1255 spin_unlock_bh(&hslot->lock);
1054 } 1256 }
@@ -1057,25 +1259,22 @@ EXPORT_SYMBOL(udp_lib_unhash);
1057 1259
1058static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 1260static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1059{ 1261{
1060 int is_udplite = IS_UDPLITE(sk); 1262 int rc = sock_queue_rcv_skb(sk, skb);
1061 int rc; 1263
1264 if (rc < 0) {
1265 int is_udplite = IS_UDPLITE(sk);
1062 1266
1063 if ((rc = sock_queue_rcv_skb(sk, skb)) < 0) {
1064 /* Note that an ENOMEM error is charged twice */ 1267 /* Note that an ENOMEM error is charged twice */
1065 if (rc == -ENOMEM) { 1268 if (rc == -ENOMEM)
1066 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, 1269 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
1067 is_udplite); 1270 is_udplite);
1068 atomic_inc(&sk->sk_drops); 1271 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
1069 } 1272 kfree_skb(skb);
1070 goto drop; 1273 return -1;
1071 } 1274 }
1072 1275
1073 return 0; 1276 return 0;
1074 1277
1075drop:
1076 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
1077 kfree_skb(skb);
1078 return -1;
1079} 1278}
1080 1279
1081/* returns: 1280/* returns:
@@ -1182,53 +1381,88 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1182 1381
1183drop: 1382drop:
1184 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); 1383 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
1384 atomic_inc(&sk->sk_drops);
1185 kfree_skb(skb); 1385 kfree_skb(skb);
1186 return -1; 1386 return -1;
1187} 1387}
1188 1388
1389
1390static void flush_stack(struct sock **stack, unsigned int count,
1391 struct sk_buff *skb, unsigned int final)
1392{
1393 unsigned int i;
1394 struct sk_buff *skb1 = NULL;
1395 struct sock *sk;
1396
1397 for (i = 0; i < count; i++) {
1398 sk = stack[i];
1399 if (likely(skb1 == NULL))
1400 skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC);
1401
1402 if (!skb1) {
1403 atomic_inc(&sk->sk_drops);
1404 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
1405 IS_UDPLITE(sk));
1406 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
1407 IS_UDPLITE(sk));
1408 }
1409
1410 if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0)
1411 skb1 = NULL;
1412 }
1413 if (unlikely(skb1))
1414 kfree_skb(skb1);
1415}
1416
1189/* 1417/*
1190 * Multicasts and broadcasts go to each listener. 1418 * Multicasts and broadcasts go to each listener.
1191 * 1419 *
1192 * Note: called only from the BH handler context, 1420 * Note: called only from the BH handler context.
1193 * so we don't need to lock the hashes.
1194 */ 1421 */
1195static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, 1422static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
1196 struct udphdr *uh, 1423 struct udphdr *uh,
1197 __be32 saddr, __be32 daddr, 1424 __be32 saddr, __be32 daddr,
1198 struct udp_table *udptable) 1425 struct udp_table *udptable)
1199{ 1426{
1200 struct sock *sk; 1427 struct sock *sk, *stack[256 / sizeof(struct sock *)];
1201 struct udp_hslot *hslot = &udptable->hash[udp_hashfn(net, ntohs(uh->dest))]; 1428 struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest));
1202 int dif; 1429 int dif;
1430 unsigned int i, count = 0;
1203 1431
1204 spin_lock(&hslot->lock); 1432 spin_lock(&hslot->lock);
1205 sk = sk_nulls_head(&hslot->head); 1433 sk = sk_nulls_head(&hslot->head);
1206 dif = skb->dev->ifindex; 1434 dif = skb->dev->ifindex;
1207 sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif); 1435 sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif);
1208 if (sk) { 1436 while (sk) {
1209 struct sock *sknext = NULL; 1437 stack[count++] = sk;
1210 1438 sk = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest,
1211 do { 1439 daddr, uh->source, saddr, dif);
1212 struct sk_buff *skb1 = skb; 1440 if (unlikely(count == ARRAY_SIZE(stack))) {
1213 1441 if (!sk)
1214 sknext = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest, 1442 break;
1215 daddr, uh->source, saddr, 1443 flush_stack(stack, count, skb, ~0);
1216 dif); 1444 count = 0;
1217 if (sknext) 1445 }
1218 skb1 = skb_clone(skb, GFP_ATOMIC); 1446 }
1219 1447 /*
1220 if (skb1) { 1448 * before releasing chain lock, we must take a reference on sockets
1221 int ret = udp_queue_rcv_skb(sk, skb1); 1449 */
1222 if (ret > 0) 1450 for (i = 0; i < count; i++)
1223 /* we should probably re-process instead 1451 sock_hold(stack[i]);
1224 * of dropping packets here. */ 1452
1225 kfree_skb(skb1);
1226 }
1227 sk = sknext;
1228 } while (sknext);
1229 } else
1230 consume_skb(skb);
1231 spin_unlock(&hslot->lock); 1453 spin_unlock(&hslot->lock);
1454
1455 /*
1456 * do the slow work with no lock held
1457 */
1458 if (count) {
1459 flush_stack(stack, count, skb, count - 1);
1460
1461 for (i = 0; i < count; i++)
1462 sock_put(stack[i]);
1463 } else {
1464 kfree_skb(skb);
1465 }
1232 return 0; 1466 return 0;
1233} 1467}
1234 1468
@@ -1620,9 +1854,14 @@ static struct sock *udp_get_first(struct seq_file *seq, int start)
1620 struct udp_iter_state *state = seq->private; 1854 struct udp_iter_state *state = seq->private;
1621 struct net *net = seq_file_net(seq); 1855 struct net *net = seq_file_net(seq);
1622 1856
1623 for (state->bucket = start; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) { 1857 for (state->bucket = start; state->bucket <= state->udp_table->mask;
1858 ++state->bucket) {
1624 struct hlist_nulls_node *node; 1859 struct hlist_nulls_node *node;
1625 struct udp_hslot *hslot = &state->udp_table->hash[state->bucket]; 1860 struct udp_hslot *hslot = &state->udp_table->hash[state->bucket];
1861
1862 if (hlist_nulls_empty(&hslot->head))
1863 continue;
1864
1626 spin_lock_bh(&hslot->lock); 1865 spin_lock_bh(&hslot->lock);
1627 sk_nulls_for_each(sk, node, &hslot->head) { 1866 sk_nulls_for_each(sk, node, &hslot->head) {
1628 if (!net_eq(sock_net(sk), net)) 1867 if (!net_eq(sock_net(sk), net))
@@ -1647,7 +1886,7 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
1647 } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family)); 1886 } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family));
1648 1887
1649 if (!sk) { 1888 if (!sk) {
1650 if (state->bucket < UDP_HTABLE_SIZE) 1889 if (state->bucket <= state->udp_table->mask)
1651 spin_unlock_bh(&state->udp_table->hash[state->bucket].lock); 1890 spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
1652 return udp_get_first(seq, state->bucket + 1); 1891 return udp_get_first(seq, state->bucket + 1);
1653 } 1892 }
@@ -1667,7 +1906,7 @@ static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
1667static void *udp_seq_start(struct seq_file *seq, loff_t *pos) 1906static void *udp_seq_start(struct seq_file *seq, loff_t *pos)
1668{ 1907{
1669 struct udp_iter_state *state = seq->private; 1908 struct udp_iter_state *state = seq->private;
1670 state->bucket = UDP_HTABLE_SIZE; 1909 state->bucket = MAX_UDP_PORTS;
1671 1910
1672 return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN; 1911 return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
1673} 1912}
@@ -1689,7 +1928,7 @@ static void udp_seq_stop(struct seq_file *seq, void *v)
1689{ 1928{
1690 struct udp_iter_state *state = seq->private; 1929 struct udp_iter_state *state = seq->private;
1691 1930
1692 if (state->bucket < UDP_HTABLE_SIZE) 1931 if (state->bucket <= state->udp_table->mask)
1693 spin_unlock_bh(&state->udp_table->hash[state->bucket].lock); 1932 spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
1694} 1933}
1695 1934
@@ -1744,12 +1983,12 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,
1744 int bucket, int *len) 1983 int bucket, int *len)
1745{ 1984{
1746 struct inet_sock *inet = inet_sk(sp); 1985 struct inet_sock *inet = inet_sk(sp);
1747 __be32 dest = inet->daddr; 1986 __be32 dest = inet->inet_daddr;
1748 __be32 src = inet->rcv_saddr; 1987 __be32 src = inet->inet_rcv_saddr;
1749 __u16 destp = ntohs(inet->dport); 1988 __u16 destp = ntohs(inet->inet_dport);
1750 __u16 srcp = ntohs(inet->sport); 1989 __u16 srcp = ntohs(inet->inet_sport);
1751 1990
1752 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 1991 seq_printf(f, "%5d: %08X:%04X %08X:%04X"
1753 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d%n", 1992 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d%n",
1754 bucket, src, srcp, dest, destp, sp->sk_state, 1993 bucket, src, srcp, dest, destp, sp->sk_state,
1755 sk_wmem_alloc_get(sp), 1994 sk_wmem_alloc_get(sp),
@@ -1815,21 +2054,60 @@ void udp4_proc_exit(void)
1815} 2054}
1816#endif /* CONFIG_PROC_FS */ 2055#endif /* CONFIG_PROC_FS */
1817 2056
1818void __init udp_table_init(struct udp_table *table) 2057static __initdata unsigned long uhash_entries;
2058static int __init set_uhash_entries(char *str)
1819{ 2059{
1820 int i; 2060 if (!str)
2061 return 0;
2062 uhash_entries = simple_strtoul(str, &str, 0);
2063 if (uhash_entries && uhash_entries < UDP_HTABLE_SIZE_MIN)
2064 uhash_entries = UDP_HTABLE_SIZE_MIN;
2065 return 1;
2066}
2067__setup("uhash_entries=", set_uhash_entries);
1821 2068
1822 for (i = 0; i < UDP_HTABLE_SIZE; i++) { 2069void __init udp_table_init(struct udp_table *table, const char *name)
2070{
2071 unsigned int i;
2072
2073 if (!CONFIG_BASE_SMALL)
2074 table->hash = alloc_large_system_hash(name,
2075 2 * sizeof(struct udp_hslot),
2076 uhash_entries,
2077 21, /* one slot per 2 MB */
2078 0,
2079 &table->log,
2080 &table->mask,
2081 64 * 1024);
2082 /*
2083 * Make sure hash table has the minimum size
2084 */
2085 if (CONFIG_BASE_SMALL || table->mask < UDP_HTABLE_SIZE_MIN - 1) {
2086 table->hash = kmalloc(UDP_HTABLE_SIZE_MIN *
2087 2 * sizeof(struct udp_hslot), GFP_KERNEL);
2088 if (!table->hash)
2089 panic(name);
2090 table->log = ilog2(UDP_HTABLE_SIZE_MIN);
2091 table->mask = UDP_HTABLE_SIZE_MIN - 1;
2092 }
2093 table->hash2 = table->hash + (table->mask + 1);
2094 for (i = 0; i <= table->mask; i++) {
1823 INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i); 2095 INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i);
2096 table->hash[i].count = 0;
1824 spin_lock_init(&table->hash[i].lock); 2097 spin_lock_init(&table->hash[i].lock);
1825 } 2098 }
2099 for (i = 0; i <= table->mask; i++) {
2100 INIT_HLIST_NULLS_HEAD(&table->hash2[i].head, i);
2101 table->hash2[i].count = 0;
2102 spin_lock_init(&table->hash2[i].lock);
2103 }
1826} 2104}
1827 2105
1828void __init udp_init(void) 2106void __init udp_init(void)
1829{ 2107{
1830 unsigned long nr_pages, limit; 2108 unsigned long nr_pages, limit;
1831 2109
1832 udp_table_init(&udp_table); 2110 udp_table_init(&udp_table, "UDP");
1833 /* Set the pressure threshold up by the same strategy of TCP. It is a 2111 /* Set the pressure threshold up by the same strategy of TCP. It is a
1834 * fraction of global memory that is up to 1/2 at 256 MB, decreasing 2112 * fraction of global memory that is up to 1/2 at 256 MB, decreasing
1835 * toward zero with the amount of memory, with a floor of 128 pages. 2113 * toward zero with the amount of memory, with a floor of 128 pages.