aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/udp.c
diff options
context:
space:
mode:
authorAndrea Bastoni <bastoni@cs.unc.edu>2010-05-30 19:16:45 -0400
committerAndrea Bastoni <bastoni@cs.unc.edu>2010-05-30 19:16:45 -0400
commitada47b5fe13d89735805b566185f4885f5a3f750 (patch)
tree644b88f8a71896307d71438e9b3af49126ffb22b /net/ipv4/udp.c
parent43e98717ad40a4ae64545b5ba047c7b86aa44f4f (diff)
parent3280f21d43ee541f97f8cda5792150d2dbec20d5 (diff)
Merge branch 'wip-2.6.34' into old-private-masterarchived-private-master
Diffstat (limited to 'net/ipv4/udp.c')
-rw-r--r--net/ipv4/udp.c515
1 files changed, 397 insertions, 118 deletions
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 0fa9f70e4b19..c36522a0f113 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -95,6 +95,7 @@
95#include <linux/mm.h> 95#include <linux/mm.h>
96#include <linux/inet.h> 96#include <linux/inet.h>
97#include <linux/netdevice.h> 97#include <linux/netdevice.h>
98#include <linux/slab.h>
98#include <net/tcp_states.h> 99#include <net/tcp_states.h>
99#include <linux/skbuff.h> 100#include <linux/skbuff.h>
100#include <linux/proc_fs.h> 101#include <linux/proc_fs.h>
@@ -106,7 +107,7 @@
106#include <net/xfrm.h> 107#include <net/xfrm.h>
107#include "udp_impl.h" 108#include "udp_impl.h"
108 109
109struct udp_table udp_table; 110struct udp_table udp_table __read_mostly;
110EXPORT_SYMBOL(udp_table); 111EXPORT_SYMBOL(udp_table);
111 112
112int sysctl_udp_mem[3] __read_mostly; 113int sysctl_udp_mem[3] __read_mostly;
@@ -121,28 +122,30 @@ EXPORT_SYMBOL(sysctl_udp_wmem_min);
121atomic_t udp_memory_allocated; 122atomic_t udp_memory_allocated;
122EXPORT_SYMBOL(udp_memory_allocated); 123EXPORT_SYMBOL(udp_memory_allocated);
123 124
124#define PORTS_PER_CHAIN (65536 / UDP_HTABLE_SIZE) 125#define MAX_UDP_PORTS 65536
126#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
125 127
126static int udp_lib_lport_inuse(struct net *net, __u16 num, 128static int udp_lib_lport_inuse(struct net *net, __u16 num,
127 const struct udp_hslot *hslot, 129 const struct udp_hslot *hslot,
128 unsigned long *bitmap, 130 unsigned long *bitmap,
129 struct sock *sk, 131 struct sock *sk,
130 int (*saddr_comp)(const struct sock *sk1, 132 int (*saddr_comp)(const struct sock *sk1,
131 const struct sock *sk2)) 133 const struct sock *sk2),
134 unsigned int log)
132{ 135{
133 struct sock *sk2; 136 struct sock *sk2;
134 struct hlist_nulls_node *node; 137 struct hlist_nulls_node *node;
135 138
136 sk_nulls_for_each(sk2, node, &hslot->head) 139 sk_nulls_for_each(sk2, node, &hslot->head)
137 if (net_eq(sock_net(sk2), net) && 140 if (net_eq(sock_net(sk2), net) &&
138 sk2 != sk && 141 sk2 != sk &&
139 (bitmap || sk2->sk_hash == num) && 142 (bitmap || udp_sk(sk2)->udp_port_hash == num) &&
140 (!sk2->sk_reuse || !sk->sk_reuse) && 143 (!sk2->sk_reuse || !sk->sk_reuse) &&
141 (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if 144 (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
142 || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && 145 sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
143 (*saddr_comp)(sk, sk2)) { 146 (*saddr_comp)(sk, sk2)) {
144 if (bitmap) 147 if (bitmap)
145 __set_bit(sk2->sk_hash / UDP_HTABLE_SIZE, 148 __set_bit(udp_sk(sk2)->udp_port_hash >> log,
146 bitmap); 149 bitmap);
147 else 150 else
148 return 1; 151 return 1;
@@ -150,18 +153,51 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
150 return 0; 153 return 0;
151} 154}
152 155
156/*
157 * Note: we still hold spinlock of primary hash chain, so no other writer
158 * can insert/delete a socket with local_port == num
159 */
160static int udp_lib_lport_inuse2(struct net *net, __u16 num,
161 struct udp_hslot *hslot2,
162 struct sock *sk,
163 int (*saddr_comp)(const struct sock *sk1,
164 const struct sock *sk2))
165{
166 struct sock *sk2;
167 struct hlist_nulls_node *node;
168 int res = 0;
169
170 spin_lock(&hslot2->lock);
171 udp_portaddr_for_each_entry(sk2, node, &hslot2->head)
172 if (net_eq(sock_net(sk2), net) &&
173 sk2 != sk &&
174 (udp_sk(sk2)->udp_port_hash == num) &&
175 (!sk2->sk_reuse || !sk->sk_reuse) &&
176 (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
177 sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
178 (*saddr_comp)(sk, sk2)) {
179 res = 1;
180 break;
181 }
182 spin_unlock(&hslot2->lock);
183 return res;
184}
185
153/** 186/**
154 * udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6 187 * udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6
155 * 188 *
156 * @sk: socket struct in question 189 * @sk: socket struct in question
157 * @snum: port number to look up 190 * @snum: port number to look up
158 * @saddr_comp: AF-dependent comparison of bound local IP addresses 191 * @saddr_comp: AF-dependent comparison of bound local IP addresses
192 * @hash2_nulladdr: AF-dependant hash value in secondary hash chains,
193 * with NULL address
159 */ 194 */
160int udp_lib_get_port(struct sock *sk, unsigned short snum, 195int udp_lib_get_port(struct sock *sk, unsigned short snum,
161 int (*saddr_comp)(const struct sock *sk1, 196 int (*saddr_comp)(const struct sock *sk1,
162 const struct sock *sk2)) 197 const struct sock *sk2),
198 unsigned int hash2_nulladdr)
163{ 199{
164 struct udp_hslot *hslot; 200 struct udp_hslot *hslot, *hslot2;
165 struct udp_table *udptable = sk->sk_prot->h.udp_table; 201 struct udp_table *udptable = sk->sk_prot->h.udp_table;
166 int error = 1; 202 int error = 1;
167 struct net *net = sock_net(sk); 203 struct net *net = sock_net(sk);
@@ -180,13 +216,14 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
180 /* 216 /*
181 * force rand to be an odd multiple of UDP_HTABLE_SIZE 217 * force rand to be an odd multiple of UDP_HTABLE_SIZE
182 */ 218 */
183 rand = (rand | 1) * UDP_HTABLE_SIZE; 219 rand = (rand | 1) * (udptable->mask + 1);
184 for (last = first + UDP_HTABLE_SIZE; first != last; first++) { 220 last = first + udptable->mask + 1;
185 hslot = &udptable->hash[udp_hashfn(net, first)]; 221 do {
222 hslot = udp_hashslot(udptable, net, first);
186 bitmap_zero(bitmap, PORTS_PER_CHAIN); 223 bitmap_zero(bitmap, PORTS_PER_CHAIN);
187 spin_lock_bh(&hslot->lock); 224 spin_lock_bh(&hslot->lock);
188 udp_lib_lport_inuse(net, snum, hslot, bitmap, sk, 225 udp_lib_lport_inuse(net, snum, hslot, bitmap, sk,
189 saddr_comp); 226 saddr_comp, udptable->log);
190 227
191 snum = first; 228 snum = first;
192 /* 229 /*
@@ -196,25 +233,59 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
196 */ 233 */
197 do { 234 do {
198 if (low <= snum && snum <= high && 235 if (low <= snum && snum <= high &&
199 !test_bit(snum / UDP_HTABLE_SIZE, bitmap)) 236 !test_bit(snum >> udptable->log, bitmap))
200 goto found; 237 goto found;
201 snum += rand; 238 snum += rand;
202 } while (snum != first); 239 } while (snum != first);
203 spin_unlock_bh(&hslot->lock); 240 spin_unlock_bh(&hslot->lock);
204 } 241 } while (++first != last);
205 goto fail; 242 goto fail;
206 } else { 243 } else {
207 hslot = &udptable->hash[udp_hashfn(net, snum)]; 244 hslot = udp_hashslot(udptable, net, snum);
208 spin_lock_bh(&hslot->lock); 245 spin_lock_bh(&hslot->lock);
209 if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, saddr_comp)) 246 if (hslot->count > 10) {
247 int exist;
248 unsigned int slot2 = udp_sk(sk)->udp_portaddr_hash ^ snum;
249
250 slot2 &= udptable->mask;
251 hash2_nulladdr &= udptable->mask;
252
253 hslot2 = udp_hashslot2(udptable, slot2);
254 if (hslot->count < hslot2->count)
255 goto scan_primary_hash;
256
257 exist = udp_lib_lport_inuse2(net, snum, hslot2,
258 sk, saddr_comp);
259 if (!exist && (hash2_nulladdr != slot2)) {
260 hslot2 = udp_hashslot2(udptable, hash2_nulladdr);
261 exist = udp_lib_lport_inuse2(net, snum, hslot2,
262 sk, saddr_comp);
263 }
264 if (exist)
265 goto fail_unlock;
266 else
267 goto found;
268 }
269scan_primary_hash:
270 if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk,
271 saddr_comp, 0))
210 goto fail_unlock; 272 goto fail_unlock;
211 } 273 }
212found: 274found:
213 inet_sk(sk)->num = snum; 275 inet_sk(sk)->inet_num = snum;
214 sk->sk_hash = snum; 276 udp_sk(sk)->udp_port_hash = snum;
277 udp_sk(sk)->udp_portaddr_hash ^= snum;
215 if (sk_unhashed(sk)) { 278 if (sk_unhashed(sk)) {
216 sk_nulls_add_node_rcu(sk, &hslot->head); 279 sk_nulls_add_node_rcu(sk, &hslot->head);
280 hslot->count++;
217 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 281 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
282
283 hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
284 spin_lock(&hslot2->lock);
285 hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
286 &hslot2->head);
287 hslot2->count++;
288 spin_unlock(&hslot2->lock);
218 } 289 }
219 error = 0; 290 error = 0;
220fail_unlock: 291fail_unlock:
@@ -229,13 +300,26 @@ static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2)
229 struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2); 300 struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2);
230 301
231 return (!ipv6_only_sock(sk2) && 302 return (!ipv6_only_sock(sk2) &&
232 (!inet1->rcv_saddr || !inet2->rcv_saddr || 303 (!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr ||
233 inet1->rcv_saddr == inet2->rcv_saddr)); 304 inet1->inet_rcv_saddr == inet2->inet_rcv_saddr));
305}
306
307static unsigned int udp4_portaddr_hash(struct net *net, __be32 saddr,
308 unsigned int port)
309{
310 return jhash_1word(saddr, net_hash_mix(net)) ^ port;
234} 311}
235 312
236int udp_v4_get_port(struct sock *sk, unsigned short snum) 313int udp_v4_get_port(struct sock *sk, unsigned short snum)
237{ 314{
238 return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal); 315 unsigned int hash2_nulladdr =
316 udp4_portaddr_hash(sock_net(sk), INADDR_ANY, snum);
317 unsigned int hash2_partial =
318 udp4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0);
319
320 /* precompute partial secondary hash */
321 udp_sk(sk)->udp_portaddr_hash = hash2_partial;
322 return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal, hash2_nulladdr);
239} 323}
240 324
241static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr, 325static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
@@ -244,23 +328,23 @@ static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
244{ 328{
245 int score = -1; 329 int score = -1;
246 330
247 if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum && 331 if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum &&
248 !ipv6_only_sock(sk)) { 332 !ipv6_only_sock(sk)) {
249 struct inet_sock *inet = inet_sk(sk); 333 struct inet_sock *inet = inet_sk(sk);
250 334
251 score = (sk->sk_family == PF_INET ? 1 : 0); 335 score = (sk->sk_family == PF_INET ? 1 : 0);
252 if (inet->rcv_saddr) { 336 if (inet->inet_rcv_saddr) {
253 if (inet->rcv_saddr != daddr) 337 if (inet->inet_rcv_saddr != daddr)
254 return -1; 338 return -1;
255 score += 2; 339 score += 2;
256 } 340 }
257 if (inet->daddr) { 341 if (inet->inet_daddr) {
258 if (inet->daddr != saddr) 342 if (inet->inet_daddr != saddr)
259 return -1; 343 return -1;
260 score += 2; 344 score += 2;
261 } 345 }
262 if (inet->dport) { 346 if (inet->inet_dport) {
263 if (inet->dport != sport) 347 if (inet->inet_dport != sport)
264 return -1; 348 return -1;
265 score += 2; 349 score += 2;
266 } 350 }
@@ -273,6 +357,89 @@ static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
273 return score; 357 return score;
274} 358}
275 359
360/*
361 * In this second variant, we check (daddr, dport) matches (inet_rcv_sadd, inet_num)
362 */
363#define SCORE2_MAX (1 + 2 + 2 + 2)
364static inline int compute_score2(struct sock *sk, struct net *net,
365 __be32 saddr, __be16 sport,
366 __be32 daddr, unsigned int hnum, int dif)
367{
368 int score = -1;
369
370 if (net_eq(sock_net(sk), net) && !ipv6_only_sock(sk)) {
371 struct inet_sock *inet = inet_sk(sk);
372
373 if (inet->inet_rcv_saddr != daddr)
374 return -1;
375 if (inet->inet_num != hnum)
376 return -1;
377
378 score = (sk->sk_family == PF_INET ? 1 : 0);
379 if (inet->inet_daddr) {
380 if (inet->inet_daddr != saddr)
381 return -1;
382 score += 2;
383 }
384 if (inet->inet_dport) {
385 if (inet->inet_dport != sport)
386 return -1;
387 score += 2;
388 }
389 if (sk->sk_bound_dev_if) {
390 if (sk->sk_bound_dev_if != dif)
391 return -1;
392 score += 2;
393 }
394 }
395 return score;
396}
397
398
399/* called with read_rcu_lock() */
400static struct sock *udp4_lib_lookup2(struct net *net,
401 __be32 saddr, __be16 sport,
402 __be32 daddr, unsigned int hnum, int dif,
403 struct udp_hslot *hslot2, unsigned int slot2)
404{
405 struct sock *sk, *result;
406 struct hlist_nulls_node *node;
407 int score, badness;
408
409begin:
410 result = NULL;
411 badness = -1;
412 udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {
413 score = compute_score2(sk, net, saddr, sport,
414 daddr, hnum, dif);
415 if (score > badness) {
416 result = sk;
417 badness = score;
418 if (score == SCORE2_MAX)
419 goto exact_match;
420 }
421 }
422 /*
423 * if the nulls value we got at the end of this lookup is
424 * not the expected one, we must restart lookup.
425 * We probably met an item that was moved to another chain.
426 */
427 if (get_nulls_value(node) != slot2)
428 goto begin;
429
430 if (result) {
431exact_match:
432 if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
433 result = NULL;
434 else if (unlikely(compute_score2(result, net, saddr, sport,
435 daddr, hnum, dif) < badness)) {
436 sock_put(result);
437 goto begin;
438 }
439 }
440 return result;
441}
442
276/* UDP is nearly always wildcards out the wazoo, it makes no sense to try 443/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
277 * harder than this. -DaveM 444 * harder than this. -DaveM
278 */ 445 */
@@ -283,11 +450,35 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
283 struct sock *sk, *result; 450 struct sock *sk, *result;
284 struct hlist_nulls_node *node; 451 struct hlist_nulls_node *node;
285 unsigned short hnum = ntohs(dport); 452 unsigned short hnum = ntohs(dport);
286 unsigned int hash = udp_hashfn(net, hnum); 453 unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
287 struct udp_hslot *hslot = &udptable->hash[hash]; 454 struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
288 int score, badness; 455 int score, badness;
289 456
290 rcu_read_lock(); 457 rcu_read_lock();
458 if (hslot->count > 10) {
459 hash2 = udp4_portaddr_hash(net, daddr, hnum);
460 slot2 = hash2 & udptable->mask;
461 hslot2 = &udptable->hash2[slot2];
462 if (hslot->count < hslot2->count)
463 goto begin;
464
465 result = udp4_lib_lookup2(net, saddr, sport,
466 daddr, hnum, dif,
467 hslot2, slot2);
468 if (!result) {
469 hash2 = udp4_portaddr_hash(net, INADDR_ANY, hnum);
470 slot2 = hash2 & udptable->mask;
471 hslot2 = &udptable->hash2[slot2];
472 if (hslot->count < hslot2->count)
473 goto begin;
474
475 result = udp4_lib_lookup2(net, saddr, sport,
476 INADDR_ANY, hnum, dif,
477 hslot2, slot2);
478 }
479 rcu_read_unlock();
480 return result;
481 }
291begin: 482begin:
292 result = NULL; 483 result = NULL;
293 badness = -1; 484 badness = -1;
@@ -304,7 +495,7 @@ begin:
304 * not the expected one, we must restart lookup. 495 * not the expected one, we must restart lookup.
305 * We probably met an item that was moved to another chain. 496 * We probably met an item that was moved to another chain.
306 */ 497 */
307 if (get_nulls_value(node) != hash) 498 if (get_nulls_value(node) != slot)
308 goto begin; 499 goto begin;
309 500
310 if (result) { 501 if (result) {
@@ -354,12 +545,13 @@ static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,
354 sk_nulls_for_each_from(s, node) { 545 sk_nulls_for_each_from(s, node) {
355 struct inet_sock *inet = inet_sk(s); 546 struct inet_sock *inet = inet_sk(s);
356 547
357 if (!net_eq(sock_net(s), net) || 548 if (!net_eq(sock_net(s), net) ||
358 s->sk_hash != hnum || 549 udp_sk(s)->udp_port_hash != hnum ||
359 (inet->daddr && inet->daddr != rmt_addr) || 550 (inet->inet_daddr && inet->inet_daddr != rmt_addr) ||
360 (inet->dport != rmt_port && inet->dport) || 551 (inet->inet_dport != rmt_port && inet->inet_dport) ||
361 (inet->rcv_saddr && inet->rcv_saddr != loc_addr) || 552 (inet->inet_rcv_saddr &&
362 ipv6_only_sock(s) || 553 inet->inet_rcv_saddr != loc_addr) ||
554 ipv6_only_sock(s) ||
363 (s->sk_bound_dev_if && s->sk_bound_dev_if != dif)) 555 (s->sk_bound_dev_if && s->sk_bound_dev_if != dif))
364 continue; 556 continue;
365 if (!ip_mc_sf_allow(s, loc_addr, rmt_addr, dif)) 557 if (!ip_mc_sf_allow(s, loc_addr, rmt_addr, dif))
@@ -642,14 +834,14 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
642 } else { 834 } else {
643 if (sk->sk_state != TCP_ESTABLISHED) 835 if (sk->sk_state != TCP_ESTABLISHED)
644 return -EDESTADDRREQ; 836 return -EDESTADDRREQ;
645 daddr = inet->daddr; 837 daddr = inet->inet_daddr;
646 dport = inet->dport; 838 dport = inet->inet_dport;
647 /* Open fast path for connected socket. 839 /* Open fast path for connected socket.
648 Route will not be used, if at least one option is set. 840 Route will not be used, if at least one option is set.
649 */ 841 */
650 connected = 1; 842 connected = 1;
651 } 843 }
652 ipc.addr = inet->saddr; 844 ipc.addr = inet->inet_saddr;
653 845
654 ipc.oif = sk->sk_bound_dev_if; 846 ipc.oif = sk->sk_bound_dev_if;
655 err = sock_tx_timestamp(msg, sk, &ipc.shtx); 847 err = sock_tx_timestamp(msg, sk, &ipc.shtx);
@@ -704,7 +896,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
704 .proto = sk->sk_protocol, 896 .proto = sk->sk_protocol,
705 .flags = inet_sk_flowi_flags(sk), 897 .flags = inet_sk_flowi_flags(sk),
706 .uli_u = { .ports = 898 .uli_u = { .ports =
707 { .sport = inet->sport, 899 { .sport = inet->inet_sport,
708 .dport = dport } } }; 900 .dport = dport } } };
709 struct net *net = sock_net(sk); 901 struct net *net = sock_net(sk);
710 902
@@ -748,7 +940,7 @@ back_from_confirm:
748 inet->cork.fl.fl4_dst = daddr; 940 inet->cork.fl.fl4_dst = daddr;
749 inet->cork.fl.fl_ip_dport = dport; 941 inet->cork.fl.fl_ip_dport = dport;
750 inet->cork.fl.fl4_src = saddr; 942 inet->cork.fl.fl4_src = saddr;
751 inet->cork.fl.fl_ip_sport = inet->sport; 943 inet->cork.fl.fl_ip_sport = inet->inet_sport;
752 up->pending = AF_INET; 944 up->pending = AF_INET;
753 945
754do_append_data: 946do_append_data:
@@ -862,6 +1054,7 @@ static unsigned int first_packet_length(struct sock *sk)
862 udp_lib_checksum_complete(skb)) { 1054 udp_lib_checksum_complete(skb)) {
863 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, 1055 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
864 IS_UDPLITE(sk)); 1056 IS_UDPLITE(sk));
1057 atomic_inc(&sk->sk_drops);
865 __skb_unlink(skb, rcvq); 1058 __skb_unlink(skb, rcvq);
866 __skb_queue_tail(&list_kill, skb); 1059 __skb_queue_tail(&list_kill, skb);
867 } 1060 }
@@ -925,7 +1118,7 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
925 struct inet_sock *inet = inet_sk(sk); 1118 struct inet_sock *inet = inet_sk(sk);
926 struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; 1119 struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
927 struct sk_buff *skb; 1120 struct sk_buff *skb;
928 unsigned int ulen, copied; 1121 unsigned int ulen;
929 int peeked; 1122 int peeked;
930 int err; 1123 int err;
931 int is_udplite = IS_UDPLITE(sk); 1124 int is_udplite = IS_UDPLITE(sk);
@@ -946,10 +1139,9 @@ try_again:
946 goto out; 1139 goto out;
947 1140
948 ulen = skb->len - sizeof(struct udphdr); 1141 ulen = skb->len - sizeof(struct udphdr);
949 copied = len; 1142 if (len > ulen)
950 if (copied > ulen) 1143 len = ulen;
951 copied = ulen; 1144 else if (len < ulen)
952 else if (copied < ulen)
953 msg->msg_flags |= MSG_TRUNC; 1145 msg->msg_flags |= MSG_TRUNC;
954 1146
955 /* 1147 /*
@@ -958,14 +1150,14 @@ try_again:
958 * coverage checksum (UDP-Lite), do it before the copy. 1150 * coverage checksum (UDP-Lite), do it before the copy.
959 */ 1151 */
960 1152
961 if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) { 1153 if (len < ulen || UDP_SKB_CB(skb)->partial_cov) {
962 if (udp_lib_checksum_complete(skb)) 1154 if (udp_lib_checksum_complete(skb))
963 goto csum_copy_err; 1155 goto csum_copy_err;
964 } 1156 }
965 1157
966 if (skb_csum_unnecessary(skb)) 1158 if (skb_csum_unnecessary(skb))
967 err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), 1159 err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr),
968 msg->msg_iov, copied); 1160 msg->msg_iov, len);
969 else { 1161 else {
970 err = skb_copy_and_csum_datagram_iovec(skb, 1162 err = skb_copy_and_csum_datagram_iovec(skb,
971 sizeof(struct udphdr), 1163 sizeof(struct udphdr),
@@ -982,7 +1174,7 @@ try_again:
982 UDP_INC_STATS_USER(sock_net(sk), 1174 UDP_INC_STATS_USER(sock_net(sk),
983 UDP_MIB_INDATAGRAMS, is_udplite); 1175 UDP_MIB_INDATAGRAMS, is_udplite);
984 1176
985 sock_recv_timestamp(msg, sk, skb); 1177 sock_recv_ts_and_drops(msg, sk, skb);
986 1178
987 /* Copy the address. */ 1179 /* Copy the address. */
988 if (sin) { 1180 if (sin) {
@@ -994,7 +1186,7 @@ try_again:
994 if (inet->cmsg_flags) 1186 if (inet->cmsg_flags)
995 ip_cmsg_recv(msg, skb); 1187 ip_cmsg_recv(msg, skb);
996 1188
997 err = copied; 1189 err = len;
998 if (flags & MSG_TRUNC) 1190 if (flags & MSG_TRUNC)
999 err = ulen; 1191 err = ulen;
1000 1192
@@ -1023,15 +1215,15 @@ int udp_disconnect(struct sock *sk, int flags)
1023 */ 1215 */
1024 1216
1025 sk->sk_state = TCP_CLOSE; 1217 sk->sk_state = TCP_CLOSE;
1026 inet->daddr = 0; 1218 inet->inet_daddr = 0;
1027 inet->dport = 0; 1219 inet->inet_dport = 0;
1028 sk->sk_bound_dev_if = 0; 1220 sk->sk_bound_dev_if = 0;
1029 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) 1221 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1030 inet_reset_saddr(sk); 1222 inet_reset_saddr(sk);
1031 1223
1032 if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) { 1224 if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) {
1033 sk->sk_prot->unhash(sk); 1225 sk->sk_prot->unhash(sk);
1034 inet->sport = 0; 1226 inet->inet_sport = 0;
1035 } 1227 }
1036 sk_dst_reset(sk); 1228 sk_dst_reset(sk);
1037 return 0; 1229 return 0;
@@ -1042,13 +1234,22 @@ void udp_lib_unhash(struct sock *sk)
1042{ 1234{
1043 if (sk_hashed(sk)) { 1235 if (sk_hashed(sk)) {
1044 struct udp_table *udptable = sk->sk_prot->h.udp_table; 1236 struct udp_table *udptable = sk->sk_prot->h.udp_table;
1045 unsigned int hash = udp_hashfn(sock_net(sk), sk->sk_hash); 1237 struct udp_hslot *hslot, *hslot2;
1046 struct udp_hslot *hslot = &udptable->hash[hash]; 1238
1239 hslot = udp_hashslot(udptable, sock_net(sk),
1240 udp_sk(sk)->udp_port_hash);
1241 hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
1047 1242
1048 spin_lock_bh(&hslot->lock); 1243 spin_lock_bh(&hslot->lock);
1049 if (sk_nulls_del_node_init_rcu(sk)) { 1244 if (sk_nulls_del_node_init_rcu(sk)) {
1050 inet_sk(sk)->num = 0; 1245 hslot->count--;
1246 inet_sk(sk)->inet_num = 0;
1051 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 1247 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
1248
1249 spin_lock(&hslot2->lock);
1250 hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
1251 hslot2->count--;
1252 spin_unlock(&hslot2->lock);
1052 } 1253 }
1053 spin_unlock_bh(&hslot->lock); 1254 spin_unlock_bh(&hslot->lock);
1054 } 1255 }
@@ -1057,25 +1258,22 @@ EXPORT_SYMBOL(udp_lib_unhash);
1057 1258
1058static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 1259static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1059{ 1260{
1060 int is_udplite = IS_UDPLITE(sk); 1261 int rc = sock_queue_rcv_skb(sk, skb);
1061 int rc; 1262
1263 if (rc < 0) {
1264 int is_udplite = IS_UDPLITE(sk);
1062 1265
1063 if ((rc = sock_queue_rcv_skb(sk, skb)) < 0) {
1064 /* Note that an ENOMEM error is charged twice */ 1266 /* Note that an ENOMEM error is charged twice */
1065 if (rc == -ENOMEM) { 1267 if (rc == -ENOMEM)
1066 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, 1268 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
1067 is_udplite); 1269 is_udplite);
1068 atomic_inc(&sk->sk_drops); 1270 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
1069 } 1271 kfree_skb(skb);
1070 goto drop; 1272 return -1;
1071 } 1273 }
1072 1274
1073 return 0; 1275 return 0;
1074 1276
1075drop:
1076 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
1077 kfree_skb(skb);
1078 return -1;
1079} 1277}
1080 1278
1081/* returns: 1279/* returns:
@@ -1174,61 +1372,98 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1174 bh_lock_sock(sk); 1372 bh_lock_sock(sk);
1175 if (!sock_owned_by_user(sk)) 1373 if (!sock_owned_by_user(sk))
1176 rc = __udp_queue_rcv_skb(sk, skb); 1374 rc = __udp_queue_rcv_skb(sk, skb);
1177 else 1375 else if (sk_add_backlog(sk, skb)) {
1178 sk_add_backlog(sk, skb); 1376 bh_unlock_sock(sk);
1377 goto drop;
1378 }
1179 bh_unlock_sock(sk); 1379 bh_unlock_sock(sk);
1180 1380
1181 return rc; 1381 return rc;
1182 1382
1183drop: 1383drop:
1184 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); 1384 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
1385 atomic_inc(&sk->sk_drops);
1185 kfree_skb(skb); 1386 kfree_skb(skb);
1186 return -1; 1387 return -1;
1187} 1388}
1188 1389
1390
1391static void flush_stack(struct sock **stack, unsigned int count,
1392 struct sk_buff *skb, unsigned int final)
1393{
1394 unsigned int i;
1395 struct sk_buff *skb1 = NULL;
1396 struct sock *sk;
1397
1398 for (i = 0; i < count; i++) {
1399 sk = stack[i];
1400 if (likely(skb1 == NULL))
1401 skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC);
1402
1403 if (!skb1) {
1404 atomic_inc(&sk->sk_drops);
1405 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
1406 IS_UDPLITE(sk));
1407 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
1408 IS_UDPLITE(sk));
1409 }
1410
1411 if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0)
1412 skb1 = NULL;
1413 }
1414 if (unlikely(skb1))
1415 kfree_skb(skb1);
1416}
1417
1189/* 1418/*
1190 * Multicasts and broadcasts go to each listener. 1419 * Multicasts and broadcasts go to each listener.
1191 * 1420 *
1192 * Note: called only from the BH handler context, 1421 * Note: called only from the BH handler context.
1193 * so we don't need to lock the hashes.
1194 */ 1422 */
1195static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, 1423static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
1196 struct udphdr *uh, 1424 struct udphdr *uh,
1197 __be32 saddr, __be32 daddr, 1425 __be32 saddr, __be32 daddr,
1198 struct udp_table *udptable) 1426 struct udp_table *udptable)
1199{ 1427{
1200 struct sock *sk; 1428 struct sock *sk, *stack[256 / sizeof(struct sock *)];
1201 struct udp_hslot *hslot = &udptable->hash[udp_hashfn(net, ntohs(uh->dest))]; 1429 struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest));
1202 int dif; 1430 int dif;
1431 unsigned int i, count = 0;
1203 1432
1204 spin_lock(&hslot->lock); 1433 spin_lock(&hslot->lock);
1205 sk = sk_nulls_head(&hslot->head); 1434 sk = sk_nulls_head(&hslot->head);
1206 dif = skb->dev->ifindex; 1435 dif = skb->dev->ifindex;
1207 sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif); 1436 sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif);
1208 if (sk) { 1437 while (sk) {
1209 struct sock *sknext = NULL; 1438 stack[count++] = sk;
1439 sk = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest,
1440 daddr, uh->source, saddr, dif);
1441 if (unlikely(count == ARRAY_SIZE(stack))) {
1442 if (!sk)
1443 break;
1444 flush_stack(stack, count, skb, ~0);
1445 count = 0;
1446 }
1447 }
1448 /*
1449 * before releasing chain lock, we must take a reference on sockets
1450 */
1451 for (i = 0; i < count; i++)
1452 sock_hold(stack[i]);
1210 1453
1211 do {
1212 struct sk_buff *skb1 = skb;
1213
1214 sknext = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest,
1215 daddr, uh->source, saddr,
1216 dif);
1217 if (sknext)
1218 skb1 = skb_clone(skb, GFP_ATOMIC);
1219
1220 if (skb1) {
1221 int ret = udp_queue_rcv_skb(sk, skb1);
1222 if (ret > 0)
1223 /* we should probably re-process instead
1224 * of dropping packets here. */
1225 kfree_skb(skb1);
1226 }
1227 sk = sknext;
1228 } while (sknext);
1229 } else
1230 consume_skb(skb);
1231 spin_unlock(&hslot->lock); 1454 spin_unlock(&hslot->lock);
1455
1456 /*
1457 * do the slow work with no lock held
1458 */
1459 if (count) {
1460 flush_stack(stack, count, skb, count - 1);
1461
1462 for (i = 0; i < count; i++)
1463 sock_put(stack[i]);
1464 } else {
1465 kfree_skb(skb);
1466 }
1232 return 0; 1467 return 0;
1233} 1468}
1234 1469
@@ -1292,6 +1527,9 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
1292 1527
1293 uh = udp_hdr(skb); 1528 uh = udp_hdr(skb);
1294 ulen = ntohs(uh->len); 1529 ulen = ntohs(uh->len);
1530 saddr = ip_hdr(skb)->saddr;
1531 daddr = ip_hdr(skb)->daddr;
1532
1295 if (ulen > skb->len) 1533 if (ulen > skb->len)
1296 goto short_packet; 1534 goto short_packet;
1297 1535
@@ -1305,9 +1543,6 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
1305 if (udp4_csum_init(skb, uh, proto)) 1543 if (udp4_csum_init(skb, uh, proto))
1306 goto csum_error; 1544 goto csum_error;
1307 1545
1308 saddr = ip_hdr(skb)->saddr;
1309 daddr = ip_hdr(skb)->daddr;
1310
1311 if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) 1546 if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
1312 return __udp4_lib_mcast_deliver(net, skb, uh, 1547 return __udp4_lib_mcast_deliver(net, skb, uh,
1313 saddr, daddr, udptable); 1548 saddr, daddr, udptable);
@@ -1620,9 +1855,14 @@ static struct sock *udp_get_first(struct seq_file *seq, int start)
1620 struct udp_iter_state *state = seq->private; 1855 struct udp_iter_state *state = seq->private;
1621 struct net *net = seq_file_net(seq); 1856 struct net *net = seq_file_net(seq);
1622 1857
1623 for (state->bucket = start; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) { 1858 for (state->bucket = start; state->bucket <= state->udp_table->mask;
1859 ++state->bucket) {
1624 struct hlist_nulls_node *node; 1860 struct hlist_nulls_node *node;
1625 struct udp_hslot *hslot = &state->udp_table->hash[state->bucket]; 1861 struct udp_hslot *hslot = &state->udp_table->hash[state->bucket];
1862
1863 if (hlist_nulls_empty(&hslot->head))
1864 continue;
1865
1626 spin_lock_bh(&hslot->lock); 1866 spin_lock_bh(&hslot->lock);
1627 sk_nulls_for_each(sk, node, &hslot->head) { 1867 sk_nulls_for_each(sk, node, &hslot->head) {
1628 if (!net_eq(sock_net(sk), net)) 1868 if (!net_eq(sock_net(sk), net))
@@ -1647,7 +1887,7 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
1647 } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family)); 1887 } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family));
1648 1888
1649 if (!sk) { 1889 if (!sk) {
1650 if (state->bucket < UDP_HTABLE_SIZE) 1890 if (state->bucket <= state->udp_table->mask)
1651 spin_unlock_bh(&state->udp_table->hash[state->bucket].lock); 1891 spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
1652 return udp_get_first(seq, state->bucket + 1); 1892 return udp_get_first(seq, state->bucket + 1);
1653 } 1893 }
@@ -1667,7 +1907,7 @@ static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
1667static void *udp_seq_start(struct seq_file *seq, loff_t *pos) 1907static void *udp_seq_start(struct seq_file *seq, loff_t *pos)
1668{ 1908{
1669 struct udp_iter_state *state = seq->private; 1909 struct udp_iter_state *state = seq->private;
1670 state->bucket = UDP_HTABLE_SIZE; 1910 state->bucket = MAX_UDP_PORTS;
1671 1911
1672 return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN; 1912 return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
1673} 1913}
@@ -1689,7 +1929,7 @@ static void udp_seq_stop(struct seq_file *seq, void *v)
1689{ 1929{
1690 struct udp_iter_state *state = seq->private; 1930 struct udp_iter_state *state = seq->private;
1691 1931
1692 if (state->bucket < UDP_HTABLE_SIZE) 1932 if (state->bucket <= state->udp_table->mask)
1693 spin_unlock_bh(&state->udp_table->hash[state->bucket].lock); 1933 spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
1694} 1934}
1695 1935
@@ -1744,12 +1984,12 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,
1744 int bucket, int *len) 1984 int bucket, int *len)
1745{ 1985{
1746 struct inet_sock *inet = inet_sk(sp); 1986 struct inet_sock *inet = inet_sk(sp);
1747 __be32 dest = inet->daddr; 1987 __be32 dest = inet->inet_daddr;
1748 __be32 src = inet->rcv_saddr; 1988 __be32 src = inet->inet_rcv_saddr;
1749 __u16 destp = ntohs(inet->dport); 1989 __u16 destp = ntohs(inet->inet_dport);
1750 __u16 srcp = ntohs(inet->sport); 1990 __u16 srcp = ntohs(inet->inet_sport);
1751 1991
1752 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 1992 seq_printf(f, "%5d: %08X:%04X %08X:%04X"
1753 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d%n", 1993 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d%n",
1754 bucket, src, srcp, dest, destp, sp->sk_state, 1994 bucket, src, srcp, dest, destp, sp->sk_state,
1755 sk_wmem_alloc_get(sp), 1995 sk_wmem_alloc_get(sp),
@@ -1789,12 +2029,12 @@ static struct udp_seq_afinfo udp4_seq_afinfo = {
1789 }, 2029 },
1790}; 2030};
1791 2031
1792static int udp4_proc_init_net(struct net *net) 2032static int __net_init udp4_proc_init_net(struct net *net)
1793{ 2033{
1794 return udp_proc_register(net, &udp4_seq_afinfo); 2034 return udp_proc_register(net, &udp4_seq_afinfo);
1795} 2035}
1796 2036
1797static void udp4_proc_exit_net(struct net *net) 2037static void __net_exit udp4_proc_exit_net(struct net *net)
1798{ 2038{
1799 udp_proc_unregister(net, &udp4_seq_afinfo); 2039 udp_proc_unregister(net, &udp4_seq_afinfo);
1800} 2040}
@@ -1815,21 +2055,60 @@ void udp4_proc_exit(void)
1815} 2055}
1816#endif /* CONFIG_PROC_FS */ 2056#endif /* CONFIG_PROC_FS */
1817 2057
1818void __init udp_table_init(struct udp_table *table) 2058static __initdata unsigned long uhash_entries;
2059static int __init set_uhash_entries(char *str)
1819{ 2060{
1820 int i; 2061 if (!str)
2062 return 0;
2063 uhash_entries = simple_strtoul(str, &str, 0);
2064 if (uhash_entries && uhash_entries < UDP_HTABLE_SIZE_MIN)
2065 uhash_entries = UDP_HTABLE_SIZE_MIN;
2066 return 1;
2067}
2068__setup("uhash_entries=", set_uhash_entries);
1821 2069
1822 for (i = 0; i < UDP_HTABLE_SIZE; i++) { 2070void __init udp_table_init(struct udp_table *table, const char *name)
2071{
2072 unsigned int i;
2073
2074 if (!CONFIG_BASE_SMALL)
2075 table->hash = alloc_large_system_hash(name,
2076 2 * sizeof(struct udp_hslot),
2077 uhash_entries,
2078 21, /* one slot per 2 MB */
2079 0,
2080 &table->log,
2081 &table->mask,
2082 64 * 1024);
2083 /*
2084 * Make sure hash table has the minimum size
2085 */
2086 if (CONFIG_BASE_SMALL || table->mask < UDP_HTABLE_SIZE_MIN - 1) {
2087 table->hash = kmalloc(UDP_HTABLE_SIZE_MIN *
2088 2 * sizeof(struct udp_hslot), GFP_KERNEL);
2089 if (!table->hash)
2090 panic(name);
2091 table->log = ilog2(UDP_HTABLE_SIZE_MIN);
2092 table->mask = UDP_HTABLE_SIZE_MIN - 1;
2093 }
2094 table->hash2 = table->hash + (table->mask + 1);
2095 for (i = 0; i <= table->mask; i++) {
1823 INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i); 2096 INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i);
2097 table->hash[i].count = 0;
1824 spin_lock_init(&table->hash[i].lock); 2098 spin_lock_init(&table->hash[i].lock);
1825 } 2099 }
2100 for (i = 0; i <= table->mask; i++) {
2101 INIT_HLIST_NULLS_HEAD(&table->hash2[i].head, i);
2102 table->hash2[i].count = 0;
2103 spin_lock_init(&table->hash2[i].lock);
2104 }
1826} 2105}
1827 2106
1828void __init udp_init(void) 2107void __init udp_init(void)
1829{ 2108{
1830 unsigned long nr_pages, limit; 2109 unsigned long nr_pages, limit;
1831 2110
1832 udp_table_init(&udp_table); 2111 udp_table_init(&udp_table, "UDP");
1833 /* Set the pressure threshold up by the same strategy of TCP. It is a 2112 /* Set the pressure threshold up by the same strategy of TCP. It is a
1834 * fraction of global memory that is up to 1/2 at 256 MB, decreasing 2113 * fraction of global memory that is up to 1/2 at 256 MB, decreasing
1835 * toward zero with the amount of memory, with a floor of 128 pages. 2114 * toward zero with the amount of memory, with a floor of 128 pages.