aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Dumazet <dada1@cosmosbay.com>2008-10-29 04:41:45 -0400
committerDavid S. Miller <davem@davemloft.net>2008-10-29 04:41:45 -0400
commit645ca708f936b2fbeb79e52d7823e3eb2c0905f8 (patch)
treeb384696994ee3cb04759a7bfffc29a48e4bf40f6
parentb189db5d299c6824780af5590564ff608adb3dea (diff)
udp: introduce struct udp_table and multiple spinlocks
UDP sockets are hashed in a 128 slots hash table. This hash table is protected by *one* rwlock. This rwlock is readlocked each time an incoming UDP message is handled. This rwlock is writelocked each time a socket must be inserted in hash table (bind time), or deleted from this table (close time) This is not scalable on SMP machines : 1) Even in read mode, lock() and unlock() are atomic operations and must dirty a contended cache line, shared by all cpus. 2) A writer might be starved if many readers are 'in flight'. This can happen on a machine with some NIC receiving many UDP messages. User process can be delayed a long time at socket creation/dismantle time. This patch prepares RCU migration, by introducing 'struct udp_table and struct udp_hslot', and using one spinlock per chain, to reduce contention on central rwlock. Introducing one spinlock per chain reduces latencies, for port randomization on heavily loaded UDP servers. This also speedup bindings to specific ports. udp_lib_unhash() was uninlined, becoming to big. Some cleanups were done to ease review of following patch (RCUification of UDP Unicast lookups) Signed-off-by: Eric Dumazet <dada1@cosmosbay.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/sock.h2
-rw-r--r--include/net/udp.h25
-rw-r--r--include/net/udplite.h2
-rw-r--r--net/ipv4/udp.c207
-rw-r--r--net/ipv4/udp_impl.h4
-rw-r--r--net/ipv4/udplite.c13
-rw-r--r--net/ipv6/udp.c112
-rw-r--r--net/ipv6/udp_impl.h4
-rw-r--r--net/ipv6/udplite.c8
9 files changed, 214 insertions, 163 deletions
diff --git a/include/net/sock.h b/include/net/sock.h
index d6b750a25078..d200dfbe1ef6 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -599,7 +599,7 @@ struct proto {
599 599
600 union { 600 union {
601 struct inet_hashinfo *hashinfo; 601 struct inet_hashinfo *hashinfo;
602 struct hlist_head *udp_hash; 602 struct udp_table *udp_table;
603 struct raw_hashinfo *raw_hash; 603 struct raw_hashinfo *raw_hash;
604 } h; 604 } h;
605 605
diff --git a/include/net/udp.h b/include/net/udp.h
index 1e205095ea68..df2bfe545374 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -50,8 +50,15 @@ struct udp_skb_cb {
50}; 50};
51#define UDP_SKB_CB(__skb) ((struct udp_skb_cb *)((__skb)->cb)) 51#define UDP_SKB_CB(__skb) ((struct udp_skb_cb *)((__skb)->cb))
52 52
53extern struct hlist_head udp_hash[UDP_HTABLE_SIZE]; 53struct udp_hslot {
54extern rwlock_t udp_hash_lock; 54 struct hlist_head head;
55 spinlock_t lock;
56} __attribute__((aligned(2 * sizeof(long))));
57struct udp_table {
58 struct udp_hslot hash[UDP_HTABLE_SIZE];
59};
60extern struct udp_table udp_table;
61extern void udp_table_init(struct udp_table *);
55 62
56 63
57/* Note: this must match 'valbool' in sock_setsockopt */ 64/* Note: this must match 'valbool' in sock_setsockopt */
@@ -110,15 +117,7 @@ static inline void udp_lib_hash(struct sock *sk)
110 BUG(); 117 BUG();
111} 118}
112 119
113static inline void udp_lib_unhash(struct sock *sk) 120extern void udp_lib_unhash(struct sock *sk);
114{
115 write_lock_bh(&udp_hash_lock);
116 if (sk_del_node_init(sk)) {
117 inet_sk(sk)->num = 0;
118 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
119 }
120 write_unlock_bh(&udp_hash_lock);
121}
122 121
123static inline void udp_lib_close(struct sock *sk, long timeout) 122static inline void udp_lib_close(struct sock *sk, long timeout)
124{ 123{
@@ -187,7 +186,7 @@ extern struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
187struct udp_seq_afinfo { 186struct udp_seq_afinfo {
188 char *name; 187 char *name;
189 sa_family_t family; 188 sa_family_t family;
190 struct hlist_head *hashtable; 189 struct udp_table *udp_table;
191 struct file_operations seq_fops; 190 struct file_operations seq_fops;
192 struct seq_operations seq_ops; 191 struct seq_operations seq_ops;
193}; 192};
@@ -196,7 +195,7 @@ struct udp_iter_state {
196 struct seq_net_private p; 195 struct seq_net_private p;
197 sa_family_t family; 196 sa_family_t family;
198 int bucket; 197 int bucket;
199 struct hlist_head *hashtable; 198 struct udp_table *udp_table;
200}; 199};
201 200
202#ifdef CONFIG_PROC_FS 201#ifdef CONFIG_PROC_FS
diff --git a/include/net/udplite.h b/include/net/udplite.h
index b76b2e377af4..afdffe607b24 100644
--- a/include/net/udplite.h
+++ b/include/net/udplite.h
@@ -11,7 +11,7 @@
11#define UDPLITE_RECV_CSCOV 11 /* receiver partial coverage (threshold ) */ 11#define UDPLITE_RECV_CSCOV 11 /* receiver partial coverage (threshold ) */
12 12
13extern struct proto udplite_prot; 13extern struct proto udplite_prot;
14extern struct hlist_head udplite_hash[UDP_HTABLE_SIZE]; 14extern struct udp_table udplite_table;
15 15
16/* 16/*
17 * Checksum computation is all in software, hence simpler getfrag. 17 * Checksum computation is all in software, hence simpler getfrag.
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 2095abc3caba..2a6c491f97d7 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -104,12 +104,8 @@
104#include <net/xfrm.h> 104#include <net/xfrm.h>
105#include "udp_impl.h" 105#include "udp_impl.h"
106 106
107/* 107struct udp_table udp_table;
108 * Snmp MIB for the UDP layer 108EXPORT_SYMBOL(udp_table);
109 */
110
111struct hlist_head udp_hash[UDP_HTABLE_SIZE];
112DEFINE_RWLOCK(udp_hash_lock);
113 109
114int sysctl_udp_mem[3] __read_mostly; 110int sysctl_udp_mem[3] __read_mostly;
115int sysctl_udp_rmem_min __read_mostly; 111int sysctl_udp_rmem_min __read_mostly;
@@ -123,7 +119,7 @@ atomic_t udp_memory_allocated;
123EXPORT_SYMBOL(udp_memory_allocated); 119EXPORT_SYMBOL(udp_memory_allocated);
124 120
125static int udp_lib_lport_inuse(struct net *net, __u16 num, 121static int udp_lib_lport_inuse(struct net *net, __u16 num,
126 const struct hlist_head udptable[], 122 const struct udp_hslot *hslot,
127 struct sock *sk, 123 struct sock *sk,
128 int (*saddr_comp)(const struct sock *sk1, 124 int (*saddr_comp)(const struct sock *sk1,
129 const struct sock *sk2)) 125 const struct sock *sk2))
@@ -131,7 +127,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
131 struct sock *sk2; 127 struct sock *sk2;
132 struct hlist_node *node; 128 struct hlist_node *node;
133 129
134 sk_for_each(sk2, node, &udptable[udp_hashfn(net, num)]) 130 sk_for_each(sk2, node, &hslot->head)
135 if (net_eq(sock_net(sk2), net) && 131 if (net_eq(sock_net(sk2), net) &&
136 sk2 != sk && 132 sk2 != sk &&
137 sk2->sk_hash == num && 133 sk2->sk_hash == num &&
@@ -154,12 +150,11 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
154 int (*saddr_comp)(const struct sock *sk1, 150 int (*saddr_comp)(const struct sock *sk1,
155 const struct sock *sk2 ) ) 151 const struct sock *sk2 ) )
156{ 152{
157 struct hlist_head *udptable = sk->sk_prot->h.udp_hash; 153 struct udp_hslot *hslot;
154 struct udp_table *udptable = sk->sk_prot->h.udp_table;
158 int error = 1; 155 int error = 1;
159 struct net *net = sock_net(sk); 156 struct net *net = sock_net(sk);
160 157
161 write_lock_bh(&udp_hash_lock);
162
163 if (!snum) { 158 if (!snum) {
164 int low, high, remaining; 159 int low, high, remaining;
165 unsigned rand; 160 unsigned rand;
@@ -171,26 +166,34 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
171 rand = net_random(); 166 rand = net_random();
172 snum = first = rand % remaining + low; 167 snum = first = rand % remaining + low;
173 rand |= 1; 168 rand |= 1;
174 while (udp_lib_lport_inuse(net, snum, udptable, sk, 169 for (;;) {
175 saddr_comp)) { 170 hslot = &udptable->hash[udp_hashfn(net, snum)];
171 spin_lock_bh(&hslot->lock);
172 if (!udp_lib_lport_inuse(net, snum, hslot, sk, saddr_comp))
173 break;
174 spin_unlock_bh(&hslot->lock);
176 do { 175 do {
177 snum = snum + rand; 176 snum = snum + rand;
178 } while (snum < low || snum > high); 177 } while (snum < low || snum > high);
179 if (snum == first) 178 if (snum == first)
180 goto fail; 179 goto fail;
181 } 180 }
182 } else if (udp_lib_lport_inuse(net, snum, udptable, sk, saddr_comp)) 181 } else {
183 goto fail; 182 hslot = &udptable->hash[udp_hashfn(net, snum)];
184 183 spin_lock_bh(&hslot->lock);
184 if (udp_lib_lport_inuse(net, snum, hslot, sk, saddr_comp))
185 goto fail_unlock;
186 }
185 inet_sk(sk)->num = snum; 187 inet_sk(sk)->num = snum;
186 sk->sk_hash = snum; 188 sk->sk_hash = snum;
187 if (sk_unhashed(sk)) { 189 if (sk_unhashed(sk)) {
188 sk_add_node(sk, &udptable[udp_hashfn(net, snum)]); 190 sk_add_node(sk, &hslot->head);
189 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 191 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
190 } 192 }
191 error = 0; 193 error = 0;
194fail_unlock:
195 spin_unlock_bh(&hslot->lock);
192fail: 196fail:
193 write_unlock_bh(&udp_hash_lock);
194 return error; 197 return error;
195} 198}
196 199
@@ -208,63 +211,73 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum)
208 return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal); 211 return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal);
209} 212}
210 213
214static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
215 unsigned short hnum,
216 __be16 sport, __be32 daddr, __be16 dport, int dif)
217{
218 int score = -1;
219
220 if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum &&
221 !ipv6_only_sock(sk)) {
222 struct inet_sock *inet = inet_sk(sk);
223
224 score = (sk->sk_family == PF_INET ? 1 : 0);
225 if (inet->rcv_saddr) {
226 if (inet->rcv_saddr != daddr)
227 return -1;
228 score += 2;
229 }
230 if (inet->daddr) {
231 if (inet->daddr != saddr)
232 return -1;
233 score += 2;
234 }
235 if (inet->dport) {
236 if (inet->dport != sport)
237 return -1;
238 score += 2;
239 }
240 if (sk->sk_bound_dev_if) {
241 if (sk->sk_bound_dev_if != dif)
242 return -1;
243 score += 2;
244 }
245 }
246 return score;
247}
248
211/* UDP is nearly always wildcards out the wazoo, it makes no sense to try 249/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
212 * harder than this. -DaveM 250 * harder than this. -DaveM
213 */ 251 */
214static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, 252static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
215 __be16 sport, __be32 daddr, __be16 dport, 253 __be16 sport, __be32 daddr, __be16 dport,
216 int dif, struct hlist_head udptable[]) 254 int dif, struct udp_table *udptable)
217{ 255{
218 struct sock *sk, *result = NULL; 256 struct sock *sk, *result = NULL;
219 struct hlist_node *node; 257 struct hlist_node *node;
220 unsigned short hnum = ntohs(dport); 258 unsigned short hnum = ntohs(dport);
221 int badness = -1; 259 unsigned int hash = udp_hashfn(net, hnum);
222 260 struct udp_hslot *hslot = &udptable->hash[hash];
223 read_lock(&udp_hash_lock); 261 int score, badness = -1;
224 sk_for_each(sk, node, &udptable[udp_hashfn(net, hnum)]) { 262
225 struct inet_sock *inet = inet_sk(sk); 263 spin_lock(&hslot->lock);
226 264 sk_for_each(sk, node, &hslot->head) {
227 if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum && 265 score = compute_score(sk, net, saddr, hnum, sport,
228 !ipv6_only_sock(sk)) { 266 daddr, dport, dif);
229 int score = (sk->sk_family == PF_INET ? 1 : 0); 267 if (score > badness) {
230 if (inet->rcv_saddr) { 268 result = sk;
231 if (inet->rcv_saddr != daddr) 269 badness = score;
232 continue;
233 score+=2;
234 }
235 if (inet->daddr) {
236 if (inet->daddr != saddr)
237 continue;
238 score+=2;
239 }
240 if (inet->dport) {
241 if (inet->dport != sport)
242 continue;
243 score+=2;
244 }
245 if (sk->sk_bound_dev_if) {
246 if (sk->sk_bound_dev_if != dif)
247 continue;
248 score+=2;
249 }
250 if (score == 9) {
251 result = sk;
252 break;
253 } else if (score > badness) {
254 result = sk;
255 badness = score;
256 }
257 } 270 }
258 } 271 }
259 if (result) 272 if (result)
260 sock_hold(result); 273 sock_hold(result);
261 read_unlock(&udp_hash_lock); 274 spin_unlock(&hslot->lock);
262 return result; 275 return result;
263} 276}
264 277
265static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb, 278static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
266 __be16 sport, __be16 dport, 279 __be16 sport, __be16 dport,
267 struct hlist_head udptable[]) 280 struct udp_table *udptable)
268{ 281{
269 struct sock *sk; 282 struct sock *sk;
270 const struct iphdr *iph = ip_hdr(skb); 283 const struct iphdr *iph = ip_hdr(skb);
@@ -280,7 +293,7 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
280struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, 293struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
281 __be32 daddr, __be16 dport, int dif) 294 __be32 daddr, __be16 dport, int dif)
282{ 295{
283 return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, udp_hash); 296 return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table);
284} 297}
285EXPORT_SYMBOL_GPL(udp4_lib_lookup); 298EXPORT_SYMBOL_GPL(udp4_lib_lookup);
286 299
@@ -323,7 +336,7 @@ found:
323 * to find the appropriate port. 336 * to find the appropriate port.
324 */ 337 */
325 338
326void __udp4_lib_err(struct sk_buff *skb, u32 info, struct hlist_head udptable[]) 339void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
327{ 340{
328 struct inet_sock *inet; 341 struct inet_sock *inet;
329 struct iphdr *iph = (struct iphdr*)skb->data; 342 struct iphdr *iph = (struct iphdr*)skb->data;
@@ -392,7 +405,7 @@ out:
392 405
393void udp_err(struct sk_buff *skb, u32 info) 406void udp_err(struct sk_buff *skb, u32 info)
394{ 407{
395 __udp4_lib_err(skb, info, udp_hash); 408 __udp4_lib_err(skb, info, &udp_table);
396} 409}
397 410
398/* 411/*
@@ -933,6 +946,21 @@ int udp_disconnect(struct sock *sk, int flags)
933 return 0; 946 return 0;
934} 947}
935 948
949void udp_lib_unhash(struct sock *sk)
950{
951 struct udp_table *udptable = sk->sk_prot->h.udp_table;
952 unsigned int hash = udp_hashfn(sock_net(sk), sk->sk_hash);
953 struct udp_hslot *hslot = &udptable->hash[hash];
954
955 spin_lock(&hslot->lock);
956 if (sk_del_node_init(sk)) {
957 inet_sk(sk)->num = 0;
958 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
959 }
960 spin_unlock(&hslot->lock);
961}
962EXPORT_SYMBOL(udp_lib_unhash);
963
936static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 964static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
937{ 965{
938 int is_udplite = IS_UDPLITE(sk); 966 int is_udplite = IS_UDPLITE(sk);
@@ -1071,13 +1099,14 @@ drop:
1071static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, 1099static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
1072 struct udphdr *uh, 1100 struct udphdr *uh,
1073 __be32 saddr, __be32 daddr, 1101 __be32 saddr, __be32 daddr,
1074 struct hlist_head udptable[]) 1102 struct udp_table *udptable)
1075{ 1103{
1076 struct sock *sk; 1104 struct sock *sk;
1105 struct udp_hslot *hslot = &udptable->hash[udp_hashfn(net, ntohs(uh->dest))];
1077 int dif; 1106 int dif;
1078 1107
1079 read_lock(&udp_hash_lock); 1108 spin_lock(&hslot->lock);
1080 sk = sk_head(&udptable[udp_hashfn(net, ntohs(uh->dest))]); 1109 sk = sk_head(&hslot->head);
1081 dif = skb->dev->ifindex; 1110 dif = skb->dev->ifindex;
1082 sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif); 1111 sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif);
1083 if (sk) { 1112 if (sk) {
@@ -1102,7 +1131,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
1102 } while (sknext); 1131 } while (sknext);
1103 } else 1132 } else
1104 kfree_skb(skb); 1133 kfree_skb(skb);
1105 read_unlock(&udp_hash_lock); 1134 spin_unlock(&hslot->lock);
1106 return 0; 1135 return 0;
1107} 1136}
1108 1137
@@ -1148,7 +1177,7 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
1148 * All we need to do is get the socket, and then do a checksum. 1177 * All we need to do is get the socket, and then do a checksum.
1149 */ 1178 */
1150 1179
1151int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[], 1180int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
1152 int proto) 1181 int proto)
1153{ 1182{
1154 struct sock *sk; 1183 struct sock *sk;
@@ -1246,7 +1275,7 @@ drop:
1246 1275
1247int udp_rcv(struct sk_buff *skb) 1276int udp_rcv(struct sk_buff *skb)
1248{ 1277{
1249 return __udp4_lib_rcv(skb, udp_hash, IPPROTO_UDP); 1278 return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP);
1250} 1279}
1251 1280
1252void udp_destroy_sock(struct sock *sk) 1281void udp_destroy_sock(struct sock *sk)
@@ -1488,7 +1517,7 @@ struct proto udp_prot = {
1488 .sysctl_wmem = &sysctl_udp_wmem_min, 1517 .sysctl_wmem = &sysctl_udp_wmem_min,
1489 .sysctl_rmem = &sysctl_udp_rmem_min, 1518 .sysctl_rmem = &sysctl_udp_rmem_min,
1490 .obj_size = sizeof(struct udp_sock), 1519 .obj_size = sizeof(struct udp_sock),
1491 .h.udp_hash = udp_hash, 1520 .h.udp_table = &udp_table,
1492#ifdef CONFIG_COMPAT 1521#ifdef CONFIG_COMPAT
1493 .compat_setsockopt = compat_udp_setsockopt, 1522 .compat_setsockopt = compat_udp_setsockopt,
1494 .compat_getsockopt = compat_udp_getsockopt, 1523 .compat_getsockopt = compat_udp_getsockopt,
@@ -1498,20 +1527,23 @@ struct proto udp_prot = {
1498/* ------------------------------------------------------------------------ */ 1527/* ------------------------------------------------------------------------ */
1499#ifdef CONFIG_PROC_FS 1528#ifdef CONFIG_PROC_FS
1500 1529
1501static struct sock *udp_get_first(struct seq_file *seq) 1530static struct sock *udp_get_first(struct seq_file *seq, int start)
1502{ 1531{
1503 struct sock *sk; 1532 struct sock *sk;
1504 struct udp_iter_state *state = seq->private; 1533 struct udp_iter_state *state = seq->private;
1505 struct net *net = seq_file_net(seq); 1534 struct net *net = seq_file_net(seq);
1506 1535
1507 for (state->bucket = 0; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) { 1536 for (state->bucket = start; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) {
1508 struct hlist_node *node; 1537 struct hlist_node *node;
1509 sk_for_each(sk, node, state->hashtable + state->bucket) { 1538 struct udp_hslot *hslot = &state->udp_table->hash[state->bucket];
1539 spin_lock_bh(&hslot->lock);
1540 sk_for_each(sk, node, &hslot->head) {
1510 if (!net_eq(sock_net(sk), net)) 1541 if (!net_eq(sock_net(sk), net))
1511 continue; 1542 continue;
1512 if (sk->sk_family == state->family) 1543 if (sk->sk_family == state->family)
1513 goto found; 1544 goto found;
1514 } 1545 }
1546 spin_unlock_bh(&hslot->lock);
1515 } 1547 }
1516 sk = NULL; 1548 sk = NULL;
1517found: 1549found:
@@ -1525,20 +1557,18 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
1525 1557
1526 do { 1558 do {
1527 sk = sk_next(sk); 1559 sk = sk_next(sk);
1528try_again:
1529 ;
1530 } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family)); 1560 } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family));
1531 1561
1532 if (!sk && ++state->bucket < UDP_HTABLE_SIZE) { 1562 if (!sk) {
1533 sk = sk_head(state->hashtable + state->bucket); 1563 spin_unlock(&state->udp_table->hash[state->bucket].lock);
1534 goto try_again; 1564 return udp_get_first(seq, state->bucket + 1);
1535 } 1565 }
1536 return sk; 1566 return sk;
1537} 1567}
1538 1568
1539static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos) 1569static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
1540{ 1570{
1541 struct sock *sk = udp_get_first(seq); 1571 struct sock *sk = udp_get_first(seq, 0);
1542 1572
1543 if (sk) 1573 if (sk)
1544 while (pos && (sk = udp_get_next(seq, sk)) != NULL) 1574 while (pos && (sk = udp_get_next(seq, sk)) != NULL)
@@ -1547,9 +1577,7 @@ static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
1547} 1577}
1548 1578
1549static void *udp_seq_start(struct seq_file *seq, loff_t *pos) 1579static void *udp_seq_start(struct seq_file *seq, loff_t *pos)
1550 __acquires(udp_hash_lock)
1551{ 1580{
1552 read_lock(&udp_hash_lock);
1553 return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN; 1581 return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
1554} 1582}
1555 1583
@@ -1567,9 +1595,11 @@ static void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1567} 1595}
1568 1596
1569static void udp_seq_stop(struct seq_file *seq, void *v) 1597static void udp_seq_stop(struct seq_file *seq, void *v)
1570 __releases(udp_hash_lock)
1571{ 1598{
1572 read_unlock(&udp_hash_lock); 1599 struct udp_iter_state *state = seq->private;
1600
1601 if (state->bucket < UDP_HTABLE_SIZE)
1602 spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
1573} 1603}
1574 1604
1575static int udp_seq_open(struct inode *inode, struct file *file) 1605static int udp_seq_open(struct inode *inode, struct file *file)
@@ -1585,7 +1615,7 @@ static int udp_seq_open(struct inode *inode, struct file *file)
1585 1615
1586 s = ((struct seq_file *)file->private_data)->private; 1616 s = ((struct seq_file *)file->private_data)->private;
1587 s->family = afinfo->family; 1617 s->family = afinfo->family;
1588 s->hashtable = afinfo->hashtable; 1618 s->udp_table = afinfo->udp_table;
1589 return err; 1619 return err;
1590} 1620}
1591 1621
@@ -1657,7 +1687,7 @@ int udp4_seq_show(struct seq_file *seq, void *v)
1657static struct udp_seq_afinfo udp4_seq_afinfo = { 1687static struct udp_seq_afinfo udp4_seq_afinfo = {
1658 .name = "udp", 1688 .name = "udp",
1659 .family = AF_INET, 1689 .family = AF_INET,
1660 .hashtable = udp_hash, 1690 .udp_table = &udp_table,
1661 .seq_fops = { 1691 .seq_fops = {
1662 .owner = THIS_MODULE, 1692 .owner = THIS_MODULE,
1663 }, 1693 },
@@ -1692,10 +1722,21 @@ void udp4_proc_exit(void)
1692} 1722}
1693#endif /* CONFIG_PROC_FS */ 1723#endif /* CONFIG_PROC_FS */
1694 1724
1725void __init udp_table_init(struct udp_table *table)
1726{
1727 int i;
1728
1729 for (i = 0; i < UDP_HTABLE_SIZE; i++) {
1730 INIT_HLIST_HEAD(&table->hash[i].head);
1731 spin_lock_init(&table->hash[i].lock);
1732 }
1733}
1734
1695void __init udp_init(void) 1735void __init udp_init(void)
1696{ 1736{
1697 unsigned long limit; 1737 unsigned long limit;
1698 1738
1739 udp_table_init(&udp_table);
1699 /* Set the pressure threshold up by the same strategy of TCP. It is a 1740 /* Set the pressure threshold up by the same strategy of TCP. It is a
1700 * fraction of global memory that is up to 1/2 at 256 MB, decreasing 1741 * fraction of global memory that is up to 1/2 at 256 MB, decreasing
1701 * toward zero with the amount of memory, with a floor of 128 pages. 1742 * toward zero with the amount of memory, with a floor of 128 pages.
@@ -1712,8 +1753,6 @@ void __init udp_init(void)
1712} 1753}
1713 1754
1714EXPORT_SYMBOL(udp_disconnect); 1755EXPORT_SYMBOL(udp_disconnect);
1715EXPORT_SYMBOL(udp_hash);
1716EXPORT_SYMBOL(udp_hash_lock);
1717EXPORT_SYMBOL(udp_ioctl); 1756EXPORT_SYMBOL(udp_ioctl);
1718EXPORT_SYMBOL(udp_prot); 1757EXPORT_SYMBOL(udp_prot);
1719EXPORT_SYMBOL(udp_sendmsg); 1758EXPORT_SYMBOL(udp_sendmsg);
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
index 2e9bad2fa1bc..9f4a6165f722 100644
--- a/net/ipv4/udp_impl.h
+++ b/net/ipv4/udp_impl.h
@@ -5,8 +5,8 @@
5#include <net/protocol.h> 5#include <net/protocol.h>
6#include <net/inet_common.h> 6#include <net/inet_common.h>
7 7
8extern int __udp4_lib_rcv(struct sk_buff *, struct hlist_head [], int ); 8extern int __udp4_lib_rcv(struct sk_buff *, struct udp_table *, int );
9extern void __udp4_lib_err(struct sk_buff *, u32, struct hlist_head []); 9extern void __udp4_lib_err(struct sk_buff *, u32, struct udp_table *);
10 10
11extern int udp_v4_get_port(struct sock *sk, unsigned short snum); 11extern int udp_v4_get_port(struct sock *sk, unsigned short snum);
12 12
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 3c807964da96..d8ea8e5f5ea3 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -12,16 +12,17 @@
12 */ 12 */
13#include "udp_impl.h" 13#include "udp_impl.h"
14 14
15struct hlist_head udplite_hash[UDP_HTABLE_SIZE]; 15struct udp_table udplite_table;
16EXPORT_SYMBOL(udplite_table);
16 17
17static int udplite_rcv(struct sk_buff *skb) 18static int udplite_rcv(struct sk_buff *skb)
18{ 19{
19 return __udp4_lib_rcv(skb, udplite_hash, IPPROTO_UDPLITE); 20 return __udp4_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE);
20} 21}
21 22
22static void udplite_err(struct sk_buff *skb, u32 info) 23static void udplite_err(struct sk_buff *skb, u32 info)
23{ 24{
24 __udp4_lib_err(skb, info, udplite_hash); 25 __udp4_lib_err(skb, info, &udplite_table);
25} 26}
26 27
27static struct net_protocol udplite_protocol = { 28static struct net_protocol udplite_protocol = {
@@ -50,7 +51,7 @@ struct proto udplite_prot = {
50 .unhash = udp_lib_unhash, 51 .unhash = udp_lib_unhash,
51 .get_port = udp_v4_get_port, 52 .get_port = udp_v4_get_port,
52 .obj_size = sizeof(struct udp_sock), 53 .obj_size = sizeof(struct udp_sock),
53 .h.udp_hash = udplite_hash, 54 .h.udp_table = &udplite_table,
54#ifdef CONFIG_COMPAT 55#ifdef CONFIG_COMPAT
55 .compat_setsockopt = compat_udp_setsockopt, 56 .compat_setsockopt = compat_udp_setsockopt,
56 .compat_getsockopt = compat_udp_getsockopt, 57 .compat_getsockopt = compat_udp_getsockopt,
@@ -71,7 +72,7 @@ static struct inet_protosw udplite4_protosw = {
71static struct udp_seq_afinfo udplite4_seq_afinfo = { 72static struct udp_seq_afinfo udplite4_seq_afinfo = {
72 .name = "udplite", 73 .name = "udplite",
73 .family = AF_INET, 74 .family = AF_INET,
74 .hashtable = udplite_hash, 75 .udp_table = &udplite_table,
75 .seq_fops = { 76 .seq_fops = {
76 .owner = THIS_MODULE, 77 .owner = THIS_MODULE,
77 }, 78 },
@@ -108,6 +109,7 @@ static inline int udplite4_proc_init(void)
108 109
109void __init udplite4_register(void) 110void __init udplite4_register(void)
110{ 111{
112 udp_table_init(&udplite_table);
111 if (proto_register(&udplite_prot, 1)) 113 if (proto_register(&udplite_prot, 1))
112 goto out_register_err; 114 goto out_register_err;
113 115
@@ -126,5 +128,4 @@ out_register_err:
126 printk(KERN_CRIT "%s: Cannot add UDP-Lite protocol.\n", __func__); 128 printk(KERN_CRIT "%s: Cannot add UDP-Lite protocol.\n", __func__);
127} 129}
128 130
129EXPORT_SYMBOL(udplite_hash);
130EXPORT_SYMBOL(udplite_prot); 131EXPORT_SYMBOL(udplite_prot);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index e51da8c092fa..ccee7244ca0f 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -54,62 +54,73 @@ int udp_v6_get_port(struct sock *sk, unsigned short snum)
54 return udp_lib_get_port(sk, snum, ipv6_rcv_saddr_equal); 54 return udp_lib_get_port(sk, snum, ipv6_rcv_saddr_equal);
55} 55}
56 56
57static inline int compute_score(struct sock *sk, struct net *net,
58 unsigned short hnum,
59 struct in6_addr *saddr, __be16 sport,
60 struct in6_addr *daddr, __be16 dport,
61 int dif)
62{
63 int score = -1;
64
65 if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum &&
66 sk->sk_family == PF_INET6) {
67 struct ipv6_pinfo *np = inet6_sk(sk);
68 struct inet_sock *inet = inet_sk(sk);
69
70 score = 0;
71 if (inet->dport) {
72 if (inet->dport != sport)
73 return -1;
74 score++;
75 }
76 if (!ipv6_addr_any(&np->rcv_saddr)) {
77 if (!ipv6_addr_equal(&np->rcv_saddr, daddr))
78 return -1;
79 score++;
80 }
81 if (!ipv6_addr_any(&np->daddr)) {
82 if (!ipv6_addr_equal(&np->daddr, saddr))
83 return -1;
84 score++;
85 }
86 if (sk->sk_bound_dev_if) {
87 if (sk->sk_bound_dev_if != dif)
88 return -1;
89 score++;
90 }
91 }
92 return score;
93}
94
57static struct sock *__udp6_lib_lookup(struct net *net, 95static struct sock *__udp6_lib_lookup(struct net *net,
58 struct in6_addr *saddr, __be16 sport, 96 struct in6_addr *saddr, __be16 sport,
59 struct in6_addr *daddr, __be16 dport, 97 struct in6_addr *daddr, __be16 dport,
60 int dif, struct hlist_head udptable[]) 98 int dif, struct udp_table *udptable)
61{ 99{
62 struct sock *sk, *result = NULL; 100 struct sock *sk, *result = NULL;
63 struct hlist_node *node; 101 struct hlist_node *node;
64 unsigned short hnum = ntohs(dport); 102 unsigned short hnum = ntohs(dport);
65 int badness = -1; 103 unsigned int hash = udp_hashfn(net, hnum);
66 104 struct udp_hslot *hslot = &udptable->hash[hash];
67 read_lock(&udp_hash_lock); 105 int score, badness = -1;
68 sk_for_each(sk, node, &udptable[udp_hashfn(net, hnum)]) { 106
69 struct inet_sock *inet = inet_sk(sk); 107 spin_lock(&hslot->lock);
70 108 sk_for_each(sk, node, &hslot->head) {
71 if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum && 109 score = compute_score(sk, net, hnum, saddr, sport, daddr, dport, dif);
72 sk->sk_family == PF_INET6) { 110 if (score > badness) {
73 struct ipv6_pinfo *np = inet6_sk(sk); 111 result = sk;
74 int score = 0; 112 badness = score;
75 if (inet->dport) {
76 if (inet->dport != sport)
77 continue;
78 score++;
79 }
80 if (!ipv6_addr_any(&np->rcv_saddr)) {
81 if (!ipv6_addr_equal(&np->rcv_saddr, daddr))
82 continue;
83 score++;
84 }
85 if (!ipv6_addr_any(&np->daddr)) {
86 if (!ipv6_addr_equal(&np->daddr, saddr))
87 continue;
88 score++;
89 }
90 if (sk->sk_bound_dev_if) {
91 if (sk->sk_bound_dev_if != dif)
92 continue;
93 score++;
94 }
95 if (score == 4) {
96 result = sk;
97 break;
98 } else if (score > badness) {
99 result = sk;
100 badness = score;
101 }
102 } 113 }
103 } 114 }
104 if (result) 115 if (result)
105 sock_hold(result); 116 sock_hold(result);
106 read_unlock(&udp_hash_lock); 117 spin_unlock(&hslot->lock);
107 return result; 118 return result;
108} 119}
109 120
110static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb, 121static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb,
111 __be16 sport, __be16 dport, 122 __be16 sport, __be16 dport,
112 struct hlist_head udptable[]) 123 struct udp_table *udptable)
113{ 124{
114 struct sock *sk; 125 struct sock *sk;
115 struct ipv6hdr *iph = ipv6_hdr(skb); 126 struct ipv6hdr *iph = ipv6_hdr(skb);
@@ -239,7 +250,7 @@ csum_copy_err:
239 250
240void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, 251void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
241 int type, int code, int offset, __be32 info, 252 int type, int code, int offset, __be32 info,
242 struct hlist_head udptable[] ) 253 struct udp_table *udptable)
243{ 254{
244 struct ipv6_pinfo *np; 255 struct ipv6_pinfo *np;
245 struct ipv6hdr *hdr = (struct ipv6hdr*)skb->data; 256 struct ipv6hdr *hdr = (struct ipv6hdr*)skb->data;
@@ -275,7 +286,7 @@ static __inline__ void udpv6_err(struct sk_buff *skb,
275 struct inet6_skb_parm *opt, int type, 286 struct inet6_skb_parm *opt, int type,
276 int code, int offset, __be32 info ) 287 int code, int offset, __be32 info )
277{ 288{
278 __udp6_lib_err(skb, opt, type, code, offset, info, udp_hash); 289 __udp6_lib_err(skb, opt, type, code, offset, info, &udp_table);
279} 290}
280 291
281int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) 292int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
@@ -374,14 +385,15 @@ static struct sock *udp_v6_mcast_next(struct sock *sk,
374 */ 385 */
375static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, 386static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
376 struct in6_addr *saddr, struct in6_addr *daddr, 387 struct in6_addr *saddr, struct in6_addr *daddr,
377 struct hlist_head udptable[]) 388 struct udp_table *udptable)
378{ 389{
379 struct sock *sk, *sk2; 390 struct sock *sk, *sk2;
380 const struct udphdr *uh = udp_hdr(skb); 391 const struct udphdr *uh = udp_hdr(skb);
392 struct udp_hslot *hslot = &udptable->hash[udp_hashfn(net, ntohs(uh->dest))];
381 int dif; 393 int dif;
382 394
383 read_lock(&udp_hash_lock); 395 spin_lock(&hslot->lock);
384 sk = sk_head(&udptable[udp_hashfn(net, ntohs(uh->dest))]); 396 sk = sk_head(&hslot->head);
385 dif = inet6_iif(skb); 397 dif = inet6_iif(skb);
386 sk = udp_v6_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif); 398 sk = udp_v6_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif);
387 if (!sk) { 399 if (!sk) {
@@ -409,7 +421,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
409 sk_add_backlog(sk, skb); 421 sk_add_backlog(sk, skb);
410 bh_unlock_sock(sk); 422 bh_unlock_sock(sk);
411out: 423out:
412 read_unlock(&udp_hash_lock); 424 spin_unlock(&hslot->lock);
413 return 0; 425 return 0;
414} 426}
415 427
@@ -447,7 +459,7 @@ static inline int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh,
447 return 0; 459 return 0;
448} 460}
449 461
450int __udp6_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[], 462int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
451 int proto) 463 int proto)
452{ 464{
453 struct sock *sk; 465 struct sock *sk;
@@ -544,7 +556,7 @@ discard:
544 556
545static __inline__ int udpv6_rcv(struct sk_buff *skb) 557static __inline__ int udpv6_rcv(struct sk_buff *skb)
546{ 558{
547 return __udp6_lib_rcv(skb, udp_hash, IPPROTO_UDP); 559 return __udp6_lib_rcv(skb, &udp_table, IPPROTO_UDP);
548} 560}
549 561
550/* 562/*
@@ -1008,7 +1020,7 @@ int udp6_seq_show(struct seq_file *seq, void *v)
1008static struct udp_seq_afinfo udp6_seq_afinfo = { 1020static struct udp_seq_afinfo udp6_seq_afinfo = {
1009 .name = "udp6", 1021 .name = "udp6",
1010 .family = AF_INET6, 1022 .family = AF_INET6,
1011 .hashtable = udp_hash, 1023 .udp_table = &udp_table,
1012 .seq_fops = { 1024 .seq_fops = {
1013 .owner = THIS_MODULE, 1025 .owner = THIS_MODULE,
1014 }, 1026 },
@@ -1050,7 +1062,7 @@ struct proto udpv6_prot = {
1050 .sysctl_wmem = &sysctl_udp_wmem_min, 1062 .sysctl_wmem = &sysctl_udp_wmem_min,
1051 .sysctl_rmem = &sysctl_udp_rmem_min, 1063 .sysctl_rmem = &sysctl_udp_rmem_min,
1052 .obj_size = sizeof(struct udp6_sock), 1064 .obj_size = sizeof(struct udp6_sock),
1053 .h.udp_hash = udp_hash, 1065 .h.udp_table = &udp_table,
1054#ifdef CONFIG_COMPAT 1066#ifdef CONFIG_COMPAT
1055 .compat_setsockopt = compat_udpv6_setsockopt, 1067 .compat_setsockopt = compat_udpv6_setsockopt,
1056 .compat_getsockopt = compat_udpv6_getsockopt, 1068 .compat_getsockopt = compat_udpv6_getsockopt,
diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h
index 92dd7da766d8..23779208c334 100644
--- a/net/ipv6/udp_impl.h
+++ b/net/ipv6/udp_impl.h
@@ -7,9 +7,9 @@
7#include <net/inet_common.h> 7#include <net/inet_common.h>
8#include <net/transp_v6.h> 8#include <net/transp_v6.h>
9 9
10extern int __udp6_lib_rcv(struct sk_buff *, struct hlist_head [], int ); 10extern int __udp6_lib_rcv(struct sk_buff *, struct udp_table *, int );
11extern void __udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *, 11extern void __udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *,
12 int , int , int , __be32 , struct hlist_head []); 12 int , int , int , __be32 , struct udp_table *);
13 13
14extern int udp_v6_get_port(struct sock *sk, unsigned short snum); 14extern int udp_v6_get_port(struct sock *sk, unsigned short snum);
15 15
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index 3cd1a1ac3d6c..f1e892a99e05 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -15,14 +15,14 @@
15 15
16static int udplitev6_rcv(struct sk_buff *skb) 16static int udplitev6_rcv(struct sk_buff *skb)
17{ 17{
18 return __udp6_lib_rcv(skb, udplite_hash, IPPROTO_UDPLITE); 18 return __udp6_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE);
19} 19}
20 20
21static void udplitev6_err(struct sk_buff *skb, 21static void udplitev6_err(struct sk_buff *skb,
22 struct inet6_skb_parm *opt, 22 struct inet6_skb_parm *opt,
23 int type, int code, int offset, __be32 info) 23 int type, int code, int offset, __be32 info)
24{ 24{
25 __udp6_lib_err(skb, opt, type, code, offset, info, udplite_hash); 25 __udp6_lib_err(skb, opt, type, code, offset, info, &udplite_table);
26} 26}
27 27
28static struct inet6_protocol udplitev6_protocol = { 28static struct inet6_protocol udplitev6_protocol = {
@@ -49,7 +49,7 @@ struct proto udplitev6_prot = {
49 .unhash = udp_lib_unhash, 49 .unhash = udp_lib_unhash,
50 .get_port = udp_v6_get_port, 50 .get_port = udp_v6_get_port,
51 .obj_size = sizeof(struct udp6_sock), 51 .obj_size = sizeof(struct udp6_sock),
52 .h.udp_hash = udplite_hash, 52 .h.udp_table = &udplite_table,
53#ifdef CONFIG_COMPAT 53#ifdef CONFIG_COMPAT
54 .compat_setsockopt = compat_udpv6_setsockopt, 54 .compat_setsockopt = compat_udpv6_setsockopt,
55 .compat_getsockopt = compat_udpv6_getsockopt, 55 .compat_getsockopt = compat_udpv6_getsockopt,
@@ -95,7 +95,7 @@ void udplitev6_exit(void)
95static struct udp_seq_afinfo udplite6_seq_afinfo = { 95static struct udp_seq_afinfo udplite6_seq_afinfo = {
96 .name = "udplite6", 96 .name = "udplite6",
97 .family = AF_INET6, 97 .family = AF_INET6,
98 .hashtable = udplite_hash, 98 .udp_table = &udplite_table,
99 .seq_fops = { 99 .seq_fops = {
100 .owner = THIS_MODULE, 100 .owner = THIS_MODULE,
101 }, 101 },