diff options
author | Sven Wegener <sven.wegener@stealer.net> | 2008-08-17 18:52:08 -0400 |
---|---|---|
committer | Simon Horman <horms@verge.net.au> | 2008-08-19 03:37:04 -0400 |
commit | 39ac50d0c79747b186c1268d9a488f8c1d256be7 (patch) | |
tree | fe2b213b41cef4c3eb3242836a4b7144f05bd826 /net | |
parent | 3f087668c4e7c97289f0a67f9278ae6e0a765a80 (diff) |
ipvs: Fix race conditions in lblc scheduler
We can't access the cache entry outside of our critical read-locked region,
because someone may free that entry. And we also need to check in the critical
region wether the destination is still available, i.e. it's not in the trash.
If we drop our reference counter, the destination can be purged from the trash
at any time. Our caller only guarantees that no destination is moved to the
trash, while we are scheduling. Also there is no need for our own rwlock,
there is already one in the service structure for use in the schedulers.
Signed-off-by: Sven Wegener <sven.wegener@stealer.net>
Signed-off-by: Simon Horman <horms@verge.net.au>
Diffstat (limited to 'net')
-rw-r--r-- | net/ipv4/ipvs/ip_vs_lblc.c | 204 |
1 files changed, 96 insertions, 108 deletions
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c index b9b334cccf37..d2a43aa3fe4c 100644 --- a/net/ipv4/ipvs/ip_vs_lblc.c +++ b/net/ipv4/ipvs/ip_vs_lblc.c | |||
@@ -96,7 +96,6 @@ struct ip_vs_lblc_entry { | |||
96 | * IPVS lblc hash table | 96 | * IPVS lblc hash table |
97 | */ | 97 | */ |
98 | struct ip_vs_lblc_table { | 98 | struct ip_vs_lblc_table { |
99 | rwlock_t lock; /* lock for this table */ | ||
100 | struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ | 99 | struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ |
101 | atomic_t entries; /* number of entries */ | 100 | atomic_t entries; /* number of entries */ |
102 | int max_size; /* maximum size of entries */ | 101 | int max_size; /* maximum size of entries */ |
@@ -123,31 +122,6 @@ static ctl_table vs_vars_table[] = { | |||
123 | 122 | ||
124 | static struct ctl_table_header * sysctl_header; | 123 | static struct ctl_table_header * sysctl_header; |
125 | 124 | ||
126 | /* | ||
127 | * new/free a ip_vs_lblc_entry, which is a mapping of a destionation | ||
128 | * IP address to a server. | ||
129 | */ | ||
130 | static inline struct ip_vs_lblc_entry * | ||
131 | ip_vs_lblc_new(__be32 daddr, struct ip_vs_dest *dest) | ||
132 | { | ||
133 | struct ip_vs_lblc_entry *en; | ||
134 | |||
135 | en = kmalloc(sizeof(struct ip_vs_lblc_entry), GFP_ATOMIC); | ||
136 | if (en == NULL) { | ||
137 | IP_VS_ERR("ip_vs_lblc_new(): no memory\n"); | ||
138 | return NULL; | ||
139 | } | ||
140 | |||
141 | INIT_LIST_HEAD(&en->list); | ||
142 | en->addr = daddr; | ||
143 | |||
144 | atomic_inc(&dest->refcnt); | ||
145 | en->dest = dest; | ||
146 | |||
147 | return en; | ||
148 | } | ||
149 | |||
150 | |||
151 | static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) | 125 | static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) |
152 | { | 126 | { |
153 | list_del(&en->list); | 127 | list_del(&en->list); |
@@ -173,55 +147,66 @@ static inline unsigned ip_vs_lblc_hashkey(__be32 addr) | |||
173 | * Hash an entry in the ip_vs_lblc_table. | 147 | * Hash an entry in the ip_vs_lblc_table. |
174 | * returns bool success. | 148 | * returns bool success. |
175 | */ | 149 | */ |
176 | static int | 150 | static void |
177 | ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) | 151 | ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) |
178 | { | 152 | { |
179 | unsigned hash; | 153 | unsigned hash = ip_vs_lblc_hashkey(en->addr); |
180 | |||
181 | if (!list_empty(&en->list)) { | ||
182 | IP_VS_ERR("ip_vs_lblc_hash(): request for already hashed, " | ||
183 | "called from %p\n", __builtin_return_address(0)); | ||
184 | return 0; | ||
185 | } | ||
186 | 154 | ||
187 | /* | ||
188 | * Hash by destination IP address | ||
189 | */ | ||
190 | hash = ip_vs_lblc_hashkey(en->addr); | ||
191 | |||
192 | write_lock(&tbl->lock); | ||
193 | list_add(&en->list, &tbl->bucket[hash]); | 155 | list_add(&en->list, &tbl->bucket[hash]); |
194 | atomic_inc(&tbl->entries); | 156 | atomic_inc(&tbl->entries); |
195 | write_unlock(&tbl->lock); | ||
196 | |||
197 | return 1; | ||
198 | } | 157 | } |
199 | 158 | ||
200 | 159 | ||
201 | /* | 160 | /* |
202 | * Get ip_vs_lblc_entry associated with supplied parameters. | 161 | * Get ip_vs_lblc_entry associated with supplied parameters. Called under read |
162 | * lock | ||
203 | */ | 163 | */ |
204 | static inline struct ip_vs_lblc_entry * | 164 | static inline struct ip_vs_lblc_entry * |
205 | ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr) | 165 | ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr) |
206 | { | 166 | { |
207 | unsigned hash; | 167 | unsigned hash = ip_vs_lblc_hashkey(addr); |
208 | struct ip_vs_lblc_entry *en; | 168 | struct ip_vs_lblc_entry *en; |
209 | 169 | ||
210 | hash = ip_vs_lblc_hashkey(addr); | 170 | list_for_each_entry(en, &tbl->bucket[hash], list) |
171 | if (en->addr == addr) | ||
172 | return en; | ||
211 | 173 | ||
212 | read_lock(&tbl->lock); | 174 | return NULL; |
175 | } | ||
213 | 176 | ||
214 | list_for_each_entry(en, &tbl->bucket[hash], list) { | 177 | |
215 | if (en->addr == addr) { | 178 | /* |
216 | /* HIT */ | 179 | * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP |
217 | read_unlock(&tbl->lock); | 180 | * address to a server. Called under write lock. |
218 | return en; | 181 | */ |
182 | static inline struct ip_vs_lblc_entry * | ||
183 | ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, __be32 daddr, | ||
184 | struct ip_vs_dest *dest) | ||
185 | { | ||
186 | struct ip_vs_lblc_entry *en; | ||
187 | |||
188 | en = ip_vs_lblc_get(tbl, daddr); | ||
189 | if (!en) { | ||
190 | en = kmalloc(sizeof(*en), GFP_ATOMIC); | ||
191 | if (!en) { | ||
192 | IP_VS_ERR("ip_vs_lblc_new(): no memory\n"); | ||
193 | return NULL; | ||
219 | } | 194 | } |
220 | } | ||
221 | 195 | ||
222 | read_unlock(&tbl->lock); | 196 | en->addr = daddr; |
197 | en->lastuse = jiffies; | ||
223 | 198 | ||
224 | return NULL; | 199 | atomic_inc(&dest->refcnt); |
200 | en->dest = dest; | ||
201 | |||
202 | ip_vs_lblc_hash(tbl, en); | ||
203 | } else if (en->dest != dest) { | ||
204 | atomic_dec(&en->dest->refcnt); | ||
205 | atomic_inc(&dest->refcnt); | ||
206 | en->dest = dest; | ||
207 | } | ||
208 | |||
209 | return en; | ||
225 | } | 210 | } |
226 | 211 | ||
227 | 212 | ||
@@ -230,30 +215,29 @@ ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr) | |||
230 | */ | 215 | */ |
231 | static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) | 216 | static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) |
232 | { | 217 | { |
233 | int i; | ||
234 | struct ip_vs_lblc_entry *en, *nxt; | 218 | struct ip_vs_lblc_entry *en, *nxt; |
219 | int i; | ||
235 | 220 | ||
236 | for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { | 221 | for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { |
237 | write_lock(&tbl->lock); | ||
238 | list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { | 222 | list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { |
239 | ip_vs_lblc_free(en); | 223 | ip_vs_lblc_free(en); |
240 | atomic_dec(&tbl->entries); | 224 | atomic_dec(&tbl->entries); |
241 | } | 225 | } |
242 | write_unlock(&tbl->lock); | ||
243 | } | 226 | } |
244 | } | 227 | } |
245 | 228 | ||
246 | 229 | ||
247 | static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl) | 230 | static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) |
248 | { | 231 | { |
232 | struct ip_vs_lblc_table *tbl = svc->sched_data; | ||
233 | struct ip_vs_lblc_entry *en, *nxt; | ||
249 | unsigned long now = jiffies; | 234 | unsigned long now = jiffies; |
250 | int i, j; | 235 | int i, j; |
251 | struct ip_vs_lblc_entry *en, *nxt; | ||
252 | 236 | ||
253 | for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { | 237 | for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { |
254 | j = (j + 1) & IP_VS_LBLC_TAB_MASK; | 238 | j = (j + 1) & IP_VS_LBLC_TAB_MASK; |
255 | 239 | ||
256 | write_lock(&tbl->lock); | 240 | write_lock(&svc->sched_lock); |
257 | list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { | 241 | list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { |
258 | if (time_before(now, | 242 | if (time_before(now, |
259 | en->lastuse + sysctl_ip_vs_lblc_expiration)) | 243 | en->lastuse + sysctl_ip_vs_lblc_expiration)) |
@@ -262,7 +246,7 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl) | |||
262 | ip_vs_lblc_free(en); | 246 | ip_vs_lblc_free(en); |
263 | atomic_dec(&tbl->entries); | 247 | atomic_dec(&tbl->entries); |
264 | } | 248 | } |
265 | write_unlock(&tbl->lock); | 249 | write_unlock(&svc->sched_lock); |
266 | } | 250 | } |
267 | tbl->rover = j; | 251 | tbl->rover = j; |
268 | } | 252 | } |
@@ -281,17 +265,16 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl) | |||
281 | */ | 265 | */ |
282 | static void ip_vs_lblc_check_expire(unsigned long data) | 266 | static void ip_vs_lblc_check_expire(unsigned long data) |
283 | { | 267 | { |
284 | struct ip_vs_lblc_table *tbl; | 268 | struct ip_vs_service *svc = (struct ip_vs_service *) data; |
269 | struct ip_vs_lblc_table *tbl = svc->sched_data; | ||
285 | unsigned long now = jiffies; | 270 | unsigned long now = jiffies; |
286 | int goal; | 271 | int goal; |
287 | int i, j; | 272 | int i, j; |
288 | struct ip_vs_lblc_entry *en, *nxt; | 273 | struct ip_vs_lblc_entry *en, *nxt; |
289 | 274 | ||
290 | tbl = (struct ip_vs_lblc_table *)data; | ||
291 | |||
292 | if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { | 275 | if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { |
293 | /* do full expiration check */ | 276 | /* do full expiration check */ |
294 | ip_vs_lblc_full_check(tbl); | 277 | ip_vs_lblc_full_check(svc); |
295 | tbl->counter = 1; | 278 | tbl->counter = 1; |
296 | goto out; | 279 | goto out; |
297 | } | 280 | } |
@@ -308,7 +291,7 @@ static void ip_vs_lblc_check_expire(unsigned long data) | |||
308 | for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { | 291 | for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { |
309 | j = (j + 1) & IP_VS_LBLC_TAB_MASK; | 292 | j = (j + 1) & IP_VS_LBLC_TAB_MASK; |
310 | 293 | ||
311 | write_lock(&tbl->lock); | 294 | write_lock(&svc->sched_lock); |
312 | list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { | 295 | list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { |
313 | if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) | 296 | if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) |
314 | continue; | 297 | continue; |
@@ -317,7 +300,7 @@ static void ip_vs_lblc_check_expire(unsigned long data) | |||
317 | atomic_dec(&tbl->entries); | 300 | atomic_dec(&tbl->entries); |
318 | goal--; | 301 | goal--; |
319 | } | 302 | } |
320 | write_unlock(&tbl->lock); | 303 | write_unlock(&svc->sched_lock); |
321 | if (goal <= 0) | 304 | if (goal <= 0) |
322 | break; | 305 | break; |
323 | } | 306 | } |
@@ -336,15 +319,14 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) | |||
336 | /* | 319 | /* |
337 | * Allocate the ip_vs_lblc_table for this service | 320 | * Allocate the ip_vs_lblc_table for this service |
338 | */ | 321 | */ |
339 | tbl = kmalloc(sizeof(struct ip_vs_lblc_table), GFP_ATOMIC); | 322 | tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC); |
340 | if (tbl == NULL) { | 323 | if (tbl == NULL) { |
341 | IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n"); | 324 | IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n"); |
342 | return -ENOMEM; | 325 | return -ENOMEM; |
343 | } | 326 | } |
344 | svc->sched_data = tbl; | 327 | svc->sched_data = tbl; |
345 | IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for " | 328 | IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for " |
346 | "current service\n", | 329 | "current service\n", sizeof(*tbl)); |
347 | sizeof(struct ip_vs_lblc_table)); | ||
348 | 330 | ||
349 | /* | 331 | /* |
350 | * Initialize the hash buckets | 332 | * Initialize the hash buckets |
@@ -352,7 +334,6 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) | |||
352 | for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { | 334 | for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { |
353 | INIT_LIST_HEAD(&tbl->bucket[i]); | 335 | INIT_LIST_HEAD(&tbl->bucket[i]); |
354 | } | 336 | } |
355 | rwlock_init(&tbl->lock); | ||
356 | tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; | 337 | tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; |
357 | tbl->rover = 0; | 338 | tbl->rover = 0; |
358 | tbl->counter = 1; | 339 | tbl->counter = 1; |
@@ -361,9 +342,8 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) | |||
361 | * Hook periodic timer for garbage collection | 342 | * Hook periodic timer for garbage collection |
362 | */ | 343 | */ |
363 | setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire, | 344 | setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire, |
364 | (unsigned long)tbl); | 345 | (unsigned long)svc); |
365 | tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL; | 346 | mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); |
366 | add_timer(&tbl->periodic_timer); | ||
367 | 347 | ||
368 | return 0; | 348 | return 0; |
369 | } | 349 | } |
@@ -380,9 +360,9 @@ static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) | |||
380 | ip_vs_lblc_flush(tbl); | 360 | ip_vs_lblc_flush(tbl); |
381 | 361 | ||
382 | /* release the table itself */ | 362 | /* release the table itself */ |
383 | kfree(svc->sched_data); | 363 | kfree(tbl); |
384 | IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n", | 364 | IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n", |
385 | sizeof(struct ip_vs_lblc_table)); | 365 | sizeof(*tbl)); |
386 | 366 | ||
387 | return 0; | 367 | return 0; |
388 | } | 368 | } |
@@ -478,46 +458,54 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) | |||
478 | static struct ip_vs_dest * | 458 | static struct ip_vs_dest * |
479 | ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) | 459 | ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) |
480 | { | 460 | { |
481 | struct ip_vs_dest *dest; | 461 | struct ip_vs_lblc_table *tbl = svc->sched_data; |
482 | struct ip_vs_lblc_table *tbl; | ||
483 | struct ip_vs_lblc_entry *en; | ||
484 | struct iphdr *iph = ip_hdr(skb); | 462 | struct iphdr *iph = ip_hdr(skb); |
463 | struct ip_vs_dest *dest = NULL; | ||
464 | struct ip_vs_lblc_entry *en; | ||
485 | 465 | ||
486 | IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n"); | 466 | IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n"); |
487 | 467 | ||
488 | tbl = (struct ip_vs_lblc_table *)svc->sched_data; | 468 | /* First look in our cache */ |
469 | read_lock(&svc->sched_lock); | ||
489 | en = ip_vs_lblc_get(tbl, iph->daddr); | 470 | en = ip_vs_lblc_get(tbl, iph->daddr); |
490 | if (en == NULL) { | 471 | if (en) { |
491 | dest = __ip_vs_lblc_schedule(svc, iph); | 472 | /* We only hold a read lock, but this is atomic */ |
492 | if (dest == NULL) { | 473 | en->lastuse = jiffies; |
493 | IP_VS_DBG(1, "no destination available\n"); | 474 | |
494 | return NULL; | 475 | /* |
495 | } | 476 | * If the destination is not available, i.e. it's in the trash, |
496 | en = ip_vs_lblc_new(iph->daddr, dest); | 477 | * we must ignore it, as it may be removed from under our feet, |
497 | if (en == NULL) { | 478 | * if someone drops our reference count. Our caller only makes |
498 | return NULL; | 479 | * sure that destinations, that are not in the trash, are not |
499 | } | 480 | * moved to the trash, while we are scheduling. But anyone can |
500 | ip_vs_lblc_hash(tbl, en); | 481 | * free up entries from the trash at any time. |
501 | } else { | 482 | */ |
502 | dest = en->dest; | 483 | |
503 | if (!(dest->flags & IP_VS_DEST_F_AVAILABLE) | 484 | if (en->dest->flags & IP_VS_DEST_F_AVAILABLE) |
504 | || atomic_read(&dest->weight) <= 0 | 485 | dest = en->dest; |
505 | || is_overloaded(dest, svc)) { | 486 | } |
506 | dest = __ip_vs_lblc_schedule(svc, iph); | 487 | read_unlock(&svc->sched_lock); |
507 | if (dest == NULL) { | 488 | |
508 | IP_VS_DBG(1, "no destination available\n"); | 489 | /* If the destination has a weight and is not overloaded, use it */ |
509 | return NULL; | 490 | if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc)) |
510 | } | 491 | goto out; |
511 | atomic_dec(&en->dest->refcnt); | 492 | |
512 | atomic_inc(&dest->refcnt); | 493 | /* No cache entry or it is invalid, time to schedule */ |
513 | en->dest = dest; | 494 | dest = __ip_vs_lblc_schedule(svc, iph); |
514 | } | 495 | if (!dest) { |
496 | IP_VS_DBG(1, "no destination available\n"); | ||
497 | return NULL; | ||
515 | } | 498 | } |
516 | en->lastuse = jiffies; | ||
517 | 499 | ||
500 | /* If we fail to create a cache entry, we'll just use the valid dest */ | ||
501 | write_lock(&svc->sched_lock); | ||
502 | ip_vs_lblc_new(tbl, iph->daddr, dest); | ||
503 | write_unlock(&svc->sched_lock); | ||
504 | |||
505 | out: | ||
518 | IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u " | 506 | IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u " |
519 | "--> server %u.%u.%u.%u:%d\n", | 507 | "--> server %u.%u.%u.%u:%d\n", |
520 | NIPQUAD(en->addr), | 508 | NIPQUAD(iph->daddr), |
521 | NIPQUAD(dest->addr), | 509 | NIPQUAD(dest->addr), |
522 | ntohs(dest->port)); | 510 | ntohs(dest->port)); |
523 | 511 | ||