aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSven Wegener <sven.wegener@stealer.net>2008-08-17 18:52:08 -0400
committerSimon Horman <horms@verge.net.au>2008-08-19 03:37:04 -0400
commit39ac50d0c79747b186c1268d9a488f8c1d256be7 (patch)
treefe2b213b41cef4c3eb3242836a4b7144f05bd826
parent3f087668c4e7c97289f0a67f9278ae6e0a765a80 (diff)
ipvs: Fix race conditions in lblc scheduler
We can't access the cache entry outside of our critical read-locked region, because someone may free that entry. And we also need to check in the critical region wether the destination is still available, i.e. it's not in the trash. If we drop our reference counter, the destination can be purged from the trash at any time. Our caller only guarantees that no destination is moved to the trash, while we are scheduling. Also there is no need for our own rwlock, there is already one in the service structure for use in the schedulers. Signed-off-by: Sven Wegener <sven.wegener@stealer.net> Signed-off-by: Simon Horman <horms@verge.net.au>
-rw-r--r--net/ipv4/ipvs/ip_vs_lblc.c204
1 files changed, 96 insertions, 108 deletions
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c
index b9b334cccf37..d2a43aa3fe4c 100644
--- a/net/ipv4/ipvs/ip_vs_lblc.c
+++ b/net/ipv4/ipvs/ip_vs_lblc.c
@@ -96,7 +96,6 @@ struct ip_vs_lblc_entry {
96 * IPVS lblc hash table 96 * IPVS lblc hash table
97 */ 97 */
98struct ip_vs_lblc_table { 98struct ip_vs_lblc_table {
99 rwlock_t lock; /* lock for this table */
100 struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ 99 struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */
101 atomic_t entries; /* number of entries */ 100 atomic_t entries; /* number of entries */
102 int max_size; /* maximum size of entries */ 101 int max_size; /* maximum size of entries */
@@ -123,31 +122,6 @@ static ctl_table vs_vars_table[] = {
123 122
124static struct ctl_table_header * sysctl_header; 123static struct ctl_table_header * sysctl_header;
125 124
126/*
127 * new/free a ip_vs_lblc_entry, which is a mapping of a destionation
128 * IP address to a server.
129 */
130static inline struct ip_vs_lblc_entry *
131ip_vs_lblc_new(__be32 daddr, struct ip_vs_dest *dest)
132{
133 struct ip_vs_lblc_entry *en;
134
135 en = kmalloc(sizeof(struct ip_vs_lblc_entry), GFP_ATOMIC);
136 if (en == NULL) {
137 IP_VS_ERR("ip_vs_lblc_new(): no memory\n");
138 return NULL;
139 }
140
141 INIT_LIST_HEAD(&en->list);
142 en->addr = daddr;
143
144 atomic_inc(&dest->refcnt);
145 en->dest = dest;
146
147 return en;
148}
149
150
151static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) 125static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
152{ 126{
153 list_del(&en->list); 127 list_del(&en->list);
@@ -173,55 +147,66 @@ static inline unsigned ip_vs_lblc_hashkey(__be32 addr)
173 * Hash an entry in the ip_vs_lblc_table. 147 * Hash an entry in the ip_vs_lblc_table.
174 * returns bool success. 148 * returns bool success.
175 */ 149 */
176static int 150static void
177ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) 151ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
178{ 152{
179 unsigned hash; 153 unsigned hash = ip_vs_lblc_hashkey(en->addr);
180
181 if (!list_empty(&en->list)) {
182 IP_VS_ERR("ip_vs_lblc_hash(): request for already hashed, "
183 "called from %p\n", __builtin_return_address(0));
184 return 0;
185 }
186 154
187 /*
188 * Hash by destination IP address
189 */
190 hash = ip_vs_lblc_hashkey(en->addr);
191
192 write_lock(&tbl->lock);
193 list_add(&en->list, &tbl->bucket[hash]); 155 list_add(&en->list, &tbl->bucket[hash]);
194 atomic_inc(&tbl->entries); 156 atomic_inc(&tbl->entries);
195 write_unlock(&tbl->lock);
196
197 return 1;
198} 157}
199 158
200 159
201/* 160/*
202 * Get ip_vs_lblc_entry associated with supplied parameters. 161 * Get ip_vs_lblc_entry associated with supplied parameters. Called under read
162 * lock
203 */ 163 */
204static inline struct ip_vs_lblc_entry * 164static inline struct ip_vs_lblc_entry *
205ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr) 165ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr)
206{ 166{
207 unsigned hash; 167 unsigned hash = ip_vs_lblc_hashkey(addr);
208 struct ip_vs_lblc_entry *en; 168 struct ip_vs_lblc_entry *en;
209 169
210 hash = ip_vs_lblc_hashkey(addr); 170 list_for_each_entry(en, &tbl->bucket[hash], list)
171 if (en->addr == addr)
172 return en;
211 173
212 read_lock(&tbl->lock); 174 return NULL;
175}
213 176
214 list_for_each_entry(en, &tbl->bucket[hash], list) { 177
215 if (en->addr == addr) { 178/*
216 /* HIT */ 179 * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP
217 read_unlock(&tbl->lock); 180 * address to a server. Called under write lock.
218 return en; 181 */
182static inline struct ip_vs_lblc_entry *
183ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, __be32 daddr,
184 struct ip_vs_dest *dest)
185{
186 struct ip_vs_lblc_entry *en;
187
188 en = ip_vs_lblc_get(tbl, daddr);
189 if (!en) {
190 en = kmalloc(sizeof(*en), GFP_ATOMIC);
191 if (!en) {
192 IP_VS_ERR("ip_vs_lblc_new(): no memory\n");
193 return NULL;
219 } 194 }
220 }
221 195
222 read_unlock(&tbl->lock); 196 en->addr = daddr;
197 en->lastuse = jiffies;
223 198
224 return NULL; 199 atomic_inc(&dest->refcnt);
200 en->dest = dest;
201
202 ip_vs_lblc_hash(tbl, en);
203 } else if (en->dest != dest) {
204 atomic_dec(&en->dest->refcnt);
205 atomic_inc(&dest->refcnt);
206 en->dest = dest;
207 }
208
209 return en;
225} 210}
226 211
227 212
@@ -230,30 +215,29 @@ ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr)
230 */ 215 */
231static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) 216static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl)
232{ 217{
233 int i;
234 struct ip_vs_lblc_entry *en, *nxt; 218 struct ip_vs_lblc_entry *en, *nxt;
219 int i;
235 220
236 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { 221 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
237 write_lock(&tbl->lock);
238 list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { 222 list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
239 ip_vs_lblc_free(en); 223 ip_vs_lblc_free(en);
240 atomic_dec(&tbl->entries); 224 atomic_dec(&tbl->entries);
241 } 225 }
242 write_unlock(&tbl->lock);
243 } 226 }
244} 227}
245 228
246 229
247static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl) 230static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
248{ 231{
232 struct ip_vs_lblc_table *tbl = svc->sched_data;
233 struct ip_vs_lblc_entry *en, *nxt;
249 unsigned long now = jiffies; 234 unsigned long now = jiffies;
250 int i, j; 235 int i, j;
251 struct ip_vs_lblc_entry *en, *nxt;
252 236
253 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { 237 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
254 j = (j + 1) & IP_VS_LBLC_TAB_MASK; 238 j = (j + 1) & IP_VS_LBLC_TAB_MASK;
255 239
256 write_lock(&tbl->lock); 240 write_lock(&svc->sched_lock);
257 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { 241 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
258 if (time_before(now, 242 if (time_before(now,
259 en->lastuse + sysctl_ip_vs_lblc_expiration)) 243 en->lastuse + sysctl_ip_vs_lblc_expiration))
@@ -262,7 +246,7 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl)
262 ip_vs_lblc_free(en); 246 ip_vs_lblc_free(en);
263 atomic_dec(&tbl->entries); 247 atomic_dec(&tbl->entries);
264 } 248 }
265 write_unlock(&tbl->lock); 249 write_unlock(&svc->sched_lock);
266 } 250 }
267 tbl->rover = j; 251 tbl->rover = j;
268} 252}
@@ -281,17 +265,16 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl)
281 */ 265 */
282static void ip_vs_lblc_check_expire(unsigned long data) 266static void ip_vs_lblc_check_expire(unsigned long data)
283{ 267{
284 struct ip_vs_lblc_table *tbl; 268 struct ip_vs_service *svc = (struct ip_vs_service *) data;
269 struct ip_vs_lblc_table *tbl = svc->sched_data;
285 unsigned long now = jiffies; 270 unsigned long now = jiffies;
286 int goal; 271 int goal;
287 int i, j; 272 int i, j;
288 struct ip_vs_lblc_entry *en, *nxt; 273 struct ip_vs_lblc_entry *en, *nxt;
289 274
290 tbl = (struct ip_vs_lblc_table *)data;
291
292 if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { 275 if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
293 /* do full expiration check */ 276 /* do full expiration check */
294 ip_vs_lblc_full_check(tbl); 277 ip_vs_lblc_full_check(svc);
295 tbl->counter = 1; 278 tbl->counter = 1;
296 goto out; 279 goto out;
297 } 280 }
@@ -308,7 +291,7 @@ static void ip_vs_lblc_check_expire(unsigned long data)
308 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { 291 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
309 j = (j + 1) & IP_VS_LBLC_TAB_MASK; 292 j = (j + 1) & IP_VS_LBLC_TAB_MASK;
310 293
311 write_lock(&tbl->lock); 294 write_lock(&svc->sched_lock);
312 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { 295 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
313 if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) 296 if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
314 continue; 297 continue;
@@ -317,7 +300,7 @@ static void ip_vs_lblc_check_expire(unsigned long data)
317 atomic_dec(&tbl->entries); 300 atomic_dec(&tbl->entries);
318 goal--; 301 goal--;
319 } 302 }
320 write_unlock(&tbl->lock); 303 write_unlock(&svc->sched_lock);
321 if (goal <= 0) 304 if (goal <= 0)
322 break; 305 break;
323 } 306 }
@@ -336,15 +319,14 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
336 /* 319 /*
337 * Allocate the ip_vs_lblc_table for this service 320 * Allocate the ip_vs_lblc_table for this service
338 */ 321 */
339 tbl = kmalloc(sizeof(struct ip_vs_lblc_table), GFP_ATOMIC); 322 tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
340 if (tbl == NULL) { 323 if (tbl == NULL) {
341 IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n"); 324 IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n");
342 return -ENOMEM; 325 return -ENOMEM;
343 } 326 }
344 svc->sched_data = tbl; 327 svc->sched_data = tbl;
345 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for " 328 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for "
346 "current service\n", 329 "current service\n", sizeof(*tbl));
347 sizeof(struct ip_vs_lblc_table));
348 330
349 /* 331 /*
350 * Initialize the hash buckets 332 * Initialize the hash buckets
@@ -352,7 +334,6 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
352 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { 334 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
353 INIT_LIST_HEAD(&tbl->bucket[i]); 335 INIT_LIST_HEAD(&tbl->bucket[i]);
354 } 336 }
355 rwlock_init(&tbl->lock);
356 tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; 337 tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
357 tbl->rover = 0; 338 tbl->rover = 0;
358 tbl->counter = 1; 339 tbl->counter = 1;
@@ -361,9 +342,8 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
361 * Hook periodic timer for garbage collection 342 * Hook periodic timer for garbage collection
362 */ 343 */
363 setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire, 344 setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire,
364 (unsigned long)tbl); 345 (unsigned long)svc);
365 tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL; 346 mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
366 add_timer(&tbl->periodic_timer);
367 347
368 return 0; 348 return 0;
369} 349}
@@ -380,9 +360,9 @@ static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
380 ip_vs_lblc_flush(tbl); 360 ip_vs_lblc_flush(tbl);
381 361
382 /* release the table itself */ 362 /* release the table itself */
383 kfree(svc->sched_data); 363 kfree(tbl);
384 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n", 364 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
385 sizeof(struct ip_vs_lblc_table)); 365 sizeof(*tbl));
386 366
387 return 0; 367 return 0;
388} 368}
@@ -478,46 +458,54 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
478static struct ip_vs_dest * 458static struct ip_vs_dest *
479ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) 459ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
480{ 460{
481 struct ip_vs_dest *dest; 461 struct ip_vs_lblc_table *tbl = svc->sched_data;
482 struct ip_vs_lblc_table *tbl;
483 struct ip_vs_lblc_entry *en;
484 struct iphdr *iph = ip_hdr(skb); 462 struct iphdr *iph = ip_hdr(skb);
463 struct ip_vs_dest *dest = NULL;
464 struct ip_vs_lblc_entry *en;
485 465
486 IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n"); 466 IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n");
487 467
488 tbl = (struct ip_vs_lblc_table *)svc->sched_data; 468 /* First look in our cache */
469 read_lock(&svc->sched_lock);
489 en = ip_vs_lblc_get(tbl, iph->daddr); 470 en = ip_vs_lblc_get(tbl, iph->daddr);
490 if (en == NULL) { 471 if (en) {
491 dest = __ip_vs_lblc_schedule(svc, iph); 472 /* We only hold a read lock, but this is atomic */
492 if (dest == NULL) { 473 en->lastuse = jiffies;
493 IP_VS_DBG(1, "no destination available\n"); 474
494 return NULL; 475 /*
495 } 476 * If the destination is not available, i.e. it's in the trash,
496 en = ip_vs_lblc_new(iph->daddr, dest); 477 * we must ignore it, as it may be removed from under our feet,
497 if (en == NULL) { 478 * if someone drops our reference count. Our caller only makes
498 return NULL; 479 * sure that destinations, that are not in the trash, are not
499 } 480 * moved to the trash, while we are scheduling. But anyone can
500 ip_vs_lblc_hash(tbl, en); 481 * free up entries from the trash at any time.
501 } else { 482 */
502 dest = en->dest; 483
503 if (!(dest->flags & IP_VS_DEST_F_AVAILABLE) 484 if (en->dest->flags & IP_VS_DEST_F_AVAILABLE)
504 || atomic_read(&dest->weight) <= 0 485 dest = en->dest;
505 || is_overloaded(dest, svc)) { 486 }
506 dest = __ip_vs_lblc_schedule(svc, iph); 487 read_unlock(&svc->sched_lock);
507 if (dest == NULL) { 488
508 IP_VS_DBG(1, "no destination available\n"); 489 /* If the destination has a weight and is not overloaded, use it */
509 return NULL; 490 if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc))
510 } 491 goto out;
511 atomic_dec(&en->dest->refcnt); 492
512 atomic_inc(&dest->refcnt); 493 /* No cache entry or it is invalid, time to schedule */
513 en->dest = dest; 494 dest = __ip_vs_lblc_schedule(svc, iph);
514 } 495 if (!dest) {
496 IP_VS_DBG(1, "no destination available\n");
497 return NULL;
515 } 498 }
516 en->lastuse = jiffies;
517 499
500 /* If we fail to create a cache entry, we'll just use the valid dest */
501 write_lock(&svc->sched_lock);
502 ip_vs_lblc_new(tbl, iph->daddr, dest);
503 write_unlock(&svc->sched_lock);
504
505out:
518 IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u " 506 IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u "
519 "--> server %u.%u.%u.%u:%d\n", 507 "--> server %u.%u.%u.%u:%d\n",
520 NIPQUAD(en->addr), 508 NIPQUAD(iph->daddr),
521 NIPQUAD(dest->addr), 509 NIPQUAD(dest->addr),
522 ntohs(dest->port)); 510 ntohs(dest->port));
523 511