diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/ipvs/ip_vs_lblc.c | 204 |
1 files changed, 96 insertions, 108 deletions
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c index b9b334cccf37..d2a43aa3fe4c 100644 --- a/net/ipv4/ipvs/ip_vs_lblc.c +++ b/net/ipv4/ipvs/ip_vs_lblc.c | |||
@@ -96,7 +96,6 @@ struct ip_vs_lblc_entry { | |||
96 | * IPVS lblc hash table | 96 | * IPVS lblc hash table |
97 | */ | 97 | */ |
98 | struct ip_vs_lblc_table { | 98 | struct ip_vs_lblc_table { |
99 | rwlock_t lock; /* lock for this table */ | ||
100 | struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ | 99 | struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ |
101 | atomic_t entries; /* number of entries */ | 100 | atomic_t entries; /* number of entries */ |
102 | int max_size; /* maximum size of entries */ | 101 | int max_size; /* maximum size of entries */ |
@@ -123,31 +122,6 @@ static ctl_table vs_vars_table[] = { | |||
123 | 122 | ||
124 | static struct ctl_table_header * sysctl_header; | 123 | static struct ctl_table_header * sysctl_header; |
125 | 124 | ||
126 | /* | ||
127 | * new/free a ip_vs_lblc_entry, which is a mapping of a destionation | ||
128 | * IP address to a server. | ||
129 | */ | ||
130 | static inline struct ip_vs_lblc_entry * | ||
131 | ip_vs_lblc_new(__be32 daddr, struct ip_vs_dest *dest) | ||
132 | { | ||
133 | struct ip_vs_lblc_entry *en; | ||
134 | |||
135 | en = kmalloc(sizeof(struct ip_vs_lblc_entry), GFP_ATOMIC); | ||
136 | if (en == NULL) { | ||
137 | IP_VS_ERR("ip_vs_lblc_new(): no memory\n"); | ||
138 | return NULL; | ||
139 | } | ||
140 | |||
141 | INIT_LIST_HEAD(&en->list); | ||
142 | en->addr = daddr; | ||
143 | |||
144 | atomic_inc(&dest->refcnt); | ||
145 | en->dest = dest; | ||
146 | |||
147 | return en; | ||
148 | } | ||
149 | |||
150 | |||
151 | static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) | 125 | static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) |
152 | { | 126 | { |
153 | list_del(&en->list); | 127 | list_del(&en->list); |
@@ -173,55 +147,66 @@ static inline unsigned ip_vs_lblc_hashkey(__be32 addr) | |||
173 | * Hash an entry in the ip_vs_lblc_table. | 147 | * Hash an entry in the ip_vs_lblc_table. |
174 | * returns bool success. | 148 | * returns bool success. |
175 | */ | 149 | */ |
176 | static int | 150 | static void |
177 | ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) | 151 | ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) |
178 | { | 152 | { |
179 | unsigned hash; | 153 | unsigned hash = ip_vs_lblc_hashkey(en->addr); |
180 | |||
181 | if (!list_empty(&en->list)) { | ||
182 | IP_VS_ERR("ip_vs_lblc_hash(): request for already hashed, " | ||
183 | "called from %p\n", __builtin_return_address(0)); | ||
184 | return 0; | ||
185 | } | ||
186 | 154 | ||
187 | /* | ||
188 | * Hash by destination IP address | ||
189 | */ | ||
190 | hash = ip_vs_lblc_hashkey(en->addr); | ||
191 | |||
192 | write_lock(&tbl->lock); | ||
193 | list_add(&en->list, &tbl->bucket[hash]); | 155 | list_add(&en->list, &tbl->bucket[hash]); |
194 | atomic_inc(&tbl->entries); | 156 | atomic_inc(&tbl->entries); |
195 | write_unlock(&tbl->lock); | ||
196 | |||
197 | return 1; | ||
198 | } | 157 | } |
199 | 158 | ||
200 | 159 | ||
201 | /* | 160 | /* |
202 | * Get ip_vs_lblc_entry associated with supplied parameters. | 161 | * Get ip_vs_lblc_entry associated with supplied parameters. Called under read |
162 | * lock | ||
203 | */ | 163 | */ |
204 | static inline struct ip_vs_lblc_entry * | 164 | static inline struct ip_vs_lblc_entry * |
205 | ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr) | 165 | ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr) |
206 | { | 166 | { |
207 | unsigned hash; | 167 | unsigned hash = ip_vs_lblc_hashkey(addr); |
208 | struct ip_vs_lblc_entry *en; | 168 | struct ip_vs_lblc_entry *en; |
209 | 169 | ||
210 | hash = ip_vs_lblc_hashkey(addr); | 170 | list_for_each_entry(en, &tbl->bucket[hash], list) |
171 | if (en->addr == addr) | ||
172 | return en; | ||
211 | 173 | ||
212 | read_lock(&tbl->lock); | 174 | return NULL; |
175 | } | ||
213 | 176 | ||
214 | list_for_each_entry(en, &tbl->bucket[hash], list) { | 177 | |
215 | if (en->addr == addr) { | 178 | /* |
216 | /* HIT */ | 179 | * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP |
217 | read_unlock(&tbl->lock); | 180 | * address to a server. Called under write lock. |
218 | return en; | 181 | */ |
182 | static inline struct ip_vs_lblc_entry * | ||
183 | ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, __be32 daddr, | ||
184 | struct ip_vs_dest *dest) | ||
185 | { | ||
186 | struct ip_vs_lblc_entry *en; | ||
187 | |||
188 | en = ip_vs_lblc_get(tbl, daddr); | ||
189 | if (!en) { | ||
190 | en = kmalloc(sizeof(*en), GFP_ATOMIC); | ||
191 | if (!en) { | ||
192 | IP_VS_ERR("ip_vs_lblc_new(): no memory\n"); | ||
193 | return NULL; | ||
219 | } | 194 | } |
220 | } | ||
221 | 195 | ||
222 | read_unlock(&tbl->lock); | 196 | en->addr = daddr; |
197 | en->lastuse = jiffies; | ||
223 | 198 | ||
224 | return NULL; | 199 | atomic_inc(&dest->refcnt); |
200 | en->dest = dest; | ||
201 | |||
202 | ip_vs_lblc_hash(tbl, en); | ||
203 | } else if (en->dest != dest) { | ||
204 | atomic_dec(&en->dest->refcnt); | ||
205 | atomic_inc(&dest->refcnt); | ||
206 | en->dest = dest; | ||
207 | } | ||
208 | |||
209 | return en; | ||
225 | } | 210 | } |
226 | 211 | ||
227 | 212 | ||
@@ -230,30 +215,29 @@ ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr) | |||
230 | */ | 215 | */ |
231 | static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) | 216 | static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) |
232 | { | 217 | { |
233 | int i; | ||
234 | struct ip_vs_lblc_entry *en, *nxt; | 218 | struct ip_vs_lblc_entry *en, *nxt; |
219 | int i; | ||
235 | 220 | ||
236 | for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { | 221 | for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { |
237 | write_lock(&tbl->lock); | ||
238 | list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { | 222 | list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { |
239 | ip_vs_lblc_free(en); | 223 | ip_vs_lblc_free(en); |
240 | atomic_dec(&tbl->entries); | 224 | atomic_dec(&tbl->entries); |
241 | } | 225 | } |
242 | write_unlock(&tbl->lock); | ||
243 | } | 226 | } |
244 | } | 227 | } |
245 | 228 | ||
246 | 229 | ||
247 | static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl) | 230 | static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) |
248 | { | 231 | { |
232 | struct ip_vs_lblc_table *tbl = svc->sched_data; | ||
233 | struct ip_vs_lblc_entry *en, *nxt; | ||
249 | unsigned long now = jiffies; | 234 | unsigned long now = jiffies; |
250 | int i, j; | 235 | int i, j; |
251 | struct ip_vs_lblc_entry *en, *nxt; | ||
252 | 236 | ||
253 | for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { | 237 | for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { |
254 | j = (j + 1) & IP_VS_LBLC_TAB_MASK; | 238 | j = (j + 1) & IP_VS_LBLC_TAB_MASK; |
255 | 239 | ||
256 | write_lock(&tbl->lock); | 240 | write_lock(&svc->sched_lock); |
257 | list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { | 241 | list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { |
258 | if (time_before(now, | 242 | if (time_before(now, |
259 | en->lastuse + sysctl_ip_vs_lblc_expiration)) | 243 | en->lastuse + sysctl_ip_vs_lblc_expiration)) |
@@ -262,7 +246,7 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl) | |||
262 | ip_vs_lblc_free(en); | 246 | ip_vs_lblc_free(en); |
263 | atomic_dec(&tbl->entries); | 247 | atomic_dec(&tbl->entries); |
264 | } | 248 | } |
265 | write_unlock(&tbl->lock); | 249 | write_unlock(&svc->sched_lock); |
266 | } | 250 | } |
267 | tbl->rover = j; | 251 | tbl->rover = j; |
268 | } | 252 | } |
@@ -281,17 +265,16 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl) | |||
281 | */ | 265 | */ |
282 | static void ip_vs_lblc_check_expire(unsigned long data) | 266 | static void ip_vs_lblc_check_expire(unsigned long data) |
283 | { | 267 | { |
284 | struct ip_vs_lblc_table *tbl; | 268 | struct ip_vs_service *svc = (struct ip_vs_service *) data; |
269 | struct ip_vs_lblc_table *tbl = svc->sched_data; | ||
285 | unsigned long now = jiffies; | 270 | unsigned long now = jiffies; |
286 | int goal; | 271 | int goal; |
287 | int i, j; | 272 | int i, j; |
288 | struct ip_vs_lblc_entry *en, *nxt; | 273 | struct ip_vs_lblc_entry *en, *nxt; |
289 | 274 | ||
290 | tbl = (struct ip_vs_lblc_table *)data; | ||
291 | |||
292 | if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { | 275 | if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { |
293 | /* do full expiration check */ | 276 | /* do full expiration check */ |
294 | ip_vs_lblc_full_check(tbl); | 277 | ip_vs_lblc_full_check(svc); |
295 | tbl->counter = 1; | 278 | tbl->counter = 1; |
296 | goto out; | 279 | goto out; |
297 | } | 280 | } |
@@ -308,7 +291,7 @@ static void ip_vs_lblc_check_expire(unsigned long data) | |||
308 | for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { | 291 | for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { |
309 | j = (j + 1) & IP_VS_LBLC_TAB_MASK; | 292 | j = (j + 1) & IP_VS_LBLC_TAB_MASK; |
310 | 293 | ||
311 | write_lock(&tbl->lock); | 294 | write_lock(&svc->sched_lock); |
312 | list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { | 295 | list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { |
313 | if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) | 296 | if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) |
314 | continue; | 297 | continue; |
@@ -317,7 +300,7 @@ static void ip_vs_lblc_check_expire(unsigned long data) | |||
317 | atomic_dec(&tbl->entries); | 300 | atomic_dec(&tbl->entries); |
318 | goal--; | 301 | goal--; |
319 | } | 302 | } |
320 | write_unlock(&tbl->lock); | 303 | write_unlock(&svc->sched_lock); |
321 | if (goal <= 0) | 304 | if (goal <= 0) |
322 | break; | 305 | break; |
323 | } | 306 | } |
@@ -336,15 +319,14 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) | |||
336 | /* | 319 | /* |
337 | * Allocate the ip_vs_lblc_table for this service | 320 | * Allocate the ip_vs_lblc_table for this service |
338 | */ | 321 | */ |
339 | tbl = kmalloc(sizeof(struct ip_vs_lblc_table), GFP_ATOMIC); | 322 | tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC); |
340 | if (tbl == NULL) { | 323 | if (tbl == NULL) { |
341 | IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n"); | 324 | IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n"); |
342 | return -ENOMEM; | 325 | return -ENOMEM; |
343 | } | 326 | } |
344 | svc->sched_data = tbl; | 327 | svc->sched_data = tbl; |
345 | IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for " | 328 | IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for " |
346 | "current service\n", | 329 | "current service\n", sizeof(*tbl)); |
347 | sizeof(struct ip_vs_lblc_table)); | ||
348 | 330 | ||
349 | /* | 331 | /* |
350 | * Initialize the hash buckets | 332 | * Initialize the hash buckets |
@@ -352,7 +334,6 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) | |||
352 | for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { | 334 | for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { |
353 | INIT_LIST_HEAD(&tbl->bucket[i]); | 335 | INIT_LIST_HEAD(&tbl->bucket[i]); |
354 | } | 336 | } |
355 | rwlock_init(&tbl->lock); | ||
356 | tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; | 337 | tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; |
357 | tbl->rover = 0; | 338 | tbl->rover = 0; |
358 | tbl->counter = 1; | 339 | tbl->counter = 1; |
@@ -361,9 +342,8 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) | |||
361 | * Hook periodic timer for garbage collection | 342 | * Hook periodic timer for garbage collection |
362 | */ | 343 | */ |
363 | setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire, | 344 | setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire, |
364 | (unsigned long)tbl); | 345 | (unsigned long)svc); |
365 | tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL; | 346 | mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); |
366 | add_timer(&tbl->periodic_timer); | ||
367 | 347 | ||
368 | return 0; | 348 | return 0; |
369 | } | 349 | } |
@@ -380,9 +360,9 @@ static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) | |||
380 | ip_vs_lblc_flush(tbl); | 360 | ip_vs_lblc_flush(tbl); |
381 | 361 | ||
382 | /* release the table itself */ | 362 | /* release the table itself */ |
383 | kfree(svc->sched_data); | 363 | kfree(tbl); |
384 | IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n", | 364 | IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n", |
385 | sizeof(struct ip_vs_lblc_table)); | 365 | sizeof(*tbl)); |
386 | 366 | ||
387 | return 0; | 367 | return 0; |
388 | } | 368 | } |
@@ -478,46 +458,54 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) | |||
478 | static struct ip_vs_dest * | 458 | static struct ip_vs_dest * |
479 | ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) | 459 | ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) |
480 | { | 460 | { |
481 | struct ip_vs_dest *dest; | 461 | struct ip_vs_lblc_table *tbl = svc->sched_data; |
482 | struct ip_vs_lblc_table *tbl; | ||
483 | struct ip_vs_lblc_entry *en; | ||
484 | struct iphdr *iph = ip_hdr(skb); | 462 | struct iphdr *iph = ip_hdr(skb); |
463 | struct ip_vs_dest *dest = NULL; | ||
464 | struct ip_vs_lblc_entry *en; | ||
485 | 465 | ||
486 | IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n"); | 466 | IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n"); |
487 | 467 | ||
488 | tbl = (struct ip_vs_lblc_table *)svc->sched_data; | 468 | /* First look in our cache */ |
469 | read_lock(&svc->sched_lock); | ||
489 | en = ip_vs_lblc_get(tbl, iph->daddr); | 470 | en = ip_vs_lblc_get(tbl, iph->daddr); |
490 | if (en == NULL) { | 471 | if (en) { |
491 | dest = __ip_vs_lblc_schedule(svc, iph); | 472 | /* We only hold a read lock, but this is atomic */ |
492 | if (dest == NULL) { | 473 | en->lastuse = jiffies; |
493 | IP_VS_DBG(1, "no destination available\n"); | 474 | |
494 | return NULL; | 475 | /* |
495 | } | 476 | * If the destination is not available, i.e. it's in the trash, |
496 | en = ip_vs_lblc_new(iph->daddr, dest); | 477 | * we must ignore it, as it may be removed from under our feet, |
497 | if (en == NULL) { | 478 | * if someone drops our reference count. Our caller only makes |
498 | return NULL; | 479 | * sure that destinations, that are not in the trash, are not |
499 | } | 480 | * moved to the trash, while we are scheduling. But anyone can |
500 | ip_vs_lblc_hash(tbl, en); | 481 | * free up entries from the trash at any time. |
501 | } else { | 482 | */ |
502 | dest = en->dest; | 483 | |
503 | if (!(dest->flags & IP_VS_DEST_F_AVAILABLE) | 484 | if (en->dest->flags & IP_VS_DEST_F_AVAILABLE) |
504 | || atomic_read(&dest->weight) <= 0 | 485 | dest = en->dest; |
505 | || is_overloaded(dest, svc)) { | 486 | } |
506 | dest = __ip_vs_lblc_schedule(svc, iph); | 487 | read_unlock(&svc->sched_lock); |
507 | if (dest == NULL) { | 488 | |
508 | IP_VS_DBG(1, "no destination available\n"); | 489 | /* If the destination has a weight and is not overloaded, use it */ |
509 | return NULL; | 490 | if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc)) |
510 | } | 491 | goto out; |
511 | atomic_dec(&en->dest->refcnt); | 492 | |
512 | atomic_inc(&dest->refcnt); | 493 | /* No cache entry or it is invalid, time to schedule */ |
513 | en->dest = dest; | 494 | dest = __ip_vs_lblc_schedule(svc, iph); |
514 | } | 495 | if (!dest) { |
496 | IP_VS_DBG(1, "no destination available\n"); | ||
497 | return NULL; | ||
515 | } | 498 | } |
516 | en->lastuse = jiffies; | ||
517 | 499 | ||
500 | /* If we fail to create a cache entry, we'll just use the valid dest */ | ||
501 | write_lock(&svc->sched_lock); | ||
502 | ip_vs_lblc_new(tbl, iph->daddr, dest); | ||
503 | write_unlock(&svc->sched_lock); | ||
504 | |||
505 | out: | ||
518 | IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u " | 506 | IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u " |
519 | "--> server %u.%u.%u.%u:%d\n", | 507 | "--> server %u.%u.%u.%u:%d\n", |
520 | NIPQUAD(en->addr), | 508 | NIPQUAD(iph->daddr), |
521 | NIPQUAD(dest->addr), | 509 | NIPQUAD(dest->addr), |
522 | ntohs(dest->port)); | 510 | ntohs(dest->port)); |
523 | 511 | ||