aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/ipvs/ip_vs_lblc.c204
1 files changed, 96 insertions, 108 deletions
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c
index b9b334cccf37..d2a43aa3fe4c 100644
--- a/net/ipv4/ipvs/ip_vs_lblc.c
+++ b/net/ipv4/ipvs/ip_vs_lblc.c
@@ -96,7 +96,6 @@ struct ip_vs_lblc_entry {
96 * IPVS lblc hash table 96 * IPVS lblc hash table
97 */ 97 */
98struct ip_vs_lblc_table { 98struct ip_vs_lblc_table {
99 rwlock_t lock; /* lock for this table */
100 struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ 99 struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */
101 atomic_t entries; /* number of entries */ 100 atomic_t entries; /* number of entries */
102 int max_size; /* maximum size of entries */ 101 int max_size; /* maximum size of entries */
@@ -123,31 +122,6 @@ static ctl_table vs_vars_table[] = {
123 122
124static struct ctl_table_header * sysctl_header; 123static struct ctl_table_header * sysctl_header;
125 124
126/*
127 * new/free a ip_vs_lblc_entry, which is a mapping of a destionation
128 * IP address to a server.
129 */
130static inline struct ip_vs_lblc_entry *
131ip_vs_lblc_new(__be32 daddr, struct ip_vs_dest *dest)
132{
133 struct ip_vs_lblc_entry *en;
134
135 en = kmalloc(sizeof(struct ip_vs_lblc_entry), GFP_ATOMIC);
136 if (en == NULL) {
137 IP_VS_ERR("ip_vs_lblc_new(): no memory\n");
138 return NULL;
139 }
140
141 INIT_LIST_HEAD(&en->list);
142 en->addr = daddr;
143
144 atomic_inc(&dest->refcnt);
145 en->dest = dest;
146
147 return en;
148}
149
150
151static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) 125static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
152{ 126{
153 list_del(&en->list); 127 list_del(&en->list);
@@ -173,55 +147,66 @@ static inline unsigned ip_vs_lblc_hashkey(__be32 addr)
173 * Hash an entry in the ip_vs_lblc_table. 147 * Hash an entry in the ip_vs_lblc_table.
174 * returns bool success. 148 * returns bool success.
175 */ 149 */
176static int 150static void
177ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) 151ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
178{ 152{
179 unsigned hash; 153 unsigned hash = ip_vs_lblc_hashkey(en->addr);
180
181 if (!list_empty(&en->list)) {
182 IP_VS_ERR("ip_vs_lblc_hash(): request for already hashed, "
183 "called from %p\n", __builtin_return_address(0));
184 return 0;
185 }
186 154
187 /*
188 * Hash by destination IP address
189 */
190 hash = ip_vs_lblc_hashkey(en->addr);
191
192 write_lock(&tbl->lock);
193 list_add(&en->list, &tbl->bucket[hash]); 155 list_add(&en->list, &tbl->bucket[hash]);
194 atomic_inc(&tbl->entries); 156 atomic_inc(&tbl->entries);
195 write_unlock(&tbl->lock);
196
197 return 1;
198} 157}
199 158
200 159
201/* 160/*
202 * Get ip_vs_lblc_entry associated with supplied parameters. 161 * Get ip_vs_lblc_entry associated with supplied parameters. Called under read
162 * lock
203 */ 163 */
204static inline struct ip_vs_lblc_entry * 164static inline struct ip_vs_lblc_entry *
205ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr) 165ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr)
206{ 166{
207 unsigned hash; 167 unsigned hash = ip_vs_lblc_hashkey(addr);
208 struct ip_vs_lblc_entry *en; 168 struct ip_vs_lblc_entry *en;
209 169
210 hash = ip_vs_lblc_hashkey(addr); 170 list_for_each_entry(en, &tbl->bucket[hash], list)
171 if (en->addr == addr)
172 return en;
211 173
212 read_lock(&tbl->lock); 174 return NULL;
175}
213 176
214 list_for_each_entry(en, &tbl->bucket[hash], list) { 177
215 if (en->addr == addr) { 178/*
216 /* HIT */ 179 * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP
217 read_unlock(&tbl->lock); 180 * address to a server. Called under write lock.
218 return en; 181 */
182static inline struct ip_vs_lblc_entry *
183ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, __be32 daddr,
184 struct ip_vs_dest *dest)
185{
186 struct ip_vs_lblc_entry *en;
187
188 en = ip_vs_lblc_get(tbl, daddr);
189 if (!en) {
190 en = kmalloc(sizeof(*en), GFP_ATOMIC);
191 if (!en) {
192 IP_VS_ERR("ip_vs_lblc_new(): no memory\n");
193 return NULL;
219 } 194 }
220 }
221 195
222 read_unlock(&tbl->lock); 196 en->addr = daddr;
197 en->lastuse = jiffies;
223 198
224 return NULL; 199 atomic_inc(&dest->refcnt);
200 en->dest = dest;
201
202 ip_vs_lblc_hash(tbl, en);
203 } else if (en->dest != dest) {
204 atomic_dec(&en->dest->refcnt);
205 atomic_inc(&dest->refcnt);
206 en->dest = dest;
207 }
208
209 return en;
225} 210}
226 211
227 212
@@ -230,30 +215,29 @@ ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr)
230 */ 215 */
231static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) 216static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl)
232{ 217{
233 int i;
234 struct ip_vs_lblc_entry *en, *nxt; 218 struct ip_vs_lblc_entry *en, *nxt;
219 int i;
235 220
236 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { 221 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
237 write_lock(&tbl->lock);
238 list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { 222 list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
239 ip_vs_lblc_free(en); 223 ip_vs_lblc_free(en);
240 atomic_dec(&tbl->entries); 224 atomic_dec(&tbl->entries);
241 } 225 }
242 write_unlock(&tbl->lock);
243 } 226 }
244} 227}
245 228
246 229
247static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl) 230static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
248{ 231{
232 struct ip_vs_lblc_table *tbl = svc->sched_data;
233 struct ip_vs_lblc_entry *en, *nxt;
249 unsigned long now = jiffies; 234 unsigned long now = jiffies;
250 int i, j; 235 int i, j;
251 struct ip_vs_lblc_entry *en, *nxt;
252 236
253 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { 237 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
254 j = (j + 1) & IP_VS_LBLC_TAB_MASK; 238 j = (j + 1) & IP_VS_LBLC_TAB_MASK;
255 239
256 write_lock(&tbl->lock); 240 write_lock(&svc->sched_lock);
257 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { 241 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
258 if (time_before(now, 242 if (time_before(now,
259 en->lastuse + sysctl_ip_vs_lblc_expiration)) 243 en->lastuse + sysctl_ip_vs_lblc_expiration))
@@ -262,7 +246,7 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl)
262 ip_vs_lblc_free(en); 246 ip_vs_lblc_free(en);
263 atomic_dec(&tbl->entries); 247 atomic_dec(&tbl->entries);
264 } 248 }
265 write_unlock(&tbl->lock); 249 write_unlock(&svc->sched_lock);
266 } 250 }
267 tbl->rover = j; 251 tbl->rover = j;
268} 252}
@@ -281,17 +265,16 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl)
281 */ 265 */
282static void ip_vs_lblc_check_expire(unsigned long data) 266static void ip_vs_lblc_check_expire(unsigned long data)
283{ 267{
284 struct ip_vs_lblc_table *tbl; 268 struct ip_vs_service *svc = (struct ip_vs_service *) data;
269 struct ip_vs_lblc_table *tbl = svc->sched_data;
285 unsigned long now = jiffies; 270 unsigned long now = jiffies;
286 int goal; 271 int goal;
287 int i, j; 272 int i, j;
288 struct ip_vs_lblc_entry *en, *nxt; 273 struct ip_vs_lblc_entry *en, *nxt;
289 274
290 tbl = (struct ip_vs_lblc_table *)data;
291
292 if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { 275 if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
293 /* do full expiration check */ 276 /* do full expiration check */
294 ip_vs_lblc_full_check(tbl); 277 ip_vs_lblc_full_check(svc);
295 tbl->counter = 1; 278 tbl->counter = 1;
296 goto out; 279 goto out;
297 } 280 }
@@ -308,7 +291,7 @@ static void ip_vs_lblc_check_expire(unsigned long data)
308 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { 291 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
309 j = (j + 1) & IP_VS_LBLC_TAB_MASK; 292 j = (j + 1) & IP_VS_LBLC_TAB_MASK;
310 293
311 write_lock(&tbl->lock); 294 write_lock(&svc->sched_lock);
312 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { 295 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
313 if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) 296 if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
314 continue; 297 continue;
@@ -317,7 +300,7 @@ static void ip_vs_lblc_check_expire(unsigned long data)
317 atomic_dec(&tbl->entries); 300 atomic_dec(&tbl->entries);
318 goal--; 301 goal--;
319 } 302 }
320 write_unlock(&tbl->lock); 303 write_unlock(&svc->sched_lock);
321 if (goal <= 0) 304 if (goal <= 0)
322 break; 305 break;
323 } 306 }
@@ -336,15 +319,14 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
336 /* 319 /*
337 * Allocate the ip_vs_lblc_table for this service 320 * Allocate the ip_vs_lblc_table for this service
338 */ 321 */
339 tbl = kmalloc(sizeof(struct ip_vs_lblc_table), GFP_ATOMIC); 322 tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
340 if (tbl == NULL) { 323 if (tbl == NULL) {
341 IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n"); 324 IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n");
342 return -ENOMEM; 325 return -ENOMEM;
343 } 326 }
344 svc->sched_data = tbl; 327 svc->sched_data = tbl;
345 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for " 328 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for "
346 "current service\n", 329 "current service\n", sizeof(*tbl));
347 sizeof(struct ip_vs_lblc_table));
348 330
349 /* 331 /*
350 * Initialize the hash buckets 332 * Initialize the hash buckets
@@ -352,7 +334,6 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
352 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { 334 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
353 INIT_LIST_HEAD(&tbl->bucket[i]); 335 INIT_LIST_HEAD(&tbl->bucket[i]);
354 } 336 }
355 rwlock_init(&tbl->lock);
356 tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; 337 tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
357 tbl->rover = 0; 338 tbl->rover = 0;
358 tbl->counter = 1; 339 tbl->counter = 1;
@@ -361,9 +342,8 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
361 * Hook periodic timer for garbage collection 342 * Hook periodic timer for garbage collection
362 */ 343 */
363 setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire, 344 setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire,
364 (unsigned long)tbl); 345 (unsigned long)svc);
365 tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL; 346 mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
366 add_timer(&tbl->periodic_timer);
367 347
368 return 0; 348 return 0;
369} 349}
@@ -380,9 +360,9 @@ static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
380 ip_vs_lblc_flush(tbl); 360 ip_vs_lblc_flush(tbl);
381 361
382 /* release the table itself */ 362 /* release the table itself */
383 kfree(svc->sched_data); 363 kfree(tbl);
384 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n", 364 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
385 sizeof(struct ip_vs_lblc_table)); 365 sizeof(*tbl));
386 366
387 return 0; 367 return 0;
388} 368}
@@ -478,46 +458,54 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
478static struct ip_vs_dest * 458static struct ip_vs_dest *
479ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) 459ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
480{ 460{
481 struct ip_vs_dest *dest; 461 struct ip_vs_lblc_table *tbl = svc->sched_data;
482 struct ip_vs_lblc_table *tbl;
483 struct ip_vs_lblc_entry *en;
484 struct iphdr *iph = ip_hdr(skb); 462 struct iphdr *iph = ip_hdr(skb);
463 struct ip_vs_dest *dest = NULL;
464 struct ip_vs_lblc_entry *en;
485 465
486 IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n"); 466 IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n");
487 467
488 tbl = (struct ip_vs_lblc_table *)svc->sched_data; 468 /* First look in our cache */
469 read_lock(&svc->sched_lock);
489 en = ip_vs_lblc_get(tbl, iph->daddr); 470 en = ip_vs_lblc_get(tbl, iph->daddr);
490 if (en == NULL) { 471 if (en) {
491 dest = __ip_vs_lblc_schedule(svc, iph); 472 /* We only hold a read lock, but this is atomic */
492 if (dest == NULL) { 473 en->lastuse = jiffies;
493 IP_VS_DBG(1, "no destination available\n"); 474
494 return NULL; 475 /*
495 } 476 * If the destination is not available, i.e. it's in the trash,
496 en = ip_vs_lblc_new(iph->daddr, dest); 477 * we must ignore it, as it may be removed from under our feet,
497 if (en == NULL) { 478 * if someone drops our reference count. Our caller only makes
498 return NULL; 479 * sure that destinations, that are not in the trash, are not
499 } 480 * moved to the trash, while we are scheduling. But anyone can
500 ip_vs_lblc_hash(tbl, en); 481 * free up entries from the trash at any time.
501 } else { 482 */
502 dest = en->dest; 483
503 if (!(dest->flags & IP_VS_DEST_F_AVAILABLE) 484 if (en->dest->flags & IP_VS_DEST_F_AVAILABLE)
504 || atomic_read(&dest->weight) <= 0 485 dest = en->dest;
505 || is_overloaded(dest, svc)) { 486 }
506 dest = __ip_vs_lblc_schedule(svc, iph); 487 read_unlock(&svc->sched_lock);
507 if (dest == NULL) { 488
508 IP_VS_DBG(1, "no destination available\n"); 489 /* If the destination has a weight and is not overloaded, use it */
509 return NULL; 490 if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc))
510 } 491 goto out;
511 atomic_dec(&en->dest->refcnt); 492
512 atomic_inc(&dest->refcnt); 493 /* No cache entry or it is invalid, time to schedule */
513 en->dest = dest; 494 dest = __ip_vs_lblc_schedule(svc, iph);
514 } 495 if (!dest) {
496 IP_VS_DBG(1, "no destination available\n");
497 return NULL;
515 } 498 }
516 en->lastuse = jiffies;
517 499
500 /* If we fail to create a cache entry, we'll just use the valid dest */
501 write_lock(&svc->sched_lock);
502 ip_vs_lblc_new(tbl, iph->daddr, dest);
503 write_unlock(&svc->sched_lock);
504
505out:
518 IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u " 506 IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u "
519 "--> server %u.%u.%u.%u:%d\n", 507 "--> server %u.%u.%u.%u:%d\n",
520 NIPQUAD(en->addr), 508 NIPQUAD(iph->daddr),
521 NIPQUAD(dest->addr), 509 NIPQUAD(dest->addr),
522 ntohs(dest->port)); 510 ntohs(dest->port));
523 511