aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/route.c
diff options
context:
space:
mode:
authorEric Dumazet <dada1@cosmosbay.com>2007-09-15 13:55:54 -0400
committerDavid S. Miller <davem@sunset.davemloft.net>2007-10-10 19:49:25 -0400
commit39c90ece7565f5c47110c2fa77409d7a9478bd5b (patch)
tree220bf734ed470024901226675550501d45192f0e /net/ipv4/route.c
parentdac24ab396fc92985060d5cb3c467d2d0ffc0c20 (diff)
[IPV4]: Convert rt_check_expire() from softirq processing to workqueue.
On loaded/big hosts, rt_check_expire() if of litle use, because it generally breaks out of its main loop because of a jiffies change. It can take a long time (read : timer invocations) to actually scan the whole hash table, freeing unused entries. Converting it to use a workqueue instead of softirq is a nice move because we can allow rt_check_expire() to do the scan it is supposed to do, without hogging the CPU. This has an impact on the average number of entries in cache, reducing ram usage. Cache is more responsive to parameter changes (/proc/sys/net/ipv4/route/gc_timeout and /proc/sys/net/ipv4/route/gc_interval) Note: Maybe the default value of gc_interval (60 seconds) is too high, since this means we actually need 5 (300/60) invocations to scan the whole table. Signed-off-by: Eric Dumazet <dada1@cosmosbay.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/route.c')
-rw-r--r--net/ipv4/route.c30
1 files changed, 12 insertions, 18 deletions
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 396c631166a4..006d6058a806 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -81,6 +81,7 @@
81#include <linux/netdevice.h> 81#include <linux/netdevice.h>
82#include <linux/proc_fs.h> 82#include <linux/proc_fs.h>
83#include <linux/init.h> 83#include <linux/init.h>
84#include <linux/workqueue.h>
84#include <linux/skbuff.h> 85#include <linux/skbuff.h>
85#include <linux/inetdevice.h> 86#include <linux/inetdevice.h>
86#include <linux/igmp.h> 87#include <linux/igmp.h>
@@ -136,7 +137,8 @@ static unsigned long rt_deadline;
136#define RTprint(a...) printk(KERN_DEBUG a) 137#define RTprint(a...) printk(KERN_DEBUG a)
137 138
138static struct timer_list rt_flush_timer; 139static struct timer_list rt_flush_timer;
139static struct timer_list rt_periodic_timer; 140static void rt_check_expire(struct work_struct *work);
141static DECLARE_DELAYED_WORK(expires_work, rt_check_expire);
140static struct timer_list rt_secret_timer; 142static struct timer_list rt_secret_timer;
141 143
142/* 144/*
@@ -572,20 +574,19 @@ static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
572 (fl1->iif ^ fl2->iif)) == 0; 574 (fl1->iif ^ fl2->iif)) == 0;
573} 575}
574 576
575/* This runs via a timer and thus is always in BH context. */ 577static void rt_check_expire(struct work_struct *work)
576static void rt_check_expire(unsigned long dummy)
577{ 578{
578 static unsigned int rover; 579 static unsigned int rover;
579 unsigned int i = rover, goal; 580 unsigned int i = rover, goal;
580 struct rtable *rth, **rthp; 581 struct rtable *rth, **rthp;
581 unsigned long now = jiffies;
582 u64 mult; 582 u64 mult;
583 583
584 mult = ((u64)ip_rt_gc_interval) << rt_hash_log; 584 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
585 if (ip_rt_gc_timeout > 1) 585 if (ip_rt_gc_timeout > 1)
586 do_div(mult, ip_rt_gc_timeout); 586 do_div(mult, ip_rt_gc_timeout);
587 goal = (unsigned int)mult; 587 goal = (unsigned int)mult;
588 if (goal > rt_hash_mask) goal = rt_hash_mask + 1; 588 if (goal > rt_hash_mask)
589 goal = rt_hash_mask + 1;
589 for (; goal > 0; goal--) { 590 for (; goal > 0; goal--) {
590 unsigned long tmo = ip_rt_gc_timeout; 591 unsigned long tmo = ip_rt_gc_timeout;
591 592
@@ -594,11 +595,11 @@ static void rt_check_expire(unsigned long dummy)
594 595
595 if (*rthp == 0) 596 if (*rthp == 0)
596 continue; 597 continue;
597 spin_lock(rt_hash_lock_addr(i)); 598 spin_lock_bh(rt_hash_lock_addr(i));
598 while ((rth = *rthp) != NULL) { 599 while ((rth = *rthp) != NULL) {
599 if (rth->u.dst.expires) { 600 if (rth->u.dst.expires) {
600 /* Entry is expired even if it is in use */ 601 /* Entry is expired even if it is in use */
601 if (time_before_eq(now, rth->u.dst.expires)) { 602 if (time_before_eq(jiffies, rth->u.dst.expires)) {
602 tmo >>= 1; 603 tmo >>= 1;
603 rthp = &rth->u.dst.rt_next; 604 rthp = &rth->u.dst.rt_next;
604 continue; 605 continue;
@@ -613,14 +614,10 @@ static void rt_check_expire(unsigned long dummy)
613 *rthp = rth->u.dst.rt_next; 614 *rthp = rth->u.dst.rt_next;
614 rt_free(rth); 615 rt_free(rth);
615 } 616 }
616 spin_unlock(rt_hash_lock_addr(i)); 617 spin_unlock_bh(rt_hash_lock_addr(i));
617
618 /* Fallback loop breaker. */
619 if (time_after(jiffies, now))
620 break;
621 } 618 }
622 rover = i; 619 rover = i;
623 mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval); 620 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
624} 621}
625 622
626/* This can run from both BH and non-BH contexts, the latter 623/* This can run from both BH and non-BH contexts, the latter
@@ -2993,17 +2990,14 @@ int __init ip_rt_init(void)
2993 2990
2994 init_timer(&rt_flush_timer); 2991 init_timer(&rt_flush_timer);
2995 rt_flush_timer.function = rt_run_flush; 2992 rt_flush_timer.function = rt_run_flush;
2996 init_timer(&rt_periodic_timer);
2997 rt_periodic_timer.function = rt_check_expire;
2998 init_timer(&rt_secret_timer); 2993 init_timer(&rt_secret_timer);
2999 rt_secret_timer.function = rt_secret_rebuild; 2994 rt_secret_timer.function = rt_secret_rebuild;
3000 2995
3001 /* All the timers, started at system startup tend 2996 /* All the timers, started at system startup tend
3002 to synchronize. Perturb it a bit. 2997 to synchronize. Perturb it a bit.
3003 */ 2998 */
3004 rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval + 2999 schedule_delayed_work(&expires_work,
3005 ip_rt_gc_interval; 3000 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3006 add_timer(&rt_periodic_timer);
3007 3001
3008 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval + 3002 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3009 ip_rt_secret_interval; 3003 ip_rt_secret_interval;