diff options
author | Eric Dumazet <dada1@cosmosbay.com> | 2007-09-12 08:29:01 -0400 |
---|---|---|
committer | David S. Miller <davem@sunset.davemloft.net> | 2007-10-10 19:49:15 -0400 |
commit | 86bba269d08f0c545ae76c90b56727f65d62d57f (patch) | |
tree | 05b5edc0df7f0d0e4e04822cac41deba28f91e0a /net | |
parent | 3c12afe75f61d9402797d63941367962ca36fcc9 (diff) |
[PATCH] NET : convert IP route cache garbage collection from softirq processing to a workqueue
When the periodic IP route cache flush is done (every 600 seconds on
default configuration), some hosts suffer a lot and eventually trigger
the "soft lockup" message.
dst_run_gc() is doing a scan of a possibly huge list of dst_entries,
eventually freeing some (less than 1%) of them, while holding the
dst_lock spinlock for the whole scan.
Then it rearms a timer to redo the full thing 1/10 s later...
The slowdown can last one minute or so, depending on how active are
the tcp sessions.
This second version of the patch converts the processing from a softirq
based one to a workqueue.
Even if the list of entries in garbage_list is huge, host is still
responsive to softirqs and can make progress.
Instead of resetting gc timer to 0.1 second if one entry was freed in a
gc run, we do this if more than 10% of entries were freed.
Before patch :
Aug 16 06:21:37 SRV1 kernel: BUG: soft lockup detected on CPU#0!
Aug 16 06:21:37 SRV1 kernel:
Aug 16 06:21:37 SRV1 kernel: Call Trace:
Aug 16 06:21:37 SRV1 kernel: <IRQ> [<ffffffff802286f0>] wake_up_process+0x10/0x20
Aug 16 06:21:37 SRV1 kernel: [<ffffffff80251e09>] softlockup_tick+0xe9/0x110
Aug 16 06:21:37 SRV1 kernel: [<ffffffff803cd380>] dst_run_gc+0x0/0x140
Aug 16 06:21:37 SRV1 kernel: [<ffffffff802376f3>] run_local_timers+0x13/0x20
Aug 16 06:21:37 SRV1 kernel: [<ffffffff802379c7>] update_process_times+0x57/0x90
Aug 16 06:21:37 SRV1 kernel: [<ffffffff80216034>] smp_local_timer_interrupt+0x34/0x60
Aug 16 06:21:37 SRV1 kernel: [<ffffffff802165cc>] smp_apic_timer_interrupt+0x5c/0x80
Aug 16 06:21:37 SRV1 kernel: [<ffffffff8020a816>] apic_timer_interrupt+0x66/0x70
Aug 16 06:21:37 SRV1 kernel: [<ffffffff803cd3d3>] dst_run_gc+0x53/0x140
Aug 16 06:21:37 SRV1 kernel: [<ffffffff803cd3c6>] dst_run_gc+0x46/0x140
Aug 16 06:21:37 SRV1 kernel: [<ffffffff80237148>] run_timer_softirq+0x148/0x1c0
Aug 16 06:21:37 SRV1 kernel: [<ffffffff8023340c>] __do_softirq+0x6c/0xe0
Aug 16 06:21:37 SRV1 kernel: [<ffffffff8020ad6c>] call_softirq+0x1c/0x30
Aug 16 06:21:37 SRV1 kernel: <EOI> [<ffffffff8020cb34>] do_softirq+0x34/0x90
Aug 16 06:21:37 SRV1 kernel: [<ffffffff802331cf>] local_bh_enable_ip+0x3f/0x60
Aug 16 06:21:37 SRV1 kernel: [<ffffffff80422913>] _spin_unlock_bh+0x13/0x20
Aug 16 06:21:37 SRV1 kernel: [<ffffffff803dfde8>] rt_garbage_collect+0x1d8/0x320
Aug 16 06:21:37 SRV1 kernel: [<ffffffff803cd4dd>] dst_alloc+0x1d/0xa0
Aug 16 06:21:37 SRV1 kernel: [<ffffffff803e1433>] __ip_route_output_key+0x573/0x800
Aug 16 06:21:37 SRV1 kernel: [<ffffffff803c02e2>] sock_common_recvmsg+0x32/0x50
Aug 16 06:21:37 SRV1 kernel: [<ffffffff803e16dc>] ip_route_output_flow+0x1c/0x60
Aug 16 06:21:37 SRV1 kernel: [<ffffffff80400160>] tcp_v4_connect+0x150/0x610
Aug 16 06:21:37 SRV1 kernel: [<ffffffff803ebf07>] inet_bind_bucket_create+0x17/0x60
Aug 16 06:21:37 SRV1 kernel: [<ffffffff8040cd16>] inet_stream_connect+0xa6/0x2c0
Aug 16 06:21:37 SRV1 kernel: [<ffffffff80422981>] _spin_lock_bh+0x11/0x30
Aug 16 06:21:37 SRV1 kernel: [<ffffffff803c0bdf>] lock_sock_nested+0xcf/0xe0
Aug 16 06:21:37 SRV1 kernel: [<ffffffff80422981>] _spin_lock_bh+0x11/0x30
Aug 16 06:21:37 SRV1 kernel: [<ffffffff803be551>] sys_connect+0x71/0xa0
Aug 16 06:21:37 SRV1 kernel: [<ffffffff803eee3f>] tcp_setsockopt+0x1f/0x30
Aug 16 06:21:37 SRV1 kernel: [<ffffffff803c030f>] sock_common_setsockopt+0xf/0x20
Aug 16 06:21:37 SRV1 kernel: [<ffffffff803be4bd>] sys_setsockopt+0x9d/0xc0
Aug 16 06:21:37 SRV1 kernel: [<ffffffff8028881e>] sys_ioctl+0x5e/0x80
Aug 16 06:21:37 SRV1 kernel: [<ffffffff80209c4e>] system_call+0x7e/0x83
After patch : (RT_CACHE_DEBUG set to 2 to get following traces)
dst_total: 75469 delayed: 74109 work_perf: 141 expires: 150 elapsed: 8092 us
dst_total: 78725 delayed: 73366 work_perf: 743 expires: 400 elapsed: 8542 us
dst_total: 86126 delayed: 71844 work_perf: 1522 expires: 775 elapsed: 8849 us
dst_total: 100173 delayed: 68791 work_perf: 3053 expires: 1256 elapsed: 9748 us
dst_total: 121798 delayed: 64711 work_perf: 4080 expires: 1997 elapsed: 10146 us
dst_total: 154522 delayed: 58316 work_perf: 6395 expires: 25 elapsed: 11402 us
dst_total: 154957 delayed: 58252 work_perf: 64 expires: 150 elapsed: 6148 us
dst_total: 157377 delayed: 57843 work_perf: 409 expires: 400 elapsed: 6350 us
dst_total: 163745 delayed: 56679 work_perf: 1164 expires: 775 elapsed: 7051 us
dst_total: 176577 delayed: 53965 work_perf: 2714 expires: 1389 elapsed: 8120 us
dst_total: 198993 delayed: 49627 work_perf: 4338 expires: 1997 elapsed: 8909 us
dst_total: 226638 delayed: 46865 work_perf: 2762 expires: 2748 elapsed: 7351 us
I successfully reduced the IP route cache of many hosts by a four factor
thanks to this patch. Previously, I had to disable "ip route flush cache"
to avoid crashes.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r-- | net/core/dst.c | 189 |
1 files changed, 122 insertions, 67 deletions
diff --git a/net/core/dst.c b/net/core/dst.c index 32267a16e01e..38c741ac5d08 100644 --- a/net/core/dst.c +++ b/net/core/dst.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <linux/errno.h> | 9 | #include <linux/errno.h> |
10 | #include <linux/init.h> | 10 | #include <linux/init.h> |
11 | #include <linux/kernel.h> | 11 | #include <linux/kernel.h> |
12 | #include <linux/workqueue.h> | ||
12 | #include <linux/mm.h> | 13 | #include <linux/mm.h> |
13 | #include <linux/module.h> | 14 | #include <linux/module.h> |
14 | #include <linux/netdevice.h> | 15 | #include <linux/netdevice.h> |
@@ -19,50 +20,72 @@ | |||
19 | 20 | ||
20 | #include <net/dst.h> | 21 | #include <net/dst.h> |
21 | 22 | ||
22 | /* Locking strategy: | 23 | /* |
23 | * 1) Garbage collection state of dead destination cache | 24 | * Theory of operations: |
24 | * entries is protected by dst_lock. | 25 | * 1) We use a list, protected by a spinlock, to add |
25 | * 2) GC is run only from BH context, and is the only remover | 26 | * new entries from both BH and non-BH context. |
26 | * of entries. | 27 | * 2) In order to keep spinlock held for a small delay, |
27 | * 3) Entries are added to the garbage list from both BH | 28 | * we use a second list where are stored long lived |
28 | * and non-BH context, so local BH disabling is needed. | 29 | * entries, that are handled by the garbage collect thread |
29 | * 4) All operations modify state, so a spinlock is used. | 30 | * fired by a workqueue. |
31 | * 3) This list is guarded by a mutex, | ||
32 | * so that the gc_task and dst_dev_event() can be synchronized. | ||
30 | */ | 33 | */ |
31 | static struct dst_entry *dst_garbage_list; | ||
32 | #if RT_CACHE_DEBUG >= 2 | 34 | #if RT_CACHE_DEBUG >= 2 |
33 | static atomic_t dst_total = ATOMIC_INIT(0); | 35 | static atomic_t dst_total = ATOMIC_INIT(0); |
34 | #endif | 36 | #endif |
35 | static DEFINE_SPINLOCK(dst_lock); | ||
36 | 37 | ||
37 | static unsigned long dst_gc_timer_expires; | 38 | /* |
38 | static unsigned long dst_gc_timer_inc = DST_GC_MAX; | 39 | * We want to keep lock & list close together |
39 | static void dst_run_gc(unsigned long); | 40 | * to dirty as few cache lines as possible in __dst_free(). |
41 | * As this is not a very strong hint, we dont force an alignment on SMP. | ||
42 | */ | ||
43 | static struct { | ||
44 | spinlock_t lock; | ||
45 | struct dst_entry *list; | ||
46 | unsigned long timer_inc; | ||
47 | unsigned long timer_expires; | ||
48 | } dst_garbage = { | ||
49 | .lock = __SPIN_LOCK_UNLOCKED(dst_garbage.lock), | ||
50 | .timer_inc = DST_GC_MAX, | ||
51 | }; | ||
52 | static void dst_gc_task(struct work_struct *work); | ||
40 | static void ___dst_free(struct dst_entry * dst); | 53 | static void ___dst_free(struct dst_entry * dst); |
41 | 54 | ||
42 | static DEFINE_TIMER(dst_gc_timer, dst_run_gc, DST_GC_MIN, 0); | 55 | static DECLARE_DELAYED_WORK(dst_gc_work, dst_gc_task); |
43 | 56 | ||
44 | static void dst_run_gc(unsigned long dummy) | 57 | static DEFINE_MUTEX(dst_gc_mutex); |
58 | /* | ||
59 | * long lived entries are maintained in this list, guarded by dst_gc_mutex | ||
60 | */ | ||
61 | static struct dst_entry *dst_busy_list; | ||
62 | |||
63 | static void dst_gc_task(struct work_struct *work) | ||
45 | { | 64 | { |
46 | int delayed = 0; | 65 | int delayed = 0; |
47 | int work_performed; | 66 | int work_performed = 0; |
48 | struct dst_entry * dst, **dstp; | 67 | unsigned long expires = ~0L; |
68 | struct dst_entry *dst, *next, head; | ||
69 | struct dst_entry *last = &head; | ||
70 | #if RT_CACHE_DEBUG >= 2 | ||
71 | ktime_t time_start = ktime_get(); | ||
72 | struct timespec elapsed; | ||
73 | #endif | ||
49 | 74 | ||
50 | if (!spin_trylock(&dst_lock)) { | 75 | mutex_lock(&dst_gc_mutex); |
51 | mod_timer(&dst_gc_timer, jiffies + HZ/10); | 76 | next = dst_busy_list; |
52 | return; | ||
53 | } | ||
54 | 77 | ||
55 | del_timer(&dst_gc_timer); | 78 | loop: |
56 | dstp = &dst_garbage_list; | 79 | while ((dst = next) != NULL) { |
57 | work_performed = 0; | 80 | next = dst->next; |
58 | while ((dst = *dstp) != NULL) { | 81 | prefetch(&next->next); |
59 | if (atomic_read(&dst->__refcnt)) { | 82 | if (likely(atomic_read(&dst->__refcnt))) { |
60 | dstp = &dst->next; | 83 | last->next = dst; |
84 | last = dst; | ||
61 | delayed++; | 85 | delayed++; |
62 | continue; | 86 | continue; |
63 | } | 87 | } |
64 | *dstp = dst->next; | 88 | work_performed++; |
65 | work_performed = 1; | ||
66 | 89 | ||
67 | dst = dst_destroy(dst); | 90 | dst = dst_destroy(dst); |
68 | if (dst) { | 91 | if (dst) { |
@@ -78,38 +101,56 @@ static void dst_run_gc(unsigned long dummy) | |||
78 | continue; | 101 | continue; |
79 | 102 | ||
80 | ___dst_free(dst); | 103 | ___dst_free(dst); |
81 | dst->next = *dstp; | 104 | dst->next = next; |
82 | *dstp = dst; | 105 | next = dst; |
83 | dstp = &dst->next; | ||
84 | } | 106 | } |
85 | } | 107 | } |
86 | if (!dst_garbage_list) { | 108 | |
87 | dst_gc_timer_inc = DST_GC_MAX; | 109 | spin_lock_bh(&dst_garbage.lock); |
88 | goto out; | 110 | next = dst_garbage.list; |
111 | if (next) { | ||
112 | dst_garbage.list = NULL; | ||
113 | spin_unlock_bh(&dst_garbage.lock); | ||
114 | goto loop; | ||
89 | } | 115 | } |
90 | if (!work_performed) { | 116 | last->next = NULL; |
91 | if ((dst_gc_timer_expires += dst_gc_timer_inc) > DST_GC_MAX) | 117 | dst_busy_list = head.next; |
92 | dst_gc_timer_expires = DST_GC_MAX; | 118 | if (!dst_busy_list) |
93 | dst_gc_timer_inc += DST_GC_INC; | 119 | dst_garbage.timer_inc = DST_GC_MAX; |
94 | } else { | 120 | else { |
95 | dst_gc_timer_inc = DST_GC_INC; | 121 | /* |
96 | dst_gc_timer_expires = DST_GC_MIN; | 122 | * if we freed less than 1/10 of delayed entries, |
123 | * we can sleep longer. | ||
124 | */ | ||
125 | if (work_performed <= delayed/10) { | ||
126 | dst_garbage.timer_expires += dst_garbage.timer_inc; | ||
127 | if (dst_garbage.timer_expires > DST_GC_MAX) | ||
128 | dst_garbage.timer_expires = DST_GC_MAX; | ||
129 | dst_garbage.timer_inc += DST_GC_INC; | ||
130 | } else { | ||
131 | dst_garbage.timer_inc = DST_GC_INC; | ||
132 | dst_garbage.timer_expires = DST_GC_MIN; | ||
133 | } | ||
134 | expires = dst_garbage.timer_expires; | ||
135 | /* | ||
136 | * if the next desired timer is more than 4 seconds in the future | ||
137 | * then round the timer to whole seconds | ||
138 | */ | ||
139 | if (expires > 4*HZ) | ||
140 | expires = round_jiffies_relative(expires); | ||
141 | schedule_delayed_work(&dst_gc_work, expires); | ||
97 | } | 142 | } |
143 | |||
144 | spin_unlock_bh(&dst_garbage.lock); | ||
145 | mutex_unlock(&dst_gc_mutex); | ||
98 | #if RT_CACHE_DEBUG >= 2 | 146 | #if RT_CACHE_DEBUG >= 2 |
99 | printk("dst_total: %d/%d %ld\n", | 147 | elapsed = ktime_to_timespec(ktime_sub(ktime_get(), time_start)); |
100 | atomic_read(&dst_total), delayed, dst_gc_timer_expires); | 148 | printk(KERN_DEBUG "dst_total: %d delayed: %d work_perf: %d" |
149 | " expires: %lu elapsed: %lu us\n", | ||
150 | atomic_read(&dst_total), delayed, work_performed, | ||
151 | expires, | ||
152 | elapsed.tv_sec * USEC_PER_SEC + elapsed.tv_nsec / NSEC_PER_USEC); | ||
101 | #endif | 153 | #endif |
102 | /* if the next desired timer is more than 4 seconds in the future | ||
103 | * then round the timer to whole seconds | ||
104 | */ | ||
105 | if (dst_gc_timer_expires > 4*HZ) | ||
106 | mod_timer(&dst_gc_timer, | ||
107 | round_jiffies(jiffies + dst_gc_timer_expires)); | ||
108 | else | ||
109 | mod_timer(&dst_gc_timer, jiffies + dst_gc_timer_expires); | ||
110 | |||
111 | out: | ||
112 | spin_unlock(&dst_lock); | ||
113 | } | 154 | } |
114 | 155 | ||
115 | static int dst_discard(struct sk_buff *skb) | 156 | static int dst_discard(struct sk_buff *skb) |
@@ -154,16 +195,16 @@ static void ___dst_free(struct dst_entry * dst) | |||
154 | 195 | ||
155 | void __dst_free(struct dst_entry * dst) | 196 | void __dst_free(struct dst_entry * dst) |
156 | { | 197 | { |
157 | spin_lock_bh(&dst_lock); | 198 | spin_lock_bh(&dst_garbage.lock); |
158 | ___dst_free(dst); | 199 | ___dst_free(dst); |
159 | dst->next = dst_garbage_list; | 200 | dst->next = dst_garbage.list; |
160 | dst_garbage_list = dst; | 201 | dst_garbage.list = dst; |
161 | if (dst_gc_timer_inc > DST_GC_INC) { | 202 | if (dst_garbage.timer_inc > DST_GC_INC) { |
162 | dst_gc_timer_inc = DST_GC_INC; | 203 | dst_garbage.timer_inc = DST_GC_INC; |
163 | dst_gc_timer_expires = DST_GC_MIN; | 204 | dst_garbage.timer_expires = DST_GC_MIN; |
164 | mod_timer(&dst_gc_timer, jiffies + dst_gc_timer_expires); | 205 | schedule_delayed_work(&dst_gc_work, dst_garbage.timer_expires); |
165 | } | 206 | } |
166 | spin_unlock_bh(&dst_lock); | 207 | spin_unlock_bh(&dst_garbage.lock); |
167 | } | 208 | } |
168 | 209 | ||
169 | struct dst_entry *dst_destroy(struct dst_entry * dst) | 210 | struct dst_entry *dst_destroy(struct dst_entry * dst) |
@@ -251,7 +292,7 @@ static inline void dst_ifdown(struct dst_entry *dst, struct net_device *dev, | |||
251 | static int dst_dev_event(struct notifier_block *this, unsigned long event, void *ptr) | 292 | static int dst_dev_event(struct notifier_block *this, unsigned long event, void *ptr) |
252 | { | 293 | { |
253 | struct net_device *dev = ptr; | 294 | struct net_device *dev = ptr; |
254 | struct dst_entry *dst; | 295 | struct dst_entry *dst, *last = NULL; |
255 | 296 | ||
256 | if (dev->nd_net != &init_net) | 297 | if (dev->nd_net != &init_net) |
257 | return NOTIFY_DONE; | 298 | return NOTIFY_DONE; |
@@ -259,11 +300,25 @@ static int dst_dev_event(struct notifier_block *this, unsigned long event, void | |||
259 | switch (event) { | 300 | switch (event) { |
260 | case NETDEV_UNREGISTER: | 301 | case NETDEV_UNREGISTER: |
261 | case NETDEV_DOWN: | 302 | case NETDEV_DOWN: |
262 | spin_lock_bh(&dst_lock); | 303 | mutex_lock(&dst_gc_mutex); |
263 | for (dst = dst_garbage_list; dst; dst = dst->next) { | 304 | for (dst = dst_busy_list; dst; dst = dst->next) { |
305 | last = dst; | ||
306 | dst_ifdown(dst, dev, event != NETDEV_DOWN); | ||
307 | } | ||
308 | |||
309 | spin_lock_bh(&dst_garbage.lock); | ||
310 | dst = dst_garbage.list; | ||
311 | dst_garbage.list = NULL; | ||
312 | spin_unlock_bh(&dst_garbage.lock); | ||
313 | |||
314 | if (last) | ||
315 | last->next = dst; | ||
316 | else | ||
317 | dst_busy_list = dst; | ||
318 | for (; dst; dst = dst->next) { | ||
264 | dst_ifdown(dst, dev, event != NETDEV_DOWN); | 319 | dst_ifdown(dst, dev, event != NETDEV_DOWN); |
265 | } | 320 | } |
266 | spin_unlock_bh(&dst_lock); | 321 | mutex_unlock(&dst_gc_mutex); |
267 | break; | 322 | break; |
268 | } | 323 | } |
269 | return NOTIFY_DONE; | 324 | return NOTIFY_DONE; |