diff options
-rw-r--r-- | net/core/dst.c | 189 |
1 files changed, 122 insertions, 67 deletions
diff --git a/net/core/dst.c b/net/core/dst.c index 32267a16e01..38c741ac5d0 100644 --- a/net/core/dst.c +++ b/net/core/dst.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <linux/errno.h> | 9 | #include <linux/errno.h> |
10 | #include <linux/init.h> | 10 | #include <linux/init.h> |
11 | #include <linux/kernel.h> | 11 | #include <linux/kernel.h> |
12 | #include <linux/workqueue.h> | ||
12 | #include <linux/mm.h> | 13 | #include <linux/mm.h> |
13 | #include <linux/module.h> | 14 | #include <linux/module.h> |
14 | #include <linux/netdevice.h> | 15 | #include <linux/netdevice.h> |
@@ -19,50 +20,72 @@ | |||
19 | 20 | ||
20 | #include <net/dst.h> | 21 | #include <net/dst.h> |
21 | 22 | ||
22 | /* Locking strategy: | 23 | /* |
23 | * 1) Garbage collection state of dead destination cache | 24 | * Theory of operations: |
24 | * entries is protected by dst_lock. | 25 | * 1) We use a list, protected by a spinlock, to add |
25 | * 2) GC is run only from BH context, and is the only remover | 26 | * new entries from both BH and non-BH context. |
26 | * of entries. | 27 | * 2) In order to keep spinlock held for a small delay, |
27 | * 3) Entries are added to the garbage list from both BH | 28 | * we use a second list where are stored long lived |
28 | * and non-BH context, so local BH disabling is needed. | 29 | * entries, that are handled by the garbage collect thread |
29 | * 4) All operations modify state, so a spinlock is used. | 30 | * fired by a workqueue. |
31 | * 3) This list is guarded by a mutex, | ||
32 | * so that the gc_task and dst_dev_event() can be synchronized. | ||
30 | */ | 33 | */ |
31 | static struct dst_entry *dst_garbage_list; | ||
32 | #if RT_CACHE_DEBUG >= 2 | 34 | #if RT_CACHE_DEBUG >= 2 |
33 | static atomic_t dst_total = ATOMIC_INIT(0); | 35 | static atomic_t dst_total = ATOMIC_INIT(0); |
34 | #endif | 36 | #endif |
35 | static DEFINE_SPINLOCK(dst_lock); | ||
36 | 37 | ||
37 | static unsigned long dst_gc_timer_expires; | 38 | /* |
38 | static unsigned long dst_gc_timer_inc = DST_GC_MAX; | 39 | * We want to keep lock & list close together |
39 | static void dst_run_gc(unsigned long); | 40 | * to dirty as few cache lines as possible in __dst_free(). |
41 | * As this is not a very strong hint, we dont force an alignment on SMP. | ||
42 | */ | ||
43 | static struct { | ||
44 | spinlock_t lock; | ||
45 | struct dst_entry *list; | ||
46 | unsigned long timer_inc; | ||
47 | unsigned long timer_expires; | ||
48 | } dst_garbage = { | ||
49 | .lock = __SPIN_LOCK_UNLOCKED(dst_garbage.lock), | ||
50 | .timer_inc = DST_GC_MAX, | ||
51 | }; | ||
52 | static void dst_gc_task(struct work_struct *work); | ||
40 | static void ___dst_free(struct dst_entry * dst); | 53 | static void ___dst_free(struct dst_entry * dst); |
41 | 54 | ||
42 | static DEFINE_TIMER(dst_gc_timer, dst_run_gc, DST_GC_MIN, 0); | 55 | static DECLARE_DELAYED_WORK(dst_gc_work, dst_gc_task); |
43 | 56 | ||
44 | static void dst_run_gc(unsigned long dummy) | 57 | static DEFINE_MUTEX(dst_gc_mutex); |
58 | /* | ||
59 | * long lived entries are maintained in this list, guarded by dst_gc_mutex | ||
60 | */ | ||
61 | static struct dst_entry *dst_busy_list; | ||
62 | |||
63 | static void dst_gc_task(struct work_struct *work) | ||
45 | { | 64 | { |
46 | int delayed = 0; | 65 | int delayed = 0; |
47 | int work_performed; | 66 | int work_performed = 0; |
48 | struct dst_entry * dst, **dstp; | 67 | unsigned long expires = ~0L; |
68 | struct dst_entry *dst, *next, head; | ||
69 | struct dst_entry *last = &head; | ||
70 | #if RT_CACHE_DEBUG >= 2 | ||
71 | ktime_t time_start = ktime_get(); | ||
72 | struct timespec elapsed; | ||
73 | #endif | ||
49 | 74 | ||
50 | if (!spin_trylock(&dst_lock)) { | 75 | mutex_lock(&dst_gc_mutex); |
51 | mod_timer(&dst_gc_timer, jiffies + HZ/10); | 76 | next = dst_busy_list; |
52 | return; | ||
53 | } | ||
54 | 77 | ||
55 | del_timer(&dst_gc_timer); | 78 | loop: |
56 | dstp = &dst_garbage_list; | 79 | while ((dst = next) != NULL) { |
57 | work_performed = 0; | 80 | next = dst->next; |
58 | while ((dst = *dstp) != NULL) { | 81 | prefetch(&next->next); |
59 | if (atomic_read(&dst->__refcnt)) { | 82 | if (likely(atomic_read(&dst->__refcnt))) { |
60 | dstp = &dst->next; | 83 | last->next = dst; |
84 | last = dst; | ||
61 | delayed++; | 85 | delayed++; |
62 | continue; | 86 | continue; |
63 | } | 87 | } |
64 | *dstp = dst->next; | 88 | work_performed++; |
65 | work_performed = 1; | ||
66 | 89 | ||
67 | dst = dst_destroy(dst); | 90 | dst = dst_destroy(dst); |
68 | if (dst) { | 91 | if (dst) { |
@@ -78,38 +101,56 @@ static void dst_run_gc(unsigned long dummy) | |||
78 | continue; | 101 | continue; |
79 | 102 | ||
80 | ___dst_free(dst); | 103 | ___dst_free(dst); |
81 | dst->next = *dstp; | 104 | dst->next = next; |
82 | *dstp = dst; | 105 | next = dst; |
83 | dstp = &dst->next; | ||
84 | } | 106 | } |
85 | } | 107 | } |
86 | if (!dst_garbage_list) { | 108 | |
87 | dst_gc_timer_inc = DST_GC_MAX; | 109 | spin_lock_bh(&dst_garbage.lock); |
88 | goto out; | 110 | next = dst_garbage.list; |
111 | if (next) { | ||
112 | dst_garbage.list = NULL; | ||
113 | spin_unlock_bh(&dst_garbage.lock); | ||
114 | goto loop; | ||
89 | } | 115 | } |
90 | if (!work_performed) { | 116 | last->next = NULL; |
91 | if ((dst_gc_timer_expires += dst_gc_timer_inc) > DST_GC_MAX) | 117 | dst_busy_list = head.next; |
92 | dst_gc_timer_expires = DST_GC_MAX; | 118 | if (!dst_busy_list) |
93 | dst_gc_timer_inc += DST_GC_INC; | 119 | dst_garbage.timer_inc = DST_GC_MAX; |
94 | } else { | 120 | else { |
95 | dst_gc_timer_inc = DST_GC_INC; | 121 | /* |
96 | dst_gc_timer_expires = DST_GC_MIN; | 122 | * if we freed less than 1/10 of delayed entries, |
123 | * we can sleep longer. | ||
124 | */ | ||
125 | if (work_performed <= delayed/10) { | ||
126 | dst_garbage.timer_expires += dst_garbage.timer_inc; | ||
127 | if (dst_garbage.timer_expires > DST_GC_MAX) | ||
128 | dst_garbage.timer_expires = DST_GC_MAX; | ||
129 | dst_garbage.timer_inc += DST_GC_INC; | ||
130 | } else { | ||
131 | dst_garbage.timer_inc = DST_GC_INC; | ||
132 | dst_garbage.timer_expires = DST_GC_MIN; | ||
133 | } | ||
134 | expires = dst_garbage.timer_expires; | ||
135 | /* | ||
136 | * if the next desired timer is more than 4 seconds in the future | ||
137 | * then round the timer to whole seconds | ||
138 | */ | ||
139 | if (expires > 4*HZ) | ||
140 | expires = round_jiffies_relative(expires); | ||
141 | schedule_delayed_work(&dst_gc_work, expires); | ||
97 | } | 142 | } |
143 | |||
144 | spin_unlock_bh(&dst_garbage.lock); | ||
145 | mutex_unlock(&dst_gc_mutex); | ||
98 | #if RT_CACHE_DEBUG >= 2 | 146 | #if RT_CACHE_DEBUG >= 2 |
99 | printk("dst_total: %d/%d %ld\n", | 147 | elapsed = ktime_to_timespec(ktime_sub(ktime_get(), time_start)); |
100 | atomic_read(&dst_total), delayed, dst_gc_timer_expires); | 148 | printk(KERN_DEBUG "dst_total: %d delayed: %d work_perf: %d" |
149 | " expires: %lu elapsed: %lu us\n", | ||
150 | atomic_read(&dst_total), delayed, work_performed, | ||
151 | expires, | ||
152 | elapsed.tv_sec * USEC_PER_SEC + elapsed.tv_nsec / NSEC_PER_USEC); | ||
101 | #endif | 153 | #endif |
102 | /* if the next desired timer is more than 4 seconds in the future | ||
103 | * then round the timer to whole seconds | ||
104 | */ | ||
105 | if (dst_gc_timer_expires > 4*HZ) | ||
106 | mod_timer(&dst_gc_timer, | ||
107 | round_jiffies(jiffies + dst_gc_timer_expires)); | ||
108 | else | ||
109 | mod_timer(&dst_gc_timer, jiffies + dst_gc_timer_expires); | ||
110 | |||
111 | out: | ||
112 | spin_unlock(&dst_lock); | ||
113 | } | 154 | } |
114 | 155 | ||
115 | static int dst_discard(struct sk_buff *skb) | 156 | static int dst_discard(struct sk_buff *skb) |
@@ -154,16 +195,16 @@ static void ___dst_free(struct dst_entry * dst) | |||
154 | 195 | ||
155 | void __dst_free(struct dst_entry * dst) | 196 | void __dst_free(struct dst_entry * dst) |
156 | { | 197 | { |
157 | spin_lock_bh(&dst_lock); | 198 | spin_lock_bh(&dst_garbage.lock); |
158 | ___dst_free(dst); | 199 | ___dst_free(dst); |
159 | dst->next = dst_garbage_list; | 200 | dst->next = dst_garbage.list; |
160 | dst_garbage_list = dst; | 201 | dst_garbage.list = dst; |
161 | if (dst_gc_timer_inc > DST_GC_INC) { | 202 | if (dst_garbage.timer_inc > DST_GC_INC) { |
162 | dst_gc_timer_inc = DST_GC_INC; | 203 | dst_garbage.timer_inc = DST_GC_INC; |
163 | dst_gc_timer_expires = DST_GC_MIN; | 204 | dst_garbage.timer_expires = DST_GC_MIN; |
164 | mod_timer(&dst_gc_timer, jiffies + dst_gc_timer_expires); | 205 | schedule_delayed_work(&dst_gc_work, dst_garbage.timer_expires); |
165 | } | 206 | } |
166 | spin_unlock_bh(&dst_lock); | 207 | spin_unlock_bh(&dst_garbage.lock); |
167 | } | 208 | } |
168 | 209 | ||
169 | struct dst_entry *dst_destroy(struct dst_entry * dst) | 210 | struct dst_entry *dst_destroy(struct dst_entry * dst) |
@@ -251,7 +292,7 @@ static inline void dst_ifdown(struct dst_entry *dst, struct net_device *dev, | |||
251 | static int dst_dev_event(struct notifier_block *this, unsigned long event, void *ptr) | 292 | static int dst_dev_event(struct notifier_block *this, unsigned long event, void *ptr) |
252 | { | 293 | { |
253 | struct net_device *dev = ptr; | 294 | struct net_device *dev = ptr; |
254 | struct dst_entry *dst; | 295 | struct dst_entry *dst, *last = NULL; |
255 | 296 | ||
256 | if (dev->nd_net != &init_net) | 297 | if (dev->nd_net != &init_net) |
257 | return NOTIFY_DONE; | 298 | return NOTIFY_DONE; |
@@ -259,11 +300,25 @@ static int dst_dev_event(struct notifier_block *this, unsigned long event, void | |||
259 | switch (event) { | 300 | switch (event) { |
260 | case NETDEV_UNREGISTER: | 301 | case NETDEV_UNREGISTER: |
261 | case NETDEV_DOWN: | 302 | case NETDEV_DOWN: |
262 | spin_lock_bh(&dst_lock); | 303 | mutex_lock(&dst_gc_mutex); |
263 | for (dst = dst_garbage_list; dst; dst = dst->next) { | 304 | for (dst = dst_busy_list; dst; dst = dst->next) { |
305 | last = dst; | ||
306 | dst_ifdown(dst, dev, event != NETDEV_DOWN); | ||
307 | } | ||
308 | |||
309 | spin_lock_bh(&dst_garbage.lock); | ||
310 | dst = dst_garbage.list; | ||
311 | dst_garbage.list = NULL; | ||
312 | spin_unlock_bh(&dst_garbage.lock); | ||
313 | |||
314 | if (last) | ||
315 | last->next = dst; | ||
316 | else | ||
317 | dst_busy_list = dst; | ||
318 | for (; dst; dst = dst->next) { | ||
264 | dst_ifdown(dst, dev, event != NETDEV_DOWN); | 319 | dst_ifdown(dst, dev, event != NETDEV_DOWN); |
265 | } | 320 | } |
266 | spin_unlock_bh(&dst_lock); | 321 | mutex_unlock(&dst_gc_mutex); |
267 | break; | 322 | break; |
268 | } | 323 | } |
269 | return NOTIFY_DONE; | 324 | return NOTIFY_DONE; |