diff options
author | Eric Dumazet <eric.dumazet@gmail.com> | 2012-07-30 21:08:23 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2012-07-31 17:41:38 -0400 |
commit | 54764bb647b2e847c512acf8d443df965da35000 (patch) | |
tree | 3918f6f42679d0ebdcef230f28285f99d178be62 /net | |
parent | 5a0d513b622ee41e117fc37e26e27e8ef42e8dae (diff) |
ipv4: Restore old dst_free() behavior.
commit 404e0a8b6a55 (net: ipv4: fix RCU races on dst refcounts) tried
to solve a race but added a problem at device/fib dismantle time :
We really want to call dst_free() as soon as possible, even if sockets
still have dst in their cache.
dst_release() calls in free_fib_info_rcu() are not welcomed.
Root of the problem was that now we also cache output routes (in
nh_rth_output), we must use call_rcu() instead of call_rcu_bh() in
rt_free(), because output route lookups are done in process context.
Based on feedback and initial patch from David Miller (adding another
call_rcu_bh() call in fib, but it appears it was not the right fix)
I left the inet_sk_rx_dst_set() helper and added __rcu attributes
to nh_rth_output and nh_rth_input to better document what is going on in
this code.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r-- | net/core/dst.c | 26 | ||||
-rw-r--r-- | net/decnet/dn_route.c | 6 | ||||
-rw-r--r-- | net/ipv4/fib_semantics.c | 21 | ||||
-rw-r--r-- | net/ipv4/route.c | 26 |
4 files changed, 39 insertions, 40 deletions
diff --git a/net/core/dst.c b/net/core/dst.c index d9e33ebe170f..069d51d29414 100644 --- a/net/core/dst.c +++ b/net/core/dst.c | |||
@@ -258,15 +258,6 @@ again: | |||
258 | } | 258 | } |
259 | EXPORT_SYMBOL(dst_destroy); | 259 | EXPORT_SYMBOL(dst_destroy); |
260 | 260 | ||
261 | static void dst_rcu_destroy(struct rcu_head *head) | ||
262 | { | ||
263 | struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); | ||
264 | |||
265 | dst = dst_destroy(dst); | ||
266 | if (dst) | ||
267 | __dst_free(dst); | ||
268 | } | ||
269 | |||
270 | void dst_release(struct dst_entry *dst) | 261 | void dst_release(struct dst_entry *dst) |
271 | { | 262 | { |
272 | if (dst) { | 263 | if (dst) { |
@@ -274,14 +265,10 @@ void dst_release(struct dst_entry *dst) | |||
274 | 265 | ||
275 | newrefcnt = atomic_dec_return(&dst->__refcnt); | 266 | newrefcnt = atomic_dec_return(&dst->__refcnt); |
276 | WARN_ON(newrefcnt < 0); | 267 | WARN_ON(newrefcnt < 0); |
277 | if (unlikely(dst->flags & (DST_NOCACHE | DST_RCU_FREE)) && !newrefcnt) { | 268 | if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) { |
278 | if (dst->flags & DST_RCU_FREE) { | 269 | dst = dst_destroy(dst); |
279 | call_rcu_bh(&dst->rcu_head, dst_rcu_destroy); | 270 | if (dst) |
280 | } else { | 271 | __dst_free(dst); |
281 | dst = dst_destroy(dst); | ||
282 | if (dst) | ||
283 | __dst_free(dst); | ||
284 | } | ||
285 | } | 272 | } |
286 | } | 273 | } |
287 | } | 274 | } |
@@ -333,14 +320,11 @@ EXPORT_SYMBOL(__dst_destroy_metrics_generic); | |||
333 | */ | 320 | */ |
334 | void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst) | 321 | void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst) |
335 | { | 322 | { |
336 | bool hold; | ||
337 | |||
338 | WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); | 323 | WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); |
339 | /* If dst not in cache, we must take a reference, because | 324 | /* If dst not in cache, we must take a reference, because |
340 | * dst_release() will destroy dst as soon as its refcount becomes zero | 325 | * dst_release() will destroy dst as soon as its refcount becomes zero |
341 | */ | 326 | */ |
342 | hold = (dst->flags & (DST_NOCACHE | DST_RCU_FREE)) == DST_NOCACHE; | 327 | if (unlikely(dst->flags & DST_NOCACHE)) { |
343 | if (unlikely(hold)) { | ||
344 | dst_hold(dst); | 328 | dst_hold(dst); |
345 | skb_dst_set(skb, dst); | 329 | skb_dst_set(skb, dst); |
346 | } else { | 330 | } else { |
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 26719779ad8e..85a3604c87c8 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c | |||
@@ -184,12 +184,6 @@ static __inline__ unsigned int dn_hash(__le16 src, __le16 dst) | |||
184 | return dn_rt_hash_mask & (unsigned int)tmp; | 184 | return dn_rt_hash_mask & (unsigned int)tmp; |
185 | } | 185 | } |
186 | 186 | ||
187 | static inline void dst_rcu_free(struct rcu_head *head) | ||
188 | { | ||
189 | struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); | ||
190 | dst_free(dst); | ||
191 | } | ||
192 | |||
193 | static inline void dnrt_free(struct dn_route *rt) | 187 | static inline void dnrt_free(struct dn_route *rt) |
194 | { | 188 | { |
195 | call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); | 189 | call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); |
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index e55171f184f9..625cf185c489 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c | |||
@@ -161,6 +161,21 @@ static void free_nh_exceptions(struct fib_nh *nh) | |||
161 | kfree(hash); | 161 | kfree(hash); |
162 | } | 162 | } |
163 | 163 | ||
164 | static void rt_nexthop_free(struct rtable __rcu **rtp) | ||
165 | { | ||
166 | struct rtable *rt = rcu_dereference_protected(*rtp, 1); | ||
167 | |||
168 | if (!rt) | ||
169 | return; | ||
170 | |||
171 | /* Not even needed : RCU_INIT_POINTER(*rtp, NULL); | ||
172 | * because we waited an RCU grace period before calling | ||
173 | * free_fib_info_rcu() | ||
174 | */ | ||
175 | |||
176 | dst_free(&rt->dst); | ||
177 | } | ||
178 | |||
164 | /* Release a nexthop info record */ | 179 | /* Release a nexthop info record */ |
165 | static void free_fib_info_rcu(struct rcu_head *head) | 180 | static void free_fib_info_rcu(struct rcu_head *head) |
166 | { | 181 | { |
@@ -171,10 +186,8 @@ static void free_fib_info_rcu(struct rcu_head *head) | |||
171 | dev_put(nexthop_nh->nh_dev); | 186 | dev_put(nexthop_nh->nh_dev); |
172 | if (nexthop_nh->nh_exceptions) | 187 | if (nexthop_nh->nh_exceptions) |
173 | free_nh_exceptions(nexthop_nh); | 188 | free_nh_exceptions(nexthop_nh); |
174 | if (nexthop_nh->nh_rth_output) | 189 | rt_nexthop_free(&nexthop_nh->nh_rth_output); |
175 | dst_release(&nexthop_nh->nh_rth_output->dst); | 190 | rt_nexthop_free(&nexthop_nh->nh_rth_input); |
176 | if (nexthop_nh->nh_rth_input) | ||
177 | dst_release(&nexthop_nh->nh_rth_input->dst); | ||
178 | } endfor_nexthops(fi); | 191 | } endfor_nexthops(fi); |
179 | 192 | ||
180 | release_net(fi->fib_net); | 193 | release_net(fi->fib_net); |
diff --git a/net/ipv4/route.c b/net/ipv4/route.c index d6eabcfe8a90..2bd107477469 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c | |||
@@ -1199,23 +1199,31 @@ restart: | |||
1199 | fnhe->fnhe_stamp = jiffies; | 1199 | fnhe->fnhe_stamp = jiffies; |
1200 | } | 1200 | } |
1201 | 1201 | ||
1202 | static inline void rt_free(struct rtable *rt) | ||
1203 | { | ||
1204 | call_rcu(&rt->dst.rcu_head, dst_rcu_free); | ||
1205 | } | ||
1206 | |||
1202 | static void rt_cache_route(struct fib_nh *nh, struct rtable *rt) | 1207 | static void rt_cache_route(struct fib_nh *nh, struct rtable *rt) |
1203 | { | 1208 | { |
1204 | struct rtable *orig, *prev, **p = &nh->nh_rth_output; | 1209 | struct rtable *orig, *prev, **p = (struct rtable **)&nh->nh_rth_output; |
1205 | 1210 | ||
1206 | if (rt_is_input_route(rt)) | 1211 | if (rt_is_input_route(rt)) |
1207 | p = &nh->nh_rth_input; | 1212 | p = (struct rtable **)&nh->nh_rth_input; |
1208 | 1213 | ||
1209 | orig = *p; | 1214 | orig = *p; |
1210 | 1215 | ||
1211 | rt->dst.flags |= DST_RCU_FREE; | ||
1212 | dst_hold(&rt->dst); | ||
1213 | prev = cmpxchg(p, orig, rt); | 1216 | prev = cmpxchg(p, orig, rt); |
1214 | if (prev == orig) { | 1217 | if (prev == orig) { |
1215 | if (orig) | 1218 | if (orig) |
1216 | dst_release(&orig->dst); | 1219 | rt_free(orig); |
1217 | } else { | 1220 | } else { |
1218 | dst_release(&rt->dst); | 1221 | /* Routes we intend to cache in the FIB nexthop have |
1222 | * the DST_NOCACHE bit clear. However, if we are | ||
1223 | * unsuccessful at storing this route into the cache | ||
1224 | * we really need to set it. | ||
1225 | */ | ||
1226 | rt->dst.flags |= DST_NOCACHE; | ||
1219 | } | 1227 | } |
1220 | } | 1228 | } |
1221 | 1229 | ||
@@ -1412,7 +1420,7 @@ static int __mkroute_input(struct sk_buff *skb, | |||
1412 | do_cache = false; | 1420 | do_cache = false; |
1413 | if (res->fi) { | 1421 | if (res->fi) { |
1414 | if (!itag) { | 1422 | if (!itag) { |
1415 | rth = FIB_RES_NH(*res).nh_rth_input; | 1423 | rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); |
1416 | if (rt_cache_valid(rth)) { | 1424 | if (rt_cache_valid(rth)) { |
1417 | skb_dst_set_noref(skb, &rth->dst); | 1425 | skb_dst_set_noref(skb, &rth->dst); |
1418 | goto out; | 1426 | goto out; |
@@ -1574,7 +1582,7 @@ local_input: | |||
1574 | do_cache = false; | 1582 | do_cache = false; |
1575 | if (res.fi) { | 1583 | if (res.fi) { |
1576 | if (!itag) { | 1584 | if (!itag) { |
1577 | rth = FIB_RES_NH(res).nh_rth_input; | 1585 | rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input); |
1578 | if (rt_cache_valid(rth)) { | 1586 | if (rt_cache_valid(rth)) { |
1579 | skb_dst_set_noref(skb, &rth->dst); | 1587 | skb_dst_set_noref(skb, &rth->dst); |
1580 | err = 0; | 1588 | err = 0; |
@@ -1742,7 +1750,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res, | |||
1742 | if (fi) { | 1750 | if (fi) { |
1743 | fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr); | 1751 | fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr); |
1744 | if (!fnhe) { | 1752 | if (!fnhe) { |
1745 | rth = FIB_RES_NH(*res).nh_rth_output; | 1753 | rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_output); |
1746 | if (rt_cache_valid(rth)) { | 1754 | if (rt_cache_valid(rth)) { |
1747 | dst_hold(&rth->dst); | 1755 | dst_hold(&rth->dst); |
1748 | return rth; | 1756 | return rth; |