diff options
author | Patrick McHardy <kaber@trash.net> | 2006-09-27 19:45:45 -0400 |
---|---|---|
committer | David S. Miller <davem@sunset.davemloft.net> | 2006-09-28 21:01:50 -0400 |
commit | 85670cc1faa2e1472e4a423cbf0b5e3d55c5ba88 (patch) | |
tree | a5da67836995f1b04c844071db97608bc2c37b85 | |
parent | 787e0617e5176176c494a787f1b0a5248a3db568 (diff) |
[NET_SCHED]: Fix fallout from dev->qdisc RCU change
The move of qdisc destruction to a rcu callback broke locking in the
entire qdisc layer by invalidating previously valid assumptions about
the context in which changes to the qdisc tree occur.
The two assumptions were:
- since changes only happen in process context, read_lock doesn't need
bottem half protection. Now invalid since destruction of inner qdiscs,
classifiers, actions and estimators happens in the RCU callback unless
they're manually deleted, resulting in dead-locks when read_lock in
process context is interrupted by write_lock_bh in bottem half context.
- since changes only happen under the RTNL, no additional locking is
necessary for data not used during packet processing (f.e. u32_list).
Again, since destruction now happens in the RCU callback, this assumption
is not valid anymore, causing races while using this data, which can
result in corruption or use-after-free.
Instead of "fixing" this by disabling bottem halfs everywhere and adding
new locks/refcounting, this patch makes these assumptions valid again by
moving destruction back to process context. Since only the dev->qdisc
pointer is protected by RCU, but ->enqueue and the qdisc tree are still
protected by dev->qdisc_lock, destruction of the tree can be performed
immediately and only the final free needs to happen in the rcu callback
to make sure dev_queue_xmit doesn't access already freed memory.
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | net/core/dev.c | 14 | ||||
-rw-r--r-- | net/sched/cls_api.c | 4 | ||||
-rw-r--r-- | net/sched/sch_api.c | 16 | ||||
-rw-r--r-- | net/sched/sch_generic.c | 66 |
4 files changed, 39 insertions, 61 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index 14de297d024d..4d891beab138 100644 --- a/net/core/dev.c +++ b/net/core/dev.c | |||
@@ -1480,14 +1480,16 @@ gso: | |||
1480 | if (q->enqueue) { | 1480 | if (q->enqueue) { |
1481 | /* Grab device queue */ | 1481 | /* Grab device queue */ |
1482 | spin_lock(&dev->queue_lock); | 1482 | spin_lock(&dev->queue_lock); |
1483 | q = dev->qdisc; | ||
1484 | if (q->enqueue) { | ||
1485 | rc = q->enqueue(skb, q); | ||
1486 | qdisc_run(dev); | ||
1487 | spin_unlock(&dev->queue_lock); | ||
1483 | 1488 | ||
1484 | rc = q->enqueue(skb, q); | 1489 | rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc; |
1485 | 1490 | goto out; | |
1486 | qdisc_run(dev); | 1491 | } |
1487 | |||
1488 | spin_unlock(&dev->queue_lock); | 1492 | spin_unlock(&dev->queue_lock); |
1489 | rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc; | ||
1490 | goto out; | ||
1491 | } | 1493 | } |
1492 | 1494 | ||
1493 | /* The device has no queue. Common case for software devices: | 1495 | /* The device has no queue. Common case for software devices: |
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 7e14f14058e9..37a184021647 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c | |||
@@ -401,7 +401,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) | |||
401 | if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL) | 401 | if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL) |
402 | return skb->len; | 402 | return skb->len; |
403 | 403 | ||
404 | read_lock_bh(&qdisc_tree_lock); | 404 | read_lock(&qdisc_tree_lock); |
405 | if (!tcm->tcm_parent) | 405 | if (!tcm->tcm_parent) |
406 | q = dev->qdisc_sleeping; | 406 | q = dev->qdisc_sleeping; |
407 | else | 407 | else |
@@ -458,7 +458,7 @@ errout: | |||
458 | if (cl) | 458 | if (cl) |
459 | cops->put(q, cl); | 459 | cops->put(q, cl); |
460 | out: | 460 | out: |
461 | read_unlock_bh(&qdisc_tree_lock); | 461 | read_unlock(&qdisc_tree_lock); |
462 | dev_put(dev); | 462 | dev_put(dev); |
463 | return skb->len; | 463 | return skb->len; |
464 | } | 464 | } |
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index a19eff12cf78..0b6489291140 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c | |||
@@ -195,14 +195,14 @@ struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) | |||
195 | { | 195 | { |
196 | struct Qdisc *q; | 196 | struct Qdisc *q; |
197 | 197 | ||
198 | read_lock_bh(&qdisc_tree_lock); | 198 | read_lock(&qdisc_tree_lock); |
199 | list_for_each_entry(q, &dev->qdisc_list, list) { | 199 | list_for_each_entry(q, &dev->qdisc_list, list) { |
200 | if (q->handle == handle) { | 200 | if (q->handle == handle) { |
201 | read_unlock_bh(&qdisc_tree_lock); | 201 | read_unlock(&qdisc_tree_lock); |
202 | return q; | 202 | return q; |
203 | } | 203 | } |
204 | } | 204 | } |
205 | read_unlock_bh(&qdisc_tree_lock); | 205 | read_unlock(&qdisc_tree_lock); |
206 | return NULL; | 206 | return NULL; |
207 | } | 207 | } |
208 | 208 | ||
@@ -837,7 +837,7 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) | |||
837 | continue; | 837 | continue; |
838 | if (idx > s_idx) | 838 | if (idx > s_idx) |
839 | s_q_idx = 0; | 839 | s_q_idx = 0; |
840 | read_lock_bh(&qdisc_tree_lock); | 840 | read_lock(&qdisc_tree_lock); |
841 | q_idx = 0; | 841 | q_idx = 0; |
842 | list_for_each_entry(q, &dev->qdisc_list, list) { | 842 | list_for_each_entry(q, &dev->qdisc_list, list) { |
843 | if (q_idx < s_q_idx) { | 843 | if (q_idx < s_q_idx) { |
@@ -846,12 +846,12 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) | |||
846 | } | 846 | } |
847 | if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid, | 847 | if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid, |
848 | cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) { | 848 | cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) { |
849 | read_unlock_bh(&qdisc_tree_lock); | 849 | read_unlock(&qdisc_tree_lock); |
850 | goto done; | 850 | goto done; |
851 | } | 851 | } |
852 | q_idx++; | 852 | q_idx++; |
853 | } | 853 | } |
854 | read_unlock_bh(&qdisc_tree_lock); | 854 | read_unlock(&qdisc_tree_lock); |
855 | } | 855 | } |
856 | 856 | ||
857 | done: | 857 | done: |
@@ -1074,7 +1074,7 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) | |||
1074 | s_t = cb->args[0]; | 1074 | s_t = cb->args[0]; |
1075 | t = 0; | 1075 | t = 0; |
1076 | 1076 | ||
1077 | read_lock_bh(&qdisc_tree_lock); | 1077 | read_lock(&qdisc_tree_lock); |
1078 | list_for_each_entry(q, &dev->qdisc_list, list) { | 1078 | list_for_each_entry(q, &dev->qdisc_list, list) { |
1079 | if (t < s_t || !q->ops->cl_ops || | 1079 | if (t < s_t || !q->ops->cl_ops || |
1080 | (tcm->tcm_parent && | 1080 | (tcm->tcm_parent && |
@@ -1096,7 +1096,7 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) | |||
1096 | break; | 1096 | break; |
1097 | t++; | 1097 | t++; |
1098 | } | 1098 | } |
1099 | read_unlock_bh(&qdisc_tree_lock); | 1099 | read_unlock(&qdisc_tree_lock); |
1100 | 1100 | ||
1101 | cb->args[0] = t; | 1101 | cb->args[0] = t; |
1102 | 1102 | ||
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 6f9151899795..88c6a99ce53c 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c | |||
@@ -45,11 +45,10 @@ | |||
45 | The idea is the following: | 45 | The idea is the following: |
46 | - enqueue, dequeue are serialized via top level device | 46 | - enqueue, dequeue are serialized via top level device |
47 | spinlock dev->queue_lock. | 47 | spinlock dev->queue_lock. |
48 | - tree walking is protected by read_lock_bh(qdisc_tree_lock) | 48 | - tree walking is protected by read_lock(qdisc_tree_lock) |
49 | and this lock is used only in process context. | 49 | and this lock is used only in process context. |
50 | - updates to tree are made under rtnl semaphore or | 50 | - updates to tree are made only under rtnl semaphore, |
51 | from softirq context (__qdisc_destroy rcu-callback) | 51 | hence this lock may be made without local bh disabling. |
52 | hence this lock needs local bh disabling. | ||
53 | 52 | ||
54 | qdisc_tree_lock must be grabbed BEFORE dev->queue_lock! | 53 | qdisc_tree_lock must be grabbed BEFORE dev->queue_lock! |
55 | */ | 54 | */ |
@@ -57,14 +56,14 @@ DEFINE_RWLOCK(qdisc_tree_lock); | |||
57 | 56 | ||
58 | void qdisc_lock_tree(struct net_device *dev) | 57 | void qdisc_lock_tree(struct net_device *dev) |
59 | { | 58 | { |
60 | write_lock_bh(&qdisc_tree_lock); | 59 | write_lock(&qdisc_tree_lock); |
61 | spin_lock_bh(&dev->queue_lock); | 60 | spin_lock_bh(&dev->queue_lock); |
62 | } | 61 | } |
63 | 62 | ||
64 | void qdisc_unlock_tree(struct net_device *dev) | 63 | void qdisc_unlock_tree(struct net_device *dev) |
65 | { | 64 | { |
66 | spin_unlock_bh(&dev->queue_lock); | 65 | spin_unlock_bh(&dev->queue_lock); |
67 | write_unlock_bh(&qdisc_tree_lock); | 66 | write_unlock(&qdisc_tree_lock); |
68 | } | 67 | } |
69 | 68 | ||
70 | /* | 69 | /* |
@@ -483,20 +482,6 @@ void qdisc_reset(struct Qdisc *qdisc) | |||
483 | static void __qdisc_destroy(struct rcu_head *head) | 482 | static void __qdisc_destroy(struct rcu_head *head) |
484 | { | 483 | { |
485 | struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu); | 484 | struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu); |
486 | struct Qdisc_ops *ops = qdisc->ops; | ||
487 | |||
488 | #ifdef CONFIG_NET_ESTIMATOR | ||
489 | gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est); | ||
490 | #endif | ||
491 | write_lock(&qdisc_tree_lock); | ||
492 | if (ops->reset) | ||
493 | ops->reset(qdisc); | ||
494 | if (ops->destroy) | ||
495 | ops->destroy(qdisc); | ||
496 | write_unlock(&qdisc_tree_lock); | ||
497 | module_put(ops->owner); | ||
498 | |||
499 | dev_put(qdisc->dev); | ||
500 | kfree((char *) qdisc - qdisc->padded); | 485 | kfree((char *) qdisc - qdisc->padded); |
501 | } | 486 | } |
502 | 487 | ||
@@ -504,32 +489,23 @@ static void __qdisc_destroy(struct rcu_head *head) | |||
504 | 489 | ||
505 | void qdisc_destroy(struct Qdisc *qdisc) | 490 | void qdisc_destroy(struct Qdisc *qdisc) |
506 | { | 491 | { |
507 | struct list_head cql = LIST_HEAD_INIT(cql); | 492 | struct Qdisc_ops *ops = qdisc->ops; |
508 | struct Qdisc *cq, *q, *n; | ||
509 | 493 | ||
510 | if (qdisc->flags & TCQ_F_BUILTIN || | 494 | if (qdisc->flags & TCQ_F_BUILTIN || |
511 | !atomic_dec_and_test(&qdisc->refcnt)) | 495 | !atomic_dec_and_test(&qdisc->refcnt)) |
512 | return; | 496 | return; |
513 | 497 | ||
514 | if (!list_empty(&qdisc->list)) { | 498 | list_del(&qdisc->list); |
515 | if (qdisc->ops->cl_ops == NULL) | 499 | #ifdef CONFIG_NET_ESTIMATOR |
516 | list_del(&qdisc->list); | 500 | gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est); |
517 | else | 501 | #endif |
518 | list_move(&qdisc->list, &cql); | 502 | if (ops->reset) |
519 | } | 503 | ops->reset(qdisc); |
520 | 504 | if (ops->destroy) | |
521 | /* unlink inner qdiscs from dev->qdisc_list immediately */ | 505 | ops->destroy(qdisc); |
522 | list_for_each_entry(cq, &cql, list) | ||
523 | list_for_each_entry_safe(q, n, &qdisc->dev->qdisc_list, list) | ||
524 | if (TC_H_MAJ(q->parent) == TC_H_MAJ(cq->handle)) { | ||
525 | if (q->ops->cl_ops == NULL) | ||
526 | list_del_init(&q->list); | ||
527 | else | ||
528 | list_move_tail(&q->list, &cql); | ||
529 | } | ||
530 | list_for_each_entry_safe(cq, n, &cql, list) | ||
531 | list_del_init(&cq->list); | ||
532 | 506 | ||
507 | module_put(ops->owner); | ||
508 | dev_put(qdisc->dev); | ||
533 | call_rcu(&qdisc->q_rcu, __qdisc_destroy); | 509 | call_rcu(&qdisc->q_rcu, __qdisc_destroy); |
534 | } | 510 | } |
535 | 511 | ||
@@ -549,15 +525,15 @@ void dev_activate(struct net_device *dev) | |||
549 | printk(KERN_INFO "%s: activation failed\n", dev->name); | 525 | printk(KERN_INFO "%s: activation failed\n", dev->name); |
550 | return; | 526 | return; |
551 | } | 527 | } |
552 | write_lock_bh(&qdisc_tree_lock); | 528 | write_lock(&qdisc_tree_lock); |
553 | list_add_tail(&qdisc->list, &dev->qdisc_list); | 529 | list_add_tail(&qdisc->list, &dev->qdisc_list); |
554 | write_unlock_bh(&qdisc_tree_lock); | 530 | write_unlock(&qdisc_tree_lock); |
555 | } else { | 531 | } else { |
556 | qdisc = &noqueue_qdisc; | 532 | qdisc = &noqueue_qdisc; |
557 | } | 533 | } |
558 | write_lock_bh(&qdisc_tree_lock); | 534 | write_lock(&qdisc_tree_lock); |
559 | dev->qdisc_sleeping = qdisc; | 535 | dev->qdisc_sleeping = qdisc; |
560 | write_unlock_bh(&qdisc_tree_lock); | 536 | write_unlock(&qdisc_tree_lock); |
561 | } | 537 | } |
562 | 538 | ||
563 | if (!netif_carrier_ok(dev)) | 539 | if (!netif_carrier_ok(dev)) |