diff options
author | Patrick McHardy <kaber@trash.net> | 2006-09-27 19:45:45 -0400 |
---|---|---|
committer | David S. Miller <davem@sunset.davemloft.net> | 2006-09-28 21:01:50 -0400 |
commit | 85670cc1faa2e1472e4a423cbf0b5e3d55c5ba88 (patch) | |
tree | a5da67836995f1b04c844071db97608bc2c37b85 /net/sched/sch_generic.c | |
parent | 787e0617e5176176c494a787f1b0a5248a3db568 (diff) |
[NET_SCHED]: Fix fallout from dev->qdisc RCU change
The move of qdisc destruction to a rcu callback broke locking in the
entire qdisc layer by invalidating previously valid assumptions about
the context in which changes to the qdisc tree occur.
The two assumptions were:
- since changes only happen in process context, read_lock doesn't need
bottem half protection. Now invalid since destruction of inner qdiscs,
classifiers, actions and estimators happens in the RCU callback unless
they're manually deleted, resulting in dead-locks when read_lock in
process context is interrupted by write_lock_bh in bottem half context.
- since changes only happen under the RTNL, no additional locking is
necessary for data not used during packet processing (f.e. u32_list).
Again, since destruction now happens in the RCU callback, this assumption
is not valid anymore, causing races while using this data, which can
result in corruption or use-after-free.
Instead of "fixing" this by disabling bottem halfs everywhere and adding
new locks/refcounting, this patch makes these assumptions valid again by
moving destruction back to process context. Since only the dev->qdisc
pointer is protected by RCU, but ->enqueue and the qdisc tree are still
protected by dev->qdisc_lock, destruction of the tree can be performed
immediately and only the final free needs to happen in the rcu callback
to make sure dev_queue_xmit doesn't access already freed memory.
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/sched/sch_generic.c')
-rw-r--r-- | net/sched/sch_generic.c | 66 |
1 files changed, 21 insertions, 45 deletions
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 6f9151899795..88c6a99ce53c 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c | |||
@@ -45,11 +45,10 @@ | |||
45 | The idea is the following: | 45 | The idea is the following: |
46 | - enqueue, dequeue are serialized via top level device | 46 | - enqueue, dequeue are serialized via top level device |
47 | spinlock dev->queue_lock. | 47 | spinlock dev->queue_lock. |
48 | - tree walking is protected by read_lock_bh(qdisc_tree_lock) | 48 | - tree walking is protected by read_lock(qdisc_tree_lock) |
49 | and this lock is used only in process context. | 49 | and this lock is used only in process context. |
50 | - updates to tree are made under rtnl semaphore or | 50 | - updates to tree are made only under rtnl semaphore, |
51 | from softirq context (__qdisc_destroy rcu-callback) | 51 | hence this lock may be made without local bh disabling. |
52 | hence this lock needs local bh disabling. | ||
53 | 52 | ||
54 | qdisc_tree_lock must be grabbed BEFORE dev->queue_lock! | 53 | qdisc_tree_lock must be grabbed BEFORE dev->queue_lock! |
55 | */ | 54 | */ |
@@ -57,14 +56,14 @@ DEFINE_RWLOCK(qdisc_tree_lock); | |||
57 | 56 | ||
58 | void qdisc_lock_tree(struct net_device *dev) | 57 | void qdisc_lock_tree(struct net_device *dev) |
59 | { | 58 | { |
60 | write_lock_bh(&qdisc_tree_lock); | 59 | write_lock(&qdisc_tree_lock); |
61 | spin_lock_bh(&dev->queue_lock); | 60 | spin_lock_bh(&dev->queue_lock); |
62 | } | 61 | } |
63 | 62 | ||
64 | void qdisc_unlock_tree(struct net_device *dev) | 63 | void qdisc_unlock_tree(struct net_device *dev) |
65 | { | 64 | { |
66 | spin_unlock_bh(&dev->queue_lock); | 65 | spin_unlock_bh(&dev->queue_lock); |
67 | write_unlock_bh(&qdisc_tree_lock); | 66 | write_unlock(&qdisc_tree_lock); |
68 | } | 67 | } |
69 | 68 | ||
70 | /* | 69 | /* |
@@ -483,20 +482,6 @@ void qdisc_reset(struct Qdisc *qdisc) | |||
483 | static void __qdisc_destroy(struct rcu_head *head) | 482 | static void __qdisc_destroy(struct rcu_head *head) |
484 | { | 483 | { |
485 | struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu); | 484 | struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu); |
486 | struct Qdisc_ops *ops = qdisc->ops; | ||
487 | |||
488 | #ifdef CONFIG_NET_ESTIMATOR | ||
489 | gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est); | ||
490 | #endif | ||
491 | write_lock(&qdisc_tree_lock); | ||
492 | if (ops->reset) | ||
493 | ops->reset(qdisc); | ||
494 | if (ops->destroy) | ||
495 | ops->destroy(qdisc); | ||
496 | write_unlock(&qdisc_tree_lock); | ||
497 | module_put(ops->owner); | ||
498 | |||
499 | dev_put(qdisc->dev); | ||
500 | kfree((char *) qdisc - qdisc->padded); | 485 | kfree((char *) qdisc - qdisc->padded); |
501 | } | 486 | } |
502 | 487 | ||
@@ -504,32 +489,23 @@ static void __qdisc_destroy(struct rcu_head *head) | |||
504 | 489 | ||
505 | void qdisc_destroy(struct Qdisc *qdisc) | 490 | void qdisc_destroy(struct Qdisc *qdisc) |
506 | { | 491 | { |
507 | struct list_head cql = LIST_HEAD_INIT(cql); | 492 | struct Qdisc_ops *ops = qdisc->ops; |
508 | struct Qdisc *cq, *q, *n; | ||
509 | 493 | ||
510 | if (qdisc->flags & TCQ_F_BUILTIN || | 494 | if (qdisc->flags & TCQ_F_BUILTIN || |
511 | !atomic_dec_and_test(&qdisc->refcnt)) | 495 | !atomic_dec_and_test(&qdisc->refcnt)) |
512 | return; | 496 | return; |
513 | 497 | ||
514 | if (!list_empty(&qdisc->list)) { | 498 | list_del(&qdisc->list); |
515 | if (qdisc->ops->cl_ops == NULL) | 499 | #ifdef CONFIG_NET_ESTIMATOR |
516 | list_del(&qdisc->list); | 500 | gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est); |
517 | else | 501 | #endif |
518 | list_move(&qdisc->list, &cql); | 502 | if (ops->reset) |
519 | } | 503 | ops->reset(qdisc); |
520 | 504 | if (ops->destroy) | |
521 | /* unlink inner qdiscs from dev->qdisc_list immediately */ | 505 | ops->destroy(qdisc); |
522 | list_for_each_entry(cq, &cql, list) | ||
523 | list_for_each_entry_safe(q, n, &qdisc->dev->qdisc_list, list) | ||
524 | if (TC_H_MAJ(q->parent) == TC_H_MAJ(cq->handle)) { | ||
525 | if (q->ops->cl_ops == NULL) | ||
526 | list_del_init(&q->list); | ||
527 | else | ||
528 | list_move_tail(&q->list, &cql); | ||
529 | } | ||
530 | list_for_each_entry_safe(cq, n, &cql, list) | ||
531 | list_del_init(&cq->list); | ||
532 | 506 | ||
507 | module_put(ops->owner); | ||
508 | dev_put(qdisc->dev); | ||
533 | call_rcu(&qdisc->q_rcu, __qdisc_destroy); | 509 | call_rcu(&qdisc->q_rcu, __qdisc_destroy); |
534 | } | 510 | } |
535 | 511 | ||
@@ -549,15 +525,15 @@ void dev_activate(struct net_device *dev) | |||
549 | printk(KERN_INFO "%s: activation failed\n", dev->name); | 525 | printk(KERN_INFO "%s: activation failed\n", dev->name); |
550 | return; | 526 | return; |
551 | } | 527 | } |
552 | write_lock_bh(&qdisc_tree_lock); | 528 | write_lock(&qdisc_tree_lock); |
553 | list_add_tail(&qdisc->list, &dev->qdisc_list); | 529 | list_add_tail(&qdisc->list, &dev->qdisc_list); |
554 | write_unlock_bh(&qdisc_tree_lock); | 530 | write_unlock(&qdisc_tree_lock); |
555 | } else { | 531 | } else { |
556 | qdisc = &noqueue_qdisc; | 532 | qdisc = &noqueue_qdisc; |
557 | } | 533 | } |
558 | write_lock_bh(&qdisc_tree_lock); | 534 | write_lock(&qdisc_tree_lock); |
559 | dev->qdisc_sleeping = qdisc; | 535 | dev->qdisc_sleeping = qdisc; |
560 | write_unlock_bh(&qdisc_tree_lock); | 536 | write_unlock(&qdisc_tree_lock); |
561 | } | 537 | } |
562 | 538 | ||
563 | if (!netif_carrier_ok(dev)) | 539 | if (!netif_carrier_ok(dev)) |