aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPatrick McHardy <kaber@trash.net>2006-09-27 19:45:45 -0400
committerDavid S. Miller <davem@sunset.davemloft.net>2006-09-28 21:01:50 -0400
commit85670cc1faa2e1472e4a423cbf0b5e3d55c5ba88 (patch)
treea5da67836995f1b04c844071db97608bc2c37b85
parent787e0617e5176176c494a787f1b0a5248a3db568 (diff)
[NET_SCHED]: Fix fallout from dev->qdisc RCU change
The move of qdisc destruction to a rcu callback broke locking in the entire qdisc layer by invalidating previously valid assumptions about the context in which changes to the qdisc tree occur. The two assumptions were: - since changes only happen in process context, read_lock doesn't need bottem half protection. Now invalid since destruction of inner qdiscs, classifiers, actions and estimators happens in the RCU callback unless they're manually deleted, resulting in dead-locks when read_lock in process context is interrupted by write_lock_bh in bottem half context. - since changes only happen under the RTNL, no additional locking is necessary for data not used during packet processing (f.e. u32_list). Again, since destruction now happens in the RCU callback, this assumption is not valid anymore, causing races while using this data, which can result in corruption or use-after-free. Instead of "fixing" this by disabling bottem halfs everywhere and adding new locks/refcounting, this patch makes these assumptions valid again by moving destruction back to process context. Since only the dev->qdisc pointer is protected by RCU, but ->enqueue and the qdisc tree are still protected by dev->qdisc_lock, destruction of the tree can be performed immediately and only the final free needs to happen in the rcu callback to make sure dev_queue_xmit doesn't access already freed memory. Signed-off-by: Patrick McHardy <kaber@trash.net> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--net/core/dev.c14
-rw-r--r--net/sched/cls_api.c4
-rw-r--r--net/sched/sch_api.c16
-rw-r--r--net/sched/sch_generic.c66
4 files changed, 39 insertions, 61 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index 14de297d024d..4d891beab138 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1480,14 +1480,16 @@ gso:
1480 if (q->enqueue) { 1480 if (q->enqueue) {
1481 /* Grab device queue */ 1481 /* Grab device queue */
1482 spin_lock(&dev->queue_lock); 1482 spin_lock(&dev->queue_lock);
1483 q = dev->qdisc;
1484 if (q->enqueue) {
1485 rc = q->enqueue(skb, q);
1486 qdisc_run(dev);
1487 spin_unlock(&dev->queue_lock);
1483 1488
1484 rc = q->enqueue(skb, q); 1489 rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
1485 1490 goto out;
1486 qdisc_run(dev); 1491 }
1487
1488 spin_unlock(&dev->queue_lock); 1492 spin_unlock(&dev->queue_lock);
1489 rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
1490 goto out;
1491 } 1493 }
1492 1494
1493 /* The device has no queue. Common case for software devices: 1495 /* The device has no queue. Common case for software devices:
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 7e14f14058e9..37a184021647 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -401,7 +401,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
401 if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL) 401 if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
402 return skb->len; 402 return skb->len;
403 403
404 read_lock_bh(&qdisc_tree_lock); 404 read_lock(&qdisc_tree_lock);
405 if (!tcm->tcm_parent) 405 if (!tcm->tcm_parent)
406 q = dev->qdisc_sleeping; 406 q = dev->qdisc_sleeping;
407 else 407 else
@@ -458,7 +458,7 @@ errout:
458 if (cl) 458 if (cl)
459 cops->put(q, cl); 459 cops->put(q, cl);
460out: 460out:
461 read_unlock_bh(&qdisc_tree_lock); 461 read_unlock(&qdisc_tree_lock);
462 dev_put(dev); 462 dev_put(dev);
463 return skb->len; 463 return skb->len;
464} 464}
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index a19eff12cf78..0b6489291140 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -195,14 +195,14 @@ struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
195{ 195{
196 struct Qdisc *q; 196 struct Qdisc *q;
197 197
198 read_lock_bh(&qdisc_tree_lock); 198 read_lock(&qdisc_tree_lock);
199 list_for_each_entry(q, &dev->qdisc_list, list) { 199 list_for_each_entry(q, &dev->qdisc_list, list) {
200 if (q->handle == handle) { 200 if (q->handle == handle) {
201 read_unlock_bh(&qdisc_tree_lock); 201 read_unlock(&qdisc_tree_lock);
202 return q; 202 return q;
203 } 203 }
204 } 204 }
205 read_unlock_bh(&qdisc_tree_lock); 205 read_unlock(&qdisc_tree_lock);
206 return NULL; 206 return NULL;
207} 207}
208 208
@@ -837,7 +837,7 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
837 continue; 837 continue;
838 if (idx > s_idx) 838 if (idx > s_idx)
839 s_q_idx = 0; 839 s_q_idx = 0;
840 read_lock_bh(&qdisc_tree_lock); 840 read_lock(&qdisc_tree_lock);
841 q_idx = 0; 841 q_idx = 0;
842 list_for_each_entry(q, &dev->qdisc_list, list) { 842 list_for_each_entry(q, &dev->qdisc_list, list) {
843 if (q_idx < s_q_idx) { 843 if (q_idx < s_q_idx) {
@@ -846,12 +846,12 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
846 } 846 }
847 if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid, 847 if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
848 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) { 848 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
849 read_unlock_bh(&qdisc_tree_lock); 849 read_unlock(&qdisc_tree_lock);
850 goto done; 850 goto done;
851 } 851 }
852 q_idx++; 852 q_idx++;
853 } 853 }
854 read_unlock_bh(&qdisc_tree_lock); 854 read_unlock(&qdisc_tree_lock);
855 } 855 }
856 856
857done: 857done:
@@ -1074,7 +1074,7 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1074 s_t = cb->args[0]; 1074 s_t = cb->args[0];
1075 t = 0; 1075 t = 0;
1076 1076
1077 read_lock_bh(&qdisc_tree_lock); 1077 read_lock(&qdisc_tree_lock);
1078 list_for_each_entry(q, &dev->qdisc_list, list) { 1078 list_for_each_entry(q, &dev->qdisc_list, list) {
1079 if (t < s_t || !q->ops->cl_ops || 1079 if (t < s_t || !q->ops->cl_ops ||
1080 (tcm->tcm_parent && 1080 (tcm->tcm_parent &&
@@ -1096,7 +1096,7 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1096 break; 1096 break;
1097 t++; 1097 t++;
1098 } 1098 }
1099 read_unlock_bh(&qdisc_tree_lock); 1099 read_unlock(&qdisc_tree_lock);
1100 1100
1101 cb->args[0] = t; 1101 cb->args[0] = t;
1102 1102
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 6f9151899795..88c6a99ce53c 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -45,11 +45,10 @@
45 The idea is the following: 45 The idea is the following:
46 - enqueue, dequeue are serialized via top level device 46 - enqueue, dequeue are serialized via top level device
47 spinlock dev->queue_lock. 47 spinlock dev->queue_lock.
48 - tree walking is protected by read_lock_bh(qdisc_tree_lock) 48 - tree walking is protected by read_lock(qdisc_tree_lock)
49 and this lock is used only in process context. 49 and this lock is used only in process context.
50 - updates to tree are made under rtnl semaphore or 50 - updates to tree are made only under rtnl semaphore,
51 from softirq context (__qdisc_destroy rcu-callback) 51 hence this lock may be made without local bh disabling.
52 hence this lock needs local bh disabling.
53 52
54 qdisc_tree_lock must be grabbed BEFORE dev->queue_lock! 53 qdisc_tree_lock must be grabbed BEFORE dev->queue_lock!
55 */ 54 */
@@ -57,14 +56,14 @@ DEFINE_RWLOCK(qdisc_tree_lock);
57 56
58void qdisc_lock_tree(struct net_device *dev) 57void qdisc_lock_tree(struct net_device *dev)
59{ 58{
60 write_lock_bh(&qdisc_tree_lock); 59 write_lock(&qdisc_tree_lock);
61 spin_lock_bh(&dev->queue_lock); 60 spin_lock_bh(&dev->queue_lock);
62} 61}
63 62
64void qdisc_unlock_tree(struct net_device *dev) 63void qdisc_unlock_tree(struct net_device *dev)
65{ 64{
66 spin_unlock_bh(&dev->queue_lock); 65 spin_unlock_bh(&dev->queue_lock);
67 write_unlock_bh(&qdisc_tree_lock); 66 write_unlock(&qdisc_tree_lock);
68} 67}
69 68
70/* 69/*
@@ -483,20 +482,6 @@ void qdisc_reset(struct Qdisc *qdisc)
483static void __qdisc_destroy(struct rcu_head *head) 482static void __qdisc_destroy(struct rcu_head *head)
484{ 483{
485 struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu); 484 struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
486 struct Qdisc_ops *ops = qdisc->ops;
487
488#ifdef CONFIG_NET_ESTIMATOR
489 gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
490#endif
491 write_lock(&qdisc_tree_lock);
492 if (ops->reset)
493 ops->reset(qdisc);
494 if (ops->destroy)
495 ops->destroy(qdisc);
496 write_unlock(&qdisc_tree_lock);
497 module_put(ops->owner);
498
499 dev_put(qdisc->dev);
500 kfree((char *) qdisc - qdisc->padded); 485 kfree((char *) qdisc - qdisc->padded);
501} 486}
502 487
@@ -504,32 +489,23 @@ static void __qdisc_destroy(struct rcu_head *head)
504 489
505void qdisc_destroy(struct Qdisc *qdisc) 490void qdisc_destroy(struct Qdisc *qdisc)
506{ 491{
507 struct list_head cql = LIST_HEAD_INIT(cql); 492 struct Qdisc_ops *ops = qdisc->ops;
508 struct Qdisc *cq, *q, *n;
509 493
510 if (qdisc->flags & TCQ_F_BUILTIN || 494 if (qdisc->flags & TCQ_F_BUILTIN ||
511 !atomic_dec_and_test(&qdisc->refcnt)) 495 !atomic_dec_and_test(&qdisc->refcnt))
512 return; 496 return;
513 497
514 if (!list_empty(&qdisc->list)) { 498 list_del(&qdisc->list);
515 if (qdisc->ops->cl_ops == NULL) 499#ifdef CONFIG_NET_ESTIMATOR
516 list_del(&qdisc->list); 500 gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
517 else 501#endif
518 list_move(&qdisc->list, &cql); 502 if (ops->reset)
519 } 503 ops->reset(qdisc);
520 504 if (ops->destroy)
521 /* unlink inner qdiscs from dev->qdisc_list immediately */ 505 ops->destroy(qdisc);
522 list_for_each_entry(cq, &cql, list)
523 list_for_each_entry_safe(q, n, &qdisc->dev->qdisc_list, list)
524 if (TC_H_MAJ(q->parent) == TC_H_MAJ(cq->handle)) {
525 if (q->ops->cl_ops == NULL)
526 list_del_init(&q->list);
527 else
528 list_move_tail(&q->list, &cql);
529 }
530 list_for_each_entry_safe(cq, n, &cql, list)
531 list_del_init(&cq->list);
532 506
507 module_put(ops->owner);
508 dev_put(qdisc->dev);
533 call_rcu(&qdisc->q_rcu, __qdisc_destroy); 509 call_rcu(&qdisc->q_rcu, __qdisc_destroy);
534} 510}
535 511
@@ -549,15 +525,15 @@ void dev_activate(struct net_device *dev)
549 printk(KERN_INFO "%s: activation failed\n", dev->name); 525 printk(KERN_INFO "%s: activation failed\n", dev->name);
550 return; 526 return;
551 } 527 }
552 write_lock_bh(&qdisc_tree_lock); 528 write_lock(&qdisc_tree_lock);
553 list_add_tail(&qdisc->list, &dev->qdisc_list); 529 list_add_tail(&qdisc->list, &dev->qdisc_list);
554 write_unlock_bh(&qdisc_tree_lock); 530 write_unlock(&qdisc_tree_lock);
555 } else { 531 } else {
556 qdisc = &noqueue_qdisc; 532 qdisc = &noqueue_qdisc;
557 } 533 }
558 write_lock_bh(&qdisc_tree_lock); 534 write_lock(&qdisc_tree_lock);
559 dev->qdisc_sleeping = qdisc; 535 dev->qdisc_sleeping = qdisc;
560 write_unlock_bh(&qdisc_tree_lock); 536 write_unlock(&qdisc_tree_lock);
561 } 537 }
562 538
563 if (!netif_carrier_ok(dev)) 539 if (!netif_carrier_ok(dev))