diff options
author | John Fastabend <john.fastabend@gmail.com> | 2014-09-20 00:50:34 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2014-09-22 15:59:21 -0400 |
commit | de5df63228fcfbd5bb7fd883774c18fec9e61f12 (patch) | |
tree | 4eb60e8f52cdedc39237888272e5ed39a914512e | |
parent | a1ddcfee2d9ae172d0095f3f8227f7fa53288c65 (diff) |
net: sched: cls_u32 changes to knode must appear atomic to readers
Changes to the cls_u32 classifier must appear atomic to the
readers. Before this patch if a change is requested for both
the exts and ifindex, first the ifindex is updated then the
exts with tcf_exts_change(). This opens a small window where
a reader can have a exts chain with an incorrect ifindex. This
violates the the RCU semantics.
Here we resolve this by always passing u32_set_parms() a copy
of the tc_u_knode to work on and then inserting it into the hash
table after the updates have been successfully applied.
Tested with the following short script:
#tc filter add dev p3p2 parent 8001:0 protocol ip prio 99 handle 1: \
u32 divisor 256
#tc filter add dev p3p2 parent 8001:0 protocol ip prio 99 \
u32 link 1: hashkey mask ffffff00 at 12 \
match ip src 192.168.8.0/2
#tc filter add dev p3p2 parent 8001:0 protocol ip prio 102 \
handle 1::10 u32 classid 1:2 ht 1: \
match ip src 192.168.8.0/8 match ip tos 0x0a 1e
#tc filter change dev p3p2 parent 8001:0 protocol ip prio 102 \
handle 1::10 u32 classid 1:2 ht 1: \
match ip src 1.1.0.0/8 match ip tos 0x0b 1e
CC: Eric Dumazet <edumazet@google.com>
CC: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | net/sched/cls_u32.c | 135 |
1 files changed, 126 insertions, 9 deletions
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 8d90e50a8ce4..e3fb5308d44a 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c | |||
@@ -354,27 +354,53 @@ static int u32_init(struct tcf_proto *tp) | |||
354 | return 0; | 354 | return 0; |
355 | } | 355 | } |
356 | 356 | ||
357 | static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n) | 357 | static int u32_destroy_key(struct tcf_proto *tp, |
358 | struct tc_u_knode *n, | ||
359 | bool free_pf) | ||
358 | { | 360 | { |
359 | tcf_unbind_filter(tp, &n->res); | 361 | tcf_unbind_filter(tp, &n->res); |
360 | tcf_exts_destroy(tp, &n->exts); | 362 | tcf_exts_destroy(tp, &n->exts); |
361 | if (n->ht_down) | 363 | if (n->ht_down) |
362 | n->ht_down->refcnt--; | 364 | n->ht_down->refcnt--; |
363 | #ifdef CONFIG_CLS_U32_PERF | 365 | #ifdef CONFIG_CLS_U32_PERF |
364 | free_percpu(n->pf); | 366 | if (free_pf) |
367 | free_percpu(n->pf); | ||
365 | #endif | 368 | #endif |
366 | #ifdef CONFIG_CLS_U32_MARK | 369 | #ifdef CONFIG_CLS_U32_MARK |
367 | free_percpu(n->pcpu_success); | 370 | if (free_pf) |
371 | free_percpu(n->pcpu_success); | ||
368 | #endif | 372 | #endif |
369 | kfree(n); | 373 | kfree(n); |
370 | return 0; | 374 | return 0; |
371 | } | 375 | } |
372 | 376 | ||
377 | /* u32_delete_key_rcu should be called when free'ing a copied | ||
378 | * version of a tc_u_knode obtained from u32_init_knode(). When | ||
379 | * copies are obtained from u32_init_knode() the statistics are | ||
380 | * shared between the old and new copies to allow readers to | ||
381 | * continue to update the statistics during the copy. To support | ||
382 | * this the u32_delete_key_rcu variant does not free the percpu | ||
383 | * statistics. | ||
384 | */ | ||
373 | static void u32_delete_key_rcu(struct rcu_head *rcu) | 385 | static void u32_delete_key_rcu(struct rcu_head *rcu) |
374 | { | 386 | { |
375 | struct tc_u_knode *key = container_of(rcu, struct tc_u_knode, rcu); | 387 | struct tc_u_knode *key = container_of(rcu, struct tc_u_knode, rcu); |
376 | 388 | ||
377 | u32_destroy_key(key->tp, key); | 389 | u32_destroy_key(key->tp, key, false); |
390 | } | ||
391 | |||
392 | /* u32_delete_key_freepf_rcu is the rcu callback variant | ||
393 | * that free's the entire structure including the statistics | ||
394 | * percpu variables. Only use this if the key is not a copy | ||
395 | * returned by u32_init_knode(). See u32_delete_key_rcu() | ||
396 | * for the variant that should be used with keys return from | ||
397 | * u32_init_knode() | ||
398 | */ | ||
399 | static void u32_delete_key_freepf_rcu(struct rcu_head *rcu) | ||
400 | { | ||
401 | struct tc_u_knode *key = container_of(rcu, struct tc_u_knode, rcu); | ||
402 | |||
403 | u32_destroy_key(key->tp, key, true); | ||
378 | } | 404 | } |
379 | 405 | ||
380 | static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key) | 406 | static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key) |
@@ -390,7 +416,7 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key) | |||
390 | if (pkp == key) { | 416 | if (pkp == key) { |
391 | RCU_INIT_POINTER(*kp, key->next); | 417 | RCU_INIT_POINTER(*kp, key->next); |
392 | 418 | ||
393 | call_rcu(&key->rcu, u32_delete_key_rcu); | 419 | call_rcu(&key->rcu, u32_delete_key_freepf_rcu); |
394 | return 0; | 420 | return 0; |
395 | } | 421 | } |
396 | } | 422 | } |
@@ -408,7 +434,7 @@ static void u32_clear_hnode(struct tc_u_hnode *ht) | |||
408 | while ((n = rtnl_dereference(ht->ht[h])) != NULL) { | 434 | while ((n = rtnl_dereference(ht->ht[h])) != NULL) { |
409 | RCU_INIT_POINTER(ht->ht[h], | 435 | RCU_INIT_POINTER(ht->ht[h], |
410 | rtnl_dereference(n->next)); | 436 | rtnl_dereference(n->next)); |
411 | call_rcu(&n->rcu, u32_delete_key_rcu); | 437 | call_rcu(&n->rcu, u32_delete_key_freepf_rcu); |
412 | } | 438 | } |
413 | } | 439 | } |
414 | } | 440 | } |
@@ -584,6 +610,82 @@ errout: | |||
584 | return err; | 610 | return err; |
585 | } | 611 | } |
586 | 612 | ||
613 | static void u32_replace_knode(struct tcf_proto *tp, | ||
614 | struct tc_u_common *tp_c, | ||
615 | struct tc_u_knode *n) | ||
616 | { | ||
617 | struct tc_u_knode __rcu **ins; | ||
618 | struct tc_u_knode *pins; | ||
619 | struct tc_u_hnode *ht; | ||
620 | |||
621 | if (TC_U32_HTID(n->handle) == TC_U32_ROOT) | ||
622 | ht = rtnl_dereference(tp->root); | ||
623 | else | ||
624 | ht = u32_lookup_ht(tp_c, TC_U32_HTID(n->handle)); | ||
625 | |||
626 | ins = &ht->ht[TC_U32_HASH(n->handle)]; | ||
627 | |||
628 | /* The node must always exist for it to be replaced if this is not the | ||
629 | * case then something went very wrong elsewhere. | ||
630 | */ | ||
631 | for (pins = rtnl_dereference(*ins); ; | ||
632 | ins = &pins->next, pins = rtnl_dereference(*ins)) | ||
633 | if (pins->handle == n->handle) | ||
634 | break; | ||
635 | |||
636 | RCU_INIT_POINTER(n->next, pins->next); | ||
637 | rcu_assign_pointer(*ins, n); | ||
638 | } | ||
639 | |||
640 | static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp, | ||
641 | struct tc_u_knode *n) | ||
642 | { | ||
643 | struct tc_u_knode *new; | ||
644 | struct tc_u32_sel *s = &n->sel; | ||
645 | |||
646 | new = kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), | ||
647 | GFP_KERNEL); | ||
648 | |||
649 | if (!new) | ||
650 | return NULL; | ||
651 | |||
652 | RCU_INIT_POINTER(new->next, n->next); | ||
653 | new->handle = n->handle; | ||
654 | RCU_INIT_POINTER(new->ht_up, n->ht_up); | ||
655 | |||
656 | #ifdef CONFIG_NET_CLS_IND | ||
657 | new->ifindex = n->ifindex; | ||
658 | #endif | ||
659 | new->fshift = n->fshift; | ||
660 | new->res = n->res; | ||
661 | RCU_INIT_POINTER(new->ht_down, n->ht_down); | ||
662 | |||
663 | /* bump reference count as long as we hold pointer to structure */ | ||
664 | if (new->ht_down) | ||
665 | new->ht_down->refcnt++; | ||
666 | |||
667 | #ifdef CONFIG_CLS_U32_PERF | ||
668 | /* Statistics may be incremented by readers during update | ||
669 | * so we must keep them in tact. When the node is later destroyed | ||
670 | * a special destroy call must be made to not free the pf memory. | ||
671 | */ | ||
672 | new->pf = n->pf; | ||
673 | #endif | ||
674 | |||
675 | #ifdef CONFIG_CLS_U32_MARK | ||
676 | new->val = n->val; | ||
677 | new->mask = n->mask; | ||
678 | /* Similarly success statistics must be moved as pointers */ | ||
679 | new->pcpu_success = n->pcpu_success; | ||
680 | #endif | ||
681 | new->tp = tp; | ||
682 | memcpy(&new->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key)); | ||
683 | |||
684 | tcf_exts_init(&new->exts, TCA_U32_ACT, TCA_U32_POLICE); | ||
685 | |||
686 | return new; | ||
687 | } | ||
688 | |||
587 | static int u32_change(struct net *net, struct sk_buff *in_skb, | 689 | static int u32_change(struct net *net, struct sk_buff *in_skb, |
588 | struct tcf_proto *tp, unsigned long base, u32 handle, | 690 | struct tcf_proto *tp, unsigned long base, u32 handle, |
589 | struct nlattr **tca, | 691 | struct nlattr **tca, |
@@ -610,12 +712,27 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, | |||
610 | 712 | ||
611 | n = (struct tc_u_knode *)*arg; | 713 | n = (struct tc_u_knode *)*arg; |
612 | if (n) { | 714 | if (n) { |
715 | struct tc_u_knode *new; | ||
716 | |||
613 | if (TC_U32_KEY(n->handle) == 0) | 717 | if (TC_U32_KEY(n->handle) == 0) |
614 | return -EINVAL; | 718 | return -EINVAL; |
615 | 719 | ||
616 | return u32_set_parms(net, tp, base, | 720 | new = u32_init_knode(tp, n); |
617 | rtnl_dereference(n->ht_up), n, tb, | 721 | if (!new) |
618 | tca[TCA_RATE], ovr); | 722 | return -ENOMEM; |
723 | |||
724 | err = u32_set_parms(net, tp, base, | ||
725 | rtnl_dereference(n->ht_up), new, tb, | ||
726 | tca[TCA_RATE], ovr); | ||
727 | |||
728 | if (err) { | ||
729 | u32_destroy_key(tp, new, false); | ||
730 | return err; | ||
731 | } | ||
732 | |||
733 | u32_replace_knode(tp, tp_c, new); | ||
734 | call_rcu(&n->rcu, u32_delete_key_rcu); | ||
735 | return 0; | ||
619 | } | 736 | } |
620 | 737 | ||
621 | if (tb[TCA_U32_DIVISOR]) { | 738 | if (tb[TCA_U32_DIVISOR]) { |