diff options
Diffstat (limited to 'net/sched/sch_fq.c')
-rw-r--r-- | net/sched/sch_fq.c | 137 |
1 files changed, 83 insertions, 54 deletions
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 32ad015ee8ce..95d843961907 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c | |||
@@ -88,7 +88,7 @@ struct fq_sched_data { | |||
88 | struct fq_flow internal; /* for non classified or high prio packets */ | 88 | struct fq_flow internal; /* for non classified or high prio packets */ |
89 | u32 quantum; | 89 | u32 quantum; |
90 | u32 initial_quantum; | 90 | u32 initial_quantum; |
91 | u32 flow_default_rate;/* rate per flow : bytes per second */ | 91 | u32 flow_refill_delay; |
92 | u32 flow_max_rate; /* optional max rate per flow */ | 92 | u32 flow_max_rate; /* optional max rate per flow */ |
93 | u32 flow_plimit; /* max packets per flow */ | 93 | u32 flow_plimit; /* max packets per flow */ |
94 | struct rb_root *fq_root; | 94 | struct rb_root *fq_root; |
@@ -115,6 +115,7 @@ static struct fq_flow detached, throttled; | |||
115 | static void fq_flow_set_detached(struct fq_flow *f) | 115 | static void fq_flow_set_detached(struct fq_flow *f) |
116 | { | 116 | { |
117 | f->next = &detached; | 117 | f->next = &detached; |
118 | f->age = jiffies; | ||
118 | } | 119 | } |
119 | 120 | ||
120 | static bool fq_flow_is_detached(const struct fq_flow *f) | 121 | static bool fq_flow_is_detached(const struct fq_flow *f) |
@@ -209,21 +210,15 @@ static void fq_gc(struct fq_sched_data *q, | |||
209 | } | 210 | } |
210 | } | 211 | } |
211 | 212 | ||
212 | static const u8 prio2band[TC_PRIO_MAX + 1] = { | ||
213 | 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 | ||
214 | }; | ||
215 | |||
216 | static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) | 213 | static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) |
217 | { | 214 | { |
218 | struct rb_node **p, *parent; | 215 | struct rb_node **p, *parent; |
219 | struct sock *sk = skb->sk; | 216 | struct sock *sk = skb->sk; |
220 | struct rb_root *root; | 217 | struct rb_root *root; |
221 | struct fq_flow *f; | 218 | struct fq_flow *f; |
222 | int band; | ||
223 | 219 | ||
224 | /* warning: no starvation prevention... */ | 220 | /* warning: no starvation prevention... */ |
225 | band = prio2band[skb->priority & TC_PRIO_MAX]; | 221 | if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL)) |
226 | if (unlikely(band == 0)) | ||
227 | return &q->internal; | 222 | return &q->internal; |
228 | 223 | ||
229 | if (unlikely(!sk)) { | 224 | if (unlikely(!sk)) { |
@@ -255,6 +250,7 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) | |||
255 | f->socket_hash != sk->sk_hash)) { | 250 | f->socket_hash != sk->sk_hash)) { |
256 | f->credit = q->initial_quantum; | 251 | f->credit = q->initial_quantum; |
257 | f->socket_hash = sk->sk_hash; | 252 | f->socket_hash = sk->sk_hash; |
253 | f->time_next_packet = 0ULL; | ||
258 | } | 254 | } |
259 | return f; | 255 | return f; |
260 | } | 256 | } |
@@ -285,7 +281,7 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) | |||
285 | 281 | ||
286 | 282 | ||
287 | /* remove one skb from head of flow queue */ | 283 | /* remove one skb from head of flow queue */ |
288 | static struct sk_buff *fq_dequeue_head(struct fq_flow *flow) | 284 | static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow) |
289 | { | 285 | { |
290 | struct sk_buff *skb = flow->head; | 286 | struct sk_buff *skb = flow->head; |
291 | 287 | ||
@@ -293,6 +289,8 @@ static struct sk_buff *fq_dequeue_head(struct fq_flow *flow) | |||
293 | flow->head = skb->next; | 289 | flow->head = skb->next; |
294 | skb->next = NULL; | 290 | skb->next = NULL; |
295 | flow->qlen--; | 291 | flow->qlen--; |
292 | sch->qstats.backlog -= qdisc_pkt_len(skb); | ||
293 | sch->q.qlen--; | ||
296 | } | 294 | } |
297 | return skb; | 295 | return skb; |
298 | } | 296 | } |
@@ -370,17 +368,20 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch) | |||
370 | } | 368 | } |
371 | 369 | ||
372 | f->qlen++; | 370 | f->qlen++; |
373 | flow_queue_add(f, skb); | ||
374 | if (skb_is_retransmit(skb)) | 371 | if (skb_is_retransmit(skb)) |
375 | q->stat_tcp_retrans++; | 372 | q->stat_tcp_retrans++; |
376 | sch->qstats.backlog += qdisc_pkt_len(skb); | 373 | sch->qstats.backlog += qdisc_pkt_len(skb); |
377 | if (fq_flow_is_detached(f)) { | 374 | if (fq_flow_is_detached(f)) { |
378 | fq_flow_add_tail(&q->new_flows, f); | 375 | fq_flow_add_tail(&q->new_flows, f); |
379 | if (q->quantum > f->credit) | 376 | if (time_after(jiffies, f->age + q->flow_refill_delay)) |
380 | f->credit = q->quantum; | 377 | f->credit = max_t(u32, f->credit, q->quantum); |
381 | q->inactive_flows--; | 378 | q->inactive_flows--; |
382 | qdisc_unthrottled(sch); | 379 | qdisc_unthrottled(sch); |
383 | } | 380 | } |
381 | |||
382 | /* Note: this overwrites f->age */ | ||
383 | flow_queue_add(f, skb); | ||
384 | |||
384 | if (unlikely(f == &q->internal)) { | 385 | if (unlikely(f == &q->internal)) { |
385 | q->stat_internal_packets++; | 386 | q->stat_internal_packets++; |
386 | qdisc_unthrottled(sch); | 387 | qdisc_unthrottled(sch); |
@@ -418,8 +419,9 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch) | |||
418 | struct fq_flow_head *head; | 419 | struct fq_flow_head *head; |
419 | struct sk_buff *skb; | 420 | struct sk_buff *skb; |
420 | struct fq_flow *f; | 421 | struct fq_flow *f; |
422 | u32 rate; | ||
421 | 423 | ||
422 | skb = fq_dequeue_head(&q->internal); | 424 | skb = fq_dequeue_head(sch, &q->internal); |
423 | if (skb) | 425 | if (skb) |
424 | goto out; | 426 | goto out; |
425 | fq_check_throttled(q, now); | 427 | fq_check_throttled(q, now); |
@@ -449,7 +451,7 @@ begin: | |||
449 | goto begin; | 451 | goto begin; |
450 | } | 452 | } |
451 | 453 | ||
452 | skb = fq_dequeue_head(f); | 454 | skb = fq_dequeue_head(sch, f); |
453 | if (!skb) { | 455 | if (!skb) { |
454 | head->first = f->next; | 456 | head->first = f->next; |
455 | /* force a pass through old_flows to prevent starvation */ | 457 | /* force a pass through old_flows to prevent starvation */ |
@@ -457,7 +459,6 @@ begin: | |||
457 | fq_flow_add_tail(&q->old_flows, f); | 459 | fq_flow_add_tail(&q->old_flows, f); |
458 | } else { | 460 | } else { |
459 | fq_flow_set_detached(f); | 461 | fq_flow_set_detached(f); |
460 | f->age = jiffies; | ||
461 | q->inactive_flows++; | 462 | q->inactive_flows++; |
462 | } | 463 | } |
463 | goto begin; | 464 | goto begin; |
@@ -466,43 +467,70 @@ begin: | |||
466 | f->time_next_packet = now; | 467 | f->time_next_packet = now; |
467 | f->credit -= qdisc_pkt_len(skb); | 468 | f->credit -= qdisc_pkt_len(skb); |
468 | 469 | ||
469 | if (f->credit <= 0 && | 470 | if (f->credit > 0 || !q->rate_enable) |
470 | q->rate_enable && | 471 | goto out; |
471 | skb->sk && skb->sk->sk_state != TCP_TIME_WAIT) { | ||
472 | u32 rate = skb->sk->sk_pacing_rate ?: q->flow_default_rate; | ||
473 | 472 | ||
474 | rate = min(rate, q->flow_max_rate); | 473 | rate = q->flow_max_rate; |
475 | if (rate) { | 474 | if (skb->sk && skb->sk->sk_state != TCP_TIME_WAIT) |
476 | u64 len = (u64)qdisc_pkt_len(skb) * NSEC_PER_SEC; | 475 | rate = min(skb->sk->sk_pacing_rate, rate); |
477 | 476 | ||
478 | do_div(len, rate); | 477 | if (rate != ~0U) { |
479 | /* Since socket rate can change later, | 478 | u32 plen = max(qdisc_pkt_len(skb), q->quantum); |
480 | * clamp the delay to 125 ms. | 479 | u64 len = (u64)plen * NSEC_PER_SEC; |
481 | * TODO: maybe segment the too big skb, as in commit | ||
482 | * e43ac79a4bc ("sch_tbf: segment too big GSO packets") | ||
483 | */ | ||
484 | if (unlikely(len > 125 * NSEC_PER_MSEC)) { | ||
485 | len = 125 * NSEC_PER_MSEC; | ||
486 | q->stat_pkts_too_long++; | ||
487 | } | ||
488 | 480 | ||
489 | f->time_next_packet = now + len; | 481 | if (likely(rate)) |
482 | do_div(len, rate); | ||
483 | /* Since socket rate can change later, | ||
484 | * clamp the delay to 125 ms. | ||
485 | * TODO: maybe segment the too big skb, as in commit | ||
486 | * e43ac79a4bc ("sch_tbf: segment too big GSO packets") | ||
487 | */ | ||
488 | if (unlikely(len > 125 * NSEC_PER_MSEC)) { | ||
489 | len = 125 * NSEC_PER_MSEC; | ||
490 | q->stat_pkts_too_long++; | ||
490 | } | 491 | } |
492 | |||
493 | f->time_next_packet = now + len; | ||
491 | } | 494 | } |
492 | out: | 495 | out: |
493 | sch->qstats.backlog -= qdisc_pkt_len(skb); | ||
494 | qdisc_bstats_update(sch, skb); | 496 | qdisc_bstats_update(sch, skb); |
495 | sch->q.qlen--; | ||
496 | qdisc_unthrottled(sch); | 497 | qdisc_unthrottled(sch); |
497 | return skb; | 498 | return skb; |
498 | } | 499 | } |
499 | 500 | ||
500 | static void fq_reset(struct Qdisc *sch) | 501 | static void fq_reset(struct Qdisc *sch) |
501 | { | 502 | { |
503 | struct fq_sched_data *q = qdisc_priv(sch); | ||
504 | struct rb_root *root; | ||
502 | struct sk_buff *skb; | 505 | struct sk_buff *skb; |
506 | struct rb_node *p; | ||
507 | struct fq_flow *f; | ||
508 | unsigned int idx; | ||
503 | 509 | ||
504 | while ((skb = fq_dequeue(sch)) != NULL) | 510 | while ((skb = fq_dequeue_head(sch, &q->internal)) != NULL) |
505 | kfree_skb(skb); | 511 | kfree_skb(skb); |
512 | |||
513 | if (!q->fq_root) | ||
514 | return; | ||
515 | |||
516 | for (idx = 0; idx < (1U << q->fq_trees_log); idx++) { | ||
517 | root = &q->fq_root[idx]; | ||
518 | while ((p = rb_first(root)) != NULL) { | ||
519 | f = container_of(p, struct fq_flow, fq_node); | ||
520 | rb_erase(p, root); | ||
521 | |||
522 | while ((skb = fq_dequeue_head(sch, f)) != NULL) | ||
523 | kfree_skb(skb); | ||
524 | |||
525 | kmem_cache_free(fq_flow_cachep, f); | ||
526 | } | ||
527 | } | ||
528 | q->new_flows.first = NULL; | ||
529 | q->old_flows.first = NULL; | ||
530 | q->delayed = RB_ROOT; | ||
531 | q->flows = 0; | ||
532 | q->inactive_flows = 0; | ||
533 | q->throttled_flows = 0; | ||
506 | } | 534 | } |
507 | 535 | ||
508 | static void fq_rehash(struct fq_sched_data *q, | 536 | static void fq_rehash(struct fq_sched_data *q, |
@@ -584,6 +612,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { | |||
584 | [TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 }, | 612 | [TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 }, |
585 | [TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 }, | 613 | [TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 }, |
586 | [TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 }, | 614 | [TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 }, |
615 | [TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 }, | ||
587 | }; | 616 | }; |
588 | 617 | ||
589 | static int fq_change(struct Qdisc *sch, struct nlattr *opt) | 618 | static int fq_change(struct Qdisc *sch, struct nlattr *opt) |
@@ -622,10 +651,11 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt) | |||
622 | q->quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]); | 651 | q->quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]); |
623 | 652 | ||
624 | if (tb[TCA_FQ_INITIAL_QUANTUM]) | 653 | if (tb[TCA_FQ_INITIAL_QUANTUM]) |
625 | q->quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]); | 654 | q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]); |
626 | 655 | ||
627 | if (tb[TCA_FQ_FLOW_DEFAULT_RATE]) | 656 | if (tb[TCA_FQ_FLOW_DEFAULT_RATE]) |
628 | q->flow_default_rate = nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]); | 657 | pr_warn_ratelimited("sch_fq: defrate %u ignored.\n", |
658 | nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE])); | ||
629 | 659 | ||
630 | if (tb[TCA_FQ_FLOW_MAX_RATE]) | 660 | if (tb[TCA_FQ_FLOW_MAX_RATE]) |
631 | q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]); | 661 | q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]); |
@@ -639,12 +669,20 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt) | |||
639 | err = -EINVAL; | 669 | err = -EINVAL; |
640 | } | 670 | } |
641 | 671 | ||
672 | if (tb[TCA_FQ_FLOW_REFILL_DELAY]) { | ||
673 | u32 usecs_delay = nla_get_u32(tb[TCA_FQ_FLOW_REFILL_DELAY]) ; | ||
674 | |||
675 | q->flow_refill_delay = usecs_to_jiffies(usecs_delay); | ||
676 | } | ||
677 | |||
642 | if (!err) | 678 | if (!err) |
643 | err = fq_resize(q, fq_log); | 679 | err = fq_resize(q, fq_log); |
644 | 680 | ||
645 | while (sch->q.qlen > sch->limit) { | 681 | while (sch->q.qlen > sch->limit) { |
646 | struct sk_buff *skb = fq_dequeue(sch); | 682 | struct sk_buff *skb = fq_dequeue(sch); |
647 | 683 | ||
684 | if (!skb) | ||
685 | break; | ||
648 | kfree_skb(skb); | 686 | kfree_skb(skb); |
649 | drop_count++; | 687 | drop_count++; |
650 | } | 688 | } |
@@ -657,21 +695,9 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt) | |||
657 | static void fq_destroy(struct Qdisc *sch) | 695 | static void fq_destroy(struct Qdisc *sch) |
658 | { | 696 | { |
659 | struct fq_sched_data *q = qdisc_priv(sch); | 697 | struct fq_sched_data *q = qdisc_priv(sch); |
660 | struct rb_root *root; | ||
661 | struct rb_node *p; | ||
662 | unsigned int idx; | ||
663 | 698 | ||
664 | if (q->fq_root) { | 699 | fq_reset(sch); |
665 | for (idx = 0; idx < (1U << q->fq_trees_log); idx++) { | 700 | kfree(q->fq_root); |
666 | root = &q->fq_root[idx]; | ||
667 | while ((p = rb_first(root)) != NULL) { | ||
668 | rb_erase(p, root); | ||
669 | kmem_cache_free(fq_flow_cachep, | ||
670 | container_of(p, struct fq_flow, fq_node)); | ||
671 | } | ||
672 | } | ||
673 | kfree(q->fq_root); | ||
674 | } | ||
675 | qdisc_watchdog_cancel(&q->watchdog); | 701 | qdisc_watchdog_cancel(&q->watchdog); |
676 | } | 702 | } |
677 | 703 | ||
@@ -684,7 +710,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt) | |||
684 | q->flow_plimit = 100; | 710 | q->flow_plimit = 100; |
685 | q->quantum = 2 * psched_mtu(qdisc_dev(sch)); | 711 | q->quantum = 2 * psched_mtu(qdisc_dev(sch)); |
686 | q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch)); | 712 | q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch)); |
687 | q->flow_default_rate = 0; | 713 | q->flow_refill_delay = msecs_to_jiffies(40); |
688 | q->flow_max_rate = ~0U; | 714 | q->flow_max_rate = ~0U; |
689 | q->rate_enable = 1; | 715 | q->rate_enable = 1; |
690 | q->new_flows.first = NULL; | 716 | q->new_flows.first = NULL; |
@@ -711,13 +737,16 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) | |||
711 | if (opts == NULL) | 737 | if (opts == NULL) |
712 | goto nla_put_failure; | 738 | goto nla_put_failure; |
713 | 739 | ||
740 | /* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */ | ||
741 | |||
714 | if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) || | 742 | if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) || |
715 | nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) || | 743 | nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) || |
716 | nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) || | 744 | nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) || |
717 | nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) || | 745 | nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) || |
718 | nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) || | 746 | nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) || |
719 | nla_put_u32(skb, TCA_FQ_FLOW_DEFAULT_RATE, q->flow_default_rate) || | ||
720 | nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) || | 747 | nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) || |
748 | nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY, | ||
749 | jiffies_to_usecs(q->flow_refill_delay)) || | ||
721 | nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log)) | 750 | nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log)) |
722 | goto nla_put_failure; | 751 | goto nla_put_failure; |
723 | 752 | ||