aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Dumazet <eric.dumazet@gmail.com>2012-01-06 01:31:44 -0500
committerDavid S. Miller <davem@davemloft.net>2012-01-12 23:05:28 -0500
commitddecf0f4db44ef94847a62d6ecf74456b4dcc66f (patch)
tree48c49b70f83d713c7e3f594e067ee726bb70777e
parent72092cc45378176ba700034c91b7af2db524df26 (diff)
net_sched: sfq: add optional RED on top of SFQ
Adds an optional Random Early Detection on each SFQ flow queue. Traditional SFQ limits count of packets, while RED permits to also control number of bytes per flow, and adds ECN capability as well. 1) We dont handle the idle time management in this RED implementation, since each 'new flow' begins with a null qavg. We really want to address backlogged flows. 2) if headdrop is selected, we try to ecn mark first packet instead of currently enqueued packet. This gives faster feedback for tcp flows compared to traditional RED [ marking the last packet in queue ] Example of use : tc qdisc add dev $DEV parent 1:1 handle 10: est 1sec 4sec sfq \ limit 3000 headdrop flows 512 divisor 16384 \ redflowlimit 100000 min 8000 max 60000 probability 0.20 ecn qdisc sfq 10: parent 1:1 limit 3000p quantum 1514b depth 127 headdrop flows 512/16384 divisor 16384 ewma 6 min 8000b max 60000b probability 0.2 ecn prob_mark 0 prob_mark_head 4876 prob_drop 6131 forced_mark 0 forced_mark_head 0 forced_drop 0 Sent 1175211782 bytes 777537 pkt (dropped 6131, overlimits 11007 requeues 0) rate 99483Kbit 8219pps backlog 689392b 456p requeues 0 In this test, with 64 netperf TCP_STREAM sessions, 50% using ECN enabled flows, we can see number of packets CE marked is smaller than number of drops (for non ECN flows) If same test is run, without RED, we can check backlog is much bigger. qdisc sfq 10: parent 1:1 limit 3000p quantum 1514b depth 127 headdrop flows 512/16384 divisor 16384 Sent 1148683617 bytes 795006 pkt (dropped 0, overlimits 0 requeues 0) rate 98429Kbit 8521pps backlog 1221290b 841p requeues 0 Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> CC: Stephen Hemminger <shemminger@vyatta.com> CC: Dave Taht <dave.taht@gmail.com> Tested-by: Dave Taht <dave.taht@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/linux/pkt_sched.h20
-rw-r--r--include/net/red.h3
-rw-r--r--net/sched/sch_sfq.c146
3 files changed, 152 insertions, 17 deletions
diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index 8f1b928f777c..0d5b79365d03 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -162,10 +162,30 @@ struct tc_sfq_qopt {
162 unsigned flows; /* Maximal number of flows */ 162 unsigned flows; /* Maximal number of flows */
163}; 163};
164 164
165struct tc_sfqred_stats {
166 __u32 prob_drop; /* Early drops, below max threshold */
167 __u32 forced_drop; /* Early drops, after max threshold */
168 __u32 prob_mark; /* Marked packets, below max threshold */
169 __u32 forced_mark; /* Marked packets, after max threshold */
170 __u32 prob_mark_head; /* Marked packets, below max threshold */
171 __u32 forced_mark_head;/* Marked packets, after max threshold */
172};
173
165struct tc_sfq_qopt_v1 { 174struct tc_sfq_qopt_v1 {
166 struct tc_sfq_qopt v0; 175 struct tc_sfq_qopt v0;
167 unsigned int depth; /* max number of packets per flow */ 176 unsigned int depth; /* max number of packets per flow */
168 unsigned int headdrop; 177 unsigned int headdrop;
178/* SFQRED parameters */
179 __u32 limit; /* HARD maximal flow queue length (bytes) */
180 __u32 qth_min; /* Min average length threshold (bytes) */
181 __u32 qth_max; /* Max average length threshold (bytes) */
182 unsigned char Wlog; /* log(W) */
183 unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */
184 unsigned char Scell_log; /* cell size for idle damping */
185 unsigned char flags;
186 __u32 max_P; /* probability, high resolution */
187/* SFQRED stats */
188 struct tc_sfqred_stats stats;
169}; 189};
170 190
171 191
diff --git a/include/net/red.h b/include/net/red.h
index baab385a4736..28068ec614b2 100644
--- a/include/net/red.h
+++ b/include/net/red.h
@@ -199,7 +199,8 @@ static inline void red_set_parms(struct red_parms *p,
199 p->Scell_log = Scell_log; 199 p->Scell_log = Scell_log;
200 p->Scell_max = (255 << Scell_log); 200 p->Scell_max = (255 << Scell_log);
201 201
202 memcpy(p->Stab, stab, sizeof(p->Stab)); 202 if (stab)
203 memcpy(p->Stab, stab, sizeof(p->Stab));
203} 204}
204 205
205static inline int red_is_idling(const struct red_vars *v) 206static inline int red_is_idling(const struct red_vars *v)
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 0a7964009e8c..67494aef9acf 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -24,6 +24,7 @@
24#include <net/netlink.h> 24#include <net/netlink.h>
25#include <net/pkt_sched.h> 25#include <net/pkt_sched.h>
26#include <net/flow_keys.h> 26#include <net/flow_keys.h>
27#include <net/red.h>
27 28
28 29
29/* Stochastic Fairness Queuing algorithm. 30/* Stochastic Fairness Queuing algorithm.
@@ -108,24 +109,30 @@ struct sfq_slot {
108 struct sfq_head dep; /* anchor in dep[] chains */ 109 struct sfq_head dep; /* anchor in dep[] chains */
109 unsigned short hash; /* hash value (index in ht[]) */ 110 unsigned short hash; /* hash value (index in ht[]) */
110 short allot; /* credit for this slot */ 111 short allot; /* credit for this slot */
112
113 unsigned int backlog;
114 struct red_vars vars;
111}; 115};
112 116
113struct sfq_sched_data { 117struct sfq_sched_data {
114/* frequently used fields */ 118/* frequently used fields */
115 int limit; /* limit of total number of packets in this qdisc */ 119 int limit; /* limit of total number of packets in this qdisc */
116 unsigned int divisor; /* number of slots in hash table */ 120 unsigned int divisor; /* number of slots in hash table */
117 unsigned int maxflows; /* number of flows in flows array */ 121 u8 headdrop;
118 int headdrop; 122 u8 maxdepth; /* limit of packets per flow */
119 int maxdepth; /* limit of packets per flow */
120 123
121 u32 perturbation; 124 u32 perturbation;
122 struct tcf_proto *filter_list; 125 u8 cur_depth; /* depth of longest slot */
123 sfq_index cur_depth; /* depth of longest slot */ 126 u8 flags;
124 unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */ 127 unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
125 struct sfq_slot *tail; /* current slot in round */ 128 struct tcf_proto *filter_list;
126 sfq_index *ht; /* Hash table ('divisor' slots) */ 129 sfq_index *ht; /* Hash table ('divisor' slots) */
127 struct sfq_slot *slots; /* Flows table ('maxflows' entries) */ 130 struct sfq_slot *slots; /* Flows table ('maxflows' entries) */
128 131
132 struct red_parms *red_parms;
133 struct tc_sfqred_stats stats;
134 struct sfq_slot *tail; /* current slot in round */
135
129 struct sfq_head dep[SFQ_MAX_DEPTH + 1]; 136 struct sfq_head dep[SFQ_MAX_DEPTH + 1];
130 /* Linked lists of slots, indexed by depth 137 /* Linked lists of slots, indexed by depth
131 * dep[0] : list of unused flows 138 * dep[0] : list of unused flows
@@ -133,6 +140,7 @@ struct sfq_sched_data {
133 * dep[X] : list of flows with X packets 140 * dep[X] : list of flows with X packets
134 */ 141 */
135 142
143 unsigned int maxflows; /* number of flows in flows array */
136 int perturb_period; 144 int perturb_period;
137 unsigned int quantum; /* Allotment per round: MUST BE >= MTU */ 145 unsigned int quantum; /* Allotment per round: MUST BE >= MTU */
138 struct timer_list perturb_timer; 146 struct timer_list perturb_timer;
@@ -321,6 +329,7 @@ static unsigned int sfq_drop(struct Qdisc *sch)
321drop: 329drop:
322 skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot); 330 skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
323 len = qdisc_pkt_len(skb); 331 len = qdisc_pkt_len(skb);
332 slot->backlog -= len;
324 sfq_dec(q, x); 333 sfq_dec(q, x);
325 kfree_skb(skb); 334 kfree_skb(skb);
326 sch->q.qlen--; 335 sch->q.qlen--;
@@ -341,6 +350,23 @@ drop:
341 return 0; 350 return 0;
342} 351}
343 352
353/* Is ECN parameter configured */
354static int sfq_prob_mark(const struct sfq_sched_data *q)
355{
356 return q->flags & TC_RED_ECN;
357}
358
359/* Should packets over max threshold just be marked */
360static int sfq_hard_mark(const struct sfq_sched_data *q)
361{
362 return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
363}
364
365static int sfq_headdrop(const struct sfq_sched_data *q)
366{
367 return q->headdrop;
368}
369
344static int 370static int
345sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) 371sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
346{ 372{
@@ -349,6 +375,8 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
349 sfq_index x, qlen; 375 sfq_index x, qlen;
350 struct sfq_slot *slot; 376 struct sfq_slot *slot;
351 int uninitialized_var(ret); 377 int uninitialized_var(ret);
378 struct sk_buff *head;
379 int delta;
352 380
353 hash = sfq_classify(skb, sch, &ret); 381 hash = sfq_classify(skb, sch, &ret);
354 if (hash == 0) { 382 if (hash == 0) {
@@ -368,24 +396,75 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
368 q->ht[hash] = x; 396 q->ht[hash] = x;
369 slot = &q->slots[x]; 397 slot = &q->slots[x];
370 slot->hash = hash; 398 slot->hash = hash;
399 slot->backlog = 0; /* should already be 0 anyway... */
400 red_set_vars(&slot->vars);
401 goto enqueue;
371 } 402 }
403 if (q->red_parms) {
404 slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
405 &slot->vars,
406 slot->backlog);
407 switch (red_action(q->red_parms,
408 &slot->vars,
409 slot->vars.qavg)) {
410 case RED_DONT_MARK:
411 break;
372 412
373 if (slot->qlen >= q->maxdepth) { 413 case RED_PROB_MARK:
374 struct sk_buff *head; 414 sch->qstats.overlimits++;
415 if (sfq_prob_mark(q)) {
416 /* We know we have at least one packet in queue */
417 if (sfq_headdrop(q) &&
418 INET_ECN_set_ce(slot->skblist_next)) {
419 q->stats.prob_mark_head++;
420 break;
421 }
422 if (INET_ECN_set_ce(skb)) {
423 q->stats.prob_mark++;
424 break;
425 }
426 }
427 q->stats.prob_drop++;
428 goto congestion_drop;
429
430 case RED_HARD_MARK:
431 sch->qstats.overlimits++;
432 if (sfq_hard_mark(q)) {
433 /* We know we have at least one packet in queue */
434 if (sfq_headdrop(q) &&
435 INET_ECN_set_ce(slot->skblist_next)) {
436 q->stats.forced_mark_head++;
437 break;
438 }
439 if (INET_ECN_set_ce(skb)) {
440 q->stats.forced_mark++;
441 break;
442 }
443 }
444 q->stats.forced_drop++;
445 goto congestion_drop;
446 }
447 }
375 448
376 if (!q->headdrop) 449 if (slot->qlen >= q->maxdepth) {
450congestion_drop:
451 if (!sfq_headdrop(q))
377 return qdisc_drop(skb, sch); 452 return qdisc_drop(skb, sch);
378 453
454 /* We know we have at least one packet in queue */
379 head = slot_dequeue_head(slot); 455 head = slot_dequeue_head(slot);
380 sch->qstats.backlog -= qdisc_pkt_len(head); 456 delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
457 sch->qstats.backlog -= delta;
458 slot->backlog -= delta;
381 qdisc_drop(head, sch); 459 qdisc_drop(head, sch);
382 460
383 sch->qstats.backlog += qdisc_pkt_len(skb);
384 slot_queue_add(slot, skb); 461 slot_queue_add(slot, skb);
385 return NET_XMIT_CN; 462 return NET_XMIT_CN;
386 } 463 }
387 464
465enqueue:
388 sch->qstats.backlog += qdisc_pkt_len(skb); 466 sch->qstats.backlog += qdisc_pkt_len(skb);
467 slot->backlog += qdisc_pkt_len(skb);
389 slot_queue_add(slot, skb); 468 slot_queue_add(slot, skb);
390 sfq_inc(q, x); 469 sfq_inc(q, x);
391 if (slot->qlen == 1) { /* The flow is new */ 470 if (slot->qlen == 1) { /* The flow is new */
@@ -396,6 +475,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
396 slot->next = q->tail->next; 475 slot->next = q->tail->next;
397 q->tail->next = x; 476 q->tail->next = x;
398 } 477 }
478 /* We could use a bigger initial quantum for new flows */
399 slot->allot = q->scaled_quantum; 479 slot->allot = q->scaled_quantum;
400 } 480 }
401 if (++sch->q.qlen <= q->limit) 481 if (++sch->q.qlen <= q->limit)
@@ -439,7 +519,7 @@ next_slot:
439 qdisc_bstats_update(sch, skb); 519 qdisc_bstats_update(sch, skb);
440 sch->q.qlen--; 520 sch->q.qlen--;
441 sch->qstats.backlog -= qdisc_pkt_len(skb); 521 sch->qstats.backlog -= qdisc_pkt_len(skb);
442 522 slot->backlog -= qdisc_pkt_len(skb);
443 /* Is the slot empty? */ 523 /* Is the slot empty? */
444 if (slot->qlen == 0) { 524 if (slot->qlen == 0) {
445 q->ht[slot->hash] = SFQ_EMPTY_SLOT; 525 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
@@ -490,6 +570,8 @@ static void sfq_rehash(struct Qdisc *sch)
490 sfq_dec(q, i); 570 sfq_dec(q, i);
491 __skb_queue_tail(&list, skb); 571 __skb_queue_tail(&list, skb);
492 } 572 }
573 slot->backlog = 0;
574 red_set_vars(&slot->vars);
493 q->ht[slot->hash] = SFQ_EMPTY_SLOT; 575 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
494 } 576 }
495 q->tail = NULL; 577 q->tail = NULL;
@@ -514,6 +596,11 @@ drop: sch->qstats.backlog -= qdisc_pkt_len(skb);
514 if (slot->qlen >= q->maxdepth) 596 if (slot->qlen >= q->maxdepth)
515 goto drop; 597 goto drop;
516 slot_queue_add(slot, skb); 598 slot_queue_add(slot, skb);
599 if (q->red_parms)
600 slot->vars.qavg = red_calc_qavg(q->red_parms,
601 &slot->vars,
602 slot->backlog);
603 slot->backlog += qdisc_pkt_len(skb);
517 sfq_inc(q, x); 604 sfq_inc(q, x);
518 if (slot->qlen == 1) { /* The flow is new */ 605 if (slot->qlen == 1) { /* The flow is new */
519 if (q->tail == NULL) { /* It is the first flow */ 606 if (q->tail == NULL) { /* It is the first flow */
@@ -552,6 +639,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
552 struct tc_sfq_qopt *ctl = nla_data(opt); 639 struct tc_sfq_qopt *ctl = nla_data(opt);
553 struct tc_sfq_qopt_v1 *ctl_v1 = NULL; 640 struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
554 unsigned int qlen; 641 unsigned int qlen;
642 struct red_parms *p = NULL;
555 643
556 if (opt->nla_len < nla_attr_size(sizeof(*ctl))) 644 if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
557 return -EINVAL; 645 return -EINVAL;
@@ -560,7 +648,11 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
560 if (ctl->divisor && 648 if (ctl->divisor &&
561 (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536)) 649 (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
562 return -EINVAL; 650 return -EINVAL;
563 651 if (ctl_v1 && ctl_v1->qth_min) {
652 p = kmalloc(sizeof(*p), GFP_KERNEL);
653 if (!p)
654 return -ENOMEM;
655 }
564 sch_tree_lock(sch); 656 sch_tree_lock(sch);
565 if (ctl->quantum) { 657 if (ctl->quantum) {
566 q->quantum = ctl->quantum; 658 q->quantum = ctl->quantum;
@@ -576,6 +668,16 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
576 if (ctl_v1) { 668 if (ctl_v1) {
577 if (ctl_v1->depth) 669 if (ctl_v1->depth)
578 q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH); 670 q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
671 if (p) {
672 swap(q->red_parms, p);
673 red_set_parms(q->red_parms,
674 ctl_v1->qth_min, ctl_v1->qth_max,
675 ctl_v1->Wlog,
676 ctl_v1->Plog, ctl_v1->Scell_log,
677 NULL,
678 ctl_v1->max_P);
679 }
680 q->flags = ctl_v1->flags;
579 q->headdrop = ctl_v1->headdrop; 681 q->headdrop = ctl_v1->headdrop;
580 } 682 }
581 if (ctl->limit) { 683 if (ctl->limit) {
@@ -594,6 +696,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
594 q->perturbation = net_random(); 696 q->perturbation = net_random();
595 } 697 }
596 sch_tree_unlock(sch); 698 sch_tree_unlock(sch);
699 kfree(p);
597 return 0; 700 return 0;
598} 701}
599 702
@@ -625,6 +728,7 @@ static void sfq_destroy(struct Qdisc *sch)
625 del_timer_sync(&q->perturb_timer); 728 del_timer_sync(&q->perturb_timer);
626 sfq_free(q->ht); 729 sfq_free(q->ht);
627 sfq_free(q->slots); 730 sfq_free(q->slots);
731 kfree(q->red_parms);
628} 732}
629 733
630static int sfq_init(struct Qdisc *sch, struct nlattr *opt) 734static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
@@ -683,6 +787,7 @@ static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
683 struct sfq_sched_data *q = qdisc_priv(sch); 787 struct sfq_sched_data *q = qdisc_priv(sch);
684 unsigned char *b = skb_tail_pointer(skb); 788 unsigned char *b = skb_tail_pointer(skb);
685 struct tc_sfq_qopt_v1 opt; 789 struct tc_sfq_qopt_v1 opt;
790 struct red_parms *p = q->red_parms;
686 791
687 memset(&opt, 0, sizeof(opt)); 792 memset(&opt, 0, sizeof(opt));
688 opt.v0.quantum = q->quantum; 793 opt.v0.quantum = q->quantum;
@@ -693,6 +798,17 @@ static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
693 opt.depth = q->maxdepth; 798 opt.depth = q->maxdepth;
694 opt.headdrop = q->headdrop; 799 opt.headdrop = q->headdrop;
695 800
801 if (p) {
802 opt.qth_min = p->qth_min >> p->Wlog;
803 opt.qth_max = p->qth_max >> p->Wlog;
804 opt.Wlog = p->Wlog;
805 opt.Plog = p->Plog;
806 opt.Scell_log = p->Scell_log;
807 opt.max_P = p->max_P;
808 }
809 memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
810 opt.flags = q->flags;
811
696 NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); 812 NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
697 813
698 return skb->len; 814 return skb->len;
@@ -747,15 +863,13 @@ static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
747 sfq_index idx = q->ht[cl - 1]; 863 sfq_index idx = q->ht[cl - 1];
748 struct gnet_stats_queue qs = { 0 }; 864 struct gnet_stats_queue qs = { 0 };
749 struct tc_sfq_xstats xstats = { 0 }; 865 struct tc_sfq_xstats xstats = { 0 };
750 struct sk_buff *skb;
751 866
752 if (idx != SFQ_EMPTY_SLOT) { 867 if (idx != SFQ_EMPTY_SLOT) {
753 const struct sfq_slot *slot = &q->slots[idx]; 868 const struct sfq_slot *slot = &q->slots[idx];
754 869
755 xstats.allot = slot->allot << SFQ_ALLOT_SHIFT; 870 xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
756 qs.qlen = slot->qlen; 871 qs.qlen = slot->qlen;
757 slot_queue_walk(slot, skb) 872 qs.backlog = slot->backlog;
758 qs.backlog += qdisc_pkt_len(skb);
759 } 873 }
760 if (gnet_stats_copy_queue(d, &qs) < 0) 874 if (gnet_stats_copy_queue(d, &qs) < 0)
761 return -1; 875 return -1;