aboutsummaryrefslogtreecommitdiffstats
path: root/net/sched
diff options
context:
space:
mode:
Diffstat (limited to 'net/sched')
-rw-r--r--net/sched/Kconfig2
-rw-r--r--net/sched/act_api.c3
-rw-r--r--net/sched/cls_api.c2
-rw-r--r--net/sched/cls_cgroup.c24
-rw-r--r--net/sched/sch_api.c20
-rw-r--r--net/sched/sch_cbq.c3
-rw-r--r--net/sched/sch_generic.c11
-rw-r--r--net/sched/sch_htb.c139
-rw-r--r--net/sched/sch_mq.c4
-rw-r--r--net/sched/sch_mqprio.c4
-rw-r--r--net/sched/sch_qfq.c830
11 files changed, 717 insertions, 325 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 62fb51face8a..235e01acac51 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -509,7 +509,7 @@ config NET_EMATCH_TEXT
509 509
510config NET_EMATCH_CANID 510config NET_EMATCH_CANID
511 tristate "CAN Identifier" 511 tristate "CAN Identifier"
512 depends on NET_EMATCH && CAN 512 depends on NET_EMATCH && (CAN=y || CAN=m)
513 ---help--- 513 ---help---
514 Say Y here if you want to be able to classify CAN frames based 514 Say Y here if you want to be able to classify CAN frames based
515 on CAN Identifier. 515 on CAN Identifier.
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 102761d294cb..65d240cbf74b 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -987,6 +987,9 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
987 u32 portid = skb ? NETLINK_CB(skb).portid : 0; 987 u32 portid = skb ? NETLINK_CB(skb).portid : 0;
988 int ret = 0, ovr = 0; 988 int ret = 0, ovr = 0;
989 989
990 if ((n->nlmsg_type != RTM_GETACTION) && !capable(CAP_NET_ADMIN))
991 return -EPERM;
992
990 ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL); 993 ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL);
991 if (ret < 0) 994 if (ret < 0)
992 return ret; 995 return ret;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 7ae02892437c..ff55ed6c49b2 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -139,6 +139,8 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
139 int err; 139 int err;
140 int tp_created = 0; 140 int tp_created = 0;
141 141
142 if ((n->nlmsg_type != RTM_GETTFILTER) && !capable(CAP_NET_ADMIN))
143 return -EPERM;
142replay: 144replay:
143 t = nlmsg_data(n); 145 t = nlmsg_data(n);
144 protocol = TC_H_MIN(t->tcm_info); 146 protocol = TC_H_MIN(t->tcm_info);
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 31f06b633574..6db7855b9029 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -17,6 +17,7 @@
17#include <linux/skbuff.h> 17#include <linux/skbuff.h>
18#include <linux/cgroup.h> 18#include <linux/cgroup.h>
19#include <linux/rcupdate.h> 19#include <linux/rcupdate.h>
20#include <linux/fdtable.h>
20#include <net/rtnetlink.h> 21#include <net/rtnetlink.h>
21#include <net/pkt_cls.h> 22#include <net/pkt_cls.h>
22#include <net/sock.h> 23#include <net/sock.h>
@@ -57,6 +58,28 @@ static void cgrp_css_free(struct cgroup *cgrp)
57 kfree(cgrp_cls_state(cgrp)); 58 kfree(cgrp_cls_state(cgrp));
58} 59}
59 60
61static int update_classid(const void *v, struct file *file, unsigned n)
62{
63 int err;
64 struct socket *sock = sock_from_file(file, &err);
65 if (sock)
66 sock->sk->sk_classid = (u32)(unsigned long)v;
67 return 0;
68}
69
70static void cgrp_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
71{
72 struct task_struct *p;
73 void *v;
74
75 cgroup_taskset_for_each(p, cgrp, tset) {
76 task_lock(p);
77 v = (void *)(unsigned long)task_cls_classid(p);
78 iterate_fd(p->files, 0, update_classid, v);
79 task_unlock(p);
80 }
81}
82
60static u64 read_classid(struct cgroup *cgrp, struct cftype *cft) 83static u64 read_classid(struct cgroup *cgrp, struct cftype *cft)
61{ 84{
62 return cgrp_cls_state(cgrp)->classid; 85 return cgrp_cls_state(cgrp)->classid;
@@ -82,6 +105,7 @@ struct cgroup_subsys net_cls_subsys = {
82 .css_alloc = cgrp_css_alloc, 105 .css_alloc = cgrp_css_alloc,
83 .css_online = cgrp_css_online, 106 .css_online = cgrp_css_online,
84 .css_free = cgrp_css_free, 107 .css_free = cgrp_css_free,
108 .attach = cgrp_attach,
85 .subsys_id = net_cls_subsys_id, 109 .subsys_id = net_cls_subsys_id,
86 .base_cftypes = ss_files, 110 .base_cftypes = ss_files,
87 .module = THIS_MODULE, 111 .module = THIS_MODULE,
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index a18d975db59c..d84f7e734cd7 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -495,16 +495,15 @@ EXPORT_SYMBOL(qdisc_watchdog_init);
495 495
496void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires) 496void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
497{ 497{
498 ktime_t time;
499
500 if (test_bit(__QDISC_STATE_DEACTIVATED, 498 if (test_bit(__QDISC_STATE_DEACTIVATED,
501 &qdisc_root_sleeping(wd->qdisc)->state)) 499 &qdisc_root_sleeping(wd->qdisc)->state))
502 return; 500 return;
503 501
504 qdisc_throttled(wd->qdisc); 502 qdisc_throttled(wd->qdisc);
505 time = ktime_set(0, 0); 503
506 time = ktime_add_ns(time, PSCHED_TICKS2NS(expires)); 504 hrtimer_start(&wd->timer,
507 hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS); 505 ns_to_ktime(PSCHED_TICKS2NS(expires)),
506 HRTIMER_MODE_ABS);
508} 507}
509EXPORT_SYMBOL(qdisc_watchdog_schedule); 508EXPORT_SYMBOL(qdisc_watchdog_schedule);
510 509
@@ -834,6 +833,8 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
834 goto err_out3; 833 goto err_out3;
835 } 834 }
836 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock); 835 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
836 if (!netif_is_multiqueue(dev))
837 sch->flags |= TCQ_F_ONETXQUEUE;
837 } 838 }
838 839
839 sch->handle = handle; 840 sch->handle = handle;
@@ -981,6 +982,9 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
981 struct Qdisc *p = NULL; 982 struct Qdisc *p = NULL;
982 int err; 983 int err;
983 984
985 if ((n->nlmsg_type != RTM_GETQDISC) && !capable(CAP_NET_ADMIN))
986 return -EPERM;
987
984 dev = __dev_get_by_index(net, tcm->tcm_ifindex); 988 dev = __dev_get_by_index(net, tcm->tcm_ifindex);
985 if (!dev) 989 if (!dev)
986 return -ENODEV; 990 return -ENODEV;
@@ -1044,6 +1048,9 @@ static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1044 struct Qdisc *q, *p; 1048 struct Qdisc *q, *p;
1045 int err; 1049 int err;
1046 1050
1051 if (!capable(CAP_NET_ADMIN))
1052 return -EPERM;
1053
1047replay: 1054replay:
1048 /* Reinit, just in case something touches this. */ 1055 /* Reinit, just in case something touches this. */
1049 tcm = nlmsg_data(n); 1056 tcm = nlmsg_data(n);
@@ -1380,6 +1387,9 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1380 u32 qid = TC_H_MAJ(clid); 1387 u32 qid = TC_H_MAJ(clid);
1381 int err; 1388 int err;
1382 1389
1390 if ((n->nlmsg_type != RTM_GETTCLASS) && !capable(CAP_NET_ADMIN))
1391 return -EPERM;
1392
1383 dev = __dev_get_by_index(net, tcm->tcm_ifindex); 1393 dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1384 if (!dev) 1394 if (!dev)
1385 return -ENODEV; 1395 return -ENODEV;
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 564b9fc8efd3..0e19948470b8 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -509,8 +509,7 @@ static void cbq_ovl_delay(struct cbq_class *cl)
509 cl->cpriority = TC_CBQ_MAXPRIO; 509 cl->cpriority = TC_CBQ_MAXPRIO;
510 q->pmask |= (1<<TC_CBQ_MAXPRIO); 510 q->pmask |= (1<<TC_CBQ_MAXPRIO);
511 511
512 expires = ktime_set(0, 0); 512 expires = ns_to_ktime(PSCHED_TICKS2NS(sched));
513 expires = ktime_add_ns(expires, PSCHED_TICKS2NS(sched));
514 if (hrtimer_try_to_cancel(&q->delay_timer) && 513 if (hrtimer_try_to_cancel(&q->delay_timer) &&
515 ktime_to_ns(ktime_sub( 514 ktime_to_ns(ktime_sub(
516 hrtimer_get_expires(&q->delay_timer), 515 hrtimer_get_expires(&q->delay_timer),
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index aefc1504dc88..5d81a4478514 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -53,20 +53,19 @@ static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
53static inline struct sk_buff *dequeue_skb(struct Qdisc *q) 53static inline struct sk_buff *dequeue_skb(struct Qdisc *q)
54{ 54{
55 struct sk_buff *skb = q->gso_skb; 55 struct sk_buff *skb = q->gso_skb;
56 const struct netdev_queue *txq = q->dev_queue;
56 57
57 if (unlikely(skb)) { 58 if (unlikely(skb)) {
58 struct net_device *dev = qdisc_dev(q);
59 struct netdev_queue *txq;
60
61 /* check the reason of requeuing without tx lock first */ 59 /* check the reason of requeuing without tx lock first */
62 txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); 60 txq = netdev_get_tx_queue(txq->dev, skb_get_queue_mapping(skb));
63 if (!netif_xmit_frozen_or_stopped(txq)) { 61 if (!netif_xmit_frozen_or_stopped(txq)) {
64 q->gso_skb = NULL; 62 q->gso_skb = NULL;
65 q->q.qlen--; 63 q->q.qlen--;
66 } else 64 } else
67 skb = NULL; 65 skb = NULL;
68 } else { 66 } else {
69 skb = q->dequeue(q); 67 if (!(q->flags & TCQ_F_ONETXQUEUE) || !netif_xmit_frozen_or_stopped(txq))
68 skb = q->dequeue(q);
70 } 69 }
71 70
72 return skb; 71 return skb;
@@ -686,6 +685,8 @@ static void attach_one_default_qdisc(struct net_device *dev,
686 netdev_info(dev, "activation failed\n"); 685 netdev_info(dev, "activation failed\n");
687 return; 686 return;
688 } 687 }
688 if (!netif_is_multiqueue(dev))
689 qdisc->flags |= TCQ_F_ONETXQUEUE;
689 } 690 }
690 dev_queue->qdisc_sleeping = qdisc; 691 dev_queue->qdisc_sleeping = qdisc;
691} 692}
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 9d75b7761313..d2922c0ef57a 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -71,6 +71,12 @@ enum htb_cmode {
71 HTB_CAN_SEND /* class can send */ 71 HTB_CAN_SEND /* class can send */
72}; 72};
73 73
74struct htb_rate_cfg {
75 u64 rate_bps;
76 u32 mult;
77 u32 shift;
78};
79
74/* interior & leaf nodes; props specific to leaves are marked L: */ 80/* interior & leaf nodes; props specific to leaves are marked L: */
75struct htb_class { 81struct htb_class {
76 struct Qdisc_class_common common; 82 struct Qdisc_class_common common;
@@ -118,11 +124,11 @@ struct htb_class {
118 int filter_cnt; 124 int filter_cnt;
119 125
120 /* token bucket parameters */ 126 /* token bucket parameters */
121 struct qdisc_rate_table *rate; /* rate table of the class itself */ 127 struct htb_rate_cfg rate;
122 struct qdisc_rate_table *ceil; /* ceiling rate (limits borrows too) */ 128 struct htb_rate_cfg ceil;
123 long buffer, cbuffer; /* token bucket depth/rate */ 129 s64 buffer, cbuffer; /* token bucket depth/rate */
124 psched_tdiff_t mbuffer; /* max wait time */ 130 psched_tdiff_t mbuffer; /* max wait time */
125 long tokens, ctokens; /* current number of tokens */ 131 s64 tokens, ctokens; /* current number of tokens */
126 psched_time_t t_c; /* checkpoint time */ 132 psched_time_t t_c; /* checkpoint time */
127}; 133};
128 134
@@ -162,6 +168,45 @@ struct htb_sched {
162 struct work_struct work; 168 struct work_struct work;
163}; 169};
164 170
171static u64 l2t_ns(struct htb_rate_cfg *r, unsigned int len)
172{
173 return ((u64)len * r->mult) >> r->shift;
174}
175
176static void htb_precompute_ratedata(struct htb_rate_cfg *r)
177{
178 u64 factor;
179 u64 mult;
180 int shift;
181
182 r->shift = 0;
183 r->mult = 1;
184 /*
185 * Calibrate mult, shift so that token counting is accurate
186 * for smallest packet size (64 bytes). Token (time in ns) is
187 * computed as (bytes * 8) * NSEC_PER_SEC / rate_bps. It will
188 * work as long as the smallest packet transfer time can be
189 * accurately represented in nanosec.
190 */
191 if (r->rate_bps > 0) {
192 /*
193 * Higher shift gives better accuracy. Find the largest
194 * shift such that mult fits in 32 bits.
195 */
196 for (shift = 0; shift < 16; shift++) {
197 r->shift = shift;
198 factor = 8LLU * NSEC_PER_SEC * (1 << r->shift);
199 mult = div64_u64(factor, r->rate_bps);
200 if (mult > UINT_MAX)
201 break;
202 }
203
204 r->shift = shift - 1;
205 factor = 8LLU * NSEC_PER_SEC * (1 << r->shift);
206 r->mult = div64_u64(factor, r->rate_bps);
207 }
208}
209
165/* find class in global hash table using given handle */ 210/* find class in global hash table using given handle */
166static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch) 211static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
167{ 212{
@@ -273,7 +318,7 @@ static void htb_add_to_id_tree(struct rb_root *root,
273 * already in the queue. 318 * already in the queue.
274 */ 319 */
275static void htb_add_to_wait_tree(struct htb_sched *q, 320static void htb_add_to_wait_tree(struct htb_sched *q,
276 struct htb_class *cl, long delay) 321 struct htb_class *cl, s64 delay)
277{ 322{
278 struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL; 323 struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL;
279 324
@@ -441,14 +486,14 @@ static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
441 htb_remove_class_from_row(q, cl, mask); 486 htb_remove_class_from_row(q, cl, mask);
442} 487}
443 488
444static inline long htb_lowater(const struct htb_class *cl) 489static inline s64 htb_lowater(const struct htb_class *cl)
445{ 490{
446 if (htb_hysteresis) 491 if (htb_hysteresis)
447 return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0; 492 return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
448 else 493 else
449 return 0; 494 return 0;
450} 495}
451static inline long htb_hiwater(const struct htb_class *cl) 496static inline s64 htb_hiwater(const struct htb_class *cl)
452{ 497{
453 if (htb_hysteresis) 498 if (htb_hysteresis)
454 return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0; 499 return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
@@ -469,9 +514,9 @@ static inline long htb_hiwater(const struct htb_class *cl)
469 * mode transitions per time unit. The speed gain is about 1/6. 514 * mode transitions per time unit. The speed gain is about 1/6.
470 */ 515 */
471static inline enum htb_cmode 516static inline enum htb_cmode
472htb_class_mode(struct htb_class *cl, long *diff) 517htb_class_mode(struct htb_class *cl, s64 *diff)
473{ 518{
474 long toks; 519 s64 toks;
475 520
476 if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) { 521 if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
477 *diff = -toks; 522 *diff = -toks;
@@ -495,7 +540,7 @@ htb_class_mode(struct htb_class *cl, long *diff)
495 * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree). 540 * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
496 */ 541 */
497static void 542static void
498htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, long *diff) 543htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
499{ 544{
500 enum htb_cmode new_mode = htb_class_mode(cl, diff); 545 enum htb_cmode new_mode = htb_class_mode(cl, diff);
501 546
@@ -581,26 +626,26 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
581 return NET_XMIT_SUCCESS; 626 return NET_XMIT_SUCCESS;
582} 627}
583 628
584static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, long diff) 629static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, s64 diff)
585{ 630{
586 long toks = diff + cl->tokens; 631 s64 toks = diff + cl->tokens;
587 632
588 if (toks > cl->buffer) 633 if (toks > cl->buffer)
589 toks = cl->buffer; 634 toks = cl->buffer;
590 toks -= (long) qdisc_l2t(cl->rate, bytes); 635 toks -= (s64) l2t_ns(&cl->rate, bytes);
591 if (toks <= -cl->mbuffer) 636 if (toks <= -cl->mbuffer)
592 toks = 1 - cl->mbuffer; 637 toks = 1 - cl->mbuffer;
593 638
594 cl->tokens = toks; 639 cl->tokens = toks;
595} 640}
596 641
597static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, long diff) 642static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff)
598{ 643{
599 long toks = diff + cl->ctokens; 644 s64 toks = diff + cl->ctokens;
600 645
601 if (toks > cl->cbuffer) 646 if (toks > cl->cbuffer)
602 toks = cl->cbuffer; 647 toks = cl->cbuffer;
603 toks -= (long) qdisc_l2t(cl->ceil, bytes); 648 toks -= (s64) l2t_ns(&cl->ceil, bytes);
604 if (toks <= -cl->mbuffer) 649 if (toks <= -cl->mbuffer)
605 toks = 1 - cl->mbuffer; 650 toks = 1 - cl->mbuffer;
606 651
@@ -623,10 +668,10 @@ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
623{ 668{
624 int bytes = qdisc_pkt_len(skb); 669 int bytes = qdisc_pkt_len(skb);
625 enum htb_cmode old_mode; 670 enum htb_cmode old_mode;
626 long diff; 671 s64 diff;
627 672
628 while (cl) { 673 while (cl) {
629 diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer); 674 diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
630 if (cl->level >= level) { 675 if (cl->level >= level) {
631 if (cl->level == level) 676 if (cl->level == level)
632 cl->xstats.lends++; 677 cl->xstats.lends++;
@@ -673,7 +718,7 @@ static psched_time_t htb_do_events(struct htb_sched *q, int level,
673 unsigned long stop_at = start + 2; 718 unsigned long stop_at = start + 2;
674 while (time_before(jiffies, stop_at)) { 719 while (time_before(jiffies, stop_at)) {
675 struct htb_class *cl; 720 struct htb_class *cl;
676 long diff; 721 s64 diff;
677 struct rb_node *p = rb_first(&q->wait_pq[level]); 722 struct rb_node *p = rb_first(&q->wait_pq[level]);
678 723
679 if (!p) 724 if (!p)
@@ -684,7 +729,7 @@ static psched_time_t htb_do_events(struct htb_sched *q, int level,
684 return cl->pq_key; 729 return cl->pq_key;
685 730
686 htb_safe_rb_erase(p, q->wait_pq + level); 731 htb_safe_rb_erase(p, q->wait_pq + level);
687 diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer); 732 diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
688 htb_change_class_mode(q, cl, &diff); 733 htb_change_class_mode(q, cl, &diff);
689 if (cl->cmode != HTB_CAN_SEND) 734 if (cl->cmode != HTB_CAN_SEND)
690 htb_add_to_wait_tree(q, cl, diff); 735 htb_add_to_wait_tree(q, cl, diff);
@@ -871,10 +916,10 @@ ok:
871 916
872 if (!sch->q.qlen) 917 if (!sch->q.qlen)
873 goto fin; 918 goto fin;
874 q->now = psched_get_time(); 919 q->now = ktime_to_ns(ktime_get());
875 start_at = jiffies; 920 start_at = jiffies;
876 921
877 next_event = q->now + 5 * PSCHED_TICKS_PER_SEC; 922 next_event = q->now + 5 * NSEC_PER_SEC;
878 923
879 for (level = 0; level < TC_HTB_MAXDEPTH; level++) { 924 for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
880 /* common case optimization - skip event handler quickly */ 925 /* common case optimization - skip event handler quickly */
@@ -884,7 +929,7 @@ ok:
884 if (q->now >= q->near_ev_cache[level]) { 929 if (q->now >= q->near_ev_cache[level]) {
885 event = htb_do_events(q, level, start_at); 930 event = htb_do_events(q, level, start_at);
886 if (!event) 931 if (!event)
887 event = q->now + PSCHED_TICKS_PER_SEC; 932 event = q->now + NSEC_PER_SEC;
888 q->near_ev_cache[level] = event; 933 q->near_ev_cache[level] = event;
889 } else 934 } else
890 event = q->near_ev_cache[level]; 935 event = q->near_ev_cache[level];
@@ -903,10 +948,17 @@ ok:
903 } 948 }
904 } 949 }
905 sch->qstats.overlimits++; 950 sch->qstats.overlimits++;
906 if (likely(next_event > q->now)) 951 if (likely(next_event > q->now)) {
907 qdisc_watchdog_schedule(&q->watchdog, next_event); 952 if (!test_bit(__QDISC_STATE_DEACTIVATED,
908 else 953 &qdisc_root_sleeping(q->watchdog.qdisc)->state)) {
954 ktime_t time = ns_to_ktime(next_event);
955 qdisc_throttled(q->watchdog.qdisc);
956 hrtimer_start(&q->watchdog.timer, time,
957 HRTIMER_MODE_ABS);
958 }
959 } else {
909 schedule_work(&q->work); 960 schedule_work(&q->work);
961 }
910fin: 962fin:
911 return skb; 963 return skb;
912} 964}
@@ -1082,9 +1134,9 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1082 1134
1083 memset(&opt, 0, sizeof(opt)); 1135 memset(&opt, 0, sizeof(opt));
1084 1136
1085 opt.rate = cl->rate->rate; 1137 opt.rate.rate = cl->rate.rate_bps >> 3;
1086 opt.buffer = cl->buffer; 1138 opt.buffer = cl->buffer;
1087 opt.ceil = cl->ceil->rate; 1139 opt.ceil.rate = cl->ceil.rate_bps >> 3;
1088 opt.cbuffer = cl->cbuffer; 1140 opt.cbuffer = cl->cbuffer;
1089 opt.quantum = cl->quantum; 1141 opt.quantum = cl->quantum;
1090 opt.prio = cl->prio; 1142 opt.prio = cl->prio;
@@ -1203,9 +1255,6 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1203 qdisc_destroy(cl->un.leaf.q); 1255 qdisc_destroy(cl->un.leaf.q);
1204 } 1256 }
1205 gen_kill_estimator(&cl->bstats, &cl->rate_est); 1257 gen_kill_estimator(&cl->bstats, &cl->rate_est);
1206 qdisc_put_rtab(cl->rate);
1207 qdisc_put_rtab(cl->ceil);
1208
1209 tcf_destroy_chain(&cl->filter_list); 1258 tcf_destroy_chain(&cl->filter_list);
1210 kfree(cl); 1259 kfree(cl);
1211} 1260}
@@ -1307,7 +1356,6 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
1307 struct htb_sched *q = qdisc_priv(sch); 1356 struct htb_sched *q = qdisc_priv(sch);
1308 struct htb_class *cl = (struct htb_class *)*arg, *parent; 1357 struct htb_class *cl = (struct htb_class *)*arg, *parent;
1309 struct nlattr *opt = tca[TCA_OPTIONS]; 1358 struct nlattr *opt = tca[TCA_OPTIONS];
1310 struct qdisc_rate_table *rtab = NULL, *ctab = NULL;
1311 struct nlattr *tb[__TCA_HTB_MAX]; 1359 struct nlattr *tb[__TCA_HTB_MAX];
1312 struct tc_htb_opt *hopt; 1360 struct tc_htb_opt *hopt;
1313 1361
@@ -1326,10 +1374,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
1326 parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch); 1374 parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
1327 1375
1328 hopt = nla_data(tb[TCA_HTB_PARMS]); 1376 hopt = nla_data(tb[TCA_HTB_PARMS]);
1329 1377 if (!hopt->rate.rate || !hopt->ceil.rate)
1330 rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]);
1331 ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]);
1332 if (!rtab || !ctab)
1333 goto failure; 1378 goto failure;
1334 1379
1335 if (!cl) { /* new class */ 1380 if (!cl) { /* new class */
@@ -1439,7 +1484,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
1439 * is really leaf before changing cl->un.leaf ! 1484 * is really leaf before changing cl->un.leaf !
1440 */ 1485 */
1441 if (!cl->level) { 1486 if (!cl->level) {
1442 cl->quantum = rtab->rate.rate / q->rate2quantum; 1487 cl->quantum = hopt->rate.rate / q->rate2quantum;
1443 if (!hopt->quantum && cl->quantum < 1000) { 1488 if (!hopt->quantum && cl->quantum < 1000) {
1444 pr_warning( 1489 pr_warning(
1445 "HTB: quantum of class %X is small. Consider r2q change.\n", 1490 "HTB: quantum of class %X is small. Consider r2q change.\n",
@@ -1460,12 +1505,16 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
1460 1505
1461 cl->buffer = hopt->buffer; 1506 cl->buffer = hopt->buffer;
1462 cl->cbuffer = hopt->cbuffer; 1507 cl->cbuffer = hopt->cbuffer;
1463 if (cl->rate) 1508
1464 qdisc_put_rtab(cl->rate); 1509 cl->rate.rate_bps = (u64)hopt->rate.rate << 3;
1465 cl->rate = rtab; 1510 cl->ceil.rate_bps = (u64)hopt->ceil.rate << 3;
1466 if (cl->ceil) 1511
1467 qdisc_put_rtab(cl->ceil); 1512 htb_precompute_ratedata(&cl->rate);
1468 cl->ceil = ctab; 1513 htb_precompute_ratedata(&cl->ceil);
1514
1515 cl->buffer = hopt->buffer << PSCHED_SHIFT;
1516 cl->cbuffer = hopt->buffer << PSCHED_SHIFT;
1517
1469 sch_tree_unlock(sch); 1518 sch_tree_unlock(sch);
1470 1519
1471 qdisc_class_hash_grow(sch, &q->clhash); 1520 qdisc_class_hash_grow(sch, &q->clhash);
@@ -1474,10 +1523,6 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
1474 return 0; 1523 return 0;
1475 1524
1476failure: 1525failure:
1477 if (rtab)
1478 qdisc_put_rtab(rtab);
1479 if (ctab)
1480 qdisc_put_rtab(ctab);
1481 return err; 1526 return err;
1482} 1527}
1483 1528
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index 0a4b2f9a0094..5da78a19ac9a 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -63,6 +63,7 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt)
63 if (qdisc == NULL) 63 if (qdisc == NULL)
64 goto err; 64 goto err;
65 priv->qdiscs[ntx] = qdisc; 65 priv->qdiscs[ntx] = qdisc;
66 qdisc->flags |= TCQ_F_ONETXQUEUE;
66 } 67 }
67 68
68 sch->flags |= TCQ_F_MQROOT; 69 sch->flags |= TCQ_F_MQROOT;
@@ -150,7 +151,8 @@ static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
150 dev_deactivate(dev); 151 dev_deactivate(dev);
151 152
152 *old = dev_graft_qdisc(dev_queue, new); 153 *old = dev_graft_qdisc(dev_queue, new);
153 154 if (new)
155 new->flags |= TCQ_F_ONETXQUEUE;
154 if (dev->flags & IFF_UP) 156 if (dev->flags & IFF_UP)
155 dev_activate(dev); 157 dev_activate(dev);
156 return 0; 158 return 0;
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index d1831ca966d4..accec33c454c 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -132,6 +132,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
132 goto err; 132 goto err;
133 } 133 }
134 priv->qdiscs[i] = qdisc; 134 priv->qdiscs[i] = qdisc;
135 qdisc->flags |= TCQ_F_ONETXQUEUE;
135 } 136 }
136 137
137 /* If the mqprio options indicate that hardware should own 138 /* If the mqprio options indicate that hardware should own
@@ -205,6 +206,9 @@ static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
205 206
206 *old = dev_graft_qdisc(dev_queue, new); 207 *old = dev_graft_qdisc(dev_queue, new);
207 208
209 if (new)
210 new->flags |= TCQ_F_ONETXQUEUE;
211
208 if (dev->flags & IFF_UP) 212 if (dev->flags & IFF_UP)
209 dev_activate(dev); 213 dev_activate(dev);
210 214
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index 9687fa1c2275..6ed37652a4c3 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -1,7 +1,8 @@
1/* 1/*
2 * net/sched/sch_qfq.c Quick Fair Queueing Scheduler. 2 * net/sched/sch_qfq.c Quick Fair Queueing Plus Scheduler.
3 * 3 *
4 * Copyright (c) 2009 Fabio Checconi, Luigi Rizzo, and Paolo Valente. 4 * Copyright (c) 2009 Fabio Checconi, Luigi Rizzo, and Paolo Valente.
5 * Copyright (c) 2012 Paolo Valente.
5 * 6 *
6 * This program is free software; you can redistribute it and/or 7 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License 8 * modify it under the terms of the GNU General Public License
@@ -19,12 +20,18 @@
19#include <net/pkt_cls.h> 20#include <net/pkt_cls.h>
20 21
21 22
22/* Quick Fair Queueing 23/* Quick Fair Queueing Plus
23 =================== 24 ========================
24 25
25 Sources: 26 Sources:
26 27
27 Fabio Checconi, Luigi Rizzo, and Paolo Valente: "QFQ: Efficient 28 [1] Paolo Valente,
29 "Reducing the Execution Time of Fair-Queueing Schedulers."
30 http://algo.ing.unimo.it/people/paolo/agg-sched/agg-sched.pdf
31
32 Sources for QFQ:
33
34 [2] Fabio Checconi, Luigi Rizzo, and Paolo Valente: "QFQ: Efficient
28 Packet Scheduling with Tight Bandwidth Distribution Guarantees." 35 Packet Scheduling with Tight Bandwidth Distribution Guarantees."
29 36
30 See also: 37 See also:
@@ -33,6 +40,20 @@
33 40
34/* 41/*
35 42
43 QFQ+ divides classes into aggregates of at most MAX_AGG_CLASSES
44 classes. Each aggregate is timestamped with a virtual start time S
45 and a virtual finish time F, and scheduled according to its
46 timestamps. S and F are computed as a function of a system virtual
47 time function V. The classes within each aggregate are instead
48 scheduled with DRR.
49
50 To speed up operations, QFQ+ divides also aggregates into a limited
51 number of groups. Which group a class belongs to depends on the
52 ratio between the maximum packet length for the class and the weight
53 of the class. Groups have their own S and F. In the end, QFQ+
54 schedules groups, then aggregates within groups, then classes within
55 aggregates. See [1] and [2] for a full description.
56
36 Virtual time computations. 57 Virtual time computations.
37 58
38 S, F and V are all computed in fixed point arithmetic with 59 S, F and V are all computed in fixed point arithmetic with
@@ -76,27 +97,28 @@
76#define QFQ_MAX_SLOTS 32 97#define QFQ_MAX_SLOTS 32
77 98
78/* 99/*
79 * Shifts used for class<->group mapping. We allow class weights that are 100 * Shifts used for aggregate<->group mapping. We allow class weights that are
80 * in the range [1, 2^MAX_WSHIFT], and we try to map each class i to the 101 * in the range [1, 2^MAX_WSHIFT], and we try to map each aggregate i to the
81 * group with the smallest index that can support the L_i / r_i configured 102 * group with the smallest index that can support the L_i / r_i configured
82 * for the class. 103 * for the classes in the aggregate.
83 * 104 *
84 * grp->index is the index of the group; and grp->slot_shift 105 * grp->index is the index of the group; and grp->slot_shift
85 * is the shift for the corresponding (scaled) sigma_i. 106 * is the shift for the corresponding (scaled) sigma_i.
86 */ 107 */
87#define QFQ_MAX_INDEX 24 108#define QFQ_MAX_INDEX 24
88#define QFQ_MAX_WSHIFT 12 109#define QFQ_MAX_WSHIFT 10
89 110
90#define QFQ_MAX_WEIGHT (1<<QFQ_MAX_WSHIFT) 111#define QFQ_MAX_WEIGHT (1<<QFQ_MAX_WSHIFT) /* see qfq_slot_insert */
91#define QFQ_MAX_WSUM (16*QFQ_MAX_WEIGHT) 112#define QFQ_MAX_WSUM (64*QFQ_MAX_WEIGHT)
92 113
93#define FRAC_BITS 30 /* fixed point arithmetic */ 114#define FRAC_BITS 30 /* fixed point arithmetic */
94#define ONE_FP (1UL << FRAC_BITS) 115#define ONE_FP (1UL << FRAC_BITS)
95#define IWSUM (ONE_FP/QFQ_MAX_WSUM) 116#define IWSUM (ONE_FP/QFQ_MAX_WSUM)
96 117
97#define QFQ_MTU_SHIFT 16 /* to support TSO/GSO */ 118#define QFQ_MTU_SHIFT 16 /* to support TSO/GSO */
98#define QFQ_MIN_SLOT_SHIFT (FRAC_BITS + QFQ_MTU_SHIFT - QFQ_MAX_INDEX) 119#define QFQ_MIN_LMAX 512 /* see qfq_slot_insert */
99#define QFQ_MIN_LMAX 256 /* min possible lmax for a class */ 120
121#define QFQ_MAX_AGG_CLASSES 8 /* max num classes per aggregate allowed */
100 122
101/* 123/*
102 * Possible group states. These values are used as indexes for the bitmaps 124 * Possible group states. These values are used as indexes for the bitmaps
@@ -106,6 +128,8 @@ enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE };
106 128
107struct qfq_group; 129struct qfq_group;
108 130
131struct qfq_aggregate;
132
109struct qfq_class { 133struct qfq_class {
110 struct Qdisc_class_common common; 134 struct Qdisc_class_common common;
111 135
@@ -116,7 +140,12 @@ struct qfq_class {
116 struct gnet_stats_queue qstats; 140 struct gnet_stats_queue qstats;
117 struct gnet_stats_rate_est rate_est; 141 struct gnet_stats_rate_est rate_est;
118 struct Qdisc *qdisc; 142 struct Qdisc *qdisc;
143 struct list_head alist; /* Link for active-classes list. */
144 struct qfq_aggregate *agg; /* Parent aggregate. */
145 int deficit; /* DRR deficit counter. */
146};
119 147
148struct qfq_aggregate {
120 struct hlist_node next; /* Link for the slot list. */ 149 struct hlist_node next; /* Link for the slot list. */
121 u64 S, F; /* flow timestamps (exact) */ 150 u64 S, F; /* flow timestamps (exact) */
122 151
@@ -127,8 +156,18 @@ struct qfq_class {
127 struct qfq_group *grp; 156 struct qfq_group *grp;
128 157
129 /* these are copied from the flowset. */ 158 /* these are copied from the flowset. */
130 u32 inv_w; /* ONE_FP/weight */ 159 u32 class_weight; /* Weight of each class in this aggregate. */
131 u32 lmax; /* Max packet size for this flow. */ 160 /* Max pkt size for the classes in this aggregate, DRR quantum. */
161 int lmax;
162
163 u32 inv_w; /* ONE_FP/(sum of weights of classes in aggr.). */
164 u32 budgetmax; /* Max budget for this aggregate. */
165 u32 initial_budget, budget; /* Initial and current budget. */
166
167 int num_classes; /* Number of classes in this aggr. */
168 struct list_head active; /* DRR queue of active classes. */
169
170 struct hlist_node nonfull_next; /* See nonfull_aggs in qfq_sched. */
132}; 171};
133 172
134struct qfq_group { 173struct qfq_group {
@@ -138,7 +177,7 @@ struct qfq_group {
138 unsigned int front; /* Index of the front slot. */ 177 unsigned int front; /* Index of the front slot. */
139 unsigned long full_slots; /* non-empty slots */ 178 unsigned long full_slots; /* non-empty slots */
140 179
141 /* Array of RR lists of active classes. */ 180 /* Array of RR lists of active aggregates. */
142 struct hlist_head slots[QFQ_MAX_SLOTS]; 181 struct hlist_head slots[QFQ_MAX_SLOTS];
143}; 182};
144 183
@@ -146,13 +185,28 @@ struct qfq_sched {
146 struct tcf_proto *filter_list; 185 struct tcf_proto *filter_list;
147 struct Qdisc_class_hash clhash; 186 struct Qdisc_class_hash clhash;
148 187
149 u64 V; /* Precise virtual time. */ 188 u64 oldV, V; /* Precise virtual times. */
150 u32 wsum; /* weight sum */ 189 struct qfq_aggregate *in_serv_agg; /* Aggregate being served. */
190 u32 num_active_agg; /* Num. of active aggregates */
191 u32 wsum; /* weight sum */
151 192
152 unsigned long bitmaps[QFQ_MAX_STATE]; /* Group bitmaps. */ 193 unsigned long bitmaps[QFQ_MAX_STATE]; /* Group bitmaps. */
153 struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */ 194 struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */
195 u32 min_slot_shift; /* Index of the group-0 bit in the bitmaps. */
196
197 u32 max_agg_classes; /* Max number of classes per aggr. */
198 struct hlist_head nonfull_aggs; /* Aggs with room for more classes. */
154}; 199};
155 200
201/*
202 * Possible reasons why the timestamps of an aggregate are updated
203 * enqueue: the aggregate switches from idle to active and must scheduled
204 * for service
205 * requeue: the aggregate finishes its budget, so it stops being served and
206 * must be rescheduled for service
207 */
208enum update_reason {enqueue, requeue};
209
156static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid) 210static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid)
157{ 211{
158 struct qfq_sched *q = qdisc_priv(sch); 212 struct qfq_sched *q = qdisc_priv(sch);
@@ -182,18 +236,18 @@ static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = {
182 * index = log_2(maxlen/weight) but we need to apply the scaling. 236 * index = log_2(maxlen/weight) but we need to apply the scaling.
183 * This is used only once at flow creation. 237 * This is used only once at flow creation.
184 */ 238 */
185static int qfq_calc_index(u32 inv_w, unsigned int maxlen) 239static int qfq_calc_index(u32 inv_w, unsigned int maxlen, u32 min_slot_shift)
186{ 240{
187 u64 slot_size = (u64)maxlen * inv_w; 241 u64 slot_size = (u64)maxlen * inv_w;
188 unsigned long size_map; 242 unsigned long size_map;
189 int index = 0; 243 int index = 0;
190 244
191 size_map = slot_size >> QFQ_MIN_SLOT_SHIFT; 245 size_map = slot_size >> min_slot_shift;
192 if (!size_map) 246 if (!size_map)
193 goto out; 247 goto out;
194 248
195 index = __fls(size_map) + 1; /* basically a log_2 */ 249 index = __fls(size_map) + 1; /* basically a log_2 */
196 index -= !(slot_size - (1ULL << (index + QFQ_MIN_SLOT_SHIFT - 1))); 250 index -= !(slot_size - (1ULL << (index + min_slot_shift - 1)));
197 251
198 if (index < 0) 252 if (index < 0)
199 index = 0; 253 index = 0;
@@ -204,66 +258,150 @@ out:
204 return index; 258 return index;
205} 259}
206 260
207/* Length of the next packet (0 if the queue is empty). */ 261static void qfq_deactivate_agg(struct qfq_sched *, struct qfq_aggregate *);
208static unsigned int qdisc_peek_len(struct Qdisc *sch) 262static void qfq_activate_agg(struct qfq_sched *, struct qfq_aggregate *,
263 enum update_reason);
264
265static void qfq_init_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
266 u32 lmax, u32 weight)
209{ 267{
210 struct sk_buff *skb; 268 INIT_LIST_HEAD(&agg->active);
269 hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs);
270
271 agg->lmax = lmax;
272 agg->class_weight = weight;
273}
274
275static struct qfq_aggregate *qfq_find_agg(struct qfq_sched *q,
276 u32 lmax, u32 weight)
277{
278 struct qfq_aggregate *agg;
279 struct hlist_node *n;
280
281 hlist_for_each_entry(agg, n, &q->nonfull_aggs, nonfull_next)
282 if (agg->lmax == lmax && agg->class_weight == weight)
283 return agg;
284
285 return NULL;
286}
287
211 288
212 skb = sch->ops->peek(sch); 289/* Update aggregate as a function of the new number of classes. */
213 return skb ? qdisc_pkt_len(skb) : 0; 290static void qfq_update_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
291 int new_num_classes)
292{
293 u32 new_agg_weight;
294
295 if (new_num_classes == q->max_agg_classes)
296 hlist_del_init(&agg->nonfull_next);
297
298 if (agg->num_classes > new_num_classes &&
299 new_num_classes == q->max_agg_classes - 1) /* agg no more full */
300 hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs);
301
302 agg->budgetmax = new_num_classes * agg->lmax;
303 new_agg_weight = agg->class_weight * new_num_classes;
304 agg->inv_w = ONE_FP/new_agg_weight;
305
306 if (agg->grp == NULL) {
307 int i = qfq_calc_index(agg->inv_w, agg->budgetmax,
308 q->min_slot_shift);
309 agg->grp = &q->groups[i];
310 }
311
312 q->wsum +=
313 (int) agg->class_weight * (new_num_classes - agg->num_classes);
314
315 agg->num_classes = new_num_classes;
316}
317
318/* Add class to aggregate. */
319static void qfq_add_to_agg(struct qfq_sched *q,
320 struct qfq_aggregate *agg,
321 struct qfq_class *cl)
322{
323 cl->agg = agg;
324
325 qfq_update_agg(q, agg, agg->num_classes+1);
326 if (cl->qdisc->q.qlen > 0) { /* adding an active class */
327 list_add_tail(&cl->alist, &agg->active);
328 if (list_first_entry(&agg->active, struct qfq_class, alist) ==
329 cl && q->in_serv_agg != agg) /* agg was inactive */
330 qfq_activate_agg(q, agg, enqueue); /* schedule agg */
331 }
214} 332}
215 333
216static void qfq_deactivate_class(struct qfq_sched *, struct qfq_class *); 334static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *);
217static void qfq_activate_class(struct qfq_sched *q, struct qfq_class *cl,
218 unsigned int len);
219 335
220static void qfq_update_class_params(struct qfq_sched *q, struct qfq_class *cl, 336static void qfq_destroy_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
221 u32 lmax, u32 inv_w, int delta_w)
222{ 337{
223 int i; 338 if (!hlist_unhashed(&agg->nonfull_next))
339 hlist_del_init(&agg->nonfull_next);
340 if (q->in_serv_agg == agg)
341 q->in_serv_agg = qfq_choose_next_agg(q);
342 kfree(agg);
343}
224 344
225 /* update qfq-specific data */ 345/* Deschedule class from within its parent aggregate. */
226 cl->lmax = lmax; 346static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl)
227 cl->inv_w = inv_w; 347{
228 i = qfq_calc_index(cl->inv_w, cl->lmax); 348 struct qfq_aggregate *agg = cl->agg;
229 349
230 cl->grp = &q->groups[i];
231 350
232 q->wsum += delta_w; 351 list_del(&cl->alist); /* remove from RR queue of the aggregate */
352 if (list_empty(&agg->active)) /* agg is now inactive */
353 qfq_deactivate_agg(q, agg);
233} 354}
234 355
235static void qfq_update_reactivate_class(struct qfq_sched *q, 356/* Remove class from its parent aggregate. */
236 struct qfq_class *cl, 357static void qfq_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl)
237 u32 inv_w, u32 lmax, int delta_w)
238{ 358{
239 bool need_reactivation = false; 359 struct qfq_aggregate *agg = cl->agg;
240 int i = qfq_calc_index(inv_w, lmax);
241 360
242 if (&q->groups[i] != cl->grp && cl->qdisc->q.qlen > 0) { 361 cl->agg = NULL;
243 /* 362 if (agg->num_classes == 1) { /* agg being emptied, destroy it */
244 * shift cl->F back, to not charge the 363 qfq_destroy_agg(q, agg);
245 * class for the not-yet-served head 364 return;
246 * packet
247 */
248 cl->F = cl->S;
249 /* remove class from its slot in the old group */
250 qfq_deactivate_class(q, cl);
251 need_reactivation = true;
252 } 365 }
366 qfq_update_agg(q, agg, agg->num_classes-1);
367}
253 368
254 qfq_update_class_params(q, cl, lmax, inv_w, delta_w); 369/* Deschedule class and remove it from its parent aggregate. */
370static void qfq_deact_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl)
371{
372 if (cl->qdisc->q.qlen > 0) /* class is active */
373 qfq_deactivate_class(q, cl);
255 374
256 if (need_reactivation) /* activate in new group */ 375 qfq_rm_from_agg(q, cl);
257 qfq_activate_class(q, cl, qdisc_peek_len(cl->qdisc));
258} 376}
259 377
378/* Move class to a new aggregate, matching the new class weight and/or lmax */
379static int qfq_change_agg(struct Qdisc *sch, struct qfq_class *cl, u32 weight,
380 u32 lmax)
381{
382 struct qfq_sched *q = qdisc_priv(sch);
383 struct qfq_aggregate *new_agg = qfq_find_agg(q, lmax, weight);
384
385 if (new_agg == NULL) { /* create new aggregate */
386 new_agg = kzalloc(sizeof(*new_agg), GFP_ATOMIC);
387 if (new_agg == NULL)
388 return -ENOBUFS;
389 qfq_init_agg(q, new_agg, lmax, weight);
390 }
391 qfq_deact_rm_from_agg(q, cl);
392 qfq_add_to_agg(q, new_agg, cl);
393
394 return 0;
395}
260 396
261static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, 397static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
262 struct nlattr **tca, unsigned long *arg) 398 struct nlattr **tca, unsigned long *arg)
263{ 399{
264 struct qfq_sched *q = qdisc_priv(sch); 400 struct qfq_sched *q = qdisc_priv(sch);
265 struct qfq_class *cl = (struct qfq_class *)*arg; 401 struct qfq_class *cl = (struct qfq_class *)*arg;
402 bool existing = false;
266 struct nlattr *tb[TCA_QFQ_MAX + 1]; 403 struct nlattr *tb[TCA_QFQ_MAX + 1];
404 struct qfq_aggregate *new_agg = NULL;
267 u32 weight, lmax, inv_w; 405 u32 weight, lmax, inv_w;
268 int err; 406 int err;
269 int delta_w; 407 int delta_w;
@@ -286,15 +424,6 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
286 } else 424 } else
287 weight = 1; 425 weight = 1;
288 426
289 inv_w = ONE_FP / weight;
290 weight = ONE_FP / inv_w;
291 delta_w = weight - (cl ? ONE_FP / cl->inv_w : 0);
292 if (q->wsum + delta_w > QFQ_MAX_WSUM) {
293 pr_notice("qfq: total weight out of range (%u + %u)\n",
294 delta_w, q->wsum);
295 return -EINVAL;
296 }
297
298 if (tb[TCA_QFQ_LMAX]) { 427 if (tb[TCA_QFQ_LMAX]) {
299 lmax = nla_get_u32(tb[TCA_QFQ_LMAX]); 428 lmax = nla_get_u32(tb[TCA_QFQ_LMAX]);
300 if (lmax < QFQ_MIN_LMAX || lmax > (1UL << QFQ_MTU_SHIFT)) { 429 if (lmax < QFQ_MIN_LMAX || lmax > (1UL << QFQ_MTU_SHIFT)) {
@@ -304,7 +433,23 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
304 } else 433 } else
305 lmax = psched_mtu(qdisc_dev(sch)); 434 lmax = psched_mtu(qdisc_dev(sch));
306 435
307 if (cl != NULL) { 436 inv_w = ONE_FP / weight;
437 weight = ONE_FP / inv_w;
438
439 if (cl != NULL &&
440 lmax == cl->agg->lmax &&
441 weight == cl->agg->class_weight)
442 return 0; /* nothing to change */
443
444 delta_w = weight - (cl ? cl->agg->class_weight : 0);
445
446 if (q->wsum + delta_w > QFQ_MAX_WSUM) {
447 pr_notice("qfq: total weight out of range (%d + %u)\n",
448 delta_w, q->wsum);
449 return -EINVAL;
450 }
451
452 if (cl != NULL) { /* modify existing class */
308 if (tca[TCA_RATE]) { 453 if (tca[TCA_RATE]) {
309 err = gen_replace_estimator(&cl->bstats, &cl->rate_est, 454 err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
310 qdisc_root_sleeping_lock(sch), 455 qdisc_root_sleeping_lock(sch),
@@ -312,25 +457,18 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
312 if (err) 457 if (err)
313 return err; 458 return err;
314 } 459 }
315 460 existing = true;
316 if (lmax == cl->lmax && inv_w == cl->inv_w) 461 goto set_change_agg;
317 return 0; /* nothing to update */
318
319 sch_tree_lock(sch);
320 qfq_update_reactivate_class(q, cl, inv_w, lmax, delta_w);
321 sch_tree_unlock(sch);
322
323 return 0;
324 } 462 }
325 463
464 /* create and init new class */
326 cl = kzalloc(sizeof(struct qfq_class), GFP_KERNEL); 465 cl = kzalloc(sizeof(struct qfq_class), GFP_KERNEL);
327 if (cl == NULL) 466 if (cl == NULL)
328 return -ENOBUFS; 467 return -ENOBUFS;
329 468
330 cl->refcnt = 1; 469 cl->refcnt = 1;
331 cl->common.classid = classid; 470 cl->common.classid = classid;
332 471 cl->deficit = lmax;
333 qfq_update_class_params(q, cl, lmax, inv_w, delta_w);
334 472
335 cl->qdisc = qdisc_create_dflt(sch->dev_queue, 473 cl->qdisc = qdisc_create_dflt(sch->dev_queue,
336 &pfifo_qdisc_ops, classid); 474 &pfifo_qdisc_ops, classid);
@@ -341,11 +479,8 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
341 err = gen_new_estimator(&cl->bstats, &cl->rate_est, 479 err = gen_new_estimator(&cl->bstats, &cl->rate_est,
342 qdisc_root_sleeping_lock(sch), 480 qdisc_root_sleeping_lock(sch),
343 tca[TCA_RATE]); 481 tca[TCA_RATE]);
344 if (err) { 482 if (err)
345 qdisc_destroy(cl->qdisc); 483 goto destroy_class;
346 kfree(cl);
347 return err;
348 }
349 } 484 }
350 485
351 sch_tree_lock(sch); 486 sch_tree_lock(sch);
@@ -354,19 +489,39 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
354 489
355 qdisc_class_hash_grow(sch, &q->clhash); 490 qdisc_class_hash_grow(sch, &q->clhash);
356 491
492set_change_agg:
493 sch_tree_lock(sch);
494 new_agg = qfq_find_agg(q, lmax, weight);
495 if (new_agg == NULL) { /* create new aggregate */
496 sch_tree_unlock(sch);
497 new_agg = kzalloc(sizeof(*new_agg), GFP_KERNEL);
498 if (new_agg == NULL) {
499 err = -ENOBUFS;
500 gen_kill_estimator(&cl->bstats, &cl->rate_est);
501 goto destroy_class;
502 }
503 sch_tree_lock(sch);
504 qfq_init_agg(q, new_agg, lmax, weight);
505 }
506 if (existing)
507 qfq_deact_rm_from_agg(q, cl);
508 qfq_add_to_agg(q, new_agg, cl);
509 sch_tree_unlock(sch);
510
357 *arg = (unsigned long)cl; 511 *arg = (unsigned long)cl;
358 return 0; 512 return 0;
513
514destroy_class:
515 qdisc_destroy(cl->qdisc);
516 kfree(cl);
517 return err;
359} 518}
360 519
361static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl) 520static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl)
362{ 521{
363 struct qfq_sched *q = qdisc_priv(sch); 522 struct qfq_sched *q = qdisc_priv(sch);
364 523
365 if (cl->inv_w) { 524 qfq_rm_from_agg(q, cl);
366 q->wsum -= ONE_FP / cl->inv_w;
367 cl->inv_w = 0;
368 }
369
370 gen_kill_estimator(&cl->bstats, &cl->rate_est); 525 gen_kill_estimator(&cl->bstats, &cl->rate_est);
371 qdisc_destroy(cl->qdisc); 526 qdisc_destroy(cl->qdisc);
372 kfree(cl); 527 kfree(cl);
@@ -481,8 +636,8 @@ static int qfq_dump_class(struct Qdisc *sch, unsigned long arg,
481 nest = nla_nest_start(skb, TCA_OPTIONS); 636 nest = nla_nest_start(skb, TCA_OPTIONS);
482 if (nest == NULL) 637 if (nest == NULL)
483 goto nla_put_failure; 638 goto nla_put_failure;
484 if (nla_put_u32(skb, TCA_QFQ_WEIGHT, ONE_FP/cl->inv_w) || 639 if (nla_put_u32(skb, TCA_QFQ_WEIGHT, cl->agg->class_weight) ||
485 nla_put_u32(skb, TCA_QFQ_LMAX, cl->lmax)) 640 nla_put_u32(skb, TCA_QFQ_LMAX, cl->agg->lmax))
486 goto nla_put_failure; 641 goto nla_put_failure;
487 return nla_nest_end(skb, nest); 642 return nla_nest_end(skb, nest);
488 643
@@ -500,8 +655,8 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
500 memset(&xstats, 0, sizeof(xstats)); 655 memset(&xstats, 0, sizeof(xstats));
501 cl->qdisc->qstats.qlen = cl->qdisc->q.qlen; 656 cl->qdisc->qstats.qlen = cl->qdisc->q.qlen;
502 657
503 xstats.weight = ONE_FP/cl->inv_w; 658 xstats.weight = cl->agg->class_weight;
504 xstats.lmax = cl->lmax; 659 xstats.lmax = cl->agg->lmax;
505 660
506 if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || 661 if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
507 gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || 662 gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
@@ -652,16 +807,16 @@ static void qfq_unblock_groups(struct qfq_sched *q, int index, u64 old_F)
652 * perhaps 807 * perhaps
653 * 808 *
654 old_V ^= q->V; 809 old_V ^= q->V;
655 old_V >>= QFQ_MIN_SLOT_SHIFT; 810 old_V >>= q->min_slot_shift;
656 if (old_V) { 811 if (old_V) {
657 ... 812 ...
658 } 813 }
659 * 814 *
660 */ 815 */
661static void qfq_make_eligible(struct qfq_sched *q, u64 old_V) 816static void qfq_make_eligible(struct qfq_sched *q)
662{ 817{
663 unsigned long vslot = q->V >> QFQ_MIN_SLOT_SHIFT; 818 unsigned long vslot = q->V >> q->min_slot_shift;
664 unsigned long old_vslot = old_V >> QFQ_MIN_SLOT_SHIFT; 819 unsigned long old_vslot = q->oldV >> q->min_slot_shift;
665 820
666 if (vslot != old_vslot) { 821 if (vslot != old_vslot) {
667 unsigned long mask = (1UL << fls(vslot ^ old_vslot)) - 1; 822 unsigned long mask = (1UL << fls(vslot ^ old_vslot)) - 1;
@@ -672,34 +827,38 @@ static void qfq_make_eligible(struct qfq_sched *q, u64 old_V)
672 827
673 828
674/* 829/*
675 * If the weight and lmax (max_pkt_size) of the classes do not change, 830 * The index of the slot in which the aggregate is to be inserted must
676 * then QFQ guarantees that the slot index is never higher than 831 * not be higher than QFQ_MAX_SLOTS-2. There is a '-2' and not a '-1'
677 * 2 + ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) * (QFQ_MAX_WEIGHT/QFQ_MAX_WSUM). 832 * because the start time of the group may be moved backward by one
833 * slot after the aggregate has been inserted, and this would cause
834 * non-empty slots to be right-shifted by one position.
678 * 835 *
679 * With the current values of the above constants, the index is 836 * If the weight and lmax (max_pkt_size) of the classes do not change,
680 * then guaranteed to never be higher than 2 + 256 * (1 / 16) = 18. 837 * then QFQ+ does meet the above contraint according to the current
838 * values of its parameters. In fact, if the weight and lmax of the
839 * classes do not change, then, from the theory, QFQ+ guarantees that
840 * the slot index is never higher than
841 * 2 + QFQ_MAX_AGG_CLASSES * ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) *
842 * (QFQ_MAX_WEIGHT/QFQ_MAX_WSUM) = 2 + 8 * 128 * (1 / 64) = 18
681 * 843 *
682 * When the weight of a class is increased or the lmax of the class is 844 * When the weight of a class is increased or the lmax of the class is
683 * decreased, a new class with smaller slot size may happen to be 845 * decreased, a new aggregate with smaller slot size than the original
684 * activated. The activation of this class should be properly delayed 846 * parent aggregate of the class may happen to be activated. The
685 * to when the service of the class has finished in the ideal system 847 * activation of this aggregate should be properly delayed to when the
686 * tracked by QFQ. If the activation of the class is not delayed to 848 * service of the class has finished in the ideal system tracked by
687 * this reference time instant, then this class may be unjustly served 849 * QFQ+. If the activation of the aggregate is not delayed to this
688 * before other classes waiting for service. This may cause 850 * reference time instant, then this aggregate may be unjustly served
689 * (unfrequently) the above bound to the slot index to be violated for 851 * before other aggregates waiting for service. This may cause the
690 * some of these unlucky classes. 852 * above bound to the slot index to be violated for some of these
853 * unlucky aggregates.
691 * 854 *
692 * Instead of delaying the activation of the new class, which is quite 855 * Instead of delaying the activation of the new aggregate, which is
693 * complex, the following inaccurate but simple solution is used: if 856 * quite complex, the following inaccurate but simple solution is used:
694 * the slot index is higher than QFQ_MAX_SLOTS-2, then the timestamps 857 * if the slot index is higher than QFQ_MAX_SLOTS-2, then the
695 * of the class are shifted backward so as to let the slot index 858 * timestamps of the aggregate are shifted backward so as to let the
696 * become equal to QFQ_MAX_SLOTS-2. This threshold is used because, if 859 * slot index become equal to QFQ_MAX_SLOTS-2.
697 * the slot index is above it, then the data structure implementing
698 * the bucket list either gets immediately corrupted or may get
699 * corrupted on a possible next packet arrival that causes the start
700 * time of the group to be shifted backward.
701 */ 860 */
702static void qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl, 861static void qfq_slot_insert(struct qfq_group *grp, struct qfq_aggregate *agg,
703 u64 roundedS) 862 u64 roundedS)
704{ 863{
705 u64 slot = (roundedS - grp->S) >> grp->slot_shift; 864 u64 slot = (roundedS - grp->S) >> grp->slot_shift;
@@ -708,22 +867,22 @@ static void qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl,
708 if (unlikely(slot > QFQ_MAX_SLOTS - 2)) { 867 if (unlikely(slot > QFQ_MAX_SLOTS - 2)) {
709 u64 deltaS = roundedS - grp->S - 868 u64 deltaS = roundedS - grp->S -
710 ((u64)(QFQ_MAX_SLOTS - 2)<<grp->slot_shift); 869 ((u64)(QFQ_MAX_SLOTS - 2)<<grp->slot_shift);
711 cl->S -= deltaS; 870 agg->S -= deltaS;
712 cl->F -= deltaS; 871 agg->F -= deltaS;
713 slot = QFQ_MAX_SLOTS - 2; 872 slot = QFQ_MAX_SLOTS - 2;
714 } 873 }
715 874
716 i = (grp->front + slot) % QFQ_MAX_SLOTS; 875 i = (grp->front + slot) % QFQ_MAX_SLOTS;
717 876
718 hlist_add_head(&cl->next, &grp->slots[i]); 877 hlist_add_head(&agg->next, &grp->slots[i]);
719 __set_bit(slot, &grp->full_slots); 878 __set_bit(slot, &grp->full_slots);
720} 879}
721 880
722/* Maybe introduce hlist_first_entry?? */ 881/* Maybe introduce hlist_first_entry?? */
723static struct qfq_class *qfq_slot_head(struct qfq_group *grp) 882static struct qfq_aggregate *qfq_slot_head(struct qfq_group *grp)
724{ 883{
725 return hlist_entry(grp->slots[grp->front].first, 884 return hlist_entry(grp->slots[grp->front].first,
726 struct qfq_class, next); 885 struct qfq_aggregate, next);
727} 886}
728 887
729/* 888/*
@@ -731,20 +890,20 @@ static struct qfq_class *qfq_slot_head(struct qfq_group *grp)
731 */ 890 */
732static void qfq_front_slot_remove(struct qfq_group *grp) 891static void qfq_front_slot_remove(struct qfq_group *grp)
733{ 892{
734 struct qfq_class *cl = qfq_slot_head(grp); 893 struct qfq_aggregate *agg = qfq_slot_head(grp);
735 894
736 BUG_ON(!cl); 895 BUG_ON(!agg);
737 hlist_del(&cl->next); 896 hlist_del(&agg->next);
738 if (hlist_empty(&grp->slots[grp->front])) 897 if (hlist_empty(&grp->slots[grp->front]))
739 __clear_bit(0, &grp->full_slots); 898 __clear_bit(0, &grp->full_slots);
740} 899}
741 900
742/* 901/*
743 * Returns the first full queue in a group. As a side effect, 902 * Returns the first aggregate in the first non-empty bucket of the
744 * adjust the bucket list so the first non-empty bucket is at 903 * group. As a side effect, adjusts the bucket list so the first
745 * position 0 in full_slots. 904 * non-empty bucket is at position 0 in full_slots.
746 */ 905 */
747static struct qfq_class *qfq_slot_scan(struct qfq_group *grp) 906static struct qfq_aggregate *qfq_slot_scan(struct qfq_group *grp)
748{ 907{
749 unsigned int i; 908 unsigned int i;
750 909
@@ -780,7 +939,7 @@ static void qfq_slot_rotate(struct qfq_group *grp, u64 roundedS)
780 grp->front = (grp->front - i) % QFQ_MAX_SLOTS; 939 grp->front = (grp->front - i) % QFQ_MAX_SLOTS;
781} 940}
782 941
783static void qfq_update_eligible(struct qfq_sched *q, u64 old_V) 942static void qfq_update_eligible(struct qfq_sched *q)
784{ 943{
785 struct qfq_group *grp; 944 struct qfq_group *grp;
786 unsigned long ineligible; 945 unsigned long ineligible;
@@ -792,137 +951,226 @@ static void qfq_update_eligible(struct qfq_sched *q, u64 old_V)
792 if (qfq_gt(grp->S, q->V)) 951 if (qfq_gt(grp->S, q->V))
793 q->V = grp->S; 952 q->V = grp->S;
794 } 953 }
795 qfq_make_eligible(q, old_V); 954 qfq_make_eligible(q);
796 } 955 }
797} 956}
798 957
799/* 958/* Dequeue head packet of the head class in the DRR queue of the aggregate. */
800 * Updates the class, returns true if also the group needs to be updated. 959static void agg_dequeue(struct qfq_aggregate *agg,
801 */ 960 struct qfq_class *cl, unsigned int len)
802static bool qfq_update_class(struct qfq_group *grp, struct qfq_class *cl)
803{ 961{
804 unsigned int len = qdisc_peek_len(cl->qdisc); 962 qdisc_dequeue_peeked(cl->qdisc);
805 963
806 cl->S = cl->F; 964 cl->deficit -= (int) len;
807 if (!len)
808 qfq_front_slot_remove(grp); /* queue is empty */
809 else {
810 u64 roundedS;
811 965
812 cl->F = cl->S + (u64)len * cl->inv_w; 966 if (cl->qdisc->q.qlen == 0) /* no more packets, remove from list */
813 roundedS = qfq_round_down(cl->S, grp->slot_shift); 967 list_del(&cl->alist);
814 if (roundedS == grp->S) 968 else if (cl->deficit < qdisc_pkt_len(cl->qdisc->ops->peek(cl->qdisc))) {
815 return false; 969 cl->deficit += agg->lmax;
816 970 list_move_tail(&cl->alist, &agg->active);
817 qfq_front_slot_remove(grp);
818 qfq_slot_insert(grp, cl, roundedS);
819 } 971 }
972}
973
974static inline struct sk_buff *qfq_peek_skb(struct qfq_aggregate *agg,
975 struct qfq_class **cl,
976 unsigned int *len)
977{
978 struct sk_buff *skb;
820 979
821 return true; 980 *cl = list_first_entry(&agg->active, struct qfq_class, alist);
981 skb = (*cl)->qdisc->ops->peek((*cl)->qdisc);
982 if (skb == NULL)
983 WARN_ONCE(1, "qfq_dequeue: non-workconserving leaf\n");
984 else
985 *len = qdisc_pkt_len(skb);
986
987 return skb;
988}
989
990/* Update F according to the actual service received by the aggregate. */
991static inline void charge_actual_service(struct qfq_aggregate *agg)
992{
993 /* compute the service received by the aggregate */
994 u32 service_received = agg->initial_budget - agg->budget;
995
996 agg->F = agg->S + (u64)service_received * agg->inv_w;
822} 997}
823 998
824static struct sk_buff *qfq_dequeue(struct Qdisc *sch) 999static struct sk_buff *qfq_dequeue(struct Qdisc *sch)
825{ 1000{
826 struct qfq_sched *q = qdisc_priv(sch); 1001 struct qfq_sched *q = qdisc_priv(sch);
827 struct qfq_group *grp; 1002 struct qfq_aggregate *in_serv_agg = q->in_serv_agg;
828 struct qfq_class *cl; 1003 struct qfq_class *cl;
829 struct sk_buff *skb; 1004 struct sk_buff *skb = NULL;
830 unsigned int len; 1005 /* next-packet len, 0 means no more active classes in in-service agg */
831 u64 old_V; 1006 unsigned int len = 0;
832 1007
833 if (!q->bitmaps[ER]) 1008 if (in_serv_agg == NULL)
834 return NULL; 1009 return NULL;
835 1010
836 grp = qfq_ffs(q, q->bitmaps[ER]); 1011 if (!list_empty(&in_serv_agg->active))
1012 skb = qfq_peek_skb(in_serv_agg, &cl, &len);
837 1013
838 cl = qfq_slot_head(grp); 1014 /*
839 skb = qdisc_dequeue_peeked(cl->qdisc); 1015 * If there are no active classes in the in-service aggregate,
840 if (!skb) { 1016 * or if the aggregate has not enough budget to serve its next
841 WARN_ONCE(1, "qfq_dequeue: non-workconserving leaf\n"); 1017 * class, then choose the next aggregate to serve.
842 return NULL; 1018 */
1019 if (len == 0 || in_serv_agg->budget < len) {
1020 charge_actual_service(in_serv_agg);
1021
1022 /* recharge the budget of the aggregate */
1023 in_serv_agg->initial_budget = in_serv_agg->budget =
1024 in_serv_agg->budgetmax;
1025
1026 if (!list_empty(&in_serv_agg->active))
1027 /*
1028 * Still active: reschedule for
1029 * service. Possible optimization: if no other
1030 * aggregate is active, then there is no point
1031 * in rescheduling this aggregate, and we can
1032 * just keep it as the in-service one. This
1033 * should be however a corner case, and to
1034 * handle it, we would need to maintain an
1035 * extra num_active_aggs field.
1036 */
1037 qfq_activate_agg(q, in_serv_agg, requeue);
1038 else if (sch->q.qlen == 0) { /* no aggregate to serve */
1039 q->in_serv_agg = NULL;
1040 return NULL;
1041 }
1042
1043 /*
1044 * If we get here, there are other aggregates queued:
1045 * choose the new aggregate to serve.
1046 */
1047 in_serv_agg = q->in_serv_agg = qfq_choose_next_agg(q);
1048 skb = qfq_peek_skb(in_serv_agg, &cl, &len);
843 } 1049 }
1050 if (!skb)
1051 return NULL;
844 1052
845 sch->q.qlen--; 1053 sch->q.qlen--;
846 qdisc_bstats_update(sch, skb); 1054 qdisc_bstats_update(sch, skb);
847 1055
848 old_V = q->V; 1056 agg_dequeue(in_serv_agg, cl, len);
849 len = qdisc_pkt_len(skb); 1057 in_serv_agg->budget -= len;
850 q->V += (u64)len * IWSUM; 1058 q->V += (u64)len * IWSUM;
851 pr_debug("qfq dequeue: len %u F %lld now %lld\n", 1059 pr_debug("qfq dequeue: len %u F %lld now %lld\n",
852 len, (unsigned long long) cl->F, (unsigned long long) q->V); 1060 len, (unsigned long long) in_serv_agg->F,
1061 (unsigned long long) q->V);
853 1062
854 if (qfq_update_class(grp, cl)) { 1063 return skb;
855 u64 old_F = grp->F; 1064}
856 1065
857 cl = qfq_slot_scan(grp); 1066static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *q)
858 if (!cl) 1067{
859 __clear_bit(grp->index, &q->bitmaps[ER]); 1068 struct qfq_group *grp;
860 else { 1069 struct qfq_aggregate *agg, *new_front_agg;
861 u64 roundedS = qfq_round_down(cl->S, grp->slot_shift); 1070 u64 old_F;
862 unsigned int s;
863 1071
864 if (grp->S == roundedS) 1072 qfq_update_eligible(q);
865 goto skip_unblock; 1073 q->oldV = q->V;
866 grp->S = roundedS; 1074
867 grp->F = roundedS + (2ULL << grp->slot_shift); 1075 if (!q->bitmaps[ER])
868 __clear_bit(grp->index, &q->bitmaps[ER]); 1076 return NULL;
869 s = qfq_calc_state(q, grp); 1077
870 __set_bit(grp->index, &q->bitmaps[s]); 1078 grp = qfq_ffs(q, q->bitmaps[ER]);
871 } 1079 old_F = grp->F;
1080
1081 agg = qfq_slot_head(grp);
872 1082
873 qfq_unblock_groups(q, grp->index, old_F); 1083 /* agg starts to be served, remove it from schedule */
1084 qfq_front_slot_remove(grp);
1085
1086 new_front_agg = qfq_slot_scan(grp);
1087
1088 if (new_front_agg == NULL) /* group is now inactive, remove from ER */
1089 __clear_bit(grp->index, &q->bitmaps[ER]);
1090 else {
1091 u64 roundedS = qfq_round_down(new_front_agg->S,
1092 grp->slot_shift);
1093 unsigned int s;
1094
1095 if (grp->S == roundedS)
1096 return agg;
1097 grp->S = roundedS;
1098 grp->F = roundedS + (2ULL << grp->slot_shift);
1099 __clear_bit(grp->index, &q->bitmaps[ER]);
1100 s = qfq_calc_state(q, grp);
1101 __set_bit(grp->index, &q->bitmaps[s]);
874 } 1102 }
875 1103
876skip_unblock: 1104 qfq_unblock_groups(q, grp->index, old_F);
877 qfq_update_eligible(q, old_V);
878 1105
879 return skb; 1106 return agg;
880} 1107}
881 1108
882/* 1109/*
883 * Assign a reasonable start time for a new flow k in group i. 1110 * Assign a reasonable start time for a new aggregate in group i.
884 * Admissible values for \hat(F) are multiples of \sigma_i 1111 * Admissible values for \hat(F) are multiples of \sigma_i
885 * no greater than V+\sigma_i . Larger values mean that 1112 * no greater than V+\sigma_i . Larger values mean that
886 * we had a wraparound so we consider the timestamp to be stale. 1113 * we had a wraparound so we consider the timestamp to be stale.
887 * 1114 *
888 * If F is not stale and F >= V then we set S = F. 1115 * If F is not stale and F >= V then we set S = F.
889 * Otherwise we should assign S = V, but this may violate 1116 * Otherwise we should assign S = V, but this may violate
890 * the ordering in ER. So, if we have groups in ER, set S to 1117 * the ordering in EB (see [2]). So, if we have groups in ER,
891 * the F_j of the first group j which would be blocking us. 1118 * set S to the F_j of the first group j which would be blocking us.
892 * We are guaranteed not to move S backward because 1119 * We are guaranteed not to move S backward because
893 * otherwise our group i would still be blocked. 1120 * otherwise our group i would still be blocked.
894 */ 1121 */
895static void qfq_update_start(struct qfq_sched *q, struct qfq_class *cl) 1122static void qfq_update_start(struct qfq_sched *q, struct qfq_aggregate *agg)
896{ 1123{
897 unsigned long mask; 1124 unsigned long mask;
898 u64 limit, roundedF; 1125 u64 limit, roundedF;
899 int slot_shift = cl->grp->slot_shift; 1126 int slot_shift = agg->grp->slot_shift;
900 1127
901 roundedF = qfq_round_down(cl->F, slot_shift); 1128 roundedF = qfq_round_down(agg->F, slot_shift);
902 limit = qfq_round_down(q->V, slot_shift) + (1ULL << slot_shift); 1129 limit = qfq_round_down(q->V, slot_shift) + (1ULL << slot_shift);
903 1130
904 if (!qfq_gt(cl->F, q->V) || qfq_gt(roundedF, limit)) { 1131 if (!qfq_gt(agg->F, q->V) || qfq_gt(roundedF, limit)) {
905 /* timestamp was stale */ 1132 /* timestamp was stale */
906 mask = mask_from(q->bitmaps[ER], cl->grp->index); 1133 mask = mask_from(q->bitmaps[ER], agg->grp->index);
907 if (mask) { 1134 if (mask) {
908 struct qfq_group *next = qfq_ffs(q, mask); 1135 struct qfq_group *next = qfq_ffs(q, mask);
909 if (qfq_gt(roundedF, next->F)) { 1136 if (qfq_gt(roundedF, next->F)) {
910 if (qfq_gt(limit, next->F)) 1137 if (qfq_gt(limit, next->F))
911 cl->S = next->F; 1138 agg->S = next->F;
912 else /* preserve timestamp correctness */ 1139 else /* preserve timestamp correctness */
913 cl->S = limit; 1140 agg->S = limit;
914 return; 1141 return;
915 } 1142 }
916 } 1143 }
917 cl->S = q->V; 1144 agg->S = q->V;
918 } else /* timestamp is not stale */ 1145 } else /* timestamp is not stale */
919 cl->S = cl->F; 1146 agg->S = agg->F;
920} 1147}
921 1148
1149/*
1150 * Update the timestamps of agg before scheduling/rescheduling it for
1151 * service. In particular, assign to agg->F its maximum possible
1152 * value, i.e., the virtual finish time with which the aggregate
1153 * should be labeled if it used all its budget once in service.
1154 */
1155static inline void
1156qfq_update_agg_ts(struct qfq_sched *q,
1157 struct qfq_aggregate *agg, enum update_reason reason)
1158{
1159 if (reason != requeue)
1160 qfq_update_start(q, agg);
1161 else /* just charge agg for the service received */
1162 agg->S = agg->F;
1163
1164 agg->F = agg->S + (u64)agg->budgetmax * agg->inv_w;
1165}
1166
1167static void qfq_schedule_agg(struct qfq_sched *, struct qfq_aggregate *);
1168
922static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) 1169static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
923{ 1170{
924 struct qfq_sched *q = qdisc_priv(sch); 1171 struct qfq_sched *q = qdisc_priv(sch);
925 struct qfq_class *cl; 1172 struct qfq_class *cl;
1173 struct qfq_aggregate *agg;
926 int err = 0; 1174 int err = 0;
927 1175
928 cl = qfq_classify(skb, sch, &err); 1176 cl = qfq_classify(skb, sch, &err);
@@ -934,11 +1182,13 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
934 } 1182 }
935 pr_debug("qfq_enqueue: cl = %x\n", cl->common.classid); 1183 pr_debug("qfq_enqueue: cl = %x\n", cl->common.classid);
936 1184
937 if (unlikely(cl->lmax < qdisc_pkt_len(skb))) { 1185 if (unlikely(cl->agg->lmax < qdisc_pkt_len(skb))) {
938 pr_debug("qfq: increasing maxpkt from %u to %u for class %u", 1186 pr_debug("qfq: increasing maxpkt from %u to %u for class %u",
939 cl->lmax, qdisc_pkt_len(skb), cl->common.classid); 1187 cl->agg->lmax, qdisc_pkt_len(skb), cl->common.classid);
940 qfq_update_reactivate_class(q, cl, cl->inv_w, 1188 err = qfq_change_agg(sch, cl, cl->agg->class_weight,
941 qdisc_pkt_len(skb), 0); 1189 qdisc_pkt_len(skb));
1190 if (err)
1191 return err;
942 } 1192 }
943 1193
944 err = qdisc_enqueue(skb, cl->qdisc); 1194 err = qdisc_enqueue(skb, cl->qdisc);
@@ -954,35 +1204,50 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
954 bstats_update(&cl->bstats, skb); 1204 bstats_update(&cl->bstats, skb);
955 ++sch->q.qlen; 1205 ++sch->q.qlen;
956 1206
957 /* If the new skb is not the head of queue, then done here. */ 1207 agg = cl->agg;
958 if (cl->qdisc->q.qlen != 1) 1208 /* if the queue was not empty, then done here */
1209 if (cl->qdisc->q.qlen != 1) {
1210 if (unlikely(skb == cl->qdisc->ops->peek(cl->qdisc)) &&
1211 list_first_entry(&agg->active, struct qfq_class, alist)
1212 == cl && cl->deficit < qdisc_pkt_len(skb))
1213 list_move_tail(&cl->alist, &agg->active);
1214
959 return err; 1215 return err;
1216 }
1217
1218 /* schedule class for service within the aggregate */
1219 cl->deficit = agg->lmax;
1220 list_add_tail(&cl->alist, &agg->active);
960 1221
961 /* If reach this point, queue q was idle */ 1222 if (list_first_entry(&agg->active, struct qfq_class, alist) != cl)
962 qfq_activate_class(q, cl, qdisc_pkt_len(skb)); 1223 return err; /* aggregate was not empty, nothing else to do */
1224
1225 /* recharge budget */
1226 agg->initial_budget = agg->budget = agg->budgetmax;
1227
1228 qfq_update_agg_ts(q, agg, enqueue);
1229 if (q->in_serv_agg == NULL)
1230 q->in_serv_agg = agg;
1231 else if (agg != q->in_serv_agg)
1232 qfq_schedule_agg(q, agg);
963 1233
964 return err; 1234 return err;
965} 1235}
966 1236
967/* 1237/*
968 * Handle class switch from idle to backlogged. 1238 * Schedule aggregate according to its timestamps.
969 */ 1239 */
970static void qfq_activate_class(struct qfq_sched *q, struct qfq_class *cl, 1240static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
971 unsigned int pkt_len)
972{ 1241{
973 struct qfq_group *grp = cl->grp; 1242 struct qfq_group *grp = agg->grp;
974 u64 roundedS; 1243 u64 roundedS;
975 int s; 1244 int s;
976 1245
977 qfq_update_start(q, cl); 1246 roundedS = qfq_round_down(agg->S, grp->slot_shift);
978
979 /* compute new finish time and rounded start. */
980 cl->F = cl->S + (u64)pkt_len * cl->inv_w;
981 roundedS = qfq_round_down(cl->S, grp->slot_shift);
982 1247
983 /* 1248 /*
984 * insert cl in the correct bucket. 1249 * Insert agg in the correct bucket.
985 * If cl->S >= grp->S we don't need to adjust the 1250 * If agg->S >= grp->S we don't need to adjust the
986 * bucket list and simply go to the insertion phase. 1251 * bucket list and simply go to the insertion phase.
987 * Otherwise grp->S is decreasing, we must make room 1252 * Otherwise grp->S is decreasing, we must make room
988 * in the bucket list, and also recompute the group state. 1253 * in the bucket list, and also recompute the group state.
@@ -990,10 +1255,10 @@ static void qfq_activate_class(struct qfq_sched *q, struct qfq_class *cl,
990 * was in ER make sure to adjust V. 1255 * was in ER make sure to adjust V.
991 */ 1256 */
992 if (grp->full_slots) { 1257 if (grp->full_slots) {
993 if (!qfq_gt(grp->S, cl->S)) 1258 if (!qfq_gt(grp->S, agg->S))
994 goto skip_update; 1259 goto skip_update;
995 1260
996 /* create a slot for this cl->S */ 1261 /* create a slot for this agg->S */
997 qfq_slot_rotate(grp, roundedS); 1262 qfq_slot_rotate(grp, roundedS);
998 /* group was surely ineligible, remove */ 1263 /* group was surely ineligible, remove */
999 __clear_bit(grp->index, &q->bitmaps[IR]); 1264 __clear_bit(grp->index, &q->bitmaps[IR]);
@@ -1008,46 +1273,61 @@ static void qfq_activate_class(struct qfq_sched *q, struct qfq_class *cl,
1008 1273
1009 pr_debug("qfq enqueue: new state %d %#lx S %lld F %lld V %lld\n", 1274 pr_debug("qfq enqueue: new state %d %#lx S %lld F %lld V %lld\n",
1010 s, q->bitmaps[s], 1275 s, q->bitmaps[s],
1011 (unsigned long long) cl->S, 1276 (unsigned long long) agg->S,
1012 (unsigned long long) cl->F, 1277 (unsigned long long) agg->F,
1013 (unsigned long long) q->V); 1278 (unsigned long long) q->V);
1014 1279
1015skip_update: 1280skip_update:
1016 qfq_slot_insert(grp, cl, roundedS); 1281 qfq_slot_insert(grp, agg, roundedS);
1017} 1282}
1018 1283
1019 1284
1285/* Update agg ts and schedule agg for service */
1286static void qfq_activate_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
1287 enum update_reason reason)
1288{
1289 qfq_update_agg_ts(q, agg, reason);
1290 qfq_schedule_agg(q, agg);
1291}
1292
1020static void qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp, 1293static void qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp,
1021 struct qfq_class *cl) 1294 struct qfq_aggregate *agg)
1022{ 1295{
1023 unsigned int i, offset; 1296 unsigned int i, offset;
1024 u64 roundedS; 1297 u64 roundedS;
1025 1298
1026 roundedS = qfq_round_down(cl->S, grp->slot_shift); 1299 roundedS = qfq_round_down(agg->S, grp->slot_shift);
1027 offset = (roundedS - grp->S) >> grp->slot_shift; 1300 offset = (roundedS - grp->S) >> grp->slot_shift;
1301
1028 i = (grp->front + offset) % QFQ_MAX_SLOTS; 1302 i = (grp->front + offset) % QFQ_MAX_SLOTS;
1029 1303
1030 hlist_del(&cl->next); 1304 hlist_del(&agg->next);
1031 if (hlist_empty(&grp->slots[i])) 1305 if (hlist_empty(&grp->slots[i]))
1032 __clear_bit(offset, &grp->full_slots); 1306 __clear_bit(offset, &grp->full_slots);
1033} 1307}
1034 1308
1035/* 1309/*
1036 * called to forcibly destroy a queue. 1310 * Called to forcibly deschedule an aggregate. If the aggregate is
1037 * If the queue is not in the front bucket, or if it has 1311 * not in the front bucket, or if the latter has other aggregates in
1038 * other queues in the front bucket, we can simply remove 1312 * the front bucket, we can simply remove the aggregate with no other
1039 * the queue with no other side effects. 1313 * side effects.
1040 * Otherwise we must propagate the event up. 1314 * Otherwise we must propagate the event up.
1041 */ 1315 */
1042static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl) 1316static void qfq_deactivate_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
1043{ 1317{
1044 struct qfq_group *grp = cl->grp; 1318 struct qfq_group *grp = agg->grp;
1045 unsigned long mask; 1319 unsigned long mask;
1046 u64 roundedS; 1320 u64 roundedS;
1047 int s; 1321 int s;
1048 1322
1049 cl->F = cl->S; 1323 if (agg == q->in_serv_agg) {
1050 qfq_slot_remove(q, grp, cl); 1324 charge_actual_service(agg);
1325 q->in_serv_agg = qfq_choose_next_agg(q);
1326 return;
1327 }
1328
1329 agg->F = agg->S;
1330 qfq_slot_remove(q, grp, agg);
1051 1331
1052 if (!grp->full_slots) { 1332 if (!grp->full_slots) {
1053 __clear_bit(grp->index, &q->bitmaps[IR]); 1333 __clear_bit(grp->index, &q->bitmaps[IR]);
@@ -1066,8 +1346,8 @@ static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl)
1066 } 1346 }
1067 __clear_bit(grp->index, &q->bitmaps[ER]); 1347 __clear_bit(grp->index, &q->bitmaps[ER]);
1068 } else if (hlist_empty(&grp->slots[grp->front])) { 1348 } else if (hlist_empty(&grp->slots[grp->front])) {
1069 cl = qfq_slot_scan(grp); 1349 agg = qfq_slot_scan(grp);
1070 roundedS = qfq_round_down(cl->S, grp->slot_shift); 1350 roundedS = qfq_round_down(agg->S, grp->slot_shift);
1071 if (grp->S != roundedS) { 1351 if (grp->S != roundedS) {
1072 __clear_bit(grp->index, &q->bitmaps[ER]); 1352 __clear_bit(grp->index, &q->bitmaps[ER]);
1073 __clear_bit(grp->index, &q->bitmaps[IR]); 1353 __clear_bit(grp->index, &q->bitmaps[IR]);
@@ -1080,7 +1360,7 @@ static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl)
1080 } 1360 }
1081 } 1361 }
1082 1362
1083 qfq_update_eligible(q, q->V); 1363 qfq_update_eligible(q);
1084} 1364}
1085 1365
1086static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg) 1366static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg)
@@ -1092,6 +1372,32 @@ static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1092 qfq_deactivate_class(q, cl); 1372 qfq_deactivate_class(q, cl);
1093} 1373}
1094 1374
1375static unsigned int qfq_drop_from_slot(struct qfq_sched *q,
1376 struct hlist_head *slot)
1377{
1378 struct qfq_aggregate *agg;
1379 struct hlist_node *n;
1380 struct qfq_class *cl;
1381 unsigned int len;
1382
1383 hlist_for_each_entry(agg, n, slot, next) {
1384 list_for_each_entry(cl, &agg->active, alist) {
1385
1386 if (!cl->qdisc->ops->drop)
1387 continue;
1388
1389 len = cl->qdisc->ops->drop(cl->qdisc);
1390 if (len > 0) {
1391 if (cl->qdisc->q.qlen == 0)
1392 qfq_deactivate_class(q, cl);
1393
1394 return len;
1395 }
1396 }
1397 }
1398 return 0;
1399}
1400
1095static unsigned int qfq_drop(struct Qdisc *sch) 1401static unsigned int qfq_drop(struct Qdisc *sch)
1096{ 1402{
1097 struct qfq_sched *q = qdisc_priv(sch); 1403 struct qfq_sched *q = qdisc_priv(sch);
@@ -1101,24 +1407,13 @@ static unsigned int qfq_drop(struct Qdisc *sch)
1101 for (i = 0; i <= QFQ_MAX_INDEX; i++) { 1407 for (i = 0; i <= QFQ_MAX_INDEX; i++) {
1102 grp = &q->groups[i]; 1408 grp = &q->groups[i];
1103 for (j = 0; j < QFQ_MAX_SLOTS; j++) { 1409 for (j = 0; j < QFQ_MAX_SLOTS; j++) {
1104 struct qfq_class *cl; 1410 len = qfq_drop_from_slot(q, &grp->slots[j]);
1105 struct hlist_node *n; 1411 if (len > 0) {
1106 1412 sch->q.qlen--;
1107 hlist_for_each_entry(cl, n, &grp->slots[j], next) { 1413 return len;
1108
1109 if (!cl->qdisc->ops->drop)
1110 continue;
1111
1112 len = cl->qdisc->ops->drop(cl->qdisc);
1113 if (len > 0) {
1114 sch->q.qlen--;
1115 if (!cl->qdisc->q.qlen)
1116 qfq_deactivate_class(q, cl);
1117
1118 return len;
1119 }
1120 } 1414 }
1121 } 1415 }
1416
1122 } 1417 }
1123 1418
1124 return 0; 1419 return 0;
@@ -1129,44 +1424,51 @@ static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
1129 struct qfq_sched *q = qdisc_priv(sch); 1424 struct qfq_sched *q = qdisc_priv(sch);
1130 struct qfq_group *grp; 1425 struct qfq_group *grp;
1131 int i, j, err; 1426 int i, j, err;
1427 u32 max_cl_shift, maxbudg_shift, max_classes;
1132 1428
1133 err = qdisc_class_hash_init(&q->clhash); 1429 err = qdisc_class_hash_init(&q->clhash);
1134 if (err < 0) 1430 if (err < 0)
1135 return err; 1431 return err;
1136 1432
1433 if (qdisc_dev(sch)->tx_queue_len + 1 > QFQ_MAX_AGG_CLASSES)
1434 max_classes = QFQ_MAX_AGG_CLASSES;
1435 else
1436 max_classes = qdisc_dev(sch)->tx_queue_len + 1;
1437 /* max_cl_shift = floor(log_2(max_classes)) */
1438 max_cl_shift = __fls(max_classes);
1439 q->max_agg_classes = 1<<max_cl_shift;
1440
1441 /* maxbudg_shift = log2(max_len * max_classes_per_agg) */
1442 maxbudg_shift = QFQ_MTU_SHIFT + max_cl_shift;
1443 q->min_slot_shift = FRAC_BITS + maxbudg_shift - QFQ_MAX_INDEX;
1444
1137 for (i = 0; i <= QFQ_MAX_INDEX; i++) { 1445 for (i = 0; i <= QFQ_MAX_INDEX; i++) {
1138 grp = &q->groups[i]; 1446 grp = &q->groups[i];
1139 grp->index = i; 1447 grp->index = i;
1140 grp->slot_shift = QFQ_MTU_SHIFT + FRAC_BITS 1448 grp->slot_shift = q->min_slot_shift + i;
1141 - (QFQ_MAX_INDEX - i);
1142 for (j = 0; j < QFQ_MAX_SLOTS; j++) 1449 for (j = 0; j < QFQ_MAX_SLOTS; j++)
1143 INIT_HLIST_HEAD(&grp->slots[j]); 1450 INIT_HLIST_HEAD(&grp->slots[j]);
1144 } 1451 }
1145 1452
1453 INIT_HLIST_HEAD(&q->nonfull_aggs);
1454
1146 return 0; 1455 return 0;
1147} 1456}
1148 1457
1149static void qfq_reset_qdisc(struct Qdisc *sch) 1458static void qfq_reset_qdisc(struct Qdisc *sch)
1150{ 1459{
1151 struct qfq_sched *q = qdisc_priv(sch); 1460 struct qfq_sched *q = qdisc_priv(sch);
1152 struct qfq_group *grp;
1153 struct qfq_class *cl; 1461 struct qfq_class *cl;
1154 struct hlist_node *n, *tmp; 1462 struct hlist_node *n;
1155 unsigned int i, j; 1463 unsigned int i;
1156 1464
1157 for (i = 0; i <= QFQ_MAX_INDEX; i++) { 1465 for (i = 0; i < q->clhash.hashsize; i++) {
1158 grp = &q->groups[i]; 1466 hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
1159 for (j = 0; j < QFQ_MAX_SLOTS; j++) { 1467 if (cl->qdisc->q.qlen > 0)
1160 hlist_for_each_entry_safe(cl, n, tmp,
1161 &grp->slots[j], next) {
1162 qfq_deactivate_class(q, cl); 1468 qfq_deactivate_class(q, cl);
1163 }
1164 }
1165 }
1166 1469
1167 for (i = 0; i < q->clhash.hashsize; i++) {
1168 hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode)
1169 qdisc_reset(cl->qdisc); 1470 qdisc_reset(cl->qdisc);
1471 }
1170 } 1472 }
1171 sch->q.qlen = 0; 1473 sch->q.qlen = 0;
1172} 1474}