aboutsummaryrefslogtreecommitdiffstats
path: root/net/sched
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-12-12 21:07:07 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-12-12 21:07:07 -0500
commit6be35c700f742e911ecedd07fcc43d4439922334 (patch)
treeca9f37214d204465fcc2d79c82efd291e357c53c /net/sched
parente37aa63e87bd581f9be5555ed0ba83f5295c92fc (diff)
parent520dfe3a3645257bf83660f672c47f8558f3d4c4 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking changes from David Miller: 1) Allow to dump, monitor, and change the bridge multicast database using netlink. From Cong Wang. 2) RFC 5961 TCP blind data injection attack mitigation, from Eric Dumazet. 3) Networking user namespace support from Eric W. Biederman. 4) tuntap/virtio-net multiqueue support by Jason Wang. 5) Support for checksum offload of encapsulated packets (basically, tunneled traffic can still be checksummed by HW). From Joseph Gasparakis. 6) Allow BPF filter access to VLAN tags, from Eric Dumazet and Daniel Borkmann. 7) Bridge port parameters over netlink and BPDU blocking support from Stephen Hemminger. 8) Improve data access patterns during inet socket demux by rearranging socket layout, from Eric Dumazet. 9) TIPC protocol updates and cleanups from Ying Xue, Paul Gortmaker, and Jon Maloy. 10) Update TCP socket hash sizing to be more in line with current day realities. The existing heurstics were choosen a decade ago. From Eric Dumazet. 11) Fix races, queue bloat, and excessive wakeups in ATM and associated drivers, from Krzysztof Mazur and David Woodhouse. 12) Support DOVE (Distributed Overlay Virtual Ethernet) extensions in VXLAN driver, from David Stevens. 13) Add "oops_only" mode to netconsole, from Amerigo Wang. 14) Support set and query of VEB/VEPA bridge mode via PF_BRIDGE, also allow DCB netlink to work on namespaces other than the initial namespace. From John Fastabend. 15) Support PTP in the Tigon3 driver, from Matt Carlson. 16) tun/vhost zero copy fixes and improvements, plus turn it on by default, from Michael S. Tsirkin. 17) Support per-association statistics in SCTP, from Michele Baldessari. And many, many, driver updates, cleanups, and improvements. Too numerous to mention individually. * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1722 commits) net/mlx4_en: Add support for destination MAC in steering rules net/mlx4_en: Use generic etherdevice.h functions. net: ethtool: Add destination MAC address to flow steering API bridge: add support of adding and deleting mdb entries bridge: notify mdb changes via netlink ndisc: Unexport ndisc_{build,send}_skb(). uapi: add missing netconf.h to export list pkt_sched: avoid requeues if possible solos-pci: fix double-free of TX skb in DMA mode bnx2: Fix accidental reversions. bna: Driver Version Updated to 3.1.2.1 bna: Firmware update bna: Add RX State bna: Rx Page Based Allocation bna: TX Intr Coalescing Fix bna: Tx and Rx Optimizations bna: Code Cleanup and Enhancements ath9k: check pdata variable before dereferencing it ath5k: RX timestamp is reported at end of frame ath9k_htc: RX timestamp is reported at end of frame ...
Diffstat (limited to 'net/sched')
-rw-r--r--net/sched/Kconfig2
-rw-r--r--net/sched/act_api.c3
-rw-r--r--net/sched/cls_api.c2
-rw-r--r--net/sched/cls_cgroup.c24
-rw-r--r--net/sched/sch_api.c20
-rw-r--r--net/sched/sch_cbq.c3
-rw-r--r--net/sched/sch_generic.c11
-rw-r--r--net/sched/sch_htb.c139
-rw-r--r--net/sched/sch_mq.c4
-rw-r--r--net/sched/sch_mqprio.c4
-rw-r--r--net/sched/sch_qfq.c830
11 files changed, 717 insertions, 325 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 62fb51face8a..235e01acac51 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -509,7 +509,7 @@ config NET_EMATCH_TEXT
509 509
510config NET_EMATCH_CANID 510config NET_EMATCH_CANID
511 tristate "CAN Identifier" 511 tristate "CAN Identifier"
512 depends on NET_EMATCH && CAN 512 depends on NET_EMATCH && (CAN=y || CAN=m)
513 ---help--- 513 ---help---
514 Say Y here if you want to be able to classify CAN frames based 514 Say Y here if you want to be able to classify CAN frames based
515 on CAN Identifier. 515 on CAN Identifier.
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 102761d294cb..65d240cbf74b 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -987,6 +987,9 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
987 u32 portid = skb ? NETLINK_CB(skb).portid : 0; 987 u32 portid = skb ? NETLINK_CB(skb).portid : 0;
988 int ret = 0, ovr = 0; 988 int ret = 0, ovr = 0;
989 989
990 if ((n->nlmsg_type != RTM_GETACTION) && !capable(CAP_NET_ADMIN))
991 return -EPERM;
992
990 ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL); 993 ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL);
991 if (ret < 0) 994 if (ret < 0)
992 return ret; 995 return ret;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 7ae02892437c..ff55ed6c49b2 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -139,6 +139,8 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
139 int err; 139 int err;
140 int tp_created = 0; 140 int tp_created = 0;
141 141
142 if ((n->nlmsg_type != RTM_GETTFILTER) && !capable(CAP_NET_ADMIN))
143 return -EPERM;
142replay: 144replay:
143 t = nlmsg_data(n); 145 t = nlmsg_data(n);
144 protocol = TC_H_MIN(t->tcm_info); 146 protocol = TC_H_MIN(t->tcm_info);
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 31f06b633574..6db7855b9029 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -17,6 +17,7 @@
17#include <linux/skbuff.h> 17#include <linux/skbuff.h>
18#include <linux/cgroup.h> 18#include <linux/cgroup.h>
19#include <linux/rcupdate.h> 19#include <linux/rcupdate.h>
20#include <linux/fdtable.h>
20#include <net/rtnetlink.h> 21#include <net/rtnetlink.h>
21#include <net/pkt_cls.h> 22#include <net/pkt_cls.h>
22#include <net/sock.h> 23#include <net/sock.h>
@@ -57,6 +58,28 @@ static void cgrp_css_free(struct cgroup *cgrp)
57 kfree(cgrp_cls_state(cgrp)); 58 kfree(cgrp_cls_state(cgrp));
58} 59}
59 60
61static int update_classid(const void *v, struct file *file, unsigned n)
62{
63 int err;
64 struct socket *sock = sock_from_file(file, &err);
65 if (sock)
66 sock->sk->sk_classid = (u32)(unsigned long)v;
67 return 0;
68}
69
70static void cgrp_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
71{
72 struct task_struct *p;
73 void *v;
74
75 cgroup_taskset_for_each(p, cgrp, tset) {
76 task_lock(p);
77 v = (void *)(unsigned long)task_cls_classid(p);
78 iterate_fd(p->files, 0, update_classid, v);
79 task_unlock(p);
80 }
81}
82
60static u64 read_classid(struct cgroup *cgrp, struct cftype *cft) 83static u64 read_classid(struct cgroup *cgrp, struct cftype *cft)
61{ 84{
62 return cgrp_cls_state(cgrp)->classid; 85 return cgrp_cls_state(cgrp)->classid;
@@ -82,6 +105,7 @@ struct cgroup_subsys net_cls_subsys = {
82 .css_alloc = cgrp_css_alloc, 105 .css_alloc = cgrp_css_alloc,
83 .css_online = cgrp_css_online, 106 .css_online = cgrp_css_online,
84 .css_free = cgrp_css_free, 107 .css_free = cgrp_css_free,
108 .attach = cgrp_attach,
85 .subsys_id = net_cls_subsys_id, 109 .subsys_id = net_cls_subsys_id,
86 .base_cftypes = ss_files, 110 .base_cftypes = ss_files,
87 .module = THIS_MODULE, 111 .module = THIS_MODULE,
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index a18d975db59c..d84f7e734cd7 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -495,16 +495,15 @@ EXPORT_SYMBOL(qdisc_watchdog_init);
495 495
496void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires) 496void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
497{ 497{
498 ktime_t time;
499
500 if (test_bit(__QDISC_STATE_DEACTIVATED, 498 if (test_bit(__QDISC_STATE_DEACTIVATED,
501 &qdisc_root_sleeping(wd->qdisc)->state)) 499 &qdisc_root_sleeping(wd->qdisc)->state))
502 return; 500 return;
503 501
504 qdisc_throttled(wd->qdisc); 502 qdisc_throttled(wd->qdisc);
505 time = ktime_set(0, 0); 503
506 time = ktime_add_ns(time, PSCHED_TICKS2NS(expires)); 504 hrtimer_start(&wd->timer,
507 hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS); 505 ns_to_ktime(PSCHED_TICKS2NS(expires)),
506 HRTIMER_MODE_ABS);
508} 507}
509EXPORT_SYMBOL(qdisc_watchdog_schedule); 508EXPORT_SYMBOL(qdisc_watchdog_schedule);
510 509
@@ -834,6 +833,8 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
834 goto err_out3; 833 goto err_out3;
835 } 834 }
836 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock); 835 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
836 if (!netif_is_multiqueue(dev))
837 sch->flags |= TCQ_F_ONETXQUEUE;
837 } 838 }
838 839
839 sch->handle = handle; 840 sch->handle = handle;
@@ -981,6 +982,9 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
981 struct Qdisc *p = NULL; 982 struct Qdisc *p = NULL;
982 int err; 983 int err;
983 984
985 if ((n->nlmsg_type != RTM_GETQDISC) && !capable(CAP_NET_ADMIN))
986 return -EPERM;
987
984 dev = __dev_get_by_index(net, tcm->tcm_ifindex); 988 dev = __dev_get_by_index(net, tcm->tcm_ifindex);
985 if (!dev) 989 if (!dev)
986 return -ENODEV; 990 return -ENODEV;
@@ -1044,6 +1048,9 @@ static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1044 struct Qdisc *q, *p; 1048 struct Qdisc *q, *p;
1045 int err; 1049 int err;
1046 1050
1051 if (!capable(CAP_NET_ADMIN))
1052 return -EPERM;
1053
1047replay: 1054replay:
1048 /* Reinit, just in case something touches this. */ 1055 /* Reinit, just in case something touches this. */
1049 tcm = nlmsg_data(n); 1056 tcm = nlmsg_data(n);
@@ -1380,6 +1387,9 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1380 u32 qid = TC_H_MAJ(clid); 1387 u32 qid = TC_H_MAJ(clid);
1381 int err; 1388 int err;
1382 1389
1390 if ((n->nlmsg_type != RTM_GETTCLASS) && !capable(CAP_NET_ADMIN))
1391 return -EPERM;
1392
1383 dev = __dev_get_by_index(net, tcm->tcm_ifindex); 1393 dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1384 if (!dev) 1394 if (!dev)
1385 return -ENODEV; 1395 return -ENODEV;
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 564b9fc8efd3..0e19948470b8 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -509,8 +509,7 @@ static void cbq_ovl_delay(struct cbq_class *cl)
509 cl->cpriority = TC_CBQ_MAXPRIO; 509 cl->cpriority = TC_CBQ_MAXPRIO;
510 q->pmask |= (1<<TC_CBQ_MAXPRIO); 510 q->pmask |= (1<<TC_CBQ_MAXPRIO);
511 511
512 expires = ktime_set(0, 0); 512 expires = ns_to_ktime(PSCHED_TICKS2NS(sched));
513 expires = ktime_add_ns(expires, PSCHED_TICKS2NS(sched));
514 if (hrtimer_try_to_cancel(&q->delay_timer) && 513 if (hrtimer_try_to_cancel(&q->delay_timer) &&
515 ktime_to_ns(ktime_sub( 514 ktime_to_ns(ktime_sub(
516 hrtimer_get_expires(&q->delay_timer), 515 hrtimer_get_expires(&q->delay_timer),
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index aefc1504dc88..5d81a4478514 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -53,20 +53,19 @@ static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
53static inline struct sk_buff *dequeue_skb(struct Qdisc *q) 53static inline struct sk_buff *dequeue_skb(struct Qdisc *q)
54{ 54{
55 struct sk_buff *skb = q->gso_skb; 55 struct sk_buff *skb = q->gso_skb;
56 const struct netdev_queue *txq = q->dev_queue;
56 57
57 if (unlikely(skb)) { 58 if (unlikely(skb)) {
58 struct net_device *dev = qdisc_dev(q);
59 struct netdev_queue *txq;
60
61 /* check the reason of requeuing without tx lock first */ 59 /* check the reason of requeuing without tx lock first */
62 txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); 60 txq = netdev_get_tx_queue(txq->dev, skb_get_queue_mapping(skb));
63 if (!netif_xmit_frozen_or_stopped(txq)) { 61 if (!netif_xmit_frozen_or_stopped(txq)) {
64 q->gso_skb = NULL; 62 q->gso_skb = NULL;
65 q->q.qlen--; 63 q->q.qlen--;
66 } else 64 } else
67 skb = NULL; 65 skb = NULL;
68 } else { 66 } else {
69 skb = q->dequeue(q); 67 if (!(q->flags & TCQ_F_ONETXQUEUE) || !netif_xmit_frozen_or_stopped(txq))
68 skb = q->dequeue(q);
70 } 69 }
71 70
72 return skb; 71 return skb;
@@ -686,6 +685,8 @@ static void attach_one_default_qdisc(struct net_device *dev,
686 netdev_info(dev, "activation failed\n"); 685 netdev_info(dev, "activation failed\n");
687 return; 686 return;
688 } 687 }
688 if (!netif_is_multiqueue(dev))
689 qdisc->flags |= TCQ_F_ONETXQUEUE;
689 } 690 }
690 dev_queue->qdisc_sleeping = qdisc; 691 dev_queue->qdisc_sleeping = qdisc;
691} 692}
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 9d75b7761313..d2922c0ef57a 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -71,6 +71,12 @@ enum htb_cmode {
71 HTB_CAN_SEND /* class can send */ 71 HTB_CAN_SEND /* class can send */
72}; 72};
73 73
74struct htb_rate_cfg {
75 u64 rate_bps;
76 u32 mult;
77 u32 shift;
78};
79
74/* interior & leaf nodes; props specific to leaves are marked L: */ 80/* interior & leaf nodes; props specific to leaves are marked L: */
75struct htb_class { 81struct htb_class {
76 struct Qdisc_class_common common; 82 struct Qdisc_class_common common;
@@ -118,11 +124,11 @@ struct htb_class {
118 int filter_cnt; 124 int filter_cnt;
119 125
120 /* token bucket parameters */ 126 /* token bucket parameters */
121 struct qdisc_rate_table *rate; /* rate table of the class itself */ 127 struct htb_rate_cfg rate;
122 struct qdisc_rate_table *ceil; /* ceiling rate (limits borrows too) */ 128 struct htb_rate_cfg ceil;
123 long buffer, cbuffer; /* token bucket depth/rate */ 129 s64 buffer, cbuffer; /* token bucket depth/rate */
124 psched_tdiff_t mbuffer; /* max wait time */ 130 psched_tdiff_t mbuffer; /* max wait time */
125 long tokens, ctokens; /* current number of tokens */ 131 s64 tokens, ctokens; /* current number of tokens */
126 psched_time_t t_c; /* checkpoint time */ 132 psched_time_t t_c; /* checkpoint time */
127}; 133};
128 134
@@ -162,6 +168,45 @@ struct htb_sched {
162 struct work_struct work; 168 struct work_struct work;
163}; 169};
164 170
171static u64 l2t_ns(struct htb_rate_cfg *r, unsigned int len)
172{
173 return ((u64)len * r->mult) >> r->shift;
174}
175
176static void htb_precompute_ratedata(struct htb_rate_cfg *r)
177{
178 u64 factor;
179 u64 mult;
180 int shift;
181
182 r->shift = 0;
183 r->mult = 1;
184 /*
185 * Calibrate mult, shift so that token counting is accurate
186 * for smallest packet size (64 bytes). Token (time in ns) is
187 * computed as (bytes * 8) * NSEC_PER_SEC / rate_bps. It will
188 * work as long as the smallest packet transfer time can be
189 * accurately represented in nanosec.
190 */
191 if (r->rate_bps > 0) {
192 /*
193 * Higher shift gives better accuracy. Find the largest
194 * shift such that mult fits in 32 bits.
195 */
196 for (shift = 0; shift < 16; shift++) {
197 r->shift = shift;
198 factor = 8LLU * NSEC_PER_SEC * (1 << r->shift);
199 mult = div64_u64(factor, r->rate_bps);
200 if (mult > UINT_MAX)
201 break;
202 }
203
204 r->shift = shift - 1;
205 factor = 8LLU * NSEC_PER_SEC * (1 << r->shift);
206 r->mult = div64_u64(factor, r->rate_bps);
207 }
208}
209
165/* find class in global hash table using given handle */ 210/* find class in global hash table using given handle */
166static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch) 211static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
167{ 212{
@@ -273,7 +318,7 @@ static void htb_add_to_id_tree(struct rb_root *root,
273 * already in the queue. 318 * already in the queue.
274 */ 319 */
275static void htb_add_to_wait_tree(struct htb_sched *q, 320static void htb_add_to_wait_tree(struct htb_sched *q,
276 struct htb_class *cl, long delay) 321 struct htb_class *cl, s64 delay)
277{ 322{
278 struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL; 323 struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL;
279 324
@@ -441,14 +486,14 @@ static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
441 htb_remove_class_from_row(q, cl, mask); 486 htb_remove_class_from_row(q, cl, mask);
442} 487}
443 488
444static inline long htb_lowater(const struct htb_class *cl) 489static inline s64 htb_lowater(const struct htb_class *cl)
445{ 490{
446 if (htb_hysteresis) 491 if (htb_hysteresis)
447 return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0; 492 return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
448 else 493 else
449 return 0; 494 return 0;
450} 495}
451static inline long htb_hiwater(const struct htb_class *cl) 496static inline s64 htb_hiwater(const struct htb_class *cl)
452{ 497{
453 if (htb_hysteresis) 498 if (htb_hysteresis)
454 return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0; 499 return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
@@ -469,9 +514,9 @@ static inline long htb_hiwater(const struct htb_class *cl)
469 * mode transitions per time unit. The speed gain is about 1/6. 514 * mode transitions per time unit. The speed gain is about 1/6.
470 */ 515 */
471static inline enum htb_cmode 516static inline enum htb_cmode
472htb_class_mode(struct htb_class *cl, long *diff) 517htb_class_mode(struct htb_class *cl, s64 *diff)
473{ 518{
474 long toks; 519 s64 toks;
475 520
476 if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) { 521 if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
477 *diff = -toks; 522 *diff = -toks;
@@ -495,7 +540,7 @@ htb_class_mode(struct htb_class *cl, long *diff)
495 * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree). 540 * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
496 */ 541 */
497static void 542static void
498htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, long *diff) 543htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
499{ 544{
500 enum htb_cmode new_mode = htb_class_mode(cl, diff); 545 enum htb_cmode new_mode = htb_class_mode(cl, diff);
501 546
@@ -581,26 +626,26 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
581 return NET_XMIT_SUCCESS; 626 return NET_XMIT_SUCCESS;
582} 627}
583 628
584static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, long diff) 629static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, s64 diff)
585{ 630{
586 long toks = diff + cl->tokens; 631 s64 toks = diff + cl->tokens;
587 632
588 if (toks > cl->buffer) 633 if (toks > cl->buffer)
589 toks = cl->buffer; 634 toks = cl->buffer;
590 toks -= (long) qdisc_l2t(cl->rate, bytes); 635 toks -= (s64) l2t_ns(&cl->rate, bytes);
591 if (toks <= -cl->mbuffer) 636 if (toks <= -cl->mbuffer)
592 toks = 1 - cl->mbuffer; 637 toks = 1 - cl->mbuffer;
593 638
594 cl->tokens = toks; 639 cl->tokens = toks;
595} 640}
596 641
597static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, long diff) 642static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff)
598{ 643{
599 long toks = diff + cl->ctokens; 644 s64 toks = diff + cl->ctokens;
600 645
601 if (toks > cl->cbuffer) 646 if (toks > cl->cbuffer)
602 toks = cl->cbuffer; 647 toks = cl->cbuffer;
603 toks -= (long) qdisc_l2t(cl->ceil, bytes); 648 toks -= (s64) l2t_ns(&cl->ceil, bytes);
604 if (toks <= -cl->mbuffer) 649 if (toks <= -cl->mbuffer)
605 toks = 1 - cl->mbuffer; 650 toks = 1 - cl->mbuffer;
606 651
@@ -623,10 +668,10 @@ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
623{ 668{
624 int bytes = qdisc_pkt_len(skb); 669 int bytes = qdisc_pkt_len(skb);
625 enum htb_cmode old_mode; 670 enum htb_cmode old_mode;
626 long diff; 671 s64 diff;
627 672
628 while (cl) { 673 while (cl) {
629 diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer); 674 diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
630 if (cl->level >= level) { 675 if (cl->level >= level) {
631 if (cl->level == level) 676 if (cl->level == level)
632 cl->xstats.lends++; 677 cl->xstats.lends++;
@@ -673,7 +718,7 @@ static psched_time_t htb_do_events(struct htb_sched *q, int level,
673 unsigned long stop_at = start + 2; 718 unsigned long stop_at = start + 2;
674 while (time_before(jiffies, stop_at)) { 719 while (time_before(jiffies, stop_at)) {
675 struct htb_class *cl; 720 struct htb_class *cl;
676 long diff; 721 s64 diff;
677 struct rb_node *p = rb_first(&q->wait_pq[level]); 722 struct rb_node *p = rb_first(&q->wait_pq[level]);
678 723
679 if (!p) 724 if (!p)
@@ -684,7 +729,7 @@ static psched_time_t htb_do_events(struct htb_sched *q, int level,
684 return cl->pq_key; 729 return cl->pq_key;
685 730
686 htb_safe_rb_erase(p, q->wait_pq + level); 731 htb_safe_rb_erase(p, q->wait_pq + level);
687 diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer); 732 diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
688 htb_change_class_mode(q, cl, &diff); 733 htb_change_class_mode(q, cl, &diff);
689 if (cl->cmode != HTB_CAN_SEND) 734 if (cl->cmode != HTB_CAN_SEND)
690 htb_add_to_wait_tree(q, cl, diff); 735 htb_add_to_wait_tree(q, cl, diff);
@@ -871,10 +916,10 @@ ok:
871 916
872 if (!sch->q.qlen) 917 if (!sch->q.qlen)
873 goto fin; 918 goto fin;
874 q->now = psched_get_time(); 919 q->now = ktime_to_ns(ktime_get());
875 start_at = jiffies; 920 start_at = jiffies;
876 921
877 next_event = q->now + 5 * PSCHED_TICKS_PER_SEC; 922 next_event = q->now + 5 * NSEC_PER_SEC;
878 923
879 for (level = 0; level < TC_HTB_MAXDEPTH; level++) { 924 for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
880 /* common case optimization - skip event handler quickly */ 925 /* common case optimization - skip event handler quickly */
@@ -884,7 +929,7 @@ ok:
884 if (q->now >= q->near_ev_cache[level]) { 929 if (q->now >= q->near_ev_cache[level]) {
885 event = htb_do_events(q, level, start_at); 930 event = htb_do_events(q, level, start_at);
886 if (!event) 931 if (!event)
887 event = q->now + PSCHED_TICKS_PER_SEC; 932 event = q->now + NSEC_PER_SEC;
888 q->near_ev_cache[level] = event; 933 q->near_ev_cache[level] = event;
889 } else 934 } else
890 event = q->near_ev_cache[level]; 935 event = q->near_ev_cache[level];
@@ -903,10 +948,17 @@ ok:
903 } 948 }
904 } 949 }
905 sch->qstats.overlimits++; 950 sch->qstats.overlimits++;
906 if (likely(next_event > q->now)) 951 if (likely(next_event > q->now)) {
907 qdisc_watchdog_schedule(&q->watchdog, next_event); 952 if (!test_bit(__QDISC_STATE_DEACTIVATED,
908 else 953 &qdisc_root_sleeping(q->watchdog.qdisc)->state)) {
954 ktime_t time = ns_to_ktime(next_event);
955 qdisc_throttled(q->watchdog.qdisc);
956 hrtimer_start(&q->watchdog.timer, time,
957 HRTIMER_MODE_ABS);
958 }
959 } else {
909 schedule_work(&q->work); 960 schedule_work(&q->work);
961 }
910fin: 962fin:
911 return skb; 963 return skb;
912} 964}
@@ -1082,9 +1134,9 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1082 1134
1083 memset(&opt, 0, sizeof(opt)); 1135 memset(&opt, 0, sizeof(opt));
1084 1136
1085 opt.rate = cl->rate->rate; 1137 opt.rate.rate = cl->rate.rate_bps >> 3;
1086 opt.buffer = cl->buffer; 1138 opt.buffer = cl->buffer;
1087 opt.ceil = cl->ceil->rate; 1139 opt.ceil.rate = cl->ceil.rate_bps >> 3;
1088 opt.cbuffer = cl->cbuffer; 1140 opt.cbuffer = cl->cbuffer;
1089 opt.quantum = cl->quantum; 1141 opt.quantum = cl->quantum;
1090 opt.prio = cl->prio; 1142 opt.prio = cl->prio;
@@ -1203,9 +1255,6 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1203 qdisc_destroy(cl->un.leaf.q); 1255 qdisc_destroy(cl->un.leaf.q);
1204 } 1256 }
1205 gen_kill_estimator(&cl->bstats, &cl->rate_est); 1257 gen_kill_estimator(&cl->bstats, &cl->rate_est);
1206 qdisc_put_rtab(cl->rate);
1207 qdisc_put_rtab(cl->ceil);
1208
1209 tcf_destroy_chain(&cl->filter_list); 1258 tcf_destroy_chain(&cl->filter_list);
1210 kfree(cl); 1259 kfree(cl);
1211} 1260}
@@ -1307,7 +1356,6 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
1307 struct htb_sched *q = qdisc_priv(sch); 1356 struct htb_sched *q = qdisc_priv(sch);
1308 struct htb_class *cl = (struct htb_class *)*arg, *parent; 1357 struct htb_class *cl = (struct htb_class *)*arg, *parent;
1309 struct nlattr *opt = tca[TCA_OPTIONS]; 1358 struct nlattr *opt = tca[TCA_OPTIONS];
1310 struct qdisc_rate_table *rtab = NULL, *ctab = NULL;
1311 struct nlattr *tb[__TCA_HTB_MAX]; 1359 struct nlattr *tb[__TCA_HTB_MAX];
1312 struct tc_htb_opt *hopt; 1360 struct tc_htb_opt *hopt;
1313 1361
@@ -1326,10 +1374,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
1326 parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch); 1374 parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
1327 1375
1328 hopt = nla_data(tb[TCA_HTB_PARMS]); 1376 hopt = nla_data(tb[TCA_HTB_PARMS]);
1329 1377 if (!hopt->rate.rate || !hopt->ceil.rate)
1330 rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]);
1331 ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]);
1332 if (!rtab || !ctab)
1333 goto failure; 1378 goto failure;
1334 1379
1335 if (!cl) { /* new class */ 1380 if (!cl) { /* new class */
@@ -1439,7 +1484,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
1439 * is really leaf before changing cl->un.leaf ! 1484 * is really leaf before changing cl->un.leaf !
1440 */ 1485 */
1441 if (!cl->level) { 1486 if (!cl->level) {
1442 cl->quantum = rtab->rate.rate / q->rate2quantum; 1487 cl->quantum = hopt->rate.rate / q->rate2quantum;
1443 if (!hopt->quantum && cl->quantum < 1000) { 1488 if (!hopt->quantum && cl->quantum < 1000) {
1444 pr_warning( 1489 pr_warning(
1445 "HTB: quantum of class %X is small. Consider r2q change.\n", 1490 "HTB: quantum of class %X is small. Consider r2q change.\n",
@@ -1460,12 +1505,16 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
1460 1505
1461 cl->buffer = hopt->buffer; 1506 cl->buffer = hopt->buffer;
1462 cl->cbuffer = hopt->cbuffer; 1507 cl->cbuffer = hopt->cbuffer;
1463 if (cl->rate) 1508
1464 qdisc_put_rtab(cl->rate); 1509 cl->rate.rate_bps = (u64)hopt->rate.rate << 3;
1465 cl->rate = rtab; 1510 cl->ceil.rate_bps = (u64)hopt->ceil.rate << 3;
1466 if (cl->ceil) 1511
1467 qdisc_put_rtab(cl->ceil); 1512 htb_precompute_ratedata(&cl->rate);
1468 cl->ceil = ctab; 1513 htb_precompute_ratedata(&cl->ceil);
1514
1515 cl->buffer = hopt->buffer << PSCHED_SHIFT;
1516 cl->cbuffer = hopt->buffer << PSCHED_SHIFT;
1517
1469 sch_tree_unlock(sch); 1518 sch_tree_unlock(sch);
1470 1519
1471 qdisc_class_hash_grow(sch, &q->clhash); 1520 qdisc_class_hash_grow(sch, &q->clhash);
@@ -1474,10 +1523,6 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
1474 return 0; 1523 return 0;
1475 1524
1476failure: 1525failure:
1477 if (rtab)
1478 qdisc_put_rtab(rtab);
1479 if (ctab)
1480 qdisc_put_rtab(ctab);
1481 return err; 1526 return err;
1482} 1527}
1483 1528
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index 0a4b2f9a0094..5da78a19ac9a 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -63,6 +63,7 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt)
63 if (qdisc == NULL) 63 if (qdisc == NULL)
64 goto err; 64 goto err;
65 priv->qdiscs[ntx] = qdisc; 65 priv->qdiscs[ntx] = qdisc;
66 qdisc->flags |= TCQ_F_ONETXQUEUE;
66 } 67 }
67 68
68 sch->flags |= TCQ_F_MQROOT; 69 sch->flags |= TCQ_F_MQROOT;
@@ -150,7 +151,8 @@ static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
150 dev_deactivate(dev); 151 dev_deactivate(dev);
151 152
152 *old = dev_graft_qdisc(dev_queue, new); 153 *old = dev_graft_qdisc(dev_queue, new);
153 154 if (new)
155 new->flags |= TCQ_F_ONETXQUEUE;
154 if (dev->flags & IFF_UP) 156 if (dev->flags & IFF_UP)
155 dev_activate(dev); 157 dev_activate(dev);
156 return 0; 158 return 0;
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index d1831ca966d4..accec33c454c 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -132,6 +132,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
132 goto err; 132 goto err;
133 } 133 }
134 priv->qdiscs[i] = qdisc; 134 priv->qdiscs[i] = qdisc;
135 qdisc->flags |= TCQ_F_ONETXQUEUE;
135 } 136 }
136 137
137 /* If the mqprio options indicate that hardware should own 138 /* If the mqprio options indicate that hardware should own
@@ -205,6 +206,9 @@ static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
205 206
206 *old = dev_graft_qdisc(dev_queue, new); 207 *old = dev_graft_qdisc(dev_queue, new);
207 208
209 if (new)
210 new->flags |= TCQ_F_ONETXQUEUE;
211
208 if (dev->flags & IFF_UP) 212 if (dev->flags & IFF_UP)
209 dev_activate(dev); 213 dev_activate(dev);
210 214
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index 9687fa1c2275..6ed37652a4c3 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -1,7 +1,8 @@
1/* 1/*
2 * net/sched/sch_qfq.c Quick Fair Queueing Scheduler. 2 * net/sched/sch_qfq.c Quick Fair Queueing Plus Scheduler.
3 * 3 *
4 * Copyright (c) 2009 Fabio Checconi, Luigi Rizzo, and Paolo Valente. 4 * Copyright (c) 2009 Fabio Checconi, Luigi Rizzo, and Paolo Valente.
5 * Copyright (c) 2012 Paolo Valente.
5 * 6 *
6 * This program is free software; you can redistribute it and/or 7 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License 8 * modify it under the terms of the GNU General Public License
@@ -19,12 +20,18 @@
19#include <net/pkt_cls.h> 20#include <net/pkt_cls.h>
20 21
21 22
22/* Quick Fair Queueing 23/* Quick Fair Queueing Plus
23 =================== 24 ========================
24 25
25 Sources: 26 Sources:
26 27
27 Fabio Checconi, Luigi Rizzo, and Paolo Valente: "QFQ: Efficient 28 [1] Paolo Valente,
29 "Reducing the Execution Time of Fair-Queueing Schedulers."
30 http://algo.ing.unimo.it/people/paolo/agg-sched/agg-sched.pdf
31
32 Sources for QFQ:
33
34 [2] Fabio Checconi, Luigi Rizzo, and Paolo Valente: "QFQ: Efficient
28 Packet Scheduling with Tight Bandwidth Distribution Guarantees." 35 Packet Scheduling with Tight Bandwidth Distribution Guarantees."
29 36
30 See also: 37 See also:
@@ -33,6 +40,20 @@
33 40
34/* 41/*
35 42
43 QFQ+ divides classes into aggregates of at most MAX_AGG_CLASSES
44 classes. Each aggregate is timestamped with a virtual start time S
45 and a virtual finish time F, and scheduled according to its
46 timestamps. S and F are computed as a function of a system virtual
47 time function V. The classes within each aggregate are instead
48 scheduled with DRR.
49
50 To speed up operations, QFQ+ divides also aggregates into a limited
51 number of groups. Which group a class belongs to depends on the
52 ratio between the maximum packet length for the class and the weight
53 of the class. Groups have their own S and F. In the end, QFQ+
54 schedules groups, then aggregates within groups, then classes within
55 aggregates. See [1] and [2] for a full description.
56
36 Virtual time computations. 57 Virtual time computations.
37 58
38 S, F and V are all computed in fixed point arithmetic with 59 S, F and V are all computed in fixed point arithmetic with
@@ -76,27 +97,28 @@
76#define QFQ_MAX_SLOTS 32 97#define QFQ_MAX_SLOTS 32
77 98
78/* 99/*
79 * Shifts used for class<->group mapping. We allow class weights that are 100 * Shifts used for aggregate<->group mapping. We allow class weights that are
80 * in the range [1, 2^MAX_WSHIFT], and we try to map each class i to the 101 * in the range [1, 2^MAX_WSHIFT], and we try to map each aggregate i to the
81 * group with the smallest index that can support the L_i / r_i configured 102 * group with the smallest index that can support the L_i / r_i configured
82 * for the class. 103 * for the classes in the aggregate.
83 * 104 *
84 * grp->index is the index of the group; and grp->slot_shift 105 * grp->index is the index of the group; and grp->slot_shift
85 * is the shift for the corresponding (scaled) sigma_i. 106 * is the shift for the corresponding (scaled) sigma_i.
86 */ 107 */
87#define QFQ_MAX_INDEX 24 108#define QFQ_MAX_INDEX 24
88#define QFQ_MAX_WSHIFT 12 109#define QFQ_MAX_WSHIFT 10
89 110
90#define QFQ_MAX_WEIGHT (1<<QFQ_MAX_WSHIFT) 111#define QFQ_MAX_WEIGHT (1<<QFQ_MAX_WSHIFT) /* see qfq_slot_insert */
91#define QFQ_MAX_WSUM (16*QFQ_MAX_WEIGHT) 112#define QFQ_MAX_WSUM (64*QFQ_MAX_WEIGHT)
92 113
93#define FRAC_BITS 30 /* fixed point arithmetic */ 114#define FRAC_BITS 30 /* fixed point arithmetic */
94#define ONE_FP (1UL << FRAC_BITS) 115#define ONE_FP (1UL << FRAC_BITS)
95#define IWSUM (ONE_FP/QFQ_MAX_WSUM) 116#define IWSUM (ONE_FP/QFQ_MAX_WSUM)
96 117
97#define QFQ_MTU_SHIFT 16 /* to support TSO/GSO */ 118#define QFQ_MTU_SHIFT 16 /* to support TSO/GSO */
98#define QFQ_MIN_SLOT_SHIFT (FRAC_BITS + QFQ_MTU_SHIFT - QFQ_MAX_INDEX) 119#define QFQ_MIN_LMAX 512 /* see qfq_slot_insert */
99#define QFQ_MIN_LMAX 256 /* min possible lmax for a class */ 120
121#define QFQ_MAX_AGG_CLASSES 8 /* max num classes per aggregate allowed */
100 122
101/* 123/*
102 * Possible group states. These values are used as indexes for the bitmaps 124 * Possible group states. These values are used as indexes for the bitmaps
@@ -106,6 +128,8 @@ enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE };
106 128
107struct qfq_group; 129struct qfq_group;
108 130
131struct qfq_aggregate;
132
109struct qfq_class { 133struct qfq_class {
110 struct Qdisc_class_common common; 134 struct Qdisc_class_common common;
111 135
@@ -116,7 +140,12 @@ struct qfq_class {
116 struct gnet_stats_queue qstats; 140 struct gnet_stats_queue qstats;
117 struct gnet_stats_rate_est rate_est; 141 struct gnet_stats_rate_est rate_est;
118 struct Qdisc *qdisc; 142 struct Qdisc *qdisc;
143 struct list_head alist; /* Link for active-classes list. */
144 struct qfq_aggregate *agg; /* Parent aggregate. */
145 int deficit; /* DRR deficit counter. */
146};
119 147
148struct qfq_aggregate {
120 struct hlist_node next; /* Link for the slot list. */ 149 struct hlist_node next; /* Link for the slot list. */
121 u64 S, F; /* flow timestamps (exact) */ 150 u64 S, F; /* flow timestamps (exact) */
122 151
@@ -127,8 +156,18 @@ struct qfq_class {
127 struct qfq_group *grp; 156 struct qfq_group *grp;
128 157
129 /* these are copied from the flowset. */ 158 /* these are copied from the flowset. */
130 u32 inv_w; /* ONE_FP/weight */ 159 u32 class_weight; /* Weight of each class in this aggregate. */
131 u32 lmax; /* Max packet size for this flow. */ 160 /* Max pkt size for the classes in this aggregate, DRR quantum. */
161 int lmax;
162
163 u32 inv_w; /* ONE_FP/(sum of weights of classes in aggr.). */
164 u32 budgetmax; /* Max budget for this aggregate. */
165 u32 initial_budget, budget; /* Initial and current budget. */
166
167 int num_classes; /* Number of classes in this aggr. */
168 struct list_head active; /* DRR queue of active classes. */
169
170 struct hlist_node nonfull_next; /* See nonfull_aggs in qfq_sched. */
132}; 171};
133 172
134struct qfq_group { 173struct qfq_group {
@@ -138,7 +177,7 @@ struct qfq_group {
138 unsigned int front; /* Index of the front slot. */ 177 unsigned int front; /* Index of the front slot. */
139 unsigned long full_slots; /* non-empty slots */ 178 unsigned long full_slots; /* non-empty slots */
140 179
141 /* Array of RR lists of active classes. */ 180 /* Array of RR lists of active aggregates. */
142 struct hlist_head slots[QFQ_MAX_SLOTS]; 181 struct hlist_head slots[QFQ_MAX_SLOTS];
143}; 182};
144 183
@@ -146,13 +185,28 @@ struct qfq_sched {
146 struct tcf_proto *filter_list; 185 struct tcf_proto *filter_list;
147 struct Qdisc_class_hash clhash; 186 struct Qdisc_class_hash clhash;
148 187
149 u64 V; /* Precise virtual time. */ 188 u64 oldV, V; /* Precise virtual times. */
150 u32 wsum; /* weight sum */ 189 struct qfq_aggregate *in_serv_agg; /* Aggregate being served. */
190 u32 num_active_agg; /* Num. of active aggregates */
191 u32 wsum; /* weight sum */
151 192
152 unsigned long bitmaps[QFQ_MAX_STATE]; /* Group bitmaps. */ 193 unsigned long bitmaps[QFQ_MAX_STATE]; /* Group bitmaps. */
153 struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */ 194 struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */
195 u32 min_slot_shift; /* Index of the group-0 bit in the bitmaps. */
196
197 u32 max_agg_classes; /* Max number of classes per aggr. */
198 struct hlist_head nonfull_aggs; /* Aggs with room for more classes. */
154}; 199};
155 200
201/*
202 * Possible reasons why the timestamps of an aggregate are updated
203 * enqueue: the aggregate switches from idle to active and must scheduled
204 * for service
205 * requeue: the aggregate finishes its budget, so it stops being served and
206 * must be rescheduled for service
207 */
208enum update_reason {enqueue, requeue};
209
156static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid) 210static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid)
157{ 211{
158 struct qfq_sched *q = qdisc_priv(sch); 212 struct qfq_sched *q = qdisc_priv(sch);
@@ -182,18 +236,18 @@ static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = {
182 * index = log_2(maxlen/weight) but we need to apply the scaling. 236 * index = log_2(maxlen/weight) but we need to apply the scaling.
183 * This is used only once at flow creation. 237 * This is used only once at flow creation.
184 */ 238 */
185static int qfq_calc_index(u32 inv_w, unsigned int maxlen) 239static int qfq_calc_index(u32 inv_w, unsigned int maxlen, u32 min_slot_shift)
186{ 240{
187 u64 slot_size = (u64)maxlen * inv_w; 241 u64 slot_size = (u64)maxlen * inv_w;
188 unsigned long size_map; 242 unsigned long size_map;
189 int index = 0; 243 int index = 0;
190 244
191 size_map = slot_size >> QFQ_MIN_SLOT_SHIFT; 245 size_map = slot_size >> min_slot_shift;
192 if (!size_map) 246 if (!size_map)
193 goto out; 247 goto out;
194 248
195 index = __fls(size_map) + 1; /* basically a log_2 */ 249 index = __fls(size_map) + 1; /* basically a log_2 */
196 index -= !(slot_size - (1ULL << (index + QFQ_MIN_SLOT_SHIFT - 1))); 250 index -= !(slot_size - (1ULL << (index + min_slot_shift - 1)));
197 251
198 if (index < 0) 252 if (index < 0)
199 index = 0; 253 index = 0;
@@ -204,66 +258,150 @@ out:
204 return index; 258 return index;
205} 259}
206 260
207/* Length of the next packet (0 if the queue is empty). */ 261static void qfq_deactivate_agg(struct qfq_sched *, struct qfq_aggregate *);
208static unsigned int qdisc_peek_len(struct Qdisc *sch) 262static void qfq_activate_agg(struct qfq_sched *, struct qfq_aggregate *,
263 enum update_reason);
264
265static void qfq_init_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
266 u32 lmax, u32 weight)
209{ 267{
210 struct sk_buff *skb; 268 INIT_LIST_HEAD(&agg->active);
269 hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs);
270
271 agg->lmax = lmax;
272 agg->class_weight = weight;
273}
274
275static struct qfq_aggregate *qfq_find_agg(struct qfq_sched *q,
276 u32 lmax, u32 weight)
277{
278 struct qfq_aggregate *agg;
279 struct hlist_node *n;
280
281 hlist_for_each_entry(agg, n, &q->nonfull_aggs, nonfull_next)
282 if (agg->lmax == lmax && agg->class_weight == weight)
283 return agg;
284
285 return NULL;
286}
287
211 288
212 skb = sch->ops->peek(sch); 289/* Update aggregate as a function of the new number of classes. */
213 return skb ? qdisc_pkt_len(skb) : 0; 290static void qfq_update_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
291 int new_num_classes)
292{
293 u32 new_agg_weight;
294
295 if (new_num_classes == q->max_agg_classes)
296 hlist_del_init(&agg->nonfull_next);
297
298 if (agg->num_classes > new_num_classes &&
299 new_num_classes == q->max_agg_classes - 1) /* agg no more full */
300 hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs);
301
302 agg->budgetmax = new_num_classes * agg->lmax;
303 new_agg_weight = agg->class_weight * new_num_classes;
304 agg->inv_w = ONE_FP/new_agg_weight;
305
306 if (agg->grp == NULL) {
307 int i = qfq_calc_index(agg->inv_w, agg->budgetmax,
308 q->min_slot_shift);
309 agg->grp = &q->groups[i];
310 }
311
312 q->wsum +=
313 (int) agg->class_weight * (new_num_classes - agg->num_classes);
314
315 agg->num_classes = new_num_classes;
316}
317
318/* Add class to aggregate. */
319static void qfq_add_to_agg(struct qfq_sched *q,
320 struct qfq_aggregate *agg,
321 struct qfq_class *cl)
322{
323 cl->agg = agg;
324
325 qfq_update_agg(q, agg, agg->num_classes+1);
326 if (cl->qdisc->q.qlen > 0) { /* adding an active class */
327 list_add_tail(&cl->alist, &agg->active);
328 if (list_first_entry(&agg->active, struct qfq_class, alist) ==
329 cl && q->in_serv_agg != agg) /* agg was inactive */
330 qfq_activate_agg(q, agg, enqueue); /* schedule agg */
331 }
214} 332}
215 333
216static void qfq_deactivate_class(struct qfq_sched *, struct qfq_class *); 334static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *);
217static void qfq_activate_class(struct qfq_sched *q, struct qfq_class *cl,
218 unsigned int len);
219 335
220static void qfq_update_class_params(struct qfq_sched *q, struct qfq_class *cl, 336static void qfq_destroy_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
221 u32 lmax, u32 inv_w, int delta_w)
222{ 337{
223 int i; 338 if (!hlist_unhashed(&agg->nonfull_next))
339 hlist_del_init(&agg->nonfull_next);
340 if (q->in_serv_agg == agg)
341 q->in_serv_agg = qfq_choose_next_agg(q);
342 kfree(agg);
343}
224 344
225 /* update qfq-specific data */ 345/* Deschedule class from within its parent aggregate. */
226 cl->lmax = lmax; 346static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl)
227 cl->inv_w = inv_w; 347{
228 i = qfq_calc_index(cl->inv_w, cl->lmax); 348 struct qfq_aggregate *agg = cl->agg;
229 349
230 cl->grp = &q->groups[i];
231 350
232 q->wsum += delta_w; 351 list_del(&cl->alist); /* remove from RR queue of the aggregate */
352 if (list_empty(&agg->active)) /* agg is now inactive */
353 qfq_deactivate_agg(q, agg);
233} 354}
234 355
235static void qfq_update_reactivate_class(struct qfq_sched *q, 356/* Remove class from its parent aggregate. */
236 struct qfq_class *cl, 357static void qfq_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl)
237 u32 inv_w, u32 lmax, int delta_w)
238{ 358{
239 bool need_reactivation = false; 359 struct qfq_aggregate *agg = cl->agg;
240 int i = qfq_calc_index(inv_w, lmax);
241 360
242 if (&q->groups[i] != cl->grp && cl->qdisc->q.qlen > 0) { 361 cl->agg = NULL;
243 /* 362 if (agg->num_classes == 1) { /* agg being emptied, destroy it */
244 * shift cl->F back, to not charge the 363 qfq_destroy_agg(q, agg);
245 * class for the not-yet-served head 364 return;
246 * packet
247 */
248 cl->F = cl->S;
249 /* remove class from its slot in the old group */
250 qfq_deactivate_class(q, cl);
251 need_reactivation = true;
252 } 365 }
366 qfq_update_agg(q, agg, agg->num_classes-1);
367}
253 368
254 qfq_update_class_params(q, cl, lmax, inv_w, delta_w); 369/* Deschedule class and remove it from its parent aggregate. */
370static void qfq_deact_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl)
371{
372 if (cl->qdisc->q.qlen > 0) /* class is active */
373 qfq_deactivate_class(q, cl);
255 374
256 if (need_reactivation) /* activate in new group */ 375 qfq_rm_from_agg(q, cl);
257 qfq_activate_class(q, cl, qdisc_peek_len(cl->qdisc));
258} 376}
259 377
378/* Move class to a new aggregate, matching the new class weight and/or lmax */
379static int qfq_change_agg(struct Qdisc *sch, struct qfq_class *cl, u32 weight,
380 u32 lmax)
381{
382 struct qfq_sched *q = qdisc_priv(sch);
383 struct qfq_aggregate *new_agg = qfq_find_agg(q, lmax, weight);
384
385 if (new_agg == NULL) { /* create new aggregate */
386 new_agg = kzalloc(sizeof(*new_agg), GFP_ATOMIC);
387 if (new_agg == NULL)
388 return -ENOBUFS;
389 qfq_init_agg(q, new_agg, lmax, weight);
390 }
391 qfq_deact_rm_from_agg(q, cl);
392 qfq_add_to_agg(q, new_agg, cl);
393
394 return 0;
395}
260 396
261static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, 397static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
262 struct nlattr **tca, unsigned long *arg) 398 struct nlattr **tca, unsigned long *arg)
263{ 399{
264 struct qfq_sched *q = qdisc_priv(sch); 400 struct qfq_sched *q = qdisc_priv(sch);
265 struct qfq_class *cl = (struct qfq_class *)*arg; 401 struct qfq_class *cl = (struct qfq_class *)*arg;
402 bool existing = false;
266 struct nlattr *tb[TCA_QFQ_MAX + 1]; 403 struct nlattr *tb[TCA_QFQ_MAX + 1];
404 struct qfq_aggregate *new_agg = NULL;
267 u32 weight, lmax, inv_w; 405 u32 weight, lmax, inv_w;
268 int err; 406 int err;
269 int delta_w; 407 int delta_w;
@@ -286,15 +424,6 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
286 } else 424 } else
287 weight = 1; 425 weight = 1;
288 426
289 inv_w = ONE_FP / weight;
290 weight = ONE_FP / inv_w;
291 delta_w = weight - (cl ? ONE_FP / cl->inv_w : 0);
292 if (q->wsum + delta_w > QFQ_MAX_WSUM) {
293 pr_notice("qfq: total weight out of range (%u + %u)\n",
294 delta_w, q->wsum);
295 return -EINVAL;
296 }
297
298 if (tb[TCA_QFQ_LMAX]) { 427 if (tb[TCA_QFQ_LMAX]) {
299 lmax = nla_get_u32(tb[TCA_QFQ_LMAX]); 428 lmax = nla_get_u32(tb[TCA_QFQ_LMAX]);
300 if (lmax < QFQ_MIN_LMAX || lmax > (1UL << QFQ_MTU_SHIFT)) { 429 if (lmax < QFQ_MIN_LMAX || lmax > (1UL << QFQ_MTU_SHIFT)) {
@@ -304,7 +433,23 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
304 } else 433 } else
305 lmax = psched_mtu(qdisc_dev(sch)); 434 lmax = psched_mtu(qdisc_dev(sch));
306 435
307 if (cl != NULL) { 436 inv_w = ONE_FP / weight;
437 weight = ONE_FP / inv_w;
438
439 if (cl != NULL &&
440 lmax == cl->agg->lmax &&
441 weight == cl->agg->class_weight)
442 return 0; /* nothing to change */
443
444 delta_w = weight - (cl ? cl->agg->class_weight : 0);
445
446 if (q->wsum + delta_w > QFQ_MAX_WSUM) {
447 pr_notice("qfq: total weight out of range (%d + %u)\n",
448 delta_w, q->wsum);
449 return -EINVAL;
450 }
451
452 if (cl != NULL) { /* modify existing class */
308 if (tca[TCA_RATE]) { 453 if (tca[TCA_RATE]) {
309 err = gen_replace_estimator(&cl->bstats, &cl->rate_est, 454 err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
310 qdisc_root_sleeping_lock(sch), 455 qdisc_root_sleeping_lock(sch),
@@ -312,25 +457,18 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
312 if (err) 457 if (err)
313 return err; 458 return err;
314 } 459 }
315 460 existing = true;
316 if (lmax == cl->lmax && inv_w == cl->inv_w) 461 goto set_change_agg;
317 return 0; /* nothing to update */
318
319 sch_tree_lock(sch);
320 qfq_update_reactivate_class(q, cl, inv_w, lmax, delta_w);
321 sch_tree_unlock(sch);
322
323 return 0;
324 } 462 }
325 463
464 /* create and init new class */
326 cl = kzalloc(sizeof(struct qfq_class), GFP_KERNEL); 465 cl = kzalloc(sizeof(struct qfq_class), GFP_KERNEL);
327 if (cl == NULL) 466 if (cl == NULL)
328 return -ENOBUFS; 467 return -ENOBUFS;
329 468
330 cl->refcnt = 1; 469 cl->refcnt = 1;
331 cl->common.classid = classid; 470 cl->common.classid = classid;
332 471 cl->deficit = lmax;
333 qfq_update_class_params(q, cl, lmax, inv_w, delta_w);
334 472
335 cl->qdisc = qdisc_create_dflt(sch->dev_queue, 473 cl->qdisc = qdisc_create_dflt(sch->dev_queue,
336 &pfifo_qdisc_ops, classid); 474 &pfifo_qdisc_ops, classid);
@@ -341,11 +479,8 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
341 err = gen_new_estimator(&cl->bstats, &cl->rate_est, 479 err = gen_new_estimator(&cl->bstats, &cl->rate_est,
342 qdisc_root_sleeping_lock(sch), 480 qdisc_root_sleeping_lock(sch),
343 tca[TCA_RATE]); 481 tca[TCA_RATE]);
344 if (err) { 482 if (err)
345 qdisc_destroy(cl->qdisc); 483 goto destroy_class;
346 kfree(cl);
347 return err;
348 }
349 } 484 }
350 485
351 sch_tree_lock(sch); 486 sch_tree_lock(sch);
@@ -354,19 +489,39 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
354 489
355 qdisc_class_hash_grow(sch, &q->clhash); 490 qdisc_class_hash_grow(sch, &q->clhash);
356 491
492set_change_agg:
493 sch_tree_lock(sch);
494 new_agg = qfq_find_agg(q, lmax, weight);
495 if (new_agg == NULL) { /* create new aggregate */
496 sch_tree_unlock(sch);
497 new_agg = kzalloc(sizeof(*new_agg), GFP_KERNEL);
498 if (new_agg == NULL) {
499 err = -ENOBUFS;
500 gen_kill_estimator(&cl->bstats, &cl->rate_est);
501 goto destroy_class;
502 }
503 sch_tree_lock(sch);
504 qfq_init_agg(q, new_agg, lmax, weight);
505 }
506 if (existing)
507 qfq_deact_rm_from_agg(q, cl);
508 qfq_add_to_agg(q, new_agg, cl);
509 sch_tree_unlock(sch);
510
357 *arg = (unsigned long)cl; 511 *arg = (unsigned long)cl;
358 return 0; 512 return 0;
513
514destroy_class:
515 qdisc_destroy(cl->qdisc);
516 kfree(cl);
517 return err;
359} 518}
360 519
361static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl) 520static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl)
362{ 521{
363 struct qfq_sched *q = qdisc_priv(sch); 522 struct qfq_sched *q = qdisc_priv(sch);
364 523
365 if (cl->inv_w) { 524 qfq_rm_from_agg(q, cl);
366 q->wsum -= ONE_FP / cl->inv_w;
367 cl->inv_w = 0;
368 }
369
370 gen_kill_estimator(&cl->bstats, &cl->rate_est); 525 gen_kill_estimator(&cl->bstats, &cl->rate_est);
371 qdisc_destroy(cl->qdisc); 526 qdisc_destroy(cl->qdisc);
372 kfree(cl); 527 kfree(cl);
@@ -481,8 +636,8 @@ static int qfq_dump_class(struct Qdisc *sch, unsigned long arg,
481 nest = nla_nest_start(skb, TCA_OPTIONS); 636 nest = nla_nest_start(skb, TCA_OPTIONS);
482 if (nest == NULL) 637 if (nest == NULL)
483 goto nla_put_failure; 638 goto nla_put_failure;
484 if (nla_put_u32(skb, TCA_QFQ_WEIGHT, ONE_FP/cl->inv_w) || 639 if (nla_put_u32(skb, TCA_QFQ_WEIGHT, cl->agg->class_weight) ||
485 nla_put_u32(skb, TCA_QFQ_LMAX, cl->lmax)) 640 nla_put_u32(skb, TCA_QFQ_LMAX, cl->agg->lmax))
486 goto nla_put_failure; 641 goto nla_put_failure;
487 return nla_nest_end(skb, nest); 642 return nla_nest_end(skb, nest);
488 643
@@ -500,8 +655,8 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
500 memset(&xstats, 0, sizeof(xstats)); 655 memset(&xstats, 0, sizeof(xstats));
501 cl->qdisc->qstats.qlen = cl->qdisc->q.qlen; 656 cl->qdisc->qstats.qlen = cl->qdisc->q.qlen;
502 657
503 xstats.weight = ONE_FP/cl->inv_w; 658 xstats.weight = cl->agg->class_weight;
504 xstats.lmax = cl->lmax; 659 xstats.lmax = cl->agg->lmax;
505 660
506 if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || 661 if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
507 gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || 662 gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
@@ -652,16 +807,16 @@ static void qfq_unblock_groups(struct qfq_sched *q, int index, u64 old_F)
652 * perhaps 807 * perhaps
653 * 808 *
654 old_V ^= q->V; 809 old_V ^= q->V;
655 old_V >>= QFQ_MIN_SLOT_SHIFT; 810 old_V >>= q->min_slot_shift;
656 if (old_V) { 811 if (old_V) {
657 ... 812 ...
658 } 813 }
659 * 814 *
660 */ 815 */
661static void qfq_make_eligible(struct qfq_sched *q, u64 old_V) 816static void qfq_make_eligible(struct qfq_sched *q)
662{ 817{
663 unsigned long vslot = q->V >> QFQ_MIN_SLOT_SHIFT; 818 unsigned long vslot = q->V >> q->min_slot_shift;
664 unsigned long old_vslot = old_V >> QFQ_MIN_SLOT_SHIFT; 819 unsigned long old_vslot = q->oldV >> q->min_slot_shift;
665 820
666 if (vslot != old_vslot) { 821 if (vslot != old_vslot) {
667 unsigned long mask = (1UL << fls(vslot ^ old_vslot)) - 1; 822 unsigned long mask = (1UL << fls(vslot ^ old_vslot)) - 1;
@@ -672,34 +827,38 @@ static void qfq_make_eligible(struct qfq_sched *q, u64 old_V)
672 827
673 828
674/* 829/*
675 * If the weight and lmax (max_pkt_size) of the classes do not change, 830 * The index of the slot in which the aggregate is to be inserted must
676 * then QFQ guarantees that the slot index is never higher than 831 * not be higher than QFQ_MAX_SLOTS-2. There is a '-2' and not a '-1'
677 * 2 + ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) * (QFQ_MAX_WEIGHT/QFQ_MAX_WSUM). 832 * because the start time of the group may be moved backward by one
833 * slot after the aggregate has been inserted, and this would cause
834 * non-empty slots to be right-shifted by one position.
678 * 835 *
679 * With the current values of the above constants, the index is 836 * If the weight and lmax (max_pkt_size) of the classes do not change,
680 * then guaranteed to never be higher than 2 + 256 * (1 / 16) = 18. 837 * then QFQ+ does meet the above contraint according to the current
838 * values of its parameters. In fact, if the weight and lmax of the
839 * classes do not change, then, from the theory, QFQ+ guarantees that
840 * the slot index is never higher than
841 * 2 + QFQ_MAX_AGG_CLASSES * ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) *
842 * (QFQ_MAX_WEIGHT/QFQ_MAX_WSUM) = 2 + 8 * 128 * (1 / 64) = 18
681 * 843 *
682 * When the weight of a class is increased or the lmax of the class is 844 * When the weight of a class is increased or the lmax of the class is
683 * decreased, a new class with smaller slot size may happen to be 845 * decreased, a new aggregate with smaller slot size than the original
684 * activated. The activation of this class should be properly delayed 846 * parent aggregate of the class may happen to be activated. The
685 * to when the service of the class has finished in the ideal system 847 * activation of this aggregate should be properly delayed to when the
686 * tracked by QFQ. If the activation of the class is not delayed to 848 * service of the class has finished in the ideal system tracked by
687 * this reference time instant, then this class may be unjustly served 849 * QFQ+. If the activation of the aggregate is not delayed to this
688 * before other classes waiting for service. This may cause 850 * reference time instant, then this aggregate may be unjustly served
689 * (unfrequently) the above bound to the slot index to be violated for 851 * before other aggregates waiting for service. This may cause the
690 * some of these unlucky classes. 852 * above bound to the slot index to be violated for some of these
853 * unlucky aggregates.
691 * 854 *
692 * Instead of delaying the activation of the new class, which is quite 855 * Instead of delaying the activation of the new aggregate, which is
693 * complex, the following inaccurate but simple solution is used: if 856 * quite complex, the following inaccurate but simple solution is used:
694 * the slot index is higher than QFQ_MAX_SLOTS-2, then the timestamps 857 * if the slot index is higher than QFQ_MAX_SLOTS-2, then the
695 * of the class are shifted backward so as to let the slot index 858 * timestamps of the aggregate are shifted backward so as to let the
696 * become equal to QFQ_MAX_SLOTS-2. This threshold is used because, if 859 * slot index become equal to QFQ_MAX_SLOTS-2.
697 * the slot index is above it, then the data structure implementing
698 * the bucket list either gets immediately corrupted or may get
699 * corrupted on a possible next packet arrival that causes the start
700 * time of the group to be shifted backward.
701 */ 860 */
702static void qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl, 861static void qfq_slot_insert(struct qfq_group *grp, struct qfq_aggregate *agg,
703 u64 roundedS) 862 u64 roundedS)
704{ 863{
705 u64 slot = (roundedS - grp->S) >> grp->slot_shift; 864 u64 slot = (roundedS - grp->S) >> grp->slot_shift;
@@ -708,22 +867,22 @@ static void qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl,
708 if (unlikely(slot > QFQ_MAX_SLOTS - 2)) { 867 if (unlikely(slot > QFQ_MAX_SLOTS - 2)) {
709 u64 deltaS = roundedS - grp->S - 868 u64 deltaS = roundedS - grp->S -
710 ((u64)(QFQ_MAX_SLOTS - 2)<<grp->slot_shift); 869 ((u64)(QFQ_MAX_SLOTS - 2)<<grp->slot_shift);
711 cl->S -= deltaS; 870 agg->S -= deltaS;
712 cl->F -= deltaS; 871 agg->F -= deltaS;
713 slot = QFQ_MAX_SLOTS - 2; 872 slot = QFQ_MAX_SLOTS - 2;
714 } 873 }
715 874
716 i = (grp->front + slot) % QFQ_MAX_SLOTS; 875 i = (grp->front + slot) % QFQ_MAX_SLOTS;
717 876
718 hlist_add_head(&cl->next, &grp->slots[i]); 877 hlist_add_head(&agg->next, &grp->slots[i]);
719 __set_bit(slot, &grp->full_slots); 878 __set_bit(slot, &grp->full_slots);
720} 879}
721 880
722/* Maybe introduce hlist_first_entry?? */ 881/* Maybe introduce hlist_first_entry?? */
723static struct qfq_class *qfq_slot_head(struct qfq_group *grp) 882static struct qfq_aggregate *qfq_slot_head(struct qfq_group *grp)
724{ 883{
725 return hlist_entry(grp->slots[grp->front].first, 884 return hlist_entry(grp->slots[grp->front].first,
726 struct qfq_class, next); 885 struct qfq_aggregate, next);
727} 886}
728 887
729/* 888/*
@@ -731,20 +890,20 @@ static struct qfq_class *qfq_slot_head(struct qfq_group *grp)
731 */ 890 */
732static void qfq_front_slot_remove(struct qfq_group *grp) 891static void qfq_front_slot_remove(struct qfq_group *grp)
733{ 892{
734 struct qfq_class *cl = qfq_slot_head(grp); 893 struct qfq_aggregate *agg = qfq_slot_head(grp);
735 894
736 BUG_ON(!cl); 895 BUG_ON(!agg);
737 hlist_del(&cl->next); 896 hlist_del(&agg->next);
738 if (hlist_empty(&grp->slots[grp->front])) 897 if (hlist_empty(&grp->slots[grp->front]))
739 __clear_bit(0, &grp->full_slots); 898 __clear_bit(0, &grp->full_slots);
740} 899}
741 900
742/* 901/*
743 * Returns the first full queue in a group. As a side effect, 902 * Returns the first aggregate in the first non-empty bucket of the
744 * adjust the bucket list so the first non-empty bucket is at 903 * group. As a side effect, adjusts the bucket list so the first
745 * position 0 in full_slots. 904 * non-empty bucket is at position 0 in full_slots.
746 */ 905 */
747static struct qfq_class *qfq_slot_scan(struct qfq_group *grp) 906static struct qfq_aggregate *qfq_slot_scan(struct qfq_group *grp)
748{ 907{
749 unsigned int i; 908 unsigned int i;
750 909
@@ -780,7 +939,7 @@ static void qfq_slot_rotate(struct qfq_group *grp, u64 roundedS)
780 grp->front = (grp->front - i) % QFQ_MAX_SLOTS; 939 grp->front = (grp->front - i) % QFQ_MAX_SLOTS;
781} 940}
782 941
783static void qfq_update_eligible(struct qfq_sched *q, u64 old_V) 942static void qfq_update_eligible(struct qfq_sched *q)
784{ 943{
785 struct qfq_group *grp; 944 struct qfq_group *grp;
786 unsigned long ineligible; 945 unsigned long ineligible;
@@ -792,137 +951,226 @@ static void qfq_update_eligible(struct qfq_sched *q, u64 old_V)
792 if (qfq_gt(grp->S, q->V)) 951 if (qfq_gt(grp->S, q->V))
793 q->V = grp->S; 952 q->V = grp->S;
794 } 953 }
795 qfq_make_eligible(q, old_V); 954 qfq_make_eligible(q);
796 } 955 }
797} 956}
798 957
799/* 958/* Dequeue head packet of the head class in the DRR queue of the aggregate. */
800 * Updates the class, returns true if also the group needs to be updated. 959static void agg_dequeue(struct qfq_aggregate *agg,
801 */ 960 struct qfq_class *cl, unsigned int len)
802static bool qfq_update_class(struct qfq_group *grp, struct qfq_class *cl)
803{ 961{
804 unsigned int len = qdisc_peek_len(cl->qdisc); 962 qdisc_dequeue_peeked(cl->qdisc);
805 963
806 cl->S = cl->F; 964 cl->deficit -= (int) len;
807 if (!len)
808 qfq_front_slot_remove(grp); /* queue is empty */
809 else {
810 u64 roundedS;
811 965
812 cl->F = cl->S + (u64)len * cl->inv_w; 966 if (cl->qdisc->q.qlen == 0) /* no more packets, remove from list */
813 roundedS = qfq_round_down(cl->S, grp->slot_shift); 967 list_del(&cl->alist);
814 if (roundedS == grp->S) 968 else if (cl->deficit < qdisc_pkt_len(cl->qdisc->ops->peek(cl->qdisc))) {
815 return false; 969 cl->deficit += agg->lmax;
816 970 list_move_tail(&cl->alist, &agg->active);
817 qfq_front_slot_remove(grp);
818 qfq_slot_insert(grp, cl, roundedS);
819 } 971 }
972}
973
974static inline struct sk_buff *qfq_peek_skb(struct qfq_aggregate *agg,
975 struct qfq_class **cl,
976 unsigned int *len)
977{
978 struct sk_buff *skb;
820 979
821 return true; 980 *cl = list_first_entry(&agg->active, struct qfq_class, alist);
981 skb = (*cl)->qdisc->ops->peek((*cl)->qdisc);
982 if (skb == NULL)
983 WARN_ONCE(1, "qfq_dequeue: non-workconserving leaf\n");
984 else
985 *len = qdisc_pkt_len(skb);
986
987 return skb;
988}
989
990/* Update F according to the actual service received by the aggregate. */
991static inline void charge_actual_service(struct qfq_aggregate *agg)
992{
993 /* compute the service received by the aggregate */
994 u32 service_received = agg->initial_budget - agg->budget;
995
996 agg->F = agg->S + (u64)service_received * agg->inv_w;
822} 997}
823 998
824static struct sk_buff *qfq_dequeue(struct Qdisc *sch) 999static struct sk_buff *qfq_dequeue(struct Qdisc *sch)
825{ 1000{
826 struct qfq_sched *q = qdisc_priv(sch); 1001 struct qfq_sched *q = qdisc_priv(sch);
827 struct qfq_group *grp; 1002 struct qfq_aggregate *in_serv_agg = q->in_serv_agg;
828 struct qfq_class *cl; 1003 struct qfq_class *cl;
829 struct sk_buff *skb; 1004 struct sk_buff *skb = NULL;
830 unsigned int len; 1005 /* next-packet len, 0 means no more active classes in in-service agg */
831 u64 old_V; 1006 unsigned int len = 0;
832 1007
833 if (!q->bitmaps[ER]) 1008 if (in_serv_agg == NULL)
834 return NULL; 1009 return NULL;
835 1010
836 grp = qfq_ffs(q, q->bitmaps[ER]); 1011 if (!list_empty(&in_serv_agg->active))
1012 skb = qfq_peek_skb(in_serv_agg, &cl, &len);
837 1013
838 cl = qfq_slot_head(grp); 1014 /*
839 skb = qdisc_dequeue_peeked(cl->qdisc); 1015 * If there are no active classes in the in-service aggregate,
840 if (!skb) { 1016 * or if the aggregate has not enough budget to serve its next
841 WARN_ONCE(1, "qfq_dequeue: non-workconserving leaf\n"); 1017 * class, then choose the next aggregate to serve.
842 return NULL; 1018 */
1019 if (len == 0 || in_serv_agg->budget < len) {
1020 charge_actual_service(in_serv_agg);
1021
1022 /* recharge the budget of the aggregate */
1023 in_serv_agg->initial_budget = in_serv_agg->budget =
1024 in_serv_agg->budgetmax;
1025
1026 if (!list_empty(&in_serv_agg->active))
1027 /*
1028 * Still active: reschedule for
1029 * service. Possible optimization: if no other
1030 * aggregate is active, then there is no point
1031 * in rescheduling this aggregate, and we can
1032 * just keep it as the in-service one. This
1033 * should be however a corner case, and to
1034 * handle it, we would need to maintain an
1035 * extra num_active_aggs field.
1036 */
1037 qfq_activate_agg(q, in_serv_agg, requeue);
1038 else if (sch->q.qlen == 0) { /* no aggregate to serve */
1039 q->in_serv_agg = NULL;
1040 return NULL;
1041 }
1042
1043 /*
1044 * If we get here, there are other aggregates queued:
1045 * choose the new aggregate to serve.
1046 */
1047 in_serv_agg = q->in_serv_agg = qfq_choose_next_agg(q);
1048 skb = qfq_peek_skb(in_serv_agg, &cl, &len);
843 } 1049 }
1050 if (!skb)
1051 return NULL;
844 1052
845 sch->q.qlen--; 1053 sch->q.qlen--;
846 qdisc_bstats_update(sch, skb); 1054 qdisc_bstats_update(sch, skb);
847 1055
848 old_V = q->V; 1056 agg_dequeue(in_serv_agg, cl, len);
849 len = qdisc_pkt_len(skb); 1057 in_serv_agg->budget -= len;
850 q->V += (u64)len * IWSUM; 1058 q->V += (u64)len * IWSUM;
851 pr_debug("qfq dequeue: len %u F %lld now %lld\n", 1059 pr_debug("qfq dequeue: len %u F %lld now %lld\n",
852 len, (unsigned long long) cl->F, (unsigned long long) q->V); 1060 len, (unsigned long long) in_serv_agg->F,
1061 (unsigned long long) q->V);
853 1062
854 if (qfq_update_class(grp, cl)) { 1063 return skb;
855 u64 old_F = grp->F; 1064}
856 1065
857 cl = qfq_slot_scan(grp); 1066static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *q)
858 if (!cl) 1067{
859 __clear_bit(grp->index, &q->bitmaps[ER]); 1068 struct qfq_group *grp;
860 else { 1069 struct qfq_aggregate *agg, *new_front_agg;
861 u64 roundedS = qfq_round_down(cl->S, grp->slot_shift); 1070 u64 old_F;
862 unsigned int s;
863 1071
864 if (grp->S == roundedS) 1072 qfq_update_eligible(q);
865 goto skip_unblock; 1073 q->oldV = q->V;
866 grp->S = roundedS; 1074
867 grp->F = roundedS + (2ULL << grp->slot_shift); 1075 if (!q->bitmaps[ER])
868 __clear_bit(grp->index, &q->bitmaps[ER]); 1076 return NULL;
869 s = qfq_calc_state(q, grp); 1077
870 __set_bit(grp->index, &q->bitmaps[s]); 1078 grp = qfq_ffs(q, q->bitmaps[ER]);
871 } 1079 old_F = grp->F;
1080
1081 agg = qfq_slot_head(grp);
872 1082
873 qfq_unblock_groups(q, grp->index, old_F); 1083 /* agg starts to be served, remove it from schedule */
1084 qfq_front_slot_remove(grp);
1085
1086 new_front_agg = qfq_slot_scan(grp);
1087
1088 if (new_front_agg == NULL) /* group is now inactive, remove from ER */
1089 __clear_bit(grp->index, &q->bitmaps[ER]);
1090 else {
1091 u64 roundedS = qfq_round_down(new_front_agg->S,
1092 grp->slot_shift);
1093 unsigned int s;
1094
1095 if (grp->S == roundedS)
1096 return agg;
1097 grp->S = roundedS;
1098 grp->F = roundedS + (2ULL << grp->slot_shift);
1099 __clear_bit(grp->index, &q->bitmaps[ER]);
1100 s = qfq_calc_state(q, grp);
1101 __set_bit(grp->index, &q->bitmaps[s]);
874 } 1102 }
875 1103
876skip_unblock: 1104 qfq_unblock_groups(q, grp->index, old_F);
877 qfq_update_eligible(q, old_V);
878 1105
879 return skb; 1106 return agg;
880} 1107}
881 1108
882/* 1109/*
883 * Assign a reasonable start time for a new flow k in group i. 1110 * Assign a reasonable start time for a new aggregate in group i.
884 * Admissible values for \hat(F) are multiples of \sigma_i 1111 * Admissible values for \hat(F) are multiples of \sigma_i
885 * no greater than V+\sigma_i . Larger values mean that 1112 * no greater than V+\sigma_i . Larger values mean that
886 * we had a wraparound so we consider the timestamp to be stale. 1113 * we had a wraparound so we consider the timestamp to be stale.
887 * 1114 *
888 * If F is not stale and F >= V then we set S = F. 1115 * If F is not stale and F >= V then we set S = F.
889 * Otherwise we should assign S = V, but this may violate 1116 * Otherwise we should assign S = V, but this may violate
890 * the ordering in ER. So, if we have groups in ER, set S to 1117 * the ordering in EB (see [2]). So, if we have groups in ER,
891 * the F_j of the first group j which would be blocking us. 1118 * set S to the F_j of the first group j which would be blocking us.
892 * We are guaranteed not to move S backward because 1119 * We are guaranteed not to move S backward because
893 * otherwise our group i would still be blocked. 1120 * otherwise our group i would still be blocked.
894 */ 1121 */
895static void qfq_update_start(struct qfq_sched *q, struct qfq_class *cl) 1122static void qfq_update_start(struct qfq_sched *q, struct qfq_aggregate *agg)
896{ 1123{
897 unsigned long mask; 1124 unsigned long mask;
898 u64 limit, roundedF; 1125 u64 limit, roundedF;
899 int slot_shift = cl->grp->slot_shift; 1126 int slot_shift = agg->grp->slot_shift;
900 1127
901 roundedF = qfq_round_down(cl->F, slot_shift); 1128 roundedF = qfq_round_down(agg->F, slot_shift);
902 limit = qfq_round_down(q->V, slot_shift) + (1ULL << slot_shift); 1129 limit = qfq_round_down(q->V, slot_shift) + (1ULL << slot_shift);
903 1130
904 if (!qfq_gt(cl->F, q->V) || qfq_gt(roundedF, limit)) { 1131 if (!qfq_gt(agg->F, q->V) || qfq_gt(roundedF, limit)) {
905 /* timestamp was stale */ 1132 /* timestamp was stale */
906 mask = mask_from(q->bitmaps[ER], cl->grp->index); 1133 mask = mask_from(q->bitmaps[ER], agg->grp->index);
907 if (mask) { 1134 if (mask) {
908 struct qfq_group *next = qfq_ffs(q, mask); 1135 struct qfq_group *next = qfq_ffs(q, mask);
909 if (qfq_gt(roundedF, next->F)) { 1136 if (qfq_gt(roundedF, next->F)) {
910 if (qfq_gt(limit, next->F)) 1137 if (qfq_gt(limit, next->F))
911 cl->S = next->F; 1138 agg->S = next->F;
912 else /* preserve timestamp correctness */ 1139 else /* preserve timestamp correctness */
913 cl->S = limit; 1140 agg->S = limit;
914 return; 1141 return;
915 } 1142 }
916 } 1143 }
917 cl->S = q->V; 1144 agg->S = q->V;
918 } else /* timestamp is not stale */ 1145 } else /* timestamp is not stale */
919 cl->S = cl->F; 1146 agg->S = agg->F;
920} 1147}
921 1148
1149/*
1150 * Update the timestamps of agg before scheduling/rescheduling it for
1151 * service. In particular, assign to agg->F its maximum possible
1152 * value, i.e., the virtual finish time with which the aggregate
1153 * should be labeled if it used all its budget once in service.
1154 */
1155static inline void
1156qfq_update_agg_ts(struct qfq_sched *q,
1157 struct qfq_aggregate *agg, enum update_reason reason)
1158{
1159 if (reason != requeue)
1160 qfq_update_start(q, agg);
1161 else /* just charge agg for the service received */
1162 agg->S = agg->F;
1163
1164 agg->F = agg->S + (u64)agg->budgetmax * agg->inv_w;
1165}
1166
1167static void qfq_schedule_agg(struct qfq_sched *, struct qfq_aggregate *);
1168
922static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) 1169static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
923{ 1170{
924 struct qfq_sched *q = qdisc_priv(sch); 1171 struct qfq_sched *q = qdisc_priv(sch);
925 struct qfq_class *cl; 1172 struct qfq_class *cl;
1173 struct qfq_aggregate *agg;
926 int err = 0; 1174 int err = 0;
927 1175
928 cl = qfq_classify(skb, sch, &err); 1176 cl = qfq_classify(skb, sch, &err);
@@ -934,11 +1182,13 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
934 } 1182 }
935 pr_debug("qfq_enqueue: cl = %x\n", cl->common.classid); 1183 pr_debug("qfq_enqueue: cl = %x\n", cl->common.classid);
936 1184
937 if (unlikely(cl->lmax < qdisc_pkt_len(skb))) { 1185 if (unlikely(cl->agg->lmax < qdisc_pkt_len(skb))) {
938 pr_debug("qfq: increasing maxpkt from %u to %u for class %u", 1186 pr_debug("qfq: increasing maxpkt from %u to %u for class %u",
939 cl->lmax, qdisc_pkt_len(skb), cl->common.classid); 1187 cl->agg->lmax, qdisc_pkt_len(skb), cl->common.classid);
940 qfq_update_reactivate_class(q, cl, cl->inv_w, 1188 err = qfq_change_agg(sch, cl, cl->agg->class_weight,
941 qdisc_pkt_len(skb), 0); 1189 qdisc_pkt_len(skb));
1190 if (err)
1191 return err;
942 } 1192 }
943 1193
944 err = qdisc_enqueue(skb, cl->qdisc); 1194 err = qdisc_enqueue(skb, cl->qdisc);
@@ -954,35 +1204,50 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
954 bstats_update(&cl->bstats, skb); 1204 bstats_update(&cl->bstats, skb);
955 ++sch->q.qlen; 1205 ++sch->q.qlen;
956 1206
957 /* If the new skb is not the head of queue, then done here. */ 1207 agg = cl->agg;
958 if (cl->qdisc->q.qlen != 1) 1208 /* if the queue was not empty, then done here */
1209 if (cl->qdisc->q.qlen != 1) {
1210 if (unlikely(skb == cl->qdisc->ops->peek(cl->qdisc)) &&
1211 list_first_entry(&agg->active, struct qfq_class, alist)
1212 == cl && cl->deficit < qdisc_pkt_len(skb))
1213 list_move_tail(&cl->alist, &agg->active);
1214
959 return err; 1215 return err;
1216 }
1217
1218 /* schedule class for service within the aggregate */
1219 cl->deficit = agg->lmax;
1220 list_add_tail(&cl->alist, &agg->active);
960 1221
961 /* If reach this point, queue q was idle */ 1222 if (list_first_entry(&agg->active, struct qfq_class, alist) != cl)
962 qfq_activate_class(q, cl, qdisc_pkt_len(skb)); 1223 return err; /* aggregate was not empty, nothing else to do */
1224
1225 /* recharge budget */
1226 agg->initial_budget = agg->budget = agg->budgetmax;
1227
1228 qfq_update_agg_ts(q, agg, enqueue);
1229 if (q->in_serv_agg == NULL)
1230 q->in_serv_agg = agg;
1231 else if (agg != q->in_serv_agg)
1232 qfq_schedule_agg(q, agg);
963 1233
964 return err; 1234 return err;
965} 1235}
966 1236
967/* 1237/*
968 * Handle class switch from idle to backlogged. 1238 * Schedule aggregate according to its timestamps.
969 */ 1239 */
970static void qfq_activate_class(struct qfq_sched *q, struct qfq_class *cl, 1240static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
971 unsigned int pkt_len)
972{ 1241{
973 struct qfq_group *grp = cl->grp; 1242 struct qfq_group *grp = agg->grp;
974 u64 roundedS; 1243 u64 roundedS;
975 int s; 1244 int s;
976 1245
977 qfq_update_start(q, cl); 1246 roundedS = qfq_round_down(agg->S, grp->slot_shift);
978
979 /* compute new finish time and rounded start. */
980 cl->F = cl->S + (u64)pkt_len * cl->inv_w;
981 roundedS = qfq_round_down(cl->S, grp->slot_shift);
982 1247
983 /* 1248 /*
984 * insert cl in the correct bucket. 1249 * Insert agg in the correct bucket.
985 * If cl->S >= grp->S we don't need to adjust the 1250 * If agg->S >= grp->S we don't need to adjust the
986 * bucket list and simply go to the insertion phase. 1251 * bucket list and simply go to the insertion phase.
987 * Otherwise grp->S is decreasing, we must make room 1252 * Otherwise grp->S is decreasing, we must make room
988 * in the bucket list, and also recompute the group state. 1253 * in the bucket list, and also recompute the group state.
@@ -990,10 +1255,10 @@ static void qfq_activate_class(struct qfq_sched *q, struct qfq_class *cl,
990 * was in ER make sure to adjust V. 1255 * was in ER make sure to adjust V.
991 */ 1256 */
992 if (grp->full_slots) { 1257 if (grp->full_slots) {
993 if (!qfq_gt(grp->S, cl->S)) 1258 if (!qfq_gt(grp->S, agg->S))
994 goto skip_update; 1259 goto skip_update;
995 1260
996 /* create a slot for this cl->S */ 1261 /* create a slot for this agg->S */
997 qfq_slot_rotate(grp, roundedS); 1262 qfq_slot_rotate(grp, roundedS);
998 /* group was surely ineligible, remove */ 1263 /* group was surely ineligible, remove */
999 __clear_bit(grp->index, &q->bitmaps[IR]); 1264 __clear_bit(grp->index, &q->bitmaps[IR]);
@@ -1008,46 +1273,61 @@ static void qfq_activate_class(struct qfq_sched *q, struct qfq_class *cl,
1008 1273
1009 pr_debug("qfq enqueue: new state %d %#lx S %lld F %lld V %lld\n", 1274 pr_debug("qfq enqueue: new state %d %#lx S %lld F %lld V %lld\n",
1010 s, q->bitmaps[s], 1275 s, q->bitmaps[s],
1011 (unsigned long long) cl->S, 1276 (unsigned long long) agg->S,
1012 (unsigned long long) cl->F, 1277 (unsigned long long) agg->F,
1013 (unsigned long long) q->V); 1278 (unsigned long long) q->V);
1014 1279
1015skip_update: 1280skip_update:
1016 qfq_slot_insert(grp, cl, roundedS); 1281 qfq_slot_insert(grp, agg, roundedS);
1017} 1282}
1018 1283
1019 1284
1285/* Update agg ts and schedule agg for service */
1286static void qfq_activate_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
1287 enum update_reason reason)
1288{
1289 qfq_update_agg_ts(q, agg, reason);
1290 qfq_schedule_agg(q, agg);
1291}
1292
1020static void qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp, 1293static void qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp,
1021 struct qfq_class *cl) 1294 struct qfq_aggregate *agg)
1022{ 1295{
1023 unsigned int i, offset; 1296 unsigned int i, offset;
1024 u64 roundedS; 1297 u64 roundedS;
1025 1298
1026 roundedS = qfq_round_down(cl->S, grp->slot_shift); 1299 roundedS = qfq_round_down(agg->S, grp->slot_shift);
1027 offset = (roundedS - grp->S) >> grp->slot_shift; 1300 offset = (roundedS - grp->S) >> grp->slot_shift;
1301
1028 i = (grp->front + offset) % QFQ_MAX_SLOTS; 1302 i = (grp->front + offset) % QFQ_MAX_SLOTS;
1029 1303
1030 hlist_del(&cl->next); 1304 hlist_del(&agg->next);
1031 if (hlist_empty(&grp->slots[i])) 1305 if (hlist_empty(&grp->slots[i]))
1032 __clear_bit(offset, &grp->full_slots); 1306 __clear_bit(offset, &grp->full_slots);
1033} 1307}
1034 1308
1035/* 1309/*
1036 * called to forcibly destroy a queue. 1310 * Called to forcibly deschedule an aggregate. If the aggregate is
1037 * If the queue is not in the front bucket, or if it has 1311 * not in the front bucket, or if the latter has other aggregates in
1038 * other queues in the front bucket, we can simply remove 1312 * the front bucket, we can simply remove the aggregate with no other
1039 * the queue with no other side effects. 1313 * side effects.
1040 * Otherwise we must propagate the event up. 1314 * Otherwise we must propagate the event up.
1041 */ 1315 */
1042static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl) 1316static void qfq_deactivate_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
1043{ 1317{
1044 struct qfq_group *grp = cl->grp; 1318 struct qfq_group *grp = agg->grp;
1045 unsigned long mask; 1319 unsigned long mask;
1046 u64 roundedS; 1320 u64 roundedS;
1047 int s; 1321 int s;
1048 1322
1049 cl->F = cl->S; 1323 if (agg == q->in_serv_agg) {
1050 qfq_slot_remove(q, grp, cl); 1324 charge_actual_service(agg);
1325 q->in_serv_agg = qfq_choose_next_agg(q);
1326 return;
1327 }
1328
1329 agg->F = agg->S;
1330 qfq_slot_remove(q, grp, agg);
1051 1331
1052 if (!grp->full_slots) { 1332 if (!grp->full_slots) {
1053 __clear_bit(grp->index, &q->bitmaps[IR]); 1333 __clear_bit(grp->index, &q->bitmaps[IR]);
@@ -1066,8 +1346,8 @@ static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl)
1066 } 1346 }
1067 __clear_bit(grp->index, &q->bitmaps[ER]); 1347 __clear_bit(grp->index, &q->bitmaps[ER]);
1068 } else if (hlist_empty(&grp->slots[grp->front])) { 1348 } else if (hlist_empty(&grp->slots[grp->front])) {
1069 cl = qfq_slot_scan(grp); 1349 agg = qfq_slot_scan(grp);
1070 roundedS = qfq_round_down(cl->S, grp->slot_shift); 1350 roundedS = qfq_round_down(agg->S, grp->slot_shift);
1071 if (grp->S != roundedS) { 1351 if (grp->S != roundedS) {
1072 __clear_bit(grp->index, &q->bitmaps[ER]); 1352 __clear_bit(grp->index, &q->bitmaps[ER]);
1073 __clear_bit(grp->index, &q->bitmaps[IR]); 1353 __clear_bit(grp->index, &q->bitmaps[IR]);
@@ -1080,7 +1360,7 @@ static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl)
1080 } 1360 }
1081 } 1361 }
1082 1362
1083 qfq_update_eligible(q, q->V); 1363 qfq_update_eligible(q);
1084} 1364}
1085 1365
1086static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg) 1366static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg)
@@ -1092,6 +1372,32 @@ static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1092 qfq_deactivate_class(q, cl); 1372 qfq_deactivate_class(q, cl);
1093} 1373}
1094 1374
1375static unsigned int qfq_drop_from_slot(struct qfq_sched *q,
1376 struct hlist_head *slot)
1377{
1378 struct qfq_aggregate *agg;
1379 struct hlist_node *n;
1380 struct qfq_class *cl;
1381 unsigned int len;
1382
1383 hlist_for_each_entry(agg, n, slot, next) {
1384 list_for_each_entry(cl, &agg->active, alist) {
1385
1386 if (!cl->qdisc->ops->drop)
1387 continue;
1388
1389 len = cl->qdisc->ops->drop(cl->qdisc);
1390 if (len > 0) {
1391 if (cl->qdisc->q.qlen == 0)
1392 qfq_deactivate_class(q, cl);
1393
1394 return len;
1395 }
1396 }
1397 }
1398 return 0;
1399}
1400
1095static unsigned int qfq_drop(struct Qdisc *sch) 1401static unsigned int qfq_drop(struct Qdisc *sch)
1096{ 1402{
1097 struct qfq_sched *q = qdisc_priv(sch); 1403 struct qfq_sched *q = qdisc_priv(sch);
@@ -1101,24 +1407,13 @@ static unsigned int qfq_drop(struct Qdisc *sch)
1101 for (i = 0; i <= QFQ_MAX_INDEX; i++) { 1407 for (i = 0; i <= QFQ_MAX_INDEX; i++) {
1102 grp = &q->groups[i]; 1408 grp = &q->groups[i];
1103 for (j = 0; j < QFQ_MAX_SLOTS; j++) { 1409 for (j = 0; j < QFQ_MAX_SLOTS; j++) {
1104 struct qfq_class *cl; 1410 len = qfq_drop_from_slot(q, &grp->slots[j]);
1105 struct hlist_node *n; 1411 if (len > 0) {
1106 1412 sch->q.qlen--;
1107 hlist_for_each_entry(cl, n, &grp->slots[j], next) { 1413 return len;
1108
1109 if (!cl->qdisc->ops->drop)
1110 continue;
1111
1112 len = cl->qdisc->ops->drop(cl->qdisc);
1113 if (len > 0) {
1114 sch->q.qlen--;
1115 if (!cl->qdisc->q.qlen)
1116 qfq_deactivate_class(q, cl);
1117
1118 return len;
1119 }
1120 } 1414 }
1121 } 1415 }
1416
1122 } 1417 }
1123 1418
1124 return 0; 1419 return 0;
@@ -1129,44 +1424,51 @@ static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
1129 struct qfq_sched *q = qdisc_priv(sch); 1424 struct qfq_sched *q = qdisc_priv(sch);
1130 struct qfq_group *grp; 1425 struct qfq_group *grp;
1131 int i, j, err; 1426 int i, j, err;
1427 u32 max_cl_shift, maxbudg_shift, max_classes;
1132 1428
1133 err = qdisc_class_hash_init(&q->clhash); 1429 err = qdisc_class_hash_init(&q->clhash);
1134 if (err < 0) 1430 if (err < 0)
1135 return err; 1431 return err;
1136 1432
1433 if (qdisc_dev(sch)->tx_queue_len + 1 > QFQ_MAX_AGG_CLASSES)
1434 max_classes = QFQ_MAX_AGG_CLASSES;
1435 else
1436 max_classes = qdisc_dev(sch)->tx_queue_len + 1;
1437 /* max_cl_shift = floor(log_2(max_classes)) */
1438 max_cl_shift = __fls(max_classes);
1439 q->max_agg_classes = 1<<max_cl_shift;
1440
1441 /* maxbudg_shift = log2(max_len * max_classes_per_agg) */
1442 maxbudg_shift = QFQ_MTU_SHIFT + max_cl_shift;
1443 q->min_slot_shift = FRAC_BITS + maxbudg_shift - QFQ_MAX_INDEX;
1444
1137 for (i = 0; i <= QFQ_MAX_INDEX; i++) { 1445 for (i = 0; i <= QFQ_MAX_INDEX; i++) {
1138 grp = &q->groups[i]; 1446 grp = &q->groups[i];
1139 grp->index = i; 1447 grp->index = i;
1140 grp->slot_shift = QFQ_MTU_SHIFT + FRAC_BITS 1448 grp->slot_shift = q->min_slot_shift + i;
1141 - (QFQ_MAX_INDEX - i);
1142 for (j = 0; j < QFQ_MAX_SLOTS; j++) 1449 for (j = 0; j < QFQ_MAX_SLOTS; j++)
1143 INIT_HLIST_HEAD(&grp->slots[j]); 1450 INIT_HLIST_HEAD(&grp->slots[j]);
1144 } 1451 }
1145 1452
1453 INIT_HLIST_HEAD(&q->nonfull_aggs);
1454
1146 return 0; 1455 return 0;
1147} 1456}
1148 1457
1149static void qfq_reset_qdisc(struct Qdisc *sch) 1458static void qfq_reset_qdisc(struct Qdisc *sch)
1150{ 1459{
1151 struct qfq_sched *q = qdisc_priv(sch); 1460 struct qfq_sched *q = qdisc_priv(sch);
1152 struct qfq_group *grp;
1153 struct qfq_class *cl; 1461 struct qfq_class *cl;
1154 struct hlist_node *n, *tmp; 1462 struct hlist_node *n;
1155 unsigned int i, j; 1463 unsigned int i;
1156 1464
1157 for (i = 0; i <= QFQ_MAX_INDEX; i++) { 1465 for (i = 0; i < q->clhash.hashsize; i++) {
1158 grp = &q->groups[i]; 1466 hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
1159 for (j = 0; j < QFQ_MAX_SLOTS; j++) { 1467 if (cl->qdisc->q.qlen > 0)
1160 hlist_for_each_entry_safe(cl, n, tmp,
1161 &grp->slots[j], next) {
1162 qfq_deactivate_class(q, cl); 1468 qfq_deactivate_class(q, cl);
1163 }
1164 }
1165 }
1166 1469
1167 for (i = 0; i < q->clhash.hashsize; i++) {
1168 hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode)
1169 qdisc_reset(cl->qdisc); 1470 qdisc_reset(cl->qdisc);
1471 }
1170 } 1472 }
1171 sch->q.qlen = 0; 1473 sch->q.qlen = 0;
1172} 1474}