aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorPaolo Valente <paolo.valente@unimore.it>2012-11-23 06:03:19 -0500
committerDavid S. Miller <davem@davemloft.net>2012-11-28 11:19:35 -0500
commit462dbc9101acd38e92eda93c0726857517a24bbd (patch)
treead0433c2d8a2b2449c5ba85fe0353086293bc0de /net
parent351f33d9e0d3e26adf0ef5180e1e28b4737c49ce (diff)
pkt_sched: QFQ Plus: fair-queueing service at DRR cost
This patch turns QFQ into QFQ+, a variant of QFQ that provides the following two benefits: 1) QFQ+ is faster than QFQ, 2) differently from QFQ, QFQ+ correctly schedules also non-leaves classes in a hierarchical setting. A detailed description of QFQ+, plus a performance comparison with DRR and QFQ, can be found in [1]. [1] P. Valente, "Reducing the Execution Time of Fair-Queueing Schedulers" http://algo.ing.unimo.it/people/paolo/agg-sched/agg-sched.pdf Signed-off-by: Paolo Valente <paolo.valente@unimore.it> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r--net/sched/sch_qfq.c830
1 files changed, 566 insertions, 264 deletions
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index 9687fa1c227..6ed37652a4c 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -1,7 +1,8 @@
1/* 1/*
2 * net/sched/sch_qfq.c Quick Fair Queueing Scheduler. 2 * net/sched/sch_qfq.c Quick Fair Queueing Plus Scheduler.
3 * 3 *
4 * Copyright (c) 2009 Fabio Checconi, Luigi Rizzo, and Paolo Valente. 4 * Copyright (c) 2009 Fabio Checconi, Luigi Rizzo, and Paolo Valente.
5 * Copyright (c) 2012 Paolo Valente.
5 * 6 *
6 * This program is free software; you can redistribute it and/or 7 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License 8 * modify it under the terms of the GNU General Public License
@@ -19,12 +20,18 @@
19#include <net/pkt_cls.h> 20#include <net/pkt_cls.h>
20 21
21 22
22/* Quick Fair Queueing 23/* Quick Fair Queueing Plus
23 =================== 24 ========================
24 25
25 Sources: 26 Sources:
26 27
27 Fabio Checconi, Luigi Rizzo, and Paolo Valente: "QFQ: Efficient 28 [1] Paolo Valente,
29 "Reducing the Execution Time of Fair-Queueing Schedulers."
30 http://algo.ing.unimo.it/people/paolo/agg-sched/agg-sched.pdf
31
32 Sources for QFQ:
33
34 [2] Fabio Checconi, Luigi Rizzo, and Paolo Valente: "QFQ: Efficient
28 Packet Scheduling with Tight Bandwidth Distribution Guarantees." 35 Packet Scheduling with Tight Bandwidth Distribution Guarantees."
29 36
30 See also: 37 See also:
@@ -33,6 +40,20 @@
33 40
34/* 41/*
35 42
43 QFQ+ divides classes into aggregates of at most MAX_AGG_CLASSES
44 classes. Each aggregate is timestamped with a virtual start time S
45 and a virtual finish time F, and scheduled according to its
46 timestamps. S and F are computed as a function of a system virtual
47 time function V. The classes within each aggregate are instead
48 scheduled with DRR.
49
50 To speed up operations, QFQ+ divides also aggregates into a limited
51 number of groups. Which group a class belongs to depends on the
52 ratio between the maximum packet length for the class and the weight
53 of the class. Groups have their own S and F. In the end, QFQ+
54 schedules groups, then aggregates within groups, then classes within
55 aggregates. See [1] and [2] for a full description.
56
36 Virtual time computations. 57 Virtual time computations.
37 58
38 S, F and V are all computed in fixed point arithmetic with 59 S, F and V are all computed in fixed point arithmetic with
@@ -76,27 +97,28 @@
76#define QFQ_MAX_SLOTS 32 97#define QFQ_MAX_SLOTS 32
77 98
78/* 99/*
79 * Shifts used for class<->group mapping. We allow class weights that are 100 * Shifts used for aggregate<->group mapping. We allow class weights that are
80 * in the range [1, 2^MAX_WSHIFT], and we try to map each class i to the 101 * in the range [1, 2^MAX_WSHIFT], and we try to map each aggregate i to the
81 * group with the smallest index that can support the L_i / r_i configured 102 * group with the smallest index that can support the L_i / r_i configured
82 * for the class. 103 * for the classes in the aggregate.
83 * 104 *
84 * grp->index is the index of the group; and grp->slot_shift 105 * grp->index is the index of the group; and grp->slot_shift
85 * is the shift for the corresponding (scaled) sigma_i. 106 * is the shift for the corresponding (scaled) sigma_i.
86 */ 107 */
87#define QFQ_MAX_INDEX 24 108#define QFQ_MAX_INDEX 24
88#define QFQ_MAX_WSHIFT 12 109#define QFQ_MAX_WSHIFT 10
89 110
90#define QFQ_MAX_WEIGHT (1<<QFQ_MAX_WSHIFT) 111#define QFQ_MAX_WEIGHT (1<<QFQ_MAX_WSHIFT) /* see qfq_slot_insert */
91#define QFQ_MAX_WSUM (16*QFQ_MAX_WEIGHT) 112#define QFQ_MAX_WSUM (64*QFQ_MAX_WEIGHT)
92 113
93#define FRAC_BITS 30 /* fixed point arithmetic */ 114#define FRAC_BITS 30 /* fixed point arithmetic */
94#define ONE_FP (1UL << FRAC_BITS) 115#define ONE_FP (1UL << FRAC_BITS)
95#define IWSUM (ONE_FP/QFQ_MAX_WSUM) 116#define IWSUM (ONE_FP/QFQ_MAX_WSUM)
96 117
97#define QFQ_MTU_SHIFT 16 /* to support TSO/GSO */ 118#define QFQ_MTU_SHIFT 16 /* to support TSO/GSO */
98#define QFQ_MIN_SLOT_SHIFT (FRAC_BITS + QFQ_MTU_SHIFT - QFQ_MAX_INDEX) 119#define QFQ_MIN_LMAX 512 /* see qfq_slot_insert */
99#define QFQ_MIN_LMAX 256 /* min possible lmax for a class */ 120
121#define QFQ_MAX_AGG_CLASSES 8 /* max num classes per aggregate allowed */
100 122
101/* 123/*
102 * Possible group states. These values are used as indexes for the bitmaps 124 * Possible group states. These values are used as indexes for the bitmaps
@@ -106,6 +128,8 @@ enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE };
106 128
107struct qfq_group; 129struct qfq_group;
108 130
131struct qfq_aggregate;
132
109struct qfq_class { 133struct qfq_class {
110 struct Qdisc_class_common common; 134 struct Qdisc_class_common common;
111 135
@@ -116,7 +140,12 @@ struct qfq_class {
116 struct gnet_stats_queue qstats; 140 struct gnet_stats_queue qstats;
117 struct gnet_stats_rate_est rate_est; 141 struct gnet_stats_rate_est rate_est;
118 struct Qdisc *qdisc; 142 struct Qdisc *qdisc;
143 struct list_head alist; /* Link for active-classes list. */
144 struct qfq_aggregate *agg; /* Parent aggregate. */
145 int deficit; /* DRR deficit counter. */
146};
119 147
148struct qfq_aggregate {
120 struct hlist_node next; /* Link for the slot list. */ 149 struct hlist_node next; /* Link for the slot list. */
121 u64 S, F; /* flow timestamps (exact) */ 150 u64 S, F; /* flow timestamps (exact) */
122 151
@@ -127,8 +156,18 @@ struct qfq_class {
127 struct qfq_group *grp; 156 struct qfq_group *grp;
128 157
129 /* these are copied from the flowset. */ 158 /* these are copied from the flowset. */
130 u32 inv_w; /* ONE_FP/weight */ 159 u32 class_weight; /* Weight of each class in this aggregate. */
131 u32 lmax; /* Max packet size for this flow. */ 160 /* Max pkt size for the classes in this aggregate, DRR quantum. */
161 int lmax;
162
163 u32 inv_w; /* ONE_FP/(sum of weights of classes in aggr.). */
164 u32 budgetmax; /* Max budget for this aggregate. */
165 u32 initial_budget, budget; /* Initial and current budget. */
166
167 int num_classes; /* Number of classes in this aggr. */
168 struct list_head active; /* DRR queue of active classes. */
169
170 struct hlist_node nonfull_next; /* See nonfull_aggs in qfq_sched. */
132}; 171};
133 172
134struct qfq_group { 173struct qfq_group {
@@ -138,7 +177,7 @@ struct qfq_group {
138 unsigned int front; /* Index of the front slot. */ 177 unsigned int front; /* Index of the front slot. */
139 unsigned long full_slots; /* non-empty slots */ 178 unsigned long full_slots; /* non-empty slots */
140 179
141 /* Array of RR lists of active classes. */ 180 /* Array of RR lists of active aggregates. */
142 struct hlist_head slots[QFQ_MAX_SLOTS]; 181 struct hlist_head slots[QFQ_MAX_SLOTS];
143}; 182};
144 183
@@ -146,13 +185,28 @@ struct qfq_sched {
146 struct tcf_proto *filter_list; 185 struct tcf_proto *filter_list;
147 struct Qdisc_class_hash clhash; 186 struct Qdisc_class_hash clhash;
148 187
149 u64 V; /* Precise virtual time. */ 188 u64 oldV, V; /* Precise virtual times. */
150 u32 wsum; /* weight sum */ 189 struct qfq_aggregate *in_serv_agg; /* Aggregate being served. */
190 u32 num_active_agg; /* Num. of active aggregates */
191 u32 wsum; /* weight sum */
151 192
152 unsigned long bitmaps[QFQ_MAX_STATE]; /* Group bitmaps. */ 193 unsigned long bitmaps[QFQ_MAX_STATE]; /* Group bitmaps. */
153 struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */ 194 struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */
195 u32 min_slot_shift; /* Index of the group-0 bit in the bitmaps. */
196
197 u32 max_agg_classes; /* Max number of classes per aggr. */
198 struct hlist_head nonfull_aggs; /* Aggs with room for more classes. */
154}; 199};
155 200
201/*
202 * Possible reasons why the timestamps of an aggregate are updated
203 * enqueue: the aggregate switches from idle to active and must scheduled
204 * for service
205 * requeue: the aggregate finishes its budget, so it stops being served and
206 * must be rescheduled for service
207 */
208enum update_reason {enqueue, requeue};
209
156static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid) 210static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid)
157{ 211{
158 struct qfq_sched *q = qdisc_priv(sch); 212 struct qfq_sched *q = qdisc_priv(sch);
@@ -182,18 +236,18 @@ static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = {
182 * index = log_2(maxlen/weight) but we need to apply the scaling. 236 * index = log_2(maxlen/weight) but we need to apply the scaling.
183 * This is used only once at flow creation. 237 * This is used only once at flow creation.
184 */ 238 */
185static int qfq_calc_index(u32 inv_w, unsigned int maxlen) 239static int qfq_calc_index(u32 inv_w, unsigned int maxlen, u32 min_slot_shift)
186{ 240{
187 u64 slot_size = (u64)maxlen * inv_w; 241 u64 slot_size = (u64)maxlen * inv_w;
188 unsigned long size_map; 242 unsigned long size_map;
189 int index = 0; 243 int index = 0;
190 244
191 size_map = slot_size >> QFQ_MIN_SLOT_SHIFT; 245 size_map = slot_size >> min_slot_shift;
192 if (!size_map) 246 if (!size_map)
193 goto out; 247 goto out;
194 248
195 index = __fls(size_map) + 1; /* basically a log_2 */ 249 index = __fls(size_map) + 1; /* basically a log_2 */
196 index -= !(slot_size - (1ULL << (index + QFQ_MIN_SLOT_SHIFT - 1))); 250 index -= !(slot_size - (1ULL << (index + min_slot_shift - 1)));
197 251
198 if (index < 0) 252 if (index < 0)
199 index = 0; 253 index = 0;
@@ -204,66 +258,150 @@ out:
204 return index; 258 return index;
205} 259}
206 260
207/* Length of the next packet (0 if the queue is empty). */ 261static void qfq_deactivate_agg(struct qfq_sched *, struct qfq_aggregate *);
208static unsigned int qdisc_peek_len(struct Qdisc *sch) 262static void qfq_activate_agg(struct qfq_sched *, struct qfq_aggregate *,
263 enum update_reason);
264
265static void qfq_init_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
266 u32 lmax, u32 weight)
209{ 267{
210 struct sk_buff *skb; 268 INIT_LIST_HEAD(&agg->active);
269 hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs);
270
271 agg->lmax = lmax;
272 agg->class_weight = weight;
273}
274
275static struct qfq_aggregate *qfq_find_agg(struct qfq_sched *q,
276 u32 lmax, u32 weight)
277{
278 struct qfq_aggregate *agg;
279 struct hlist_node *n;
280
281 hlist_for_each_entry(agg, n, &q->nonfull_aggs, nonfull_next)
282 if (agg->lmax == lmax && agg->class_weight == weight)
283 return agg;
284
285 return NULL;
286}
287
211 288
212 skb = sch->ops->peek(sch); 289/* Update aggregate as a function of the new number of classes. */
213 return skb ? qdisc_pkt_len(skb) : 0; 290static void qfq_update_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
291 int new_num_classes)
292{
293 u32 new_agg_weight;
294
295 if (new_num_classes == q->max_agg_classes)
296 hlist_del_init(&agg->nonfull_next);
297
298 if (agg->num_classes > new_num_classes &&
299 new_num_classes == q->max_agg_classes - 1) /* agg no more full */
300 hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs);
301
302 agg->budgetmax = new_num_classes * agg->lmax;
303 new_agg_weight = agg->class_weight * new_num_classes;
304 agg->inv_w = ONE_FP/new_agg_weight;
305
306 if (agg->grp == NULL) {
307 int i = qfq_calc_index(agg->inv_w, agg->budgetmax,
308 q->min_slot_shift);
309 agg->grp = &q->groups[i];
310 }
311
312 q->wsum +=
313 (int) agg->class_weight * (new_num_classes - agg->num_classes);
314
315 agg->num_classes = new_num_classes;
316}
317
318/* Add class to aggregate. */
319static void qfq_add_to_agg(struct qfq_sched *q,
320 struct qfq_aggregate *agg,
321 struct qfq_class *cl)
322{
323 cl->agg = agg;
324
325 qfq_update_agg(q, agg, agg->num_classes+1);
326 if (cl->qdisc->q.qlen > 0) { /* adding an active class */
327 list_add_tail(&cl->alist, &agg->active);
328 if (list_first_entry(&agg->active, struct qfq_class, alist) ==
329 cl && q->in_serv_agg != agg) /* agg was inactive */
330 qfq_activate_agg(q, agg, enqueue); /* schedule agg */
331 }
214} 332}
215 333
216static void qfq_deactivate_class(struct qfq_sched *, struct qfq_class *); 334static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *);
217static void qfq_activate_class(struct qfq_sched *q, struct qfq_class *cl,
218 unsigned int len);
219 335
220static void qfq_update_class_params(struct qfq_sched *q, struct qfq_class *cl, 336static void qfq_destroy_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
221 u32 lmax, u32 inv_w, int delta_w)
222{ 337{
223 int i; 338 if (!hlist_unhashed(&agg->nonfull_next))
339 hlist_del_init(&agg->nonfull_next);
340 if (q->in_serv_agg == agg)
341 q->in_serv_agg = qfq_choose_next_agg(q);
342 kfree(agg);
343}
224 344
225 /* update qfq-specific data */ 345/* Deschedule class from within its parent aggregate. */
226 cl->lmax = lmax; 346static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl)
227 cl->inv_w = inv_w; 347{
228 i = qfq_calc_index(cl->inv_w, cl->lmax); 348 struct qfq_aggregate *agg = cl->agg;
229 349
230 cl->grp = &q->groups[i];
231 350
232 q->wsum += delta_w; 351 list_del(&cl->alist); /* remove from RR queue of the aggregate */
352 if (list_empty(&agg->active)) /* agg is now inactive */
353 qfq_deactivate_agg(q, agg);
233} 354}
234 355
235static void qfq_update_reactivate_class(struct qfq_sched *q, 356/* Remove class from its parent aggregate. */
236 struct qfq_class *cl, 357static void qfq_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl)
237 u32 inv_w, u32 lmax, int delta_w)
238{ 358{
239 bool need_reactivation = false; 359 struct qfq_aggregate *agg = cl->agg;
240 int i = qfq_calc_index(inv_w, lmax);
241 360
242 if (&q->groups[i] != cl->grp && cl->qdisc->q.qlen > 0) { 361 cl->agg = NULL;
243 /* 362 if (agg->num_classes == 1) { /* agg being emptied, destroy it */
244 * shift cl->F back, to not charge the 363 qfq_destroy_agg(q, agg);
245 * class for the not-yet-served head 364 return;
246 * packet
247 */
248 cl->F = cl->S;
249 /* remove class from its slot in the old group */
250 qfq_deactivate_class(q, cl);
251 need_reactivation = true;
252 } 365 }
366 qfq_update_agg(q, agg, agg->num_classes-1);
367}
253 368
254 qfq_update_class_params(q, cl, lmax, inv_w, delta_w); 369/* Deschedule class and remove it from its parent aggregate. */
370static void qfq_deact_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl)
371{
372 if (cl->qdisc->q.qlen > 0) /* class is active */
373 qfq_deactivate_class(q, cl);
255 374
256 if (need_reactivation) /* activate in new group */ 375 qfq_rm_from_agg(q, cl);
257 qfq_activate_class(q, cl, qdisc_peek_len(cl->qdisc));
258} 376}
259 377
378/* Move class to a new aggregate, matching the new class weight and/or lmax */
379static int qfq_change_agg(struct Qdisc *sch, struct qfq_class *cl, u32 weight,
380 u32 lmax)
381{
382 struct qfq_sched *q = qdisc_priv(sch);
383 struct qfq_aggregate *new_agg = qfq_find_agg(q, lmax, weight);
384
385 if (new_agg == NULL) { /* create new aggregate */
386 new_agg = kzalloc(sizeof(*new_agg), GFP_ATOMIC);
387 if (new_agg == NULL)
388 return -ENOBUFS;
389 qfq_init_agg(q, new_agg, lmax, weight);
390 }
391 qfq_deact_rm_from_agg(q, cl);
392 qfq_add_to_agg(q, new_agg, cl);
393
394 return 0;
395}
260 396
261static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, 397static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
262 struct nlattr **tca, unsigned long *arg) 398 struct nlattr **tca, unsigned long *arg)
263{ 399{
264 struct qfq_sched *q = qdisc_priv(sch); 400 struct qfq_sched *q = qdisc_priv(sch);
265 struct qfq_class *cl = (struct qfq_class *)*arg; 401 struct qfq_class *cl = (struct qfq_class *)*arg;
402 bool existing = false;
266 struct nlattr *tb[TCA_QFQ_MAX + 1]; 403 struct nlattr *tb[TCA_QFQ_MAX + 1];
404 struct qfq_aggregate *new_agg = NULL;
267 u32 weight, lmax, inv_w; 405 u32 weight, lmax, inv_w;
268 int err; 406 int err;
269 int delta_w; 407 int delta_w;
@@ -286,15 +424,6 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
286 } else 424 } else
287 weight = 1; 425 weight = 1;
288 426
289 inv_w = ONE_FP / weight;
290 weight = ONE_FP / inv_w;
291 delta_w = weight - (cl ? ONE_FP / cl->inv_w : 0);
292 if (q->wsum + delta_w > QFQ_MAX_WSUM) {
293 pr_notice("qfq: total weight out of range (%u + %u)\n",
294 delta_w, q->wsum);
295 return -EINVAL;
296 }
297
298 if (tb[TCA_QFQ_LMAX]) { 427 if (tb[TCA_QFQ_LMAX]) {
299 lmax = nla_get_u32(tb[TCA_QFQ_LMAX]); 428 lmax = nla_get_u32(tb[TCA_QFQ_LMAX]);
300 if (lmax < QFQ_MIN_LMAX || lmax > (1UL << QFQ_MTU_SHIFT)) { 429 if (lmax < QFQ_MIN_LMAX || lmax > (1UL << QFQ_MTU_SHIFT)) {
@@ -304,7 +433,23 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
304 } else 433 } else
305 lmax = psched_mtu(qdisc_dev(sch)); 434 lmax = psched_mtu(qdisc_dev(sch));
306 435
307 if (cl != NULL) { 436 inv_w = ONE_FP / weight;
437 weight = ONE_FP / inv_w;
438
439 if (cl != NULL &&
440 lmax == cl->agg->lmax &&
441 weight == cl->agg->class_weight)
442 return 0; /* nothing to change */
443
444 delta_w = weight - (cl ? cl->agg->class_weight : 0);
445
446 if (q->wsum + delta_w > QFQ_MAX_WSUM) {
447 pr_notice("qfq: total weight out of range (%d + %u)\n",
448 delta_w, q->wsum);
449 return -EINVAL;
450 }
451
452 if (cl != NULL) { /* modify existing class */
308 if (tca[TCA_RATE]) { 453 if (tca[TCA_RATE]) {
309 err = gen_replace_estimator(&cl->bstats, &cl->rate_est, 454 err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
310 qdisc_root_sleeping_lock(sch), 455 qdisc_root_sleeping_lock(sch),
@@ -312,25 +457,18 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
312 if (err) 457 if (err)
313 return err; 458 return err;
314 } 459 }
315 460 existing = true;
316 if (lmax == cl->lmax && inv_w == cl->inv_w) 461 goto set_change_agg;
317 return 0; /* nothing to update */
318
319 sch_tree_lock(sch);
320 qfq_update_reactivate_class(q, cl, inv_w, lmax, delta_w);
321 sch_tree_unlock(sch);
322
323 return 0;
324 } 462 }
325 463
464 /* create and init new class */
326 cl = kzalloc(sizeof(struct qfq_class), GFP_KERNEL); 465 cl = kzalloc(sizeof(struct qfq_class), GFP_KERNEL);
327 if (cl == NULL) 466 if (cl == NULL)
328 return -ENOBUFS; 467 return -ENOBUFS;
329 468
330 cl->refcnt = 1; 469 cl->refcnt = 1;
331 cl->common.classid = classid; 470 cl->common.classid = classid;
332 471 cl->deficit = lmax;
333 qfq_update_class_params(q, cl, lmax, inv_w, delta_w);
334 472
335 cl->qdisc = qdisc_create_dflt(sch->dev_queue, 473 cl->qdisc = qdisc_create_dflt(sch->dev_queue,
336 &pfifo_qdisc_ops, classid); 474 &pfifo_qdisc_ops, classid);
@@ -341,11 +479,8 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
341 err = gen_new_estimator(&cl->bstats, &cl->rate_est, 479 err = gen_new_estimator(&cl->bstats, &cl->rate_est,
342 qdisc_root_sleeping_lock(sch), 480 qdisc_root_sleeping_lock(sch),
343 tca[TCA_RATE]); 481 tca[TCA_RATE]);
344 if (err) { 482 if (err)
345 qdisc_destroy(cl->qdisc); 483 goto destroy_class;
346 kfree(cl);
347 return err;
348 }
349 } 484 }
350 485
351 sch_tree_lock(sch); 486 sch_tree_lock(sch);
@@ -354,19 +489,39 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
354 489
355 qdisc_class_hash_grow(sch, &q->clhash); 490 qdisc_class_hash_grow(sch, &q->clhash);
356 491
492set_change_agg:
493 sch_tree_lock(sch);
494 new_agg = qfq_find_agg(q, lmax, weight);
495 if (new_agg == NULL) { /* create new aggregate */
496 sch_tree_unlock(sch);
497 new_agg = kzalloc(sizeof(*new_agg), GFP_KERNEL);
498 if (new_agg == NULL) {
499 err = -ENOBUFS;
500 gen_kill_estimator(&cl->bstats, &cl->rate_est);
501 goto destroy_class;
502 }
503 sch_tree_lock(sch);
504 qfq_init_agg(q, new_agg, lmax, weight);
505 }
506 if (existing)
507 qfq_deact_rm_from_agg(q, cl);
508 qfq_add_to_agg(q, new_agg, cl);
509 sch_tree_unlock(sch);
510
357 *arg = (unsigned long)cl; 511 *arg = (unsigned long)cl;
358 return 0; 512 return 0;
513
514destroy_class:
515 qdisc_destroy(cl->qdisc);
516 kfree(cl);
517 return err;
359} 518}
360 519
361static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl) 520static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl)
362{ 521{
363 struct qfq_sched *q = qdisc_priv(sch); 522 struct qfq_sched *q = qdisc_priv(sch);
364 523
365 if (cl->inv_w) { 524 qfq_rm_from_agg(q, cl);
366 q->wsum -= ONE_FP / cl->inv_w;
367 cl->inv_w = 0;
368 }
369
370 gen_kill_estimator(&cl->bstats, &cl->rate_est); 525 gen_kill_estimator(&cl->bstats, &cl->rate_est);
371 qdisc_destroy(cl->qdisc); 526 qdisc_destroy(cl->qdisc);
372 kfree(cl); 527 kfree(cl);
@@ -481,8 +636,8 @@ static int qfq_dump_class(struct Qdisc *sch, unsigned long arg,
481 nest = nla_nest_start(skb, TCA_OPTIONS); 636 nest = nla_nest_start(skb, TCA_OPTIONS);
482 if (nest == NULL) 637 if (nest == NULL)
483 goto nla_put_failure; 638 goto nla_put_failure;
484 if (nla_put_u32(skb, TCA_QFQ_WEIGHT, ONE_FP/cl->inv_w) || 639 if (nla_put_u32(skb, TCA_QFQ_WEIGHT, cl->agg->class_weight) ||
485 nla_put_u32(skb, TCA_QFQ_LMAX, cl->lmax)) 640 nla_put_u32(skb, TCA_QFQ_LMAX, cl->agg->lmax))
486 goto nla_put_failure; 641 goto nla_put_failure;
487 return nla_nest_end(skb, nest); 642 return nla_nest_end(skb, nest);
488 643
@@ -500,8 +655,8 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
500 memset(&xstats, 0, sizeof(xstats)); 655 memset(&xstats, 0, sizeof(xstats));
501 cl->qdisc->qstats.qlen = cl->qdisc->q.qlen; 656 cl->qdisc->qstats.qlen = cl->qdisc->q.qlen;
502 657
503 xstats.weight = ONE_FP/cl->inv_w; 658 xstats.weight = cl->agg->class_weight;
504 xstats.lmax = cl->lmax; 659 xstats.lmax = cl->agg->lmax;
505 660
506 if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || 661 if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
507 gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || 662 gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
@@ -652,16 +807,16 @@ static void qfq_unblock_groups(struct qfq_sched *q, int index, u64 old_F)
652 * perhaps 807 * perhaps
653 * 808 *
654 old_V ^= q->V; 809 old_V ^= q->V;
655 old_V >>= QFQ_MIN_SLOT_SHIFT; 810 old_V >>= q->min_slot_shift;
656 if (old_V) { 811 if (old_V) {
657 ... 812 ...
658 } 813 }
659 * 814 *
660 */ 815 */
661static void qfq_make_eligible(struct qfq_sched *q, u64 old_V) 816static void qfq_make_eligible(struct qfq_sched *q)
662{ 817{
663 unsigned long vslot = q->V >> QFQ_MIN_SLOT_SHIFT; 818 unsigned long vslot = q->V >> q->min_slot_shift;
664 unsigned long old_vslot = old_V >> QFQ_MIN_SLOT_SHIFT; 819 unsigned long old_vslot = q->oldV >> q->min_slot_shift;
665 820
666 if (vslot != old_vslot) { 821 if (vslot != old_vslot) {
667 unsigned long mask = (1UL << fls(vslot ^ old_vslot)) - 1; 822 unsigned long mask = (1UL << fls(vslot ^ old_vslot)) - 1;
@@ -672,34 +827,38 @@ static void qfq_make_eligible(struct qfq_sched *q, u64 old_V)
672 827
673 828
674/* 829/*
675 * If the weight and lmax (max_pkt_size) of the classes do not change, 830 * The index of the slot in which the aggregate is to be inserted must
676 * then QFQ guarantees that the slot index is never higher than 831 * not be higher than QFQ_MAX_SLOTS-2. There is a '-2' and not a '-1'
677 * 2 + ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) * (QFQ_MAX_WEIGHT/QFQ_MAX_WSUM). 832 * because the start time of the group may be moved backward by one
833 * slot after the aggregate has been inserted, and this would cause
834 * non-empty slots to be right-shifted by one position.
678 * 835 *
679 * With the current values of the above constants, the index is 836 * If the weight and lmax (max_pkt_size) of the classes do not change,
680 * then guaranteed to never be higher than 2 + 256 * (1 / 16) = 18. 837 * then QFQ+ does meet the above contraint according to the current
838 * values of its parameters. In fact, if the weight and lmax of the
839 * classes do not change, then, from the theory, QFQ+ guarantees that
840 * the slot index is never higher than
841 * 2 + QFQ_MAX_AGG_CLASSES * ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) *
842 * (QFQ_MAX_WEIGHT/QFQ_MAX_WSUM) = 2 + 8 * 128 * (1 / 64) = 18
681 * 843 *
682 * When the weight of a class is increased or the lmax of the class is 844 * When the weight of a class is increased or the lmax of the class is
683 * decreased, a new class with smaller slot size may happen to be 845 * decreased, a new aggregate with smaller slot size than the original
684 * activated. The activation of this class should be properly delayed 846 * parent aggregate of the class may happen to be activated. The
685 * to when the service of the class has finished in the ideal system 847 * activation of this aggregate should be properly delayed to when the
686 * tracked by QFQ. If the activation of the class is not delayed to 848 * service of the class has finished in the ideal system tracked by
687 * this reference time instant, then this class may be unjustly served 849 * QFQ+. If the activation of the aggregate is not delayed to this
688 * before other classes waiting for service. This may cause 850 * reference time instant, then this aggregate may be unjustly served
689 * (unfrequently) the above bound to the slot index to be violated for 851 * before other aggregates waiting for service. This may cause the
690 * some of these unlucky classes. 852 * above bound to the slot index to be violated for some of these
853 * unlucky aggregates.
691 * 854 *
692 * Instead of delaying the activation of the new class, which is quite 855 * Instead of delaying the activation of the new aggregate, which is
693 * complex, the following inaccurate but simple solution is used: if 856 * quite complex, the following inaccurate but simple solution is used:
694 * the slot index is higher than QFQ_MAX_SLOTS-2, then the timestamps 857 * if the slot index is higher than QFQ_MAX_SLOTS-2, then the
695 * of the class are shifted backward so as to let the slot index 858 * timestamps of the aggregate are shifted backward so as to let the
696 * become equal to QFQ_MAX_SLOTS-2. This threshold is used because, if 859 * slot index become equal to QFQ_MAX_SLOTS-2.
697 * the slot index is above it, then the data structure implementing
698 * the bucket list either gets immediately corrupted or may get
699 * corrupted on a possible next packet arrival that causes the start
700 * time of the group to be shifted backward.
701 */ 860 */
702static void qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl, 861static void qfq_slot_insert(struct qfq_group *grp, struct qfq_aggregate *agg,
703 u64 roundedS) 862 u64 roundedS)
704{ 863{
705 u64 slot = (roundedS - grp->S) >> grp->slot_shift; 864 u64 slot = (roundedS - grp->S) >> grp->slot_shift;
@@ -708,22 +867,22 @@ static void qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl,
708 if (unlikely(slot > QFQ_MAX_SLOTS - 2)) { 867 if (unlikely(slot > QFQ_MAX_SLOTS - 2)) {
709 u64 deltaS = roundedS - grp->S - 868 u64 deltaS = roundedS - grp->S -
710 ((u64)(QFQ_MAX_SLOTS - 2)<<grp->slot_shift); 869 ((u64)(QFQ_MAX_SLOTS - 2)<<grp->slot_shift);
711 cl->S -= deltaS; 870 agg->S -= deltaS;
712 cl->F -= deltaS; 871 agg->F -= deltaS;
713 slot = QFQ_MAX_SLOTS - 2; 872 slot = QFQ_MAX_SLOTS - 2;
714 } 873 }
715 874
716 i = (grp->front + slot) % QFQ_MAX_SLOTS; 875 i = (grp->front + slot) % QFQ_MAX_SLOTS;
717 876
718 hlist_add_head(&cl->next, &grp->slots[i]); 877 hlist_add_head(&agg->next, &grp->slots[i]);
719 __set_bit(slot, &grp->full_slots); 878 __set_bit(slot, &grp->full_slots);
720} 879}
721 880
722/* Maybe introduce hlist_first_entry?? */ 881/* Maybe introduce hlist_first_entry?? */
723static struct qfq_class *qfq_slot_head(struct qfq_group *grp) 882static struct qfq_aggregate *qfq_slot_head(struct qfq_group *grp)
724{ 883{
725 return hlist_entry(grp->slots[grp->front].first, 884 return hlist_entry(grp->slots[grp->front].first,
726 struct qfq_class, next); 885 struct qfq_aggregate, next);
727} 886}
728 887
729/* 888/*
@@ -731,20 +890,20 @@ static struct qfq_class *qfq_slot_head(struct qfq_group *grp)
731 */ 890 */
732static void qfq_front_slot_remove(struct qfq_group *grp) 891static void qfq_front_slot_remove(struct qfq_group *grp)
733{ 892{
734 struct qfq_class *cl = qfq_slot_head(grp); 893 struct qfq_aggregate *agg = qfq_slot_head(grp);
735 894
736 BUG_ON(!cl); 895 BUG_ON(!agg);
737 hlist_del(&cl->next); 896 hlist_del(&agg->next);
738 if (hlist_empty(&grp->slots[grp->front])) 897 if (hlist_empty(&grp->slots[grp->front]))
739 __clear_bit(0, &grp->full_slots); 898 __clear_bit(0, &grp->full_slots);
740} 899}
741 900
742/* 901/*
743 * Returns the first full queue in a group. As a side effect, 902 * Returns the first aggregate in the first non-empty bucket of the
744 * adjust the bucket list so the first non-empty bucket is at 903 * group. As a side effect, adjusts the bucket list so the first
745 * position 0 in full_slots. 904 * non-empty bucket is at position 0 in full_slots.
746 */ 905 */
747static struct qfq_class *qfq_slot_scan(struct qfq_group *grp) 906static struct qfq_aggregate *qfq_slot_scan(struct qfq_group *grp)
748{ 907{
749 unsigned int i; 908 unsigned int i;
750 909
@@ -780,7 +939,7 @@ static void qfq_slot_rotate(struct qfq_group *grp, u64 roundedS)
780 grp->front = (grp->front - i) % QFQ_MAX_SLOTS; 939 grp->front = (grp->front - i) % QFQ_MAX_SLOTS;
781} 940}
782 941
783static void qfq_update_eligible(struct qfq_sched *q, u64 old_V) 942static void qfq_update_eligible(struct qfq_sched *q)
784{ 943{
785 struct qfq_group *grp; 944 struct qfq_group *grp;
786 unsigned long ineligible; 945 unsigned long ineligible;
@@ -792,137 +951,226 @@ static void qfq_update_eligible(struct qfq_sched *q, u64 old_V)
792 if (qfq_gt(grp->S, q->V)) 951 if (qfq_gt(grp->S, q->V))
793 q->V = grp->S; 952 q->V = grp->S;
794 } 953 }
795 qfq_make_eligible(q, old_V); 954 qfq_make_eligible(q);
796 } 955 }
797} 956}
798 957
799/* 958/* Dequeue head packet of the head class in the DRR queue of the aggregate. */
800 * Updates the class, returns true if also the group needs to be updated. 959static void agg_dequeue(struct qfq_aggregate *agg,
801 */ 960 struct qfq_class *cl, unsigned int len)
802static bool qfq_update_class(struct qfq_group *grp, struct qfq_class *cl)
803{ 961{
804 unsigned int len = qdisc_peek_len(cl->qdisc); 962 qdisc_dequeue_peeked(cl->qdisc);
805 963
806 cl->S = cl->F; 964 cl->deficit -= (int) len;
807 if (!len)
808 qfq_front_slot_remove(grp); /* queue is empty */
809 else {
810 u64 roundedS;
811 965
812 cl->F = cl->S + (u64)len * cl->inv_w; 966 if (cl->qdisc->q.qlen == 0) /* no more packets, remove from list */
813 roundedS = qfq_round_down(cl->S, grp->slot_shift); 967 list_del(&cl->alist);
814 if (roundedS == grp->S) 968 else if (cl->deficit < qdisc_pkt_len(cl->qdisc->ops->peek(cl->qdisc))) {
815 return false; 969 cl->deficit += agg->lmax;
816 970 list_move_tail(&cl->alist, &agg->active);
817 qfq_front_slot_remove(grp);
818 qfq_slot_insert(grp, cl, roundedS);
819 } 971 }
972}
973
974static inline struct sk_buff *qfq_peek_skb(struct qfq_aggregate *agg,
975 struct qfq_class **cl,
976 unsigned int *len)
977{
978 struct sk_buff *skb;
820 979
821 return true; 980 *cl = list_first_entry(&agg->active, struct qfq_class, alist);
981 skb = (*cl)->qdisc->ops->peek((*cl)->qdisc);
982 if (skb == NULL)
983 WARN_ONCE(1, "qfq_dequeue: non-workconserving leaf\n");
984 else
985 *len = qdisc_pkt_len(skb);
986
987 return skb;
988}
989
990/* Update F according to the actual service received by the aggregate. */
991static inline void charge_actual_service(struct qfq_aggregate *agg)
992{
993 /* compute the service received by the aggregate */
994 u32 service_received = agg->initial_budget - agg->budget;
995
996 agg->F = agg->S + (u64)service_received * agg->inv_w;
822} 997}
823 998
824static struct sk_buff *qfq_dequeue(struct Qdisc *sch) 999static struct sk_buff *qfq_dequeue(struct Qdisc *sch)
825{ 1000{
826 struct qfq_sched *q = qdisc_priv(sch); 1001 struct qfq_sched *q = qdisc_priv(sch);
827 struct qfq_group *grp; 1002 struct qfq_aggregate *in_serv_agg = q->in_serv_agg;
828 struct qfq_class *cl; 1003 struct qfq_class *cl;
829 struct sk_buff *skb; 1004 struct sk_buff *skb = NULL;
830 unsigned int len; 1005 /* next-packet len, 0 means no more active classes in in-service agg */
831 u64 old_V; 1006 unsigned int len = 0;
832 1007
833 if (!q->bitmaps[ER]) 1008 if (in_serv_agg == NULL)
834 return NULL; 1009 return NULL;
835 1010
836 grp = qfq_ffs(q, q->bitmaps[ER]); 1011 if (!list_empty(&in_serv_agg->active))
1012 skb = qfq_peek_skb(in_serv_agg, &cl, &len);
837 1013
838 cl = qfq_slot_head(grp); 1014 /*
839 skb = qdisc_dequeue_peeked(cl->qdisc); 1015 * If there are no active classes in the in-service aggregate,
840 if (!skb) { 1016 * or if the aggregate has not enough budget to serve its next
841 WARN_ONCE(1, "qfq_dequeue: non-workconserving leaf\n"); 1017 * class, then choose the next aggregate to serve.
842 return NULL; 1018 */
1019 if (len == 0 || in_serv_agg->budget < len) {
1020 charge_actual_service(in_serv_agg);
1021
1022 /* recharge the budget of the aggregate */
1023 in_serv_agg->initial_budget = in_serv_agg->budget =
1024 in_serv_agg->budgetmax;
1025
1026 if (!list_empty(&in_serv_agg->active))
1027 /*
1028 * Still active: reschedule for
1029 * service. Possible optimization: if no other
1030 * aggregate is active, then there is no point
1031 * in rescheduling this aggregate, and we can
1032 * just keep it as the in-service one. This
1033 * should be however a corner case, and to
1034 * handle it, we would need to maintain an
1035 * extra num_active_aggs field.
1036 */
1037 qfq_activate_agg(q, in_serv_agg, requeue);
1038 else if (sch->q.qlen == 0) { /* no aggregate to serve */
1039 q->in_serv_agg = NULL;
1040 return NULL;
1041 }
1042
1043 /*
1044 * If we get here, there are other aggregates queued:
1045 * choose the new aggregate to serve.
1046 */
1047 in_serv_agg = q->in_serv_agg = qfq_choose_next_agg(q);
1048 skb = qfq_peek_skb(in_serv_agg, &cl, &len);
843 } 1049 }
1050 if (!skb)
1051 return NULL;
844 1052
845 sch->q.qlen--; 1053 sch->q.qlen--;
846 qdisc_bstats_update(sch, skb); 1054 qdisc_bstats_update(sch, skb);
847 1055
848 old_V = q->V; 1056 agg_dequeue(in_serv_agg, cl, len);
849 len = qdisc_pkt_len(skb); 1057 in_serv_agg->budget -= len;
850 q->V += (u64)len * IWSUM; 1058 q->V += (u64)len * IWSUM;
851 pr_debug("qfq dequeue: len %u F %lld now %lld\n", 1059 pr_debug("qfq dequeue: len %u F %lld now %lld\n",
852 len, (unsigned long long) cl->F, (unsigned long long) q->V); 1060 len, (unsigned long long) in_serv_agg->F,
1061 (unsigned long long) q->V);
853 1062
854 if (qfq_update_class(grp, cl)) { 1063 return skb;
855 u64 old_F = grp->F; 1064}
856 1065
857 cl = qfq_slot_scan(grp); 1066static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *q)
858 if (!cl) 1067{
859 __clear_bit(grp->index, &q->bitmaps[ER]); 1068 struct qfq_group *grp;
860 else { 1069 struct qfq_aggregate *agg, *new_front_agg;
861 u64 roundedS = qfq_round_down(cl->S, grp->slot_shift); 1070 u64 old_F;
862 unsigned int s;
863 1071
864 if (grp->S == roundedS) 1072 qfq_update_eligible(q);
865 goto skip_unblock; 1073 q->oldV = q->V;
866 grp->S = roundedS; 1074
867 grp->F = roundedS + (2ULL << grp->slot_shift); 1075 if (!q->bitmaps[ER])
868 __clear_bit(grp->index, &q->bitmaps[ER]); 1076 return NULL;
869 s = qfq_calc_state(q, grp); 1077
870 __set_bit(grp->index, &q->bitmaps[s]); 1078 grp = qfq_ffs(q, q->bitmaps[ER]);
871 } 1079 old_F = grp->F;
1080
1081 agg = qfq_slot_head(grp);
872 1082
873 qfq_unblock_groups(q, grp->index, old_F); 1083 /* agg starts to be served, remove it from schedule */
1084 qfq_front_slot_remove(grp);
1085
1086 new_front_agg = qfq_slot_scan(grp);
1087
1088 if (new_front_agg == NULL) /* group is now inactive, remove from ER */
1089 __clear_bit(grp->index, &q->bitmaps[ER]);
1090 else {
1091 u64 roundedS = qfq_round_down(new_front_agg->S,
1092 grp->slot_shift);
1093 unsigned int s;
1094
1095 if (grp->S == roundedS)
1096 return agg;
1097 grp->S = roundedS;
1098 grp->F = roundedS + (2ULL << grp->slot_shift);
1099 __clear_bit(grp->index, &q->bitmaps[ER]);
1100 s = qfq_calc_state(q, grp);
1101 __set_bit(grp->index, &q->bitmaps[s]);
874 } 1102 }
875 1103
876skip_unblock: 1104 qfq_unblock_groups(q, grp->index, old_F);
877 qfq_update_eligible(q, old_V);
878 1105
879 return skb; 1106 return agg;
880} 1107}
881 1108
882/* 1109/*
883 * Assign a reasonable start time for a new flow k in group i. 1110 * Assign a reasonable start time for a new aggregate in group i.
884 * Admissible values for \hat(F) are multiples of \sigma_i 1111 * Admissible values for \hat(F) are multiples of \sigma_i
885 * no greater than V+\sigma_i . Larger values mean that 1112 * no greater than V+\sigma_i . Larger values mean that
886 * we had a wraparound so we consider the timestamp to be stale. 1113 * we had a wraparound so we consider the timestamp to be stale.
887 * 1114 *
888 * If F is not stale and F >= V then we set S = F. 1115 * If F is not stale and F >= V then we set S = F.
889 * Otherwise we should assign S = V, but this may violate 1116 * Otherwise we should assign S = V, but this may violate
890 * the ordering in ER. So, if we have groups in ER, set S to 1117 * the ordering in EB (see [2]). So, if we have groups in ER,
891 * the F_j of the first group j which would be blocking us. 1118 * set S to the F_j of the first group j which would be blocking us.
892 * We are guaranteed not to move S backward because 1119 * We are guaranteed not to move S backward because
893 * otherwise our group i would still be blocked. 1120 * otherwise our group i would still be blocked.
894 */ 1121 */
895static void qfq_update_start(struct qfq_sched *q, struct qfq_class *cl) 1122static void qfq_update_start(struct qfq_sched *q, struct qfq_aggregate *agg)
896{ 1123{
897 unsigned long mask; 1124 unsigned long mask;
898 u64 limit, roundedF; 1125 u64 limit, roundedF;
899 int slot_shift = cl->grp->slot_shift; 1126 int slot_shift = agg->grp->slot_shift;
900 1127
901 roundedF = qfq_round_down(cl->F, slot_shift); 1128 roundedF = qfq_round_down(agg->F, slot_shift);
902 limit = qfq_round_down(q->V, slot_shift) + (1ULL << slot_shift); 1129 limit = qfq_round_down(q->V, slot_shift) + (1ULL << slot_shift);
903 1130
904 if (!qfq_gt(cl->F, q->V) || qfq_gt(roundedF, limit)) { 1131 if (!qfq_gt(agg->F, q->V) || qfq_gt(roundedF, limit)) {
905 /* timestamp was stale */ 1132 /* timestamp was stale */
906 mask = mask_from(q->bitmaps[ER], cl->grp->index); 1133 mask = mask_from(q->bitmaps[ER], agg->grp->index);
907 if (mask) { 1134 if (mask) {
908 struct qfq_group *next = qfq_ffs(q, mask); 1135 struct qfq_group *next = qfq_ffs(q, mask);
909 if (qfq_gt(roundedF, next->F)) { 1136 if (qfq_gt(roundedF, next->F)) {
910 if (qfq_gt(limit, next->F)) 1137 if (qfq_gt(limit, next->F))
911 cl->S = next->F; 1138 agg->S = next->F;
912 else /* preserve timestamp correctness */ 1139 else /* preserve timestamp correctness */
913 cl->S = limit; 1140 agg->S = limit;
914 return; 1141 return;
915 } 1142 }
916 } 1143 }
917 cl->S = q->V; 1144 agg->S = q->V;
918 } else /* timestamp is not stale */ 1145 } else /* timestamp is not stale */
919 cl->S = cl->F; 1146 agg->S = agg->F;
920} 1147}
921 1148
1149/*
1150 * Update the timestamps of agg before scheduling/rescheduling it for
1151 * service. In particular, assign to agg->F its maximum possible
1152 * value, i.e., the virtual finish time with which the aggregate
1153 * should be labeled if it used all its budget once in service.
1154 */
1155static inline void
1156qfq_update_agg_ts(struct qfq_sched *q,
1157 struct qfq_aggregate *agg, enum update_reason reason)
1158{
1159 if (reason != requeue)
1160 qfq_update_start(q, agg);
1161 else /* just charge agg for the service received */
1162 agg->S = agg->F;
1163
1164 agg->F = agg->S + (u64)agg->budgetmax * agg->inv_w;
1165}
1166
1167static void qfq_schedule_agg(struct qfq_sched *, struct qfq_aggregate *);
1168
922static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) 1169static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
923{ 1170{
924 struct qfq_sched *q = qdisc_priv(sch); 1171 struct qfq_sched *q = qdisc_priv(sch);
925 struct qfq_class *cl; 1172 struct qfq_class *cl;
1173 struct qfq_aggregate *agg;
926 int err = 0; 1174 int err = 0;
927 1175
928 cl = qfq_classify(skb, sch, &err); 1176 cl = qfq_classify(skb, sch, &err);
@@ -934,11 +1182,13 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
934 } 1182 }
935 pr_debug("qfq_enqueue: cl = %x\n", cl->common.classid); 1183 pr_debug("qfq_enqueue: cl = %x\n", cl->common.classid);
936 1184
937 if (unlikely(cl->lmax < qdisc_pkt_len(skb))) { 1185 if (unlikely(cl->agg->lmax < qdisc_pkt_len(skb))) {
938 pr_debug("qfq: increasing maxpkt from %u to %u for class %u", 1186 pr_debug("qfq: increasing maxpkt from %u to %u for class %u",
939 cl->lmax, qdisc_pkt_len(skb), cl->common.classid); 1187 cl->agg->lmax, qdisc_pkt_len(skb), cl->common.classid);
940 qfq_update_reactivate_class(q, cl, cl->inv_w, 1188 err = qfq_change_agg(sch, cl, cl->agg->class_weight,
941 qdisc_pkt_len(skb), 0); 1189 qdisc_pkt_len(skb));
1190 if (err)
1191 return err;
942 } 1192 }
943 1193
944 err = qdisc_enqueue(skb, cl->qdisc); 1194 err = qdisc_enqueue(skb, cl->qdisc);
@@ -954,35 +1204,50 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
954 bstats_update(&cl->bstats, skb); 1204 bstats_update(&cl->bstats, skb);
955 ++sch->q.qlen; 1205 ++sch->q.qlen;
956 1206
957 /* If the new skb is not the head of queue, then done here. */ 1207 agg = cl->agg;
958 if (cl->qdisc->q.qlen != 1) 1208 /* if the queue was not empty, then done here */
1209 if (cl->qdisc->q.qlen != 1) {
1210 if (unlikely(skb == cl->qdisc->ops->peek(cl->qdisc)) &&
1211 list_first_entry(&agg->active, struct qfq_class, alist)
1212 == cl && cl->deficit < qdisc_pkt_len(skb))
1213 list_move_tail(&cl->alist, &agg->active);
1214
959 return err; 1215 return err;
1216 }
1217
1218 /* schedule class for service within the aggregate */
1219 cl->deficit = agg->lmax;
1220 list_add_tail(&cl->alist, &agg->active);
960 1221
961 /* If reach this point, queue q was idle */ 1222 if (list_first_entry(&agg->active, struct qfq_class, alist) != cl)
962 qfq_activate_class(q, cl, qdisc_pkt_len(skb)); 1223 return err; /* aggregate was not empty, nothing else to do */
1224
1225 /* recharge budget */
1226 agg->initial_budget = agg->budget = agg->budgetmax;
1227
1228 qfq_update_agg_ts(q, agg, enqueue);
1229 if (q->in_serv_agg == NULL)
1230 q->in_serv_agg = agg;
1231 else if (agg != q->in_serv_agg)
1232 qfq_schedule_agg(q, agg);
963 1233
964 return err; 1234 return err;
965} 1235}
966 1236
967/* 1237/*
968 * Handle class switch from idle to backlogged. 1238 * Schedule aggregate according to its timestamps.
969 */ 1239 */
970static void qfq_activate_class(struct qfq_sched *q, struct qfq_class *cl, 1240static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
971 unsigned int pkt_len)
972{ 1241{
973 struct qfq_group *grp = cl->grp; 1242 struct qfq_group *grp = agg->grp;
974 u64 roundedS; 1243 u64 roundedS;
975 int s; 1244 int s;
976 1245
977 qfq_update_start(q, cl); 1246 roundedS = qfq_round_down(agg->S, grp->slot_shift);
978
979 /* compute new finish time and rounded start. */
980 cl->F = cl->S + (u64)pkt_len * cl->inv_w;
981 roundedS = qfq_round_down(cl->S, grp->slot_shift);
982 1247
983 /* 1248 /*
984 * insert cl in the correct bucket. 1249 * Insert agg in the correct bucket.
985 * If cl->S >= grp->S we don't need to adjust the 1250 * If agg->S >= grp->S we don't need to adjust the
986 * bucket list and simply go to the insertion phase. 1251 * bucket list and simply go to the insertion phase.
987 * Otherwise grp->S is decreasing, we must make room 1252 * Otherwise grp->S is decreasing, we must make room
988 * in the bucket list, and also recompute the group state. 1253 * in the bucket list, and also recompute the group state.
@@ -990,10 +1255,10 @@ static void qfq_activate_class(struct qfq_sched *q, struct qfq_class *cl,
990 * was in ER make sure to adjust V. 1255 * was in ER make sure to adjust V.
991 */ 1256 */
992 if (grp->full_slots) { 1257 if (grp->full_slots) {
993 if (!qfq_gt(grp->S, cl->S)) 1258 if (!qfq_gt(grp->S, agg->S))
994 goto skip_update; 1259 goto skip_update;
995 1260
996 /* create a slot for this cl->S */ 1261 /* create a slot for this agg->S */
997 qfq_slot_rotate(grp, roundedS); 1262 qfq_slot_rotate(grp, roundedS);
998 /* group was surely ineligible, remove */ 1263 /* group was surely ineligible, remove */
999 __clear_bit(grp->index, &q->bitmaps[IR]); 1264 __clear_bit(grp->index, &q->bitmaps[IR]);
@@ -1008,46 +1273,61 @@ static void qfq_activate_class(struct qfq_sched *q, struct qfq_class *cl,
1008 1273
1009 pr_debug("qfq enqueue: new state %d %#lx S %lld F %lld V %lld\n", 1274 pr_debug("qfq enqueue: new state %d %#lx S %lld F %lld V %lld\n",
1010 s, q->bitmaps[s], 1275 s, q->bitmaps[s],
1011 (unsigned long long) cl->S, 1276 (unsigned long long) agg->S,
1012 (unsigned long long) cl->F, 1277 (unsigned long long) agg->F,
1013 (unsigned long long) q->V); 1278 (unsigned long long) q->V);
1014 1279
1015skip_update: 1280skip_update:
1016 qfq_slot_insert(grp, cl, roundedS); 1281 qfq_slot_insert(grp, agg, roundedS);
1017} 1282}
1018 1283
1019 1284
1285/* Update agg ts and schedule agg for service */
1286static void qfq_activate_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
1287 enum update_reason reason)
1288{
1289 qfq_update_agg_ts(q, agg, reason);
1290 qfq_schedule_agg(q, agg);
1291}
1292
1020static void qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp, 1293static void qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp,
1021 struct qfq_class *cl) 1294 struct qfq_aggregate *agg)
1022{ 1295{
1023 unsigned int i, offset; 1296 unsigned int i, offset;
1024 u64 roundedS; 1297 u64 roundedS;
1025 1298
1026 roundedS = qfq_round_down(cl->S, grp->slot_shift); 1299 roundedS = qfq_round_down(agg->S, grp->slot_shift);
1027 offset = (roundedS - grp->S) >> grp->slot_shift; 1300 offset = (roundedS - grp->S) >> grp->slot_shift;
1301
1028 i = (grp->front + offset) % QFQ_MAX_SLOTS; 1302 i = (grp->front + offset) % QFQ_MAX_SLOTS;
1029 1303
1030 hlist_del(&cl->next); 1304 hlist_del(&agg->next);
1031 if (hlist_empty(&grp->slots[i])) 1305 if (hlist_empty(&grp->slots[i]))
1032 __clear_bit(offset, &grp->full_slots); 1306 __clear_bit(offset, &grp->full_slots);
1033} 1307}
1034 1308
1035/* 1309/*
1036 * called to forcibly destroy a queue. 1310 * Called to forcibly deschedule an aggregate. If the aggregate is
1037 * If the queue is not in the front bucket, or if it has 1311 * not in the front bucket, or if the latter has other aggregates in
1038 * other queues in the front bucket, we can simply remove 1312 * the front bucket, we can simply remove the aggregate with no other
1039 * the queue with no other side effects. 1313 * side effects.
1040 * Otherwise we must propagate the event up. 1314 * Otherwise we must propagate the event up.
1041 */ 1315 */
1042static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl) 1316static void qfq_deactivate_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
1043{ 1317{
1044 struct qfq_group *grp = cl->grp; 1318 struct qfq_group *grp = agg->grp;
1045 unsigned long mask; 1319 unsigned long mask;
1046 u64 roundedS; 1320 u64 roundedS;
1047 int s; 1321 int s;
1048 1322
1049 cl->F = cl->S; 1323 if (agg == q->in_serv_agg) {
1050 qfq_slot_remove(q, grp, cl); 1324 charge_actual_service(agg);
1325 q->in_serv_agg = qfq_choose_next_agg(q);
1326 return;
1327 }
1328
1329 agg->F = agg->S;
1330 qfq_slot_remove(q, grp, agg);
1051 1331
1052 if (!grp->full_slots) { 1332 if (!grp->full_slots) {
1053 __clear_bit(grp->index, &q->bitmaps[IR]); 1333 __clear_bit(grp->index, &q->bitmaps[IR]);
@@ -1066,8 +1346,8 @@ static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl)
1066 } 1346 }
1067 __clear_bit(grp->index, &q->bitmaps[ER]); 1347 __clear_bit(grp->index, &q->bitmaps[ER]);
1068 } else if (hlist_empty(&grp->slots[grp->front])) { 1348 } else if (hlist_empty(&grp->slots[grp->front])) {
1069 cl = qfq_slot_scan(grp); 1349 agg = qfq_slot_scan(grp);
1070 roundedS = qfq_round_down(cl->S, grp->slot_shift); 1350 roundedS = qfq_round_down(agg->S, grp->slot_shift);
1071 if (grp->S != roundedS) { 1351 if (grp->S != roundedS) {
1072 __clear_bit(grp->index, &q->bitmaps[ER]); 1352 __clear_bit(grp->index, &q->bitmaps[ER]);
1073 __clear_bit(grp->index, &q->bitmaps[IR]); 1353 __clear_bit(grp->index, &q->bitmaps[IR]);
@@ -1080,7 +1360,7 @@ static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl)
1080 } 1360 }
1081 } 1361 }
1082 1362
1083 qfq_update_eligible(q, q->V); 1363 qfq_update_eligible(q);
1084} 1364}
1085 1365
1086static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg) 1366static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg)
@@ -1092,6 +1372,32 @@ static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1092 qfq_deactivate_class(q, cl); 1372 qfq_deactivate_class(q, cl);
1093} 1373}
1094 1374
1375static unsigned int qfq_drop_from_slot(struct qfq_sched *q,
1376 struct hlist_head *slot)
1377{
1378 struct qfq_aggregate *agg;
1379 struct hlist_node *n;
1380 struct qfq_class *cl;
1381 unsigned int len;
1382
1383 hlist_for_each_entry(agg, n, slot, next) {
1384 list_for_each_entry(cl, &agg->active, alist) {
1385
1386 if (!cl->qdisc->ops->drop)
1387 continue;
1388
1389 len = cl->qdisc->ops->drop(cl->qdisc);
1390 if (len > 0) {
1391 if (cl->qdisc->q.qlen == 0)
1392 qfq_deactivate_class(q, cl);
1393
1394 return len;
1395 }
1396 }
1397 }
1398 return 0;
1399}
1400
1095static unsigned int qfq_drop(struct Qdisc *sch) 1401static unsigned int qfq_drop(struct Qdisc *sch)
1096{ 1402{
1097 struct qfq_sched *q = qdisc_priv(sch); 1403 struct qfq_sched *q = qdisc_priv(sch);
@@ -1101,24 +1407,13 @@ static unsigned int qfq_drop(struct Qdisc *sch)
1101 for (i = 0; i <= QFQ_MAX_INDEX; i++) { 1407 for (i = 0; i <= QFQ_MAX_INDEX; i++) {
1102 grp = &q->groups[i]; 1408 grp = &q->groups[i];
1103 for (j = 0; j < QFQ_MAX_SLOTS; j++) { 1409 for (j = 0; j < QFQ_MAX_SLOTS; j++) {
1104 struct qfq_class *cl; 1410 len = qfq_drop_from_slot(q, &grp->slots[j]);
1105 struct hlist_node *n; 1411 if (len > 0) {
1106 1412 sch->q.qlen--;
1107 hlist_for_each_entry(cl, n, &grp->slots[j], next) { 1413 return len;
1108
1109 if (!cl->qdisc->ops->drop)
1110 continue;
1111
1112 len = cl->qdisc->ops->drop(cl->qdisc);
1113 if (len > 0) {
1114 sch->q.qlen--;
1115 if (!cl->qdisc->q.qlen)
1116 qfq_deactivate_class(q, cl);
1117
1118 return len;
1119 }
1120 } 1414 }
1121 } 1415 }
1416
1122 } 1417 }
1123 1418
1124 return 0; 1419 return 0;
@@ -1129,44 +1424,51 @@ static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
1129 struct qfq_sched *q = qdisc_priv(sch); 1424 struct qfq_sched *q = qdisc_priv(sch);
1130 struct qfq_group *grp; 1425 struct qfq_group *grp;
1131 int i, j, err; 1426 int i, j, err;
1427 u32 max_cl_shift, maxbudg_shift, max_classes;
1132 1428
1133 err = qdisc_class_hash_init(&q->clhash); 1429 err = qdisc_class_hash_init(&q->clhash);
1134 if (err < 0) 1430 if (err < 0)
1135 return err; 1431 return err;
1136 1432
1433 if (qdisc_dev(sch)->tx_queue_len + 1 > QFQ_MAX_AGG_CLASSES)
1434 max_classes = QFQ_MAX_AGG_CLASSES;
1435 else
1436 max_classes = qdisc_dev(sch)->tx_queue_len + 1;
1437 /* max_cl_shift = floor(log_2(max_classes)) */
1438 max_cl_shift = __fls(max_classes);
1439 q->max_agg_classes = 1<<max_cl_shift;
1440
1441 /* maxbudg_shift = log2(max_len * max_classes_per_agg) */
1442 maxbudg_shift = QFQ_MTU_SHIFT + max_cl_shift;
1443 q->min_slot_shift = FRAC_BITS + maxbudg_shift - QFQ_MAX_INDEX;
1444
1137 for (i = 0; i <= QFQ_MAX_INDEX; i++) { 1445 for (i = 0; i <= QFQ_MAX_INDEX; i++) {
1138 grp = &q->groups[i]; 1446 grp = &q->groups[i];
1139 grp->index = i; 1447 grp->index = i;
1140 grp->slot_shift = QFQ_MTU_SHIFT + FRAC_BITS 1448 grp->slot_shift = q->min_slot_shift + i;
1141 - (QFQ_MAX_INDEX - i);
1142 for (j = 0; j < QFQ_MAX_SLOTS; j++) 1449 for (j = 0; j < QFQ_MAX_SLOTS; j++)
1143 INIT_HLIST_HEAD(&grp->slots[j]); 1450 INIT_HLIST_HEAD(&grp->slots[j]);
1144 } 1451 }
1145 1452
1453 INIT_HLIST_HEAD(&q->nonfull_aggs);
1454
1146 return 0; 1455 return 0;
1147} 1456}
1148 1457
1149static void qfq_reset_qdisc(struct Qdisc *sch) 1458static void qfq_reset_qdisc(struct Qdisc *sch)
1150{ 1459{
1151 struct qfq_sched *q = qdisc_priv(sch); 1460 struct qfq_sched *q = qdisc_priv(sch);
1152 struct qfq_group *grp;
1153 struct qfq_class *cl; 1461 struct qfq_class *cl;
1154 struct hlist_node *n, *tmp; 1462 struct hlist_node *n;
1155 unsigned int i, j; 1463 unsigned int i;
1156 1464
1157 for (i = 0; i <= QFQ_MAX_INDEX; i++) { 1465 for (i = 0; i < q->clhash.hashsize; i++) {
1158 grp = &q->groups[i]; 1466 hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
1159 for (j = 0; j < QFQ_MAX_SLOTS; j++) { 1467 if (cl->qdisc->q.qlen > 0)
1160 hlist_for_each_entry_safe(cl, n, tmp,
1161 &grp->slots[j], next) {
1162 qfq_deactivate_class(q, cl); 1468 qfq_deactivate_class(q, cl);
1163 }
1164 }
1165 }
1166 1469
1167 for (i = 0; i < q->clhash.hashsize; i++) {
1168 hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode)
1169 qdisc_reset(cl->qdisc); 1470 qdisc_reset(cl->qdisc);
1471 }
1170 } 1472 }
1171 sch->q.qlen = 0; 1473 sch->q.qlen = 0;
1172} 1474}