diff options
author | Paolo Valente <paolo.valente@unimore.it> | 2012-11-23 06:03:19 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2012-11-28 11:19:35 -0500 |
commit | 462dbc9101acd38e92eda93c0726857517a24bbd (patch) | |
tree | ad0433c2d8a2b2449c5ba85fe0353086293bc0de /net | |
parent | 351f33d9e0d3e26adf0ef5180e1e28b4737c49ce (diff) |
pkt_sched: QFQ Plus: fair-queueing service at DRR cost
This patch turns QFQ into QFQ+, a variant of QFQ that provides the
following two benefits: 1) QFQ+ is faster than QFQ, 2) differently
from QFQ, QFQ+ correctly schedules also non-leaves classes in a
hierarchical setting. A detailed description of QFQ+, plus a
performance comparison with DRR and QFQ, can be found in [1].
[1] P. Valente, "Reducing the Execution Time of Fair-Queueing Schedulers"
http://algo.ing.unimo.it/people/paolo/agg-sched/agg-sched.pdf
Signed-off-by: Paolo Valente <paolo.valente@unimore.it>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r-- | net/sched/sch_qfq.c | 830 |
1 files changed, 566 insertions, 264 deletions
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 9687fa1c227..6ed37652a4c 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c | |||
@@ -1,7 +1,8 @@ | |||
1 | /* | 1 | /* |
2 | * net/sched/sch_qfq.c Quick Fair Queueing Scheduler. | 2 | * net/sched/sch_qfq.c Quick Fair Queueing Plus Scheduler. |
3 | * | 3 | * |
4 | * Copyright (c) 2009 Fabio Checconi, Luigi Rizzo, and Paolo Valente. | 4 | * Copyright (c) 2009 Fabio Checconi, Luigi Rizzo, and Paolo Valente. |
5 | * Copyright (c) 2012 Paolo Valente. | ||
5 | * | 6 | * |
6 | * This program is free software; you can redistribute it and/or | 7 | * This program is free software; you can redistribute it and/or |
7 | * modify it under the terms of the GNU General Public License | 8 | * modify it under the terms of the GNU General Public License |
@@ -19,12 +20,18 @@ | |||
19 | #include <net/pkt_cls.h> | 20 | #include <net/pkt_cls.h> |
20 | 21 | ||
21 | 22 | ||
22 | /* Quick Fair Queueing | 23 | /* Quick Fair Queueing Plus |
23 | =================== | 24 | ======================== |
24 | 25 | ||
25 | Sources: | 26 | Sources: |
26 | 27 | ||
27 | Fabio Checconi, Luigi Rizzo, and Paolo Valente: "QFQ: Efficient | 28 | [1] Paolo Valente, |
29 | "Reducing the Execution Time of Fair-Queueing Schedulers." | ||
30 | http://algo.ing.unimo.it/people/paolo/agg-sched/agg-sched.pdf | ||
31 | |||
32 | Sources for QFQ: | ||
33 | |||
34 | [2] Fabio Checconi, Luigi Rizzo, and Paolo Valente: "QFQ: Efficient | ||
28 | Packet Scheduling with Tight Bandwidth Distribution Guarantees." | 35 | Packet Scheduling with Tight Bandwidth Distribution Guarantees." |
29 | 36 | ||
30 | See also: | 37 | See also: |
@@ -33,6 +40,20 @@ | |||
33 | 40 | ||
34 | /* | 41 | /* |
35 | 42 | ||
43 | QFQ+ divides classes into aggregates of at most MAX_AGG_CLASSES | ||
44 | classes. Each aggregate is timestamped with a virtual start time S | ||
45 | and a virtual finish time F, and scheduled according to its | ||
46 | timestamps. S and F are computed as a function of a system virtual | ||
47 | time function V. The classes within each aggregate are instead | ||
48 | scheduled with DRR. | ||
49 | |||
50 | To speed up operations, QFQ+ divides also aggregates into a limited | ||
51 | number of groups. Which group a class belongs to depends on the | ||
52 | ratio between the maximum packet length for the class and the weight | ||
53 | of the class. Groups have their own S and F. In the end, QFQ+ | ||
54 | schedules groups, then aggregates within groups, then classes within | ||
55 | aggregates. See [1] and [2] for a full description. | ||
56 | |||
36 | Virtual time computations. | 57 | Virtual time computations. |
37 | 58 | ||
38 | S, F and V are all computed in fixed point arithmetic with | 59 | S, F and V are all computed in fixed point arithmetic with |
@@ -76,27 +97,28 @@ | |||
76 | #define QFQ_MAX_SLOTS 32 | 97 | #define QFQ_MAX_SLOTS 32 |
77 | 98 | ||
78 | /* | 99 | /* |
79 | * Shifts used for class<->group mapping. We allow class weights that are | 100 | * Shifts used for aggregate<->group mapping. We allow class weights that are |
80 | * in the range [1, 2^MAX_WSHIFT], and we try to map each class i to the | 101 | * in the range [1, 2^MAX_WSHIFT], and we try to map each aggregate i to the |
81 | * group with the smallest index that can support the L_i / r_i configured | 102 | * group with the smallest index that can support the L_i / r_i configured |
82 | * for the class. | 103 | * for the classes in the aggregate. |
83 | * | 104 | * |
84 | * grp->index is the index of the group; and grp->slot_shift | 105 | * grp->index is the index of the group; and grp->slot_shift |
85 | * is the shift for the corresponding (scaled) sigma_i. | 106 | * is the shift for the corresponding (scaled) sigma_i. |
86 | */ | 107 | */ |
87 | #define QFQ_MAX_INDEX 24 | 108 | #define QFQ_MAX_INDEX 24 |
88 | #define QFQ_MAX_WSHIFT 12 | 109 | #define QFQ_MAX_WSHIFT 10 |
89 | 110 | ||
90 | #define QFQ_MAX_WEIGHT (1<<QFQ_MAX_WSHIFT) | 111 | #define QFQ_MAX_WEIGHT (1<<QFQ_MAX_WSHIFT) /* see qfq_slot_insert */ |
91 | #define QFQ_MAX_WSUM (16*QFQ_MAX_WEIGHT) | 112 | #define QFQ_MAX_WSUM (64*QFQ_MAX_WEIGHT) |
92 | 113 | ||
93 | #define FRAC_BITS 30 /* fixed point arithmetic */ | 114 | #define FRAC_BITS 30 /* fixed point arithmetic */ |
94 | #define ONE_FP (1UL << FRAC_BITS) | 115 | #define ONE_FP (1UL << FRAC_BITS) |
95 | #define IWSUM (ONE_FP/QFQ_MAX_WSUM) | 116 | #define IWSUM (ONE_FP/QFQ_MAX_WSUM) |
96 | 117 | ||
97 | #define QFQ_MTU_SHIFT 16 /* to support TSO/GSO */ | 118 | #define QFQ_MTU_SHIFT 16 /* to support TSO/GSO */ |
98 | #define QFQ_MIN_SLOT_SHIFT (FRAC_BITS + QFQ_MTU_SHIFT - QFQ_MAX_INDEX) | 119 | #define QFQ_MIN_LMAX 512 /* see qfq_slot_insert */ |
99 | #define QFQ_MIN_LMAX 256 /* min possible lmax for a class */ | 120 | |
121 | #define QFQ_MAX_AGG_CLASSES 8 /* max num classes per aggregate allowed */ | ||
100 | 122 | ||
101 | /* | 123 | /* |
102 | * Possible group states. These values are used as indexes for the bitmaps | 124 | * Possible group states. These values are used as indexes for the bitmaps |
@@ -106,6 +128,8 @@ enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE }; | |||
106 | 128 | ||
107 | struct qfq_group; | 129 | struct qfq_group; |
108 | 130 | ||
131 | struct qfq_aggregate; | ||
132 | |||
109 | struct qfq_class { | 133 | struct qfq_class { |
110 | struct Qdisc_class_common common; | 134 | struct Qdisc_class_common common; |
111 | 135 | ||
@@ -116,7 +140,12 @@ struct qfq_class { | |||
116 | struct gnet_stats_queue qstats; | 140 | struct gnet_stats_queue qstats; |
117 | struct gnet_stats_rate_est rate_est; | 141 | struct gnet_stats_rate_est rate_est; |
118 | struct Qdisc *qdisc; | 142 | struct Qdisc *qdisc; |
143 | struct list_head alist; /* Link for active-classes list. */ | ||
144 | struct qfq_aggregate *agg; /* Parent aggregate. */ | ||
145 | int deficit; /* DRR deficit counter. */ | ||
146 | }; | ||
119 | 147 | ||
148 | struct qfq_aggregate { | ||
120 | struct hlist_node next; /* Link for the slot list. */ | 149 | struct hlist_node next; /* Link for the slot list. */ |
121 | u64 S, F; /* flow timestamps (exact) */ | 150 | u64 S, F; /* flow timestamps (exact) */ |
122 | 151 | ||
@@ -127,8 +156,18 @@ struct qfq_class { | |||
127 | struct qfq_group *grp; | 156 | struct qfq_group *grp; |
128 | 157 | ||
129 | /* these are copied from the flowset. */ | 158 | /* these are copied from the flowset. */ |
130 | u32 inv_w; /* ONE_FP/weight */ | 159 | u32 class_weight; /* Weight of each class in this aggregate. */ |
131 | u32 lmax; /* Max packet size for this flow. */ | 160 | /* Max pkt size for the classes in this aggregate, DRR quantum. */ |
161 | int lmax; | ||
162 | |||
163 | u32 inv_w; /* ONE_FP/(sum of weights of classes in aggr.). */ | ||
164 | u32 budgetmax; /* Max budget for this aggregate. */ | ||
165 | u32 initial_budget, budget; /* Initial and current budget. */ | ||
166 | |||
167 | int num_classes; /* Number of classes in this aggr. */ | ||
168 | struct list_head active; /* DRR queue of active classes. */ | ||
169 | |||
170 | struct hlist_node nonfull_next; /* See nonfull_aggs in qfq_sched. */ | ||
132 | }; | 171 | }; |
133 | 172 | ||
134 | struct qfq_group { | 173 | struct qfq_group { |
@@ -138,7 +177,7 @@ struct qfq_group { | |||
138 | unsigned int front; /* Index of the front slot. */ | 177 | unsigned int front; /* Index of the front slot. */ |
139 | unsigned long full_slots; /* non-empty slots */ | 178 | unsigned long full_slots; /* non-empty slots */ |
140 | 179 | ||
141 | /* Array of RR lists of active classes. */ | 180 | /* Array of RR lists of active aggregates. */ |
142 | struct hlist_head slots[QFQ_MAX_SLOTS]; | 181 | struct hlist_head slots[QFQ_MAX_SLOTS]; |
143 | }; | 182 | }; |
144 | 183 | ||
@@ -146,13 +185,28 @@ struct qfq_sched { | |||
146 | struct tcf_proto *filter_list; | 185 | struct tcf_proto *filter_list; |
147 | struct Qdisc_class_hash clhash; | 186 | struct Qdisc_class_hash clhash; |
148 | 187 | ||
149 | u64 V; /* Precise virtual time. */ | 188 | u64 oldV, V; /* Precise virtual times. */ |
150 | u32 wsum; /* weight sum */ | 189 | struct qfq_aggregate *in_serv_agg; /* Aggregate being served. */ |
190 | u32 num_active_agg; /* Num. of active aggregates */ | ||
191 | u32 wsum; /* weight sum */ | ||
151 | 192 | ||
152 | unsigned long bitmaps[QFQ_MAX_STATE]; /* Group bitmaps. */ | 193 | unsigned long bitmaps[QFQ_MAX_STATE]; /* Group bitmaps. */ |
153 | struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */ | 194 | struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */ |
195 | u32 min_slot_shift; /* Index of the group-0 bit in the bitmaps. */ | ||
196 | |||
197 | u32 max_agg_classes; /* Max number of classes per aggr. */ | ||
198 | struct hlist_head nonfull_aggs; /* Aggs with room for more classes. */ | ||
154 | }; | 199 | }; |
155 | 200 | ||
201 | /* | ||
202 | * Possible reasons why the timestamps of an aggregate are updated | ||
203 | * enqueue: the aggregate switches from idle to active and must scheduled | ||
204 | * for service | ||
205 | * requeue: the aggregate finishes its budget, so it stops being served and | ||
206 | * must be rescheduled for service | ||
207 | */ | ||
208 | enum update_reason {enqueue, requeue}; | ||
209 | |||
156 | static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid) | 210 | static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid) |
157 | { | 211 | { |
158 | struct qfq_sched *q = qdisc_priv(sch); | 212 | struct qfq_sched *q = qdisc_priv(sch); |
@@ -182,18 +236,18 @@ static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = { | |||
182 | * index = log_2(maxlen/weight) but we need to apply the scaling. | 236 | * index = log_2(maxlen/weight) but we need to apply the scaling. |
183 | * This is used only once at flow creation. | 237 | * This is used only once at flow creation. |
184 | */ | 238 | */ |
185 | static int qfq_calc_index(u32 inv_w, unsigned int maxlen) | 239 | static int qfq_calc_index(u32 inv_w, unsigned int maxlen, u32 min_slot_shift) |
186 | { | 240 | { |
187 | u64 slot_size = (u64)maxlen * inv_w; | 241 | u64 slot_size = (u64)maxlen * inv_w; |
188 | unsigned long size_map; | 242 | unsigned long size_map; |
189 | int index = 0; | 243 | int index = 0; |
190 | 244 | ||
191 | size_map = slot_size >> QFQ_MIN_SLOT_SHIFT; | 245 | size_map = slot_size >> min_slot_shift; |
192 | if (!size_map) | 246 | if (!size_map) |
193 | goto out; | 247 | goto out; |
194 | 248 | ||
195 | index = __fls(size_map) + 1; /* basically a log_2 */ | 249 | index = __fls(size_map) + 1; /* basically a log_2 */ |
196 | index -= !(slot_size - (1ULL << (index + QFQ_MIN_SLOT_SHIFT - 1))); | 250 | index -= !(slot_size - (1ULL << (index + min_slot_shift - 1))); |
197 | 251 | ||
198 | if (index < 0) | 252 | if (index < 0) |
199 | index = 0; | 253 | index = 0; |
@@ -204,66 +258,150 @@ out: | |||
204 | return index; | 258 | return index; |
205 | } | 259 | } |
206 | 260 | ||
207 | /* Length of the next packet (0 if the queue is empty). */ | 261 | static void qfq_deactivate_agg(struct qfq_sched *, struct qfq_aggregate *); |
208 | static unsigned int qdisc_peek_len(struct Qdisc *sch) | 262 | static void qfq_activate_agg(struct qfq_sched *, struct qfq_aggregate *, |
263 | enum update_reason); | ||
264 | |||
265 | static void qfq_init_agg(struct qfq_sched *q, struct qfq_aggregate *agg, | ||
266 | u32 lmax, u32 weight) | ||
209 | { | 267 | { |
210 | struct sk_buff *skb; | 268 | INIT_LIST_HEAD(&agg->active); |
269 | hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs); | ||
270 | |||
271 | agg->lmax = lmax; | ||
272 | agg->class_weight = weight; | ||
273 | } | ||
274 | |||
275 | static struct qfq_aggregate *qfq_find_agg(struct qfq_sched *q, | ||
276 | u32 lmax, u32 weight) | ||
277 | { | ||
278 | struct qfq_aggregate *agg; | ||
279 | struct hlist_node *n; | ||
280 | |||
281 | hlist_for_each_entry(agg, n, &q->nonfull_aggs, nonfull_next) | ||
282 | if (agg->lmax == lmax && agg->class_weight == weight) | ||
283 | return agg; | ||
284 | |||
285 | return NULL; | ||
286 | } | ||
287 | |||
211 | 288 | ||
212 | skb = sch->ops->peek(sch); | 289 | /* Update aggregate as a function of the new number of classes. */ |
213 | return skb ? qdisc_pkt_len(skb) : 0; | 290 | static void qfq_update_agg(struct qfq_sched *q, struct qfq_aggregate *agg, |
291 | int new_num_classes) | ||
292 | { | ||
293 | u32 new_agg_weight; | ||
294 | |||
295 | if (new_num_classes == q->max_agg_classes) | ||
296 | hlist_del_init(&agg->nonfull_next); | ||
297 | |||
298 | if (agg->num_classes > new_num_classes && | ||
299 | new_num_classes == q->max_agg_classes - 1) /* agg no more full */ | ||
300 | hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs); | ||
301 | |||
302 | agg->budgetmax = new_num_classes * agg->lmax; | ||
303 | new_agg_weight = agg->class_weight * new_num_classes; | ||
304 | agg->inv_w = ONE_FP/new_agg_weight; | ||
305 | |||
306 | if (agg->grp == NULL) { | ||
307 | int i = qfq_calc_index(agg->inv_w, agg->budgetmax, | ||
308 | q->min_slot_shift); | ||
309 | agg->grp = &q->groups[i]; | ||
310 | } | ||
311 | |||
312 | q->wsum += | ||
313 | (int) agg->class_weight * (new_num_classes - agg->num_classes); | ||
314 | |||
315 | agg->num_classes = new_num_classes; | ||
316 | } | ||
317 | |||
318 | /* Add class to aggregate. */ | ||
319 | static void qfq_add_to_agg(struct qfq_sched *q, | ||
320 | struct qfq_aggregate *agg, | ||
321 | struct qfq_class *cl) | ||
322 | { | ||
323 | cl->agg = agg; | ||
324 | |||
325 | qfq_update_agg(q, agg, agg->num_classes+1); | ||
326 | if (cl->qdisc->q.qlen > 0) { /* adding an active class */ | ||
327 | list_add_tail(&cl->alist, &agg->active); | ||
328 | if (list_first_entry(&agg->active, struct qfq_class, alist) == | ||
329 | cl && q->in_serv_agg != agg) /* agg was inactive */ | ||
330 | qfq_activate_agg(q, agg, enqueue); /* schedule agg */ | ||
331 | } | ||
214 | } | 332 | } |
215 | 333 | ||
216 | static void qfq_deactivate_class(struct qfq_sched *, struct qfq_class *); | 334 | static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *); |
217 | static void qfq_activate_class(struct qfq_sched *q, struct qfq_class *cl, | ||
218 | unsigned int len); | ||
219 | 335 | ||
220 | static void qfq_update_class_params(struct qfq_sched *q, struct qfq_class *cl, | 336 | static void qfq_destroy_agg(struct qfq_sched *q, struct qfq_aggregate *agg) |
221 | u32 lmax, u32 inv_w, int delta_w) | ||
222 | { | 337 | { |
223 | int i; | 338 | if (!hlist_unhashed(&agg->nonfull_next)) |
339 | hlist_del_init(&agg->nonfull_next); | ||
340 | if (q->in_serv_agg == agg) | ||
341 | q->in_serv_agg = qfq_choose_next_agg(q); | ||
342 | kfree(agg); | ||
343 | } | ||
224 | 344 | ||
225 | /* update qfq-specific data */ | 345 | /* Deschedule class from within its parent aggregate. */ |
226 | cl->lmax = lmax; | 346 | static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl) |
227 | cl->inv_w = inv_w; | 347 | { |
228 | i = qfq_calc_index(cl->inv_w, cl->lmax); | 348 | struct qfq_aggregate *agg = cl->agg; |
229 | 349 | ||
230 | cl->grp = &q->groups[i]; | ||
231 | 350 | ||
232 | q->wsum += delta_w; | 351 | list_del(&cl->alist); /* remove from RR queue of the aggregate */ |
352 | if (list_empty(&agg->active)) /* agg is now inactive */ | ||
353 | qfq_deactivate_agg(q, agg); | ||
233 | } | 354 | } |
234 | 355 | ||
235 | static void qfq_update_reactivate_class(struct qfq_sched *q, | 356 | /* Remove class from its parent aggregate. */ |
236 | struct qfq_class *cl, | 357 | static void qfq_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl) |
237 | u32 inv_w, u32 lmax, int delta_w) | ||
238 | { | 358 | { |
239 | bool need_reactivation = false; | 359 | struct qfq_aggregate *agg = cl->agg; |
240 | int i = qfq_calc_index(inv_w, lmax); | ||
241 | 360 | ||
242 | if (&q->groups[i] != cl->grp && cl->qdisc->q.qlen > 0) { | 361 | cl->agg = NULL; |
243 | /* | 362 | if (agg->num_classes == 1) { /* agg being emptied, destroy it */ |
244 | * shift cl->F back, to not charge the | 363 | qfq_destroy_agg(q, agg); |
245 | * class for the not-yet-served head | 364 | return; |
246 | * packet | ||
247 | */ | ||
248 | cl->F = cl->S; | ||
249 | /* remove class from its slot in the old group */ | ||
250 | qfq_deactivate_class(q, cl); | ||
251 | need_reactivation = true; | ||
252 | } | 365 | } |
366 | qfq_update_agg(q, agg, agg->num_classes-1); | ||
367 | } | ||
253 | 368 | ||
254 | qfq_update_class_params(q, cl, lmax, inv_w, delta_w); | 369 | /* Deschedule class and remove it from its parent aggregate. */ |
370 | static void qfq_deact_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl) | ||
371 | { | ||
372 | if (cl->qdisc->q.qlen > 0) /* class is active */ | ||
373 | qfq_deactivate_class(q, cl); | ||
255 | 374 | ||
256 | if (need_reactivation) /* activate in new group */ | 375 | qfq_rm_from_agg(q, cl); |
257 | qfq_activate_class(q, cl, qdisc_peek_len(cl->qdisc)); | ||
258 | } | 376 | } |
259 | 377 | ||
378 | /* Move class to a new aggregate, matching the new class weight and/or lmax */ | ||
379 | static int qfq_change_agg(struct Qdisc *sch, struct qfq_class *cl, u32 weight, | ||
380 | u32 lmax) | ||
381 | { | ||
382 | struct qfq_sched *q = qdisc_priv(sch); | ||
383 | struct qfq_aggregate *new_agg = qfq_find_agg(q, lmax, weight); | ||
384 | |||
385 | if (new_agg == NULL) { /* create new aggregate */ | ||
386 | new_agg = kzalloc(sizeof(*new_agg), GFP_ATOMIC); | ||
387 | if (new_agg == NULL) | ||
388 | return -ENOBUFS; | ||
389 | qfq_init_agg(q, new_agg, lmax, weight); | ||
390 | } | ||
391 | qfq_deact_rm_from_agg(q, cl); | ||
392 | qfq_add_to_agg(q, new_agg, cl); | ||
393 | |||
394 | return 0; | ||
395 | } | ||
260 | 396 | ||
261 | static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, | 397 | static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, |
262 | struct nlattr **tca, unsigned long *arg) | 398 | struct nlattr **tca, unsigned long *arg) |
263 | { | 399 | { |
264 | struct qfq_sched *q = qdisc_priv(sch); | 400 | struct qfq_sched *q = qdisc_priv(sch); |
265 | struct qfq_class *cl = (struct qfq_class *)*arg; | 401 | struct qfq_class *cl = (struct qfq_class *)*arg; |
402 | bool existing = false; | ||
266 | struct nlattr *tb[TCA_QFQ_MAX + 1]; | 403 | struct nlattr *tb[TCA_QFQ_MAX + 1]; |
404 | struct qfq_aggregate *new_agg = NULL; | ||
267 | u32 weight, lmax, inv_w; | 405 | u32 weight, lmax, inv_w; |
268 | int err; | 406 | int err; |
269 | int delta_w; | 407 | int delta_w; |
@@ -286,15 +424,6 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, | |||
286 | } else | 424 | } else |
287 | weight = 1; | 425 | weight = 1; |
288 | 426 | ||
289 | inv_w = ONE_FP / weight; | ||
290 | weight = ONE_FP / inv_w; | ||
291 | delta_w = weight - (cl ? ONE_FP / cl->inv_w : 0); | ||
292 | if (q->wsum + delta_w > QFQ_MAX_WSUM) { | ||
293 | pr_notice("qfq: total weight out of range (%u + %u)\n", | ||
294 | delta_w, q->wsum); | ||
295 | return -EINVAL; | ||
296 | } | ||
297 | |||
298 | if (tb[TCA_QFQ_LMAX]) { | 427 | if (tb[TCA_QFQ_LMAX]) { |
299 | lmax = nla_get_u32(tb[TCA_QFQ_LMAX]); | 428 | lmax = nla_get_u32(tb[TCA_QFQ_LMAX]); |
300 | if (lmax < QFQ_MIN_LMAX || lmax > (1UL << QFQ_MTU_SHIFT)) { | 429 | if (lmax < QFQ_MIN_LMAX || lmax > (1UL << QFQ_MTU_SHIFT)) { |
@@ -304,7 +433,23 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, | |||
304 | } else | 433 | } else |
305 | lmax = psched_mtu(qdisc_dev(sch)); | 434 | lmax = psched_mtu(qdisc_dev(sch)); |
306 | 435 | ||
307 | if (cl != NULL) { | 436 | inv_w = ONE_FP / weight; |
437 | weight = ONE_FP / inv_w; | ||
438 | |||
439 | if (cl != NULL && | ||
440 | lmax == cl->agg->lmax && | ||
441 | weight == cl->agg->class_weight) | ||
442 | return 0; /* nothing to change */ | ||
443 | |||
444 | delta_w = weight - (cl ? cl->agg->class_weight : 0); | ||
445 | |||
446 | if (q->wsum + delta_w > QFQ_MAX_WSUM) { | ||
447 | pr_notice("qfq: total weight out of range (%d + %u)\n", | ||
448 | delta_w, q->wsum); | ||
449 | return -EINVAL; | ||
450 | } | ||
451 | |||
452 | if (cl != NULL) { /* modify existing class */ | ||
308 | if (tca[TCA_RATE]) { | 453 | if (tca[TCA_RATE]) { |
309 | err = gen_replace_estimator(&cl->bstats, &cl->rate_est, | 454 | err = gen_replace_estimator(&cl->bstats, &cl->rate_est, |
310 | qdisc_root_sleeping_lock(sch), | 455 | qdisc_root_sleeping_lock(sch), |
@@ -312,25 +457,18 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, | |||
312 | if (err) | 457 | if (err) |
313 | return err; | 458 | return err; |
314 | } | 459 | } |
315 | 460 | existing = true; | |
316 | if (lmax == cl->lmax && inv_w == cl->inv_w) | 461 | goto set_change_agg; |
317 | return 0; /* nothing to update */ | ||
318 | |||
319 | sch_tree_lock(sch); | ||
320 | qfq_update_reactivate_class(q, cl, inv_w, lmax, delta_w); | ||
321 | sch_tree_unlock(sch); | ||
322 | |||
323 | return 0; | ||
324 | } | 462 | } |
325 | 463 | ||
464 | /* create and init new class */ | ||
326 | cl = kzalloc(sizeof(struct qfq_class), GFP_KERNEL); | 465 | cl = kzalloc(sizeof(struct qfq_class), GFP_KERNEL); |
327 | if (cl == NULL) | 466 | if (cl == NULL) |
328 | return -ENOBUFS; | 467 | return -ENOBUFS; |
329 | 468 | ||
330 | cl->refcnt = 1; | 469 | cl->refcnt = 1; |
331 | cl->common.classid = classid; | 470 | cl->common.classid = classid; |
332 | 471 | cl->deficit = lmax; | |
333 | qfq_update_class_params(q, cl, lmax, inv_w, delta_w); | ||
334 | 472 | ||
335 | cl->qdisc = qdisc_create_dflt(sch->dev_queue, | 473 | cl->qdisc = qdisc_create_dflt(sch->dev_queue, |
336 | &pfifo_qdisc_ops, classid); | 474 | &pfifo_qdisc_ops, classid); |
@@ -341,11 +479,8 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, | |||
341 | err = gen_new_estimator(&cl->bstats, &cl->rate_est, | 479 | err = gen_new_estimator(&cl->bstats, &cl->rate_est, |
342 | qdisc_root_sleeping_lock(sch), | 480 | qdisc_root_sleeping_lock(sch), |
343 | tca[TCA_RATE]); | 481 | tca[TCA_RATE]); |
344 | if (err) { | 482 | if (err) |
345 | qdisc_destroy(cl->qdisc); | 483 | goto destroy_class; |
346 | kfree(cl); | ||
347 | return err; | ||
348 | } | ||
349 | } | 484 | } |
350 | 485 | ||
351 | sch_tree_lock(sch); | 486 | sch_tree_lock(sch); |
@@ -354,19 +489,39 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, | |||
354 | 489 | ||
355 | qdisc_class_hash_grow(sch, &q->clhash); | 490 | qdisc_class_hash_grow(sch, &q->clhash); |
356 | 491 | ||
492 | set_change_agg: | ||
493 | sch_tree_lock(sch); | ||
494 | new_agg = qfq_find_agg(q, lmax, weight); | ||
495 | if (new_agg == NULL) { /* create new aggregate */ | ||
496 | sch_tree_unlock(sch); | ||
497 | new_agg = kzalloc(sizeof(*new_agg), GFP_KERNEL); | ||
498 | if (new_agg == NULL) { | ||
499 | err = -ENOBUFS; | ||
500 | gen_kill_estimator(&cl->bstats, &cl->rate_est); | ||
501 | goto destroy_class; | ||
502 | } | ||
503 | sch_tree_lock(sch); | ||
504 | qfq_init_agg(q, new_agg, lmax, weight); | ||
505 | } | ||
506 | if (existing) | ||
507 | qfq_deact_rm_from_agg(q, cl); | ||
508 | qfq_add_to_agg(q, new_agg, cl); | ||
509 | sch_tree_unlock(sch); | ||
510 | |||
357 | *arg = (unsigned long)cl; | 511 | *arg = (unsigned long)cl; |
358 | return 0; | 512 | return 0; |
513 | |||
514 | destroy_class: | ||
515 | qdisc_destroy(cl->qdisc); | ||
516 | kfree(cl); | ||
517 | return err; | ||
359 | } | 518 | } |
360 | 519 | ||
361 | static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl) | 520 | static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl) |
362 | { | 521 | { |
363 | struct qfq_sched *q = qdisc_priv(sch); | 522 | struct qfq_sched *q = qdisc_priv(sch); |
364 | 523 | ||
365 | if (cl->inv_w) { | 524 | qfq_rm_from_agg(q, cl); |
366 | q->wsum -= ONE_FP / cl->inv_w; | ||
367 | cl->inv_w = 0; | ||
368 | } | ||
369 | |||
370 | gen_kill_estimator(&cl->bstats, &cl->rate_est); | 525 | gen_kill_estimator(&cl->bstats, &cl->rate_est); |
371 | qdisc_destroy(cl->qdisc); | 526 | qdisc_destroy(cl->qdisc); |
372 | kfree(cl); | 527 | kfree(cl); |
@@ -481,8 +636,8 @@ static int qfq_dump_class(struct Qdisc *sch, unsigned long arg, | |||
481 | nest = nla_nest_start(skb, TCA_OPTIONS); | 636 | nest = nla_nest_start(skb, TCA_OPTIONS); |
482 | if (nest == NULL) | 637 | if (nest == NULL) |
483 | goto nla_put_failure; | 638 | goto nla_put_failure; |
484 | if (nla_put_u32(skb, TCA_QFQ_WEIGHT, ONE_FP/cl->inv_w) || | 639 | if (nla_put_u32(skb, TCA_QFQ_WEIGHT, cl->agg->class_weight) || |
485 | nla_put_u32(skb, TCA_QFQ_LMAX, cl->lmax)) | 640 | nla_put_u32(skb, TCA_QFQ_LMAX, cl->agg->lmax)) |
486 | goto nla_put_failure; | 641 | goto nla_put_failure; |
487 | return nla_nest_end(skb, nest); | 642 | return nla_nest_end(skb, nest); |
488 | 643 | ||
@@ -500,8 +655,8 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg, | |||
500 | memset(&xstats, 0, sizeof(xstats)); | 655 | memset(&xstats, 0, sizeof(xstats)); |
501 | cl->qdisc->qstats.qlen = cl->qdisc->q.qlen; | 656 | cl->qdisc->qstats.qlen = cl->qdisc->q.qlen; |
502 | 657 | ||
503 | xstats.weight = ONE_FP/cl->inv_w; | 658 | xstats.weight = cl->agg->class_weight; |
504 | xstats.lmax = cl->lmax; | 659 | xstats.lmax = cl->agg->lmax; |
505 | 660 | ||
506 | if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || | 661 | if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || |
507 | gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || | 662 | gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || |
@@ -652,16 +807,16 @@ static void qfq_unblock_groups(struct qfq_sched *q, int index, u64 old_F) | |||
652 | * perhaps | 807 | * perhaps |
653 | * | 808 | * |
654 | old_V ^= q->V; | 809 | old_V ^= q->V; |
655 | old_V >>= QFQ_MIN_SLOT_SHIFT; | 810 | old_V >>= q->min_slot_shift; |
656 | if (old_V) { | 811 | if (old_V) { |
657 | ... | 812 | ... |
658 | } | 813 | } |
659 | * | 814 | * |
660 | */ | 815 | */ |
661 | static void qfq_make_eligible(struct qfq_sched *q, u64 old_V) | 816 | static void qfq_make_eligible(struct qfq_sched *q) |
662 | { | 817 | { |
663 | unsigned long vslot = q->V >> QFQ_MIN_SLOT_SHIFT; | 818 | unsigned long vslot = q->V >> q->min_slot_shift; |
664 | unsigned long old_vslot = old_V >> QFQ_MIN_SLOT_SHIFT; | 819 | unsigned long old_vslot = q->oldV >> q->min_slot_shift; |
665 | 820 | ||
666 | if (vslot != old_vslot) { | 821 | if (vslot != old_vslot) { |
667 | unsigned long mask = (1UL << fls(vslot ^ old_vslot)) - 1; | 822 | unsigned long mask = (1UL << fls(vslot ^ old_vslot)) - 1; |
@@ -672,34 +827,38 @@ static void qfq_make_eligible(struct qfq_sched *q, u64 old_V) | |||
672 | 827 | ||
673 | 828 | ||
674 | /* | 829 | /* |
675 | * If the weight and lmax (max_pkt_size) of the classes do not change, | 830 | * The index of the slot in which the aggregate is to be inserted must |
676 | * then QFQ guarantees that the slot index is never higher than | 831 | * not be higher than QFQ_MAX_SLOTS-2. There is a '-2' and not a '-1' |
677 | * 2 + ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) * (QFQ_MAX_WEIGHT/QFQ_MAX_WSUM). | 832 | * because the start time of the group may be moved backward by one |
833 | * slot after the aggregate has been inserted, and this would cause | ||
834 | * non-empty slots to be right-shifted by one position. | ||
678 | * | 835 | * |
679 | * With the current values of the above constants, the index is | 836 | * If the weight and lmax (max_pkt_size) of the classes do not change, |
680 | * then guaranteed to never be higher than 2 + 256 * (1 / 16) = 18. | 837 | * then QFQ+ does meet the above contraint according to the current |
838 | * values of its parameters. In fact, if the weight and lmax of the | ||
839 | * classes do not change, then, from the theory, QFQ+ guarantees that | ||
840 | * the slot index is never higher than | ||
841 | * 2 + QFQ_MAX_AGG_CLASSES * ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) * | ||
842 | * (QFQ_MAX_WEIGHT/QFQ_MAX_WSUM) = 2 + 8 * 128 * (1 / 64) = 18 | ||
681 | * | 843 | * |
682 | * When the weight of a class is increased or the lmax of the class is | 844 | * When the weight of a class is increased or the lmax of the class is |
683 | * decreased, a new class with smaller slot size may happen to be | 845 | * decreased, a new aggregate with smaller slot size than the original |
684 | * activated. The activation of this class should be properly delayed | 846 | * parent aggregate of the class may happen to be activated. The |
685 | * to when the service of the class has finished in the ideal system | 847 | * activation of this aggregate should be properly delayed to when the |
686 | * tracked by QFQ. If the activation of the class is not delayed to | 848 | * service of the class has finished in the ideal system tracked by |
687 | * this reference time instant, then this class may be unjustly served | 849 | * QFQ+. If the activation of the aggregate is not delayed to this |
688 | * before other classes waiting for service. This may cause | 850 | * reference time instant, then this aggregate may be unjustly served |
689 | * (unfrequently) the above bound to the slot index to be violated for | 851 | * before other aggregates waiting for service. This may cause the |
690 | * some of these unlucky classes. | 852 | * above bound to the slot index to be violated for some of these |
853 | * unlucky aggregates. | ||
691 | * | 854 | * |
692 | * Instead of delaying the activation of the new class, which is quite | 855 | * Instead of delaying the activation of the new aggregate, which is |
693 | * complex, the following inaccurate but simple solution is used: if | 856 | * quite complex, the following inaccurate but simple solution is used: |
694 | * the slot index is higher than QFQ_MAX_SLOTS-2, then the timestamps | 857 | * if the slot index is higher than QFQ_MAX_SLOTS-2, then the |
695 | * of the class are shifted backward so as to let the slot index | 858 | * timestamps of the aggregate are shifted backward so as to let the |
696 | * become equal to QFQ_MAX_SLOTS-2. This threshold is used because, if | 859 | * slot index become equal to QFQ_MAX_SLOTS-2. |
697 | * the slot index is above it, then the data structure implementing | ||
698 | * the bucket list either gets immediately corrupted or may get | ||
699 | * corrupted on a possible next packet arrival that causes the start | ||
700 | * time of the group to be shifted backward. | ||
701 | */ | 860 | */ |
702 | static void qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl, | 861 | static void qfq_slot_insert(struct qfq_group *grp, struct qfq_aggregate *agg, |
703 | u64 roundedS) | 862 | u64 roundedS) |
704 | { | 863 | { |
705 | u64 slot = (roundedS - grp->S) >> grp->slot_shift; | 864 | u64 slot = (roundedS - grp->S) >> grp->slot_shift; |
@@ -708,22 +867,22 @@ static void qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl, | |||
708 | if (unlikely(slot > QFQ_MAX_SLOTS - 2)) { | 867 | if (unlikely(slot > QFQ_MAX_SLOTS - 2)) { |
709 | u64 deltaS = roundedS - grp->S - | 868 | u64 deltaS = roundedS - grp->S - |
710 | ((u64)(QFQ_MAX_SLOTS - 2)<<grp->slot_shift); | 869 | ((u64)(QFQ_MAX_SLOTS - 2)<<grp->slot_shift); |
711 | cl->S -= deltaS; | 870 | agg->S -= deltaS; |
712 | cl->F -= deltaS; | 871 | agg->F -= deltaS; |
713 | slot = QFQ_MAX_SLOTS - 2; | 872 | slot = QFQ_MAX_SLOTS - 2; |
714 | } | 873 | } |
715 | 874 | ||
716 | i = (grp->front + slot) % QFQ_MAX_SLOTS; | 875 | i = (grp->front + slot) % QFQ_MAX_SLOTS; |
717 | 876 | ||
718 | hlist_add_head(&cl->next, &grp->slots[i]); | 877 | hlist_add_head(&agg->next, &grp->slots[i]); |
719 | __set_bit(slot, &grp->full_slots); | 878 | __set_bit(slot, &grp->full_slots); |
720 | } | 879 | } |
721 | 880 | ||
722 | /* Maybe introduce hlist_first_entry?? */ | 881 | /* Maybe introduce hlist_first_entry?? */ |
723 | static struct qfq_class *qfq_slot_head(struct qfq_group *grp) | 882 | static struct qfq_aggregate *qfq_slot_head(struct qfq_group *grp) |
724 | { | 883 | { |
725 | return hlist_entry(grp->slots[grp->front].first, | 884 | return hlist_entry(grp->slots[grp->front].first, |
726 | struct qfq_class, next); | 885 | struct qfq_aggregate, next); |
727 | } | 886 | } |
728 | 887 | ||
729 | /* | 888 | /* |
@@ -731,20 +890,20 @@ static struct qfq_class *qfq_slot_head(struct qfq_group *grp) | |||
731 | */ | 890 | */ |
732 | static void qfq_front_slot_remove(struct qfq_group *grp) | 891 | static void qfq_front_slot_remove(struct qfq_group *grp) |
733 | { | 892 | { |
734 | struct qfq_class *cl = qfq_slot_head(grp); | 893 | struct qfq_aggregate *agg = qfq_slot_head(grp); |
735 | 894 | ||
736 | BUG_ON(!cl); | 895 | BUG_ON(!agg); |
737 | hlist_del(&cl->next); | 896 | hlist_del(&agg->next); |
738 | if (hlist_empty(&grp->slots[grp->front])) | 897 | if (hlist_empty(&grp->slots[grp->front])) |
739 | __clear_bit(0, &grp->full_slots); | 898 | __clear_bit(0, &grp->full_slots); |
740 | } | 899 | } |
741 | 900 | ||
742 | /* | 901 | /* |
743 | * Returns the first full queue in a group. As a side effect, | 902 | * Returns the first aggregate in the first non-empty bucket of the |
744 | * adjust the bucket list so the first non-empty bucket is at | 903 | * group. As a side effect, adjusts the bucket list so the first |
745 | * position 0 in full_slots. | 904 | * non-empty bucket is at position 0 in full_slots. |
746 | */ | 905 | */ |
747 | static struct qfq_class *qfq_slot_scan(struct qfq_group *grp) | 906 | static struct qfq_aggregate *qfq_slot_scan(struct qfq_group *grp) |
748 | { | 907 | { |
749 | unsigned int i; | 908 | unsigned int i; |
750 | 909 | ||
@@ -780,7 +939,7 @@ static void qfq_slot_rotate(struct qfq_group *grp, u64 roundedS) | |||
780 | grp->front = (grp->front - i) % QFQ_MAX_SLOTS; | 939 | grp->front = (grp->front - i) % QFQ_MAX_SLOTS; |
781 | } | 940 | } |
782 | 941 | ||
783 | static void qfq_update_eligible(struct qfq_sched *q, u64 old_V) | 942 | static void qfq_update_eligible(struct qfq_sched *q) |
784 | { | 943 | { |
785 | struct qfq_group *grp; | 944 | struct qfq_group *grp; |
786 | unsigned long ineligible; | 945 | unsigned long ineligible; |
@@ -792,137 +951,226 @@ static void qfq_update_eligible(struct qfq_sched *q, u64 old_V) | |||
792 | if (qfq_gt(grp->S, q->V)) | 951 | if (qfq_gt(grp->S, q->V)) |
793 | q->V = grp->S; | 952 | q->V = grp->S; |
794 | } | 953 | } |
795 | qfq_make_eligible(q, old_V); | 954 | qfq_make_eligible(q); |
796 | } | 955 | } |
797 | } | 956 | } |
798 | 957 | ||
799 | /* | 958 | /* Dequeue head packet of the head class in the DRR queue of the aggregate. */ |
800 | * Updates the class, returns true if also the group needs to be updated. | 959 | static void agg_dequeue(struct qfq_aggregate *agg, |
801 | */ | 960 | struct qfq_class *cl, unsigned int len) |
802 | static bool qfq_update_class(struct qfq_group *grp, struct qfq_class *cl) | ||
803 | { | 961 | { |
804 | unsigned int len = qdisc_peek_len(cl->qdisc); | 962 | qdisc_dequeue_peeked(cl->qdisc); |
805 | 963 | ||
806 | cl->S = cl->F; | 964 | cl->deficit -= (int) len; |
807 | if (!len) | ||
808 | qfq_front_slot_remove(grp); /* queue is empty */ | ||
809 | else { | ||
810 | u64 roundedS; | ||
811 | 965 | ||
812 | cl->F = cl->S + (u64)len * cl->inv_w; | 966 | if (cl->qdisc->q.qlen == 0) /* no more packets, remove from list */ |
813 | roundedS = qfq_round_down(cl->S, grp->slot_shift); | 967 | list_del(&cl->alist); |
814 | if (roundedS == grp->S) | 968 | else if (cl->deficit < qdisc_pkt_len(cl->qdisc->ops->peek(cl->qdisc))) { |
815 | return false; | 969 | cl->deficit += agg->lmax; |
816 | 970 | list_move_tail(&cl->alist, &agg->active); | |
817 | qfq_front_slot_remove(grp); | ||
818 | qfq_slot_insert(grp, cl, roundedS); | ||
819 | } | 971 | } |
972 | } | ||
973 | |||
974 | static inline struct sk_buff *qfq_peek_skb(struct qfq_aggregate *agg, | ||
975 | struct qfq_class **cl, | ||
976 | unsigned int *len) | ||
977 | { | ||
978 | struct sk_buff *skb; | ||
820 | 979 | ||
821 | return true; | 980 | *cl = list_first_entry(&agg->active, struct qfq_class, alist); |
981 | skb = (*cl)->qdisc->ops->peek((*cl)->qdisc); | ||
982 | if (skb == NULL) | ||
983 | WARN_ONCE(1, "qfq_dequeue: non-workconserving leaf\n"); | ||
984 | else | ||
985 | *len = qdisc_pkt_len(skb); | ||
986 | |||
987 | return skb; | ||
988 | } | ||
989 | |||
990 | /* Update F according to the actual service received by the aggregate. */ | ||
991 | static inline void charge_actual_service(struct qfq_aggregate *agg) | ||
992 | { | ||
993 | /* compute the service received by the aggregate */ | ||
994 | u32 service_received = agg->initial_budget - agg->budget; | ||
995 | |||
996 | agg->F = agg->S + (u64)service_received * agg->inv_w; | ||
822 | } | 997 | } |
823 | 998 | ||
824 | static struct sk_buff *qfq_dequeue(struct Qdisc *sch) | 999 | static struct sk_buff *qfq_dequeue(struct Qdisc *sch) |
825 | { | 1000 | { |
826 | struct qfq_sched *q = qdisc_priv(sch); | 1001 | struct qfq_sched *q = qdisc_priv(sch); |
827 | struct qfq_group *grp; | 1002 | struct qfq_aggregate *in_serv_agg = q->in_serv_agg; |
828 | struct qfq_class *cl; | 1003 | struct qfq_class *cl; |
829 | struct sk_buff *skb; | 1004 | struct sk_buff *skb = NULL; |
830 | unsigned int len; | 1005 | /* next-packet len, 0 means no more active classes in in-service agg */ |
831 | u64 old_V; | 1006 | unsigned int len = 0; |
832 | 1007 | ||
833 | if (!q->bitmaps[ER]) | 1008 | if (in_serv_agg == NULL) |
834 | return NULL; | 1009 | return NULL; |
835 | 1010 | ||
836 | grp = qfq_ffs(q, q->bitmaps[ER]); | 1011 | if (!list_empty(&in_serv_agg->active)) |
1012 | skb = qfq_peek_skb(in_serv_agg, &cl, &len); | ||
837 | 1013 | ||
838 | cl = qfq_slot_head(grp); | 1014 | /* |
839 | skb = qdisc_dequeue_peeked(cl->qdisc); | 1015 | * If there are no active classes in the in-service aggregate, |
840 | if (!skb) { | 1016 | * or if the aggregate has not enough budget to serve its next |
841 | WARN_ONCE(1, "qfq_dequeue: non-workconserving leaf\n"); | 1017 | * class, then choose the next aggregate to serve. |
842 | return NULL; | 1018 | */ |
1019 | if (len == 0 || in_serv_agg->budget < len) { | ||
1020 | charge_actual_service(in_serv_agg); | ||
1021 | |||
1022 | /* recharge the budget of the aggregate */ | ||
1023 | in_serv_agg->initial_budget = in_serv_agg->budget = | ||
1024 | in_serv_agg->budgetmax; | ||
1025 | |||
1026 | if (!list_empty(&in_serv_agg->active)) | ||
1027 | /* | ||
1028 | * Still active: reschedule for | ||
1029 | * service. Possible optimization: if no other | ||
1030 | * aggregate is active, then there is no point | ||
1031 | * in rescheduling this aggregate, and we can | ||
1032 | * just keep it as the in-service one. This | ||
1033 | * should be however a corner case, and to | ||
1034 | * handle it, we would need to maintain an | ||
1035 | * extra num_active_aggs field. | ||
1036 | */ | ||
1037 | qfq_activate_agg(q, in_serv_agg, requeue); | ||
1038 | else if (sch->q.qlen == 0) { /* no aggregate to serve */ | ||
1039 | q->in_serv_agg = NULL; | ||
1040 | return NULL; | ||
1041 | } | ||
1042 | |||
1043 | /* | ||
1044 | * If we get here, there are other aggregates queued: | ||
1045 | * choose the new aggregate to serve. | ||
1046 | */ | ||
1047 | in_serv_agg = q->in_serv_agg = qfq_choose_next_agg(q); | ||
1048 | skb = qfq_peek_skb(in_serv_agg, &cl, &len); | ||
843 | } | 1049 | } |
1050 | if (!skb) | ||
1051 | return NULL; | ||
844 | 1052 | ||
845 | sch->q.qlen--; | 1053 | sch->q.qlen--; |
846 | qdisc_bstats_update(sch, skb); | 1054 | qdisc_bstats_update(sch, skb); |
847 | 1055 | ||
848 | old_V = q->V; | 1056 | agg_dequeue(in_serv_agg, cl, len); |
849 | len = qdisc_pkt_len(skb); | 1057 | in_serv_agg->budget -= len; |
850 | q->V += (u64)len * IWSUM; | 1058 | q->V += (u64)len * IWSUM; |
851 | pr_debug("qfq dequeue: len %u F %lld now %lld\n", | 1059 | pr_debug("qfq dequeue: len %u F %lld now %lld\n", |
852 | len, (unsigned long long) cl->F, (unsigned long long) q->V); | 1060 | len, (unsigned long long) in_serv_agg->F, |
1061 | (unsigned long long) q->V); | ||
853 | 1062 | ||
854 | if (qfq_update_class(grp, cl)) { | 1063 | return skb; |
855 | u64 old_F = grp->F; | 1064 | } |
856 | 1065 | ||
857 | cl = qfq_slot_scan(grp); | 1066 | static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *q) |
858 | if (!cl) | 1067 | { |
859 | __clear_bit(grp->index, &q->bitmaps[ER]); | 1068 | struct qfq_group *grp; |
860 | else { | 1069 | struct qfq_aggregate *agg, *new_front_agg; |
861 | u64 roundedS = qfq_round_down(cl->S, grp->slot_shift); | 1070 | u64 old_F; |
862 | unsigned int s; | ||
863 | 1071 | ||
864 | if (grp->S == roundedS) | 1072 | qfq_update_eligible(q); |
865 | goto skip_unblock; | 1073 | q->oldV = q->V; |
866 | grp->S = roundedS; | 1074 | |
867 | grp->F = roundedS + (2ULL << grp->slot_shift); | 1075 | if (!q->bitmaps[ER]) |
868 | __clear_bit(grp->index, &q->bitmaps[ER]); | 1076 | return NULL; |
869 | s = qfq_calc_state(q, grp); | 1077 | |
870 | __set_bit(grp->index, &q->bitmaps[s]); | 1078 | grp = qfq_ffs(q, q->bitmaps[ER]); |
871 | } | 1079 | old_F = grp->F; |
1080 | |||
1081 | agg = qfq_slot_head(grp); | ||
872 | 1082 | ||
873 | qfq_unblock_groups(q, grp->index, old_F); | 1083 | /* agg starts to be served, remove it from schedule */ |
1084 | qfq_front_slot_remove(grp); | ||
1085 | |||
1086 | new_front_agg = qfq_slot_scan(grp); | ||
1087 | |||
1088 | if (new_front_agg == NULL) /* group is now inactive, remove from ER */ | ||
1089 | __clear_bit(grp->index, &q->bitmaps[ER]); | ||
1090 | else { | ||
1091 | u64 roundedS = qfq_round_down(new_front_agg->S, | ||
1092 | grp->slot_shift); | ||
1093 | unsigned int s; | ||
1094 | |||
1095 | if (grp->S == roundedS) | ||
1096 | return agg; | ||
1097 | grp->S = roundedS; | ||
1098 | grp->F = roundedS + (2ULL << grp->slot_shift); | ||
1099 | __clear_bit(grp->index, &q->bitmaps[ER]); | ||
1100 | s = qfq_calc_state(q, grp); | ||
1101 | __set_bit(grp->index, &q->bitmaps[s]); | ||
874 | } | 1102 | } |
875 | 1103 | ||
876 | skip_unblock: | 1104 | qfq_unblock_groups(q, grp->index, old_F); |
877 | qfq_update_eligible(q, old_V); | ||
878 | 1105 | ||
879 | return skb; | 1106 | return agg; |
880 | } | 1107 | } |
881 | 1108 | ||
882 | /* | 1109 | /* |
883 | * Assign a reasonable start time for a new flow k in group i. | 1110 | * Assign a reasonable start time for a new aggregate in group i. |
884 | * Admissible values for \hat(F) are multiples of \sigma_i | 1111 | * Admissible values for \hat(F) are multiples of \sigma_i |
885 | * no greater than V+\sigma_i . Larger values mean that | 1112 | * no greater than V+\sigma_i . Larger values mean that |
886 | * we had a wraparound so we consider the timestamp to be stale. | 1113 | * we had a wraparound so we consider the timestamp to be stale. |
887 | * | 1114 | * |
888 | * If F is not stale and F >= V then we set S = F. | 1115 | * If F is not stale and F >= V then we set S = F. |
889 | * Otherwise we should assign S = V, but this may violate | 1116 | * Otherwise we should assign S = V, but this may violate |
890 | * the ordering in ER. So, if we have groups in ER, set S to | 1117 | * the ordering in EB (see [2]). So, if we have groups in ER, |
891 | * the F_j of the first group j which would be blocking us. | 1118 | * set S to the F_j of the first group j which would be blocking us. |
892 | * We are guaranteed not to move S backward because | 1119 | * We are guaranteed not to move S backward because |
893 | * otherwise our group i would still be blocked. | 1120 | * otherwise our group i would still be blocked. |
894 | */ | 1121 | */ |
895 | static void qfq_update_start(struct qfq_sched *q, struct qfq_class *cl) | 1122 | static void qfq_update_start(struct qfq_sched *q, struct qfq_aggregate *agg) |
896 | { | 1123 | { |
897 | unsigned long mask; | 1124 | unsigned long mask; |
898 | u64 limit, roundedF; | 1125 | u64 limit, roundedF; |
899 | int slot_shift = cl->grp->slot_shift; | 1126 | int slot_shift = agg->grp->slot_shift; |
900 | 1127 | ||
901 | roundedF = qfq_round_down(cl->F, slot_shift); | 1128 | roundedF = qfq_round_down(agg->F, slot_shift); |
902 | limit = qfq_round_down(q->V, slot_shift) + (1ULL << slot_shift); | 1129 | limit = qfq_round_down(q->V, slot_shift) + (1ULL << slot_shift); |
903 | 1130 | ||
904 | if (!qfq_gt(cl->F, q->V) || qfq_gt(roundedF, limit)) { | 1131 | if (!qfq_gt(agg->F, q->V) || qfq_gt(roundedF, limit)) { |
905 | /* timestamp was stale */ | 1132 | /* timestamp was stale */ |
906 | mask = mask_from(q->bitmaps[ER], cl->grp->index); | 1133 | mask = mask_from(q->bitmaps[ER], agg->grp->index); |
907 | if (mask) { | 1134 | if (mask) { |
908 | struct qfq_group *next = qfq_ffs(q, mask); | 1135 | struct qfq_group *next = qfq_ffs(q, mask); |
909 | if (qfq_gt(roundedF, next->F)) { | 1136 | if (qfq_gt(roundedF, next->F)) { |
910 | if (qfq_gt(limit, next->F)) | 1137 | if (qfq_gt(limit, next->F)) |
911 | cl->S = next->F; | 1138 | agg->S = next->F; |
912 | else /* preserve timestamp correctness */ | 1139 | else /* preserve timestamp correctness */ |
913 | cl->S = limit; | 1140 | agg->S = limit; |
914 | return; | 1141 | return; |
915 | } | 1142 | } |
916 | } | 1143 | } |
917 | cl->S = q->V; | 1144 | agg->S = q->V; |
918 | } else /* timestamp is not stale */ | 1145 | } else /* timestamp is not stale */ |
919 | cl->S = cl->F; | 1146 | agg->S = agg->F; |
920 | } | 1147 | } |
921 | 1148 | ||
1149 | /* | ||
1150 | * Update the timestamps of agg before scheduling/rescheduling it for | ||
1151 | * service. In particular, assign to agg->F its maximum possible | ||
1152 | * value, i.e., the virtual finish time with which the aggregate | ||
1153 | * should be labeled if it used all its budget once in service. | ||
1154 | */ | ||
1155 | static inline void | ||
1156 | qfq_update_agg_ts(struct qfq_sched *q, | ||
1157 | struct qfq_aggregate *agg, enum update_reason reason) | ||
1158 | { | ||
1159 | if (reason != requeue) | ||
1160 | qfq_update_start(q, agg); | ||
1161 | else /* just charge agg for the service received */ | ||
1162 | agg->S = agg->F; | ||
1163 | |||
1164 | agg->F = agg->S + (u64)agg->budgetmax * agg->inv_w; | ||
1165 | } | ||
1166 | |||
1167 | static void qfq_schedule_agg(struct qfq_sched *, struct qfq_aggregate *); | ||
1168 | |||
922 | static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) | 1169 | static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) |
923 | { | 1170 | { |
924 | struct qfq_sched *q = qdisc_priv(sch); | 1171 | struct qfq_sched *q = qdisc_priv(sch); |
925 | struct qfq_class *cl; | 1172 | struct qfq_class *cl; |
1173 | struct qfq_aggregate *agg; | ||
926 | int err = 0; | 1174 | int err = 0; |
927 | 1175 | ||
928 | cl = qfq_classify(skb, sch, &err); | 1176 | cl = qfq_classify(skb, sch, &err); |
@@ -934,11 +1182,13 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) | |||
934 | } | 1182 | } |
935 | pr_debug("qfq_enqueue: cl = %x\n", cl->common.classid); | 1183 | pr_debug("qfq_enqueue: cl = %x\n", cl->common.classid); |
936 | 1184 | ||
937 | if (unlikely(cl->lmax < qdisc_pkt_len(skb))) { | 1185 | if (unlikely(cl->agg->lmax < qdisc_pkt_len(skb))) { |
938 | pr_debug("qfq: increasing maxpkt from %u to %u for class %u", | 1186 | pr_debug("qfq: increasing maxpkt from %u to %u for class %u", |
939 | cl->lmax, qdisc_pkt_len(skb), cl->common.classid); | 1187 | cl->agg->lmax, qdisc_pkt_len(skb), cl->common.classid); |
940 | qfq_update_reactivate_class(q, cl, cl->inv_w, | 1188 | err = qfq_change_agg(sch, cl, cl->agg->class_weight, |
941 | qdisc_pkt_len(skb), 0); | 1189 | qdisc_pkt_len(skb)); |
1190 | if (err) | ||
1191 | return err; | ||
942 | } | 1192 | } |
943 | 1193 | ||
944 | err = qdisc_enqueue(skb, cl->qdisc); | 1194 | err = qdisc_enqueue(skb, cl->qdisc); |
@@ -954,35 +1204,50 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) | |||
954 | bstats_update(&cl->bstats, skb); | 1204 | bstats_update(&cl->bstats, skb); |
955 | ++sch->q.qlen; | 1205 | ++sch->q.qlen; |
956 | 1206 | ||
957 | /* If the new skb is not the head of queue, then done here. */ | 1207 | agg = cl->agg; |
958 | if (cl->qdisc->q.qlen != 1) | 1208 | /* if the queue was not empty, then done here */ |
1209 | if (cl->qdisc->q.qlen != 1) { | ||
1210 | if (unlikely(skb == cl->qdisc->ops->peek(cl->qdisc)) && | ||
1211 | list_first_entry(&agg->active, struct qfq_class, alist) | ||
1212 | == cl && cl->deficit < qdisc_pkt_len(skb)) | ||
1213 | list_move_tail(&cl->alist, &agg->active); | ||
1214 | |||
959 | return err; | 1215 | return err; |
1216 | } | ||
1217 | |||
1218 | /* schedule class for service within the aggregate */ | ||
1219 | cl->deficit = agg->lmax; | ||
1220 | list_add_tail(&cl->alist, &agg->active); | ||
960 | 1221 | ||
961 | /* If reach this point, queue q was idle */ | 1222 | if (list_first_entry(&agg->active, struct qfq_class, alist) != cl) |
962 | qfq_activate_class(q, cl, qdisc_pkt_len(skb)); | 1223 | return err; /* aggregate was not empty, nothing else to do */ |
1224 | |||
1225 | /* recharge budget */ | ||
1226 | agg->initial_budget = agg->budget = agg->budgetmax; | ||
1227 | |||
1228 | qfq_update_agg_ts(q, agg, enqueue); | ||
1229 | if (q->in_serv_agg == NULL) | ||
1230 | q->in_serv_agg = agg; | ||
1231 | else if (agg != q->in_serv_agg) | ||
1232 | qfq_schedule_agg(q, agg); | ||
963 | 1233 | ||
964 | return err; | 1234 | return err; |
965 | } | 1235 | } |
966 | 1236 | ||
967 | /* | 1237 | /* |
968 | * Handle class switch from idle to backlogged. | 1238 | * Schedule aggregate according to its timestamps. |
969 | */ | 1239 | */ |
970 | static void qfq_activate_class(struct qfq_sched *q, struct qfq_class *cl, | 1240 | static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg) |
971 | unsigned int pkt_len) | ||
972 | { | 1241 | { |
973 | struct qfq_group *grp = cl->grp; | 1242 | struct qfq_group *grp = agg->grp; |
974 | u64 roundedS; | 1243 | u64 roundedS; |
975 | int s; | 1244 | int s; |
976 | 1245 | ||
977 | qfq_update_start(q, cl); | 1246 | roundedS = qfq_round_down(agg->S, grp->slot_shift); |
978 | |||
979 | /* compute new finish time and rounded start. */ | ||
980 | cl->F = cl->S + (u64)pkt_len * cl->inv_w; | ||
981 | roundedS = qfq_round_down(cl->S, grp->slot_shift); | ||
982 | 1247 | ||
983 | /* | 1248 | /* |
984 | * insert cl in the correct bucket. | 1249 | * Insert agg in the correct bucket. |
985 | * If cl->S >= grp->S we don't need to adjust the | 1250 | * If agg->S >= grp->S we don't need to adjust the |
986 | * bucket list and simply go to the insertion phase. | 1251 | * bucket list and simply go to the insertion phase. |
987 | * Otherwise grp->S is decreasing, we must make room | 1252 | * Otherwise grp->S is decreasing, we must make room |
988 | * in the bucket list, and also recompute the group state. | 1253 | * in the bucket list, and also recompute the group state. |
@@ -990,10 +1255,10 @@ static void qfq_activate_class(struct qfq_sched *q, struct qfq_class *cl, | |||
990 | * was in ER make sure to adjust V. | 1255 | * was in ER make sure to adjust V. |
991 | */ | 1256 | */ |
992 | if (grp->full_slots) { | 1257 | if (grp->full_slots) { |
993 | if (!qfq_gt(grp->S, cl->S)) | 1258 | if (!qfq_gt(grp->S, agg->S)) |
994 | goto skip_update; | 1259 | goto skip_update; |
995 | 1260 | ||
996 | /* create a slot for this cl->S */ | 1261 | /* create a slot for this agg->S */ |
997 | qfq_slot_rotate(grp, roundedS); | 1262 | qfq_slot_rotate(grp, roundedS); |
998 | /* group was surely ineligible, remove */ | 1263 | /* group was surely ineligible, remove */ |
999 | __clear_bit(grp->index, &q->bitmaps[IR]); | 1264 | __clear_bit(grp->index, &q->bitmaps[IR]); |
@@ -1008,46 +1273,61 @@ static void qfq_activate_class(struct qfq_sched *q, struct qfq_class *cl, | |||
1008 | 1273 | ||
1009 | pr_debug("qfq enqueue: new state %d %#lx S %lld F %lld V %lld\n", | 1274 | pr_debug("qfq enqueue: new state %d %#lx S %lld F %lld V %lld\n", |
1010 | s, q->bitmaps[s], | 1275 | s, q->bitmaps[s], |
1011 | (unsigned long long) cl->S, | 1276 | (unsigned long long) agg->S, |
1012 | (unsigned long long) cl->F, | 1277 | (unsigned long long) agg->F, |
1013 | (unsigned long long) q->V); | 1278 | (unsigned long long) q->V); |
1014 | 1279 | ||
1015 | skip_update: | 1280 | skip_update: |
1016 | qfq_slot_insert(grp, cl, roundedS); | 1281 | qfq_slot_insert(grp, agg, roundedS); |
1017 | } | 1282 | } |
1018 | 1283 | ||
1019 | 1284 | ||
1285 | /* Update agg ts and schedule agg for service */ | ||
1286 | static void qfq_activate_agg(struct qfq_sched *q, struct qfq_aggregate *agg, | ||
1287 | enum update_reason reason) | ||
1288 | { | ||
1289 | qfq_update_agg_ts(q, agg, reason); | ||
1290 | qfq_schedule_agg(q, agg); | ||
1291 | } | ||
1292 | |||
1020 | static void qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp, | 1293 | static void qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp, |
1021 | struct qfq_class *cl) | 1294 | struct qfq_aggregate *agg) |
1022 | { | 1295 | { |
1023 | unsigned int i, offset; | 1296 | unsigned int i, offset; |
1024 | u64 roundedS; | 1297 | u64 roundedS; |
1025 | 1298 | ||
1026 | roundedS = qfq_round_down(cl->S, grp->slot_shift); | 1299 | roundedS = qfq_round_down(agg->S, grp->slot_shift); |
1027 | offset = (roundedS - grp->S) >> grp->slot_shift; | 1300 | offset = (roundedS - grp->S) >> grp->slot_shift; |
1301 | |||
1028 | i = (grp->front + offset) % QFQ_MAX_SLOTS; | 1302 | i = (grp->front + offset) % QFQ_MAX_SLOTS; |
1029 | 1303 | ||
1030 | hlist_del(&cl->next); | 1304 | hlist_del(&agg->next); |
1031 | if (hlist_empty(&grp->slots[i])) | 1305 | if (hlist_empty(&grp->slots[i])) |
1032 | __clear_bit(offset, &grp->full_slots); | 1306 | __clear_bit(offset, &grp->full_slots); |
1033 | } | 1307 | } |
1034 | 1308 | ||
1035 | /* | 1309 | /* |
1036 | * called to forcibly destroy a queue. | 1310 | * Called to forcibly deschedule an aggregate. If the aggregate is |
1037 | * If the queue is not in the front bucket, or if it has | 1311 | * not in the front bucket, or if the latter has other aggregates in |
1038 | * other queues in the front bucket, we can simply remove | 1312 | * the front bucket, we can simply remove the aggregate with no other |
1039 | * the queue with no other side effects. | 1313 | * side effects. |
1040 | * Otherwise we must propagate the event up. | 1314 | * Otherwise we must propagate the event up. |
1041 | */ | 1315 | */ |
1042 | static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl) | 1316 | static void qfq_deactivate_agg(struct qfq_sched *q, struct qfq_aggregate *agg) |
1043 | { | 1317 | { |
1044 | struct qfq_group *grp = cl->grp; | 1318 | struct qfq_group *grp = agg->grp; |
1045 | unsigned long mask; | 1319 | unsigned long mask; |
1046 | u64 roundedS; | 1320 | u64 roundedS; |
1047 | int s; | 1321 | int s; |
1048 | 1322 | ||
1049 | cl->F = cl->S; | 1323 | if (agg == q->in_serv_agg) { |
1050 | qfq_slot_remove(q, grp, cl); | 1324 | charge_actual_service(agg); |
1325 | q->in_serv_agg = qfq_choose_next_agg(q); | ||
1326 | return; | ||
1327 | } | ||
1328 | |||
1329 | agg->F = agg->S; | ||
1330 | qfq_slot_remove(q, grp, agg); | ||
1051 | 1331 | ||
1052 | if (!grp->full_slots) { | 1332 | if (!grp->full_slots) { |
1053 | __clear_bit(grp->index, &q->bitmaps[IR]); | 1333 | __clear_bit(grp->index, &q->bitmaps[IR]); |
@@ -1066,8 +1346,8 @@ static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl) | |||
1066 | } | 1346 | } |
1067 | __clear_bit(grp->index, &q->bitmaps[ER]); | 1347 | __clear_bit(grp->index, &q->bitmaps[ER]); |
1068 | } else if (hlist_empty(&grp->slots[grp->front])) { | 1348 | } else if (hlist_empty(&grp->slots[grp->front])) { |
1069 | cl = qfq_slot_scan(grp); | 1349 | agg = qfq_slot_scan(grp); |
1070 | roundedS = qfq_round_down(cl->S, grp->slot_shift); | 1350 | roundedS = qfq_round_down(agg->S, grp->slot_shift); |
1071 | if (grp->S != roundedS) { | 1351 | if (grp->S != roundedS) { |
1072 | __clear_bit(grp->index, &q->bitmaps[ER]); | 1352 | __clear_bit(grp->index, &q->bitmaps[ER]); |
1073 | __clear_bit(grp->index, &q->bitmaps[IR]); | 1353 | __clear_bit(grp->index, &q->bitmaps[IR]); |
@@ -1080,7 +1360,7 @@ static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl) | |||
1080 | } | 1360 | } |
1081 | } | 1361 | } |
1082 | 1362 | ||
1083 | qfq_update_eligible(q, q->V); | 1363 | qfq_update_eligible(q); |
1084 | } | 1364 | } |
1085 | 1365 | ||
1086 | static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg) | 1366 | static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg) |
@@ -1092,6 +1372,32 @@ static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg) | |||
1092 | qfq_deactivate_class(q, cl); | 1372 | qfq_deactivate_class(q, cl); |
1093 | } | 1373 | } |
1094 | 1374 | ||
1375 | static unsigned int qfq_drop_from_slot(struct qfq_sched *q, | ||
1376 | struct hlist_head *slot) | ||
1377 | { | ||
1378 | struct qfq_aggregate *agg; | ||
1379 | struct hlist_node *n; | ||
1380 | struct qfq_class *cl; | ||
1381 | unsigned int len; | ||
1382 | |||
1383 | hlist_for_each_entry(agg, n, slot, next) { | ||
1384 | list_for_each_entry(cl, &agg->active, alist) { | ||
1385 | |||
1386 | if (!cl->qdisc->ops->drop) | ||
1387 | continue; | ||
1388 | |||
1389 | len = cl->qdisc->ops->drop(cl->qdisc); | ||
1390 | if (len > 0) { | ||
1391 | if (cl->qdisc->q.qlen == 0) | ||
1392 | qfq_deactivate_class(q, cl); | ||
1393 | |||
1394 | return len; | ||
1395 | } | ||
1396 | } | ||
1397 | } | ||
1398 | return 0; | ||
1399 | } | ||
1400 | |||
1095 | static unsigned int qfq_drop(struct Qdisc *sch) | 1401 | static unsigned int qfq_drop(struct Qdisc *sch) |
1096 | { | 1402 | { |
1097 | struct qfq_sched *q = qdisc_priv(sch); | 1403 | struct qfq_sched *q = qdisc_priv(sch); |
@@ -1101,24 +1407,13 @@ static unsigned int qfq_drop(struct Qdisc *sch) | |||
1101 | for (i = 0; i <= QFQ_MAX_INDEX; i++) { | 1407 | for (i = 0; i <= QFQ_MAX_INDEX; i++) { |
1102 | grp = &q->groups[i]; | 1408 | grp = &q->groups[i]; |
1103 | for (j = 0; j < QFQ_MAX_SLOTS; j++) { | 1409 | for (j = 0; j < QFQ_MAX_SLOTS; j++) { |
1104 | struct qfq_class *cl; | 1410 | len = qfq_drop_from_slot(q, &grp->slots[j]); |
1105 | struct hlist_node *n; | 1411 | if (len > 0) { |
1106 | 1412 | sch->q.qlen--; | |
1107 | hlist_for_each_entry(cl, n, &grp->slots[j], next) { | 1413 | return len; |
1108 | |||
1109 | if (!cl->qdisc->ops->drop) | ||
1110 | continue; | ||
1111 | |||
1112 | len = cl->qdisc->ops->drop(cl->qdisc); | ||
1113 | if (len > 0) { | ||
1114 | sch->q.qlen--; | ||
1115 | if (!cl->qdisc->q.qlen) | ||
1116 | qfq_deactivate_class(q, cl); | ||
1117 | |||
1118 | return len; | ||
1119 | } | ||
1120 | } | 1414 | } |
1121 | } | 1415 | } |
1416 | |||
1122 | } | 1417 | } |
1123 | 1418 | ||
1124 | return 0; | 1419 | return 0; |
@@ -1129,44 +1424,51 @@ static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt) | |||
1129 | struct qfq_sched *q = qdisc_priv(sch); | 1424 | struct qfq_sched *q = qdisc_priv(sch); |
1130 | struct qfq_group *grp; | 1425 | struct qfq_group *grp; |
1131 | int i, j, err; | 1426 | int i, j, err; |
1427 | u32 max_cl_shift, maxbudg_shift, max_classes; | ||
1132 | 1428 | ||
1133 | err = qdisc_class_hash_init(&q->clhash); | 1429 | err = qdisc_class_hash_init(&q->clhash); |
1134 | if (err < 0) | 1430 | if (err < 0) |
1135 | return err; | 1431 | return err; |
1136 | 1432 | ||
1433 | if (qdisc_dev(sch)->tx_queue_len + 1 > QFQ_MAX_AGG_CLASSES) | ||
1434 | max_classes = QFQ_MAX_AGG_CLASSES; | ||
1435 | else | ||
1436 | max_classes = qdisc_dev(sch)->tx_queue_len + 1; | ||
1437 | /* max_cl_shift = floor(log_2(max_classes)) */ | ||
1438 | max_cl_shift = __fls(max_classes); | ||
1439 | q->max_agg_classes = 1<<max_cl_shift; | ||
1440 | |||
1441 | /* maxbudg_shift = log2(max_len * max_classes_per_agg) */ | ||
1442 | maxbudg_shift = QFQ_MTU_SHIFT + max_cl_shift; | ||
1443 | q->min_slot_shift = FRAC_BITS + maxbudg_shift - QFQ_MAX_INDEX; | ||
1444 | |||
1137 | for (i = 0; i <= QFQ_MAX_INDEX; i++) { | 1445 | for (i = 0; i <= QFQ_MAX_INDEX; i++) { |
1138 | grp = &q->groups[i]; | 1446 | grp = &q->groups[i]; |
1139 | grp->index = i; | 1447 | grp->index = i; |
1140 | grp->slot_shift = QFQ_MTU_SHIFT + FRAC_BITS | 1448 | grp->slot_shift = q->min_slot_shift + i; |
1141 | - (QFQ_MAX_INDEX - i); | ||
1142 | for (j = 0; j < QFQ_MAX_SLOTS; j++) | 1449 | for (j = 0; j < QFQ_MAX_SLOTS; j++) |
1143 | INIT_HLIST_HEAD(&grp->slots[j]); | 1450 | INIT_HLIST_HEAD(&grp->slots[j]); |
1144 | } | 1451 | } |
1145 | 1452 | ||
1453 | INIT_HLIST_HEAD(&q->nonfull_aggs); | ||
1454 | |||
1146 | return 0; | 1455 | return 0; |
1147 | } | 1456 | } |
1148 | 1457 | ||
1149 | static void qfq_reset_qdisc(struct Qdisc *sch) | 1458 | static void qfq_reset_qdisc(struct Qdisc *sch) |
1150 | { | 1459 | { |
1151 | struct qfq_sched *q = qdisc_priv(sch); | 1460 | struct qfq_sched *q = qdisc_priv(sch); |
1152 | struct qfq_group *grp; | ||
1153 | struct qfq_class *cl; | 1461 | struct qfq_class *cl; |
1154 | struct hlist_node *n, *tmp; | 1462 | struct hlist_node *n; |
1155 | unsigned int i, j; | 1463 | unsigned int i; |
1156 | 1464 | ||
1157 | for (i = 0; i <= QFQ_MAX_INDEX; i++) { | 1465 | for (i = 0; i < q->clhash.hashsize; i++) { |
1158 | grp = &q->groups[i]; | 1466 | hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) { |
1159 | for (j = 0; j < QFQ_MAX_SLOTS; j++) { | 1467 | if (cl->qdisc->q.qlen > 0) |
1160 | hlist_for_each_entry_safe(cl, n, tmp, | ||
1161 | &grp->slots[j], next) { | ||
1162 | qfq_deactivate_class(q, cl); | 1468 | qfq_deactivate_class(q, cl); |
1163 | } | ||
1164 | } | ||
1165 | } | ||
1166 | 1469 | ||
1167 | for (i = 0; i < q->clhash.hashsize; i++) { | ||
1168 | hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) | ||
1169 | qdisc_reset(cl->qdisc); | 1470 | qdisc_reset(cl->qdisc); |
1471 | } | ||
1170 | } | 1472 | } |
1171 | sch->q.qlen = 0; | 1473 | sch->q.qlen = 0; |
1172 | } | 1474 | } |