diff options
author | Eric Dumazet <eric.dumazet@gmail.com> | 2011-02-23 05:56:17 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2011-02-23 17:05:11 -0500 |
commit | e13e02a3c68d899169c78d9a18689bd73491d59a (patch) | |
tree | 6e6b40ef37261df391cd445ec0f1b3d538b23a47 | |
parent | dee9f4bceb5fd9dbfcc1567148fccdbf16d6a38a (diff) |
net_sched: SFB flow scheduler
This is the Stochastic Fair Blue scheduler, based on work from :
W. Feng, D. Kandlur, D. Saha, K. Shin. Blue: A New Class of Active Queue
Management Algorithms. U. Michigan CSE-TR-387-99, April 1999.
http://www.thefengs.com/wuchang/blue/CSE-TR-387-99.pdf
This implementation is based on work done by Juliusz Chroboczek
General SFB algorithm can be found in figure 14, page 15:
B[l][n] : L x N array of bins (L levels, N bins per level)
enqueue()
Calculate hash function values h{0}, h{1}, .. h{L-1}
Update bins at each level
for i = 0 to L - 1
if (B[i][h{i}].qlen > bin_size)
B[i][h{i}].p_mark += p_increment;
else if (B[i][h{i}].qlen == 0)
B[i][h{i}].p_mark -= p_decrement;
p_min = min(B[0][h{0}].p_mark ... B[L-1][h{L-1}].p_mark);
if (p_min == 1.0)
ratelimit();
else
mark/drop with probabilty p_min;
I did the adaptation of Juliusz code to meet current kernel standards,
and various changes to address previous comments :
http://thread.gmane.org/gmane.linux.network/90225
http://thread.gmane.org/gmane.linux.network/90375
Default flow classifier is the rxhash introduced by RPS in 2.6.35, but
we can use an external flow classifier if wanted.
tc qdisc add dev $DEV parent 1:11 handle 11: \
est 0.5sec 2sec sfb limit 128
tc filter add dev $DEV protocol ip parent 11: handle 3 \
flow hash keys dst divisor 1024
Notes:
1) SFB default child qdisc is pfifo_fast. It can be changed by another
qdisc but a child qdisc MUST not drop a packet previously queued. This
is because SFB needs to handle a dequeued packet in order to maintain
its virtual queue states. pfifo_head_drop or CHOKe should not be used.
2) ECN is enabled by default, unlike RED/CHOKe/GRED
With help from Patrick McHardy & Andi Kleen
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Juliusz Chroboczek <Juliusz.Chroboczek@pps.jussieu.fr>
CC: Stephen Hemminger <shemminger@vyatta.com>
CC: Patrick McHardy <kaber@trash.net>
CC: Andi Kleen <andi@firstfloor.org>
CC: John W. Linville <linville@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | include/linux/pkt_sched.h | 39 | ||||
-rw-r--r-- | net/sched/Kconfig | 11 | ||||
-rw-r--r-- | net/sched/Makefile | 1 | ||||
-rw-r--r-- | net/sched/sch_sfb.c | 709 |
4 files changed, 760 insertions, 0 deletions
diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h index d4bb6f58c90c..5afee2b238bd 100644 --- a/include/linux/pkt_sched.h +++ b/include/linux/pkt_sched.h | |||
@@ -522,4 +522,43 @@ struct tc_mqprio_qopt { | |||
522 | __u16 offset[TC_QOPT_MAX_QUEUE]; | 522 | __u16 offset[TC_QOPT_MAX_QUEUE]; |
523 | }; | 523 | }; |
524 | 524 | ||
525 | /* SFB */ | ||
526 | |||
527 | enum { | ||
528 | TCA_SFB_UNSPEC, | ||
529 | TCA_SFB_PARMS, | ||
530 | __TCA_SFB_MAX, | ||
531 | }; | ||
532 | |||
533 | #define TCA_SFB_MAX (__TCA_SFB_MAX - 1) | ||
534 | |||
535 | /* | ||
536 | * Note: increment, decrement are Q0.16 fixed-point values. | ||
537 | */ | ||
538 | struct tc_sfb_qopt { | ||
539 | __u32 rehash_interval; /* delay between hash move, in ms */ | ||
540 | __u32 warmup_time; /* double buffering warmup time in ms (warmup_time < rehash_interval) */ | ||
541 | __u32 max; /* max len of qlen_min */ | ||
542 | __u32 bin_size; /* maximum queue length per bin */ | ||
543 | __u32 increment; /* probability increment, (d1 in Blue) */ | ||
544 | __u32 decrement; /* probability decrement, (d2 in Blue) */ | ||
545 | __u32 limit; /* max SFB queue length */ | ||
546 | __u32 penalty_rate; /* inelastic flows are rate limited to 'rate' pps */ | ||
547 | __u32 penalty_burst; | ||
548 | }; | ||
549 | |||
550 | struct tc_sfb_xstats { | ||
551 | __u32 earlydrop; | ||
552 | __u32 penaltydrop; | ||
553 | __u32 bucketdrop; | ||
554 | __u32 queuedrop; | ||
555 | __u32 childdrop; /* drops in child qdisc */ | ||
556 | __u32 marked; | ||
557 | __u32 maxqlen; | ||
558 | __u32 maxprob; | ||
559 | __u32 avgprob; | ||
560 | }; | ||
561 | |||
562 | #define SFB_MAX_PROB 0xFFFF | ||
563 | |||
525 | #endif | 564 | #endif |
diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 8c19b6e3201e..a7a5583d4f68 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig | |||
@@ -126,6 +126,17 @@ config NET_SCH_RED | |||
126 | To compile this code as a module, choose M here: the | 126 | To compile this code as a module, choose M here: the |
127 | module will be called sch_red. | 127 | module will be called sch_red. |
128 | 128 | ||
129 | config NET_SCH_SFB | ||
130 | tristate "Stochastic Fair Blue (SFB)" | ||
131 | ---help--- | ||
132 | Say Y here if you want to use the Stochastic Fair Blue (SFB) | ||
133 | packet scheduling algorithm. | ||
134 | |||
135 | See the top of <file:net/sched/sch_sfb.c> for more details. | ||
136 | |||
137 | To compile this code as a module, choose M here: the | ||
138 | module will be called sch_sfb. | ||
139 | |||
129 | config NET_SCH_SFQ | 140 | config NET_SCH_SFQ |
130 | tristate "Stochastic Fairness Queueing (SFQ)" | 141 | tristate "Stochastic Fairness Queueing (SFQ)" |
131 | ---help--- | 142 | ---help--- |
diff --git a/net/sched/Makefile b/net/sched/Makefile index 06c6cdfd1948..2e77b8dba22e 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile | |||
@@ -24,6 +24,7 @@ obj-$(CONFIG_NET_SCH_RED) += sch_red.o | |||
24 | obj-$(CONFIG_NET_SCH_GRED) += sch_gred.o | 24 | obj-$(CONFIG_NET_SCH_GRED) += sch_gred.o |
25 | obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o | 25 | obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o |
26 | obj-$(CONFIG_NET_SCH_DSMARK) += sch_dsmark.o | 26 | obj-$(CONFIG_NET_SCH_DSMARK) += sch_dsmark.o |
27 | obj-$(CONFIG_NET_SCH_SFB) += sch_sfb.o | ||
27 | obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o | 28 | obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o |
28 | obj-$(CONFIG_NET_SCH_TBF) += sch_tbf.o | 29 | obj-$(CONFIG_NET_SCH_TBF) += sch_tbf.o |
29 | obj-$(CONFIG_NET_SCH_TEQL) += sch_teql.o | 30 | obj-$(CONFIG_NET_SCH_TEQL) += sch_teql.o |
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c new file mode 100644 index 000000000000..0a833d0c1f61 --- /dev/null +++ b/net/sched/sch_sfb.c | |||
@@ -0,0 +1,709 @@ | |||
1 | /* | ||
2 | * net/sched/sch_sfb.c Stochastic Fair Blue | ||
3 | * | ||
4 | * Copyright (c) 2008-2011 Juliusz Chroboczek <jch@pps.jussieu.fr> | ||
5 | * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * version 2 as published by the Free Software Foundation. | ||
10 | * | ||
11 | * W. Feng, D. Kandlur, D. Saha, K. Shin. Blue: | ||
12 | * A New Class of Active Queue Management Algorithms. | ||
13 | * U. Michigan CSE-TR-387-99, April 1999. | ||
14 | * | ||
15 | * http://www.thefengs.com/wuchang/blue/CSE-TR-387-99.pdf | ||
16 | * | ||
17 | */ | ||
18 | |||
19 | #include <linux/module.h> | ||
20 | #include <linux/types.h> | ||
21 | #include <linux/kernel.h> | ||
22 | #include <linux/errno.h> | ||
23 | #include <linux/skbuff.h> | ||
24 | #include <linux/random.h> | ||
25 | #include <linux/jhash.h> | ||
26 | #include <net/ip.h> | ||
27 | #include <net/pkt_sched.h> | ||
28 | #include <net/inet_ecn.h> | ||
29 | |||
30 | /* | ||
31 | * SFB uses two B[l][n] : L x N arrays of bins (L levels, N bins per level) | ||
32 | * This implementation uses L = 8 and N = 16 | ||
33 | * This permits us to split one 32bit hash (provided per packet by rxhash or | ||
34 | * external classifier) into 8 subhashes of 4 bits. | ||
35 | */ | ||
36 | #define SFB_BUCKET_SHIFT 4 | ||
37 | #define SFB_NUMBUCKETS (1 << SFB_BUCKET_SHIFT) /* N bins per Level */ | ||
38 | #define SFB_BUCKET_MASK (SFB_NUMBUCKETS - 1) | ||
39 | #define SFB_LEVELS (32 / SFB_BUCKET_SHIFT) /* L */ | ||
40 | |||
41 | /* SFB algo uses a virtual queue, named "bin" */ | ||
42 | struct sfb_bucket { | ||
43 | u16 qlen; /* length of virtual queue */ | ||
44 | u16 p_mark; /* marking probability */ | ||
45 | }; | ||
46 | |||
47 | /* We use a double buffering right before hash change | ||
48 | * (Section 4.4 of SFB reference : moving hash functions) | ||
49 | */ | ||
50 | struct sfb_bins { | ||
51 | u32 perturbation; /* jhash perturbation */ | ||
52 | struct sfb_bucket bins[SFB_LEVELS][SFB_NUMBUCKETS]; | ||
53 | }; | ||
54 | |||
55 | struct sfb_sched_data { | ||
56 | struct Qdisc *qdisc; | ||
57 | struct tcf_proto *filter_list; | ||
58 | unsigned long rehash_interval; | ||
59 | unsigned long warmup_time; /* double buffering warmup time in jiffies */ | ||
60 | u32 max; | ||
61 | u32 bin_size; /* maximum queue length per bin */ | ||
62 | u32 increment; /* d1 */ | ||
63 | u32 decrement; /* d2 */ | ||
64 | u32 limit; /* HARD maximal queue length */ | ||
65 | u32 penalty_rate; | ||
66 | u32 penalty_burst; | ||
67 | u32 tokens_avail; | ||
68 | unsigned long rehash_time; | ||
69 | unsigned long token_time; | ||
70 | |||
71 | u8 slot; /* current active bins (0 or 1) */ | ||
72 | bool double_buffering; | ||
73 | struct sfb_bins bins[2]; | ||
74 | |||
75 | struct { | ||
76 | u32 earlydrop; | ||
77 | u32 penaltydrop; | ||
78 | u32 bucketdrop; | ||
79 | u32 queuedrop; | ||
80 | u32 childdrop; /* drops in child qdisc */ | ||
81 | u32 marked; /* ECN mark */ | ||
82 | } stats; | ||
83 | }; | ||
84 | |||
85 | /* | ||
86 | * Each queued skb might be hashed on one or two bins | ||
87 | * We store in skb_cb the two hash values. | ||
88 | * (A zero value means double buffering was not used) | ||
89 | */ | ||
90 | struct sfb_skb_cb { | ||
91 | u32 hashes[2]; | ||
92 | }; | ||
93 | |||
94 | static inline struct sfb_skb_cb *sfb_skb_cb(const struct sk_buff *skb) | ||
95 | { | ||
96 | BUILD_BUG_ON(sizeof(skb->cb) < | ||
97 | sizeof(struct qdisc_skb_cb) + sizeof(struct sfb_skb_cb)); | ||
98 | return (struct sfb_skb_cb *)qdisc_skb_cb(skb)->data; | ||
99 | } | ||
100 | |||
101 | /* | ||
102 | * If using 'internal' SFB flow classifier, hash comes from skb rxhash | ||
103 | * If using external classifier, hash comes from the classid. | ||
104 | */ | ||
105 | static u32 sfb_hash(const struct sk_buff *skb, u32 slot) | ||
106 | { | ||
107 | return sfb_skb_cb(skb)->hashes[slot]; | ||
108 | } | ||
109 | |||
110 | /* Probabilities are coded as Q0.16 fixed-point values, | ||
111 | * with 0xFFFF representing 65535/65536 (almost 1.0) | ||
112 | * Addition and subtraction are saturating in [0, 65535] | ||
113 | */ | ||
114 | static u32 prob_plus(u32 p1, u32 p2) | ||
115 | { | ||
116 | u32 res = p1 + p2; | ||
117 | |||
118 | return min_t(u32, res, SFB_MAX_PROB); | ||
119 | } | ||
120 | |||
121 | static u32 prob_minus(u32 p1, u32 p2) | ||
122 | { | ||
123 | return p1 > p2 ? p1 - p2 : 0; | ||
124 | } | ||
125 | |||
126 | static void increment_one_qlen(u32 sfbhash, u32 slot, struct sfb_sched_data *q) | ||
127 | { | ||
128 | int i; | ||
129 | struct sfb_bucket *b = &q->bins[slot].bins[0][0]; | ||
130 | |||
131 | for (i = 0; i < SFB_LEVELS; i++) { | ||
132 | u32 hash = sfbhash & SFB_BUCKET_MASK; | ||
133 | |||
134 | sfbhash >>= SFB_BUCKET_SHIFT; | ||
135 | if (b[hash].qlen < 0xFFFF) | ||
136 | b[hash].qlen++; | ||
137 | b += SFB_NUMBUCKETS; /* next level */ | ||
138 | } | ||
139 | } | ||
140 | |||
141 | static void increment_qlen(const struct sk_buff *skb, struct sfb_sched_data *q) | ||
142 | { | ||
143 | u32 sfbhash; | ||
144 | |||
145 | sfbhash = sfb_hash(skb, 0); | ||
146 | if (sfbhash) | ||
147 | increment_one_qlen(sfbhash, 0, q); | ||
148 | |||
149 | sfbhash = sfb_hash(skb, 1); | ||
150 | if (sfbhash) | ||
151 | increment_one_qlen(sfbhash, 1, q); | ||
152 | } | ||
153 | |||
154 | static void decrement_one_qlen(u32 sfbhash, u32 slot, | ||
155 | struct sfb_sched_data *q) | ||
156 | { | ||
157 | int i; | ||
158 | struct sfb_bucket *b = &q->bins[slot].bins[0][0]; | ||
159 | |||
160 | for (i = 0; i < SFB_LEVELS; i++) { | ||
161 | u32 hash = sfbhash & SFB_BUCKET_MASK; | ||
162 | |||
163 | sfbhash >>= SFB_BUCKET_SHIFT; | ||
164 | if (b[hash].qlen > 0) | ||
165 | b[hash].qlen--; | ||
166 | b += SFB_NUMBUCKETS; /* next level */ | ||
167 | } | ||
168 | } | ||
169 | |||
170 | static void decrement_qlen(const struct sk_buff *skb, struct sfb_sched_data *q) | ||
171 | { | ||
172 | u32 sfbhash; | ||
173 | |||
174 | sfbhash = sfb_hash(skb, 0); | ||
175 | if (sfbhash) | ||
176 | decrement_one_qlen(sfbhash, 0, q); | ||
177 | |||
178 | sfbhash = sfb_hash(skb, 1); | ||
179 | if (sfbhash) | ||
180 | decrement_one_qlen(sfbhash, 1, q); | ||
181 | } | ||
182 | |||
183 | static void decrement_prob(struct sfb_bucket *b, struct sfb_sched_data *q) | ||
184 | { | ||
185 | b->p_mark = prob_minus(b->p_mark, q->decrement); | ||
186 | } | ||
187 | |||
188 | static void increment_prob(struct sfb_bucket *b, struct sfb_sched_data *q) | ||
189 | { | ||
190 | b->p_mark = prob_plus(b->p_mark, q->increment); | ||
191 | } | ||
192 | |||
193 | static void sfb_zero_all_buckets(struct sfb_sched_data *q) | ||
194 | { | ||
195 | memset(&q->bins, 0, sizeof(q->bins)); | ||
196 | } | ||
197 | |||
198 | /* | ||
199 | * compute max qlen, max p_mark, and avg p_mark | ||
200 | */ | ||
201 | static u32 sfb_compute_qlen(u32 *prob_r, u32 *avgpm_r, const struct sfb_sched_data *q) | ||
202 | { | ||
203 | int i; | ||
204 | u32 qlen = 0, prob = 0, totalpm = 0; | ||
205 | const struct sfb_bucket *b = &q->bins[q->slot].bins[0][0]; | ||
206 | |||
207 | for (i = 0; i < SFB_LEVELS * SFB_NUMBUCKETS; i++) { | ||
208 | if (qlen < b->qlen) | ||
209 | qlen = b->qlen; | ||
210 | totalpm += b->p_mark; | ||
211 | if (prob < b->p_mark) | ||
212 | prob = b->p_mark; | ||
213 | b++; | ||
214 | } | ||
215 | *prob_r = prob; | ||
216 | *avgpm_r = totalpm / (SFB_LEVELS * SFB_NUMBUCKETS); | ||
217 | return qlen; | ||
218 | } | ||
219 | |||
220 | |||
221 | static void sfb_init_perturbation(u32 slot, struct sfb_sched_data *q) | ||
222 | { | ||
223 | q->bins[slot].perturbation = net_random(); | ||
224 | } | ||
225 | |||
226 | static void sfb_swap_slot(struct sfb_sched_data *q) | ||
227 | { | ||
228 | sfb_init_perturbation(q->slot, q); | ||
229 | q->slot ^= 1; | ||
230 | q->double_buffering = false; | ||
231 | } | ||
232 | |||
233 | /* Non elastic flows are allowed to use part of the bandwidth, expressed | ||
234 | * in "penalty_rate" packets per second, with "penalty_burst" burst | ||
235 | */ | ||
236 | static bool sfb_rate_limit(struct sk_buff *skb, struct sfb_sched_data *q) | ||
237 | { | ||
238 | if (q->penalty_rate == 0 || q->penalty_burst == 0) | ||
239 | return true; | ||
240 | |||
241 | if (q->tokens_avail < 1) { | ||
242 | unsigned long age = min(10UL * HZ, jiffies - q->token_time); | ||
243 | |||
244 | q->tokens_avail = (age * q->penalty_rate) / HZ; | ||
245 | if (q->tokens_avail > q->penalty_burst) | ||
246 | q->tokens_avail = q->penalty_burst; | ||
247 | q->token_time = jiffies; | ||
248 | if (q->tokens_avail < 1) | ||
249 | return true; | ||
250 | } | ||
251 | |||
252 | q->tokens_avail--; | ||
253 | return false; | ||
254 | } | ||
255 | |||
256 | static bool sfb_classify(struct sk_buff *skb, struct sfb_sched_data *q, | ||
257 | int *qerr, u32 *salt) | ||
258 | { | ||
259 | struct tcf_result res; | ||
260 | int result; | ||
261 | |||
262 | result = tc_classify(skb, q->filter_list, &res); | ||
263 | if (result >= 0) { | ||
264 | #ifdef CONFIG_NET_CLS_ACT | ||
265 | switch (result) { | ||
266 | case TC_ACT_STOLEN: | ||
267 | case TC_ACT_QUEUED: | ||
268 | *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; | ||
269 | case TC_ACT_SHOT: | ||
270 | return false; | ||
271 | } | ||
272 | #endif | ||
273 | *salt = TC_H_MIN(res.classid); | ||
274 | return true; | ||
275 | } | ||
276 | return false; | ||
277 | } | ||
278 | |||
279 | static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) | ||
280 | { | ||
281 | |||
282 | struct sfb_sched_data *q = qdisc_priv(sch); | ||
283 | struct Qdisc *child = q->qdisc; | ||
284 | int i; | ||
285 | u32 p_min = ~0; | ||
286 | u32 minqlen = ~0; | ||
287 | u32 r, slot, salt, sfbhash; | ||
288 | int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; | ||
289 | |||
290 | if (q->rehash_interval > 0) { | ||
291 | unsigned long limit = q->rehash_time + q->rehash_interval; | ||
292 | |||
293 | if (unlikely(time_after(jiffies, limit))) { | ||
294 | sfb_swap_slot(q); | ||
295 | q->rehash_time = jiffies; | ||
296 | } else if (unlikely(!q->double_buffering && q->warmup_time > 0 && | ||
297 | time_after(jiffies, limit - q->warmup_time))) { | ||
298 | q->double_buffering = true; | ||
299 | } | ||
300 | } | ||
301 | |||
302 | if (q->filter_list) { | ||
303 | /* If using external classifiers, get result and record it. */ | ||
304 | if (!sfb_classify(skb, q, &ret, &salt)) | ||
305 | goto other_drop; | ||
306 | } else { | ||
307 | salt = skb_get_rxhash(skb); | ||
308 | } | ||
309 | |||
310 | slot = q->slot; | ||
311 | |||
312 | sfbhash = jhash_1word(salt, q->bins[slot].perturbation); | ||
313 | if (!sfbhash) | ||
314 | sfbhash = 1; | ||
315 | sfb_skb_cb(skb)->hashes[slot] = sfbhash; | ||
316 | |||
317 | for (i = 0; i < SFB_LEVELS; i++) { | ||
318 | u32 hash = sfbhash & SFB_BUCKET_MASK; | ||
319 | struct sfb_bucket *b = &q->bins[slot].bins[i][hash]; | ||
320 | |||
321 | sfbhash >>= SFB_BUCKET_SHIFT; | ||
322 | if (b->qlen == 0) | ||
323 | decrement_prob(b, q); | ||
324 | else if (b->qlen >= q->bin_size) | ||
325 | increment_prob(b, q); | ||
326 | if (minqlen > b->qlen) | ||
327 | minqlen = b->qlen; | ||
328 | if (p_min > b->p_mark) | ||
329 | p_min = b->p_mark; | ||
330 | } | ||
331 | |||
332 | slot ^= 1; | ||
333 | sfb_skb_cb(skb)->hashes[slot] = 0; | ||
334 | |||
335 | if (unlikely(minqlen >= q->max || sch->q.qlen >= q->limit)) { | ||
336 | sch->qstats.overlimits++; | ||
337 | if (minqlen >= q->max) | ||
338 | q->stats.bucketdrop++; | ||
339 | else | ||
340 | q->stats.queuedrop++; | ||
341 | goto drop; | ||
342 | } | ||
343 | |||
344 | if (unlikely(p_min >= SFB_MAX_PROB)) { | ||
345 | /* Inelastic flow */ | ||
346 | if (q->double_buffering) { | ||
347 | sfbhash = jhash_1word(salt, q->bins[slot].perturbation); | ||
348 | if (!sfbhash) | ||
349 | sfbhash = 1; | ||
350 | sfb_skb_cb(skb)->hashes[slot] = sfbhash; | ||
351 | |||
352 | for (i = 0; i < SFB_LEVELS; i++) { | ||
353 | u32 hash = sfbhash & SFB_BUCKET_MASK; | ||
354 | struct sfb_bucket *b = &q->bins[slot].bins[i][hash]; | ||
355 | |||
356 | sfbhash >>= SFB_BUCKET_SHIFT; | ||
357 | if (b->qlen == 0) | ||
358 | decrement_prob(b, q); | ||
359 | else if (b->qlen >= q->bin_size) | ||
360 | increment_prob(b, q); | ||
361 | } | ||
362 | } | ||
363 | if (sfb_rate_limit(skb, q)) { | ||
364 | sch->qstats.overlimits++; | ||
365 | q->stats.penaltydrop++; | ||
366 | goto drop; | ||
367 | } | ||
368 | goto enqueue; | ||
369 | } | ||
370 | |||
371 | r = net_random() & SFB_MAX_PROB; | ||
372 | |||
373 | if (unlikely(r < p_min)) { | ||
374 | if (unlikely(p_min > SFB_MAX_PROB / 2)) { | ||
375 | /* If we're marking that many packets, then either | ||
376 | * this flow is unresponsive, or we're badly congested. | ||
377 | * In either case, we want to start dropping packets. | ||
378 | */ | ||
379 | if (r < (p_min - SFB_MAX_PROB / 2) * 2) { | ||
380 | q->stats.earlydrop++; | ||
381 | goto drop; | ||
382 | } | ||
383 | } | ||
384 | if (INET_ECN_set_ce(skb)) { | ||
385 | q->stats.marked++; | ||
386 | } else { | ||
387 | q->stats.earlydrop++; | ||
388 | goto drop; | ||
389 | } | ||
390 | } | ||
391 | |||
392 | enqueue: | ||
393 | ret = qdisc_enqueue(skb, child); | ||
394 | if (likely(ret == NET_XMIT_SUCCESS)) { | ||
395 | sch->q.qlen++; | ||
396 | increment_qlen(skb, q); | ||
397 | } else if (net_xmit_drop_count(ret)) { | ||
398 | q->stats.childdrop++; | ||
399 | sch->qstats.drops++; | ||
400 | } | ||
401 | return ret; | ||
402 | |||
403 | drop: | ||
404 | qdisc_drop(skb, sch); | ||
405 | return NET_XMIT_CN; | ||
406 | other_drop: | ||
407 | if (ret & __NET_XMIT_BYPASS) | ||
408 | sch->qstats.drops++; | ||
409 | kfree_skb(skb); | ||
410 | return ret; | ||
411 | } | ||
412 | |||
413 | static struct sk_buff *sfb_dequeue(struct Qdisc *sch) | ||
414 | { | ||
415 | struct sfb_sched_data *q = qdisc_priv(sch); | ||
416 | struct Qdisc *child = q->qdisc; | ||
417 | struct sk_buff *skb; | ||
418 | |||
419 | skb = child->dequeue(q->qdisc); | ||
420 | |||
421 | if (skb) { | ||
422 | qdisc_bstats_update(sch, skb); | ||
423 | sch->q.qlen--; | ||
424 | decrement_qlen(skb, q); | ||
425 | } | ||
426 | |||
427 | return skb; | ||
428 | } | ||
429 | |||
430 | static struct sk_buff *sfb_peek(struct Qdisc *sch) | ||
431 | { | ||
432 | struct sfb_sched_data *q = qdisc_priv(sch); | ||
433 | struct Qdisc *child = q->qdisc; | ||
434 | |||
435 | return child->ops->peek(child); | ||
436 | } | ||
437 | |||
438 | /* No sfb_drop -- impossible since the child doesn't return the dropped skb. */ | ||
439 | |||
440 | static void sfb_reset(struct Qdisc *sch) | ||
441 | { | ||
442 | struct sfb_sched_data *q = qdisc_priv(sch); | ||
443 | |||
444 | qdisc_reset(q->qdisc); | ||
445 | sch->q.qlen = 0; | ||
446 | q->slot = 0; | ||
447 | q->double_buffering = false; | ||
448 | sfb_zero_all_buckets(q); | ||
449 | sfb_init_perturbation(0, q); | ||
450 | } | ||
451 | |||
452 | static void sfb_destroy(struct Qdisc *sch) | ||
453 | { | ||
454 | struct sfb_sched_data *q = qdisc_priv(sch); | ||
455 | |||
456 | tcf_destroy_chain(&q->filter_list); | ||
457 | qdisc_destroy(q->qdisc); | ||
458 | } | ||
459 | |||
460 | static const struct nla_policy sfb_policy[TCA_SFB_MAX + 1] = { | ||
461 | [TCA_SFB_PARMS] = { .len = sizeof(struct tc_sfb_qopt) }, | ||
462 | }; | ||
463 | |||
464 | static const struct tc_sfb_qopt sfb_default_ops = { | ||
465 | .rehash_interval = 600 * MSEC_PER_SEC, | ||
466 | .warmup_time = 60 * MSEC_PER_SEC, | ||
467 | .limit = 0, | ||
468 | .max = 25, | ||
469 | .bin_size = 20, | ||
470 | .increment = (SFB_MAX_PROB + 500) / 1000, /* 0.1 % */ | ||
471 | .decrement = (SFB_MAX_PROB + 3000) / 6000, | ||
472 | .penalty_rate = 10, | ||
473 | .penalty_burst = 20, | ||
474 | }; | ||
475 | |||
476 | static int sfb_change(struct Qdisc *sch, struct nlattr *opt) | ||
477 | { | ||
478 | struct sfb_sched_data *q = qdisc_priv(sch); | ||
479 | struct Qdisc *child; | ||
480 | struct nlattr *tb[TCA_SFB_MAX + 1]; | ||
481 | const struct tc_sfb_qopt *ctl = &sfb_default_ops; | ||
482 | u32 limit; | ||
483 | int err; | ||
484 | |||
485 | if (opt) { | ||
486 | err = nla_parse_nested(tb, TCA_SFB_MAX, opt, sfb_policy); | ||
487 | if (err < 0) | ||
488 | return -EINVAL; | ||
489 | |||
490 | if (tb[TCA_SFB_PARMS] == NULL) | ||
491 | return -EINVAL; | ||
492 | |||
493 | ctl = nla_data(tb[TCA_SFB_PARMS]); | ||
494 | } | ||
495 | |||
496 | limit = ctl->limit; | ||
497 | if (limit == 0) | ||
498 | limit = max_t(u32, qdisc_dev(sch)->tx_queue_len, 1); | ||
499 | |||
500 | child = fifo_create_dflt(sch, &pfifo_qdisc_ops, limit); | ||
501 | if (IS_ERR(child)) | ||
502 | return PTR_ERR(child); | ||
503 | |||
504 | sch_tree_lock(sch); | ||
505 | |||
506 | qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen); | ||
507 | qdisc_destroy(q->qdisc); | ||
508 | q->qdisc = child; | ||
509 | |||
510 | q->rehash_interval = msecs_to_jiffies(ctl->rehash_interval); | ||
511 | q->warmup_time = msecs_to_jiffies(ctl->warmup_time); | ||
512 | q->rehash_time = jiffies; | ||
513 | q->limit = limit; | ||
514 | q->increment = ctl->increment; | ||
515 | q->decrement = ctl->decrement; | ||
516 | q->max = ctl->max; | ||
517 | q->bin_size = ctl->bin_size; | ||
518 | q->penalty_rate = ctl->penalty_rate; | ||
519 | q->penalty_burst = ctl->penalty_burst; | ||
520 | q->tokens_avail = ctl->penalty_burst; | ||
521 | q->token_time = jiffies; | ||
522 | |||
523 | q->slot = 0; | ||
524 | q->double_buffering = false; | ||
525 | sfb_zero_all_buckets(q); | ||
526 | sfb_init_perturbation(0, q); | ||
527 | sfb_init_perturbation(1, q); | ||
528 | |||
529 | sch_tree_unlock(sch); | ||
530 | |||
531 | return 0; | ||
532 | } | ||
533 | |||
534 | static int sfb_init(struct Qdisc *sch, struct nlattr *opt) | ||
535 | { | ||
536 | struct sfb_sched_data *q = qdisc_priv(sch); | ||
537 | |||
538 | q->qdisc = &noop_qdisc; | ||
539 | return sfb_change(sch, opt); | ||
540 | } | ||
541 | |||
542 | static int sfb_dump(struct Qdisc *sch, struct sk_buff *skb) | ||
543 | { | ||
544 | struct sfb_sched_data *q = qdisc_priv(sch); | ||
545 | struct nlattr *opts; | ||
546 | struct tc_sfb_qopt opt = { | ||
547 | .rehash_interval = jiffies_to_msecs(q->rehash_interval), | ||
548 | .warmup_time = jiffies_to_msecs(q->warmup_time), | ||
549 | .limit = q->limit, | ||
550 | .max = q->max, | ||
551 | .bin_size = q->bin_size, | ||
552 | .increment = q->increment, | ||
553 | .decrement = q->decrement, | ||
554 | .penalty_rate = q->penalty_rate, | ||
555 | .penalty_burst = q->penalty_burst, | ||
556 | }; | ||
557 | |||
558 | sch->qstats.backlog = q->qdisc->qstats.backlog; | ||
559 | opts = nla_nest_start(skb, TCA_OPTIONS); | ||
560 | NLA_PUT(skb, TCA_SFB_PARMS, sizeof(opt), &opt); | ||
561 | return nla_nest_end(skb, opts); | ||
562 | |||
563 | nla_put_failure: | ||
564 | nla_nest_cancel(skb, opts); | ||
565 | return -EMSGSIZE; | ||
566 | } | ||
567 | |||
568 | static int sfb_dump_stats(struct Qdisc *sch, struct gnet_dump *d) | ||
569 | { | ||
570 | struct sfb_sched_data *q = qdisc_priv(sch); | ||
571 | struct tc_sfb_xstats st = { | ||
572 | .earlydrop = q->stats.earlydrop, | ||
573 | .penaltydrop = q->stats.penaltydrop, | ||
574 | .bucketdrop = q->stats.bucketdrop, | ||
575 | .queuedrop = q->stats.queuedrop, | ||
576 | .childdrop = q->stats.childdrop, | ||
577 | .marked = q->stats.marked, | ||
578 | }; | ||
579 | |||
580 | st.maxqlen = sfb_compute_qlen(&st.maxprob, &st.avgprob, q); | ||
581 | |||
582 | return gnet_stats_copy_app(d, &st, sizeof(st)); | ||
583 | } | ||
584 | |||
585 | static int sfb_dump_class(struct Qdisc *sch, unsigned long cl, | ||
586 | struct sk_buff *skb, struct tcmsg *tcm) | ||
587 | { | ||
588 | return -ENOSYS; | ||
589 | } | ||
590 | |||
591 | static int sfb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, | ||
592 | struct Qdisc **old) | ||
593 | { | ||
594 | struct sfb_sched_data *q = qdisc_priv(sch); | ||
595 | |||
596 | if (new == NULL) | ||
597 | new = &noop_qdisc; | ||
598 | |||
599 | sch_tree_lock(sch); | ||
600 | *old = q->qdisc; | ||
601 | q->qdisc = new; | ||
602 | qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); | ||
603 | qdisc_reset(*old); | ||
604 | sch_tree_unlock(sch); | ||
605 | return 0; | ||
606 | } | ||
607 | |||
608 | static struct Qdisc *sfb_leaf(struct Qdisc *sch, unsigned long arg) | ||
609 | { | ||
610 | struct sfb_sched_data *q = qdisc_priv(sch); | ||
611 | |||
612 | return q->qdisc; | ||
613 | } | ||
614 | |||
615 | static unsigned long sfb_get(struct Qdisc *sch, u32 classid) | ||
616 | { | ||
617 | return 1; | ||
618 | } | ||
619 | |||
620 | static void sfb_put(struct Qdisc *sch, unsigned long arg) | ||
621 | { | ||
622 | } | ||
623 | |||
624 | static int sfb_change_class(struct Qdisc *sch, u32 classid, u32 parentid, | ||
625 | struct nlattr **tca, unsigned long *arg) | ||
626 | { | ||
627 | return -ENOSYS; | ||
628 | } | ||
629 | |||
630 | static int sfb_delete(struct Qdisc *sch, unsigned long cl) | ||
631 | { | ||
632 | return -ENOSYS; | ||
633 | } | ||
634 | |||
635 | static void sfb_walk(struct Qdisc *sch, struct qdisc_walker *walker) | ||
636 | { | ||
637 | if (!walker->stop) { | ||
638 | if (walker->count >= walker->skip) | ||
639 | if (walker->fn(sch, 1, walker) < 0) { | ||
640 | walker->stop = 1; | ||
641 | return; | ||
642 | } | ||
643 | walker->count++; | ||
644 | } | ||
645 | } | ||
646 | |||
647 | static struct tcf_proto **sfb_find_tcf(struct Qdisc *sch, unsigned long cl) | ||
648 | { | ||
649 | struct sfb_sched_data *q = qdisc_priv(sch); | ||
650 | |||
651 | if (cl) | ||
652 | return NULL; | ||
653 | return &q->filter_list; | ||
654 | } | ||
655 | |||
656 | static unsigned long sfb_bind(struct Qdisc *sch, unsigned long parent, | ||
657 | u32 classid) | ||
658 | { | ||
659 | return 0; | ||
660 | } | ||
661 | |||
662 | |||
663 | static const struct Qdisc_class_ops sfb_class_ops = { | ||
664 | .graft = sfb_graft, | ||
665 | .leaf = sfb_leaf, | ||
666 | .get = sfb_get, | ||
667 | .put = sfb_put, | ||
668 | .change = sfb_change_class, | ||
669 | .delete = sfb_delete, | ||
670 | .walk = sfb_walk, | ||
671 | .tcf_chain = sfb_find_tcf, | ||
672 | .bind_tcf = sfb_bind, | ||
673 | .unbind_tcf = sfb_put, | ||
674 | .dump = sfb_dump_class, | ||
675 | }; | ||
676 | |||
677 | static struct Qdisc_ops sfb_qdisc_ops __read_mostly = { | ||
678 | .id = "sfb", | ||
679 | .priv_size = sizeof(struct sfb_sched_data), | ||
680 | .cl_ops = &sfb_class_ops, | ||
681 | .enqueue = sfb_enqueue, | ||
682 | .dequeue = sfb_dequeue, | ||
683 | .peek = sfb_peek, | ||
684 | .init = sfb_init, | ||
685 | .reset = sfb_reset, | ||
686 | .destroy = sfb_destroy, | ||
687 | .change = sfb_change, | ||
688 | .dump = sfb_dump, | ||
689 | .dump_stats = sfb_dump_stats, | ||
690 | .owner = THIS_MODULE, | ||
691 | }; | ||
692 | |||
693 | static int __init sfb_module_init(void) | ||
694 | { | ||
695 | return register_qdisc(&sfb_qdisc_ops); | ||
696 | } | ||
697 | |||
698 | static void __exit sfb_module_exit(void) | ||
699 | { | ||
700 | unregister_qdisc(&sfb_qdisc_ops); | ||
701 | } | ||
702 | |||
703 | module_init(sfb_module_init) | ||
704 | module_exit(sfb_module_exit) | ||
705 | |||
706 | MODULE_DESCRIPTION("Stochastic Fair Blue queue discipline"); | ||
707 | MODULE_AUTHOR("Juliusz Chroboczek"); | ||
708 | MODULE_AUTHOR("Eric Dumazet"); | ||
709 | MODULE_LICENSE("GPL"); | ||