aboutsummaryrefslogtreecommitdiffstats
path: root/net/sched/sch_red.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/sched/sch_red.c')
-rw-r--r--net/sched/sch_red.c418
1 files changed, 107 insertions, 311 deletions
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 7845d045eec4..dccfa44c2d71 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -9,76 +9,23 @@
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 * 10 *
11 * Changes: 11 * Changes:
12 * J Hadi Salim <hadi@nortel.com> 980914: computation fixes 12 * J Hadi Salim 980914: computation fixes
13 * Alexey Makarenko <makar@phoenix.kharkov.ua> 990814: qave on idle link was calculated incorrectly. 13 * Alexey Makarenko <makar@phoenix.kharkov.ua> 990814: qave on idle link was calculated incorrectly.
14 * J Hadi Salim <hadi@nortelnetworks.com> 980816: ECN support 14 * J Hadi Salim 980816: ECN support
15 */ 15 */
16 16
17#include <linux/config.h> 17#include <linux/config.h>
18#include <linux/module.h> 18#include <linux/module.h>
19#include <asm/uaccess.h>
20#include <asm/system.h>
21#include <linux/bitops.h>
22#include <linux/types.h> 19#include <linux/types.h>
23#include <linux/kernel.h> 20#include <linux/kernel.h>
24#include <linux/sched.h>
25#include <linux/string.h>
26#include <linux/mm.h>
27#include <linux/socket.h>
28#include <linux/sockios.h>
29#include <linux/in.h>
30#include <linux/errno.h>
31#include <linux/interrupt.h>
32#include <linux/if_ether.h>
33#include <linux/inet.h>
34#include <linux/netdevice.h> 21#include <linux/netdevice.h>
35#include <linux/etherdevice.h>
36#include <linux/notifier.h>
37#include <net/ip.h>
38#include <net/route.h>
39#include <linux/skbuff.h> 22#include <linux/skbuff.h>
40#include <net/sock.h>
41#include <net/pkt_sched.h> 23#include <net/pkt_sched.h>
42#include <net/inet_ecn.h> 24#include <net/inet_ecn.h>
43#include <net/dsfield.h> 25#include <net/red.h>
44 26
45 27
46/* Random Early Detection (RED) algorithm. 28/* Parameters, settable by user:
47 =======================================
48
49 Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways
50 for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking.
51
52 This file codes a "divisionless" version of RED algorithm
53 as written down in Fig.17 of the paper.
54
55Short description.
56------------------
57
58 When a new packet arrives we calculate the average queue length:
59
60 avg = (1-W)*avg + W*current_queue_len,
61
62 W is the filter time constant (chosen as 2^(-Wlog)), it controls
63 the inertia of the algorithm. To allow larger bursts, W should be
64 decreased.
65
66 if (avg > th_max) -> packet marked (dropped).
67 if (avg < th_min) -> packet passes.
68 if (th_min < avg < th_max) we calculate probability:
69
70 Pb = max_P * (avg - th_min)/(th_max-th_min)
71
72 and mark (drop) packet with this probability.
73 Pb changes from 0 (at avg==th_min) to max_P (avg==th_max).
74 max_P should be small (not 1), usually 0.01..0.02 is good value.
75
76 max_P is chosen as a number, so that max_P/(th_max-th_min)
77 is a negative power of two in order arithmetics to contain
78 only shifts.
79
80
81 Parameters, settable by user:
82 ----------------------------- 29 -----------------------------
83 30
84 limit - bytes (must be > qth_max + burst) 31 limit - bytes (must be > qth_max + burst)
@@ -89,243 +36,93 @@ Short description.
89 arbitrarily high (well, less than ram size) 36 arbitrarily high (well, less than ram size)
90 Really, this limit will never be reached 37 Really, this limit will never be reached
91 if RED works correctly. 38 if RED works correctly.
92
93 qth_min - bytes (should be < qth_max/2)
94 qth_max - bytes (should be at least 2*qth_min and less limit)
95 Wlog - bits (<32) log(1/W).
96 Plog - bits (<32)
97
98 Plog is related to max_P by formula:
99
100 max_P = (qth_max-qth_min)/2^Plog;
101
102 F.e. if qth_max=128K and qth_min=32K, then Plog=22
103 corresponds to max_P=0.02
104
105 Scell_log
106 Stab
107
108 Lookup table for log((1-W)^(t/t_ave).
109
110
111NOTES:
112
113Upper bound on W.
114-----------------
115
116 If you want to allow bursts of L packets of size S,
117 you should choose W:
118
119 L + 1 - th_min/S < (1-(1-W)^L)/W
120
121 th_min/S = 32 th_min/S = 4
122
123 log(W) L
124 -1 33
125 -2 35
126 -3 39
127 -4 46
128 -5 57
129 -6 75
130 -7 101
131 -8 135
132 -9 190
133 etc.
134 */ 39 */
135 40
136struct red_sched_data 41struct red_sched_data
137{ 42{
138/* Parameters */ 43 u32 limit; /* HARD maximal queue length */
139 u32 limit; /* HARD maximal queue length */ 44 unsigned char flags;
140 u32 qth_min; /* Min average length threshold: A scaled */ 45 struct red_parms parms;
141 u32 qth_max; /* Max average length threshold: A scaled */ 46 struct red_stats stats;
142 u32 Rmask;
143 u32 Scell_max;
144 unsigned char flags;
145 char Wlog; /* log(W) */
146 char Plog; /* random number bits */
147 char Scell_log;
148 u8 Stab[256];
149
150/* Variables */
151 unsigned long qave; /* Average queue length: A scaled */
152 int qcount; /* Packets since last random number generation */
153 u32 qR; /* Cached random number */
154
155 psched_time_t qidlestart; /* Start of idle period */
156 struct tc_red_xstats st;
157}; 47};
158 48
159static int red_ecn_mark(struct sk_buff *skb) 49static inline int red_use_ecn(struct red_sched_data *q)
160{ 50{
161 if (skb->nh.raw + 20 > skb->tail) 51 return q->flags & TC_RED_ECN;
162 return 0;
163
164 switch (skb->protocol) {
165 case __constant_htons(ETH_P_IP):
166 if (INET_ECN_is_not_ect(skb->nh.iph->tos))
167 return 0;
168 IP_ECN_set_ce(skb->nh.iph);
169 return 1;
170 case __constant_htons(ETH_P_IPV6):
171 if (INET_ECN_is_not_ect(ipv6_get_dsfield(skb->nh.ipv6h)))
172 return 0;
173 IP6_ECN_set_ce(skb->nh.ipv6h);
174 return 1;
175 default:
176 return 0;
177 }
178} 52}
179 53
180static int 54static inline int red_use_harddrop(struct red_sched_data *q)
181red_enqueue(struct sk_buff *skb, struct Qdisc* sch) 55{
56 return q->flags & TC_RED_HARDDROP;
57}
58
59static int red_enqueue(struct sk_buff *skb, struct Qdisc* sch)
182{ 60{
183 struct red_sched_data *q = qdisc_priv(sch); 61 struct red_sched_data *q = qdisc_priv(sch);
184 62
185 psched_time_t now; 63 q->parms.qavg = red_calc_qavg(&q->parms, sch->qstats.backlog);
186 64
187 if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) { 65 if (red_is_idling(&q->parms))
188 long us_idle; 66 red_end_of_idle_period(&q->parms);
189 int shift;
190 67
191 PSCHED_GET_TIME(now); 68 switch (red_action(&q->parms, q->parms.qavg)) {
192 us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max); 69 case RED_DONT_MARK:
193 PSCHED_SET_PASTPERFECT(q->qidlestart); 70 break;
194 71
195/* 72 case RED_PROB_MARK:
196 The problem: ideally, average length queue recalcultion should 73 sch->qstats.overlimits++;
197 be done over constant clock intervals. This is too expensive, so that 74 if (!red_use_ecn(q) || !INET_ECN_set_ce(skb)) {
198 the calculation is driven by outgoing packets. 75 q->stats.prob_drop++;
199 When the queue is idle we have to model this clock by hand. 76 goto congestion_drop;
200 77 }
201 SF+VJ proposed to "generate" m = idletime/(average_pkt_size/bandwidth)
202 dummy packets as a burst after idle time, i.e.
203
204 q->qave *= (1-W)^m
205
206 This is an apparently overcomplicated solution (f.e. we have to precompute
207 a table to make this calculation in reasonable time)
208 I believe that a simpler model may be used here,
209 but it is field for experiments.
210*/
211 shift = q->Stab[us_idle>>q->Scell_log];
212
213 if (shift) {
214 q->qave >>= shift;
215 } else {
216 /* Approximate initial part of exponent
217 with linear function:
218 (1-W)^m ~= 1-mW + ...
219
220 Seems, it is the best solution to
221 problem of too coarce exponent tabulation.
222 */
223
224 us_idle = (q->qave * us_idle)>>q->Scell_log;
225 if (us_idle < q->qave/2)
226 q->qave -= us_idle;
227 else
228 q->qave >>= 1;
229 }
230 } else {
231 q->qave += sch->qstats.backlog - (q->qave >> q->Wlog);
232 /* NOTE:
233 q->qave is fixed point number with point at Wlog.
234 The formulae above is equvalent to floating point
235 version:
236
237 qave = qave*(1-W) + sch->qstats.backlog*W;
238 --ANK (980924)
239 */
240 }
241 78
242 if (q->qave < q->qth_min) { 79 q->stats.prob_mark++;
243 q->qcount = -1; 80 break;
244enqueue: 81
245 if (sch->qstats.backlog + skb->len <= q->limit) { 82 case RED_HARD_MARK:
246 __skb_queue_tail(&sch->q, skb); 83 sch->qstats.overlimits++;
247 sch->qstats.backlog += skb->len; 84 if (red_use_harddrop(q) || !red_use_ecn(q) ||
248 sch->bstats.bytes += skb->len; 85 !INET_ECN_set_ce(skb)) {
249 sch->bstats.packets++; 86 q->stats.forced_drop++;
250 return NET_XMIT_SUCCESS; 87 goto congestion_drop;
251 } else { 88 }
252 q->st.pdrop++;
253 }
254 kfree_skb(skb);
255 sch->qstats.drops++;
256 return NET_XMIT_DROP;
257 }
258 if (q->qave >= q->qth_max) {
259 q->qcount = -1;
260 sch->qstats.overlimits++;
261mark:
262 if (!(q->flags&TC_RED_ECN) || !red_ecn_mark(skb)) {
263 q->st.early++;
264 goto drop;
265 }
266 q->st.marked++;
267 goto enqueue;
268 }
269 89
270 if (++q->qcount) { 90 q->stats.forced_mark++;
271 /* The formula used below causes questions. 91 break;
272
273 OK. qR is random number in the interval 0..Rmask
274 i.e. 0..(2^Plog). If we used floating point
275 arithmetics, it would be: (2^Plog)*rnd_num,
276 where rnd_num is less 1.
277
278 Taking into account, that qave have fixed
279 point at Wlog, and Plog is related to max_P by
280 max_P = (qth_max-qth_min)/2^Plog; two lines
281 below have the following floating point equivalent:
282
283 max_P*(qave - qth_min)/(qth_max-qth_min) < rnd/qcount
284
285 Any questions? --ANK (980924)
286 */
287 if (((q->qave - q->qth_min)>>q->Wlog)*q->qcount < q->qR)
288 goto enqueue;
289 q->qcount = 0;
290 q->qR = net_random()&q->Rmask;
291 sch->qstats.overlimits++;
292 goto mark;
293 } 92 }
294 q->qR = net_random()&q->Rmask;
295 goto enqueue;
296 93
297drop: 94 if (sch->qstats.backlog + skb->len <= q->limit)
298 kfree_skb(skb); 95 return qdisc_enqueue_tail(skb, sch);
299 sch->qstats.drops++; 96
97 q->stats.pdrop++;
98 return qdisc_drop(skb, sch);
99
100congestion_drop:
101 qdisc_drop(skb, sch);
300 return NET_XMIT_CN; 102 return NET_XMIT_CN;
301} 103}
302 104
303static int 105static int red_requeue(struct sk_buff *skb, struct Qdisc* sch)
304red_requeue(struct sk_buff *skb, struct Qdisc* sch)
305{ 106{
306 struct red_sched_data *q = qdisc_priv(sch); 107 struct red_sched_data *q = qdisc_priv(sch);
307 108
308 PSCHED_SET_PASTPERFECT(q->qidlestart); 109 if (red_is_idling(&q->parms))
110 red_end_of_idle_period(&q->parms);
309 111
310 __skb_queue_head(&sch->q, skb); 112 return qdisc_requeue(skb, sch);
311 sch->qstats.backlog += skb->len;
312 sch->qstats.requeues++;
313 return 0;
314} 113}
315 114
316static struct sk_buff * 115static struct sk_buff * red_dequeue(struct Qdisc* sch)
317red_dequeue(struct Qdisc* sch)
318{ 116{
319 struct sk_buff *skb; 117 struct sk_buff *skb;
320 struct red_sched_data *q = qdisc_priv(sch); 118 struct red_sched_data *q = qdisc_priv(sch);
321 119
322 skb = __skb_dequeue(&sch->q); 120 skb = qdisc_dequeue_head(sch);
323 if (skb) { 121
324 sch->qstats.backlog -= skb->len; 122 if (skb == NULL && !red_is_idling(&q->parms))
325 return skb; 123 red_start_of_idle_period(&q->parms);
326 } 124
327 PSCHED_GET_TIME(q->qidlestart); 125 return skb;
328 return NULL;
329} 126}
330 127
331static unsigned int red_drop(struct Qdisc* sch) 128static unsigned int red_drop(struct Qdisc* sch)
@@ -333,16 +130,17 @@ static unsigned int red_drop(struct Qdisc* sch)
333 struct sk_buff *skb; 130 struct sk_buff *skb;
334 struct red_sched_data *q = qdisc_priv(sch); 131 struct red_sched_data *q = qdisc_priv(sch);
335 132
336 skb = __skb_dequeue_tail(&sch->q); 133 skb = qdisc_dequeue_tail(sch);
337 if (skb) { 134 if (skb) {
338 unsigned int len = skb->len; 135 unsigned int len = skb->len;
339 sch->qstats.backlog -= len; 136 q->stats.other++;
340 sch->qstats.drops++; 137 qdisc_drop(skb, sch);
341 q->st.other++;
342 kfree_skb(skb);
343 return len; 138 return len;
344 } 139 }
345 PSCHED_GET_TIME(q->qidlestart); 140
141 if (!red_is_idling(&q->parms))
142 red_start_of_idle_period(&q->parms);
143
346 return 0; 144 return 0;
347} 145}
348 146
@@ -350,43 +148,38 @@ static void red_reset(struct Qdisc* sch)
350{ 148{
351 struct red_sched_data *q = qdisc_priv(sch); 149 struct red_sched_data *q = qdisc_priv(sch);
352 150
353 __skb_queue_purge(&sch->q); 151 qdisc_reset_queue(sch);
354 sch->qstats.backlog = 0; 152 red_restart(&q->parms);
355 PSCHED_SET_PASTPERFECT(q->qidlestart);
356 q->qave = 0;
357 q->qcount = -1;
358} 153}
359 154
360static int red_change(struct Qdisc *sch, struct rtattr *opt) 155static int red_change(struct Qdisc *sch, struct rtattr *opt)
361{ 156{
362 struct red_sched_data *q = qdisc_priv(sch); 157 struct red_sched_data *q = qdisc_priv(sch);
363 struct rtattr *tb[TCA_RED_STAB]; 158 struct rtattr *tb[TCA_RED_MAX];
364 struct tc_red_qopt *ctl; 159 struct tc_red_qopt *ctl;
365 160
366 if (opt == NULL || 161 if (opt == NULL || rtattr_parse_nested(tb, TCA_RED_MAX, opt))
367 rtattr_parse_nested(tb, TCA_RED_STAB, opt) || 162 return -EINVAL;
368 tb[TCA_RED_PARMS-1] == 0 || tb[TCA_RED_STAB-1] == 0 || 163
164 if (tb[TCA_RED_PARMS-1] == NULL ||
369 RTA_PAYLOAD(tb[TCA_RED_PARMS-1]) < sizeof(*ctl) || 165 RTA_PAYLOAD(tb[TCA_RED_PARMS-1]) < sizeof(*ctl) ||
370 RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < 256) 166 tb[TCA_RED_STAB-1] == NULL ||
167 RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < RED_STAB_SIZE)
371 return -EINVAL; 168 return -EINVAL;
372 169
373 ctl = RTA_DATA(tb[TCA_RED_PARMS-1]); 170 ctl = RTA_DATA(tb[TCA_RED_PARMS-1]);
374 171
375 sch_tree_lock(sch); 172 sch_tree_lock(sch);
376 q->flags = ctl->flags; 173 q->flags = ctl->flags;
377 q->Wlog = ctl->Wlog;
378 q->Plog = ctl->Plog;
379 q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL;
380 q->Scell_log = ctl->Scell_log;
381 q->Scell_max = (255<<q->Scell_log);
382 q->qth_min = ctl->qth_min<<ctl->Wlog;
383 q->qth_max = ctl->qth_max<<ctl->Wlog;
384 q->limit = ctl->limit; 174 q->limit = ctl->limit;
385 memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256);
386 175
387 q->qcount = -1; 176 red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
177 ctl->Plog, ctl->Scell_log,
178 RTA_DATA(tb[TCA_RED_STAB-1]));
179
388 if (skb_queue_empty(&sch->q)) 180 if (skb_queue_empty(&sch->q))
389 PSCHED_SET_PASTPERFECT(q->qidlestart); 181 red_end_of_idle_period(&q->parms);
182
390 sch_tree_unlock(sch); 183 sch_tree_unlock(sch);
391 return 0; 184 return 0;
392} 185}
@@ -399,39 +192,39 @@ static int red_init(struct Qdisc* sch, struct rtattr *opt)
399static int red_dump(struct Qdisc *sch, struct sk_buff *skb) 192static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
400{ 193{
401 struct red_sched_data *q = qdisc_priv(sch); 194 struct red_sched_data *q = qdisc_priv(sch);
402 unsigned char *b = skb->tail; 195 struct rtattr *opts = NULL;
403 struct rtattr *rta; 196 struct tc_red_qopt opt = {
404 struct tc_red_qopt opt; 197 .limit = q->limit,
405 198 .flags = q->flags,
406 rta = (struct rtattr*)b; 199 .qth_min = q->parms.qth_min >> q->parms.Wlog,
407 RTA_PUT(skb, TCA_OPTIONS, 0, NULL); 200 .qth_max = q->parms.qth_max >> q->parms.Wlog,
408 opt.limit = q->limit; 201 .Wlog = q->parms.Wlog,
409 opt.qth_min = q->qth_min>>q->Wlog; 202 .Plog = q->parms.Plog,
410 opt.qth_max = q->qth_max>>q->Wlog; 203 .Scell_log = q->parms.Scell_log,
411 opt.Wlog = q->Wlog; 204 };
412 opt.Plog = q->Plog; 205
413 opt.Scell_log = q->Scell_log; 206 opts = RTA_NEST(skb, TCA_OPTIONS);
414 opt.flags = q->flags;
415 RTA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt); 207 RTA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt);
416 rta->rta_len = skb->tail - b; 208 return RTA_NEST_END(skb, opts);
417
418 return skb->len;
419 209
420rtattr_failure: 210rtattr_failure:
421 skb_trim(skb, b - skb->data); 211 return RTA_NEST_CANCEL(skb, opts);
422 return -1;
423} 212}
424 213
425static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d) 214static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
426{ 215{
427 struct red_sched_data *q = qdisc_priv(sch); 216 struct red_sched_data *q = qdisc_priv(sch);
428 217 struct tc_red_xstats st = {
429 return gnet_stats_copy_app(d, &q->st, sizeof(q->st)); 218 .early = q->stats.prob_drop + q->stats.forced_drop,
219 .pdrop = q->stats.pdrop,
220 .other = q->stats.other,
221 .marked = q->stats.prob_mark + q->stats.forced_mark,
222 };
223
224 return gnet_stats_copy_app(d, &st, sizeof(st));
430} 225}
431 226
432static struct Qdisc_ops red_qdisc_ops = { 227static struct Qdisc_ops red_qdisc_ops = {
433 .next = NULL,
434 .cl_ops = NULL,
435 .id = "red", 228 .id = "red",
436 .priv_size = sizeof(struct red_sched_data), 229 .priv_size = sizeof(struct red_sched_data),
437 .enqueue = red_enqueue, 230 .enqueue = red_enqueue,
@@ -450,10 +243,13 @@ static int __init red_module_init(void)
450{ 243{
451 return register_qdisc(&red_qdisc_ops); 244 return register_qdisc(&red_qdisc_ops);
452} 245}
453static void __exit red_module_exit(void) 246
247static void __exit red_module_exit(void)
454{ 248{
455 unregister_qdisc(&red_qdisc_ops); 249 unregister_qdisc(&red_qdisc_ops);
456} 250}
251
457module_init(red_module_init) 252module_init(red_module_init)
458module_exit(red_module_exit) 253module_exit(red_module_exit)
254
459MODULE_LICENSE("GPL"); 255MODULE_LICENSE("GPL");