aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/sched/sch_gred.c841
-rw-r--r--net/sched/sch_red.c418
2 files changed, 518 insertions, 741 deletions
diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index 25c171c32715..29a2dd9f3029 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -15,247 +15,281 @@
15 * from Ren Liu 15 * from Ren Liu
16 * - More error checks 16 * - More error checks
17 * 17 *
18 * 18 * For all the glorious comments look at include/net/red.h
19 *
20 * For all the glorious comments look at Alexey's sch_red.c
21 */ 19 */
22 20
23#include <linux/config.h> 21#include <linux/config.h>
24#include <linux/module.h> 22#include <linux/module.h>
25#include <asm/uaccess.h>
26#include <asm/system.h>
27#include <linux/bitops.h>
28#include <linux/types.h> 23#include <linux/types.h>
29#include <linux/kernel.h> 24#include <linux/kernel.h>
30#include <linux/sched.h>
31#include <linux/string.h>
32#include <linux/mm.h>
33#include <linux/socket.h>
34#include <linux/sockios.h>
35#include <linux/in.h>
36#include <linux/errno.h>
37#include <linux/interrupt.h>
38#include <linux/if_ether.h>
39#include <linux/inet.h>
40#include <linux/netdevice.h> 25#include <linux/netdevice.h>
41#include <linux/etherdevice.h>
42#include <linux/notifier.h>
43#include <net/ip.h>
44#include <net/route.h>
45#include <linux/skbuff.h> 26#include <linux/skbuff.h>
46#include <net/sock.h>
47#include <net/pkt_sched.h> 27#include <net/pkt_sched.h>
28#include <net/red.h>
48 29
49#if 1 /* control */ 30#define GRED_DEF_PRIO (MAX_DPs / 2)
50#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) 31#define GRED_VQ_MASK (MAX_DPs - 1)
51#else
52#define DPRINTK(format,args...)
53#endif
54
55#if 0 /* data */
56#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args)
57#else
58#define D2PRINTK(format,args...)
59#endif
60 32
61struct gred_sched_data; 33struct gred_sched_data;
62struct gred_sched; 34struct gred_sched;
63 35
64struct gred_sched_data 36struct gred_sched_data
65{ 37{
66/* Parameters */
67 u32 limit; /* HARD maximal queue length */ 38 u32 limit; /* HARD maximal queue length */
68 u32 qth_min; /* Min average length threshold: A scaled */
69 u32 qth_max; /* Max average length threshold: A scaled */
70 u32 DP; /* the drop pramaters */ 39 u32 DP; /* the drop pramaters */
71 char Wlog; /* log(W) */
72 char Plog; /* random number bits */
73 u32 Scell_max;
74 u32 Rmask;
75 u32 bytesin; /* bytes seen on virtualQ so far*/ 40 u32 bytesin; /* bytes seen on virtualQ so far*/
76 u32 packetsin; /* packets seen on virtualQ so far*/ 41 u32 packetsin; /* packets seen on virtualQ so far*/
77 u32 backlog; /* bytes on the virtualQ */ 42 u32 backlog; /* bytes on the virtualQ */
78 u32 forced; /* packets dropped for exceeding limits */ 43 u8 prio; /* the prio of this vq */
79 u32 early; /* packets dropped as a warning */ 44
80 u32 other; /* packets dropped by invoking drop() */ 45 struct red_parms parms;
81 u32 pdrop; /* packets dropped because we exceeded physical queue limits */ 46 struct red_stats stats;
82 char Scell_log; 47};
83 u8 Stab[256]; 48
84 u8 prio; /* the prio of this vq */ 49enum {
85 50 GRED_WRED_MODE = 1,
86/* Variables */ 51 GRED_RIO_MODE,
87 unsigned long qave; /* Average queue length: A scaled */
88 int qcount; /* Packets since last random number generation */
89 u32 qR; /* Cached random number */
90
91 psched_time_t qidlestart; /* Start of idle period */
92}; 52};
93 53
94struct gred_sched 54struct gred_sched
95{ 55{
96 struct gred_sched_data *tab[MAX_DPs]; 56 struct gred_sched_data *tab[MAX_DPs];
97 u32 DPs; 57 unsigned long flags;
98 u32 def; 58 u32 red_flags;
99 u8 initd; 59 u32 DPs;
100 u8 grio; 60 u32 def;
101 u8 eqp; 61 struct red_parms wred_set;
102}; 62};
103 63
104static int 64static inline int gred_wred_mode(struct gred_sched *table)
105gred_enqueue(struct sk_buff *skb, struct Qdisc* sch)
106{ 65{
107 psched_time_t now; 66 return test_bit(GRED_WRED_MODE, &table->flags);
108 struct gred_sched_data *q=NULL; 67}
109 struct gred_sched *t= qdisc_priv(sch); 68
110 unsigned long qave=0; 69static inline void gred_enable_wred_mode(struct gred_sched *table)
111 int i=0; 70{
71 __set_bit(GRED_WRED_MODE, &table->flags);
72}
73
74static inline void gred_disable_wred_mode(struct gred_sched *table)
75{
76 __clear_bit(GRED_WRED_MODE, &table->flags);
77}
78
79static inline int gred_rio_mode(struct gred_sched *table)
80{
81 return test_bit(GRED_RIO_MODE, &table->flags);
82}
83
84static inline void gred_enable_rio_mode(struct gred_sched *table)
85{
86 __set_bit(GRED_RIO_MODE, &table->flags);
87}
88
89static inline void gred_disable_rio_mode(struct gred_sched *table)
90{
91 __clear_bit(GRED_RIO_MODE, &table->flags);
92}
93
94static inline int gred_wred_mode_check(struct Qdisc *sch)
95{
96 struct gred_sched *table = qdisc_priv(sch);
97 int i;
112 98
113 if (!t->initd && skb_queue_len(&sch->q) < (sch->dev->tx_queue_len ? : 1)) { 99 /* Really ugly O(n^2) but shouldn't be necessary too frequent. */
114 D2PRINTK("NO GRED Queues setup yet! Enqueued anyway\n"); 100 for (i = 0; i < table->DPs; i++) {
115 goto do_enqueue; 101 struct gred_sched_data *q = table->tab[i];
102 int n;
103
104 if (q == NULL)
105 continue;
106
107 for (n = 0; n < table->DPs; n++)
108 if (table->tab[n] && table->tab[n] != q &&
109 table->tab[n]->prio == q->prio)
110 return 1;
116 } 111 }
117 112
113 return 0;
114}
115
116static inline unsigned int gred_backlog(struct gred_sched *table,
117 struct gred_sched_data *q,
118 struct Qdisc *sch)
119{
120 if (gred_wred_mode(table))
121 return sch->qstats.backlog;
122 else
123 return q->backlog;
124}
125
126static inline u16 tc_index_to_dp(struct sk_buff *skb)
127{
128 return skb->tc_index & GRED_VQ_MASK;
129}
130
131static inline void gred_load_wred_set(struct gred_sched *table,
132 struct gred_sched_data *q)
133{
134 q->parms.qavg = table->wred_set.qavg;
135 q->parms.qidlestart = table->wred_set.qidlestart;
136}
137
138static inline void gred_store_wred_set(struct gred_sched *table,
139 struct gred_sched_data *q)
140{
141 table->wred_set.qavg = q->parms.qavg;
142}
143
144static inline int gred_use_ecn(struct gred_sched *t)
145{
146 return t->red_flags & TC_RED_ECN;
147}
118 148
119 if ( ((skb->tc_index&0xf) > (t->DPs -1)) || !(q=t->tab[skb->tc_index&0xf])) { 149static inline int gred_use_harddrop(struct gred_sched *t)
120 printk("GRED: setting to default (%d)\n ",t->def); 150{
121 if (!(q=t->tab[t->def])) { 151 return t->red_flags & TC_RED_HARDDROP;
122 DPRINTK("GRED: setting to default FAILED! dropping!! " 152}
123 "(%d)\n ", t->def); 153
124 goto drop; 154static int gred_enqueue(struct sk_buff *skb, struct Qdisc* sch)
155{
156 struct gred_sched_data *q=NULL;
157 struct gred_sched *t= qdisc_priv(sch);
158 unsigned long qavg = 0;
159 u16 dp = tc_index_to_dp(skb);
160
161 if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
162 dp = t->def;
163
164 if ((q = t->tab[dp]) == NULL) {
165 /* Pass through packets not assigned to a DP
166 * if no default DP has been configured. This
167 * allows for DP flows to be left untouched.
168 */
169 if (skb_queue_len(&sch->q) < sch->dev->tx_queue_len)
170 return qdisc_enqueue_tail(skb, sch);
171 else
172 goto drop;
125 } 173 }
174
126 /* fix tc_index? --could be controvesial but needed for 175 /* fix tc_index? --could be controvesial but needed for
127 requeueing */ 176 requeueing */
128 skb->tc_index=(skb->tc_index&0xfffffff0) | t->def; 177 skb->tc_index = (skb->tc_index & ~GRED_VQ_MASK) | dp;
129 } 178 }
130 179
131 D2PRINTK("gred_enqueue virtualQ 0x%x classid %x backlog %d " 180 /* sum up all the qaves of prios <= to ours to get the new qave */
132 "general backlog %d\n",skb->tc_index&0xf,sch->handle,q->backlog, 181 if (!gred_wred_mode(t) && gred_rio_mode(t)) {
133 sch->qstats.backlog); 182 int i;
134 /* sum up all the qaves of prios <= to ours to get the new qave*/ 183
135 if (!t->eqp && t->grio) { 184 for (i = 0; i < t->DPs; i++) {
136 for (i=0;i<t->DPs;i++) { 185 if (t->tab[i] && t->tab[i]->prio < q->prio &&
137 if ((!t->tab[i]) || (i==q->DP)) 186 !red_is_idling(&t->tab[i]->parms))
138 continue; 187 qavg +=t->tab[i]->parms.qavg;
139
140 if ((t->tab[i]->prio < q->prio) && (PSCHED_IS_PASTPERFECT(t->tab[i]->qidlestart)))
141 qave +=t->tab[i]->qave;
142 } 188 }
143 189
144 } 190 }
145 191
146 q->packetsin++; 192 q->packetsin++;
147 q->bytesin+=skb->len; 193 q->bytesin += skb->len;
148 194
149 if (t->eqp && t->grio) { 195 if (gred_wred_mode(t))
150 qave=0; 196 gred_load_wred_set(t, q);
151 q->qave=t->tab[t->def]->qave;
152 q->qidlestart=t->tab[t->def]->qidlestart;
153 }
154 197
155 if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) { 198 q->parms.qavg = red_calc_qavg(&q->parms, gred_backlog(t, q, sch));
156 long us_idle;
157 PSCHED_GET_TIME(now);
158 us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max);
159 PSCHED_SET_PASTPERFECT(q->qidlestart);
160 199
161 q->qave >>= q->Stab[(us_idle>>q->Scell_log)&0xFF]; 200 if (red_is_idling(&q->parms))
162 } else { 201 red_end_of_idle_period(&q->parms);
163 if (t->eqp) {
164 q->qave += sch->qstats.backlog - (q->qave >> q->Wlog);
165 } else {
166 q->qave += q->backlog - (q->qave >> q->Wlog);
167 }
168 202
169 } 203 if (gred_wred_mode(t))
170 204 gred_store_wred_set(t, q);
171
172 if (t->eqp && t->grio)
173 t->tab[t->def]->qave=q->qave;
174
175 if ((q->qave+qave) < q->qth_min) {
176 q->qcount = -1;
177enqueue:
178 if (q->backlog + skb->len <= q->limit) {
179 q->backlog += skb->len;
180do_enqueue:
181 __skb_queue_tail(&sch->q, skb);
182 sch->qstats.backlog += skb->len;
183 sch->bstats.bytes += skb->len;
184 sch->bstats.packets++;
185 return 0;
186 } else {
187 q->pdrop++;
188 }
189 205
190drop: 206 switch (red_action(&q->parms, q->parms.qavg + qavg)) {
191 kfree_skb(skb); 207 case RED_DONT_MARK:
192 sch->qstats.drops++; 208 break;
193 return NET_XMIT_DROP; 209
194 } 210 case RED_PROB_MARK:
195 if ((q->qave+qave) >= q->qth_max) { 211 sch->qstats.overlimits++;
196 q->qcount = -1; 212 if (!gred_use_ecn(t) || !INET_ECN_set_ce(skb)) {
197 sch->qstats.overlimits++; 213 q->stats.prob_drop++;
198 q->forced++; 214 goto congestion_drop;
199 goto drop; 215 }
216
217 q->stats.prob_mark++;
218 break;
219
220 case RED_HARD_MARK:
221 sch->qstats.overlimits++;
222 if (gred_use_harddrop(t) || !gred_use_ecn(t) ||
223 !INET_ECN_set_ce(skb)) {
224 q->stats.forced_drop++;
225 goto congestion_drop;
226 }
227 q->stats.forced_mark++;
228 break;
200 } 229 }
201 if (++q->qcount) { 230
202 if ((((qave+q->qave) - q->qth_min)>>q->Wlog)*q->qcount < q->qR) 231 if (q->backlog + skb->len <= q->limit) {
203 goto enqueue; 232 q->backlog += skb->len;
204 q->qcount = 0; 233 return qdisc_enqueue_tail(skb, sch);
205 q->qR = net_random()&q->Rmask;
206 sch->qstats.overlimits++;
207 q->early++;
208 goto drop;
209 } 234 }
210 q->qR = net_random()&q->Rmask; 235
211 goto enqueue; 236 q->stats.pdrop++;
237drop:
238 return qdisc_drop(skb, sch);
239
240congestion_drop:
241 qdisc_drop(skb, sch);
242 return NET_XMIT_CN;
212} 243}
213 244
214static int 245static int gred_requeue(struct sk_buff *skb, struct Qdisc* sch)
215gred_requeue(struct sk_buff *skb, struct Qdisc* sch)
216{ 246{
247 struct gred_sched *t = qdisc_priv(sch);
217 struct gred_sched_data *q; 248 struct gred_sched_data *q;
218 struct gred_sched *t= qdisc_priv(sch); 249 u16 dp = tc_index_to_dp(skb);
219 q= t->tab[(skb->tc_index&0xf)]; 250
220/* error checking here -- probably unnecessary */ 251 if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
221 PSCHED_SET_PASTPERFECT(q->qidlestart); 252 if (net_ratelimit())
222 253 printk(KERN_WARNING "GRED: Unable to relocate VQ 0x%x "
223 __skb_queue_head(&sch->q, skb); 254 "for requeue, screwing up backlog.\n",
224 sch->qstats.backlog += skb->len; 255 tc_index_to_dp(skb));
225 sch->qstats.requeues++; 256 } else {
226 q->backlog += skb->len; 257 if (red_is_idling(&q->parms))
227 return 0; 258 red_end_of_idle_period(&q->parms);
259 q->backlog += skb->len;
260 }
261
262 return qdisc_requeue(skb, sch);
228} 263}
229 264
230static struct sk_buff * 265static struct sk_buff *gred_dequeue(struct Qdisc* sch)
231gred_dequeue(struct Qdisc* sch)
232{ 266{
233 struct sk_buff *skb; 267 struct sk_buff *skb;
234 struct gred_sched_data *q; 268 struct gred_sched *t = qdisc_priv(sch);
235 struct gred_sched *t= qdisc_priv(sch); 269
270 skb = qdisc_dequeue_head(sch);
236 271
237 skb = __skb_dequeue(&sch->q);
238 if (skb) { 272 if (skb) {
239 sch->qstats.backlog -= skb->len; 273 struct gred_sched_data *q;
240 q= t->tab[(skb->tc_index&0xf)]; 274 u16 dp = tc_index_to_dp(skb);
241 if (q) { 275
242 q->backlog -= skb->len; 276 if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
243 if (!q->backlog && !t->eqp) 277 if (net_ratelimit())
244 PSCHED_GET_TIME(q->qidlestart); 278 printk(KERN_WARNING "GRED: Unable to relocate "
279 "VQ 0x%x after dequeue, screwing up "
280 "backlog.\n", tc_index_to_dp(skb));
245 } else { 281 } else {
246 D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf); 282 q->backlog -= skb->len;
283
284 if (!q->backlog && !gred_wred_mode(t))
285 red_start_of_idle_period(&q->parms);
247 } 286 }
287
248 return skb; 288 return skb;
249 } 289 }
250 290
251 if (t->eqp) { 291 if (gred_wred_mode(t) && !red_is_idling(&t->wred_set))
252 q= t->tab[t->def]; 292 red_start_of_idle_period(&t->wred_set);
253 if (!q)
254 D2PRINTK("no default VQ set: Results will be "
255 "screwed up\n");
256 else
257 PSCHED_GET_TIME(q->qidlestart);
258 }
259 293
260 return NULL; 294 return NULL;
261} 295}
@@ -263,36 +297,34 @@ gred_dequeue(struct Qdisc* sch)
263static unsigned int gred_drop(struct Qdisc* sch) 297static unsigned int gred_drop(struct Qdisc* sch)
264{ 298{
265 struct sk_buff *skb; 299 struct sk_buff *skb;
300 struct gred_sched *t = qdisc_priv(sch);
266 301
267 struct gred_sched_data *q; 302 skb = qdisc_dequeue_tail(sch);
268 struct gred_sched *t= qdisc_priv(sch);
269
270 skb = __skb_dequeue_tail(&sch->q);
271 if (skb) { 303 if (skb) {
272 unsigned int len = skb->len; 304 unsigned int len = skb->len;
273 sch->qstats.backlog -= len; 305 struct gred_sched_data *q;
274 sch->qstats.drops++; 306 u16 dp = tc_index_to_dp(skb);
275 q= t->tab[(skb->tc_index&0xf)]; 307
276 if (q) { 308 if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
277 q->backlog -= len; 309 if (net_ratelimit())
278 q->other++; 310 printk(KERN_WARNING "GRED: Unable to relocate "
279 if (!q->backlog && !t->eqp) 311 "VQ 0x%x while dropping, screwing up "
280 PSCHED_GET_TIME(q->qidlestart); 312 "backlog.\n", tc_index_to_dp(skb));
281 } else { 313 } else {
282 D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf); 314 q->backlog -= len;
315 q->stats.other++;
316
317 if (!q->backlog && !gred_wred_mode(t))
318 red_start_of_idle_period(&q->parms);
283 } 319 }
284 320
285 kfree_skb(skb); 321 qdisc_drop(skb, sch);
286 return len; 322 return len;
287 } 323 }
288 324
289 q=t->tab[t->def]; 325 if (gred_wred_mode(t) && !red_is_idling(&t->wred_set))
290 if (!q) { 326 red_start_of_idle_period(&t->wred_set);
291 D2PRINTK("no default VQ set: Results might be screwed up\n");
292 return 0;
293 }
294 327
295 PSCHED_GET_TIME(q->qidlestart);
296 return 0; 328 return 0;
297 329
298} 330}
@@ -300,293 +332,241 @@ static unsigned int gred_drop(struct Qdisc* sch)
300static void gred_reset(struct Qdisc* sch) 332static void gred_reset(struct Qdisc* sch)
301{ 333{
302 int i; 334 int i;
303 struct gred_sched_data *q; 335 struct gred_sched *t = qdisc_priv(sch);
304 struct gred_sched *t= qdisc_priv(sch); 336
337 qdisc_reset_queue(sch);
305 338
306 __skb_queue_purge(&sch->q); 339 for (i = 0; i < t->DPs; i++) {
340 struct gred_sched_data *q = t->tab[i];
307 341
308 sch->qstats.backlog = 0; 342 if (!q)
343 continue;
309 344
310 for (i=0;i<t->DPs;i++) { 345 red_restart(&q->parms);
311 q= t->tab[i];
312 if (!q)
313 continue;
314 PSCHED_SET_PASTPERFECT(q->qidlestart);
315 q->qave = 0;
316 q->qcount = -1;
317 q->backlog = 0; 346 q->backlog = 0;
318 q->other=0;
319 q->forced=0;
320 q->pdrop=0;
321 q->early=0;
322 } 347 }
323} 348}
324 349
325static int gred_change(struct Qdisc *sch, struct rtattr *opt) 350static inline void gred_destroy_vq(struct gred_sched_data *q)
351{
352 kfree(q);
353}
354
355static inline int gred_change_table_def(struct Qdisc *sch, struct rtattr *dps)
326{ 356{
327 struct gred_sched *table = qdisc_priv(sch); 357 struct gred_sched *table = qdisc_priv(sch);
328 struct gred_sched_data *q;
329 struct tc_gred_qopt *ctl;
330 struct tc_gred_sopt *sopt; 358 struct tc_gred_sopt *sopt;
331 struct rtattr *tb[TCA_GRED_STAB];
332 struct rtattr *tb2[TCA_GRED_DPS];
333 int i; 359 int i;
334 360
335 if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_STAB, opt)) 361 if (dps == NULL || RTA_PAYLOAD(dps) < sizeof(*sopt))
336 return -EINVAL; 362 return -EINVAL;
337 363
338 if (tb[TCA_GRED_PARMS-1] == 0 && tb[TCA_GRED_STAB-1] == 0) { 364 sopt = RTA_DATA(dps);
339 rtattr_parse_nested(tb2, TCA_GRED_DPS, opt); 365
366 if (sopt->DPs > MAX_DPs || sopt->DPs == 0 || sopt->def_DP >= sopt->DPs)
367 return -EINVAL;
340 368
341 if (tb2[TCA_GRED_DPS-1] == 0) 369 sch_tree_lock(sch);
342 return -EINVAL; 370 table->DPs = sopt->DPs;
371 table->def = sopt->def_DP;
372 table->red_flags = sopt->flags;
373
374 /*
375 * Every entry point to GRED is synchronized with the above code
376 * and the DP is checked against DPs, i.e. shadowed VQs can no
377 * longer be found so we can unlock right here.
378 */
379 sch_tree_unlock(sch);
380
381 if (sopt->grio) {
382 gred_enable_rio_mode(table);
383 gred_disable_wred_mode(table);
384 if (gred_wred_mode_check(sch))
385 gred_enable_wred_mode(table);
386 } else {
387 gred_disable_rio_mode(table);
388 gred_disable_wred_mode(table);
389 }
343 390
344 sopt = RTA_DATA(tb2[TCA_GRED_DPS-1]); 391 for (i = table->DPs; i < MAX_DPs; i++) {
345 table->DPs=sopt->DPs; 392 if (table->tab[i]) {
346 table->def=sopt->def_DP; 393 printk(KERN_WARNING "GRED: Warning: Destroying "
347 table->grio=sopt->grio; 394 "shadowed VQ 0x%x\n", i);
348 table->initd=0; 395 gred_destroy_vq(table->tab[i]);
349 /* probably need to clear all the table DP entries as well */ 396 table->tab[i] = NULL;
350 return 0; 397 }
351 } 398 }
352 399
400 return 0;
401}
353 402
354 if (!table->DPs || tb[TCA_GRED_PARMS-1] == 0 || tb[TCA_GRED_STAB-1] == 0 || 403static inline int gred_change_vq(struct Qdisc *sch, int dp,
355 RTA_PAYLOAD(tb[TCA_GRED_PARMS-1]) < sizeof(*ctl) || 404 struct tc_gred_qopt *ctl, int prio, u8 *stab)
356 RTA_PAYLOAD(tb[TCA_GRED_STAB-1]) < 256) 405{
357 return -EINVAL; 406 struct gred_sched *table = qdisc_priv(sch);
407 struct gred_sched_data *q;
358 408
359 ctl = RTA_DATA(tb[TCA_GRED_PARMS-1]); 409 if (table->tab[dp] == NULL) {
360 if (ctl->DP > MAX_DPs-1 ) { 410 table->tab[dp] = kmalloc(sizeof(*q), GFP_KERNEL);
361 /* misbehaving is punished! Put in the default drop probability */ 411 if (table->tab[dp] == NULL)
362 DPRINTK("\nGRED: DP %u not in the proper range fixed. New DP "
363 "set to default at %d\n",ctl->DP,table->def);
364 ctl->DP=table->def;
365 }
366
367 if (table->tab[ctl->DP] == NULL) {
368 table->tab[ctl->DP]=kmalloc(sizeof(struct gred_sched_data),
369 GFP_KERNEL);
370 if (NULL == table->tab[ctl->DP])
371 return -ENOMEM; 412 return -ENOMEM;
372 memset(table->tab[ctl->DP], 0, (sizeof(struct gred_sched_data))); 413 memset(table->tab[dp], 0, sizeof(*q));
373 }
374 q= table->tab[ctl->DP];
375
376 if (table->grio) {
377 if (ctl->prio <=0) {
378 if (table->def && table->tab[table->def]) {
379 DPRINTK("\nGRED: DP %u does not have a prio"
380 "setting default to %d\n",ctl->DP,
381 table->tab[table->def]->prio);
382 q->prio=table->tab[table->def]->prio;
383 } else {
384 DPRINTK("\nGRED: DP %u does not have a prio"
385 " setting default to 8\n",ctl->DP);
386 q->prio=8;
387 }
388 } else {
389 q->prio=ctl->prio;
390 }
391 } else {
392 q->prio=8;
393 } 414 }
394 415
395 416 q = table->tab[dp];
396 q->DP=ctl->DP; 417 q->DP = dp;
397 q->Wlog = ctl->Wlog; 418 q->prio = prio;
398 q->Plog = ctl->Plog;
399 q->limit = ctl->limit; 419 q->limit = ctl->limit;
400 q->Scell_log = ctl->Scell_log;
401 q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL;
402 q->Scell_max = (255<<q->Scell_log);
403 q->qth_min = ctl->qth_min<<ctl->Wlog;
404 q->qth_max = ctl->qth_max<<ctl->Wlog;
405 q->qave=0;
406 q->backlog=0;
407 q->qcount = -1;
408 q->other=0;
409 q->forced=0;
410 q->pdrop=0;
411 q->early=0;
412
413 PSCHED_SET_PASTPERFECT(q->qidlestart);
414 memcpy(q->Stab, RTA_DATA(tb[TCA_GRED_STAB-1]), 256);
415
416 if ( table->initd && table->grio) {
417 /* this looks ugly but it's not in the fast path */
418 for (i=0;i<table->DPs;i++) {
419 if ((!table->tab[i]) || (i==q->DP) )
420 continue;
421 if (table->tab[i]->prio == q->prio ){
422 /* WRED mode detected */
423 table->eqp=1;
424 break;
425 }
426 }
427 }
428 420
429 if (!table->initd) { 421 if (q->backlog == 0)
430 table->initd=1; 422 red_end_of_idle_period(&q->parms);
431 /*
432 the first entry also goes into the default until
433 over-written
434 */
435
436 if (table->tab[table->def] == NULL) {
437 table->tab[table->def]=
438 kmalloc(sizeof(struct gred_sched_data), GFP_KERNEL);
439 if (NULL == table->tab[table->def])
440 return -ENOMEM;
441
442 memset(table->tab[table->def], 0,
443 (sizeof(struct gred_sched_data)));
444 }
445 q= table->tab[table->def];
446 q->DP=table->def;
447 q->Wlog = ctl->Wlog;
448 q->Plog = ctl->Plog;
449 q->limit = ctl->limit;
450 q->Scell_log = ctl->Scell_log;
451 q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL;
452 q->Scell_max = (255<<q->Scell_log);
453 q->qth_min = ctl->qth_min<<ctl->Wlog;
454 q->qth_max = ctl->qth_max<<ctl->Wlog;
455
456 if (table->grio)
457 q->prio=table->tab[ctl->DP]->prio;
458 else
459 q->prio=8;
460
461 q->qcount = -1;
462 PSCHED_SET_PASTPERFECT(q->qidlestart);
463 memcpy(q->Stab, RTA_DATA(tb[TCA_GRED_STAB-1]), 256);
464 }
465 return 0;
466 423
424 red_set_parms(&q->parms,
425 ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Plog,
426 ctl->Scell_log, stab);
427
428 return 0;
467} 429}
468 430
469static int gred_init(struct Qdisc *sch, struct rtattr *opt) 431static int gred_change(struct Qdisc *sch, struct rtattr *opt)
470{ 432{
471 struct gred_sched *table = qdisc_priv(sch); 433 struct gred_sched *table = qdisc_priv(sch);
472 struct tc_gred_sopt *sopt; 434 struct tc_gred_qopt *ctl;
473 struct rtattr *tb[TCA_GRED_STAB]; 435 struct rtattr *tb[TCA_GRED_MAX];
474 struct rtattr *tb2[TCA_GRED_DPS]; 436 int err = -EINVAL, prio = GRED_DEF_PRIO;
437 u8 *stab;
475 438
476 if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_STAB, opt)) 439 if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_MAX, opt))
477 return -EINVAL; 440 return -EINVAL;
478 441
479 if (tb[TCA_GRED_PARMS-1] == 0 && tb[TCA_GRED_STAB-1] == 0) { 442 if (tb[TCA_GRED_PARMS-1] == NULL && tb[TCA_GRED_STAB-1] == NULL)
480 rtattr_parse_nested(tb2, TCA_GRED_DPS, opt); 443 return gred_change_table_def(sch, opt);
444
445 if (tb[TCA_GRED_PARMS-1] == NULL ||
446 RTA_PAYLOAD(tb[TCA_GRED_PARMS-1]) < sizeof(*ctl) ||
447 tb[TCA_GRED_STAB-1] == NULL ||
448 RTA_PAYLOAD(tb[TCA_GRED_STAB-1]) < 256)
449 return -EINVAL;
450
451 ctl = RTA_DATA(tb[TCA_GRED_PARMS-1]);
452 stab = RTA_DATA(tb[TCA_GRED_STAB-1]);
453
454 if (ctl->DP >= table->DPs)
455 goto errout;
481 456
482 if (tb2[TCA_GRED_DPS-1] == 0) 457 if (gred_rio_mode(table)) {
483 return -EINVAL; 458 if (ctl->prio == 0) {
459 int def_prio = GRED_DEF_PRIO;
484 460
485 sopt = RTA_DATA(tb2[TCA_GRED_DPS-1]); 461 if (table->tab[table->def])
486 table->DPs=sopt->DPs; 462 def_prio = table->tab[table->def]->prio;
487 table->def=sopt->def_DP; 463
488 table->grio=sopt->grio; 464 printk(KERN_DEBUG "GRED: DP %u does not have a prio "
489 table->initd=0; 465 "setting default to %d\n", ctl->DP, def_prio);
490 return 0; 466
467 prio = def_prio;
468 } else
469 prio = ctl->prio;
470 }
471
472 sch_tree_lock(sch);
473
474 err = gred_change_vq(sch, ctl->DP, ctl, prio, stab);
475 if (err < 0)
476 goto errout_locked;
477
478 if (gred_rio_mode(table)) {
479 gred_disable_wred_mode(table);
480 if (gred_wred_mode_check(sch))
481 gred_enable_wred_mode(table);
491 } 482 }
492 483
493 DPRINTK("\n GRED_INIT error!\n"); 484 err = 0;
494 return -EINVAL; 485
486errout_locked:
487 sch_tree_unlock(sch);
488errout:
489 return err;
495} 490}
496 491
497static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) 492static int gred_init(struct Qdisc *sch, struct rtattr *opt)
498{ 493{
499 unsigned long qave; 494 struct rtattr *tb[TCA_GRED_MAX];
500 struct rtattr *rta;
501 struct tc_gred_qopt *opt = NULL ;
502 struct tc_gred_qopt *dst;
503 struct gred_sched *table = qdisc_priv(sch);
504 struct gred_sched_data *q;
505 int i;
506 unsigned char *b = skb->tail;
507 495
508 rta = (struct rtattr*)b; 496 if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_MAX, opt))
509 RTA_PUT(skb, TCA_OPTIONS, 0, NULL); 497 return -EINVAL;
510 498
511 opt=kmalloc(sizeof(struct tc_gred_qopt)*MAX_DPs, GFP_KERNEL); 499 if (tb[TCA_GRED_PARMS-1] || tb[TCA_GRED_STAB-1])
500 return -EINVAL;
512 501
513 if (opt == NULL) { 502 return gred_change_table_def(sch, tb[TCA_GRED_DPS-1]);
514 DPRINTK("gred_dump:failed to malloc for %Zd\n", 503}
515 sizeof(struct tc_gred_qopt)*MAX_DPs);
516 goto rtattr_failure;
517 }
518 504
519 memset(opt, 0, (sizeof(struct tc_gred_qopt))*table->DPs); 505static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
506{
507 struct gred_sched *table = qdisc_priv(sch);
508 struct rtattr *parms, *opts = NULL;
509 int i;
510 struct tc_gred_sopt sopt = {
511 .DPs = table->DPs,
512 .def_DP = table->def,
513 .grio = gred_rio_mode(table),
514 .flags = table->red_flags,
515 };
520 516
521 if (!table->initd) { 517 opts = RTA_NEST(skb, TCA_OPTIONS);
522 DPRINTK("NO GRED Queues setup!\n"); 518 RTA_PUT(skb, TCA_GRED_DPS, sizeof(sopt), &sopt);
523 } 519 parms = RTA_NEST(skb, TCA_GRED_PARMS);
520
521 for (i = 0; i < MAX_DPs; i++) {
522 struct gred_sched_data *q = table->tab[i];
523 struct tc_gred_qopt opt;
524 524
525 for (i=0;i<MAX_DPs;i++) { 525 memset(&opt, 0, sizeof(opt));
526 dst= &opt[i];
527 q= table->tab[i];
528 526
529 if (!q) { 527 if (!q) {
530 /* hack -- fix at some point with proper message 528 /* hack -- fix at some point with proper message
531 This is how we indicate to tc that there is no VQ 529 This is how we indicate to tc that there is no VQ
532 at this DP */ 530 at this DP */
533 531
534 dst->DP=MAX_DPs+i; 532 opt.DP = MAX_DPs + i;
535 continue; 533 goto append_opt;
536 } 534 }
537 535
538 dst->limit=q->limit; 536 opt.limit = q->limit;
539 dst->qth_min=q->qth_min>>q->Wlog; 537 opt.DP = q->DP;
540 dst->qth_max=q->qth_max>>q->Wlog; 538 opt.backlog = q->backlog;
541 dst->DP=q->DP; 539 opt.prio = q->prio;
542 dst->backlog=q->backlog; 540 opt.qth_min = q->parms.qth_min >> q->parms.Wlog;
543 if (q->qave) { 541 opt.qth_max = q->parms.qth_max >> q->parms.Wlog;
544 if (table->eqp && table->grio) { 542 opt.Wlog = q->parms.Wlog;
545 q->qidlestart=table->tab[table->def]->qidlestart; 543 opt.Plog = q->parms.Plog;
546 q->qave=table->tab[table->def]->qave; 544 opt.Scell_log = q->parms.Scell_log;
547 } 545 opt.other = q->stats.other;
548 if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) { 546 opt.early = q->stats.prob_drop;
549 long idle; 547 opt.forced = q->stats.forced_drop;
550 psched_time_t now; 548 opt.pdrop = q->stats.pdrop;
551 PSCHED_GET_TIME(now); 549 opt.packets = q->packetsin;
552 idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max); 550 opt.bytesin = q->bytesin;
553 qave = q->qave >> q->Stab[(idle>>q->Scell_log)&0xFF]; 551
554 dst->qave = qave >> q->Wlog; 552 if (gred_wred_mode(table)) {
555 553 q->parms.qidlestart =
556 } else { 554 table->tab[table->def]->parms.qidlestart;
557 dst->qave = q->qave >> q->Wlog; 555 q->parms.qavg = table->tab[table->def]->parms.qavg;
558 }
559 } else {
560 dst->qave = 0;
561 } 556 }
562 557
563 558 opt.qave = red_calc_qavg(&q->parms, q->parms.qavg);
564 dst->Wlog = q->Wlog; 559
565 dst->Plog = q->Plog; 560append_opt:
566 dst->Scell_log = q->Scell_log; 561 RTA_APPEND(skb, sizeof(opt), &opt);
567 dst->other = q->other;
568 dst->forced = q->forced;
569 dst->early = q->early;
570 dst->pdrop = q->pdrop;
571 dst->prio = q->prio;
572 dst->packets=q->packetsin;
573 dst->bytesin=q->bytesin;
574 } 562 }
575 563
576 RTA_PUT(skb, TCA_GRED_PARMS, sizeof(struct tc_gred_qopt)*MAX_DPs, opt); 564 RTA_NEST_END(skb, parms);
577 rta->rta_len = skb->tail - b;
578 565
579 kfree(opt); 566 return RTA_NEST_END(skb, opts);
580 return skb->len;
581 567
582rtattr_failure: 568rtattr_failure:
583 if (opt) 569 return RTA_NEST_CANCEL(skb, opts);
584 kfree(opt);
585 DPRINTK("gred_dump: FAILURE!!!!\n");
586
587/* also free the opt struct here */
588 skb_trim(skb, b - skb->data);
589 return -1;
590} 570}
591 571
592static void gred_destroy(struct Qdisc *sch) 572static void gred_destroy(struct Qdisc *sch)
@@ -594,15 +574,13 @@ static void gred_destroy(struct Qdisc *sch)
594 struct gred_sched *table = qdisc_priv(sch); 574 struct gred_sched *table = qdisc_priv(sch);
595 int i; 575 int i;
596 576
597 for (i = 0;i < table->DPs; i++) { 577 for (i = 0; i < table->DPs; i++) {
598 if (table->tab[i]) 578 if (table->tab[i])
599 kfree(table->tab[i]); 579 gred_destroy_vq(table->tab[i]);
600 } 580 }
601} 581}
602 582
603static struct Qdisc_ops gred_qdisc_ops = { 583static struct Qdisc_ops gred_qdisc_ops = {
604 .next = NULL,
605 .cl_ops = NULL,
606 .id = "gred", 584 .id = "gred",
607 .priv_size = sizeof(struct gred_sched), 585 .priv_size = sizeof(struct gred_sched),
608 .enqueue = gred_enqueue, 586 .enqueue = gred_enqueue,
@@ -621,10 +599,13 @@ static int __init gred_module_init(void)
621{ 599{
622 return register_qdisc(&gred_qdisc_ops); 600 return register_qdisc(&gred_qdisc_ops);
623} 601}
624static void __exit gred_module_exit(void) 602
603static void __exit gred_module_exit(void)
625{ 604{
626 unregister_qdisc(&gred_qdisc_ops); 605 unregister_qdisc(&gred_qdisc_ops);
627} 606}
607
628module_init(gred_module_init) 608module_init(gred_module_init)
629module_exit(gred_module_exit) 609module_exit(gred_module_exit)
610
630MODULE_LICENSE("GPL"); 611MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 7845d045eec4..dccfa44c2d71 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -9,76 +9,23 @@
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 * 10 *
11 * Changes: 11 * Changes:
12 * J Hadi Salim <hadi@nortel.com> 980914: computation fixes 12 * J Hadi Salim 980914: computation fixes
13 * Alexey Makarenko <makar@phoenix.kharkov.ua> 990814: qave on idle link was calculated incorrectly. 13 * Alexey Makarenko <makar@phoenix.kharkov.ua> 990814: qave on idle link was calculated incorrectly.
14 * J Hadi Salim <hadi@nortelnetworks.com> 980816: ECN support 14 * J Hadi Salim 980816: ECN support
15 */ 15 */
16 16
17#include <linux/config.h> 17#include <linux/config.h>
18#include <linux/module.h> 18#include <linux/module.h>
19#include <asm/uaccess.h>
20#include <asm/system.h>
21#include <linux/bitops.h>
22#include <linux/types.h> 19#include <linux/types.h>
23#include <linux/kernel.h> 20#include <linux/kernel.h>
24#include <linux/sched.h>
25#include <linux/string.h>
26#include <linux/mm.h>
27#include <linux/socket.h>
28#include <linux/sockios.h>
29#include <linux/in.h>
30#include <linux/errno.h>
31#include <linux/interrupt.h>
32#include <linux/if_ether.h>
33#include <linux/inet.h>
34#include <linux/netdevice.h> 21#include <linux/netdevice.h>
35#include <linux/etherdevice.h>
36#include <linux/notifier.h>
37#include <net/ip.h>
38#include <net/route.h>
39#include <linux/skbuff.h> 22#include <linux/skbuff.h>
40#include <net/sock.h>
41#include <net/pkt_sched.h> 23#include <net/pkt_sched.h>
42#include <net/inet_ecn.h> 24#include <net/inet_ecn.h>
43#include <net/dsfield.h> 25#include <net/red.h>
44 26
45 27
46/* Random Early Detection (RED) algorithm. 28/* Parameters, settable by user:
47 =======================================
48
49 Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways
50 for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking.
51
52 This file codes a "divisionless" version of RED algorithm
53 as written down in Fig.17 of the paper.
54
55Short description.
56------------------
57
58 When a new packet arrives we calculate the average queue length:
59
60 avg = (1-W)*avg + W*current_queue_len,
61
62 W is the filter time constant (chosen as 2^(-Wlog)), it controls
63 the inertia of the algorithm. To allow larger bursts, W should be
64 decreased.
65
66 if (avg > th_max) -> packet marked (dropped).
67 if (avg < th_min) -> packet passes.
68 if (th_min < avg < th_max) we calculate probability:
69
70 Pb = max_P * (avg - th_min)/(th_max-th_min)
71
72 and mark (drop) packet with this probability.
73 Pb changes from 0 (at avg==th_min) to max_P (avg==th_max).
74 max_P should be small (not 1), usually 0.01..0.02 is good value.
75
76 max_P is chosen as a number, so that max_P/(th_max-th_min)
77 is a negative power of two in order arithmetics to contain
78 only shifts.
79
80
81 Parameters, settable by user:
82 ----------------------------- 29 -----------------------------
83 30
84 limit - bytes (must be > qth_max + burst) 31 limit - bytes (must be > qth_max + burst)
@@ -89,243 +36,93 @@ Short description.
89 arbitrarily high (well, less than ram size) 36 arbitrarily high (well, less than ram size)
90 Really, this limit will never be reached 37 Really, this limit will never be reached
91 if RED works correctly. 38 if RED works correctly.
92
93 qth_min - bytes (should be < qth_max/2)
94 qth_max - bytes (should be at least 2*qth_min and less limit)
95 Wlog - bits (<32) log(1/W).
96 Plog - bits (<32)
97
98 Plog is related to max_P by formula:
99
100 max_P = (qth_max-qth_min)/2^Plog;
101
102 F.e. if qth_max=128K and qth_min=32K, then Plog=22
103 corresponds to max_P=0.02
104
105 Scell_log
106 Stab
107
108 Lookup table for log((1-W)^(t/t_ave).
109
110
111NOTES:
112
113Upper bound on W.
114-----------------
115
116 If you want to allow bursts of L packets of size S,
117 you should choose W:
118
119 L + 1 - th_min/S < (1-(1-W)^L)/W
120
121 th_min/S = 32 th_min/S = 4
122
123 log(W) L
124 -1 33
125 -2 35
126 -3 39
127 -4 46
128 -5 57
129 -6 75
130 -7 101
131 -8 135
132 -9 190
133 etc.
134 */ 39 */
135 40
136struct red_sched_data 41struct red_sched_data
137{ 42{
138/* Parameters */ 43 u32 limit; /* HARD maximal queue length */
139 u32 limit; /* HARD maximal queue length */ 44 unsigned char flags;
140 u32 qth_min; /* Min average length threshold: A scaled */ 45 struct red_parms parms;
141 u32 qth_max; /* Max average length threshold: A scaled */ 46 struct red_stats stats;
142 u32 Rmask;
143 u32 Scell_max;
144 unsigned char flags;
145 char Wlog; /* log(W) */
146 char Plog; /* random number bits */
147 char Scell_log;
148 u8 Stab[256];
149
150/* Variables */
151 unsigned long qave; /* Average queue length: A scaled */
152 int qcount; /* Packets since last random number generation */
153 u32 qR; /* Cached random number */
154
155 psched_time_t qidlestart; /* Start of idle period */
156 struct tc_red_xstats st;
157}; 47};
158 48
159static int red_ecn_mark(struct sk_buff *skb) 49static inline int red_use_ecn(struct red_sched_data *q)
160{ 50{
161 if (skb->nh.raw + 20 > skb->tail) 51 return q->flags & TC_RED_ECN;
162 return 0;
163
164 switch (skb->protocol) {
165 case __constant_htons(ETH_P_IP):
166 if (INET_ECN_is_not_ect(skb->nh.iph->tos))
167 return 0;
168 IP_ECN_set_ce(skb->nh.iph);
169 return 1;
170 case __constant_htons(ETH_P_IPV6):
171 if (INET_ECN_is_not_ect(ipv6_get_dsfield(skb->nh.ipv6h)))
172 return 0;
173 IP6_ECN_set_ce(skb->nh.ipv6h);
174 return 1;
175 default:
176 return 0;
177 }
178} 52}
179 53
180static int 54static inline int red_use_harddrop(struct red_sched_data *q)
181red_enqueue(struct sk_buff *skb, struct Qdisc* sch) 55{
56 return q->flags & TC_RED_HARDDROP;
57}
58
59static int red_enqueue(struct sk_buff *skb, struct Qdisc* sch)
182{ 60{
183 struct red_sched_data *q = qdisc_priv(sch); 61 struct red_sched_data *q = qdisc_priv(sch);
184 62
185 psched_time_t now; 63 q->parms.qavg = red_calc_qavg(&q->parms, sch->qstats.backlog);
186 64
187 if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) { 65 if (red_is_idling(&q->parms))
188 long us_idle; 66 red_end_of_idle_period(&q->parms);
189 int shift;
190 67
191 PSCHED_GET_TIME(now); 68 switch (red_action(&q->parms, q->parms.qavg)) {
192 us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max); 69 case RED_DONT_MARK:
193 PSCHED_SET_PASTPERFECT(q->qidlestart); 70 break;
194 71
195/* 72 case RED_PROB_MARK:
196 The problem: ideally, average length queue recalcultion should 73 sch->qstats.overlimits++;
197 be done over constant clock intervals. This is too expensive, so that 74 if (!red_use_ecn(q) || !INET_ECN_set_ce(skb)) {
198 the calculation is driven by outgoing packets. 75 q->stats.prob_drop++;
199 When the queue is idle we have to model this clock by hand. 76 goto congestion_drop;
200 77 }
201 SF+VJ proposed to "generate" m = idletime/(average_pkt_size/bandwidth)
202 dummy packets as a burst after idle time, i.e.
203
204 q->qave *= (1-W)^m
205
206 This is an apparently overcomplicated solution (f.e. we have to precompute
207 a table to make this calculation in reasonable time)
208 I believe that a simpler model may be used here,
209 but it is field for experiments.
210*/
211 shift = q->Stab[us_idle>>q->Scell_log];
212
213 if (shift) {
214 q->qave >>= shift;
215 } else {
216 /* Approximate initial part of exponent
217 with linear function:
218 (1-W)^m ~= 1-mW + ...
219
220 Seems, it is the best solution to
221 problem of too coarce exponent tabulation.
222 */
223
224 us_idle = (q->qave * us_idle)>>q->Scell_log;
225 if (us_idle < q->qave/2)
226 q->qave -= us_idle;
227 else
228 q->qave >>= 1;
229 }
230 } else {
231 q->qave += sch->qstats.backlog - (q->qave >> q->Wlog);
232 /* NOTE:
233 q->qave is fixed point number with point at Wlog.
234 The formulae above is equvalent to floating point
235 version:
236
237 qave = qave*(1-W) + sch->qstats.backlog*W;
238 --ANK (980924)
239 */
240 }
241 78
242 if (q->qave < q->qth_min) { 79 q->stats.prob_mark++;
243 q->qcount = -1; 80 break;
244enqueue: 81
245 if (sch->qstats.backlog + skb->len <= q->limit) { 82 case RED_HARD_MARK:
246 __skb_queue_tail(&sch->q, skb); 83 sch->qstats.overlimits++;
247 sch->qstats.backlog += skb->len; 84 if (red_use_harddrop(q) || !red_use_ecn(q) ||
248 sch->bstats.bytes += skb->len; 85 !INET_ECN_set_ce(skb)) {
249 sch->bstats.packets++; 86 q->stats.forced_drop++;
250 return NET_XMIT_SUCCESS; 87 goto congestion_drop;
251 } else { 88 }
252 q->st.pdrop++;
253 }
254 kfree_skb(skb);
255 sch->qstats.drops++;
256 return NET_XMIT_DROP;
257 }
258 if (q->qave >= q->qth_max) {
259 q->qcount = -1;
260 sch->qstats.overlimits++;
261mark:
262 if (!(q->flags&TC_RED_ECN) || !red_ecn_mark(skb)) {
263 q->st.early++;
264 goto drop;
265 }
266 q->st.marked++;
267 goto enqueue;
268 }
269 89
270 if (++q->qcount) { 90 q->stats.forced_mark++;
271 /* The formula used below causes questions. 91 break;
272
273 OK. qR is random number in the interval 0..Rmask
274 i.e. 0..(2^Plog). If we used floating point
275 arithmetics, it would be: (2^Plog)*rnd_num,
276 where rnd_num is less 1.
277
278 Taking into account, that qave have fixed
279 point at Wlog, and Plog is related to max_P by
280 max_P = (qth_max-qth_min)/2^Plog; two lines
281 below have the following floating point equivalent:
282
283 max_P*(qave - qth_min)/(qth_max-qth_min) < rnd/qcount
284
285 Any questions? --ANK (980924)
286 */
287 if (((q->qave - q->qth_min)>>q->Wlog)*q->qcount < q->qR)
288 goto enqueue;
289 q->qcount = 0;
290 q->qR = net_random()&q->Rmask;
291 sch->qstats.overlimits++;
292 goto mark;
293 } 92 }
294 q->qR = net_random()&q->Rmask;
295 goto enqueue;
296 93
297drop: 94 if (sch->qstats.backlog + skb->len <= q->limit)
298 kfree_skb(skb); 95 return qdisc_enqueue_tail(skb, sch);
299 sch->qstats.drops++; 96
97 q->stats.pdrop++;
98 return qdisc_drop(skb, sch);
99
100congestion_drop:
101 qdisc_drop(skb, sch);
300 return NET_XMIT_CN; 102 return NET_XMIT_CN;
301} 103}
302 104
303static int 105static int red_requeue(struct sk_buff *skb, struct Qdisc* sch)
304red_requeue(struct sk_buff *skb, struct Qdisc* sch)
305{ 106{
306 struct red_sched_data *q = qdisc_priv(sch); 107 struct red_sched_data *q = qdisc_priv(sch);
307 108
308 PSCHED_SET_PASTPERFECT(q->qidlestart); 109 if (red_is_idling(&q->parms))
110 red_end_of_idle_period(&q->parms);
309 111
310 __skb_queue_head(&sch->q, skb); 112 return qdisc_requeue(skb, sch);
311 sch->qstats.backlog += skb->len;
312 sch->qstats.requeues++;
313 return 0;
314} 113}
315 114
316static struct sk_buff * 115static struct sk_buff * red_dequeue(struct Qdisc* sch)
317red_dequeue(struct Qdisc* sch)
318{ 116{
319 struct sk_buff *skb; 117 struct sk_buff *skb;
320 struct red_sched_data *q = qdisc_priv(sch); 118 struct red_sched_data *q = qdisc_priv(sch);
321 119
322 skb = __skb_dequeue(&sch->q); 120 skb = qdisc_dequeue_head(sch);
323 if (skb) { 121
324 sch->qstats.backlog -= skb->len; 122 if (skb == NULL && !red_is_idling(&q->parms))
325 return skb; 123 red_start_of_idle_period(&q->parms);
326 } 124
327 PSCHED_GET_TIME(q->qidlestart); 125 return skb;
328 return NULL;
329} 126}
330 127
331static unsigned int red_drop(struct Qdisc* sch) 128static unsigned int red_drop(struct Qdisc* sch)
@@ -333,16 +130,17 @@ static unsigned int red_drop(struct Qdisc* sch)
333 struct sk_buff *skb; 130 struct sk_buff *skb;
334 struct red_sched_data *q = qdisc_priv(sch); 131 struct red_sched_data *q = qdisc_priv(sch);
335 132
336 skb = __skb_dequeue_tail(&sch->q); 133 skb = qdisc_dequeue_tail(sch);
337 if (skb) { 134 if (skb) {
338 unsigned int len = skb->len; 135 unsigned int len = skb->len;
339 sch->qstats.backlog -= len; 136 q->stats.other++;
340 sch->qstats.drops++; 137 qdisc_drop(skb, sch);
341 q->st.other++;
342 kfree_skb(skb);
343 return len; 138 return len;
344 } 139 }
345 PSCHED_GET_TIME(q->qidlestart); 140
141 if (!red_is_idling(&q->parms))
142 red_start_of_idle_period(&q->parms);
143
346 return 0; 144 return 0;
347} 145}
348 146
@@ -350,43 +148,38 @@ static void red_reset(struct Qdisc* sch)
350{ 148{
351 struct red_sched_data *q = qdisc_priv(sch); 149 struct red_sched_data *q = qdisc_priv(sch);
352 150
353 __skb_queue_purge(&sch->q); 151 qdisc_reset_queue(sch);
354 sch->qstats.backlog = 0; 152 red_restart(&q->parms);
355 PSCHED_SET_PASTPERFECT(q->qidlestart);
356 q->qave = 0;
357 q->qcount = -1;
358} 153}
359 154
360static int red_change(struct Qdisc *sch, struct rtattr *opt) 155static int red_change(struct Qdisc *sch, struct rtattr *opt)
361{ 156{
362 struct red_sched_data *q = qdisc_priv(sch); 157 struct red_sched_data *q = qdisc_priv(sch);
363 struct rtattr *tb[TCA_RED_STAB]; 158 struct rtattr *tb[TCA_RED_MAX];
364 struct tc_red_qopt *ctl; 159 struct tc_red_qopt *ctl;
365 160
366 if (opt == NULL || 161 if (opt == NULL || rtattr_parse_nested(tb, TCA_RED_MAX, opt))
367 rtattr_parse_nested(tb, TCA_RED_STAB, opt) || 162 return -EINVAL;
368 tb[TCA_RED_PARMS-1] == 0 || tb[TCA_RED_STAB-1] == 0 || 163
164 if (tb[TCA_RED_PARMS-1] == NULL ||
369 RTA_PAYLOAD(tb[TCA_RED_PARMS-1]) < sizeof(*ctl) || 165 RTA_PAYLOAD(tb[TCA_RED_PARMS-1]) < sizeof(*ctl) ||
370 RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < 256) 166 tb[TCA_RED_STAB-1] == NULL ||
167 RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < RED_STAB_SIZE)
371 return -EINVAL; 168 return -EINVAL;
372 169
373 ctl = RTA_DATA(tb[TCA_RED_PARMS-1]); 170 ctl = RTA_DATA(tb[TCA_RED_PARMS-1]);
374 171
375 sch_tree_lock(sch); 172 sch_tree_lock(sch);
376 q->flags = ctl->flags; 173 q->flags = ctl->flags;
377 q->Wlog = ctl->Wlog;
378 q->Plog = ctl->Plog;
379 q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL;
380 q->Scell_log = ctl->Scell_log;
381 q->Scell_max = (255<<q->Scell_log);
382 q->qth_min = ctl->qth_min<<ctl->Wlog;
383 q->qth_max = ctl->qth_max<<ctl->Wlog;
384 q->limit = ctl->limit; 174 q->limit = ctl->limit;
385 memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256);
386 175
387 q->qcount = -1; 176 red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
177 ctl->Plog, ctl->Scell_log,
178 RTA_DATA(tb[TCA_RED_STAB-1]));
179
388 if (skb_queue_empty(&sch->q)) 180 if (skb_queue_empty(&sch->q))
389 PSCHED_SET_PASTPERFECT(q->qidlestart); 181 red_end_of_idle_period(&q->parms);
182
390 sch_tree_unlock(sch); 183 sch_tree_unlock(sch);
391 return 0; 184 return 0;
392} 185}
@@ -399,39 +192,39 @@ static int red_init(struct Qdisc* sch, struct rtattr *opt)
399static int red_dump(struct Qdisc *sch, struct sk_buff *skb) 192static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
400{ 193{
401 struct red_sched_data *q = qdisc_priv(sch); 194 struct red_sched_data *q = qdisc_priv(sch);
402 unsigned char *b = skb->tail; 195 struct rtattr *opts = NULL;
403 struct rtattr *rta; 196 struct tc_red_qopt opt = {
404 struct tc_red_qopt opt; 197 .limit = q->limit,
405 198 .flags = q->flags,
406 rta = (struct rtattr*)b; 199 .qth_min = q->parms.qth_min >> q->parms.Wlog,
407 RTA_PUT(skb, TCA_OPTIONS, 0, NULL); 200 .qth_max = q->parms.qth_max >> q->parms.Wlog,
408 opt.limit = q->limit; 201 .Wlog = q->parms.Wlog,
409 opt.qth_min = q->qth_min>>q->Wlog; 202 .Plog = q->parms.Plog,
410 opt.qth_max = q->qth_max>>q->Wlog; 203 .Scell_log = q->parms.Scell_log,
411 opt.Wlog = q->Wlog; 204 };
412 opt.Plog = q->Plog; 205
413 opt.Scell_log = q->Scell_log; 206 opts = RTA_NEST(skb, TCA_OPTIONS);
414 opt.flags = q->flags;
415 RTA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt); 207 RTA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt);
416 rta->rta_len = skb->tail - b; 208 return RTA_NEST_END(skb, opts);
417
418 return skb->len;
419 209
420rtattr_failure: 210rtattr_failure:
421 skb_trim(skb, b - skb->data); 211 return RTA_NEST_CANCEL(skb, opts);
422 return -1;
423} 212}
424 213
425static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d) 214static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
426{ 215{
427 struct red_sched_data *q = qdisc_priv(sch); 216 struct red_sched_data *q = qdisc_priv(sch);
428 217 struct tc_red_xstats st = {
429 return gnet_stats_copy_app(d, &q->st, sizeof(q->st)); 218 .early = q->stats.prob_drop + q->stats.forced_drop,
219 .pdrop = q->stats.pdrop,
220 .other = q->stats.other,
221 .marked = q->stats.prob_mark + q->stats.forced_mark,
222 };
223
224 return gnet_stats_copy_app(d, &st, sizeof(st));
430} 225}
431 226
432static struct Qdisc_ops red_qdisc_ops = { 227static struct Qdisc_ops red_qdisc_ops = {
433 .next = NULL,
434 .cl_ops = NULL,
435 .id = "red", 228 .id = "red",
436 .priv_size = sizeof(struct red_sched_data), 229 .priv_size = sizeof(struct red_sched_data),
437 .enqueue = red_enqueue, 230 .enqueue = red_enqueue,
@@ -450,10 +243,13 @@ static int __init red_module_init(void)
450{ 243{
451 return register_qdisc(&red_qdisc_ops); 244 return register_qdisc(&red_qdisc_ops);
452} 245}
453static void __exit red_module_exit(void) 246
247static void __exit red_module_exit(void)
454{ 248{
455 unregister_qdisc(&red_qdisc_ops); 249 unregister_qdisc(&red_qdisc_ops);
456} 250}
251
457module_init(red_module_init) 252module_init(red_module_init)
458module_exit(red_module_exit) 253module_exit(red_module_exit)
254
459MODULE_LICENSE("GPL"); 255MODULE_LICENSE("GPL");