1 files changed, 459 insertions, 0 deletions
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
new file mode 100644
index 000000000000..664d0e47374f
--- /dev/null
+++ b/net/sched/sch_red.c
@@ -0,0 +1,459 @@
+/*
+ * net/sched/sch_red.c  Random Early Detection queue.
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Changes:
+ * J Hadi Salim <hadi@nortel.com> 980914:       computation fixes
+ * Alexey Makarenko <makar@phoenix.kharkov.ua> 990814: qave on idle link was calculated incorrectly.
+ * J Hadi Salim <hadi@nortelnetworks.com> 980816:  ECN support  
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/notifier.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+#include <net/dsfield.h>
+/*      Random Early Detection (RED) algorithm.
+        =======================================
+        Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways
+        for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking.
+        This file codes a "divisionless" version of RED algorithm
+        as written down in Fig.17 of the paper.
+Short description.
+------------------
+        When a new packet arrives we calculate the average queue length:
+        avg = (1-W)*avg + W*current_queue_len,
+        W is the filter time constant (chosen as 2^(-Wlog)), it controls
+        the inertia of the algorithm. To allow larger bursts, W should be
+        decreased.
+        if (avg > th_max) -> packet marked (dropped).
+        if (avg < th_min) -> packet passes.
+        if (th_min < avg < th_max) we calculate probability:
+        Pb = max_P * (avg - th_min)/(th_max-th_min)
+        and mark (drop) packet with this probability.
+        Pb changes from 0 (at avg==th_min) to max_P (avg==th_max).
+        max_P should be small (not 1), usually 0.01..0.02 is good value.
+        max_P is chosen as a number, so that max_P/(th_max-th_min)
+        is a negative power of two in order arithmetics to contain
+        only shifts.
+        Parameters, settable by user:
+        -----------------------------
+        limit           - bytes (must be > qth_max + burst)
+        Hard limit on queue length, should be chosen >qth_max
+        to allow packet bursts. This parameter does not
+        affect the algorithms behaviour and can be chosen
+        arbitrarily high (well, less than ram size)
+        Really, this limit will never be reached
+        if RED works correctly.
+        qth_min         - bytes (should be < qth_max/2)
+        qth_max         - bytes (should be at least 2*qth_min and less limit)
+        Wlog            - bits (<32) log(1/W).
+        Plog            - bits (<32)
+        Plog is related to max_P by formula:
+        max_P = (qth_max-qth_min)/2^Plog;
+        F.e. if qth_max=128K and qth_min=32K, then Plog=22
+        corresponds to max_P=0.02
+        Scell_log
+        Stab
+        Lookup table for log((1-W)^(t/t_ave).
+NOTES:
+Upper bound on W.
+-----------------
+        If you want to allow bursts of L packets of size S,
+        you should choose W:
+        L + 1 - th_min/S < (1-(1-W)^L)/W
+        th_min/S = 32         th_min/S = 4
+                                               
+        log(W)  L
+        -1      33
+        -2      35
+        -3      39
+        -4      46
+        -5      57
+        -6      75
+        -7      101
+        -8      135
+        -9      190
+        etc.
+ */
+struct red_sched_data
+{
+/* Parameters */
+        u32             limit;          /* HARD maximal queue length    */
+        u32             qth_min;        /* Min average length threshold: A scaled */
+        u32             qth_max;        /* Max average length threshold: A scaled */
+        u32             Rmask;
+        u32             Scell_max;
+        unsigned char   flags;
+        char            Wlog;           /* log(W)               */
+        char            Plog;           /* random number bits   */
+        char            Scell_log;
+        u8              Stab[256];
+/* Variables */
+        unsigned long   qave;           /* Average queue length: A scaled */
+        int             qcount;         /* Packets since last random number generation */
+        u32             qR;             /* Cached random number */
+        psched_time_t   qidlestart;     /* Start of idle period         */
+        struct tc_red_xstats st;
+};
+static int red_ecn_mark(struct sk_buff *skb)
+{
+        if (skb->nh.raw + 20 > skb->tail)
+                return 0;
+        switch (skb->protocol) {
+        case __constant_htons(ETH_P_IP):
+                if (INET_ECN_is_not_ect(skb->nh.iph->tos))
+                        return 0;
+                IP_ECN_set_ce(skb->nh.iph);
+                return 1;
+        case __constant_htons(ETH_P_IPV6):
+                if (INET_ECN_is_not_ect(ipv6_get_dsfield(skb->nh.ipv6h)))
+                        return 0;
+                IP6_ECN_set_ce(skb->nh.ipv6h);
+                return 1;
+        default:
+                return 0;
+        }
+}
+static int
+red_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+{
+        struct red_sched_data *q = qdisc_priv(sch);
+        psched_time_t now;
+        if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) {
+                long us_idle;
+                int  shift;
+                PSCHED_GET_TIME(now);
+                us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max);
+                PSCHED_SET_PASTPERFECT(q->qidlestart);
+/*
+   The problem: ideally, average length queue recalcultion should
+   be done over constant clock intervals. This is too expensive, so that
+   the calculation is driven by outgoing packets.
+   When the queue is idle we have to model this clock by hand.
+   SF+VJ proposed to "generate" m = idletime/(average_pkt_size/bandwidth)
+   dummy packets as a burst after idle time, i.e.
+          q->qave *= (1-W)^m
+   This is an apparently overcomplicated solution (f.e. we have to precompute
+   a table to make this calculation in reasonable time)
+   I believe that a simpler model may be used here,
+   but it is field for experiments.
+*/
+                shift = q->Stab[us_idle>>q->Scell_log];
+                if (shift) {
+                        q->qave >>= shift;
+                } else {
+                        /* Approximate initial part of exponent
+                           with linear function:
+                           (1-W)^m ~= 1-mW + ...
+                           Seems, it is the best solution to
+                           problem of too coarce exponent tabulation.
+                         */
+                        us_idle = (q->qave * us_idle)>>q->Scell_log;
+                        if (us_idle < q->qave/2)
+                                q->qave -= us_idle;
+                        else
+                                q->qave >>= 1;
+                }
+        } else {
+                q->qave += sch->qstats.backlog - (q->qave >> q->Wlog);
+                /* NOTE:
+                   q->qave is fixed point number with point at Wlog.
+                   The formulae above is equvalent to floating point
+                   version:
+                   qave = qave*(1-W) + sch->qstats.backlog*W;
+                                                           --ANK (980924)
+                 */
+        }
+        if (q->qave < q->qth_min) {
+                q->qcount = -1;
+enqueue:
+                if (sch->qstats.backlog + skb->len <= q->limit) {
+                        __skb_queue_tail(&sch->q, skb);
+                        sch->qstats.backlog += skb->len;
+                        sch->bstats.bytes += skb->len;
+                        sch->bstats.packets++;
+                        return NET_XMIT_SUCCESS;
+                } else {
+                        q->st.pdrop++;
+                }
+                kfree_skb(skb);
+                sch->qstats.drops++;
+                return NET_XMIT_DROP;
+        }
+        if (q->qave >= q->qth_max) {
+                q->qcount = -1;
+                sch->qstats.overlimits++;
+mark:
+                if  (!(q->flags&TC_RED_ECN) || !red_ecn_mark(skb)) {
+                        q->st.early++;
+                        goto drop;
+                }
+                q->st.marked++;
+                goto enqueue;
+        }
+        if (++q->qcount) {
+                /* The formula used below causes questions.
+                   OK. qR is random number in the interval 0..Rmask
+                   i.e. 0..(2^Plog). If we used floating point
+                   arithmetics, it would be: (2^Plog)*rnd_num,
+                   where rnd_num is less 1.
+                   Taking into account, that qave have fixed
+                   point at Wlog, and Plog is related to max_P by
+                   max_P = (qth_max-qth_min)/2^Plog; two lines
+                   below have the following floating point equivalent:
+                   
+                   max_P*(qave - qth_min)/(qth_max-qth_min) < rnd/qcount
+                   Any questions? --ANK (980924)
+                 */
+                if (((q->qave - q->qth_min)>>q->Wlog)*q->qcount < q->qR)
+                        goto enqueue;
+                q->qcount = 0;
+                q->qR = net_random()&q->Rmask;
+                sch->qstats.overlimits++;
+                goto mark;
+        }
+        q->qR = net_random()&q->Rmask;
+        goto enqueue;
+drop:
+        kfree_skb(skb);
+        sch->qstats.drops++;
+        return NET_XMIT_CN;
+}
+static int
+red_requeue(struct sk_buff *skb, struct Qdisc* sch)
+{
+        struct red_sched_data *q = qdisc_priv(sch);
+        PSCHED_SET_PASTPERFECT(q->qidlestart);
+        __skb_queue_head(&sch->q, skb);
+        sch->qstats.backlog += skb->len;
+        sch->qstats.requeues++;
+        return 0;
+}
+static struct sk_buff *
+red_dequeue(struct Qdisc* sch)
+{
+        struct sk_buff *skb;
+        struct red_sched_data *q = qdisc_priv(sch);
+        skb = __skb_dequeue(&sch->q);
+        if (skb) {
+                sch->qstats.backlog -= skb->len;
+                return skb;
+        }
+        PSCHED_GET_TIME(q->qidlestart);
+        return NULL;
+}
+static unsigned int red_drop(struct Qdisc* sch)
+{
+        struct sk_buff *skb;
+        struct red_sched_data *q = qdisc_priv(sch);
+        skb = __skb_dequeue_tail(&sch->q);
+        if (skb) {
+                unsigned int len = skb->len;
+                sch->qstats.backlog -= len;
+                sch->qstats.drops++;
+                q->st.other++;
+                kfree_skb(skb);
+                return len;
+        }
+        PSCHED_GET_TIME(q->qidlestart);
+        return 0;
+}
+static void red_reset(struct Qdisc* sch)
+{
+        struct red_sched_data *q = qdisc_priv(sch);
+        __skb_queue_purge(&sch->q);
+        sch->qstats.backlog = 0;
+        PSCHED_SET_PASTPERFECT(q->qidlestart);
+        q->qave = 0;
+        q->qcount = -1;
+}
+static int red_change(struct Qdisc *sch, struct rtattr *opt)
+{
+        struct red_sched_data *q = qdisc_priv(sch);
+        struct rtattr *tb[TCA_RED_STAB];
+        struct tc_red_qopt *ctl;
+        if (opt == NULL ||
+            rtattr_parse_nested(tb, TCA_RED_STAB, opt) ||
+            tb[TCA_RED_PARMS-1] == 0 || tb[TCA_RED_STAB-1] == 0 ||
+            RTA_PAYLOAD(tb[TCA_RED_PARMS-1]) < sizeof(*ctl) ||
+            RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < 256)
+                return -EINVAL;
+        ctl = RTA_DATA(tb[TCA_RED_PARMS-1]);
+        sch_tree_lock(sch);
+        q->flags = ctl->flags;
+        q->Wlog = ctl->Wlog;
+        q->Plog = ctl->Plog;
+        q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL;
+        q->Scell_log = ctl->Scell_log;
+        q->Scell_max = (255<<q->Scell_log);
+        q->qth_min = ctl->qth_min<<ctl->Wlog;
+        q->qth_max = ctl->qth_max<<ctl->Wlog;
+        q->limit = ctl->limit;
+        memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256);
+        q->qcount = -1;
+        if (skb_queue_len(&sch->q) == 0)
+                PSCHED_SET_PASTPERFECT(q->qidlestart);
+        sch_tree_unlock(sch);
+        return 0;
+}
+static int red_init(struct Qdisc* sch, struct rtattr *opt)
+{
+        return red_change(sch, opt);
+}
+static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+        struct red_sched_data *q = qdisc_priv(sch);
+        unsigned char    *b = skb->tail;
+        struct rtattr *rta;
+        struct tc_red_qopt opt;
+        rta = (struct rtattr*)b;
+        RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
+        opt.limit = q->limit;
+        opt.qth_min = q->qth_min>>q->Wlog;
+        opt.qth_max = q->qth_max>>q->Wlog;
+        opt.Wlog = q->Wlog;
+        opt.Plog = q->Plog;
+        opt.Scell_log = q->Scell_log;
+        opt.flags = q->flags;
+        RTA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt);
+        rta->rta_len = skb->tail - b;
+        return skb->len;
+rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+        struct red_sched_data *q = qdisc_priv(sch);
+        return gnet_stats_copy_app(d, &q->st, sizeof(q->st));
+}
+static struct Qdisc_ops red_qdisc_ops = {
+        .next           =       NULL,
+        .cl_ops         =       NULL,
+        .id             =       "red",
+        .priv_size      =       sizeof(struct red_sched_data),
+        .enqueue        =       red_enqueue,
+        .dequeue        =       red_dequeue,
+        .requeue        =       red_requeue,
+        .drop           =       red_drop,
+        .init           =       red_init,
+        .reset          =       red_reset,
+        .change         =       red_change,
+        .dump           =       red_dump,
+        .dump_stats     =       red_dump_stats,
+        .owner          =       THIS_MODULE,
+};
+static int __init red_module_init(void)
+{
+        return register_qdisc(&red_qdisc_ops);
+}
+static void __exit red_module_exit(void) 
+{
+        unregister_qdisc(&red_qdisc_ops);
+}
+module_init(red_module_init)
+module_exit(red_module_exit)
+MODULE_LICENSE("GPL");

diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c new file mode 100644 index 000000000000..664d0e47374f --- /dev/null +++ b/net/sched/sch_red.c
@@ -0,0 +1,459 @@
	1	/*
	2	* net/sched/sch_red.c Random Early Detection queue.
	3	*
	4	* This program is free software; you can redistribute it and/or
	5	* modify it under the terms of the GNU General Public License
	6	* as published by the Free Software Foundation; either version
	7	* 2 of the License, or (at your option) any later version.
	8	*
	9	* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
	10	*
	11	* Changes:
	12	* J Hadi Salim <hadi@nortel.com> 980914: computation fixes
	13	* Alexey Makarenko <makar@phoenix.kharkov.ua> 990814: qave on idle link was calculated incorrectly.
	14	* J Hadi Salim <hadi@nortelnetworks.com> 980816: ECN support
	15	*/
	16
	17	#include <linux/config.h>
	18	#include <linux/module.h>
	19	#include <asm/uaccess.h>
	20	#include <asm/system.h>
	21	#include <linux/bitops.h>
	22	#include <linux/types.h>
	23	#include <linux/kernel.h>
	24	#include <linux/sched.h>
	25	#include <linux/string.h>
	26	#include <linux/mm.h>
	27	#include <linux/socket.h>
	28	#include <linux/sockios.h>
	29	#include <linux/in.h>
	30	#include <linux/errno.h>
	31	#include <linux/interrupt.h>
	32	#include <linux/if_ether.h>
	33	#include <linux/inet.h>
	34	#include <linux/netdevice.h>
	35	#include <linux/etherdevice.h>
	36	#include <linux/notifier.h>
	37	#include <net/ip.h>
	38	#include <net/route.h>
	39	#include <linux/skbuff.h>
	40	#include <net/sock.h>
	41	#include <net/pkt_sched.h>
	42	#include <net/inet_ecn.h>
	43	#include <net/dsfield.h>
	44
	45
	46	/* Random Early Detection (RED) algorithm.
	47	=======================================
	48
	49	Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways
	50	for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking.
	51
	52	This file codes a "divisionless" version of RED algorithm
	53	as written down in Fig.17 of the paper.
	54
	55	Short description.
	56	------------------
	57
	58	When a new packet arrives we calculate the average queue length:
	59
	60	avg = (1-W)avg + Wcurrent_queue_len,
	61
	62	W is the filter time constant (chosen as 2^(-Wlog)), it controls
	63	the inertia of the algorithm. To allow larger bursts, W should be
	64	decreased.
	65
	66	if (avg > th_max) -> packet marked (dropped).
	67	if (avg < th_min) -> packet passes.
	68	if (th_min < avg < th_max) we calculate probability:
	69
	70	Pb = max_P * (avg - th_min)/(th_max-th_min)
	71
	72	and mark (drop) packet with this probability.
	73	Pb changes from 0 (at avg==th_min) to max_P (avg==th_max).
	74	max_P should be small (not 1), usually 0.01..0.02 is good value.
	75
	76	max_P is chosen as a number, so that max_P/(th_max-th_min)
	77	is a negative power of two in order arithmetics to contain
	78	only shifts.
	79
	80
	81	Parameters, settable by user:
	82	-----------------------------
	83
	84	limit - bytes (must be > qth_max + burst)
	85
	86	Hard limit on queue length, should be chosen >qth_max
	87	to allow packet bursts. This parameter does not
	88	affect the algorithms behaviour and can be chosen
	89	arbitrarily high (well, less than ram size)
	90	Really, this limit will never be reached
	91	if RED works correctly.
	92
	93	qth_min - bytes (should be < qth_max/2)
	94	qth_max - bytes (should be at least 2*qth_min and less limit)
	95	Wlog - bits (<32) log(1/W).
	96	Plog - bits (<32)
	97
	98	Plog is related to max_P by formula:
	99
	100	max_P = (qth_max-qth_min)/2^Plog;
	101
	102	F.e. if qth_max=128K and qth_min=32K, then Plog=22
	103	corresponds to max_P=0.02
	104
	105	Scell_log
	106	Stab
	107
	108	Lookup table for log((1-W)^(t/t_ave).
	109
	110
	111	NOTES:
	112
	113	Upper bound on W.
	114	-----------------
	115
	116	If you want to allow bursts of L packets of size S,
	117	you should choose W:
	118
	119	L + 1 - th_min/S < (1-(1-W)^L)/W
	120
	121	th_min/S = 32 th_min/S = 4
	122
	123	log(W) L
	124	-1 33
	125	-2 35
	126	-3 39
	127	-4 46
	128	-5 57
	129	-6 75
	130	-7 101
	131	-8 135
	132	-9 190
	133	etc.
	134	*/
	135
	136	struct red_sched_data
	137	{
	138	/* Parameters */
	139	u32 limit; /* HARD maximal queue length */
	140	u32 qth_min; /* Min average length threshold: A scaled */
	141	u32 qth_max; /* Max average length threshold: A scaled */
	142	u32 Rmask;
	143	u32 Scell_max;
	144	unsigned char flags;
	145	char Wlog; /* log(W) */
	146	char Plog; /* random number bits */
	147	char Scell_log;
	148	u8 Stab[256];
	149
	150	/* Variables */
	151	unsigned long qave; /* Average queue length: A scaled */
	152	int qcount; /* Packets since last random number generation */
	153	u32 qR; /* Cached random number */
	154
	155	psched_time_t qidlestart; /* Start of idle period */
	156	struct tc_red_xstats st;
	157	};
	158
	159	static int red_ecn_mark(struct sk_buff *skb)
	160	{
	161	if (skb->nh.raw + 20 > skb->tail)
	162	return 0;
	163
	164	switch (skb->protocol) {
	165	case __constant_htons(ETH_P_IP):
	166	if (INET_ECN_is_not_ect(skb->nh.iph->tos))
	167	return 0;
	168	IP_ECN_set_ce(skb->nh.iph);
	169	return 1;
	170	case __constant_htons(ETH_P_IPV6):
	171	if (INET_ECN_is_not_ect(ipv6_get_dsfield(skb->nh.ipv6h)))
	172	return 0;
	173	IP6_ECN_set_ce(skb->nh.ipv6h);
	174	return 1;
	175	default:
	176	return 0;
	177	}
	178	}
	179
	180	static int
	181	red_enqueue(struct sk_buff skb, struct Qdisc sch)
	182	{
	183	struct red_sched_data *q = qdisc_priv(sch);
	184
	185	psched_time_t now;
	186
	187	if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) {
	188	long us_idle;
	189	int shift;
	190
	191	PSCHED_GET_TIME(now);
	192	us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max);
	193	PSCHED_SET_PASTPERFECT(q->qidlestart);
	194
	195	/*
	196	The problem: ideally, average length queue recalcultion should
	197	be done over constant clock intervals. This is too expensive, so that
	198	the calculation is driven by outgoing packets.
	199	When the queue is idle we have to model this clock by hand.
	200
	201	SF+VJ proposed to "generate" m = idletime/(average_pkt_size/bandwidth)
	202	dummy packets as a burst after idle time, i.e.
	203
	204	q->qave *= (1-W)^m
	205
	206	This is an apparently overcomplicated solution (f.e. we have to precompute
	207	a table to make this calculation in reasonable time)
	208	I believe that a simpler model may be used here,
	209	but it is field for experiments.
	210	*/
	211	shift = q->Stab[us_idle>>q->Scell_log];
	212
	213	if (shift) {
	214	q->qave >>= shift;
	215	} else {
	216	/* Approximate initial part of exponent
	217	with linear function:
	218	(1-W)^m ~= 1-mW + ...
	219
	220	Seems, it is the best solution to
	221	problem of too coarce exponent tabulation.
	222	*/
	223
	224	us_idle = (q->qave * us_idle)>>q->Scell_log;
	225	if (us_idle < q->qave/2)
	226	q->qave -= us_idle;
	227	else
	228	q->qave >>= 1;
	229	}
	230	} else {
	231	q->qave += sch->qstats.backlog - (q->qave >> q->Wlog);
	232	/* NOTE:
	233	q->qave is fixed point number with point at Wlog.
	234	The formulae above is equvalent to floating point
	235	version:
	236
	237	qave = qave(1-W) + sch->qstats.backlogW;
	238	--ANK (980924)
	239	*/
	240	}
	241
	242	if (q->qave < q->qth_min) {
	243	q->qcount = -1;
	244	enqueue:
	245	if (sch->qstats.backlog + skb->len <= q->limit) {
	246	__skb_queue_tail(&sch->q, skb);
	247	sch->qstats.backlog += skb->len;
	248	sch->bstats.bytes += skb->len;
	249	sch->bstats.packets++;
	250	return NET_XMIT_SUCCESS;
	251	} else {
	252	q->st.pdrop++;
	253	}
	254	kfree_skb(skb);
	255	sch->qstats.drops++;
	256	return NET_XMIT_DROP;
	257	}
	258	if (q->qave >= q->qth_max) {
	259	q->qcount = -1;
	260	sch->qstats.overlimits++;
	261	mark:
	262	if (!(q->flags&TC_RED_ECN) \|\| !red_ecn_mark(skb)) {
	263	q->st.early++;
	264	goto drop;
	265	}
	266	q->st.marked++;
	267	goto enqueue;
	268	}
	269
	270	if (++q->qcount) {
	271	/* The formula used below causes questions.
	272
	273	OK. qR is random number in the interval 0..Rmask
	274	i.e. 0..(2^Plog). If we used floating point
	275	arithmetics, it would be: (2^Plog)*rnd_num,
	276	where rnd_num is less 1.
	277
	278	Taking into account, that qave have fixed
	279	point at Wlog, and Plog is related to max_P by
	280	max_P = (qth_max-qth_min)/2^Plog; two lines
	281	below have the following floating point equivalent:
	282
	283	max_P*(qave - qth_min)/(qth_max-qth_min) < rnd/qcount
	284
	285	Any questions? --ANK (980924)
	286	*/
	287	if (((q->qave - q->qth_min)>>q->Wlog)*q->qcount < q->qR)
	288	goto enqueue;
	289	q->qcount = 0;
	290	q->qR = net_random()&q->Rmask;
	291	sch->qstats.overlimits++;
	292	goto mark;
	293	}
	294	q->qR = net_random()&q->Rmask;
	295	goto enqueue;
	296
	297	drop:
	298	kfree_skb(skb);
	299	sch->qstats.drops++;
	300	return NET_XMIT_CN;
	301	}
	302
	303	static int
	304	red_requeue(struct sk_buff skb, struct Qdisc sch)
	305	{
	306	struct red_sched_data *q = qdisc_priv(sch);
	307
	308	PSCHED_SET_PASTPERFECT(q->qidlestart);
	309
	310	__skb_queue_head(&sch->q, skb);
	311	sch->qstats.backlog += skb->len;
	312	sch->qstats.requeues++;
	313	return 0;
	314	}
	315
	316	static struct sk_buff *
	317	red_dequeue(struct Qdisc* sch)
	318	{
	319	struct sk_buff *skb;
	320	struct red_sched_data *q = qdisc_priv(sch);
	321
	322	skb = __skb_dequeue(&sch->q);
	323	if (skb) {
	324	sch->qstats.backlog -= skb->len;
	325	return skb;
	326	}
	327	PSCHED_GET_TIME(q->qidlestart);
	328	return NULL;
	329	}
	330
	331	static unsigned int red_drop(struct Qdisc* sch)
	332	{
	333	struct sk_buff *skb;
	334	struct red_sched_data *q = qdisc_priv(sch);
	335
	336	skb = __skb_dequeue_tail(&sch->q);
	337	if (skb) {
	338	unsigned int len = skb->len;
	339	sch->qstats.backlog -= len;
	340	sch->qstats.drops++;
	341	q->st.other++;
	342	kfree_skb(skb);
	343	return len;
	344	}
	345	PSCHED_GET_TIME(q->qidlestart);
	346	return 0;
	347	}
	348
	349	static void red_reset(struct Qdisc* sch)
	350	{
	351	struct red_sched_data *q = qdisc_priv(sch);
	352
	353	__skb_queue_purge(&sch->q);
	354	sch->qstats.backlog = 0;
	355	PSCHED_SET_PASTPERFECT(q->qidlestart);
	356	q->qave = 0;
	357	q->qcount = -1;
	358	}
	359
	360	static int red_change(struct Qdisc sch, struct rtattr opt)
	361	{
	362	struct red_sched_data *q = qdisc_priv(sch);
	363	struct rtattr *tb[TCA_RED_STAB];
	364	struct tc_red_qopt *ctl;
	365
	366	if (opt == NULL \|\|
	367	rtattr_parse_nested(tb, TCA_RED_STAB, opt) \|\|
	368	tb[TCA_RED_PARMS-1] == 0 \|\| tb[TCA_RED_STAB-1] == 0 \|\|
	369	RTA_PAYLOAD(tb[TCA_RED_PARMS-1]) < sizeof(*ctl) \|\|
	370	RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < 256)
	371	return -EINVAL;
	372
	373	ctl = RTA_DATA(tb[TCA_RED_PARMS-1]);
	374
	375	sch_tree_lock(sch);
	376	q->flags = ctl->flags;
	377	q->Wlog = ctl->Wlog;
	378	q->Plog = ctl->Plog;
	379	q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL;
	380	q->Scell_log = ctl->Scell_log;
	381	q->Scell_max = (255<<q->Scell_log);
	382	q->qth_min = ctl->qth_min<<ctl->Wlog;
	383	q->qth_max = ctl->qth_max<<ctl->Wlog;
	384	q->limit = ctl->limit;
	385	memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256);
	386
	387	q->qcount = -1;
	388	if (skb_queue_len(&sch->q) == 0)
	389	PSCHED_SET_PASTPERFECT(q->qidlestart);
	390	sch_tree_unlock(sch);
	391	return 0;
	392	}
	393
	394	static int red_init(struct Qdisc* sch, struct rtattr *opt)
	395	{
	396	return red_change(sch, opt);
	397	}
	398
	399	static int red_dump(struct Qdisc sch, struct sk_buff skb)
	400	{
	401	struct red_sched_data *q = qdisc_priv(sch);
	402	unsigned char *b = skb->tail;
	403	struct rtattr *rta;
	404	struct tc_red_qopt opt;
	405
	406	rta = (struct rtattr*)b;
	407	RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
	408	opt.limit = q->limit;
	409	opt.qth_min = q->qth_min>>q->Wlog;
	410	opt.qth_max = q->qth_max>>q->Wlog;
	411	opt.Wlog = q->Wlog;
	412	opt.Plog = q->Plog;
	413	opt.Scell_log = q->Scell_log;
	414	opt.flags = q->flags;
	415	RTA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt);
	416	rta->rta_len = skb->tail - b;
	417
	418	return skb->len;
	419
	420	rtattr_failure:
	421	skb_trim(skb, b - skb->data);
	422	return -1;
	423	}
	424
	425	static int red_dump_stats(struct Qdisc sch, struct gnet_dump d)
	426	{
	427	struct red_sched_data *q = qdisc_priv(sch);
	428
	429	return gnet_stats_copy_app(d, &q->st, sizeof(q->st));
	430	}
	431
	432	static struct Qdisc_ops red_qdisc_ops = {
	433	.next = NULL,
	434	.cl_ops = NULL,
	435	.id = "red",
	436	.priv_size = sizeof(struct red_sched_data),
	437	.enqueue = red_enqueue,
	438	.dequeue = red_dequeue,
	439	.requeue = red_requeue,
	440	.drop = red_drop,
	441	.init = red_init,
	442	.reset = red_reset,
	443	.change = red_change,
	444	.dump = red_dump,
	445	.dump_stats = red_dump_stats,
	446	.owner = THIS_MODULE,
	447	};
	448
	449	static int __init red_module_init(void)
	450	{
	451	return register_qdisc(&red_qdisc_ops);
	452	}
	453	static void __exit red_module_exit(void)
	454	{
	455	unregister_qdisc(&red_qdisc_ops);
	456	}
	457	module_init(red_module_init)
	458	module_exit(red_module_exit)
	459	MODULE_LICENSE("GPL");