Merge master.kernel.org:/pub/scm/linux/kernel/git/acme/net-2.6

author: Linus Torvalds <torvalds@g5.osdl.org> 2005-11-07 11:05:11 -0500
committer: Linus Torvalds <torvalds@g5.osdl.org> 2005-11-07 11:05:11 -0500
commit: 8e33ba49765484bc6de3a2f8143733713fa93bc1 (patch)
tree: 2ea080e478e4ee86a893b75db2d5c81ce14cbf10 /net/sched/sch_red.c
parent: 8cde0776ec1e86c270f65bf482f96288e6bf0023 (diff)
parent: 2d43f1128a4282fbe8442f40b4cbbac05d8f10aa (diff)
1 files changed, 107 insertions, 311 deletions
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 7845d045eec4..dccfa44c2d71 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -9,76 +9,23 @@
 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 * Changes:
- * J Hadi Salim <hadi@nortel.com> 980914:       computation fixes
+ * J Hadi Salim 980914: computation fixes
 * Alexey Makarenko <makar@phoenix.kharkov.ua> 990814: qave on idle link was calculated incorrectly.
- * J Hadi Salim <hadi@nortelnetworks.com> 980816:  ECN support  
+ * J Hadi Salim 980816:  ECN support
 */
 #include <linux/config.h>
 #include <linux/module.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/bitops.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/in.h>
-#include <linux/errno.h>
-#include <linux/interrupt.h>
-#include <linux/if_ether.h>
-#include <linux/inet.h>
 #include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/notifier.h>
-#include <net/ip.h>
-#include <net/route.h>
 #include <linux/skbuff.h>
-#include <net/sock.h>
 #include <net/pkt_sched.h>
 #include <net/inet_ecn.h>
-#include <net/dsfield.h>
+#include <net/red.h>
-/*      Random Early Detection (RED) algorithm.
+/*      Parameters, settable by user:
-        =======================================
-        Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways
-        for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking.
-        This file codes a "divisionless" version of RED algorithm
-        as written down in Fig.17 of the paper.
-Short description.
------------------
-        When a new packet arrives we calculate the average queue length:
-        avg = (1-W)*avg + W*current_queue_len,
-        W is the filter time constant (chosen as 2^(-Wlog)), it controls
-        the inertia of the algorithm. To allow larger bursts, W should be
-        decreased.
-        if (avg > th_max) -> packet marked (dropped).
-        if (avg < th_min) -> packet passes.
-        if (th_min < avg < th_max) we calculate probability:
-        Pb = max_P * (avg - th_min)/(th_max-th_min)
-        and mark (drop) packet with this probability.
-        Pb changes from 0 (at avg==th_min) to max_P (avg==th_max).
-        max_P should be small (not 1), usually 0.01..0.02 is good value.
-        max_P is chosen as a number, so that max_P/(th_max-th_min)
-        is a negative power of two in order arithmetics to contain
-        only shifts.
-        Parameters, settable by user:
        -----------------------------
        limit           - bytes (must be > qth_max + burst)
@@ -89,243 +36,93 @@ Short description.
        arbitrarily high (well, less than ram size)
        Really, this limit will never be reached
        if RED works correctly.
-        qth_min         - bytes (should be < qth_max/2)
-        qth_max         - bytes (should be at least 2*qth_min and less limit)
-        Wlog            - bits (<32) log(1/W).
-        Plog            - bits (<32)
-        Plog is related to max_P by formula:
-        max_P = (qth_max-qth_min)/2^Plog;
-        F.e. if qth_max=128K and qth_min=32K, then Plog=22
-        corresponds to max_P=0.02
-        Scell_log
-        Stab
-        Lookup table for log((1-W)^(t/t_ave).
-NOTES:
-Upper bound on W.
-----------------
-        If you want to allow bursts of L packets of size S,
-        you should choose W:
-        L + 1 - th_min/S < (1-(1-W)^L)/W
-        th_min/S = 32         th_min/S = 4
-                                               
-        log(W)  L
-        -1      33
-        -2      35
-        -3      39
-        -4      46
-        -5      57
-        -6      75
-        -7      101
-        -8      135
-        -9      190
-        etc.
 */
 struct red_sched_data
 {
-/* Parameters */
+        u32                     limit;          /* HARD maximal queue length */
-        u32             limit;          /* HARD maximal queue length    */
+        unsigned char           flags;
-        u32             qth_min;        /* Min average length threshold: A scaled */
+        struct red_parms        parms;
-        u32             qth_max;        /* Max average length threshold: A scaled */
+        struct red_stats        stats;
-        u32             Rmask;
-        u32             Scell_max;
-        unsigned char   flags;
-        char            Wlog;           /* log(W)               */
-        char            Plog;           /* random number bits   */
-        char            Scell_log;
-        u8              Stab[256];
-/* Variables */
-        unsigned long   qave;           /* Average queue length: A scaled */
-        int             qcount;         /* Packets since last random number generation */
-        u32             qR;             /* Cached random number */
-        psched_time_t   qidlestart;     /* Start of idle period         */
-        struct tc_red_xstats st;
 };
-static int red_ecn_mark(struct sk_buff *skb)
+static inline int red_use_ecn(struct red_sched_data *q)
 {
-        if (skb->nh.raw + 20 > skb->tail)
+        return q->flags & TC_RED_ECN;
-                return 0;
-        switch (skb->protocol) {
-        case __constant_htons(ETH_P_IP):
-                if (INET_ECN_is_not_ect(skb->nh.iph->tos))
-                        return 0;
-                IP_ECN_set_ce(skb->nh.iph);
-                return 1;
-        case __constant_htons(ETH_P_IPV6):
-                if (INET_ECN_is_not_ect(ipv6_get_dsfield(skb->nh.ipv6h)))
-                        return 0;
-                IP6_ECN_set_ce(skb->nh.ipv6h);
-                return 1;
-        default:
-                return 0;
-        }
 }
-static int
+static inline int red_use_harddrop(struct red_sched_data *q)
-red_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+{
+        return q->flags & TC_RED_HARDDROP;
+}
+static int red_enqueue(struct sk_buff *skb, struct Qdisc* sch)
 {
        struct red_sched_data *q = qdisc_priv(sch);
-        psched_time_t now;
+        q->parms.qavg = red_calc_qavg(&q->parms, sch->qstats.backlog);
-        if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) {
+        if (red_is_idling(&q->parms))
-                long us_idle;
+                red_end_of_idle_period(&q->parms);
-                int  shift;
-                PSCHED_GET_TIME(now);
+        switch (red_action(&q->parms, q->parms.qavg)) {
-                us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max);
+                case RED_DONT_MARK:
-                PSCHED_SET_PASTPERFECT(q->qidlestart);
+                        break;
-/*
+                case RED_PROB_MARK:
-   The problem: ideally, average length queue recalcultion should
+                        sch->qstats.overlimits++;
-   be done over constant clock intervals. This is too expensive, so that
+                        if (!red_use_ecn(q) || !INET_ECN_set_ce(skb)) {
-   the calculation is driven by outgoing packets.
+                                q->stats.prob_drop++;
-   When the queue is idle we have to model this clock by hand.
+                                goto congestion_drop;
+                        }
-   SF+VJ proposed to "generate" m = idletime/(average_pkt_size/bandwidth)
-   dummy packets as a burst after idle time, i.e.
-          q->qave *= (1-W)^m
-   This is an apparently overcomplicated solution (f.e. we have to precompute
-   a table to make this calculation in reasonable time)
-   I believe that a simpler model may be used here,
-   but it is field for experiments.
-*/
-                shift = q->Stab[us_idle>>q->Scell_log];
-                if (shift) {
-                        q->qave >>= shift;
-                } else {
-                        /* Approximate initial part of exponent
-                           with linear function:
-                           (1-W)^m ~= 1-mW + ...
-                           Seems, it is the best solution to
-                           problem of too coarce exponent tabulation.
-                         */
-                        us_idle = (q->qave * us_idle)>>q->Scell_log;
-                        if (us_idle < q->qave/2)
-                                q->qave -= us_idle;
-                        else
-                                q->qave >>= 1;
-                }
-        } else {
-                q->qave += sch->qstats.backlog - (q->qave >> q->Wlog);
-                /* NOTE:
-                   q->qave is fixed point number with point at Wlog.
-                   The formulae above is equvalent to floating point
-                   version:
-                   qave = qave*(1-W) + sch->qstats.backlog*W;
-                                                           --ANK (980924)
-                 */
-        }
-        if (q->qave < q->qth_min) {
+                        q->stats.prob_mark++;
-                q->qcount = -1;
+                        break;
-enqueue:
-                if (sch->qstats.backlog + skb->len <= q->limit) {
+                case RED_HARD_MARK:
-                        __skb_queue_tail(&sch->q, skb);
+                        sch->qstats.overlimits++;
-                        sch->qstats.backlog += skb->len;
+                        if (red_use_harddrop(q) || !red_use_ecn(q) ||
-                        sch->bstats.bytes += skb->len;
+                            !INET_ECN_set_ce(skb)) {
-                        sch->bstats.packets++;
+                                q->stats.forced_drop++;
-                        return NET_XMIT_SUCCESS;
+                                goto congestion_drop;
-                } else {
+                        }
-                        q->st.pdrop++;
-                }
-                kfree_skb(skb);
-                sch->qstats.drops++;
-                return NET_XMIT_DROP;
-        }
-        if (q->qave >= q->qth_max) {
-                q->qcount = -1;
-                sch->qstats.overlimits++;
-mark:
-                if  (!(q->flags&TC_RED_ECN) || !red_ecn_mark(skb)) {
-                        q->st.early++;
-                        goto drop;
-                }
-                q->st.marked++;
-                goto enqueue;
-        }
-        if (++q->qcount) {
+                        q->stats.forced_mark++;
-                /* The formula used below causes questions.
+                        break;
-                   OK. qR is random number in the interval 0..Rmask
-                   i.e. 0..(2^Plog). If we used floating point
-                   arithmetics, it would be: (2^Plog)*rnd_num,
-                   where rnd_num is less 1.
-                   Taking into account, that qave have fixed
-                   point at Wlog, and Plog is related to max_P by
-                   max_P = (qth_max-qth_min)/2^Plog; two lines
-                   below have the following floating point equivalent:
-                   
-                   max_P*(qave - qth_min)/(qth_max-qth_min) < rnd/qcount
-                   Any questions? --ANK (980924)
-                 */
-                if (((q->qave - q->qth_min)>>q->Wlog)*q->qcount < q->qR)
-                        goto enqueue;
-                q->qcount = 0;
-                q->qR = net_random()&q->Rmask;
-                sch->qstats.overlimits++;
-                goto mark;
        }
-        q->qR = net_random()&q->Rmask;
-        goto enqueue;
-drop:
+        if (sch->qstats.backlog + skb->len <= q->limit)
-        kfree_skb(skb);
+                return qdisc_enqueue_tail(skb, sch);
-        sch->qstats.drops++;
+        q->stats.pdrop++;
+        return qdisc_drop(skb, sch);
+congestion_drop:
+        qdisc_drop(skb, sch);
        return NET_XMIT_CN;
 }
-static int
+static int red_requeue(struct sk_buff *skb, struct Qdisc* sch)
-red_requeue(struct sk_buff *skb, struct Qdisc* sch)
 {
        struct red_sched_data *q = qdisc_priv(sch);
-        PSCHED_SET_PASTPERFECT(q->qidlestart);
+        if (red_is_idling(&q->parms))
+                red_end_of_idle_period(&q->parms);
-        __skb_queue_head(&sch->q, skb);
+        return qdisc_requeue(skb, sch);
-        sch->qstats.backlog += skb->len;
-        sch->qstats.requeues++;
-        return 0;
 }
-static struct sk_buff *
+static struct sk_buff * red_dequeue(struct Qdisc* sch)
-red_dequeue(struct Qdisc* sch)
 {
        struct sk_buff *skb;
        struct red_sched_data *q = qdisc_priv(sch);
-        skb = __skb_dequeue(&sch->q);
+        skb = qdisc_dequeue_head(sch);
-        if (skb) {
-                sch->qstats.backlog -= skb->len;
+        if (skb == NULL && !red_is_idling(&q->parms))
-                return skb;
+                red_start_of_idle_period(&q->parms);
-        }
-        PSCHED_GET_TIME(q->qidlestart);
+        return skb;
-        return NULL;
 }
 static unsigned int red_drop(struct Qdisc* sch)
@@ -333,16 +130,17 @@ static unsigned int red_drop(struct Qdisc* sch)
        struct sk_buff *skb;
        struct red_sched_data *q = qdisc_priv(sch);
-        skb = __skb_dequeue_tail(&sch->q);
+        skb = qdisc_dequeue_tail(sch);
        if (skb) {
                unsigned int len = skb->len;
-                sch->qstats.backlog -= len;
+                q->stats.other++;
-                sch->qstats.drops++;
+                qdisc_drop(skb, sch);
-                q->st.other++;
-                kfree_skb(skb);
                return len;
        }
-        PSCHED_GET_TIME(q->qidlestart);
+        if (!red_is_idling(&q->parms))
+                red_start_of_idle_period(&q->parms);
        return 0;
 }
@@ -350,43 +148,38 @@ static void red_reset(struct Qdisc* sch)
 {
        struct red_sched_data *q = qdisc_priv(sch);
-        __skb_queue_purge(&sch->q);
+        qdisc_reset_queue(sch);
-        sch->qstats.backlog = 0;
+        red_restart(&q->parms);
-        PSCHED_SET_PASTPERFECT(q->qidlestart);
-        q->qave = 0;
-        q->qcount = -1;
 }
 static int red_change(struct Qdisc *sch, struct rtattr *opt)
 {
        struct red_sched_data *q = qdisc_priv(sch);
-        struct rtattr *tb[TCA_RED_STAB];
+        struct rtattr *tb[TCA_RED_MAX];
        struct tc_red_qopt *ctl;
-        if (opt == NULL ||
+        if (opt == NULL || rtattr_parse_nested(tb, TCA_RED_MAX, opt))
-            rtattr_parse_nested(tb, TCA_RED_STAB, opt) ||
+                return -EINVAL;
-            tb[TCA_RED_PARMS-1] == 0 || tb[TCA_RED_STAB-1] == 0 ||
+        if (tb[TCA_RED_PARMS-1] == NULL ||
            RTA_PAYLOAD(tb[TCA_RED_PARMS-1]) < sizeof(*ctl) ||
-            RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < 256)
+            tb[TCA_RED_STAB-1] == NULL ||
+            RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < RED_STAB_SIZE)
                return -EINVAL;
        ctl = RTA_DATA(tb[TCA_RED_PARMS-1]);
        sch_tree_lock(sch);
        q->flags = ctl->flags;
-        q->Wlog = ctl->Wlog;
-        q->Plog = ctl->Plog;
-        q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL;
-        q->Scell_log = ctl->Scell_log;
-        q->Scell_max = (255<<q->Scell_log);
-        q->qth_min = ctl->qth_min<<ctl->Wlog;
-        q->qth_max = ctl->qth_max<<ctl->Wlog;
        q->limit = ctl->limit;
-        memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256);
-        q->qcount = -1;
+        red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
+                                 ctl->Plog, ctl->Scell_log,
+                                 RTA_DATA(tb[TCA_RED_STAB-1]));
        if (skb_queue_empty(&sch->q))
-                PSCHED_SET_PASTPERFECT(q->qidlestart);
+                red_end_of_idle_period(&q->parms);
        sch_tree_unlock(sch);
        return 0;
 }
@@ -399,39 +192,39 @@ static int red_init(struct Qdisc* sch, struct rtattr *opt)
 static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
        struct red_sched_data *q = qdisc_priv(sch);
-        unsigned char    *b = skb->tail;
+        struct rtattr *opts = NULL;
-        struct rtattr *rta;
+        struct tc_red_qopt opt = {
-        struct tc_red_qopt opt;
+                .limit          = q->limit,
+                .flags          = q->flags,
-        rta = (struct rtattr*)b;
+                .qth_min        = q->parms.qth_min >> q->parms.Wlog,
-        RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
+                .qth_max        = q->parms.qth_max >> q->parms.Wlog,
-        opt.limit = q->limit;
+                .Wlog           = q->parms.Wlog,
-        opt.qth_min = q->qth_min>>q->Wlog;
+                .Plog           = q->parms.Plog,
-        opt.qth_max = q->qth_max>>q->Wlog;
+                .Scell_log      = q->parms.Scell_log,
-        opt.Wlog = q->Wlog;
+        };
-        opt.Plog = q->Plog;
-        opt.Scell_log = q->Scell_log;
+        opts = RTA_NEST(skb, TCA_OPTIONS);
-        opt.flags = q->flags;
        RTA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt);
-        rta->rta_len = skb->tail - b;
+        return RTA_NEST_END(skb, opts);
-        return skb->len;
 rtattr_failure:
-        skb_trim(skb, b - skb->data);
+        return RTA_NEST_CANCEL(skb, opts);
-        return -1;
 }
 static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 {
        struct red_sched_data *q = qdisc_priv(sch);
+        struct tc_red_xstats st = {
-        return gnet_stats_copy_app(d, &q->st, sizeof(q->st));
+                .early  = q->stats.prob_drop + q->stats.forced_drop,
+                .pdrop  = q->stats.pdrop,
+                .other  = q->stats.other,
+                .marked = q->stats.prob_mark + q->stats.forced_mark,
+        };
+        return gnet_stats_copy_app(d, &st, sizeof(st));
 }
 static struct Qdisc_ops red_qdisc_ops = {
-        .next           =       NULL,
-        .cl_ops         =       NULL,
        .id             =       "red",
        .priv_size      =       sizeof(struct red_sched_data),
        .enqueue        =       red_enqueue,
@@ -450,10 +243,13 @@ static int __init red_module_init(void)
 {
        return register_qdisc(&red_qdisc_ops);
 }
-static void __exit red_module_exit(void) 
+static void __exit red_module_exit(void)
 {
        unregister_qdisc(&red_qdisc_ops);
 }
 module_init(red_module_init)
 module_exit(red_module_exit)
 MODULE_LICENSE("GPL");
author	Linus Torvalds <torvalds@g5.osdl.org>	2005-11-07 11:05:11 -0500
committer	Linus Torvalds <torvalds@g5.osdl.org>	2005-11-07 11:05:11 -0500
commit	8e33ba49765484bc6de3a2f8143733713fa93bc1 (patch)
tree	2ea080e478e4ee86a893b75db2d5c81ce14cbf10 /net/sched/sch_red.c
parent	8cde0776ec1e86c270f65bf482f96288e6bf0023 (diff)
parent	2d43f1128a4282fbe8442f40b4cbbac05d8f10aa (diff)

diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 7845d045eec4..dccfa44c2d71 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c
@@ -9,76 +9,23 @@
9	* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>	9	* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10	*	10	*
11	* Changes:	11	* Changes:
12	* J Hadi Salim <hadi@nortel.com> 980914: computation fixes	12	* J Hadi Salim 980914: computation fixes
13	* Alexey Makarenko <makar@phoenix.kharkov.ua> 990814: qave on idle link was calculated incorrectly.	13	* Alexey Makarenko <makar@phoenix.kharkov.ua> 990814: qave on idle link was calculated incorrectly.
14	* J Hadi Salim <hadi@nortelnetworks.com> 980816: ECN support	14	* J Hadi Salim 980816: ECN support
15	*/	15	*/
16		16
17	#include <linux/config.h>	17	#include <linux/config.h>
18	#include <linux/module.h>	18	#include <linux/module.h>
19	#include <asm/uaccess.h>
20	#include <asm/system.h>
21	#include <linux/bitops.h>
22	#include <linux/types.h>	19	#include <linux/types.h>
23	#include <linux/kernel.h>	20	#include <linux/kernel.h>
24	#include <linux/sched.h>
25	#include <linux/string.h>
26	#include <linux/mm.h>
27	#include <linux/socket.h>
28	#include <linux/sockios.h>
29	#include <linux/in.h>
30	#include <linux/errno.h>
31	#include <linux/interrupt.h>
32	#include <linux/if_ether.h>
33	#include <linux/inet.h>
34	#include <linux/netdevice.h>	21	#include <linux/netdevice.h>
35	#include <linux/etherdevice.h>
36	#include <linux/notifier.h>
37	#include <net/ip.h>
38	#include <net/route.h>
39	#include <linux/skbuff.h>	22	#include <linux/skbuff.h>
40	#include <net/sock.h>
41	#include <net/pkt_sched.h>	23	#include <net/pkt_sched.h>
42	#include <net/inet_ecn.h>	24	#include <net/inet_ecn.h>
43	#include <net/dsfield.h>	25	#include <net/red.h>
44		26
45		27
46	/* Random Early Detection (RED) algorithm.	28	/* Parameters, settable by user:
47	=======================================
48
49	Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways
50	for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking.
51
52	This file codes a "divisionless" version of RED algorithm
53	as written down in Fig.17 of the paper.
54
55	Short description.
56	------------------
57
58	When a new packet arrives we calculate the average queue length:
59
60	avg = (1-W)avg + Wcurrent_queue_len,
61
62	W is the filter time constant (chosen as 2^(-Wlog)), it controls
63	the inertia of the algorithm. To allow larger bursts, W should be
64	decreased.
65
66	if (avg > th_max) -> packet marked (dropped).
67	if (avg < th_min) -> packet passes.
68	if (th_min < avg < th_max) we calculate probability:
69
70	Pb = max_P * (avg - th_min)/(th_max-th_min)
71
72	and mark (drop) packet with this probability.
73	Pb changes from 0 (at avg==th_min) to max_P (avg==th_max).
74	max_P should be small (not 1), usually 0.01..0.02 is good value.
75
76	max_P is chosen as a number, so that max_P/(th_max-th_min)
77	is a negative power of two in order arithmetics to contain
78	only shifts.
79
80
81	Parameters, settable by user:
82	-----------------------------	29	-----------------------------
83		30
84	limit - bytes (must be > qth_max + burst)	31	limit - bytes (must be > qth_max + burst)
@@ -89,243 +36,93 @@ Short description.
89	arbitrarily high (well, less than ram size)	36	arbitrarily high (well, less than ram size)
90	Really, this limit will never be reached	37	Really, this limit will never be reached
91	if RED works correctly.	38	if RED works correctly.
92
93	qth_min - bytes (should be < qth_max/2)
94	qth_max - bytes (should be at least 2*qth_min and less limit)
95	Wlog - bits (<32) log(1/W).
96	Plog - bits (<32)
97
98	Plog is related to max_P by formula:
99
100	max_P = (qth_max-qth_min)/2^Plog;
101
102	F.e. if qth_max=128K and qth_min=32K, then Plog=22
103	corresponds to max_P=0.02
104
105	Scell_log
106	Stab
107
108	Lookup table for log((1-W)^(t/t_ave).
109
110
111	NOTES:
112
113	Upper bound on W.
114	-----------------
115
116	If you want to allow bursts of L packets of size S,
117	you should choose W:
118
119	L + 1 - th_min/S < (1-(1-W)^L)/W
120
121	th_min/S = 32 th_min/S = 4
122
123	log(W) L
124	-1 33
125	-2 35
126	-3 39
127	-4 46
128	-5 57
129	-6 75
130	-7 101
131	-8 135
132	-9 190
133	etc.
134	*/	39	*/
135		40
136	struct red_sched_data	41	struct red_sched_data
137	{	42	{
138	/* Parameters */	43	u32 limit; /* HARD maximal queue length */
139	u32 limit; /* HARD maximal queue length */	44	unsigned char flags;
140	u32 qth_min; /* Min average length threshold: A scaled */	45	struct red_parms parms;
141	u32 qth_max; /* Max average length threshold: A scaled */	46	struct red_stats stats;
142	u32 Rmask;
143	u32 Scell_max;
144	unsigned char flags;
145	char Wlog; /* log(W) */
146	char Plog; /* random number bits */
147	char Scell_log;
148	u8 Stab[256];
149
150	/* Variables */
151	unsigned long qave; /* Average queue length: A scaled */
152	int qcount; /* Packets since last random number generation */
153	u32 qR; /* Cached random number */
154
155	psched_time_t qidlestart; /* Start of idle period */
156	struct tc_red_xstats st;
157	};	47	};
158		48
159	static int red_ecn_mark(struct sk_buff *skb)	49	static inline int red_use_ecn(struct red_sched_data *q)
160	{	50	{
161	if (skb->nh.raw + 20 > skb->tail)	51	return q->flags & TC_RED_ECN;
162	return 0;
163
164	switch (skb->protocol) {
165	case __constant_htons(ETH_P_IP):
166	if (INET_ECN_is_not_ect(skb->nh.iph->tos))
167	return 0;
168	IP_ECN_set_ce(skb->nh.iph);
169	return 1;
170	case __constant_htons(ETH_P_IPV6):
171	if (INET_ECN_is_not_ect(ipv6_get_dsfield(skb->nh.ipv6h)))
172	return 0;
173	IP6_ECN_set_ce(skb->nh.ipv6h);
174	return 1;
175	default:
176	return 0;
177	}
178	}	52	}
179		53
180	static int	54	static inline int red_use_harddrop(struct red_sched_data *q)
181	red_enqueue(struct sk_buff skb, struct Qdisc sch)	55	{
		56	return q->flags & TC_RED_HARDDROP;
		57	}
		58
		59	static int red_enqueue(struct sk_buff skb, struct Qdisc sch)
182	{	60	{
183	struct red_sched_data *q = qdisc_priv(sch);	61	struct red_sched_data *q = qdisc_priv(sch);
184		62
185	psched_time_t now;	63	q->parms.qavg = red_calc_qavg(&q->parms, sch->qstats.backlog);
186		64
187	if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) {	65	if (red_is_idling(&q->parms))
188	long us_idle;	66	red_end_of_idle_period(&q->parms);
189	int shift;
190		67
191	PSCHED_GET_TIME(now);	68	switch (red_action(&q->parms, q->parms.qavg)) {
192	us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max);	69	case RED_DONT_MARK:
193	PSCHED_SET_PASTPERFECT(q->qidlestart);	70	break;
194		71
195	/*	72	case RED_PROB_MARK:
196	The problem: ideally, average length queue recalcultion should	73	sch->qstats.overlimits++;
197	be done over constant clock intervals. This is too expensive, so that	74	if (!red_use_ecn(q) \|\| !INET_ECN_set_ce(skb)) {
198	the calculation is driven by outgoing packets.	75	q->stats.prob_drop++;
199	When the queue is idle we have to model this clock by hand.	76	goto congestion_drop;
200		77	}
201	SF+VJ proposed to "generate" m = idletime/(average_pkt_size/bandwidth)
202	dummy packets as a burst after idle time, i.e.
203
204	q->qave *= (1-W)^m
205
206	This is an apparently overcomplicated solution (f.e. we have to precompute
207	a table to make this calculation in reasonable time)
208	I believe that a simpler model may be used here,
209	but it is field for experiments.
210	*/
211	shift = q->Stab[us_idle>>q->Scell_log];
212
213	if (shift) {
214	q->qave >>= shift;
215	} else {
216	/* Approximate initial part of exponent
217	with linear function:
218	(1-W)^m ~= 1-mW + ...
219
220	Seems, it is the best solution to
221	problem of too coarce exponent tabulation.
222	*/
223
224	us_idle = (q->qave * us_idle)>>q->Scell_log;
225	if (us_idle < q->qave/2)
226	q->qave -= us_idle;
227	else
228	q->qave >>= 1;
229	}
230	} else {
231	q->qave += sch->qstats.backlog - (q->qave >> q->Wlog);
232	/* NOTE:
233	q->qave is fixed point number with point at Wlog.
234	The formulae above is equvalent to floating point
235	version:
236
237	qave = qave(1-W) + sch->qstats.backlogW;
238	--ANK (980924)
239	*/
240	}
241		78
242	if (q->qave < q->qth_min) {	79	q->stats.prob_mark++;
243	q->qcount = -1;	80	break;
244	enqueue:	81
245	if (sch->qstats.backlog + skb->len <= q->limit) {	82	case RED_HARD_MARK:
246	__skb_queue_tail(&sch->q, skb);	83	sch->qstats.overlimits++;
247	sch->qstats.backlog += skb->len;	84	if (red_use_harddrop(q) \|\| !red_use_ecn(q) \|\|
248	sch->bstats.bytes += skb->len;	85	!INET_ECN_set_ce(skb)) {
249	sch->bstats.packets++;	86	q->stats.forced_drop++;
250	return NET_XMIT_SUCCESS;	87	goto congestion_drop;
251	} else {	88	}
252	q->st.pdrop++;
253	}
254	kfree_skb(skb);
255	sch->qstats.drops++;
256	return NET_XMIT_DROP;
257	}
258	if (q->qave >= q->qth_max) {
259	q->qcount = -1;
260	sch->qstats.overlimits++;
261	mark:
262	if (!(q->flags&TC_RED_ECN) \|\| !red_ecn_mark(skb)) {
263	q->st.early++;
264	goto drop;
265	}
266	q->st.marked++;
267	goto enqueue;
268	}
269		89
270	if (++q->qcount) {	90	q->stats.forced_mark++;
271	/* The formula used below causes questions.	91	break;
272
273	OK. qR is random number in the interval 0..Rmask
274	i.e. 0..(2^Plog). If we used floating point
275	arithmetics, it would be: (2^Plog)*rnd_num,
276	where rnd_num is less 1.
277
278	Taking into account, that qave have fixed
279	point at Wlog, and Plog is related to max_P by
280	max_P = (qth_max-qth_min)/2^Plog; two lines
281	below have the following floating point equivalent:
282
283	max_P*(qave - qth_min)/(qth_max-qth_min) < rnd/qcount
284
285	Any questions? --ANK (980924)
286	*/
287	if (((q->qave - q->qth_min)>>q->Wlog)*q->qcount < q->qR)
288	goto enqueue;
289	q->qcount = 0;
290	q->qR = net_random()&q->Rmask;
291	sch->qstats.overlimits++;
292	goto mark;
293	}	92	}
294	q->qR = net_random()&q->Rmask;
295	goto enqueue;
296		93
297	drop:	94	if (sch->qstats.backlog + skb->len <= q->limit)
298	kfree_skb(skb);	95	return qdisc_enqueue_tail(skb, sch);
299	sch->qstats.drops++;	96
		97	q->stats.pdrop++;
		98	return qdisc_drop(skb, sch);
		99
		100	congestion_drop:
		101	qdisc_drop(skb, sch);
300	return NET_XMIT_CN;	102	return NET_XMIT_CN;
301	}	103	}
302		104
303	static int	105	static int red_requeue(struct sk_buff skb, struct Qdisc sch)
304	red_requeue(struct sk_buff skb, struct Qdisc sch)
305	{	106	{
306	struct red_sched_data *q = qdisc_priv(sch);	107	struct red_sched_data *q = qdisc_priv(sch);
307		108
308	PSCHED_SET_PASTPERFECT(q->qidlestart);	109	if (red_is_idling(&q->parms))
		110	red_end_of_idle_period(&q->parms);
309		111
310	__skb_queue_head(&sch->q, skb);	112	return qdisc_requeue(skb, sch);
311	sch->qstats.backlog += skb->len;
312	sch->qstats.requeues++;
313	return 0;
314	}	113	}
315		114
316	static struct sk_buff *	115	static struct sk_buff * red_dequeue(struct Qdisc* sch)
317	red_dequeue(struct Qdisc* sch)
318	{	116	{
319	struct sk_buff *skb;	117	struct sk_buff *skb;
320	struct red_sched_data *q = qdisc_priv(sch);	118	struct red_sched_data *q = qdisc_priv(sch);
321		119
322	skb = __skb_dequeue(&sch->q);	120	skb = qdisc_dequeue_head(sch);
323	if (skb) {	121
324	sch->qstats.backlog -= skb->len;	122	if (skb == NULL && !red_is_idling(&q->parms))
325	return skb;	123	red_start_of_idle_period(&q->parms);
326	}	124
327	PSCHED_GET_TIME(q->qidlestart);	125	return skb;
328	return NULL;
329	}	126	}
330		127
331	static unsigned int red_drop(struct Qdisc* sch)	128	static unsigned int red_drop(struct Qdisc* sch)
@@ -333,16 +130,17 @@ static unsigned int red_drop(struct Qdisc* sch)
333	struct sk_buff *skb;	130	struct sk_buff *skb;
334	struct red_sched_data *q = qdisc_priv(sch);	131	struct red_sched_data *q = qdisc_priv(sch);
335		132
336	skb = __skb_dequeue_tail(&sch->q);	133	skb = qdisc_dequeue_tail(sch);
337	if (skb) {	134	if (skb) {
338	unsigned int len = skb->len;	135	unsigned int len = skb->len;
339	sch->qstats.backlog -= len;	136	q->stats.other++;
340	sch->qstats.drops++;	137	qdisc_drop(skb, sch);
341	q->st.other++;
342	kfree_skb(skb);
343	return len;	138	return len;
344	}	139	}
345	PSCHED_GET_TIME(q->qidlestart);	140
		141	if (!red_is_idling(&q->parms))
		142	red_start_of_idle_period(&q->parms);
		143
346	return 0;	144	return 0;
347	}	145	}
348		146
@@ -350,43 +148,38 @@ static void red_reset(struct Qdisc* sch)
350	{	148	{
351	struct red_sched_data *q = qdisc_priv(sch);	149	struct red_sched_data *q = qdisc_priv(sch);
352		150
353	__skb_queue_purge(&sch->q);	151	qdisc_reset_queue(sch);
354	sch->qstats.backlog = 0;	152	red_restart(&q->parms);
355	PSCHED_SET_PASTPERFECT(q->qidlestart);
356	q->qave = 0;
357	q->qcount = -1;
358	}	153	}
359		154
360	static int red_change(struct Qdisc sch, struct rtattr opt)	155	static int red_change(struct Qdisc sch, struct rtattr opt)
361	{	156	{
362	struct red_sched_data *q = qdisc_priv(sch);	157	struct red_sched_data *q = qdisc_priv(sch);
363	struct rtattr *tb[TCA_RED_STAB];	158	struct rtattr *tb[TCA_RED_MAX];
364	struct tc_red_qopt *ctl;	159	struct tc_red_qopt *ctl;
365		160
366	if (opt == NULL \|\|	161	if (opt == NULL \|\| rtattr_parse_nested(tb, TCA_RED_MAX, opt))
367	rtattr_parse_nested(tb, TCA_RED_STAB, opt) \|\|	162	return -EINVAL;
368	tb[TCA_RED_PARMS-1] == 0 \|\| tb[TCA_RED_STAB-1] == 0 \|\|	163
		164	if (tb[TCA_RED_PARMS-1] == NULL \|\|
369	RTA_PAYLOAD(tb[TCA_RED_PARMS-1]) < sizeof(*ctl) \|\|	165	RTA_PAYLOAD(tb[TCA_RED_PARMS-1]) < sizeof(*ctl) \|\|
370	RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < 256)	166	tb[TCA_RED_STAB-1] == NULL \|\|
		167	RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < RED_STAB_SIZE)
371	return -EINVAL;	168	return -EINVAL;
372		169
373	ctl = RTA_DATA(tb[TCA_RED_PARMS-1]);	170	ctl = RTA_DATA(tb[TCA_RED_PARMS-1]);
374		171
375	sch_tree_lock(sch);	172	sch_tree_lock(sch);
376	q->flags = ctl->flags;	173	q->flags = ctl->flags;
377	q->Wlog = ctl->Wlog;
378	q->Plog = ctl->Plog;
379	q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL;
380	q->Scell_log = ctl->Scell_log;
381	q->Scell_max = (255<<q->Scell_log);
382	q->qth_min = ctl->qth_min<<ctl->Wlog;
383	q->qth_max = ctl->qth_max<<ctl->Wlog;
384	q->limit = ctl->limit;	174	q->limit = ctl->limit;
385	memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256);
386		175
387	q->qcount = -1;	176	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
		177	ctl->Plog, ctl->Scell_log,
		178	RTA_DATA(tb[TCA_RED_STAB-1]));
		179
388	if (skb_queue_empty(&sch->q))	180	if (skb_queue_empty(&sch->q))
389	PSCHED_SET_PASTPERFECT(q->qidlestart);	181	red_end_of_idle_period(&q->parms);
		182
390	sch_tree_unlock(sch);	183	sch_tree_unlock(sch);
391	return 0;	184	return 0;
392	}	185	}
@@ -399,39 +192,39 @@ static int red_init(struct Qdisc* sch, struct rtattr *opt)
399	static int red_dump(struct Qdisc sch, struct sk_buff skb)	192	static int red_dump(struct Qdisc sch, struct sk_buff skb)
400	{	193	{
401	struct red_sched_data *q = qdisc_priv(sch);	194	struct red_sched_data *q = qdisc_priv(sch);
402	unsigned char *b = skb->tail;	195	struct rtattr *opts = NULL;
403	struct rtattr *rta;	196	struct tc_red_qopt opt = {
404	struct tc_red_qopt opt;	197	.limit = q->limit,
405		198	.flags = q->flags,
406	rta = (struct rtattr*)b;	199	.qth_min = q->parms.qth_min >> q->parms.Wlog,
407	RTA_PUT(skb, TCA_OPTIONS, 0, NULL);	200	.qth_max = q->parms.qth_max >> q->parms.Wlog,
408	opt.limit = q->limit;	201	.Wlog = q->parms.Wlog,
409	opt.qth_min = q->qth_min>>q->Wlog;	202	.Plog = q->parms.Plog,
410	opt.qth_max = q->qth_max>>q->Wlog;	203	.Scell_log = q->parms.Scell_log,
411	opt.Wlog = q->Wlog;	204	};
412	opt.Plog = q->Plog;	205
413	opt.Scell_log = q->Scell_log;	206	opts = RTA_NEST(skb, TCA_OPTIONS);
414	opt.flags = q->flags;
415	RTA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt);	207	RTA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt);
416	rta->rta_len = skb->tail - b;	208	return RTA_NEST_END(skb, opts);
417
418	return skb->len;
419		209
420	rtattr_failure:	210	rtattr_failure:
421	skb_trim(skb, b - skb->data);	211	return RTA_NEST_CANCEL(skb, opts);
422	return -1;
423	}	212	}
424		213
425	static int red_dump_stats(struct Qdisc sch, struct gnet_dump d)	214	static int red_dump_stats(struct Qdisc sch, struct gnet_dump d)
426	{	215	{
427	struct red_sched_data *q = qdisc_priv(sch);	216	struct red_sched_data *q = qdisc_priv(sch);
428		217	struct tc_red_xstats st = {
429	return gnet_stats_copy_app(d, &q->st, sizeof(q->st));	218	.early = q->stats.prob_drop + q->stats.forced_drop,
		219	.pdrop = q->stats.pdrop,
		220	.other = q->stats.other,
		221	.marked = q->stats.prob_mark + q->stats.forced_mark,
		222	};
		223
		224	return gnet_stats_copy_app(d, &st, sizeof(st));
430	}	225	}
431		226
432	static struct Qdisc_ops red_qdisc_ops = {	227	static struct Qdisc_ops red_qdisc_ops = {
433	.next = NULL,
434	.cl_ops = NULL,
435	.id = "red",	228	.id = "red",
436	.priv_size = sizeof(struct red_sched_data),	229	.priv_size = sizeof(struct red_sched_data),
437	.enqueue = red_enqueue,	230	.enqueue = red_enqueue,
@@ -450,10 +243,13 @@ static int __init red_module_init(void)
450	{	243	{
451	return register_qdisc(&red_qdisc_ops);	244	return register_qdisc(&red_qdisc_ops);
452	}	245	}
453	static void __exit red_module_exit(void)	246
		247	static void __exit red_module_exit(void)
454	{	248	{
455	unregister_qdisc(&red_qdisc_ops);	249	unregister_qdisc(&red_qdisc_ops);
456	}	250	}
		251
457	module_init(red_module_init)	252	module_init(red_module_init)
458	module_exit(red_module_exit)	253	module_exit(red_module_exit)
		254
459	MODULE_LICENSE("GPL");	255	MODULE_LICENSE("GPL");