net_sched: SFB flow scheduler

This is the Stochastic Fair Blue scheduler, based on work from : W. Feng, D. Kandlur, D. Saha, K. Shin. Blue: A New Class of Active Queue Management Algorithms. U. Michigan CSE-TR-387-99, April 1999. http://www.thefengs.com/wuchang/blue/CSE-TR-387-99.pdf This implementation is based on work done by Juliusz Chroboczek General SFB algorithm can be found in figure 14, page 15: B[l][n] : L x N array of bins (L levels, N bins per level) enqueue() Calculate hash function values h{0}, h{1}, .. h{L-1} Update bins at each level for i = 0 to L - 1 if (B[i][h{i}].qlen > bin_size) B[i][h{i}].p_mark += p_increment; else if (B[i][h{i}].qlen == 0) B[i][h{i}].p_mark -= p_decrement; p_min = min(B[0][h{0}].p_mark ... B[L-1][h{L-1}].p_mark); if (p_min == 1.0) ratelimit(); else mark/drop with probabilty p_min; I did the adaptation of Juliusz code to meet current kernel standards, and various changes to address previous comments : http://thread.gmane.org/gmane.linux.network/90225 http://thread.gmane.org/gmane.linux.network/90375 Default flow classifier is the rxhash introduced by RPS in 2.6.35, but we can use an external flow classifier if wanted. tc qdisc add dev $DEV parent 1:11 handle 11: \ est 0.5sec 2sec sfb limit 128 tc filter add dev $DEV protocol ip parent 11: handle 3 \ flow hash keys dst divisor 1024 Notes: 1) SFB default child qdisc is pfifo_fast. It can be changed by another qdisc but a child qdisc MUST not drop a packet previously queued. This is because SFB needs to handle a dequeued packet in order to maintain its virtual queue states. pfifo_head_drop or CHOKe should not be used. 2) ECN is enabled by default, unlike RED/CHOKe/GRED With help from Patrick McHardy & Andi Kleen Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> CC: Juliusz Chroboczek <Juliusz.Chroboczek@pps.jussieu.fr> CC: Stephen Hemminger <shemminger@vyatta.com> CC: Patrick McHardy <kaber@trash.net> CC: Andi Kleen <andi@firstfloor.org> CC: John W. Linville <linville@tuxdriver.com> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Eric Dumazet <eric.dumazet@gmail.com> 2011-02-23 05:56:17 -0500
committer: David S. Miller <davem@davemloft.net> 2011-02-23 17:05:11 -0500
commit: e13e02a3c68d899169c78d9a18689bd73491d59a (patch)
tree: 6e6b40ef37261df391cd445ec0f1b3d538b23a47 /net/sched
parent: dee9f4bceb5fd9dbfcc1567148fccdbf16d6a38a (diff)
3 files changed, 721 insertions, 0 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 8c19b6e3201e..a7a5583d4f68 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -126,6 +126,17 @@ config NET_SCH_RED
          To compile this code as a module, choose M here: the
          module will be called sch_red.
+config NET_SCH_SFB
+        tristate "Stochastic Fair Blue (SFB)"
+        ---help---
+          Say Y here if you want to use the Stochastic Fair Blue (SFB)
+          packet scheduling algorithm.
+          See the top of <file:net/sched/sch_sfb.c> for more details.
+          To compile this code as a module, choose M here: the
+          module will be called sch_sfb.
 config NET_SCH_SFQ
        tristate "Stochastic Fairness Queueing (SFQ)"
        ---help---
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 06c6cdfd1948..2e77b8dba22e 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -24,6 +24,7 @@ obj-$(CONFIG_NET_SCH_RED)	+= sch_red.o
 obj-$(CONFIG_NET_SCH_GRED)      += sch_gred.o
 obj-$(CONFIG_NET_SCH_INGRESS)   += sch_ingress.o 
 obj-$(CONFIG_NET_SCH_DSMARK)    += sch_dsmark.o
+obj-$(CONFIG_NET_SCH_SFB)       += sch_sfb.o
 obj-$(CONFIG_NET_SCH_SFQ)       += sch_sfq.o
 obj-$(CONFIG_NET_SCH_TBF)       += sch_tbf.o
 obj-$(CONFIG_NET_SCH_TEQL)      += sch_teql.o
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
new file mode 100644
index 000000000000..0a833d0c1f61
--- /dev/null
+++ b/net/sched/sch_sfb.c
@@ -0,0 +1,709 @@
+/*
+ * net/sched/sch_sfb.c    Stochastic Fair Blue
+ *
+ * Copyright (c) 2008-2011 Juliusz Chroboczek <jch@pps.jussieu.fr>
+ * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * W. Feng, D. Kandlur, D. Saha, K. Shin. Blue:
+ * A New Class of Active Queue Management Algorithms.
+ * U. Michigan CSE-TR-387-99, April 1999.
+ *
+ * http://www.thefengs.com/wuchang/blue/CSE-TR-387-99.pdf
+ *
+ */
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/random.h>
+#include <linux/jhash.h>
+#include <net/ip.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+/*
+ * SFB uses two B[l][n] : L x N arrays of bins (L levels, N bins per level)
+ * This implementation uses L = 8 and N = 16
+ * This permits us to split one 32bit hash (provided per packet by rxhash or
+ * external classifier) into 8 subhashes of 4 bits.
+ */
+#define SFB_BUCKET_SHIFT 4
+#define SFB_NUMBUCKETS  (1 << SFB_BUCKET_SHIFT) /* N bins per Level */
+#define SFB_BUCKET_MASK (SFB_NUMBUCKETS - 1)
+#define SFB_LEVELS      (32 / SFB_BUCKET_SHIFT) /* L */
+/* SFB algo uses a virtual queue, named "bin" */
+struct sfb_bucket {
+        u16             qlen; /* length of virtual queue */
+        u16             p_mark; /* marking probability */
+};
+/* We use a double buffering right before hash change
+ * (Section 4.4 of SFB reference : moving hash functions)
+ */
+struct sfb_bins {
+        u32               perturbation; /* jhash perturbation */
+        struct sfb_bucket bins[SFB_LEVELS][SFB_NUMBUCKETS];
+};
+struct sfb_sched_data {
+        struct Qdisc    *qdisc;
+        struct tcf_proto *filter_list;
+        unsigned long   rehash_interval;
+        unsigned long   warmup_time;    /* double buffering warmup time in jiffies */
+        u32             max;
+        u32             bin_size;       /* maximum queue length per bin */
+        u32             increment;      /* d1 */
+        u32             decrement;      /* d2 */
+        u32             limit;          /* HARD maximal queue length */
+        u32             penalty_rate;
+        u32             penalty_burst;
+        u32             tokens_avail;
+        unsigned long   rehash_time;
+        unsigned long   token_time;
+        u8              slot;           /* current active bins (0 or 1) */
+        bool            double_buffering;
+        struct sfb_bins bins[2];
+        struct {
+                u32     earlydrop;
+                u32     penaltydrop;
+                u32     bucketdrop;
+                u32     queuedrop;
+                u32     childdrop;      /* drops in child qdisc */
+                u32     marked;         /* ECN mark */
+        } stats;
+};
+/*
+ * Each queued skb might be hashed on one or two bins
+ * We store in skb_cb the two hash values.
+ * (A zero value means double buffering was not used)
+ */
+struct sfb_skb_cb {
+        u32 hashes[2];
+};
+static inline struct sfb_skb_cb *sfb_skb_cb(const struct sk_buff *skb)
+{
+        BUILD_BUG_ON(sizeof(skb->cb) <
+                sizeof(struct qdisc_skb_cb) + sizeof(struct sfb_skb_cb));
+        return (struct sfb_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+/*
+ * If using 'internal' SFB flow classifier, hash comes from skb rxhash
+ * If using external classifier, hash comes from the classid.
+ */
+static u32 sfb_hash(const struct sk_buff *skb, u32 slot)
+{
+        return sfb_skb_cb(skb)->hashes[slot];
+}
+/* Probabilities are coded as Q0.16 fixed-point values,
+ * with 0xFFFF representing 65535/65536 (almost 1.0)
+ * Addition and subtraction are saturating in [0, 65535]
+ */
+static u32 prob_plus(u32 p1, u32 p2)
+{
+        u32 res = p1 + p2;
+        return min_t(u32, res, SFB_MAX_PROB);
+}
+static u32 prob_minus(u32 p1, u32 p2)
+{
+        return p1 > p2 ? p1 - p2 : 0;
+}
+static void increment_one_qlen(u32 sfbhash, u32 slot, struct sfb_sched_data *q)
+{
+        int i;
+        struct sfb_bucket *b = &q->bins[slot].bins[0][0];
+        for (i = 0; i < SFB_LEVELS; i++) {
+                u32 hash = sfbhash & SFB_BUCKET_MASK;
+                sfbhash >>= SFB_BUCKET_SHIFT;
+                if (b[hash].qlen < 0xFFFF)
+                        b[hash].qlen++;
+                b += SFB_NUMBUCKETS; /* next level */
+        }
+}
+static void increment_qlen(const struct sk_buff *skb, struct sfb_sched_data *q)
+{
+        u32 sfbhash;
+        sfbhash = sfb_hash(skb, 0);
+        if (sfbhash)
+                increment_one_qlen(sfbhash, 0, q);
+        sfbhash = sfb_hash(skb, 1);
+        if (sfbhash)
+                increment_one_qlen(sfbhash, 1, q);
+}
+static void decrement_one_qlen(u32 sfbhash, u32 slot,
+                               struct sfb_sched_data *q)
+{
+        int i;
+        struct sfb_bucket *b = &q->bins[slot].bins[0][0];
+        for (i = 0; i < SFB_LEVELS; i++) {
+                u32 hash = sfbhash & SFB_BUCKET_MASK;
+                sfbhash >>= SFB_BUCKET_SHIFT;
+                if (b[hash].qlen > 0)
+                        b[hash].qlen--;
+                b += SFB_NUMBUCKETS; /* next level */
+        }
+}
+static void decrement_qlen(const struct sk_buff *skb, struct sfb_sched_data *q)
+{
+        u32 sfbhash;
+        sfbhash = sfb_hash(skb, 0);
+        if (sfbhash)
+                decrement_one_qlen(sfbhash, 0, q);
+        sfbhash = sfb_hash(skb, 1);
+        if (sfbhash)
+                decrement_one_qlen(sfbhash, 1, q);
+}
+static void decrement_prob(struct sfb_bucket *b, struct sfb_sched_data *q)
+{
+        b->p_mark = prob_minus(b->p_mark, q->decrement);
+}
+static void increment_prob(struct sfb_bucket *b, struct sfb_sched_data *q)
+{
+        b->p_mark = prob_plus(b->p_mark, q->increment);
+}
+static void sfb_zero_all_buckets(struct sfb_sched_data *q)
+{
+        memset(&q->bins, 0, sizeof(q->bins));
+}
+/*
+ * compute max qlen, max p_mark, and avg p_mark
+ */
+static u32 sfb_compute_qlen(u32 *prob_r, u32 *avgpm_r, const struct sfb_sched_data *q)
+{
+        int i;
+        u32 qlen = 0, prob = 0, totalpm = 0;
+        const struct sfb_bucket *b = &q->bins[q->slot].bins[0][0];
+        for (i = 0; i < SFB_LEVELS * SFB_NUMBUCKETS; i++) {
+                if (qlen < b->qlen)
+                        qlen = b->qlen;
+                totalpm += b->p_mark;
+                if (prob < b->p_mark)
+                        prob = b->p_mark;
+                b++;
+        }
+        *prob_r = prob;
+        *avgpm_r = totalpm / (SFB_LEVELS * SFB_NUMBUCKETS);
+        return qlen;
+}
+static void sfb_init_perturbation(u32 slot, struct sfb_sched_data *q)
+{
+        q->bins[slot].perturbation = net_random();
+}
+static void sfb_swap_slot(struct sfb_sched_data *q)
+{
+        sfb_init_perturbation(q->slot, q);
+        q->slot ^= 1;
+        q->double_buffering = false;
+}
+/* Non elastic flows are allowed to use part of the bandwidth, expressed
+ * in "penalty_rate" packets per second, with "penalty_burst" burst
+ */
+static bool sfb_rate_limit(struct sk_buff *skb, struct sfb_sched_data *q)
+{
+        if (q->penalty_rate == 0 || q->penalty_burst == 0)
+                return true;
+        if (q->tokens_avail < 1) {
+                unsigned long age = min(10UL * HZ, jiffies - q->token_time);
+                q->tokens_avail = (age * q->penalty_rate) / HZ;
+                if (q->tokens_avail > q->penalty_burst)
+                        q->tokens_avail = q->penalty_burst;
+                q->token_time = jiffies;
+                if (q->tokens_avail < 1)
+                        return true;
+        }
+        q->tokens_avail--;
+        return false;
+}
+static bool sfb_classify(struct sk_buff *skb, struct sfb_sched_data *q,
+                         int *qerr, u32 *salt)
+{
+        struct tcf_result res;
+        int result;
+        result = tc_classify(skb, q->filter_list, &res);
+        if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+                switch (result) {
+                case TC_ACT_STOLEN:
+                case TC_ACT_QUEUED:
+                        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+                case TC_ACT_SHOT:
+                        return false;
+                }
+#endif
+                *salt = TC_H_MIN(res.classid);
+                return true;
+        }
+        return false;
+}
+static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+        struct sfb_sched_data *q = qdisc_priv(sch);
+        struct Qdisc *child = q->qdisc;
+        int i;
+        u32 p_min = ~0;
+        u32 minqlen = ~0;
+        u32 r, slot, salt, sfbhash;
+        int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+        if (q->rehash_interval > 0) {
+                unsigned long limit = q->rehash_time + q->rehash_interval;
+                if (unlikely(time_after(jiffies, limit))) {
+                        sfb_swap_slot(q);
+                        q->rehash_time = jiffies;
+                } else if (unlikely(!q->double_buffering && q->warmup_time > 0 &&
+                                    time_after(jiffies, limit - q->warmup_time))) {
+                        q->double_buffering = true;
+                }
+        }
+        if (q->filter_list) {
+                /* If using external classifiers, get result and record it. */
+                if (!sfb_classify(skb, q, &ret, &salt))
+                        goto other_drop;
+        } else {
+                salt = skb_get_rxhash(skb);
+        }
+        slot = q->slot;
+        sfbhash = jhash_1word(salt, q->bins[slot].perturbation);
+        if (!sfbhash)
+                sfbhash = 1;
+        sfb_skb_cb(skb)->hashes[slot] = sfbhash;
+        for (i = 0; i < SFB_LEVELS; i++) {
+                u32 hash = sfbhash & SFB_BUCKET_MASK;
+                struct sfb_bucket *b = &q->bins[slot].bins[i][hash];
+                sfbhash >>= SFB_BUCKET_SHIFT;
+                if (b->qlen == 0)
+                        decrement_prob(b, q);
+                else if (b->qlen >= q->bin_size)
+                        increment_prob(b, q);
+                if (minqlen > b->qlen)
+                        minqlen = b->qlen;
+                if (p_min > b->p_mark)
+                        p_min = b->p_mark;
+        }
+        slot ^= 1;
+        sfb_skb_cb(skb)->hashes[slot] = 0;
+        if (unlikely(minqlen >= q->max || sch->q.qlen >= q->limit)) {
+                sch->qstats.overlimits++;
+                if (minqlen >= q->max)
+                        q->stats.bucketdrop++;
+                else
+                        q->stats.queuedrop++;
+                goto drop;
+        }
+        if (unlikely(p_min >= SFB_MAX_PROB)) {
+                /* Inelastic flow */
+                if (q->double_buffering) {
+                        sfbhash = jhash_1word(salt, q->bins[slot].perturbation);
+                        if (!sfbhash)
+                                sfbhash = 1;
+                        sfb_skb_cb(skb)->hashes[slot] = sfbhash;
+                        for (i = 0; i < SFB_LEVELS; i++) {
+                                u32 hash = sfbhash & SFB_BUCKET_MASK;
+                                struct sfb_bucket *b = &q->bins[slot].bins[i][hash];
+                                sfbhash >>= SFB_BUCKET_SHIFT;
+                                if (b->qlen == 0)
+                                        decrement_prob(b, q);
+                                else if (b->qlen >= q->bin_size)
+                                        increment_prob(b, q);
+                        }
+                }
+                if (sfb_rate_limit(skb, q)) {
+                        sch->qstats.overlimits++;
+                        q->stats.penaltydrop++;
+                        goto drop;
+                }
+                goto enqueue;
+        }
+        r = net_random() & SFB_MAX_PROB;
+        if (unlikely(r < p_min)) {
+                if (unlikely(p_min > SFB_MAX_PROB / 2)) {
+                        /* If we're marking that many packets, then either
+                         * this flow is unresponsive, or we're badly congested.
+                         * In either case, we want to start dropping packets.
+                         */
+                        if (r < (p_min - SFB_MAX_PROB / 2) * 2) {
+                                q->stats.earlydrop++;
+                                goto drop;
+                        }
+                }
+                if (INET_ECN_set_ce(skb)) {
+                        q->stats.marked++;
+                } else {
+                        q->stats.earlydrop++;
+                        goto drop;
+                }
+        }
+enqueue:
+        ret = qdisc_enqueue(skb, child);
+        if (likely(ret == NET_XMIT_SUCCESS)) {
+                sch->q.qlen++;
+                increment_qlen(skb, q);
+        } else if (net_xmit_drop_count(ret)) {
+                q->stats.childdrop++;
+                sch->qstats.drops++;
+        }
+        return ret;
+drop:
+        qdisc_drop(skb, sch);
+        return NET_XMIT_CN;
+other_drop:
+        if (ret & __NET_XMIT_BYPASS)
+                sch->qstats.drops++;
+        kfree_skb(skb);
+        return ret;
+}
+static struct sk_buff *sfb_dequeue(struct Qdisc *sch)
+{
+        struct sfb_sched_data *q = qdisc_priv(sch);
+        struct Qdisc *child = q->qdisc;
+        struct sk_buff *skb;
+        skb = child->dequeue(q->qdisc);
+        if (skb) {
+                qdisc_bstats_update(sch, skb);
+                sch->q.qlen--;
+                decrement_qlen(skb, q);
+        }
+        return skb;
+}
+static struct sk_buff *sfb_peek(struct Qdisc *sch)
+{
+        struct sfb_sched_data *q = qdisc_priv(sch);
+        struct Qdisc *child = q->qdisc;
+        return child->ops->peek(child);
+}
+/* No sfb_drop -- impossible since the child doesn't return the dropped skb. */
+static void sfb_reset(struct Qdisc *sch)
+{
+        struct sfb_sched_data *q = qdisc_priv(sch);
+        qdisc_reset(q->qdisc);
+        sch->q.qlen = 0;
+        q->slot = 0;
+        q->double_buffering = false;
+        sfb_zero_all_buckets(q);
+        sfb_init_perturbation(0, q);
+}
+static void sfb_destroy(struct Qdisc *sch)
+{
+        struct sfb_sched_data *q = qdisc_priv(sch);
+        tcf_destroy_chain(&q->filter_list);
+        qdisc_destroy(q->qdisc);
+}
+static const struct nla_policy sfb_policy[TCA_SFB_MAX + 1] = {
+        [TCA_SFB_PARMS] = { .len = sizeof(struct tc_sfb_qopt) },
+};
+static const struct tc_sfb_qopt sfb_default_ops = {
+        .rehash_interval = 600 * MSEC_PER_SEC,
+        .warmup_time = 60 * MSEC_PER_SEC,
+        .limit = 0,
+        .max = 25,
+        .bin_size = 20,
+        .increment = (SFB_MAX_PROB + 500) / 1000, /* 0.1 % */
+        .decrement = (SFB_MAX_PROB + 3000) / 6000,
+        .penalty_rate = 10,
+        .penalty_burst = 20,
+};
+static int sfb_change(struct Qdisc *sch, struct nlattr *opt)
+{
+        struct sfb_sched_data *q = qdisc_priv(sch);
+        struct Qdisc *child;
+        struct nlattr *tb[TCA_SFB_MAX + 1];
+        const struct tc_sfb_qopt *ctl = &sfb_default_ops;
+        u32 limit;
+        int err;
+        if (opt) {
+                err = nla_parse_nested(tb, TCA_SFB_MAX, opt, sfb_policy);
+                if (err < 0)
+                        return -EINVAL;
+                if (tb[TCA_SFB_PARMS] == NULL)
+                        return -EINVAL;
+                ctl = nla_data(tb[TCA_SFB_PARMS]);
+        }
+        limit = ctl->limit;
+        if (limit == 0)
+                limit = max_t(u32, qdisc_dev(sch)->tx_queue_len, 1);
+        child = fifo_create_dflt(sch, &pfifo_qdisc_ops, limit);
+        if (IS_ERR(child))
+                return PTR_ERR(child);
+        sch_tree_lock(sch);
+        qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
+        qdisc_destroy(q->qdisc);
+        q->qdisc = child;
+        q->rehash_interval = msecs_to_jiffies(ctl->rehash_interval);
+        q->warmup_time = msecs_to_jiffies(ctl->warmup_time);
+        q->rehash_time = jiffies;
+        q->limit = limit;
+        q->increment = ctl->increment;
+        q->decrement = ctl->decrement;
+        q->max = ctl->max;
+        q->bin_size = ctl->bin_size;
+        q->penalty_rate = ctl->penalty_rate;
+        q->penalty_burst = ctl->penalty_burst;
+        q->tokens_avail = ctl->penalty_burst;
+        q->token_time = jiffies;
+        q->slot = 0;
+        q->double_buffering = false;
+        sfb_zero_all_buckets(q);
+        sfb_init_perturbation(0, q);
+        sfb_init_perturbation(1, q);
+        sch_tree_unlock(sch);
+        return 0;
+}
+static int sfb_init(struct Qdisc *sch, struct nlattr *opt)
+{
+        struct sfb_sched_data *q = qdisc_priv(sch);
+        q->qdisc = &noop_qdisc;
+        return sfb_change(sch, opt);
+}
+static int sfb_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+        struct sfb_sched_data *q = qdisc_priv(sch);
+        struct nlattr *opts;
+        struct tc_sfb_qopt opt = {
+                .rehash_interval = jiffies_to_msecs(q->rehash_interval),
+                .warmup_time = jiffies_to_msecs(q->warmup_time),
+                .limit = q->limit,
+                .max = q->max,
+                .bin_size = q->bin_size,
+                .increment = q->increment,
+                .decrement = q->decrement,
+                .penalty_rate = q->penalty_rate,
+                .penalty_burst = q->penalty_burst,
+        };
+        sch->qstats.backlog = q->qdisc->qstats.backlog;
+        opts = nla_nest_start(skb, TCA_OPTIONS);
+        NLA_PUT(skb, TCA_SFB_PARMS, sizeof(opt), &opt);
+        return nla_nest_end(skb, opts);
+nla_put_failure:
+        nla_nest_cancel(skb, opts);
+        return -EMSGSIZE;
+}
+static int sfb_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+        struct sfb_sched_data *q = qdisc_priv(sch);
+        struct tc_sfb_xstats st = {
+                .earlydrop = q->stats.earlydrop,
+                .penaltydrop = q->stats.penaltydrop,
+                .bucketdrop = q->stats.bucketdrop,
+                .queuedrop = q->stats.queuedrop,
+                .childdrop = q->stats.childdrop,
+                .marked = q->stats.marked,
+        };
+        st.maxqlen = sfb_compute_qlen(&st.maxprob, &st.avgprob, q);
+        return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+static int sfb_dump_class(struct Qdisc *sch, unsigned long cl,
+                          struct sk_buff *skb, struct tcmsg *tcm)
+{
+        return -ENOSYS;
+}
+static int sfb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+                     struct Qdisc **old)
+{
+        struct sfb_sched_data *q = qdisc_priv(sch);
+        if (new == NULL)
+                new = &noop_qdisc;
+        sch_tree_lock(sch);
+        *old = q->qdisc;
+        q->qdisc = new;
+        qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+        qdisc_reset(*old);
+        sch_tree_unlock(sch);
+        return 0;
+}
+static struct Qdisc *sfb_leaf(struct Qdisc *sch, unsigned long arg)
+{
+        struct sfb_sched_data *q = qdisc_priv(sch);
+        return q->qdisc;
+}
+static unsigned long sfb_get(struct Qdisc *sch, u32 classid)
+{
+        return 1;
+}
+static void sfb_put(struct Qdisc *sch, unsigned long arg)
+{
+}
+static int sfb_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
+                            struct nlattr **tca, unsigned long *arg)
+{
+        return -ENOSYS;
+}
+static int sfb_delete(struct Qdisc *sch, unsigned long cl)
+{
+        return -ENOSYS;
+}
+static void sfb_walk(struct Qdisc *sch, struct qdisc_walker *walker)
+{
+        if (!walker->stop) {
+                if (walker->count >= walker->skip)
+                        if (walker->fn(sch, 1, walker) < 0) {
+                                walker->stop = 1;
+                                return;
+                        }
+                walker->count++;
+        }
+}
+static struct tcf_proto **sfb_find_tcf(struct Qdisc *sch, unsigned long cl)
+{
+        struct sfb_sched_data *q = qdisc_priv(sch);
+        if (cl)
+                return NULL;
+        return &q->filter_list;
+}
+static unsigned long sfb_bind(struct Qdisc *sch, unsigned long parent,
+                              u32 classid)
+{
+        return 0;
+}
+static const struct Qdisc_class_ops sfb_class_ops = {
+        .graft          =       sfb_graft,
+        .leaf           =       sfb_leaf,
+        .get            =       sfb_get,
+        .put            =       sfb_put,
+        .change         =       sfb_change_class,
+        .delete         =       sfb_delete,
+        .walk           =       sfb_walk,
+        .tcf_chain      =       sfb_find_tcf,
+        .bind_tcf       =       sfb_bind,
+        .unbind_tcf     =       sfb_put,
+        .dump           =       sfb_dump_class,
+};
+static struct Qdisc_ops sfb_qdisc_ops __read_mostly = {
+        .id             =       "sfb",
+        .priv_size      =       sizeof(struct sfb_sched_data),
+        .cl_ops         =       &sfb_class_ops,
+        .enqueue        =       sfb_enqueue,
+        .dequeue        =       sfb_dequeue,
+        .peek           =       sfb_peek,
+        .init           =       sfb_init,
+        .reset          =       sfb_reset,
+        .destroy        =       sfb_destroy,
+        .change         =       sfb_change,
+        .dump           =       sfb_dump,
+        .dump_stats     =       sfb_dump_stats,
+        .owner          =       THIS_MODULE,
+};
+static int __init sfb_module_init(void)
+{
+        return register_qdisc(&sfb_qdisc_ops);
+}
+static void __exit sfb_module_exit(void)
+{
+        unregister_qdisc(&sfb_qdisc_ops);
+}
+module_init(sfb_module_init)
+module_exit(sfb_module_exit)
+MODULE_DESCRIPTION("Stochastic Fair Blue queue discipline");
+MODULE_AUTHOR("Juliusz Chroboczek");
+MODULE_AUTHOR("Eric Dumazet");
+MODULE_LICENSE("GPL");
author	Eric Dumazet <eric.dumazet@gmail.com>	2011-02-23 05:56:17 -0500
committer	David S. Miller <davem@davemloft.net>	2011-02-23 17:05:11 -0500
commit	e13e02a3c68d899169c78d9a18689bd73491d59a (patch)
tree	6e6b40ef37261df391cd445ec0f1b3d538b23a47 /net/sched
parent	dee9f4bceb5fd9dbfcc1567148fccdbf16d6a38a (diff)

diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 8c19b6e3201e..a7a5583d4f68 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig
@@ -126,6 +126,17 @@ config NET_SCH_RED
126	To compile this code as a module, choose M here: the	126	To compile this code as a module, choose M here: the
127	module will be called sch_red.	127	module will be called sch_red.
128		128
		129	config NET_SCH_SFB
		130	tristate "Stochastic Fair Blue (SFB)"
		131	---help---
		132	Say Y here if you want to use the Stochastic Fair Blue (SFB)
		133	packet scheduling algorithm.
		134
		135	See the top of <file:net/sched/sch_sfb.c> for more details.
		136
		137	To compile this code as a module, choose M here: the
		138	module will be called sch_sfb.
		139
129	config NET_SCH_SFQ	140	config NET_SCH_SFQ
130	tristate "Stochastic Fairness Queueing (SFQ)"	141	tristate "Stochastic Fairness Queueing (SFQ)"
131	---help---	142	---help---


diff --git a/net/sched/Makefile b/net/sched/Makefile index 06c6cdfd1948..2e77b8dba22e 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile
@@ -24,6 +24,7 @@ obj-$(CONFIG_NET_SCH_RED) += sch_red.o
24	obj-$(CONFIG_NET_SCH_GRED) += sch_gred.o	24	obj-$(CONFIG_NET_SCH_GRED) += sch_gred.o
25	obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o	25	obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o
26	obj-$(CONFIG_NET_SCH_DSMARK) += sch_dsmark.o	26	obj-$(CONFIG_NET_SCH_DSMARK) += sch_dsmark.o
		27	obj-$(CONFIG_NET_SCH_SFB) += sch_sfb.o
27	obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o	28	obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o
28	obj-$(CONFIG_NET_SCH_TBF) += sch_tbf.o	29	obj-$(CONFIG_NET_SCH_TBF) += sch_tbf.o
29	obj-$(CONFIG_NET_SCH_TEQL) += sch_teql.o	30	obj-$(CONFIG_NET_SCH_TEQL) += sch_teql.o


diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c new file mode 100644 index 000000000000..0a833d0c1f61 --- /dev/null +++ b/net/sched/sch_sfb.c
@@ -0,0 +1,709 @@
		1	/*
		2	* net/sched/sch_sfb.c Stochastic Fair Blue
		3	*
		4	* Copyright (c) 2008-2011 Juliusz Chroboczek <jch@pps.jussieu.fr>
		5	* Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
		6	*
		7	* This program is free software; you can redistribute it and/or
		8	* modify it under the terms of the GNU General Public License
		9	* version 2 as published by the Free Software Foundation.
		10	*
		11	* W. Feng, D. Kandlur, D. Saha, K. Shin. Blue:
		12	* A New Class of Active Queue Management Algorithms.
		13	* U. Michigan CSE-TR-387-99, April 1999.
		14	*
		15	* http://www.thefengs.com/wuchang/blue/CSE-TR-387-99.pdf
		16	*
		17	*/
		18
		19	#include <linux/module.h>
		20	#include <linux/types.h>
		21	#include <linux/kernel.h>
		22	#include <linux/errno.h>
		23	#include <linux/skbuff.h>
		24	#include <linux/random.h>
		25	#include <linux/jhash.h>
		26	#include <net/ip.h>
		27	#include <net/pkt_sched.h>
		28	#include <net/inet_ecn.h>
		29
		30	/*
		31	* SFB uses two B[l][n] : L x N arrays of bins (L levels, N bins per level)
		32	* This implementation uses L = 8 and N = 16
		33	* This permits us to split one 32bit hash (provided per packet by rxhash or
		34	* external classifier) into 8 subhashes of 4 bits.
		35	*/
		36	#define SFB_BUCKET_SHIFT 4
		37	#define SFB_NUMBUCKETS (1 << SFB_BUCKET_SHIFT) /* N bins per Level */
		38	#define SFB_BUCKET_MASK (SFB_NUMBUCKETS - 1)
		39	#define SFB_LEVELS (32 / SFB_BUCKET_SHIFT) /* L */
		40
		41	/* SFB algo uses a virtual queue, named "bin" */
		42	struct sfb_bucket {
		43	u16 qlen; /* length of virtual queue */
		44	u16 p_mark; /* marking probability */
		45	};
		46
		47	/* We use a double buffering right before hash change
		48	* (Section 4.4 of SFB reference : moving hash functions)
		49	*/
		50	struct sfb_bins {
		51	u32 perturbation; /* jhash perturbation */
		52	struct sfb_bucket bins[SFB_LEVELS][SFB_NUMBUCKETS];
		53	};
		54
		55	struct sfb_sched_data {
		56	struct Qdisc *qdisc;
		57	struct tcf_proto *filter_list;
		58	unsigned long rehash_interval;
		59	unsigned long warmup_time; /* double buffering warmup time in jiffies */
		60	u32 max;
		61	u32 bin_size; /* maximum queue length per bin */
		62	u32 increment; /* d1 */
		63	u32 decrement; /* d2 */
		64	u32 limit; /* HARD maximal queue length */
		65	u32 penalty_rate;
		66	u32 penalty_burst;
		67	u32 tokens_avail;
		68	unsigned long rehash_time;
		69	unsigned long token_time;
		70
		71	u8 slot; /* current active bins (0 or 1) */
		72	bool double_buffering;
		73	struct sfb_bins bins[2];
		74
		75	struct {
		76	u32 earlydrop;
		77	u32 penaltydrop;
		78	u32 bucketdrop;
		79	u32 queuedrop;
		80	u32 childdrop; /* drops in child qdisc */
		81	u32 marked; /* ECN mark */
		82	} stats;
		83	};
		84
		85	/*
		86	* Each queued skb might be hashed on one or two bins
		87	* We store in skb_cb the two hash values.
		88	* (A zero value means double buffering was not used)
		89	*/
		90	struct sfb_skb_cb {
		91	u32 hashes[2];
		92	};
		93
		94	static inline struct sfb_skb_cb sfb_skb_cb(const struct sk_buff skb)
		95	{
		96	BUILD_BUG_ON(sizeof(skb->cb) <
		97	sizeof(struct qdisc_skb_cb) + sizeof(struct sfb_skb_cb));
		98	return (struct sfb_skb_cb *)qdisc_skb_cb(skb)->data;
		99	}
		100
		101	/*
		102	* If using 'internal' SFB flow classifier, hash comes from skb rxhash
		103	* If using external classifier, hash comes from the classid.
		104	*/
		105	static u32 sfb_hash(const struct sk_buff *skb, u32 slot)
		106	{
		107	return sfb_skb_cb(skb)->hashes[slot];
		108	}
		109
		110	/* Probabilities are coded as Q0.16 fixed-point values,
		111	* with 0xFFFF representing 65535/65536 (almost 1.0)
		112	* Addition and subtraction are saturating in [0, 65535]
		113	*/
		114	static u32 prob_plus(u32 p1, u32 p2)
		115	{
		116	u32 res = p1 + p2;
		117
		118	return min_t(u32, res, SFB_MAX_PROB);
		119	}
		120
		121	static u32 prob_minus(u32 p1, u32 p2)
		122	{
		123	return p1 > p2 ? p1 - p2 : 0;
		124	}
		125
		126	static void increment_one_qlen(u32 sfbhash, u32 slot, struct sfb_sched_data *q)
		127	{
		128	int i;
		129	struct sfb_bucket *b = &q->bins[slot].bins[0][0];
		130
		131	for (i = 0; i < SFB_LEVELS; i++) {
		132	u32 hash = sfbhash & SFB_BUCKET_MASK;
		133
		134	sfbhash >>= SFB_BUCKET_SHIFT;
		135	if (b[hash].qlen < 0xFFFF)
		136	b[hash].qlen++;
		137	b += SFB_NUMBUCKETS; /* next level */
		138	}
		139	}
		140
		141	static void increment_qlen(const struct sk_buff skb, struct sfb_sched_data q)
		142	{
		143	u32 sfbhash;
		144
		145	sfbhash = sfb_hash(skb, 0);
		146	if (sfbhash)
		147	increment_one_qlen(sfbhash, 0, q);
		148
		149	sfbhash = sfb_hash(skb, 1);
		150	if (sfbhash)
		151	increment_one_qlen(sfbhash, 1, q);
		152	}
		153
		154	static void decrement_one_qlen(u32 sfbhash, u32 slot,
		155	struct sfb_sched_data *q)
		156	{
		157	int i;
		158	struct sfb_bucket *b = &q->bins[slot].bins[0][0];
		159
		160	for (i = 0; i < SFB_LEVELS; i++) {
		161	u32 hash = sfbhash & SFB_BUCKET_MASK;
		162
		163	sfbhash >>= SFB_BUCKET_SHIFT;
		164	if (b[hash].qlen > 0)
		165	b[hash].qlen--;
		166	b += SFB_NUMBUCKETS; /* next level */
		167	}
		168	}
		169
		170	static void decrement_qlen(const struct sk_buff skb, struct sfb_sched_data q)
		171	{
		172	u32 sfbhash;
		173
		174	sfbhash = sfb_hash(skb, 0);
		175	if (sfbhash)
		176	decrement_one_qlen(sfbhash, 0, q);
		177
		178	sfbhash = sfb_hash(skb, 1);
		179	if (sfbhash)
		180	decrement_one_qlen(sfbhash, 1, q);
		181	}
		182
		183	static void decrement_prob(struct sfb_bucket b, struct sfb_sched_data q)
		184	{
		185	b->p_mark = prob_minus(b->p_mark, q->decrement);
		186	}
		187
		188	static void increment_prob(struct sfb_bucket b, struct sfb_sched_data q)
		189	{
		190	b->p_mark = prob_plus(b->p_mark, q->increment);
		191	}
		192
		193	static void sfb_zero_all_buckets(struct sfb_sched_data *q)
		194	{
		195	memset(&q->bins, 0, sizeof(q->bins));
		196	}
		197
		198	/*
		199	* compute max qlen, max p_mark, and avg p_mark
		200	*/
		201	static u32 sfb_compute_qlen(u32 prob_r, u32 avgpm_r, const struct sfb_sched_data *q)
		202	{
		203	int i;
		204	u32 qlen = 0, prob = 0, totalpm = 0;
		205	const struct sfb_bucket *b = &q->bins[q->slot].bins[0][0];
		206
		207	for (i = 0; i < SFB_LEVELS * SFB_NUMBUCKETS; i++) {
		208	if (qlen < b->qlen)
		209	qlen = b->qlen;
		210	totalpm += b->p_mark;
		211	if (prob < b->p_mark)
		212	prob = b->p_mark;
		213	b++;
		214	}
		215	*prob_r = prob;
		216	avgpm_r = totalpm / (SFB_LEVELS SFB_NUMBUCKETS);
		217	return qlen;
		218	}
		219
		220
		221	static void sfb_init_perturbation(u32 slot, struct sfb_sched_data *q)
		222	{
		223	q->bins[slot].perturbation = net_random();
		224	}
		225
		226	static void sfb_swap_slot(struct sfb_sched_data *q)
		227	{
		228	sfb_init_perturbation(q->slot, q);
		229	q->slot ^= 1;
		230	q->double_buffering = false;
		231	}
		232
		233	/* Non elastic flows are allowed to use part of the bandwidth, expressed
		234	* in "penalty_rate" packets per second, with "penalty_burst" burst
		235	*/
		236	static bool sfb_rate_limit(struct sk_buff skb, struct sfb_sched_data q)
		237	{
		238	if (q->penalty_rate == 0 \|\| q->penalty_burst == 0)
		239	return true;
		240
		241	if (q->tokens_avail < 1) {
		242	unsigned long age = min(10UL * HZ, jiffies - q->token_time);
		243
		244	q->tokens_avail = (age * q->penalty_rate) / HZ;
		245	if (q->tokens_avail > q->penalty_burst)
		246	q->tokens_avail = q->penalty_burst;
		247	q->token_time = jiffies;
		248	if (q->tokens_avail < 1)
		249	return true;
		250	}
		251
		252	q->tokens_avail--;
		253	return false;
		254	}
		255
		256	static bool sfb_classify(struct sk_buff skb, struct sfb_sched_data q,
		257	int qerr, u32 salt)
		258	{
		259	struct tcf_result res;
		260	int result;
		261
		262	result = tc_classify(skb, q->filter_list, &res);
		263	if (result >= 0) {
		264	#ifdef CONFIG_NET_CLS_ACT
		265	switch (result) {
		266	case TC_ACT_STOLEN:
		267	case TC_ACT_QUEUED:
		268	*qerr = NET_XMIT_SUCCESS \| __NET_XMIT_STOLEN;
		269	case TC_ACT_SHOT:
		270	return false;
		271	}
		272	#endif
		273	*salt = TC_H_MIN(res.classid);
		274	return true;
		275	}
		276	return false;
		277	}
		278
		279	static int sfb_enqueue(struct sk_buff skb, struct Qdisc sch)
		280	{
		281
		282	struct sfb_sched_data *q = qdisc_priv(sch);
		283	struct Qdisc *child = q->qdisc;
		284	int i;
		285	u32 p_min = ~0;
		286	u32 minqlen = ~0;
		287	u32 r, slot, salt, sfbhash;
		288	int ret = NET_XMIT_SUCCESS \| __NET_XMIT_BYPASS;
		289
		290	if (q->rehash_interval > 0) {
		291	unsigned long limit = q->rehash_time + q->rehash_interval;
		292
		293	if (unlikely(time_after(jiffies, limit))) {
		294	sfb_swap_slot(q);
		295	q->rehash_time = jiffies;
		296	} else if (unlikely(!q->double_buffering && q->warmup_time > 0 &&
		297	time_after(jiffies, limit - q->warmup_time))) {
		298	q->double_buffering = true;
		299	}
		300	}
		301
		302	if (q->filter_list) {
		303	/* If using external classifiers, get result and record it. */
		304	if (!sfb_classify(skb, q, &ret, &salt))
		305	goto other_drop;
		306	} else {
		307	salt = skb_get_rxhash(skb);
		308	}
		309
		310	slot = q->slot;
		311
		312	sfbhash = jhash_1word(salt, q->bins[slot].perturbation);
		313	if (!sfbhash)
		314	sfbhash = 1;
		315	sfb_skb_cb(skb)->hashes[slot] = sfbhash;
		316
		317	for (i = 0; i < SFB_LEVELS; i++) {
		318	u32 hash = sfbhash & SFB_BUCKET_MASK;
		319	struct sfb_bucket *b = &q->bins[slot].bins[i][hash];
		320
		321	sfbhash >>= SFB_BUCKET_SHIFT;
		322	if (b->qlen == 0)
		323	decrement_prob(b, q);
		324	else if (b->qlen >= q->bin_size)
		325	increment_prob(b, q);
		326	if (minqlen > b->qlen)
		327	minqlen = b->qlen;
		328	if (p_min > b->p_mark)
		329	p_min = b->p_mark;
		330	}
		331
		332	slot ^= 1;
		333	sfb_skb_cb(skb)->hashes[slot] = 0;
		334
		335	if (unlikely(minqlen >= q->max \|\| sch->q.qlen >= q->limit)) {
		336	sch->qstats.overlimits++;
		337	if (minqlen >= q->max)
		338	q->stats.bucketdrop++;
		339	else
		340	q->stats.queuedrop++;
		341	goto drop;
		342	}
		343
		344	if (unlikely(p_min >= SFB_MAX_PROB)) {
		345	/* Inelastic flow */
		346	if (q->double_buffering) {
		347	sfbhash = jhash_1word(salt, q->bins[slot].perturbation);
		348	if (!sfbhash)
		349	sfbhash = 1;
		350	sfb_skb_cb(skb)->hashes[slot] = sfbhash;
		351
		352	for (i = 0; i < SFB_LEVELS; i++) {
		353	u32 hash = sfbhash & SFB_BUCKET_MASK;
		354	struct sfb_bucket *b = &q->bins[slot].bins[i][hash];
		355
		356	sfbhash >>= SFB_BUCKET_SHIFT;
		357	if (b->qlen == 0)
		358	decrement_prob(b, q);
		359	else if (b->qlen >= q->bin_size)
		360	increment_prob(b, q);
		361	}
		362	}
		363	if (sfb_rate_limit(skb, q)) {
		364	sch->qstats.overlimits++;
		365	q->stats.penaltydrop++;
		366	goto drop;
		367	}
		368	goto enqueue;
		369	}
		370
		371	r = net_random() & SFB_MAX_PROB;
		372
		373	if (unlikely(r < p_min)) {
		374	if (unlikely(p_min > SFB_MAX_PROB / 2)) {
		375	/* If we're marking that many packets, then either
		376	* this flow is unresponsive, or we're badly congested.
		377	* In either case, we want to start dropping packets.
		378	*/
		379	if (r < (p_min - SFB_MAX_PROB / 2) * 2) {
		380	q->stats.earlydrop++;
		381	goto drop;
		382	}
		383	}
		384	if (INET_ECN_set_ce(skb)) {
		385	q->stats.marked++;
		386	} else {
		387	q->stats.earlydrop++;
		388	goto drop;
		389	}
		390	}
		391
		392	enqueue:
		393	ret = qdisc_enqueue(skb, child);
		394	if (likely(ret == NET_XMIT_SUCCESS)) {
		395	sch->q.qlen++;
		396	increment_qlen(skb, q);
		397	} else if (net_xmit_drop_count(ret)) {
		398	q->stats.childdrop++;
		399	sch->qstats.drops++;
		400	}
		401	return ret;
		402
		403	drop:
		404	qdisc_drop(skb, sch);
		405	return NET_XMIT_CN;
		406	other_drop:
		407	if (ret & __NET_XMIT_BYPASS)
		408	sch->qstats.drops++;
		409	kfree_skb(skb);
		410	return ret;
		411	}
		412
		413	static struct sk_buff sfb_dequeue(struct Qdisc sch)
		414	{
		415	struct sfb_sched_data *q = qdisc_priv(sch);
		416	struct Qdisc *child = q->qdisc;
		417	struct sk_buff *skb;
		418
		419	skb = child->dequeue(q->qdisc);
		420
		421	if (skb) {
		422	qdisc_bstats_update(sch, skb);
		423	sch->q.qlen--;
		424	decrement_qlen(skb, q);
		425	}
		426
		427	return skb;
		428	}
		429
		430	static struct sk_buff sfb_peek(struct Qdisc sch)
		431	{
		432	struct sfb_sched_data *q = qdisc_priv(sch);
		433	struct Qdisc *child = q->qdisc;
		434
		435	return child->ops->peek(child);
		436	}
		437
		438	/* No sfb_drop -- impossible since the child doesn't return the dropped skb. */
		439
		440	static void sfb_reset(struct Qdisc *sch)
		441	{
		442	struct sfb_sched_data *q = qdisc_priv(sch);
		443
		444	qdisc_reset(q->qdisc);
		445	sch->q.qlen = 0;
		446	q->slot = 0;
		447	q->double_buffering = false;
		448	sfb_zero_all_buckets(q);
		449	sfb_init_perturbation(0, q);
		450	}
		451
		452	static void sfb_destroy(struct Qdisc *sch)
		453	{
		454	struct sfb_sched_data *q = qdisc_priv(sch);
		455
		456	tcf_destroy_chain(&q->filter_list);
		457	qdisc_destroy(q->qdisc);
		458	}
		459
		460	static const struct nla_policy sfb_policy[TCA_SFB_MAX + 1] = {
		461	[TCA_SFB_PARMS] = { .len = sizeof(struct tc_sfb_qopt) },
		462	};
		463
		464	static const struct tc_sfb_qopt sfb_default_ops = {
		465	.rehash_interval = 600 * MSEC_PER_SEC,
		466	.warmup_time = 60 * MSEC_PER_SEC,
		467	.limit = 0,
		468	.max = 25,
		469	.bin_size = 20,
		470	.increment = (SFB_MAX_PROB + 500) / 1000, /* 0.1 % */
		471	.decrement = (SFB_MAX_PROB + 3000) / 6000,
		472	.penalty_rate = 10,
		473	.penalty_burst = 20,
		474	};
		475
		476	static int sfb_change(struct Qdisc sch, struct nlattr opt)
		477	{
		478	struct sfb_sched_data *q = qdisc_priv(sch);
		479	struct Qdisc *child;
		480	struct nlattr *tb[TCA_SFB_MAX + 1];
		481	const struct tc_sfb_qopt *ctl = &sfb_default_ops;
		482	u32 limit;
		483	int err;
		484
		485	if (opt) {
		486	err = nla_parse_nested(tb, TCA_SFB_MAX, opt, sfb_policy);
		487	if (err < 0)
		488	return -EINVAL;
		489
		490	if (tb[TCA_SFB_PARMS] == NULL)
		491	return -EINVAL;
		492
		493	ctl = nla_data(tb[TCA_SFB_PARMS]);
		494	}
		495
		496	limit = ctl->limit;
		497	if (limit == 0)
		498	limit = max_t(u32, qdisc_dev(sch)->tx_queue_len, 1);
		499
		500	child = fifo_create_dflt(sch, &pfifo_qdisc_ops, limit);
		501	if (IS_ERR(child))
		502	return PTR_ERR(child);
		503
		504	sch_tree_lock(sch);
		505
		506	qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
		507	qdisc_destroy(q->qdisc);
		508	q->qdisc = child;
		509
		510	q->rehash_interval = msecs_to_jiffies(ctl->rehash_interval);
		511	q->warmup_time = msecs_to_jiffies(ctl->warmup_time);
		512	q->rehash_time = jiffies;
		513	q->limit = limit;
		514	q->increment = ctl->increment;
		515	q->decrement = ctl->decrement;
		516	q->max = ctl->max;
		517	q->bin_size = ctl->bin_size;
		518	q->penalty_rate = ctl->penalty_rate;
		519	q->penalty_burst = ctl->penalty_burst;
		520	q->tokens_avail = ctl->penalty_burst;
		521	q->token_time = jiffies;
		522
		523	q->slot = 0;
		524	q->double_buffering = false;
		525	sfb_zero_all_buckets(q);
		526	sfb_init_perturbation(0, q);
		527	sfb_init_perturbation(1, q);
		528
		529	sch_tree_unlock(sch);
		530
		531	return 0;
		532	}
		533
		534	static int sfb_init(struct Qdisc sch, struct nlattr opt)
		535	{
		536	struct sfb_sched_data *q = qdisc_priv(sch);
		537
		538	q->qdisc = &noop_qdisc;
		539	return sfb_change(sch, opt);
		540	}
		541
		542	static int sfb_dump(struct Qdisc sch, struct sk_buff skb)
		543	{
		544	struct sfb_sched_data *q = qdisc_priv(sch);
		545	struct nlattr *opts;
		546	struct tc_sfb_qopt opt = {
		547	.rehash_interval = jiffies_to_msecs(q->rehash_interval),
		548	.warmup_time = jiffies_to_msecs(q->warmup_time),
		549	.limit = q->limit,
		550	.max = q->max,
		551	.bin_size = q->bin_size,
		552	.increment = q->increment,
		553	.decrement = q->decrement,
		554	.penalty_rate = q->penalty_rate,
		555	.penalty_burst = q->penalty_burst,
		556	};
		557
		558	sch->qstats.backlog = q->qdisc->qstats.backlog;
		559	opts = nla_nest_start(skb, TCA_OPTIONS);
		560	NLA_PUT(skb, TCA_SFB_PARMS, sizeof(opt), &opt);
		561	return nla_nest_end(skb, opts);
		562
		563	nla_put_failure:
		564	nla_nest_cancel(skb, opts);
		565	return -EMSGSIZE;
		566	}
		567
		568	static int sfb_dump_stats(struct Qdisc sch, struct gnet_dump d)
		569	{
		570	struct sfb_sched_data *q = qdisc_priv(sch);
		571	struct tc_sfb_xstats st = {
		572	.earlydrop = q->stats.earlydrop,
		573	.penaltydrop = q->stats.penaltydrop,
		574	.bucketdrop = q->stats.bucketdrop,
		575	.queuedrop = q->stats.queuedrop,
		576	.childdrop = q->stats.childdrop,
		577	.marked = q->stats.marked,
		578	};
		579
		580	st.maxqlen = sfb_compute_qlen(&st.maxprob, &st.avgprob, q);
		581
		582	return gnet_stats_copy_app(d, &st, sizeof(st));
		583	}
		584
		585	static int sfb_dump_class(struct Qdisc *sch, unsigned long cl,
		586	struct sk_buff skb, struct tcmsg tcm)
		587	{
		588	return -ENOSYS;
		589	}
		590
		591	static int sfb_graft(struct Qdisc sch, unsigned long arg, struct Qdisc new,
		592	struct Qdisc **old)
		593	{
		594	struct sfb_sched_data *q = qdisc_priv(sch);
		595
		596	if (new == NULL)
		597	new = &noop_qdisc;
		598
		599	sch_tree_lock(sch);
		600	*old = q->qdisc;
		601	q->qdisc = new;
		602	qdisc_tree_decrease_qlen(old, (old)->q.qlen);
		603	qdisc_reset(*old);
		604	sch_tree_unlock(sch);
		605	return 0;
		606	}
		607
		608	static struct Qdisc sfb_leaf(struct Qdisc sch, unsigned long arg)
		609	{
		610	struct sfb_sched_data *q = qdisc_priv(sch);
		611
		612	return q->qdisc;
		613	}
		614
		615	static unsigned long sfb_get(struct Qdisc *sch, u32 classid)
		616	{
		617	return 1;
		618	}
		619
		620	static void sfb_put(struct Qdisc *sch, unsigned long arg)
		621	{
		622	}
		623
		624	static int sfb_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
		625	struct nlattr *tca, unsigned long arg)
		626	{
		627	return -ENOSYS;
		628	}
		629
		630	static int sfb_delete(struct Qdisc *sch, unsigned long cl)
		631	{
		632	return -ENOSYS;
		633	}
		634
		635	static void sfb_walk(struct Qdisc sch, struct qdisc_walker walker)
		636	{
		637	if (!walker->stop) {
		638	if (walker->count >= walker->skip)
		639	if (walker->fn(sch, 1, walker) < 0) {
		640	walker->stop = 1;
		641	return;
		642	}
		643	walker->count++;
		644	}
		645	}
		646
		647	static struct tcf_proto *sfb_find_tcf(struct Qdisc sch, unsigned long cl)
		648	{
		649	struct sfb_sched_data *q = qdisc_priv(sch);
		650
		651	if (cl)
		652	return NULL;
		653	return &q->filter_list;
		654	}
		655
		656	static unsigned long sfb_bind(struct Qdisc *sch, unsigned long parent,
		657	u32 classid)
		658	{
		659	return 0;
		660	}
		661
		662
		663	static const struct Qdisc_class_ops sfb_class_ops = {
		664	.graft = sfb_graft,
		665	.leaf = sfb_leaf,
		666	.get = sfb_get,
		667	.put = sfb_put,
		668	.change = sfb_change_class,
		669	.delete = sfb_delete,
		670	.walk = sfb_walk,
		671	.tcf_chain = sfb_find_tcf,
		672	.bind_tcf = sfb_bind,
		673	.unbind_tcf = sfb_put,
		674	.dump = sfb_dump_class,
		675	};
		676
		677	static struct Qdisc_ops sfb_qdisc_ops __read_mostly = {
		678	.id = "sfb",
		679	.priv_size = sizeof(struct sfb_sched_data),
		680	.cl_ops = &sfb_class_ops,
		681	.enqueue = sfb_enqueue,
		682	.dequeue = sfb_dequeue,
		683	.peek = sfb_peek,
		684	.init = sfb_init,
		685	.reset = sfb_reset,
		686	.destroy = sfb_destroy,
		687	.change = sfb_change,
		688	.dump = sfb_dump,
		689	.dump_stats = sfb_dump_stats,
		690	.owner = THIS_MODULE,
		691	};
		692
		693	static int __init sfb_module_init(void)
		694	{
		695	return register_qdisc(&sfb_qdisc_ops);
		696	}
		697
		698	static void __exit sfb_module_exit(void)
		699	{
		700	unregister_qdisc(&sfb_qdisc_ops);
		701	}
		702
		703	module_init(sfb_module_init)
		704	module_exit(sfb_module_exit)
		705
		706	MODULE_DESCRIPTION("Stochastic Fair Blue queue discipline");
		707	MODULE_AUTHOR("Juliusz Chroboczek");
		708	MODULE_AUTHOR("Eric Dumazet");
		709	MODULE_LICENSE("GPL");