Linux-2.6.12-rc2v2.6.12-rc2

Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
author: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 18:20:36 -0400
committer: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 18:20:36 -0400
commit: 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree: 0bba044c4ce775e45a88a51686b5d9f90697ea9d /net/sched
39 files changed, 22039 insertions, 0 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
new file mode 100644
index 000000000000..3d1d902dd1a1
--- /dev/null
+++ b/net/sched/Kconfig
@@ -0,0 +1,508 @@
+#
+# Traffic control configuration.
+# 
+choice
+        prompt "Packet scheduler clock source"
+        depends on NET_SCHED
+        default NET_SCH_CLK_JIFFIES
+        help
+          Packet schedulers need a monotonic clock that increments at a static
+          rate. The kernel provides several suitable interfaces, each with
+          different properties:
+          
+          - high resolution (us or better)
+          - fast to read (minimal locking, no i/o access)
+          - synchronized on all processors
+          - handles cpu clock frequency changes
+          but nothing provides all of the above.
+config NET_SCH_CLK_JIFFIES
+        bool "Timer interrupt"
+        help
+          Say Y here if you want to use the timer interrupt (jiffies) as clock
+          source. This clock source is fast, synchronized on all processors and
+          handles cpu clock frequency changes, but its resolution is too low
+          for accurate shaping except at very low speed.
+config NET_SCH_CLK_GETTIMEOFDAY
+        bool "gettimeofday"
+        help
+          Say Y here if you want to use gettimeofday as clock source. This clock
+          source has high resolution, is synchronized on all processors and
+          handles cpu clock frequency changes, but it is slow.
+          Choose this if you need a high resolution clock source but can't use
+          the CPU's cycle counter.
+config NET_SCH_CLK_CPU
+        bool "CPU cycle counter"
+        depends on X86_TSC || X86_64 || ALPHA || SPARC64 || PPC64 || IA64
+        help
+          Say Y here if you want to use the CPU's cycle counter as clock source.
+          This is a cheap and high resolution clock source, but on some
+          architectures it is not synchronized on all processors and doesn't
+          handle cpu clock frequency changes.
+          The useable cycle counters are:
+                x86/x86_64      - Timestamp Counter
+                alpha           - Cycle Counter
+                sparc64         - %ticks register
+                ppc64           - Time base
+                ia64            - Interval Time Counter
+          Choose this if your CPU's cycle counter is working properly.
+endchoice
+config NET_SCH_CBQ
+        tristate "CBQ packet scheduler"
+        depends on NET_SCHED
+        ---help---
+          Say Y here if you want to use the Class-Based Queueing (CBQ) packet
+          scheduling algorithm for some of your network devices.  This
+          algorithm classifies the waiting packets into a tree-like hierarchy
+          of classes; the leaves of this tree are in turn scheduled by
+          separate algorithms (called "disciplines" in this context).
+          See the top of <file:net/sched/sch_cbq.c> for references about the
+          CBQ algorithm.
+          CBQ is a commonly used scheduler, so if you're unsure, you should
+          say Y here. Then say Y to all the queueing algorithms below that you
+          want to use as CBQ disciplines.  Then say Y to "Packet classifier
+          API" and say Y to all the classifiers you want to use; a classifier
+          is a routine that allows you to sort your outgoing traffic into
+          classes based on a certain criterion.
+          To compile this code as a module, choose M here: the
+          module will be called sch_cbq.
+config NET_SCH_HTB
+        tristate "HTB packet scheduler"
+        depends on NET_SCHED
+        ---help---
+          Say Y here if you want to use the Hierarchical Token Buckets (HTB)
+          packet scheduling algorithm for some of your network devices. See
+          <http://luxik.cdi.cz/~devik/qos/htb/> for complete manual and
+          in-depth articles.
+          HTB is very similar to the CBQ regarding its goals however is has 
+          different properties and different algorithm.
+          To compile this code as a module, choose M here: the
+          module will be called sch_htb.
+config NET_SCH_HFSC
+        tristate "HFSC packet scheduler"
+        depends on NET_SCHED
+        ---help---
+          Say Y here if you want to use the Hierarchical Fair Service Curve
+          (HFSC) packet scheduling algorithm for some of your network devices.
+          To compile this code as a module, choose M here: the
+          module will be called sch_hfsc.
+#tristate '  H-PFQ packet scheduler' CONFIG_NET_SCH_HPFQ
+config NET_SCH_ATM
+        tristate "ATM pseudo-scheduler"
+        depends on NET_SCHED && ATM
+        ---help---
+          Say Y here if you want to use the ATM pseudo-scheduler.  This
+          provides a framework for invoking classifiers (aka "filters"), which
+          in turn select classes of this queuing discipline.  Each class maps
+          the flow(s) it is handling to a given virtual circuit (see the top of
+          <file:net/sched/sch_atm.c>).
+          To compile this code as a module, choose M here: the
+          module will be called sch_atm.
+config NET_SCH_PRIO
+        tristate "The simplest PRIO pseudoscheduler"
+        depends on NET_SCHED
+        help
+          Say Y here if you want to use an n-band priority queue packet
+          "scheduler" for some of your network devices or as a leaf discipline
+          for the CBQ scheduling algorithm. If unsure, say Y.
+          To compile this code as a module, choose M here: the
+          module will be called sch_prio.
+config NET_SCH_RED
+        tristate "RED queue"
+        depends on NET_SCHED
+        help
+          Say Y here if you want to use the Random Early Detection (RED)
+          packet scheduling algorithm for some of your network devices (see
+          the top of <file:net/sched/sch_red.c> for details and references
+          about the algorithm).
+          To compile this code as a module, choose M here: the
+          module will be called sch_red.
+config NET_SCH_SFQ
+        tristate "SFQ queue"
+        depends on NET_SCHED
+        ---help---
+          Say Y here if you want to use the Stochastic Fairness Queueing (SFQ)
+          packet scheduling algorithm for some of your network devices or as a
+          leaf discipline for the CBQ scheduling algorithm (see the top of
+          <file:net/sched/sch_sfq.c> for details and references about the SFQ
+          algorithm).
+          To compile this code as a module, choose M here: the
+          module will be called sch_sfq.
+config NET_SCH_TEQL
+        tristate "TEQL queue"
+        depends on NET_SCHED
+        ---help---
+          Say Y here if you want to use the True Link Equalizer (TLE) packet
+          scheduling algorithm for some of your network devices or as a leaf
+          discipline for the CBQ scheduling algorithm. This queueing
+          discipline allows the combination of several physical devices into
+          one virtual device. (see the top of <file:net/sched/sch_teql.c> for
+          details).
+          To compile this code as a module, choose M here: the
+          module will be called sch_teql.
+config NET_SCH_TBF
+        tristate "TBF queue"
+        depends on NET_SCHED
+        help
+          Say Y here if you want to use the Simple Token Bucket Filter (TBF)
+          packet scheduling algorithm for some of your network devices or as a
+          leaf discipline for the CBQ scheduling algorithm (see the top of
+          <file:net/sched/sch_tbf.c> for a description of the TBF algorithm).
+          To compile this code as a module, choose M here: the
+          module will be called sch_tbf.
+config NET_SCH_GRED
+        tristate "GRED queue"
+        depends on NET_SCHED
+        help
+          Say Y here if you want to use the Generic Random Early Detection
+          (RED) packet scheduling algorithm for some of your network devices
+          (see the top of <file:net/sched/sch_red.c> for details and
+          references about the algorithm).
+          To compile this code as a module, choose M here: the
+          module will be called sch_gred.
+config NET_SCH_DSMARK
+        tristate "Diffserv field marker"
+        depends on NET_SCHED
+        help
+          Say Y if you want to schedule packets according to the
+          Differentiated Services architecture proposed in RFC 2475.
+          Technical information on this method, with pointers to associated
+          RFCs, is available at <http://www.gta.ufrj.br/diffserv/>.
+          To compile this code as a module, choose M here: the
+          module will be called sch_dsmark.
+config NET_SCH_NETEM
+        tristate "Network emulator"
+        depends on NET_SCHED
+        help
+          Say Y if you want to emulate network delay, loss, and packet
+          re-ordering. This is often useful to simulate networks when
+          testing applications or protocols.
+          To compile this driver as a module, choose M here: the module
+          will be called sch_netem.
+          If unsure, say N.
+config NET_SCH_INGRESS
+        tristate "Ingress Qdisc"
+        depends on NET_SCHED 
+        help
+          If you say Y here, you will be able to police incoming bandwidth
+          and drop packets when this bandwidth exceeds your desired rate.
+          If unsure, say Y.
+          To compile this code as a module, choose M here: the
+          module will be called sch_ingress.
+config NET_QOS
+        bool "QoS support"
+        depends on NET_SCHED
+        ---help---
+          Say Y here if you want to include Quality Of Service scheduling
+          features, which means that you will be able to request certain
+          rate-of-flow limits for your network devices.
+          This Quality of Service (QoS) support will enable you to use
+          Differentiated Services (diffserv) and Resource Reservation Protocol
+          (RSVP) on your Linux router if you also say Y to "Packet classifier
+          API" and to some classifiers below. Documentation and software is at
+          <http://diffserv.sourceforge.net/>.
+          Note that the answer to this question won't directly affect the
+          kernel: saying N will just cause the configurator to skip all
+          the questions about QoS support.
+config NET_ESTIMATOR
+        bool "Rate estimator"
+        depends on NET_QOS
+        help
+          In order for Quality of Service scheduling to work, the current
+          rate-of-flow for a network device has to be estimated; if you say Y
+          here, the kernel will do just that.
+config NET_CLS
+        bool "Packet classifier API"
+        depends on NET_SCHED
+        ---help---
+          The CBQ scheduling algorithm requires that network packets which are
+          scheduled to be sent out over a network device be classified
+          according to some criterion. If you say Y here, you will get a
+          choice of several different packet classifiers with the following
+          questions.
+          This will enable you to use Differentiated Services (diffserv) and
+          Resource Reservation Protocol (RSVP) on your Linux router.
+          Documentation and software is at
+          <http://diffserv.sourceforge.net/>.
+config NET_CLS_BASIC
+        tristate "Basic classifier"
+        depends on NET_CLS
+        ---help---
+          Say Y here if you want to be able to classify packets using
+          only extended matches and actions.
+          To compile this code as a module, choose M here: the
+          module will be called cls_basic.
+config NET_CLS_TCINDEX
+        tristate "TC index classifier"
+        depends on NET_CLS
+        help
+          If you say Y here, you will be able to classify outgoing packets
+          according to the tc_index field of the skb. You will want this
+          feature if you want to implement Differentiated Services using
+          sch_dsmark. If unsure, say Y.
+          To compile this code as a module, choose M here: the
+          module will be called cls_tcindex.
+config NET_CLS_ROUTE4
+        tristate "Routing table based classifier"
+        depends on NET_CLS
+        select NET_CLS_ROUTE
+        help
+          If you say Y here, you will be able to classify outgoing packets
+          according to the route table entry they matched. If unsure, say Y.
+          To compile this code as a module, choose M here: the
+          module will be called cls_route.
+config NET_CLS_ROUTE
+        bool
+        default n
+config NET_CLS_FW
+        tristate "Firewall based classifier"
+        depends on NET_CLS
+        help
+          If you say Y here, you will be able to classify outgoing packets
+          according to firewall criteria you specified.
+          To compile this code as a module, choose M here: the
+          module will be called cls_fw.
+config NET_CLS_U32
+        tristate "U32 classifier"
+        depends on NET_CLS
+        help
+          If you say Y here, you will be able to classify outgoing packets
+          according to their destination address. If unsure, say Y.
+          To compile this code as a module, choose M here: the
+          module will be called cls_u32.
+config CLS_U32_PERF
+        bool "U32 classifier performance counters"
+        depends on NET_CLS_U32
+        help
+          gathers stats that could be used to tune u32 classifier performance.
+          Requires a new iproute2
+          You MUST NOT turn this on if you dont have an update iproute2.
+config NET_CLS_IND
+        bool "classify input device (slows things u32/fw) "
+        depends on NET_CLS_U32 || NET_CLS_FW
+        help
+          This option will be killed eventually when a 
+          metadata action appears because it slows things a little
+          Available only for u32 and fw classifiers.
+          Requires a new iproute2
+          You MUST NOT turn this on if you dont have an update iproute2.
+config CLS_U32_MARK
+        bool "Use nfmark as a key in U32 classifier"
+        depends on NET_CLS_U32 && NETFILTER
+        help
+          This allows you to match mark in a u32 filter.
+          Example:
+          tc filter add dev eth0 protocol ip parent 1:0 prio 5 u32 \
+                match mark 0x0090 0xffff \
+                match ip dst 4.4.4.4 \
+                flowid 1:90
+          You must use a new iproute2 to use this feature.
+config NET_CLS_RSVP
+        tristate "Special RSVP classifier"
+        depends on NET_CLS && NET_QOS
+        ---help---
+          The Resource Reservation Protocol (RSVP) permits end systems to
+          request a minimum and maximum data flow rate for a connection; this
+          is important for real time data such as streaming sound or video.
+          Say Y here if you want to be able to classify outgoing packets based
+          on their RSVP requests.
+          To compile this code as a module, choose M here: the
+          module will be called cls_rsvp.
+config NET_CLS_RSVP6
+        tristate "Special RSVP classifier for IPv6"
+        depends on NET_CLS && NET_QOS
+        ---help---
+          The Resource Reservation Protocol (RSVP) permits end systems to
+          request a minimum and maximum data flow rate for a connection; this
+          is important for real time data such as streaming sound or video.
+          Say Y here if you want to be able to classify outgoing packets based
+          on their RSVP requests and you are using the new Internet Protocol
+          IPv6 as opposed to the older and more common IPv4.
+          To compile this code as a module, choose M here: the
+          module will be called cls_rsvp6.
+config NET_EMATCH
+        bool "Extended Matches"
+        depends on NET_CLS
+        ---help---
+          Say Y here if you want to use extended matches on top of classifiers
+          and select the extended matches below.
+          Extended matches are small classification helpers not worth writing
+          a separate classifier.
+          You must have a recent version of the iproute2 tools in order to use
+          extended matches.
+config NET_EMATCH_STACK
+        int "Stack size"
+        depends on NET_EMATCH
+        default "32"
+        ---help---
+          Size of the local stack variable used while evaluating the tree of
+          ematches. Limits the depth of the tree, i.e. the number of
+          encapsulated precedences. Every level requires 4 bytes of addtional
+          stack space.
+config NET_EMATCH_CMP
+        tristate "Simple packet data comparison"
+        depends on NET_EMATCH
+        ---help---
+          Say Y here if you want to be able to classify packets based on
+          simple packet data comparisons for 8, 16, and 32bit values.
+          To compile this code as a module, choose M here: the
+          module will be called em_cmp.
+config NET_EMATCH_NBYTE
+        tristate "Multi byte comparison"
+        depends on NET_EMATCH
+        ---help---
+          Say Y here if you want to be able to classify packets based on
+          multiple byte comparisons mainly useful for IPv6 address comparisons.
+          To compile this code as a module, choose M here: the
+          module will be called em_nbyte.
+config NET_EMATCH_U32
+        tristate "U32 hashing key"
+        depends on NET_EMATCH
+        ---help---
+          Say Y here if you want to be able to classify packets using
+          the famous u32 key in combination with logic relations.
+          To compile this code as a module, choose M here: the
+          module will be called em_u32.
+config NET_EMATCH_META
+        tristate "Metadata"
+        depends on NET_EMATCH
+        ---help---
+          Say Y here if you want to be ablt to classify packets based on
+          metadata such as load average, netfilter attributes, socket
+          attributes and routing decisions.
+          To compile this code as a module, choose M here: the
+          module will be called em_meta.
+config NET_CLS_ACT
+        bool "Packet ACTION"
+        depends on EXPERIMENTAL && NET_CLS && NET_QOS
+        ---help---
+        This option requires you have a new iproute2. It enables
+        tc extensions which can be used with tc classifiers.
+          You MUST NOT turn this on if you dont have an update iproute2.
+config NET_ACT_POLICE
+        tristate "Policing Actions"
+        depends on NET_CLS_ACT 
+        ---help---
+        If you are using a newer iproute2 select this one, otherwise use one
+        below to select a policer.
+          You MUST NOT turn this on if you dont have an update iproute2.
+config NET_ACT_GACT
+        tristate "generic Actions"
+        depends on NET_CLS_ACT
+        ---help---
+        You must have new iproute2 to use this feature.
+        This adds simple filtering actions like drop, accept etc.
+config GACT_PROB
+        bool "generic Actions probability"
+        depends on NET_ACT_GACT
+        ---help---
+        Allows generic actions to be randomly or deterministically used.
+config NET_ACT_MIRRED
+        tristate "Packet In/Egress redirecton/mirror Actions"
+        depends on NET_CLS_ACT
+        ---help---
+        requires new iproute2
+        This allows packets to be mirrored or redirected to netdevices
+config NET_ACT_IPT
+        tristate "iptables Actions"
+        depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
+        ---help---
+        requires new iproute2
+        This allows iptables targets to be used by tc filters
+config NET_ACT_PEDIT
+        tristate "Generic Packet Editor Actions"
+        depends on NET_CLS_ACT
+        ---help---
+        requires new iproute2
+        This allows for packets to be generically edited
+config NET_CLS_POLICE
+        bool "Traffic policing (needed for in/egress)"
+        depends on NET_CLS && NET_QOS && NET_CLS_ACT!=y
+        help
+          Say Y to support traffic policing (bandwidth limits).  Needed for
+          ingress and egress rate limiting.
diff --git a/net/sched/Makefile b/net/sched/Makefile
new file mode 100644
index 000000000000..431e55786efd
--- /dev/null
+++ b/net/sched/Makefile
@@ -0,0 +1,41 @@
+#
+# Makefile for the Linux Traffic Control Unit.
+#
+obj-y   := sch_generic.o
+obj-$(CONFIG_NET_SCHED)         += sch_api.o sch_fifo.o
+obj-$(CONFIG_NET_CLS)           += cls_api.o
+obj-$(CONFIG_NET_CLS_ACT)       += act_api.o
+obj-$(CONFIG_NET_ACT_POLICE)    += police.o
+obj-$(CONFIG_NET_CLS_POLICE)    += police.o
+obj-$(CONFIG_NET_ACT_GACT)      += gact.o
+obj-$(CONFIG_NET_ACT_MIRRED)    += mirred.o
+obj-$(CONFIG_NET_ACT_IPT)       += ipt.o
+obj-$(CONFIG_NET_ACT_PEDIT)     += pedit.o
+obj-$(CONFIG_NET_SCH_CBQ)       += sch_cbq.o
+obj-$(CONFIG_NET_SCH_HTB)       += sch_htb.o
+obj-$(CONFIG_NET_SCH_HPFQ)      += sch_hpfq.o
+obj-$(CONFIG_NET_SCH_HFSC)      += sch_hfsc.o
+obj-$(CONFIG_NET_SCH_RED)       += sch_red.o
+obj-$(CONFIG_NET_SCH_GRED)      += sch_gred.o
+obj-$(CONFIG_NET_SCH_INGRESS)   += sch_ingress.o 
+obj-$(CONFIG_NET_SCH_DSMARK)    += sch_dsmark.o
+obj-$(CONFIG_NET_SCH_SFQ)       += sch_sfq.o
+obj-$(CONFIG_NET_SCH_TBF)       += sch_tbf.o
+obj-$(CONFIG_NET_SCH_TEQL)      += sch_teql.o
+obj-$(CONFIG_NET_SCH_PRIO)      += sch_prio.o
+obj-$(CONFIG_NET_SCH_ATM)       += sch_atm.o
+obj-$(CONFIG_NET_SCH_NETEM)     += sch_netem.o
+obj-$(CONFIG_NET_CLS_U32)       += cls_u32.o
+obj-$(CONFIG_NET_CLS_ROUTE4)    += cls_route.o
+obj-$(CONFIG_NET_CLS_FW)        += cls_fw.o
+obj-$(CONFIG_NET_CLS_RSVP)      += cls_rsvp.o
+obj-$(CONFIG_NET_CLS_TCINDEX)   += cls_tcindex.o
+obj-$(CONFIG_NET_CLS_RSVP6)     += cls_rsvp6.o
+obj-$(CONFIG_NET_CLS_BASIC)     += cls_basic.o
+obj-$(CONFIG_NET_EMATCH)        += ematch.o
+obj-$(CONFIG_NET_EMATCH_CMP)    += em_cmp.o
+obj-$(CONFIG_NET_EMATCH_NBYTE)  += em_nbyte.o
+obj-$(CONFIG_NET_EMATCH_U32)    += em_u32.o
+obj-$(CONFIG_NET_EMATCH_META)   += em_meta.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
new file mode 100644
index 000000000000..5e6cc371b39e
--- /dev/null
+++ b/net/sched/act_api.c
@@ -0,0 +1,894 @@
+/*
+ * net/sched/act_api.c  Packet action API.
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Author:      Jamal Hadi Salim
+ *
+ *
+ */
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/init.h>
+#include <linux/kmod.h>
+#include <net/sock.h>
+#include <net/sch_generic.h>
+#include <net/act_api.h>
+#if 1 /* control */
+#define DPRINTK(format, args...) printk(KERN_DEBUG format, ##args)
+#else
+#define DPRINTK(format, args...)
+#endif
+#if 0 /* data */
+#define D2PRINTK(format, args...) printk(KERN_DEBUG format, ##args)
+#else
+#define D2PRINTK(format, args...)
+#endif
+static struct tc_action_ops *act_base = NULL;
+static DEFINE_RWLOCK(act_mod_lock);
+int tcf_register_action(struct tc_action_ops *act)
+{
+        struct tc_action_ops *a, **ap;
+        write_lock(&act_mod_lock);
+        for (ap = &act_base; (a = *ap) != NULL; ap = &a->next) {
+                if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) {
+                        write_unlock(&act_mod_lock);
+                        return -EEXIST;
+                }
+        }
+        act->next = NULL;
+        *ap = act;
+        write_unlock(&act_mod_lock);
+        return 0;
+}
+int tcf_unregister_action(struct tc_action_ops *act)
+{
+        struct tc_action_ops *a, **ap;
+        int err = -ENOENT;
+        write_lock(&act_mod_lock);
+        for (ap = &act_base; (a = *ap) != NULL; ap = &a->next)
+                if (a == act)
+                        break;
+        if (a) {
+                *ap = a->next;
+                a->next = NULL;
+                err = 0;
+        }
+        write_unlock(&act_mod_lock);
+        return err;
+}
+/* lookup by name */
+static struct tc_action_ops *tc_lookup_action_n(char *kind)
+{
+        struct tc_action_ops *a = NULL;
+        if (kind) {
+                read_lock(&act_mod_lock);
+                for (a = act_base; a; a = a->next) {
+                        if (strcmp(kind, a->kind) == 0) {
+                                if (!try_module_get(a->owner)) {
+                                        read_unlock(&act_mod_lock);
+                                        return NULL;
+                                }
+                                break;
+                        }
+                }
+                read_unlock(&act_mod_lock);
+        }
+        return a;
+}
+/* lookup by rtattr */
+static struct tc_action_ops *tc_lookup_action(struct rtattr *kind)
+{
+        struct tc_action_ops *a = NULL;
+        if (kind) {
+                read_lock(&act_mod_lock);
+                for (a = act_base; a; a = a->next) {
+                        if (rtattr_strcmp(kind, a->kind) == 0) {
+                                if (!try_module_get(a->owner)) {
+                                        read_unlock(&act_mod_lock);
+                                        return NULL;
+                                }
+                                break;
+                        }
+                }
+                read_unlock(&act_mod_lock);
+        }
+        return a;
+}
+#if 0
+/* lookup by id */
+static struct tc_action_ops *tc_lookup_action_id(u32 type)
+{
+        struct tc_action_ops *a = NULL;
+        if (type) {
+                read_lock(&act_mod_lock);
+                for (a = act_base; a; a = a->next) {
+                        if (a->type == type) {
+                                if (!try_module_get(a->owner)) {
+                                        read_unlock(&act_mod_lock);
+                                        return NULL;
+                                }
+                                break;
+                        }
+                }
+                read_unlock(&act_mod_lock);
+        }
+        return a;
+}
+#endif
+int tcf_action_exec(struct sk_buff *skb, struct tc_action *act,
+                    struct tcf_result *res)
+{
+        struct tc_action *a;
+        int ret = -1;
+        if (skb->tc_verd & TC_NCLS) {
+                skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
+                D2PRINTK("(%p)tcf_action_exec: cleared TC_NCLS in %s out %s\n",
+                         skb, skb->input_dev ? skb->input_dev->name : "xxx",
+                         skb->dev->name);
+                ret = TC_ACT_OK;
+                goto exec_done;
+        }
+        while ((a = act) != NULL) {
+repeat:
+                if (a->ops && a->ops->act) {
+                        ret = a->ops->act(&skb, a);
+                        if (TC_MUNGED & skb->tc_verd) {
+                                /* copied already, allow trampling */
+                                skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
+                                skb->tc_verd = CLR_TC_MUNGED(skb->tc_verd);
+                        }
+                        if (ret != TC_ACT_PIPE)
+                                goto exec_done;
+                        if (ret == TC_ACT_REPEAT)
+                                goto repeat;    /* we need a ttl - JHS */
+                }
+                act = a->next;
+        }
+exec_done:
+        if (skb->tc_classid > 0) {
+                res->classid = skb->tc_classid;
+                res->class = 0;
+                skb->tc_classid = 0;
+        }
+        return ret;
+}
+void tcf_action_destroy(struct tc_action *act, int bind)
+{
+        struct tc_action *a;
+        for (a = act; a; a = act) {
+                if (a->ops && a->ops->cleanup) {
+                        DPRINTK("tcf_action_destroy destroying %p next %p\n",
+                                a, a->next);
+                        if (a->ops->cleanup(a, bind) == ACT_P_DELETED)
+                                module_put(a->ops->owner);
+                        act = act->next;
+                        kfree(a);
+                } else { /*FIXME: Remove later - catch insertion bugs*/
+                        printk("tcf_action_destroy: BUG? destroying NULL ops\n");
+                        act = act->next;
+                        kfree(a);
+                }
+        }
+}
+int
+tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+{
+        int err = -EINVAL;
+        if (a->ops == NULL || a->ops->dump == NULL)
+                return err;
+        return a->ops->dump(skb, a, bind, ref);
+}
+int
+tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+{
+        int err = -EINVAL;
+        unsigned char *b = skb->tail;
+        struct rtattr *r;
+        if (a->ops == NULL || a->ops->dump == NULL)
+                return err;
+        RTA_PUT(skb, TCA_KIND, IFNAMSIZ, a->ops->kind);
+        if (tcf_action_copy_stats(skb, a, 0))
+                goto rtattr_failure;
+        r = (struct rtattr*) skb->tail;
+        RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
+        if ((err = tcf_action_dump_old(skb, a, bind, ref)) > 0) {
+                r->rta_len = skb->tail - (u8*)r;
+                return err;
+        }
+rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+int
+tcf_action_dump(struct sk_buff *skb, struct tc_action *act, int bind, int ref)
+{
+        struct tc_action *a;
+        int err = -EINVAL;
+        unsigned char *b = skb->tail;
+        struct rtattr *r ;
+        while ((a = act) != NULL) {
+                r = (struct rtattr*) skb->tail;
+                act = a->next;
+                RTA_PUT(skb, a->order, 0, NULL);
+                err = tcf_action_dump_1(skb, a, bind, ref);
+                if (err < 0)
+                        goto rtattr_failure;
+                r->rta_len = skb->tail - (u8*)r;
+        }
+        return 0;
+rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -err;
+}
+struct tc_action *tcf_action_init_1(struct rtattr *rta, struct rtattr *est,
+                                    char *name, int ovr, int bind, int *err)
+{
+        struct tc_action *a;
+        struct tc_action_ops *a_o;
+        char act_name[IFNAMSIZ];
+        struct rtattr *tb[TCA_ACT_MAX+1];
+        struct rtattr *kind;
+        *err = -EINVAL;
+        if (name == NULL) {
+                if (rtattr_parse_nested(tb, TCA_ACT_MAX, rta) < 0)
+                        goto err_out;
+                kind = tb[TCA_ACT_KIND-1];
+                if (kind == NULL)
+                        goto err_out;
+                if (rtattr_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ)
+                        goto err_out;
+        } else {
+                if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ)
+                        goto err_out;
+        }
+        a_o = tc_lookup_action_n(act_name);
+        if (a_o == NULL) {
+#ifdef CONFIG_KMOD
+                rtnl_unlock();
+                request_module(act_name);
+                rtnl_lock();
+                a_o = tc_lookup_action_n(act_name);
+                /* We dropped the RTNL semaphore in order to
+                 * perform the module load.  So, even if we
+                 * succeeded in loading the module we have to
+                 * tell the caller to replay the request.  We
+                 * indicate this using -EAGAIN.
+                 */
+                if (a_o != NULL) {
+                        *err = -EAGAIN;
+                        goto err_mod;
+                }
+#endif
+                goto err_out;
+        }
+        *err = -ENOMEM;
+        a = kmalloc(sizeof(*a), GFP_KERNEL);
+        if (a == NULL)
+                goto err_mod;
+        memset(a, 0, sizeof(*a));
+        /* backward compatibility for policer */
+        if (name == NULL)
+                *err = a_o->init(tb[TCA_ACT_OPTIONS-1], est, a, ovr, bind);
+        else
+                *err = a_o->init(rta, est, a, ovr, bind);
+        if (*err < 0)
+                goto err_free;
+        /* module count goes up only when brand new policy is created
+           if it exists and is only bound to in a_o->init() then
+           ACT_P_CREATED is not returned (a zero is).
+        */
+        if (*err != ACT_P_CREATED)
+                module_put(a_o->owner);
+        a->ops = a_o;
+        DPRINTK("tcf_action_init_1: successfull %s\n", act_name);
+        *err = 0;
+        return a;
+err_free:
+        kfree(a);
+err_mod:
+        module_put(a_o->owner);
+err_out:
+        return NULL;
+}
+struct tc_action *tcf_action_init(struct rtattr *rta, struct rtattr *est,
+                                  char *name, int ovr, int bind, int *err)
+{
+        struct rtattr *tb[TCA_ACT_MAX_PRIO+1];
+        struct tc_action *head = NULL, *act, *act_prev = NULL;
+        int i;
+        if (rtattr_parse_nested(tb, TCA_ACT_MAX_PRIO, rta) < 0) {
+                *err = -EINVAL;
+                return head;
+        }
+        for (i=0; i < TCA_ACT_MAX_PRIO && tb[i]; i++) {
+                act = tcf_action_init_1(tb[i], est, name, ovr, bind, err);
+                if (act == NULL)
+                        goto err;
+                act->order = i+1;
+                if (head == NULL)
+                        head = act;
+                else
+                        act_prev->next = act;
+                act_prev = act;
+        }
+        return head;
+err:
+        if (head != NULL)
+                tcf_action_destroy(head, bind);
+        return NULL;
+}
+int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a,
+                          int compat_mode)
+{
+        int err = 0;
+        struct gnet_dump d;
+        struct tcf_act_hdr *h = a->priv;
+        
+        if (h == NULL)
+                goto errout;
+        /* compat_mode being true specifies a call that is supposed
+         * to add additional backward compatiblity statistic TLVs.
+         */
+        if (compat_mode) {
+                if (a->type == TCA_OLD_COMPAT)
+                        err = gnet_stats_start_copy_compat(skb, 0,
+                                TCA_STATS, TCA_XSTATS, h->stats_lock, &d);
+                else
+                        return 0;
+        } else
+                err = gnet_stats_start_copy(skb, TCA_ACT_STATS,
+                        h->stats_lock, &d);
+        if (err < 0)
+                goto errout;
+        if (a->ops != NULL && a->ops->get_stats != NULL)
+                if (a->ops->get_stats(skb, a) < 0)
+                        goto errout;
+        if (gnet_stats_copy_basic(&d, &h->bstats) < 0 ||
+#ifdef CONFIG_NET_ESTIMATOR
+            gnet_stats_copy_rate_est(&d, &h->rate_est) < 0 ||
+#endif
+            gnet_stats_copy_queue(&d, &h->qstats) < 0)
+                goto errout;
+        if (gnet_stats_finish_copy(&d) < 0)
+                goto errout;
+        return 0;
+errout:
+        return -1;
+}
+static int
+tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 pid, u32 seq,
+             unsigned flags, int event, int bind, int ref)
+{
+        struct tcamsg *t;
+        struct nlmsghdr *nlh;
+        unsigned char *b = skb->tail;
+        struct rtattr *x;
+        nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*t));
+        nlh->nlmsg_flags = flags;
+        t = NLMSG_DATA(nlh);
+        t->tca_family = AF_UNSPEC;
+        
+        x = (struct rtattr*) skb->tail;
+        RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
+        if (tcf_action_dump(skb, a, bind, ref) < 0)
+                goto rtattr_failure;
+        x->rta_len = skb->tail - (u8*)x;
+        
+        nlh->nlmsg_len = skb->tail - b;
+        return skb->len;
+rtattr_failure:
+nlmsg_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+static int
+act_get_notify(u32 pid, struct nlmsghdr *n, struct tc_action *a, int event)
+{
+        struct sk_buff *skb;
+        int err = 0;
+        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+        if (!skb)
+                return -ENOBUFS;
+        if (tca_get_fill(skb, a, pid, n->nlmsg_seq, 0, event, 0, 0) <= 0) {
+                kfree_skb(skb);
+                return -EINVAL;
+        }
+        err = netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT);
+        if (err > 0)
+                err = 0;
+        return err;
+}
+static struct tc_action *
+tcf_action_get_1(struct rtattr *rta, struct nlmsghdr *n, u32 pid, int *err)
+{
+        struct rtattr *tb[TCA_ACT_MAX+1];
+        struct tc_action *a;
+        int index;
+        *err = -EINVAL;
+        if (rtattr_parse_nested(tb, TCA_ACT_MAX, rta) < 0)
+                return NULL;
+        if (tb[TCA_ACT_INDEX - 1] == NULL ||
+            RTA_PAYLOAD(tb[TCA_ACT_INDEX - 1]) < sizeof(index))
+                return NULL;
+        index = *(int *)RTA_DATA(tb[TCA_ACT_INDEX - 1]);
+        *err = -ENOMEM;
+        a = kmalloc(sizeof(struct tc_action), GFP_KERNEL);
+        if (a == NULL)
+                return NULL;
+        memset(a, 0, sizeof(struct tc_action));
+        *err = -EINVAL;
+        a->ops = tc_lookup_action(tb[TCA_ACT_KIND - 1]);
+        if (a->ops == NULL)
+                goto err_free;
+        if (a->ops->lookup == NULL)
+                goto err_mod;
+        *err = -ENOENT;
+        if (a->ops->lookup(a, index) == 0)
+                goto err_mod;
+        module_put(a->ops->owner);
+        *err = 0;
+        return a;
+err_mod:
+        module_put(a->ops->owner);
+err_free:
+        kfree(a);
+        return NULL;
+}
+static void cleanup_a(struct tc_action *act)
+{
+        struct tc_action *a;
+        for (a = act; a; a = act) {
+                act = a->next;
+                kfree(a);
+        }
+}
+static struct tc_action *create_a(int i)
+{
+        struct tc_action *act;
+        act = kmalloc(sizeof(*act), GFP_KERNEL);
+        if (act == NULL) {
+                printk("create_a: failed to alloc!\n");
+                return NULL;
+        }
+        memset(act, 0, sizeof(*act));
+        act->order = i;
+        return act;
+}
+static int tca_action_flush(struct rtattr *rta, struct nlmsghdr *n, u32 pid)
+{
+        struct sk_buff *skb;
+        unsigned char *b;
+        struct nlmsghdr *nlh;
+        struct tcamsg *t;
+        struct netlink_callback dcb;
+        struct rtattr *x;
+        struct rtattr *tb[TCA_ACT_MAX+1];
+        struct rtattr *kind;
+        struct tc_action *a = create_a(0);
+        int err = -EINVAL;
+        if (a == NULL) {
+                printk("tca_action_flush: couldnt create tc_action\n");
+                return err;
+        }
+        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+        if (!skb) {
+                printk("tca_action_flush: failed skb alloc\n");
+                kfree(a);
+                return -ENOBUFS;
+        }
+        b = (unsigned char *)skb->tail;
+        if (rtattr_parse_nested(tb, TCA_ACT_MAX, rta) < 0)
+                goto err_out;
+        kind = tb[TCA_ACT_KIND-1];
+        a->ops = tc_lookup_action(kind);
+        if (a->ops == NULL)
+                goto err_out;
+        nlh = NLMSG_PUT(skb, pid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t));
+        t = NLMSG_DATA(nlh);
+        t->tca_family = AF_UNSPEC;
+        x = (struct rtattr *) skb->tail;
+        RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
+        err = a->ops->walk(skb, &dcb, RTM_DELACTION, a);
+        if (err < 0)
+                goto rtattr_failure;
+        x->rta_len = skb->tail - (u8 *) x;
+        nlh->nlmsg_len = skb->tail - b;
+        nlh->nlmsg_flags |= NLM_F_ROOT;
+        module_put(a->ops->owner);
+        kfree(a);
+        err = rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
+        if (err > 0)
+                return 0;
+        return err;
+rtattr_failure:
+        module_put(a->ops->owner);
+nlmsg_failure:
+err_out:
+        kfree_skb(skb);
+        kfree(a);
+        return err;
+}
+static int
+tca_action_gd(struct rtattr *rta, struct nlmsghdr *n, u32 pid, int event)
+{
+        int i, ret = 0;
+        struct rtattr *tb[TCA_ACT_MAX_PRIO+1];
+        struct tc_action *head = NULL, *act, *act_prev = NULL;
+        if (rtattr_parse_nested(tb, TCA_ACT_MAX_PRIO, rta) < 0)
+                return -EINVAL;
+        if (event == RTM_DELACTION && n->nlmsg_flags&NLM_F_ROOT) {
+                if (tb[0] != NULL && tb[1] == NULL)
+                        return tca_action_flush(tb[0], n, pid);
+        }
+        for (i=0; i < TCA_ACT_MAX_PRIO && tb[i]; i++) {
+                act = tcf_action_get_1(tb[i], n, pid, &ret);
+                if (act == NULL)
+                        goto err;
+                act->order = i+1;
+                if (head == NULL)
+                        head = act;
+                else
+                        act_prev->next = act;
+                act_prev = act;
+        }
+        if (event == RTM_GETACTION)
+                ret = act_get_notify(pid, n, head, event);
+        else { /* delete */
+                struct sk_buff *skb;
+                skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+                if (!skb) {
+                        ret = -ENOBUFS;
+                        goto err;
+                }
+                if (tca_get_fill(skb, head, pid, n->nlmsg_seq, 0, event,
+                                 0, 1) <= 0) {
+                        kfree_skb(skb);
+                        ret = -EINVAL;
+                        goto err;
+                }
+                /* now do the delete */
+                tcf_action_destroy(head, 0);
+                ret = rtnetlink_send(skb, pid, RTMGRP_TC,
+                                     n->nlmsg_flags&NLM_F_ECHO);
+                if (ret > 0)
+                        return 0;
+                return ret;
+        }
+err:
+        cleanup_a(head);
+        return ret;
+}
+static int tcf_add_notify(struct tc_action *a, u32 pid, u32 seq, int event,
+                          unsigned flags)
+{
+        struct tcamsg *t;
+        struct nlmsghdr *nlh;
+        struct sk_buff *skb;
+        struct rtattr *x;
+        unsigned char *b;
+        int err = 0;
+        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+        if (!skb)
+                return -ENOBUFS;
+        b = (unsigned char *)skb->tail;
+        nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*t));
+        nlh->nlmsg_flags = flags;
+        t = NLMSG_DATA(nlh);
+        t->tca_family = AF_UNSPEC;
+        
+        x = (struct rtattr*) skb->tail;
+        RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
+        if (tcf_action_dump(skb, a, 0, 0) < 0)
+                goto rtattr_failure;
+        x->rta_len = skb->tail - (u8*)x;
+        
+        nlh->nlmsg_len = skb->tail - b;
+        NETLINK_CB(skb).dst_groups = RTMGRP_TC;
+        
+        err = rtnetlink_send(skb, pid, RTMGRP_TC, flags&NLM_F_ECHO);
+        if (err > 0)
+                err = 0;
+        return err;
+rtattr_failure:
+nlmsg_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+        
+static int
+tcf_action_add(struct rtattr *rta, struct nlmsghdr *n, u32 pid, int ovr)
+{
+        int ret = 0;
+        struct tc_action *act;
+        struct tc_action *a;
+        u32 seq = n->nlmsg_seq;
+        act = tcf_action_init(rta, NULL, NULL, ovr, 0, &ret);
+        if (act == NULL)
+                goto done;
+        /* dump then free all the actions after update; inserted policy
+         * stays intact
+         * */
+        ret = tcf_add_notify(act, pid, seq, RTM_NEWACTION, n->nlmsg_flags);
+        for (a = act; a; a = act) {
+                act = a->next;
+                kfree(a);
+        }
+done:
+        return ret;
+}
+static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
+{
+        struct rtattr **tca = arg;
+        u32 pid = skb ? NETLINK_CB(skb).pid : 0;
+        int ret = 0, ovr = 0;
+        if (tca[TCA_ACT_TAB-1] == NULL) {
+                printk("tc_ctl_action: received NO action attribs\n");
+                return -EINVAL;
+        }
+        /* n->nlmsg_flags&NLM_F_CREATE
+         * */
+        switch (n->nlmsg_type) {
+        case RTM_NEWACTION:
+                /* we are going to assume all other flags
+                 * imply create only if it doesnt exist
+                 * Note that CREATE | EXCL implies that
+                 * but since we want avoid ambiguity (eg when flags
+                 * is zero) then just set this
+                 */
+                if (n->nlmsg_flags&NLM_F_REPLACE)
+                        ovr = 1;
+replay:
+                ret = tcf_action_add(tca[TCA_ACT_TAB-1], n, pid, ovr);
+                if (ret == -EAGAIN)
+                        goto replay;
+                break;
+        case RTM_DELACTION:
+                ret = tca_action_gd(tca[TCA_ACT_TAB-1], n, pid, RTM_DELACTION);
+                break;
+        case RTM_GETACTION:
+                ret = tca_action_gd(tca[TCA_ACT_TAB-1], n, pid, RTM_GETACTION);
+                break;
+        default:
+                BUG();
+        }
+        return ret;
+}
+static char *
+find_dump_kind(struct nlmsghdr *n)
+{
+        struct rtattr *tb1, *tb2[TCA_ACT_MAX+1];
+        struct rtattr *tb[TCA_ACT_MAX_PRIO + 1];
+        struct rtattr *rta[TCAA_MAX + 1];
+        struct rtattr *kind;
+        int min_len = NLMSG_LENGTH(sizeof(struct tcamsg));
+        int attrlen = n->nlmsg_len - NLMSG_ALIGN(min_len);
+        struct rtattr *attr = (void *) n + NLMSG_ALIGN(min_len);
+        if (rtattr_parse(rta, TCAA_MAX, attr, attrlen) < 0)
+                return NULL;
+        tb1 = rta[TCA_ACT_TAB - 1];
+        if (tb1 == NULL)
+                return NULL;
+        if (rtattr_parse(tb, TCA_ACT_MAX_PRIO, RTA_DATA(tb1),
+                         NLMSG_ALIGN(RTA_PAYLOAD(tb1))) < 0)
+                return NULL;
+        if (tb[0] == NULL)
+                return NULL;
+        if (rtattr_parse(tb2, TCA_ACT_MAX, RTA_DATA(tb[0]),
+                         RTA_PAYLOAD(tb[0])) < 0)
+                return NULL;
+        kind = tb2[TCA_ACT_KIND-1];
+        return (char *) RTA_DATA(kind);
+}
+static int
+tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
+{
+        struct nlmsghdr *nlh;
+        unsigned char *b = skb->tail;
+        struct rtattr *x;
+        struct tc_action_ops *a_o;
+        struct tc_action a;
+        int ret = 0;
+        struct tcamsg *t = (struct tcamsg *) NLMSG_DATA(cb->nlh);
+        char *kind = find_dump_kind(cb->nlh);
+        if (kind == NULL) {
+                printk("tc_dump_action: action bad kind\n");
+                return 0;
+        }
+        a_o = tc_lookup_action_n(kind);
+        if (a_o == NULL) {
+                printk("failed to find %s\n", kind);
+                return 0;
+        }
+        memset(&a, 0, sizeof(struct tc_action));
+        a.ops = a_o;
+        if (a_o->walk == NULL) {
+                printk("tc_dump_action: %s !capable of dumping table\n", kind);
+                goto rtattr_failure;
+        }
+        nlh = NLMSG_PUT(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
+                        cb->nlh->nlmsg_type, sizeof(*t));
+        t = NLMSG_DATA(nlh);
+        t->tca_family = AF_UNSPEC;
+        x = (struct rtattr *) skb->tail;
+        RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
+        ret = a_o->walk(skb, cb, RTM_GETACTION, &a);
+        if (ret < 0)
+                goto rtattr_failure;
+        if (ret > 0) {
+                x->rta_len = skb->tail - (u8 *) x;
+                ret = skb->len;
+        } else
+                skb_trim(skb, (u8*)x - skb->data);
+        nlh->nlmsg_len = skb->tail - b;
+        if (NETLINK_CB(cb->skb).pid && ret)
+                nlh->nlmsg_flags |= NLM_F_MULTI;
+        module_put(a_o->owner);
+        return skb->len;
+rtattr_failure:
+nlmsg_failure:
+        module_put(a_o->owner);
+        skb_trim(skb, b - skb->data);
+        return skb->len;
+}
+static int __init tc_action_init(void)
+{
+        struct rtnetlink_link *link_p = rtnetlink_links[PF_UNSPEC];
+        if (link_p) {
+                link_p[RTM_NEWACTION-RTM_BASE].doit = tc_ctl_action;
+                link_p[RTM_DELACTION-RTM_BASE].doit = tc_ctl_action;
+                link_p[RTM_GETACTION-RTM_BASE].doit = tc_ctl_action;
+                link_p[RTM_GETACTION-RTM_BASE].dumpit = tc_dump_action;
+        }
+        printk("TC classifier action (bugs to netdev@oss.sgi.com cc "
+               "hadi@cyberus.ca)\n");
+        return 0;
+}
+subsys_initcall(tc_action_init);
+EXPORT_SYMBOL(tcf_register_action);
+EXPORT_SYMBOL(tcf_unregister_action);
+EXPORT_SYMBOL(tcf_action_exec);
+EXPORT_SYMBOL(tcf_action_dump_1);
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
new file mode 100644
index 000000000000..56e66c3fe0fa
--- /dev/null
+++ b/net/sched/cls_api.c
@@ -0,0 +1,642 @@
+/*
+ * net/sched/cls_api.c  Packet classifier API.
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Changes:
+ *
+ * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
+ *
+ */
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/init.h>
+#include <linux/kmod.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+#if 0 /* control */
+#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
+#else
+#define DPRINTK(format,args...)
+#endif
+/* The list of all installed classifier types */
+static struct tcf_proto_ops *tcf_proto_base;
+/* Protects list of registered TC modules. It is pure SMP lock. */
+static DEFINE_RWLOCK(cls_mod_lock);
+/* Find classifier type by string name */
+static struct tcf_proto_ops * tcf_proto_lookup_ops(struct rtattr *kind)
+{
+        struct tcf_proto_ops *t = NULL;
+        if (kind) {
+                read_lock(&cls_mod_lock);
+                for (t = tcf_proto_base; t; t = t->next) {
+                        if (rtattr_strcmp(kind, t->kind) == 0) {
+                                if (!try_module_get(t->owner))
+                                        t = NULL;
+                                break;
+                        }
+                }
+                read_unlock(&cls_mod_lock);
+        }
+        return t;
+}
+/* Register(unregister) new classifier type */
+int register_tcf_proto_ops(struct tcf_proto_ops *ops)
+{
+        struct tcf_proto_ops *t, **tp;
+        int rc = -EEXIST;
+        write_lock(&cls_mod_lock);
+        for (tp = &tcf_proto_base; (t = *tp) != NULL; tp = &t->next)
+                if (!strcmp(ops->kind, t->kind))
+                        goto out;
+        ops->next = NULL;
+        *tp = ops;
+        rc = 0;
+out:
+        write_unlock(&cls_mod_lock);
+        return rc;
+}
+int unregister_tcf_proto_ops(struct tcf_proto_ops *ops)
+{
+        struct tcf_proto_ops *t, **tp;
+        int rc = -ENOENT;
+        write_lock(&cls_mod_lock);
+        for (tp = &tcf_proto_base; (t=*tp) != NULL; tp = &t->next)
+                if (t == ops)
+                        break;
+        if (!t)
+                goto out;
+        *tp = t->next;
+        rc = 0;
+out:
+        write_unlock(&cls_mod_lock);
+        return rc;
+}
+static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n,
+                          struct tcf_proto *tp, unsigned long fh, int event);
+/* Select new prio value from the range, managed by kernel. */
+static __inline__ u32 tcf_auto_prio(struct tcf_proto *tp)
+{
+        u32 first = TC_H_MAKE(0xC0000000U,0U);
+        if (tp)
+                first = tp->prio-1;
+        return first;
+}
+/* Add/change/delete/get a filter node */
+static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
+{
+        struct rtattr **tca;
+        struct tcmsg *t;
+        u32 protocol;
+        u32 prio;
+        u32 nprio;
+        u32 parent;
+        struct net_device *dev;
+        struct Qdisc  *q;
+        struct tcf_proto **back, **chain;
+        struct tcf_proto *tp;
+        struct tcf_proto_ops *tp_ops;
+        struct Qdisc_class_ops *cops;
+        unsigned long cl;
+        unsigned long fh;
+        int err;
+replay:
+        tca = arg;
+        t = NLMSG_DATA(n);
+        protocol = TC_H_MIN(t->tcm_info);
+        prio = TC_H_MAJ(t->tcm_info);
+        nprio = prio;
+        parent = t->tcm_parent;
+        cl = 0;
+        if (prio == 0) {
+                /* If no priority is given, user wants we allocated it. */
+                if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE))
+                        return -ENOENT;
+                prio = TC_H_MAKE(0x80000000U,0U);
+        }
+        /* Find head of filter chain. */
+        /* Find link */
+        if ((dev = __dev_get_by_index(t->tcm_ifindex)) == NULL)
+                return -ENODEV;
+        /* Find qdisc */
+        if (!parent) {
+                q = dev->qdisc_sleeping;
+                parent = q->handle;
+        } else if ((q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent))) == NULL)
+                return -EINVAL;
+        /* Is it classful? */
+        if ((cops = q->ops->cl_ops) == NULL)
+                return -EINVAL;
+        /* Do we search for filter, attached to class? */
+        if (TC_H_MIN(parent)) {
+                cl = cops->get(q, parent);
+                if (cl == 0)
+                        return -ENOENT;
+        }
+        /* And the last stroke */
+        chain = cops->tcf_chain(q, cl);
+        err = -EINVAL;
+        if (chain == NULL)
+                goto errout;
+        /* Check the chain for existence of proto-tcf with this priority */
+        for (back = chain; (tp=*back) != NULL; back = &tp->next) {
+                if (tp->prio >= prio) {
+                        if (tp->prio == prio) {
+                                if (!nprio || (tp->protocol != protocol && protocol))
+                                        goto errout;
+                        } else
+                                tp = NULL;
+                        break;
+                }
+        }
+        if (tp == NULL) {
+                /* Proto-tcf does not exist, create new one */
+                if (tca[TCA_KIND-1] == NULL || !protocol)
+                        goto errout;
+                err = -ENOENT;
+                if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE))
+                        goto errout;
+                /* Create new proto tcf */
+                err = -ENOBUFS;
+                if ((tp = kmalloc(sizeof(*tp), GFP_KERNEL)) == NULL)
+                        goto errout;
+                err = -EINVAL;
+                tp_ops = tcf_proto_lookup_ops(tca[TCA_KIND-1]);
+                if (tp_ops == NULL) {
+#ifdef CONFIG_KMOD
+                        struct rtattr *kind = tca[TCA_KIND-1];
+                        char name[IFNAMSIZ];
+                        if (kind != NULL &&
+                            rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
+                                rtnl_unlock();
+                                request_module("cls_%s", name);
+                                rtnl_lock();
+                                tp_ops = tcf_proto_lookup_ops(kind);
+                                /* We dropped the RTNL semaphore in order to
+                                 * perform the module load.  So, even if we
+                                 * succeeded in loading the module we have to
+                                 * replay the request.  We indicate this using
+                                 * -EAGAIN.
+                                 */
+                                if (tp_ops != NULL) {
+                                        module_put(tp_ops->owner);
+                                        err = -EAGAIN;
+                                }
+                        }
+#endif
+                        kfree(tp);
+                        goto errout;
+                }
+                memset(tp, 0, sizeof(*tp));
+                tp->ops = tp_ops;
+                tp->protocol = protocol;
+                tp->prio = nprio ? : tcf_auto_prio(*back);
+                tp->q = q;
+                tp->classify = tp_ops->classify;
+                tp->classid = parent;
+                if ((err = tp_ops->init(tp)) != 0) {
+                        module_put(tp_ops->owner);
+                        kfree(tp);
+                        goto errout;
+                }
+                qdisc_lock_tree(dev);
+                tp->next = *back;
+                *back = tp;
+                qdisc_unlock_tree(dev);
+        } else if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], tp->ops->kind))
+                goto errout;
+        fh = tp->ops->get(tp, t->tcm_handle);
+        if (fh == 0) {
+                if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) {
+                        qdisc_lock_tree(dev);
+                        *back = tp->next;
+                        qdisc_unlock_tree(dev);
+                        tfilter_notify(skb, n, tp, fh, RTM_DELTFILTER);
+                        tcf_destroy(tp);
+                        err = 0;
+                        goto errout;
+                }
+                err = -ENOENT;
+                if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE))
+                        goto errout;
+        } else {
+                switch (n->nlmsg_type) {
+                case RTM_NEWTFILTER:    
+                        err = -EEXIST;
+                        if (n->nlmsg_flags&NLM_F_EXCL)
+                                goto errout;
+                        break;
+                case RTM_DELTFILTER:
+                        err = tp->ops->delete(tp, fh);
+                        if (err == 0)
+                                tfilter_notify(skb, n, tp, fh, RTM_DELTFILTER);
+                        goto errout;
+                case RTM_GETTFILTER:
+                        err = tfilter_notify(skb, n, tp, fh, RTM_NEWTFILTER);
+                        goto errout;
+                default:
+                        err = -EINVAL;
+                        goto errout;
+                }
+        }
+        err = tp->ops->change(tp, cl, t->tcm_handle, tca, &fh);
+        if (err == 0)
+                tfilter_notify(skb, n, tp, fh, RTM_NEWTFILTER);
+errout:
+        if (cl)
+                cops->put(q, cl);
+        if (err == -EAGAIN)
+                /* Replay the request. */
+                goto replay;
+        return err;
+}
+static int
+tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp, unsigned long fh,
+              u32 pid, u32 seq, unsigned flags, int event)
+{
+        struct tcmsg *tcm;
+        struct nlmsghdr  *nlh;
+        unsigned char    *b = skb->tail;
+        nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm));
+        nlh->nlmsg_flags = flags;
+        tcm = NLMSG_DATA(nlh);
+        tcm->tcm_family = AF_UNSPEC;
+        tcm->tcm_ifindex = tp->q->dev->ifindex;
+        tcm->tcm_parent = tp->classid;
+        tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol);
+        RTA_PUT(skb, TCA_KIND, IFNAMSIZ, tp->ops->kind);
+        tcm->tcm_handle = fh;
+        if (RTM_DELTFILTER != event) {
+                tcm->tcm_handle = 0;
+                if (tp->ops->dump && tp->ops->dump(tp, fh, skb, tcm) < 0)
+                        goto rtattr_failure;
+        }
+        nlh->nlmsg_len = skb->tail - b;
+        return skb->len;
+nlmsg_failure:
+rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n,
+                          struct tcf_proto *tp, unsigned long fh, int event)
+{
+        struct sk_buff *skb;
+        u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
+        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+        if (!skb)
+                return -ENOBUFS;
+        if (tcf_fill_node(skb, tp, fh, pid, n->nlmsg_seq, 0, event) <= 0) {
+                kfree_skb(skb);
+                return -EINVAL;
+        }
+        return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
+}
+struct tcf_dump_args
+{
+        struct tcf_walker w;
+        struct sk_buff *skb;
+        struct netlink_callback *cb;
+};
+static int tcf_node_dump(struct tcf_proto *tp, unsigned long n, struct tcf_walker *arg)
+{
+        struct tcf_dump_args *a = (void*)arg;
+        return tcf_fill_node(a->skb, tp, n, NETLINK_CB(a->cb->skb).pid,
+                             a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER);
+}
+static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
+{
+        int t;
+        int s_t;
+        struct net_device *dev;
+        struct Qdisc *q;
+        struct tcf_proto *tp, **chain;
+        struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
+        unsigned long cl = 0;
+        struct Qdisc_class_ops *cops;
+        struct tcf_dump_args arg;
+        if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
+                return skb->len;
+        if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
+                return skb->len;
+        read_lock_bh(&qdisc_tree_lock);
+        if (!tcm->tcm_parent)
+                q = dev->qdisc_sleeping;
+        else
+                q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent));
+        if (!q)
+                goto out;
+        if ((cops = q->ops->cl_ops) == NULL)
+                goto errout;
+        if (TC_H_MIN(tcm->tcm_parent)) {
+                cl = cops->get(q, tcm->tcm_parent);
+                if (cl == 0)
+                        goto errout;
+        }
+        chain = cops->tcf_chain(q, cl);
+        if (chain == NULL)
+                goto errout;
+        s_t = cb->args[0];
+        for (tp=*chain, t=0; tp; tp = tp->next, t++) {
+                if (t < s_t) continue;
+                if (TC_H_MAJ(tcm->tcm_info) &&
+                    TC_H_MAJ(tcm->tcm_info) != tp->prio)
+                        continue;
+                if (TC_H_MIN(tcm->tcm_info) &&
+                    TC_H_MIN(tcm->tcm_info) != tp->protocol)
+                        continue;
+                if (t > s_t)
+                        memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
+                if (cb->args[1] == 0) {
+                        if (tcf_fill_node(skb, tp, 0, NETLINK_CB(cb->skb).pid,
+                                          cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER) <= 0) {
+                                break;
+                        }
+                        cb->args[1] = 1;
+                }
+                if (tp->ops->walk == NULL)
+                        continue;
+                arg.w.fn = tcf_node_dump;
+                arg.skb = skb;
+                arg.cb = cb;
+                arg.w.stop = 0;
+                arg.w.skip = cb->args[1]-1;
+                arg.w.count = 0;
+                tp->ops->walk(tp, &arg.w);
+                cb->args[1] = arg.w.count+1;
+                if (arg.w.stop)
+                        break;
+        }
+        cb->args[0] = t;
+errout:
+        if (cl)
+                cops->put(q, cl);
+out:
+        read_unlock_bh(&qdisc_tree_lock);
+        dev_put(dev);
+        return skb->len;
+}
+void
+tcf_exts_destroy(struct tcf_proto *tp, struct tcf_exts *exts)
+{
+#ifdef CONFIG_NET_CLS_ACT
+        if (exts->action) {
+                tcf_action_destroy(exts->action, TCA_ACT_UNBIND);
+                exts->action = NULL;
+        }
+#elif defined CONFIG_NET_CLS_POLICE
+        if (exts->police) {
+                tcf_police_release(exts->police, TCA_ACT_UNBIND);
+                exts->police = NULL;
+        }
+#endif
+}
+int
+tcf_exts_validate(struct tcf_proto *tp, struct rtattr **tb,
+                  struct rtattr *rate_tlv, struct tcf_exts *exts,
+                  struct tcf_ext_map *map)
+{
+        memset(exts, 0, sizeof(*exts));
+        
+#ifdef CONFIG_NET_CLS_ACT
+        {
+                int err;
+                struct tc_action *act;
+                if (map->police && tb[map->police-1]) {
+                        act = tcf_action_init_1(tb[map->police-1], rate_tlv, "police",
+                                TCA_ACT_NOREPLACE, TCA_ACT_BIND, &err);
+                        if (act == NULL)
+                                return err;
+                        act->type = TCA_OLD_COMPAT;
+                        exts->action = act;
+                } else if (map->action && tb[map->action-1]) {
+                        act = tcf_action_init(tb[map->action-1], rate_tlv, NULL,
+                                TCA_ACT_NOREPLACE, TCA_ACT_BIND, &err);
+                        if (act == NULL)
+                                return err;
+                        exts->action = act;
+                }
+        }
+#elif defined CONFIG_NET_CLS_POLICE
+        if (map->police && tb[map->police-1]) {
+                struct tcf_police *p;
+                
+                p = tcf_police_locate(tb[map->police-1], rate_tlv);
+                if (p == NULL)
+                        return -EINVAL;
+                exts->police = p;
+        } else if (map->action && tb[map->action-1])
+                return -EOPNOTSUPP;
+#else
+        if ((map->action && tb[map->action-1]) ||
+            (map->police && tb[map->police-1]))
+                return -EOPNOTSUPP;
+#endif
+        return 0;
+}
+void
+tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst,
+                struct tcf_exts *src)
+{
+#ifdef CONFIG_NET_CLS_ACT
+        if (src->action) {
+                struct tc_action *act;
+                tcf_tree_lock(tp);
+                act = xchg(&dst->action, src->action);
+                tcf_tree_unlock(tp);
+                if (act)
+                        tcf_action_destroy(act, TCA_ACT_UNBIND);
+        }
+#elif defined CONFIG_NET_CLS_POLICE
+        if (src->police) {
+                struct tcf_police *p;
+                tcf_tree_lock(tp);
+                p = xchg(&dst->police, src->police);
+                tcf_tree_unlock(tp);
+                if (p)
+                        tcf_police_release(p, TCA_ACT_UNBIND);
+        }
+#endif
+}
+int
+tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts,
+              struct tcf_ext_map *map)
+{
+#ifdef CONFIG_NET_CLS_ACT
+        if (map->action && exts->action) {
+                /*
+                 * again for backward compatible mode - we want
+                 * to work with both old and new modes of entering
+                 * tc data even if iproute2  was newer - jhs
+                 */
+                struct rtattr * p_rta = (struct rtattr*) skb->tail;
+                if (exts->action->type != TCA_OLD_COMPAT) {
+                        RTA_PUT(skb, map->action, 0, NULL);
+                        if (tcf_action_dump(skb, exts->action, 0, 0) < 0)
+                                goto rtattr_failure;
+                        p_rta->rta_len = skb->tail - (u8*)p_rta;
+                } else if (map->police) {
+                        RTA_PUT(skb, map->police, 0, NULL);
+                        if (tcf_action_dump_old(skb, exts->action, 0, 0) < 0)
+                                goto rtattr_failure;
+                        p_rta->rta_len = skb->tail - (u8*)p_rta;
+                }
+        }
+#elif defined CONFIG_NET_CLS_POLICE
+        if (map->police && exts->police) {
+                struct rtattr * p_rta = (struct rtattr*) skb->tail;
+                RTA_PUT(skb, map->police, 0, NULL);
+                if (tcf_police_dump(skb, exts->police) < 0)
+                        goto rtattr_failure;
+                p_rta->rta_len = skb->tail - (u8*)p_rta;
+        }
+#endif
+        return 0;
+rtattr_failure: __attribute__ ((unused))
+        return -1;
+}
+int
+tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts,
+                    struct tcf_ext_map *map)
+{
+#ifdef CONFIG_NET_CLS_ACT
+        if (exts->action)
+                if (tcf_action_copy_stats(skb, exts->action, 1) < 0)
+                        goto rtattr_failure;
+#elif defined CONFIG_NET_CLS_POLICE
+        if (exts->police)
+                if (tcf_police_dump_stats(skb, exts->police) < 0)
+                        goto rtattr_failure;
+#endif
+        return 0;
+rtattr_failure: __attribute__ ((unused))
+        return -1;
+}
+static int __init tc_filter_init(void)
+{
+        struct rtnetlink_link *link_p = rtnetlink_links[PF_UNSPEC];
+        /* Setup rtnetlink links. It is made here to avoid
+           exporting large number of public symbols.
+         */
+        if (link_p) {
+                link_p[RTM_NEWTFILTER-RTM_BASE].doit = tc_ctl_tfilter;
+                link_p[RTM_DELTFILTER-RTM_BASE].doit = tc_ctl_tfilter;
+                link_p[RTM_GETTFILTER-RTM_BASE].doit = tc_ctl_tfilter;
+                link_p[RTM_GETTFILTER-RTM_BASE].dumpit = tc_dump_tfilter;
+        }
+        return 0;
+}
+subsys_initcall(tc_filter_init);
+EXPORT_SYMBOL(register_tcf_proto_ops);
+EXPORT_SYMBOL(unregister_tcf_proto_ops);
+EXPORT_SYMBOL(tcf_exts_validate);
+EXPORT_SYMBOL(tcf_exts_destroy);
+EXPORT_SYMBOL(tcf_exts_change);
+EXPORT_SYMBOL(tcf_exts_dump);
+EXPORT_SYMBOL(tcf_exts_dump_stats);
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
new file mode 100644
index 000000000000..0d2d4415f334
--- /dev/null
+++ b/net/sched/cls_basic.c
@@ -0,0 +1,303 @@
+/*
+ * net/sched/cls_basic.c        Basic Packet Classifier.
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Authors:     Thomas Graf <tgraf@suug.ch>
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+#include <net/act_api.h>
+#include <net/pkt_cls.h>
+struct basic_head
+{
+        u32                     hgenerator;
+        struct list_head        flist;
+};
+struct basic_filter
+{
+        u32                     handle;
+        struct tcf_exts         exts;
+        struct tcf_ematch_tree  ematches;
+        struct tcf_result       res;
+        struct list_head        link;
+};
+static struct tcf_ext_map basic_ext_map = {
+        .action = TCA_BASIC_ACT,
+        .police = TCA_BASIC_POLICE
+};
+static int basic_classify(struct sk_buff *skb, struct tcf_proto *tp,
+                          struct tcf_result *res)
+{
+        int r;
+        struct basic_head *head = (struct basic_head *) tp->root;
+        struct basic_filter *f;
+        list_for_each_entry(f, &head->flist, link) {
+                if (!tcf_em_tree_match(skb, &f->ematches, NULL))
+                        continue;
+                *res = f->res;
+                r = tcf_exts_exec(skb, &f->exts, res);
+                if (r < 0)
+                        continue;
+                return r;
+        }
+        return -1;
+}
+static unsigned long basic_get(struct tcf_proto *tp, u32 handle)
+{
+        unsigned long l = 0UL;
+        struct basic_head *head = (struct basic_head *) tp->root;
+        struct basic_filter *f;
+        if (head == NULL)
+                return 0UL;
+        list_for_each_entry(f, &head->flist, link)
+                if (f->handle == handle)
+                        l = (unsigned long) f;
+        return l;
+}
+static void basic_put(struct tcf_proto *tp, unsigned long f)
+{
+}
+static int basic_init(struct tcf_proto *tp)
+{
+        return 0;
+}
+static inline void basic_delete_filter(struct tcf_proto *tp,
+                                       struct basic_filter *f)
+{
+        tcf_unbind_filter(tp, &f->res);
+        tcf_exts_destroy(tp, &f->exts);
+        tcf_em_tree_destroy(tp, &f->ematches);
+        kfree(f);
+}
+static void basic_destroy(struct tcf_proto *tp)
+{
+        struct basic_head *head = (struct basic_head *) xchg(&tp->root, NULL);
+        struct basic_filter *f, *n;
+        
+        list_for_each_entry_safe(f, n, &head->flist, link) {
+                list_del(&f->link);
+                basic_delete_filter(tp, f);
+        }
+}
+static int basic_delete(struct tcf_proto *tp, unsigned long arg)
+{
+        struct basic_head *head = (struct basic_head *) tp->root;
+        struct basic_filter *t, *f = (struct basic_filter *) arg;
+        list_for_each_entry(t, &head->flist, link)
+                if (t == f) {
+                        tcf_tree_lock(tp);
+                        list_del(&t->link);
+                        tcf_tree_unlock(tp);
+                        basic_delete_filter(tp, t);
+                        return 0;
+                }
+        return -ENOENT;
+}
+static inline int basic_set_parms(struct tcf_proto *tp, struct basic_filter *f,
+                                  unsigned long base, struct rtattr **tb,
+                                  struct rtattr *est)
+{
+        int err = -EINVAL;
+        struct tcf_exts e;
+        struct tcf_ematch_tree t;
+        if (tb[TCA_BASIC_CLASSID-1])
+                if (RTA_PAYLOAD(tb[TCA_BASIC_CLASSID-1]) < sizeof(u32))
+                        return err;
+        err = tcf_exts_validate(tp, tb, est, &e, &basic_ext_map);
+        if (err < 0)
+                return err;
+        err = tcf_em_tree_validate(tp, tb[TCA_BASIC_EMATCHES-1], &t);
+        if (err < 0)
+                goto errout;
+        if (tb[TCA_BASIC_CLASSID-1]) {
+                f->res.classid = *(u32*)RTA_DATA(tb[TCA_BASIC_CLASSID-1]);
+                tcf_bind_filter(tp, &f->res, base);
+        }
+        tcf_exts_change(tp, &f->exts, &e);
+        tcf_em_tree_change(tp, &f->ematches, &t);
+        return 0;
+errout:
+        tcf_exts_destroy(tp, &e);
+        return err;
+}
+static int basic_change(struct tcf_proto *tp, unsigned long base, u32 handle,
+                        struct rtattr **tca, unsigned long *arg)
+{
+        int err = -EINVAL;
+        struct basic_head *head = (struct basic_head *) tp->root;
+        struct rtattr *tb[TCA_BASIC_MAX];
+        struct basic_filter *f = (struct basic_filter *) *arg;
+        if (tca[TCA_OPTIONS-1] == NULL)
+                return -EINVAL;
+        if (rtattr_parse_nested(tb, TCA_BASIC_MAX, tca[TCA_OPTIONS-1]) < 0)
+                return -EINVAL;
+        if (f != NULL) {
+                if (handle && f->handle != handle)
+                        return -EINVAL;
+                return basic_set_parms(tp, f, base, tb, tca[TCA_RATE-1]);
+        }
+        err = -ENOBUFS;
+        if (head == NULL) {
+                head = kmalloc(sizeof(*head), GFP_KERNEL);
+                if (head == NULL)
+                        goto errout;
+                memset(head, 0, sizeof(*head));
+                INIT_LIST_HEAD(&head->flist);
+                tp->root = head;
+        }
+        f = kmalloc(sizeof(*f), GFP_KERNEL);
+        if (f == NULL)
+                goto errout;
+        memset(f, 0, sizeof(*f));
+        err = -EINVAL;
+        if (handle)
+                f->handle = handle;
+        else {
+                int i = 0x80000000;
+                do {
+                        if (++head->hgenerator == 0x7FFFFFFF)
+                                head->hgenerator = 1;
+                } while (--i > 0 && basic_get(tp, head->hgenerator));
+                if (i <= 0) {
+                        printk(KERN_ERR "Insufficient number of handles\n");
+                        goto errout;
+                }
+                f->handle = head->hgenerator;
+        }
+        err = basic_set_parms(tp, f, base, tb, tca[TCA_RATE-1]);
+        if (err < 0)
+                goto errout;
+        tcf_tree_lock(tp);
+        list_add(&f->link, &head->flist);
+        tcf_tree_unlock(tp);
+        *arg = (unsigned long) f;
+        return 0;
+errout:
+        if (*arg == 0UL && f)
+                kfree(f);
+        return err;
+}
+static void basic_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+        struct basic_head *head = (struct basic_head *) tp->root;
+        struct basic_filter *f;
+        list_for_each_entry(f, &head->flist, link) {
+                if (arg->count < arg->skip)
+                        goto skip;
+                if (arg->fn(tp, (unsigned long) f, arg) < 0) {
+                        arg->stop = 1;
+                        break;
+                }
+skip:
+                arg->count++;
+        }
+}
+static int basic_dump(struct tcf_proto *tp, unsigned long fh,
+                      struct sk_buff *skb, struct tcmsg *t)
+{
+        struct basic_filter *f = (struct basic_filter *) fh;
+        unsigned char *b = skb->tail;
+        struct rtattr *rta;
+        if (f == NULL)
+                return skb->len;
+        t->tcm_handle = f->handle;
+        rta = (struct rtattr *) b;
+        RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
+        if (tcf_exts_dump(skb, &f->exts, &basic_ext_map) < 0 ||
+            tcf_em_tree_dump(skb, &f->ematches, TCA_BASIC_EMATCHES) < 0)
+                goto rtattr_failure;
+        rta->rta_len = (skb->tail - b);
+        return skb->len;
+rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+static struct tcf_proto_ops cls_basic_ops = {
+        .kind           =       "basic",
+        .classify       =       basic_classify,
+        .init           =       basic_init,
+        .destroy        =       basic_destroy,
+        .get            =       basic_get,
+        .put            =       basic_put,
+        .change         =       basic_change,
+        .delete         =       basic_delete,
+        .walk           =       basic_walk,
+        .dump           =       basic_dump,
+        .owner          =       THIS_MODULE,
+};
+static int __init init_basic(void)
+{
+        return register_tcf_proto_ops(&cls_basic_ops);
+}
+static void __exit exit_basic(void) 
+{
+        unregister_tcf_proto_ops(&cls_basic_ops);
+}
+module_init(init_basic)
+module_exit(exit_basic)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
new file mode 100644
index 000000000000..fdfc83af3d1f
--- /dev/null
+++ b/net/sched/cls_fw.c
@@ -0,0 +1,378 @@
+/*
+ * net/sched/cls_fw.c   Classifier mapping ipchains' fwmark to traffic class.
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Changes:
+ * Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_walk off by one
+ * Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_delete killed all the filter (and kernel).
+ * Alex <alex@pilotsoft.com> : 2004xxyy: Added Action extension
+ *
+ * JHS: We should remove the CONFIG_NET_CLS_IND from here
+ * eventually when the meta match extension is made available
+ *
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/notifier.h>
+#include <linux/netfilter.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/act_api.h>
+#include <net/pkt_cls.h>
+struct fw_head
+{
+        struct fw_filter *ht[256];
+};
+struct fw_filter
+{
+        struct fw_filter        *next;
+        u32                     id;
+        struct tcf_result       res;
+#ifdef CONFIG_NET_CLS_IND
+        char                    indev[IFNAMSIZ];
+#endif /* CONFIG_NET_CLS_IND */
+        struct tcf_exts         exts;
+};
+static struct tcf_ext_map fw_ext_map = {
+        .action = TCA_FW_ACT,
+        .police = TCA_FW_POLICE
+};
+static __inline__ int fw_hash(u32 handle)
+{
+        return handle&0xFF;
+}
+static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp,
+                          struct tcf_result *res)
+{
+        struct fw_head *head = (struct fw_head*)tp->root;
+        struct fw_filter *f;
+        int r;
+#ifdef CONFIG_NETFILTER
+        u32 id = skb->nfmark;
+#else
+        u32 id = 0;
+#endif
+        if (head != NULL) {
+                for (f=head->ht[fw_hash(id)]; f; f=f->next) {
+                        if (f->id == id) {
+                                *res = f->res;
+#ifdef CONFIG_NET_CLS_IND
+                                if (!tcf_match_indev(skb, f->indev))
+                                        continue;
+#endif /* CONFIG_NET_CLS_IND */
+                                r = tcf_exts_exec(skb, &f->exts, res);
+                                if (r < 0)
+                                        continue;
+                                return r;
+                        }
+                }
+        } else {
+                /* old method */
+                if (id && (TC_H_MAJ(id) == 0 || !(TC_H_MAJ(id^tp->q->handle)))) {
+                        res->classid = id;
+                        res->class = 0;
+                        return 0;
+                }
+        }
+        return -1;
+}
+static unsigned long fw_get(struct tcf_proto *tp, u32 handle)
+{
+        struct fw_head *head = (struct fw_head*)tp->root;
+        struct fw_filter *f;
+        if (head == NULL)
+                return 0;
+        for (f=head->ht[fw_hash(handle)]; f; f=f->next) {
+                if (f->id == handle)
+                        return (unsigned long)f;
+        }
+        return 0;
+}
+static void fw_put(struct tcf_proto *tp, unsigned long f)
+{
+}
+static int fw_init(struct tcf_proto *tp)
+{
+        return 0;
+}
+static inline void
+fw_delete_filter(struct tcf_proto *tp, struct fw_filter *f)
+{
+        tcf_unbind_filter(tp, &f->res);
+        tcf_exts_destroy(tp, &f->exts);
+        kfree(f);
+}
+static void fw_destroy(struct tcf_proto *tp)
+{
+        struct fw_head *head = (struct fw_head*)xchg(&tp->root, NULL);
+        struct fw_filter *f;
+        int h;
+        if (head == NULL)
+                return;
+        for (h=0; h<256; h++) {
+                while ((f=head->ht[h]) != NULL) {
+                        head->ht[h] = f->next;
+                        fw_delete_filter(tp, f);
+                }
+        }
+        kfree(head);
+}
+static int fw_delete(struct tcf_proto *tp, unsigned long arg)
+{
+        struct fw_head *head = (struct fw_head*)tp->root;
+        struct fw_filter *f = (struct fw_filter*)arg;
+        struct fw_filter **fp;
+        if (head == NULL || f == NULL)
+                goto out;
+        for (fp=&head->ht[fw_hash(f->id)]; *fp; fp = &(*fp)->next) {
+                if (*fp == f) {
+                        tcf_tree_lock(tp);
+                        *fp = f->next;
+                        tcf_tree_unlock(tp);
+                        fw_delete_filter(tp, f);
+                        return 0;
+                }
+        }
+out:
+        return -EINVAL;
+}
+static int
+fw_change_attrs(struct tcf_proto *tp, struct fw_filter *f,
+        struct rtattr **tb, struct rtattr **tca, unsigned long base)
+{
+        struct tcf_exts e;
+        int err;
+        err = tcf_exts_validate(tp, tb, tca[TCA_RATE-1], &e, &fw_ext_map);
+        if (err < 0)
+                return err;
+        err = -EINVAL;
+        if (tb[TCA_FW_CLASSID-1]) {
+                if (RTA_PAYLOAD(tb[TCA_FW_CLASSID-1]) != sizeof(u32))
+                        goto errout;
+                f->res.classid = *(u32*)RTA_DATA(tb[TCA_FW_CLASSID-1]);
+                tcf_bind_filter(tp, &f->res, base);
+        }
+#ifdef CONFIG_NET_CLS_IND
+        if (tb[TCA_FW_INDEV-1]) {
+                err = tcf_change_indev(tp, f->indev, tb[TCA_FW_INDEV-1]);
+                if (err < 0)
+                        goto errout;
+        }
+#endif /* CONFIG_NET_CLS_IND */
+        tcf_exts_change(tp, &f->exts, &e);
+        return 0;
+errout:
+        tcf_exts_destroy(tp, &e);
+        return err;
+}
+static int fw_change(struct tcf_proto *tp, unsigned long base,
+                     u32 handle,
+                     struct rtattr **tca,
+                     unsigned long *arg)
+{
+        struct fw_head *head = (struct fw_head*)tp->root;
+        struct fw_filter *f = (struct fw_filter *) *arg;
+        struct rtattr *opt = tca[TCA_OPTIONS-1];
+        struct rtattr *tb[TCA_FW_MAX];
+        int err;
+        if (!opt)
+                return handle ? -EINVAL : 0;
+        if (rtattr_parse_nested(tb, TCA_FW_MAX, opt) < 0)
+                return -EINVAL;
+        if (f != NULL) {
+                if (f->id != handle && handle)
+                        return -EINVAL;
+                return fw_change_attrs(tp, f, tb, tca, base);
+        }
+        if (!handle)
+                return -EINVAL;
+        if (head == NULL) {
+                head = kmalloc(sizeof(struct fw_head), GFP_KERNEL);
+                if (head == NULL)
+                        return -ENOBUFS;
+                memset(head, 0, sizeof(*head));
+                tcf_tree_lock(tp);
+                tp->root = head;
+                tcf_tree_unlock(tp);
+        }
+        f = kmalloc(sizeof(struct fw_filter), GFP_KERNEL);
+        if (f == NULL)
+                return -ENOBUFS;
+        memset(f, 0, sizeof(*f));
+        f->id = handle;
+        err = fw_change_attrs(tp, f, tb, tca, base);
+        if (err < 0)
+                goto errout;
+        f->next = head->ht[fw_hash(handle)];
+        tcf_tree_lock(tp);
+        head->ht[fw_hash(handle)] = f;
+        tcf_tree_unlock(tp);
+        *arg = (unsigned long)f;
+        return 0;
+errout:
+        if (f)
+                kfree(f);
+        return err;
+}
+static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+        struct fw_head *head = (struct fw_head*)tp->root;
+        int h;
+        if (head == NULL)
+                arg->stop = 1;
+        if (arg->stop)
+                return;
+        for (h = 0; h < 256; h++) {
+                struct fw_filter *f;
+                for (f = head->ht[h]; f; f = f->next) {
+                        if (arg->count < arg->skip) {
+                                arg->count++;
+                                continue;
+                        }
+                        if (arg->fn(tp, (unsigned long)f, arg) < 0) {
+                                arg->stop = 1;
+                                return;
+                        }
+                        arg->count++;
+                }
+        }
+}
+static int fw_dump(struct tcf_proto *tp, unsigned long fh,
+                   struct sk_buff *skb, struct tcmsg *t)
+{
+        struct fw_filter *f = (struct fw_filter*)fh;
+        unsigned char    *b = skb->tail;
+        struct rtattr *rta;
+        if (f == NULL)
+                return skb->len;
+        t->tcm_handle = f->id;
+        if (!f->res.classid && !tcf_exts_is_available(&f->exts))
+                return skb->len;
+        rta = (struct rtattr*)b;
+        RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
+        if (f->res.classid)
+                RTA_PUT(skb, TCA_FW_CLASSID, 4, &f->res.classid);
+#ifdef CONFIG_NET_CLS_IND
+        if (strlen(f->indev))
+                RTA_PUT(skb, TCA_FW_INDEV, IFNAMSIZ, f->indev);
+#endif /* CONFIG_NET_CLS_IND */
+        if (tcf_exts_dump(skb, &f->exts, &fw_ext_map) < 0)
+                goto rtattr_failure;
+        rta->rta_len = skb->tail - b;
+        if (tcf_exts_dump_stats(skb, &f->exts, &fw_ext_map) < 0)
+                goto rtattr_failure;
+        return skb->len;
+rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+static struct tcf_proto_ops cls_fw_ops = {
+        .next           =       NULL,
+        .kind           =       "fw",
+        .classify       =       fw_classify,
+        .init           =       fw_init,
+        .destroy        =       fw_destroy,
+        .get            =       fw_get,
+        .put            =       fw_put,
+        .change         =       fw_change,
+        .delete         =       fw_delete,
+        .walk           =       fw_walk,
+        .dump           =       fw_dump,
+        .owner          =       THIS_MODULE,
+};
+static int __init init_fw(void)
+{
+        return register_tcf_proto_ops(&cls_fw_ops);
+}
+static void __exit exit_fw(void) 
+{
+        unregister_tcf_proto_ops(&cls_fw_ops);
+}
+module_init(init_fw)
+module_exit(exit_fw)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
new file mode 100644
index 000000000000..02996ac05c75
--- /dev/null
+++ b/net/sched/cls_route.c
@@ -0,0 +1,639 @@
+/*
+ * net/sched/cls_route.c        ROUTE4 classifier.
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+#include <linux/module.h>
+#include <linux/config.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/notifier.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/act_api.h>
+#include <net/pkt_cls.h>
+/*
+   1. For now we assume that route tags < 256.
+      It allows to use direct table lookups, instead of hash tables.
+   2. For now we assume that "from TAG" and "fromdev DEV" statements
+      are mutually  exclusive.
+   3. "to TAG from ANY" has higher priority, than "to ANY from XXX"
+ */
+struct route4_fastmap
+{
+        struct route4_filter    *filter;
+        u32                     id;
+        int                     iif;
+};
+struct route4_head
+{
+        struct route4_fastmap   fastmap[16];
+        struct route4_bucket    *table[256+1];
+};
+struct route4_bucket
+{
+        /* 16 FROM buckets + 16 IIF buckets + 1 wildcard bucket */
+        struct route4_filter    *ht[16+16+1];
+};
+struct route4_filter
+{
+        struct route4_filter    *next;
+        u32                     id;
+        int                     iif;
+        struct tcf_result       res;
+        struct tcf_exts         exts;
+        u32                     handle;
+        struct route4_bucket    *bkt;
+};
+#define ROUTE4_FAILURE ((struct route4_filter*)(-1L))
+static struct tcf_ext_map route_ext_map = {
+        .police = TCA_ROUTE4_POLICE,
+        .action = TCA_ROUTE4_ACT
+};
+static __inline__ int route4_fastmap_hash(u32 id, int iif)
+{
+        return id&0xF;
+}
+static inline
+void route4_reset_fastmap(struct net_device *dev, struct route4_head *head, u32 id)
+{
+        spin_lock_bh(&dev->queue_lock);
+        memset(head->fastmap, 0, sizeof(head->fastmap));
+        spin_unlock_bh(&dev->queue_lock);
+}
+static void __inline__
+route4_set_fastmap(struct route4_head *head, u32 id, int iif,
+                   struct route4_filter *f)
+{
+        int h = route4_fastmap_hash(id, iif);
+        head->fastmap[h].id = id;
+        head->fastmap[h].iif = iif;
+        head->fastmap[h].filter = f;
+}
+static __inline__ int route4_hash_to(u32 id)
+{
+        return id&0xFF;
+}
+static __inline__ int route4_hash_from(u32 id)
+{
+        return (id>>16)&0xF;
+}
+static __inline__ int route4_hash_iif(int iif)
+{
+        return 16 + ((iif>>16)&0xF);
+}
+static __inline__ int route4_hash_wild(void)
+{
+        return 32;
+}
+#define ROUTE4_APPLY_RESULT()                                   \
+{                                                               \
+        *res = f->res;                                          \
+        if (tcf_exts_is_available(&f->exts)) {                  \
+                int r = tcf_exts_exec(skb, &f->exts, res);      \
+                if (r < 0) {                                    \
+                        dont_cache = 1;                         \
+                        continue;                               \
+                }                                               \
+                return r;                                       \
+        } else if (!dont_cache)                                 \
+                route4_set_fastmap(head, id, iif, f);           \
+        return 0;                                               \
+}
+static int route4_classify(struct sk_buff *skb, struct tcf_proto *tp,
+                           struct tcf_result *res)
+{
+        struct route4_head *head = (struct route4_head*)tp->root;
+        struct dst_entry *dst;
+        struct route4_bucket *b;
+        struct route4_filter *f;
+        u32 id, h;
+        int iif, dont_cache = 0;
+        if ((dst = skb->dst) == NULL)
+                goto failure;
+        id = dst->tclassid;
+        if (head == NULL)
+                goto old_method;
+        iif = ((struct rtable*)dst)->fl.iif;
+        h = route4_fastmap_hash(id, iif);
+        if (id == head->fastmap[h].id &&
+            iif == head->fastmap[h].iif &&
+            (f = head->fastmap[h].filter) != NULL) {
+                if (f == ROUTE4_FAILURE)
+                        goto failure;
+                *res = f->res;
+                return 0;
+        }
+        h = route4_hash_to(id);
+restart:
+        if ((b = head->table[h]) != NULL) {
+                for (f = b->ht[route4_hash_from(id)]; f; f = f->next)
+                        if (f->id == id)
+                                ROUTE4_APPLY_RESULT();
+                for (f = b->ht[route4_hash_iif(iif)]; f; f = f->next)
+                        if (f->iif == iif)
+                                ROUTE4_APPLY_RESULT();
+                for (f = b->ht[route4_hash_wild()]; f; f = f->next)
+                        ROUTE4_APPLY_RESULT();
+        }
+        if (h < 256) {
+                h = 256;
+                id &= ~0xFFFF;
+                goto restart;
+        }
+        if (!dont_cache)
+                route4_set_fastmap(head, id, iif, ROUTE4_FAILURE);
+failure:
+        return -1;
+old_method:
+        if (id && (TC_H_MAJ(id) == 0 ||
+                   !(TC_H_MAJ(id^tp->q->handle)))) {
+                res->classid = id;
+                res->class = 0;
+                return 0;
+        }
+        return -1;
+}
+static inline u32 to_hash(u32 id)
+{
+        u32 h = id&0xFF;
+        if (id&0x8000)
+                h += 256;
+        return h;
+}
+static inline u32 from_hash(u32 id)
+{
+        id &= 0xFFFF;
+        if (id == 0xFFFF)
+                return 32;
+        if (!(id & 0x8000)) {
+                if (id > 255)
+                        return 256;
+                return id&0xF;
+        }
+        return 16 + (id&0xF);
+}
+static unsigned long route4_get(struct tcf_proto *tp, u32 handle)
+{
+        struct route4_head *head = (struct route4_head*)tp->root;
+        struct route4_bucket *b;
+        struct route4_filter *f;
+        unsigned h1, h2;
+        if (!head)
+                return 0;
+        h1 = to_hash(handle);
+        if (h1 > 256)
+                return 0;
+        h2 = from_hash(handle>>16);
+        if (h2 > 32)
+                return 0;
+        if ((b = head->table[h1]) != NULL) {
+                for (f = b->ht[h2]; f; f = f->next)
+                        if (f->handle == handle)
+                                return (unsigned long)f;
+        }
+        return 0;
+}
+static void route4_put(struct tcf_proto *tp, unsigned long f)
+{
+}
+static int route4_init(struct tcf_proto *tp)
+{
+        return 0;
+}
+static inline void
+route4_delete_filter(struct tcf_proto *tp, struct route4_filter *f)
+{
+        tcf_unbind_filter(tp, &f->res);
+        tcf_exts_destroy(tp, &f->exts);
+        kfree(f);
+}
+static void route4_destroy(struct tcf_proto *tp)
+{
+        struct route4_head *head = xchg(&tp->root, NULL);
+        int h1, h2;
+        if (head == NULL)
+                return;
+        for (h1=0; h1<=256; h1++) {
+                struct route4_bucket *b;
+                if ((b = head->table[h1]) != NULL) {
+                        for (h2=0; h2<=32; h2++) {
+                                struct route4_filter *f;
+                                while ((f = b->ht[h2]) != NULL) {
+                                        b->ht[h2] = f->next;
+                                        route4_delete_filter(tp, f);
+                                }
+                        }
+                        kfree(b);
+                }
+        }
+        kfree(head);
+}
+static int route4_delete(struct tcf_proto *tp, unsigned long arg)
+{
+        struct route4_head *head = (struct route4_head*)tp->root;
+        struct route4_filter **fp, *f = (struct route4_filter*)arg;
+        unsigned h = 0;
+        struct route4_bucket *b;
+        int i;
+        if (!head || !f)
+                return -EINVAL;
+        h = f->handle;
+        b = f->bkt;
+        for (fp = &b->ht[from_hash(h>>16)]; *fp; fp = &(*fp)->next) {
+                if (*fp == f) {
+                        tcf_tree_lock(tp);
+                        *fp = f->next;
+                        tcf_tree_unlock(tp);
+                        route4_reset_fastmap(tp->q->dev, head, f->id);
+                        route4_delete_filter(tp, f);
+                        /* Strip tree */
+                        for (i=0; i<=32; i++)
+                                if (b->ht[i])
+                                        return 0;
+                        /* OK, session has no flows */
+                        tcf_tree_lock(tp);
+                        head->table[to_hash(h)] = NULL;
+                        tcf_tree_unlock(tp);
+                        kfree(b);
+                        return 0;
+                }
+        }
+        return 0;
+}
+static int route4_set_parms(struct tcf_proto *tp, unsigned long base,
+        struct route4_filter *f, u32 handle, struct route4_head *head,
+        struct rtattr **tb, struct rtattr *est, int new)
+{
+        int err;
+        u32 id = 0, to = 0, nhandle = 0x8000;
+        struct route4_filter *fp;
+        unsigned int h1;
+        struct route4_bucket *b;
+        struct tcf_exts e;
+        err = tcf_exts_validate(tp, tb, est, &e, &route_ext_map);
+        if (err < 0)
+                return err;
+        err = -EINVAL;
+        if (tb[TCA_ROUTE4_CLASSID-1])
+                if (RTA_PAYLOAD(tb[TCA_ROUTE4_CLASSID-1]) < sizeof(u32))
+                        goto errout;
+        if (tb[TCA_ROUTE4_TO-1]) {
+                if (new && handle & 0x8000)
+                        goto errout;
+                if (RTA_PAYLOAD(tb[TCA_ROUTE4_TO-1]) < sizeof(u32))
+                        goto errout;
+                to = *(u32*)RTA_DATA(tb[TCA_ROUTE4_TO-1]);
+                if (to > 0xFF)
+                        goto errout;
+                nhandle = to;
+        }
+        if (tb[TCA_ROUTE4_FROM-1]) {
+                if (tb[TCA_ROUTE4_IIF-1])
+                        goto errout;
+                if (RTA_PAYLOAD(tb[TCA_ROUTE4_FROM-1]) < sizeof(u32))
+                        goto errout;
+                id = *(u32*)RTA_DATA(tb[TCA_ROUTE4_FROM-1]);
+                if (id > 0xFF)
+                        goto errout;
+                nhandle |= id << 16;
+        } else if (tb[TCA_ROUTE4_IIF-1]) {
+                if (RTA_PAYLOAD(tb[TCA_ROUTE4_IIF-1]) < sizeof(u32))
+                        goto errout;
+                id = *(u32*)RTA_DATA(tb[TCA_ROUTE4_IIF-1]);
+                if (id > 0x7FFF)
+                        goto errout;
+                nhandle |= (id | 0x8000) << 16;
+        } else
+                nhandle |= 0xFFFF << 16;
+        if (handle && new) {
+                nhandle |= handle & 0x7F00;
+                if (nhandle != handle)
+                        goto errout;
+        }
+        h1 = to_hash(nhandle);
+        if ((b = head->table[h1]) == NULL) {
+                err = -ENOBUFS;
+                b = kmalloc(sizeof(struct route4_bucket), GFP_KERNEL);
+                if (b == NULL)
+                        goto errout;
+                memset(b, 0, sizeof(*b));
+                tcf_tree_lock(tp);
+                head->table[h1] = b;
+                tcf_tree_unlock(tp);
+        } else {
+                unsigned int h2 = from_hash(nhandle >> 16);
+                err = -EEXIST;
+                for (fp = b->ht[h2]; fp; fp = fp->next)
+                        if (fp->handle == f->handle)
+                                goto errout;
+        }
+        tcf_tree_lock(tp);
+        if (tb[TCA_ROUTE4_TO-1])
+                f->id = to;
+        if (tb[TCA_ROUTE4_FROM-1])
+                f->id = to | id<<16;
+        else if (tb[TCA_ROUTE4_IIF-1])
+                f->iif = id;
+        f->handle = nhandle;
+        f->bkt = b;
+        tcf_tree_unlock(tp);
+        if (tb[TCA_ROUTE4_CLASSID-1]) {
+                f->res.classid = *(u32*)RTA_DATA(tb[TCA_ROUTE4_CLASSID-1]);
+                tcf_bind_filter(tp, &f->res, base);
+        }
+        tcf_exts_change(tp, &f->exts, &e);
+        return 0;
+errout:
+        tcf_exts_destroy(tp, &e);
+        return err;
+}
+static int route4_change(struct tcf_proto *tp, unsigned long base,
+                       u32 handle,
+                       struct rtattr **tca,
+                       unsigned long *arg)
+{
+        struct route4_head *head = tp->root;
+        struct route4_filter *f, *f1, **fp;
+        struct route4_bucket *b;
+        struct rtattr *opt = tca[TCA_OPTIONS-1];
+        struct rtattr *tb[TCA_ROUTE4_MAX];
+        unsigned int h, th;
+        u32 old_handle = 0;
+        int err;
+        if (opt == NULL)
+                return handle ? -EINVAL : 0;
+        if (rtattr_parse_nested(tb, TCA_ROUTE4_MAX, opt) < 0)
+                return -EINVAL;
+        if ((f = (struct route4_filter*)*arg) != NULL) {
+                if (f->handle != handle && handle)
+                        return -EINVAL;
+                if (f->bkt)
+                        old_handle = f->handle;
+                err = route4_set_parms(tp, base, f, handle, head, tb,
+                        tca[TCA_RATE-1], 0);
+                if (err < 0)
+                        return err;
+                goto reinsert;
+        }
+        err = -ENOBUFS;
+        if (head == NULL) {
+                head = kmalloc(sizeof(struct route4_head), GFP_KERNEL);
+                if (head == NULL)
+                        goto errout;
+                memset(head, 0, sizeof(struct route4_head));
+                tcf_tree_lock(tp);
+                tp->root = head;
+                tcf_tree_unlock(tp);
+        }
+        f = kmalloc(sizeof(struct route4_filter), GFP_KERNEL);
+        if (f == NULL)
+                goto errout;
+        memset(f, 0, sizeof(*f));
+        err = route4_set_parms(tp, base, f, handle, head, tb,
+                tca[TCA_RATE-1], 1);
+        if (err < 0)
+                goto errout;
+reinsert:
+        h = from_hash(f->handle >> 16);
+        for (fp = &f->bkt->ht[h]; (f1=*fp) != NULL; fp = &f1->next)
+                if (f->handle < f1->handle)
+                        break;
+        f->next = f1;
+        tcf_tree_lock(tp);
+        *fp = f;
+        if (old_handle && f->handle != old_handle) {
+                th = to_hash(old_handle);
+                h = from_hash(old_handle >> 16);
+                if ((b = head->table[th]) != NULL) {
+                        for (fp = &b->ht[h]; *fp; fp = &(*fp)->next) {
+                                if (*fp == f) {
+                                        *fp = f->next;
+                                        break;
+                                }
+                        }
+                }
+        }
+        tcf_tree_unlock(tp);
+        route4_reset_fastmap(tp->q->dev, head, f->id);
+        *arg = (unsigned long)f;
+        return 0;
+errout:
+        if (f)
+                kfree(f);
+        return err;
+}
+static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+        struct route4_head *head = tp->root;
+        unsigned h, h1;
+        if (head == NULL)
+                arg->stop = 1;
+        if (arg->stop)
+                return;
+        for (h = 0; h <= 256; h++) {
+                struct route4_bucket *b = head->table[h];
+                if (b) {
+                        for (h1 = 0; h1 <= 32; h1++) {
+                                struct route4_filter *f;
+                                for (f = b->ht[h1]; f; f = f->next) {
+                                        if (arg->count < arg->skip) {
+                                                arg->count++;
+                                                continue;
+                                        }
+                                        if (arg->fn(tp, (unsigned long)f, arg) < 0) {
+                                                arg->stop = 1;
+                                                return;
+                                        }
+                                        arg->count++;
+                                }
+                        }
+                }
+        }
+}
+static int route4_dump(struct tcf_proto *tp, unsigned long fh,
+                       struct sk_buff *skb, struct tcmsg *t)
+{
+        struct route4_filter *f = (struct route4_filter*)fh;
+        unsigned char    *b = skb->tail;
+        struct rtattr *rta;
+        u32 id;
+        if (f == NULL)
+                return skb->len;
+        t->tcm_handle = f->handle;
+        rta = (struct rtattr*)b;
+        RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
+        if (!(f->handle&0x8000)) {
+                id = f->id&0xFF;
+                RTA_PUT(skb, TCA_ROUTE4_TO, sizeof(id), &id);
+        }
+        if (f->handle&0x80000000) {
+                if ((f->handle>>16) != 0xFFFF)
+                        RTA_PUT(skb, TCA_ROUTE4_IIF, sizeof(f->iif), &f->iif);
+        } else {
+                id = f->id>>16;
+                RTA_PUT(skb, TCA_ROUTE4_FROM, sizeof(id), &id);
+        }
+        if (f->res.classid)
+                RTA_PUT(skb, TCA_ROUTE4_CLASSID, 4, &f->res.classid);
+        if (tcf_exts_dump(skb, &f->exts, &route_ext_map) < 0)
+                goto rtattr_failure;
+        rta->rta_len = skb->tail - b;
+        if (tcf_exts_dump_stats(skb, &f->exts, &route_ext_map) < 0)
+                goto rtattr_failure;
+        return skb->len;
+rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+static struct tcf_proto_ops cls_route4_ops = {
+        .next           =       NULL,
+        .kind           =       "route",
+        .classify       =       route4_classify,
+        .init           =       route4_init,
+        .destroy        =       route4_destroy,
+        .get            =       route4_get,
+        .put            =       route4_put,
+        .change         =       route4_change,
+        .delete         =       route4_delete,
+        .walk           =       route4_walk,
+        .dump           =       route4_dump,
+        .owner          =       THIS_MODULE,
+};
+static int __init init_route4(void)
+{
+        return register_tcf_proto_ops(&cls_route4_ops);
+}
+static void __exit exit_route4(void)
+{
+        unregister_tcf_proto_ops(&cls_route4_ops);
+}
+module_init(init_route4)
+module_exit(exit_route4)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_rsvp.c b/net/sched/cls_rsvp.c
new file mode 100644
index 000000000000..ad2613790d85
--- /dev/null
+++ b/net/sched/cls_rsvp.c
@@ -0,0 +1,43 @@
+/*
+ * net/sched/cls_rsvp.c Special RSVP packet classifier for IPv4.
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/notifier.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/act_api.h>
+#include <net/pkt_cls.h>
+#define RSVP_DST_LEN    1
+#define RSVP_ID         "rsvp"
+#define RSVP_OPS        cls_rsvp_ops
+#include "cls_rsvp.h"
+MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
new file mode 100644
index 000000000000..232fb9196810
--- /dev/null
+++ b/net/sched/cls_rsvp.h
@@ -0,0 +1,667 @@
+/*
+ * net/sched/cls_rsvp.h Template file for RSVPv[46] classifiers.
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+/*
+   Comparing to general packet classification problem,
+   RSVP needs only sevaral relatively simple rules:
+   * (dst, protocol) are always specified,
+     so that we are able to hash them.
+   * src may be exact, or may be wildcard, so that
+     we can keep a hash table plus one wildcard entry.
+   * source port (or flow label) is important only if src is given.
+   IMPLEMENTATION.
+   We use a two level hash table: The top level is keyed by
+   destination address and protocol ID, every bucket contains a list
+   of "rsvp sessions", identified by destination address, protocol and
+   DPI(="Destination Port ID"): triple (key, mask, offset).
+   Every bucket has a smaller hash table keyed by source address
+   (cf. RSVP flowspec) and one wildcard entry for wildcard reservations.
+   Every bucket is again a list of "RSVP flows", selected by
+   source address and SPI(="Source Port ID" here rather than
+   "security parameter index"): triple (key, mask, offset).
+   NOTE 1. All the packets with IPv6 extension headers (but AH and ESP)
+   and all fragmented packets go to the best-effort traffic class.
+   NOTE 2. Two "port id"'s seems to be redundant, rfc2207 requires
+   only one "Generalized Port Identifier". So that for classic
+   ah, esp (and udp,tcp) both *pi should coincide or one of them
+   should be wildcard.
+   At first sight, this redundancy is just a waste of CPU
+   resources. But DPI and SPI add the possibility to assign different
+   priorities to GPIs. Look also at note 4 about tunnels below.
+   NOTE 3. One complication is the case of tunneled packets.
+   We implement it as following: if the first lookup
+   matches a special session with "tunnelhdr" value not zero,
+   flowid doesn't contain the true flow ID, but the tunnel ID (1...255).
+   In this case, we pull tunnelhdr bytes and restart lookup
+   with tunnel ID added to the list of keys. Simple and stupid 8)8)
+   It's enough for PIMREG and IPIP.
+   NOTE 4. Two GPIs make it possible to parse even GRE packets.
+   F.e. DPI can select ETH_P_IP (and necessary flags to make
+   tunnelhdr correct) in GRE protocol field and SPI matches
+   GRE key. Is it not nice? 8)8)
+   Well, as result, despite its simplicity, we get a pretty
+   powerful classification engine.  */
+#include <linux/config.h>
+struct rsvp_head
+{
+        u32                     tmap[256/32];
+        u32                     hgenerator;
+        u8                      tgenerator;
+        struct rsvp_session     *ht[256];
+};
+struct rsvp_session
+{
+        struct rsvp_session     *next;
+        u32                     dst[RSVP_DST_LEN];
+        struct tc_rsvp_gpi      dpi;
+        u8                      protocol;
+        u8                      tunnelid;
+        /* 16 (src,sport) hash slots, and one wildcard source slot */
+        struct rsvp_filter      *ht[16+1];
+};
+struct rsvp_filter
+{
+        struct rsvp_filter      *next;
+        u32                     src[RSVP_DST_LEN];
+        struct tc_rsvp_gpi      spi;
+        u8                      tunnelhdr;
+        struct tcf_result       res;
+        struct tcf_exts         exts;
+        u32                     handle;
+        struct rsvp_session     *sess;
+};
+static __inline__ unsigned hash_dst(u32 *dst, u8 protocol, u8 tunnelid)
+{
+        unsigned h = dst[RSVP_DST_LEN-1];
+        h ^= h>>16;
+        h ^= h>>8;
+        return (h ^ protocol ^ tunnelid) & 0xFF;
+}
+static __inline__ unsigned hash_src(u32 *src)
+{
+        unsigned h = src[RSVP_DST_LEN-1];
+        h ^= h>>16;
+        h ^= h>>8;
+        h ^= h>>4;
+        return h & 0xF;
+}
+static struct tcf_ext_map rsvp_ext_map = {
+        .police = TCA_RSVP_POLICE,
+        .action = TCA_RSVP_ACT
+};
+#define RSVP_APPLY_RESULT()                             \
+{                                                       \
+        int r = tcf_exts_exec(skb, &f->exts, res);      \
+        if (r < 0)                                      \
+                continue;                               \
+        else if (r > 0)                                 \
+                return r;                               \
+}
+        
+static int rsvp_classify(struct sk_buff *skb, struct tcf_proto *tp,
+                         struct tcf_result *res)
+{
+        struct rsvp_session **sht = ((struct rsvp_head*)tp->root)->ht;
+        struct rsvp_session *s;
+        struct rsvp_filter *f;
+        unsigned h1, h2;
+        u32 *dst, *src;
+        u8 protocol;
+        u8 tunnelid = 0;
+        u8 *xprt;
+#if RSVP_DST_LEN == 4
+        struct ipv6hdr *nhptr = skb->nh.ipv6h;
+#else
+        struct iphdr *nhptr = skb->nh.iph;
+#endif
+restart:
+#if RSVP_DST_LEN == 4
+        src = &nhptr->saddr.s6_addr32[0];
+        dst = &nhptr->daddr.s6_addr32[0];
+        protocol = nhptr->nexthdr;
+        xprt = ((u8*)nhptr) + sizeof(struct ipv6hdr);
+#else
+        src = &nhptr->saddr;
+        dst = &nhptr->daddr;
+        protocol = nhptr->protocol;
+        xprt = ((u8*)nhptr) + (nhptr->ihl<<2);
+        if (nhptr->frag_off&__constant_htons(IP_MF|IP_OFFSET))
+                return -1;
+#endif
+        h1 = hash_dst(dst, protocol, tunnelid);
+        h2 = hash_src(src);
+        for (s = sht[h1]; s; s = s->next) {
+                if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] &&
+                    protocol == s->protocol &&
+                    !(s->dpi.mask & (*(u32*)(xprt+s->dpi.offset)^s->dpi.key))
+#if RSVP_DST_LEN == 4
+                    && dst[0] == s->dst[0]
+                    && dst[1] == s->dst[1]
+                    && dst[2] == s->dst[2]
+#endif
+                    && tunnelid == s->tunnelid) {
+                        for (f = s->ht[h2]; f; f = f->next) {
+                                if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN-1] &&
+                                    !(f->spi.mask & (*(u32*)(xprt+f->spi.offset)^f->spi.key))
+#if RSVP_DST_LEN == 4
+                                    && src[0] == f->src[0]
+                                    && src[1] == f->src[1]
+                                    && src[2] == f->src[2]
+#endif
+                                    ) {
+                                        *res = f->res;
+                                        RSVP_APPLY_RESULT();
+matched:
+                                        if (f->tunnelhdr == 0)
+                                                return 0;
+                                        tunnelid = f->res.classid;
+                                        nhptr = (void*)(xprt + f->tunnelhdr - sizeof(*nhptr));
+                                        goto restart;
+                                }
+                        }
+                        /* And wildcard bucket... */
+                        for (f = s->ht[16]; f; f = f->next) {
+                                *res = f->res;
+                                RSVP_APPLY_RESULT();
+                                goto matched;
+                        }
+                        return -1;
+                }
+        }
+        return -1;
+}
+static unsigned long rsvp_get(struct tcf_proto *tp, u32 handle)
+{
+        struct rsvp_session **sht = ((struct rsvp_head*)tp->root)->ht;
+        struct rsvp_session *s;
+        struct rsvp_filter *f;
+        unsigned h1 = handle&0xFF;
+        unsigned h2 = (handle>>8)&0xFF;
+        if (h2 > 16)
+                return 0;
+        for (s = sht[h1]; s; s = s->next) {
+                for (f = s->ht[h2]; f; f = f->next) {
+                        if (f->handle == handle)
+                                return (unsigned long)f;
+                }
+        }
+        return 0;
+}
+static void rsvp_put(struct tcf_proto *tp, unsigned long f)
+{
+}
+static int rsvp_init(struct tcf_proto *tp)
+{
+        struct rsvp_head *data;
+        data = kmalloc(sizeof(struct rsvp_head), GFP_KERNEL);
+        if (data) {
+                memset(data, 0, sizeof(struct rsvp_head));
+                tp->root = data;
+                return 0;
+        }
+        return -ENOBUFS;
+}
+static inline void
+rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f)
+{
+        tcf_unbind_filter(tp, &f->res);
+        tcf_exts_destroy(tp, &f->exts);
+        kfree(f);
+}
+static void rsvp_destroy(struct tcf_proto *tp)
+{
+        struct rsvp_head *data = xchg(&tp->root, NULL);
+        struct rsvp_session **sht;
+        int h1, h2;
+        if (data == NULL)
+                return;
+        sht = data->ht;
+        for (h1=0; h1<256; h1++) {
+                struct rsvp_session *s;
+                while ((s = sht[h1]) != NULL) {
+                        sht[h1] = s->next;
+                        for (h2=0; h2<=16; h2++) {
+                                struct rsvp_filter *f;
+                                while ((f = s->ht[h2]) != NULL) {
+                                        s->ht[h2] = f->next;
+                                        rsvp_delete_filter(tp, f);
+                                }
+                        }
+                        kfree(s);
+                }
+        }
+        kfree(data);
+}
+static int rsvp_delete(struct tcf_proto *tp, unsigned long arg)
+{
+        struct rsvp_filter **fp, *f = (struct rsvp_filter*)arg;
+        unsigned h = f->handle;
+        struct rsvp_session **sp;
+        struct rsvp_session *s = f->sess;
+        int i;
+        for (fp = &s->ht[(h>>8)&0xFF]; *fp; fp = &(*fp)->next) {
+                if (*fp == f) {
+                        tcf_tree_lock(tp);
+                        *fp = f->next;
+                        tcf_tree_unlock(tp);
+                        rsvp_delete_filter(tp, f);
+                        /* Strip tree */
+                        for (i=0; i<=16; i++)
+                                if (s->ht[i])
+                                        return 0;
+                        /* OK, session has no flows */
+                        for (sp = &((struct rsvp_head*)tp->root)->ht[h&0xFF];
+                             *sp; sp = &(*sp)->next) {
+                                if (*sp == s) {
+                                        tcf_tree_lock(tp);
+                                        *sp = s->next;
+                                        tcf_tree_unlock(tp);
+                                        kfree(s);
+                                        return 0;
+                                }
+                        }
+                        return 0;
+                }
+        }
+        return 0;
+}
+static unsigned gen_handle(struct tcf_proto *tp, unsigned salt)
+{
+        struct rsvp_head *data = tp->root;
+        int i = 0xFFFF;
+        while (i-- > 0) {
+                u32 h;
+                if ((data->hgenerator += 0x10000) == 0)
+                        data->hgenerator = 0x10000;
+                h = data->hgenerator|salt;
+                if (rsvp_get(tp, h) == 0)
+                        return h;
+        }
+        return 0;
+}
+static int tunnel_bts(struct rsvp_head *data)
+{
+        int n = data->tgenerator>>5;
+        u32 b = 1<<(data->tgenerator&0x1F);
+        
+        if (data->tmap[n]&b)
+                return 0;
+        data->tmap[n] |= b;
+        return 1;
+}
+static void tunnel_recycle(struct rsvp_head *data)
+{
+        struct rsvp_session **sht = data->ht;
+        u32 tmap[256/32];
+        int h1, h2;
+        memset(tmap, 0, sizeof(tmap));
+        for (h1=0; h1<256; h1++) {
+                struct rsvp_session *s;
+                for (s = sht[h1]; s; s = s->next) {
+                        for (h2=0; h2<=16; h2++) {
+                                struct rsvp_filter *f;
+                                for (f = s->ht[h2]; f; f = f->next) {
+                                        if (f->tunnelhdr == 0)
+                                                continue;
+                                        data->tgenerator = f->res.classid;
+                                        tunnel_bts(data);
+                                }
+                        }
+                }
+        }
+        memcpy(data->tmap, tmap, sizeof(tmap));
+}
+static u32 gen_tunnel(struct rsvp_head *data)
+{
+        int i, k;
+        for (k=0; k<2; k++) {
+                for (i=255; i>0; i--) {
+                        if (++data->tgenerator == 0)
+                                data->tgenerator = 1;
+                        if (tunnel_bts(data))
+                                return data->tgenerator;
+                }
+                tunnel_recycle(data);
+        }
+        return 0;
+}
+static int rsvp_change(struct tcf_proto *tp, unsigned long base,
+                       u32 handle,
+                       struct rtattr **tca,
+                       unsigned long *arg)
+{
+        struct rsvp_head *data = tp->root;
+        struct rsvp_filter *f, **fp;
+        struct rsvp_session *s, **sp;
+        struct tc_rsvp_pinfo *pinfo = NULL;
+        struct rtattr *opt = tca[TCA_OPTIONS-1];
+        struct rtattr *tb[TCA_RSVP_MAX];
+        struct tcf_exts e;
+        unsigned h1, h2;
+        u32 *dst;
+        int err;
+        if (opt == NULL)
+                return handle ? -EINVAL : 0;
+        if (rtattr_parse_nested(tb, TCA_RSVP_MAX, opt) < 0)
+                return -EINVAL;
+        err = tcf_exts_validate(tp, tb, tca[TCA_RATE-1], &e, &rsvp_ext_map);
+        if (err < 0)
+                return err;
+        if ((f = (struct rsvp_filter*)*arg) != NULL) {
+                /* Node exists: adjust only classid */
+                if (f->handle != handle && handle)
+                        goto errout2;
+                if (tb[TCA_RSVP_CLASSID-1]) {
+                        f->res.classid = *(u32*)RTA_DATA(tb[TCA_RSVP_CLASSID-1]);
+                        tcf_bind_filter(tp, &f->res, base);
+                }
+                tcf_exts_change(tp, &f->exts, &e);
+                return 0;
+        }
+        /* Now more serious part... */
+        err = -EINVAL;
+        if (handle)
+                goto errout2;
+        if (tb[TCA_RSVP_DST-1] == NULL)
+                goto errout2;
+        err = -ENOBUFS;
+        f = kmalloc(sizeof(struct rsvp_filter), GFP_KERNEL);
+        if (f == NULL)
+                goto errout2;
+        memset(f, 0, sizeof(*f));
+        h2 = 16;
+        if (tb[TCA_RSVP_SRC-1]) {
+                err = -EINVAL;
+                if (RTA_PAYLOAD(tb[TCA_RSVP_SRC-1]) != sizeof(f->src))
+                        goto errout;
+                memcpy(f->src, RTA_DATA(tb[TCA_RSVP_SRC-1]), sizeof(f->src));
+                h2 = hash_src(f->src);
+        }
+        if (tb[TCA_RSVP_PINFO-1]) {
+                err = -EINVAL;
+                if (RTA_PAYLOAD(tb[TCA_RSVP_PINFO-1]) < sizeof(struct tc_rsvp_pinfo))
+                        goto errout;
+                pinfo = RTA_DATA(tb[TCA_RSVP_PINFO-1]);
+                f->spi = pinfo->spi;
+                f->tunnelhdr = pinfo->tunnelhdr;
+        }
+        if (tb[TCA_RSVP_CLASSID-1]) {
+                err = -EINVAL;
+                if (RTA_PAYLOAD(tb[TCA_RSVP_CLASSID-1]) != 4)
+                        goto errout;
+                f->res.classid = *(u32*)RTA_DATA(tb[TCA_RSVP_CLASSID-1]);
+        }
+        err = -EINVAL;
+        if (RTA_PAYLOAD(tb[TCA_RSVP_DST-1]) != sizeof(f->src))
+                goto errout;
+        dst = RTA_DATA(tb[TCA_RSVP_DST-1]);
+        h1 = hash_dst(dst, pinfo ? pinfo->protocol : 0, pinfo ? pinfo->tunnelid : 0);
+        err = -ENOMEM;
+        if ((f->handle = gen_handle(tp, h1 | (h2<<8))) == 0)
+                goto errout;
+        if (f->tunnelhdr) {
+                err = -EINVAL;
+                if (f->res.classid > 255)
+                        goto errout;
+                err = -ENOMEM;
+                if (f->res.classid == 0 &&
+                    (f->res.classid = gen_tunnel(data)) == 0)
+                        goto errout;
+        }
+        for (sp = &data->ht[h1]; (s=*sp) != NULL; sp = &s->next) {
+                if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] &&
+                    pinfo && pinfo->protocol == s->protocol &&
+                    memcmp(&pinfo->dpi, &s->dpi, sizeof(s->dpi)) == 0
+#if RSVP_DST_LEN == 4
+                    && dst[0] == s->dst[0]
+                    && dst[1] == s->dst[1]
+                    && dst[2] == s->dst[2]
+#endif
+                    && pinfo->tunnelid == s->tunnelid) {
+insert:
+                        /* OK, we found appropriate session */
+                        fp = &s->ht[h2];
+                        f->sess = s;
+                        if (f->tunnelhdr == 0)
+                                tcf_bind_filter(tp, &f->res, base);
+                        tcf_exts_change(tp, &f->exts, &e);
+                        for (fp = &s->ht[h2]; *fp; fp = &(*fp)->next)
+                                if (((*fp)->spi.mask&f->spi.mask) != f->spi.mask)
+                                        break;
+                        f->next = *fp;
+                        wmb();
+                        *fp = f;
+                        *arg = (unsigned long)f;
+                        return 0;
+                }
+        }
+        /* No session found. Create new one. */
+        err = -ENOBUFS;
+        s = kmalloc(sizeof(struct rsvp_session), GFP_KERNEL);
+        if (s == NULL)
+                goto errout;
+        memset(s, 0, sizeof(*s));
+        memcpy(s->dst, dst, sizeof(s->dst));
+        if (pinfo) {
+                s->dpi = pinfo->dpi;
+                s->protocol = pinfo->protocol;
+                s->tunnelid = pinfo->tunnelid;
+        }
+        for (sp = &data->ht[h1]; *sp; sp = &(*sp)->next) {
+                if (((*sp)->dpi.mask&s->dpi.mask) != s->dpi.mask)
+                        break;
+        }
+        s->next = *sp;
+        wmb();
+        *sp = s;
+        
+        goto insert;
+errout:
+        if (f)
+                kfree(f);
+errout2:
+        tcf_exts_destroy(tp, &e);
+        return err;
+}
+static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+        struct rsvp_head *head = tp->root;
+        unsigned h, h1;
+        if (arg->stop)
+                return;
+        for (h = 0; h < 256; h++) {
+                struct rsvp_session *s;
+                for (s = head->ht[h]; s; s = s->next) {
+                        for (h1 = 0; h1 <= 16; h1++) {
+                                struct rsvp_filter *f;
+                                for (f = s->ht[h1]; f; f = f->next) {
+                                        if (arg->count < arg->skip) {
+                                                arg->count++;
+                                                continue;
+                                        }
+                                        if (arg->fn(tp, (unsigned long)f, arg) < 0) {
+                                                arg->stop = 1;
+                                                return;
+                                        }
+                                        arg->count++;
+                                }
+                        }
+                }
+        }
+}
+static int rsvp_dump(struct tcf_proto *tp, unsigned long fh,
+                     struct sk_buff *skb, struct tcmsg *t)
+{
+        struct rsvp_filter *f = (struct rsvp_filter*)fh;
+        struct rsvp_session *s;
+        unsigned char    *b = skb->tail;
+        struct rtattr *rta;
+        struct tc_rsvp_pinfo pinfo;
+        if (f == NULL)
+                return skb->len;
+        s = f->sess;
+        t->tcm_handle = f->handle;
+        rta = (struct rtattr*)b;
+        RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
+        RTA_PUT(skb, TCA_RSVP_DST, sizeof(s->dst), &s->dst);
+        pinfo.dpi = s->dpi;
+        pinfo.spi = f->spi;
+        pinfo.protocol = s->protocol;
+        pinfo.tunnelid = s->tunnelid;
+        pinfo.tunnelhdr = f->tunnelhdr;
+        RTA_PUT(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo);
+        if (f->res.classid)
+                RTA_PUT(skb, TCA_RSVP_CLASSID, 4, &f->res.classid);
+        if (((f->handle>>8)&0xFF) != 16)
+                RTA_PUT(skb, TCA_RSVP_SRC, sizeof(f->src), f->src);
+        if (tcf_exts_dump(skb, &f->exts, &rsvp_ext_map) < 0)
+                goto rtattr_failure;
+        rta->rta_len = skb->tail - b;
+        if (tcf_exts_dump_stats(skb, &f->exts, &rsvp_ext_map) < 0)
+                goto rtattr_failure;
+        return skb->len;
+rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+static struct tcf_proto_ops RSVP_OPS = {
+        .next           =       NULL,
+        .kind           =       RSVP_ID,
+        .classify       =       rsvp_classify,
+        .init           =       rsvp_init,
+        .destroy        =       rsvp_destroy,
+        .get            =       rsvp_get,
+        .put            =       rsvp_put,
+        .change         =       rsvp_change,
+        .delete         =       rsvp_delete,
+        .walk           =       rsvp_walk,
+        .dump           =       rsvp_dump,
+        .owner          =       THIS_MODULE,
+};
+static int __init init_rsvp(void)
+{
+        return register_tcf_proto_ops(&RSVP_OPS);
+}
+static void __exit exit_rsvp(void) 
+{
+        unregister_tcf_proto_ops(&RSVP_OPS);
+}
+module_init(init_rsvp)
+module_exit(exit_rsvp)
diff --git a/net/sched/cls_rsvp6.c b/net/sched/cls_rsvp6.c
new file mode 100644
index 000000000000..fde51f7848eb
--- /dev/null
+++ b/net/sched/cls_rsvp6.c
@@ -0,0 +1,44 @@
+/*
+ * net/sched/cls_rsvp6.c        Special RSVP packet classifier for IPv6.
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/notifier.h>
+#include <net/ip.h>
+#include <linux/ipv6.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/act_api.h>
+#include <net/pkt_cls.h>
+#define RSVP_DST_LEN    4
+#define RSVP_ID         "rsvp6"
+#define RSVP_OPS        cls_rsvp6_ops
+#include "cls_rsvp.h"
+MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
new file mode 100644
index 000000000000..404d9d83a7fa
--- /dev/null
+++ b/net/sched/cls_tcindex.c
@@ -0,0 +1,537 @@
+/*
+ * net/sched/cls_tcindex.c      Packet classifier for skb->tc_index
+ *
+ * Written 1998,1999 by Werner Almesberger, EPFL ICA
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <net/ip.h>
+#include <net/act_api.h>
+#include <net/pkt_cls.h>
+#include <net/route.h>
+/*
+ * Not quite sure if we need all the xchgs Alexey uses when accessing things.
+ * Can always add them later ... :)
+ */
+/*
+ * Passing parameters to the root seems to be done more awkwardly than really
+ * necessary. At least, u32 doesn't seem to use such dirty hacks. To be
+ * verified. FIXME.
+ */
+#define PERFECT_HASH_THRESHOLD  64      /* use perfect hash if not bigger */
+#define DEFAULT_HASH_SIZE       64      /* optimized for diffserv */
+#if 1 /* control */
+#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
+#else
+#define DPRINTK(format,args...)
+#endif
+#if 0 /* data */
+#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args)
+#else
+#define D2PRINTK(format,args...)
+#endif
+#define PRIV(tp)        ((struct tcindex_data *) (tp)->root)
+struct tcindex_filter_result {
+        struct tcf_exts         exts;
+        struct tcf_result       res;
+};
+struct tcindex_filter {
+        u16 key;
+        struct tcindex_filter_result result;
+        struct tcindex_filter *next;
+};
+struct tcindex_data {
+        struct tcindex_filter_result *perfect; /* perfect hash; NULL if none */
+        struct tcindex_filter **h; /* imperfect hash; only used if !perfect;
+                                      NULL if unused */
+        u16 mask;               /* AND key with mask */
+        int shift;              /* shift ANDed key to the right */
+        int hash;               /* hash table size; 0 if undefined */
+        int alloc_hash;         /* allocated size */
+        int fall_through;       /* 0: only classify if explicit match */
+};
+static struct tcf_ext_map tcindex_ext_map = {
+        .police = TCA_TCINDEX_POLICE,
+        .action = TCA_TCINDEX_ACT
+};
+static inline int
+tcindex_filter_is_set(struct tcindex_filter_result *r)
+{
+        return tcf_exts_is_predicative(&r->exts) || r->res.classid;
+}
+static struct tcindex_filter_result *
+tcindex_lookup(struct tcindex_data *p, u16 key)
+{
+        struct tcindex_filter *f;
+        if (p->perfect)
+                return tcindex_filter_is_set(p->perfect + key) ?
+                        p->perfect + key : NULL;
+        else if (p->h) {
+                for (f = p->h[key % p->hash]; f; f = f->next)
+                        if (f->key == key)
+                                return &f->result;
+        }
+        return NULL;
+}
+static int tcindex_classify(struct sk_buff *skb, struct tcf_proto *tp,
+                            struct tcf_result *res)
+{
+        struct tcindex_data *p = PRIV(tp);
+        struct tcindex_filter_result *f;
+        int key = (skb->tc_index & p->mask) >> p->shift;
+        D2PRINTK("tcindex_classify(skb %p,tp %p,res %p),p %p\n",skb,tp,res,p);
+        f = tcindex_lookup(p, key);
+        if (!f) {
+                if (!p->fall_through)
+                        return -1;
+                res->classid = TC_H_MAKE(TC_H_MAJ(tp->q->handle), key);
+                res->class = 0;
+                D2PRINTK("alg 0x%x\n",res->classid);
+                return 0;
+        }
+        *res = f->res;
+        D2PRINTK("map 0x%x\n",res->classid);
+        return tcf_exts_exec(skb, &f->exts, res);
+}
+static unsigned long tcindex_get(struct tcf_proto *tp, u32 handle)
+{
+        struct tcindex_data *p = PRIV(tp);
+        struct tcindex_filter_result *r;
+        DPRINTK("tcindex_get(tp %p,handle 0x%08x)\n",tp,handle);
+        if (p->perfect && handle >= p->alloc_hash)
+                return 0;
+        r = tcindex_lookup(p, handle);
+        return r && tcindex_filter_is_set(r) ? (unsigned long) r : 0UL;
+}
+static void tcindex_put(struct tcf_proto *tp, unsigned long f)
+{
+        DPRINTK("tcindex_put(tp %p,f 0x%lx)\n",tp,f);
+}
+static int tcindex_init(struct tcf_proto *tp)
+{
+        struct tcindex_data *p;
+        DPRINTK("tcindex_init(tp %p)\n",tp);
+        p = kmalloc(sizeof(struct tcindex_data),GFP_KERNEL);
+        if (!p)
+                return -ENOMEM;
+        memset(p, 0, sizeof(*p));
+        p->mask = 0xffff;
+        p->hash = DEFAULT_HASH_SIZE;
+        p->fall_through = 1;
+        tp->root = p;
+        return 0;
+}
+static int
+__tcindex_delete(struct tcf_proto *tp, unsigned long arg, int lock)
+{
+        struct tcindex_data *p = PRIV(tp);
+        struct tcindex_filter_result *r = (struct tcindex_filter_result *) arg;
+        struct tcindex_filter *f = NULL;
+        DPRINTK("tcindex_delete(tp %p,arg 0x%lx),p %p,f %p\n",tp,arg,p,f);
+        if (p->perfect) {
+                if (!r->res.class)
+                        return -ENOENT;
+        } else {
+                int i;
+                struct tcindex_filter **walk = NULL;
+                for (i = 0; i < p->hash; i++)
+                        for (walk = p->h+i; *walk; walk = &(*walk)->next)
+                                if (&(*walk)->result == r)
+                                        goto found;
+                return -ENOENT;
+found:
+                f = *walk;
+                if (lock)
+                        tcf_tree_lock(tp);
+                *walk = f->next;
+                if (lock)
+                        tcf_tree_unlock(tp);
+        }
+        tcf_unbind_filter(tp, &r->res);
+        tcf_exts_destroy(tp, &r->exts);
+        if (f)
+                kfree(f);
+        return 0;
+}
+static int tcindex_delete(struct tcf_proto *tp, unsigned long arg)
+{
+        return __tcindex_delete(tp, arg, 1);
+}
+static inline int
+valid_perfect_hash(struct tcindex_data *p)
+{
+        return  p->hash > (p->mask >> p->shift);
+}
+static int
+tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle,
+                  struct tcindex_data *p, struct tcindex_filter_result *r,
+                  struct rtattr **tb, struct rtattr *est)
+{
+        int err, balloc = 0;
+        struct tcindex_filter_result new_filter_result, *old_r = r;
+        struct tcindex_filter_result cr;
+        struct tcindex_data cp;
+        struct tcindex_filter *f = NULL; /* make gcc behave */
+        struct tcf_exts e;
+        err = tcf_exts_validate(tp, tb, est, &e, &tcindex_ext_map);
+        if (err < 0)
+                return err;
+        
+        memcpy(&cp, p, sizeof(cp));
+        memset(&new_filter_result, 0, sizeof(new_filter_result));
+        if (old_r)
+                memcpy(&cr, r, sizeof(cr));
+        else
+                memset(&cr, 0, sizeof(cr));
+        err = -EINVAL;
+        if (tb[TCA_TCINDEX_HASH-1]) {
+                if (RTA_PAYLOAD(tb[TCA_TCINDEX_HASH-1]) < sizeof(u32))
+                        goto errout;
+                cp.hash = *(u32 *) RTA_DATA(tb[TCA_TCINDEX_HASH-1]);
+        }
+        if (tb[TCA_TCINDEX_MASK-1]) {
+                if (RTA_PAYLOAD(tb[TCA_TCINDEX_MASK-1]) < sizeof(u16))
+                        goto errout;
+                cp.mask = *(u16 *) RTA_DATA(tb[TCA_TCINDEX_MASK-1]);
+        }
+        if (tb[TCA_TCINDEX_SHIFT-1]) {
+                if (RTA_PAYLOAD(tb[TCA_TCINDEX_SHIFT-1]) < sizeof(u16))
+                        goto errout;
+                cp.shift = *(u16 *) RTA_DATA(tb[TCA_TCINDEX_SHIFT-1]);
+        }
+        err = -EBUSY;
+        /* Hash already allocated, make sure that we still meet the
+         * requirements for the allocated hash.
+         */
+        if (cp.perfect) {
+                if (!valid_perfect_hash(&cp) ||
+                    cp.hash > cp.alloc_hash)
+                        goto errout;
+        } else if (cp.h && cp.hash != cp.alloc_hash)
+                goto errout;
+        err = -EINVAL;
+        if (tb[TCA_TCINDEX_FALL_THROUGH-1]) {
+                if (RTA_PAYLOAD(tb[TCA_TCINDEX_FALL_THROUGH-1]) < sizeof(u32))
+                        goto errout;
+                cp.fall_through =
+                        *(u32 *) RTA_DATA(tb[TCA_TCINDEX_FALL_THROUGH-1]);
+        }
+        if (!cp.hash) {
+                /* Hash not specified, use perfect hash if the upper limit
+                 * of the hashing index is below the threshold.
+                 */
+                if ((cp.mask >> cp.shift) < PERFECT_HASH_THRESHOLD)
+                        cp.hash = (cp.mask >> cp.shift)+1;
+                else
+                        cp.hash = DEFAULT_HASH_SIZE;
+        }
+        if (!cp.perfect && !cp.h)
+                cp.alloc_hash = cp.hash;
+        /* Note: this could be as restrictive as if (handle & ~(mask >> shift))
+         * but then, we'd fail handles that may become valid after some future
+         * mask change. While this is extremely unlikely to ever matter,
+         * the check below is safer (and also more backwards-compatible).
+         */
+        if (cp.perfect || valid_perfect_hash(&cp))
+                if (handle >= cp.alloc_hash)
+                        goto errout;
+        err = -ENOMEM;
+        if (!cp.perfect && !cp.h) {
+                if (valid_perfect_hash(&cp)) {
+                        cp.perfect = kmalloc(cp.hash * sizeof(*r), GFP_KERNEL);
+                        if (!cp.perfect)
+                                goto errout;
+                        memset(cp.perfect, 0, cp.hash * sizeof(*r));
+                        balloc = 1;
+                } else {
+                        cp.h = kmalloc(cp.hash * sizeof(f), GFP_KERNEL);
+                        if (!cp.h)
+                                goto errout;
+                        memset(cp.h, 0, cp.hash * sizeof(f));
+                        balloc = 2;
+                }
+        }
+        if (cp.perfect)
+                r = cp.perfect + handle;
+        else
+                r = tcindex_lookup(&cp, handle) ? : &new_filter_result;
+        if (r == &new_filter_result) {
+                f = kmalloc(sizeof(*f), GFP_KERNEL);
+                if (!f)
+                        goto errout_alloc;
+                memset(f, 0, sizeof(*f));
+        }
+        if (tb[TCA_TCINDEX_CLASSID-1]) {
+                cr.res.classid = *(u32 *) RTA_DATA(tb[TCA_TCINDEX_CLASSID-1]);
+                tcf_bind_filter(tp, &cr.res, base);
+        }
+        tcf_exts_change(tp, &cr.exts, &e);
+        tcf_tree_lock(tp);
+        if (old_r && old_r != r)
+                memset(old_r, 0, sizeof(*old_r));
+        memcpy(p, &cp, sizeof(cp));
+        memcpy(r, &cr, sizeof(cr));
+        if (r == &new_filter_result) {
+                struct tcindex_filter **fp;
+                f->key = handle;
+                f->result = new_filter_result;
+                f->next = NULL;
+                for (fp = p->h+(handle % p->hash); *fp; fp = &(*fp)->next)
+                        /* nothing */;
+                *fp = f;
+        }
+        tcf_tree_unlock(tp);
+        return 0;
+errout_alloc:
+        if (balloc == 1)
+                kfree(cp.perfect);
+        else if (balloc == 2)
+                kfree(cp.h);
+errout:
+        tcf_exts_destroy(tp, &e);
+        return err;
+}
+static int
+tcindex_change(struct tcf_proto *tp, unsigned long base, u32 handle,
+               struct rtattr **tca, unsigned long *arg)
+{
+        struct rtattr *opt = tca[TCA_OPTIONS-1];
+        struct rtattr *tb[TCA_TCINDEX_MAX];
+        struct tcindex_data *p = PRIV(tp);
+        struct tcindex_filter_result *r = (struct tcindex_filter_result *) *arg;
+        DPRINTK("tcindex_change(tp %p,handle 0x%08x,tca %p,arg %p),opt %p,"
+            "p %p,r %p,*arg 0x%lx\n",
+            tp, handle, tca, arg, opt, p, r, arg ? *arg : 0L);
+        if (!opt)
+                return 0;
+        if (rtattr_parse_nested(tb, TCA_TCINDEX_MAX, opt) < 0)
+                return -EINVAL;
+        return tcindex_set_parms(tp, base, handle, p, r, tb, tca[TCA_RATE-1]);
+}
+static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker)
+{
+        struct tcindex_data *p = PRIV(tp);
+        struct tcindex_filter *f,*next;
+        int i;
+        DPRINTK("tcindex_walk(tp %p,walker %p),p %p\n",tp,walker,p);
+        if (p->perfect) {
+                for (i = 0; i < p->hash; i++) {
+                        if (!p->perfect[i].res.class)
+                                continue;
+                        if (walker->count >= walker->skip) {
+                                if (walker->fn(tp,
+                                    (unsigned long) (p->perfect+i), walker)
+                                     < 0) {
+                                        walker->stop = 1;
+                                        return;
+                                }
+                        }
+                        walker->count++;
+                }
+        }
+        if (!p->h)
+                return;
+        for (i = 0; i < p->hash; i++) {
+                for (f = p->h[i]; f; f = next) {
+                        next = f->next;
+                        if (walker->count >= walker->skip) {
+                                if (walker->fn(tp,(unsigned long) &f->result,
+                                    walker) < 0) {
+                                        walker->stop = 1;
+                                        return;
+                                }
+                        }
+                        walker->count++;
+                }
+        }
+}
+static int tcindex_destroy_element(struct tcf_proto *tp,
+    unsigned long arg, struct tcf_walker *walker)
+{
+        return __tcindex_delete(tp, arg, 0);
+}
+static void tcindex_destroy(struct tcf_proto *tp)
+{
+        struct tcindex_data *p = PRIV(tp);
+        struct tcf_walker walker;
+        DPRINTK("tcindex_destroy(tp %p),p %p\n",tp,p);
+        walker.count = 0;
+        walker.skip = 0;
+        walker.fn = &tcindex_destroy_element;
+        tcindex_walk(tp,&walker);
+        if (p->perfect)
+                kfree(p->perfect);
+        if (p->h)
+                kfree(p->h);
+        kfree(p);
+        tp->root = NULL;
+}
+static int tcindex_dump(struct tcf_proto *tp, unsigned long fh,
+    struct sk_buff *skb, struct tcmsg *t)
+{
+        struct tcindex_data *p = PRIV(tp);
+        struct tcindex_filter_result *r = (struct tcindex_filter_result *) fh;
+        unsigned char *b = skb->tail;
+        struct rtattr *rta;
+        DPRINTK("tcindex_dump(tp %p,fh 0x%lx,skb %p,t %p),p %p,r %p,b %p\n",
+            tp,fh,skb,t,p,r,b);
+        DPRINTK("p->perfect %p p->h %p\n",p->perfect,p->h);
+        rta = (struct rtattr *) b;
+        RTA_PUT(skb,TCA_OPTIONS,0,NULL);
+        if (!fh) {
+                t->tcm_handle = ~0; /* whatever ... */
+                RTA_PUT(skb,TCA_TCINDEX_HASH,sizeof(p->hash),&p->hash);
+                RTA_PUT(skb,TCA_TCINDEX_MASK,sizeof(p->mask),&p->mask);
+                RTA_PUT(skb,TCA_TCINDEX_SHIFT,sizeof(p->shift),&p->shift);
+                RTA_PUT(skb,TCA_TCINDEX_FALL_THROUGH,sizeof(p->fall_through),
+                    &p->fall_through);
+                rta->rta_len = skb->tail-b;
+        } else {
+                if (p->perfect) {
+                        t->tcm_handle = r-p->perfect;
+                } else {
+                        struct tcindex_filter *f;
+                        int i;
+                        t->tcm_handle = 0;
+                        for (i = 0; !t->tcm_handle && i < p->hash; i++) {
+                                for (f = p->h[i]; !t->tcm_handle && f;
+                                     f = f->next) {
+                                        if (&f->result == r)
+                                                t->tcm_handle = f->key;
+                                }
+                        }
+                }
+                DPRINTK("handle = %d\n",t->tcm_handle);
+                if (r->res.class)
+                        RTA_PUT(skb, TCA_TCINDEX_CLASSID, 4, &r->res.classid);
+                if (tcf_exts_dump(skb, &r->exts, &tcindex_ext_map) < 0)
+                        goto rtattr_failure;
+                rta->rta_len = skb->tail-b;
+                if (tcf_exts_dump_stats(skb, &r->exts, &tcindex_ext_map) < 0)
+                        goto rtattr_failure;
+        }
+        
+        return skb->len;
+rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+static struct tcf_proto_ops cls_tcindex_ops = {
+        .next           =       NULL,
+        .kind           =       "tcindex",
+        .classify       =       tcindex_classify,
+        .init           =       tcindex_init,
+        .destroy        =       tcindex_destroy,
+        .get            =       tcindex_get,
+        .put            =       tcindex_put,
+        .change         =       tcindex_change,
+        .delete         =       tcindex_delete,
+        .walk           =       tcindex_walk,
+        .dump           =       tcindex_dump,
+        .owner          =       THIS_MODULE,
+};
+static int __init init_tcindex(void)
+{
+        return register_tcf_proto_ops(&cls_tcindex_ops);
+}
+static void __exit exit_tcindex(void) 
+{
+        unregister_tcf_proto_ops(&cls_tcindex_ops);
+}
+module_init(init_tcindex)
+module_exit(exit_tcindex)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
new file mode 100644
index 000000000000..364b87d86455
--- /dev/null
+++ b/net/sched/cls_u32.c
@@ -0,0 +1,828 @@
+/*
+ * net/sched/cls_u32.c  Ugly (or Universal) 32bit key Packet Classifier.
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ *      The filters are packed to hash tables of key nodes
+ *      with a set of 32bit key/mask pairs at every node.
+ *      Nodes reference next level hash tables etc.
+ *
+ *      This scheme is the best universal classifier I managed to
+ *      invent; it is not super-fast, but it is not slow (provided you
+ *      program it correctly), and general enough.  And its relative
+ *      speed grows as the number of rules becomes larger.
+ *
+ *      It seems that it represents the best middle point between
+ *      speed and manageability both by human and by machine.
+ *
+ *      It is especially useful for link sharing combined with QoS;
+ *      pure RSVP doesn't need such a general approach and can use
+ *      much simpler (and faster) schemes, sort of cls_rsvp.c.
+ *
+ *      JHS: We should remove the CONFIG_NET_CLS_IND from here
+ *      eventually when the meta match extension is made available
+ *
+ *      nfmark match added by Catalin(ux aka Dino) BOIE <catab at umbrella.ro>
+ */
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/notifier.h>
+#include <linux/rtnetlink.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/act_api.h>
+#include <net/pkt_cls.h>
+struct tc_u_knode
+{
+        struct tc_u_knode       *next;
+        u32                     handle;
+        struct tc_u_hnode       *ht_up;
+        struct tcf_exts         exts;
+#ifdef CONFIG_NET_CLS_IND
+        char                     indev[IFNAMSIZ];
+#endif
+        u8                      fshift;
+        struct tcf_result       res;
+        struct tc_u_hnode       *ht_down;
+#ifdef CONFIG_CLS_U32_PERF
+        struct tc_u32_pcnt      *pf;
+#endif
+#ifdef CONFIG_CLS_U32_MARK
+        struct tc_u32_mark      mark;
+#endif
+        struct tc_u32_sel       sel;
+};
+struct tc_u_hnode
+{
+        struct tc_u_hnode       *next;
+        u32                     handle;
+        u32                     prio;
+        struct tc_u_common      *tp_c;
+        int                     refcnt;
+        unsigned                divisor;
+        struct tc_u_knode       *ht[1];
+};
+struct tc_u_common
+{
+        struct tc_u_common      *next;
+        struct tc_u_hnode       *hlist;
+        struct Qdisc            *q;
+        int                     refcnt;
+        u32                     hgenerator;
+};
+static struct tcf_ext_map u32_ext_map = {
+        .action = TCA_U32_ACT,
+        .police = TCA_U32_POLICE
+};
+static struct tc_u_common *u32_list;
+static __inline__ unsigned u32_hash_fold(u32 key, struct tc_u32_sel *sel, u8 fshift)
+{
+        unsigned h = (key & sel->hmask)>>fshift;
+        return h;
+}
+static int u32_classify(struct sk_buff *skb, struct tcf_proto *tp, struct tcf_result *res)
+{
+        struct {
+                struct tc_u_knode *knode;
+                u8                *ptr;
+        } stack[TC_U32_MAXDEPTH];
+        struct tc_u_hnode *ht = (struct tc_u_hnode*)tp->root;
+        u8 *ptr = skb->nh.raw;
+        struct tc_u_knode *n;
+        int sdepth = 0;
+        int off2 = 0;
+        int sel = 0;
+#ifdef CONFIG_CLS_U32_PERF
+        int j;
+#endif
+        int i, r;
+next_ht:
+        n = ht->ht[sel];
+next_knode:
+        if (n) {
+                struct tc_u32_key *key = n->sel.keys;
+#ifdef CONFIG_CLS_U32_PERF
+                n->pf->rcnt +=1;
+                j = 0;
+#endif
+#ifdef CONFIG_CLS_U32_MARK
+                if ((skb->nfmark & n->mark.mask) != n->mark.val) {
+                        n = n->next;
+                        goto next_knode;
+                } else {
+                        n->mark.success++;
+                }
+#endif
+                for (i = n->sel.nkeys; i>0; i--, key++) {
+                        if ((*(u32*)(ptr+key->off+(off2&key->offmask))^key->val)&key->mask) {
+                                n = n->next;
+                                goto next_knode;
+                        }
+#ifdef CONFIG_CLS_U32_PERF
+                        n->pf->kcnts[j] +=1;
+                        j++;
+#endif
+                }
+                if (n->ht_down == NULL) {
+check_terminal:
+                        if (n->sel.flags&TC_U32_TERMINAL) {
+                                *res = n->res;
+#ifdef CONFIG_NET_CLS_IND
+                                if (!tcf_match_indev(skb, n->indev)) {
+                                        n = n->next;
+                                        goto next_knode;
+                                }
+#endif
+#ifdef CONFIG_CLS_U32_PERF
+                                n->pf->rhit +=1;
+#endif
+                                r = tcf_exts_exec(skb, &n->exts, res);
+                                if (r < 0) {
+                                        n = n->next;
+                                        goto next_knode;
+                                }
+                                return r;
+                        }
+                        n = n->next;
+                        goto next_knode;
+                }
+                /* PUSH */
+                if (sdepth >= TC_U32_MAXDEPTH)
+                        goto deadloop;
+                stack[sdepth].knode = n;
+                stack[sdepth].ptr = ptr;
+                sdepth++;
+                ht = n->ht_down;
+                sel = 0;
+                if (ht->divisor)
+                        sel = ht->divisor&u32_hash_fold(*(u32*)(ptr+n->sel.hoff), &n->sel,n->fshift);
+                if (!(n->sel.flags&(TC_U32_VAROFFSET|TC_U32_OFFSET|TC_U32_EAT)))
+                        goto next_ht;
+                if (n->sel.flags&(TC_U32_OFFSET|TC_U32_VAROFFSET)) {
+                        off2 = n->sel.off + 3;
+                        if (n->sel.flags&TC_U32_VAROFFSET)
+                                off2 += ntohs(n->sel.offmask & *(u16*)(ptr+n->sel.offoff)) >>n->sel.offshift;
+                        off2 &= ~3;
+                }
+                if (n->sel.flags&TC_U32_EAT) {
+                        ptr += off2;
+                        off2 = 0;
+                }
+                if (ptr < skb->tail)
+                        goto next_ht;
+        }
+        /* POP */
+        if (sdepth--) {
+                n = stack[sdepth].knode;
+                ht = n->ht_up;
+                ptr = stack[sdepth].ptr;
+                goto check_terminal;
+        }
+        return -1;
+deadloop:
+        if (net_ratelimit())
+                printk("cls_u32: dead loop\n");
+        return -1;
+}
+static __inline__ struct tc_u_hnode *
+u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)
+{
+        struct tc_u_hnode *ht;
+        for (ht = tp_c->hlist; ht; ht = ht->next)
+                if (ht->handle == handle)
+                        break;
+        return ht;
+}
+static __inline__ struct tc_u_knode *
+u32_lookup_key(struct tc_u_hnode *ht, u32 handle)
+{
+        unsigned sel;
+        struct tc_u_knode *n = NULL;
+        sel = TC_U32_HASH(handle);
+        if (sel > ht->divisor)
+                goto out;
+        for (n = ht->ht[sel]; n; n = n->next)
+                if (n->handle == handle)
+                        break;
+out:
+        return n;
+}
+static unsigned long u32_get(struct tcf_proto *tp, u32 handle)
+{
+        struct tc_u_hnode *ht;
+        struct tc_u_common *tp_c = tp->data;
+        if (TC_U32_HTID(handle) == TC_U32_ROOT)
+                ht = tp->root;
+        else
+                ht = u32_lookup_ht(tp_c, TC_U32_HTID(handle));
+        if (!ht)
+                return 0;
+        if (TC_U32_KEY(handle) == 0)
+                return (unsigned long)ht;
+        return (unsigned long)u32_lookup_key(ht, handle);
+}
+static void u32_put(struct tcf_proto *tp, unsigned long f)
+{
+}
+static u32 gen_new_htid(struct tc_u_common *tp_c)
+{
+        int i = 0x800;
+        do {
+                if (++tp_c->hgenerator == 0x7FF)
+                        tp_c->hgenerator = 1;
+        } while (--i>0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20));
+        return i > 0 ? (tp_c->hgenerator|0x800)<<20 : 0;
+}
+static int u32_init(struct tcf_proto *tp)
+{
+        struct tc_u_hnode *root_ht;
+        struct tc_u_common *tp_c;
+        for (tp_c = u32_list; tp_c; tp_c = tp_c->next)
+                if (tp_c->q == tp->q)
+                        break;
+        root_ht = kmalloc(sizeof(*root_ht), GFP_KERNEL);
+        if (root_ht == NULL)
+                return -ENOBUFS;
+        memset(root_ht, 0, sizeof(*root_ht));
+        root_ht->divisor = 0;
+        root_ht->refcnt++;
+        root_ht->handle = tp_c ? gen_new_htid(tp_c) : 0x80000000;
+        root_ht->prio = tp->prio;
+        if (tp_c == NULL) {
+                tp_c = kmalloc(sizeof(*tp_c), GFP_KERNEL);
+                if (tp_c == NULL) {
+                        kfree(root_ht);
+                        return -ENOBUFS;
+                }
+                memset(tp_c, 0, sizeof(*tp_c));
+                tp_c->q = tp->q;
+                tp_c->next = u32_list;
+                u32_list = tp_c;
+        }
+        tp_c->refcnt++;
+        root_ht->next = tp_c->hlist;
+        tp_c->hlist = root_ht;
+        root_ht->tp_c = tp_c;
+        tp->root = root_ht;
+        tp->data = tp_c;
+        return 0;
+}
+static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n)
+{
+        tcf_unbind_filter(tp, &n->res);
+        tcf_exts_destroy(tp, &n->exts);
+        if (n->ht_down)
+                n->ht_down->refcnt--;
+#ifdef CONFIG_CLS_U32_PERF
+        if (n && (NULL != n->pf))
+                kfree(n->pf);
+#endif
+        kfree(n);
+        return 0;
+}
+static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode* key)
+{
+        struct tc_u_knode **kp;
+        struct tc_u_hnode *ht = key->ht_up;
+        if (ht) {
+                for (kp = &ht->ht[TC_U32_HASH(key->handle)]; *kp; kp = &(*kp)->next) {
+                        if (*kp == key) {
+                                tcf_tree_lock(tp);
+                                *kp = key->next;
+                                tcf_tree_unlock(tp);
+                                u32_destroy_key(tp, key);
+                                return 0;
+                        }
+                }
+        }
+        BUG_TRAP(0);
+        return 0;
+}
+static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
+{
+        struct tc_u_knode *n;
+        unsigned h;
+        for (h=0; h<=ht->divisor; h++) {
+                while ((n = ht->ht[h]) != NULL) {
+                        ht->ht[h] = n->next;
+                        u32_destroy_key(tp, n);
+                }
+        }
+}
+static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
+{
+        struct tc_u_common *tp_c = tp->data;
+        struct tc_u_hnode **hn;
+        BUG_TRAP(!ht->refcnt);
+        u32_clear_hnode(tp, ht);
+        for (hn = &tp_c->hlist; *hn; hn = &(*hn)->next) {
+                if (*hn == ht) {
+                        *hn = ht->next;
+                        kfree(ht);
+                        return 0;
+                }
+        }
+        BUG_TRAP(0);
+        return -ENOENT;
+}
+static void u32_destroy(struct tcf_proto *tp)
+{
+        struct tc_u_common *tp_c = tp->data;
+        struct tc_u_hnode *root_ht = xchg(&tp->root, NULL);
+        BUG_TRAP(root_ht != NULL);
+        if (root_ht && --root_ht->refcnt == 0)
+                u32_destroy_hnode(tp, root_ht);
+        if (--tp_c->refcnt == 0) {
+                struct tc_u_hnode *ht;
+                struct tc_u_common **tp_cp;
+                for (tp_cp = &u32_list; *tp_cp; tp_cp = &(*tp_cp)->next) {
+                        if (*tp_cp == tp_c) {
+                                *tp_cp = tp_c->next;
+                                break;
+                        }
+                }
+                for (ht=tp_c->hlist; ht; ht = ht->next)
+                        u32_clear_hnode(tp, ht);
+                while ((ht = tp_c->hlist) != NULL) {
+                        tp_c->hlist = ht->next;
+                        BUG_TRAP(ht->refcnt == 0);
+                        kfree(ht);
+                };
+                kfree(tp_c);
+        }
+        tp->data = NULL;
+}
+static int u32_delete(struct tcf_proto *tp, unsigned long arg)
+{
+        struct tc_u_hnode *ht = (struct tc_u_hnode*)arg;
+        if (ht == NULL)
+                return 0;
+        if (TC_U32_KEY(ht->handle))
+                return u32_delete_key(tp, (struct tc_u_knode*)ht);
+        if (tp->root == ht)
+                return -EINVAL;
+        if (--ht->refcnt == 0)
+                u32_destroy_hnode(tp, ht);
+        return 0;
+}
+static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle)
+{
+        struct tc_u_knode *n;
+        unsigned i = 0x7FF;
+        for (n=ht->ht[TC_U32_HASH(handle)]; n; n = n->next)
+                if (i < TC_U32_NODE(n->handle))
+                        i = TC_U32_NODE(n->handle);
+        i++;
+        return handle|(i>0xFFF ? 0xFFF : i);
+}
+static int u32_set_parms(struct tcf_proto *tp, unsigned long base,
+                         struct tc_u_hnode *ht,
+                         struct tc_u_knode *n, struct rtattr **tb,
+                         struct rtattr *est)
+{
+        int err;
+        struct tcf_exts e;
+        err = tcf_exts_validate(tp, tb, est, &e, &u32_ext_map);
+        if (err < 0)
+                return err;
+        err = -EINVAL;
+        if (tb[TCA_U32_LINK-1]) {
+                u32 handle = *(u32*)RTA_DATA(tb[TCA_U32_LINK-1]);
+                struct tc_u_hnode *ht_down = NULL;
+                if (TC_U32_KEY(handle))
+                        goto errout;
+                if (handle) {
+                        ht_down = u32_lookup_ht(ht->tp_c, handle);
+                        if (ht_down == NULL)
+                                goto errout;
+                        ht_down->refcnt++;
+                }
+                tcf_tree_lock(tp);
+                ht_down = xchg(&n->ht_down, ht_down);
+                tcf_tree_unlock(tp);
+                if (ht_down)
+                        ht_down->refcnt--;
+        }
+        if (tb[TCA_U32_CLASSID-1]) {
+                n->res.classid = *(u32*)RTA_DATA(tb[TCA_U32_CLASSID-1]);
+                tcf_bind_filter(tp, &n->res, base);
+        }
+#ifdef CONFIG_NET_CLS_IND
+        if (tb[TCA_U32_INDEV-1]) {
+                int err = tcf_change_indev(tp, n->indev, tb[TCA_U32_INDEV-1]);
+                if (err < 0)
+                        goto errout;
+        }
+#endif
+        tcf_exts_change(tp, &n->exts, &e);
+        return 0;
+errout:
+        tcf_exts_destroy(tp, &e);
+        return err;
+}
+static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle,
+                      struct rtattr **tca,
+                      unsigned long *arg)
+{
+        struct tc_u_common *tp_c = tp->data;
+        struct tc_u_hnode *ht;
+        struct tc_u_knode *n;
+        struct tc_u32_sel *s;
+        struct rtattr *opt = tca[TCA_OPTIONS-1];
+        struct rtattr *tb[TCA_U32_MAX];
+        u32 htid;
+        int err;
+        if (opt == NULL)
+                return handle ? -EINVAL : 0;
+        if (rtattr_parse_nested(tb, TCA_U32_MAX, opt) < 0)
+                return -EINVAL;
+        if ((n = (struct tc_u_knode*)*arg) != NULL) {
+                if (TC_U32_KEY(n->handle) == 0)
+                        return -EINVAL;
+                return u32_set_parms(tp, base, n->ht_up, n, tb, tca[TCA_RATE-1]);
+        }
+        if (tb[TCA_U32_DIVISOR-1]) {
+                unsigned divisor = *(unsigned*)RTA_DATA(tb[TCA_U32_DIVISOR-1]);
+                if (--divisor > 0x100)
+                        return -EINVAL;
+                if (TC_U32_KEY(handle))
+                        return -EINVAL;
+                if (handle == 0) {
+                        handle = gen_new_htid(tp->data);
+                        if (handle == 0)
+                                return -ENOMEM;
+                }
+                ht = kmalloc(sizeof(*ht) + divisor*sizeof(void*), GFP_KERNEL);
+                if (ht == NULL)
+                        return -ENOBUFS;
+                memset(ht, 0, sizeof(*ht) + divisor*sizeof(void*));
+                ht->tp_c = tp_c;
+                ht->refcnt = 0;
+                ht->divisor = divisor;
+                ht->handle = handle;
+                ht->prio = tp->prio;
+                ht->next = tp_c->hlist;
+                tp_c->hlist = ht;
+                *arg = (unsigned long)ht;
+                return 0;
+        }
+        if (tb[TCA_U32_HASH-1]) {
+                htid = *(unsigned*)RTA_DATA(tb[TCA_U32_HASH-1]);
+                if (TC_U32_HTID(htid) == TC_U32_ROOT) {
+                        ht = tp->root;
+                        htid = ht->handle;
+                } else {
+                        ht = u32_lookup_ht(tp->data, TC_U32_HTID(htid));
+                        if (ht == NULL)
+                                return -EINVAL;
+                }
+        } else {
+                ht = tp->root;
+                htid = ht->handle;
+        }
+        if (ht->divisor < TC_U32_HASH(htid))
+                return -EINVAL;
+        if (handle) {
+                if (TC_U32_HTID(handle) && TC_U32_HTID(handle^htid))
+                        return -EINVAL;
+                handle = htid | TC_U32_NODE(handle);
+        } else
+                handle = gen_new_kid(ht, htid);
+        if (tb[TCA_U32_SEL-1] == 0 ||
+            RTA_PAYLOAD(tb[TCA_U32_SEL-1]) < sizeof(struct tc_u32_sel))
+                return -EINVAL;
+        s = RTA_DATA(tb[TCA_U32_SEL-1]);
+        n = kmalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), GFP_KERNEL);
+        if (n == NULL)
+                return -ENOBUFS;
+        memset(n, 0, sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key));
+#ifdef CONFIG_CLS_U32_PERF
+        n->pf = kmalloc(sizeof(struct tc_u32_pcnt) + s->nkeys*sizeof(u64), GFP_KERNEL);
+        if (n->pf == NULL) {
+                kfree(n);
+                return -ENOBUFS;
+        }
+        memset(n->pf, 0, sizeof(struct tc_u32_pcnt) + s->nkeys*sizeof(u64));
+#endif
+        memcpy(&n->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key));
+        n->ht_up = ht;
+        n->handle = handle;
+{
+        u8 i = 0;
+        u32 mask = s->hmask;
+        if (mask) {
+                while (!(mask & 1)) {
+                        i++;
+                        mask>>=1;
+                }
+        }
+        n->fshift = i;
+}
+#ifdef CONFIG_CLS_U32_MARK
+        if (tb[TCA_U32_MARK-1]) {
+                struct tc_u32_mark *mark;
+                if (RTA_PAYLOAD(tb[TCA_U32_MARK-1]) < sizeof(struct tc_u32_mark)) {
+#ifdef CONFIG_CLS_U32_PERF
+                        kfree(n->pf);
+#endif
+                        kfree(n);
+                        return -EINVAL;
+                }
+                mark = RTA_DATA(tb[TCA_U32_MARK-1]);
+                memcpy(&n->mark, mark, sizeof(struct tc_u32_mark));
+                n->mark.success = 0;
+        }
+#endif
+        err = u32_set_parms(tp, base, ht, n, tb, tca[TCA_RATE-1]);
+        if (err == 0) {
+                struct tc_u_knode **ins;
+                for (ins = &ht->ht[TC_U32_HASH(handle)]; *ins; ins = &(*ins)->next)
+                        if (TC_U32_NODE(handle) < TC_U32_NODE((*ins)->handle))
+                                break;
+                n->next = *ins;
+                wmb();
+                *ins = n;
+                *arg = (unsigned long)n;
+                return 0;
+        }
+#ifdef CONFIG_CLS_U32_PERF
+        if (n && (NULL != n->pf))
+                kfree(n->pf);
+#endif
+        kfree(n);
+        return err;
+}
+static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+        struct tc_u_common *tp_c = tp->data;
+        struct tc_u_hnode *ht;
+        struct tc_u_knode *n;
+        unsigned h;
+        if (arg->stop)
+                return;
+        for (ht = tp_c->hlist; ht; ht = ht->next) {
+                if (ht->prio != tp->prio)
+                        continue;
+                if (arg->count >= arg->skip) {
+                        if (arg->fn(tp, (unsigned long)ht, arg) < 0) {
+                                arg->stop = 1;
+                                return;
+                        }
+                }
+                arg->count++;
+                for (h = 0; h <= ht->divisor; h++) {
+                        for (n = ht->ht[h]; n; n = n->next) {
+                                if (arg->count < arg->skip) {
+                                        arg->count++;
+                                        continue;
+                                }
+                                if (arg->fn(tp, (unsigned long)n, arg) < 0) {
+                                        arg->stop = 1;
+                                        return;
+                                }
+                                arg->count++;
+                        }
+                }
+        }
+}
+static int u32_dump(struct tcf_proto *tp, unsigned long fh,
+                     struct sk_buff *skb, struct tcmsg *t)
+{
+        struct tc_u_knode *n = (struct tc_u_knode*)fh;
+        unsigned char    *b = skb->tail;
+        struct rtattr *rta;
+        if (n == NULL)
+                return skb->len;
+        t->tcm_handle = n->handle;
+        rta = (struct rtattr*)b;
+        RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
+        if (TC_U32_KEY(n->handle) == 0) {
+                struct tc_u_hnode *ht = (struct tc_u_hnode*)fh;
+                u32 divisor = ht->divisor+1;
+                RTA_PUT(skb, TCA_U32_DIVISOR, 4, &divisor);
+        } else {
+                RTA_PUT(skb, TCA_U32_SEL,
+                        sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key),
+                        &n->sel);
+                if (n->ht_up) {
+                        u32 htid = n->handle & 0xFFFFF000;
+                        RTA_PUT(skb, TCA_U32_HASH, 4, &htid);
+                }
+                if (n->res.classid)
+                        RTA_PUT(skb, TCA_U32_CLASSID, 4, &n->res.classid);
+                if (n->ht_down)
+                        RTA_PUT(skb, TCA_U32_LINK, 4, &n->ht_down->handle);
+#ifdef CONFIG_CLS_U32_MARK
+                if (n->mark.val || n->mark.mask)
+                        RTA_PUT(skb, TCA_U32_MARK, sizeof(n->mark), &n->mark);
+#endif
+                if (tcf_exts_dump(skb, &n->exts, &u32_ext_map) < 0)
+                        goto rtattr_failure;
+#ifdef CONFIG_NET_CLS_IND
+                if(strlen(n->indev))
+                        RTA_PUT(skb, TCA_U32_INDEV, IFNAMSIZ, n->indev);
+#endif
+#ifdef CONFIG_CLS_U32_PERF
+                RTA_PUT(skb, TCA_U32_PCNT, 
+                sizeof(struct tc_u32_pcnt) + n->sel.nkeys*sizeof(u64),
+                        n->pf);
+#endif
+        }
+        rta->rta_len = skb->tail - b;
+        if (TC_U32_KEY(n->handle))
+                if (tcf_exts_dump_stats(skb, &n->exts, &u32_ext_map) < 0)
+                        goto rtattr_failure;
+        return skb->len;
+rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+static struct tcf_proto_ops cls_u32_ops = {
+        .next           =       NULL,
+        .kind           =       "u32",
+        .classify       =       u32_classify,
+        .init           =       u32_init,
+        .destroy        =       u32_destroy,
+        .get            =       u32_get,
+        .put            =       u32_put,
+        .change         =       u32_change,
+        .delete         =       u32_delete,
+        .walk           =       u32_walk,
+        .dump           =       u32_dump,
+        .owner          =       THIS_MODULE,
+};
+static int __init init_u32(void)
+{
+        printk("u32 classifier\n");
+#ifdef CONFIG_CLS_U32_PERF
+        printk("    Perfomance counters on\n");
+#endif
+#ifdef CONFIG_NET_CLS_POLICE
+        printk("    OLD policer on \n");
+#endif
+#ifdef CONFIG_NET_CLS_IND
+        printk("    input device check on \n");
+#endif
+#ifdef CONFIG_NET_CLS_ACT
+        printk("    Actions configured \n");
+#endif
+        return register_tcf_proto_ops(&cls_u32_ops);
+}
+static void __exit exit_u32(void) 
+{
+        unregister_tcf_proto_ops(&cls_u32_ops);
+}
+module_init(init_u32)
+module_exit(exit_u32)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/em_cmp.c b/net/sched/em_cmp.c
new file mode 100644
index 000000000000..bf1f00f8b1bf
--- /dev/null
+++ b/net/sched/em_cmp.c
@@ -0,0 +1,101 @@
+/*
+ * net/sched/em_cmp.c   Simple packet data comparison ematch
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Authors:     Thomas Graf <tgraf@suug.ch>
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/tc_ematch/tc_em_cmp.h>
+#include <net/pkt_cls.h>
+static inline int cmp_needs_transformation(struct tcf_em_cmp *cmp)
+{
+        return unlikely(cmp->flags & TCF_EM_CMP_TRANS);
+}
+static int em_cmp_match(struct sk_buff *skb, struct tcf_ematch *em,
+                        struct tcf_pkt_info *info)
+{
+        struct tcf_em_cmp *cmp = (struct tcf_em_cmp *) em->data;
+        unsigned char *ptr = tcf_get_base_ptr(skb, cmp->layer) + cmp->off;
+        u32 val = 0;
+        if (!tcf_valid_offset(skb, ptr, cmp->align))
+                return 0;
+        switch (cmp->align) {
+                case TCF_EM_ALIGN_U8:
+                        val = *ptr;
+                        break;
+                case TCF_EM_ALIGN_U16:
+                        val = *ptr << 8;
+                        val |= *(ptr+1);
+                        if (cmp_needs_transformation(cmp))
+                                val = be16_to_cpu(val);
+                        break;
+                case TCF_EM_ALIGN_U32:
+                        /* Worth checking boundries? The branching seems
+                         * to get worse. Visit again. */
+                        val = *ptr << 24;
+                        val |= *(ptr+1) << 16;
+                        val |= *(ptr+2) << 8;
+                        val |= *(ptr+3);
+                        if (cmp_needs_transformation(cmp))
+                                val = be32_to_cpu(val);
+                        break;
+                default:
+                        return 0;
+        }
+        if (cmp->mask)
+                val &= cmp->mask;
+        switch (cmp->opnd) {
+                case TCF_EM_OPND_EQ:
+                        return val == cmp->val;
+                case TCF_EM_OPND_LT:
+                        return val < cmp->val;
+                case TCF_EM_OPND_GT:
+                        return val > cmp->val;
+        }
+        return 0;
+}
+static struct tcf_ematch_ops em_cmp_ops = {
+        .kind     = TCF_EM_CMP,
+        .datalen  = sizeof(struct tcf_em_cmp),
+        .match    = em_cmp_match,
+        .owner    = THIS_MODULE,
+        .link     = LIST_HEAD_INIT(em_cmp_ops.link)
+};
+static int __init init_em_cmp(void)
+{
+        return tcf_em_register(&em_cmp_ops);
+}
+static void __exit exit_em_cmp(void) 
+{
+        tcf_em_unregister(&em_cmp_ops);
+}
+MODULE_LICENSE("GPL");
+module_init(init_em_cmp);
+module_exit(exit_em_cmp);
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
new file mode 100644
index 000000000000..f1eeaf65cee5
--- /dev/null
+++ b/net/sched/em_meta.c
@@ -0,0 +1,661 @@
+/*
+ * net/sched/em_meta.c  Metadata ematch
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Authors:     Thomas Graf <tgraf@suug.ch>
+ *
+ * ==========================================================================
+ * 
+ *      The metadata ematch compares two meta objects where each object
+ *      represents either a meta value stored in the kernel or a static
+ *      value provided by userspace. The objects are not provided by
+ *      userspace itself but rather a definition providing the information
+ *      to build them. Every object is of a certain type which must be
+ *      equal to the object it is being compared to.
+ *
+ *      The definition of a objects conists of the type (meta type), a
+ *      identifier (meta id) and additional type specific information.
+ *      The meta id is either TCF_META_TYPE_VALUE for values provided by
+ *      userspace or a index to the meta operations table consisting of
+ *      function pointers to type specific meta data collectors returning
+ *      the value of the requested meta value.
+ *
+ *               lvalue                                   rvalue
+ *            +-----------+                           +-----------+
+ *            | type: INT |                           | type: INT |
+ *       def  | id: INDEV |                           | id: VALUE |
+ *            | data:     |                           | data: 3   |
+ *            +-----------+                           +-----------+
+ *                  |                                       |
+ *                  ---> meta_ops[INT][INDEV](...)          |
+ *                            |                            |
+ *                  -----------                             |
+ *                  V                                       V
+ *            +-----------+                           +-----------+
+ *            | type: INT |                           | type: INT |
+ *       obj  | id: INDEV |                           | id: VALUE |
+ *            | data: 2   |<--data got filled out     | data: 3   |
+ *            +-----------+                           +-----------+
+ *                  |                                         |
+ *                  --------------> 2  equals 3 <--------------
+ *
+ *      This is a simplified schema, the complexity varies depending
+ *      on the meta type. Obviously, the length of the data must also
+ *      be provided for non-numeric types.
+ *
+ *      Additionaly, type dependant modifiers such as shift operators
+ *      or mask may be applied to extend the functionaliy. As of now,
+ *      the variable length type supports shifting the byte string to
+ *      the right, eating up any number of octets and thus supporting
+ *      wildcard interface name comparisons such as "ppp%" matching
+ *      ppp0..9.
+ *
+ *      NOTE: Certain meta values depend on other subsystems and are
+ *            only available if that subsytem is enabled in the kernel.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/random.h>
+#include <linux/tc_ematch/tc_em_meta.h>
+#include <net/dst.h>
+#include <net/route.h>
+#include <net/pkt_cls.h>
+struct meta_obj
+{
+        unsigned long           value;
+        unsigned int            len;
+};
+struct meta_value
+{
+        struct tcf_meta_val     hdr;
+        unsigned long           val;
+        unsigned int            len;
+};
+struct meta_match
+{
+        struct meta_value       lvalue;
+        struct meta_value       rvalue;
+};
+static inline int meta_id(struct meta_value *v)
+{
+        return TCF_META_ID(v->hdr.kind);
+}
+static inline int meta_type(struct meta_value *v)
+{
+        return TCF_META_TYPE(v->hdr.kind);
+}
+#define META_COLLECTOR(FUNC) static void meta_##FUNC(struct sk_buff *skb, \
+        struct tcf_pkt_info *info, struct meta_value *v, \
+        struct meta_obj *dst, int *err)
+/**************************************************************************
+ * System status & misc
+ **************************************************************************/
+META_COLLECTOR(int_random)
+{
+        get_random_bytes(&dst->value, sizeof(dst->value));
+}
+static inline unsigned long fixed_loadavg(int load)
+{
+        int rnd_load = load + (FIXED_1/200);
+        int rnd_frac = ((rnd_load & (FIXED_1-1)) * 100) >> FSHIFT;
+        return ((rnd_load >> FSHIFT) * 100) + rnd_frac;
+}
+META_COLLECTOR(int_loadavg_0)
+{
+        dst->value = fixed_loadavg(avenrun[0]);
+}
+META_COLLECTOR(int_loadavg_1)
+{
+        dst->value = fixed_loadavg(avenrun[1]);
+}
+META_COLLECTOR(int_loadavg_2)
+{
+        dst->value = fixed_loadavg(avenrun[2]);
+}
+/**************************************************************************
+ * Device names & indices
+ **************************************************************************/
+static inline int int_dev(struct net_device *dev, struct meta_obj *dst)
+{
+        if (unlikely(dev == NULL))
+                return -1;
+        dst->value = dev->ifindex;
+        return 0;
+}
+static inline int var_dev(struct net_device *dev, struct meta_obj *dst)
+{
+        if (unlikely(dev == NULL))
+                return -1;
+        dst->value = (unsigned long) dev->name;
+        dst->len = strlen(dev->name);
+        return 0;
+}
+META_COLLECTOR(int_dev)
+{
+        *err = int_dev(skb->dev, dst);
+}
+META_COLLECTOR(var_dev)
+{
+        *err = var_dev(skb->dev, dst);
+}
+META_COLLECTOR(int_indev)
+{
+        *err = int_dev(skb->input_dev, dst);
+}
+META_COLLECTOR(var_indev)
+{
+        *err = var_dev(skb->input_dev, dst);
+}
+META_COLLECTOR(int_realdev)
+{
+        *err = int_dev(skb->real_dev, dst);
+}
+META_COLLECTOR(var_realdev)
+{
+        *err = var_dev(skb->real_dev, dst);
+}
+/**************************************************************************
+ * skb attributes
+ **************************************************************************/
+META_COLLECTOR(int_priority)
+{
+        dst->value = skb->priority;
+}
+META_COLLECTOR(int_protocol)
+{
+        /* Let userspace take care of the byte ordering */
+        dst->value = skb->protocol;
+}
+META_COLLECTOR(int_security)
+{
+        dst->value = skb->security;
+}
+META_COLLECTOR(int_pkttype)
+{
+        dst->value = skb->pkt_type;
+}
+META_COLLECTOR(int_pktlen)
+{
+        dst->value = skb->len;
+}
+META_COLLECTOR(int_datalen)
+{
+        dst->value = skb->data_len;
+}
+META_COLLECTOR(int_maclen)
+{
+        dst->value = skb->mac_len;
+}
+/**************************************************************************
+ * Netfilter
+ **************************************************************************/
+#ifdef CONFIG_NETFILTER
+META_COLLECTOR(int_nfmark)
+{
+        dst->value = skb->nfmark;
+}
+#endif
+/**************************************************************************
+ * Traffic Control
+ **************************************************************************/
+META_COLLECTOR(int_tcindex)
+{
+        dst->value = skb->tc_index;
+}
+#ifdef CONFIG_NET_CLS_ACT
+META_COLLECTOR(int_tcverd)
+{
+        dst->value = skb->tc_verd;
+}
+META_COLLECTOR(int_tcclassid)
+{
+        dst->value = skb->tc_classid;
+}
+#endif
+/**************************************************************************
+ * Routing
+ **************************************************************************/
+#ifdef CONFIG_NET_CLS_ROUTE
+META_COLLECTOR(int_rtclassid)
+{
+        if (unlikely(skb->dst == NULL))
+                *err = -1;
+        else
+                dst->value = skb->dst->tclassid;
+}
+#endif
+META_COLLECTOR(int_rtiif)
+{
+        if (unlikely(skb->dst == NULL))
+                *err = -1;
+        else
+                dst->value = ((struct rtable*) skb->dst)->fl.iif;
+}
+/**************************************************************************
+ * Meta value collectors assignment table
+ **************************************************************************/
+struct meta_ops
+{
+        void            (*get)(struct sk_buff *, struct tcf_pkt_info *,
+                               struct meta_value *, struct meta_obj *, int *);
+};
+/* Meta value operations table listing all meta value collectors and
+ * assigns them to a type and meta id. */
+static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
+        [TCF_META_TYPE_VAR] = {
+                [TCF_META_ID_DEV]       = { .get = meta_var_dev },
+                [TCF_META_ID_INDEV]     = { .get = meta_var_indev },
+                [TCF_META_ID_REALDEV]   = { .get = meta_var_realdev }
+        },
+        [TCF_META_TYPE_INT] = {
+                [TCF_META_ID_RANDOM]    = { .get = meta_int_random },
+                [TCF_META_ID_LOADAVG_0] = { .get = meta_int_loadavg_0 },
+                [TCF_META_ID_LOADAVG_1] = { .get = meta_int_loadavg_1 },
+                [TCF_META_ID_LOADAVG_2] = { .get = meta_int_loadavg_2 },
+                [TCF_META_ID_DEV]       = { .get = meta_int_dev },
+                [TCF_META_ID_INDEV]     = { .get = meta_int_indev },
+                [TCF_META_ID_REALDEV]   = { .get = meta_int_realdev },
+                [TCF_META_ID_PRIORITY]  = { .get = meta_int_priority },
+                [TCF_META_ID_PROTOCOL]  = { .get = meta_int_protocol },
+                [TCF_META_ID_SECURITY]  = { .get = meta_int_security },
+                [TCF_META_ID_PKTTYPE]   = { .get = meta_int_pkttype },
+                [TCF_META_ID_PKTLEN]    = { .get = meta_int_pktlen },
+                [TCF_META_ID_DATALEN]   = { .get = meta_int_datalen },
+                [TCF_META_ID_MACLEN]    = { .get = meta_int_maclen },
+#ifdef CONFIG_NETFILTER
+                [TCF_META_ID_NFMARK]    = { .get = meta_int_nfmark },
+#endif
+                [TCF_META_ID_TCINDEX]   = { .get = meta_int_tcindex },
+#ifdef CONFIG_NET_CLS_ACT
+                [TCF_META_ID_TCVERDICT] = { .get = meta_int_tcverd },
+                [TCF_META_ID_TCCLASSID] = { .get = meta_int_tcclassid },
+#endif
+#ifdef CONFIG_NET_CLS_ROUTE
+                [TCF_META_ID_RTCLASSID] = { .get = meta_int_rtclassid },
+#endif
+                [TCF_META_ID_RTIIF]     = { .get = meta_int_rtiif }
+        }
+};
+static inline struct meta_ops * meta_ops(struct meta_value *val)
+{
+        return &__meta_ops[meta_type(val)][meta_id(val)];
+}
+/**************************************************************************
+ * Type specific operations for TCF_META_TYPE_VAR
+ **************************************************************************/
+static int meta_var_compare(struct meta_obj *a, struct meta_obj *b)
+{
+        int r = a->len - b->len;
+        if (r == 0)
+                r = memcmp((void *) a->value, (void *) b->value, a->len);
+        return r;
+}
+static int meta_var_change(struct meta_value *dst, struct rtattr *rta)
+{
+        int len = RTA_PAYLOAD(rta);
+        dst->val = (unsigned long) kmalloc(len, GFP_KERNEL);
+        if (dst->val == 0UL)
+                return -ENOMEM;
+        memcpy((void *) dst->val, RTA_DATA(rta), len);
+        dst->len = len;
+        return 0;
+}
+static void meta_var_destroy(struct meta_value *v)
+{
+        if (v->val)
+                kfree((void *) v->val);
+}
+static void meta_var_apply_extras(struct meta_value *v,
+                                  struct meta_obj *dst)
+{
+        int shift = v->hdr.shift;
+        if (shift && shift < dst->len)
+                dst->len -= shift;
+}
+static int meta_var_dump(struct sk_buff *skb, struct meta_value *v, int tlv)
+{
+        if (v->val && v->len)
+                RTA_PUT(skb, tlv, v->len, (void *) v->val);
+        return 0;
+rtattr_failure:
+        return -1;
+}
+/**************************************************************************
+ * Type specific operations for TCF_META_TYPE_INT
+ **************************************************************************/
+static int meta_int_compare(struct meta_obj *a, struct meta_obj *b)
+{
+        /* Let gcc optimize it, the unlikely is not really based on
+         * some numbers but jump free code for mismatches seems
+         * more logical. */
+        if (unlikely(a == b))
+                return 0;
+        else if (a < b)
+                return -1;
+        else
+                return 1;
+}
+static int meta_int_change(struct meta_value *dst, struct rtattr *rta)
+{
+        if (RTA_PAYLOAD(rta) >= sizeof(unsigned long)) {
+                dst->val = *(unsigned long *) RTA_DATA(rta);
+                dst->len = sizeof(unsigned long);
+        } else if (RTA_PAYLOAD(rta) == sizeof(u32)) {
+                dst->val = *(u32 *) RTA_DATA(rta);
+                dst->len = sizeof(u32);
+        } else
+                return -EINVAL;
+        return 0;
+}
+static void meta_int_apply_extras(struct meta_value *v,
+                                  struct meta_obj *dst)
+{
+        if (v->hdr.shift)
+                dst->value >>= v->hdr.shift;
+        if (v->val)
+                dst->value &= v->val;
+}
+static int meta_int_dump(struct sk_buff *skb, struct meta_value *v, int tlv)
+{
+        if (v->len == sizeof(unsigned long))
+                RTA_PUT(skb, tlv, sizeof(unsigned long), &v->val);
+        else if (v->len == sizeof(u32)) {
+                u32 d = v->val;
+                RTA_PUT(skb, tlv, sizeof(d), &d);
+        }
+        return 0;
+rtattr_failure:
+        return -1;
+}
+/**************************************************************************
+ * Type specific operations table
+ **************************************************************************/
+struct meta_type_ops
+{
+        void    (*destroy)(struct meta_value *);
+        int     (*compare)(struct meta_obj *, struct meta_obj *);
+        int     (*change)(struct meta_value *, struct rtattr *);
+        void    (*apply_extras)(struct meta_value *, struct meta_obj *);
+        int     (*dump)(struct sk_buff *, struct meta_value *, int);
+};
+static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX+1] = {
+        [TCF_META_TYPE_VAR] = {
+                .destroy = meta_var_destroy,
+                .compare = meta_var_compare,
+                .change = meta_var_change,
+                .apply_extras = meta_var_apply_extras,
+                .dump = meta_var_dump
+        },
+        [TCF_META_TYPE_INT] = {
+                .compare = meta_int_compare,
+                .change = meta_int_change,
+                .apply_extras = meta_int_apply_extras,
+                .dump = meta_int_dump
+        }
+};
+static inline struct meta_type_ops * meta_type_ops(struct meta_value *v)
+{
+        return &__meta_type_ops[meta_type(v)];
+}
+/**************************************************************************
+ * Core
+ **************************************************************************/
+static inline int meta_get(struct sk_buff *skb, struct tcf_pkt_info *info, 
+                           struct meta_value *v, struct meta_obj *dst)
+{
+        int err = 0;
+        if (meta_id(v) == TCF_META_ID_VALUE) {
+                dst->value = v->val;
+                dst->len = v->len;
+                return 0;
+        }
+        meta_ops(v)->get(skb, info, v, dst, &err);
+        if (err < 0)
+                return err;
+        if (meta_type_ops(v)->apply_extras)
+            meta_type_ops(v)->apply_extras(v, dst);
+        return 0;
+}
+static int em_meta_match(struct sk_buff *skb, struct tcf_ematch *m,
+                         struct tcf_pkt_info *info)
+{
+        int r;
+        struct meta_match *meta = (struct meta_match *) m->data;
+        struct meta_obj l_value, r_value;
+        if (meta_get(skb, info, &meta->lvalue, &l_value) < 0 ||
+            meta_get(skb, info, &meta->rvalue, &r_value) < 0)
+                return 0;
+        r = meta_type_ops(&meta->lvalue)->compare(&l_value, &r_value);
+        switch (meta->lvalue.hdr.op) {
+                case TCF_EM_OPND_EQ:
+                        return !r;
+                case TCF_EM_OPND_LT:
+                        return r < 0;
+                case TCF_EM_OPND_GT:
+                        return r > 0;
+        }
+        return 0;
+}
+static inline void meta_delete(struct meta_match *meta)
+{
+        struct meta_type_ops *ops = meta_type_ops(&meta->lvalue);
+        if (ops && ops->destroy) {
+                ops->destroy(&meta->lvalue);
+                ops->destroy(&meta->rvalue);
+        }
+        kfree(meta);
+}
+static inline int meta_change_data(struct meta_value *dst, struct rtattr *rta)
+{
+        if (rta) {
+                if (RTA_PAYLOAD(rta) == 0)
+                        return -EINVAL;
+                return meta_type_ops(dst)->change(dst, rta);
+        }
+        return 0;
+}
+static inline int meta_is_supported(struct meta_value *val)
+{
+        return (!meta_id(val) || meta_ops(val)->get);
+}
+static int em_meta_change(struct tcf_proto *tp, void *data, int len,
+                          struct tcf_ematch *m)
+{
+        int err = -EINVAL;
+        struct rtattr *tb[TCA_EM_META_MAX];
+        struct tcf_meta_hdr *hdr;
+        struct meta_match *meta = NULL;
+        
+        if (rtattr_parse(tb, TCA_EM_META_MAX, data, len) < 0)
+                goto errout;
+        if (tb[TCA_EM_META_HDR-1] == NULL ||
+            RTA_PAYLOAD(tb[TCA_EM_META_HDR-1]) < sizeof(*hdr))
+                goto errout;
+        hdr = RTA_DATA(tb[TCA_EM_META_HDR-1]);
+        if (TCF_META_TYPE(hdr->left.kind) != TCF_META_TYPE(hdr->right.kind) ||
+            TCF_META_TYPE(hdr->left.kind) > TCF_META_TYPE_MAX ||
+            TCF_META_ID(hdr->left.kind) > TCF_META_ID_MAX ||
+            TCF_META_ID(hdr->right.kind) > TCF_META_ID_MAX)
+                goto errout;
+        meta = kmalloc(sizeof(*meta), GFP_KERNEL);
+        if (meta == NULL)
+                goto errout;
+        memset(meta, 0, sizeof(*meta));
+        memcpy(&meta->lvalue.hdr, &hdr->left, sizeof(hdr->left));
+        memcpy(&meta->rvalue.hdr, &hdr->right, sizeof(hdr->right));
+        if (!meta_is_supported(&meta->lvalue) ||
+            !meta_is_supported(&meta->rvalue)) {
+                err = -EOPNOTSUPP;
+                goto errout;
+        }
+        if (meta_change_data(&meta->lvalue, tb[TCA_EM_META_LVALUE-1]) < 0 ||
+            meta_change_data(&meta->rvalue, tb[TCA_EM_META_RVALUE-1]) < 0)
+                goto errout;
+        m->datalen = sizeof(*meta);
+        m->data = (unsigned long) meta;
+        err = 0;
+errout:
+        if (err && meta)
+                meta_delete(meta);
+        return err;
+}
+static void em_meta_destroy(struct tcf_proto *tp, struct tcf_ematch *m)
+{
+        if (m)
+                meta_delete((struct meta_match *) m->data);
+}
+static int em_meta_dump(struct sk_buff *skb, struct tcf_ematch *em)
+{
+        struct meta_match *meta = (struct meta_match *) em->data;
+        struct tcf_meta_hdr hdr;
+        struct meta_type_ops *ops;
+        memset(&hdr, 0, sizeof(hdr));
+        memcpy(&hdr.left, &meta->lvalue.hdr, sizeof(hdr.left));
+        memcpy(&hdr.right, &meta->rvalue.hdr, sizeof(hdr.right));
+        RTA_PUT(skb, TCA_EM_META_HDR, sizeof(hdr), &hdr);
+        ops = meta_type_ops(&meta->lvalue);
+        if (ops->dump(skb, &meta->lvalue, TCA_EM_META_LVALUE) < 0 ||
+            ops->dump(skb, &meta->rvalue, TCA_EM_META_RVALUE) < 0)
+                goto rtattr_failure;
+        return 0;
+rtattr_failure:
+        return -1;
+}               
+static struct tcf_ematch_ops em_meta_ops = {
+        .kind     = TCF_EM_META,
+        .change   = em_meta_change,
+        .match    = em_meta_match,
+        .destroy  = em_meta_destroy,
+        .dump     = em_meta_dump,
+        .owner    = THIS_MODULE,
+        .link     = LIST_HEAD_INIT(em_meta_ops.link)
+};
+static int __init init_em_meta(void)
+{
+        return tcf_em_register(&em_meta_ops);
+}
+static void __exit exit_em_meta(void) 
+{
+        tcf_em_unregister(&em_meta_ops);
+}
+MODULE_LICENSE("GPL");
+module_init(init_em_meta);
+module_exit(exit_em_meta);
diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c
new file mode 100644
index 000000000000..71ea926a9f09
--- /dev/null
+++ b/net/sched/em_nbyte.c
@@ -0,0 +1,82 @@
+/*
+ * net/sched/em_nbyte.c N-Byte ematch
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Authors:     Thomas Graf <tgraf@suug.ch>
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/tc_ematch/tc_em_nbyte.h>
+#include <net/pkt_cls.h>
+struct nbyte_data
+{
+        struct tcf_em_nbyte     hdr;
+        char                    pattern[0];
+};
+        
+static int em_nbyte_change(struct tcf_proto *tp, void *data, int data_len,
+                           struct tcf_ematch *em)
+{
+        struct tcf_em_nbyte *nbyte = data;
+        if (data_len < sizeof(*nbyte) ||
+            data_len < (sizeof(*nbyte) + nbyte->len))
+                return -EINVAL;
+        em->datalen = sizeof(*nbyte) + nbyte->len;
+        em->data = (unsigned long) kmalloc(em->datalen, GFP_KERNEL);
+        if (em->data == 0UL)
+                return -ENOBUFS;
+        memcpy((void *) em->data, data, em->datalen);
+        return 0;
+}
+static int em_nbyte_match(struct sk_buff *skb, struct tcf_ematch *em,
+                          struct tcf_pkt_info *info)
+{
+        struct nbyte_data *nbyte = (struct nbyte_data *) em->data;
+        unsigned char *ptr = tcf_get_base_ptr(skb, nbyte->hdr.layer);
+        ptr += nbyte->hdr.off;
+        if (!tcf_valid_offset(skb, ptr, nbyte->hdr.len))
+                return 0;
+        return !memcmp(ptr + nbyte->hdr.off, nbyte->pattern, nbyte->hdr.len);
+}
+static struct tcf_ematch_ops em_nbyte_ops = {
+        .kind     = TCF_EM_NBYTE,
+        .change   = em_nbyte_change,
+        .match    = em_nbyte_match,
+        .owner    = THIS_MODULE,
+        .link     = LIST_HEAD_INIT(em_nbyte_ops.link)
+};
+static int __init init_em_nbyte(void)
+{
+        return tcf_em_register(&em_nbyte_ops);
+}
+static void __exit exit_em_nbyte(void) 
+{
+        tcf_em_unregister(&em_nbyte_ops);
+}
+MODULE_LICENSE("GPL");
+module_init(init_em_nbyte);
+module_exit(exit_em_nbyte);
diff --git a/net/sched/em_u32.c b/net/sched/em_u32.c
new file mode 100644
index 000000000000..34e7e51e601e
--- /dev/null
+++ b/net/sched/em_u32.c
@@ -0,0 +1,63 @@
+/*
+ * net/sched/em_u32.c   U32 Ematch
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Authors:     Thomas Graf <tgraf@suug.ch>
+ *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Based on net/sched/cls_u32.c
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <net/pkt_cls.h>
+static int em_u32_match(struct sk_buff *skb, struct tcf_ematch *em,
+                        struct tcf_pkt_info *info)
+{
+        struct tc_u32_key *key = (struct tc_u32_key *) em->data;
+        unsigned char *ptr = skb->nh.raw;
+        
+        if (info) {
+                if (info->ptr)
+                        ptr = info->ptr;
+                ptr += (info->nexthdr & key->offmask);
+        }
+        ptr += key->off;
+        if (!tcf_valid_offset(skb, ptr, sizeof(u32)))
+                return 0;
+        
+        return !(((*(u32*) ptr)  ^ key->val) & key->mask);
+}
+static struct tcf_ematch_ops em_u32_ops = {
+        .kind     = TCF_EM_U32,
+        .datalen  = sizeof(struct tc_u32_key),
+        .match    = em_u32_match,
+        .owner    = THIS_MODULE,
+        .link     = LIST_HEAD_INIT(em_u32_ops.link)
+};
+static int __init init_em_u32(void)
+{
+        return tcf_em_register(&em_u32_ops);
+}
+static void __exit exit_em_u32(void) 
+{
+        tcf_em_unregister(&em_u32_ops);
+}
+MODULE_LICENSE("GPL");
+module_init(init_em_u32);
+module_exit(exit_em_u32);
diff --git a/net/sched/ematch.c b/net/sched/ematch.c
new file mode 100644
index 000000000000..ebfe2e7d21bd
--- /dev/null
+++ b/net/sched/ematch.c
@@ -0,0 +1,524 @@
+/*
+ * net/sched/ematch.c           Extended Match API
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Authors:     Thomas Graf <tgraf@suug.ch>
+ *
+ * ==========================================================================
+ *
+ * An extended match (ematch) is a small classification tool not worth
+ * writing a full classifier for. Ematches can be interconnected to form
+ * a logic expression and get attached to classifiers to extend their
+ * functionatlity.
+ *
+ * The userspace part transforms the logic expressions into an array
+ * consisting of multiple sequences of interconnected ematches separated
+ * by markers. Precedence is implemented by a special ematch kind
+ * referencing a sequence beyond the marker of the current sequence
+ * causing the current position in the sequence to be pushed onto a stack
+ * to allow the current position to be overwritten by the position referenced
+ * in the special ematch. Matching continues in the new sequence until a
+ * marker is reached causing the position to be restored from the stack.
+ *
+ * Example:
+ *          A AND (B1 OR B2) AND C AND D
+ *
+ *              ------->-PUSH-------
+ *    -->--    /         -->--      \   -->--
+ *   /     \  /         /     \      \ /     \
+ * +-------+-------+-------+-------+-------+--------+
+ * | A AND | B AND | C AND | D END | B1 OR | B2 END |
+ * +-------+-------+-------+-------+-------+--------+
+ *                    \                      /
+ *                     --------<-POP---------
+ *
+ * where B is a virtual ematch referencing to sequence starting with B1.
+ * 
+ * ==========================================================================
+ *
+ * How to write an ematch in 60 seconds
+ * ------------------------------------
+ * 
+ *   1) Provide a matcher function:
+ *      static int my_match(struct sk_buff *skb, struct tcf_ematch *m,
+ *                          struct tcf_pkt_info *info)
+ *      {
+ *              struct mydata *d = (struct mydata *) m->data;
+ *
+ *              if (...matching goes here...)
+ *                      return 1;
+ *              else
+ *                      return 0;
+ *      }
+ *
+ *   2) Fill out a struct tcf_ematch_ops:
+ *      static struct tcf_ematch_ops my_ops = {
+ *              .kind = unique id,
+ *              .datalen = sizeof(struct mydata),
+ *              .match = my_match,
+ *              .owner = THIS_MODULE,
+ *      };
+ *
+ *   3) Register/Unregister your ematch:
+ *      static int __init init_my_ematch(void)
+ *      {
+ *              return tcf_em_register(&my_ops);
+ *      }
+ *
+ *      static void __exit exit_my_ematch(void)
+ *      {
+ *              return tcf_em_unregister(&my_ops);
+ *      }
+ *
+ *      module_init(init_my_ematch);
+ *      module_exit(exit_my_ematch);
+ *
+ *   4) By now you should have two more seconds left, barely enough to
+ *      open up a beer to watch the compilation going.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+#include <net/pkt_cls.h>
+#include <config/net/ematch/stack.h>
+static LIST_HEAD(ematch_ops);
+static DEFINE_RWLOCK(ematch_mod_lock);
+static inline struct tcf_ematch_ops * tcf_em_lookup(u16 kind)
+{
+        struct tcf_ematch_ops *e = NULL;
+        read_lock(&ematch_mod_lock);
+        list_for_each_entry(e, &ematch_ops, link) {
+                if (kind == e->kind) {
+                        if (!try_module_get(e->owner))
+                                e = NULL;
+                        read_unlock(&ematch_mod_lock);
+                        return e;
+                }
+        }
+        read_unlock(&ematch_mod_lock);
+        return NULL;
+}
+/**
+ * tcf_em_register - register an extended match
+ * 
+ * @ops: ematch operations lookup table
+ *
+ * This function must be called by ematches to announce their presence.
+ * The given @ops must have kind set to a unique identifier and the
+ * callback match() must be implemented. All other callbacks are optional
+ * and a fallback implementation is used instead.
+ *
+ * Returns -EEXISTS if an ematch of the same kind has already registered.
+ */
+int tcf_em_register(struct tcf_ematch_ops *ops)
+{
+        int err = -EEXIST;
+        struct tcf_ematch_ops *e;
+        if (ops->match == NULL)
+                return -EINVAL;
+        write_lock(&ematch_mod_lock);
+        list_for_each_entry(e, &ematch_ops, link)
+                if (ops->kind == e->kind)
+                        goto errout;
+        list_add_tail(&ops->link, &ematch_ops);
+        err = 0;
+errout:
+        write_unlock(&ematch_mod_lock);
+        return err;
+}
+/**
+ * tcf_em_unregister - unregster and extended match
+ *
+ * @ops: ematch operations lookup table
+ *
+ * This function must be called by ematches to announce their disappearance
+ * for examples when the module gets unloaded. The @ops parameter must be
+ * the same as the one used for registration.
+ *
+ * Returns -ENOENT if no matching ematch was found.
+ */
+int tcf_em_unregister(struct tcf_ematch_ops *ops)
+{
+        int err = 0;
+        struct tcf_ematch_ops *e;
+        write_lock(&ematch_mod_lock);
+        list_for_each_entry(e, &ematch_ops, link) {
+                if (e == ops) {
+                        list_del(&e->link);
+                        goto out;
+                }
+        }
+        err = -ENOENT;
+out:
+        write_unlock(&ematch_mod_lock);
+        return err;
+}
+static inline struct tcf_ematch * tcf_em_get_match(struct tcf_ematch_tree *tree,
+                                                   int index)
+{
+        return &tree->matches[index];
+}
+static int tcf_em_validate(struct tcf_proto *tp,
+                           struct tcf_ematch_tree_hdr *tree_hdr,
+                           struct tcf_ematch *em, struct rtattr *rta, int idx)
+{
+        int err = -EINVAL;
+        struct tcf_ematch_hdr *em_hdr = RTA_DATA(rta);
+        int data_len = RTA_PAYLOAD(rta) - sizeof(*em_hdr);
+        void *data = (void *) em_hdr + sizeof(*em_hdr);
+        if (!TCF_EM_REL_VALID(em_hdr->flags))
+                goto errout;
+        if (em_hdr->kind == TCF_EM_CONTAINER) {
+                /* Special ematch called "container", carries an index
+                 * referencing an external ematch sequence. */
+                u32 ref;
+                if (data_len < sizeof(ref))
+                        goto errout;
+                ref = *(u32 *) data;
+                if (ref >= tree_hdr->nmatches)
+                        goto errout;
+                /* We do not allow backward jumps to avoid loops and jumps
+                 * to our own position are of course illegal. */
+                if (ref <= idx)
+                        goto errout;
+                
+                em->data = ref;
+        } else {
+                /* Note: This lookup will increase the module refcnt
+                 * of the ematch module referenced. In case of a failure,
+                 * a destroy function is called by the underlying layer
+                 * which automatically releases the reference again, therefore
+                 * the module MUST not be given back under any circumstances
+                 * here. Be aware, the destroy function assumes that the
+                 * module is held if the ops field is non zero. */
+                em->ops = tcf_em_lookup(em_hdr->kind);
+                if (em->ops == NULL) {
+                        err = -ENOENT;
+                        goto errout;
+                }
+                /* ematch module provides expected length of data, so we
+                 * can do a basic sanity check. */
+                if (em->ops->datalen && data_len < em->ops->datalen)
+                        goto errout;
+                if (em->ops->change) {
+                        err = em->ops->change(tp, data, data_len, em);
+                        if (err < 0)
+                                goto errout;
+                } else if (data_len > 0) {
+                        /* ematch module doesn't provide an own change
+                         * procedure and expects us to allocate and copy
+                         * the ematch data.
+                         *
+                         * TCF_EM_SIMPLE may be specified stating that the
+                         * data only consists of a u32 integer and the module
+                         * does not expected a memory reference but rather
+                         * the value carried. */
+                        if (em_hdr->flags & TCF_EM_SIMPLE) {
+                                if (data_len < sizeof(u32))
+                                        goto errout;
+                                em->data = *(u32 *) data;
+                        } else {
+                                void *v = kmalloc(data_len, GFP_KERNEL);
+                                if (v == NULL) {
+                                        err = -ENOBUFS;
+                                        goto errout;
+                                }
+                                memcpy(v, data, data_len);
+                                em->data = (unsigned long) v;
+                        }
+                }
+        }
+        em->matchid = em_hdr->matchid;
+        em->flags = em_hdr->flags;
+        em->datalen = data_len;
+        err = 0;
+errout:
+        return err;
+}
+/**
+ * tcf_em_tree_validate - validate ematch config TLV and build ematch tree
+ *
+ * @tp: classifier kind handle
+ * @rta: ematch tree configuration TLV
+ * @tree: destination ematch tree variable to store the resulting
+ *        ematch tree.
+ *
+ * This function validates the given configuration TLV @rta and builds an
+ * ematch tree in @tree. The resulting tree must later be copied into
+ * the private classifier data using tcf_em_tree_change(). You MUST NOT
+ * provide the ematch tree variable of the private classifier data directly,
+ * the changes would not be locked properly.
+ *
+ * Returns a negative error code if the configuration TLV contains errors.
+ */
+int tcf_em_tree_validate(struct tcf_proto *tp, struct rtattr *rta,
+                         struct tcf_ematch_tree *tree)
+{
+        int idx, list_len, matches_len, err = -EINVAL;
+        struct rtattr *tb[TCA_EMATCH_TREE_MAX];
+        struct rtattr *rt_match, *rt_hdr, *rt_list;
+        struct tcf_ematch_tree_hdr *tree_hdr;
+        struct tcf_ematch *em;
+        if (rtattr_parse_nested(tb, TCA_EMATCH_TREE_MAX, rta) < 0)
+                goto errout;
+        rt_hdr = tb[TCA_EMATCH_TREE_HDR-1];
+        rt_list = tb[TCA_EMATCH_TREE_LIST-1];
+        if (rt_hdr == NULL || rt_list == NULL)
+                goto errout;
+        if (RTA_PAYLOAD(rt_hdr) < sizeof(*tree_hdr) ||
+            RTA_PAYLOAD(rt_list) < sizeof(*rt_match))
+                goto errout;
+        tree_hdr = RTA_DATA(rt_hdr);
+        memcpy(&tree->hdr, tree_hdr, sizeof(*tree_hdr));
+        rt_match = RTA_DATA(rt_list);
+        list_len = RTA_PAYLOAD(rt_list);
+        matches_len = tree_hdr->nmatches * sizeof(*em);
+        tree->matches = kmalloc(matches_len, GFP_KERNEL);
+        if (tree->matches == NULL)
+                goto errout;
+        memset(tree->matches, 0, matches_len);
+        /* We do not use rtattr_parse_nested here because the maximum
+         * number of attributes is unknown. This saves us the allocation
+         * for a tb buffer which would serve no purpose at all.
+         * 
+         * The array of rt attributes is parsed in the order as they are
+         * provided, their type must be incremental from 1 to n. Even
+         * if it does not serve any real purpose, a failure of sticking
+         * to this policy will result in parsing failure. */
+        for (idx = 0; RTA_OK(rt_match, list_len); idx++) {
+                err = -EINVAL;
+                if (rt_match->rta_type != (idx + 1))
+                        goto errout_abort;
+                if (idx >= tree_hdr->nmatches)
+                        goto errout_abort;
+                if (RTA_PAYLOAD(rt_match) < sizeof(struct tcf_ematch_hdr))
+                        goto errout_abort;
+                em = tcf_em_get_match(tree, idx);
+                err = tcf_em_validate(tp, tree_hdr, em, rt_match, idx);
+                if (err < 0)
+                        goto errout_abort;
+                rt_match = RTA_NEXT(rt_match, list_len);
+        }
+        /* Check if the number of matches provided by userspace actually
+         * complies with the array of matches. The number was used for
+         * the validation of references and a mismatch could lead to
+         * undefined references during the matching process. */
+        if (idx != tree_hdr->nmatches) {
+                err = -EINVAL;
+                goto errout_abort;
+        }
+        err = 0;
+errout:
+        return err;
+errout_abort:
+        tcf_em_tree_destroy(tp, tree);
+        return err;
+}
+/**
+ * tcf_em_tree_destroy - destroy an ematch tree
+ *
+ * @tp: classifier kind handle
+ * @tree: ematch tree to be deleted
+ *
+ * This functions destroys an ematch tree previously created by
+ * tcf_em_tree_validate()/tcf_em_tree_change(). You must ensure that
+ * the ematch tree is not in use before calling this function.
+ */
+void tcf_em_tree_destroy(struct tcf_proto *tp, struct tcf_ematch_tree *tree)
+{
+        int i;
+        if (tree->matches == NULL)
+                return;
+        for (i = 0; i < tree->hdr.nmatches; i++) {
+                struct tcf_ematch *em = tcf_em_get_match(tree, i);
+                if (em->ops) {
+                        if (em->ops->destroy)
+                                em->ops->destroy(tp, em);
+                        else if (!tcf_em_is_simple(em) && em->data)
+                                kfree((void *) em->data);
+                        module_put(em->ops->owner);
+                }
+        }
+        
+        tree->hdr.nmatches = 0;
+        kfree(tree->matches);
+}
+/**
+ * tcf_em_tree_dump - dump ematch tree into a rtnl message
+ *
+ * @skb: skb holding the rtnl message
+ * @t: ematch tree to be dumped
+ * @tlv: TLV type to be used to encapsulate the tree
+ *
+ * This function dumps a ematch tree into a rtnl message. It is valid to
+ * call this function while the ematch tree is in use.
+ *
+ * Returns -1 if the skb tailroom is insufficient.
+ */
+int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv)
+{
+        int i;
+        struct rtattr * top_start = (struct rtattr*) skb->tail;
+        struct rtattr * list_start;
+        RTA_PUT(skb, tlv, 0, NULL);
+        RTA_PUT(skb, TCA_EMATCH_TREE_HDR, sizeof(tree->hdr), &tree->hdr);
+        list_start = (struct rtattr *) skb->tail;
+        RTA_PUT(skb, TCA_EMATCH_TREE_LIST, 0, NULL);
+        for (i = 0; i < tree->hdr.nmatches; i++) {
+                struct rtattr *match_start = (struct rtattr*) skb->tail;
+                struct tcf_ematch *em = tcf_em_get_match(tree, i);
+                struct tcf_ematch_hdr em_hdr = {
+                        .kind = em->ops ? em->ops->kind : TCF_EM_CONTAINER,
+                        .matchid = em->matchid,
+                        .flags = em->flags
+                };
+                RTA_PUT(skb, i+1, sizeof(em_hdr), &em_hdr);
+                if (em->ops && em->ops->dump) {
+                        if (em->ops->dump(skb, em) < 0)
+                                goto rtattr_failure;
+                } else if (tcf_em_is_container(em) || tcf_em_is_simple(em)) {
+                        u32 u = em->data;
+                        RTA_PUT_NOHDR(skb, sizeof(u), &u);
+                } else if (em->datalen > 0)
+                        RTA_PUT_NOHDR(skb, em->datalen, (void *) em->data);
+                match_start->rta_len = skb->tail - (u8*) match_start;
+        }
+        list_start->rta_len = skb->tail - (u8 *) list_start;
+        top_start->rta_len = skb->tail - (u8 *) top_start;
+        return 0;
+rtattr_failure:
+        return -1;
+}
+static inline int tcf_em_match(struct sk_buff *skb, struct tcf_ematch *em,
+                               struct tcf_pkt_info *info)
+{
+        int r = em->ops->match(skb, em, info);
+        return tcf_em_is_inverted(em) ? !r : r;
+}
+/* Do not use this function directly, use tcf_em_tree_match instead */
+int __tcf_em_tree_match(struct sk_buff *skb, struct tcf_ematch_tree *tree,
+                        struct tcf_pkt_info *info)
+{
+        int stackp = 0, match_idx = 0, res = 0;
+        struct tcf_ematch *cur_match;
+        int stack[CONFIG_NET_EMATCH_STACK];
+proceed:
+        while (match_idx < tree->hdr.nmatches) {
+                cur_match = tcf_em_get_match(tree, match_idx);
+                if (tcf_em_is_container(cur_match)) {
+                        if (unlikely(stackp >= CONFIG_NET_EMATCH_STACK))
+                                goto stack_overflow;
+                        stack[stackp++] = match_idx;
+                        match_idx = cur_match->data;
+                        goto proceed;
+                }
+                res = tcf_em_match(skb, cur_match, info);
+                if (tcf_em_early_end(cur_match, res))
+                        break;
+                match_idx++;
+        }
+pop_stack:
+        if (stackp > 0) {
+                match_idx = stack[--stackp];
+                cur_match = tcf_em_get_match(tree, match_idx);
+                if (tcf_em_early_end(cur_match, res))
+                        goto pop_stack;
+                else {
+                        match_idx++;
+                        goto proceed;
+                }
+        }
+        return res;
+stack_overflow:
+        if (net_ratelimit())
+                printk("Local stack overflow, increase NET_EMATCH_STACK\n");
+        return -1;
+}
+EXPORT_SYMBOL(tcf_em_register);
+EXPORT_SYMBOL(tcf_em_unregister);
+EXPORT_SYMBOL(tcf_em_tree_validate);
+EXPORT_SYMBOL(tcf_em_tree_destroy);
+EXPORT_SYMBOL(tcf_em_tree_dump);
+EXPORT_SYMBOL(__tcf_em_tree_match);
diff --git a/net/sched/estimator.c b/net/sched/estimator.c
new file mode 100644
index 000000000000..5d3ae03e22a7
--- /dev/null
+++ b/net/sched/estimator.c
@@ -0,0 +1,197 @@
+/*
+ * net/sched/estimator.c        Simple rate estimator.
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/init.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+/*
+   This code is NOT intended to be used for statistics collection,
+   its purpose is to provide a base for statistical multiplexing
+   for controlled load service.
+   If you need only statistics, run a user level daemon which
+   periodically reads byte counters.
+   Unfortunately, rate estimation is not a very easy task.
+   F.e. I did not find a simple way to estimate the current peak rate
+   and even failed to formulate the problem 8)8)
+   So I preferred not to built an estimator into the scheduler,
+   but run this task separately.
+   Ideally, it should be kernel thread(s), but for now it runs
+   from timers, which puts apparent top bounds on the number of rated
+   flows, has minimal overhead on small, but is enough
+   to handle controlled load service, sets of aggregates.
+   We measure rate over A=(1<<interval) seconds and evaluate EWMA:
+   avrate = avrate*(1-W) + rate*W
+   where W is chosen as negative power of 2: W = 2^(-ewma_log)
+   The resulting time constant is:
+   T = A/(-ln(1-W))
+   NOTES.
+   * The stored value for avbps is scaled by 2^5, so that maximal
+     rate is ~1Gbit, avpps is scaled by 2^10.
+   * Minimal interval is HZ/4=250msec (it is the greatest common divisor
+     for HZ=100 and HZ=1024 8)), maximal interval
+     is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals
+     are too expensive, longer ones can be implemented
+     at user level painlessly.
+ */
+#define EST_MAX_INTERVAL        5
+struct qdisc_estimator
+{
+        struct qdisc_estimator  *next;
+        struct tc_stats         *stats;
+        spinlock_t              *stats_lock;
+        unsigned                interval;
+        int                     ewma_log;
+        u64                     last_bytes;
+        u32                     last_packets;
+        u32                     avpps;
+        u32                     avbps;
+};
+struct qdisc_estimator_head
+{
+        struct timer_list       timer;
+        struct qdisc_estimator  *list;
+};
+static struct qdisc_estimator_head elist[EST_MAX_INTERVAL+1];
+/* Estimator array lock */
+static DEFINE_RWLOCK(est_lock);
+static void est_timer(unsigned long arg)
+{
+        int idx = (int)arg;
+        struct qdisc_estimator *e;
+        read_lock(&est_lock);
+        for (e = elist[idx].list; e; e = e->next) {
+                struct tc_stats *st = e->stats;
+                u64 nbytes;
+                u32 npackets;
+                u32 rate;
+                spin_lock(e->stats_lock);
+                nbytes = st->bytes;
+                npackets = st->packets;
+                rate = (nbytes - e->last_bytes)<<(7 - idx);
+                e->last_bytes = nbytes;
+                e->avbps += ((long)rate - (long)e->avbps) >> e->ewma_log;
+                st->bps = (e->avbps+0xF)>>5;
+                rate = (npackets - e->last_packets)<<(12 - idx);
+                e->last_packets = npackets;
+                e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log;
+                e->stats->pps = (e->avpps+0x1FF)>>10;
+                spin_unlock(e->stats_lock);
+        }
+        mod_timer(&elist[idx].timer, jiffies + ((HZ<<idx)/4));
+        read_unlock(&est_lock);
+}
+int qdisc_new_estimator(struct tc_stats *stats, spinlock_t *stats_lock, struct rtattr *opt)
+{
+        struct qdisc_estimator *est;
+        struct tc_estimator *parm = RTA_DATA(opt);
+        if (RTA_PAYLOAD(opt) < sizeof(*parm))
+                return -EINVAL;
+        if (parm->interval < -2 || parm->interval > 3)
+                return -EINVAL;
+        est = kmalloc(sizeof(*est), GFP_KERNEL);
+        if (est == NULL)
+                return -ENOBUFS;
+        memset(est, 0, sizeof(*est));
+        est->interval = parm->interval + 2;
+        est->stats = stats;
+        est->stats_lock = stats_lock;
+        est->ewma_log = parm->ewma_log;
+        est->last_bytes = stats->bytes;
+        est->avbps = stats->bps<<5;
+        est->last_packets = stats->packets;
+        est->avpps = stats->pps<<10;
+        est->next = elist[est->interval].list;
+        if (est->next == NULL) {
+                init_timer(&elist[est->interval].timer);
+                elist[est->interval].timer.data = est->interval;
+                elist[est->interval].timer.expires = jiffies + ((HZ<<est->interval)/4);
+                elist[est->interval].timer.function = est_timer;
+                add_timer(&elist[est->interval].timer);
+        }
+        write_lock_bh(&est_lock);
+        elist[est->interval].list = est;
+        write_unlock_bh(&est_lock);
+        return 0;
+}
+void qdisc_kill_estimator(struct tc_stats *stats)
+{
+        int idx;
+        struct qdisc_estimator *est, **pest;
+        for (idx=0; idx <= EST_MAX_INTERVAL; idx++) {
+                int killed = 0;
+                pest = &elist[idx].list;
+                while ((est=*pest) != NULL) {
+                        if (est->stats != stats) {
+                                pest = &est->next;
+                                continue;
+                        }
+                        write_lock_bh(&est_lock);
+                        *pest = est->next;
+                        write_unlock_bh(&est_lock);
+                        kfree(est);
+                        killed++;
+                }
+                if (killed && elist[idx].list == NULL)
+                        del_timer(&elist[idx].timer);
+        }
+}
+EXPORT_SYMBOL(qdisc_kill_estimator);
+EXPORT_SYMBOL(qdisc_new_estimator);
diff --git a/net/sched/gact.c b/net/sched/gact.c
new file mode 100644
index 000000000000..a811c89fef7f
--- /dev/null
+++ b/net/sched/gact.c
@@ -0,0 +1,231 @@
+/*
+ * net/sched/gact.c     Generic actions
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * copyright    Jamal Hadi Salim (2002-4)
+ *
+ */
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+#include <linux/tc_act/tc_gact.h>
+#include <net/tc_act/tc_gact.h>
+/* use generic hash table */
+#define MY_TAB_SIZE     16
+#define MY_TAB_MASK     15
+static u32 idx_gen;
+static struct tcf_gact *tcf_gact_ht[MY_TAB_SIZE];
+static DEFINE_RWLOCK(gact_lock);
+/* ovewrride the defaults */
+#define tcf_st          tcf_gact
+#define tc_st           tc_gact
+#define tcf_t_lock      gact_lock
+#define tcf_ht          tcf_gact_ht
+#define CONFIG_NET_ACT_INIT 1
+#include <net/pkt_act.h>
+#ifdef CONFIG_GACT_PROB
+static int gact_net_rand(struct tcf_gact *p)
+{
+        if (net_random()%p->pval)
+                return p->action;
+        return p->paction;
+}
+static int gact_determ(struct tcf_gact *p)
+{
+        if (p->bstats.packets%p->pval)
+                return p->action;
+        return p->paction;
+}
+typedef int (*g_rand)(struct tcf_gact *p);
+static g_rand gact_rand[MAX_RAND]= { NULL, gact_net_rand, gact_determ };
+#endif
+static int tcf_gact_init(struct rtattr *rta, struct rtattr *est,
+                         struct tc_action *a, int ovr, int bind)
+{
+        struct rtattr *tb[TCA_GACT_MAX];
+        struct tc_gact *parm;
+        struct tcf_gact *p;
+        int ret = 0;
+        if (rta == NULL || rtattr_parse_nested(tb, TCA_GACT_MAX, rta) < 0)
+                return -EINVAL;
+        if (tb[TCA_GACT_PARMS - 1] == NULL ||
+            RTA_PAYLOAD(tb[TCA_GACT_PARMS - 1]) < sizeof(*parm))
+                return -EINVAL;
+        parm = RTA_DATA(tb[TCA_GACT_PARMS - 1]);
+        if (tb[TCA_GACT_PROB-1] != NULL)
+#ifdef CONFIG_GACT_PROB
+                if (RTA_PAYLOAD(tb[TCA_GACT_PROB-1]) < sizeof(struct tc_gact_p))
+                        return -EINVAL;
+#else
+                return -EOPNOTSUPP;
+#endif
+        p = tcf_hash_check(parm->index, a, ovr, bind);
+        if (p == NULL) {
+                p = tcf_hash_create(parm->index, est, a, sizeof(*p), ovr, bind);
+                if (p == NULL)
+                        return -ENOMEM;
+                ret = ACT_P_CREATED;
+        } else {
+                if (!ovr) {
+                        tcf_hash_release(p, bind);
+                        return -EEXIST;
+                }
+        }
+        spin_lock_bh(&p->lock);
+        p->action = parm->action;
+#ifdef CONFIG_GACT_PROB
+        if (tb[TCA_GACT_PROB-1] != NULL) {
+                struct tc_gact_p *p_parm = RTA_DATA(tb[TCA_GACT_PROB-1]);
+                p->paction = p_parm->paction;
+                p->pval    = p_parm->pval;
+                p->ptype   = p_parm->ptype;
+        }
+#endif
+        spin_unlock_bh(&p->lock);
+        if (ret == ACT_P_CREATED)
+                tcf_hash_insert(p);
+        return ret;
+}
+static int
+tcf_gact_cleanup(struct tc_action *a, int bind)
+{
+        struct tcf_gact *p = PRIV(a, gact);
+        if (p != NULL)
+                return tcf_hash_release(p, bind);
+        return 0;
+}
+static int
+tcf_gact(struct sk_buff **pskb, struct tc_action *a)
+{
+        struct tcf_gact *p = PRIV(a, gact);
+        struct sk_buff *skb = *pskb;
+        int action = TC_ACT_SHOT;
+        spin_lock(&p->lock);
+#ifdef CONFIG_GACT_PROB
+        if (p->ptype && gact_rand[p->ptype] != NULL)
+                action = gact_rand[p->ptype](p);
+        else
+                action = p->action;
+#else
+        action = p->action;
+#endif
+        p->bstats.bytes += skb->len;
+        p->bstats.packets++;
+        if (action == TC_ACT_SHOT)
+                p->qstats.drops++;
+        p->tm.lastuse = jiffies;
+        spin_unlock(&p->lock);
+        return action;
+}
+static int
+tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+{
+        unsigned char *b = skb->tail;
+        struct tc_gact opt;
+        struct tcf_gact *p = PRIV(a, gact);
+        struct tcf_t t;
+        opt.index = p->index;
+        opt.refcnt = p->refcnt - ref;
+        opt.bindcnt = p->bindcnt - bind;
+        opt.action = p->action;
+        RTA_PUT(skb, TCA_GACT_PARMS, sizeof(opt), &opt);
+#ifdef CONFIG_GACT_PROB
+        if (p->ptype) {
+                struct tc_gact_p p_opt;
+                p_opt.paction = p->paction;
+                p_opt.pval = p->pval;
+                p_opt.ptype = p->ptype;
+                RTA_PUT(skb, TCA_GACT_PROB, sizeof(p_opt), &p_opt);
+        }
+#endif
+        t.install = jiffies_to_clock_t(jiffies - p->tm.install);
+        t.lastuse = jiffies_to_clock_t(jiffies - p->tm.lastuse);
+        t.expires = jiffies_to_clock_t(p->tm.expires);
+        RTA_PUT(skb, TCA_GACT_TM, sizeof(t), &t);
+        return skb->len;
+      rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+static struct tc_action_ops act_gact_ops = {
+        .kind           =       "gact",
+        .type           =       TCA_ACT_GACT,
+        .capab          =       TCA_CAP_NONE,
+        .owner          =       THIS_MODULE,
+        .act            =       tcf_gact,
+        .dump           =       tcf_gact_dump,
+        .cleanup        =       tcf_gact_cleanup,
+        .lookup         =       tcf_hash_search,
+        .init           =       tcf_gact_init,
+        .walk           =       tcf_generic_walker
+};
+MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
+MODULE_DESCRIPTION("Generic Classifier actions");
+MODULE_LICENSE("GPL");
+static int __init
+gact_init_module(void)
+{
+#ifdef CONFIG_GACT_PROB
+        printk("GACT probability on\n");
+#else
+        printk("GACT probability NOT on\n");
+#endif
+        return tcf_register_action(&act_gact_ops);
+}
+static void __exit
+gact_cleanup_module(void)
+{
+        tcf_unregister_action(&act_gact_ops);
+}
+module_init(gact_init_module);
+module_exit(gact_cleanup_module);
diff --git a/net/sched/ipt.c b/net/sched/ipt.c
new file mode 100644
index 000000000000..b114d994d523
--- /dev/null
+++ b/net/sched/ipt.c
@@ -0,0 +1,326 @@
+/*
+ * net/sched/ipt.c      iptables target interface
+ *
+ *TODO: Add other tables. For now we only support the ipv4 table targets
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Copyright:   Jamal Hadi Salim (2002-4)
+ */
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/bitops.h>
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/kmod.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+#include <linux/tc_act/tc_ipt.h>
+#include <net/tc_act/tc_ipt.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+/* use generic hash table */
+#define MY_TAB_SIZE     16
+#define MY_TAB_MASK     15
+static u32 idx_gen;
+static struct tcf_ipt *tcf_ipt_ht[MY_TAB_SIZE];
+/* ipt hash table lock */
+static DEFINE_RWLOCK(ipt_lock);
+/* ovewrride the defaults */
+#define tcf_st          tcf_ipt
+#define tcf_t_lock      ipt_lock
+#define tcf_ht          tcf_ipt_ht
+#define CONFIG_NET_ACT_INIT
+#include <net/pkt_act.h>
+static int
+ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int hook)
+{
+        struct ipt_target *target;
+        int ret = 0;
+        target = ipt_find_target(t->u.user.name, t->u.user.revision);
+        if (!target)
+                return -ENOENT;
+        DPRINTK("ipt_init_target: found %s\n", target->name);
+        t->u.kernel.target = target;
+        if (t->u.kernel.target->checkentry
+            && !t->u.kernel.target->checkentry(table, NULL, t->data,
+                                               t->u.target_size - sizeof(*t),
+                                               hook)) {
+                DPRINTK("ipt_init_target: check failed for `%s'.\n",
+                        t->u.kernel.target->name);
+                module_put(t->u.kernel.target->me);
+                ret = -EINVAL;
+        }
+        return ret;
+}
+static void
+ipt_destroy_target(struct ipt_entry_target *t)
+{
+        if (t->u.kernel.target->destroy)
+                t->u.kernel.target->destroy(t->data,
+                                            t->u.target_size - sizeof(*t));
+        module_put(t->u.kernel.target->me);
+}
+static int
+tcf_ipt_release(struct tcf_ipt *p, int bind)
+{
+        int ret = 0;
+        if (p) {
+                if (bind)
+                        p->bindcnt--;
+                p->refcnt--;
+                if (p->bindcnt <= 0 && p->refcnt <= 0) {
+                        ipt_destroy_target(p->t);
+                        kfree(p->tname);
+                        kfree(p->t);
+                        tcf_hash_destroy(p);
+                        ret = ACT_P_DELETED;
+                }
+        }
+        return ret;
+}
+static int
+tcf_ipt_init(struct rtattr *rta, struct rtattr *est, struct tc_action *a,
+             int ovr, int bind)
+{
+        struct rtattr *tb[TCA_IPT_MAX];
+        struct tcf_ipt *p;
+        struct ipt_entry_target *td, *t;
+        char *tname;
+        int ret = 0, err;
+        u32 hook = 0;
+        u32 index = 0;
+        if (rta == NULL || rtattr_parse_nested(tb, TCA_IPT_MAX, rta) < 0)
+                return -EINVAL;
+        if (tb[TCA_IPT_HOOK-1] == NULL ||
+            RTA_PAYLOAD(tb[TCA_IPT_HOOK-1]) < sizeof(u32))
+                return -EINVAL;
+        if (tb[TCA_IPT_TARG-1] == NULL ||
+            RTA_PAYLOAD(tb[TCA_IPT_TARG-1]) < sizeof(*t))
+                return -EINVAL;
+        td = (struct ipt_entry_target *)RTA_DATA(tb[TCA_IPT_TARG-1]);
+        if (RTA_PAYLOAD(tb[TCA_IPT_TARG-1]) < td->u.target_size)
+                return -EINVAL;
+        if (tb[TCA_IPT_INDEX-1] != NULL &&
+            RTA_PAYLOAD(tb[TCA_IPT_INDEX-1]) >= sizeof(u32))
+                index = *(u32 *)RTA_DATA(tb[TCA_IPT_INDEX-1]);
+        p = tcf_hash_check(index, a, ovr, bind);
+        if (p == NULL) {
+                p = tcf_hash_create(index, est, a, sizeof(*p), ovr, bind);
+                if (p == NULL)
+                        return -ENOMEM;
+                ret = ACT_P_CREATED;
+        } else {
+                if (!ovr) {
+                        tcf_ipt_release(p, bind);
+                        return -EEXIST;
+                }
+        }
+        hook = *(u32 *)RTA_DATA(tb[TCA_IPT_HOOK-1]);
+        err = -ENOMEM;
+        tname = kmalloc(IFNAMSIZ, GFP_KERNEL);
+        if (tname == NULL)
+                goto err1;
+        if (tb[TCA_IPT_TABLE - 1] == NULL ||
+            rtattr_strlcpy(tname, tb[TCA_IPT_TABLE-1], IFNAMSIZ) >= IFNAMSIZ)
+                strcpy(tname, "mangle");
+        t = kmalloc(td->u.target_size, GFP_KERNEL);
+        if (t == NULL)
+                goto err2;
+        memcpy(t, td, td->u.target_size);
+        if ((err = ipt_init_target(t, tname, hook)) < 0)
+                goto err3;
+        spin_lock_bh(&p->lock);
+        if (ret != ACT_P_CREATED) {
+                ipt_destroy_target(p->t);
+                kfree(p->tname);
+                kfree(p->t);
+        }
+        p->tname = tname;
+        p->t     = t;
+        p->hook  = hook;
+        spin_unlock_bh(&p->lock);
+        if (ret == ACT_P_CREATED)
+                tcf_hash_insert(p);
+        return ret;
+err3:
+        kfree(t);
+err2:
+        kfree(tname);
+err1:
+        kfree(p);
+        return err;
+}
+static int
+tcf_ipt_cleanup(struct tc_action *a, int bind)
+{
+        struct tcf_ipt *p = PRIV(a, ipt);
+        return tcf_ipt_release(p, bind);
+}
+static int
+tcf_ipt(struct sk_buff **pskb, struct tc_action *a)
+{
+        int ret = 0, result = 0;
+        struct tcf_ipt *p = PRIV(a, ipt);
+        struct sk_buff *skb = *pskb;
+        if (skb_cloned(skb)) {
+                if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+                        return TC_ACT_UNSPEC;
+        }
+        spin_lock(&p->lock);
+        p->tm.lastuse = jiffies;
+        p->bstats.bytes += skb->len;
+        p->bstats.packets++;
+        /* yes, we have to worry about both in and out dev
+         worry later - danger - this API seems to have changed
+         from earlier kernels */
+        ret = p->t->u.kernel.target->target(&skb, skb->dev, NULL,
+                                            p->hook, p->t->data, NULL);
+        switch (ret) {
+        case NF_ACCEPT:
+                result = TC_ACT_OK;
+                break;
+        case NF_DROP:
+                result = TC_ACT_SHOT;
+                p->qstats.drops++;
+                break;
+        case IPT_CONTINUE:
+                result = TC_ACT_PIPE;
+                break;
+        default:
+                if (net_ratelimit())
+                        printk("Bogus netfilter code %d assume ACCEPT\n", ret);
+                result = TC_POLICE_OK;
+                break;
+        }
+        spin_unlock(&p->lock);
+        return result;
+}
+static int
+tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+{
+        struct ipt_entry_target *t;
+        struct tcf_t tm;
+        struct tc_cnt c;
+        unsigned char *b = skb->tail;
+        struct tcf_ipt *p = PRIV(a, ipt);
+        /* for simple targets kernel size == user size
+        ** user name = target name
+        ** for foolproof you need to not assume this
+        */
+        t = kmalloc(p->t->u.user.target_size, GFP_ATOMIC);
+        if (t == NULL)
+                goto rtattr_failure;
+        c.bindcnt = p->bindcnt - bind;
+        c.refcnt = p->refcnt - ref;
+        memcpy(t, p->t, p->t->u.user.target_size);
+        strcpy(t->u.user.name, p->t->u.kernel.target->name);
+        DPRINTK("\ttcf_ipt_dump tablename %s length %d\n", p->tname,
+                strlen(p->tname));
+        DPRINTK("\tdump target name %s size %d size user %d "
+                "data[0] %x data[1] %x\n", p->t->u.kernel.target->name,
+                p->t->u.target_size, p->t->u.user.target_size,
+                p->t->data[0], p->t->data[1]);
+        RTA_PUT(skb, TCA_IPT_TARG, p->t->u.user.target_size, t);
+        RTA_PUT(skb, TCA_IPT_INDEX, 4, &p->index);
+        RTA_PUT(skb, TCA_IPT_HOOK, 4, &p->hook);
+        RTA_PUT(skb, TCA_IPT_CNT, sizeof(struct tc_cnt), &c);
+        RTA_PUT(skb, TCA_IPT_TABLE, IFNAMSIZ, p->tname);
+        tm.install = jiffies_to_clock_t(jiffies - p->tm.install);
+        tm.lastuse = jiffies_to_clock_t(jiffies - p->tm.lastuse);
+        tm.expires = jiffies_to_clock_t(p->tm.expires);
+        RTA_PUT(skb, TCA_IPT_TM, sizeof (tm), &tm);
+        kfree(t);
+        return skb->len;
+      rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        kfree(t);
+        return -1;
+}
+static struct tc_action_ops act_ipt_ops = {
+        .kind           =       "ipt",
+        .type           =       TCA_ACT_IPT,
+        .capab          =       TCA_CAP_NONE,
+        .owner          =       THIS_MODULE,
+        .act            =       tcf_ipt,
+        .dump           =       tcf_ipt_dump,
+        .cleanup        =       tcf_ipt_cleanup,
+        .lookup         =       tcf_hash_search,
+        .init           =       tcf_ipt_init,
+        .walk           =       tcf_generic_walker
+};
+MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
+MODULE_DESCRIPTION("Iptables target actions");
+MODULE_LICENSE("GPL");
+static int __init
+ipt_init_module(void)
+{
+        return tcf_register_action(&act_ipt_ops);
+}
+static void __exit
+ipt_cleanup_module(void)
+{
+        tcf_unregister_action(&act_ipt_ops);
+}
+module_init(ipt_init_module);
+module_exit(ipt_cleanup_module);
diff --git a/net/sched/mirred.c b/net/sched/mirred.c
new file mode 100644
index 000000000000..f309ce336803
--- /dev/null
+++ b/net/sched/mirred.c
@@ -0,0 +1,276 @@
+/*
+ * net/sched/mirred.c   packet mirroring and redirect actions
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Authors:     Jamal Hadi Salim (2002-4)
+ *
+ * TODO: Add ingress support (and socket redirect support)
+ *
+ */
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/bitops.h>
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+#include <linux/tc_act/tc_mirred.h>
+#include <net/tc_act/tc_mirred.h>
+#include <linux/etherdevice.h>
+#include <linux/if_arp.h>
+/* use generic hash table */
+#define MY_TAB_SIZE     8
+#define MY_TAB_MASK     (MY_TAB_SIZE - 1)
+static u32 idx_gen;
+static struct tcf_mirred *tcf_mirred_ht[MY_TAB_SIZE];
+static DEFINE_RWLOCK(mirred_lock);
+/* ovewrride the defaults */
+#define tcf_st          tcf_mirred
+#define tc_st           tc_mirred
+#define tcf_t_lock      mirred_lock
+#define tcf_ht          tcf_mirred_ht
+#define CONFIG_NET_ACT_INIT 1
+#include <net/pkt_act.h>
+static inline int
+tcf_mirred_release(struct tcf_mirred *p, int bind)
+{
+        if (p) {
+                if (bind)
+                        p->bindcnt--;
+                p->refcnt--;
+                if(!p->bindcnt && p->refcnt <= 0) {
+                        dev_put(p->dev);
+                        tcf_hash_destroy(p);
+                        return 1;
+                }
+        }
+        return 0;
+}
+static int
+tcf_mirred_init(struct rtattr *rta, struct rtattr *est, struct tc_action *a,
+                int ovr, int bind)
+{
+        struct rtattr *tb[TCA_MIRRED_MAX];
+        struct tc_mirred *parm;
+        struct tcf_mirred *p;
+        struct net_device *dev = NULL;
+        int ret = 0;
+        int ok_push = 0;
+        if (rta == NULL || rtattr_parse_nested(tb, TCA_MIRRED_MAX, rta) < 0)
+                return -EINVAL;
+        if (tb[TCA_MIRRED_PARMS-1] == NULL ||
+            RTA_PAYLOAD(tb[TCA_MIRRED_PARMS-1]) < sizeof(*parm))
+                return -EINVAL;
+        parm = RTA_DATA(tb[TCA_MIRRED_PARMS-1]);
+        if (parm->ifindex) {
+                dev = __dev_get_by_index(parm->ifindex);
+                if (dev == NULL)
+                        return -ENODEV;
+                switch (dev->type) {
+                        case ARPHRD_TUNNEL:
+                        case ARPHRD_TUNNEL6:
+                        case ARPHRD_SIT:
+                        case ARPHRD_IPGRE:
+                        case ARPHRD_VOID:
+                        case ARPHRD_NONE:
+                                ok_push = 0;
+                                break;
+                        default:
+                                ok_push = 1;
+                                break;
+                }
+        }
+        p = tcf_hash_check(parm->index, a, ovr, bind);
+        if (p == NULL) {
+                if (!parm->ifindex)
+                        return -EINVAL;
+                p = tcf_hash_create(parm->index, est, a, sizeof(*p), ovr, bind);
+                if (p == NULL)
+                        return -ENOMEM;
+                ret = ACT_P_CREATED;
+        } else {
+                if (!ovr) {
+                        tcf_mirred_release(p, bind);
+                        return -EEXIST;
+                }
+        }
+        spin_lock_bh(&p->lock);
+        p->action = parm->action;
+        p->eaction = parm->eaction;
+        if (parm->ifindex) {
+                p->ifindex = parm->ifindex;
+                if (ret != ACT_P_CREATED)
+                        dev_put(p->dev);
+                p->dev = dev;
+                dev_hold(dev);
+                p->ok_push = ok_push;
+        }
+        spin_unlock_bh(&p->lock);
+        if (ret == ACT_P_CREATED)
+                tcf_hash_insert(p);
+        DPRINTK("tcf_mirred_init index %d action %d eaction %d device %s "
+                "ifindex %d\n", parm->index, parm->action, parm->eaction,
+                dev->name, parm->ifindex);
+        return ret;
+}
+static int
+tcf_mirred_cleanup(struct tc_action *a, int bind)
+{
+        struct tcf_mirred *p = PRIV(a, mirred);
+        if (p != NULL)
+                return tcf_mirred_release(p, bind);
+        return 0;
+}
+static int
+tcf_mirred(struct sk_buff **pskb, struct tc_action *a)
+{
+        struct tcf_mirred *p = PRIV(a, mirred);
+        struct net_device *dev;
+        struct sk_buff *skb2 = NULL;
+        struct sk_buff *skb = *pskb;
+        u32 at = G_TC_AT(skb->tc_verd);
+        spin_lock(&p->lock);
+        dev = p->dev;
+        p->tm.lastuse = jiffies;
+        if (!(dev->flags&IFF_UP) ) {
+                if (net_ratelimit())
+                        printk("mirred to Houston: device %s is gone!\n",
+                               dev->name);
+bad_mirred:
+                if (skb2 != NULL)
+                        kfree_skb(skb2);
+                p->qstats.overlimits++;
+                p->bstats.bytes += skb->len;
+                p->bstats.packets++;
+                spin_unlock(&p->lock);
+                /* should we be asking for packet to be dropped?
+                 * may make sense for redirect case only
+                */
+                return TC_ACT_SHOT;
+        }
+        skb2 = skb_clone(skb, GFP_ATOMIC);
+        if (skb2 == NULL)
+                goto bad_mirred;
+        if (p->eaction != TCA_EGRESS_MIRROR && p->eaction != TCA_EGRESS_REDIR) {
+                if (net_ratelimit())
+                        printk("tcf_mirred unknown action %d\n", p->eaction);
+                goto bad_mirred;
+        }
+        p->bstats.bytes += skb2->len;
+        p->bstats.packets++;
+        if (!(at & AT_EGRESS))
+                if (p->ok_push)
+                        skb_push(skb2, skb2->dev->hard_header_len);
+        /* mirror is always swallowed */
+        if (p->eaction != TCA_EGRESS_MIRROR)
+                skb2->tc_verd = SET_TC_FROM(skb2->tc_verd, at);
+        skb2->dev = dev;
+        skb2->input_dev = skb->dev;
+        dev_queue_xmit(skb2);
+        spin_unlock(&p->lock);
+        return p->action;
+}
+static int
+tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+{
+        unsigned char *b = skb->tail;
+        struct tc_mirred opt;
+        struct tcf_mirred *p = PRIV(a, mirred);
+        struct tcf_t t;
+        opt.index = p->index;
+        opt.action = p->action;
+        opt.refcnt = p->refcnt - ref;
+        opt.bindcnt = p->bindcnt - bind;
+        opt.eaction = p->eaction;
+        opt.ifindex = p->ifindex;
+        DPRINTK("tcf_mirred_dump index %d action %d eaction %d ifindex %d\n",
+                 p->index, p->action, p->eaction, p->ifindex);
+        RTA_PUT(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt);
+        t.install = jiffies_to_clock_t(jiffies - p->tm.install);
+        t.lastuse = jiffies_to_clock_t(jiffies - p->tm.lastuse);
+        t.expires = jiffies_to_clock_t(p->tm.expires);
+        RTA_PUT(skb, TCA_MIRRED_TM, sizeof(t), &t);
+        return skb->len;
+      rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+static struct tc_action_ops act_mirred_ops = {
+        .kind           =       "mirred",
+        .type           =       TCA_ACT_MIRRED,
+        .capab          =       TCA_CAP_NONE,
+        .owner          =       THIS_MODULE,
+        .act            =       tcf_mirred,
+        .dump           =       tcf_mirred_dump,
+        .cleanup        =       tcf_mirred_cleanup,
+        .lookup         =       tcf_hash_search,
+        .init           =       tcf_mirred_init,
+        .walk           =       tcf_generic_walker
+};
+MODULE_AUTHOR("Jamal Hadi Salim(2002)");
+MODULE_DESCRIPTION("Device Mirror/redirect actions");
+MODULE_LICENSE("GPL");
+static int __init
+mirred_init_module(void)
+{
+        printk("Mirror/redirect action on\n");
+        return tcf_register_action(&act_mirred_ops);
+}
+static void __exit
+mirred_cleanup_module(void)
+{
+        tcf_unregister_action(&act_mirred_ops);
+}
+module_init(mirred_init_module);
+module_exit(mirred_cleanup_module);
diff --git a/net/sched/pedit.c b/net/sched/pedit.c
new file mode 100644
index 000000000000..678be6a645fb
--- /dev/null
+++ b/net/sched/pedit.c
@@ -0,0 +1,288 @@
+/*
+ * net/sched/pedit.c    Generic packet editor
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Authors:     Jamal Hadi Salim (2002-4)
+ */
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/bitops.h>
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+#include <linux/tc_act/tc_pedit.h>
+#include <net/tc_act/tc_pedit.h>
+#define PEDIT_DEB 1
+/* use generic hash table */
+#define MY_TAB_SIZE     16
+#define MY_TAB_MASK     15
+static u32 idx_gen;
+static struct tcf_pedit *tcf_pedit_ht[MY_TAB_SIZE];
+static DEFINE_RWLOCK(pedit_lock);
+#define tcf_st          tcf_pedit
+#define tc_st           tc_pedit
+#define tcf_t_lock      pedit_lock
+#define tcf_ht          tcf_pedit_ht
+#define CONFIG_NET_ACT_INIT 1
+#include <net/pkt_act.h>
+static int
+tcf_pedit_init(struct rtattr *rta, struct rtattr *est, struct tc_action *a,
+               int ovr, int bind)
+{
+        struct rtattr *tb[TCA_PEDIT_MAX];
+        struct tc_pedit *parm;
+        int ret = 0;
+        struct tcf_pedit *p;
+        struct tc_pedit_key *keys = NULL;
+        int ksize;
+        if (rta == NULL || rtattr_parse_nested(tb, TCA_PEDIT_MAX, rta) < 0)
+                return -EINVAL;
+        if (tb[TCA_PEDIT_PARMS - 1] == NULL ||
+            RTA_PAYLOAD(tb[TCA_PEDIT_PARMS-1]) < sizeof(*parm))
+                return -EINVAL;
+        parm = RTA_DATA(tb[TCA_PEDIT_PARMS-1]);
+        ksize = parm->nkeys * sizeof(struct tc_pedit_key);
+        if (RTA_PAYLOAD(tb[TCA_PEDIT_PARMS-1]) < sizeof(*parm) + ksize)
+                return -EINVAL;
+        p = tcf_hash_check(parm->index, a, ovr, bind);
+        if (p == NULL) {
+                if (!parm->nkeys)
+                        return -EINVAL;
+                p = tcf_hash_create(parm->index, est, a, sizeof(*p), ovr, bind);
+                if (p == NULL)
+                        return -ENOMEM;
+                keys = kmalloc(ksize, GFP_KERNEL);
+                if (keys == NULL) {
+                        kfree(p);
+                        return -ENOMEM;
+                }
+                ret = ACT_P_CREATED;
+        } else {
+                if (!ovr) {
+                        tcf_hash_release(p, bind);
+                        return -EEXIST;
+                }
+                if (p->nkeys && p->nkeys != parm->nkeys) {
+                        keys = kmalloc(ksize, GFP_KERNEL);
+                        if (keys == NULL)
+                                return -ENOMEM;
+                }
+        }
+        spin_lock_bh(&p->lock);
+        p->flags = parm->flags;
+        p->action = parm->action;
+        if (keys) {
+                kfree(p->keys);
+                p->keys = keys;
+                p->nkeys = parm->nkeys;
+        }
+        memcpy(p->keys, parm->keys, ksize);
+        spin_unlock_bh(&p->lock);
+        if (ret == ACT_P_CREATED)
+                tcf_hash_insert(p);
+        return ret;
+}
+static int
+tcf_pedit_cleanup(struct tc_action *a, int bind)
+{
+        struct tcf_pedit *p = PRIV(a, pedit);
+        if (p != NULL) {
+                struct tc_pedit_key *keys = p->keys;
+                if (tcf_hash_release(p, bind)) {
+                        kfree(keys);
+                        return 1;
+                }
+        }
+        return 0;
+}
+static int
+tcf_pedit(struct sk_buff **pskb, struct tc_action *a)
+{
+        struct tcf_pedit *p = PRIV(a, pedit);
+        struct sk_buff *skb = *pskb;
+        int i, munged = 0;
+        u8 *pptr;
+        if (!(skb->tc_verd & TC_OK2MUNGE)) {
+                /* should we set skb->cloned? */
+                if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) {
+                        return p->action;
+                }
+        }
+        pptr = skb->nh.raw;
+        spin_lock(&p->lock);
+        p->tm.lastuse = jiffies;
+        if (p->nkeys > 0) {
+                struct tc_pedit_key *tkey = p->keys;
+                for (i = p->nkeys; i > 0; i--, tkey++) {
+                        u32 *ptr;
+                        int offset = tkey->off;
+                        if (tkey->offmask) {
+                                if (skb->len > tkey->at) {
+                                         char *j = pptr + tkey->at;
+                                         offset += ((*j & tkey->offmask) >> 
+                                                   tkey->shift);
+                                } else {
+                                        goto bad;
+                                }
+                        }
+                        if (offset % 4) {
+                                printk("offset must be on 32 bit boundaries\n");
+                                goto bad;
+                        }
+                        if (skb->len < 0 || (offset > 0 && offset > skb->len)) {
+                                printk("offset %d cant exceed pkt length %d\n",
+                                       offset, skb->len);
+                                goto bad;
+                        }
+                        ptr = (u32 *)(pptr+offset);
+                        /* just do it, baby */
+                        *ptr = ((*ptr & tkey->mask) ^ tkey->val);
+                        munged++;
+                }
+                
+                if (munged)
+                        skb->tc_verd = SET_TC_MUNGED(skb->tc_verd);
+                goto done;
+        } else {
+                printk("pedit BUG: index %d\n",p->index);
+        }
+bad:
+        p->qstats.overlimits++;
+done:
+        p->bstats.bytes += skb->len;
+        p->bstats.packets++;
+        spin_unlock(&p->lock);
+        return p->action;
+}
+static int
+tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,int bind, int ref)
+{
+        unsigned char *b = skb->tail;
+        struct tc_pedit *opt;
+        struct tcf_pedit *p = PRIV(a, pedit);
+        struct tcf_t t;
+        int s; 
+                
+        s = sizeof(*opt) + p->nkeys * sizeof(struct tc_pedit_key);
+        /* netlink spinlocks held above us - must use ATOMIC */
+        opt = kmalloc(s, GFP_ATOMIC);
+        if (opt == NULL)
+                return -ENOBUFS;
+        memset(opt, 0, s);
+        memcpy(opt->keys, p->keys, p->nkeys * sizeof(struct tc_pedit_key));
+        opt->index = p->index;
+        opt->nkeys = p->nkeys;
+        opt->flags = p->flags;
+        opt->action = p->action;
+        opt->refcnt = p->refcnt - ref;
+        opt->bindcnt = p->bindcnt - bind;
+#ifdef PEDIT_DEB
+        {                
+                /* Debug - get rid of later */
+                int i;
+                struct tc_pedit_key *key = opt->keys;
+                for (i=0; i<opt->nkeys; i++, key++) {
+                        printk( "\n key #%d",i);
+                        printk( "  at %d: val %08x mask %08x",
+                        (unsigned int)key->off,
+                        (unsigned int)key->val,
+                        (unsigned int)key->mask);
+                }
+        }
+#endif
+        RTA_PUT(skb, TCA_PEDIT_PARMS, s, opt);
+        t.install = jiffies_to_clock_t(jiffies - p->tm.install);
+        t.lastuse = jiffies_to_clock_t(jiffies - p->tm.lastuse);
+        t.expires = jiffies_to_clock_t(p->tm.expires);
+        RTA_PUT(skb, TCA_PEDIT_TM, sizeof(t), &t);
+        return skb->len;
+rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+static
+struct tc_action_ops act_pedit_ops = {
+        .kind           =       "pedit",
+        .type           =       TCA_ACT_PEDIT,
+        .capab          =       TCA_CAP_NONE,
+        .owner          =       THIS_MODULE,
+        .act            =       tcf_pedit,
+        .dump           =       tcf_pedit_dump,
+        .cleanup        =       tcf_pedit_cleanup,
+        .lookup         =       tcf_hash_search,
+        .init           =       tcf_pedit_init,
+        .walk           =       tcf_generic_walker
+};
+MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
+MODULE_DESCRIPTION("Generic Packet Editor actions");
+MODULE_LICENSE("GPL");
+static int __init
+pedit_init_module(void)
+{
+        return tcf_register_action(&act_pedit_ops);
+}
+static void __exit
+pedit_cleanup_module(void)
+{
+        tcf_unregister_action(&act_pedit_ops);
+}
+module_init(pedit_init_module);
+module_exit(pedit_cleanup_module);
diff --git a/net/sched/police.c b/net/sched/police.c
new file mode 100644
index 000000000000..c03545faf523
--- /dev/null
+++ b/net/sched/police.c
@@ -0,0 +1,612 @@
+/*
+ * net/sched/police.c   Input police filter.
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *              J Hadi Salim (action changes)
+ */
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/module.h>
+#include <linux/rtnetlink.h>
+#include <linux/init.h>
+#include <net/sock.h>
+#include <net/act_api.h>
+#define L2T(p,L)   ((p)->R_tab->data[(L)>>(p)->R_tab->rate.cell_log])
+#define L2T_P(p,L) ((p)->P_tab->data[(L)>>(p)->P_tab->rate.cell_log])
+#define PRIV(a) ((struct tcf_police *) (a)->priv)
+/* use generic hash table */
+#define MY_TAB_SIZE     16
+#define MY_TAB_MASK     15
+static u32 idx_gen;
+static struct tcf_police *tcf_police_ht[MY_TAB_SIZE];
+/* Policer hash table lock */
+static DEFINE_RWLOCK(police_lock);
+/* Each policer is serialized by its individual spinlock */
+static __inline__ unsigned tcf_police_hash(u32 index)
+{
+        return index&0xF;
+}
+static __inline__ struct tcf_police * tcf_police_lookup(u32 index)
+{
+        struct tcf_police *p;
+        read_lock(&police_lock);
+        for (p = tcf_police_ht[tcf_police_hash(index)]; p; p = p->next) {
+                if (p->index == index)
+                        break;
+        }
+        read_unlock(&police_lock);
+        return p;
+}
+#ifdef CONFIG_NET_CLS_ACT
+static int tcf_generic_walker(struct sk_buff *skb, struct netlink_callback *cb,
+                              int type, struct tc_action *a)
+{
+        struct tcf_police *p;
+        int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
+        struct rtattr *r;
+        read_lock(&police_lock);
+        s_i = cb->args[0];
+        for (i = 0; i < MY_TAB_SIZE; i++) {
+                p = tcf_police_ht[tcf_police_hash(i)];
+                for (; p; p = p->next) {
+                        index++;
+                        if (index < s_i)
+                                continue;
+                        a->priv = p;
+                        a->order = index;
+                        r = (struct rtattr*) skb->tail;
+                        RTA_PUT(skb, a->order, 0, NULL);
+                        if (type == RTM_DELACTION)
+                                err = tcf_action_dump_1(skb, a, 0, 1);
+                        else
+                                err = tcf_action_dump_1(skb, a, 0, 0);
+                        if (err < 0) {
+                                index--;
+                                skb_trim(skb, (u8*)r - skb->data);
+                                goto done;
+                        }
+                        r->rta_len = skb->tail - (u8*)r;
+                        n_i++;
+                }
+        }
+done:
+        read_unlock(&police_lock);
+        if (n_i)
+                cb->args[0] += n_i;
+        return n_i;
+rtattr_failure:
+        skb_trim(skb, (u8*)r - skb->data);
+        goto done;
+}
+static inline int
+tcf_hash_search(struct tc_action *a, u32 index)
+{
+        struct tcf_police *p = tcf_police_lookup(index);
+        if (p != NULL) {
+                a->priv = p;
+                return 1;
+        } else {
+                return 0;
+        }
+}
+#endif
+static inline u32 tcf_police_new_index(void)
+{
+        do {
+                if (++idx_gen == 0)
+                        idx_gen = 1;
+        } while (tcf_police_lookup(idx_gen));
+        return idx_gen;
+}
+void tcf_police_destroy(struct tcf_police *p)
+{
+        unsigned h = tcf_police_hash(p->index);
+        struct tcf_police **p1p;
+        
+        for (p1p = &tcf_police_ht[h]; *p1p; p1p = &(*p1p)->next) {
+                if (*p1p == p) {
+                        write_lock_bh(&police_lock);
+                        *p1p = p->next;
+                        write_unlock_bh(&police_lock);
+#ifdef CONFIG_NET_ESTIMATOR
+                        gen_kill_estimator(&p->bstats, &p->rate_est);
+#endif
+                        if (p->R_tab)
+                                qdisc_put_rtab(p->R_tab);
+                        if (p->P_tab)
+                                qdisc_put_rtab(p->P_tab);
+                        kfree(p);
+                        return;
+                }
+        }
+        BUG_TRAP(0);
+}
+#ifdef CONFIG_NET_CLS_ACT
+static int tcf_act_police_locate(struct rtattr *rta, struct rtattr *est,
+                                 struct tc_action *a, int ovr, int bind)
+{
+        unsigned h;
+        int ret = 0, err;
+        struct rtattr *tb[TCA_POLICE_MAX];
+        struct tc_police *parm;
+        struct tcf_police *p;
+        struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL;
+        if (rta == NULL || rtattr_parse_nested(tb, TCA_POLICE_MAX, rta) < 0)
+                return -EINVAL;
+        if (tb[TCA_POLICE_TBF-1] == NULL ||
+            RTA_PAYLOAD(tb[TCA_POLICE_TBF-1]) != sizeof(*parm))
+                return -EINVAL;
+        parm = RTA_DATA(tb[TCA_POLICE_TBF-1]);
+        if (tb[TCA_POLICE_RESULT-1] != NULL &&
+            RTA_PAYLOAD(tb[TCA_POLICE_RESULT-1]) != sizeof(u32))
+                return -EINVAL;
+        if (tb[TCA_POLICE_RESULT-1] != NULL &&
+            RTA_PAYLOAD(tb[TCA_POLICE_RESULT-1]) != sizeof(u32))
+                return -EINVAL;
+        if (parm->index && (p = tcf_police_lookup(parm->index)) != NULL) {
+                a->priv = p;
+                if (bind) {
+                        p->bindcnt += 1;
+                        p->refcnt += 1;
+                }
+                if (ovr)
+                        goto override;
+                return ret;
+        }
+        p = kmalloc(sizeof(*p), GFP_KERNEL);
+        if (p == NULL)
+                return -ENOMEM;
+        memset(p, 0, sizeof(*p));
+        ret = ACT_P_CREATED;
+        p->refcnt = 1;
+        spin_lock_init(&p->lock);
+        p->stats_lock = &p->lock;
+        if (bind)
+                p->bindcnt = 1;
+override:
+        if (parm->rate.rate) {
+                err = -ENOMEM;
+                R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE-1]);
+                if (R_tab == NULL)
+                        goto failure;
+                if (parm->peakrate.rate) {
+                        P_tab = qdisc_get_rtab(&parm->peakrate,
+                                               tb[TCA_POLICE_PEAKRATE-1]);
+                        if (p->P_tab == NULL) {
+                                qdisc_put_rtab(R_tab);
+                                goto failure;
+                        }
+                }
+        }
+        /* No failure allowed after this point */
+        spin_lock_bh(&p->lock);
+        if (R_tab != NULL) {
+                qdisc_put_rtab(p->R_tab);
+                p->R_tab = R_tab;
+        }
+        if (P_tab != NULL) {
+                qdisc_put_rtab(p->P_tab);
+                p->P_tab = P_tab;
+        }
+        if (tb[TCA_POLICE_RESULT-1])
+                p->result = *(u32*)RTA_DATA(tb[TCA_POLICE_RESULT-1]);
+        p->toks = p->burst = parm->burst;
+        p->mtu = parm->mtu;
+        if (p->mtu == 0) {
+                p->mtu = ~0;
+                if (p->R_tab)
+                        p->mtu = 255<<p->R_tab->rate.cell_log;
+        }
+        if (p->P_tab)
+                p->ptoks = L2T_P(p, p->mtu);
+        p->action = parm->action;
+#ifdef CONFIG_NET_ESTIMATOR
+        if (tb[TCA_POLICE_AVRATE-1])
+                p->ewma_rate = *(u32*)RTA_DATA(tb[TCA_POLICE_AVRATE-1]);
+        if (est)
+                gen_replace_estimator(&p->bstats, &p->rate_est, p->stats_lock, est);
+#endif
+        spin_unlock_bh(&p->lock);
+        if (ret != ACT_P_CREATED)
+                return ret;
+        PSCHED_GET_TIME(p->t_c);
+        p->index = parm->index ? : tcf_police_new_index();
+        h = tcf_police_hash(p->index);
+        write_lock_bh(&police_lock);
+        p->next = tcf_police_ht[h];
+        tcf_police_ht[h] = p;
+        write_unlock_bh(&police_lock);
+        a->priv = p;
+        return ret;
+failure:
+        if (ret == ACT_P_CREATED)
+                kfree(p);
+        return err;
+}
+static int tcf_act_police_cleanup(struct tc_action *a, int bind)
+{
+        struct tcf_police *p = PRIV(a);
+        if (p != NULL)
+                return tcf_police_release(p, bind);
+        return 0;
+}
+static int tcf_act_police(struct sk_buff **pskb, struct tc_action *a)
+{
+        psched_time_t now;
+        struct sk_buff *skb = *pskb;
+        struct tcf_police *p = PRIV(a);
+        long toks;
+        long ptoks = 0;
+        spin_lock(&p->lock);
+        p->bstats.bytes += skb->len;
+        p->bstats.packets++;
+#ifdef CONFIG_NET_ESTIMATOR
+        if (p->ewma_rate && p->rate_est.bps >= p->ewma_rate) {
+                p->qstats.overlimits++;
+                spin_unlock(&p->lock);
+                return p->action;
+        }
+#endif
+        if (skb->len <= p->mtu) {
+                if (p->R_tab == NULL) {
+                        spin_unlock(&p->lock);
+                        return p->result;
+                }
+                PSCHED_GET_TIME(now);
+                toks = PSCHED_TDIFF_SAFE(now, p->t_c, p->burst);
+                if (p->P_tab) {
+                        ptoks = toks + p->ptoks;
+                        if (ptoks > (long)L2T_P(p, p->mtu))
+                                ptoks = (long)L2T_P(p, p->mtu);
+                        ptoks -= L2T_P(p, skb->len);
+                }
+                toks += p->toks;
+                if (toks > (long)p->burst)
+                        toks = p->burst;
+                toks -= L2T(p, skb->len);
+                if ((toks|ptoks) >= 0) {
+                        p->t_c = now;
+                        p->toks = toks;
+                        p->ptoks = ptoks;
+                        spin_unlock(&p->lock);
+                        return p->result;
+                }
+        }
+        p->qstats.overlimits++;
+        spin_unlock(&p->lock);
+        return p->action;
+}
+static int
+tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+{
+        unsigned char    *b = skb->tail;
+        struct tc_police opt;
+        struct tcf_police *p = PRIV(a);
+        opt.index = p->index;
+        opt.action = p->action;
+        opt.mtu = p->mtu;
+        opt.burst = p->burst;
+        opt.refcnt = p->refcnt - ref;
+        opt.bindcnt = p->bindcnt - bind;
+        if (p->R_tab)
+                opt.rate = p->R_tab->rate;
+        else
+                memset(&opt.rate, 0, sizeof(opt.rate));
+        if (p->P_tab)
+                opt.peakrate = p->P_tab->rate;
+        else
+                memset(&opt.peakrate, 0, sizeof(opt.peakrate));
+        RTA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt);
+        if (p->result)
+                RTA_PUT(skb, TCA_POLICE_RESULT, sizeof(int), &p->result);
+#ifdef CONFIG_NET_ESTIMATOR
+        if (p->ewma_rate)
+                RTA_PUT(skb, TCA_POLICE_AVRATE, 4, &p->ewma_rate);
+#endif
+        return skb->len;
+rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+MODULE_AUTHOR("Alexey Kuznetsov");
+MODULE_DESCRIPTION("Policing actions");
+MODULE_LICENSE("GPL");
+static struct tc_action_ops act_police_ops = {
+        .kind           =       "police",
+        .type           =       TCA_ID_POLICE,
+        .capab          =       TCA_CAP_NONE,
+        .owner          =       THIS_MODULE,
+        .act            =       tcf_act_police,
+        .dump           =       tcf_act_police_dump,
+        .cleanup        =       tcf_act_police_cleanup,
+        .lookup         =       tcf_hash_search,
+        .init           =       tcf_act_police_locate,
+        .walk           =       tcf_generic_walker
+};
+static int __init
+police_init_module(void)
+{
+        return tcf_register_action(&act_police_ops);
+}
+static void __exit
+police_cleanup_module(void)
+{
+        tcf_unregister_action(&act_police_ops);
+}
+module_init(police_init_module);
+module_exit(police_cleanup_module);
+#endif
+struct tcf_police * tcf_police_locate(struct rtattr *rta, struct rtattr *est)
+{
+        unsigned h;
+        struct tcf_police *p;
+        struct rtattr *tb[TCA_POLICE_MAX];
+        struct tc_police *parm;
+        if (rtattr_parse_nested(tb, TCA_POLICE_MAX, rta) < 0)
+                return NULL;
+        if (tb[TCA_POLICE_TBF-1] == NULL ||
+            RTA_PAYLOAD(tb[TCA_POLICE_TBF-1]) != sizeof(*parm))
+                return NULL;
+        parm = RTA_DATA(tb[TCA_POLICE_TBF-1]);
+        if (parm->index && (p = tcf_police_lookup(parm->index)) != NULL) {
+                p->refcnt++;
+                return p;
+        }
+        p = kmalloc(sizeof(*p), GFP_KERNEL);
+        if (p == NULL)
+                return NULL;
+        memset(p, 0, sizeof(*p));
+        p->refcnt = 1;
+        spin_lock_init(&p->lock);
+        p->stats_lock = &p->lock;
+        if (parm->rate.rate) {
+                p->R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE-1]);
+                if (p->R_tab == NULL)
+                        goto failure;
+                if (parm->peakrate.rate) {
+                        p->P_tab = qdisc_get_rtab(&parm->peakrate,
+                                                  tb[TCA_POLICE_PEAKRATE-1]);
+                        if (p->P_tab == NULL)
+                                goto failure;
+                }
+        }
+        if (tb[TCA_POLICE_RESULT-1]) {
+                if (RTA_PAYLOAD(tb[TCA_POLICE_RESULT-1]) != sizeof(u32))
+                        goto failure;
+                p->result = *(u32*)RTA_DATA(tb[TCA_POLICE_RESULT-1]);
+        }
+#ifdef CONFIG_NET_ESTIMATOR
+        if (tb[TCA_POLICE_AVRATE-1]) {
+                if (RTA_PAYLOAD(tb[TCA_POLICE_AVRATE-1]) != sizeof(u32))
+                        goto failure;
+                p->ewma_rate = *(u32*)RTA_DATA(tb[TCA_POLICE_AVRATE-1]);
+        }
+#endif
+        p->toks = p->burst = parm->burst;
+        p->mtu = parm->mtu;
+        if (p->mtu == 0) {
+                p->mtu = ~0;
+                if (p->R_tab)
+                        p->mtu = 255<<p->R_tab->rate.cell_log;
+        }
+        if (p->P_tab)
+                p->ptoks = L2T_P(p, p->mtu);
+        PSCHED_GET_TIME(p->t_c);
+        p->index = parm->index ? : tcf_police_new_index();
+        p->action = parm->action;
+#ifdef CONFIG_NET_ESTIMATOR
+        if (est)
+                gen_new_estimator(&p->bstats, &p->rate_est, p->stats_lock, est);
+#endif
+        h = tcf_police_hash(p->index);
+        write_lock_bh(&police_lock);
+        p->next = tcf_police_ht[h];
+        tcf_police_ht[h] = p;
+        write_unlock_bh(&police_lock);
+        return p;
+failure:
+        if (p->R_tab)
+                qdisc_put_rtab(p->R_tab);
+        kfree(p);
+        return NULL;
+}
+int tcf_police(struct sk_buff *skb, struct tcf_police *p)
+{
+        psched_time_t now;
+        long toks;
+        long ptoks = 0;
+        spin_lock(&p->lock);
+        p->bstats.bytes += skb->len;
+        p->bstats.packets++;
+#ifdef CONFIG_NET_ESTIMATOR
+        if (p->ewma_rate && p->rate_est.bps >= p->ewma_rate) {
+                p->qstats.overlimits++;
+                spin_unlock(&p->lock);
+                return p->action;
+        }
+#endif
+        if (skb->len <= p->mtu) {
+                if (p->R_tab == NULL) {
+                        spin_unlock(&p->lock);
+                        return p->result;
+                }
+                PSCHED_GET_TIME(now);
+                toks = PSCHED_TDIFF_SAFE(now, p->t_c, p->burst);
+                if (p->P_tab) {
+                        ptoks = toks + p->ptoks;
+                        if (ptoks > (long)L2T_P(p, p->mtu))
+                                ptoks = (long)L2T_P(p, p->mtu);
+                        ptoks -= L2T_P(p, skb->len);
+                }
+                toks += p->toks;
+                if (toks > (long)p->burst)
+                        toks = p->burst;
+                toks -= L2T(p, skb->len);
+                if ((toks|ptoks) >= 0) {
+                        p->t_c = now;
+                        p->toks = toks;
+                        p->ptoks = ptoks;
+                        spin_unlock(&p->lock);
+                        return p->result;
+                }
+        }
+        p->qstats.overlimits++;
+        spin_unlock(&p->lock);
+        return p->action;
+}
+int tcf_police_dump(struct sk_buff *skb, struct tcf_police *p)
+{
+        unsigned char    *b = skb->tail;
+        struct tc_police opt;
+        opt.index = p->index;
+        opt.action = p->action;
+        opt.mtu = p->mtu;
+        opt.burst = p->burst;
+        if (p->R_tab)
+                opt.rate = p->R_tab->rate;
+        else
+                memset(&opt.rate, 0, sizeof(opt.rate));
+        if (p->P_tab)
+                opt.peakrate = p->P_tab->rate;
+        else
+                memset(&opt.peakrate, 0, sizeof(opt.peakrate));
+        RTA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt);
+        if (p->result)
+                RTA_PUT(skb, TCA_POLICE_RESULT, sizeof(int), &p->result);
+#ifdef CONFIG_NET_ESTIMATOR
+        if (p->ewma_rate)
+                RTA_PUT(skb, TCA_POLICE_AVRATE, 4, &p->ewma_rate);
+#endif
+        return skb->len;
+rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+int tcf_police_dump_stats(struct sk_buff *skb, struct tcf_police *p)
+{
+        struct gnet_dump d;
+        
+        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
+                        TCA_XSTATS, p->stats_lock, &d) < 0)
+                goto errout;
+        
+        if (gnet_stats_copy_basic(&d, &p->bstats) < 0 ||
+#ifdef CONFIG_NET_ESTIMATOR
+            gnet_stats_copy_rate_est(&d, &p->rate_est) < 0 ||
+#endif
+            gnet_stats_copy_queue(&d, &p->qstats) < 0)
+                goto errout;
+        if (gnet_stats_finish_copy(&d) < 0)
+                goto errout;
+        return 0;
+errout:
+        return -1;
+}
+EXPORT_SYMBOL(tcf_police);
+EXPORT_SYMBOL(tcf_police_destroy);
+EXPORT_SYMBOL(tcf_police_dump);
+EXPORT_SYMBOL(tcf_police_dump_stats);
+EXPORT_SYMBOL(tcf_police_hash);
+EXPORT_SYMBOL(tcf_police_ht);
+EXPORT_SYMBOL(tcf_police_locate);
+EXPORT_SYMBOL(tcf_police_lookup);
+EXPORT_SYMBOL(tcf_police_new_index);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
new file mode 100644
index 000000000000..4323a74eea30
--- /dev/null
+++ b/net/sched/sch_api.c
@@ -0,0 +1,1296 @@
+/*
+ * net/sched/sch_api.c  Packet scheduler API.
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Fixes:
+ *
+ * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
+ * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
+ * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/kmod.h>
+#include <linux/list.h>
+#include <linux/bitops.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+#include <asm/processor.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
+                        struct Qdisc *old, struct Qdisc *new);
+static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
+                         struct Qdisc *q, unsigned long cl, int event);
+/*
+   Short review.
+   -------------
+   This file consists of two interrelated parts:
+   1. queueing disciplines manager frontend.
+   2. traffic classes manager frontend.
+   Generally, queueing discipline ("qdisc") is a black box,
+   which is able to enqueue packets and to dequeue them (when
+   device is ready to send something) in order and at times
+   determined by algorithm hidden in it.
+   qdisc's are divided to two categories:
+   - "queues", which have no internal structure visible from outside.
+   - "schedulers", which split all the packets to "traffic classes",
+     using "packet classifiers" (look at cls_api.c)
+   In turn, classes may have child qdiscs (as rule, queues)
+   attached to them etc. etc. etc.
+   The goal of the routines in this file is to translate
+   information supplied by user in the form of handles
+   to more intelligible for kernel form, to make some sanity
+   checks and part of work, which is common to all qdiscs
+   and to provide rtnetlink notifications.
+   All real intelligent work is done inside qdisc modules.
+   Every discipline has two major routines: enqueue and dequeue.
+   ---dequeue
+   dequeue usually returns a skb to send. It is allowed to return NULL,
+   but it does not mean that queue is empty, it just means that
+   discipline does not want to send anything this time.
+   Queue is really empty if q->q.qlen == 0.
+   For complicated disciplines with multiple queues q->q is not
+   real packet queue, but however q->q.qlen must be valid.
+   ---enqueue
+   enqueue returns 0, if packet was enqueued successfully.
+   If packet (this one or another one) was dropped, it returns
+   not zero error code.
+   NET_XMIT_DROP        - this packet dropped
+     Expected action: do not backoff, but wait until queue will clear.
+   NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
+     Expected action: backoff or ignore
+   NET_XMIT_POLICED     - dropped by police.
+     Expected action: backoff or error to real-time apps.
+   Auxiliary routines:
+   ---requeue
+   requeues once dequeued packet. It is used for non-standard or
+   just buggy devices, which can defer output even if dev->tbusy=0.
+   ---reset
+   returns qdisc to initial state: purge all buffers, clear all
+   timers, counters (except for statistics) etc.
+   ---init
+   initializes newly created qdisc.
+   ---destroy
+   destroys resources allocated by init and during lifetime of qdisc.
+   ---change
+   changes qdisc parameters.
+ */
+/* Protects list of registered TC modules. It is pure SMP lock. */
+static DEFINE_RWLOCK(qdisc_mod_lock);
+/************************************************
+ *      Queueing disciplines manipulation.      *
+ ************************************************/
+/* The list of all installed queueing disciplines. */
+static struct Qdisc_ops *qdisc_base;
+/* Register/uregister queueing discipline */
+int register_qdisc(struct Qdisc_ops *qops)
+{
+        struct Qdisc_ops *q, **qp;
+        int rc = -EEXIST;
+        write_lock(&qdisc_mod_lock);
+        for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
+                if (!strcmp(qops->id, q->id))
+                        goto out;
+        if (qops->enqueue == NULL)
+                qops->enqueue = noop_qdisc_ops.enqueue;
+        if (qops->requeue == NULL)
+                qops->requeue = noop_qdisc_ops.requeue;
+        if (qops->dequeue == NULL)
+                qops->dequeue = noop_qdisc_ops.dequeue;
+        qops->next = NULL;
+        *qp = qops;
+        rc = 0;
+out:
+        write_unlock(&qdisc_mod_lock);
+        return rc;
+}
+int unregister_qdisc(struct Qdisc_ops *qops)
+{
+        struct Qdisc_ops *q, **qp;
+        int err = -ENOENT;
+        write_lock(&qdisc_mod_lock);
+        for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
+                if (q == qops)
+                        break;
+        if (q) {
+                *qp = q->next;
+                q->next = NULL;
+                err = 0;
+        }
+        write_unlock(&qdisc_mod_lock);
+        return err;
+}
+/* We know handle. Find qdisc among all qdisc's attached to device
+   (root qdisc, all its children, children of children etc.)
+ */
+struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
+{
+        struct Qdisc *q;
+        read_lock_bh(&qdisc_tree_lock);
+        list_for_each_entry(q, &dev->qdisc_list, list) {
+                if (q->handle == handle) {
+                        read_unlock_bh(&qdisc_tree_lock);
+                        return q;
+                }
+        }
+        read_unlock_bh(&qdisc_tree_lock);
+        return NULL;
+}
+static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
+{
+        unsigned long cl;
+        struct Qdisc *leaf;
+        struct Qdisc_class_ops *cops = p->ops->cl_ops;
+        if (cops == NULL)
+                return NULL;
+        cl = cops->get(p, classid);
+        if (cl == 0)
+                return NULL;
+        leaf = cops->leaf(p, cl);
+        cops->put(p, cl);
+        return leaf;
+}
+/* Find queueing discipline by name */
+static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
+{
+        struct Qdisc_ops *q = NULL;
+        if (kind) {
+                read_lock(&qdisc_mod_lock);
+                for (q = qdisc_base; q; q = q->next) {
+                        if (rtattr_strcmp(kind, q->id) == 0) {
+                                if (!try_module_get(q->owner))
+                                        q = NULL;
+                                break;
+                        }
+                }
+                read_unlock(&qdisc_mod_lock);
+        }
+        return q;
+}
+static struct qdisc_rate_table *qdisc_rtab_list;
+struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab)
+{
+        struct qdisc_rate_table *rtab;
+        for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
+                if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
+                        rtab->refcnt++;
+                        return rtab;
+                }
+        }
+        if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024)
+                return NULL;
+        rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
+        if (rtab) {
+                rtab->rate = *r;
+                rtab->refcnt = 1;
+                memcpy(rtab->data, RTA_DATA(tab), 1024);
+                rtab->next = qdisc_rtab_list;
+                qdisc_rtab_list = rtab;
+        }
+        return rtab;
+}
+void qdisc_put_rtab(struct qdisc_rate_table *tab)
+{
+        struct qdisc_rate_table *rtab, **rtabp;
+        if (!tab || --tab->refcnt)
+                return;
+        for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
+                if (rtab == tab) {
+                        *rtabp = rtab->next;
+                        kfree(rtab);
+                        return;
+                }
+        }
+}
+/* Allocate an unique handle from space managed by kernel */
+static u32 qdisc_alloc_handle(struct net_device *dev)
+{
+        int i = 0x10000;
+        static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
+        do {
+                autohandle += TC_H_MAKE(0x10000U, 0);
+                if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
+                        autohandle = TC_H_MAKE(0x80000000U, 0);
+        } while (qdisc_lookup(dev, autohandle) && --i > 0);
+        return i>0 ? autohandle : 0;
+}
+/* Attach toplevel qdisc to device dev */
+static struct Qdisc *
+dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
+{
+        struct Qdisc *oqdisc;
+        if (dev->flags & IFF_UP)
+                dev_deactivate(dev);
+        qdisc_lock_tree(dev);
+        if (qdisc && qdisc->flags&TCQ_F_INGRESS) {
+                oqdisc = dev->qdisc_ingress;
+                /* Prune old scheduler */
+                if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
+                        /* delete */
+                        qdisc_reset(oqdisc);
+                        dev->qdisc_ingress = NULL;
+                } else {  /* new */
+                        dev->qdisc_ingress = qdisc;
+                }
+        } else {
+                oqdisc = dev->qdisc_sleeping;
+                /* Prune old scheduler */
+                if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
+                        qdisc_reset(oqdisc);
+                /* ... and graft new one */
+                if (qdisc == NULL)
+                        qdisc = &noop_qdisc;
+                dev->qdisc_sleeping = qdisc;
+                dev->qdisc = &noop_qdisc;
+        }
+        qdisc_unlock_tree(dev);
+        if (dev->flags & IFF_UP)
+                dev_activate(dev);
+        return oqdisc;
+}
+/* Graft qdisc "new" to class "classid" of qdisc "parent" or
+   to device "dev".
+   Old qdisc is not destroyed but returned in *old.
+ */
+static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
+                       u32 classid,
+                       struct Qdisc *new, struct Qdisc **old)
+{
+        int err = 0;
+        struct Qdisc *q = *old;
+        if (parent == NULL) { 
+                if (q && q->flags&TCQ_F_INGRESS) {
+                        *old = dev_graft_qdisc(dev, q);
+                } else {
+                        *old = dev_graft_qdisc(dev, new);
+                }
+        } else {
+                struct Qdisc_class_ops *cops = parent->ops->cl_ops;
+                err = -EINVAL;
+                if (cops) {
+                        unsigned long cl = cops->get(parent, classid);
+                        if (cl) {
+                                err = cops->graft(parent, cl, new, old);
+                                if (new)
+                                        new->parent = classid;
+                                cops->put(parent, cl);
+                        }
+                }
+        }
+        return err;
+}
+/*
+   Allocate and initialize new qdisc.
+   Parameters are passed via opt.
+ */
+static struct Qdisc *
+qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
+{
+        int err;
+        struct rtattr *kind = tca[TCA_KIND-1];
+        void *p = NULL;
+        struct Qdisc *sch;
+        struct Qdisc_ops *ops;
+        int size;
+        ops = qdisc_lookup_ops(kind);
+#ifdef CONFIG_KMOD
+        if (ops == NULL && kind != NULL) {
+                char name[IFNAMSIZ];
+                if (rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
+                        /* We dropped the RTNL semaphore in order to
+                         * perform the module load.  So, even if we
+                         * succeeded in loading the module we have to
+                         * tell the caller to replay the request.  We
+                         * indicate this using -EAGAIN.
+                         * We replay the request because the device may
+                         * go away in the mean time.
+                         */
+                        rtnl_unlock();
+                        request_module("sch_%s", name);
+                        rtnl_lock();
+                        ops = qdisc_lookup_ops(kind);
+                        if (ops != NULL) {
+                                /* We will try again qdisc_lookup_ops,
+                                 * so don't keep a reference.
+                                 */
+                                module_put(ops->owner);
+                                err = -EAGAIN;
+                                goto err_out;
+                        }
+                }
+        }
+#endif
+        err = -EINVAL;
+        if (ops == NULL)
+                goto err_out;
+        /* ensure that the Qdisc and the private data are 32-byte aligned */
+        size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST);
+        size += ops->priv_size + QDISC_ALIGN_CONST;
+        p = kmalloc(size, GFP_KERNEL);
+        err = -ENOBUFS;
+        if (!p)
+                goto err_out2;
+        memset(p, 0, size);
+        sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST)
+                               & ~QDISC_ALIGN_CONST);
+        sch->padded = (char *)sch - (char *)p;
+        INIT_LIST_HEAD(&sch->list);
+        skb_queue_head_init(&sch->q);
+        if (handle == TC_H_INGRESS)
+                sch->flags |= TCQ_F_INGRESS;
+        sch->ops = ops;
+        sch->enqueue = ops->enqueue;
+        sch->dequeue = ops->dequeue;
+        sch->dev = dev;
+        dev_hold(dev);
+        atomic_set(&sch->refcnt, 1);
+        sch->stats_lock = &dev->queue_lock;
+        if (handle == 0) {
+                handle = qdisc_alloc_handle(dev);
+                err = -ENOMEM;
+                if (handle == 0)
+                        goto err_out3;
+        }
+        if (handle == TC_H_INGRESS)
+                sch->handle =TC_H_MAKE(TC_H_INGRESS, 0);
+        else
+                sch->handle = handle;
+        if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
+                qdisc_lock_tree(dev);
+                list_add_tail(&sch->list, &dev->qdisc_list);
+                qdisc_unlock_tree(dev);
+#ifdef CONFIG_NET_ESTIMATOR
+                if (tca[TCA_RATE-1])
+                        gen_new_estimator(&sch->bstats, &sch->rate_est,
+                                sch->stats_lock, tca[TCA_RATE-1]);
+#endif
+                return sch;
+        }
+err_out3:
+        dev_put(dev);
+err_out2:
+        module_put(ops->owner);
+err_out:
+        *errp = err;
+        if (p)
+                kfree(p);
+        return NULL;
+}
+static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
+{
+        if (tca[TCA_OPTIONS-1]) {
+                int err;
+                if (sch->ops->change == NULL)
+                        return -EINVAL;
+                err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
+                if (err)
+                        return err;
+        }
+#ifdef CONFIG_NET_ESTIMATOR
+        if (tca[TCA_RATE-1])
+                gen_replace_estimator(&sch->bstats, &sch->rate_est,
+                        sch->stats_lock, tca[TCA_RATE-1]);
+#endif
+        return 0;
+}
+struct check_loop_arg
+{
+        struct qdisc_walker     w;
+        struct Qdisc            *p;
+        int                     depth;
+};
+static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
+static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
+{
+        struct check_loop_arg   arg;
+        if (q->ops->cl_ops == NULL)
+                return 0;
+        arg.w.stop = arg.w.skip = arg.w.count = 0;
+        arg.w.fn = check_loop_fn;
+        arg.depth = depth;
+        arg.p = p;
+        q->ops->cl_ops->walk(q, &arg.w);
+        return arg.w.stop ? -ELOOP : 0;
+}
+static int
+check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
+{
+        struct Qdisc *leaf;
+        struct Qdisc_class_ops *cops = q->ops->cl_ops;
+        struct check_loop_arg *arg = (struct check_loop_arg *)w;
+        leaf = cops->leaf(q, cl);
+        if (leaf) {
+                if (leaf == arg->p || arg->depth > 7)
+                        return -ELOOP;
+                return check_loop(leaf, arg->p, arg->depth + 1);
+        }
+        return 0;
+}
+/*
+ * Delete/get qdisc.
+ */
+static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
+{
+        struct tcmsg *tcm = NLMSG_DATA(n);
+        struct rtattr **tca = arg;
+        struct net_device *dev;
+        u32 clid = tcm->tcm_parent;
+        struct Qdisc *q = NULL;
+        struct Qdisc *p = NULL;
+        int err;
+        if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
+                return -ENODEV;
+        if (clid) {
+                if (clid != TC_H_ROOT) {
+                        if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
+                                if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
+                                        return -ENOENT;
+                                q = qdisc_leaf(p, clid);
+                        } else { /* ingress */
+                                q = dev->qdisc_ingress;
+                        }
+                } else {
+                        q = dev->qdisc_sleeping;
+                }
+                if (!q)
+                        return -ENOENT;
+                if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
+                        return -EINVAL;
+        } else {
+                if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
+                        return -ENOENT;
+        }
+        if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
+                return -EINVAL;
+        if (n->nlmsg_type == RTM_DELQDISC) {
+                if (!clid)
+                        return -EINVAL;
+                if (q->handle == 0)
+                        return -ENOENT;
+                if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
+                        return err;
+                if (q) {
+                        qdisc_notify(skb, n, clid, q, NULL);
+                        spin_lock_bh(&dev->queue_lock);
+                        qdisc_destroy(q);
+                        spin_unlock_bh(&dev->queue_lock);
+                }
+        } else {
+                qdisc_notify(skb, n, clid, NULL, q);
+        }
+        return 0;
+}
+/*
+   Create/change qdisc.
+ */
+static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
+{
+        struct tcmsg *tcm;
+        struct rtattr **tca;
+        struct net_device *dev;
+        u32 clid;
+        struct Qdisc *q, *p;
+        int err;
+replay:
+        /* Reinit, just in case something touches this. */
+        tcm = NLMSG_DATA(n);
+        tca = arg;
+        clid = tcm->tcm_parent;
+        q = p = NULL;
+        if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
+                return -ENODEV;
+        if (clid) {
+                if (clid != TC_H_ROOT) {
+                        if (clid != TC_H_INGRESS) {
+                                if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
+                                        return -ENOENT;
+                                q = qdisc_leaf(p, clid);
+                        } else { /*ingress */
+                                q = dev->qdisc_ingress;
+                        }
+                } else {
+                        q = dev->qdisc_sleeping;
+                }
+                /* It may be default qdisc, ignore it */
+                if (q && q->handle == 0)
+                        q = NULL;
+                if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
+                        if (tcm->tcm_handle) {
+                                if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
+                                        return -EEXIST;
+                                if (TC_H_MIN(tcm->tcm_handle))
+                                        return -EINVAL;
+                                if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
+                                        goto create_n_graft;
+                                if (n->nlmsg_flags&NLM_F_EXCL)
+                                        return -EEXIST;
+                                if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
+                                        return -EINVAL;
+                                if (q == p ||
+                                    (p && check_loop(q, p, 0)))
+                                        return -ELOOP;
+                                atomic_inc(&q->refcnt);
+                                goto graft;
+                        } else {
+                                if (q == NULL)
+                                        goto create_n_graft;
+                                /* This magic test requires explanation.
+                                 *
+                                 *   We know, that some child q is already
+                                 *   attached to this parent and have choice:
+                                 *   either to change it or to create/graft new one.
+                                 *
+                                 *   1. We are allowed to create/graft only
+                                 *   if CREATE and REPLACE flags are set.
+                                 *
+                                 *   2. If EXCL is set, requestor wanted to say,
+                                 *   that qdisc tcm_handle is not expected
+                                 *   to exist, so that we choose create/graft too.
+                                 *
+                                 *   3. The last case is when no flags are set.
+                                 *   Alas, it is sort of hole in API, we
+                                 *   cannot decide what to do unambiguously.
+                                 *   For now we select create/graft, if
+                                 *   user gave KIND, which does not match existing.
+                                 */
+                                if ((n->nlmsg_flags&NLM_F_CREATE) &&
+                                    (n->nlmsg_flags&NLM_F_REPLACE) &&
+                                    ((n->nlmsg_flags&NLM_F_EXCL) ||
+                                     (tca[TCA_KIND-1] &&
+                                      rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
+                                        goto create_n_graft;
+                        }
+                }
+        } else {
+                if (!tcm->tcm_handle)
+                        return -EINVAL;
+                q = qdisc_lookup(dev, tcm->tcm_handle);
+        }
+        /* Change qdisc parameters */
+        if (q == NULL)
+                return -ENOENT;
+        if (n->nlmsg_flags&NLM_F_EXCL)
+                return -EEXIST;
+        if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
+                return -EINVAL;
+        err = qdisc_change(q, tca);
+        if (err == 0)
+                qdisc_notify(skb, n, clid, NULL, q);
+        return err;
+create_n_graft:
+        if (!(n->nlmsg_flags&NLM_F_CREATE))
+                return -ENOENT;
+        if (clid == TC_H_INGRESS)
+                q = qdisc_create(dev, tcm->tcm_parent, tca, &err);
+        else
+                q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
+        if (q == NULL) {
+                if (err == -EAGAIN)
+                        goto replay;
+                return err;
+        }
+graft:
+        if (1) {
+                struct Qdisc *old_q = NULL;
+                err = qdisc_graft(dev, p, clid, q, &old_q);
+                if (err) {
+                        if (q) {
+                                spin_lock_bh(&dev->queue_lock);
+                                qdisc_destroy(q);
+                                spin_unlock_bh(&dev->queue_lock);
+                        }
+                        return err;
+                }
+                qdisc_notify(skb, n, clid, old_q, q);
+                if (old_q) {
+                        spin_lock_bh(&dev->queue_lock);
+                        qdisc_destroy(old_q);
+                        spin_unlock_bh(&dev->queue_lock);
+                }
+        }
+        return 0;
+}
+static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
+                         u32 pid, u32 seq, unsigned flags, int event)
+{
+        struct tcmsg *tcm;
+        struct nlmsghdr  *nlh;
+        unsigned char    *b = skb->tail;
+        struct gnet_dump d;
+        nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm));
+        nlh->nlmsg_flags = flags;
+        tcm = NLMSG_DATA(nlh);
+        tcm->tcm_family = AF_UNSPEC;
+        tcm->tcm_ifindex = q->dev->ifindex;
+        tcm->tcm_parent = clid;
+        tcm->tcm_handle = q->handle;
+        tcm->tcm_info = atomic_read(&q->refcnt);
+        RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
+        if (q->ops->dump && q->ops->dump(q, skb) < 0)
+                goto rtattr_failure;
+        q->qstats.qlen = q->q.qlen;
+        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
+                        TCA_XSTATS, q->stats_lock, &d) < 0)
+                goto rtattr_failure;
+        if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
+                goto rtattr_failure;
+        if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
+#ifdef CONFIG_NET_ESTIMATOR
+            gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
+#endif
+            gnet_stats_copy_queue(&d, &q->qstats) < 0)
+                goto rtattr_failure;
+        
+        if (gnet_stats_finish_copy(&d) < 0)
+                goto rtattr_failure;
+        
+        nlh->nlmsg_len = skb->tail - b;
+        return skb->len;
+nlmsg_failure:
+rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
+                        u32 clid, struct Qdisc *old, struct Qdisc *new)
+{
+        struct sk_buff *skb;
+        u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
+        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+        if (!skb)
+                return -ENOBUFS;
+        if (old && old->handle) {
+                if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
+                        goto err_out;
+        }
+        if (new) {
+                if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
+                        goto err_out;
+        }
+        if (skb->len)
+                return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
+err_out:
+        kfree_skb(skb);
+        return -EINVAL;
+}
+static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
+{
+        int idx, q_idx;
+        int s_idx, s_q_idx;
+        struct net_device *dev;
+        struct Qdisc *q;
+        s_idx = cb->args[0];
+        s_q_idx = q_idx = cb->args[1];
+        read_lock(&dev_base_lock);
+        for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
+                if (idx < s_idx)
+                        continue;
+                if (idx > s_idx)
+                        s_q_idx = 0;
+                read_lock_bh(&qdisc_tree_lock);
+                q_idx = 0;
+                list_for_each_entry(q, &dev->qdisc_list, list) {
+                        if (q_idx < s_q_idx) {
+                                q_idx++;
+                                continue;
+                        }
+                        if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
+                                          cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
+                                read_unlock_bh(&qdisc_tree_lock);
+                                goto done;
+                        }
+                        q_idx++;
+                }
+                read_unlock_bh(&qdisc_tree_lock);
+        }
+done:
+        read_unlock(&dev_base_lock);
+        cb->args[0] = idx;
+        cb->args[1] = q_idx;
+        return skb->len;
+}
+/************************************************
+ *      Traffic classes manipulation.           *
+ ************************************************/
+static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
+{
+        struct tcmsg *tcm = NLMSG_DATA(n);
+        struct rtattr **tca = arg;
+        struct net_device *dev;
+        struct Qdisc *q = NULL;
+        struct Qdisc_class_ops *cops;
+        unsigned long cl = 0;
+        unsigned long new_cl;
+        u32 pid = tcm->tcm_parent;
+        u32 clid = tcm->tcm_handle;
+        u32 qid = TC_H_MAJ(clid);
+        int err;
+        if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
+                return -ENODEV;
+        /*
+           parent == TC_H_UNSPEC - unspecified parent.
+           parent == TC_H_ROOT   - class is root, which has no parent.
+           parent == X:0         - parent is root class.
+           parent == X:Y         - parent is a node in hierarchy.
+           parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
+           handle == 0:0         - generate handle from kernel pool.
+           handle == 0:Y         - class is X:Y, where X:0 is qdisc.
+           handle == X:Y         - clear.
+           handle == X:0         - root class.
+         */
+        /* Step 1. Determine qdisc handle X:0 */
+        if (pid != TC_H_ROOT) {
+                u32 qid1 = TC_H_MAJ(pid);
+                if (qid && qid1) {
+                        /* If both majors are known, they must be identical. */
+                        if (qid != qid1)
+                                return -EINVAL;
+                } else if (qid1) {
+                        qid = qid1;
+                } else if (qid == 0)
+                        qid = dev->qdisc_sleeping->handle;
+                /* Now qid is genuine qdisc handle consistent
+                   both with parent and child.
+                   TC_H_MAJ(pid) still may be unspecified, complete it now.
+                 */
+                if (pid)
+                        pid = TC_H_MAKE(qid, pid);
+        } else {
+                if (qid == 0)
+                        qid = dev->qdisc_sleeping->handle;
+        }
+        /* OK. Locate qdisc */
+        if ((q = qdisc_lookup(dev, qid)) == NULL) 
+                return -ENOENT;
+        /* An check that it supports classes */
+        cops = q->ops->cl_ops;
+        if (cops == NULL)
+                return -EINVAL;
+        /* Now try to get class */
+        if (clid == 0) {
+                if (pid == TC_H_ROOT)
+                        clid = qid;
+        } else
+                clid = TC_H_MAKE(qid, clid);
+        if (clid)
+                cl = cops->get(q, clid);
+        if (cl == 0) {
+                err = -ENOENT;
+                if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
+                        goto out;
+        } else {
+                switch (n->nlmsg_type) {
+                case RTM_NEWTCLASS:     
+                        err = -EEXIST;
+                        if (n->nlmsg_flags&NLM_F_EXCL)
+                                goto out;
+                        break;
+                case RTM_DELTCLASS:
+                        err = cops->delete(q, cl);
+                        if (err == 0)
+                                tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
+                        goto out;
+                case RTM_GETTCLASS:
+                        err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
+                        goto out;
+                default:
+                        err = -EINVAL;
+                        goto out;
+                }
+        }
+        new_cl = cl;
+        err = cops->change(q, clid, pid, tca, &new_cl);
+        if (err == 0)
+                tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
+out:
+        if (cl)
+                cops->put(q, cl);
+        return err;
+}
+static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
+                          unsigned long cl,
+                          u32 pid, u32 seq, unsigned flags, int event)
+{
+        struct tcmsg *tcm;
+        struct nlmsghdr  *nlh;
+        unsigned char    *b = skb->tail;
+        struct gnet_dump d;
+        struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
+        nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm));
+        nlh->nlmsg_flags = flags;
+        tcm = NLMSG_DATA(nlh);
+        tcm->tcm_family = AF_UNSPEC;
+        tcm->tcm_ifindex = q->dev->ifindex;
+        tcm->tcm_parent = q->handle;
+        tcm->tcm_handle = q->handle;
+        tcm->tcm_info = 0;
+        RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
+        if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
+                goto rtattr_failure;
+        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
+                        TCA_XSTATS, q->stats_lock, &d) < 0)
+                goto rtattr_failure;
+        if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
+                goto rtattr_failure;
+        if (gnet_stats_finish_copy(&d) < 0)
+                goto rtattr_failure;
+        nlh->nlmsg_len = skb->tail - b;
+        return skb->len;
+nlmsg_failure:
+rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
+                          struct Qdisc *q, unsigned long cl, int event)
+{
+        struct sk_buff *skb;
+        u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
+        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+        if (!skb)
+                return -ENOBUFS;
+        if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
+                kfree_skb(skb);
+                return -EINVAL;
+        }
+        return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
+}
+struct qdisc_dump_args
+{
+        struct qdisc_walker w;
+        struct sk_buff *skb;
+        struct netlink_callback *cb;
+};
+static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
+{
+        struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
+        return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
+                              a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
+}
+static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
+{
+        int t;
+        int s_t;
+        struct net_device *dev;
+        struct Qdisc *q;
+        struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
+        struct qdisc_dump_args arg;
+        if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
+                return 0;
+        if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
+                return 0;
+        s_t = cb->args[0];
+        t = 0;
+        read_lock_bh(&qdisc_tree_lock);
+        list_for_each_entry(q, &dev->qdisc_list, list) {
+                if (t < s_t || !q->ops->cl_ops ||
+                    (tcm->tcm_parent &&
+                     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
+                        t++;
+                        continue;
+                }
+                if (t > s_t)
+                        memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
+                arg.w.fn = qdisc_class_dump;
+                arg.skb = skb;
+                arg.cb = cb;
+                arg.w.stop  = 0;
+                arg.w.skip = cb->args[1];
+                arg.w.count = 0;
+                q->ops->cl_ops->walk(q, &arg.w);
+                cb->args[1] = arg.w.count;
+                if (arg.w.stop)
+                        break;
+                t++;
+        }
+        read_unlock_bh(&qdisc_tree_lock);
+        cb->args[0] = t;
+        dev_put(dev);
+        return skb->len;
+}
+/* Main classifier routine: scans classifier chain attached
+   to this qdisc, (optionally) tests for protocol and asks
+   specific classifiers.
+ */
+int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
+        struct tcf_result *res)
+{
+        int err = 0;
+        u32 protocol = skb->protocol;
+#ifdef CONFIG_NET_CLS_ACT
+        struct tcf_proto *otp = tp;
+reclassify:
+#endif
+        protocol = skb->protocol;
+        for ( ; tp; tp = tp->next) {
+                if ((tp->protocol == protocol ||
+                        tp->protocol == __constant_htons(ETH_P_ALL)) &&
+                        (err = tp->classify(skb, tp, res)) >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+                        if ( TC_ACT_RECLASSIFY == err) {
+                                __u32 verd = (__u32) G_TC_VERD(skb->tc_verd);
+                                tp = otp;
+                                if (MAX_REC_LOOP < verd++) {
+                                        printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n",
+                                                tp->prio&0xffff, ntohs(tp->protocol));
+                                        return TC_ACT_SHOT;
+                                }
+                                skb->tc_verd = SET_TC_VERD(skb->tc_verd,verd);
+                                goto reclassify;
+                        } else {
+                                if (skb->tc_verd) 
+                                        skb->tc_verd = SET_TC_VERD(skb->tc_verd,0);
+                                return err;
+                        }
+#else
+                        return err;
+#endif
+                }
+        }
+        return -1;
+}
+static int psched_us_per_tick = 1;
+static int psched_tick_per_us = 1;
+#ifdef CONFIG_PROC_FS
+static int psched_show(struct seq_file *seq, void *v)
+{
+        seq_printf(seq, "%08x %08x %08x %08x\n",
+                      psched_tick_per_us, psched_us_per_tick,
+                      1000000, HZ);
+        return 0;
+}
+static int psched_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, psched_show, PDE(inode)->data);
+}
+static struct file_operations psched_fops = {
+        .owner = THIS_MODULE,
+        .open = psched_open,
+        .read  = seq_read,
+        .llseek = seq_lseek,
+        .release = single_release,
+};      
+#endif
+#ifdef CONFIG_NET_SCH_CLK_CPU
+psched_tdiff_t psched_clock_per_hz;
+int psched_clock_scale;
+EXPORT_SYMBOL(psched_clock_per_hz);
+EXPORT_SYMBOL(psched_clock_scale);
+psched_time_t psched_time_base;
+cycles_t psched_time_mark;
+EXPORT_SYMBOL(psched_time_mark);
+EXPORT_SYMBOL(psched_time_base);
+/*
+ * Periodically adjust psched_time_base to avoid overflow
+ * with 32-bit get_cycles(). Safe up to 4GHz CPU.
+ */
+static void psched_tick(unsigned long);
+static struct timer_list psched_timer = TIMER_INITIALIZER(psched_tick, 0, 0);
+static void psched_tick(unsigned long dummy)
+{
+        if (sizeof(cycles_t) == sizeof(u32)) {
+                psched_time_t dummy_stamp;
+                PSCHED_GET_TIME(dummy_stamp);
+                psched_timer.expires = jiffies + 1*HZ;
+                add_timer(&psched_timer);
+        }
+}
+int __init psched_calibrate_clock(void)
+{
+        psched_time_t stamp, stamp1;
+        struct timeval tv, tv1;
+        psched_tdiff_t delay;
+        long rdelay;
+        unsigned long stop;
+        psched_tick(0);
+        stop = jiffies + HZ/10;
+        PSCHED_GET_TIME(stamp);
+        do_gettimeofday(&tv);
+        while (time_before(jiffies, stop)) {
+                barrier();
+                cpu_relax();
+        }
+        PSCHED_GET_TIME(stamp1);
+        do_gettimeofday(&tv1);
+        delay = PSCHED_TDIFF(stamp1, stamp);
+        rdelay = tv1.tv_usec - tv.tv_usec;
+        rdelay += (tv1.tv_sec - tv.tv_sec)*1000000;
+        if (rdelay > delay)
+                return -1;
+        delay /= rdelay;
+        psched_tick_per_us = delay;
+        while ((delay>>=1) != 0)
+                psched_clock_scale++;
+        psched_us_per_tick = 1<<psched_clock_scale;
+        psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale;
+        return 0;
+}
+#endif
+static int __init pktsched_init(void)
+{
+        struct rtnetlink_link *link_p;
+#ifdef CONFIG_NET_SCH_CLK_CPU
+        if (psched_calibrate_clock() < 0)
+                return -1;
+#elif defined(CONFIG_NET_SCH_CLK_JIFFIES)
+        psched_tick_per_us = HZ<<PSCHED_JSCALE;
+        psched_us_per_tick = 1000000;
+#endif
+        link_p = rtnetlink_links[PF_UNSPEC];
+        /* Setup rtnetlink links. It is made here to avoid
+           exporting large number of public symbols.
+         */
+        if (link_p) {
+                link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc;
+                link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc;
+                link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc;
+                link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc;
+                link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass;
+                link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass;
+                link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass;
+                link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass;
+        }
+        register_qdisc(&pfifo_qdisc_ops);
+        register_qdisc(&bfifo_qdisc_ops);
+        proc_net_fops_create("psched", 0, &psched_fops);
+        return 0;
+}
+subsys_initcall(pktsched_init);
+EXPORT_SYMBOL(qdisc_get_rtab);
+EXPORT_SYMBOL(qdisc_put_rtab);
+EXPORT_SYMBOL(register_qdisc);
+EXPORT_SYMBOL(unregister_qdisc);
+EXPORT_SYMBOL(tc_classify);
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
new file mode 100644
index 000000000000..93ebce40acac
--- /dev/null
+++ b/net/sched/sch_atm.c
@@ -0,0 +1,735 @@
+/* net/sched/sch_atm.c - ATM VC selection "queueing discipline" */
+/* Written 1998-2000 by Werner Almesberger, EPFL ICA */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/interrupt.h>
+#include <linux/atmdev.h>
+#include <linux/atmclip.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/file.h> /* for fput */
+#include <net/pkt_sched.h>
+#include <net/sock.h>
+extern struct socket *sockfd_lookup(int fd, int *err); /* @@@ fix this */
+#if 0 /* control */
+#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
+#else
+#define DPRINTK(format,args...)
+#endif
+#if 0 /* data */
+#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args)
+#else
+#define D2PRINTK(format,args...)
+#endif
+/*
+ * The ATM queuing discipline provides a framework for invoking classifiers
+ * (aka "filters"), which in turn select classes of this queuing discipline.
+ * Each class maps the flow(s) it is handling to a given VC. Multiple classes
+ * may share the same VC.
+ *
+ * When creating a class, VCs are specified by passing the number of the open
+ * socket descriptor by which the calling process references the VC. The kernel
+ * keeps the VC open at least until all classes using it are removed.
+ *
+ * In this file, most functions are named atm_tc_* to avoid confusion with all
+ * the atm_* in net/atm. This naming convention differs from what's used in the
+ * rest of net/sched.
+ *
+ * Known bugs:
+ *  - sometimes messes up the IP stack
+ *  - any manipulations besides the few operations described in the README, are
+ *    untested and likely to crash the system
+ *  - should lock the flow while there is data in the queue (?)
+ */
+#define PRIV(sch) qdisc_priv(sch)
+#define VCC2FLOW(vcc) ((struct atm_flow_data *) ((vcc)->user_back))
+struct atm_flow_data {
+        struct Qdisc            *q;             /* FIFO, TBF, etc. */
+        struct tcf_proto        *filter_list;
+        struct atm_vcc          *vcc;           /* VCC; NULL if VCC is closed */
+        void (*old_pop)(struct atm_vcc *vcc,struct sk_buff *skb); /* chaining */
+        struct atm_qdisc_data   *parent;        /* parent qdisc */
+        struct socket           *sock;          /* for closing */
+        u32                     classid;        /* x:y type ID */
+        int                     ref;            /* reference count */
+        struct gnet_stats_basic bstats;
+        struct gnet_stats_queue qstats;
+        spinlock_t              *stats_lock;
+        struct atm_flow_data    *next;
+        struct atm_flow_data    *excess;        /* flow for excess traffic;
+                                                   NULL to set CLP instead */
+        int                     hdr_len;
+        unsigned char           hdr[0];         /* header data; MUST BE LAST */
+};
+struct atm_qdisc_data {
+        struct atm_flow_data    link;           /* unclassified skbs go here */
+        struct atm_flow_data    *flows;         /* NB: "link" is also on this
+                                                   list */
+        struct tasklet_struct   task;           /* requeue tasklet */
+};
+/* ------------------------- Class/flow operations ------------------------- */
+static int find_flow(struct atm_qdisc_data *qdisc,struct atm_flow_data *flow)
+{
+        struct atm_flow_data *walk;
+        DPRINTK("find_flow(qdisc %p,flow %p)\n",qdisc,flow);
+        for (walk = qdisc->flows; walk; walk = walk->next)
+                if (walk == flow) return 1;
+        DPRINTK("find_flow: not found\n");
+        return 0;
+}
+static __inline__ struct atm_flow_data *lookup_flow(struct Qdisc *sch,
+    u32 classid)
+{
+        struct atm_qdisc_data *p = PRIV(sch);
+        struct atm_flow_data *flow;
+        for (flow = p->flows; flow; flow = flow->next)
+                if (flow->classid == classid) break;
+        return flow;
+}
+static int atm_tc_graft(struct Qdisc *sch,unsigned long arg,
+    struct Qdisc *new,struct Qdisc **old)
+{
+        struct atm_qdisc_data *p = PRIV(sch);
+        struct atm_flow_data *flow = (struct atm_flow_data *) arg;
+        DPRINTK("atm_tc_graft(sch %p,[qdisc %p],flow %p,new %p,old %p)\n",sch,
+            p,flow,new,old);
+        if (!find_flow(p,flow)) return -EINVAL;
+        if (!new) new = &noop_qdisc;
+        *old = xchg(&flow->q,new);
+        if (*old) qdisc_reset(*old);
+        return 0;
+}
+static struct Qdisc *atm_tc_leaf(struct Qdisc *sch,unsigned long cl)
+{
+        struct atm_flow_data *flow = (struct atm_flow_data *) cl;
+        DPRINTK("atm_tc_leaf(sch %p,flow %p)\n",sch,flow);
+        return flow ? flow->q : NULL;
+}
+static unsigned long atm_tc_get(struct Qdisc *sch,u32 classid)
+{
+        struct atm_qdisc_data *p __attribute__((unused)) = PRIV(sch);
+        struct atm_flow_data *flow;
+        DPRINTK("atm_tc_get(sch %p,[qdisc %p],classid %x)\n",sch,p,classid);
+        flow = lookup_flow(sch,classid);
+        if (flow) flow->ref++;
+        DPRINTK("atm_tc_get: flow %p\n",flow);
+        return (unsigned long) flow;
+}
+static unsigned long atm_tc_bind_filter(struct Qdisc *sch,
+    unsigned long parent, u32 classid)
+{
+        return atm_tc_get(sch,classid);
+}
+static void destroy_filters(struct atm_flow_data *flow)
+{
+        struct tcf_proto *filter;
+        while ((filter = flow->filter_list)) {
+                DPRINTK("destroy_filters: destroying filter %p\n",filter);
+                flow->filter_list = filter->next;
+                tcf_destroy(filter);
+        }
+}
+/*
+ * atm_tc_put handles all destructions, including the ones that are explicitly
+ * requested (atm_tc_destroy, etc.). The assumption here is that we never drop
+ * anything that still seems to be in use.
+ */
+static void atm_tc_put(struct Qdisc *sch, unsigned long cl)
+{
+        struct atm_qdisc_data *p = PRIV(sch);
+        struct atm_flow_data *flow = (struct atm_flow_data *) cl;
+        struct atm_flow_data **prev;
+        DPRINTK("atm_tc_put(sch %p,[qdisc %p],flow %p)\n",sch,p,flow);
+        if (--flow->ref) return;
+        DPRINTK("atm_tc_put: destroying\n");
+        for (prev = &p->flows; *prev; prev = &(*prev)->next)
+                if (*prev == flow) break;
+        if (!*prev) {
+                printk(KERN_CRIT "atm_tc_put: class %p not found\n",flow);
+                return;
+        }
+        *prev = flow->next;
+        DPRINTK("atm_tc_put: qdisc %p\n",flow->q);
+        qdisc_destroy(flow->q);
+        destroy_filters(flow);
+        if (flow->sock) {
+                DPRINTK("atm_tc_put: f_count %d\n",
+                    file_count(flow->sock->file));
+                flow->vcc->pop = flow->old_pop;
+                sockfd_put(flow->sock);
+        }
+        if (flow->excess) atm_tc_put(sch,(unsigned long) flow->excess);
+        if (flow != &p->link) kfree(flow);
+        /*
+         * If flow == &p->link, the qdisc no longer works at this point and
+         * needs to be removed. (By the caller of atm_tc_put.)
+         */
+}
+static void sch_atm_pop(struct atm_vcc *vcc,struct sk_buff *skb)
+{
+        struct atm_qdisc_data *p = VCC2FLOW(vcc)->parent;
+        D2PRINTK("sch_atm_pop(vcc %p,skb %p,[qdisc %p])\n",vcc,skb,p);
+        VCC2FLOW(vcc)->old_pop(vcc,skb);
+        tasklet_schedule(&p->task);
+}
+static const u8 llc_oui_ip[] = {
+        0xaa,           /* DSAP: non-ISO */
+        0xaa,           /* SSAP: non-ISO */
+        0x03,           /* Ctrl: Unnumbered Information Command PDU */
+        0x00,           /* OUI: EtherType */
+        0x00, 0x00,
+        0x08, 0x00 };   /* Ethertype IP (0800) */
+static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent,
+    struct rtattr **tca, unsigned long *arg)
+{
+        struct atm_qdisc_data *p = PRIV(sch);
+        struct atm_flow_data *flow = (struct atm_flow_data *) *arg;
+        struct atm_flow_data *excess = NULL;
+        struct rtattr *opt = tca[TCA_OPTIONS-1];
+        struct rtattr *tb[TCA_ATM_MAX];
+        struct socket *sock;
+        int fd,error,hdr_len;
+        void *hdr;
+        DPRINTK("atm_tc_change(sch %p,[qdisc %p],classid %x,parent %x,"
+            "flow %p,opt %p)\n",sch,p,classid,parent,flow,opt);
+        /*
+         * The concept of parents doesn't apply for this qdisc.
+         */
+        if (parent && parent != TC_H_ROOT && parent != sch->handle)
+                return -EINVAL;
+        /*
+         * ATM classes cannot be changed. In order to change properties of the
+         * ATM connection, that socket needs to be modified directly (via the
+         * native ATM API. In order to send a flow to a different VC, the old
+         * class needs to be removed and a new one added. (This may be changed
+         * later.)
+         */
+        if (flow) return -EBUSY;
+        if (opt == NULL || rtattr_parse_nested(tb, TCA_ATM_MAX, opt))
+                return -EINVAL;
+        if (!tb[TCA_ATM_FD-1] || RTA_PAYLOAD(tb[TCA_ATM_FD-1]) < sizeof(fd))
+                return -EINVAL;
+        fd = *(int *) RTA_DATA(tb[TCA_ATM_FD-1]);
+        DPRINTK("atm_tc_change: fd %d\n",fd);
+        if (tb[TCA_ATM_HDR-1]) {
+                hdr_len = RTA_PAYLOAD(tb[TCA_ATM_HDR-1]);
+                hdr = RTA_DATA(tb[TCA_ATM_HDR-1]);
+        }
+        else {
+                hdr_len = RFC1483LLC_LEN;
+                hdr = NULL; /* default LLC/SNAP for IP */
+        }
+        if (!tb[TCA_ATM_EXCESS-1]) excess = NULL;
+        else {
+                if (RTA_PAYLOAD(tb[TCA_ATM_EXCESS-1]) != sizeof(u32))
+                        return -EINVAL;
+                excess = (struct atm_flow_data *) atm_tc_get(sch,
+                    *(u32 *) RTA_DATA(tb[TCA_ATM_EXCESS-1]));
+                if (!excess) return -ENOENT;
+        }
+        DPRINTK("atm_tc_change: type %d, payload %d, hdr_len %d\n",
+            opt->rta_type,RTA_PAYLOAD(opt),hdr_len);
+        if (!(sock = sockfd_lookup(fd,&error))) return error; /* f_count++ */
+        DPRINTK("atm_tc_change: f_count %d\n",file_count(sock->file));
+        if (sock->ops->family != PF_ATMSVC && sock->ops->family != PF_ATMPVC) {
+                error = -EPROTOTYPE;
+                goto err_out;
+        }
+        /* @@@ should check if the socket is really operational or we'll crash
+           on vcc->send */
+        if (classid) {
+                if (TC_H_MAJ(classid ^ sch->handle)) {
+                        DPRINTK("atm_tc_change: classid mismatch\n");
+                        error = -EINVAL;
+                        goto err_out;
+                }
+                if (find_flow(p,flow)) {
+                        error = -EEXIST;
+                        goto err_out;
+                }
+        }
+        else {
+                int i;
+                unsigned long cl;
+                for (i = 1; i < 0x8000; i++) {
+                        classid = TC_H_MAKE(sch->handle,0x8000 | i);
+                        if (!(cl = atm_tc_get(sch,classid))) break;
+                        atm_tc_put(sch,cl);
+                }
+        }
+        DPRINTK("atm_tc_change: new id %x\n",classid);
+        flow = kmalloc(sizeof(struct atm_flow_data)+hdr_len,GFP_KERNEL);
+        DPRINTK("atm_tc_change: flow %p\n",flow);
+        if (!flow) {
+                error = -ENOBUFS;
+                goto err_out;
+        }
+        memset(flow,0,sizeof(*flow));
+        flow->filter_list = NULL;
+        if (!(flow->q = qdisc_create_dflt(sch->dev,&pfifo_qdisc_ops)))
+                flow->q = &noop_qdisc;
+        DPRINTK("atm_tc_change: qdisc %p\n",flow->q);
+        flow->sock = sock;
+        flow->vcc = ATM_SD(sock); /* speedup */
+        flow->vcc->user_back = flow;
+        DPRINTK("atm_tc_change: vcc %p\n",flow->vcc);
+        flow->old_pop = flow->vcc->pop;
+        flow->parent = p;
+        flow->vcc->pop = sch_atm_pop;
+        flow->classid = classid;
+        flow->ref = 1;
+        flow->excess = excess;
+        flow->next = p->link.next;
+        p->link.next = flow;
+        flow->hdr_len = hdr_len;
+        if (hdr)
+                memcpy(flow->hdr,hdr,hdr_len);
+        else
+                memcpy(flow->hdr,llc_oui_ip,sizeof(llc_oui_ip));
+        *arg = (unsigned long) flow;
+        return 0;
+err_out:
+        if (excess) atm_tc_put(sch,(unsigned long) excess);
+        sockfd_put(sock);
+        return error;
+}
+static int atm_tc_delete(struct Qdisc *sch,unsigned long arg)
+{
+        struct atm_qdisc_data *p = PRIV(sch);
+        struct atm_flow_data *flow = (struct atm_flow_data *) arg;
+        DPRINTK("atm_tc_delete(sch %p,[qdisc %p],flow %p)\n",sch,p,flow);
+        if (!find_flow(PRIV(sch),flow)) return -EINVAL;
+        if (flow->filter_list || flow == &p->link) return -EBUSY;
+        /*
+         * Reference count must be 2: one for "keepalive" (set at class
+         * creation), and one for the reference held when calling delete.
+         */
+        if (flow->ref < 2) {
+                printk(KERN_ERR "atm_tc_delete: flow->ref == %d\n",flow->ref);
+                return -EINVAL;
+        }
+        if (flow->ref > 2) return -EBUSY; /* catch references via excess, etc.*/
+        atm_tc_put(sch,arg);
+        return 0;
+}
+static void atm_tc_walk(struct Qdisc *sch,struct qdisc_walker *walker)
+{
+        struct atm_qdisc_data *p = PRIV(sch);
+        struct atm_flow_data *flow;
+        DPRINTK("atm_tc_walk(sch %p,[qdisc %p],walker %p)\n",sch,p,walker);
+        if (walker->stop) return;
+        for (flow = p->flows; flow; flow = flow->next) {
+                if (walker->count >= walker->skip)
+                        if (walker->fn(sch,(unsigned long) flow,walker) < 0) {
+                                walker->stop = 1;
+                                break;
+                        }
+                walker->count++;
+        }
+}
+static struct tcf_proto **atm_tc_find_tcf(struct Qdisc *sch,unsigned long cl)
+{
+        struct atm_qdisc_data *p = PRIV(sch);
+        struct atm_flow_data *flow = (struct atm_flow_data *) cl;
+        DPRINTK("atm_tc_find_tcf(sch %p,[qdisc %p],flow %p)\n",sch,p,flow);
+        return flow ? &flow->filter_list : &p->link.filter_list;
+}
+/* --------------------------- Qdisc operations ---------------------------- */
+static int atm_tc_enqueue(struct sk_buff *skb,struct Qdisc *sch)
+{
+        struct atm_qdisc_data *p = PRIV(sch);
+        struct atm_flow_data *flow = NULL ; /* @@@ */
+        struct tcf_result res;
+        int result;
+        int ret = NET_XMIT_POLICED;
+        D2PRINTK("atm_tc_enqueue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p);
+        result = TC_POLICE_OK; /* be nice to gcc */
+        if (TC_H_MAJ(skb->priority) != sch->handle ||
+            !(flow = (struct atm_flow_data *) atm_tc_get(sch,skb->priority)))
+                for (flow = p->flows; flow; flow = flow->next)
+                        if (flow->filter_list) {
+                                result = tc_classify(skb,flow->filter_list,
+                                    &res);
+                                if (result < 0) continue;
+                                flow = (struct atm_flow_data *) res.class;
+                                if (!flow) flow = lookup_flow(sch,res.classid);
+                                break;
+                        }
+        if (!flow) flow = &p->link;
+        else {
+                if (flow->vcc)
+                        ATM_SKB(skb)->atm_options = flow->vcc->atm_options;
+                        /*@@@ looks good ... but it's not supposed to work :-)*/
+#ifdef CONFIG_NET_CLS_POLICE
+                switch (result) {
+                        case TC_POLICE_SHOT:
+                                kfree_skb(skb);
+                                break;
+                        case TC_POLICE_RECLASSIFY:
+                                if (flow->excess) flow = flow->excess;
+                                else {
+                                        ATM_SKB(skb)->atm_options |=
+                                            ATM_ATMOPT_CLP;
+                                        break;
+                                }
+                                /* fall through */
+                        case TC_POLICE_OK:
+                                /* fall through */
+                        default:
+                                break;
+                }
+#endif
+        }
+        if (
+#ifdef CONFIG_NET_CLS_POLICE
+            result == TC_POLICE_SHOT ||
+#endif
+            (ret = flow->q->enqueue(skb,flow->q)) != 0) {
+                sch->qstats.drops++;
+                if (flow) flow->qstats.drops++;
+                return ret;
+        }
+        sch->bstats.bytes += skb->len;
+        sch->bstats.packets++;
+        flow->bstats.bytes += skb->len;
+        flow->bstats.packets++;
+        /*
+         * Okay, this may seem weird. We pretend we've dropped the packet if
+         * it goes via ATM. The reason for this is that the outer qdisc
+         * expects to be able to q->dequeue the packet later on if we return
+         * success at this place. Also, sch->q.qdisc needs to reflect whether
+         * there is a packet egligible for dequeuing or not. Note that the
+         * statistics of the outer qdisc are necessarily wrong because of all
+         * this. There's currently no correct solution for this.
+         */
+        if (flow == &p->link) {
+                sch->q.qlen++;
+                return 0;
+        }
+        tasklet_schedule(&p->task);
+        return NET_XMIT_BYPASS;
+}
+/*
+ * Dequeue packets and send them over ATM. Note that we quite deliberately
+ * avoid checking net_device's flow control here, simply because sch_atm
+ * uses its own channels, which have nothing to do with any CLIP/LANE/or
+ * non-ATM interfaces.
+ */
+static void sch_atm_dequeue(unsigned long data)
+{
+        struct Qdisc *sch = (struct Qdisc *) data;
+        struct atm_qdisc_data *p = PRIV(sch);
+        struct atm_flow_data *flow;
+        struct sk_buff *skb;
+        D2PRINTK("sch_atm_dequeue(sch %p,[qdisc %p])\n",sch,p);
+        for (flow = p->link.next; flow; flow = flow->next)
+                /*
+                 * If traffic is properly shaped, this won't generate nasty
+                 * little bursts. Otherwise, it may ... (but that's okay)
+                 */
+                while ((skb = flow->q->dequeue(flow->q))) {
+                        if (!atm_may_send(flow->vcc,skb->truesize)) {
+                                (void) flow->q->ops->requeue(skb,flow->q);
+                                break;
+                        }
+                        D2PRINTK("atm_tc_dequeue: sending on class %p\n",flow);
+                        /* remove any LL header somebody else has attached */
+                        skb_pull(skb,(char *) skb->nh.iph-(char *) skb->data);
+                        if (skb_headroom(skb) < flow->hdr_len) {
+                                struct sk_buff *new;
+                                new = skb_realloc_headroom(skb,flow->hdr_len);
+                                dev_kfree_skb(skb);
+                                if (!new) continue;
+                                skb = new;
+                        }
+                        D2PRINTK("sch_atm_dequeue: ip %p, data %p\n",
+                            skb->nh.iph,skb->data);
+                        ATM_SKB(skb)->vcc = flow->vcc;
+                        memcpy(skb_push(skb,flow->hdr_len),flow->hdr,
+                            flow->hdr_len);
+                        atomic_add(skb->truesize,
+                                   &sk_atm(flow->vcc)->sk_wmem_alloc);
+                        /* atm.atm_options are already set by atm_tc_enqueue */
+                        (void) flow->vcc->send(flow->vcc,skb);
+                }
+}
+static struct sk_buff *atm_tc_dequeue(struct Qdisc *sch)
+{
+        struct atm_qdisc_data *p = PRIV(sch);
+        struct sk_buff *skb;
+        D2PRINTK("atm_tc_dequeue(sch %p,[qdisc %p])\n",sch,p);
+        tasklet_schedule(&p->task);
+        skb = p->link.q->dequeue(p->link.q);
+        if (skb) sch->q.qlen--;
+        return skb;
+}
+static int atm_tc_requeue(struct sk_buff *skb,struct Qdisc *sch)
+{
+        struct atm_qdisc_data *p = PRIV(sch);
+        int ret;
+        D2PRINTK("atm_tc_requeue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p);
+        ret = p->link.q->ops->requeue(skb,p->link.q);
+        if (!ret) {
+        sch->q.qlen++;
+        sch->qstats.requeues++;
+    } else {
+                sch->qstats.drops++;
+                p->link.qstats.drops++;
+        }
+        return ret;
+}
+static unsigned int atm_tc_drop(struct Qdisc *sch)
+{
+        struct atm_qdisc_data *p = PRIV(sch);
+        struct atm_flow_data *flow;
+        unsigned int len;
+        DPRINTK("atm_tc_drop(sch %p,[qdisc %p])\n",sch,p);
+        for (flow = p->flows; flow; flow = flow->next)
+                if (flow->q->ops->drop && (len = flow->q->ops->drop(flow->q)))
+                        return len;
+        return 0;
+}
+static int atm_tc_init(struct Qdisc *sch,struct rtattr *opt)
+{
+        struct atm_qdisc_data *p = PRIV(sch);
+        DPRINTK("atm_tc_init(sch %p,[qdisc %p],opt %p)\n",sch,p,opt);
+        p->flows = &p->link;
+        if(!(p->link.q = qdisc_create_dflt(sch->dev,&pfifo_qdisc_ops)))
+                p->link.q = &noop_qdisc;
+        DPRINTK("atm_tc_init: link (%p) qdisc %p\n",&p->link,p->link.q);
+        p->link.filter_list = NULL;
+        p->link.vcc = NULL;
+        p->link.sock = NULL;
+        p->link.classid = sch->handle;
+        p->link.ref = 1;
+        p->link.next = NULL;
+        tasklet_init(&p->task,sch_atm_dequeue,(unsigned long) sch);
+        return 0;
+}
+static void atm_tc_reset(struct Qdisc *sch)
+{
+        struct atm_qdisc_data *p = PRIV(sch);
+        struct atm_flow_data *flow;
+        DPRINTK("atm_tc_reset(sch %p,[qdisc %p])\n",sch,p);
+        for (flow = p->flows; flow; flow = flow->next) qdisc_reset(flow->q);
+        sch->q.qlen = 0;
+}
+static void atm_tc_destroy(struct Qdisc *sch)
+{
+        struct atm_qdisc_data *p = PRIV(sch);
+        struct atm_flow_data *flow;
+        DPRINTK("atm_tc_destroy(sch %p,[qdisc %p])\n",sch,p);
+        /* races ? */
+        while ((flow = p->flows)) {
+                destroy_filters(flow);
+                if (flow->ref > 1)
+                        printk(KERN_ERR "atm_destroy: %p->ref = %d\n",flow,
+                            flow->ref);
+                atm_tc_put(sch,(unsigned long) flow);
+                if (p->flows == flow) {
+                        printk(KERN_ERR "atm_destroy: putting flow %p didn't "
+                            "kill it\n",flow);
+                        p->flows = flow->next; /* brute force */
+                        break;
+                }
+        }
+        tasklet_kill(&p->task);
+}
+static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl,
+    struct sk_buff *skb, struct tcmsg *tcm)
+{
+        struct atm_qdisc_data *p = PRIV(sch);
+        struct atm_flow_data *flow = (struct atm_flow_data *) cl;
+        unsigned char *b = skb->tail;
+        struct rtattr *rta;
+        DPRINTK("atm_tc_dump_class(sch %p,[qdisc %p],flow %p,skb %p,tcm %p)\n",
+            sch,p,flow,skb,tcm);
+        if (!find_flow(p,flow)) return -EINVAL;
+        tcm->tcm_handle = flow->classid;
+        rta = (struct rtattr *) b;
+        RTA_PUT(skb,TCA_OPTIONS,0,NULL);
+        RTA_PUT(skb,TCA_ATM_HDR,flow->hdr_len,flow->hdr);
+        if (flow->vcc) {
+                struct sockaddr_atmpvc pvc;
+                int state;
+                pvc.sap_family = AF_ATMPVC;
+                pvc.sap_addr.itf = flow->vcc->dev ? flow->vcc->dev->number : -1;
+                pvc.sap_addr.vpi = flow->vcc->vpi;
+                pvc.sap_addr.vci = flow->vcc->vci;
+                RTA_PUT(skb,TCA_ATM_ADDR,sizeof(pvc),&pvc);
+                state = ATM_VF2VS(flow->vcc->flags);
+                RTA_PUT(skb,TCA_ATM_STATE,sizeof(state),&state);
+        }
+        if (flow->excess)
+                RTA_PUT(skb,TCA_ATM_EXCESS,sizeof(u32),&flow->classid);
+        else {
+                static u32 zero;
+                RTA_PUT(skb,TCA_ATM_EXCESS,sizeof(zero),&zero);
+        }
+        rta->rta_len = skb->tail-b;
+        return skb->len;
+rtattr_failure:
+        skb_trim(skb,b-skb->data);
+        return -1;
+}
+static int
+atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
+        struct gnet_dump *d)
+{
+        struct atm_flow_data *flow = (struct atm_flow_data *) arg;
+        flow->qstats.qlen = flow->q->q.qlen;
+        if (gnet_stats_copy_basic(d, &flow->bstats) < 0 ||
+            gnet_stats_copy_queue(d, &flow->qstats) < 0)
+                return -1;
+        return 0;
+}
+static int atm_tc_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+        return 0;
+}
+static struct Qdisc_class_ops atm_class_ops = {
+        .graft          =       atm_tc_graft,
+        .leaf           =       atm_tc_leaf,
+        .get            =       atm_tc_get,
+        .put            =       atm_tc_put,
+        .change         =       atm_tc_change,
+        .delete         =       atm_tc_delete,
+        .walk           =       atm_tc_walk,
+        .tcf_chain      =       atm_tc_find_tcf,
+        .bind_tcf       =       atm_tc_bind_filter,
+        .unbind_tcf     =       atm_tc_put,
+        .dump           =       atm_tc_dump_class,
+        .dump_stats     =       atm_tc_dump_class_stats,
+};
+static struct Qdisc_ops atm_qdisc_ops = {
+        .next           =       NULL,
+        .cl_ops         =       &atm_class_ops,
+        .id             =       "atm",
+        .priv_size      =       sizeof(struct atm_qdisc_data),
+        .enqueue        =       atm_tc_enqueue,
+        .dequeue        =       atm_tc_dequeue,
+        .requeue        =       atm_tc_requeue,
+        .drop           =       atm_tc_drop,
+        .init           =       atm_tc_init,
+        .reset          =       atm_tc_reset,
+        .destroy        =       atm_tc_destroy,
+        .change         =       NULL,
+        .dump           =       atm_tc_dump,
+        .owner          =       THIS_MODULE,
+};
+static int __init atm_init(void)
+{
+        return register_qdisc(&atm_qdisc_ops);
+}
+static void __exit atm_exit(void) 
+{
+        unregister_qdisc(&atm_qdisc_ops);
+}
+module_init(atm_init)
+module_exit(atm_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
new file mode 100644
index 000000000000..d43e3b8cbf6a
--- /dev/null
+++ b/net/sched/sch_cbq.c
@@ -0,0 +1,2124 @@
+/*
+ * net/sched/sch_cbq.c  Class-Based Queueing discipline.
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/notifier.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+/*      Class-Based Queueing (CBQ) algorithm.
+        =======================================
+        Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
+                 Management Models for Packet Networks",
+                 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
+                 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
+                 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
+                 Parameters", 1996
+                 [4] Sally Floyd and Michael Speer, "Experimental Results
+                 for Class-Based Queueing", 1998, not published.
+        -----------------------------------------------------------------------
+        Algorithm skeleton was taken from NS simulator cbq.cc.
+        If someone wants to check this code against the LBL version,
+        he should take into account that ONLY the skeleton was borrowed,
+        the implementation is different. Particularly:
+        --- The WRR algorithm is different. Our version looks more
+        reasonable (I hope) and works when quanta are allowed to be
+        less than MTU, which is always the case when real time classes
+        have small rates. Note, that the statement of [3] is
+        incomplete, delay may actually be estimated even if class
+        per-round allotment is less than MTU. Namely, if per-round
+        allotment is W*r_i, and r_1+...+r_k = r < 1
+        delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
+        In the worst case we have IntServ estimate with D = W*r+k*MTU
+        and C = MTU*r. The proof (if correct at all) is trivial.
+        --- It seems that cbq-2.0 is not very accurate. At least, I cannot
+        interpret some places, which look like wrong translations
+        from NS. Anyone is advised to find these differences
+        and explain to me, why I am wrong 8).
+        --- Linux has no EOI event, so that we cannot estimate true class
+        idle time. Workaround is to consider the next dequeue event
+        as sign that previous packet is finished. This is wrong because of
+        internal device queueing, but on a permanently loaded link it is true.
+        Moreover, combined with clock integrator, this scheme looks
+        very close to an ideal solution.  */
+struct cbq_sched_data;
+struct cbq_class
+{
+        struct cbq_class        *next;          /* hash table link */
+        struct cbq_class        *next_alive;    /* next class with backlog in this priority band */
+/* Parameters */
+        u32                     classid;
+        unsigned char           priority;       /* class priority */
+        unsigned char           priority2;      /* priority to be used after overlimit */
+        unsigned char           ewma_log;       /* time constant for idle time calculation */
+        unsigned char           ovl_strategy;
+#ifdef CONFIG_NET_CLS_POLICE
+        unsigned char           police;
+#endif
+        u32                     defmap;
+        /* Link-sharing scheduler parameters */
+        long                    maxidle;        /* Class parameters: see below. */
+        long                    offtime;
+        long                    minidle;
+        u32                     avpkt;
+        struct qdisc_rate_table *R_tab;
+        /* Overlimit strategy parameters */
+        void                    (*overlimit)(struct cbq_class *cl);
+        long                    penalty;
+        /* General scheduler (WRR) parameters */
+        long                    allot;
+        long                    quantum;        /* Allotment per WRR round */
+        long                    weight;         /* Relative allotment: see below */
+        struct Qdisc            *qdisc;         /* Ptr to CBQ discipline */
+        struct cbq_class        *split;         /* Ptr to split node */
+        struct cbq_class        *share;         /* Ptr to LS parent in the class tree */
+        struct cbq_class        *tparent;       /* Ptr to tree parent in the class tree */
+        struct cbq_class        *borrow;        /* NULL if class is bandwidth limited;
+                                                   parent otherwise */
+        struct cbq_class        *sibling;       /* Sibling chain */
+        struct cbq_class        *children;      /* Pointer to children chain */
+        struct Qdisc            *q;             /* Elementary queueing discipline */
+/* Variables */
+        unsigned char           cpriority;      /* Effective priority */
+        unsigned char           delayed;
+        unsigned char           level;          /* level of the class in hierarchy:
+                                                   0 for leaf classes, and maximal
+                                                   level of children + 1 for nodes.
+                                                 */
+        psched_time_t           last;           /* Last end of service */
+        psched_time_t           undertime;
+        long                    avgidle;
+        long                    deficit;        /* Saved deficit for WRR */
+        unsigned long           penalized;
+        struct gnet_stats_basic bstats;
+        struct gnet_stats_queue qstats;
+        struct gnet_stats_rate_est rate_est;
+        spinlock_t              *stats_lock;
+        struct tc_cbq_xstats    xstats;
+        struct tcf_proto        *filter_list;
+        int                     refcnt;
+        int                     filters;
+        struct cbq_class        *defaults[TC_PRIO_MAX+1];
+};
+struct cbq_sched_data
+{
+        struct cbq_class        *classes[16];           /* Hash table of all classes */
+        int                     nclasses[TC_CBQ_MAXPRIO+1];
+        unsigned                quanta[TC_CBQ_MAXPRIO+1];
+        struct cbq_class        link;
+        unsigned                activemask;
+        struct cbq_class        *active[TC_CBQ_MAXPRIO+1];      /* List of all classes
+                                                                   with backlog */
+#ifdef CONFIG_NET_CLS_POLICE
+        struct cbq_class        *rx_class;
+#endif
+        struct cbq_class        *tx_class;
+        struct cbq_class        *tx_borrowed;
+        int                     tx_len;
+        psched_time_t           now;            /* Cached timestamp */
+        psched_time_t           now_rt;         /* Cached real time */
+        unsigned                pmask;
+        struct timer_list       delay_timer;
+        struct timer_list       wd_timer;       /* Watchdog timer,
+                                                   started when CBQ has
+                                                   backlog, but cannot
+                                                   transmit just now */
+        long                    wd_expires;
+        int                     toplevel;
+        u32                     hgenerator;
+};
+#define L2T(cl,len)     ((cl)->R_tab->data[(len)>>(cl)->R_tab->rate.cell_log])
+static __inline__ unsigned cbq_hash(u32 h)
+{
+        h ^= h>>8;
+        h ^= h>>4;
+        return h&0xF;
+}
+static __inline__ struct cbq_class *
+cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
+{
+        struct cbq_class *cl;
+        for (cl = q->classes[cbq_hash(classid)]; cl; cl = cl->next)
+                if (cl->classid == classid)
+                        return cl;
+        return NULL;
+}
+#ifdef CONFIG_NET_CLS_POLICE
+static struct cbq_class *
+cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
+{
+        struct cbq_class *cl, *new;
+        for (cl = this->tparent; cl; cl = cl->tparent)
+                if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this)
+                        return new;
+        return NULL;
+}
+#endif
+/* Classify packet. The procedure is pretty complicated, but
+   it allows us to combine link sharing and priority scheduling
+   transparently.
+   Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
+   so that it resolves to split nodes. Then packets are classified
+   by logical priority, or a more specific classifier may be attached
+   to the split node.
+ */
+static struct cbq_class *
+cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
+{
+        struct cbq_sched_data *q = qdisc_priv(sch);
+        struct cbq_class *head = &q->link;
+        struct cbq_class **defmap;
+        struct cbq_class *cl = NULL;
+        u32 prio = skb->priority;
+        struct tcf_result res;
+        /*
+         *  Step 1. If skb->priority points to one of our classes, use it.
+         */
+        if (TC_H_MAJ(prio^sch->handle) == 0 &&
+            (cl = cbq_class_lookup(q, prio)) != NULL)
+                return cl;
+        *qerr = NET_XMIT_DROP;
+        for (;;) {
+                int result = 0;
+                defmap = head->defaults;
+                /*
+                 * Step 2+n. Apply classifier.
+                 */
+                if (!head->filter_list || (result = tc_classify(skb, head->filter_list, &res)) < 0)
+                        goto fallback;
+                if ((cl = (void*)res.class) == NULL) {
+                        if (TC_H_MAJ(res.classid))
+                                cl = cbq_class_lookup(q, res.classid);
+                        else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL)
+                                cl = defmap[TC_PRIO_BESTEFFORT];
+                        if (cl == NULL || cl->level >= head->level)
+                                goto fallback;
+                }
+#ifdef CONFIG_NET_CLS_ACT
+                switch (result) {
+                case TC_ACT_QUEUED:
+                case TC_ACT_STOLEN: 
+                        *qerr = NET_XMIT_SUCCESS;
+                case TC_ACT_SHOT:
+                        return NULL;
+                }
+#elif defined(CONFIG_NET_CLS_POLICE)
+                switch (result) {
+                case TC_POLICE_RECLASSIFY:
+                        return cbq_reclassify(skb, cl);
+                case TC_POLICE_SHOT:
+                        return NULL;
+                default:
+                        break;
+                }
+#endif
+                if (cl->level == 0)
+                        return cl;
+                /*
+                 * Step 3+n. If classifier selected a link sharing class,
+                 *         apply agency specific classifier.
+                 *         Repeat this procdure until we hit a leaf node.
+                 */
+                head = cl;
+        }
+fallback:
+        cl = head;
+        /*
+         * Step 4. No success...
+         */
+        if (TC_H_MAJ(prio) == 0 &&
+            !(cl = head->defaults[prio&TC_PRIO_MAX]) &&
+            !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
+                return head;
+        return cl;
+}
+/*
+   A packet has just been enqueued on the empty class.
+   cbq_activate_class adds it to the tail of active class list
+   of its priority band.
+ */
+static __inline__ void cbq_activate_class(struct cbq_class *cl)
+{
+        struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
+        int prio = cl->cpriority;
+        struct cbq_class *cl_tail;
+        cl_tail = q->active[prio];
+        q->active[prio] = cl;
+        if (cl_tail != NULL) {
+                cl->next_alive = cl_tail->next_alive;
+                cl_tail->next_alive = cl;
+        } else {
+                cl->next_alive = cl;
+                q->activemask |= (1<<prio);
+        }
+}
+/*
+   Unlink class from active chain.
+   Note that this same procedure is done directly in cbq_dequeue*
+   during round-robin procedure.
+ */
+static void cbq_deactivate_class(struct cbq_class *this)
+{
+        struct cbq_sched_data *q = qdisc_priv(this->qdisc);
+        int prio = this->cpriority;
+        struct cbq_class *cl;
+        struct cbq_class *cl_prev = q->active[prio];
+        do {
+                cl = cl_prev->next_alive;
+                if (cl == this) {
+                        cl_prev->next_alive = cl->next_alive;
+                        cl->next_alive = NULL;
+                        if (cl == q->active[prio]) {
+                                q->active[prio] = cl_prev;
+                                if (cl == q->active[prio]) {
+                                        q->active[prio] = NULL;
+                                        q->activemask &= ~(1<<prio);
+                                        return;
+                                }
+                        }
+                        cl = cl_prev->next_alive;
+                        return;
+                }
+        } while ((cl_prev = cl) != q->active[prio]);
+}
+static void
+cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
+{
+        int toplevel = q->toplevel;
+        if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) {
+                psched_time_t now;
+                psched_tdiff_t incr;
+                PSCHED_GET_TIME(now);
+                incr = PSCHED_TDIFF(now, q->now_rt);
+                PSCHED_TADD2(q->now, incr, now);
+                do {
+                        if (PSCHED_TLESS(cl->undertime, now)) {
+                                q->toplevel = cl->level;
+                                return;
+                        }
+                } while ((cl=cl->borrow) != NULL && toplevel > cl->level);
+        }
+}
+static int
+cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+        struct cbq_sched_data *q = qdisc_priv(sch);
+        int len = skb->len;
+        int ret;
+        struct cbq_class *cl = cbq_classify(skb, sch, &ret);
+#ifdef CONFIG_NET_CLS_POLICE
+        q->rx_class = cl;
+#endif
+        if (cl == NULL) {
+                if (ret == NET_XMIT_DROP)
+                        sch->qstats.drops++;
+                kfree_skb(skb);
+                return ret;
+        }
+#ifdef CONFIG_NET_CLS_POLICE
+        cl->q->__parent = sch;
+#endif
+        if ((ret = cl->q->enqueue(skb, cl->q)) == NET_XMIT_SUCCESS) {
+                sch->q.qlen++;
+                sch->bstats.packets++;
+                sch->bstats.bytes+=len;
+                cbq_mark_toplevel(q, cl);
+                if (!cl->next_alive)
+                        cbq_activate_class(cl);
+                return ret;
+        }
+        sch->qstats.drops++;
+        cbq_mark_toplevel(q, cl);
+        cl->qstats.drops++;
+        return ret;
+}
+static int
+cbq_requeue(struct sk_buff *skb, struct Qdisc *sch)
+{
+        struct cbq_sched_data *q = qdisc_priv(sch);
+        struct cbq_class *cl;
+        int ret;
+        if ((cl = q->tx_class) == NULL) {
+                kfree_skb(skb);
+                sch->qstats.drops++;
+                return NET_XMIT_CN;
+        }
+        q->tx_class = NULL;
+        cbq_mark_toplevel(q, cl);
+#ifdef CONFIG_NET_CLS_POLICE
+        q->rx_class = cl;
+        cl->q->__parent = sch;
+#endif
+        if ((ret = cl->q->ops->requeue(skb, cl->q)) == 0) {
+                sch->q.qlen++;
+                sch->qstats.requeues++;
+                if (!cl->next_alive)
+                        cbq_activate_class(cl);
+                return 0;
+        }
+        sch->qstats.drops++;
+        cl->qstats.drops++;
+        return ret;
+}
+/* Overlimit actions */
+/* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */
+static void cbq_ovl_classic(struct cbq_class *cl)
+{
+        struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
+        psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now);
+        if (!cl->delayed) {
+                delay += cl->offtime;
+                /* 
+                   Class goes to sleep, so that it will have no
+                   chance to work avgidle. Let's forgive it 8)
+                   BTW cbq-2.0 has a crap in this
+                   place, apparently they forgot to shift it by cl->ewma_log.
+                 */
+                if (cl->avgidle < 0)
+                        delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
+                if (cl->avgidle < cl->minidle)
+                        cl->avgidle = cl->minidle;
+                if (delay <= 0)
+                        delay = 1;
+                PSCHED_TADD2(q->now, delay, cl->undertime);
+                cl->xstats.overactions++;
+                cl->delayed = 1;
+        }
+        if (q->wd_expires == 0 || q->wd_expires > delay)
+                q->wd_expires = delay;
+        /* Dirty work! We must schedule wakeups based on
+           real available rate, rather than leaf rate,
+           which may be tiny (even zero).
+         */
+        if (q->toplevel == TC_CBQ_MAXLEVEL) {
+                struct cbq_class *b;
+                psched_tdiff_t base_delay = q->wd_expires;
+                for (b = cl->borrow; b; b = b->borrow) {
+                        delay = PSCHED_TDIFF(b->undertime, q->now);
+                        if (delay < base_delay) {
+                                if (delay <= 0)
+                                        delay = 1;
+                                base_delay = delay;
+                        }
+                }
+                q->wd_expires = base_delay;
+        }
+}
+/* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
+   they go overlimit
+ */
+static void cbq_ovl_rclassic(struct cbq_class *cl)
+{
+        struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
+        struct cbq_class *this = cl;
+        do {
+                if (cl->level > q->toplevel) {
+                        cl = NULL;
+                        break;
+                }
+        } while ((cl = cl->borrow) != NULL);
+        if (cl == NULL)
+                cl = this;
+        cbq_ovl_classic(cl);
+}
+/* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */
+static void cbq_ovl_delay(struct cbq_class *cl)
+{
+        struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
+        psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now);
+        if (!cl->delayed) {
+                unsigned long sched = jiffies;
+                delay += cl->offtime;
+                if (cl->avgidle < 0)
+                        delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
+                if (cl->avgidle < cl->minidle)
+                        cl->avgidle = cl->minidle;
+                PSCHED_TADD2(q->now, delay, cl->undertime);
+                if (delay > 0) {
+                        sched += PSCHED_US2JIFFIE(delay) + cl->penalty;
+                        cl->penalized = sched;
+                        cl->cpriority = TC_CBQ_MAXPRIO;
+                        q->pmask |= (1<<TC_CBQ_MAXPRIO);
+                        if (del_timer(&q->delay_timer) &&
+                            (long)(q->delay_timer.expires - sched) > 0)
+                                q->delay_timer.expires = sched;
+                        add_timer(&q->delay_timer);
+                        cl->delayed = 1;
+                        cl->xstats.overactions++;
+                        return;
+                }
+                delay = 1;
+        }
+        if (q->wd_expires == 0 || q->wd_expires > delay)
+                q->wd_expires = delay;
+}
+/* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */
+static void cbq_ovl_lowprio(struct cbq_class *cl)
+{
+        struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
+        cl->penalized = jiffies + cl->penalty;
+        if (cl->cpriority != cl->priority2) {
+                cl->cpriority = cl->priority2;
+                q->pmask |= (1<<cl->cpriority);
+                cl->xstats.overactions++;
+        }
+        cbq_ovl_classic(cl);
+}
+/* TC_CBQ_OVL_DROP: penalize class by dropping */
+static void cbq_ovl_drop(struct cbq_class *cl)
+{
+        if (cl->q->ops->drop)
+                if (cl->q->ops->drop(cl->q))
+                        cl->qdisc->q.qlen--;
+        cl->xstats.overactions++;
+        cbq_ovl_classic(cl);
+}
+static void cbq_watchdog(unsigned long arg)
+{
+        struct Qdisc *sch = (struct Qdisc*)arg;
+        sch->flags &= ~TCQ_F_THROTTLED;
+        netif_schedule(sch->dev);
+}
+static unsigned long cbq_undelay_prio(struct cbq_sched_data *q, int prio)
+{
+        struct cbq_class *cl;
+        struct cbq_class *cl_prev = q->active[prio];
+        unsigned long now = jiffies;
+        unsigned long sched = now;
+        if (cl_prev == NULL)
+                return now;
+        do {
+                cl = cl_prev->next_alive;
+                if ((long)(now - cl->penalized) > 0) {
+                        cl_prev->next_alive = cl->next_alive;
+                        cl->next_alive = NULL;
+                        cl->cpriority = cl->priority;
+                        cl->delayed = 0;
+                        cbq_activate_class(cl);
+                        if (cl == q->active[prio]) {
+                                q->active[prio] = cl_prev;
+                                if (cl == q->active[prio]) {
+                                        q->active[prio] = NULL;
+                                        return 0;
+                                }
+                        }
+                        cl = cl_prev->next_alive;
+                } else if ((long)(sched - cl->penalized) > 0)
+                        sched = cl->penalized;
+        } while ((cl_prev = cl) != q->active[prio]);
+        return (long)(sched - now);
+}
+static void cbq_undelay(unsigned long arg)
+{
+        struct Qdisc *sch = (struct Qdisc*)arg;
+        struct cbq_sched_data *q = qdisc_priv(sch);
+        long delay = 0;
+        unsigned pmask;
+        pmask = q->pmask;
+        q->pmask = 0;
+        while (pmask) {
+                int prio = ffz(~pmask);
+                long tmp;
+                pmask &= ~(1<<prio);
+                tmp = cbq_undelay_prio(q, prio);
+                if (tmp > 0) {
+                        q->pmask |= 1<<prio;
+                        if (tmp < delay || delay == 0)
+                                delay = tmp;
+                }
+        }
+        if (delay) {
+                q->delay_timer.expires = jiffies + delay;
+                add_timer(&q->delay_timer);
+        }
+        sch->flags &= ~TCQ_F_THROTTLED;
+        netif_schedule(sch->dev);
+}
+#ifdef CONFIG_NET_CLS_POLICE
+static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
+{
+        int len = skb->len;
+        struct Qdisc *sch = child->__parent;
+        struct cbq_sched_data *q = qdisc_priv(sch);
+        struct cbq_class *cl = q->rx_class;
+        q->rx_class = NULL;
+        if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) {
+                cbq_mark_toplevel(q, cl);
+                q->rx_class = cl;
+                cl->q->__parent = sch;
+                if (cl->q->enqueue(skb, cl->q) == 0) {
+                        sch->q.qlen++;
+                        sch->bstats.packets++;
+                        sch->bstats.bytes+=len;
+                        if (!cl->next_alive)
+                                cbq_activate_class(cl);
+                        return 0;
+                }
+                sch->qstats.drops++;
+                return 0;
+        }
+        sch->qstats.drops++;
+        return -1;
+}
+#endif
+/* 
+   It is mission critical procedure.
+   We "regenerate" toplevel cutoff, if transmitting class
+   has backlog and it is not regulated. It is not part of
+   original CBQ description, but looks more reasonable.
+   Probably, it is wrong. This question needs further investigation.
+*/
+static __inline__ void
+cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
+                    struct cbq_class *borrowed)
+{
+        if (cl && q->toplevel >= borrowed->level) {
+                if (cl->q->q.qlen > 1) {
+                        do {
+                                if (PSCHED_IS_PASTPERFECT(borrowed->undertime)) {
+                                        q->toplevel = borrowed->level;
+                                        return;
+                                }
+                        } while ((borrowed=borrowed->borrow) != NULL);
+                }
+#if 0   
+        /* It is not necessary now. Uncommenting it
+           will save CPU cycles, but decrease fairness.
+         */
+                q->toplevel = TC_CBQ_MAXLEVEL;
+#endif
+        }
+}
+static void
+cbq_update(struct cbq_sched_data *q)
+{
+        struct cbq_class *this = q->tx_class;
+        struct cbq_class *cl = this;
+        int len = q->tx_len;
+        q->tx_class = NULL;
+        for ( ; cl; cl = cl->share) {
+                long avgidle = cl->avgidle;
+                long idle;
+                cl->bstats.packets++;
+                cl->bstats.bytes += len;
+                /*
+                   (now - last) is total time between packet right edges.
+                   (last_pktlen/rate) is "virtual" busy time, so that
+                         idle = (now - last) - last_pktlen/rate
+                 */
+                idle = PSCHED_TDIFF(q->now, cl->last);
+                if ((unsigned long)idle > 128*1024*1024) {
+                        avgidle = cl->maxidle;
+                } else {
+                        idle -= L2T(cl, len);
+                /* true_avgidle := (1-W)*true_avgidle + W*idle,
+                   where W=2^{-ewma_log}. But cl->avgidle is scaled:
+                   cl->avgidle == true_avgidle/W,
+                   hence:
+                 */
+                        avgidle += idle - (avgidle>>cl->ewma_log);
+                }
+                if (avgidle <= 0) {
+                        /* Overlimit or at-limit */
+                        if (avgidle < cl->minidle)
+                                avgidle = cl->minidle;
+                        cl->avgidle = avgidle;
+                        /* Calculate expected time, when this class
+                           will be allowed to send.
+                           It will occur, when:
+                           (1-W)*true_avgidle + W*delay = 0, i.e.
+                           idle = (1/W - 1)*(-true_avgidle)
+                           or
+                           idle = (1 - W)*(-cl->avgidle);
+                         */
+                        idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
+                        /*
+                           That is not all.
+                           To maintain the rate allocated to the class,
+                           we add to undertime virtual clock,
+                           necessary to complete transmitted packet.
+                           (len/phys_bandwidth has been already passed
+                           to the moment of cbq_update)
+                         */
+                        idle -= L2T(&q->link, len);
+                        idle += L2T(cl, len);
+                        PSCHED_AUDIT_TDIFF(idle);
+                        PSCHED_TADD2(q->now, idle, cl->undertime);
+                } else {
+                        /* Underlimit */
+                        PSCHED_SET_PASTPERFECT(cl->undertime);
+                        if (avgidle > cl->maxidle)
+                                cl->avgidle = cl->maxidle;
+                        else
+                                cl->avgidle = avgidle;
+                }
+                cl->last = q->now;
+        }
+        cbq_update_toplevel(q, this, q->tx_borrowed);
+}
+static __inline__ struct cbq_class *
+cbq_under_limit(struct cbq_class *cl)
+{
+        struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
+        struct cbq_class *this_cl = cl;
+        if (cl->tparent == NULL)
+                return cl;
+        if (PSCHED_IS_PASTPERFECT(cl->undertime) ||
+            !PSCHED_TLESS(q->now, cl->undertime)) {
+                cl->delayed = 0;
+                return cl;
+        }
+        do {
+                /* It is very suspicious place. Now overlimit
+                   action is generated for not bounded classes
+                   only if link is completely congested.
+                   Though it is in agree with ancestor-only paradigm,
+                   it looks very stupid. Particularly,
+                   it means that this chunk of code will either
+                   never be called or result in strong amplification
+                   of burstiness. Dangerous, silly, and, however,
+                   no another solution exists.
+                 */
+                if ((cl = cl->borrow) == NULL) {
+                        this_cl->qstats.overlimits++;
+                        this_cl->overlimit(this_cl);
+                        return NULL;
+                }
+                if (cl->level > q->toplevel)
+                        return NULL;
+        } while (!PSCHED_IS_PASTPERFECT(cl->undertime) &&
+                 PSCHED_TLESS(q->now, cl->undertime));
+        cl->delayed = 0;
+        return cl;
+}
+static __inline__ struct sk_buff *
+cbq_dequeue_prio(struct Qdisc *sch, int prio)
+{
+        struct cbq_sched_data *q = qdisc_priv(sch);
+        struct cbq_class *cl_tail, *cl_prev, *cl;
+        struct sk_buff *skb;
+        int deficit;
+        cl_tail = cl_prev = q->active[prio];
+        cl = cl_prev->next_alive;
+        do {
+                deficit = 0;
+                /* Start round */
+                do {
+                        struct cbq_class *borrow = cl;
+                        if (cl->q->q.qlen &&
+                            (borrow = cbq_under_limit(cl)) == NULL)
+                                goto skip_class;
+                        if (cl->deficit <= 0) {
+                                /* Class exhausted its allotment per
+                                   this round. Switch to the next one.
+                                 */
+                                deficit = 1;
+                                cl->deficit += cl->quantum;
+                                goto next_class;
+                        }
+                        skb = cl->q->dequeue(cl->q);
+                        /* Class did not give us any skb :-(
+                           It could occur even if cl->q->q.qlen != 0 
+                           f.e. if cl->q == "tbf"
+                         */
+                        if (skb == NULL)
+                                goto skip_class;
+                        cl->deficit -= skb->len;
+                        q->tx_class = cl;
+                        q->tx_borrowed = borrow;
+                        if (borrow != cl) {
+#ifndef CBQ_XSTATS_BORROWS_BYTES
+                                borrow->xstats.borrows++;
+                                cl->xstats.borrows++;
+#else
+                                borrow->xstats.borrows += skb->len;
+                                cl->xstats.borrows += skb->len;
+#endif
+                        }
+                        q->tx_len = skb->len;
+                        if (cl->deficit <= 0) {
+                                q->active[prio] = cl;
+                                cl = cl->next_alive;
+                                cl->deficit += cl->quantum;
+                        }
+                        return skb;
+skip_class:
+                        if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
+                                /* Class is empty or penalized.
+                                   Unlink it from active chain.
+                                 */
+                                cl_prev->next_alive = cl->next_alive;
+                                cl->next_alive = NULL;
+                                /* Did cl_tail point to it? */
+                                if (cl == cl_tail) {
+                                        /* Repair it! */
+                                        cl_tail = cl_prev;
+                                        /* Was it the last class in this band? */
+                                        if (cl == cl_tail) {
+                                                /* Kill the band! */
+                                                q->active[prio] = NULL;
+                                                q->activemask &= ~(1<<prio);
+                                                if (cl->q->q.qlen)
+                                                        cbq_activate_class(cl);
+                                                return NULL;
+                                        }
+                                        q->active[prio] = cl_tail;
+                                }
+                                if (cl->q->q.qlen)
+                                        cbq_activate_class(cl);
+                                cl = cl_prev;
+                        }
+next_class:
+                        cl_prev = cl;
+                        cl = cl->next_alive;
+                } while (cl_prev != cl_tail);
+        } while (deficit);
+        q->active[prio] = cl_prev;
+        return NULL;
+}
+static __inline__ struct sk_buff *
+cbq_dequeue_1(struct Qdisc *sch)
+{
+        struct cbq_sched_data *q = qdisc_priv(sch);
+        struct sk_buff *skb;
+        unsigned activemask;
+        activemask = q->activemask&0xFF;
+        while (activemask) {
+                int prio = ffz(~activemask);
+                activemask &= ~(1<<prio);
+                skb = cbq_dequeue_prio(sch, prio);
+                if (skb)
+                        return skb;
+        }
+        return NULL;
+}
+static struct sk_buff *
+cbq_dequeue(struct Qdisc *sch)
+{
+        struct sk_buff *skb;
+        struct cbq_sched_data *q = qdisc_priv(sch);
+        psched_time_t now;
+        psched_tdiff_t incr;
+        PSCHED_GET_TIME(now);
+        incr = PSCHED_TDIFF(now, q->now_rt);
+        if (q->tx_class) {
+                psched_tdiff_t incr2;
+                /* Time integrator. We calculate EOS time
+                   by adding expected packet transmission time.
+                   If real time is greater, we warp artificial clock,
+                   so that:
+                   cbq_time = max(real_time, work);
+                 */
+                incr2 = L2T(&q->link, q->tx_len);
+                PSCHED_TADD(q->now, incr2);
+                cbq_update(q);
+                if ((incr -= incr2) < 0)
+                        incr = 0;
+        }
+        PSCHED_TADD(q->now, incr);
+        q->now_rt = now;
+        for (;;) {
+                q->wd_expires = 0;
+                skb = cbq_dequeue_1(sch);
+                if (skb) {
+                        sch->q.qlen--;
+                        sch->flags &= ~TCQ_F_THROTTLED;
+                        return skb;
+                }
+                /* All the classes are overlimit.
+                   It is possible, if:
+                   1. Scheduler is empty.
+                   2. Toplevel cutoff inhibited borrowing.
+                   3. Root class is overlimit.
+                   Reset 2d and 3d conditions and retry.
+                   Note, that NS and cbq-2.0 are buggy, peeking
+                   an arbitrary class is appropriate for ancestor-only
+                   sharing, but not for toplevel algorithm.
+                   Our version is better, but slower, because it requires
+                   two passes, but it is unavoidable with top-level sharing.
+                */
+                if (q->toplevel == TC_CBQ_MAXLEVEL &&
+                    PSCHED_IS_PASTPERFECT(q->link.undertime))
+                        break;
+                q->toplevel = TC_CBQ_MAXLEVEL;
+                PSCHED_SET_PASTPERFECT(q->link.undertime);
+        }
+        /* No packets in scheduler or nobody wants to give them to us :-(
+           Sigh... start watchdog timer in the last case. */
+        if (sch->q.qlen) {
+                sch->qstats.overlimits++;
+                if (q->wd_expires) {
+                        long delay = PSCHED_US2JIFFIE(q->wd_expires);
+                        if (delay <= 0)
+                                delay = 1;
+                        mod_timer(&q->wd_timer, jiffies + delay);
+                        sch->flags |= TCQ_F_THROTTLED;
+                }
+        }
+        return NULL;
+}
+/* CBQ class maintanance routines */
+static void cbq_adjust_levels(struct cbq_class *this)
+{
+        if (this == NULL)
+                return;
+        do {
+                int level = 0;
+                struct cbq_class *cl;
+                if ((cl = this->children) != NULL) {
+                        do {
+                                if (cl->level > level)
+                                        level = cl->level;
+                        } while ((cl = cl->sibling) != this->children);
+                }
+                this->level = level+1;
+        } while ((this = this->tparent) != NULL);
+}
+static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
+{
+        struct cbq_class *cl;
+        unsigned h;
+        if (q->quanta[prio] == 0)
+                return;
+        for (h=0; h<16; h++) {
+                for (cl = q->classes[h]; cl; cl = cl->next) {
+                        /* BUGGGG... Beware! This expression suffer of
+                           arithmetic overflows!
+                         */
+                        if (cl->priority == prio) {
+                                cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
+                                        q->quanta[prio];
+                        }
+                        if (cl->quantum <= 0 || cl->quantum>32*cl->qdisc->dev->mtu) {
+                                printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->classid, cl->quantum);
+                                cl->quantum = cl->qdisc->dev->mtu/2 + 1;
+                        }
+                }
+        }
+}
+static void cbq_sync_defmap(struct cbq_class *cl)
+{
+        struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
+        struct cbq_class *split = cl->split;
+        unsigned h;
+        int i;
+        if (split == NULL)
+                return;
+        for (i=0; i<=TC_PRIO_MAX; i++) {
+                if (split->defaults[i] == cl && !(cl->defmap&(1<<i)))
+                        split->defaults[i] = NULL;
+        }
+        for (i=0; i<=TC_PRIO_MAX; i++) {
+                int level = split->level;
+                if (split->defaults[i])
+                        continue;
+                for (h=0; h<16; h++) {
+                        struct cbq_class *c;
+                        for (c = q->classes[h]; c; c = c->next) {
+                                if (c->split == split && c->level < level &&
+                                    c->defmap&(1<<i)) {
+                                        split->defaults[i] = c;
+                                        level = c->level;
+                                }
+                        }
+                }
+        }
+}
+static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
+{
+        struct cbq_class *split = NULL;
+        if (splitid == 0) {
+                if ((split = cl->split) == NULL)
+                        return;
+                splitid = split->classid;
+        }
+        if (split == NULL || split->classid != splitid) {
+                for (split = cl->tparent; split; split = split->tparent)
+                        if (split->classid == splitid)
+                                break;
+        }
+        if (split == NULL)
+                return;
+        if (cl->split != split) {
+                cl->defmap = 0;
+                cbq_sync_defmap(cl);
+                cl->split = split;
+                cl->defmap = def&mask;
+        } else
+                cl->defmap = (cl->defmap&~mask)|(def&mask);
+        cbq_sync_defmap(cl);
+}
+static void cbq_unlink_class(struct cbq_class *this)
+{
+        struct cbq_class *cl, **clp;
+        struct cbq_sched_data *q = qdisc_priv(this->qdisc);
+        for (clp = &q->classes[cbq_hash(this->classid)]; (cl = *clp) != NULL; clp = &cl->next) {
+                if (cl == this) {
+                        *clp = cl->next;
+                        cl->next = NULL;
+                        break;
+                }
+        }
+        if (this->tparent) {
+                clp=&this->sibling;
+                cl = *clp;
+                do {
+                        if (cl == this) {
+                                *clp = cl->sibling;
+                                break;
+                        }
+                        clp = &cl->sibling;
+                } while ((cl = *clp) != this->sibling);
+                if (this->tparent->children == this) {
+                        this->tparent->children = this->sibling;
+                        if (this->sibling == this)
+                                this->tparent->children = NULL;
+                }
+        } else {
+                BUG_TRAP(this->sibling == this);
+        }
+}
+static void cbq_link_class(struct cbq_class *this)
+{
+        struct cbq_sched_data *q = qdisc_priv(this->qdisc);
+        unsigned h = cbq_hash(this->classid);
+        struct cbq_class *parent = this->tparent;
+        this->sibling = this;
+        this->next = q->classes[h];
+        q->classes[h] = this;
+        if (parent == NULL)
+                return;
+        if (parent->children == NULL) {
+                parent->children = this;
+        } else {
+                this->sibling = parent->children->sibling;
+                parent->children->sibling = this;
+        }
+}
+static unsigned int cbq_drop(struct Qdisc* sch)
+{
+        struct cbq_sched_data *q = qdisc_priv(sch);
+        struct cbq_class *cl, *cl_head;
+        int prio;
+        unsigned int len;
+        for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
+                if ((cl_head = q->active[prio]) == NULL)
+                        continue;
+                cl = cl_head;
+                do {
+                        if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) {
+                                sch->q.qlen--;
+                                return len;
+                        }
+                } while ((cl = cl->next_alive) != cl_head);
+        }
+        return 0;
+}
+static void
+cbq_reset(struct Qdisc* sch)
+{
+        struct cbq_sched_data *q = qdisc_priv(sch);
+        struct cbq_class *cl;
+        int prio;
+        unsigned h;
+        q->activemask = 0;
+        q->pmask = 0;
+        q->tx_class = NULL;
+        q->tx_borrowed = NULL;
+        del_timer(&q->wd_timer);
+        del_timer(&q->delay_timer);
+        q->toplevel = TC_CBQ_MAXLEVEL;
+        PSCHED_GET_TIME(q->now);
+        q->now_rt = q->now;
+        for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
+                q->active[prio] = NULL;
+        for (h = 0; h < 16; h++) {
+                for (cl = q->classes[h]; cl; cl = cl->next) {
+                        qdisc_reset(cl->q);
+                        cl->next_alive = NULL;
+                        PSCHED_SET_PASTPERFECT(cl->undertime);
+                        cl->avgidle = cl->maxidle;
+                        cl->deficit = cl->quantum;
+                        cl->cpriority = cl->priority;
+                }
+        }
+        sch->q.qlen = 0;
+}
+static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
+{
+        if (lss->change&TCF_CBQ_LSS_FLAGS) {
+                cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
+                cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
+        }
+        if (lss->change&TCF_CBQ_LSS_EWMA)
+                cl->ewma_log = lss->ewma_log;
+        if (lss->change&TCF_CBQ_LSS_AVPKT)
+                cl->avpkt = lss->avpkt;
+        if (lss->change&TCF_CBQ_LSS_MINIDLE)
+                cl->minidle = -(long)lss->minidle;
+        if (lss->change&TCF_CBQ_LSS_MAXIDLE) {
+                cl->maxidle = lss->maxidle;
+                cl->avgidle = lss->maxidle;
+        }
+        if (lss->change&TCF_CBQ_LSS_OFFTIME)
+                cl->offtime = lss->offtime;
+        return 0;
+}
+static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
+{
+        q->nclasses[cl->priority]--;
+        q->quanta[cl->priority] -= cl->weight;
+        cbq_normalize_quanta(q, cl->priority);
+}
+static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
+{
+        q->nclasses[cl->priority]++;
+        q->quanta[cl->priority] += cl->weight;
+        cbq_normalize_quanta(q, cl->priority);
+}
+static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
+{
+        struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
+        if (wrr->allot)
+                cl->allot = wrr->allot;
+        if (wrr->weight)
+                cl->weight = wrr->weight;
+        if (wrr->priority) {
+                cl->priority = wrr->priority-1;
+                cl->cpriority = cl->priority;
+                if (cl->priority >= cl->priority2)
+                        cl->priority2 = TC_CBQ_MAXPRIO-1;
+        }
+        cbq_addprio(q, cl);
+        return 0;
+}
+static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)
+{
+        switch (ovl->strategy) {
+        case TC_CBQ_OVL_CLASSIC:
+                cl->overlimit = cbq_ovl_classic;
+                break;
+        case TC_CBQ_OVL_DELAY:
+                cl->overlimit = cbq_ovl_delay;
+                break;
+        case TC_CBQ_OVL_LOWPRIO:
+                if (ovl->priority2-1 >= TC_CBQ_MAXPRIO ||
+                    ovl->priority2-1 <= cl->priority)
+                        return -EINVAL;
+                cl->priority2 = ovl->priority2-1;
+                cl->overlimit = cbq_ovl_lowprio;
+                break;
+        case TC_CBQ_OVL_DROP:
+                cl->overlimit = cbq_ovl_drop;
+                break;
+        case TC_CBQ_OVL_RCLASSIC:
+                cl->overlimit = cbq_ovl_rclassic;
+                break;
+        default:
+                return -EINVAL;
+        }
+        cl->penalty = (ovl->penalty*HZ)/1000;
+        return 0;
+}
+#ifdef CONFIG_NET_CLS_POLICE
+static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p)
+{
+        cl->police = p->police;
+        if (cl->q->handle) {
+                if (p->police == TC_POLICE_RECLASSIFY)
+                        cl->q->reshape_fail = cbq_reshape_fail;
+                else
+                        cl->q->reshape_fail = NULL;
+        }
+        return 0;
+}
+#endif
+static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
+{
+        cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
+        return 0;
+}
+static int cbq_init(struct Qdisc *sch, struct rtattr *opt)
+{
+        struct cbq_sched_data *q = qdisc_priv(sch);
+        struct rtattr *tb[TCA_CBQ_MAX];
+        struct tc_ratespec *r;
+        if (rtattr_parse_nested(tb, TCA_CBQ_MAX, opt) < 0 ||
+            tb[TCA_CBQ_RTAB-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL ||
+            RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec))
+                return -EINVAL;
+        if (tb[TCA_CBQ_LSSOPT-1] &&
+            RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt))
+                return -EINVAL;
+        r = RTA_DATA(tb[TCA_CBQ_RATE-1]);
+        if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB-1])) == NULL)
+                return -EINVAL;
+        q->link.refcnt = 1;
+        q->link.sibling = &q->link;
+        q->link.classid = sch->handle;
+        q->link.qdisc = sch;
+        if (!(q->link.q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)))
+                q->link.q = &noop_qdisc;
+        q->link.priority = TC_CBQ_MAXPRIO-1;
+        q->link.priority2 = TC_CBQ_MAXPRIO-1;
+        q->link.cpriority = TC_CBQ_MAXPRIO-1;
+        q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC;
+        q->link.overlimit = cbq_ovl_classic;
+        q->link.allot = psched_mtu(sch->dev);
+        q->link.quantum = q->link.allot;
+        q->link.weight = q->link.R_tab->rate.rate;
+        q->link.ewma_log = TC_CBQ_DEF_EWMA;
+        q->link.avpkt = q->link.allot/2;
+        q->link.minidle = -0x7FFFFFFF;
+        q->link.stats_lock = &sch->dev->queue_lock;
+        init_timer(&q->wd_timer);
+        q->wd_timer.data = (unsigned long)sch;
+        q->wd_timer.function = cbq_watchdog;
+        init_timer(&q->delay_timer);
+        q->delay_timer.data = (unsigned long)sch;
+        q->delay_timer.function = cbq_undelay;
+        q->toplevel = TC_CBQ_MAXLEVEL;
+        PSCHED_GET_TIME(q->now);
+        q->now_rt = q->now;
+        cbq_link_class(&q->link);
+        if (tb[TCA_CBQ_LSSOPT-1])
+                cbq_set_lss(&q->link, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
+        cbq_addprio(q, &q->link);
+        return 0;
+}
+static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
+{
+        unsigned char    *b = skb->tail;
+        RTA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate);
+        return skb->len;
+rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
+{
+        unsigned char    *b = skb->tail;
+        struct tc_cbq_lssopt opt;
+        opt.flags = 0;
+        if (cl->borrow == NULL)
+                opt.flags |= TCF_CBQ_LSS_BOUNDED;
+        if (cl->share == NULL)
+                opt.flags |= TCF_CBQ_LSS_ISOLATED;
+        opt.ewma_log = cl->ewma_log;
+        opt.level = cl->level;
+        opt.avpkt = cl->avpkt;
+        opt.maxidle = cl->maxidle;
+        opt.minidle = (u32)(-cl->minidle);
+        opt.offtime = cl->offtime;
+        opt.change = ~0;
+        RTA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt);
+        return skb->len;
+rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
+{
+        unsigned char    *b = skb->tail;
+        struct tc_cbq_wrropt opt;
+        opt.flags = 0;
+        opt.allot = cl->allot;
+        opt.priority = cl->priority+1;
+        opt.cpriority = cl->cpriority+1;
+        opt.weight = cl->weight;
+        RTA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt);
+        return skb->len;
+rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
+{
+        unsigned char    *b = skb->tail;
+        struct tc_cbq_ovl opt;
+        opt.strategy = cl->ovl_strategy;
+        opt.priority2 = cl->priority2+1;
+        opt.penalty = (cl->penalty*1000)/HZ;
+        RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
+        return skb->len;
+rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
+{
+        unsigned char    *b = skb->tail;
+        struct tc_cbq_fopt opt;
+        if (cl->split || cl->defmap) {
+                opt.split = cl->split ? cl->split->classid : 0;
+                opt.defmap = cl->defmap;
+                opt.defchange = ~0;
+                RTA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt);
+        }
+        return skb->len;
+rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+#ifdef CONFIG_NET_CLS_POLICE
+static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
+{
+        unsigned char    *b = skb->tail;
+        struct tc_cbq_police opt;
+        if (cl->police) {
+                opt.police = cl->police;
+                RTA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt);
+        }
+        return skb->len;
+rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+#endif
+static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
+{
+        if (cbq_dump_lss(skb, cl) < 0 ||
+            cbq_dump_rate(skb, cl) < 0 ||
+            cbq_dump_wrr(skb, cl) < 0 ||
+            cbq_dump_ovl(skb, cl) < 0 ||
+#ifdef CONFIG_NET_CLS_POLICE
+            cbq_dump_police(skb, cl) < 0 ||
+#endif
+            cbq_dump_fopt(skb, cl) < 0)
+                return -1;
+        return 0;
+}
+static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+        struct cbq_sched_data *q = qdisc_priv(sch);
+        unsigned char    *b = skb->tail;
+        struct rtattr *rta;
+        rta = (struct rtattr*)b;
+        RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
+        if (cbq_dump_attr(skb, &q->link) < 0)
+                goto rtattr_failure;
+        rta->rta_len = skb->tail - b;
+        return skb->len;
+rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+static int
+cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+        struct cbq_sched_data *q = qdisc_priv(sch);
+        q->link.xstats.avgidle = q->link.avgidle;
+        return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
+}
+static int
+cbq_dump_class(struct Qdisc *sch, unsigned long arg,
+               struct sk_buff *skb, struct tcmsg *tcm)
+{
+        struct cbq_class *cl = (struct cbq_class*)arg;
+        unsigned char    *b = skb->tail;
+        struct rtattr *rta;
+        if (cl->tparent)
+                tcm->tcm_parent = cl->tparent->classid;
+        else
+                tcm->tcm_parent = TC_H_ROOT;
+        tcm->tcm_handle = cl->classid;
+        tcm->tcm_info = cl->q->handle;
+        rta = (struct rtattr*)b;
+        RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
+        if (cbq_dump_attr(skb, cl) < 0)
+                goto rtattr_failure;
+        rta->rta_len = skb->tail - b;
+        return skb->len;
+rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+static int
+cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
+        struct gnet_dump *d)
+{
+        struct cbq_sched_data *q = qdisc_priv(sch);
+        struct cbq_class *cl = (struct cbq_class*)arg;
+        cl->qstats.qlen = cl->q->q.qlen;
+        cl->xstats.avgidle = cl->avgidle;
+        cl->xstats.undertime = 0;
+        if (!PSCHED_IS_PASTPERFECT(cl->undertime))
+                cl->xstats.undertime = PSCHED_TDIFF(cl->undertime, q->now);
+        if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
+#ifdef CONFIG_NET_ESTIMATOR
+            gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
+#endif
+            gnet_stats_copy_queue(d, &cl->qstats) < 0)
+                return -1;
+        return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
+}
+static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+                     struct Qdisc **old)
+{
+        struct cbq_class *cl = (struct cbq_class*)arg;
+        if (cl) {
+                if (new == NULL) {
+                        if ((new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)) == NULL)
+                                return -ENOBUFS;
+                } else {
+#ifdef CONFIG_NET_CLS_POLICE
+                        if (cl->police == TC_POLICE_RECLASSIFY)
+                                new->reshape_fail = cbq_reshape_fail;
+#endif
+                }
+                sch_tree_lock(sch);
+                *old = cl->q;
+                cl->q = new;
+                sch->q.qlen -= (*old)->q.qlen;
+                qdisc_reset(*old);
+                sch_tree_unlock(sch);
+                return 0;
+        }
+        return -ENOENT;
+}
+static struct Qdisc *
+cbq_leaf(struct Qdisc *sch, unsigned long arg)
+{
+        struct cbq_class *cl = (struct cbq_class*)arg;
+        return cl ? cl->q : NULL;
+}
+static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
+{
+        struct cbq_sched_data *q = qdisc_priv(sch);
+        struct cbq_class *cl = cbq_class_lookup(q, classid);
+        if (cl) {
+                cl->refcnt++;
+                return (unsigned long)cl;
+        }
+        return 0;
+}
+static void cbq_destroy_filters(struct cbq_class *cl)
+{
+        struct tcf_proto *tp;
+        while ((tp = cl->filter_list) != NULL) {
+                cl->filter_list = tp->next;
+                tcf_destroy(tp);
+        }
+}
+static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
+{
+        struct cbq_sched_data *q = qdisc_priv(sch);
+        BUG_TRAP(!cl->filters);
+        cbq_destroy_filters(cl);
+        qdisc_destroy(cl->q);
+        qdisc_put_rtab(cl->R_tab);
+#ifdef CONFIG_NET_ESTIMATOR
+        gen_kill_estimator(&cl->bstats, &cl->rate_est);
+#endif
+        if (cl != &q->link)
+                kfree(cl);
+}
+static void
+cbq_destroy(struct Qdisc* sch)
+{
+        struct cbq_sched_data *q = qdisc_priv(sch);
+        struct cbq_class *cl;
+        unsigned h;
+#ifdef CONFIG_NET_CLS_POLICE
+        q->rx_class = NULL;
+#endif
+        /*
+         * Filters must be destroyed first because we don't destroy the
+         * classes from root to leafs which means that filters can still
+         * be bound to classes which have been destroyed already. --TGR '04
+         */
+        for (h = 0; h < 16; h++)
+                for (cl = q->classes[h]; cl; cl = cl->next)
+                        cbq_destroy_filters(cl);
+        for (h = 0; h < 16; h++) {
+                struct cbq_class *next;
+                for (cl = q->classes[h]; cl; cl = next) {
+                        next = cl->next;
+                        cbq_destroy_class(sch, cl);
+                }
+        }
+}
+static void cbq_put(struct Qdisc *sch, unsigned long arg)
+{
+        struct cbq_class *cl = (struct cbq_class*)arg;
+        if (--cl->refcnt == 0) {
+#ifdef CONFIG_NET_CLS_POLICE
+                struct cbq_sched_data *q = qdisc_priv(sch);
+                spin_lock_bh(&sch->dev->queue_lock);
+                if (q->rx_class == cl)
+                        q->rx_class = NULL;
+                spin_unlock_bh(&sch->dev->queue_lock);
+#endif
+                cbq_destroy_class(sch, cl);
+        }
+}
+static int
+cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **tca,
+                 unsigned long *arg)
+{
+        int err;
+        struct cbq_sched_data *q = qdisc_priv(sch);
+        struct cbq_class *cl = (struct cbq_class*)*arg;
+        struct rtattr *opt = tca[TCA_OPTIONS-1];
+        struct rtattr *tb[TCA_CBQ_MAX];
+        struct cbq_class *parent;
+        struct qdisc_rate_table *rtab = NULL;
+        if (opt==NULL || rtattr_parse_nested(tb, TCA_CBQ_MAX, opt))
+                return -EINVAL;
+        if (tb[TCA_CBQ_OVL_STRATEGY-1] &&
+            RTA_PAYLOAD(tb[TCA_CBQ_OVL_STRATEGY-1]) < sizeof(struct tc_cbq_ovl))
+                return -EINVAL;
+        if (tb[TCA_CBQ_FOPT-1] &&
+            RTA_PAYLOAD(tb[TCA_CBQ_FOPT-1]) < sizeof(struct tc_cbq_fopt))
+                return -EINVAL;
+        if (tb[TCA_CBQ_RATE-1] &&
+            RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec))
+                        return -EINVAL;
+        if (tb[TCA_CBQ_LSSOPT-1] &&
+            RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt))
+                        return -EINVAL;
+        if (tb[TCA_CBQ_WRROPT-1] &&
+            RTA_PAYLOAD(tb[TCA_CBQ_WRROPT-1]) < sizeof(struct tc_cbq_wrropt))
+                        return -EINVAL;
+#ifdef CONFIG_NET_CLS_POLICE
+        if (tb[TCA_CBQ_POLICE-1] &&
+            RTA_PAYLOAD(tb[TCA_CBQ_POLICE-1]) < sizeof(struct tc_cbq_police))
+                        return -EINVAL;
+#endif
+        if (cl) {
+                /* Check parent */
+                if (parentid) {
+                        if (cl->tparent && cl->tparent->classid != parentid)
+                                return -EINVAL;
+                        if (!cl->tparent && parentid != TC_H_ROOT)
+                                return -EINVAL;
+                }
+                if (tb[TCA_CBQ_RATE-1]) {
+                        rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]);
+                        if (rtab == NULL)
+                                return -EINVAL;
+                }
+                /* Change class parameters */
+                sch_tree_lock(sch);
+                if (cl->next_alive != NULL)
+                        cbq_deactivate_class(cl);
+                if (rtab) {
+                        rtab = xchg(&cl->R_tab, rtab);
+                        qdisc_put_rtab(rtab);
+                }
+                if (tb[TCA_CBQ_LSSOPT-1])
+                        cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
+                if (tb[TCA_CBQ_WRROPT-1]) {
+                        cbq_rmprio(q, cl);
+                        cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1]));
+                }
+                if (tb[TCA_CBQ_OVL_STRATEGY-1])
+                        cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1]));
+#ifdef CONFIG_NET_CLS_POLICE
+                if (tb[TCA_CBQ_POLICE-1])
+                        cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1]));
+#endif
+                if (tb[TCA_CBQ_FOPT-1])
+                        cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1]));
+                if (cl->q->q.qlen)
+                        cbq_activate_class(cl);
+                sch_tree_unlock(sch);
+#ifdef CONFIG_NET_ESTIMATOR
+                if (tca[TCA_RATE-1])
+                        gen_replace_estimator(&cl->bstats, &cl->rate_est,
+                                cl->stats_lock, tca[TCA_RATE-1]);
+#endif
+                return 0;
+        }
+        if (parentid == TC_H_ROOT)
+                return -EINVAL;
+        if (tb[TCA_CBQ_WRROPT-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL ||
+            tb[TCA_CBQ_LSSOPT-1] == NULL)
+                return -EINVAL;
+        rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]);
+        if (rtab == NULL)
+                return -EINVAL;
+        if (classid) {
+                err = -EINVAL;
+                if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid))
+                        goto failure;
+        } else {
+                int i;
+                classid = TC_H_MAKE(sch->handle,0x8000);
+                for (i=0; i<0x8000; i++) {
+                        if (++q->hgenerator >= 0x8000)
+                                q->hgenerator = 1;
+                        if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
+                                break;
+                }
+                err = -ENOSR;
+                if (i >= 0x8000)
+                        goto failure;
+                classid = classid|q->hgenerator;
+        }
+        parent = &q->link;
+        if (parentid) {
+                parent = cbq_class_lookup(q, parentid);
+                err = -EINVAL;
+                if (parent == NULL)
+                        goto failure;
+        }
+        err = -ENOBUFS;
+        cl = kmalloc(sizeof(*cl), GFP_KERNEL);
+        if (cl == NULL)
+                goto failure;
+        memset(cl, 0, sizeof(*cl));
+        cl->R_tab = rtab;
+        rtab = NULL;
+        cl->refcnt = 1;
+        if (!(cl->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)))
+                cl->q = &noop_qdisc;
+        cl->classid = classid;
+        cl->tparent = parent;
+        cl->qdisc = sch;
+        cl->allot = parent->allot;
+        cl->quantum = cl->allot;
+        cl->weight = cl->R_tab->rate.rate;
+        cl->stats_lock = &sch->dev->queue_lock;
+        sch_tree_lock(sch);
+        cbq_link_class(cl);
+        cl->borrow = cl->tparent;
+        if (cl->tparent != &q->link)
+                cl->share = cl->tparent;
+        cbq_adjust_levels(parent);
+        cl->minidle = -0x7FFFFFFF;
+        cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
+        cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1]));
+        if (cl->ewma_log==0)
+                cl->ewma_log = q->link.ewma_log;
+        if (cl->maxidle==0)
+                cl->maxidle = q->link.maxidle;
+        if (cl->avpkt==0)
+                cl->avpkt = q->link.avpkt;
+        cl->overlimit = cbq_ovl_classic;
+        if (tb[TCA_CBQ_OVL_STRATEGY-1])
+                cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1]));
+#ifdef CONFIG_NET_CLS_POLICE
+        if (tb[TCA_CBQ_POLICE-1])
+                cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1]));
+#endif
+        if (tb[TCA_CBQ_FOPT-1])
+                cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1]));
+        sch_tree_unlock(sch);
+#ifdef CONFIG_NET_ESTIMATOR
+        if (tca[TCA_RATE-1])
+                gen_new_estimator(&cl->bstats, &cl->rate_est,
+                        cl->stats_lock, tca[TCA_RATE-1]);
+#endif
+        *arg = (unsigned long)cl;
+        return 0;
+failure:
+        qdisc_put_rtab(rtab);
+        return err;
+}
+static int cbq_delete(struct Qdisc *sch, unsigned long arg)
+{
+        struct cbq_sched_data *q = qdisc_priv(sch);
+        struct cbq_class *cl = (struct cbq_class*)arg;
+        if (cl->filters || cl->children || cl == &q->link)
+                return -EBUSY;
+        sch_tree_lock(sch);
+        if (cl->next_alive)
+                cbq_deactivate_class(cl);
+        if (q->tx_borrowed == cl)
+                q->tx_borrowed = q->tx_class;
+        if (q->tx_class == cl) {
+                q->tx_class = NULL;
+                q->tx_borrowed = NULL;
+        }
+#ifdef CONFIG_NET_CLS_POLICE
+        if (q->rx_class == cl)
+                q->rx_class = NULL;
+#endif
+        cbq_unlink_class(cl);
+        cbq_adjust_levels(cl->tparent);
+        cl->defmap = 0;
+        cbq_sync_defmap(cl);
+        cbq_rmprio(q, cl);
+        sch_tree_unlock(sch);
+        if (--cl->refcnt == 0)
+                cbq_destroy_class(sch, cl);
+        return 0;
+}
+static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg)
+{
+        struct cbq_sched_data *q = qdisc_priv(sch);
+        struct cbq_class *cl = (struct cbq_class *)arg;
+        if (cl == NULL)
+                cl = &q->link;
+        return &cl->filter_list;
+}
+static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
+                                     u32 classid)
+{
+        struct cbq_sched_data *q = qdisc_priv(sch);
+        struct cbq_class *p = (struct cbq_class*)parent;
+        struct cbq_class *cl = cbq_class_lookup(q, classid);
+        if (cl) {
+                if (p && p->level <= cl->level)
+                        return 0;
+                cl->filters++;
+                return (unsigned long)cl;
+        }
+        return 0;
+}
+static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
+{
+        struct cbq_class *cl = (struct cbq_class*)arg;
+        cl->filters--;
+}
+static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+        struct cbq_sched_data *q = qdisc_priv(sch);
+        unsigned h;
+        if (arg->stop)
+                return;
+        for (h = 0; h < 16; h++) {
+                struct cbq_class *cl;
+                for (cl = q->classes[h]; cl; cl = cl->next) {
+                        if (arg->count < arg->skip) {
+                                arg->count++;
+                                continue;
+                        }
+                        if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
+                                arg->stop = 1;
+                                return;
+                        }
+                        arg->count++;
+                }
+        }
+}
+static struct Qdisc_class_ops cbq_class_ops = {
+        .graft          =       cbq_graft,
+        .leaf           =       cbq_leaf,
+        .get            =       cbq_get,
+        .put            =       cbq_put,
+        .change         =       cbq_change_class,
+        .delete         =       cbq_delete,
+        .walk           =       cbq_walk,
+        .tcf_chain      =       cbq_find_tcf,
+        .bind_tcf       =       cbq_bind_filter,
+        .unbind_tcf     =       cbq_unbind_filter,
+        .dump           =       cbq_dump_class,
+        .dump_stats     =       cbq_dump_class_stats,
+};
+static struct Qdisc_ops cbq_qdisc_ops = {
+        .next           =       NULL,
+        .cl_ops         =       &cbq_class_ops,
+        .id             =       "cbq",
+        .priv_size      =       sizeof(struct cbq_sched_data),
+        .enqueue        =       cbq_enqueue,
+        .dequeue        =       cbq_dequeue,
+        .requeue        =       cbq_requeue,
+        .drop           =       cbq_drop,
+        .init           =       cbq_init,
+        .reset          =       cbq_reset,
+        .destroy        =       cbq_destroy,
+        .change         =       NULL,
+        .dump           =       cbq_dump,
+        .dump_stats     =       cbq_dump_stats,
+        .owner          =       THIS_MODULE,
+};
+static int __init cbq_module_init(void)
+{
+        return register_qdisc(&cbq_qdisc_ops);
+}
+static void __exit cbq_module_exit(void) 
+{
+        unregister_qdisc(&cbq_qdisc_ops);
+}
+module_init(cbq_module_init)
+module_exit(cbq_module_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
new file mode 100644
index 000000000000..8a3db9d95bab
--- /dev/null
+++ b/net/sched/sch_dsmark.c
@@ -0,0 +1,479 @@
+/* net/sched/sch_dsmark.c - Differentiated Services field marker */
+/* Written 1998-2000 by Werner Almesberger, EPFL ICA */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h> /* for pkt_sched */
+#include <linux/rtnetlink.h>
+#include <net/pkt_sched.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <asm/byteorder.h>
+#if 1 /* control */
+#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
+#else
+#define DPRINTK(format,args...)
+#endif
+#if 0 /* data */
+#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args)
+#else
+#define D2PRINTK(format,args...)
+#endif
+#define PRIV(sch) qdisc_priv(sch)
+/*
+ * classid      class           marking
+ * -------      -----           -------
+ *   n/a          0             n/a
+ *   x:0          1             use entry [0]
+ *   ...         ...            ...
+ *   x:y y>0     y+1            use entry [y]
+ *   ...         ...            ...
+ * x:indices-1  indices         use entry [indices-1]
+ *   ...         ...            ...
+ *   x:y         y+1            use entry [y & (indices-1)]
+ *   ...         ...            ...
+ * 0xffff       0x10000         use entry [indices-1]
+ */
+#define NO_DEFAULT_INDEX        (1 << 16)
+struct dsmark_qdisc_data {
+        struct Qdisc            *q;
+        struct tcf_proto        *filter_list;
+        __u8                    *mask;  /* "owns" the array */
+        __u8                    *value;
+        __u16                   indices;
+        __u32                   default_index;  /* index range is 0...0xffff */
+        int                     set_tc_index;
+};
+/* ------------------------- Class/flow operations ------------------------- */
+static int dsmark_graft(struct Qdisc *sch,unsigned long arg,
+    struct Qdisc *new,struct Qdisc **old)
+{
+        struct dsmark_qdisc_data *p = PRIV(sch);
+        DPRINTK("dsmark_graft(sch %p,[qdisc %p],new %p,old %p)\n",sch,p,new,
+            old);
+        if (!new)
+                new = &noop_qdisc;
+        sch_tree_lock(sch);
+        *old = xchg(&p->q,new);
+        if (*old)
+                qdisc_reset(*old);
+        sch->q.qlen = 0;
+        sch_tree_unlock(sch); /* @@@ move up ? */
+        return 0;
+}
+static struct Qdisc *dsmark_leaf(struct Qdisc *sch, unsigned long arg)
+{
+        struct dsmark_qdisc_data *p = PRIV(sch);
+        return p->q;
+}
+static unsigned long dsmark_get(struct Qdisc *sch,u32 classid)
+{
+        struct dsmark_qdisc_data *p __attribute__((unused)) = PRIV(sch);
+        DPRINTK("dsmark_get(sch %p,[qdisc %p],classid %x)\n",sch,p,classid);
+        return TC_H_MIN(classid)+1;
+}
+static unsigned long dsmark_bind_filter(struct Qdisc *sch,
+    unsigned long parent, u32 classid)
+{
+        return dsmark_get(sch,classid);
+}
+static void dsmark_put(struct Qdisc *sch, unsigned long cl)
+{
+}
+static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent,
+    struct rtattr **tca, unsigned long *arg)
+{
+        struct dsmark_qdisc_data *p = PRIV(sch);
+        struct rtattr *opt = tca[TCA_OPTIONS-1];
+        struct rtattr *tb[TCA_DSMARK_MAX];
+        DPRINTK("dsmark_change(sch %p,[qdisc %p],classid %x,parent %x),"
+            "arg 0x%lx\n",sch,p,classid,parent,*arg);
+        if (*arg > p->indices)
+                return -ENOENT;
+        if (!opt || rtattr_parse_nested(tb, TCA_DSMARK_MAX, opt))
+                return -EINVAL;
+        if (tb[TCA_DSMARK_MASK-1]) {
+                if (!RTA_PAYLOAD(tb[TCA_DSMARK_MASK-1]))
+                        return -EINVAL;
+                p->mask[*arg-1] = *(__u8 *) RTA_DATA(tb[TCA_DSMARK_MASK-1]);
+        }
+        if (tb[TCA_DSMARK_VALUE-1]) {
+                if (!RTA_PAYLOAD(tb[TCA_DSMARK_VALUE-1]))
+                        return -EINVAL;
+                p->value[*arg-1] = *(__u8 *) RTA_DATA(tb[TCA_DSMARK_VALUE-1]);
+        }
+        return 0;
+}
+static int dsmark_delete(struct Qdisc *sch,unsigned long arg)
+{
+        struct dsmark_qdisc_data *p = PRIV(sch);
+        if (!arg || arg > p->indices)
+                return -EINVAL;
+        p->mask[arg-1] = 0xff;
+        p->value[arg-1] = 0;
+        return 0;
+}
+static void dsmark_walk(struct Qdisc *sch,struct qdisc_walker *walker)
+{
+        struct dsmark_qdisc_data *p = PRIV(sch);
+        int i;
+        DPRINTK("dsmark_walk(sch %p,[qdisc %p],walker %p)\n",sch,p,walker);
+        if (walker->stop)
+                return;
+        for (i = 0; i < p->indices; i++) {
+                if (p->mask[i] == 0xff && !p->value[i])
+                        continue;
+                if (walker->count >= walker->skip) {
+                        if (walker->fn(sch, i+1, walker) < 0) {
+                                walker->stop = 1;
+                                break;
+                        }
+                }
+                walker->count++;
+        }
+}
+static struct tcf_proto **dsmark_find_tcf(struct Qdisc *sch,unsigned long cl)
+{
+        struct dsmark_qdisc_data *p = PRIV(sch);
+        return &p->filter_list;
+}
+/* --------------------------- Qdisc operations ---------------------------- */
+static int dsmark_enqueue(struct sk_buff *skb,struct Qdisc *sch)
+{
+        struct dsmark_qdisc_data *p = PRIV(sch);
+        struct tcf_result res;
+        int result;
+        int ret = NET_XMIT_POLICED;
+        D2PRINTK("dsmark_enqueue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p);
+        if (p->set_tc_index) {
+                /* FIXME: Safe with non-linear skbs? --RR */
+                switch (skb->protocol) {
+                        case __constant_htons(ETH_P_IP):
+                                skb->tc_index = ipv4_get_dsfield(skb->nh.iph)
+                                        & ~INET_ECN_MASK;
+                                break;
+                        case __constant_htons(ETH_P_IPV6):
+                                skb->tc_index = ipv6_get_dsfield(skb->nh.ipv6h)
+                                        & ~INET_ECN_MASK;
+                                break;
+                        default:
+                                skb->tc_index = 0;
+                                break;
+                };
+        }
+        result = TC_POLICE_OK; /* be nice to gcc */
+        if (TC_H_MAJ(skb->priority) == sch->handle) {
+                skb->tc_index = TC_H_MIN(skb->priority);
+        } else {
+                result = tc_classify(skb,p->filter_list,&res);
+                D2PRINTK("result %d class 0x%04x\n",result,res.classid);
+                switch (result) {
+#ifdef CONFIG_NET_CLS_POLICE
+                        case TC_POLICE_SHOT:
+                                kfree_skb(skb);
+                                break;
+#if 0
+                        case TC_POLICE_RECLASSIFY:
+                                /* FIXME: what to do here ??? */
+#endif
+#endif
+                        case TC_POLICE_OK:
+                                skb->tc_index = TC_H_MIN(res.classid);
+                                break;
+                        case TC_POLICE_UNSPEC:
+                                /* fall through */
+                        default:
+                                if (p->default_index != NO_DEFAULT_INDEX)
+                                        skb->tc_index = p->default_index;
+                                break;
+                };
+        }
+        if (
+#ifdef CONFIG_NET_CLS_POLICE
+            result == TC_POLICE_SHOT ||
+#endif
+            ((ret = p->q->enqueue(skb,p->q)) != 0)) {
+                sch->qstats.drops++;
+                return ret;
+        }
+        sch->bstats.bytes += skb->len;
+        sch->bstats.packets++;
+        sch->q.qlen++;
+        return ret;
+}
+static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
+{
+        struct dsmark_qdisc_data *p = PRIV(sch);
+        struct sk_buff *skb;
+        int index;
+        D2PRINTK("dsmark_dequeue(sch %p,[qdisc %p])\n",sch,p);
+        skb = p->q->ops->dequeue(p->q);
+        if (!skb)
+                return NULL;
+        sch->q.qlen--;
+        index = skb->tc_index & (p->indices-1);
+        D2PRINTK("index %d->%d\n",skb->tc_index,index);
+        switch (skb->protocol) {
+                case __constant_htons(ETH_P_IP):
+                        ipv4_change_dsfield(skb->nh.iph,
+                            p->mask[index],p->value[index]);
+                        break;
+                case __constant_htons(ETH_P_IPV6):
+                        ipv6_change_dsfield(skb->nh.ipv6h,
+                            p->mask[index],p->value[index]);
+                        break;
+                default:
+                        /*
+                         * Only complain if a change was actually attempted.
+                         * This way, we can send non-IP traffic through dsmark
+                         * and don't need yet another qdisc as a bypass.
+                         */
+                        if (p->mask[index] != 0xff || p->value[index])
+                                printk(KERN_WARNING "dsmark_dequeue: "
+                                       "unsupported protocol %d\n",
+                                       htons(skb->protocol));
+                        break;
+        };
+        return skb;
+}
+static int dsmark_requeue(struct sk_buff *skb,struct Qdisc *sch)
+{
+        int ret;
+        struct dsmark_qdisc_data *p = PRIV(sch);
+        D2PRINTK("dsmark_requeue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p);
+        if ((ret = p->q->ops->requeue(skb, p->q)) == 0) {
+                sch->q.qlen++;
+                sch->qstats.requeues++;
+                return 0;
+        }
+        sch->qstats.drops++;
+        return ret;
+}
+static unsigned int dsmark_drop(struct Qdisc *sch)
+{
+        struct dsmark_qdisc_data *p = PRIV(sch);
+        unsigned int len;
+        
+        DPRINTK("dsmark_reset(sch %p,[qdisc %p])\n",sch,p);
+        if (!p->q->ops->drop)
+                return 0;
+        if (!(len = p->q->ops->drop(p->q)))
+                return 0;
+        sch->q.qlen--;
+        return len;
+}
+static int dsmark_init(struct Qdisc *sch,struct rtattr *opt)
+{
+        struct dsmark_qdisc_data *p = PRIV(sch);
+        struct rtattr *tb[TCA_DSMARK_MAX];
+        __u16 tmp;
+        DPRINTK("dsmark_init(sch %p,[qdisc %p],opt %p)\n",sch,p,opt);
+        if (!opt ||
+            rtattr_parse(tb,TCA_DSMARK_MAX,RTA_DATA(opt),RTA_PAYLOAD(opt)) < 0 ||
+            !tb[TCA_DSMARK_INDICES-1] ||
+            RTA_PAYLOAD(tb[TCA_DSMARK_INDICES-1]) < sizeof(__u16))
+                return -EINVAL;
+        p->indices = *(__u16 *) RTA_DATA(tb[TCA_DSMARK_INDICES-1]);
+        if (!p->indices)
+                return -EINVAL;
+        for (tmp = p->indices; tmp != 1; tmp >>= 1) {
+                if (tmp & 1)
+                        return -EINVAL;
+        }
+        p->default_index = NO_DEFAULT_INDEX;
+        if (tb[TCA_DSMARK_DEFAULT_INDEX-1]) {
+                if (RTA_PAYLOAD(tb[TCA_DSMARK_DEFAULT_INDEX-1]) < sizeof(__u16))
+                        return -EINVAL;
+                p->default_index =
+                    *(__u16 *) RTA_DATA(tb[TCA_DSMARK_DEFAULT_INDEX-1]);
+        }
+        p->set_tc_index = !!tb[TCA_DSMARK_SET_TC_INDEX-1];
+        p->mask = kmalloc(p->indices*2,GFP_KERNEL);
+        if (!p->mask)
+                return -ENOMEM;
+        p->value = p->mask+p->indices;
+        memset(p->mask,0xff,p->indices);
+        memset(p->value,0,p->indices);
+        if (!(p->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)))
+                p->q = &noop_qdisc;
+        DPRINTK("dsmark_init: qdisc %p\n",&p->q);
+        return 0;
+}
+static void dsmark_reset(struct Qdisc *sch)
+{
+        struct dsmark_qdisc_data *p = PRIV(sch);
+        DPRINTK("dsmark_reset(sch %p,[qdisc %p])\n",sch,p);
+        qdisc_reset(p->q);
+        sch->q.qlen = 0;
+}
+static void dsmark_destroy(struct Qdisc *sch)
+{
+        struct dsmark_qdisc_data *p = PRIV(sch);
+        struct tcf_proto *tp;
+        DPRINTK("dsmark_destroy(sch %p,[qdisc %p])\n",sch,p);
+        while (p->filter_list) {
+                tp = p->filter_list;
+                p->filter_list = tp->next;
+                tcf_destroy(tp);
+        }
+        qdisc_destroy(p->q);
+        kfree(p->mask);
+}
+static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl,
+    struct sk_buff *skb, struct tcmsg *tcm)
+{
+        struct dsmark_qdisc_data *p = PRIV(sch);
+        unsigned char *b = skb->tail;
+        struct rtattr *rta;
+        DPRINTK("dsmark_dump_class(sch %p,[qdisc %p],class %ld\n",sch,p,cl);
+        if (!cl || cl > p->indices)
+                return -EINVAL;
+        tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle),cl-1);
+        rta = (struct rtattr *) b;
+        RTA_PUT(skb,TCA_OPTIONS,0,NULL);
+        RTA_PUT(skb,TCA_DSMARK_MASK,1,&p->mask[cl-1]);
+        RTA_PUT(skb,TCA_DSMARK_VALUE,1,&p->value[cl-1]);
+        rta->rta_len = skb->tail-b;
+        return skb->len;
+rtattr_failure:
+        skb_trim(skb,b-skb->data);
+        return -1;
+}
+static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+        struct dsmark_qdisc_data *p = PRIV(sch);
+        unsigned char *b = skb->tail;
+        struct rtattr *rta;
+        rta = (struct rtattr *) b;
+        RTA_PUT(skb,TCA_OPTIONS,0,NULL);
+        RTA_PUT(skb,TCA_DSMARK_INDICES,sizeof(__u16),&p->indices);
+        if (p->default_index != NO_DEFAULT_INDEX) {
+                __u16 tmp = p->default_index;
+                RTA_PUT(skb,TCA_DSMARK_DEFAULT_INDEX, sizeof(__u16), &tmp);
+        }
+        if (p->set_tc_index)
+                RTA_PUT(skb, TCA_DSMARK_SET_TC_INDEX, 0, NULL);
+        rta->rta_len = skb->tail-b;
+        return skb->len;
+rtattr_failure:
+        skb_trim(skb,b-skb->data);
+        return -1;
+}
+static struct Qdisc_class_ops dsmark_class_ops = {
+        .graft          =       dsmark_graft,
+        .leaf           =       dsmark_leaf,
+        .get            =       dsmark_get,
+        .put            =       dsmark_put,
+        .change         =       dsmark_change,
+        .delete         =       dsmark_delete,
+        .walk           =       dsmark_walk,
+        .tcf_chain      =       dsmark_find_tcf,
+        .bind_tcf       =       dsmark_bind_filter,
+        .unbind_tcf     =       dsmark_put,
+        .dump           =       dsmark_dump_class,
+};
+static struct Qdisc_ops dsmark_qdisc_ops = {
+        .next           =       NULL,
+        .cl_ops         =       &dsmark_class_ops,
+        .id             =       "dsmark",
+        .priv_size      =       sizeof(struct dsmark_qdisc_data),
+        .enqueue        =       dsmark_enqueue,
+        .dequeue        =       dsmark_dequeue,
+        .requeue        =       dsmark_requeue,
+        .drop           =       dsmark_drop,
+        .init           =       dsmark_init,
+        .reset          =       dsmark_reset,
+        .destroy        =       dsmark_destroy,
+        .change         =       NULL,
+        .dump           =       dsmark_dump,
+        .owner          =       THIS_MODULE,
+};
+static int __init dsmark_module_init(void)
+{
+        return register_qdisc(&dsmark_qdisc_ops);
+}
+static void __exit dsmark_module_exit(void) 
+{
+        unregister_qdisc(&dsmark_qdisc_ops);
+}
+module_init(dsmark_module_init)
+module_exit(dsmark_module_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
new file mode 100644
index 000000000000..4888305c96da
--- /dev/null
+++ b/net/sched/sch_fifo.c
@@ -0,0 +1,212 @@
+/*
+ * net/sched/sch_fifo.c The simplest FIFO queue.
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/notifier.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+/* 1 band FIFO pseudo-"scheduler" */
+struct fifo_sched_data
+{
+        unsigned limit;
+};
+static int
+bfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+{
+        struct fifo_sched_data *q = qdisc_priv(sch);
+        if (sch->qstats.backlog + skb->len <= q->limit) {
+                __skb_queue_tail(&sch->q, skb);
+                sch->qstats.backlog += skb->len;
+                sch->bstats.bytes += skb->len;
+                sch->bstats.packets++;
+                return 0;
+        }
+        sch->qstats.drops++;
+#ifdef CONFIG_NET_CLS_POLICE
+        if (sch->reshape_fail==NULL || sch->reshape_fail(skb, sch))
+#endif
+                kfree_skb(skb);
+        return NET_XMIT_DROP;
+}
+static int
+bfifo_requeue(struct sk_buff *skb, struct Qdisc* sch)
+{
+        __skb_queue_head(&sch->q, skb);
+        sch->qstats.backlog += skb->len;
+        sch->qstats.requeues++;
+        return 0;
+}
+static struct sk_buff *
+bfifo_dequeue(struct Qdisc* sch)
+{
+        struct sk_buff *skb;
+        skb = __skb_dequeue(&sch->q);
+        if (skb)
+                sch->qstats.backlog -= skb->len;
+        return skb;
+}
+static unsigned int 
+fifo_drop(struct Qdisc* sch)
+{
+        struct sk_buff *skb;
+        skb = __skb_dequeue_tail(&sch->q);
+        if (skb) {
+                unsigned int len = skb->len;
+                sch->qstats.backlog -= len;
+                kfree_skb(skb);
+                return len;
+        }
+        return 0;
+}
+static void
+fifo_reset(struct Qdisc* sch)
+{
+        skb_queue_purge(&sch->q);
+        sch->qstats.backlog = 0;
+}
+static int
+pfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+{
+        struct fifo_sched_data *q = qdisc_priv(sch);
+        if (sch->q.qlen < q->limit) {
+                __skb_queue_tail(&sch->q, skb);
+                sch->bstats.bytes += skb->len;
+                sch->bstats.packets++;
+                return 0;
+        }
+        sch->qstats.drops++;
+#ifdef CONFIG_NET_CLS_POLICE
+        if (sch->reshape_fail==NULL || sch->reshape_fail(skb, sch))
+#endif
+                kfree_skb(skb);
+        return NET_XMIT_DROP;
+}
+static int
+pfifo_requeue(struct sk_buff *skb, struct Qdisc* sch)
+{
+        __skb_queue_head(&sch->q, skb);
+        sch->qstats.requeues++;
+        return 0;
+}
+static struct sk_buff *
+pfifo_dequeue(struct Qdisc* sch)
+{
+        return __skb_dequeue(&sch->q);
+}
+static int fifo_init(struct Qdisc *sch, struct rtattr *opt)
+{
+        struct fifo_sched_data *q = qdisc_priv(sch);
+        if (opt == NULL) {
+                unsigned int limit = sch->dev->tx_queue_len ? : 1;
+                if (sch->ops == &bfifo_qdisc_ops)
+                        q->limit = limit*sch->dev->mtu;
+                else    
+                        q->limit = limit;
+        } else {
+                struct tc_fifo_qopt *ctl = RTA_DATA(opt);
+                if (opt->rta_len < RTA_LENGTH(sizeof(*ctl)))
+                        return -EINVAL;
+                q->limit = ctl->limit;
+        }
+        return 0;
+}
+static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+        struct fifo_sched_data *q = qdisc_priv(sch);
+        unsigned char    *b = skb->tail;
+        struct tc_fifo_qopt opt;
+        opt.limit = q->limit;
+        RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+        return skb->len;
+rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+struct Qdisc_ops pfifo_qdisc_ops = {
+        .next           =       NULL,
+        .cl_ops         =       NULL,
+        .id             =       "pfifo",
+        .priv_size      =       sizeof(struct fifo_sched_data),
+        .enqueue        =       pfifo_enqueue,
+        .dequeue        =       pfifo_dequeue,
+        .requeue        =       pfifo_requeue,
+        .drop           =       fifo_drop,
+        .init           =       fifo_init,
+        .reset          =       fifo_reset,
+        .destroy        =       NULL,
+        .change         =       fifo_init,
+        .dump           =       fifo_dump,
+        .owner          =       THIS_MODULE,
+};
+struct Qdisc_ops bfifo_qdisc_ops = {
+        .next           =       NULL,
+        .cl_ops         =       NULL,
+        .id             =       "bfifo",
+        .priv_size      =       sizeof(struct fifo_sched_data),
+        .enqueue        =       bfifo_enqueue,
+        .dequeue        =       bfifo_dequeue,
+        .requeue        =       bfifo_requeue,
+        .drop           =       fifo_drop,
+        .init           =       fifo_init,
+        .reset          =       fifo_reset,
+        .destroy        =       NULL,
+        .change         =       fifo_init,
+        .dump           =       fifo_dump,
+        .owner          =       THIS_MODULE,
+};
+EXPORT_SYMBOL(bfifo_qdisc_ops);
+EXPORT_SYMBOL(pfifo_qdisc_ops);
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
new file mode 100644
index 000000000000..8c01e023f02e
--- /dev/null
+++ b/net/sched/sch_generic.c
@@ -0,0 +1,609 @@
+/*
+ * net/sched/sch_generic.c      Generic packet scheduler routines.
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
+ *              - Ingress support
+ */
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/init.h>
+#include <linux/rcupdate.h>
+#include <linux/list.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+/* Main transmission queue. */
+/* Main qdisc structure lock. 
+   However, modifications
+   to data, participating in scheduling must be additionally
+   protected with dev->queue_lock spinlock.
+   The idea is the following:
+   - enqueue, dequeue are serialized via top level device
+     spinlock dev->queue_lock.
+   - tree walking is protected by read_lock_bh(qdisc_tree_lock)
+     and this lock is used only in process context.
+   - updates to tree are made under rtnl semaphore or
+     from softirq context (__qdisc_destroy rcu-callback)
+     hence this lock needs local bh disabling.
+   qdisc_tree_lock must be grabbed BEFORE dev->queue_lock!
+ */
+DEFINE_RWLOCK(qdisc_tree_lock);
+void qdisc_lock_tree(struct net_device *dev)
+{
+        write_lock_bh(&qdisc_tree_lock);
+        spin_lock_bh(&dev->queue_lock);
+}
+void qdisc_unlock_tree(struct net_device *dev)
+{
+        spin_unlock_bh(&dev->queue_lock);
+        write_unlock_bh(&qdisc_tree_lock);
+}
+/* 
+   dev->queue_lock serializes queue accesses for this device
+   AND dev->qdisc pointer itself.
+   dev->xmit_lock serializes accesses to device driver.
+   dev->queue_lock and dev->xmit_lock are mutually exclusive,
+   if one is grabbed, another must be free.
+ */
+/* Kick device.
+   Note, that this procedure can be called by a watchdog timer, so that
+   we do not check dev->tbusy flag here.
+   Returns:  0  - queue is empty.
+            >0  - queue is not empty, but throttled.
+            <0  - queue is not empty. Device is throttled, if dev->tbusy != 0.
+   NOTE: Called under dev->queue_lock with locally disabled BH.
+*/
+int qdisc_restart(struct net_device *dev)
+{
+        struct Qdisc *q = dev->qdisc;
+        struct sk_buff *skb;
+        /* Dequeue packet */
+        if ((skb = q->dequeue(q)) != NULL) {
+                unsigned nolock = (dev->features & NETIF_F_LLTX);
+                /*
+                 * When the driver has LLTX set it does its own locking
+                 * in start_xmit. No need to add additional overhead by
+                 * locking again. These checks are worth it because
+                 * even uncongested locks can be quite expensive.
+                 * The driver can do trylock like here too, in case
+                 * of lock congestion it should return -1 and the packet
+                 * will be requeued.
+                 */
+                if (!nolock) {
+                        if (!spin_trylock(&dev->xmit_lock)) {
+                        collision:
+                                /* So, someone grabbed the driver. */
+                                
+                                /* It may be transient configuration error,
+                                   when hard_start_xmit() recurses. We detect
+                                   it by checking xmit owner and drop the
+                                   packet when deadloop is detected.
+                                */
+                                if (dev->xmit_lock_owner == smp_processor_id()) {
+                                        kfree_skb(skb);
+                                        if (net_ratelimit())
+                                                printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
+                                        return -1;
+                                }
+                                __get_cpu_var(netdev_rx_stat).cpu_collision++;
+                                goto requeue;
+                        }
+                        /* Remember that the driver is grabbed by us. */
+                        dev->xmit_lock_owner = smp_processor_id();
+                }
+                
+                {
+                        /* And release queue */
+                        spin_unlock(&dev->queue_lock);
+                        if (!netif_queue_stopped(dev)) {
+                                int ret;
+                                if (netdev_nit)
+                                        dev_queue_xmit_nit(skb, dev);
+                                ret = dev->hard_start_xmit(skb, dev);
+                                if (ret == NETDEV_TX_OK) { 
+                                        if (!nolock) {
+                                                dev->xmit_lock_owner = -1;
+                                                spin_unlock(&dev->xmit_lock);
+                                        }
+                                        spin_lock(&dev->queue_lock);
+                                        return -1;
+                                }
+                                if (ret == NETDEV_TX_LOCKED && nolock) {
+                                        spin_lock(&dev->queue_lock);
+                                        goto collision; 
+                                }
+                        }
+                        /* NETDEV_TX_BUSY - we need to requeue */
+                        /* Release the driver */
+                        if (!nolock) { 
+                                dev->xmit_lock_owner = -1;
+                                spin_unlock(&dev->xmit_lock);
+                        } 
+                        spin_lock(&dev->queue_lock);
+                        q = dev->qdisc;
+                }
+                /* Device kicked us out :(
+                   This is possible in three cases:
+                   0. driver is locked
+                   1. fastroute is enabled
+                   2. device cannot determine busy state
+                      before start of transmission (f.e. dialout)
+                   3. device is buggy (ppp)
+                 */
+requeue:
+                q->ops->requeue(skb, q);
+                netif_schedule(dev);
+                return 1;
+        }
+        return q->q.qlen;
+}
+static void dev_watchdog(unsigned long arg)
+{
+        struct net_device *dev = (struct net_device *)arg;
+        spin_lock(&dev->xmit_lock);
+        if (dev->qdisc != &noop_qdisc) {
+                if (netif_device_present(dev) &&
+                    netif_running(dev) &&
+                    netif_carrier_ok(dev)) {
+                        if (netif_queue_stopped(dev) &&
+                            (jiffies - dev->trans_start) > dev->watchdog_timeo) {
+                                printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n", dev->name);
+                                dev->tx_timeout(dev);
+                        }
+                        if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
+                                dev_hold(dev);
+                }
+        }
+        spin_unlock(&dev->xmit_lock);
+        dev_put(dev);
+}
+static void dev_watchdog_init(struct net_device *dev)
+{
+        init_timer(&dev->watchdog_timer);
+        dev->watchdog_timer.data = (unsigned long)dev;
+        dev->watchdog_timer.function = dev_watchdog;
+}
+void __netdev_watchdog_up(struct net_device *dev)
+{
+        if (dev->tx_timeout) {
+                if (dev->watchdog_timeo <= 0)
+                        dev->watchdog_timeo = 5*HZ;
+                if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
+                        dev_hold(dev);
+        }
+}
+static void dev_watchdog_up(struct net_device *dev)
+{
+        spin_lock_bh(&dev->xmit_lock);
+        __netdev_watchdog_up(dev);
+        spin_unlock_bh(&dev->xmit_lock);
+}
+static void dev_watchdog_down(struct net_device *dev)
+{
+        spin_lock_bh(&dev->xmit_lock);
+        if (del_timer(&dev->watchdog_timer))
+                __dev_put(dev);
+        spin_unlock_bh(&dev->xmit_lock);
+}
+/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
+   under all circumstances. It is difficult to invent anything faster or
+   cheaper.
+ */
+static int
+noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
+{
+        kfree_skb(skb);
+        return NET_XMIT_CN;
+}
+static struct sk_buff *
+noop_dequeue(struct Qdisc * qdisc)
+{
+        return NULL;
+}
+static int
+noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
+{
+        if (net_ratelimit())
+                printk(KERN_DEBUG "%s deferred output. It is buggy.\n", skb->dev->name);
+        kfree_skb(skb);
+        return NET_XMIT_CN;
+}
+struct Qdisc_ops noop_qdisc_ops = {
+        .next           =       NULL,
+        .cl_ops         =       NULL,
+        .id             =       "noop",
+        .priv_size      =       0,
+        .enqueue        =       noop_enqueue,
+        .dequeue        =       noop_dequeue,
+        .requeue        =       noop_requeue,
+        .owner          =       THIS_MODULE,
+};
+struct Qdisc noop_qdisc = {
+        .enqueue        =       noop_enqueue,
+        .dequeue        =       noop_dequeue,
+        .flags          =       TCQ_F_BUILTIN,
+        .ops            =       &noop_qdisc_ops,        
+        .list           =       LIST_HEAD_INIT(noop_qdisc.list),
+};
+static struct Qdisc_ops noqueue_qdisc_ops = {
+        .next           =       NULL,
+        .cl_ops         =       NULL,
+        .id             =       "noqueue",
+        .priv_size      =       0,
+        .enqueue        =       noop_enqueue,
+        .dequeue        =       noop_dequeue,
+        .requeue        =       noop_requeue,
+        .owner          =       THIS_MODULE,
+};
+static struct Qdisc noqueue_qdisc = {
+        .enqueue        =       NULL,
+        .dequeue        =       noop_dequeue,
+        .flags          =       TCQ_F_BUILTIN,
+        .ops            =       &noqueue_qdisc_ops,
+        .list           =       LIST_HEAD_INIT(noqueue_qdisc.list),
+};
+static const u8 prio2band[TC_PRIO_MAX+1] =
+        { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
+/* 3-band FIFO queue: old style, but should be a bit faster than
+   generic prio+fifo combination.
+ */
+static int
+pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
+{
+        struct sk_buff_head *list = qdisc_priv(qdisc);
+        list += prio2band[skb->priority&TC_PRIO_MAX];
+        if (list->qlen < qdisc->dev->tx_queue_len) {
+                __skb_queue_tail(list, skb);
+                qdisc->q.qlen++;
+                qdisc->bstats.bytes += skb->len;
+                qdisc->bstats.packets++;
+                return 0;
+        }
+        qdisc->qstats.drops++;
+        kfree_skb(skb);
+        return NET_XMIT_DROP;
+}
+static struct sk_buff *
+pfifo_fast_dequeue(struct Qdisc* qdisc)
+{
+        int prio;
+        struct sk_buff_head *list = qdisc_priv(qdisc);
+        struct sk_buff *skb;
+        for (prio = 0; prio < 3; prio++, list++) {
+                skb = __skb_dequeue(list);
+                if (skb) {
+                        qdisc->q.qlen--;
+                        return skb;
+                }
+        }
+        return NULL;
+}
+static int
+pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
+{
+        struct sk_buff_head *list = qdisc_priv(qdisc);
+        list += prio2band[skb->priority&TC_PRIO_MAX];
+        __skb_queue_head(list, skb);
+        qdisc->q.qlen++;
+        qdisc->qstats.requeues++;
+        return 0;
+}
+static void
+pfifo_fast_reset(struct Qdisc* qdisc)
+{
+        int prio;
+        struct sk_buff_head *list = qdisc_priv(qdisc);
+        for (prio=0; prio < 3; prio++)
+                skb_queue_purge(list+prio);
+        qdisc->q.qlen = 0;
+}
+static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
+{
+        unsigned char    *b = skb->tail;
+        struct tc_prio_qopt opt;
+        opt.bands = 3; 
+        memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
+        RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+        return skb->len;
+rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
+{
+        int i;
+        struct sk_buff_head *list = qdisc_priv(qdisc);
+        for (i=0; i<3; i++)
+                skb_queue_head_init(list+i);
+        return 0;
+}
+static struct Qdisc_ops pfifo_fast_ops = {
+        .next           =       NULL,
+        .cl_ops         =       NULL,
+        .id             =       "pfifo_fast",
+        .priv_size      =       3 * sizeof(struct sk_buff_head),
+        .enqueue        =       pfifo_fast_enqueue,
+        .dequeue        =       pfifo_fast_dequeue,
+        .requeue        =       pfifo_fast_requeue,
+        .init           =       pfifo_fast_init,
+        .reset          =       pfifo_fast_reset,
+        .dump           =       pfifo_fast_dump,
+        .owner          =       THIS_MODULE,
+};
+struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
+{
+        void *p;
+        struct Qdisc *sch;
+        int size;
+        /* ensure that the Qdisc and the private data are 32-byte aligned */
+        size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST);
+        size += ops->priv_size + QDISC_ALIGN_CONST;
+        p = kmalloc(size, GFP_KERNEL);
+        if (!p)
+                return NULL;
+        memset(p, 0, size);
+        sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST) 
+                               & ~QDISC_ALIGN_CONST);
+        sch->padded = (char *)sch - (char *)p;
+        INIT_LIST_HEAD(&sch->list);
+        skb_queue_head_init(&sch->q);
+        sch->ops = ops;
+        sch->enqueue = ops->enqueue;
+        sch->dequeue = ops->dequeue;
+        sch->dev = dev;
+        dev_hold(dev);
+        sch->stats_lock = &dev->queue_lock;
+        atomic_set(&sch->refcnt, 1);
+        if (!ops->init || ops->init(sch, NULL) == 0)
+                return sch;
+        dev_put(dev);
+        kfree(p);
+        return NULL;
+}
+/* Under dev->queue_lock and BH! */
+void qdisc_reset(struct Qdisc *qdisc)
+{
+        struct Qdisc_ops *ops = qdisc->ops;
+        if (ops->reset)
+                ops->reset(qdisc);
+}
+/* this is the rcu callback function to clean up a qdisc when there 
+ * are no further references to it */
+static void __qdisc_destroy(struct rcu_head *head)
+{
+        struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
+        struct Qdisc_ops  *ops = qdisc->ops;
+#ifdef CONFIG_NET_ESTIMATOR
+        gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
+#endif
+        write_lock(&qdisc_tree_lock);
+        if (ops->reset)
+                ops->reset(qdisc);
+        if (ops->destroy)
+                ops->destroy(qdisc);
+        write_unlock(&qdisc_tree_lock);
+        module_put(ops->owner);
+        dev_put(qdisc->dev);
+        kfree((char *) qdisc - qdisc->padded);
+}
+/* Under dev->queue_lock and BH! */
+void qdisc_destroy(struct Qdisc *qdisc)
+{
+        struct list_head cql = LIST_HEAD_INIT(cql);
+        struct Qdisc *cq, *q, *n;
+        if (qdisc->flags & TCQ_F_BUILTIN ||
+                !atomic_dec_and_test(&qdisc->refcnt))
+                return;
+        if (!list_empty(&qdisc->list)) {
+                if (qdisc->ops->cl_ops == NULL)
+                        list_del(&qdisc->list);
+                else
+                        list_move(&qdisc->list, &cql);
+        }
+        /* unlink inner qdiscs from dev->qdisc_list immediately */
+        list_for_each_entry(cq, &cql, list)
+                list_for_each_entry_safe(q, n, &qdisc->dev->qdisc_list, list)
+                        if (TC_H_MAJ(q->parent) == TC_H_MAJ(cq->handle)) {
+                                if (q->ops->cl_ops == NULL)
+                                        list_del_init(&q->list);
+                                else
+                                        list_move_tail(&q->list, &cql);
+                        }
+        list_for_each_entry_safe(cq, n, &cql, list)
+                list_del_init(&cq->list);
+        call_rcu(&qdisc->q_rcu, __qdisc_destroy);
+}
+void dev_activate(struct net_device *dev)
+{
+        /* No queueing discipline is attached to device;
+           create default one i.e. pfifo_fast for devices,
+           which need queueing and noqueue_qdisc for
+           virtual interfaces
+         */
+        if (dev->qdisc_sleeping == &noop_qdisc) {
+                struct Qdisc *qdisc;
+                if (dev->tx_queue_len) {
+                        qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops);
+                        if (qdisc == NULL) {
+                                printk(KERN_INFO "%s: activation failed\n", dev->name);
+                                return;
+                        }
+                        write_lock_bh(&qdisc_tree_lock);
+                        list_add_tail(&qdisc->list, &dev->qdisc_list);
+                        write_unlock_bh(&qdisc_tree_lock);
+                } else {
+                        qdisc =  &noqueue_qdisc;
+                }
+                write_lock_bh(&qdisc_tree_lock);
+                dev->qdisc_sleeping = qdisc;
+                write_unlock_bh(&qdisc_tree_lock);
+        }
+        spin_lock_bh(&dev->queue_lock);
+        rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
+        if (dev->qdisc != &noqueue_qdisc) {
+                dev->trans_start = jiffies;
+                dev_watchdog_up(dev);
+        }
+        spin_unlock_bh(&dev->queue_lock);
+}
+void dev_deactivate(struct net_device *dev)
+{
+        struct Qdisc *qdisc;
+        spin_lock_bh(&dev->queue_lock);
+        qdisc = dev->qdisc;
+        dev->qdisc = &noop_qdisc;
+        qdisc_reset(qdisc);
+        spin_unlock_bh(&dev->queue_lock);
+        dev_watchdog_down(dev);
+        while (test_bit(__LINK_STATE_SCHED, &dev->state))
+                yield();
+        spin_unlock_wait(&dev->xmit_lock);
+}
+void dev_init_scheduler(struct net_device *dev)
+{
+        qdisc_lock_tree(dev);
+        dev->qdisc = &noop_qdisc;
+        dev->qdisc_sleeping = &noop_qdisc;
+        INIT_LIST_HEAD(&dev->qdisc_list);
+        qdisc_unlock_tree(dev);
+        dev_watchdog_init(dev);
+}
+void dev_shutdown(struct net_device *dev)
+{
+        struct Qdisc *qdisc;
+        qdisc_lock_tree(dev);
+        qdisc = dev->qdisc_sleeping;
+        dev->qdisc = &noop_qdisc;
+        dev->qdisc_sleeping = &noop_qdisc;
+        qdisc_destroy(qdisc);
+#if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
+        if ((qdisc = dev->qdisc_ingress) != NULL) {
+                dev->qdisc_ingress = NULL;
+                qdisc_destroy(qdisc);
+        }
+#endif
+        BUG_TRAP(!timer_pending(&dev->watchdog_timer));
+        qdisc_unlock_tree(dev);
+}
+EXPORT_SYMBOL(__netdev_watchdog_up);
+EXPORT_SYMBOL(noop_qdisc);
+EXPORT_SYMBOL(noop_qdisc_ops);
+EXPORT_SYMBOL(qdisc_create_dflt);
+EXPORT_SYMBOL(qdisc_destroy);
+EXPORT_SYMBOL(qdisc_reset);
+EXPORT_SYMBOL(qdisc_restart);
+EXPORT_SYMBOL(qdisc_lock_tree);
+EXPORT_SYMBOL(qdisc_unlock_tree);
diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
new file mode 100644
index 000000000000..25c171c32715
--- /dev/null
+++ b/net/sched/sch_gred.c
@@ -0,0 +1,630 @@
+/*
+ * net/sched/sch_gred.c Generic Random Early Detection queue.
+ *
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Authors:    J Hadi Salim (hadi@cyberus.ca) 1998-2002
+ *
+ *             991129: -  Bug fix with grio mode
+ *                     - a better sing. AvgQ mode with Grio(WRED)
+ *                     - A finer grained VQ dequeue based on sugestion
+ *                       from Ren Liu
+ *                     - More error checks
+ *
+ *
+ *
+ *  For all the glorious comments look at Alexey's sch_red.c
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/notifier.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+#if 1 /* control */
+#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
+#else
+#define DPRINTK(format,args...)
+#endif
+#if 0 /* data */
+#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args)
+#else
+#define D2PRINTK(format,args...)
+#endif
+struct gred_sched_data;
+struct gred_sched;
+struct gred_sched_data
+{
+/* Parameters */
+        u32             limit;          /* HARD maximal queue length    */
+        u32             qth_min;        /* Min average length threshold: A scaled */
+        u32             qth_max;        /* Max average length threshold: A scaled */
+        u32             DP;             /* the drop pramaters */
+        char            Wlog;           /* log(W)               */
+        char            Plog;           /* random number bits   */
+        u32             Scell_max;
+        u32             Rmask;
+        u32             bytesin;        /* bytes seen on virtualQ so far*/
+        u32             packetsin;      /* packets seen on virtualQ so far*/
+        u32             backlog;        /* bytes on the virtualQ */
+        u32             forced; /* packets dropped for exceeding limits */
+        u32             early;  /* packets dropped as a warning */
+        u32             other;  /* packets dropped by invoking drop() */
+        u32             pdrop;  /* packets dropped because we exceeded physical queue limits */
+        char            Scell_log;
+        u8              Stab[256];
+        u8              prio;        /* the prio of this vq */
+/* Variables */
+        unsigned long   qave;           /* Average queue length: A scaled */
+        int             qcount;         /* Packets since last random number generation */
+        u32             qR;             /* Cached random number */
+        psched_time_t   qidlestart;     /* Start of idle period */
+};
+struct gred_sched
+{
+        struct gred_sched_data *tab[MAX_DPs];
+        u32             DPs;   
+        u32             def; 
+        u8              initd; 
+        u8              grio; 
+        u8              eqp; 
+};
+static int
+gred_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+{
+        psched_time_t now;
+        struct gred_sched_data *q=NULL;
+        struct gred_sched *t= qdisc_priv(sch);
+        unsigned long   qave=0; 
+        int i=0;
+        if (!t->initd && skb_queue_len(&sch->q) < (sch->dev->tx_queue_len ? : 1)) {
+                D2PRINTK("NO GRED Queues setup yet! Enqueued anyway\n");
+                goto do_enqueue;
+        }
+        if ( ((skb->tc_index&0xf) > (t->DPs -1)) || !(q=t->tab[skb->tc_index&0xf])) {
+                printk("GRED: setting to default (%d)\n ",t->def);
+                if (!(q=t->tab[t->def])) {
+                        DPRINTK("GRED: setting to default FAILED! dropping!! "
+                            "(%d)\n ", t->def);
+                        goto drop;
+                }
+                /* fix tc_index? --could be controvesial but needed for
+                   requeueing */
+                skb->tc_index=(skb->tc_index&0xfffffff0) | t->def;
+        }
+        D2PRINTK("gred_enqueue virtualQ 0x%x classid %x backlog %d "
+            "general backlog %d\n",skb->tc_index&0xf,sch->handle,q->backlog,
+            sch->qstats.backlog);
+        /* sum up all the qaves of prios <= to ours to get the new qave*/
+        if (!t->eqp && t->grio) {
+                for (i=0;i<t->DPs;i++) {
+                        if ((!t->tab[i]) || (i==q->DP)) 
+                                continue; 
+                                
+                        if ((t->tab[i]->prio < q->prio) && (PSCHED_IS_PASTPERFECT(t->tab[i]->qidlestart)))
+                                qave +=t->tab[i]->qave;
+                }
+                        
+        }
+        q->packetsin++;
+        q->bytesin+=skb->len;
+        if (t->eqp && t->grio) {
+                qave=0;
+                q->qave=t->tab[t->def]->qave;
+                q->qidlestart=t->tab[t->def]->qidlestart;
+        }
+        if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) {
+                long us_idle;
+                PSCHED_GET_TIME(now);
+                us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max);
+                PSCHED_SET_PASTPERFECT(q->qidlestart);
+                q->qave >>= q->Stab[(us_idle>>q->Scell_log)&0xFF];
+        } else {
+                if (t->eqp) {
+                        q->qave += sch->qstats.backlog - (q->qave >> q->Wlog);
+                } else {
+                        q->qave += q->backlog - (q->qave >> q->Wlog);
+                }
+        }
+        
+        if (t->eqp && t->grio) 
+                t->tab[t->def]->qave=q->qave;
+        if ((q->qave+qave) < q->qth_min) {
+                q->qcount = -1;
+enqueue:
+                if (q->backlog + skb->len <= q->limit) {
+                        q->backlog += skb->len;
+do_enqueue:
+                        __skb_queue_tail(&sch->q, skb);
+                        sch->qstats.backlog += skb->len;
+                        sch->bstats.bytes += skb->len;
+                        sch->bstats.packets++;
+                        return 0;
+                } else {
+                        q->pdrop++;
+                }
+drop:
+                kfree_skb(skb);
+                sch->qstats.drops++;
+                return NET_XMIT_DROP;
+        }
+        if ((q->qave+qave) >= q->qth_max) {
+                q->qcount = -1;
+                sch->qstats.overlimits++;
+                q->forced++;
+                goto drop;
+        }
+        if (++q->qcount) {
+                if ((((qave+q->qave) - q->qth_min)>>q->Wlog)*q->qcount < q->qR)
+                        goto enqueue;
+                q->qcount = 0;
+                q->qR = net_random()&q->Rmask;
+                sch->qstats.overlimits++;
+                q->early++;
+                goto drop;
+        }
+        q->qR = net_random()&q->Rmask;
+        goto enqueue;
+}
+static int
+gred_requeue(struct sk_buff *skb, struct Qdisc* sch)
+{
+        struct gred_sched_data *q;
+        struct gred_sched *t= qdisc_priv(sch);
+        q= t->tab[(skb->tc_index&0xf)];
+/* error checking here -- probably unnecessary */
+        PSCHED_SET_PASTPERFECT(q->qidlestart);
+        __skb_queue_head(&sch->q, skb);
+        sch->qstats.backlog += skb->len;
+        sch->qstats.requeues++;
+        q->backlog += skb->len;
+        return 0;
+}
+static struct sk_buff *
+gred_dequeue(struct Qdisc* sch)
+{
+        struct sk_buff *skb;
+        struct gred_sched_data *q;
+        struct gred_sched *t= qdisc_priv(sch);
+        skb = __skb_dequeue(&sch->q);
+        if (skb) {
+                sch->qstats.backlog -= skb->len;
+                q= t->tab[(skb->tc_index&0xf)];
+                if (q) {
+                        q->backlog -= skb->len;
+                        if (!q->backlog && !t->eqp)
+                                PSCHED_GET_TIME(q->qidlestart);
+                } else {
+                        D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf); 
+                }
+                return skb;
+        }
+        if (t->eqp) {
+                        q= t->tab[t->def];
+                        if (!q) 
+                                D2PRINTK("no default VQ set: Results will be "
+                                       "screwed up\n");
+                        else
+                                PSCHED_GET_TIME(q->qidlestart);
+        }
+        return NULL;
+}
+static unsigned int gred_drop(struct Qdisc* sch)
+{
+        struct sk_buff *skb;
+        struct gred_sched_data *q;
+        struct gred_sched *t= qdisc_priv(sch);
+        skb = __skb_dequeue_tail(&sch->q);
+        if (skb) {
+                unsigned int len = skb->len;
+                sch->qstats.backlog -= len;
+                sch->qstats.drops++;
+                q= t->tab[(skb->tc_index&0xf)];
+                if (q) {
+                        q->backlog -= len;
+                        q->other++;
+                        if (!q->backlog && !t->eqp)
+                                PSCHED_GET_TIME(q->qidlestart);
+                } else {
+                        D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf); 
+                }
+                kfree_skb(skb);
+                return len;
+        }
+        q=t->tab[t->def];
+        if (!q) {
+                D2PRINTK("no default VQ set: Results might be screwed up\n");
+                return 0;
+        }
+        PSCHED_GET_TIME(q->qidlestart);
+        return 0;
+}
+static void gred_reset(struct Qdisc* sch)
+{
+        int i;
+        struct gred_sched_data *q;
+        struct gred_sched *t= qdisc_priv(sch);
+        __skb_queue_purge(&sch->q);
+        sch->qstats.backlog = 0;
+        for (i=0;i<t->DPs;i++) {
+                q= t->tab[i];
+                if (!q) 
+                        continue; 
+                PSCHED_SET_PASTPERFECT(q->qidlestart);
+                q->qave = 0;
+                q->qcount = -1;
+                q->backlog = 0;
+                q->other=0;
+                q->forced=0;
+                q->pdrop=0;
+                q->early=0;
+        }
+}
+static int gred_change(struct Qdisc *sch, struct rtattr *opt)
+{
+        struct gred_sched *table = qdisc_priv(sch);
+        struct gred_sched_data *q;
+        struct tc_gred_qopt *ctl;
+        struct tc_gred_sopt *sopt;
+        struct rtattr *tb[TCA_GRED_STAB];
+        struct rtattr *tb2[TCA_GRED_DPS];
+        int i;
+        if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_STAB, opt))
+                return -EINVAL;
+        if (tb[TCA_GRED_PARMS-1] == 0 && tb[TCA_GRED_STAB-1] == 0) {
+                rtattr_parse_nested(tb2, TCA_GRED_DPS, opt);
+            if (tb2[TCA_GRED_DPS-1] == 0) 
+                        return -EINVAL;
+                sopt = RTA_DATA(tb2[TCA_GRED_DPS-1]);
+                table->DPs=sopt->DPs;   
+                table->def=sopt->def_DP; 
+                table->grio=sopt->grio; 
+                table->initd=0;
+                /* probably need to clear all the table DP entries as well */
+                return 0;
+            }
+        if (!table->DPs || tb[TCA_GRED_PARMS-1] == 0 || tb[TCA_GRED_STAB-1] == 0 ||
+                RTA_PAYLOAD(tb[TCA_GRED_PARMS-1]) < sizeof(*ctl) ||
+                RTA_PAYLOAD(tb[TCA_GRED_STAB-1]) < 256)
+                        return -EINVAL;
+        ctl = RTA_DATA(tb[TCA_GRED_PARMS-1]);
+        if (ctl->DP > MAX_DPs-1 ) {
+                /* misbehaving is punished! Put in the default drop probability */
+                DPRINTK("\nGRED: DP %u not in  the proper range fixed. New DP "
+                        "set to default at %d\n",ctl->DP,table->def);
+                ctl->DP=table->def;
+        }
+        
+        if (table->tab[ctl->DP] == NULL) {
+                table->tab[ctl->DP]=kmalloc(sizeof(struct gred_sched_data),
+                                            GFP_KERNEL);
+                if (NULL == table->tab[ctl->DP])
+                        return -ENOMEM;
+                memset(table->tab[ctl->DP], 0, (sizeof(struct gred_sched_data)));
+        }
+        q= table->tab[ctl->DP]; 
+        if (table->grio) {
+                if (ctl->prio <=0) {
+                        if (table->def && table->tab[table->def]) {
+                                DPRINTK("\nGRED: DP %u does not have a prio"
+                                        "setting default to %d\n",ctl->DP,
+                                        table->tab[table->def]->prio);
+                                q->prio=table->tab[table->def]->prio;
+                        } else { 
+                                DPRINTK("\nGRED: DP %u does not have a prio"
+                                        " setting default to 8\n",ctl->DP);
+                                q->prio=8;
+                        }
+                } else {
+                        q->prio=ctl->prio;
+                }
+        } else {
+                q->prio=8;
+        }
+        q->DP=ctl->DP;
+        q->Wlog = ctl->Wlog;
+        q->Plog = ctl->Plog;
+        q->limit = ctl->limit;
+        q->Scell_log = ctl->Scell_log;
+        q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL;
+        q->Scell_max = (255<<q->Scell_log);
+        q->qth_min = ctl->qth_min<<ctl->Wlog;
+        q->qth_max = ctl->qth_max<<ctl->Wlog;
+        q->qave=0;
+        q->backlog=0;
+        q->qcount = -1;
+        q->other=0;
+        q->forced=0;
+        q->pdrop=0;
+        q->early=0;
+        PSCHED_SET_PASTPERFECT(q->qidlestart);
+        memcpy(q->Stab, RTA_DATA(tb[TCA_GRED_STAB-1]), 256);
+        if ( table->initd && table->grio) {
+        /* this looks ugly but it's not in the fast path */
+                for (i=0;i<table->DPs;i++) {
+                        if ((!table->tab[i]) || (i==q->DP) )    
+                                continue; 
+                        if (table->tab[i]->prio == q->prio ){
+                                /* WRED mode detected */
+                                table->eqp=1;
+                                break;
+                        }
+                }
+        }
+        if (!table->initd) {
+                table->initd=1;
+                /* 
+                the first entry also goes into the default until
+                over-written 
+                */
+                if (table->tab[table->def] == NULL) {
+                        table->tab[table->def]=
+                                kmalloc(sizeof(struct gred_sched_data), GFP_KERNEL);
+                        if (NULL == table->tab[table->def])
+                                return -ENOMEM;
+                        memset(table->tab[table->def], 0,
+                               (sizeof(struct gred_sched_data)));
+                }
+                q= table->tab[table->def]; 
+                q->DP=table->def;
+                q->Wlog = ctl->Wlog;
+                q->Plog = ctl->Plog;
+                q->limit = ctl->limit;
+                q->Scell_log = ctl->Scell_log;
+                q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL;
+                q->Scell_max = (255<<q->Scell_log);
+                q->qth_min = ctl->qth_min<<ctl->Wlog;
+                q->qth_max = ctl->qth_max<<ctl->Wlog;
+                if (table->grio)
+                        q->prio=table->tab[ctl->DP]->prio;
+                else
+                        q->prio=8;
+                q->qcount = -1;
+                PSCHED_SET_PASTPERFECT(q->qidlestart);
+                memcpy(q->Stab, RTA_DATA(tb[TCA_GRED_STAB-1]), 256);
+        }
+        return 0;
+}
+static int gred_init(struct Qdisc *sch, struct rtattr *opt)
+{
+        struct gred_sched *table = qdisc_priv(sch);
+        struct tc_gred_sopt *sopt;
+        struct rtattr *tb[TCA_GRED_STAB];
+        struct rtattr *tb2[TCA_GRED_DPS];
+        if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_STAB, opt))
+                return -EINVAL;
+        if (tb[TCA_GRED_PARMS-1] == 0 && tb[TCA_GRED_STAB-1] == 0) {
+                rtattr_parse_nested(tb2, TCA_GRED_DPS, opt);
+            if (tb2[TCA_GRED_DPS-1] == 0) 
+                        return -EINVAL;
+                sopt = RTA_DATA(tb2[TCA_GRED_DPS-1]);
+                table->DPs=sopt->DPs;   
+                table->def=sopt->def_DP; 
+                table->grio=sopt->grio; 
+                table->initd=0;
+                return 0;
+        }
+        DPRINTK("\n GRED_INIT error!\n");
+        return -EINVAL;
+}
+static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+        unsigned long qave;
+        struct rtattr *rta;
+        struct tc_gred_qopt *opt = NULL ;
+        struct tc_gred_qopt *dst;
+        struct gred_sched *table = qdisc_priv(sch);
+        struct gred_sched_data *q;
+        int i;
+        unsigned char    *b = skb->tail;
+        rta = (struct rtattr*)b;
+        RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
+        opt=kmalloc(sizeof(struct tc_gred_qopt)*MAX_DPs, GFP_KERNEL);
+        if (opt  == NULL) {
+                DPRINTK("gred_dump:failed to malloc for %Zd\n",
+                    sizeof(struct tc_gred_qopt)*MAX_DPs);
+                goto rtattr_failure;
+        }
+        memset(opt, 0, (sizeof(struct tc_gred_qopt))*table->DPs);
+        if (!table->initd) {
+                DPRINTK("NO GRED Queues setup!\n");
+        }
+        for (i=0;i<MAX_DPs;i++) {
+                dst= &opt[i]; 
+                q= table->tab[i]; 
+                if (!q) {
+                        /* hack -- fix at some point with proper message
+                           This is how we indicate to tc that there is no VQ
+                           at this DP */
+                        dst->DP=MAX_DPs+i;
+                        continue;
+                }
+                dst->limit=q->limit;
+                dst->qth_min=q->qth_min>>q->Wlog;
+                dst->qth_max=q->qth_max>>q->Wlog;
+                dst->DP=q->DP;
+                dst->backlog=q->backlog;
+                if (q->qave) {
+                        if (table->eqp && table->grio) {
+                                q->qidlestart=table->tab[table->def]->qidlestart;
+                                q->qave=table->tab[table->def]->qave;
+                        }
+                        if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) {
+                                long idle;
+                                psched_time_t now;
+                                PSCHED_GET_TIME(now);
+                                idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max);
+                                qave  = q->qave >> q->Stab[(idle>>q->Scell_log)&0xFF];
+                                dst->qave = qave >> q->Wlog;
+                        } else {
+                                dst->qave = q->qave >> q->Wlog;
+                        }
+                } else {
+                        dst->qave = 0;
+                }
+                
+                dst->Wlog = q->Wlog;
+                dst->Plog = q->Plog;
+                dst->Scell_log = q->Scell_log;
+                dst->other = q->other;
+                dst->forced = q->forced;
+                dst->early = q->early;
+                dst->pdrop = q->pdrop;
+                dst->prio = q->prio;
+                dst->packets=q->packetsin;
+                dst->bytesin=q->bytesin;
+        }
+        RTA_PUT(skb, TCA_GRED_PARMS, sizeof(struct tc_gred_qopt)*MAX_DPs, opt);
+        rta->rta_len = skb->tail - b;
+        kfree(opt);
+        return skb->len;
+rtattr_failure:
+        if (opt)
+                kfree(opt);
+        DPRINTK("gred_dump: FAILURE!!!!\n");
+/* also free the opt struct here */
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+static void gred_destroy(struct Qdisc *sch)
+{
+        struct gred_sched *table = qdisc_priv(sch);
+        int i;
+        for (i = 0;i < table->DPs; i++) {
+                if (table->tab[i])
+                        kfree(table->tab[i]);
+        }
+}
+static struct Qdisc_ops gred_qdisc_ops = {
+        .next           =       NULL,
+        .cl_ops         =       NULL,
+        .id             =       "gred",
+        .priv_size      =       sizeof(struct gred_sched),
+        .enqueue        =       gred_enqueue,
+        .dequeue        =       gred_dequeue,
+        .requeue        =       gred_requeue,
+        .drop           =       gred_drop,
+        .init           =       gred_init,
+        .reset          =       gred_reset,
+        .destroy        =       gred_destroy,
+        .change         =       gred_change,
+        .dump           =       gred_dump,
+        .owner          =       THIS_MODULE,
+};
+static int __init gred_module_init(void)
+{
+        return register_qdisc(&gred_qdisc_ops);
+}
+static void __exit gred_module_exit(void) 
+{
+        unregister_qdisc(&gred_qdisc_ops);
+}
+module_init(gred_module_init)
+module_exit(gred_module_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
new file mode 100644
index 000000000000..c26764bc4103
--- /dev/null
+++ b/net/sched/sch_hfsc.c
@@ -0,0 +1,1822 @@
+/*
+ * Copyright (c) 2003 Patrick McHardy, <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * 2003-10-17 - Ported from altq
+ */
+/*
+ * Copyright (c) 1997-1999 Carnegie Mellon University. All Rights Reserved.
+ *
+ * Permission to use, copy, modify, and distribute this software and
+ * its documentation is hereby granted (including for commercial or
+ * for-profit use), provided that both the copyright notice and this
+ * permission notice appear in all copies of the software, derivative
+ * works, or modified versions, and any portions thereof.
+ *
+ * THIS SOFTWARE IS EXPERIMENTAL AND IS KNOWN TO HAVE BUGS, SOME OF
+ * WHICH MAY HAVE SERIOUS CONSEQUENCES.  CARNEGIE MELLON PROVIDES THIS
+ * SOFTWARE IN ITS ``AS IS'' CONDITION, AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ * Carnegie Mellon encourages (but does not require) users of this
+ * software to return any improvements or extensions that they make,
+ * and to grant Carnegie Mellon the rights to redistribute these
+ * changes without encumbrance.
+ */
+/*
+ * H-FSC is described in Proceedings of SIGCOMM'97,
+ * "A Hierarchical Fair Service Curve Algorithm for Link-Sharing,
+ * Real-Time and Priority Service"
+ * by Ion Stoica, Hui Zhang, and T. S. Eugene Ng.
+ *
+ * Oleg Cherevko <olwi@aq.ml.com.ua> added the upperlimit for link-sharing.
+ * when a class has an upperlimit, the fit-time is computed from the
+ * upperlimit service curve.  the link-sharing scheduler does not schedule
+ * a class whose fit-time exceeds the current time.
+ */
+#include <linux/kernel.h>
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/jiffies.h>
+#include <linux/compiler.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/timer.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/init.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/pkt_sched.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+#include <asm/system.h>
+#include <asm/div64.h>
+#define HFSC_DEBUG 1
+/*
+ * kernel internal service curve representation:
+ *   coordinates are given by 64 bit unsigned integers.
+ *   x-axis: unit is clock count.
+ *   y-axis: unit is byte.
+ *
+ *   The service curve parameters are converted to the internal
+ *   representation. The slope values are scaled to avoid overflow.
+ *   the inverse slope values as well as the y-projection of the 1st
+ *   segment are kept in order to to avoid 64-bit divide operations
+ *   that are expensive on 32-bit architectures.
+ */
+struct internal_sc
+{
+        u64     sm1;    /* scaled slope of the 1st segment */
+        u64     ism1;   /* scaled inverse-slope of the 1st segment */
+        u64     dx;     /* the x-projection of the 1st segment */
+        u64     dy;     /* the y-projection of the 1st segment */
+        u64     sm2;    /* scaled slope of the 2nd segment */
+        u64     ism2;   /* scaled inverse-slope of the 2nd segment */
+};
+/* runtime service curve */
+struct runtime_sc
+{
+        u64     x;      /* current starting position on x-axis */
+        u64     y;      /* current starting position on y-axis */
+        u64     sm1;    /* scaled slope of the 1st segment */
+        u64     ism1;   /* scaled inverse-slope of the 1st segment */
+        u64     dx;     /* the x-projection of the 1st segment */
+        u64     dy;     /* the y-projection of the 1st segment */
+        u64     sm2;    /* scaled slope of the 2nd segment */
+        u64     ism2;   /* scaled inverse-slope of the 2nd segment */
+};
+enum hfsc_class_flags
+{
+        HFSC_RSC = 0x1,
+        HFSC_FSC = 0x2,
+        HFSC_USC = 0x4
+};
+struct hfsc_class
+{
+        u32             classid;        /* class id */
+        unsigned int    refcnt;         /* usage count */
+        struct gnet_stats_basic bstats;
+        struct gnet_stats_queue qstats;
+        struct gnet_stats_rate_est rate_est;
+        spinlock_t      *stats_lock;
+        unsigned int    level;          /* class level in hierarchy */
+        struct tcf_proto *filter_list;  /* filter list */
+        unsigned int    filter_cnt;     /* filter count */
+        struct hfsc_sched *sched;       /* scheduler data */
+        struct hfsc_class *cl_parent;   /* parent class */
+        struct list_head siblings;      /* sibling classes */
+        struct list_head children;      /* child classes */
+        struct Qdisc    *qdisc;         /* leaf qdisc */
+        struct rb_node el_node;         /* qdisc's eligible tree member */
+        struct rb_root vt_tree;         /* active children sorted by cl_vt */
+        struct rb_node vt_node;         /* parent's vt_tree member */
+        struct rb_root cf_tree;         /* active children sorted by cl_f */
+        struct rb_node cf_node;         /* parent's cf_heap member */
+        struct list_head hlist;         /* hash list member */
+        struct list_head dlist;         /* drop list member */
+        u64     cl_total;               /* total work in bytes */
+        u64     cl_cumul;               /* cumulative work in bytes done by
+                                           real-time criteria */
+        u64     cl_d;                   /* deadline*/
+        u64     cl_e;                   /* eligible time */
+        u64     cl_vt;                  /* virtual time */
+        u64     cl_f;                   /* time when this class will fit for
+                                           link-sharing, max(myf, cfmin) */
+        u64     cl_myf;                 /* my fit-time (calculated from this
+                                           class's own upperlimit curve) */
+        u64     cl_myfadj;              /* my fit-time adjustment (to cancel
+                                           history dependence) */
+        u64     cl_cfmin;               /* earliest children's fit-time (used
+                                           with cl_myf to obtain cl_f) */
+        u64     cl_cvtmin;              /* minimal virtual time among the
+                                           children fit for link-sharing
+                                           (monotonic within a period) */
+        u64     cl_vtadj;               /* intra-period cumulative vt
+                                           adjustment */
+        u64     cl_vtoff;               /* inter-period cumulative vt offset */
+        u64     cl_cvtmax;              /* max child's vt in the last period */
+        u64     cl_cvtoff;              /* cumulative cvtmax of all periods */
+        u64     cl_pcvtoff;             /* parent's cvtoff at initalization
+                                           time */
+        struct internal_sc cl_rsc;      /* internal real-time service curve */
+        struct internal_sc cl_fsc;      /* internal fair service curve */
+        struct internal_sc cl_usc;      /* internal upperlimit service curve */
+        struct runtime_sc cl_deadline;  /* deadline curve */
+        struct runtime_sc cl_eligible;  /* eligible curve */
+        struct runtime_sc cl_virtual;   /* virtual curve */
+        struct runtime_sc cl_ulimit;    /* upperlimit curve */
+        unsigned long   cl_flags;       /* which curves are valid */
+        unsigned long   cl_vtperiod;    /* vt period sequence number */
+        unsigned long   cl_parentperiod;/* parent's vt period sequence number*/
+        unsigned long   cl_nactive;     /* number of active children */
+};
+#define HFSC_HSIZE      16
+struct hfsc_sched
+{
+        u16     defcls;                         /* default class id */
+        struct hfsc_class root;                 /* root class */
+        struct list_head clhash[HFSC_HSIZE];    /* class hash */
+        struct rb_root eligible;                /* eligible tree */
+        struct list_head droplist;              /* active leaf class list (for
+                                                   dropping) */
+        struct sk_buff_head requeue;            /* requeued packet */
+        struct timer_list wd_timer;             /* watchdog timer */
+};
+/*
+ * macros
+ */
+#ifdef CONFIG_NET_SCH_CLK_GETTIMEOFDAY
+#include <linux/time.h>
+#undef PSCHED_GET_TIME
+#define PSCHED_GET_TIME(stamp)                                          \
+do {                                                                    \
+        struct timeval tv;                                              \
+        do_gettimeofday(&tv);                                           \
+        (stamp) = 1000000ULL * tv.tv_sec + tv.tv_usec;                  \
+} while (0)
+#endif
+#if HFSC_DEBUG
+#define ASSERT(cond)                                                    \
+do {                                                                    \
+        if (unlikely(!(cond)))                                          \
+                printk("assertion %s failed at %s:%i (%s)\n",           \
+                       #cond, __FILE__, __LINE__, __FUNCTION__);        \
+} while (0)
+#else
+#define ASSERT(cond)
+#endif /* HFSC_DEBUG */
+#define HT_INFINITY     0xffffffffffffffffULL   /* infinite time value */
+/*
+ * eligible tree holds backlogged classes being sorted by their eligible times.
+ * there is one eligible tree per hfsc instance.
+ */
+static void
+eltree_insert(struct hfsc_class *cl)
+{
+        struct rb_node **p = &cl->sched->eligible.rb_node;
+        struct rb_node *parent = NULL;
+        struct hfsc_class *cl1;
+        while (*p != NULL) {
+                parent = *p;
+                cl1 = rb_entry(parent, struct hfsc_class, el_node);
+                if (cl->cl_e >= cl1->cl_e)
+                        p = &parent->rb_right;
+                else
+                        p = &parent->rb_left;
+        }
+        rb_link_node(&cl->el_node, parent, p);
+        rb_insert_color(&cl->el_node, &cl->sched->eligible);
+}
+static inline void
+eltree_remove(struct hfsc_class *cl)
+{
+        rb_erase(&cl->el_node, &cl->sched->eligible);
+}
+static inline void
+eltree_update(struct hfsc_class *cl)
+{
+        eltree_remove(cl);
+        eltree_insert(cl);
+}
+/* find the class with the minimum deadline among the eligible classes */
+static inline struct hfsc_class *
+eltree_get_mindl(struct hfsc_sched *q, u64 cur_time)
+{
+        struct hfsc_class *p, *cl = NULL;
+        struct rb_node *n;
+        for (n = rb_first(&q->eligible); n != NULL; n = rb_next(n)) {
+                p = rb_entry(n, struct hfsc_class, el_node);
+                if (p->cl_e > cur_time)
+                        break;
+                if (cl == NULL || p->cl_d < cl->cl_d)
+                        cl = p;
+        }
+        return cl;
+}
+/* find the class with minimum eligible time among the eligible classes */
+static inline struct hfsc_class *
+eltree_get_minel(struct hfsc_sched *q)
+{
+        struct rb_node *n;
+        
+        n = rb_first(&q->eligible);
+        if (n == NULL)
+                return NULL;
+        return rb_entry(n, struct hfsc_class, el_node);
+}
+/*
+ * vttree holds holds backlogged child classes being sorted by their virtual
+ * time. each intermediate class has one vttree.
+ */
+static void
+vttree_insert(struct hfsc_class *cl)
+{
+        struct rb_node **p = &cl->cl_parent->vt_tree.rb_node;
+        struct rb_node *parent = NULL;
+        struct hfsc_class *cl1;
+        while (*p != NULL) {
+                parent = *p;
+                cl1 = rb_entry(parent, struct hfsc_class, vt_node);
+                if (cl->cl_vt >= cl1->cl_vt)
+                        p = &parent->rb_right;
+                else
+                        p = &parent->rb_left;
+        }
+        rb_link_node(&cl->vt_node, parent, p);
+        rb_insert_color(&cl->vt_node, &cl->cl_parent->vt_tree);
+}
+static inline void
+vttree_remove(struct hfsc_class *cl)
+{
+        rb_erase(&cl->vt_node, &cl->cl_parent->vt_tree);
+}
+static inline void
+vttree_update(struct hfsc_class *cl)
+{
+        vttree_remove(cl);
+        vttree_insert(cl);
+}
+static inline struct hfsc_class *
+vttree_firstfit(struct hfsc_class *cl, u64 cur_time)
+{
+        struct hfsc_class *p;
+        struct rb_node *n;
+        for (n = rb_first(&cl->vt_tree); n != NULL; n = rb_next(n)) {
+                p = rb_entry(n, struct hfsc_class, vt_node);
+                if (p->cl_f <= cur_time)
+                        return p;
+        }
+        return NULL;
+}
+/*
+ * get the leaf class with the minimum vt in the hierarchy
+ */
+static struct hfsc_class *
+vttree_get_minvt(struct hfsc_class *cl, u64 cur_time)
+{
+        /* if root-class's cfmin is bigger than cur_time nothing to do */
+        if (cl->cl_cfmin > cur_time)
+                return NULL;
+        while (cl->level > 0) {
+                cl = vttree_firstfit(cl, cur_time);
+                if (cl == NULL)
+                        return NULL;
+                /*
+                 * update parent's cl_cvtmin.
+                 */
+                if (cl->cl_parent->cl_cvtmin < cl->cl_vt)
+                        cl->cl_parent->cl_cvtmin = cl->cl_vt;
+        }
+        return cl;
+}
+static void
+cftree_insert(struct hfsc_class *cl)
+{
+        struct rb_node **p = &cl->cl_parent->cf_tree.rb_node;
+        struct rb_node *parent = NULL;
+        struct hfsc_class *cl1;
+        while (*p != NULL) {
+                parent = *p;
+                cl1 = rb_entry(parent, struct hfsc_class, cf_node);
+                if (cl->cl_f >= cl1->cl_f)
+                        p = &parent->rb_right;
+                else
+                        p = &parent->rb_left;
+        }
+        rb_link_node(&cl->cf_node, parent, p);
+        rb_insert_color(&cl->cf_node, &cl->cl_parent->cf_tree);
+}
+static inline void
+cftree_remove(struct hfsc_class *cl)
+{
+        rb_erase(&cl->cf_node, &cl->cl_parent->cf_tree);
+}
+static inline void
+cftree_update(struct hfsc_class *cl)
+{
+        cftree_remove(cl);
+        cftree_insert(cl);
+}
+/*
+ * service curve support functions
+ *
+ *  external service curve parameters
+ *      m: bps
+ *      d: us
+ *  internal service curve parameters
+ *      sm: (bytes/psched_us) << SM_SHIFT
+ *      ism: (psched_us/byte) << ISM_SHIFT
+ *      dx: psched_us
+ *
+ * Clock source resolution (CONFIG_NET_SCH_CLK_*)
+ *  JIFFIES: for 48<=HZ<=1534 resolution is between 0.63us and 1.27us.
+ *  CPU: resolution is between 0.5us and 1us.
+ *  GETTIMEOFDAY: resolution is exactly 1us.
+ *
+ * sm and ism are scaled in order to keep effective digits.
+ * SM_SHIFT and ISM_SHIFT are selected to keep at least 4 effective
+ * digits in decimal using the following table.
+ *
+ * Note: We can afford the additional accuracy (altq hfsc keeps at most
+ * 3 effective digits) thanks to the fact that linux clock is bounded
+ * much more tightly.
+ *
+ *  bits/sec      100Kbps     1Mbps     10Mbps     100Mbps    1Gbps
+ *  ------------+-------------------------------------------------------
+ *  bytes/0.5us   6.25e-3    62.5e-3    625e-3     6250e-e    62500e-3
+ *  bytes/us      12.5e-3    125e-3     1250e-3    12500e-3   125000e-3
+ *  bytes/1.27us  15.875e-3  158.75e-3  1587.5e-3  15875e-3   158750e-3
+ *
+ *  0.5us/byte    160        16         1.6        0.16       0.016
+ *  us/byte       80         8          0.8        0.08       0.008
+ *  1.27us/byte   63         6.3        0.63       0.063      0.0063
+ */
+#define SM_SHIFT        20
+#define ISM_SHIFT       18
+#define SM_MASK         ((1ULL << SM_SHIFT) - 1)
+#define ISM_MASK        ((1ULL << ISM_SHIFT) - 1)
+static inline u64
+seg_x2y(u64 x, u64 sm)
+{
+        u64 y;
+        /*
+         * compute
+         *      y = x * sm >> SM_SHIFT
+         * but divide it for the upper and lower bits to avoid overflow
+         */
+        y = (x >> SM_SHIFT) * sm + (((x & SM_MASK) * sm) >> SM_SHIFT);
+        return y;
+}
+static inline u64
+seg_y2x(u64 y, u64 ism)
+{
+        u64 x;
+        if (y == 0)
+                x = 0;
+        else if (ism == HT_INFINITY)
+                x = HT_INFINITY;
+        else {
+                x = (y >> ISM_SHIFT) * ism
+                    + (((y & ISM_MASK) * ism) >> ISM_SHIFT);
+        }
+        return x;
+}
+/* Convert m (bps) into sm (bytes/psched us) */
+static u64
+m2sm(u32 m)
+{
+        u64 sm;
+        sm = ((u64)m << SM_SHIFT);
+        sm += PSCHED_JIFFIE2US(HZ) - 1;
+        do_div(sm, PSCHED_JIFFIE2US(HZ));
+        return sm;
+}
+/* convert m (bps) into ism (psched us/byte) */
+static u64
+m2ism(u32 m)
+{
+        u64 ism;
+        if (m == 0)
+                ism = HT_INFINITY;
+        else {
+                ism = ((u64)PSCHED_JIFFIE2US(HZ) << ISM_SHIFT);
+                ism += m - 1;
+                do_div(ism, m);
+        }
+        return ism;
+}
+/* convert d (us) into dx (psched us) */
+static u64
+d2dx(u32 d)
+{
+        u64 dx;
+        dx = ((u64)d * PSCHED_JIFFIE2US(HZ));
+        dx += 1000000 - 1;
+        do_div(dx, 1000000);
+        return dx;
+}
+/* convert sm (bytes/psched us) into m (bps) */
+static u32
+sm2m(u64 sm)
+{
+        u64 m;
+        m = (sm * PSCHED_JIFFIE2US(HZ)) >> SM_SHIFT;
+        return (u32)m;
+}
+/* convert dx (psched us) into d (us) */
+static u32
+dx2d(u64 dx)
+{
+        u64 d;
+        d = dx * 1000000;
+        do_div(d, PSCHED_JIFFIE2US(HZ));
+        return (u32)d;
+}
+static void
+sc2isc(struct tc_service_curve *sc, struct internal_sc *isc)
+{
+        isc->sm1  = m2sm(sc->m1);
+        isc->ism1 = m2ism(sc->m1);
+        isc->dx   = d2dx(sc->d);
+        isc->dy   = seg_x2y(isc->dx, isc->sm1);
+        isc->sm2  = m2sm(sc->m2);
+        isc->ism2 = m2ism(sc->m2);
+}
+/*
+ * initialize the runtime service curve with the given internal
+ * service curve starting at (x, y).
+ */
+static void
+rtsc_init(struct runtime_sc *rtsc, struct internal_sc *isc, u64 x, u64 y)
+{
+        rtsc->x    = x;
+        rtsc->y    = y;
+        rtsc->sm1  = isc->sm1;
+        rtsc->ism1 = isc->ism1;
+        rtsc->dx   = isc->dx;
+        rtsc->dy   = isc->dy;
+        rtsc->sm2  = isc->sm2;
+        rtsc->ism2 = isc->ism2;
+}
+/*
+ * calculate the y-projection of the runtime service curve by the
+ * given x-projection value
+ */
+static u64
+rtsc_y2x(struct runtime_sc *rtsc, u64 y)
+{
+        u64 x;
+        if (y < rtsc->y)
+                x = rtsc->x;
+        else if (y <= rtsc->y + rtsc->dy) {
+                /* x belongs to the 1st segment */
+                if (rtsc->dy == 0)
+                        x = rtsc->x + rtsc->dx;
+                else
+                        x = rtsc->x + seg_y2x(y - rtsc->y, rtsc->ism1);
+        } else {
+                /* x belongs to the 2nd segment */
+                x = rtsc->x + rtsc->dx
+                    + seg_y2x(y - rtsc->y - rtsc->dy, rtsc->ism2);
+        }
+        return x;
+}
+static u64
+rtsc_x2y(struct runtime_sc *rtsc, u64 x)
+{
+        u64 y;
+        if (x <= rtsc->x)
+                y = rtsc->y;
+        else if (x <= rtsc->x + rtsc->dx)
+                /* y belongs to the 1st segment */
+                y = rtsc->y + seg_x2y(x - rtsc->x, rtsc->sm1);
+        else
+                /* y belongs to the 2nd segment */
+                y = rtsc->y + rtsc->dy
+                    + seg_x2y(x - rtsc->x - rtsc->dx, rtsc->sm2);
+        return y;
+}
+/*
+ * update the runtime service curve by taking the minimum of the current
+ * runtime service curve and the service curve starting at (x, y).
+ */
+static void
+rtsc_min(struct runtime_sc *rtsc, struct internal_sc *isc, u64 x, u64 y)
+{
+        u64 y1, y2, dx, dy;
+        u32 dsm;
+        if (isc->sm1 <= isc->sm2) {
+                /* service curve is convex */
+                y1 = rtsc_x2y(rtsc, x);
+                if (y1 < y)
+                        /* the current rtsc is smaller */
+                        return;
+                rtsc->x = x;
+                rtsc->y = y;
+                return;
+        }
+        /*
+         * service curve is concave
+         * compute the two y values of the current rtsc
+         *      y1: at x
+         *      y2: at (x + dx)
+         */
+        y1 = rtsc_x2y(rtsc, x);
+        if (y1 <= y) {
+                /* rtsc is below isc, no change to rtsc */
+                return;
+        }
+        y2 = rtsc_x2y(rtsc, x + isc->dx);
+        if (y2 >= y + isc->dy) {
+                /* rtsc is above isc, replace rtsc by isc */
+                rtsc->x = x;
+                rtsc->y = y;
+                rtsc->dx = isc->dx;
+                rtsc->dy = isc->dy;
+                return;
+        }
+        /*
+         * the two curves intersect
+         * compute the offsets (dx, dy) using the reverse
+         * function of seg_x2y()
+         *      seg_x2y(dx, sm1) == seg_x2y(dx, sm2) + (y1 - y)
+         */
+        dx = (y1 - y) << SM_SHIFT;
+        dsm = isc->sm1 - isc->sm2;
+        do_div(dx, dsm);
+        /*
+         * check if (x, y1) belongs to the 1st segment of rtsc.
+         * if so, add the offset.
+         */
+        if (rtsc->x + rtsc->dx > x)
+                dx += rtsc->x + rtsc->dx - x;
+        dy = seg_x2y(dx, isc->sm1);
+        rtsc->x = x;
+        rtsc->y = y;
+        rtsc->dx = dx;
+        rtsc->dy = dy;
+        return;
+}
+static void
+init_ed(struct hfsc_class *cl, unsigned int next_len)
+{
+        u64 cur_time;
+        PSCHED_GET_TIME(cur_time);
+        /* update the deadline curve */
+        rtsc_min(&cl->cl_deadline, &cl->cl_rsc, cur_time, cl->cl_cumul);
+        /*
+         * update the eligible curve.
+         * for concave, it is equal to the deadline curve.
+         * for convex, it is a linear curve with slope m2.
+         */
+        cl->cl_eligible = cl->cl_deadline;
+        if (cl->cl_rsc.sm1 <= cl->cl_rsc.sm2) {
+                cl->cl_eligible.dx = 0;
+                cl->cl_eligible.dy = 0;
+        }
+        /* compute e and d */
+        cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul);
+        cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len);
+        eltree_insert(cl);
+}
+static void
+update_ed(struct hfsc_class *cl, unsigned int next_len)
+{
+        cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul);
+        cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len);
+        eltree_update(cl);
+}
+static inline void
+update_d(struct hfsc_class *cl, unsigned int next_len)
+{
+        cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len);
+}
+static inline void
+update_cfmin(struct hfsc_class *cl)
+{
+        struct rb_node *n = rb_first(&cl->cf_tree);
+        struct hfsc_class *p;
+        if (n == NULL) {
+                cl->cl_cfmin = 0;
+                return;
+        }
+        p = rb_entry(n, struct hfsc_class, cf_node);
+        cl->cl_cfmin = p->cl_f;
+}
+static void
+init_vf(struct hfsc_class *cl, unsigned int len)
+{
+        struct hfsc_class *max_cl;
+        struct rb_node *n;
+        u64 vt, f, cur_time;
+        int go_active;
+        cur_time = 0;
+        go_active = 1;
+        for (; cl->cl_parent != NULL; cl = cl->cl_parent) {
+                if (go_active && cl->cl_nactive++ == 0)
+                        go_active = 1;
+                else
+                        go_active = 0;
+                if (go_active) {
+                        n = rb_last(&cl->cl_parent->vt_tree);
+                        if (n != NULL) {
+                                max_cl = rb_entry(n, struct hfsc_class,vt_node);
+                                /*
+                                 * set vt to the average of the min and max
+                                 * classes.  if the parent's period didn't
+                                 * change, don't decrease vt of the class.
+                                 */
+                                vt = max_cl->cl_vt;
+                                if (cl->cl_parent->cl_cvtmin != 0)
+                                        vt = (cl->cl_parent->cl_cvtmin + vt)/2;
+                                if (cl->cl_parent->cl_vtperiod !=
+                                    cl->cl_parentperiod || vt > cl->cl_vt)
+                                        cl->cl_vt = vt;
+                        } else {
+                                /*
+                                 * first child for a new parent backlog period.
+                                 * add parent's cvtmax to cvtoff to make a new
+                                 * vt (vtoff + vt) larger than the vt in the
+                                 * last period for all children.
+                                 */
+                                vt = cl->cl_parent->cl_cvtmax;
+                                cl->cl_parent->cl_cvtoff += vt;
+                                cl->cl_parent->cl_cvtmax = 0;
+                                cl->cl_parent->cl_cvtmin = 0;
+                                cl->cl_vt = 0;
+                        }
+                        cl->cl_vtoff = cl->cl_parent->cl_cvtoff -
+                                                        cl->cl_pcvtoff;
+                        /* update the virtual curve */
+                        vt = cl->cl_vt + cl->cl_vtoff;
+                        rtsc_min(&cl->cl_virtual, &cl->cl_fsc, vt,
+                                                      cl->cl_total);
+                        if (cl->cl_virtual.x == vt) {
+                                cl->cl_virtual.x -= cl->cl_vtoff;
+                                cl->cl_vtoff = 0;
+                        }
+                        cl->cl_vtadj = 0;
+                        cl->cl_vtperiod++;  /* increment vt period */
+                        cl->cl_parentperiod = cl->cl_parent->cl_vtperiod;
+                        if (cl->cl_parent->cl_nactive == 0)
+                                cl->cl_parentperiod++;
+                        cl->cl_f = 0;
+                        vttree_insert(cl);
+                        cftree_insert(cl);
+                        if (cl->cl_flags & HFSC_USC) {
+                                /* class has upper limit curve */
+                                if (cur_time == 0)
+                                        PSCHED_GET_TIME(cur_time);
+                                /* update the ulimit curve */
+                                rtsc_min(&cl->cl_ulimit, &cl->cl_usc, cur_time,
+                                         cl->cl_total);
+                                /* compute myf */
+                                cl->cl_myf = rtsc_y2x(&cl->cl_ulimit,
+                                                      cl->cl_total);
+                                cl->cl_myfadj = 0;
+                        }
+                }
+                f = max(cl->cl_myf, cl->cl_cfmin);
+                if (f != cl->cl_f) {
+                        cl->cl_f = f;
+                        cftree_update(cl);
+                        update_cfmin(cl->cl_parent);
+                }
+        }
+}
+static void
+update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time)
+{
+        u64 f; /* , myf_bound, delta; */
+        int go_passive = 0;
+        if (cl->qdisc->q.qlen == 0 && cl->cl_flags & HFSC_FSC)
+                go_passive = 1;
+        for (; cl->cl_parent != NULL; cl = cl->cl_parent) {
+                cl->cl_total += len;
+                if (!(cl->cl_flags & HFSC_FSC) || cl->cl_nactive == 0)
+                        continue;
+                if (go_passive && --cl->cl_nactive == 0)
+                        go_passive = 1;
+                else
+                        go_passive = 0;
+                if (go_passive) {
+                        /* no more active child, going passive */
+                        /* update cvtmax of the parent class */
+                        if (cl->cl_vt > cl->cl_parent->cl_cvtmax)
+                                cl->cl_parent->cl_cvtmax = cl->cl_vt;
+                        /* remove this class from the vt tree */
+                        vttree_remove(cl);
+                        cftree_remove(cl);
+                        update_cfmin(cl->cl_parent);
+                        continue;
+                }
+                /*
+                 * update vt and f
+                 */
+                cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total)
+                            - cl->cl_vtoff + cl->cl_vtadj;
+                /*
+                 * if vt of the class is smaller than cvtmin,
+                 * the class was skipped in the past due to non-fit.
+                 * if so, we need to adjust vtadj.
+                 */
+                if (cl->cl_vt < cl->cl_parent->cl_cvtmin) {
+                        cl->cl_vtadj += cl->cl_parent->cl_cvtmin - cl->cl_vt;
+                        cl->cl_vt = cl->cl_parent->cl_cvtmin;
+                }
+                /* update the vt tree */
+                vttree_update(cl);
+                if (cl->cl_flags & HFSC_USC) {
+                        cl->cl_myf = cl->cl_myfadj + rtsc_y2x(&cl->cl_ulimit,
+                                                              cl->cl_total);
+#if 0
+                        /*
+                         * This code causes classes to stay way under their
+                         * limit when multiple classes are used at gigabit
+                         * speed. needs investigation. -kaber
+                         */
+                        /*
+                         * if myf lags behind by more than one clock tick
+                         * from the current time, adjust myfadj to prevent
+                         * a rate-limited class from going greedy.
+                         * in a steady state under rate-limiting, myf
+                         * fluctuates within one clock tick.
+                         */
+                        myf_bound = cur_time - PSCHED_JIFFIE2US(1);
+                        if (cl->cl_myf < myf_bound) {
+                                delta = cur_time - cl->cl_myf;
+                                cl->cl_myfadj += delta;
+                                cl->cl_myf += delta;
+                        }
+#endif
+                }
+                f = max(cl->cl_myf, cl->cl_cfmin);
+                if (f != cl->cl_f) {
+                        cl->cl_f = f;
+                        cftree_update(cl);
+                        update_cfmin(cl->cl_parent);
+                }
+        }
+}
+static void
+set_active(struct hfsc_class *cl, unsigned int len)
+{
+        if (cl->cl_flags & HFSC_RSC)
+                init_ed(cl, len);
+        if (cl->cl_flags & HFSC_FSC)
+                init_vf(cl, len);
+        list_add_tail(&cl->dlist, &cl->sched->droplist);
+}
+static void
+set_passive(struct hfsc_class *cl)
+{
+        if (cl->cl_flags & HFSC_RSC)
+                eltree_remove(cl);
+        list_del(&cl->dlist);
+        /*
+         * vttree is now handled in update_vf() so that update_vf(cl, 0, 0)
+         * needs to be called explicitly to remove a class from vttree.
+         */
+}
+/*
+ * hack to get length of first packet in queue.
+ */
+static unsigned int
+qdisc_peek_len(struct Qdisc *sch)
+{
+        struct sk_buff *skb;
+        unsigned int len;
+        skb = sch->dequeue(sch);
+        if (skb == NULL) {
+                if (net_ratelimit())
+                        printk("qdisc_peek_len: non work-conserving qdisc ?\n");
+                return 0;
+        }
+        len = skb->len;
+        if (unlikely(sch->ops->requeue(skb, sch) != NET_XMIT_SUCCESS)) {
+                if (net_ratelimit())
+                        printk("qdisc_peek_len: failed to requeue\n");
+                return 0;
+        }
+        return len;
+}
+static void
+hfsc_purge_queue(struct Qdisc *sch, struct hfsc_class *cl)
+{
+        unsigned int len = cl->qdisc->q.qlen;
+        qdisc_reset(cl->qdisc);
+        if (len > 0) {
+                update_vf(cl, 0, 0);
+                set_passive(cl);
+                sch->q.qlen -= len;
+        }
+}
+static void
+hfsc_adjust_levels(struct hfsc_class *cl)
+{
+        struct hfsc_class *p;
+        unsigned int level;
+        do {
+                level = 0;
+                list_for_each_entry(p, &cl->children, siblings) {
+                        if (p->level > level)
+                                level = p->level;
+                }
+                cl->level = level + 1;
+        } while ((cl = cl->cl_parent) != NULL);
+}
+static inline unsigned int
+hfsc_hash(u32 h)
+{
+        h ^= h >> 8;
+        h ^= h >> 4;
+        return h & (HFSC_HSIZE - 1);
+}
+static inline struct hfsc_class *
+hfsc_find_class(u32 classid, struct Qdisc *sch)
+{
+        struct hfsc_sched *q = qdisc_priv(sch);
+        struct hfsc_class *cl;
+        list_for_each_entry(cl, &q->clhash[hfsc_hash(classid)], hlist) {
+                if (cl->classid == classid)
+                        return cl;
+        }
+        return NULL;
+}
+static void
+hfsc_change_rsc(struct hfsc_class *cl, struct tc_service_curve *rsc,
+                u64 cur_time)
+{
+        sc2isc(rsc, &cl->cl_rsc);
+        rtsc_init(&cl->cl_deadline, &cl->cl_rsc, cur_time, cl->cl_cumul);
+        cl->cl_eligible = cl->cl_deadline;
+        if (cl->cl_rsc.sm1 <= cl->cl_rsc.sm2) {
+                cl->cl_eligible.dx = 0;
+                cl->cl_eligible.dy = 0;
+        }
+        cl->cl_flags |= HFSC_RSC;
+}
+static void
+hfsc_change_fsc(struct hfsc_class *cl, struct tc_service_curve *fsc)
+{
+        sc2isc(fsc, &cl->cl_fsc);
+        rtsc_init(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vt, cl->cl_total);
+        cl->cl_flags |= HFSC_FSC;
+}
+static void
+hfsc_change_usc(struct hfsc_class *cl, struct tc_service_curve *usc,
+                u64 cur_time)
+{
+        sc2isc(usc, &cl->cl_usc);
+        rtsc_init(&cl->cl_ulimit, &cl->cl_usc, cur_time, cl->cl_total);
+        cl->cl_flags |= HFSC_USC;
+}
+static int
+hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
+                  struct rtattr **tca, unsigned long *arg)
+{
+        struct hfsc_sched *q = qdisc_priv(sch);
+        struct hfsc_class *cl = (struct hfsc_class *)*arg;
+        struct hfsc_class *parent = NULL;
+        struct rtattr *opt = tca[TCA_OPTIONS-1];
+        struct rtattr *tb[TCA_HFSC_MAX];
+        struct tc_service_curve *rsc = NULL, *fsc = NULL, *usc = NULL;
+        u64 cur_time;
+        if (opt == NULL || rtattr_parse_nested(tb, TCA_HFSC_MAX, opt))
+                return -EINVAL;
+        if (tb[TCA_HFSC_RSC-1]) {
+                if (RTA_PAYLOAD(tb[TCA_HFSC_RSC-1]) < sizeof(*rsc))
+                        return -EINVAL;
+                rsc = RTA_DATA(tb[TCA_HFSC_RSC-1]);
+                if (rsc->m1 == 0 && rsc->m2 == 0)
+                        rsc = NULL;
+        }
+        if (tb[TCA_HFSC_FSC-1]) {
+                if (RTA_PAYLOAD(tb[TCA_HFSC_FSC-1]) < sizeof(*fsc))
+                        return -EINVAL;
+                fsc = RTA_DATA(tb[TCA_HFSC_FSC-1]);
+                if (fsc->m1 == 0 && fsc->m2 == 0)
+                        fsc = NULL;
+        }
+        if (tb[TCA_HFSC_USC-1]) {
+                if (RTA_PAYLOAD(tb[TCA_HFSC_USC-1]) < sizeof(*usc))
+                        return -EINVAL;
+                usc = RTA_DATA(tb[TCA_HFSC_USC-1]);
+                if (usc->m1 == 0 && usc->m2 == 0)
+                        usc = NULL;
+        }
+        if (cl != NULL) {
+                if (parentid) {
+                        if (cl->cl_parent && cl->cl_parent->classid != parentid)
+                                return -EINVAL;
+                        if (cl->cl_parent == NULL && parentid != TC_H_ROOT)
+                                return -EINVAL;
+                }
+                PSCHED_GET_TIME(cur_time);
+                sch_tree_lock(sch);
+                if (rsc != NULL)
+                        hfsc_change_rsc(cl, rsc, cur_time);
+                if (fsc != NULL)
+                        hfsc_change_fsc(cl, fsc);
+                if (usc != NULL)
+                        hfsc_change_usc(cl, usc, cur_time);
+                if (cl->qdisc->q.qlen != 0) {
+                        if (cl->cl_flags & HFSC_RSC)
+                                update_ed(cl, qdisc_peek_len(cl->qdisc));
+                        if (cl->cl_flags & HFSC_FSC)
+                                update_vf(cl, 0, cur_time);
+                }
+                sch_tree_unlock(sch);
+#ifdef CONFIG_NET_ESTIMATOR
+                if (tca[TCA_RATE-1])
+                        gen_replace_estimator(&cl->bstats, &cl->rate_est,
+                                cl->stats_lock, tca[TCA_RATE-1]);
+#endif
+                return 0;
+        }
+        if (parentid == TC_H_ROOT)
+                return -EEXIST;
+        parent = &q->root;
+        if (parentid) {
+                parent = hfsc_find_class(parentid, sch);
+                if (parent == NULL)
+                        return -ENOENT;
+        }
+        if (classid == 0 || TC_H_MAJ(classid ^ sch->handle) != 0)
+                return -EINVAL;
+        if (hfsc_find_class(classid, sch))
+                return -EEXIST;
+        if (rsc == NULL && fsc == NULL)
+                return -EINVAL;
+        cl = kmalloc(sizeof(struct hfsc_class), GFP_KERNEL);
+        if (cl == NULL)
+                return -ENOBUFS;
+        memset(cl, 0, sizeof(struct hfsc_class));
+        if (rsc != NULL)
+                hfsc_change_rsc(cl, rsc, 0);
+        if (fsc != NULL)
+                hfsc_change_fsc(cl, fsc);
+        if (usc != NULL)
+                hfsc_change_usc(cl, usc, 0);
+        cl->refcnt    = 1;
+        cl->classid   = classid;
+        cl->sched     = q;
+        cl->cl_parent = parent;
+        cl->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
+        if (cl->qdisc == NULL)
+                cl->qdisc = &noop_qdisc;
+        cl->stats_lock = &sch->dev->queue_lock;
+        INIT_LIST_HEAD(&cl->children);
+        cl->vt_tree = RB_ROOT;
+        cl->cf_tree = RB_ROOT;
+        sch_tree_lock(sch);
+        list_add_tail(&cl->hlist, &q->clhash[hfsc_hash(classid)]);
+        list_add_tail(&cl->siblings, &parent->children);
+        if (parent->level == 0)
+                hfsc_purge_queue(sch, parent);
+        hfsc_adjust_levels(parent);
+        cl->cl_pcvtoff = parent->cl_cvtoff;
+        sch_tree_unlock(sch);
+#ifdef CONFIG_NET_ESTIMATOR
+        if (tca[TCA_RATE-1])
+                gen_new_estimator(&cl->bstats, &cl->rate_est,
+                        cl->stats_lock, tca[TCA_RATE-1]);
+#endif
+        *arg = (unsigned long)cl;
+        return 0;
+}
+static void
+hfsc_destroy_filters(struct tcf_proto **fl)
+{
+        struct tcf_proto *tp;
+        while ((tp = *fl) != NULL) {
+                *fl = tp->next;
+                tcf_destroy(tp);
+        }
+}
+static void
+hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl)
+{
+        struct hfsc_sched *q = qdisc_priv(sch);
+        hfsc_destroy_filters(&cl->filter_list);
+        qdisc_destroy(cl->qdisc);
+#ifdef CONFIG_NET_ESTIMATOR
+        gen_kill_estimator(&cl->bstats, &cl->rate_est);
+#endif
+        if (cl != &q->root)
+                kfree(cl);
+}
+static int
+hfsc_delete_class(struct Qdisc *sch, unsigned long arg)
+{
+        struct hfsc_sched *q = qdisc_priv(sch);
+        struct hfsc_class *cl = (struct hfsc_class *)arg;
+        if (cl->level > 0 || cl->filter_cnt > 0 || cl == &q->root)
+                return -EBUSY;
+        sch_tree_lock(sch);
+        list_del(&cl->hlist);
+        list_del(&cl->siblings);
+        hfsc_adjust_levels(cl->cl_parent);
+        hfsc_purge_queue(sch, cl);
+        if (--cl->refcnt == 0)
+                hfsc_destroy_class(sch, cl);
+        sch_tree_unlock(sch);
+        return 0;
+}
+static struct hfsc_class *
+hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
+{
+        struct hfsc_sched *q = qdisc_priv(sch);
+        struct hfsc_class *cl;
+        struct tcf_result res;
+        struct tcf_proto *tcf;
+        int result;
+        if (TC_H_MAJ(skb->priority ^ sch->handle) == 0 &&
+            (cl = hfsc_find_class(skb->priority, sch)) != NULL)
+                if (cl->level == 0)
+                        return cl;
+        *qerr = NET_XMIT_DROP;
+        tcf = q->root.filter_list;
+        while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+                switch (result) {
+                case TC_ACT_QUEUED:
+                case TC_ACT_STOLEN: 
+                        *qerr = NET_XMIT_SUCCESS;
+                case TC_ACT_SHOT: 
+                        return NULL;
+                }
+#elif defined(CONFIG_NET_CLS_POLICE)
+                if (result == TC_POLICE_SHOT)
+                        return NULL;
+#endif
+                if ((cl = (struct hfsc_class *)res.class) == NULL) {
+                        if ((cl = hfsc_find_class(res.classid, sch)) == NULL)
+                                break; /* filter selected invalid classid */
+                }
+                if (cl->level == 0)
+                        return cl; /* hit leaf class */
+                /* apply inner filter chain */
+                tcf = cl->filter_list;
+        }
+        /* classification failed, try default class */
+        cl = hfsc_find_class(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
+        if (cl == NULL || cl->level > 0)
+                return NULL;
+        return cl;
+}
+static int
+hfsc_graft_class(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+                 struct Qdisc **old)
+{
+        struct hfsc_class *cl = (struct hfsc_class *)arg;
+        if (cl == NULL)
+                return -ENOENT;
+        if (cl->level > 0)
+                return -EINVAL;
+        if (new == NULL) {
+                new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
+                if (new == NULL)
+                        new = &noop_qdisc;
+        }
+        sch_tree_lock(sch);
+        hfsc_purge_queue(sch, cl);
+        *old = xchg(&cl->qdisc, new);
+        sch_tree_unlock(sch);
+        return 0;
+}
+static struct Qdisc *
+hfsc_class_leaf(struct Qdisc *sch, unsigned long arg)
+{
+        struct hfsc_class *cl = (struct hfsc_class *)arg;
+        if (cl != NULL && cl->level == 0)
+                return cl->qdisc;
+        return NULL;
+}
+static unsigned long
+hfsc_get_class(struct Qdisc *sch, u32 classid)
+{
+        struct hfsc_class *cl = hfsc_find_class(classid, sch);
+        if (cl != NULL)
+                cl->refcnt++;
+        return (unsigned long)cl;
+}
+static void
+hfsc_put_class(struct Qdisc *sch, unsigned long arg)
+{
+        struct hfsc_class *cl = (struct hfsc_class *)arg;
+        if (--cl->refcnt == 0)
+                hfsc_destroy_class(sch, cl);
+}
+static unsigned long
+hfsc_bind_tcf(struct Qdisc *sch, unsigned long parent, u32 classid)
+{
+        struct hfsc_class *p = (struct hfsc_class *)parent;
+        struct hfsc_class *cl = hfsc_find_class(classid, sch);
+        if (cl != NULL) {
+                if (p != NULL && p->level <= cl->level)
+                        return 0;
+                cl->filter_cnt++;
+        }
+        return (unsigned long)cl;
+}
+static void
+hfsc_unbind_tcf(struct Qdisc *sch, unsigned long arg)
+{
+        struct hfsc_class *cl = (struct hfsc_class *)arg;
+        cl->filter_cnt--;
+}
+static struct tcf_proto **
+hfsc_tcf_chain(struct Qdisc *sch, unsigned long arg)
+{
+        struct hfsc_sched *q = qdisc_priv(sch);
+        struct hfsc_class *cl = (struct hfsc_class *)arg;
+        if (cl == NULL)
+                cl = &q->root;
+        return &cl->filter_list;
+}
+static int
+hfsc_dump_sc(struct sk_buff *skb, int attr, struct internal_sc *sc)
+{
+        struct tc_service_curve tsc;
+        tsc.m1 = sm2m(sc->sm1);
+        tsc.d  = dx2d(sc->dx);
+        tsc.m2 = sm2m(sc->sm2);
+        RTA_PUT(skb, attr, sizeof(tsc), &tsc);
+        return skb->len;
+ rtattr_failure:
+        return -1;
+}
+static inline int
+hfsc_dump_curves(struct sk_buff *skb, struct hfsc_class *cl)
+{
+        if ((cl->cl_flags & HFSC_RSC) &&
+            (hfsc_dump_sc(skb, TCA_HFSC_RSC, &cl->cl_rsc) < 0))
+                goto rtattr_failure;
+        if ((cl->cl_flags & HFSC_FSC) &&
+            (hfsc_dump_sc(skb, TCA_HFSC_FSC, &cl->cl_fsc) < 0))
+                goto rtattr_failure;
+        if ((cl->cl_flags & HFSC_USC) &&
+            (hfsc_dump_sc(skb, TCA_HFSC_USC, &cl->cl_usc) < 0))
+                goto rtattr_failure;
+        return skb->len;
+ rtattr_failure:
+        return -1;
+}
+static int
+hfsc_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb,
+                struct tcmsg *tcm)
+{
+        struct hfsc_class *cl = (struct hfsc_class *)arg;
+        unsigned char *b = skb->tail;
+        struct rtattr *rta = (struct rtattr *)b;
+        tcm->tcm_parent = cl->cl_parent ? cl->cl_parent->classid : TC_H_ROOT;
+        tcm->tcm_handle = cl->classid;
+        if (cl->level == 0)
+                tcm->tcm_info = cl->qdisc->handle;
+        RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
+        if (hfsc_dump_curves(skb, cl) < 0)
+                goto rtattr_failure;
+        rta->rta_len = skb->tail - b;
+        return skb->len;
+ rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+static int
+hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
+        struct gnet_dump *d)
+{
+        struct hfsc_class *cl = (struct hfsc_class *)arg;
+        struct tc_hfsc_stats xstats;
+        cl->qstats.qlen = cl->qdisc->q.qlen;
+        xstats.level   = cl->level;
+        xstats.period  = cl->cl_vtperiod;
+        xstats.work    = cl->cl_total;
+        xstats.rtwork  = cl->cl_cumul;
+        if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
+#ifdef CONFIG_NET_ESTIMATOR
+            gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
+#endif
+            gnet_stats_copy_queue(d, &cl->qstats) < 0)
+                return -1;
+        return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
+}
+static void
+hfsc_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+        struct hfsc_sched *q = qdisc_priv(sch);
+        struct hfsc_class *cl;
+        unsigned int i;
+        if (arg->stop)
+                return;
+        for (i = 0; i < HFSC_HSIZE; i++) {
+                list_for_each_entry(cl, &q->clhash[i], hlist) {
+                        if (arg->count < arg->skip) {
+                                arg->count++;
+                                continue;
+                        }
+                        if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
+                                arg->stop = 1;
+                                return;
+                        }
+                        arg->count++;
+                }
+        }
+}
+static void
+hfsc_watchdog(unsigned long arg)
+{
+        struct Qdisc *sch = (struct Qdisc *)arg;
+        sch->flags &= ~TCQ_F_THROTTLED;
+        netif_schedule(sch->dev);
+}
+static void
+hfsc_schedule_watchdog(struct Qdisc *sch, u64 cur_time)
+{
+        struct hfsc_sched *q = qdisc_priv(sch);
+        struct hfsc_class *cl;
+        u64 next_time = 0;
+        long delay;
+        if ((cl = eltree_get_minel(q)) != NULL)
+                next_time = cl->cl_e;
+        if (q->root.cl_cfmin != 0) {
+                if (next_time == 0 || next_time > q->root.cl_cfmin)
+                        next_time = q->root.cl_cfmin;
+        }
+        ASSERT(next_time != 0);
+        delay = next_time - cur_time;
+        delay = PSCHED_US2JIFFIE(delay);
+        sch->flags |= TCQ_F_THROTTLED;
+        mod_timer(&q->wd_timer, jiffies + delay);
+}
+static int
+hfsc_init_qdisc(struct Qdisc *sch, struct rtattr *opt)
+{
+        struct hfsc_sched *q = qdisc_priv(sch);
+        struct tc_hfsc_qopt *qopt;
+        unsigned int i;
+        if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt))
+                return -EINVAL;
+        qopt = RTA_DATA(opt);
+        sch->stats_lock = &sch->dev->queue_lock;
+        q->defcls = qopt->defcls;
+        for (i = 0; i < HFSC_HSIZE; i++)
+                INIT_LIST_HEAD(&q->clhash[i]);
+        q->eligible = RB_ROOT;
+        INIT_LIST_HEAD(&q->droplist);
+        skb_queue_head_init(&q->requeue);
+        q->root.refcnt  = 1;
+        q->root.classid = sch->handle;
+        q->root.sched   = q;
+        q->root.qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
+        if (q->root.qdisc == NULL)
+                q->root.qdisc = &noop_qdisc;
+        q->root.stats_lock = &sch->dev->queue_lock;
+        INIT_LIST_HEAD(&q->root.children);
+        q->root.vt_tree = RB_ROOT;
+        q->root.cf_tree = RB_ROOT;
+        list_add(&q->root.hlist, &q->clhash[hfsc_hash(q->root.classid)]);
+        init_timer(&q->wd_timer);
+        q->wd_timer.function = hfsc_watchdog;
+        q->wd_timer.data = (unsigned long)sch;
+        return 0;
+}
+static int
+hfsc_change_qdisc(struct Qdisc *sch, struct rtattr *opt)
+{
+        struct hfsc_sched *q = qdisc_priv(sch);
+        struct tc_hfsc_qopt *qopt;
+        if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt))
+                return -EINVAL;
+        qopt = RTA_DATA(opt);
+        sch_tree_lock(sch);
+        q->defcls = qopt->defcls;
+        sch_tree_unlock(sch);
+        return 0;
+}
+static void
+hfsc_reset_class(struct hfsc_class *cl)
+{
+        cl->cl_total        = 0;
+        cl->cl_cumul        = 0;
+        cl->cl_d            = 0;
+        cl->cl_e            = 0;
+        cl->cl_vt           = 0;
+        cl->cl_vtadj        = 0;
+        cl->cl_vtoff        = 0;
+        cl->cl_cvtmin       = 0;
+        cl->cl_cvtmax       = 0;
+        cl->cl_cvtoff       = 0;
+        cl->cl_pcvtoff      = 0;
+        cl->cl_vtperiod     = 0;
+        cl->cl_parentperiod = 0;
+        cl->cl_f            = 0;
+        cl->cl_myf          = 0;
+        cl->cl_myfadj       = 0;
+        cl->cl_cfmin        = 0;
+        cl->cl_nactive      = 0;
+        cl->vt_tree = RB_ROOT;
+        cl->cf_tree = RB_ROOT;
+        qdisc_reset(cl->qdisc);
+        if (cl->cl_flags & HFSC_RSC)
+                rtsc_init(&cl->cl_deadline, &cl->cl_rsc, 0, 0);
+        if (cl->cl_flags & HFSC_FSC)
+                rtsc_init(&cl->cl_virtual, &cl->cl_fsc, 0, 0);
+        if (cl->cl_flags & HFSC_USC)
+                rtsc_init(&cl->cl_ulimit, &cl->cl_usc, 0, 0);
+}
+static void
+hfsc_reset_qdisc(struct Qdisc *sch)
+{
+        struct hfsc_sched *q = qdisc_priv(sch);
+        struct hfsc_class *cl;
+        unsigned int i;
+        for (i = 0; i < HFSC_HSIZE; i++) {
+                list_for_each_entry(cl, &q->clhash[i], hlist)
+                        hfsc_reset_class(cl);
+        }
+        __skb_queue_purge(&q->requeue);
+        q->eligible = RB_ROOT;
+        INIT_LIST_HEAD(&q->droplist);
+        del_timer(&q->wd_timer);
+        sch->flags &= ~TCQ_F_THROTTLED;
+        sch->q.qlen = 0;
+}
+static void
+hfsc_destroy_qdisc(struct Qdisc *sch)
+{
+        struct hfsc_sched *q = qdisc_priv(sch);
+        struct hfsc_class *cl, *next;
+        unsigned int i;
+        for (i = 0; i < HFSC_HSIZE; i++) {
+                list_for_each_entry_safe(cl, next, &q->clhash[i], hlist)
+                        hfsc_destroy_class(sch, cl);
+        }
+        __skb_queue_purge(&q->requeue);
+        del_timer(&q->wd_timer);
+}
+static int
+hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb)
+{
+        struct hfsc_sched *q = qdisc_priv(sch);
+        unsigned char *b = skb->tail;
+        struct tc_hfsc_qopt qopt;
+        qopt.defcls = q->defcls;
+        RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
+        return skb->len;
+ rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+static int
+hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+        struct hfsc_class *cl;
+        unsigned int len;
+        int err;
+        cl = hfsc_classify(skb, sch, &err);
+        if (cl == NULL) {
+                if (err == NET_XMIT_DROP)
+                        sch->qstats.drops++;
+                kfree_skb(skb);
+                return err;
+        }
+        len = skb->len;
+        err = cl->qdisc->enqueue(skb, cl->qdisc);
+        if (unlikely(err != NET_XMIT_SUCCESS)) {
+                cl->qstats.drops++;
+                sch->qstats.drops++;
+                return err;
+        }
+        if (cl->qdisc->q.qlen == 1)
+                set_active(cl, len);
+        cl->bstats.packets++;
+        cl->bstats.bytes += len;
+        sch->bstats.packets++;
+        sch->bstats.bytes += len;
+        sch->q.qlen++;
+        return NET_XMIT_SUCCESS;
+}
+static struct sk_buff *
+hfsc_dequeue(struct Qdisc *sch)
+{
+        struct hfsc_sched *q = qdisc_priv(sch);
+        struct hfsc_class *cl;
+        struct sk_buff *skb;
+        u64 cur_time;
+        unsigned int next_len;
+        int realtime = 0;
+        if (sch->q.qlen == 0)
+                return NULL;
+        if ((skb = __skb_dequeue(&q->requeue)))
+                goto out;
+        PSCHED_GET_TIME(cur_time);
+        /*
+         * if there are eligible classes, use real-time criteria.
+         * find the class with the minimum deadline among
+         * the eligible classes.
+         */
+        if ((cl = eltree_get_mindl(q, cur_time)) != NULL) {
+                realtime = 1;
+        } else {
+                /*
+                 * use link-sharing criteria
+                 * get the class with the minimum vt in the hierarchy
+                 */
+                cl = vttree_get_minvt(&q->root, cur_time);
+                if (cl == NULL) {
+                        sch->qstats.overlimits++;
+                        hfsc_schedule_watchdog(sch, cur_time);
+                        return NULL;
+                }
+        }
+        skb = cl->qdisc->dequeue(cl->qdisc);
+        if (skb == NULL) {
+                if (net_ratelimit())
+                        printk("HFSC: Non-work-conserving qdisc ?\n");
+                return NULL;
+        }
+        update_vf(cl, skb->len, cur_time);
+        if (realtime)
+                cl->cl_cumul += skb->len;
+        if (cl->qdisc->q.qlen != 0) {
+                if (cl->cl_flags & HFSC_RSC) {
+                        /* update ed */
+                        next_len = qdisc_peek_len(cl->qdisc);
+                        if (realtime)
+                                update_ed(cl, next_len);
+                        else
+                                update_d(cl, next_len);
+                }
+        } else {
+                /* the class becomes passive */
+                set_passive(cl);
+        }
+ out:
+        sch->flags &= ~TCQ_F_THROTTLED;
+        sch->q.qlen--;
+        return skb;
+}
+static int
+hfsc_requeue(struct sk_buff *skb, struct Qdisc *sch)
+{
+        struct hfsc_sched *q = qdisc_priv(sch);
+        __skb_queue_head(&q->requeue, skb);
+        sch->q.qlen++;
+        sch->qstats.requeues++;
+        return NET_XMIT_SUCCESS;
+}
+static unsigned int
+hfsc_drop(struct Qdisc *sch)
+{
+        struct hfsc_sched *q = qdisc_priv(sch);
+        struct hfsc_class *cl;
+        unsigned int len;
+        list_for_each_entry(cl, &q->droplist, dlist) {
+                if (cl->qdisc->ops->drop != NULL &&
+                    (len = cl->qdisc->ops->drop(cl->qdisc)) > 0) {
+                        if (cl->qdisc->q.qlen == 0) {
+                                update_vf(cl, 0, 0);
+                                set_passive(cl);
+                        } else {
+                                list_move_tail(&cl->dlist, &q->droplist);
+                        }
+                        cl->qstats.drops++;
+                        sch->qstats.drops++;
+                        sch->q.qlen--;
+                        return len;
+                }
+        }
+        return 0;
+}
+static struct Qdisc_class_ops hfsc_class_ops = {
+        .change         = hfsc_change_class,
+        .delete         = hfsc_delete_class,
+        .graft          = hfsc_graft_class,
+        .leaf           = hfsc_class_leaf,
+        .get            = hfsc_get_class,
+        .put            = hfsc_put_class,
+        .bind_tcf       = hfsc_bind_tcf,
+        .unbind_tcf     = hfsc_unbind_tcf,
+        .tcf_chain      = hfsc_tcf_chain,
+        .dump           = hfsc_dump_class,
+        .dump_stats     = hfsc_dump_class_stats,
+        .walk           = hfsc_walk
+};
+static struct Qdisc_ops hfsc_qdisc_ops = {
+        .id             = "hfsc",
+        .init           = hfsc_init_qdisc,
+        .change         = hfsc_change_qdisc,
+        .reset          = hfsc_reset_qdisc,
+        .destroy        = hfsc_destroy_qdisc,
+        .dump           = hfsc_dump_qdisc,
+        .enqueue        = hfsc_enqueue,
+        .dequeue        = hfsc_dequeue,
+        .requeue        = hfsc_requeue,
+        .drop           = hfsc_drop,
+        .cl_ops         = &hfsc_class_ops,
+        .priv_size      = sizeof(struct hfsc_sched),
+        .owner          = THIS_MODULE
+};
+static int __init
+hfsc_init(void)
+{
+        return register_qdisc(&hfsc_qdisc_ops);
+}
+static void __exit
+hfsc_cleanup(void)
+{
+        unregister_qdisc(&hfsc_qdisc_ops);
+}
+MODULE_LICENSE("GPL");
+module_init(hfsc_init);
+module_exit(hfsc_cleanup);
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
new file mode 100644
index 000000000000..a85935e7d53d
--- /dev/null
+++ b/net/sched/sch_htb.c
@@ -0,0 +1,1759 @@
+/* vim: ts=8 sw=8
+ * net/sched/sch_htb.c  Hierarchical token bucket, feed tree version
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Authors:     Martin Devera, <devik@cdi.cz>
+ *
+ * Credits (in time order) for older HTB versions:
+ *              Stef Coene <stef.coene@docum.org>
+ *                      HTB support at LARTC mailing list
+ *              Ondrej Kraus, <krauso@barr.cz> 
+ *                      found missing INIT_QDISC(htb)
+ *              Vladimir Smelhaus, Aamer Akhter, Bert Hubert
+ *                      helped a lot to locate nasty class stall bug
+ *              Andi Kleen, Jamal Hadi, Bert Hubert
+ *                      code review and helpful comments on shaping
+ *              Tomasz Wrona, <tw@eter.tym.pl>
+ *                      created test case so that I was able to fix nasty bug
+ *              Wilfried Weissmann
+ *                      spotted bug in dequeue code and helped with fix
+ *              Jiri Fojtasek
+ *                      fixed requeue routine
+ *              and many others. thanks.
+ *
+ * $Id: sch_htb.c,v 1.25 2003/12/07 11:08:25 devik Exp devik $
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/notifier.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <linux/list.h>
+#include <linux/compiler.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+#include <linux/rbtree.h>
+/* HTB algorithm.
+    Author: devik@cdi.cz
+    ========================================================================
+    HTB is like TBF with multiple classes. It is also similar to CBQ because
+    it allows to assign priority to each class in hierarchy. 
+    In fact it is another implementation of Floyd's formal sharing.
+    Levels:
+    Each class is assigned level. Leaf has ALWAYS level 0 and root 
+    classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
+    one less than their parent.
+*/
+#define HTB_HSIZE 16    /* classid hash size */
+#define HTB_EWMAC 2     /* rate average over HTB_EWMAC*HTB_HSIZE sec */
+#undef HTB_DEBUG        /* compile debugging support (activated by tc tool) */
+#define HTB_RATECM 1    /* whether to use rate computer */
+#define HTB_HYSTERESIS 1/* whether to use mode hysteresis for speedup */
+#define HTB_QLOCK(S) spin_lock_bh(&(S)->dev->queue_lock)
+#define HTB_QUNLOCK(S) spin_unlock_bh(&(S)->dev->queue_lock)
+#define HTB_VER 0x30011 /* major must be matched with number suplied by TC as version */
+#if HTB_VER >> 16 != TC_HTB_PROTOVER
+#error "Mismatched sch_htb.c and pkt_sch.h"
+#endif
+/* debugging support; S is subsystem, these are defined:
+  0 - netlink messages
+  1 - enqueue
+  2 - drop & requeue
+  3 - dequeue main
+  4 - dequeue one prio DRR part
+  5 - dequeue class accounting
+  6 - class overlimit status computation
+  7 - hint tree
+  8 - event queue
+ 10 - rate estimator
+ 11 - classifier 
+ 12 - fast dequeue cache
+ L is level; 0 = none, 1 = basic info, 2 = detailed, 3 = full
+ q->debug uint32 contains 16 2-bit fields one for subsystem starting
+ from LSB
+ */
+#ifdef HTB_DEBUG
+#define HTB_DBG_COND(S,L) (((q->debug>>(2*S))&3) >= L)
+#define HTB_DBG(S,L,FMT,ARG...) if (HTB_DBG_COND(S,L)) \
+        printk(KERN_DEBUG FMT,##ARG)
+#define HTB_CHCL(cl) BUG_TRAP((cl)->magic == HTB_CMAGIC)
+#define HTB_PASSQ q,
+#define HTB_ARGQ struct htb_sched *q,
+#define static
+#undef __inline__
+#define __inline__
+#undef inline
+#define inline
+#define HTB_CMAGIC 0xFEFAFEF1
+#define htb_safe_rb_erase(N,R) do { BUG_TRAP((N)->rb_color != -1); \
+                if ((N)->rb_color == -1) break; \
+                rb_erase(N,R); \
+                (N)->rb_color = -1; } while (0)
+#else
+#define HTB_DBG_COND(S,L) (0)
+#define HTB_DBG(S,L,FMT,ARG...)
+#define HTB_PASSQ
+#define HTB_ARGQ
+#define HTB_CHCL(cl)
+#define htb_safe_rb_erase(N,R) rb_erase(N,R)
+#endif
+/* used internaly to keep status of single class */
+enum htb_cmode {
+    HTB_CANT_SEND,              /* class can't send and can't borrow */
+    HTB_MAY_BORROW,             /* class can't send but may borrow */
+    HTB_CAN_SEND                /* class can send */
+};
+/* interior & leaf nodes; props specific to leaves are marked L: */
+struct htb_class
+{
+#ifdef HTB_DEBUG
+        unsigned magic;
+#endif
+    /* general class parameters */
+    u32 classid;
+    struct gnet_stats_basic bstats;
+    struct gnet_stats_queue qstats;
+    struct gnet_stats_rate_est rate_est;
+    struct tc_htb_xstats xstats;/* our special stats */
+    int refcnt;                 /* usage count of this class */
+#ifdef HTB_RATECM
+    /* rate measurement counters */
+    unsigned long rate_bytes,sum_bytes;
+    unsigned long rate_packets,sum_packets;
+#endif
+    /* topology */
+    int level;                  /* our level (see above) */
+    struct htb_class *parent;   /* parent class */
+    struct list_head hlist;     /* classid hash list item */
+    struct list_head sibling;   /* sibling list item */
+    struct list_head children;  /* children list */
+    union {
+            struct htb_class_leaf {
+                    struct Qdisc *q;
+                    int prio;
+                    int aprio;  
+                    int quantum;
+                    int deficit[TC_HTB_MAXDEPTH];
+                    struct list_head drop_list;
+            } leaf;
+            struct htb_class_inner {
+                    struct rb_root feed[TC_HTB_NUMPRIO]; /* feed trees */
+                    struct rb_node *ptr[TC_HTB_NUMPRIO]; /* current class ptr */
+            /* When class changes from state 1->2 and disconnects from 
+               parent's feed then we lost ptr value and start from the
+              first child again. Here we store classid of the
+              last valid ptr (used when ptr is NULL). */
+              u32 last_ptr_id[TC_HTB_NUMPRIO];
+            } inner;
+    } un;
+    struct rb_node node[TC_HTB_NUMPRIO]; /* node for self or feed tree */
+    struct rb_node pq_node;              /* node for event queue */
+    unsigned long pq_key;       /* the same type as jiffies global */
+    
+    int prio_activity;          /* for which prios are we active */
+    enum htb_cmode cmode;       /* current mode of the class */
+    /* class attached filters */
+    struct tcf_proto *filter_list;
+    int filter_cnt;
+    int warned;         /* only one warning about non work conserving .. */
+    /* token bucket parameters */
+    struct qdisc_rate_table *rate;      /* rate table of the class itself */
+    struct qdisc_rate_table *ceil;      /* ceiling rate (limits borrows too) */
+    long buffer,cbuffer;                /* token bucket depth/rate */
+    long mbuffer;                       /* max wait time */
+    long tokens,ctokens;                /* current number of tokens */
+    psched_time_t t_c;                  /* checkpoint time */
+};
+/* TODO: maybe compute rate when size is too large .. or drop ? */
+static __inline__ long L2T(struct htb_class *cl,struct qdisc_rate_table *rate,
+        int size)
+{ 
+    int slot = size >> rate->rate.cell_log;
+    if (slot > 255) {
+        cl->xstats.giants++;
+        slot = 255;
+    }
+    return rate->data[slot];
+}
+struct htb_sched
+{
+    struct list_head root;                      /* root classes list */
+    struct list_head hash[HTB_HSIZE];           /* hashed by classid */
+    struct list_head drops[TC_HTB_NUMPRIO];     /* active leaves (for drops) */
+    
+    /* self list - roots of self generating tree */
+    struct rb_root row[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
+    int row_mask[TC_HTB_MAXDEPTH];
+    struct rb_node *ptr[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
+    u32 last_ptr_id[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
+    /* self wait list - roots of wait PQs per row */
+    struct rb_root wait_pq[TC_HTB_MAXDEPTH];
+    /* time of nearest event per level (row) */
+    unsigned long near_ev_cache[TC_HTB_MAXDEPTH];
+    /* cached value of jiffies in dequeue */
+    unsigned long jiffies;
+    /* whether we hit non-work conserving class during this dequeue; we use */
+    int nwc_hit;        /* this to disable mindelay complaint in dequeue */
+    int defcls;         /* class where unclassified flows go to */
+    u32 debug;          /* subsystem debug levels */
+    /* filters for qdisc itself */
+    struct tcf_proto *filter_list;
+    int filter_cnt;
+    int rate2quantum;           /* quant = rate / rate2quantum */
+    psched_time_t now;          /* cached dequeue time */
+    struct timer_list timer;    /* send delay timer */
+#ifdef HTB_RATECM
+    struct timer_list rttim;    /* rate computer timer */
+    int recmp_bucket;           /* which hash bucket to recompute next */
+#endif
+    
+    /* non shaped skbs; let them go directly thru */
+    struct sk_buff_head direct_queue;
+    int direct_qlen;  /* max qlen of above */
+    long direct_pkts;
+};
+/* compute hash of size HTB_HSIZE for given handle */
+static __inline__ int htb_hash(u32 h) 
+{
+#if HTB_HSIZE != 16
+ #error "Declare new hash for your HTB_HSIZE"
+#endif
+    h ^= h>>8;  /* stolen from cbq_hash */
+    h ^= h>>4;
+    return h & 0xf;
+}
+/* find class in global hash table using given handle */
+static __inline__ struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
+{
+        struct htb_sched *q = qdisc_priv(sch);
+        struct list_head *p;
+        if (TC_H_MAJ(handle) != sch->handle) 
+                return NULL;
+        
+        list_for_each (p,q->hash+htb_hash(handle)) {
+                struct htb_class *cl = list_entry(p,struct htb_class,hlist);
+                if (cl->classid == handle)
+                        return cl;
+        }
+        return NULL;
+}
+/**
+ * htb_classify - classify a packet into class
+ *
+ * It returns NULL if the packet should be dropped or -1 if the packet
+ * should be passed directly thru. In all other cases leaf class is returned.
+ * We allow direct class selection by classid in priority. The we examine
+ * filters in qdisc and in inner nodes (if higher filter points to the inner
+ * node). If we end up with classid MAJOR:0 we enqueue the skb into special
+ * internal fifo (direct). These packets then go directly thru. If we still 
+ * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessfull
+ * then finish and return direct queue.
+ */
+#define HTB_DIRECT (struct htb_class*)-1
+static inline u32 htb_classid(struct htb_class *cl)
+{
+        return (cl && cl != HTB_DIRECT) ? cl->classid : TC_H_UNSPEC;
+}
+static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
+{
+        struct htb_sched *q = qdisc_priv(sch);
+        struct htb_class *cl;
+        struct tcf_result res;
+        struct tcf_proto *tcf;
+        int result;
+        /* allow to select class by setting skb->priority to valid classid;
+           note that nfmark can be used too by attaching filter fw with no
+           rules in it */
+        if (skb->priority == sch->handle)
+                return HTB_DIRECT;  /* X:0 (direct flow) selected */
+        if ((cl = htb_find(skb->priority,sch)) != NULL && cl->level == 0) 
+                return cl;
+        *qerr = NET_XMIT_DROP;
+        tcf = q->filter_list;
+        while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+                switch (result) {
+                case TC_ACT_QUEUED:
+                case TC_ACT_STOLEN: 
+                        *qerr = NET_XMIT_SUCCESS;
+                case TC_ACT_SHOT:
+                        return NULL;
+                }
+#elif defined(CONFIG_NET_CLS_POLICE)
+                if (result == TC_POLICE_SHOT)
+                        return HTB_DIRECT;
+#endif
+                if ((cl = (void*)res.class) == NULL) {
+                        if (res.classid == sch->handle)
+                                return HTB_DIRECT;  /* X:0 (direct flow) */
+                        if ((cl = htb_find(res.classid,sch)) == NULL)
+                                break; /* filter selected invalid classid */
+                }
+                if (!cl->level)
+                        return cl; /* we hit leaf; return it */
+                /* we have got inner class; apply inner filter chain */
+                tcf = cl->filter_list;
+        }
+        /* classification failed; try to use default class */
+        cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle),q->defcls),sch);
+        if (!cl || cl->level)
+                return HTB_DIRECT; /* bad default .. this is safe bet */
+        return cl;
+}
+#ifdef HTB_DEBUG
+static void htb_next_rb_node(struct rb_node **n);
+#define HTB_DUMTREE(root,memb) if(root) { \
+        struct rb_node *n = (root)->rb_node; \
+        while (n->rb_left) n = n->rb_left; \
+        while (n) { \
+                struct htb_class *cl = rb_entry(n, struct htb_class, memb); \
+                printk(" %x",cl->classid); htb_next_rb_node (&n); \
+        } }
+static void htb_debug_dump (struct htb_sched *q)
+{
+        int i,p;
+        printk(KERN_DEBUG "htb*g j=%lu lj=%lu\n",jiffies,q->jiffies);
+        /* rows */
+        for (i=TC_HTB_MAXDEPTH-1;i>=0;i--) {
+                printk(KERN_DEBUG "htb*r%d m=%x",i,q->row_mask[i]);
+                for (p=0;p<TC_HTB_NUMPRIO;p++) {
+                        if (!q->row[i][p].rb_node) continue;
+                        printk(" p%d:",p);
+                        HTB_DUMTREE(q->row[i]+p,node[p]);
+                }
+                printk("\n");
+        }
+        /* classes */
+        for (i = 0; i < HTB_HSIZE; i++) {
+                struct list_head *l;
+                list_for_each (l,q->hash+i) {
+                        struct htb_class *cl = list_entry(l,struct htb_class,hlist);
+                        long diff = PSCHED_TDIFF_SAFE(q->now, cl->t_c, (u32)cl->mbuffer);
+                        printk(KERN_DEBUG "htb*c%x m=%d t=%ld c=%ld pq=%lu df=%ld ql=%d "
+                                        "pa=%x f:",
+                                cl->classid,cl->cmode,cl->tokens,cl->ctokens,
+                                cl->pq_node.rb_color==-1?0:cl->pq_key,diff,
+                                cl->level?0:cl->un.leaf.q->q.qlen,cl->prio_activity);
+                        if (cl->level)
+                        for (p=0;p<TC_HTB_NUMPRIO;p++) {
+                                if (!cl->un.inner.feed[p].rb_node) continue;
+                                printk(" p%d a=%x:",p,cl->un.inner.ptr[p]?rb_entry(cl->un.inner.ptr[p], struct htb_class,node[p])->classid:0);
+                                HTB_DUMTREE(cl->un.inner.feed+p,node[p]);
+                        }
+                        printk("\n");
+                }
+        }
+}
+#endif
+/**
+ * htb_add_to_id_tree - adds class to the round robin list
+ *
+ * Routine adds class to the list (actually tree) sorted by classid.
+ * Make sure that class is not already on such list for given prio.
+ */
+static void htb_add_to_id_tree (HTB_ARGQ struct rb_root *root,
+                struct htb_class *cl,int prio)
+{
+        struct rb_node **p = &root->rb_node, *parent = NULL;
+        HTB_DBG(7,3,"htb_add_id_tree cl=%X prio=%d\n",cl->classid,prio);
+#ifdef HTB_DEBUG
+        if (cl->node[prio].rb_color != -1) { BUG_TRAP(0); return; }
+        HTB_CHCL(cl);
+        if (*p) {
+                struct htb_class *x = rb_entry(*p,struct htb_class,node[prio]);
+                HTB_CHCL(x);
+        }
+#endif
+        while (*p) {
+                struct htb_class *c; parent = *p;
+                c = rb_entry(parent, struct htb_class, node[prio]);
+                HTB_CHCL(c);
+                if (cl->classid > c->classid)
+                        p = &parent->rb_right;
+                else 
+                        p = &parent->rb_left;
+        }
+        rb_link_node(&cl->node[prio], parent, p);
+        rb_insert_color(&cl->node[prio], root);
+}
+/**
+ * htb_add_to_wait_tree - adds class to the event queue with delay
+ *
+ * The class is added to priority event queue to indicate that class will
+ * change its mode in cl->pq_key microseconds. Make sure that class is not
+ * already in the queue.
+ */
+static void htb_add_to_wait_tree (struct htb_sched *q,
+                struct htb_class *cl,long delay,int debug_hint)
+{
+        struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL;
+        HTB_DBG(7,3,"htb_add_wt cl=%X key=%lu\n",cl->classid,cl->pq_key);
+#ifdef HTB_DEBUG
+        if (cl->pq_node.rb_color != -1) { BUG_TRAP(0); return; }
+        HTB_CHCL(cl);
+        if ((delay <= 0 || delay > cl->mbuffer) && net_ratelimit())
+                printk(KERN_ERR "HTB: suspicious delay in wait_tree d=%ld cl=%X h=%d\n",delay,cl->classid,debug_hint);
+#endif
+        cl->pq_key = q->jiffies + PSCHED_US2JIFFIE(delay);
+        if (cl->pq_key == q->jiffies)
+                cl->pq_key++;
+        /* update the nearest event cache */
+        if (time_after(q->near_ev_cache[cl->level], cl->pq_key))
+                q->near_ev_cache[cl->level] = cl->pq_key;
+        
+        while (*p) {
+                struct htb_class *c; parent = *p;
+                c = rb_entry(parent, struct htb_class, pq_node);
+                if (time_after_eq(cl->pq_key, c->pq_key))
+                        p = &parent->rb_right;
+                else 
+                        p = &parent->rb_left;
+        }
+        rb_link_node(&cl->pq_node, parent, p);
+        rb_insert_color(&cl->pq_node, &q->wait_pq[cl->level]);
+}
+/**
+ * htb_next_rb_node - finds next node in binary tree
+ *
+ * When we are past last key we return NULL.
+ * Average complexity is 2 steps per call.
+ */
+static void htb_next_rb_node(struct rb_node **n)
+{
+        *n = rb_next(*n);
+}
+/**
+ * htb_add_class_to_row - add class to its row
+ *
+ * The class is added to row at priorities marked in mask.
+ * It does nothing if mask == 0.
+ */
+static inline void htb_add_class_to_row(struct htb_sched *q, 
+                struct htb_class *cl,int mask)
+{
+        HTB_DBG(7,2,"htb_addrow cl=%X mask=%X rmask=%X\n",
+                        cl->classid,mask,q->row_mask[cl->level]);
+        HTB_CHCL(cl);
+        q->row_mask[cl->level] |= mask;
+        while (mask) {
+                int prio = ffz(~mask);
+                mask &= ~(1 << prio);
+                htb_add_to_id_tree(HTB_PASSQ q->row[cl->level]+prio,cl,prio);
+        }
+}
+/**
+ * htb_remove_class_from_row - removes class from its row
+ *
+ * The class is removed from row at priorities marked in mask.
+ * It does nothing if mask == 0.
+ */
+static __inline__ void htb_remove_class_from_row(struct htb_sched *q,
+                struct htb_class *cl,int mask)
+{
+        int m = 0;
+        HTB_CHCL(cl);
+        while (mask) {
+                int prio = ffz(~mask);
+                mask &= ~(1 << prio);
+                if (q->ptr[cl->level][prio] == cl->node+prio)
+                        htb_next_rb_node(q->ptr[cl->level]+prio);
+                htb_safe_rb_erase(cl->node + prio,q->row[cl->level]+prio);
+                if (!q->row[cl->level][prio].rb_node) 
+                        m |= 1 << prio;
+        }
+        HTB_DBG(7,2,"htb_delrow cl=%X mask=%X rmask=%X maskdel=%X\n",
+                        cl->classid,mask,q->row_mask[cl->level],m);
+        q->row_mask[cl->level] &= ~m;
+}
+/**
+ * htb_activate_prios - creates active classe's feed chain
+ *
+ * The class is connected to ancestors and/or appropriate rows
+ * for priorities it is participating on. cl->cmode must be new 
+ * (activated) mode. It does nothing if cl->prio_activity == 0.
+ */
+static void htb_activate_prios(struct htb_sched *q,struct htb_class *cl)
+{
+        struct htb_class *p = cl->parent;
+        long m,mask = cl->prio_activity;
+        HTB_DBG(7,2,"htb_act_prios cl=%X mask=%lX cmode=%d\n",cl->classid,mask,cl->cmode);
+        HTB_CHCL(cl);
+        while (cl->cmode == HTB_MAY_BORROW && p && mask) {
+                HTB_CHCL(p);
+                m = mask; while (m) {
+                        int prio = ffz(~m);
+                        m &= ~(1 << prio);
+                        
+                        if (p->un.inner.feed[prio].rb_node)
+                                /* parent already has its feed in use so that
+                                   reset bit in mask as parent is already ok */
+                                mask &= ~(1 << prio);
+                        
+                        htb_add_to_id_tree(HTB_PASSQ p->un.inner.feed+prio,cl,prio);
+                }
+                HTB_DBG(7,3,"htb_act_pr_aft p=%X pact=%X mask=%lX pmode=%d\n",
+                                p->classid,p->prio_activity,mask,p->cmode);
+                p->prio_activity |= mask;
+                cl = p; p = cl->parent;
+                HTB_CHCL(cl);
+        }
+        if (cl->cmode == HTB_CAN_SEND && mask)
+                htb_add_class_to_row(q,cl,mask);
+}
+/**
+ * htb_deactivate_prios - remove class from feed chain
+ *
+ * cl->cmode must represent old mode (before deactivation). It does 
+ * nothing if cl->prio_activity == 0. Class is removed from all feed
+ * chains and rows.
+ */
+static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
+{
+        struct htb_class *p = cl->parent;
+        long m,mask = cl->prio_activity;
+        HTB_DBG(7,2,"htb_deact_prios cl=%X mask=%lX cmode=%d\n",cl->classid,mask,cl->cmode);
+        HTB_CHCL(cl);
+        while (cl->cmode == HTB_MAY_BORROW && p && mask) {
+                m = mask; mask = 0; 
+                while (m) {
+                        int prio = ffz(~m);
+                        m &= ~(1 << prio);
+                        
+                        if (p->un.inner.ptr[prio] == cl->node+prio) {
+                                /* we are removing child which is pointed to from
+                                   parent feed - forget the pointer but remember
+                                   classid */
+                                p->un.inner.last_ptr_id[prio] = cl->classid;
+                                p->un.inner.ptr[prio] = NULL;
+                        }
+                        
+                        htb_safe_rb_erase(cl->node + prio,p->un.inner.feed + prio);
+                        
+                        if (!p->un.inner.feed[prio].rb_node) 
+                                mask |= 1 << prio;
+                }
+                HTB_DBG(7,3,"htb_deact_pr_aft p=%X pact=%X mask=%lX pmode=%d\n",
+                                p->classid,p->prio_activity,mask,p->cmode);
+                p->prio_activity &= ~mask;
+                cl = p; p = cl->parent;
+                HTB_CHCL(cl);
+        }
+        if (cl->cmode == HTB_CAN_SEND && mask) 
+                htb_remove_class_from_row(q,cl,mask);
+}
+/**
+ * htb_class_mode - computes and returns current class mode
+ *
+ * It computes cl's mode at time cl->t_c+diff and returns it. If mode
+ * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
+ * from now to time when cl will change its state. 
+ * Also it is worth to note that class mode doesn't change simply
+ * at cl->{c,}tokens == 0 but there can rather be hysteresis of 
+ * 0 .. -cl->{c,}buffer range. It is meant to limit number of
+ * mode transitions per time unit. The speed gain is about 1/6.
+ */
+static __inline__ enum htb_cmode 
+htb_class_mode(struct htb_class *cl,long *diff)
+{
+    long toks;
+    if ((toks = (cl->ctokens + *diff)) < (
+#if HTB_HYSTERESIS
+            cl->cmode != HTB_CANT_SEND ? -cl->cbuffer :
+#endif
+            0)) {
+            *diff = -toks;
+            return HTB_CANT_SEND;
+    }
+    if ((toks = (cl->tokens + *diff)) >= (
+#if HTB_HYSTERESIS
+            cl->cmode == HTB_CAN_SEND ? -cl->buffer :
+#endif
+            0))
+            return HTB_CAN_SEND;
+    *diff = -toks;
+    return HTB_MAY_BORROW;
+}
+/**
+ * htb_change_class_mode - changes classe's mode
+ *
+ * This should be the only way how to change classe's mode under normal
+ * cirsumstances. Routine will update feed lists linkage, change mode
+ * and add class to the wait event queue if appropriate. New mode should
+ * be different from old one and cl->pq_key has to be valid if changing
+ * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
+ */
+static void 
+htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, long *diff)
+{ 
+        enum htb_cmode new_mode = htb_class_mode(cl,diff);
+        
+        HTB_CHCL(cl);
+        HTB_DBG(7,1,"htb_chging_clmode %d->%d cl=%X\n",cl->cmode,new_mode,cl->classid);
+        if (new_mode == cl->cmode)
+                return; 
+        
+        if (cl->prio_activity) { /* not necessary: speed optimization */
+                if (cl->cmode != HTB_CANT_SEND) 
+                        htb_deactivate_prios(q,cl);
+                cl->cmode = new_mode;
+                if (new_mode != HTB_CANT_SEND) 
+                        htb_activate_prios(q,cl);
+        } else 
+                cl->cmode = new_mode;
+}
+/**
+ * htb_activate - inserts leaf cl into appropriate active feeds 
+ *
+ * Routine learns (new) priority of leaf and activates feed chain
+ * for the prio. It can be called on already active leaf safely.
+ * It also adds leaf into droplist.
+ */
+static __inline__ void htb_activate(struct htb_sched *q,struct htb_class *cl)
+{
+        BUG_TRAP(!cl->level && cl->un.leaf.q && cl->un.leaf.q->q.qlen);
+        HTB_CHCL(cl);
+        if (!cl->prio_activity) {
+                cl->prio_activity = 1 << (cl->un.leaf.aprio = cl->un.leaf.prio);
+                htb_activate_prios(q,cl);
+                list_add_tail(&cl->un.leaf.drop_list,q->drops+cl->un.leaf.aprio);
+        }
+}
+/**
+ * htb_deactivate - remove leaf cl from active feeds 
+ *
+ * Make sure that leaf is active. In the other words it can't be called
+ * with non-active leaf. It also removes class from the drop list.
+ */
+static __inline__ void 
+htb_deactivate(struct htb_sched *q,struct htb_class *cl)
+{
+        BUG_TRAP(cl->prio_activity);
+        HTB_CHCL(cl);
+        htb_deactivate_prios(q,cl);
+        cl->prio_activity = 0;
+        list_del_init(&cl->un.leaf.drop_list);
+}
+static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+    int ret;
+    struct htb_sched *q = qdisc_priv(sch);
+    struct htb_class *cl = htb_classify(skb,sch,&ret);
+    if (cl == HTB_DIRECT) {
+        /* enqueue to helper queue */
+        if (q->direct_queue.qlen < q->direct_qlen) {
+            __skb_queue_tail(&q->direct_queue, skb);
+            q->direct_pkts++;
+        }
+#ifdef CONFIG_NET_CLS_ACT
+    } else if (!cl) {
+        if (ret == NET_XMIT_DROP)
+                sch->qstats.drops++;
+        kfree_skb (skb);
+        return ret;
+#endif
+    } else if (cl->un.leaf.q->enqueue(skb, cl->un.leaf.q) != NET_XMIT_SUCCESS) {
+        sch->qstats.drops++;
+        cl->qstats.drops++;
+        return NET_XMIT_DROP;
+    } else {
+        cl->bstats.packets++; cl->bstats.bytes += skb->len;
+        htb_activate (q,cl);
+    }
+    sch->q.qlen++;
+    sch->bstats.packets++; sch->bstats.bytes += skb->len;
+    HTB_DBG(1,1,"htb_enq_ok cl=%X skb=%p\n",(cl && cl != HTB_DIRECT)?cl->classid:0,skb);
+    return NET_XMIT_SUCCESS;
+}
+/* TODO: requeuing packet charges it to policers again !! */
+static int htb_requeue(struct sk_buff *skb, struct Qdisc *sch)
+{
+    struct htb_sched *q = qdisc_priv(sch);
+    int ret =  NET_XMIT_SUCCESS;
+    struct htb_class *cl = htb_classify(skb,sch, &ret);
+    struct sk_buff *tskb;
+    if (cl == HTB_DIRECT || !cl) {
+        /* enqueue to helper queue */
+        if (q->direct_queue.qlen < q->direct_qlen && cl) {
+            __skb_queue_head(&q->direct_queue, skb);
+        } else {
+            __skb_queue_head(&q->direct_queue, skb);
+            tskb = __skb_dequeue_tail(&q->direct_queue);
+            kfree_skb (tskb);
+            sch->qstats.drops++;
+            return NET_XMIT_CN; 
+        }
+    } else if (cl->un.leaf.q->ops->requeue(skb, cl->un.leaf.q) != NET_XMIT_SUCCESS) {
+        sch->qstats.drops++;
+        cl->qstats.drops++;
+        return NET_XMIT_DROP;
+    } else 
+            htb_activate (q,cl);
+    sch->q.qlen++;
+    sch->qstats.requeues++;
+    HTB_DBG(1,1,"htb_req_ok cl=%X skb=%p\n",(cl && cl != HTB_DIRECT)?cl->classid:0,skb);
+    return NET_XMIT_SUCCESS;
+}
+static void htb_timer(unsigned long arg)
+{
+    struct Qdisc *sch = (struct Qdisc*)arg;
+    sch->flags &= ~TCQ_F_THROTTLED;
+    wmb();
+    netif_schedule(sch->dev);
+}
+#ifdef HTB_RATECM
+#define RT_GEN(D,R) R+=D-(R/HTB_EWMAC);D=0
+static void htb_rate_timer(unsigned long arg)
+{
+        struct Qdisc *sch = (struct Qdisc*)arg;
+        struct htb_sched *q = qdisc_priv(sch);
+        struct list_head *p;
+        /* lock queue so that we can muck with it */
+        HTB_QLOCK(sch);
+        HTB_DBG(10,1,"htb_rttmr j=%ld\n",jiffies);
+        q->rttim.expires = jiffies + HZ;
+        add_timer(&q->rttim);
+        /* scan and recompute one bucket at time */
+        if (++q->recmp_bucket >= HTB_HSIZE) 
+                q->recmp_bucket = 0;
+        list_for_each (p,q->hash+q->recmp_bucket) {
+                struct htb_class *cl = list_entry(p,struct htb_class,hlist);
+                HTB_DBG(10,2,"htb_rttmr_cl cl=%X sbyte=%lu spkt=%lu\n",
+                                cl->classid,cl->sum_bytes,cl->sum_packets);
+                RT_GEN (cl->sum_bytes,cl->rate_bytes);
+                RT_GEN (cl->sum_packets,cl->rate_packets);
+        }
+        HTB_QUNLOCK(sch);
+}
+#endif
+/**
+ * htb_charge_class - charges amount "bytes" to leaf and ancestors
+ *
+ * Routine assumes that packet "bytes" long was dequeued from leaf cl
+ * borrowing from "level". It accounts bytes to ceil leaky bucket for
+ * leaf and all ancestors and to rate bucket for ancestors at levels
+ * "level" and higher. It also handles possible change of mode resulting
+ * from the update. Note that mode can also increase here (MAY_BORROW to
+ * CAN_SEND) because we can use more precise clock that event queue here.
+ * In such case we remove class from event queue first.
+ */
+static void htb_charge_class(struct htb_sched *q,struct htb_class *cl,
+                int level,int bytes)
+{       
+        long toks,diff;
+        enum htb_cmode old_mode;
+        HTB_DBG(5,1,"htb_chrg_cl cl=%X lev=%d len=%d\n",cl->classid,level,bytes);
+#define HTB_ACCNT(T,B,R) toks = diff + cl->T; \
+        if (toks > cl->B) toks = cl->B; \
+        toks -= L2T(cl, cl->R, bytes); \
+        if (toks <= -cl->mbuffer) toks = 1-cl->mbuffer; \
+        cl->T = toks
+        while (cl) {
+                HTB_CHCL(cl);
+                diff = PSCHED_TDIFF_SAFE(q->now, cl->t_c, (u32)cl->mbuffer);
+#ifdef HTB_DEBUG
+                if (diff > cl->mbuffer || diff < 0 || PSCHED_TLESS(q->now, cl->t_c)) {
+                        if (net_ratelimit())
+                                printk(KERN_ERR "HTB: bad diff in charge, cl=%X diff=%lX now=%Lu then=%Lu j=%lu\n",
+                                       cl->classid, diff,
+#ifdef CONFIG_NET_SCH_CLK_GETTIMEOFDAY
+                                       q->now.tv_sec * 1000000ULL + q->now.tv_usec,
+                                       cl->t_c.tv_sec * 1000000ULL + cl->t_c.tv_usec,
+#else
+                                       (unsigned long long) q->now,
+                                       (unsigned long long) cl->t_c,
+#endif
+                                       q->jiffies);
+                        diff = 1000;
+                }
+#endif
+                if (cl->level >= level) {
+                        if (cl->level == level) cl->xstats.lends++;
+                        HTB_ACCNT (tokens,buffer,rate);
+                } else {
+                        cl->xstats.borrows++;
+                        cl->tokens += diff; /* we moved t_c; update tokens */
+                }
+                HTB_ACCNT (ctokens,cbuffer,ceil);
+                cl->t_c = q->now;
+                HTB_DBG(5,2,"htb_chrg_clp cl=%X diff=%ld tok=%ld ctok=%ld\n",cl->classid,diff,cl->tokens,cl->ctokens);
+                old_mode = cl->cmode; diff = 0;
+                htb_change_class_mode(q,cl,&diff);
+                if (old_mode != cl->cmode) {
+                        if (old_mode != HTB_CAN_SEND)
+                                htb_safe_rb_erase(&cl->pq_node,q->wait_pq+cl->level);
+                        if (cl->cmode != HTB_CAN_SEND)
+                                htb_add_to_wait_tree (q,cl,diff,1);
+                }
+                
+#ifdef HTB_RATECM
+                /* update rate counters */
+                cl->sum_bytes += bytes; cl->sum_packets++;
+#endif
+                /* update byte stats except for leaves which are already updated */
+                if (cl->level) {
+                        cl->bstats.bytes += bytes;
+                        cl->bstats.packets++;
+                }
+                cl = cl->parent;
+        }
+}
+/**
+ * htb_do_events - make mode changes to classes at the level
+ *
+ * Scans event queue for pending events and applies them. Returns jiffies to
+ * next pending event (0 for no event in pq).
+ * Note: Aplied are events whose have cl->pq_key <= jiffies.
+ */
+static long htb_do_events(struct htb_sched *q,int level)
+{
+        int i;
+        HTB_DBG(8,1,"htb_do_events l=%d root=%p rmask=%X\n",
+                        level,q->wait_pq[level].rb_node,q->row_mask[level]);
+        for (i = 0; i < 500; i++) {
+                struct htb_class *cl;
+                long diff;
+                struct rb_node *p = q->wait_pq[level].rb_node;
+                if (!p) return 0;
+                while (p->rb_left) p = p->rb_left;
+                cl = rb_entry(p, struct htb_class, pq_node);
+                if (time_after(cl->pq_key, q->jiffies)) {
+                        HTB_DBG(8,3,"htb_do_ev_ret delay=%ld\n",cl->pq_key - q->jiffies);
+                        return cl->pq_key - q->jiffies;
+                }
+                htb_safe_rb_erase(p,q->wait_pq+level);
+                diff = PSCHED_TDIFF_SAFE(q->now, cl->t_c, (u32)cl->mbuffer);
+#ifdef HTB_DEBUG
+                if (diff > cl->mbuffer || diff < 0 || PSCHED_TLESS(q->now, cl->t_c)) {
+                        if (net_ratelimit())
+                                printk(KERN_ERR "HTB: bad diff in events, cl=%X diff=%lX now=%Lu then=%Lu j=%lu\n",
+                                       cl->classid, diff,
+#ifdef CONFIG_NET_SCH_CLK_GETTIMEOFDAY
+                                       q->now.tv_sec * 1000000ULL + q->now.tv_usec,
+                                       cl->t_c.tv_sec * 1000000ULL + cl->t_c.tv_usec,
+#else
+                                       (unsigned long long) q->now,
+                                       (unsigned long long) cl->t_c,
+#endif
+                                       q->jiffies);
+                        diff = 1000;
+                }
+#endif
+                htb_change_class_mode(q,cl,&diff);
+                if (cl->cmode != HTB_CAN_SEND)
+                        htb_add_to_wait_tree (q,cl,diff,2);
+        }
+        if (net_ratelimit())
+                printk(KERN_WARNING "htb: too many events !\n");
+        return HZ/10;
+}
+/* Returns class->node+prio from id-tree where classe's id is >= id. NULL
+   is no such one exists. */
+static struct rb_node *
+htb_id_find_next_upper(int prio,struct rb_node *n,u32 id)
+{
+        struct rb_node *r = NULL;
+        while (n) {
+                struct htb_class *cl = rb_entry(n,struct htb_class,node[prio]);
+                if (id == cl->classid) return n;
+                
+                if (id > cl->classid) {
+                        n = n->rb_right;
+                } else {
+                        r = n;
+                        n = n->rb_left;
+                }
+        }
+        return r;
+}
+/**
+ * htb_lookup_leaf - returns next leaf class in DRR order
+ *
+ * Find leaf where current feed pointers points to.
+ */
+static struct htb_class *
+htb_lookup_leaf(HTB_ARGQ struct rb_root *tree,int prio,struct rb_node **pptr,u32 *pid)
+{
+        int i;
+        struct {
+                struct rb_node *root;
+                struct rb_node **pptr;
+                u32 *pid;
+        } stk[TC_HTB_MAXDEPTH],*sp = stk;
+        
+        BUG_TRAP(tree->rb_node);
+        sp->root = tree->rb_node;
+        sp->pptr = pptr;
+        sp->pid = pid;
+        for (i = 0; i < 65535; i++) {
+                HTB_DBG(4,2,"htb_lleaf ptr=%p pid=%X\n",*sp->pptr,*sp->pid);
+                
+                if (!*sp->pptr && *sp->pid) { 
+                        /* ptr was invalidated but id is valid - try to recover 
+                           the original or next ptr */
+                        *sp->pptr = htb_id_find_next_upper(prio,sp->root,*sp->pid);
+                }
+                *sp->pid = 0; /* ptr is valid now so that remove this hint as it
+                                 can become out of date quickly */
+                if (!*sp->pptr) { /* we are at right end; rewind & go up */
+                        *sp->pptr = sp->root;
+                        while ((*sp->pptr)->rb_left) 
+                                *sp->pptr = (*sp->pptr)->rb_left;
+                        if (sp > stk) {
+                                sp--;
+                                BUG_TRAP(*sp->pptr); if(!*sp->pptr) return NULL;
+                                htb_next_rb_node (sp->pptr);
+                        }
+                } else {
+                        struct htb_class *cl;
+                        cl = rb_entry(*sp->pptr,struct htb_class,node[prio]);
+                        HTB_CHCL(cl);
+                        if (!cl->level) 
+                                return cl;
+                        (++sp)->root = cl->un.inner.feed[prio].rb_node;
+                        sp->pptr = cl->un.inner.ptr+prio;
+                        sp->pid = cl->un.inner.last_ptr_id+prio;
+                }
+        }
+        BUG_TRAP(0);
+        return NULL;
+}
+/* dequeues packet at given priority and level; call only if
+   you are sure that there is active class at prio/level */
+static struct sk_buff *
+htb_dequeue_tree(struct htb_sched *q,int prio,int level)
+{
+        struct sk_buff *skb = NULL;
+        struct htb_class *cl,*start;
+        /* look initial class up in the row */
+        start = cl = htb_lookup_leaf (HTB_PASSQ q->row[level]+prio,prio,
+                        q->ptr[level]+prio,q->last_ptr_id[level]+prio);
+        
+        do {
+next:
+                BUG_TRAP(cl); 
+                if (!cl) return NULL;
+                HTB_DBG(4,1,"htb_deq_tr prio=%d lev=%d cl=%X defic=%d\n",
+                                prio,level,cl->classid,cl->un.leaf.deficit[level]);
+                /* class can be empty - it is unlikely but can be true if leaf
+                   qdisc drops packets in enqueue routine or if someone used
+                   graft operation on the leaf since last dequeue; 
+                   simply deactivate and skip such class */
+                if (unlikely(cl->un.leaf.q->q.qlen == 0)) {
+                        struct htb_class *next;
+                        htb_deactivate(q,cl);
+                        /* row/level might become empty */
+                        if ((q->row_mask[level] & (1 << prio)) == 0)
+                                return NULL; 
+                        
+                        next = htb_lookup_leaf (HTB_PASSQ q->row[level]+prio,
+                                        prio,q->ptr[level]+prio,q->last_ptr_id[level]+prio);
+                        if (cl == start) /* fix start if we just deleted it */
+                                start = next;
+                        cl = next;
+                        goto next;
+                }
+        
+                if (likely((skb = cl->un.leaf.q->dequeue(cl->un.leaf.q)) != NULL)) 
+                        break;
+                if (!cl->warned) {
+                        printk(KERN_WARNING "htb: class %X isn't work conserving ?!\n",cl->classid);
+                        cl->warned = 1;
+                }
+                q->nwc_hit++;
+                htb_next_rb_node((level?cl->parent->un.inner.ptr:q->ptr[0])+prio);
+                cl = htb_lookup_leaf (HTB_PASSQ q->row[level]+prio,prio,q->ptr[level]+prio,
+                                q->last_ptr_id[level]+prio);
+        } while (cl != start);
+        if (likely(skb != NULL)) {
+                if ((cl->un.leaf.deficit[level] -= skb->len) < 0) {
+                        HTB_DBG(4,2,"htb_next_cl oldptr=%p quant_add=%d\n",
+                                level?cl->parent->un.inner.ptr[prio]:q->ptr[0][prio],cl->un.leaf.quantum);
+                        cl->un.leaf.deficit[level] += cl->un.leaf.quantum;
+                        htb_next_rb_node((level?cl->parent->un.inner.ptr:q->ptr[0])+prio);
+                }
+                /* this used to be after charge_class but this constelation
+                   gives us slightly better performance */
+                if (!cl->un.leaf.q->q.qlen)
+                        htb_deactivate (q,cl);
+                htb_charge_class (q,cl,level,skb->len);
+        }
+        return skb;
+}
+static void htb_delay_by(struct Qdisc *sch,long delay)
+{
+        struct htb_sched *q = qdisc_priv(sch);
+        if (delay <= 0) delay = 1;
+        if (unlikely(delay > 5*HZ)) {
+                if (net_ratelimit())
+                        printk(KERN_INFO "HTB delay %ld > 5sec\n", delay);
+                delay = 5*HZ;
+        }
+        /* why don't use jiffies here ? because expires can be in past */
+        mod_timer(&q->timer, q->jiffies + delay);
+        sch->flags |= TCQ_F_THROTTLED;
+        sch->qstats.overlimits++;
+        HTB_DBG(3,1,"htb_deq t_delay=%ld\n",delay);
+}
+static struct sk_buff *htb_dequeue(struct Qdisc *sch)
+{
+        struct sk_buff *skb = NULL;
+        struct htb_sched *q = qdisc_priv(sch);
+        int level;
+        long min_delay;
+#ifdef HTB_DEBUG
+        int evs_used = 0;
+#endif
+        q->jiffies = jiffies;
+        HTB_DBG(3,1,"htb_deq dircnt=%d qlen=%d\n",skb_queue_len(&q->direct_queue),
+                        sch->q.qlen);
+        /* try to dequeue direct packets as high prio (!) to minimize cpu work */
+        if ((skb = __skb_dequeue(&q->direct_queue)) != NULL) {
+                sch->flags &= ~TCQ_F_THROTTLED;
+                sch->q.qlen--;
+                return skb;
+        }
+        if (!sch->q.qlen) goto fin;
+        PSCHED_GET_TIME(q->now);
+        min_delay = LONG_MAX;
+        q->nwc_hit = 0;
+        for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
+                /* common case optimization - skip event handler quickly */
+                int m;
+                long delay;
+                if (time_after_eq(q->jiffies, q->near_ev_cache[level])) {
+                        delay = htb_do_events(q,level);
+                        q->near_ev_cache[level] = q->jiffies + (delay ? delay : HZ);
+#ifdef HTB_DEBUG
+                        evs_used++;
+#endif
+                } else
+                        delay = q->near_ev_cache[level] - q->jiffies;   
+                
+                if (delay && min_delay > delay) 
+                        min_delay = delay;
+                m = ~q->row_mask[level];
+                while (m != (int)(-1)) {
+                        int prio = ffz (m);
+                        m |= 1 << prio;
+                        skb = htb_dequeue_tree(q,prio,level);
+                        if (likely(skb != NULL)) {
+                                sch->q.qlen--;
+                                sch->flags &= ~TCQ_F_THROTTLED;
+                                goto fin;
+                        }
+                }
+        }
+#ifdef HTB_DEBUG
+        if (!q->nwc_hit && min_delay >= 10*HZ && net_ratelimit()) {
+                if (min_delay == LONG_MAX) {
+                        printk(KERN_ERR "HTB: dequeue bug (%d,%lu,%lu), report it please !\n",
+                                        evs_used,q->jiffies,jiffies);
+                        htb_debug_dump(q);
+                } else 
+                        printk(KERN_WARNING "HTB: mindelay=%ld, some class has "
+                                        "too small rate\n",min_delay);
+        }
+#endif
+        htb_delay_by (sch,min_delay > 5*HZ ? 5*HZ : min_delay);
+fin:
+        HTB_DBG(3,1,"htb_deq_end %s j=%lu skb=%p\n",sch->dev->name,q->jiffies,skb);
+        return skb;
+}
+/* try to drop from each class (by prio) until one succeed */
+static unsigned int htb_drop(struct Qdisc* sch)
+{
+        struct htb_sched *q = qdisc_priv(sch);
+        int prio;
+        for (prio = TC_HTB_NUMPRIO - 1; prio >= 0; prio--) {
+                struct list_head *p;
+                list_for_each (p,q->drops+prio) {
+                        struct htb_class *cl = list_entry(p, struct htb_class,
+                                                          un.leaf.drop_list);
+                        unsigned int len;
+                        if (cl->un.leaf.q->ops->drop && 
+                                (len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) {
+                                sch->q.qlen--;
+                                if (!cl->un.leaf.q->q.qlen)
+                                        htb_deactivate (q,cl);
+                                return len;
+                        }
+                }
+        }
+        return 0;
+}
+/* reset all classes */
+/* always caled under BH & queue lock */
+static void htb_reset(struct Qdisc* sch)
+{
+        struct htb_sched *q = qdisc_priv(sch);
+        int i;
+        HTB_DBG(0,1,"htb_reset sch=%p, handle=%X\n",sch,sch->handle);
+        for (i = 0; i < HTB_HSIZE; i++) {
+                struct list_head *p;
+                list_for_each (p,q->hash+i) {
+                        struct htb_class *cl = list_entry(p,struct htb_class,hlist);
+                        if (cl->level)
+                                memset(&cl->un.inner,0,sizeof(cl->un.inner));
+                        else {
+                                if (cl->un.leaf.q) 
+                                        qdisc_reset(cl->un.leaf.q);
+                                INIT_LIST_HEAD(&cl->un.leaf.drop_list);
+                        }
+                        cl->prio_activity = 0;
+                        cl->cmode = HTB_CAN_SEND;
+#ifdef HTB_DEBUG
+                        cl->pq_node.rb_color = -1;
+                        memset(cl->node,255,sizeof(cl->node));
+#endif
+                }
+        }
+        sch->flags &= ~TCQ_F_THROTTLED;
+        del_timer(&q->timer);
+        __skb_queue_purge(&q->direct_queue);
+        sch->q.qlen = 0;
+        memset(q->row,0,sizeof(q->row));
+        memset(q->row_mask,0,sizeof(q->row_mask));
+        memset(q->wait_pq,0,sizeof(q->wait_pq));
+        memset(q->ptr,0,sizeof(q->ptr));
+        for (i = 0; i < TC_HTB_NUMPRIO; i++)
+                INIT_LIST_HEAD(q->drops+i);
+}
+static int htb_init(struct Qdisc *sch, struct rtattr *opt)
+{
+        struct htb_sched *q = qdisc_priv(sch);
+        struct rtattr *tb[TCA_HTB_INIT];
+        struct tc_htb_glob *gopt;
+        int i;
+#ifdef HTB_DEBUG
+        printk(KERN_INFO "HTB init, kernel part version %d.%d\n",
+                          HTB_VER >> 16,HTB_VER & 0xffff);
+#endif
+        if (!opt || rtattr_parse_nested(tb, TCA_HTB_INIT, opt) ||
+                        tb[TCA_HTB_INIT-1] == NULL ||
+                        RTA_PAYLOAD(tb[TCA_HTB_INIT-1]) < sizeof(*gopt)) {
+                printk(KERN_ERR "HTB: hey probably you have bad tc tool ?\n");
+                return -EINVAL;
+        }
+        gopt = RTA_DATA(tb[TCA_HTB_INIT-1]);
+        if (gopt->version != HTB_VER >> 16) {
+                printk(KERN_ERR "HTB: need tc/htb version %d (minor is %d), you have %d\n",
+                                HTB_VER >> 16,HTB_VER & 0xffff,gopt->version);
+                return -EINVAL;
+        }
+        q->debug = gopt->debug;
+        HTB_DBG(0,1,"htb_init sch=%p handle=%X r2q=%d\n",sch,sch->handle,gopt->rate2quantum);
+        INIT_LIST_HEAD(&q->root);
+        for (i = 0; i < HTB_HSIZE; i++)
+                INIT_LIST_HEAD(q->hash+i);
+        for (i = 0; i < TC_HTB_NUMPRIO; i++)
+                INIT_LIST_HEAD(q->drops+i);
+        init_timer(&q->timer);
+        skb_queue_head_init(&q->direct_queue);
+        q->direct_qlen = sch->dev->tx_queue_len;
+        if (q->direct_qlen < 2) /* some devices have zero tx_queue_len */
+                q->direct_qlen = 2;
+        q->timer.function = htb_timer;
+        q->timer.data = (unsigned long)sch;
+#ifdef HTB_RATECM
+        init_timer(&q->rttim);
+        q->rttim.function = htb_rate_timer;
+        q->rttim.data = (unsigned long)sch;
+        q->rttim.expires = jiffies + HZ;
+        add_timer(&q->rttim);
+#endif
+        if ((q->rate2quantum = gopt->rate2quantum) < 1)
+                q->rate2quantum = 1;
+        q->defcls = gopt->defcls;
+        return 0;
+}
+static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+        struct htb_sched *q = qdisc_priv(sch);
+        unsigned char    *b = skb->tail;
+        struct rtattr *rta;
+        struct tc_htb_glob gopt;
+        HTB_DBG(0,1,"htb_dump sch=%p, handle=%X\n",sch,sch->handle);
+        HTB_QLOCK(sch);
+        gopt.direct_pkts = q->direct_pkts;
+#ifdef HTB_DEBUG
+        if (HTB_DBG_COND(0,2))
+                htb_debug_dump(q);
+#endif
+        gopt.version = HTB_VER;
+        gopt.rate2quantum = q->rate2quantum;
+        gopt.defcls = q->defcls;
+        gopt.debug = q->debug;
+        rta = (struct rtattr*)b;
+        RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
+        RTA_PUT(skb, TCA_HTB_INIT, sizeof(gopt), &gopt);
+        rta->rta_len = skb->tail - b;
+        HTB_QUNLOCK(sch);
+        return skb->len;
+rtattr_failure:
+        HTB_QUNLOCK(sch);
+        skb_trim(skb, skb->tail - skb->data);
+        return -1;
+}
+static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
+        struct sk_buff *skb, struct tcmsg *tcm)
+{
+#ifdef HTB_DEBUG
+        struct htb_sched *q = qdisc_priv(sch);
+#endif
+        struct htb_class *cl = (struct htb_class*)arg;
+        unsigned char    *b = skb->tail;
+        struct rtattr *rta;
+        struct tc_htb_opt opt;
+        HTB_DBG(0,1,"htb_dump_class handle=%X clid=%X\n",sch->handle,cl->classid);
+        HTB_QLOCK(sch);
+        tcm->tcm_parent = cl->parent ? cl->parent->classid : TC_H_ROOT;
+        tcm->tcm_handle = cl->classid;
+        if (!cl->level && cl->un.leaf.q)
+                tcm->tcm_info = cl->un.leaf.q->handle;
+        rta = (struct rtattr*)b;
+        RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
+        memset (&opt,0,sizeof(opt));
+        opt.rate = cl->rate->rate; opt.buffer = cl->buffer;
+        opt.ceil = cl->ceil->rate; opt.cbuffer = cl->cbuffer;
+        opt.quantum = cl->un.leaf.quantum; opt.prio = cl->un.leaf.prio;
+        opt.level = cl->level; 
+        RTA_PUT(skb, TCA_HTB_PARMS, sizeof(opt), &opt);
+        rta->rta_len = skb->tail - b;
+        HTB_QUNLOCK(sch);
+        return skb->len;
+rtattr_failure:
+        HTB_QUNLOCK(sch);
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+static int
+htb_dump_class_stats(struct Qdisc *sch, unsigned long arg,
+        struct gnet_dump *d)
+{
+        struct htb_class *cl = (struct htb_class*)arg;
+#ifdef HTB_RATECM
+        cl->rate_est.bps = cl->rate_bytes/(HTB_EWMAC*HTB_HSIZE);
+        cl->rate_est.pps = cl->rate_packets/(HTB_EWMAC*HTB_HSIZE);
+#endif
+        if (!cl->level && cl->un.leaf.q)
+                cl->qstats.qlen = cl->un.leaf.q->q.qlen;
+        cl->xstats.tokens = cl->tokens;
+        cl->xstats.ctokens = cl->ctokens;
+        if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
+            gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
+            gnet_stats_copy_queue(d, &cl->qstats) < 0)
+                return -1;
+        return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
+}
+static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+        struct Qdisc **old)
+{
+        struct htb_class *cl = (struct htb_class*)arg;
+        if (cl && !cl->level) {
+                if (new == NULL && (new = qdisc_create_dflt(sch->dev, 
+                                        &pfifo_qdisc_ops)) == NULL)
+                                        return -ENOBUFS;
+                sch_tree_lock(sch);
+                if ((*old = xchg(&cl->un.leaf.q, new)) != NULL) {
+                        if (cl->prio_activity)
+                                htb_deactivate (qdisc_priv(sch),cl);
+                        /* TODO: is it correct ? Why CBQ doesn't do it ? */
+                        sch->q.qlen -= (*old)->q.qlen;  
+                        qdisc_reset(*old);
+                }
+                sch_tree_unlock(sch);
+                return 0;
+        }
+        return -ENOENT;
+}
+static struct Qdisc * htb_leaf(struct Qdisc *sch, unsigned long arg)
+{
+        struct htb_class *cl = (struct htb_class*)arg;
+        return (cl && !cl->level) ? cl->un.leaf.q : NULL;
+}
+static unsigned long htb_get(struct Qdisc *sch, u32 classid)
+{
+#ifdef HTB_DEBUG
+        struct htb_sched *q = qdisc_priv(sch);
+#endif
+        struct htb_class *cl = htb_find(classid,sch);
+        HTB_DBG(0,1,"htb_get clid=%X q=%p cl=%p ref=%d\n",classid,q,cl,cl?cl->refcnt:0);
+        if (cl) 
+                cl->refcnt++;
+        return (unsigned long)cl;
+}
+static void htb_destroy_filters(struct tcf_proto **fl)
+{
+        struct tcf_proto *tp;
+        while ((tp = *fl) != NULL) {
+                *fl = tp->next;
+                tcf_destroy(tp);
+        }
+}
+static void htb_destroy_class(struct Qdisc* sch,struct htb_class *cl)
+{
+        struct htb_sched *q = qdisc_priv(sch);
+        HTB_DBG(0,1,"htb_destrycls clid=%X ref=%d\n", cl?cl->classid:0,cl?cl->refcnt:0);
+        if (!cl->level) {
+                BUG_TRAP(cl->un.leaf.q);
+                sch->q.qlen -= cl->un.leaf.q->q.qlen;
+                qdisc_destroy(cl->un.leaf.q);
+        }
+        qdisc_put_rtab(cl->rate);
+        qdisc_put_rtab(cl->ceil);
+        
+        htb_destroy_filters (&cl->filter_list);
+        
+        while (!list_empty(&cl->children)) 
+                htb_destroy_class (sch,list_entry(cl->children.next,
+                                        struct htb_class,sibling));
+        /* note: this delete may happen twice (see htb_delete) */
+        list_del(&cl->hlist);
+        list_del(&cl->sibling);
+        
+        if (cl->prio_activity)
+                htb_deactivate (q,cl);
+        
+        if (cl->cmode != HTB_CAN_SEND)
+                htb_safe_rb_erase(&cl->pq_node,q->wait_pq+cl->level);
+        
+        kfree(cl);
+}
+/* always caled under BH & queue lock */
+static void htb_destroy(struct Qdisc* sch)
+{
+        struct htb_sched *q = qdisc_priv(sch);
+        HTB_DBG(0,1,"htb_destroy q=%p\n",q);
+        del_timer_sync (&q->timer);
+#ifdef HTB_RATECM
+        del_timer_sync (&q->rttim);
+#endif
+        /* This line used to be after htb_destroy_class call below
+           and surprisingly it worked in 2.4. But it must precede it 
+           because filter need its target class alive to be able to call
+           unbind_filter on it (without Oops). */
+        htb_destroy_filters(&q->filter_list);
+        
+        while (!list_empty(&q->root)) 
+                htb_destroy_class (sch,list_entry(q->root.next,
+                                        struct htb_class,sibling));
+        __skb_queue_purge(&q->direct_queue);
+}
+static int htb_delete(struct Qdisc *sch, unsigned long arg)
+{
+        struct htb_sched *q = qdisc_priv(sch);
+        struct htb_class *cl = (struct htb_class*)arg;
+        HTB_DBG(0,1,"htb_delete q=%p cl=%X ref=%d\n",q,cl?cl->classid:0,cl?cl->refcnt:0);
+        // TODO: why don't allow to delete subtree ? references ? does
+        // tc subsys quarantee us that in htb_destroy it holds no class
+        // refs so that we can remove children safely there ?
+        if (!list_empty(&cl->children) || cl->filter_cnt)
+                return -EBUSY;
+        
+        sch_tree_lock(sch);
+        
+        /* delete from hash and active; remainder in destroy_class */
+        list_del_init(&cl->hlist);
+        if (cl->prio_activity)
+                htb_deactivate (q,cl);
+        if (--cl->refcnt == 0)
+                htb_destroy_class(sch,cl);
+        sch_tree_unlock(sch);
+        return 0;
+}
+static void htb_put(struct Qdisc *sch, unsigned long arg)
+{
+#ifdef HTB_DEBUG
+        struct htb_sched *q = qdisc_priv(sch);
+#endif
+        struct htb_class *cl = (struct htb_class*)arg;
+        HTB_DBG(0,1,"htb_put q=%p cl=%X ref=%d\n",q,cl?cl->classid:0,cl?cl->refcnt:0);
+        if (--cl->refcnt == 0)
+                htb_destroy_class(sch,cl);
+}
+static int htb_change_class(struct Qdisc *sch, u32 classid, 
+                u32 parentid, struct rtattr **tca, unsigned long *arg)
+{
+        int err = -EINVAL;
+        struct htb_sched *q = qdisc_priv(sch);
+        struct htb_class *cl = (struct htb_class*)*arg,*parent;
+        struct rtattr *opt = tca[TCA_OPTIONS-1];
+        struct qdisc_rate_table *rtab = NULL, *ctab = NULL;
+        struct rtattr *tb[TCA_HTB_RTAB];
+        struct tc_htb_opt *hopt;
+        /* extract all subattrs from opt attr */
+        if (!opt || rtattr_parse_nested(tb, TCA_HTB_RTAB, opt) ||
+                        tb[TCA_HTB_PARMS-1] == NULL ||
+                        RTA_PAYLOAD(tb[TCA_HTB_PARMS-1]) < sizeof(*hopt))
+                goto failure;
+        
+        parent = parentid == TC_H_ROOT ? NULL : htb_find (parentid,sch);
+        hopt = RTA_DATA(tb[TCA_HTB_PARMS-1]);
+        HTB_DBG(0,1,"htb_chg cl=%p(%X), clid=%X, parid=%X, opt/prio=%d, rate=%u, buff=%d, quant=%d\n", cl,cl?cl->classid:0,classid,parentid,(int)hopt->prio,hopt->rate.rate,hopt->buffer,hopt->quantum);
+        rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB-1]);
+        ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB-1]);
+        if (!rtab || !ctab) goto failure;
+        if (!cl) { /* new class */
+                struct Qdisc *new_q;
+                /* check for valid classid */
+                if (!classid || TC_H_MAJ(classid^sch->handle) || htb_find(classid,sch))
+                        goto failure;
+                /* check maximal depth */
+                if (parent && parent->parent && parent->parent->level < 2) {
+                        printk(KERN_ERR "htb: tree is too deep\n");
+                        goto failure;
+                }
+                err = -ENOBUFS;
+                if ((cl = kmalloc(sizeof(*cl), GFP_KERNEL)) == NULL)
+                        goto failure;
+                
+                memset(cl, 0, sizeof(*cl));
+                cl->refcnt = 1;
+                INIT_LIST_HEAD(&cl->sibling);
+                INIT_LIST_HEAD(&cl->hlist);
+                INIT_LIST_HEAD(&cl->children);
+                INIT_LIST_HEAD(&cl->un.leaf.drop_list);
+#ifdef HTB_DEBUG
+                cl->magic = HTB_CMAGIC;
+#endif
+                /* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
+                   so that can't be used inside of sch_tree_lock
+                   -- thanks to Karlis Peisenieks */
+                new_q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
+                sch_tree_lock(sch);
+                if (parent && !parent->level) {
+                        /* turn parent into inner node */
+                        sch->q.qlen -= parent->un.leaf.q->q.qlen;
+                        qdisc_destroy (parent->un.leaf.q);
+                        if (parent->prio_activity) 
+                                htb_deactivate (q,parent);
+                        /* remove from evt list because of level change */
+                        if (parent->cmode != HTB_CAN_SEND) {
+                                htb_safe_rb_erase(&parent->pq_node,q->wait_pq /*+0*/);
+                                parent->cmode = HTB_CAN_SEND;
+                        }
+                        parent->level = (parent->parent ? parent->parent->level
+                                        : TC_HTB_MAXDEPTH) - 1;
+                        memset (&parent->un.inner,0,sizeof(parent->un.inner));
+                }
+                /* leaf (we) needs elementary qdisc */
+                cl->un.leaf.q = new_q ? new_q : &noop_qdisc;
+                cl->classid = classid; cl->parent = parent;
+                /* set class to be in HTB_CAN_SEND state */
+                cl->tokens = hopt->buffer;
+                cl->ctokens = hopt->cbuffer;
+                cl->mbuffer = 60000000; /* 1min */
+                PSCHED_GET_TIME(cl->t_c);
+                cl->cmode = HTB_CAN_SEND;
+                /* attach to the hash list and parent's family */
+                list_add_tail(&cl->hlist, q->hash+htb_hash(classid));
+                list_add_tail(&cl->sibling, parent ? &parent->children : &q->root);
+#ifdef HTB_DEBUG
+                { 
+                        int i;
+                        for (i = 0; i < TC_HTB_NUMPRIO; i++) cl->node[i].rb_color = -1;
+                        cl->pq_node.rb_color = -1;
+                }
+#endif
+        } else sch_tree_lock(sch);
+        /* it used to be a nasty bug here, we have to check that node
+           is really leaf before changing cl->un.leaf ! */
+        if (!cl->level) {
+                cl->un.leaf.quantum = rtab->rate.rate / q->rate2quantum;
+                if (!hopt->quantum && cl->un.leaf.quantum < 1000) {
+                        printk(KERN_WARNING "HTB: quantum of class %X is small. Consider r2q change.\n", cl->classid);
+                        cl->un.leaf.quantum = 1000;
+                }
+                if (!hopt->quantum && cl->un.leaf.quantum > 200000) {
+                        printk(KERN_WARNING "HTB: quantum of class %X is big. Consider r2q change.\n", cl->classid);
+                        cl->un.leaf.quantum = 200000;
+                }
+                if (hopt->quantum)
+                        cl->un.leaf.quantum = hopt->quantum;
+                if ((cl->un.leaf.prio = hopt->prio) >= TC_HTB_NUMPRIO)
+                        cl->un.leaf.prio = TC_HTB_NUMPRIO - 1;
+        }
+        cl->buffer = hopt->buffer;
+        cl->cbuffer = hopt->cbuffer;
+        if (cl->rate) qdisc_put_rtab(cl->rate); cl->rate = rtab;
+        if (cl->ceil) qdisc_put_rtab(cl->ceil); cl->ceil = ctab;
+        sch_tree_unlock(sch);
+        *arg = (unsigned long)cl;
+        return 0;
+failure:
+        if (rtab) qdisc_put_rtab(rtab);
+        if (ctab) qdisc_put_rtab(ctab);
+        return err;
+}
+static struct tcf_proto **htb_find_tcf(struct Qdisc *sch, unsigned long arg)
+{
+        struct htb_sched *q = qdisc_priv(sch);
+        struct htb_class *cl = (struct htb_class *)arg;
+        struct tcf_proto **fl = cl ? &cl->filter_list : &q->filter_list;
+        HTB_DBG(0,2,"htb_tcf q=%p clid=%X fref=%d fl=%p\n",q,cl?cl->classid:0,cl?cl->filter_cnt:q->filter_cnt,*fl);
+        return fl;
+}
+static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
+        u32 classid)
+{
+        struct htb_sched *q = qdisc_priv(sch);
+        struct htb_class *cl = htb_find (classid,sch);
+        HTB_DBG(0,2,"htb_bind q=%p clid=%X cl=%p fref=%d\n",q,classid,cl,cl?cl->filter_cnt:q->filter_cnt);
+        /*if (cl && !cl->level) return 0;
+          The line above used to be there to prevent attaching filters to 
+          leaves. But at least tc_index filter uses this just to get class 
+          for other reasons so that we have to allow for it.
+          ----
+          19.6.2002 As Werner explained it is ok - bind filter is just
+          another way to "lock" the class - unlike "get" this lock can
+          be broken by class during destroy IIUC.
+         */
+        if (cl) 
+                cl->filter_cnt++; 
+        else 
+                q->filter_cnt++;
+        return (unsigned long)cl;
+}
+static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
+{
+        struct htb_sched *q = qdisc_priv(sch);
+        struct htb_class *cl = (struct htb_class *)arg;
+        HTB_DBG(0,2,"htb_unbind q=%p cl=%p fref=%d\n",q,cl,cl?cl->filter_cnt:q->filter_cnt);
+        if (cl) 
+                cl->filter_cnt--; 
+        else 
+                q->filter_cnt--;
+}
+static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+        struct htb_sched *q = qdisc_priv(sch);
+        int i;
+        if (arg->stop)
+                return;
+        for (i = 0; i < HTB_HSIZE; i++) {
+                struct list_head *p;
+                list_for_each (p,q->hash+i) {
+                        struct htb_class *cl = list_entry(p,struct htb_class,hlist);
+                        if (arg->count < arg->skip) {
+                                arg->count++;
+                                continue;
+                        }
+                        if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
+                                arg->stop = 1;
+                                return;
+                        }
+                        arg->count++;
+                }
+        }
+}
+static struct Qdisc_class_ops htb_class_ops = {
+        .graft          =       htb_graft,
+        .leaf           =       htb_leaf,
+        .get            =       htb_get,
+        .put            =       htb_put,
+        .change         =       htb_change_class,
+        .delete         =       htb_delete,
+        .walk           =       htb_walk,
+        .tcf_chain      =       htb_find_tcf,
+        .bind_tcf       =       htb_bind_filter,
+        .unbind_tcf     =       htb_unbind_filter,
+        .dump           =       htb_dump_class,
+        .dump_stats     =       htb_dump_class_stats,
+};
+static struct Qdisc_ops htb_qdisc_ops = {
+        .next           =       NULL,
+        .cl_ops         =       &htb_class_ops,
+        .id             =       "htb",
+        .priv_size      =       sizeof(struct htb_sched),
+        .enqueue        =       htb_enqueue,
+        .dequeue        =       htb_dequeue,
+        .requeue        =       htb_requeue,
+        .drop           =       htb_drop,
+        .init           =       htb_init,
+        .reset          =       htb_reset,
+        .destroy        =       htb_destroy,
+        .change         =       NULL /* htb_change */,
+        .dump           =       htb_dump,
+        .owner          =       THIS_MODULE,
+};
+static int __init htb_module_init(void)
+{
+    return register_qdisc(&htb_qdisc_ops);
+}
+static void __exit htb_module_exit(void) 
+{
+    unregister_qdisc(&htb_qdisc_ops);
+}
+module_init(htb_module_init)
+module_exit(htb_module_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
new file mode 100644
index 000000000000..8edc32a6ad2f
--- /dev/null
+++ b/net/sched/sch_ingress.c
@@ -0,0 +1,436 @@
+/* net/sched/sch_ingress.c - Ingress qdisc 
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Authors:     Jamal Hadi Salim 1999
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter.h>
+#include <linux/smp.h>
+#include <net/pkt_sched.h>
+#include <asm/byteorder.h>
+#include <asm/uaccess.h>
+#include <linux/kmod.h>
+#include <linux/stat.h>
+#include <linux/interrupt.h>
+#include <linux/list.h>
+#undef DEBUG_INGRESS
+#ifdef DEBUG_INGRESS  /* control */
+#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
+#else
+#define DPRINTK(format,args...)
+#endif
+#if 0  /* data */
+#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args)
+#else
+#define D2PRINTK(format,args...)
+#endif
+#define PRIV(sch) qdisc_priv(sch)
+/* Thanks to Doron Oz for this hack
+*/
+#ifndef CONFIG_NET_CLS_ACT
+#ifdef CONFIG_NETFILTER
+static int nf_registered; 
+#endif
+#endif
+struct ingress_qdisc_data {
+        struct Qdisc            *q;
+        struct tcf_proto        *filter_list;
+};
+/* ------------------------- Class/flow operations ------------------------- */
+static int ingress_graft(struct Qdisc *sch,unsigned long arg,
+    struct Qdisc *new,struct Qdisc **old)
+{
+#ifdef DEBUG_INGRESS
+        struct ingress_qdisc_data *p = PRIV(sch);
+#endif
+        DPRINTK("ingress_graft(sch %p,[qdisc %p],new %p,old %p)\n",
+                sch, p, new, old);
+        DPRINTK("\n ingress_graft: You cannot add qdiscs to classes");
+        return 1;
+}
+static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg)
+{
+        return NULL;
+}
+static unsigned long ingress_get(struct Qdisc *sch,u32 classid)
+{
+#ifdef DEBUG_INGRESS
+        struct ingress_qdisc_data *p = PRIV(sch);
+#endif
+        DPRINTK("ingress_get(sch %p,[qdisc %p],classid %x)\n", sch, p, classid);
+        return TC_H_MIN(classid) + 1;
+}
+static unsigned long ingress_bind_filter(struct Qdisc *sch,
+    unsigned long parent, u32 classid)
+{
+        return ingress_get(sch, classid);
+}
+static void ingress_put(struct Qdisc *sch, unsigned long cl)
+{
+}
+static int ingress_change(struct Qdisc *sch, u32 classid, u32 parent,
+    struct rtattr **tca, unsigned long *arg)
+{
+#ifdef DEBUG_INGRESS
+        struct ingress_qdisc_data *p = PRIV(sch);
+#endif
+        DPRINTK("ingress_change(sch %p,[qdisc %p],classid %x,parent %x),"
+                "arg 0x%lx\n", sch, p, classid, parent, *arg);
+        DPRINTK("No effect. sch_ingress doesn't maintain classes at the moment");
+        return 0;
+}
+static void ingress_walk(struct Qdisc *sch,struct qdisc_walker *walker)
+{
+#ifdef DEBUG_INGRESS
+        struct ingress_qdisc_data *p = PRIV(sch);
+#endif
+        DPRINTK("ingress_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker);
+        DPRINTK("No effect. sch_ingress doesn't maintain classes at the moment");
+}
+static struct tcf_proto **ingress_find_tcf(struct Qdisc *sch,unsigned long cl)
+{
+        struct ingress_qdisc_data *p = PRIV(sch);
+        return &p->filter_list;
+}
+/* --------------------------- Qdisc operations ---------------------------- */
+static int ingress_enqueue(struct sk_buff *skb,struct Qdisc *sch)
+{
+        struct ingress_qdisc_data *p = PRIV(sch);
+        struct tcf_result res;
+        int result;
+        D2PRINTK("ingress_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p);
+        result = tc_classify(skb, p->filter_list, &res);
+        D2PRINTK("result %d class 0x%04x\n", result, res.classid);
+        /*
+         * Unlike normal "enqueue" functions, ingress_enqueue returns a
+         * firewall FW_* code.
+         */
+#ifdef CONFIG_NET_CLS_ACT
+        sch->bstats.packets++;
+        sch->bstats.bytes += skb->len;
+        switch (result) {
+                case TC_ACT_SHOT:
+                        result = TC_ACT_SHOT;
+                        sch->qstats.drops++;
+                        break;
+                case TC_ACT_STOLEN:
+                case TC_ACT_QUEUED:
+                        result = TC_ACT_STOLEN;
+                        break;
+                case TC_ACT_RECLASSIFY: 
+                case TC_ACT_OK:
+                case TC_ACT_UNSPEC:
+                default:
+                        skb->tc_index = TC_H_MIN(res.classid);
+                        result = TC_ACT_OK;
+                        break;
+        };
+/* backward compat */
+#else
+#ifdef  CONFIG_NET_CLS_POLICE  
+        switch (result) {
+                case TC_POLICE_SHOT:
+                result = NF_DROP;
+                sch->qstats.drops++;
+                break;
+                case TC_POLICE_RECLASSIFY: /* DSCP remarking here ? */
+                case TC_POLICE_OK:
+                case TC_POLICE_UNSPEC:
+                default:
+                sch->bstats.packets++;
+                sch->bstats.bytes += skb->len;
+                result = NF_ACCEPT;
+                break;
+        };
+#else
+        D2PRINTK("Overriding result to ACCEPT\n");
+        result = NF_ACCEPT;
+        sch->bstats.packets++;
+        sch->bstats.bytes += skb->len;
+#endif
+#endif
+        return result;
+}
+static struct sk_buff *ingress_dequeue(struct Qdisc *sch)
+{
+/*
+        struct ingress_qdisc_data *p = PRIV(sch);
+        D2PRINTK("ingress_dequeue(sch %p,[qdisc %p])\n",sch,PRIV(p));
+*/
+        return NULL;
+}
+static int ingress_requeue(struct sk_buff *skb,struct Qdisc *sch)
+{
+/*
+        struct ingress_qdisc_data *p = PRIV(sch);
+        D2PRINTK("ingress_requeue(skb %p,sch %p,[qdisc %p])\n",skb,sch,PRIV(p));
+*/
+        return 0;
+}
+static unsigned int ingress_drop(struct Qdisc *sch)
+{
+#ifdef DEBUG_INGRESS
+        struct ingress_qdisc_data *p = PRIV(sch);
+#endif
+        DPRINTK("ingress_drop(sch %p,[qdisc %p])\n", sch, p);
+        return 0;
+}
+#ifndef CONFIG_NET_CLS_ACT
+#ifdef CONFIG_NETFILTER
+static unsigned int
+ing_hook(unsigned int hook, struct sk_buff **pskb,
+                             const struct net_device *indev,
+                             const struct net_device *outdev,
+                             int (*okfn)(struct sk_buff *))
+{
+        
+        struct Qdisc *q;
+        struct sk_buff *skb = *pskb;
+        struct net_device *dev = skb->dev;
+        int fwres=NF_ACCEPT;
+        DPRINTK("ing_hook: skb %s dev=%s len=%u\n",
+                skb->sk ? "(owned)" : "(unowned)",
+                skb->dev ? (*pskb)->dev->name : "(no dev)",
+                skb->len);
+/* 
+revisit later: Use a private since lock dev->queue_lock is also
+used on the egress (might slow things for an iota)
+*/
+        if (dev->qdisc_ingress) {
+                spin_lock(&dev->queue_lock);
+                if ((q = dev->qdisc_ingress) != NULL)
+                        fwres = q->enqueue(skb, q);
+                spin_unlock(&dev->queue_lock);
+        }
+                        
+        return fwres;
+}
+/* after ipt_filter */
+static struct nf_hook_ops ing_ops = {
+        .hook           = ing_hook,
+        .owner          = THIS_MODULE,
+        .pf             = PF_INET,
+        .hooknum        = NF_IP_PRE_ROUTING,
+        .priority       = NF_IP_PRI_FILTER + 1,
+};
+static struct nf_hook_ops ing6_ops = {
+        .hook           = ing_hook,
+        .owner          = THIS_MODULE,
+        .pf             = PF_INET6,
+        .hooknum        = NF_IP6_PRE_ROUTING,
+        .priority       = NF_IP6_PRI_FILTER + 1,
+};
+#endif
+#endif
+static int ingress_init(struct Qdisc *sch,struct rtattr *opt)
+{
+        struct ingress_qdisc_data *p = PRIV(sch);
+/* Make sure either netfilter or preferably CLS_ACT is
+* compiled in */
+#ifndef CONFIG_NET_CLS_ACT
+#ifndef CONFIG_NETFILTER
+        printk("You MUST compile classifier actions into the kernel\n");
+        return -EINVAL;
+#else
+        printk("Ingress scheduler: Classifier actions prefered over netfilter\n");
+#endif
+#endif
+                                                                                
+#ifndef CONFIG_NET_CLS_ACT
+#ifdef CONFIG_NETFILTER
+        if (!nf_registered) {
+                if (nf_register_hook(&ing_ops) < 0) {
+                        printk("ingress qdisc registration error \n");
+                        return -EINVAL;
+                }
+                nf_registered++;
+                if (nf_register_hook(&ing6_ops) < 0) {
+                        printk("IPv6 ingress qdisc registration error, " \
+                            "disabling IPv6 support.\n");
+                } else
+                        nf_registered++;
+        }
+#endif
+#endif
+        DPRINTK("ingress_init(sch %p,[qdisc %p],opt %p)\n",sch,p,opt);
+        p->q = &noop_qdisc;
+        return 0;
+}
+static void ingress_reset(struct Qdisc *sch)
+{
+        struct ingress_qdisc_data *p = PRIV(sch);
+        DPRINTK("ingress_reset(sch %p,[qdisc %p])\n", sch, p);
+/*
+#if 0
+*/
+/* for future use */
+        qdisc_reset(p->q);
+/*
+#endif
+*/
+}
+/* ------------------------------------------------------------- */
+/* ------------------------------------------------------------- */
+static void ingress_destroy(struct Qdisc *sch)
+{
+        struct ingress_qdisc_data *p = PRIV(sch);
+        struct tcf_proto *tp;
+        DPRINTK("ingress_destroy(sch %p,[qdisc %p])\n", sch, p);
+        while (p->filter_list) {
+                tp = p->filter_list;
+                p->filter_list = tp->next;
+                tcf_destroy(tp);
+        }
+#if 0
+/* for future use */
+        qdisc_destroy(p->q);
+#endif
+}
+static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+        unsigned char *b = skb->tail;
+        struct rtattr *rta;
+        rta = (struct rtattr *) b;
+        RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
+        rta->rta_len = skb->tail - b;
+        return skb->len;
+rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+static struct Qdisc_class_ops ingress_class_ops = {
+        .graft          =       ingress_graft,
+        .leaf           =       ingress_leaf,
+        .get            =       ingress_get,
+        .put            =       ingress_put,
+        .change         =       ingress_change,
+        .delete         =       NULL,
+        .walk           =       ingress_walk,
+        .tcf_chain      =       ingress_find_tcf,
+        .bind_tcf       =       ingress_bind_filter,
+        .unbind_tcf     =       ingress_put,
+        .dump           =       NULL,
+};
+static struct Qdisc_ops ingress_qdisc_ops = {
+        .next           =       NULL,
+        .cl_ops         =       &ingress_class_ops,
+        .id             =       "ingress",
+        .priv_size      =       sizeof(struct ingress_qdisc_data),
+        .enqueue        =       ingress_enqueue,
+        .dequeue        =       ingress_dequeue,
+        .requeue        =       ingress_requeue,
+        .drop           =       ingress_drop,
+        .init           =       ingress_init,
+        .reset          =       ingress_reset,
+        .destroy        =       ingress_destroy,
+        .change         =       NULL,
+        .dump           =       ingress_dump,
+        .owner          =       THIS_MODULE,
+};
+static int __init ingress_module_init(void)
+{
+        int ret = 0;
+        if ((ret = register_qdisc(&ingress_qdisc_ops)) < 0) {
+                printk("Unable to register Ingress qdisc\n");
+                return ret;
+        }
+        return ret;
+}
+static void __exit ingress_module_exit(void) 
+{
+        unregister_qdisc(&ingress_qdisc_ops);
+#ifndef CONFIG_NET_CLS_ACT
+#ifdef CONFIG_NETFILTER
+        if (nf_registered) {
+                nf_unregister_hook(&ing_ops);
+                if (nf_registered > 1)
+                        nf_unregister_hook(&ing6_ops);
+        }
+#endif
+#endif
+}
+module_init(ingress_module_init)
+module_exit(ingress_module_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
new file mode 100644
index 000000000000..31c29deb139d
--- /dev/null
+++ b/net/sched/sch_netem.c
@@ -0,0 +1,598 @@
+/*
+ * net/sched/sch_netem.c        Network emulator
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ *              Many of the algorithms and ideas for this came from
+ *              NIST Net which is not copyrighted. 
+ *
+ * Authors:     Stephen Hemminger <shemminger@osdl.org>
+ *              Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <net/pkt_sched.h>
+/*      Network Emulation Queuing algorithm.
+        ====================================
+        Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
+                 Network Emulation Tool
+                 [2] Luigi Rizzo, DummyNet for FreeBSD
+         ----------------------------------------------------------------
+         This started out as a simple way to delay outgoing packets to
+         test TCP but has grown to include most of the functionality
+         of a full blown network emulator like NISTnet. It can delay
+         packets and add random jitter (and correlation). The random
+         distribution can be loaded from a table as well to provide
+         normal, Pareto, or experimental curves. Packet loss,
+         duplication, and reordering can also be emulated.
+         This qdisc does not do classification that can be handled in
+         layering other disciplines.  It does not need to do bandwidth
+         control either since that can be handled by using token
+         bucket or other rate control.
+         The simulator is limited by the Linux timer resolution
+         and will create packet bursts on the HZ boundary (1ms).
+*/
+struct netem_sched_data {
+        struct Qdisc    *qdisc;
+        struct sk_buff_head delayed;
+        struct timer_list timer;
+        u32 latency;
+        u32 loss;
+        u32 limit;
+        u32 counter;
+        u32 gap;
+        u32 jitter;
+        u32 duplicate;
+        struct crndstate {
+                unsigned long last;
+                unsigned long rho;
+        } delay_cor, loss_cor, dup_cor;
+        struct disttable {
+                u32  size;
+                s16 table[0];
+        } *delay_dist;
+};
+/* Time stamp put into socket buffer control block */
+struct netem_skb_cb {
+        psched_time_t   time_to_send;
+};
+/* init_crandom - initialize correlated random number generator
+ * Use entropy source for initial seed.
+ */
+static void init_crandom(struct crndstate *state, unsigned long rho)
+{
+        state->rho = rho;
+        state->last = net_random();
+}
+/* get_crandom - correlated random number generator
+ * Next number depends on last value.
+ * rho is scaled to avoid floating point.
+ */
+static unsigned long get_crandom(struct crndstate *state)
+{
+        u64 value, rho;
+        unsigned long answer;
+        if (state->rho == 0)    /* no correllation */
+                return net_random();
+        value = net_random();
+        rho = (u64)state->rho + 1;
+        answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
+        state->last = answer;
+        return answer;
+}
+/* tabledist - return a pseudo-randomly distributed value with mean mu and
+ * std deviation sigma.  Uses table lookup to approximate the desired
+ * distribution, and a uniformly-distributed pseudo-random source.
+ */
+static long tabledist(unsigned long mu, long sigma, 
+                      struct crndstate *state, const struct disttable *dist)
+{
+        long t, x;
+        unsigned long rnd;
+        if (sigma == 0)
+                return mu;
+        rnd = get_crandom(state);
+        /* default uniform distribution */
+        if (dist == NULL) 
+                return (rnd % (2*sigma)) - sigma + mu;
+        t = dist->table[rnd % dist->size];
+        x = (sigma % NETEM_DIST_SCALE) * t;
+        if (x >= 0)
+                x += NETEM_DIST_SCALE/2;
+        else
+                x -= NETEM_DIST_SCALE/2;
+        return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
+}
+/* Put skb in the private delayed queue. */
+static int delay_skb(struct Qdisc *sch, struct sk_buff *skb)
+{
+        struct netem_sched_data *q = qdisc_priv(sch);
+        struct netem_skb_cb *cb = (struct netem_skb_cb *)skb->cb;
+        psched_tdiff_t td;
+        psched_time_t now;
+        
+        PSCHED_GET_TIME(now);
+        td = tabledist(q->latency, q->jitter, &q->delay_cor, q->delay_dist);
+        PSCHED_TADD2(now, td, cb->time_to_send);
+        
+        /* Always queue at tail to keep packets in order */
+        if (likely(q->delayed.qlen < q->limit)) {
+                __skb_queue_tail(&q->delayed, skb);
+                if (!timer_pending(&q->timer)) {
+                        q->timer.expires = jiffies + PSCHED_US2JIFFIE(td);
+                        add_timer(&q->timer);
+                }
+                return NET_XMIT_SUCCESS;
+        }
+        kfree_skb(skb);
+        return NET_XMIT_DROP;
+}
+static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+        struct netem_sched_data *q = qdisc_priv(sch);
+        struct sk_buff *skb2;
+        int ret;
+        pr_debug("netem_enqueue skb=%p @%lu\n", skb, jiffies);
+        /* Random packet drop 0 => none, ~0 => all */
+        if (q->loss && q->loss >= get_crandom(&q->loss_cor)) {
+                pr_debug("netem_enqueue: random loss\n");
+                sch->qstats.drops++;
+                kfree_skb(skb);
+                return 0;       /* lie about loss so TCP doesn't know */
+        }
+        /* Random duplication */
+        if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor)
+            && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
+                pr_debug("netem_enqueue: dup %p\n", skb2);
+                if (delay_skb(sch, skb2)) {
+                        sch->q.qlen++;
+                        sch->bstats.bytes += skb2->len;
+                        sch->bstats.packets++;
+                } else
+                        sch->qstats.drops++;
+        }
+        /* If doing simple delay then gap == 0 so all packets
+         * go into the delayed holding queue
+         * otherwise if doing out of order only "1 out of gap"
+         * packets will be delayed.
+         */
+        if (q->counter < q->gap) {
+                ++q->counter;
+                ret = q->qdisc->enqueue(skb, q->qdisc);
+        } else {
+                q->counter = 0;
+                ret = delay_skb(sch, skb);
+        }
+        if (likely(ret == NET_XMIT_SUCCESS)) {
+                sch->q.qlen++;
+                sch->bstats.bytes += skb->len;
+                sch->bstats.packets++;
+        } else
+                sch->qstats.drops++;
+        return ret;
+}
+/* Requeue packets but don't change time stamp */
+static int netem_requeue(struct sk_buff *skb, struct Qdisc *sch)
+{
+        struct netem_sched_data *q = qdisc_priv(sch);
+        int ret;
+        if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0) {
+                sch->q.qlen++;
+                sch->qstats.requeues++;
+        }
+        return ret;
+}
+static unsigned int netem_drop(struct Qdisc* sch)
+{
+        struct netem_sched_data *q = qdisc_priv(sch);
+        unsigned int len;
+        if ((len = q->qdisc->ops->drop(q->qdisc)) != 0) {
+                sch->q.qlen--;
+                sch->qstats.drops++;
+        }
+        return len;
+}
+/* Dequeue packet.
+ *  Move all packets that are ready to send from the delay holding
+ *  list to the underlying qdisc, then just call dequeue
+ */
+static struct sk_buff *netem_dequeue(struct Qdisc *sch)
+{
+        struct netem_sched_data *q = qdisc_priv(sch);
+        struct sk_buff *skb;
+        skb = q->qdisc->dequeue(q->qdisc);
+        if (skb) 
+                sch->q.qlen--;
+        return skb;
+}
+static void netem_watchdog(unsigned long arg)
+{
+        struct Qdisc *sch = (struct Qdisc *)arg;
+        struct netem_sched_data *q = qdisc_priv(sch);
+        struct net_device *dev = sch->dev;
+        struct sk_buff *skb;
+        psched_time_t now;
+        pr_debug("netem_watchdog: fired @%lu\n", jiffies);
+        spin_lock_bh(&dev->queue_lock);
+        PSCHED_GET_TIME(now);
+        while ((skb = skb_peek(&q->delayed)) != NULL) {
+                const struct netem_skb_cb *cb
+                        = (const struct netem_skb_cb *)skb->cb;
+                long delay 
+                        = PSCHED_US2JIFFIE(PSCHED_TDIFF(cb->time_to_send, now));
+                pr_debug("netem_watchdog: skb %p@%lu %ld\n",
+                         skb, jiffies, delay);
+                /* if more time remaining? */
+                if (delay > 0) {
+                        mod_timer(&q->timer, jiffies + delay);
+                        break;
+                }
+                __skb_unlink(skb, &q->delayed);
+                if (q->qdisc->enqueue(skb, q->qdisc)) {
+                        sch->q.qlen--;
+                        sch->qstats.drops++;
+                }
+        }
+        qdisc_run(dev);
+        spin_unlock_bh(&dev->queue_lock);
+}
+static void netem_reset(struct Qdisc *sch)
+{
+        struct netem_sched_data *q = qdisc_priv(sch);
+        qdisc_reset(q->qdisc);
+        skb_queue_purge(&q->delayed);
+        sch->q.qlen = 0;
+        del_timer_sync(&q->timer);
+}
+static int set_fifo_limit(struct Qdisc *q, int limit)
+{
+        struct rtattr *rta;
+        int ret = -ENOMEM;
+        rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
+        if (rta) {
+                rta->rta_type = RTM_NEWQDISC;
+                rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt)); 
+                ((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit;
+                
+                ret = q->ops->change(q, rta);
+                kfree(rta);
+        }
+        return ret;
+}
+/*
+ * Distribution data is a variable size payload containing
+ * signed 16 bit values.
+ */
+static int get_dist_table(struct Qdisc *sch, const struct rtattr *attr)
+{
+        struct netem_sched_data *q = qdisc_priv(sch);
+        unsigned long n = RTA_PAYLOAD(attr)/sizeof(__s16);
+        const __s16 *data = RTA_DATA(attr);
+        struct disttable *d;
+        int i;
+        if (n > 65536)
+                return -EINVAL;
+        d = kmalloc(sizeof(*d) + n*sizeof(d->table[0]), GFP_KERNEL);
+        if (!d)
+                return -ENOMEM;
+        d->size = n;
+        for (i = 0; i < n; i++)
+                d->table[i] = data[i];
+        
+        spin_lock_bh(&sch->dev->queue_lock);
+        d = xchg(&q->delay_dist, d);
+        spin_unlock_bh(&sch->dev->queue_lock);
+        kfree(d);
+        return 0;
+}
+static int get_correlation(struct Qdisc *sch, const struct rtattr *attr)
+{
+        struct netem_sched_data *q = qdisc_priv(sch);
+        const struct tc_netem_corr *c = RTA_DATA(attr);
+        if (RTA_PAYLOAD(attr) != sizeof(*c))
+                return -EINVAL;
+        init_crandom(&q->delay_cor, c->delay_corr);
+        init_crandom(&q->loss_cor, c->loss_corr);
+        init_crandom(&q->dup_cor, c->dup_corr);
+        return 0;
+}
+static int netem_change(struct Qdisc *sch, struct rtattr *opt)
+{
+        struct netem_sched_data *q = qdisc_priv(sch);
+        struct tc_netem_qopt *qopt;
+        int ret;
+        
+        if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt))
+                return -EINVAL;
+        qopt = RTA_DATA(opt);
+        ret = set_fifo_limit(q->qdisc, qopt->limit);
+        if (ret) {
+                pr_debug("netem: can't set fifo limit\n");
+                return ret;
+        }
+        
+        q->latency = qopt->latency;
+        q->jitter = qopt->jitter;
+        q->limit = qopt->limit;
+        q->gap = qopt->gap;
+        q->loss = qopt->loss;
+        q->duplicate = qopt->duplicate;
+        /* Handle nested options after initial queue options.
+         * Should have put all options in nested format but too late now.
+         */ 
+        if (RTA_PAYLOAD(opt) > sizeof(*qopt)) {
+                struct rtattr *tb[TCA_NETEM_MAX];
+                if (rtattr_parse(tb, TCA_NETEM_MAX, 
+                                 RTA_DATA(opt) + sizeof(*qopt),
+                                 RTA_PAYLOAD(opt) - sizeof(*qopt)))
+                        return -EINVAL;
+                if (tb[TCA_NETEM_CORR-1]) {
+                        ret = get_correlation(sch, tb[TCA_NETEM_CORR-1]);
+                        if (ret)
+                                return ret;
+                }
+                if (tb[TCA_NETEM_DELAY_DIST-1]) {
+                        ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST-1]);
+                        if (ret)
+                                return ret;
+                }
+        }
+        return 0;
+}
+static int netem_init(struct Qdisc *sch, struct rtattr *opt)
+{
+        struct netem_sched_data *q = qdisc_priv(sch);
+        int ret;
+        if (!opt)
+                return -EINVAL;
+        skb_queue_head_init(&q->delayed);
+        init_timer(&q->timer);
+        q->timer.function = netem_watchdog;
+        q->timer.data = (unsigned long) sch;
+        q->counter = 0;
+        q->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
+        if (!q->qdisc) {
+                pr_debug("netem: qdisc create failed\n");
+                return -ENOMEM;
+        }
+        ret = netem_change(sch, opt);
+        if (ret) {
+                pr_debug("netem: change failed\n");
+                qdisc_destroy(q->qdisc);
+        }
+        return ret;
+}
+static void netem_destroy(struct Qdisc *sch)
+{
+        struct netem_sched_data *q = qdisc_priv(sch);
+        del_timer_sync(&q->timer);
+        qdisc_destroy(q->qdisc);
+        kfree(q->delay_dist);
+}
+static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+        const struct netem_sched_data *q = qdisc_priv(sch);
+        unsigned char    *b = skb->tail;
+        struct rtattr *rta = (struct rtattr *) b;
+        struct tc_netem_qopt qopt;
+        struct tc_netem_corr cor;
+        qopt.latency = q->latency;
+        qopt.jitter = q->jitter;
+        qopt.limit = q->limit;
+        qopt.loss = q->loss;
+        qopt.gap = q->gap;
+        qopt.duplicate = q->duplicate;
+        RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
+        cor.delay_corr = q->delay_cor.rho;
+        cor.loss_corr = q->loss_cor.rho;
+        cor.dup_corr = q->dup_cor.rho;
+        RTA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor);
+        rta->rta_len = skb->tail - b;
+        return skb->len;
+rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
+                          struct sk_buff *skb, struct tcmsg *tcm)
+{
+        struct netem_sched_data *q = qdisc_priv(sch);
+        if (cl != 1)    /* only one class */
+                return -ENOENT;
+        tcm->tcm_handle |= TC_H_MIN(1);
+        tcm->tcm_info = q->qdisc->handle;
+        return 0;
+}
+static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+                     struct Qdisc **old)
+{
+        struct netem_sched_data *q = qdisc_priv(sch);
+        if (new == NULL)
+                new = &noop_qdisc;
+        sch_tree_lock(sch);
+        *old = xchg(&q->qdisc, new);
+        qdisc_reset(*old);
+        sch->q.qlen = 0;
+        sch_tree_unlock(sch);
+        return 0;
+}
+static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
+{
+        struct netem_sched_data *q = qdisc_priv(sch);
+        return q->qdisc;
+}
+static unsigned long netem_get(struct Qdisc *sch, u32 classid)
+{
+        return 1;
+}
+static void netem_put(struct Qdisc *sch, unsigned long arg)
+{
+}
+static int netem_change_class(struct Qdisc *sch, u32 classid, u32 parentid, 
+                            struct rtattr **tca, unsigned long *arg)
+{
+        return -ENOSYS;
+}
+static int netem_delete(struct Qdisc *sch, unsigned long arg)
+{
+        return -ENOSYS;
+}
+static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
+{
+        if (!walker->stop) {
+                if (walker->count >= walker->skip)
+                        if (walker->fn(sch, 1, walker) < 0) {
+                                walker->stop = 1;
+                                return;
+                        }
+                walker->count++;
+        }
+}
+static struct tcf_proto **netem_find_tcf(struct Qdisc *sch, unsigned long cl)
+{
+        return NULL;
+}
+static struct Qdisc_class_ops netem_class_ops = {
+        .graft          =       netem_graft,
+        .leaf           =       netem_leaf,
+        .get            =       netem_get,
+        .put            =       netem_put,
+        .change         =       netem_change_class,
+        .delete         =       netem_delete,
+        .walk           =       netem_walk,
+        .tcf_chain      =       netem_find_tcf,
+        .dump           =       netem_dump_class,
+};
+static struct Qdisc_ops netem_qdisc_ops = {
+        .id             =       "netem",
+        .cl_ops         =       &netem_class_ops,
+        .priv_size      =       sizeof(struct netem_sched_data),
+        .enqueue        =       netem_enqueue,
+        .dequeue        =       netem_dequeue,
+        .requeue        =       netem_requeue,
+        .drop           =       netem_drop,
+        .init           =       netem_init,
+        .reset          =       netem_reset,
+        .destroy        =       netem_destroy,
+        .change         =       netem_change,
+        .dump           =       netem_dump,
+        .owner          =       THIS_MODULE,
+};
+static int __init netem_module_init(void)
+{
+        return register_qdisc(&netem_qdisc_ops);
+}
+static void __exit netem_module_exit(void)
+{
+        unregister_qdisc(&netem_qdisc_ops);
+}
+module_init(netem_module_init)
+module_exit(netem_module_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
new file mode 100644
index 000000000000..3ac0f495bad0
--- /dev/null
+++ b/net/sched/sch_prio.c
@@ -0,0 +1,444 @@
+/*
+ * net/sched/sch_prio.c Simple 3-band priority "scheduler".
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ * Fixes:       19990609: J Hadi Salim <hadi@nortelnetworks.com>: 
+ *              Init --  EINVAL when opt undefined
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/notifier.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+struct prio_sched_data
+{
+        int bands;
+        struct tcf_proto *filter_list;
+        u8  prio2band[TC_PRIO_MAX+1];
+        struct Qdisc *queues[TCQ_PRIO_BANDS];
+};
+static struct Qdisc *
+prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
+{
+        struct prio_sched_data *q = qdisc_priv(sch);
+        u32 band = skb->priority;
+        struct tcf_result res;
+        *qerr = NET_XMIT_DROP;
+        if (TC_H_MAJ(skb->priority) != sch->handle) {
+#ifdef CONFIG_NET_CLS_ACT
+                switch (tc_classify(skb, q->filter_list, &res)) {
+                case TC_ACT_STOLEN:
+                case TC_ACT_QUEUED:
+                        *qerr = NET_XMIT_SUCCESS;
+                case TC_ACT_SHOT:
+                        return NULL;
+                };
+                if (!q->filter_list ) {
+#else
+                if (!q->filter_list || tc_classify(skb, q->filter_list, &res)) {
+#endif
+                        if (TC_H_MAJ(band))
+                                band = 0;
+                        return q->queues[q->prio2band[band&TC_PRIO_MAX]];
+                }
+                band = res.classid;
+        }
+        band = TC_H_MIN(band) - 1;
+        if (band > q->bands)
+                return q->queues[q->prio2band[0]];
+        return q->queues[band];
+}
+static int
+prio_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+        struct Qdisc *qdisc;
+        int ret;
+        qdisc = prio_classify(skb, sch, &ret);
+#ifdef CONFIG_NET_CLS_ACT
+        if (qdisc == NULL) {
+                if (ret == NET_XMIT_DROP)
+                        sch->qstats.drops++;
+                kfree_skb(skb);
+                return ret;
+        }
+#endif
+        if ((ret = qdisc->enqueue(skb, qdisc)) == NET_XMIT_SUCCESS) {
+                sch->bstats.bytes += skb->len;
+                sch->bstats.packets++;
+                sch->q.qlen++;
+                return NET_XMIT_SUCCESS;
+        }
+        sch->qstats.drops++;
+        return ret; 
+}
+static int
+prio_requeue(struct sk_buff *skb, struct Qdisc* sch)
+{
+        struct Qdisc *qdisc;
+        int ret;
+        qdisc = prio_classify(skb, sch, &ret);
+#ifdef CONFIG_NET_CLS_ACT
+        if (qdisc == NULL) {
+                if (ret == NET_XMIT_DROP)
+                        sch->qstats.drops++;
+                kfree_skb(skb);
+                return ret;
+        }
+#endif
+        if ((ret = qdisc->ops->requeue(skb, qdisc)) == NET_XMIT_SUCCESS) {
+                sch->q.qlen++;
+                sch->qstats.requeues++;
+                return 0;
+        }
+        sch->qstats.drops++;
+        return NET_XMIT_DROP;
+}
+static struct sk_buff *
+prio_dequeue(struct Qdisc* sch)
+{
+        struct sk_buff *skb;
+        struct prio_sched_data *q = qdisc_priv(sch);
+        int prio;
+        struct Qdisc *qdisc;
+        for (prio = 0; prio < q->bands; prio++) {
+                qdisc = q->queues[prio];
+                skb = qdisc->dequeue(qdisc);
+                if (skb) {
+                        sch->q.qlen--;
+                        return skb;
+                }
+        }
+        return NULL;
+}
+static unsigned int prio_drop(struct Qdisc* sch)
+{
+        struct prio_sched_data *q = qdisc_priv(sch);
+        int prio;
+        unsigned int len;
+        struct Qdisc *qdisc;
+        for (prio = q->bands-1; prio >= 0; prio--) {
+                qdisc = q->queues[prio];
+                if ((len = qdisc->ops->drop(qdisc)) != 0) {
+                        sch->q.qlen--;
+                        return len;
+                }
+        }
+        return 0;
+}
+static void
+prio_reset(struct Qdisc* sch)
+{
+        int prio;
+        struct prio_sched_data *q = qdisc_priv(sch);
+        for (prio=0; prio<q->bands; prio++)
+                qdisc_reset(q->queues[prio]);
+        sch->q.qlen = 0;
+}
+static void
+prio_destroy(struct Qdisc* sch)
+{
+        int prio;
+        struct prio_sched_data *q = qdisc_priv(sch);
+        struct tcf_proto *tp;
+        while ((tp = q->filter_list) != NULL) {
+                q->filter_list = tp->next;
+                tcf_destroy(tp);
+        }
+        for (prio=0; prio<q->bands; prio++)
+                qdisc_destroy(q->queues[prio]);
+}
+static int prio_tune(struct Qdisc *sch, struct rtattr *opt)
+{
+        struct prio_sched_data *q = qdisc_priv(sch);
+        struct tc_prio_qopt *qopt = RTA_DATA(opt);
+        int i;
+        if (opt->rta_len < RTA_LENGTH(sizeof(*qopt)))
+                return -EINVAL;
+        if (qopt->bands > TCQ_PRIO_BANDS || qopt->bands < 2)
+                return -EINVAL;
+        for (i=0; i<=TC_PRIO_MAX; i++) {
+                if (qopt->priomap[i] >= qopt->bands)
+                        return -EINVAL;
+        }
+        sch_tree_lock(sch);
+        q->bands = qopt->bands;
+        memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1);
+        for (i=q->bands; i<TCQ_PRIO_BANDS; i++) {
+                struct Qdisc *child = xchg(&q->queues[i], &noop_qdisc);
+                if (child != &noop_qdisc)
+                        qdisc_destroy(child);
+        }
+        sch_tree_unlock(sch);
+        for (i=0; i<=TC_PRIO_MAX; i++) {
+                int band = q->prio2band[i];
+                if (q->queues[band] == &noop_qdisc) {
+                        struct Qdisc *child;
+                        child = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
+                        if (child) {
+                                sch_tree_lock(sch);
+                                child = xchg(&q->queues[band], child);
+                                if (child != &noop_qdisc)
+                                        qdisc_destroy(child);
+                                sch_tree_unlock(sch);
+                        }
+                }
+        }
+        return 0;
+}
+static int prio_init(struct Qdisc *sch, struct rtattr *opt)
+{
+        struct prio_sched_data *q = qdisc_priv(sch);
+        int i;
+        for (i=0; i<TCQ_PRIO_BANDS; i++)
+                q->queues[i] = &noop_qdisc;
+        if (opt == NULL) {
+                return -EINVAL;
+        } else {
+                int err;
+                if ((err= prio_tune(sch, opt)) != 0)
+                        return err;
+        }
+        return 0;
+}
+static int prio_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+        struct prio_sched_data *q = qdisc_priv(sch);
+        unsigned char    *b = skb->tail;
+        struct tc_prio_qopt opt;
+        opt.bands = q->bands;
+        memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX+1);
+        RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+        return skb->len;
+rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+                      struct Qdisc **old)
+{
+        struct prio_sched_data *q = qdisc_priv(sch);
+        unsigned long band = arg - 1;
+        if (band >= q->bands)
+                return -EINVAL;
+        if (new == NULL)
+                new = &noop_qdisc;
+        sch_tree_lock(sch);
+        *old = q->queues[band];
+        q->queues[band] = new;
+        sch->q.qlen -= (*old)->q.qlen;
+        qdisc_reset(*old);
+        sch_tree_unlock(sch);
+        return 0;
+}
+static struct Qdisc *
+prio_leaf(struct Qdisc *sch, unsigned long arg)
+{
+        struct prio_sched_data *q = qdisc_priv(sch);
+        unsigned long band = arg - 1;
+        if (band >= q->bands)
+                return NULL;
+        return q->queues[band];
+}
+static unsigned long prio_get(struct Qdisc *sch, u32 classid)
+{
+        struct prio_sched_data *q = qdisc_priv(sch);
+        unsigned long band = TC_H_MIN(classid);
+        if (band - 1 >= q->bands)
+                return 0;
+        return band;
+}
+static unsigned long prio_bind(struct Qdisc *sch, unsigned long parent, u32 classid)
+{
+        return prio_get(sch, classid);
+}
+static void prio_put(struct Qdisc *q, unsigned long cl)
+{
+        return;
+}
+static int prio_change(struct Qdisc *sch, u32 handle, u32 parent, struct rtattr **tca, unsigned long *arg)
+{
+        unsigned long cl = *arg;
+        struct prio_sched_data *q = qdisc_priv(sch);
+        if (cl - 1 > q->bands)
+                return -ENOENT;
+        return 0;
+}
+static int prio_delete(struct Qdisc *sch, unsigned long cl)
+{
+        struct prio_sched_data *q = qdisc_priv(sch);
+        if (cl - 1 > q->bands)
+                return -ENOENT;
+        return 0;
+}
+static int prio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb,
+                           struct tcmsg *tcm)
+{
+        struct prio_sched_data *q = qdisc_priv(sch);
+        if (cl - 1 > q->bands)
+                return -ENOENT;
+        tcm->tcm_handle |= TC_H_MIN(cl);
+        if (q->queues[cl-1])
+                tcm->tcm_info = q->queues[cl-1]->handle;
+        return 0;
+}
+static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+        struct prio_sched_data *q = qdisc_priv(sch);
+        int prio;
+        if (arg->stop)
+                return;
+        for (prio = 0; prio < q->bands; prio++) {
+                if (arg->count < arg->skip) {
+                        arg->count++;
+                        continue;
+                }
+                if (arg->fn(sch, prio+1, arg) < 0) {
+                        arg->stop = 1;
+                        break;
+                }
+                arg->count++;
+        }
+}
+static struct tcf_proto ** prio_find_tcf(struct Qdisc *sch, unsigned long cl)
+{
+        struct prio_sched_data *q = qdisc_priv(sch);
+        if (cl)
+                return NULL;
+        return &q->filter_list;
+}
+static struct Qdisc_class_ops prio_class_ops = {
+        .graft          =       prio_graft,
+        .leaf           =       prio_leaf,
+        .get            =       prio_get,
+        .put            =       prio_put,
+        .change         =       prio_change,
+        .delete         =       prio_delete,
+        .walk           =       prio_walk,
+        .tcf_chain      =       prio_find_tcf,
+        .bind_tcf       =       prio_bind,
+        .unbind_tcf     =       prio_put,
+        .dump           =       prio_dump_class,
+};
+static struct Qdisc_ops prio_qdisc_ops = {
+        .next           =       NULL,
+        .cl_ops         =       &prio_class_ops,
+        .id             =       "prio",
+        .priv_size      =       sizeof(struct prio_sched_data),
+        .enqueue        =       prio_enqueue,
+        .dequeue        =       prio_dequeue,
+        .requeue        =       prio_requeue,
+        .drop           =       prio_drop,
+        .init           =       prio_init,
+        .reset          =       prio_reset,
+        .destroy        =       prio_destroy,
+        .change         =       prio_tune,
+        .dump           =       prio_dump,
+        .owner          =       THIS_MODULE,
+};
+static int __init prio_module_init(void)
+{
+        return register_qdisc(&prio_qdisc_ops);
+}
+static void __exit prio_module_exit(void) 
+{
+        unregister_qdisc(&prio_qdisc_ops);
+}
+module_init(prio_module_init)
+module_exit(prio_module_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
new file mode 100644
index 000000000000..664d0e47374f
--- /dev/null
+++ b/net/sched/sch_red.c
@@ -0,0 +1,459 @@
+/*
+ * net/sched/sch_red.c  Random Early Detection queue.
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Changes:
+ * J Hadi Salim <hadi@nortel.com> 980914:       computation fixes
+ * Alexey Makarenko <makar@phoenix.kharkov.ua> 990814: qave on idle link was calculated incorrectly.
+ * J Hadi Salim <hadi@nortelnetworks.com> 980816:  ECN support  
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/notifier.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+#include <net/dsfield.h>
+/*      Random Early Detection (RED) algorithm.
+        =======================================
+        Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways
+        for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking.
+        This file codes a "divisionless" version of RED algorithm
+        as written down in Fig.17 of the paper.
+Short description.
+------------------
+        When a new packet arrives we calculate the average queue length:
+        avg = (1-W)*avg + W*current_queue_len,
+        W is the filter time constant (chosen as 2^(-Wlog)), it controls
+        the inertia of the algorithm. To allow larger bursts, W should be
+        decreased.
+        if (avg > th_max) -> packet marked (dropped).
+        if (avg < th_min) -> packet passes.
+        if (th_min < avg < th_max) we calculate probability:
+        Pb = max_P * (avg - th_min)/(th_max-th_min)
+        and mark (drop) packet with this probability.
+        Pb changes from 0 (at avg==th_min) to max_P (avg==th_max).
+        max_P should be small (not 1), usually 0.01..0.02 is good value.
+        max_P is chosen as a number, so that max_P/(th_max-th_min)
+        is a negative power of two in order arithmetics to contain
+        only shifts.
+        Parameters, settable by user:
+        -----------------------------
+        limit           - bytes (must be > qth_max + burst)
+        Hard limit on queue length, should be chosen >qth_max
+        to allow packet bursts. This parameter does not
+        affect the algorithms behaviour and can be chosen
+        arbitrarily high (well, less than ram size)
+        Really, this limit will never be reached
+        if RED works correctly.
+        qth_min         - bytes (should be < qth_max/2)
+        qth_max         - bytes (should be at least 2*qth_min and less limit)
+        Wlog            - bits (<32) log(1/W).
+        Plog            - bits (<32)
+        Plog is related to max_P by formula:
+        max_P = (qth_max-qth_min)/2^Plog;
+        F.e. if qth_max=128K and qth_min=32K, then Plog=22
+        corresponds to max_P=0.02
+        Scell_log
+        Stab
+        Lookup table for log((1-W)^(t/t_ave).
+NOTES:
+Upper bound on W.
+-----------------
+        If you want to allow bursts of L packets of size S,
+        you should choose W:
+        L + 1 - th_min/S < (1-(1-W)^L)/W
+        th_min/S = 32         th_min/S = 4
+                                               
+        log(W)  L
+        -1      33
+        -2      35
+        -3      39
+        -4      46
+        -5      57
+        -6      75
+        -7      101
+        -8      135
+        -9      190
+        etc.
+ */
+struct red_sched_data
+{
+/* Parameters */
+        u32             limit;          /* HARD maximal queue length    */
+        u32             qth_min;        /* Min average length threshold: A scaled */
+        u32             qth_max;        /* Max average length threshold: A scaled */
+        u32             Rmask;
+        u32             Scell_max;
+        unsigned char   flags;
+        char            Wlog;           /* log(W)               */
+        char            Plog;           /* random number bits   */
+        char            Scell_log;
+        u8              Stab[256];
+/* Variables */
+        unsigned long   qave;           /* Average queue length: A scaled */
+        int             qcount;         /* Packets since last random number generation */
+        u32             qR;             /* Cached random number */
+        psched_time_t   qidlestart;     /* Start of idle period         */
+        struct tc_red_xstats st;
+};
+static int red_ecn_mark(struct sk_buff *skb)
+{
+        if (skb->nh.raw + 20 > skb->tail)
+                return 0;
+        switch (skb->protocol) {
+        case __constant_htons(ETH_P_IP):
+                if (INET_ECN_is_not_ect(skb->nh.iph->tos))
+                        return 0;
+                IP_ECN_set_ce(skb->nh.iph);
+                return 1;
+        case __constant_htons(ETH_P_IPV6):
+                if (INET_ECN_is_not_ect(ipv6_get_dsfield(skb->nh.ipv6h)))
+                        return 0;
+                IP6_ECN_set_ce(skb->nh.ipv6h);
+                return 1;
+        default:
+                return 0;
+        }
+}
+static int
+red_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+{
+        struct red_sched_data *q = qdisc_priv(sch);
+        psched_time_t now;
+        if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) {
+                long us_idle;
+                int  shift;
+                PSCHED_GET_TIME(now);
+                us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max);
+                PSCHED_SET_PASTPERFECT(q->qidlestart);
+/*
+   The problem: ideally, average length queue recalcultion should
+   be done over constant clock intervals. This is too expensive, so that
+   the calculation is driven by outgoing packets.
+   When the queue is idle we have to model this clock by hand.
+   SF+VJ proposed to "generate" m = idletime/(average_pkt_size/bandwidth)
+   dummy packets as a burst after idle time, i.e.
+          q->qave *= (1-W)^m
+   This is an apparently overcomplicated solution (f.e. we have to precompute
+   a table to make this calculation in reasonable time)
+   I believe that a simpler model may be used here,
+   but it is field for experiments.
+*/
+                shift = q->Stab[us_idle>>q->Scell_log];
+                if (shift) {
+                        q->qave >>= shift;
+                } else {
+                        /* Approximate initial part of exponent
+                           with linear function:
+                           (1-W)^m ~= 1-mW + ...
+                           Seems, it is the best solution to
+                           problem of too coarce exponent tabulation.
+                         */
+                        us_idle = (q->qave * us_idle)>>q->Scell_log;
+                        if (us_idle < q->qave/2)
+                                q->qave -= us_idle;
+                        else
+                                q->qave >>= 1;
+                }
+        } else {
+                q->qave += sch->qstats.backlog - (q->qave >> q->Wlog);
+                /* NOTE:
+                   q->qave is fixed point number with point at Wlog.
+                   The formulae above is equvalent to floating point
+                   version:
+                   qave = qave*(1-W) + sch->qstats.backlog*W;
+                                                           --ANK (980924)
+                 */
+        }
+        if (q->qave < q->qth_min) {
+                q->qcount = -1;
+enqueue:
+                if (sch->qstats.backlog + skb->len <= q->limit) {
+                        __skb_queue_tail(&sch->q, skb);
+                        sch->qstats.backlog += skb->len;
+                        sch->bstats.bytes += skb->len;
+                        sch->bstats.packets++;
+                        return NET_XMIT_SUCCESS;
+                } else {
+                        q->st.pdrop++;
+                }
+                kfree_skb(skb);
+                sch->qstats.drops++;
+                return NET_XMIT_DROP;
+        }
+        if (q->qave >= q->qth_max) {
+                q->qcount = -1;
+                sch->qstats.overlimits++;
+mark:
+                if  (!(q->flags&TC_RED_ECN) || !red_ecn_mark(skb)) {
+                        q->st.early++;
+                        goto drop;
+                }
+                q->st.marked++;
+                goto enqueue;
+        }
+        if (++q->qcount) {
+                /* The formula used below causes questions.
+                   OK. qR is random number in the interval 0..Rmask
+                   i.e. 0..(2^Plog). If we used floating point
+                   arithmetics, it would be: (2^Plog)*rnd_num,
+                   where rnd_num is less 1.
+                   Taking into account, that qave have fixed
+                   point at Wlog, and Plog is related to max_P by
+                   max_P = (qth_max-qth_min)/2^Plog; two lines
+                   below have the following floating point equivalent:
+                   
+                   max_P*(qave - qth_min)/(qth_max-qth_min) < rnd/qcount
+                   Any questions? --ANK (980924)
+                 */
+                if (((q->qave - q->qth_min)>>q->Wlog)*q->qcount < q->qR)
+                        goto enqueue;
+                q->qcount = 0;
+                q->qR = net_random()&q->Rmask;
+                sch->qstats.overlimits++;
+                goto mark;
+        }
+        q->qR = net_random()&q->Rmask;
+        goto enqueue;
+drop:
+        kfree_skb(skb);
+        sch->qstats.drops++;
+        return NET_XMIT_CN;
+}
+static int
+red_requeue(struct sk_buff *skb, struct Qdisc* sch)
+{
+        struct red_sched_data *q = qdisc_priv(sch);
+        PSCHED_SET_PASTPERFECT(q->qidlestart);
+        __skb_queue_head(&sch->q, skb);
+        sch->qstats.backlog += skb->len;
+        sch->qstats.requeues++;
+        return 0;
+}
+static struct sk_buff *
+red_dequeue(struct Qdisc* sch)
+{
+        struct sk_buff *skb;
+        struct red_sched_data *q = qdisc_priv(sch);
+        skb = __skb_dequeue(&sch->q);
+        if (skb) {
+                sch->qstats.backlog -= skb->len;
+                return skb;
+        }
+        PSCHED_GET_TIME(q->qidlestart);
+        return NULL;
+}
+static unsigned int red_drop(struct Qdisc* sch)
+{
+        struct sk_buff *skb;
+        struct red_sched_data *q = qdisc_priv(sch);
+        skb = __skb_dequeue_tail(&sch->q);
+        if (skb) {
+                unsigned int len = skb->len;
+                sch->qstats.backlog -= len;
+                sch->qstats.drops++;
+                q->st.other++;
+                kfree_skb(skb);
+                return len;
+        }
+        PSCHED_GET_TIME(q->qidlestart);
+        return 0;
+}
+static void red_reset(struct Qdisc* sch)
+{
+        struct red_sched_data *q = qdisc_priv(sch);
+        __skb_queue_purge(&sch->q);
+        sch->qstats.backlog = 0;
+        PSCHED_SET_PASTPERFECT(q->qidlestart);
+        q->qave = 0;
+        q->qcount = -1;
+}
+static int red_change(struct Qdisc *sch, struct rtattr *opt)
+{
+        struct red_sched_data *q = qdisc_priv(sch);
+        struct rtattr *tb[TCA_RED_STAB];
+        struct tc_red_qopt *ctl;
+        if (opt == NULL ||
+            rtattr_parse_nested(tb, TCA_RED_STAB, opt) ||
+            tb[TCA_RED_PARMS-1] == 0 || tb[TCA_RED_STAB-1] == 0 ||
+            RTA_PAYLOAD(tb[TCA_RED_PARMS-1]) < sizeof(*ctl) ||
+            RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < 256)
+                return -EINVAL;
+        ctl = RTA_DATA(tb[TCA_RED_PARMS-1]);
+        sch_tree_lock(sch);
+        q->flags = ctl->flags;
+        q->Wlog = ctl->Wlog;
+        q->Plog = ctl->Plog;
+        q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL;
+        q->Scell_log = ctl->Scell_log;
+        q->Scell_max = (255<<q->Scell_log);
+        q->qth_min = ctl->qth_min<<ctl->Wlog;
+        q->qth_max = ctl->qth_max<<ctl->Wlog;
+        q->limit = ctl->limit;
+        memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256);
+        q->qcount = -1;
+        if (skb_queue_len(&sch->q) == 0)
+                PSCHED_SET_PASTPERFECT(q->qidlestart);
+        sch_tree_unlock(sch);
+        return 0;
+}
+static int red_init(struct Qdisc* sch, struct rtattr *opt)
+{
+        return red_change(sch, opt);
+}
+static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+        struct red_sched_data *q = qdisc_priv(sch);
+        unsigned char    *b = skb->tail;
+        struct rtattr *rta;
+        struct tc_red_qopt opt;
+        rta = (struct rtattr*)b;
+        RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
+        opt.limit = q->limit;
+        opt.qth_min = q->qth_min>>q->Wlog;
+        opt.qth_max = q->qth_max>>q->Wlog;
+        opt.Wlog = q->Wlog;
+        opt.Plog = q->Plog;
+        opt.Scell_log = q->Scell_log;
+        opt.flags = q->flags;
+        RTA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt);
+        rta->rta_len = skb->tail - b;
+        return skb->len;
+rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+        struct red_sched_data *q = qdisc_priv(sch);
+        return gnet_stats_copy_app(d, &q->st, sizeof(q->st));
+}
+static struct Qdisc_ops red_qdisc_ops = {
+        .next           =       NULL,
+        .cl_ops         =       NULL,
+        .id             =       "red",
+        .priv_size      =       sizeof(struct red_sched_data),
+        .enqueue        =       red_enqueue,
+        .dequeue        =       red_dequeue,
+        .requeue        =       red_requeue,
+        .drop           =       red_drop,
+        .init           =       red_init,
+        .reset          =       red_reset,
+        .change         =       red_change,
+        .dump           =       red_dump,
+        .dump_stats     =       red_dump_stats,
+        .owner          =       THIS_MODULE,
+};
+static int __init red_module_init(void)
+{
+        return register_qdisc(&red_qdisc_ops);
+}
+static void __exit red_module_exit(void) 
+{
+        unregister_qdisc(&red_qdisc_ops);
+}
+module_init(red_module_init)
+module_exit(red_module_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
new file mode 100644
index 000000000000..8734bb7280e3
--- /dev/null
+++ b/net/sched/sch_sfq.c
@@ -0,0 +1,497 @@
+/*
+ * net/sched/sch_sfq.c  Stochastic Fairness Queueing discipline.
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/notifier.h>
+#include <linux/init.h>
+#include <net/ip.h>
+#include <linux/ipv6.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+/*      Stochastic Fairness Queuing algorithm.
+        =======================================
+        Source:
+        Paul E. McKenney "Stochastic Fairness Queuing",
+        IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
+        Paul E. McKenney "Stochastic Fairness Queuing",
+        "Interworking: Research and Experience", v.2, 1991, p.113-131.
+        See also:
+        M. Shreedhar and George Varghese "Efficient Fair
+        Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
+        This is not the thing that is usually called (W)FQ nowadays. 
+        It does not use any timestamp mechanism, but instead
+        processes queues in round-robin order.
+        ADVANTAGE:
+        - It is very cheap. Both CPU and memory requirements are minimal.
+        DRAWBACKS:
+        - "Stochastic" -> It is not 100% fair. 
+        When hash collisions occur, several flows are considered as one.
+        - "Round-robin" -> It introduces larger delays than virtual clock
+        based schemes, and should not be used for isolating interactive
+        traffic from non-interactive. It means, that this scheduler
+        should be used as leaf of CBQ or P3, which put interactive traffic
+        to higher priority band.
+        We still need true WFQ for top level CSZ, but using WFQ
+        for the best effort traffic is absolutely pointless:
+        SFQ is superior for this purpose.
+        IMPLEMENTATION:
+        This implementation limits maximal queue length to 128;
+        maximal mtu to 2^15-1; number of hash buckets to 1024.
+        The only goal of this restrictions was that all data
+        fit into one 4K page :-). Struct sfq_sched_data is
+        organized in anti-cache manner: all the data for a bucket
+        are scattered over different locations. This is not good,
+        but it allowed me to put it into 4K.
+        It is easy to increase these values, but not in flight.  */
+#define SFQ_DEPTH               128
+#define SFQ_HASH_DIVISOR        1024
+/* This type should contain at least SFQ_DEPTH*2 values */
+typedef unsigned char sfq_index;
+struct sfq_head
+{
+        sfq_index       next;
+        sfq_index       prev;
+};
+struct sfq_sched_data
+{
+/* Parameters */
+        int             perturb_period;
+        unsigned        quantum;        /* Allotment per round: MUST BE >= MTU */
+        int             limit;
+/* Variables */
+        struct timer_list perturb_timer;
+        int             perturbation;
+        sfq_index       tail;           /* Index of current slot in round */
+        sfq_index       max_depth;      /* Maximal depth */
+        sfq_index       ht[SFQ_HASH_DIVISOR];   /* Hash table */
+        sfq_index       next[SFQ_DEPTH];        /* Active slots link */
+        short           allot[SFQ_DEPTH];       /* Current allotment per slot */
+        unsigned short  hash[SFQ_DEPTH];        /* Hash value indexed by slots */
+        struct sk_buff_head     qs[SFQ_DEPTH];          /* Slot queue */
+        struct sfq_head dep[SFQ_DEPTH*2];       /* Linked list of slots, indexed by depth */
+};
+static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1)
+{
+        int pert = q->perturbation;
+        /* Have we any rotation primitives? If not, WHY? */
+        h ^= (h1<<pert) ^ (h1>>(0x1F - pert));
+        h ^= h>>10;
+        return h & 0x3FF;
+}
+static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
+{
+        u32 h, h2;
+        switch (skb->protocol) {
+        case __constant_htons(ETH_P_IP):
+        {
+                struct iphdr *iph = skb->nh.iph;
+                h = iph->daddr;
+                h2 = iph->saddr^iph->protocol;
+                if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
+                    (iph->protocol == IPPROTO_TCP ||
+                     iph->protocol == IPPROTO_UDP ||
+                     iph->protocol == IPPROTO_ESP))
+                        h2 ^= *(((u32*)iph) + iph->ihl);
+                break;
+        }
+        case __constant_htons(ETH_P_IPV6):
+        {
+                struct ipv6hdr *iph = skb->nh.ipv6h;
+                h = iph->daddr.s6_addr32[3];
+                h2 = iph->saddr.s6_addr32[3]^iph->nexthdr;
+                if (iph->nexthdr == IPPROTO_TCP ||
+                    iph->nexthdr == IPPROTO_UDP ||
+                    iph->nexthdr == IPPROTO_ESP)
+                        h2 ^= *(u32*)&iph[1];
+                break;
+        }
+        default:
+                h = (u32)(unsigned long)skb->dst^skb->protocol;
+                h2 = (u32)(unsigned long)skb->sk;
+        }
+        return sfq_fold_hash(q, h, h2);
+}
+static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
+{
+        sfq_index p, n;
+        int d = q->qs[x].qlen + SFQ_DEPTH;
+        p = d;
+        n = q->dep[d].next;
+        q->dep[x].next = n;
+        q->dep[x].prev = p;
+        q->dep[p].next = q->dep[n].prev = x;
+}
+static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
+{
+        sfq_index p, n;
+        n = q->dep[x].next;
+        p = q->dep[x].prev;
+        q->dep[p].next = n;
+        q->dep[n].prev = p;
+        if (n == p && q->max_depth == q->qs[x].qlen + 1)
+                q->max_depth--;
+        sfq_link(q, x);
+}
+static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
+{
+        sfq_index p, n;
+        int d;
+        n = q->dep[x].next;
+        p = q->dep[x].prev;
+        q->dep[p].next = n;
+        q->dep[n].prev = p;
+        d = q->qs[x].qlen;
+        if (q->max_depth < d)
+                q->max_depth = d;
+        sfq_link(q, x);
+}
+static unsigned int sfq_drop(struct Qdisc *sch)
+{
+        struct sfq_sched_data *q = qdisc_priv(sch);
+        sfq_index d = q->max_depth;
+        struct sk_buff *skb;
+        unsigned int len;
+        /* Queue is full! Find the longest slot and
+           drop a packet from it */
+        if (d > 1) {
+                sfq_index x = q->dep[d+SFQ_DEPTH].next;
+                skb = q->qs[x].prev;
+                len = skb->len;
+                __skb_unlink(skb, &q->qs[x]);
+                kfree_skb(skb);
+                sfq_dec(q, x);
+                sch->q.qlen--;
+                sch->qstats.drops++;
+                return len;
+        }
+        if (d == 1) {
+                /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
+                d = q->next[q->tail];
+                q->next[q->tail] = q->next[d];
+                q->allot[q->next[d]] += q->quantum;
+                skb = q->qs[d].prev;
+                len = skb->len;
+                __skb_unlink(skb, &q->qs[d]);
+                kfree_skb(skb);
+                sfq_dec(q, d);
+                sch->q.qlen--;
+                q->ht[q->hash[d]] = SFQ_DEPTH;
+                sch->qstats.drops++;
+                return len;
+        }
+        return 0;
+}
+static int
+sfq_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+{
+        struct sfq_sched_data *q = qdisc_priv(sch);
+        unsigned hash = sfq_hash(q, skb);
+        sfq_index x;
+        x = q->ht[hash];
+        if (x == SFQ_DEPTH) {
+                q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
+                q->hash[x] = hash;
+        }
+        __skb_queue_tail(&q->qs[x], skb);
+        sfq_inc(q, x);
+        if (q->qs[x].qlen == 1) {               /* The flow is new */
+                if (q->tail == SFQ_DEPTH) {     /* It is the first flow */
+                        q->tail = x;
+                        q->next[x] = x;
+                        q->allot[x] = q->quantum;
+                } else {
+                        q->next[x] = q->next[q->tail];
+                        q->next[q->tail] = x;
+                        q->tail = x;
+                }
+        }
+        if (++sch->q.qlen < q->limit-1) {
+                sch->bstats.bytes += skb->len;
+                sch->bstats.packets++;
+                return 0;
+        }
+        sfq_drop(sch);
+        return NET_XMIT_CN;
+}
+static int
+sfq_requeue(struct sk_buff *skb, struct Qdisc* sch)
+{
+        struct sfq_sched_data *q = qdisc_priv(sch);
+        unsigned hash = sfq_hash(q, skb);
+        sfq_index x;
+        x = q->ht[hash];
+        if (x == SFQ_DEPTH) {
+                q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
+                q->hash[x] = hash;
+        }
+        __skb_queue_head(&q->qs[x], skb);
+        sfq_inc(q, x);
+        if (q->qs[x].qlen == 1) {               /* The flow is new */
+                if (q->tail == SFQ_DEPTH) {     /* It is the first flow */
+                        q->tail = x;
+                        q->next[x] = x;
+                        q->allot[x] = q->quantum;
+                } else {
+                        q->next[x] = q->next[q->tail];
+                        q->next[q->tail] = x;
+                        q->tail = x;
+                }
+        }
+        if (++sch->q.qlen < q->limit - 1) {
+                sch->qstats.requeues++;
+                return 0;
+        }
+        sch->qstats.drops++;
+        sfq_drop(sch);
+        return NET_XMIT_CN;
+}
+static struct sk_buff *
+sfq_dequeue(struct Qdisc* sch)
+{
+        struct sfq_sched_data *q = qdisc_priv(sch);
+        struct sk_buff *skb;
+        sfq_index a, old_a;
+        /* No active slots */
+        if (q->tail == SFQ_DEPTH)
+                return NULL;
+        a = old_a = q->next[q->tail];
+        /* Grab packet */
+        skb = __skb_dequeue(&q->qs[a]);
+        sfq_dec(q, a);
+        sch->q.qlen--;
+        /* Is the slot empty? */
+        if (q->qs[a].qlen == 0) {
+                q->ht[q->hash[a]] = SFQ_DEPTH;
+                a = q->next[a];
+                if (a == old_a) {
+                        q->tail = SFQ_DEPTH;
+                        return skb;
+                }
+                q->next[q->tail] = a;
+                q->allot[a] += q->quantum;
+        } else if ((q->allot[a] -= skb->len) <= 0) {
+                q->tail = a;
+                a = q->next[a];
+                q->allot[a] += q->quantum;
+        }
+        return skb;
+}
+static void
+sfq_reset(struct Qdisc* sch)
+{
+        struct sk_buff *skb;
+        while ((skb = sfq_dequeue(sch)) != NULL)
+                kfree_skb(skb);
+}
+static void sfq_perturbation(unsigned long arg)
+{
+        struct Qdisc *sch = (struct Qdisc*)arg;
+        struct sfq_sched_data *q = qdisc_priv(sch);
+        q->perturbation = net_random()&0x1F;
+        if (q->perturb_period) {
+                q->perturb_timer.expires = jiffies + q->perturb_period;
+                add_timer(&q->perturb_timer);
+        }
+}
+static int sfq_change(struct Qdisc *sch, struct rtattr *opt)
+{
+        struct sfq_sched_data *q = qdisc_priv(sch);
+        struct tc_sfq_qopt *ctl = RTA_DATA(opt);
+        if (opt->rta_len < RTA_LENGTH(sizeof(*ctl)))
+                return -EINVAL;
+        sch_tree_lock(sch);
+        q->quantum = ctl->quantum ? : psched_mtu(sch->dev);
+        q->perturb_period = ctl->perturb_period*HZ;
+        if (ctl->limit)
+                q->limit = min_t(u32, ctl->limit, SFQ_DEPTH);
+        while (sch->q.qlen >= q->limit-1)
+                sfq_drop(sch);
+        del_timer(&q->perturb_timer);
+        if (q->perturb_period) {
+                q->perturb_timer.expires = jiffies + q->perturb_period;
+                add_timer(&q->perturb_timer);
+        }
+        sch_tree_unlock(sch);
+        return 0;
+}
+static int sfq_init(struct Qdisc *sch, struct rtattr *opt)
+{
+        struct sfq_sched_data *q = qdisc_priv(sch);
+        int i;
+        init_timer(&q->perturb_timer);
+        q->perturb_timer.data = (unsigned long)sch;
+        q->perturb_timer.function = sfq_perturbation;
+        for (i=0; i<SFQ_HASH_DIVISOR; i++)
+                q->ht[i] = SFQ_DEPTH;
+        for (i=0; i<SFQ_DEPTH; i++) {
+                skb_queue_head_init(&q->qs[i]);
+                q->dep[i+SFQ_DEPTH].next = i+SFQ_DEPTH;
+                q->dep[i+SFQ_DEPTH].prev = i+SFQ_DEPTH;
+        }
+        q->limit = SFQ_DEPTH;
+        q->max_depth = 0;
+        q->tail = SFQ_DEPTH;
+        if (opt == NULL) {
+                q->quantum = psched_mtu(sch->dev);
+                q->perturb_period = 0;
+        } else {
+                int err = sfq_change(sch, opt);
+                if (err)
+                        return err;
+        }
+        for (i=0; i<SFQ_DEPTH; i++)
+                sfq_link(q, i);
+        return 0;
+}
+static void sfq_destroy(struct Qdisc *sch)
+{
+        struct sfq_sched_data *q = qdisc_priv(sch);
+        del_timer(&q->perturb_timer);
+}
+static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+        struct sfq_sched_data *q = qdisc_priv(sch);
+        unsigned char    *b = skb->tail;
+        struct tc_sfq_qopt opt;
+        opt.quantum = q->quantum;
+        opt.perturb_period = q->perturb_period/HZ;
+        opt.limit = q->limit;
+        opt.divisor = SFQ_HASH_DIVISOR;
+        opt.flows = q->limit;
+        RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+        return skb->len;
+rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+static struct Qdisc_ops sfq_qdisc_ops = {
+        .next           =       NULL,
+        .cl_ops         =       NULL,
+        .id             =       "sfq",
+        .priv_size      =       sizeof(struct sfq_sched_data),
+        .enqueue        =       sfq_enqueue,
+        .dequeue        =       sfq_dequeue,
+        .requeue        =       sfq_requeue,
+        .drop           =       sfq_drop,
+        .init           =       sfq_init,
+        .reset          =       sfq_reset,
+        .destroy        =       sfq_destroy,
+        .change         =       NULL,
+        .dump           =       sfq_dump,
+        .owner          =       THIS_MODULE,
+};
+static int __init sfq_module_init(void)
+{
+        return register_qdisc(&sfq_qdisc_ops);
+}
+static void __exit sfq_module_exit(void) 
+{
+        unregister_qdisc(&sfq_qdisc_ops);
+}
+module_init(sfq_module_init)
+module_exit(sfq_module_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
new file mode 100644
index 000000000000..cb9711ea8c6c
--- /dev/null
+++ b/net/sched/sch_tbf.c
@@ -0,0 +1,543 @@
+/*
+ * net/sched/sch_tbf.c  Token Bucket Filter queue.
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *              Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
+ *                                               original idea by Martin Devera
+ *
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/notifier.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+/*      Simple Token Bucket Filter.
+        =======================================
+        SOURCE.
+        -------
+        None.
+        Description.
+        ------------
+        A data flow obeys TBF with rate R and depth B, if for any
+        time interval t_i...t_f the number of transmitted bits
+        does not exceed B + R*(t_f-t_i).
+        Packetized version of this definition:
+        The sequence of packets of sizes s_i served at moments t_i
+        obeys TBF, if for any i<=k:
+        s_i+....+s_k <= B + R*(t_k - t_i)
+        Algorithm.
+        ----------
+        Let N(t_i) be B/R initially and N(t) grow continuously with time as:
+        N(t+delta) = min{B/R, N(t) + delta}
+        If the first packet in queue has length S, it may be
+        transmitted only at the time t_* when S/R <= N(t_*),
+        and in this case N(t) jumps:
+        N(t_* + 0) = N(t_* - 0) - S/R.
+        Actually, QoS requires two TBF to be applied to a data stream.
+        One of them controls steady state burst size, another
+        one with rate P (peak rate) and depth M (equal to link MTU)
+        limits bursts at a smaller time scale.
+        It is easy to see that P>R, and B>M. If P is infinity, this double
+        TBF is equivalent to a single one.
+        When TBF works in reshaping mode, latency is estimated as:
+        lat = max ((L-B)/R, (L-M)/P)
+        NOTES.
+        ------
+        If TBF throttles, it starts a watchdog timer, which will wake it up
+        when it is ready to transmit.
+        Note that the minimal timer resolution is 1/HZ.
+        If no new packets arrive during this period,
+        or if the device is not awaken by EOI for some previous packet,
+        TBF can stop its activity for 1/HZ.
+        This means, that with depth B, the maximal rate is
+        R_crit = B*HZ
+        F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
+        Note that the peak rate TBF is much more tough: with MTU 1500
+        P_crit = 150Kbytes/sec. So, if you need greater peak
+        rates, use alpha with HZ=1000 :-)
+        With classful TBF, limit is just kept for backwards compatibility.
+        It is passed to the default bfifo qdisc - if the inner qdisc is
+        changed the limit is not effective anymore.
+*/
+struct tbf_sched_data
+{
+/* Parameters */
+        u32             limit;          /* Maximal length of backlog: bytes */
+        u32             buffer;         /* Token bucket depth/rate: MUST BE >= MTU/B */
+        u32             mtu;
+        u32             max_size;
+        struct qdisc_rate_table *R_tab;
+        struct qdisc_rate_table *P_tab;
+/* Variables */
+        long    tokens;                 /* Current number of B tokens */
+        long    ptokens;                /* Current number of P tokens */
+        psched_time_t   t_c;            /* Time check-point */
+        struct timer_list wd_timer;     /* Watchdog timer */
+        struct Qdisc    *qdisc;         /* Inner qdisc, default - bfifo queue */
+};
+#define L2T(q,L)   ((q)->R_tab->data[(L)>>(q)->R_tab->rate.cell_log])
+#define L2T_P(q,L) ((q)->P_tab->data[(L)>>(q)->P_tab->rate.cell_log])
+static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+{
+        struct tbf_sched_data *q = qdisc_priv(sch);
+        int ret;
+        if (skb->len > q->max_size) {
+                sch->qstats.drops++;
+#ifdef CONFIG_NET_CLS_POLICE
+                if (sch->reshape_fail == NULL || sch->reshape_fail(skb, sch))
+#endif
+                        kfree_skb(skb);
+                return NET_XMIT_DROP;
+        }
+        if ((ret = q->qdisc->enqueue(skb, q->qdisc)) != 0) {
+                sch->qstats.drops++;
+                return ret;
+        }
+        sch->q.qlen++;
+        sch->bstats.bytes += skb->len;
+        sch->bstats.packets++;
+        return 0;
+}
+static int tbf_requeue(struct sk_buff *skb, struct Qdisc* sch)
+{
+        struct tbf_sched_data *q = qdisc_priv(sch);
+        int ret;
+        if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0) {
+                sch->q.qlen++;
+                sch->qstats.requeues++;
+        }
+        return ret;
+}
+static unsigned int tbf_drop(struct Qdisc* sch)
+{
+        struct tbf_sched_data *q = qdisc_priv(sch);
+        unsigned int len;
+        if ((len = q->qdisc->ops->drop(q->qdisc)) != 0) {
+                sch->q.qlen--;
+                sch->qstats.drops++;
+        }
+        return len;
+}
+static void tbf_watchdog(unsigned long arg)
+{
+        struct Qdisc *sch = (struct Qdisc*)arg;
+        sch->flags &= ~TCQ_F_THROTTLED;
+        netif_schedule(sch->dev);
+}
+static struct sk_buff *tbf_dequeue(struct Qdisc* sch)
+{
+        struct tbf_sched_data *q = qdisc_priv(sch);
+        struct sk_buff *skb;
+        skb = q->qdisc->dequeue(q->qdisc);
+        if (skb) {
+                psched_time_t now;
+                long toks, delay;
+                long ptoks = 0;
+                unsigned int len = skb->len;
+                PSCHED_GET_TIME(now);
+                toks = PSCHED_TDIFF_SAFE(now, q->t_c, q->buffer);
+                if (q->P_tab) {
+                        ptoks = toks + q->ptokens;
+                        if (ptoks > (long)q->mtu)
+                                ptoks = q->mtu;
+                        ptoks -= L2T_P(q, len);
+                }
+                toks += q->tokens;
+                if (toks > (long)q->buffer)
+                        toks = q->buffer;
+                toks -= L2T(q, len);
+                if ((toks|ptoks) >= 0) {
+                        q->t_c = now;
+                        q->tokens = toks;
+                        q->ptokens = ptoks;
+                        sch->q.qlen--;
+                        sch->flags &= ~TCQ_F_THROTTLED;
+                        return skb;
+                }
+                delay = PSCHED_US2JIFFIE(max_t(long, -toks, -ptoks));
+                if (delay == 0)
+                        delay = 1;
+                mod_timer(&q->wd_timer, jiffies+delay);
+                /* Maybe we have a shorter packet in the queue,
+                   which can be sent now. It sounds cool,
+                   but, however, this is wrong in principle.
+                   We MUST NOT reorder packets under these circumstances.
+                   Really, if we split the flow into independent
+                   subflows, it would be a very good solution.
+                   This is the main idea of all FQ algorithms
+                   (cf. CSZ, HPFQ, HFSC)
+                 */
+                if (q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS) {
+                        /* When requeue fails skb is dropped */
+                        sch->q.qlen--;
+                        sch->qstats.drops++;
+                }
+                sch->flags |= TCQ_F_THROTTLED;
+                sch->qstats.overlimits++;
+        }
+        return NULL;
+}
+static void tbf_reset(struct Qdisc* sch)
+{
+        struct tbf_sched_data *q = qdisc_priv(sch);
+        qdisc_reset(q->qdisc);
+        sch->q.qlen = 0;
+        PSCHED_GET_TIME(q->t_c);
+        q->tokens = q->buffer;
+        q->ptokens = q->mtu;
+        sch->flags &= ~TCQ_F_THROTTLED;
+        del_timer(&q->wd_timer);
+}
+static struct Qdisc *tbf_create_dflt_qdisc(struct net_device *dev, u32 limit)
+{
+        struct Qdisc *q = qdisc_create_dflt(dev, &bfifo_qdisc_ops);
+        struct rtattr *rta;
+        int ret;
+        if (q) {
+                rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
+                if (rta) {
+                        rta->rta_type = RTM_NEWQDISC;
+                        rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt)); 
+                        ((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit;
+                        ret = q->ops->change(q, rta);
+                        kfree(rta);
+                        if (ret == 0)
+                                return q;
+                }
+                qdisc_destroy(q);
+        }
+        return NULL;
+}
+static int tbf_change(struct Qdisc* sch, struct rtattr *opt)
+{
+        int err = -EINVAL;
+        struct tbf_sched_data *q = qdisc_priv(sch);
+        struct rtattr *tb[TCA_TBF_PTAB];
+        struct tc_tbf_qopt *qopt;
+        struct qdisc_rate_table *rtab = NULL;
+        struct qdisc_rate_table *ptab = NULL;
+        struct Qdisc *child = NULL;
+        int max_size,n;
+        if (rtattr_parse_nested(tb, TCA_TBF_PTAB, opt) ||
+            tb[TCA_TBF_PARMS-1] == NULL ||
+            RTA_PAYLOAD(tb[TCA_TBF_PARMS-1]) < sizeof(*qopt))
+                goto done;
+        qopt = RTA_DATA(tb[TCA_TBF_PARMS-1]);
+        rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB-1]);
+        if (rtab == NULL)
+                goto done;
+        if (qopt->peakrate.rate) {
+                if (qopt->peakrate.rate > qopt->rate.rate)
+                        ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB-1]);
+                if (ptab == NULL)
+                        goto done;
+        }
+        for (n = 0; n < 256; n++)
+                if (rtab->data[n] > qopt->buffer) break;
+        max_size = (n << qopt->rate.cell_log)-1;
+        if (ptab) {
+                int size;
+                for (n = 0; n < 256; n++)
+                        if (ptab->data[n] > qopt->mtu) break;
+                size = (n << qopt->peakrate.cell_log)-1;
+                if (size < max_size) max_size = size;
+        }
+        if (max_size < 0)
+                goto done;
+        if (q->qdisc == &noop_qdisc) {
+                if ((child = tbf_create_dflt_qdisc(sch->dev, qopt->limit)) == NULL)
+                        goto done;
+        }
+        sch_tree_lock(sch);
+        if (child) q->qdisc = child;
+        q->limit = qopt->limit;
+        q->mtu = qopt->mtu;
+        q->max_size = max_size;
+        q->buffer = qopt->buffer;
+        q->tokens = q->buffer;
+        q->ptokens = q->mtu;
+        rtab = xchg(&q->R_tab, rtab);
+        ptab = xchg(&q->P_tab, ptab);
+        sch_tree_unlock(sch);
+        err = 0;
+done:
+        if (rtab)
+                qdisc_put_rtab(rtab);
+        if (ptab)
+                qdisc_put_rtab(ptab);
+        return err;
+}
+static int tbf_init(struct Qdisc* sch, struct rtattr *opt)
+{
+        struct tbf_sched_data *q = qdisc_priv(sch);
+        if (opt == NULL)
+                return -EINVAL;
+        PSCHED_GET_TIME(q->t_c);
+        init_timer(&q->wd_timer);
+        q->wd_timer.function = tbf_watchdog;
+        q->wd_timer.data = (unsigned long)sch;
+        q->qdisc = &noop_qdisc;
+        return tbf_change(sch, opt);
+}
+static void tbf_destroy(struct Qdisc *sch)
+{
+        struct tbf_sched_data *q = qdisc_priv(sch);
+        del_timer(&q->wd_timer);
+        if (q->P_tab)
+                qdisc_put_rtab(q->P_tab);
+        if (q->R_tab)
+                qdisc_put_rtab(q->R_tab);
+        qdisc_destroy(q->qdisc);
+}
+static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+        struct tbf_sched_data *q = qdisc_priv(sch);
+        unsigned char    *b = skb->tail;
+        struct rtattr *rta;
+        struct tc_tbf_qopt opt;
+        rta = (struct rtattr*)b;
+        RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
+        opt.limit = q->limit;
+        opt.rate = q->R_tab->rate;
+        if (q->P_tab)
+                opt.peakrate = q->P_tab->rate;
+        else
+                memset(&opt.peakrate, 0, sizeof(opt.peakrate));
+        opt.mtu = q->mtu;
+        opt.buffer = q->buffer;
+        RTA_PUT(skb, TCA_TBF_PARMS, sizeof(opt), &opt);
+        rta->rta_len = skb->tail - b;
+        return skb->len;
+rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
+                          struct sk_buff *skb, struct tcmsg *tcm)
+{
+        struct tbf_sched_data *q = qdisc_priv(sch);
+        if (cl != 1)    /* only one class */
+                return -ENOENT;
+        tcm->tcm_handle |= TC_H_MIN(1);
+        tcm->tcm_info = q->qdisc->handle;
+        return 0;
+}
+static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+                     struct Qdisc **old)
+{
+        struct tbf_sched_data *q = qdisc_priv(sch);
+        if (new == NULL)
+                new = &noop_qdisc;
+        sch_tree_lock(sch);
+        *old = xchg(&q->qdisc, new);
+        qdisc_reset(*old);
+        sch->q.qlen = 0;
+        sch_tree_unlock(sch);
+        return 0;
+}
+static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
+{
+        struct tbf_sched_data *q = qdisc_priv(sch);
+        return q->qdisc;
+}
+static unsigned long tbf_get(struct Qdisc *sch, u32 classid)
+{
+        return 1;
+}
+static void tbf_put(struct Qdisc *sch, unsigned long arg)
+{
+}
+static int tbf_change_class(struct Qdisc *sch, u32 classid, u32 parentid, 
+                            struct rtattr **tca, unsigned long *arg)
+{
+        return -ENOSYS;
+}
+static int tbf_delete(struct Qdisc *sch, unsigned long arg)
+{
+        return -ENOSYS;
+}
+static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
+{
+        if (!walker->stop) {
+                if (walker->count >= walker->skip)
+                        if (walker->fn(sch, 1, walker) < 0) {
+                                walker->stop = 1;
+                                return;
+                        }
+                walker->count++;
+        }
+}
+static struct tcf_proto **tbf_find_tcf(struct Qdisc *sch, unsigned long cl)
+{
+        return NULL;
+}
+static struct Qdisc_class_ops tbf_class_ops =
+{
+        .graft          =       tbf_graft,
+        .leaf           =       tbf_leaf,
+        .get            =       tbf_get,
+        .put            =       tbf_put,
+        .change         =       tbf_change_class,
+        .delete         =       tbf_delete,
+        .walk           =       tbf_walk,
+        .tcf_chain      =       tbf_find_tcf,
+        .dump           =       tbf_dump_class,
+};
+static struct Qdisc_ops tbf_qdisc_ops = {
+        .next           =       NULL,
+        .cl_ops         =       &tbf_class_ops,
+        .id             =       "tbf",
+        .priv_size      =       sizeof(struct tbf_sched_data),
+        .enqueue        =       tbf_enqueue,
+        .dequeue        =       tbf_dequeue,
+        .requeue        =       tbf_requeue,
+        .drop           =       tbf_drop,
+        .init           =       tbf_init,
+        .reset          =       tbf_reset,
+        .destroy        =       tbf_destroy,
+        .change         =       tbf_change,
+        .dump           =       tbf_dump,
+        .owner          =       THIS_MODULE,
+};
+static int __init tbf_module_init(void)
+{
+        return register_qdisc(&tbf_qdisc_ops);
+}
+static void __exit tbf_module_exit(void)
+{
+        unregister_qdisc(&tbf_qdisc_ops);
+}
+module_init(tbf_module_init)
+module_exit(tbf_module_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
new file mode 100644
index 000000000000..6cf0342706b5
--- /dev/null
+++ b/net/sched/sch_teql.c
@@ -0,0 +1,511 @@
+/* net/sched/sch_teql.c "True" (or "trivial") link equalizer.
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/notifier.h>
+#include <linux/init.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <linux/moduleparam.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+/*
+   How to setup it.
+   ----------------
+   After loading this module you will find a new device teqlN
+   and new qdisc with the same name. To join a slave to the equalizer
+   you should just set this qdisc on a device f.e.
+   # tc qdisc add dev eth0 root teql0
+   # tc qdisc add dev eth1 root teql0
+   That's all. Full PnP 8)
+   Applicability.
+   --------------
+   1. Slave devices MUST be active devices, i.e., they must raise the tbusy
+      signal and generate EOI events. If you want to equalize virtual devices
+      like tunnels, use a normal eql device.
+   2. This device puts no limitations on physical slave characteristics
+      f.e. it will equalize 9600baud line and 100Mb ethernet perfectly :-)
+      Certainly, large difference in link speeds will make the resulting
+      eqalized link unusable, because of huge packet reordering.
+      I estimate an upper useful difference as ~10 times.
+   3. If the slave requires address resolution, only protocols using
+      neighbour cache (IPv4/IPv6) will work over the equalized link.
+      Other protocols are still allowed to use the slave device directly,
+      which will not break load balancing, though native slave
+      traffic will have the highest priority.  */
+struct teql_master
+{
+        struct Qdisc_ops qops;
+        struct net_device *dev;
+        struct Qdisc *slaves;
+        struct list_head master_list;
+        struct net_device_stats stats;
+};
+struct teql_sched_data
+{
+        struct Qdisc *next;
+        struct teql_master *m;
+        struct neighbour *ncache;
+        struct sk_buff_head q;
+};
+#define NEXT_SLAVE(q) (((struct teql_sched_data*)qdisc_priv(q))->next)
+#define FMASK (IFF_BROADCAST|IFF_POINTOPOINT|IFF_BROADCAST)
+/* "teql*" qdisc routines */
+static int
+teql_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+{
+        struct net_device *dev = sch->dev;
+        struct teql_sched_data *q = qdisc_priv(sch);
+        __skb_queue_tail(&q->q, skb);
+        if (q->q.qlen <= dev->tx_queue_len) {
+                sch->bstats.bytes += skb->len;
+                sch->bstats.packets++;
+                return 0;
+        }
+        __skb_unlink(skb, &q->q);
+        kfree_skb(skb);
+        sch->qstats.drops++;
+        return NET_XMIT_DROP;
+}
+static int
+teql_requeue(struct sk_buff *skb, struct Qdisc* sch)
+{
+        struct teql_sched_data *q = qdisc_priv(sch);
+        __skb_queue_head(&q->q, skb);
+        sch->qstats.requeues++;
+        return 0;
+}
+static struct sk_buff *
+teql_dequeue(struct Qdisc* sch)
+{
+        struct teql_sched_data *dat = qdisc_priv(sch);
+        struct sk_buff *skb;
+        skb = __skb_dequeue(&dat->q);
+        if (skb == NULL) {
+                struct net_device *m = dat->m->dev->qdisc->dev;
+                if (m) {
+                        dat->m->slaves = sch;
+                        netif_wake_queue(m);
+                }
+        }
+        sch->q.qlen = dat->q.qlen + dat->m->dev->qdisc->q.qlen;
+        return skb;
+}
+static __inline__ void
+teql_neigh_release(struct neighbour *n)
+{
+        if (n)
+                neigh_release(n);
+}
+static void
+teql_reset(struct Qdisc* sch)
+{
+        struct teql_sched_data *dat = qdisc_priv(sch);
+        skb_queue_purge(&dat->q);
+        sch->q.qlen = 0;
+        teql_neigh_release(xchg(&dat->ncache, NULL));
+}
+static void
+teql_destroy(struct Qdisc* sch)
+{
+        struct Qdisc *q, *prev;
+        struct teql_sched_data *dat = qdisc_priv(sch);
+        struct teql_master *master = dat->m;
+        if ((prev = master->slaves) != NULL) {
+                do {
+                        q = NEXT_SLAVE(prev);
+                        if (q == sch) {
+                                NEXT_SLAVE(prev) = NEXT_SLAVE(q);
+                                if (q == master->slaves) {
+                                        master->slaves = NEXT_SLAVE(q);
+                                        if (q == master->slaves) {
+                                                master->slaves = NULL;
+                                                spin_lock_bh(&master->dev->queue_lock);
+                                                qdisc_reset(master->dev->qdisc);
+                                                spin_unlock_bh(&master->dev->queue_lock);
+                                        }
+                                }
+                                skb_queue_purge(&dat->q);
+                                teql_neigh_release(xchg(&dat->ncache, NULL));
+                                break;
+                        }
+                                
+                } while ((prev = q) != master->slaves);
+        }
+}
+static int teql_qdisc_init(struct Qdisc *sch, struct rtattr *opt)
+{
+        struct net_device *dev = sch->dev;
+        struct teql_master *m = (struct teql_master*)sch->ops;
+        struct teql_sched_data *q = qdisc_priv(sch);
+        if (dev->hard_header_len > m->dev->hard_header_len)
+                return -EINVAL;
+        if (m->dev == dev)
+                return -ELOOP;
+        q->m = m;
+        skb_queue_head_init(&q->q);
+        if (m->slaves) {
+                if (m->dev->flags & IFF_UP) {
+                        if ((m->dev->flags&IFF_POINTOPOINT && !(dev->flags&IFF_POINTOPOINT))
+                            || (m->dev->flags&IFF_BROADCAST && !(dev->flags&IFF_BROADCAST))
+                            || (m->dev->flags&IFF_MULTICAST && !(dev->flags&IFF_MULTICAST))
+                            || dev->mtu < m->dev->mtu)
+                                return -EINVAL;
+                } else {
+                        if (!(dev->flags&IFF_POINTOPOINT))
+                                m->dev->flags &= ~IFF_POINTOPOINT;
+                        if (!(dev->flags&IFF_BROADCAST))
+                                m->dev->flags &= ~IFF_BROADCAST;
+                        if (!(dev->flags&IFF_MULTICAST))
+                                m->dev->flags &= ~IFF_MULTICAST;
+                        if (dev->mtu < m->dev->mtu)
+                                m->dev->mtu = dev->mtu;
+                }
+                q->next = NEXT_SLAVE(m->slaves);
+                NEXT_SLAVE(m->slaves) = sch;
+        } else {
+                q->next = sch;
+                m->slaves = sch;
+                m->dev->mtu = dev->mtu;
+                m->dev->flags = (m->dev->flags&~FMASK)|(dev->flags&FMASK);
+        }
+        return 0;
+}
+/* "teql*" netdevice routines */
+static int
+__teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device *dev)
+{
+        struct teql_sched_data *q = qdisc_priv(dev->qdisc);
+        struct neighbour *mn = skb->dst->neighbour;
+        struct neighbour *n = q->ncache;
+        if (mn->tbl == NULL)
+                return -EINVAL;
+        if (n && n->tbl == mn->tbl &&
+            memcmp(n->primary_key, mn->primary_key, mn->tbl->key_len) == 0) {
+                atomic_inc(&n->refcnt);
+        } else {
+                n = __neigh_lookup_errno(mn->tbl, mn->primary_key, dev);
+                if (IS_ERR(n))
+                        return PTR_ERR(n);
+        }
+        if (neigh_event_send(n, skb_res) == 0) {
+                int err;
+                read_lock(&n->lock);
+                err = dev->hard_header(skb, dev, ntohs(skb->protocol), n->ha, NULL, skb->len);
+                read_unlock(&n->lock);
+                if (err < 0) {
+                        neigh_release(n);
+                        return -EINVAL;
+                }
+                teql_neigh_release(xchg(&q->ncache, n));
+                return 0;
+        }
+        neigh_release(n);
+        return (skb_res == NULL) ? -EAGAIN : 1;
+}
+static __inline__ int
+teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device *dev)
+{
+        if (dev->hard_header == NULL ||
+            skb->dst == NULL ||
+            skb->dst->neighbour == NULL)
+                return 0;
+        return __teql_resolve(skb, skb_res, dev);
+}
+static int teql_master_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+        struct teql_master *master = (void*)dev->priv;
+        struct Qdisc *start, *q;
+        int busy;
+        int nores;
+        int len = skb->len;
+        struct sk_buff *skb_res = NULL;
+        start = master->slaves;
+restart:
+        nores = 0;
+        busy = 0;
+        if ((q = start) == NULL)
+                goto drop;
+        do {
+                struct net_device *slave = q->dev;
+                
+                if (slave->qdisc_sleeping != q)
+                        continue;
+                if (netif_queue_stopped(slave) || ! netif_running(slave)) {
+                        busy = 1;
+                        continue;
+                }
+                switch (teql_resolve(skb, skb_res, slave)) {
+                case 0:
+                        if (spin_trylock(&slave->xmit_lock)) {
+                                slave->xmit_lock_owner = smp_processor_id();
+                                if (!netif_queue_stopped(slave) &&
+                                    slave->hard_start_xmit(skb, slave) == 0) {
+                                        slave->xmit_lock_owner = -1;
+                                        spin_unlock(&slave->xmit_lock);
+                                        master->slaves = NEXT_SLAVE(q);
+                                        netif_wake_queue(dev);
+                                        master->stats.tx_packets++;
+                                        master->stats.tx_bytes += len;
+                                        return 0;
+                                }
+                                slave->xmit_lock_owner = -1;
+                                spin_unlock(&slave->xmit_lock);
+                        }
+                        if (netif_queue_stopped(dev))
+                                busy = 1;
+                        break;
+                case 1:
+                        master->slaves = NEXT_SLAVE(q);
+                        return 0;
+                default:
+                        nores = 1;
+                        break;
+                }
+                __skb_pull(skb, skb->nh.raw - skb->data);
+        } while ((q = NEXT_SLAVE(q)) != start);
+        if (nores && skb_res == NULL) {
+                skb_res = skb;
+                goto restart;
+        }
+        if (busy) {
+                netif_stop_queue(dev);
+                return 1;
+        }
+        master->stats.tx_errors++;
+drop:
+        master->stats.tx_dropped++;
+        dev_kfree_skb(skb);
+        return 0;
+}
+static int teql_master_open(struct net_device *dev)
+{
+        struct Qdisc * q;
+        struct teql_master *m = (void*)dev->priv;
+        int mtu = 0xFFFE;
+        unsigned flags = IFF_NOARP|IFF_MULTICAST;
+        if (m->slaves == NULL)
+                return -EUNATCH;
+        flags = FMASK;
+        q = m->slaves;
+        do {
+                struct net_device *slave = q->dev;
+                if (slave == NULL)
+                        return -EUNATCH;
+                if (slave->mtu < mtu)
+                        mtu = slave->mtu;
+                if (slave->hard_header_len > LL_MAX_HEADER)
+                        return -EINVAL;
+                /* If all the slaves are BROADCAST, master is BROADCAST
+                   If all the slaves are PtP, master is PtP
+                   Otherwise, master is NBMA.
+                 */
+                if (!(slave->flags&IFF_POINTOPOINT))
+                        flags &= ~IFF_POINTOPOINT;
+                if (!(slave->flags&IFF_BROADCAST))
+                        flags &= ~IFF_BROADCAST;
+                if (!(slave->flags&IFF_MULTICAST))
+                        flags &= ~IFF_MULTICAST;
+        } while ((q = NEXT_SLAVE(q)) != m->slaves);
+        m->dev->mtu = mtu;
+        m->dev->flags = (m->dev->flags&~FMASK) | flags;
+        netif_start_queue(m->dev);
+        return 0;
+}
+static int teql_master_close(struct net_device *dev)
+{
+        netif_stop_queue(dev);
+        return 0;
+}
+static struct net_device_stats *teql_master_stats(struct net_device *dev)
+{
+        struct teql_master *m = (void*)dev->priv;
+        return &m->stats;
+}
+static int teql_master_mtu(struct net_device *dev, int new_mtu)
+{
+        struct teql_master *m = (void*)dev->priv;
+        struct Qdisc *q;
+        if (new_mtu < 68)
+                return -EINVAL;
+        q = m->slaves;
+        if (q) {
+                do {
+                        if (new_mtu > q->dev->mtu)
+                                return -EINVAL;
+                } while ((q=NEXT_SLAVE(q)) != m->slaves);
+        }
+        dev->mtu = new_mtu;
+        return 0;
+}
+static __init void teql_master_setup(struct net_device *dev)
+{
+        struct teql_master *master = dev->priv;
+        struct Qdisc_ops *ops = &master->qops;
+        master->dev     = dev;
+        ops->priv_size  = sizeof(struct teql_sched_data);
+        
+        ops->enqueue    =       teql_enqueue;
+        ops->dequeue    =       teql_dequeue;
+        ops->requeue    =       teql_requeue;
+        ops->init       =       teql_qdisc_init;
+        ops->reset      =       teql_reset;
+        ops->destroy    =       teql_destroy;
+        ops->owner      =       THIS_MODULE;
+        dev->open               = teql_master_open;
+        dev->hard_start_xmit    = teql_master_xmit;
+        dev->stop               = teql_master_close;
+        dev->get_stats          = teql_master_stats;
+        dev->change_mtu         = teql_master_mtu;
+        dev->type               = ARPHRD_VOID;
+        dev->mtu                = 1500;
+        dev->tx_queue_len       = 100;
+        dev->flags              = IFF_NOARP;
+        dev->hard_header_len    = LL_MAX_HEADER;
+        SET_MODULE_OWNER(dev);
+}
+static LIST_HEAD(master_dev_list);
+static int max_equalizers = 1;
+module_param(max_equalizers, int, 0);
+MODULE_PARM_DESC(max_equalizers, "Max number of link equalizers");
+static int __init teql_init(void)
+{
+        int i;
+        int err = -ENODEV;
+        for (i = 0; i < max_equalizers; i++) {
+                struct net_device *dev;
+                struct teql_master *master;
+                dev = alloc_netdev(sizeof(struct teql_master),
+                                  "teql%d", teql_master_setup);
+                if (!dev) {
+                        err = -ENOMEM;
+                        break;
+                }
+                if ((err = register_netdev(dev))) {
+                        free_netdev(dev);
+                        break;
+                }
+                master = dev->priv;
+                strlcpy(master->qops.id, dev->name, IFNAMSIZ);
+                err = register_qdisc(&master->qops);
+                if (err) {
+                        unregister_netdev(dev);
+                        free_netdev(dev);
+                        break;
+                }
+                list_add_tail(&master->master_list, &master_dev_list);
+        }
+        return i ? 0 : err;
+}
+static void __exit teql_exit(void) 
+{
+        struct teql_master *master, *nxt;
+        list_for_each_entry_safe(master, nxt, &master_dev_list, master_list) {
+                list_del(&master->master_list);
+                unregister_qdisc(&master->qops);
+                unregister_netdev(master->dev);
+                free_netdev(master->dev);
+        }
+}
+module_init(teql_init);
+module_exit(teql_exit);
+MODULE_LICENSE("GPL");
author	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 18:20:36 -0400
committer	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 18:20:36 -0400
commit	1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree	0bba044c4ce775e45a88a51686b5d9f90697ea9d /net/sched