38 files changed, 894 insertions, 1079 deletions
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 81987df536..d219435d08 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -213,6 +213,10 @@ int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset,
 {
        int i, err, fraglen, end = 0;
        struct sk_buff *next = skb_shinfo(skb)->frag_list;
+        if (!len)
+                return 0;
 next_skb:
        fraglen = skb_headlen(skb);
        i = -1;
diff --git a/net/core/stream.c b/net/core/stream.c
index ac9edfdf87..15bfd03e80 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -52,8 +52,9 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
 {
        struct task_struct *tsk = current;
        DEFINE_WAIT(wait);
+        int done;
-        while (1) {
+        do {
                if (sk->sk_err)
                        return sock_error(sk);
                if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
@@ -65,13 +66,12 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
                prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
                sk->sk_write_pending++;
-                if (sk_wait_event(sk, timeo_p,
+                done = sk_wait_event(sk, timeo_p,
-                                  !((1 << sk->sk_state) & 
+                                     !((1 << sk->sk_state) & 
-                                    ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))))
+                                       ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)));
-                        break;
                finish_wait(sk->sk_sleep, &wait);
                sk->sk_write_pending--;
-        }
+        } while (!done);
        return 0;
 }
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 6298cf58ff..4b9bc81ae1 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -31,8 +31,6 @@ struct inet_hashinfo __cacheline_aligned dccp_hashinfo = {
        .lhash_lock     = RW_LOCK_UNLOCKED,
        .lhash_users    = ATOMIC_INIT(0),
        .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(dccp_hashinfo.lhash_wait),
-        .portalloc_lock = SPIN_LOCK_UNLOCKED,
-        .port_rover     = 1024 - 1,
 };
 EXPORT_SYMBOL_GPL(dccp_hashinfo);
@@ -125,36 +123,15 @@ static int dccp_v4_hash_connect(struct sock *sk)
        int ret;
        if (snum == 0) {
-                int rover;
                int low = sysctl_local_port_range[0];
                int high = sysctl_local_port_range[1];
                int remaining = (high - low) + 1;
+                int rover = net_random() % (high - low) + low;
                struct hlist_node *node;
                struct inet_timewait_sock *tw = NULL;
                local_bh_disable();
-                /* TODO. Actually it is not so bad idea to remove
-                 * dccp_hashinfo.portalloc_lock before next submission to
-                 * Linus.
-                 * As soon as we touch this place at all it is time to think.
-                 *
-                 * Now it protects single _advisory_ variable
-                 * dccp_hashinfo.port_rover, hence it is mostly useless.
-                 * Code will work nicely if we just delete it, but
-                 * I am afraid in contented case it will work not better or
-                 * even worse: another cpu just will hit the same bucket
-                 * and spin there.
-                 * So some cpu salt could remove both contention and
-                 * memory pingpong. Any ideas how to do this in a nice way?
-                 */
-                spin_lock(&dccp_hashinfo.portalloc_lock);
-                rover = dccp_hashinfo.port_rover;
                do {
-                        rover++;
-                        if ((rover < low) || (rover > high))
-                                rover = low;
                        head = &dccp_hashinfo.bhash[inet_bhashfn(rover,
                                                    dccp_hashinfo.bhash_size)];
                        spin_lock(&head->lock);
@@ -187,9 +164,9 @@ static int dccp_v4_hash_connect(struct sock *sk)
                next_port:
                        spin_unlock(&head->lock);
+                        if (++rover > high)
+                                rover = low;
                } while (--remaining > 0);
-                dccp_hashinfo.port_rover = rover;
-                spin_unlock(&dccp_hashinfo.portalloc_lock);
                local_bh_enable();
@@ -197,9 +174,6 @@ static int dccp_v4_hash_connect(struct sock *sk)
 ok:
                /* All locks still held and bhs disabled */
-                dccp_hashinfo.port_rover = rover;
-                spin_unlock(&dccp_hashinfo.portalloc_lock);
                inet_bind_hash(sk, tb, rover);
                if (sk_unhashed(sk)) {
                        inet_sk(sk)->sport = htons(rover);
diff --git a/net/ieee80211/ieee80211_crypt.c b/net/ieee80211/ieee80211_crypt.c
index f3b6aa3be6..20cc580a07 100644
--- a/net/ieee80211/ieee80211_crypt.c
+++ b/net/ieee80211/ieee80211_crypt.c
@@ -12,7 +12,6 @@
 */
 #include <linux/config.h>
-#include <linux/version.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/slab.h>
diff --git a/net/ieee80211/ieee80211_crypt_ccmp.c b/net/ieee80211/ieee80211_crypt_ccmp.c
index 05a853c130..4702217285 100644
--- a/net/ieee80211/ieee80211_crypt_ccmp.c
+++ b/net/ieee80211/ieee80211_crypt_ccmp.c
@@ -10,7 +10,6 @@
 */
 #include <linux/config.h>
-#include <linux/version.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/slab.h>
diff --git a/net/ieee80211/ieee80211_crypt_tkip.c b/net/ieee80211/ieee80211_crypt_tkip.c
index 2e34f29b79..e0988320ef 100644
--- a/net/ieee80211/ieee80211_crypt_tkip.c
+++ b/net/ieee80211/ieee80211_crypt_tkip.c
@@ -10,7 +10,6 @@
 */
 #include <linux/config.h>
-#include <linux/version.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/slab.h>
diff --git a/net/ieee80211/ieee80211_crypt_wep.c b/net/ieee80211/ieee80211_crypt_wep.c
index 7c08ed2f26..073aebdf0f 100644
--- a/net/ieee80211/ieee80211_crypt_wep.c
+++ b/net/ieee80211/ieee80211_crypt_wep.c
@@ -10,7 +10,6 @@
 */
 #include <linux/config.h>
-#include <linux/version.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/slab.h>
diff --git a/net/ieee80211/ieee80211_geo.c b/net/ieee80211/ieee80211_geo.c
index c4b54ef8f6..610cc5cbc2 100644
--- a/net/ieee80211/ieee80211_geo.c
+++ b/net/ieee80211/ieee80211_geo.c
@@ -38,7 +38,6 @@
 #include <linux/slab.h>
 #include <linux/tcp.h>
 #include <linux/types.h>
-#include <linux/version.h>
 #include <linux/wireless.h>
 #include <linux/etherdevice.h>
 #include <asm/uaccess.h>
diff --git a/net/ieee80211/ieee80211_module.c b/net/ieee80211/ieee80211_module.c
index f66d792cd2..321287bc88 100644
--- a/net/ieee80211/ieee80211_module.c
+++ b/net/ieee80211/ieee80211_module.c
@@ -45,7 +45,6 @@
 #include <linux/slab.h>
 #include <linux/tcp.h>
 #include <linux/types.h>
-#include <linux/version.h>
 #include <linux/wireless.h>
 #include <linux/etherdevice.h>
 #include <asm/uaccess.h>
diff --git a/net/ieee80211/ieee80211_rx.c b/net/ieee80211/ieee80211_rx.c
index ce694cf5c1..6ad88218f5 100644
--- a/net/ieee80211/ieee80211_rx.c
+++ b/net/ieee80211/ieee80211_rx.c
@@ -28,7 +28,6 @@
 #include <linux/slab.h>
 #include <linux/tcp.h>
 #include <linux/types.h>
-#include <linux/version.h>
 #include <linux/wireless.h>
 #include <linux/etherdevice.h>
 #include <asm/uaccess.h>
diff --git a/net/ieee80211/ieee80211_tx.c b/net/ieee80211/ieee80211_tx.c
index 95ccbadbf5..445f206e65 100644
--- a/net/ieee80211/ieee80211_tx.c
+++ b/net/ieee80211/ieee80211_tx.c
@@ -38,7 +38,6 @@
 #include <linux/slab.h>
 #include <linux/tcp.h>
 #include <linux/types.h>
-#include <linux/version.h>
 #include <linux/wireless.h>
 #include <linux/etherdevice.h>
 #include <asm/uaccess.h>
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 8b6d3939e1..c6247fc840 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1908,8 +1908,11 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
                        sock_kfree_s(sk, newpsl, IP_SFLSIZE(newpsl->sl_max));
                        goto done;
                }
-        } else
+        } else {
                newpsl = NULL;
+                (void) ip_mc_add_src(in_dev, &msf->imsf_multiaddr,
+                                     msf->imsf_fmode, 0, NULL, 0);
+        }
        psl = pmc->sflist;
        if (psl) {
                (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 94468a76c5..3fe021f1a5 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -78,17 +78,9 @@ int inet_csk_get_port(struct inet_hashinfo *hashinfo,
                int low = sysctl_local_port_range[0];
                int high = sysctl_local_port_range[1];
                int remaining = (high - low) + 1;
-                int rover;
+                int rover = net_random() % (high - low) + low;
-                spin_lock(&hashinfo->portalloc_lock);
-                if (hashinfo->port_rover < low)
-                        rover = low;
-                else
-                        rover = hashinfo->port_rover;
                do {
-                        rover++;
-                        if (rover > high)
-                                rover = low;
                        head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)];
                        spin_lock(&head->lock);
                        inet_bind_bucket_for_each(tb, node, &head->chain)
@@ -97,9 +89,9 @@ int inet_csk_get_port(struct inet_hashinfo *hashinfo,
                        break;
                next:
                        spin_unlock(&head->lock);
+                        if (++rover > high)
+                                rover = low;
                } while (--remaining > 0);
-                hashinfo->port_rover = rover;
-                spin_unlock(&hashinfo->portalloc_lock);
                /* Exhausted local port range during search?  It is not
                 * possible for us to be holding one of the bind hash
diff --git a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c
index 926a668464..4108a5e12b 100644
--- a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c
+++ b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c
@@ -270,14 +270,10 @@ exp_gre(struct ip_conntrack *master,
        exp_orig->expectfn = pptp_expectfn;
        exp_orig->flags = 0;
-        exp_orig->dir = IP_CT_DIR_ORIGINAL;
        /* both expectations are identical apart from tuple */
        memcpy(exp_reply, exp_orig, sizeof(*exp_reply));
        memcpy(&exp_reply->tuple, &exp_tuples[1], sizeof(exp_reply->tuple));
-        exp_reply->dir = !exp_orig->dir;
        if (ip_nat_pptp_hook_exp_gre)
                ret = ip_nat_pptp_hook_exp_gre(exp_orig, exp_reply);
        else {
diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c
index 166e6069f1..82a65043a8 100644
--- a/net/ipv4/netfilter/ip_conntrack_netlink.c
+++ b/net/ipv4/netfilter/ip_conntrack_netlink.c
@@ -815,7 +815,7 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
                                  IPCTNL_MSG_CT_NEW, 1, ct);
        ip_conntrack_put(ct);
        if (err <= 0)
-                goto out;
+                goto free;
        err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
        if (err < 0)
@@ -824,9 +824,9 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
        DEBUGP("leaving\n");
        return 0;
+free:
+        kfree_skb(skb2);
 out:
-        if (skb2)
-                kfree_skb(skb2);
        return -1;
 }
@@ -1322,21 +1322,16 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
                                      nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW,
                                      1, exp);
        if (err <= 0)
-                goto out;
+                goto free;
        ip_conntrack_expect_put(exp);
-        err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
+        return netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
-        if (err < 0)
-                goto free;
-        return err;
+free:
+        kfree_skb(skb2);
 out:
        ip_conntrack_expect_put(exp);
-free:
-        if (skb2)
-                kfree_skb(skb2);
        return err;
 }
diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c
index c5e3abd246..762f4d9393 100644
--- a/net/ipv4/netfilter/ip_nat_core.c
+++ b/net/ipv4/netfilter/ip_nat_core.c
@@ -66,10 +66,8 @@ ip_nat_proto_find_get(u_int8_t protonum)
         * removed until we've grabbed the reference */
        preempt_disable();
        p = __ip_nat_proto_find(protonum);
-        if (p) {
+        if (!try_module_get(p->me))
-                if (!try_module_get(p->me))
+                p = &ip_nat_unknown_protocol;
-                        p = &ip_nat_unknown_protocol;
-        }
        preempt_enable();
        return p;
diff --git a/net/ipv4/netfilter/ip_nat_helper_pptp.c b/net/ipv4/netfilter/ip_nat_helper_pptp.c
index 3cdd0684d3..ee6ab74ad3 100644
--- a/net/ipv4/netfilter/ip_nat_helper_pptp.c
+++ b/net/ipv4/netfilter/ip_nat_helper_pptp.c
@@ -216,6 +216,7 @@ pptp_exp_gre(struct ip_conntrack_expect *expect_orig,
        expect_orig->saved_proto.gre.key = htons(nat_pptp_info->pac_call_id);
        expect_orig->tuple.src.u.gre.key = htons(nat_pptp_info->pns_call_id);
        expect_orig->tuple.dst.u.gre.key = htons(ct_pptp_info->pac_call_id);
+        expect_orig->dir = IP_CT_DIR_ORIGINAL;
        inv_t.src.ip = reply_t->src.ip;
        inv_t.dst.ip = reply_t->dst.ip;
        inv_t.src.u.gre.key = htons(nat_pptp_info->pac_call_id);
@@ -233,6 +234,7 @@ pptp_exp_gre(struct ip_conntrack_expect *expect_orig,
        expect_reply->saved_proto.gre.key = htons(nat_pptp_info->pns_call_id);
        expect_reply->tuple.src.u.gre.key = htons(nat_pptp_info->pac_call_id);
        expect_reply->tuple.dst.u.gre.key = htons(ct_pptp_info->pns_call_id);
+        expect_reply->dir = IP_CT_DIR_REPLY;
        inv_t.src.ip = orig_t->src.ip;
        inv_t.dst.ip = orig_t->dst.ip;
        inv_t.src.u.gre.key = htons(nat_pptp_info->pns_call_id);
diff --git a/net/ipv4/netfilter/ip_nat_proto_gre.c b/net/ipv4/netfilter/ip_nat_proto_gre.c
index 7c12854016..f7cad7cf1a 100644
--- a/net/ipv4/netfilter/ip_nat_proto_gre.c
+++ b/net/ipv4/netfilter/ip_nat_proto_gre.c
@@ -139,8 +139,8 @@ gre_manip_pkt(struct sk_buff **pskb,
                        break;
                case GRE_VERSION_PPTP:
                        DEBUGP("call_id -> 0x%04x\n", 
-                                ntohl(tuple->dst.u.gre.key));
+                                ntohs(tuple->dst.u.gre.key));
-                        pgreh->call_id = htons(ntohl(tuple->dst.u.gre.key));
+                        pgreh->call_id = tuple->dst.u.gre.key;
                        break;
                default:
                        DEBUGP("can't nat unknown GRE version\n");
diff --git a/net/ipv4/netfilter/ip_nat_proto_unknown.c b/net/ipv4/netfilter/ip_nat_proto_unknown.c
index 99bbef56f8..f0099a646a 100644
--- a/net/ipv4/netfilter/ip_nat_proto_unknown.c
+++ b/net/ipv4/netfilter/ip_nat_proto_unknown.c
@@ -62,7 +62,7 @@ unknown_print_range(char *buffer, const struct ip_nat_range *range)
 struct ip_nat_protocol ip_nat_unknown_protocol = {
        .name                   = "unknown",
-        .me                     = THIS_MODULE,
+        /* .me isn't set: getting a ref to this cannot fail. */
        .manip_pkt              = unknown_manip_pkt,
        .in_range               = unknown_in_range,
        .unique_tuple           = unknown_unique_tuple,
diff --git a/net/ipv4/netfilter/ipt_CONNMARK.c b/net/ipv4/netfilter/ipt_CONNMARK.c
index 1346380213..05d66ab594 100644
--- a/net/ipv4/netfilter/ipt_CONNMARK.c
+++ b/net/ipv4/netfilter/ipt_CONNMARK.c
@@ -109,6 +109,7 @@ static struct ipt_target ipt_connmark_reg = {
 static int __init init(void)
 {
+        need_ip_conntrack();
        return ipt_register_target(&ipt_connmark_reg);
 }
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f3f0013a95..72b7c22e1e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2112,7 +2112,6 @@ void __init tcp_init(void)
                sysctl_tcp_max_orphans >>= (3 - order);
                sysctl_max_syn_backlog = 128;
        }
-        tcp_hashinfo.port_rover = sysctl_local_port_range[0] - 1;
        sysctl_tcp_mem[0] =  768 << order;
        sysctl_tcp_mem[1] = 1024 << order;
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 6d80e063c1..ae35e06090 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -27,7 +27,7 @@
                                          */
 static int fast_convergence = 1;
-static int max_increment = 32;
+static int max_increment = 16;
 static int low_window = 14;
 static int beta = 819;          /* = 819/1024 (BICTCP_BETA_SCALE) */
 static int low_utilization_threshold = 153;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index c85819d847..49d67cd75e 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -93,8 +93,6 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
        .lhash_lock     = RW_LOCK_UNLOCKED,
        .lhash_users    = ATOMIC_INIT(0),
        .lhash_wait     = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
-        .portalloc_lock = SPIN_LOCK_UNLOCKED,
-        .port_rover     = 1024 - 1,
 };
 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 41edc14851..2c5f57299d 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2163,7 +2163,7 @@ static int addrconf_ifdown(struct net_device *dev, int how)
        /* Step 5: netlink notification of this interface */
        idev->tstamp = jiffies;
-        inet6_ifinfo_notify(RTM_NEWLINK, idev);
+        inet6_ifinfo_notify(RTM_DELLINK, idev);
        
        /* Shot the device (if unregistered) */
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 966b2372aa..f15e04ad02 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -545,8 +545,10 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
                        sock_kfree_s(sk, newpsl, IP6_SFLSIZE(newpsl->sl_max));
                        goto done;
                }
-        } else
+        } else {
                newpsl = NULL;
+                (void) ip6_mc_add_src(idev, group, gsf->gf_fmode, 0, NULL, 0);
+        }
        psl = pmc->sflist;
        if (psl) {
                (void) ip6_mc_del_src(idev, group, pmc->sfmode,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index d693cb988b..d746d3b27e 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -114,16 +114,9 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
                int low = sysctl_local_port_range[0];
                int high = sysctl_local_port_range[1];
                int remaining = (high - low) + 1;
-                int rover;
+                int rover = net_random() % (high - low) + low;
-                spin_lock(&tcp_hashinfo.portalloc_lock);
+                do {
-                if (tcp_hashinfo.port_rover < low)
-                        rover = low;
-                else
-                        rover = tcp_hashinfo.port_rover;
-                do {    rover++;
-                        if (rover > high)
-                                rover = low;
                        head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)];
                        spin_lock(&head->lock);
                        inet_bind_bucket_for_each(tb, node, &head->chain)
@@ -132,9 +125,9 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
                        break;
                next:
                        spin_unlock(&head->lock);
+                        if (++rover > high)
+                                rover = low;
                } while (--remaining > 0);
-                tcp_hashinfo.port_rover = rover;
-                spin_unlock(&tcp_hashinfo.portalloc_lock);
                /* Exhausted local port range during search?  It is not
                 * possible for us to be holding one of the bind hash
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index d10d552d9c..d3a4f30a7f 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -117,7 +117,7 @@ int nf_queue(struct sk_buff **skb,
        /* QUEUE == DROP if noone is waiting, to be safe. */
        read_lock(&queue_handler_lock);
-        if (!queue_handler[pf]->outfn) {
+        if (!queue_handler[pf] || !queue_handler[pf]->outfn) {
                read_unlock(&queue_handler_lock);
                kfree_skb(*skb);
                return 1;
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index efcd10f996..d194676f36 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -146,11 +146,10 @@ instance_create(u_int16_t group_num, int pid)
                goto out_unlock;
        }
-        inst = kmalloc(sizeof(*inst), GFP_ATOMIC);
+        inst = kzalloc(sizeof(*inst), GFP_ATOMIC);
        if (!inst)
                goto out_unlock;
-        memset(inst, 0, sizeof(*inst));
        INIT_HLIST_NODE(&inst->hlist);
        inst->lock = SPIN_LOCK_UNLOCKED;
        /* needs to be two, since we _put() after creation */
@@ -962,10 +961,9 @@ static int nful_open(struct inode *inode, struct file *file)
        struct iter_state *is;
        int ret;
-        is = kmalloc(sizeof(*is), GFP_KERNEL);
+        is = kzalloc(sizeof(*is), GFP_KERNEL);
        if (!is)
                return -ENOMEM;
-        memset(is, 0, sizeof(*is));
        ret = seq_open(file, &nful_seq_ops);
        if (ret < 0)
                goto out_free;
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index eaa44c4956..f065a6c949 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -136,11 +136,10 @@ instance_create(u_int16_t queue_num, int pid)
                goto out_unlock;
        }
-        inst = kmalloc(sizeof(*inst), GFP_ATOMIC);
+        inst = kzalloc(sizeof(*inst), GFP_ATOMIC);
        if (!inst)
                goto out_unlock;
-        memset(inst, 0, sizeof(*inst));
        inst->queue_num = queue_num;
        inst->peer_pid = pid;
        inst->queue_maxlen = NFQNL_QMAX_DEFAULT;
@@ -1036,10 +1035,9 @@ static int nfqnl_open(struct inode *inode, struct file *file)
        struct iter_state *is;
        int ret;
-        is = kmalloc(sizeof(*is), GFP_KERNEL);
+        is = kzalloc(sizeof(*is), GFP_KERNEL);
        if (!is)
                return -ENOMEM;
-        memset(is, 0, sizeof(*is));
        ret = seq_open(file, &nfqnl_seq_ops);
        if (ret < 0)
                goto out_free;
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 81510da317..7f34e7fd76 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -2,13 +2,15 @@
 # Traffic control configuration.
 # 
-menuconfig NET_SCHED
+menu "QoS and/or fair queueing"
+config NET_SCHED
        bool "QoS and/or fair queueing"
        ---help---
          When the kernel has several packets to send out over a network
          device, it has to decide which ones to send first, which ones to
-          delay, and which ones to drop. This is the job of the packet
+          delay, and which ones to drop. This is the job of the queueing
-          scheduler, and several different algorithms for how to do this
+          disciplines, several different algorithms for how to do this
          "fairly" have been proposed.
          If you say N here, you will get the standard packet scheduler, which
@@ -23,13 +25,13 @@ menuconfig NET_SCHED
          To administer these schedulers, you'll need the user-level utilities
          from the package iproute2+tc at <ftp://ftp.tux.org/pub/net/ip-routing/>.
          That package also contains some documentation; for more, check out
-          <http://snafu.freedom.org/linux2.2/iproute-notes.html>.
+          <http://linux-net.osdl.org/index.php/Iproute2>.
          This Quality of Service (QoS) support will enable you to use
          Differentiated Services (diffserv) and Resource Reservation Protocol
-          (RSVP) on your Linux router if you also say Y to "QoS support",
+          (RSVP) on your Linux router if you also say Y to the corresponding
-          "Packet classifier API" and to some classifiers below. Documentation
+          classifiers below.  Documentation and software is at
-          and software is at <http://diffserv.sourceforge.net/>.
+          <http://diffserv.sourceforge.net/>.
          If you say Y here and to "/proc file system" below, you will be able
          to read status information about packet schedulers from the file
@@ -42,7 +44,7 @@ choice
        prompt "Packet scheduler clock source"
        depends on NET_SCHED
        default NET_SCH_CLK_JIFFIES
-        help
+        ---help---
          Packet schedulers need a monotonic clock that increments at a static
          rate. The kernel provides several suitable interfaces, each with
          different properties:
@@ -56,7 +58,7 @@ choice
 config NET_SCH_CLK_JIFFIES
        bool "Timer interrupt"
-        help
+        ---help---
          Say Y here if you want to use the timer interrupt (jiffies) as clock
          source. This clock source is fast, synchronized on all processors and
          handles cpu clock frequency changes, but its resolution is too low
@@ -64,7 +66,7 @@ config NET_SCH_CLK_JIFFIES
 config NET_SCH_CLK_GETTIMEOFDAY
        bool "gettimeofday"
-        help
+        ---help---
          Say Y here if you want to use gettimeofday as clock source. This clock
          source has high resolution, is synchronized on all processors and
          handles cpu clock frequency changes, but it is slow.
@@ -77,7 +79,7 @@ config NET_SCH_CLK_GETTIMEOFDAY
 config NET_SCH_CLK_CPU
        bool "CPU cycle counter"
        depends on ((X86_TSC || X86_64) && !SMP) || ALPHA || SPARC64 || PPC64 || IA64
-        help
+        ---help---
          Say Y here if you want to use the CPU's cycle counter as clock source.
          This is a cheap and high resolution clock source, but on some
          architectures it is not synchronized on all processors and doesn't
@@ -95,134 +97,129 @@ config NET_SCH_CLK_CPU
 endchoice
+comment "Queueing/Scheduling"
+        depends on NET_SCHED
 config NET_SCH_CBQ
-        tristate "CBQ packet scheduler"
+        tristate "Class Based Queueing (CBQ)"
        depends on NET_SCHED
        ---help---
          Say Y here if you want to use the Class-Based Queueing (CBQ) packet
-          scheduling algorithm for some of your network devices.  This
+          scheduling algorithm. This algorithm classifies the waiting packets
-          algorithm classifies the waiting packets into a tree-like hierarchy
+          into a tree-like hierarchy of classes; the leaves of this tree are
-          of classes; the leaves of this tree are in turn scheduled by
+          in turn scheduled by separate algorithms.
-          separate algorithms (called "disciplines" in this context).
-          See the top of <file:net/sched/sch_cbq.c> for references about the
+          See the top of <file:net/sched/sch_cbq.c> for more details.
-          CBQ algorithm.
          CBQ is a commonly used scheduler, so if you're unsure, you should
          say Y here. Then say Y to all the queueing algorithms below that you
-          want to use as CBQ disciplines.  Then say Y to "Packet classifier
+          want to use as leaf disciplines.
-          API" and say Y to all the classifiers you want to use; a classifier
-          is a routine that allows you to sort your outgoing traffic into
-          classes based on a certain criterion.
          To compile this code as a module, choose M here: the
          module will be called sch_cbq.
 config NET_SCH_HTB
-        tristate "HTB packet scheduler"
+        tristate "Hierarchical Token Bucket (HTB)"
        depends on NET_SCHED
        ---help---
          Say Y here if you want to use the Hierarchical Token Buckets (HTB)
-          packet scheduling algorithm for some of your network devices. See
+          packet scheduling algorithm. See
          <http://luxik.cdi.cz/~devik/qos/htb/> for complete manual and
          in-depth articles.
-          HTB is very similar to the CBQ regarding its goals however is has 
+          HTB is very similar to CBQ regarding its goals however is has
          different properties and different algorithm.
          To compile this code as a module, choose M here: the
          module will be called sch_htb.
 config NET_SCH_HFSC
-        tristate "HFSC packet scheduler"
+        tristate "Hierarchical Fair Service Curve (HFSC)"
        depends on NET_SCHED
        ---help---
          Say Y here if you want to use the Hierarchical Fair Service Curve
-          (HFSC) packet scheduling algorithm for some of your network devices.
+          (HFSC) packet scheduling algorithm.
          To compile this code as a module, choose M here: the
          module will be called sch_hfsc.
-#tristate '  H-PFQ packet scheduler' CONFIG_NET_SCH_HPFQ
 config NET_SCH_ATM
-        tristate "ATM pseudo-scheduler"
+        tristate "ATM Virtual Circuits (ATM)"
        depends on NET_SCHED && ATM
        ---help---
          Say Y here if you want to use the ATM pseudo-scheduler.  This
-          provides a framework for invoking classifiers (aka "filters"), which
+          provides a framework for invoking classifiers, which in turn
-          in turn select classes of this queuing discipline.  Each class maps
+          select classes of this queuing discipline.  Each class maps
-          the flow(s) it is handling to a given virtual circuit (see the top of
+          the flow(s) it is handling to a given virtual circuit.
-          <file:net/sched/sch_atm.c>).
+          See the top of <file:net/sched/sch_atm.c>) for more details.
          To compile this code as a module, choose M here: the
          module will be called sch_atm.
 config NET_SCH_PRIO
-        tristate "The simplest PRIO pseudoscheduler"
+        tristate "Multi Band Priority Queueing (PRIO)"
        depends on NET_SCHED
-        help
+        ---help---
          Say Y here if you want to use an n-band priority queue packet
-          "scheduler" for some of your network devices or as a leaf discipline
+          scheduler.
-          for the CBQ scheduling algorithm. If unsure, say Y.
          To compile this code as a module, choose M here: the
          module will be called sch_prio.
 config NET_SCH_RED
-        tristate "RED queue"
+        tristate "Random Early Detection (RED)"
        depends on NET_SCHED
-        help
+        ---help---
          Say Y here if you want to use the Random Early Detection (RED)
-          packet scheduling algorithm for some of your network devices (see
+          packet scheduling algorithm.
-          the top of <file:net/sched/sch_red.c> for details and references
-          about the algorithm).
+          See the top of <file:net/sched/sch_red.c> for more details.
          To compile this code as a module, choose M here: the
          module will be called sch_red.
 config NET_SCH_SFQ
-        tristate "SFQ queue"
+        tristate "Stochastic Fairness Queueing (SFQ)"
        depends on NET_SCHED
        ---help---
          Say Y here if you want to use the Stochastic Fairness Queueing (SFQ)
-          packet scheduling algorithm for some of your network devices or as a
+          packet scheduling algorithm .
-          leaf discipline for the CBQ scheduling algorithm (see the top of
-          <file:net/sched/sch_sfq.c> for details and references about the SFQ
+          See the top of <file:net/sched/sch_sfq.c> for more details.
-          algorithm).
          To compile this code as a module, choose M here: the
          module will be called sch_sfq.
 config NET_SCH_TEQL
-        tristate "TEQL queue"
+        tristate "True Link Equalizer (TEQL)"
        depends on NET_SCHED
        ---help---
          Say Y here if you want to use the True Link Equalizer (TLE) packet
-          scheduling algorithm for some of your network devices or as a leaf
+          scheduling algorithm. This queueing discipline allows the combination
-          discipline for the CBQ scheduling algorithm. This queueing
+          of several physical devices into one virtual device.
-          discipline allows the combination of several physical devices into
-          one virtual device. (see the top of <file:net/sched/sch_teql.c> for
+          See the top of <file:net/sched/sch_teql.c> for more details.
-          details).
          To compile this code as a module, choose M here: the
          module will be called sch_teql.
 config NET_SCH_TBF
-        tristate "TBF queue"
+        tristate "Token Bucket Filter (TBF)"
        depends on NET_SCHED
-        help
+        ---help---
-          Say Y here if you want to use the Simple Token Bucket Filter (TBF)
+          Say Y here if you want to use the Token Bucket Filter (TBF) packet
-          packet scheduling algorithm for some of your network devices or as a
+          scheduling algorithm.
-          leaf discipline for the CBQ scheduling algorithm (see the top of
-          <file:net/sched/sch_tbf.c> for a description of the TBF algorithm).
+          See the top of <file:net/sched/sch_tbf.c> for more details.
          To compile this code as a module, choose M here: the
          module will be called sch_tbf.
 config NET_SCH_GRED
-        tristate "GRED queue"
+        tristate "Generic Random Early Detection (GRED)"
        depends on NET_SCHED
-        help
+        ---help---
          Say Y here if you want to use the Generic Random Early Detection
          (GRED) packet scheduling algorithm for some of your network devices
          (see the top of <file:net/sched/sch_red.c> for details and
@@ -232,9 +229,9 @@ config NET_SCH_GRED
          module will be called sch_gred.
 config NET_SCH_DSMARK
-        tristate "Diffserv field marker"
+        tristate "Differentiated Services marker (DSMARK)"
        depends on NET_SCHED
-        help
+        ---help---
          Say Y if you want to schedule packets according to the
          Differentiated Services architecture proposed in RFC 2475.
          Technical information on this method, with pointers to associated
@@ -244,9 +241,9 @@ config NET_SCH_DSMARK
          module will be called sch_dsmark.
 config NET_SCH_NETEM
-        tristate "Network emulator"
+        tristate "Network emulator (NETEM)"
        depends on NET_SCHED
-        help
+        ---help---
          Say Y if you want to emulate network delay, loss, and packet
          re-ordering. This is often useful to simulate networks when
          testing applications or protocols.
@@ -259,58 +256,23 @@ config NET_SCH_NETEM
 config NET_SCH_INGRESS
        tristate "Ingress Qdisc"
        depends on NET_SCHED 
-        help
+        ---help---
-          If you say Y here, you will be able to police incoming bandwidth
+          Say Y here if you want to use classifiers for incoming packets.
-          and drop packets when this bandwidth exceeds your desired rate.
          If unsure, say Y.
          To compile this code as a module, choose M here: the
          module will be called sch_ingress.
-config NET_QOS
+comment "Classification"
-        bool "QoS support"
        depends on NET_SCHED
-        ---help---
-          Say Y here if you want to include Quality Of Service scheduling
-          features, which means that you will be able to request certain
-          rate-of-flow limits for your network devices.
-          This Quality of Service (QoS) support will enable you to use
-          Differentiated Services (diffserv) and Resource Reservation Protocol
-          (RSVP) on your Linux router if you also say Y to "Packet classifier
-          API" and to some classifiers below. Documentation and software is at
-          <http://diffserv.sourceforge.net/>.
-          Note that the answer to this question won't directly affect the
-          kernel: saying N will just cause the configurator to skip all
-          the questions about QoS support.
-config NET_ESTIMATOR
-        bool "Rate estimator"
-        depends on NET_QOS
-        help
-          In order for Quality of Service scheduling to work, the current
-          rate-of-flow for a network device has to be estimated; if you say Y
-          here, the kernel will do just that.
 config NET_CLS
-        bool "Packet classifier API"
+        boolean
-        depends on NET_SCHED
-        ---help---
-          The CBQ scheduling algorithm requires that network packets which are
-          scheduled to be sent out over a network device be classified
-          according to some criterion. If you say Y here, you will get a
-          choice of several different packet classifiers with the following
-          questions.
-          This will enable you to use Differentiated Services (diffserv) and
-          Resource Reservation Protocol (RSVP) on your Linux router.
-          Documentation and software is at
-          <http://diffserv.sourceforge.net/>.
 config NET_CLS_BASIC
-        tristate "Basic classifier"
+        tristate "Elementary classification (BASIC)"
-        depends on NET_CLS
+        depends NET_SCHED
+        select NET_CLS
        ---help---
          Say Y here if you want to be able to classify packets using
          only extended matches and actions.
@@ -319,24 +281,25 @@ config NET_CLS_BASIC
          module will be called cls_basic.
 config NET_CLS_TCINDEX
-        tristate "TC index classifier"
+        tristate "Traffic-Control Index (TCINDEX)"
-        depends on NET_CLS
+        depends NET_SCHED
-        help
+        select NET_CLS
-          If you say Y here, you will be able to classify outgoing packets
+        ---help---
-          according to the tc_index field of the skb. You will want this
+          Say Y here if you want to be able to classify packets based on
-          feature if you want to implement Differentiated Services using
+          traffic control indices. You will want this feature if you want
-          sch_dsmark. If unsure, say Y.
+          to implement Differentiated Services together with DSMARK.
          To compile this code as a module, choose M here: the
          module will be called cls_tcindex.
 config NET_CLS_ROUTE4
-        tristate "Routing table based classifier"
+        tristate "Routing decision (ROUTE)"
-        depends on NET_CLS
+        depends NET_SCHED
        select NET_CLS_ROUTE
-        help
+        select NET_CLS
-          If you say Y here, you will be able to classify outgoing packets
+        ---help---
-          according to the route table entry they matched. If unsure, say Y.
+          If you say Y here, you will be able to classify packets
+          according to the route table entry they matched.
          To compile this code as a module, choose M here: the
          module will be called cls_route.
@@ -346,58 +309,45 @@ config NET_CLS_ROUTE
        default n
 config NET_CLS_FW
-        tristate "Firewall based classifier"
+        tristate "Netfilter mark (FW)"
-        depends on NET_CLS
+        depends NET_SCHED
-        help
+        select NET_CLS
-          If you say Y here, you will be able to classify outgoing packets
+        ---help---
-          according to firewall criteria you specified.
+          If you say Y here, you will be able to classify packets
+          according to netfilter/firewall marks.
          To compile this code as a module, choose M here: the
          module will be called cls_fw.
 config NET_CLS_U32
-        tristate "U32 classifier"
+        tristate "Universal 32bit comparisons w/ hashing (U32)"
-        depends on NET_CLS
+        depends NET_SCHED
-        help
+        select NET_CLS
-          If you say Y here, you will be able to classify outgoing packets
+        ---help---
-          according to their destination address. If unsure, say Y.
+          Say Y here to be able to classify packetes using a universal
+          32bit pieces based comparison scheme.
          To compile this code as a module, choose M here: the
          module will be called cls_u32.
 config CLS_U32_PERF
-        bool "U32 classifier performance counters"
+        bool "Performance counters support"
        depends on NET_CLS_U32
-        help
+        ---help---
-          gathers stats that could be used to tune u32 classifier performance.
+          Say Y here to make u32 gather additional statistics useful for
-          Requires a new iproute2
+          fine tuning u32 classifiers.
-          You MUST NOT turn this on if you dont have an update iproute2.
-config NET_CLS_IND
-        bool "classify input device (slows things u32/fw) "
-        depends on NET_CLS_U32 || NET_CLS_FW
-        help
-          This option will be killed eventually when a 
-          metadata action appears because it slows things a little
-          Available only for u32 and fw classifiers.
-          Requires a new iproute2
-          You MUST NOT turn this on if you dont have an update iproute2.
 config CLS_U32_MARK
-        bool "Use nfmark as a key in U32 classifier"
+        bool "Netfilter marks support"
        depends on NET_CLS_U32 && NETFILTER
-        help
+        ---help---
-          This allows you to match mark in a u32 filter.
+          Say Y here to be able to use netfilter marks as u32 key.
-          Example:
-          tc filter add dev eth0 protocol ip parent 1:0 prio 5 u32 \
-                match mark 0x0090 0xffff \
-                match ip dst 4.4.4.4 \
-                flowid 1:90
-          You must use a new iproute2 to use this feature.
 config NET_CLS_RSVP
-        tristate "Special RSVP classifier"
+        tristate "IPv4 Resource Reservation Protocol (RSVP)"
-        depends on NET_CLS && NET_QOS
+        depends on NET_SCHED
+        select NET_CLS
+        select NET_ESTIMATOR
        ---help---
          The Resource Reservation Protocol (RSVP) permits end systems to
          request a minimum and maximum data flow rate for a connection; this
@@ -410,31 +360,33 @@ config NET_CLS_RSVP
          module will be called cls_rsvp.
 config NET_CLS_RSVP6
-        tristate "Special RSVP classifier for IPv6"
+        tristate "IPv6 Resource Reservation Protocol (RSVP6)"
-        depends on NET_CLS && NET_QOS
+        depends on NET_SCHED
+        select NET_CLS
+        select NET_ESTIMATOR
        ---help---
          The Resource Reservation Protocol (RSVP) permits end systems to
          request a minimum and maximum data flow rate for a connection; this
          is important for real time data such as streaming sound or video.
          Say Y here if you want to be able to classify outgoing packets based
-          on their RSVP requests and you are using the new Internet Protocol
+          on their RSVP requests and you are using the IPv6.
-          IPv6 as opposed to the older and more common IPv4.
          To compile this code as a module, choose M here: the
          module will be called cls_rsvp6.
 config NET_EMATCH
        bool "Extended Matches"
-        depends on NET_CLS
+        depends NET_SCHED
+        select NET_CLS
        ---help---
          Say Y here if you want to use extended matches on top of classifiers
          and select the extended matches below.
          Extended matches are small classification helpers not worth writing
-          a separate classifier.
+          a separate classifier for.
-          You must have a recent version of the iproute2 tools in order to use
+          A recent version of the iproute2 package is required to use
          extended matches.
 config NET_EMATCH_STACK
@@ -468,7 +420,7 @@ config NET_EMATCH_NBYTE
          module will be called em_nbyte.
 config NET_EMATCH_U32
-        tristate "U32 hashing key"
+        tristate "U32 key"
        depends on NET_EMATCH
        ---help---
          Say Y here if you want to be able to classify packets using
@@ -496,76 +448,120 @@ config NET_EMATCH_TEXT
        select TEXTSEARCH_BM
        select TEXTSEARCH_FSM
        ---help---
-          Say Y here if you want to be ablt to classify packets based on
+          Say Y here if you want to be able to classify packets based on
          textsearch comparisons.
          To compile this code as a module, choose M here: the
          module will be called em_text.
 config NET_CLS_ACT
-        bool "Packet ACTION"
+        bool "Actions"
-        depends on EXPERIMENTAL && NET_CLS && NET_QOS
+        depends on EXPERIMENTAL && NET_SCHED
+        select NET_ESTIMATOR
        ---help---
-        This option requires you have a new iproute2. It enables
+          Say Y here if you want to use traffic control actions. Actions
-        tc extensions which can be used with tc classifiers.
+          get attached to classifiers and are invoked after a successful
-          You MUST NOT turn this on if you dont have an update iproute2.
+          classification. They are used to overwrite the classification
+          result, instantly drop or redirect packets, etc.
+          A recent version of the iproute2 package is required to use
+          extended matches.
 config NET_ACT_POLICE
-        tristate "Policing Actions"
+        tristate "Traffic Policing"
        depends on NET_CLS_ACT 
        ---help---
-        If you are using a newer iproute2 select this one, otherwise use one
+          Say Y here if you want to do traffic policing, i.e. strict
-        below to select a policer.
+          bandwidth limiting. This action replaces the existing policing
-          You MUST NOT turn this on if you dont have an update iproute2.
+          module.
+          To compile this code as a module, choose M here: the
+          module will be called police.
 config NET_ACT_GACT
-        tristate "generic Actions"
+        tristate "Generic actions"
        depends on NET_CLS_ACT
        ---help---
-        You must have new iproute2 to use this feature.
+          Say Y here to take generic actions such as dropping and
-        This adds simple filtering actions like drop, accept etc.
+          accepting packets.
+          To compile this code as a module, choose M here: the
+          module will be called gact.
 config GACT_PROB
-        bool "generic Actions probability"
+        bool "Probability support"
        depends on NET_ACT_GACT
        ---help---
-        Allows generic actions to be randomly or deterministically used.
+          Say Y here to use the generic action randomly or deterministically.
 config NET_ACT_MIRRED
-        tristate "Packet In/Egress redirecton/mirror Actions"
+        tristate "Redirecting and Mirroring"
        depends on NET_CLS_ACT
        ---help---
-        requires new iproute2
+          Say Y here to allow packets to be mirrored or redirected to
-        This allows packets to be mirrored or redirected to netdevices
+          other devices.
+          To compile this code as a module, choose M here: the
+          module will be called mirred.
 config NET_ACT_IPT
-        tristate "iptables Actions"
+        tristate "IPtables targets"
        depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
        ---help---
-        requires new iproute2
+          Say Y here to be able to invoke iptables targets after succesful
-        This allows iptables targets to be used by tc filters
+          classification.
+          To compile this code as a module, choose M here: the
+          module will be called ipt.
 config NET_ACT_PEDIT
-        tristate "Generic Packet Editor Actions"
+        tristate "Packet Editing"
        depends on NET_CLS_ACT
        ---help---
-        requires new iproute2
+          Say Y here if you want to mangle the content of packets.
-        This allows for packets to be generically edited
-config NET_CLS_POLICE
+          To compile this code as a module, choose M here: the
-        bool "Traffic policing (needed for in/egress)"
+          module will be called pedit.
-        depends on NET_CLS && NET_QOS && NET_CLS_ACT!=y
-        help
-          Say Y to support traffic policing (bandwidth limits).  Needed for
-          ingress and egress rate limiting.
 config NET_ACT_SIMP
-        tristate "Simple action"
+        tristate "Simple Example (Debug)"
        depends on NET_CLS_ACT
        ---help---
-        You must have new iproute2 to use this feature.
+          Say Y here to add a simple action for demonstration purposes.
-        This adds a very simple action for demonstration purposes
+          It is meant as an example and for debugging purposes. It will
-        The idea is to give action authors a basic example to look at.
+          print a configured policy string followed by the packet count
-        All this action will do is print on the console the configured
+          to the console for every packet that passes by.
-        policy string followed by _ then packet count.
+          If unsure, say N.
+          To compile this code as a module, choose M here: the
+          module will be called simple.
+config NET_CLS_POLICE
+        bool "Traffic Policing (obsolete)"
+        depends on NET_SCHED && NET_CLS_ACT!=y
+        select NET_ESTIMATOR
+        ---help---
+          Say Y here if you want to do traffic policing, i.e. strict
+          bandwidth limiting. This option is obsoleted by the traffic
+          policer implemented as action, it stays here for compatibility
+          reasons.
+config NET_CLS_IND
+        bool "Incoming device classification"
+        depends on NET_SCHED && (NET_CLS_U32 || NET_CLS_FW)
+        ---help---
+          Say Y here to extend the u32 and fw classifier to support
+          classification based on the incoming device. This option is
+          likely to disappear in favour of the metadata ematch.
+config NET_ESTIMATOR
+        bool "Rate estimator"
+        depends on NET_SCHED
+        ---help---
+          Say Y here to allow using rate estimators to estimate the current
+          rate-of-flow for network devices, queues, etc. This module is
+          automaticaly selected if needed but can be selected manually for
+          statstical purposes.
+endmenu
diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index 25c171c327..29a2dd9f30 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -15,247 +15,281 @@
 *                       from Ren Liu
 *                     - More error checks
 *
- *
+ *  For all the glorious comments look at include/net/red.h
- *
- *  For all the glorious comments look at Alexey's sch_red.c
 */
 #include <linux/config.h>
 #include <linux/module.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/bitops.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/in.h>
-#include <linux/errno.h>
-#include <linux/interrupt.h>
-#include <linux/if_ether.h>
-#include <linux/inet.h>
 #include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/notifier.h>
-#include <net/ip.h>
-#include <net/route.h>
 #include <linux/skbuff.h>
-#include <net/sock.h>
 #include <net/pkt_sched.h>
+#include <net/red.h>
-#if 1 /* control */
+#define GRED_DEF_PRIO (MAX_DPs / 2)
-#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
+#define GRED_VQ_MASK (MAX_DPs - 1)
-#else
-#define DPRINTK(format,args...)
-#endif
-#if 0 /* data */
-#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args)
-#else
-#define D2PRINTK(format,args...)
-#endif
 struct gred_sched_data;
 struct gred_sched;
 struct gred_sched_data
 {
-/* Parameters */
        u32             limit;          /* HARD maximal queue length    */
-        u32             qth_min;        /* Min average length threshold: A scaled */
-        u32             qth_max;        /* Max average length threshold: A scaled */
        u32             DP;             /* the drop pramaters */
-        char            Wlog;           /* log(W)               */
-        char            Plog;           /* random number bits   */
-        u32             Scell_max;
-        u32             Rmask;
        u32             bytesin;        /* bytes seen on virtualQ so far*/
        u32             packetsin;      /* packets seen on virtualQ so far*/
        u32             backlog;        /* bytes on the virtualQ */
-        u32             forced; /* packets dropped for exceeding limits */
+        u8              prio;           /* the prio of this vq */
-        u32             early;  /* packets dropped as a warning */
-        u32             other;  /* packets dropped by invoking drop() */
+        struct red_parms parms;
-        u32             pdrop;  /* packets dropped because we exceeded physical queue limits */
+        struct red_stats stats;
-        char            Scell_log;
+};
-        u8              Stab[256];
-        u8              prio;        /* the prio of this vq */
+enum {
+        GRED_WRED_MODE = 1,
-/* Variables */
+        GRED_RIO_MODE,
-        unsigned long   qave;           /* Average queue length: A scaled */
-        int             qcount;         /* Packets since last random number generation */
-        u32             qR;             /* Cached random number */
-        psched_time_t   qidlestart;     /* Start of idle period */
 };
 struct gred_sched
 {
        struct gred_sched_data *tab[MAX_DPs];
-        u32             DPs;   
+        unsigned long   flags;
-        u32             def; 
+        u32             red_flags;
-        u8              initd; 
+        u32             DPs;
-        u8              grio; 
+        u32             def;
-        u8              eqp; 
+        struct red_parms wred_set;
 };
-static int
+static inline int gred_wred_mode(struct gred_sched *table)
-gred_enqueue(struct sk_buff *skb, struct Qdisc* sch)
 {
-        psched_time_t now;
+        return test_bit(GRED_WRED_MODE, &table->flags);
-        struct gred_sched_data *q=NULL;
+}
-        struct gred_sched *t= qdisc_priv(sch);
-        unsigned long   qave=0; 
+static inline void gred_enable_wred_mode(struct gred_sched *table)
-        int i=0;
+{
+        __set_bit(GRED_WRED_MODE, &table->flags);
+}
+static inline void gred_disable_wred_mode(struct gred_sched *table)
+{
+        __clear_bit(GRED_WRED_MODE, &table->flags);
+}
+static inline int gred_rio_mode(struct gred_sched *table)
+{
+        return test_bit(GRED_RIO_MODE, &table->flags);
+}
+static inline void gred_enable_rio_mode(struct gred_sched *table)
+{
+        __set_bit(GRED_RIO_MODE, &table->flags);
+}
+static inline void gred_disable_rio_mode(struct gred_sched *table)
+{
+        __clear_bit(GRED_RIO_MODE, &table->flags);
+}
+static inline int gred_wred_mode_check(struct Qdisc *sch)
+{
+        struct gred_sched *table = qdisc_priv(sch);
+        int i;
-        if (!t->initd && skb_queue_len(&sch->q) < (sch->dev->tx_queue_len ? : 1)) {
+        /* Really ugly O(n^2) but shouldn't be necessary too frequent. */
-                D2PRINTK("NO GRED Queues setup yet! Enqueued anyway\n");
+        for (i = 0; i < table->DPs; i++) {
-                goto do_enqueue;
+                struct gred_sched_data *q = table->tab[i];
+                int n;
+                if (q == NULL)
+                        continue;
+                for (n = 0; n < table->DPs; n++)
+                        if (table->tab[n] && table->tab[n] != q &&
+                            table->tab[n]->prio == q->prio)
+                                return 1;
        }
+        return 0;
+}
+static inline unsigned int gred_backlog(struct gred_sched *table,
+                                        struct gred_sched_data *q,
+                                        struct Qdisc *sch)
+{
+        if (gred_wred_mode(table))
+                return sch->qstats.backlog;
+        else
+                return q->backlog;
+}
+static inline u16 tc_index_to_dp(struct sk_buff *skb)
+{
+        return skb->tc_index & GRED_VQ_MASK;
+}
+static inline void gred_load_wred_set(struct gred_sched *table,
+                                      struct gred_sched_data *q)
+{
+        q->parms.qavg = table->wred_set.qavg;
+        q->parms.qidlestart = table->wred_set.qidlestart;
+}
+static inline void gred_store_wred_set(struct gred_sched *table,
+                                       struct gred_sched_data *q)
+{
+        table->wred_set.qavg = q->parms.qavg;
+}
+static inline int gred_use_ecn(struct gred_sched *t)
+{
+        return t->red_flags & TC_RED_ECN;
+}
-        if ( ((skb->tc_index&0xf) > (t->DPs -1)) || !(q=t->tab[skb->tc_index&0xf])) {
+static inline int gred_use_harddrop(struct gred_sched *t)
-                printk("GRED: setting to default (%d)\n ",t->def);
+{
-                if (!(q=t->tab[t->def])) {
+        return t->red_flags & TC_RED_HARDDROP;
-                        DPRINTK("GRED: setting to default FAILED! dropping!! "
+}
-                            "(%d)\n ", t->def);
-                        goto drop;
+static int gred_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+{
+        struct gred_sched_data *q=NULL;
+        struct gred_sched *t= qdisc_priv(sch);
+        unsigned long qavg = 0;
+        u16 dp = tc_index_to_dp(skb);
+        if (dp >= t->DPs  || (q = t->tab[dp]) == NULL) {
+                dp = t->def;
+                if ((q = t->tab[dp]) == NULL) {
+                        /* Pass through packets not assigned to a DP
+                         * if no default DP has been configured. This
+                         * allows for DP flows to be left untouched.
+                         */
+                        if (skb_queue_len(&sch->q) < sch->dev->tx_queue_len)
+                                return qdisc_enqueue_tail(skb, sch);
+                        else
+                                goto drop;
                }
                /* fix tc_index? --could be controvesial but needed for
                   requeueing */
-                skb->tc_index=(skb->tc_index&0xfffffff0) | t->def;
+                skb->tc_index = (skb->tc_index & ~GRED_VQ_MASK) | dp;
        }
-        D2PRINTK("gred_enqueue virtualQ 0x%x classid %x backlog %d "
+        /* sum up all the qaves of prios <= to ours to get the new qave */
-            "general backlog %d\n",skb->tc_index&0xf,sch->handle,q->backlog,
+        if (!gred_wred_mode(t) && gred_rio_mode(t)) {
-            sch->qstats.backlog);
+                int i;
-        /* sum up all the qaves of prios <= to ours to get the new qave*/
-        if (!t->eqp && t->grio) {
+                for (i = 0; i < t->DPs; i++) {
-                for (i=0;i<t->DPs;i++) {
+                        if (t->tab[i] && t->tab[i]->prio < q->prio &&
-                        if ((!t->tab[i]) || (i==q->DP)) 
+                            !red_is_idling(&t->tab[i]->parms))
-                                continue; 
+                                qavg +=t->tab[i]->parms.qavg;
-                                
-                        if ((t->tab[i]->prio < q->prio) && (PSCHED_IS_PASTPERFECT(t->tab[i]->qidlestart)))
-                                qave +=t->tab[i]->qave;
                }
-                        
        }
        q->packetsin++;
-        q->bytesin+=skb->len;
+        q->bytesin += skb->len;
-        if (t->eqp && t->grio) {
+        if (gred_wred_mode(t))
-                qave=0;
+                gred_load_wred_set(t, q);
-                q->qave=t->tab[t->def]->qave;
-                q->qidlestart=t->tab[t->def]->qidlestart;
-        }
-        if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) {
+        q->parms.qavg = red_calc_qavg(&q->parms, gred_backlog(t, q, sch));
-                long us_idle;
-                PSCHED_GET_TIME(now);
-                us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max);
-                PSCHED_SET_PASTPERFECT(q->qidlestart);
-                q->qave >>= q->Stab[(us_idle>>q->Scell_log)&0xFF];
+        if (red_is_idling(&q->parms))
-        } else {
+                red_end_of_idle_period(&q->parms);
-                if (t->eqp) {
-                        q->qave += sch->qstats.backlog - (q->qave >> q->Wlog);
-                } else {
-                        q->qave += q->backlog - (q->qave >> q->Wlog);
-                }
-        }
+        if (gred_wred_mode(t))
-        
+                gred_store_wred_set(t, q);
-        if (t->eqp && t->grio) 
-                t->tab[t->def]->qave=q->qave;
-        if ((q->qave+qave) < q->qth_min) {
-                q->qcount = -1;
-enqueue:
-                if (q->backlog + skb->len <= q->limit) {
-                        q->backlog += skb->len;
-do_enqueue:
-                        __skb_queue_tail(&sch->q, skb);
-                        sch->qstats.backlog += skb->len;
-                        sch->bstats.bytes += skb->len;
-                        sch->bstats.packets++;
-                        return 0;
-                } else {
-                        q->pdrop++;
-                }
-drop:
+        switch (red_action(&q->parms, q->parms.qavg + qavg)) {
-                kfree_skb(skb);
+                case RED_DONT_MARK:
-                sch->qstats.drops++;
+                        break;
-                return NET_XMIT_DROP;
-        }
+                case RED_PROB_MARK:
-        if ((q->qave+qave) >= q->qth_max) {
+                        sch->qstats.overlimits++;
-                q->qcount = -1;
+                        if (!gred_use_ecn(t) || !INET_ECN_set_ce(skb)) {
-                sch->qstats.overlimits++;
+                                q->stats.prob_drop++;
-                q->forced++;
+                                goto congestion_drop;
-                goto drop;
+                        }
+                        q->stats.prob_mark++;
+                        break;
+                case RED_HARD_MARK:
+                        sch->qstats.overlimits++;
+                        if (gred_use_harddrop(t) || !gred_use_ecn(t) ||
+                            !INET_ECN_set_ce(skb)) {
+                                q->stats.forced_drop++;
+                                goto congestion_drop;
+                        }
+                        q->stats.forced_mark++;
+                        break;
        }
-        if (++q->qcount) {
-                if ((((qave+q->qave) - q->qth_min)>>q->Wlog)*q->qcount < q->qR)
+        if (q->backlog + skb->len <= q->limit) {
-                        goto enqueue;
+                q->backlog += skb->len;
-                q->qcount = 0;
+                return qdisc_enqueue_tail(skb, sch);
-                q->qR = net_random()&q->Rmask;
-                sch->qstats.overlimits++;
-                q->early++;
-                goto drop;
        }
-        q->qR = net_random()&q->Rmask;
-        goto enqueue;
+        q->stats.pdrop++;
+drop:
+        return qdisc_drop(skb, sch);
+congestion_drop:
+        qdisc_drop(skb, sch);
+        return NET_XMIT_CN;
 }
-static int
+static int gred_requeue(struct sk_buff *skb, struct Qdisc* sch)
-gred_requeue(struct sk_buff *skb, struct Qdisc* sch)
 {
+        struct gred_sched *t = qdisc_priv(sch);
        struct gred_sched_data *q;
-        struct gred_sched *t= qdisc_priv(sch);
+        u16 dp = tc_index_to_dp(skb);
-        q= t->tab[(skb->tc_index&0xf)];
-/* error checking here -- probably unnecessary */
+        if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
-        PSCHED_SET_PASTPERFECT(q->qidlestart);
+                if (net_ratelimit())
+                        printk(KERN_WARNING "GRED: Unable to relocate VQ 0x%x "
-        __skb_queue_head(&sch->q, skb);
+                               "for requeue, screwing up backlog.\n",
-        sch->qstats.backlog += skb->len;
+                               tc_index_to_dp(skb));
-        sch->qstats.requeues++;
+        } else {
-        q->backlog += skb->len;
+                if (red_is_idling(&q->parms))
-        return 0;
+                        red_end_of_idle_period(&q->parms);
+                q->backlog += skb->len;
+        }
+        return qdisc_requeue(skb, sch);
 }
-static struct sk_buff *
+static struct sk_buff *gred_dequeue(struct Qdisc* sch)
-gred_dequeue(struct Qdisc* sch)
 {
        struct sk_buff *skb;
-        struct gred_sched_data *q;
+        struct gred_sched *t = qdisc_priv(sch);
-        struct gred_sched *t= qdisc_priv(sch);
+        skb = qdisc_dequeue_head(sch);
-        skb = __skb_dequeue(&sch->q);
        if (skb) {
-                sch->qstats.backlog -= skb->len;
+                struct gred_sched_data *q;
-                q= t->tab[(skb->tc_index&0xf)];
+                u16 dp = tc_index_to_dp(skb);
-                if (q) {
-                        q->backlog -= skb->len;
+                if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
-                        if (!q->backlog && !t->eqp)
+                        if (net_ratelimit())
-                                PSCHED_GET_TIME(q->qidlestart);
+                                printk(KERN_WARNING "GRED: Unable to relocate "
+                                       "VQ 0x%x after dequeue, screwing up "
+                                       "backlog.\n", tc_index_to_dp(skb));
                } else {
-                        D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf); 
+                        q->backlog -= skb->len;
+                        if (!q->backlog && !gred_wred_mode(t))
+                                red_start_of_idle_period(&q->parms);
                }
                return skb;
        }
-        if (t->eqp) {
+        if (gred_wred_mode(t) && !red_is_idling(&t->wred_set))
-                        q= t->tab[t->def];
+                red_start_of_idle_period(&t->wred_set);
-                        if (!q) 
-                                D2PRINTK("no default VQ set: Results will be "
-                                       "screwed up\n");
-                        else
-                                PSCHED_GET_TIME(q->qidlestart);
-        }
        return NULL;
 }
@@ -263,36 +297,34 @@ gred_dequeue(struct Qdisc* sch)
 static unsigned int gred_drop(struct Qdisc* sch)
 {
        struct sk_buff *skb;
+        struct gred_sched *t = qdisc_priv(sch);
-        struct gred_sched_data *q;
+        skb = qdisc_dequeue_tail(sch);
-        struct gred_sched *t= qdisc_priv(sch);
-        skb = __skb_dequeue_tail(&sch->q);
        if (skb) {
                unsigned int len = skb->len;
-                sch->qstats.backlog -= len;
+                struct gred_sched_data *q;
-                sch->qstats.drops++;
+                u16 dp = tc_index_to_dp(skb);
-                q= t->tab[(skb->tc_index&0xf)];
-                if (q) {
+                if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
-                        q->backlog -= len;
+                        if (net_ratelimit())
-                        q->other++;
+                                printk(KERN_WARNING "GRED: Unable to relocate "
-                        if (!q->backlog && !t->eqp)
+                                       "VQ 0x%x while dropping, screwing up "
-                                PSCHED_GET_TIME(q->qidlestart);
+                                       "backlog.\n", tc_index_to_dp(skb));
                } else {
-                        D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf); 
+                        q->backlog -= len;
+                        q->stats.other++;
+                        if (!q->backlog && !gred_wred_mode(t))
+                                red_start_of_idle_period(&q->parms);
                }
-                kfree_skb(skb);
+                qdisc_drop(skb, sch);
                return len;
        }
-        q=t->tab[t->def];
+        if (gred_wred_mode(t) && !red_is_idling(&t->wred_set))
-        if (!q) {
+                red_start_of_idle_period(&t->wred_set);
-                D2PRINTK("no default VQ set: Results might be screwed up\n");
-                return 0;
-        }
-        PSCHED_GET_TIME(q->qidlestart);
        return 0;
 }
@@ -300,293 +332,241 @@ static unsigned int gred_drop(struct Qdisc* sch)
 static void gred_reset(struct Qdisc* sch)
 {
        int i;
-        struct gred_sched_data *q;
+        struct gred_sched *t = qdisc_priv(sch);
-        struct gred_sched *t= qdisc_priv(sch);
+        qdisc_reset_queue(sch);
-        __skb_queue_purge(&sch->q);
+        for (i = 0; i < t->DPs; i++) {
+                struct gred_sched_data *q = t->tab[i];
-        sch->qstats.backlog = 0;
+                if (!q)
+                        continue;
-        for (i=0;i<t->DPs;i++) {
+                red_restart(&q->parms);
-                q= t->tab[i];
-                if (!q) 
-                        continue; 
-                PSCHED_SET_PASTPERFECT(q->qidlestart);
-                q->qave = 0;
-                q->qcount = -1;
                q->backlog = 0;
-                q->other=0;
-                q->forced=0;
-                q->pdrop=0;
-                q->early=0;
        }
 }
-static int gred_change(struct Qdisc *sch, struct rtattr *opt)
+static inline void gred_destroy_vq(struct gred_sched_data *q)
+{
+        kfree(q);
+}
+static inline int gred_change_table_def(struct Qdisc *sch, struct rtattr *dps)
 {
        struct gred_sched *table = qdisc_priv(sch);
-        struct gred_sched_data *q;
-        struct tc_gred_qopt *ctl;
        struct tc_gred_sopt *sopt;
-        struct rtattr *tb[TCA_GRED_STAB];
-        struct rtattr *tb2[TCA_GRED_DPS];
        int i;
-        if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_STAB, opt))
+        if (dps == NULL || RTA_PAYLOAD(dps) < sizeof(*sopt))
                return -EINVAL;
-        if (tb[TCA_GRED_PARMS-1] == 0 && tb[TCA_GRED_STAB-1] == 0) {
+        sopt = RTA_DATA(dps);
-                rtattr_parse_nested(tb2, TCA_GRED_DPS, opt);
+        if (sopt->DPs > MAX_DPs || sopt->DPs == 0 || sopt->def_DP >= sopt->DPs)
+                return -EINVAL;
-            if (tb2[TCA_GRED_DPS-1] == 0) 
+        sch_tree_lock(sch);
-                        return -EINVAL;
+        table->DPs = sopt->DPs;
+        table->def = sopt->def_DP;
+        table->red_flags = sopt->flags;
+        /*
+         * Every entry point to GRED is synchronized with the above code
+         * and the DP is checked against DPs, i.e. shadowed VQs can no
+         * longer be found so we can unlock right here.
+         */
+        sch_tree_unlock(sch);
+        if (sopt->grio) {
+                gred_enable_rio_mode(table);
+                gred_disable_wred_mode(table);
+                if (gred_wred_mode_check(sch))
+                        gred_enable_wred_mode(table);
+        } else {
+                gred_disable_rio_mode(table);
+                gred_disable_wred_mode(table);
+        }
-                sopt = RTA_DATA(tb2[TCA_GRED_DPS-1]);
+        for (i = table->DPs; i < MAX_DPs; i++) {
-                table->DPs=sopt->DPs;   
+                if (table->tab[i]) {
-                table->def=sopt->def_DP; 
+                        printk(KERN_WARNING "GRED: Warning: Destroying "
-                table->grio=sopt->grio; 
+                               "shadowed VQ 0x%x\n", i);
-                table->initd=0;
+                        gred_destroy_vq(table->tab[i]);
-                /* probably need to clear all the table DP entries as well */
+                        table->tab[i] = NULL;
-                return 0;
+                }
-            }
+        }
+        return 0;
+}
-        if (!table->DPs || tb[TCA_GRED_PARMS-1] == 0 || tb[TCA_GRED_STAB-1] == 0 ||
+static inline int gred_change_vq(struct Qdisc *sch, int dp,
-                RTA_PAYLOAD(tb[TCA_GRED_PARMS-1]) < sizeof(*ctl) ||
+                                 struct tc_gred_qopt *ctl, int prio, u8 *stab)
-                RTA_PAYLOAD(tb[TCA_GRED_STAB-1]) < 256)
+{
-                        return -EINVAL;
+        struct gred_sched *table = qdisc_priv(sch);
+        struct gred_sched_data *q;
-        ctl = RTA_DATA(tb[TCA_GRED_PARMS-1]);
+        if (table->tab[dp] == NULL) {
-        if (ctl->DP > MAX_DPs-1 ) {
+                table->tab[dp] = kmalloc(sizeof(*q), GFP_KERNEL);
-                /* misbehaving is punished! Put in the default drop probability */
+                if (table->tab[dp] == NULL)
-                DPRINTK("\nGRED: DP %u not in  the proper range fixed. New DP "
-                        "set to default at %d\n",ctl->DP,table->def);
-                ctl->DP=table->def;
-        }
-        
-        if (table->tab[ctl->DP] == NULL) {
-                table->tab[ctl->DP]=kmalloc(sizeof(struct gred_sched_data),
-                                            GFP_KERNEL);
-                if (NULL == table->tab[ctl->DP])
                        return -ENOMEM;
-                memset(table->tab[ctl->DP], 0, (sizeof(struct gred_sched_data)));
+                memset(table->tab[dp], 0, sizeof(*q));
-        }
-        q= table->tab[ctl->DP]; 
-        if (table->grio) {
-                if (ctl->prio <=0) {
-                        if (table->def && table->tab[table->def]) {
-                                DPRINTK("\nGRED: DP %u does not have a prio"
-                                        "setting default to %d\n",ctl->DP,
-                                        table->tab[table->def]->prio);
-                                q->prio=table->tab[table->def]->prio;
-                        } else { 
-                                DPRINTK("\nGRED: DP %u does not have a prio"
-                                        " setting default to 8\n",ctl->DP);
-                                q->prio=8;
-                        }
-                } else {
-                        q->prio=ctl->prio;
-                }
-        } else {
-                q->prio=8;
        }
+        q = table->tab[dp];
-        q->DP=ctl->DP;
+        q->DP = dp;
-        q->Wlog = ctl->Wlog;
+        q->prio = prio;
-        q->Plog = ctl->Plog;
        q->limit = ctl->limit;
-        q->Scell_log = ctl->Scell_log;
-        q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL;
-        q->Scell_max = (255<<q->Scell_log);
-        q->qth_min = ctl->qth_min<<ctl->Wlog;
-        q->qth_max = ctl->qth_max<<ctl->Wlog;
-        q->qave=0;
-        q->backlog=0;
-        q->qcount = -1;
-        q->other=0;
-        q->forced=0;
-        q->pdrop=0;
-        q->early=0;
-        PSCHED_SET_PASTPERFECT(q->qidlestart);
-        memcpy(q->Stab, RTA_DATA(tb[TCA_GRED_STAB-1]), 256);
-        if ( table->initd && table->grio) {
-        /* this looks ugly but it's not in the fast path */
-                for (i=0;i<table->DPs;i++) {
-                        if ((!table->tab[i]) || (i==q->DP) )    
-                                continue; 
-                        if (table->tab[i]->prio == q->prio ){
-                                /* WRED mode detected */
-                                table->eqp=1;
-                                break;
-                        }
-                }
-        }
-        if (!table->initd) {
+        if (q->backlog == 0)
-                table->initd=1;
+                red_end_of_idle_period(&q->parms);
-                /* 
-                the first entry also goes into the default until
-                over-written 
-                */
-                if (table->tab[table->def] == NULL) {
-                        table->tab[table->def]=
-                                kmalloc(sizeof(struct gred_sched_data), GFP_KERNEL);
-                        if (NULL == table->tab[table->def])
-                                return -ENOMEM;
-                        memset(table->tab[table->def], 0,
-                               (sizeof(struct gred_sched_data)));
-                }
-                q= table->tab[table->def]; 
-                q->DP=table->def;
-                q->Wlog = ctl->Wlog;
-                q->Plog = ctl->Plog;
-                q->limit = ctl->limit;
-                q->Scell_log = ctl->Scell_log;
-                q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL;
-                q->Scell_max = (255<<q->Scell_log);
-                q->qth_min = ctl->qth_min<<ctl->Wlog;
-                q->qth_max = ctl->qth_max<<ctl->Wlog;
-                if (table->grio)
-                        q->prio=table->tab[ctl->DP]->prio;
-                else
-                        q->prio=8;
-                q->qcount = -1;
-                PSCHED_SET_PASTPERFECT(q->qidlestart);
-                memcpy(q->Stab, RTA_DATA(tb[TCA_GRED_STAB-1]), 256);
-        }
-        return 0;
+        red_set_parms(&q->parms,
+                      ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Plog,
+                      ctl->Scell_log, stab);
+        return 0;
 }
-static int gred_init(struct Qdisc *sch, struct rtattr *opt)
+static int gred_change(struct Qdisc *sch, struct rtattr *opt)
 {
        struct gred_sched *table = qdisc_priv(sch);
-        struct tc_gred_sopt *sopt;
+        struct tc_gred_qopt *ctl;
-        struct rtattr *tb[TCA_GRED_STAB];
+        struct rtattr *tb[TCA_GRED_MAX];
-        struct rtattr *tb2[TCA_GRED_DPS];
+        int err = -EINVAL, prio = GRED_DEF_PRIO;
+        u8 *stab;
-        if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_STAB, opt))
+        if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_MAX, opt))
                return -EINVAL;
-        if (tb[TCA_GRED_PARMS-1] == 0 && tb[TCA_GRED_STAB-1] == 0) {
+        if (tb[TCA_GRED_PARMS-1] == NULL && tb[TCA_GRED_STAB-1] == NULL)
-                rtattr_parse_nested(tb2, TCA_GRED_DPS, opt);
+                return gred_change_table_def(sch, opt);
+        if (tb[TCA_GRED_PARMS-1] == NULL ||
+            RTA_PAYLOAD(tb[TCA_GRED_PARMS-1]) < sizeof(*ctl) ||
+            tb[TCA_GRED_STAB-1] == NULL ||
+            RTA_PAYLOAD(tb[TCA_GRED_STAB-1]) < 256)
+                return -EINVAL;
+        ctl = RTA_DATA(tb[TCA_GRED_PARMS-1]);
+        stab = RTA_DATA(tb[TCA_GRED_STAB-1]);
+        if (ctl->DP >= table->DPs)
+                goto errout;
-            if (tb2[TCA_GRED_DPS-1] == 0) 
+        if (gred_rio_mode(table)) {
-                        return -EINVAL;
+                if (ctl->prio == 0) {
+                        int def_prio = GRED_DEF_PRIO;
-                sopt = RTA_DATA(tb2[TCA_GRED_DPS-1]);
+                        if (table->tab[table->def])
-                table->DPs=sopt->DPs;   
+                                def_prio = table->tab[table->def]->prio;
-                table->def=sopt->def_DP; 
-                table->grio=sopt->grio; 
+                        printk(KERN_DEBUG "GRED: DP %u does not have a prio "
-                table->initd=0;
+                               "setting default to %d\n", ctl->DP, def_prio);
-                return 0;
+                        prio = def_prio;
+                } else
+                        prio = ctl->prio;
+        }
+        sch_tree_lock(sch);
+        err = gred_change_vq(sch, ctl->DP, ctl, prio, stab);
+        if (err < 0)
+                goto errout_locked;
+        if (gred_rio_mode(table)) {
+                gred_disable_wred_mode(table);
+                if (gred_wred_mode_check(sch))
+                        gred_enable_wred_mode(table);
        }
-        DPRINTK("\n GRED_INIT error!\n");
+        err = 0;
-        return -EINVAL;
+errout_locked:
+        sch_tree_unlock(sch);
+errout:
+        return err;
 }
-static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
+static int gred_init(struct Qdisc *sch, struct rtattr *opt)
 {
-        unsigned long qave;
+        struct rtattr *tb[TCA_GRED_MAX];
-        struct rtattr *rta;
-        struct tc_gred_qopt *opt = NULL ;
-        struct tc_gred_qopt *dst;
-        struct gred_sched *table = qdisc_priv(sch);
-        struct gred_sched_data *q;
-        int i;
-        unsigned char    *b = skb->tail;
-        rta = (struct rtattr*)b;
+        if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_MAX, opt))
-        RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
+                return -EINVAL;
-        opt=kmalloc(sizeof(struct tc_gred_qopt)*MAX_DPs, GFP_KERNEL);
+        if (tb[TCA_GRED_PARMS-1] || tb[TCA_GRED_STAB-1])
+                return -EINVAL;
-        if (opt  == NULL) {
+        return gred_change_table_def(sch, tb[TCA_GRED_DPS-1]);
-                DPRINTK("gred_dump:failed to malloc for %Zd\n",
+}
-                    sizeof(struct tc_gred_qopt)*MAX_DPs);
-                goto rtattr_failure;
-        }
-        memset(opt, 0, (sizeof(struct tc_gred_qopt))*table->DPs);
+static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+        struct gred_sched *table = qdisc_priv(sch);
+        struct rtattr *parms, *opts = NULL;
+        int i;
+        struct tc_gred_sopt sopt = {
+                .DPs    = table->DPs,
+                .def_DP = table->def,
+                .grio   = gred_rio_mode(table),
+                .flags  = table->red_flags,
+        };
-        if (!table->initd) {
+        opts = RTA_NEST(skb, TCA_OPTIONS);
-                DPRINTK("NO GRED Queues setup!\n");
+        RTA_PUT(skb, TCA_GRED_DPS, sizeof(sopt), &sopt);
-        }
+        parms = RTA_NEST(skb, TCA_GRED_PARMS);
+        for (i = 0; i < MAX_DPs; i++) {
+                struct gred_sched_data *q = table->tab[i];
+                struct tc_gred_qopt opt;
-        for (i=0;i<MAX_DPs;i++) {
+                memset(&opt, 0, sizeof(opt));
-                dst= &opt[i]; 
-                q= table->tab[i]; 
                if (!q) {
                        /* hack -- fix at some point with proper message
                           This is how we indicate to tc that there is no VQ
                           at this DP */
-                        dst->DP=MAX_DPs+i;
+                        opt.DP = MAX_DPs + i;
-                        continue;
+                        goto append_opt;
                }
-                dst->limit=q->limit;
+                opt.limit       = q->limit;
-                dst->qth_min=q->qth_min>>q->Wlog;
+                opt.DP          = q->DP;
-                dst->qth_max=q->qth_max>>q->Wlog;
+                opt.backlog     = q->backlog;
-                dst->DP=q->DP;
+                opt.prio        = q->prio;
-                dst->backlog=q->backlog;
+                opt.qth_min     = q->parms.qth_min >> q->parms.Wlog;
-                if (q->qave) {
+                opt.qth_max     = q->parms.qth_max >> q->parms.Wlog;
-                        if (table->eqp && table->grio) {
+                opt.Wlog        = q->parms.Wlog;
-                                q->qidlestart=table->tab[table->def]->qidlestart;
+                opt.Plog        = q->parms.Plog;
-                                q->qave=table->tab[table->def]->qave;
+                opt.Scell_log   = q->parms.Scell_log;
-                        }
+                opt.other       = q->stats.other;
-                        if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) {
+                opt.early       = q->stats.prob_drop;
-                                long idle;
+                opt.forced      = q->stats.forced_drop;
-                                psched_time_t now;
+                opt.pdrop       = q->stats.pdrop;
-                                PSCHED_GET_TIME(now);
+                opt.packets     = q->packetsin;
-                                idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max);
+                opt.bytesin     = q->bytesin;
-                                qave  = q->qave >> q->Stab[(idle>>q->Scell_log)&0xFF];
-                                dst->qave = qave >> q->Wlog;
+                if (gred_wred_mode(table)) {
+                        q->parms.qidlestart =
-                        } else {
+                                table->tab[table->def]->parms.qidlestart;
-                                dst->qave = q->qave >> q->Wlog;
+                        q->parms.qavg = table->tab[table->def]->parms.qavg;
-                        }
-                } else {
-                        dst->qave = 0;
                }
-                
+                opt.qave = red_calc_qavg(&q->parms, q->parms.qavg);
-                dst->Wlog = q->Wlog;
-                dst->Plog = q->Plog;
+append_opt:
-                dst->Scell_log = q->Scell_log;
+                RTA_APPEND(skb, sizeof(opt), &opt);
-                dst->other = q->other;
-                dst->forced = q->forced;
-                dst->early = q->early;
-                dst->pdrop = q->pdrop;
-                dst->prio = q->prio;
-                dst->packets=q->packetsin;
-                dst->bytesin=q->bytesin;
        }
-        RTA_PUT(skb, TCA_GRED_PARMS, sizeof(struct tc_gred_qopt)*MAX_DPs, opt);
+        RTA_NEST_END(skb, parms);
-        rta->rta_len = skb->tail - b;
-        kfree(opt);
+        return RTA_NEST_END(skb, opts);
-        return skb->len;
 rtattr_failure:
-        if (opt)
+        return RTA_NEST_CANCEL(skb, opts);
-                kfree(opt);
-        DPRINTK("gred_dump: FAILURE!!!!\n");
-/* also free the opt struct here */
-        skb_trim(skb, b - skb->data);
-        return -1;
 }
 static void gred_destroy(struct Qdisc *sch)
@@ -594,15 +574,13 @@ static void gred_destroy(struct Qdisc *sch)
        struct gred_sched *table = qdisc_priv(sch);
        int i;
-        for (i = 0;i < table->DPs; i++) {
+        for (i = 0; i < table->DPs; i++) {
                if (table->tab[i])
-                        kfree(table->tab[i]);
+                        gred_destroy_vq(table->tab[i]);
        }
 }
 static struct Qdisc_ops gred_qdisc_ops = {
-        .next           =       NULL,
-        .cl_ops         =       NULL,
        .id             =       "gred",
        .priv_size      =       sizeof(struct gred_sched),
        .enqueue        =       gred_enqueue,
@@ -621,10 +599,13 @@ static int __init gred_module_init(void)
 {
        return register_qdisc(&gred_qdisc_ops);
 }
-static void __exit gred_module_exit(void) 
+static void __exit gred_module_exit(void)
 {
        unregister_qdisc(&gred_qdisc_ops);
 }
 module_init(gred_module_init)
 module_exit(gred_module_exit)
 MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index bb9bf8d500..cdc8d28379 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -25,6 +25,8 @@
 #include <net/pkt_sched.h>
+#define VERSION "1.1"
 /*      Network Emulation Queuing algorithm.
        ====================================
@@ -185,10 +187,13 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
            || q->counter < q->gap      /* inside last reordering gap */
            || q->reorder < get_crandom(&q->reorder_cor)) {
                psched_time_t now;
+                psched_tdiff_t delay;
+                delay = tabledist(q->latency, q->jitter,
+                                  &q->delay_cor, q->delay_dist);
                PSCHED_GET_TIME(now);
-                PSCHED_TADD2(now, tabledist(q->latency, q->jitter, 
+                PSCHED_TADD2(now, delay, cb->time_to_send);
-                                            &q->delay_cor, q->delay_dist),
-                             cb->time_to_send);
                ++q->counter;
                ret = q->qdisc->enqueue(skb, q->qdisc);
        } else {
@@ -248,24 +253,31 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
                const struct netem_skb_cb *cb
                        = (const struct netem_skb_cb *)skb->cb;
                psched_time_t now;
-                long delay;
                /* if more time remaining? */
                PSCHED_GET_TIME(now);
-                delay = PSCHED_US2JIFFIE(PSCHED_TDIFF(cb->time_to_send, now));
-                pr_debug("netem_run: skb=%p delay=%ld\n", skb, delay);
+                if (PSCHED_TLESS(cb->time_to_send, now)) {
-                if (delay <= 0) {
                        pr_debug("netem_dequeue: return skb=%p\n", skb);
                        sch->q.qlen--;
                        sch->flags &= ~TCQ_F_THROTTLED;
                        return skb;
-                }
+                } else {
+                        psched_tdiff_t delay = PSCHED_TDIFF(cb->time_to_send, now);
+                        if (q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS) {
+                                sch->qstats.drops++;
-                mod_timer(&q->timer, jiffies + delay);
+                                /* After this qlen is confused */
-                sch->flags |= TCQ_F_THROTTLED;
+                                printk(KERN_ERR "netem: queue discpline %s could not requeue\n",
+                                       q->qdisc->ops->id);
-                if (q->qdisc->ops->requeue(skb, q->qdisc) != 0)
+                                sch->q.qlen--;
-                        sch->qstats.drops++;
+                        }
+                        mod_timer(&q->timer, jiffies + PSCHED_US2JIFFIE(delay));
+                        sch->flags |= TCQ_F_THROTTLED;
+                }
        }
        return NULL;
@@ -290,11 +302,16 @@ static void netem_reset(struct Qdisc *sch)
        del_timer_sync(&q->timer);
 }
+/* Pass size change message down to embedded FIFO */
 static int set_fifo_limit(struct Qdisc *q, int limit)
 {
        struct rtattr *rta;
        int ret = -ENOMEM;
+        /* Hack to avoid sending change message to non-FIFO */
+        if (strncmp(q->ops->id + 1, "fifo", 4) != 0)
+                return 0;
        rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
        if (rta) {
                rta->rta_type = RTM_NEWQDISC;
@@ -426,6 +443,84 @@ static int netem_change(struct Qdisc *sch, struct rtattr *opt)
        return 0;
 }
+/*
+ * Special case version of FIFO queue for use by netem.
+ * It queues in order based on timestamps in skb's
+ */
+struct fifo_sched_data {
+        u32 limit;
+};
+static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
+{
+        struct fifo_sched_data *q = qdisc_priv(sch);
+        struct sk_buff_head *list = &sch->q;
+        const struct netem_skb_cb *ncb
+                = (const struct netem_skb_cb *)nskb->cb;
+        struct sk_buff *skb;
+        if (likely(skb_queue_len(list) < q->limit)) {
+                skb_queue_reverse_walk(list, skb) {
+                        const struct netem_skb_cb *cb
+                                = (const struct netem_skb_cb *)skb->cb;
+                        if (PSCHED_TLESS(cb->time_to_send, ncb->time_to_send))
+                                break;
+                }
+                __skb_queue_after(list, skb, nskb);
+                sch->qstats.backlog += nskb->len;
+                sch->bstats.bytes += nskb->len;
+                sch->bstats.packets++;
+                return NET_XMIT_SUCCESS;
+        }
+        return qdisc_drop(nskb, sch);
+}
+static int tfifo_init(struct Qdisc *sch, struct rtattr *opt)
+{
+        struct fifo_sched_data *q = qdisc_priv(sch);
+        if (opt) {
+                struct tc_fifo_qopt *ctl = RTA_DATA(opt);
+                if (RTA_PAYLOAD(opt) < sizeof(*ctl))
+                        return -EINVAL;
+                q->limit = ctl->limit;
+        } else
+                q->limit = max_t(u32, sch->dev->tx_queue_len, 1);
+        return 0;
+}
+static int tfifo_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+        struct fifo_sched_data *q = qdisc_priv(sch);
+        struct tc_fifo_qopt opt = { .limit = q->limit };
+        RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+        return skb->len;
+rtattr_failure:
+        return -1;
+}
+static struct Qdisc_ops tfifo_qdisc_ops = {
+        .id             =       "tfifo",
+        .priv_size      =       sizeof(struct fifo_sched_data),
+        .enqueue        =       tfifo_enqueue,
+        .dequeue        =       qdisc_dequeue_head,
+        .requeue        =       qdisc_requeue,
+        .drop           =       qdisc_queue_drop,
+        .init           =       tfifo_init,
+        .reset          =       qdisc_reset_queue,
+        .change         =       tfifo_init,
+        .dump           =       tfifo_dump,
+};
 static int netem_init(struct Qdisc *sch, struct rtattr *opt)
 {
        struct netem_sched_data *q = qdisc_priv(sch);
@@ -438,7 +533,7 @@ static int netem_init(struct Qdisc *sch, struct rtattr *opt)
        q->timer.function = netem_watchdog;
        q->timer.data = (unsigned long) sch;
-        q->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
+        q->qdisc = qdisc_create_dflt(sch->dev, &tfifo_qdisc_ops);
        if (!q->qdisc) {
                pr_debug("netem: qdisc create failed\n");
                return -ENOMEM;
@@ -601,6 +696,7 @@ static struct Qdisc_ops netem_qdisc_ops = {
 static int __init netem_module_init(void)
 {
+        pr_info("netem: version " VERSION "\n");
        return register_qdisc(&netem_qdisc_ops);
 }
 static void __exit netem_module_exit(void)
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 7845d045ee..dccfa44c2d 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -9,76 +9,23 @@
 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 * Changes:
- * J Hadi Salim <hadi@nortel.com> 980914:       computation fixes
+ * J Hadi Salim 980914: computation fixes
 * Alexey Makarenko <makar@phoenix.kharkov.ua> 990814: qave on idle link was calculated incorrectly.
- * J Hadi Salim <hadi@nortelnetworks.com> 980816:  ECN support  
+ * J Hadi Salim 980816:  ECN support
 */
 #include <linux/config.h>
 #include <linux/module.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/bitops.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/in.h>
-#include <linux/errno.h>
-#include <linux/interrupt.h>
-#include <linux/if_ether.h>
-#include <linux/inet.h>
 #include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/notifier.h>
-#include <net/ip.h>
-#include <net/route.h>
 #include <linux/skbuff.h>
-#include <net/sock.h>
 #include <net/pkt_sched.h>
 #include <net/inet_ecn.h>
-#include <net/dsfield.h>
+#include <net/red.h>
-/*      Random Early Detection (RED) algorithm.
+/*      Parameters, settable by user:
-        =======================================
-        Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways
-        for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking.
-        This file codes a "divisionless" version of RED algorithm
-        as written down in Fig.17 of the paper.
-Short description.
------------------
-        When a new packet arrives we calculate the average queue length:
-        avg = (1-W)*avg + W*current_queue_len,
-        W is the filter time constant (chosen as 2^(-Wlog)), it controls
-        the inertia of the algorithm. To allow larger bursts, W should be
-        decreased.
-        if (avg > th_max) -> packet marked (dropped).
-        if (avg < th_min) -> packet passes.
-        if (th_min < avg < th_max) we calculate probability:
-        Pb = max_P * (avg - th_min)/(th_max-th_min)
-        and mark (drop) packet with this probability.
-        Pb changes from 0 (at avg==th_min) to max_P (avg==th_max).
-        max_P should be small (not 1), usually 0.01..0.02 is good value.
-        max_P is chosen as a number, so that max_P/(th_max-th_min)
-        is a negative power of two in order arithmetics to contain
-        only shifts.
-        Parameters, settable by user:
        -----------------------------
        limit           - bytes (must be > qth_max + burst)
@@ -89,243 +36,93 @@ Short description.
        arbitrarily high (well, less than ram size)
        Really, this limit will never be reached
        if RED works correctly.
-        qth_min         - bytes (should be < qth_max/2)
-        qth_max         - bytes (should be at least 2*qth_min and less limit)
-        Wlog            - bits (<32) log(1/W).
-        Plog            - bits (<32)
-        Plog is related to max_P by formula:
-        max_P = (qth_max-qth_min)/2^Plog;
-        F.e. if qth_max=128K and qth_min=32K, then Plog=22
-        corresponds to max_P=0.02
-        Scell_log
-        Stab
-        Lookup table for log((1-W)^(t/t_ave).
-NOTES:
-Upper bound on W.
-----------------
-        If you want to allow bursts of L packets of size S,
-        you should choose W:
-        L + 1 - th_min/S < (1-(1-W)^L)/W
-        th_min/S = 32         th_min/S = 4
-                                               
-        log(W)  L
-        -1      33
-        -2      35
-        -3      39
-        -4      46
-        -5      57
-        -6      75
-        -7      101
-        -8      135
-        -9      190
-        etc.
 */
 struct red_sched_data
 {
-/* Parameters */
+        u32                     limit;          /* HARD maximal queue length */
-        u32             limit;          /* HARD maximal queue length    */
+        unsigned char           flags;
-        u32             qth_min;        /* Min average length threshold: A scaled */
+        struct red_parms        parms;
-        u32             qth_max;        /* Max average length threshold: A scaled */
+        struct red_stats        stats;
-        u32             Rmask;
-        u32             Scell_max;
-        unsigned char   flags;
-        char            Wlog;           /* log(W)               */
-        char            Plog;           /* random number bits   */
-        char            Scell_log;
-        u8              Stab[256];
-/* Variables */
-        unsigned long   qave;           /* Average queue length: A scaled */
-        int             qcount;         /* Packets since last random number generation */
-        u32             qR;             /* Cached random number */
-        psched_time_t   qidlestart;     /* Start of idle period         */
-        struct tc_red_xstats st;
 };
-static int red_ecn_mark(struct sk_buff *skb)
+static inline int red_use_ecn(struct red_sched_data *q)
 {
-        if (skb->nh.raw + 20 > skb->tail)
+        return q->flags & TC_RED_ECN;
-                return 0;
-        switch (skb->protocol) {
-        case __constant_htons(ETH_P_IP):
-                if (INET_ECN_is_not_ect(skb->nh.iph->tos))
-                        return 0;
-                IP_ECN_set_ce(skb->nh.iph);
-                return 1;
-        case __constant_htons(ETH_P_IPV6):
-                if (INET_ECN_is_not_ect(ipv6_get_dsfield(skb->nh.ipv6h)))
-                        return 0;
-                IP6_ECN_set_ce(skb->nh.ipv6h);
-                return 1;
-        default:
-                return 0;
-        }
 }
-static int
+static inline int red_use_harddrop(struct red_sched_data *q)
-red_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+{
+        return q->flags & TC_RED_HARDDROP;
+}
+static int red_enqueue(struct sk_buff *skb, struct Qdisc* sch)
 {
        struct red_sched_data *q = qdisc_priv(sch);
-        psched_time_t now;
+        q->parms.qavg = red_calc_qavg(&q->parms, sch->qstats.backlog);
-        if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) {
+        if (red_is_idling(&q->parms))
-                long us_idle;
+                red_end_of_idle_period(&q->parms);
-                int  shift;
-                PSCHED_GET_TIME(now);
+        switch (red_action(&q->parms, q->parms.qavg)) {
-                us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max);
+                case RED_DONT_MARK:
-                PSCHED_SET_PASTPERFECT(q->qidlestart);
+                        break;
-/*
+                case RED_PROB_MARK:
-   The problem: ideally, average length queue recalcultion should
+                        sch->qstats.overlimits++;
-   be done over constant clock intervals. This is too expensive, so that
+                        if (!red_use_ecn(q) || !INET_ECN_set_ce(skb)) {
-   the calculation is driven by outgoing packets.
+                                q->stats.prob_drop++;
-   When the queue is idle we have to model this clock by hand.
+                                goto congestion_drop;
+                        }
-   SF+VJ proposed to "generate" m = idletime/(average_pkt_size/bandwidth)
-   dummy packets as a burst after idle time, i.e.
-          q->qave *= (1-W)^m
-   This is an apparently overcomplicated solution (f.e. we have to precompute
-   a table to make this calculation in reasonable time)
-   I believe that a simpler model may be used here,
-   but it is field for experiments.
-*/
-                shift = q->Stab[us_idle>>q->Scell_log];
-                if (shift) {
-                        q->qave >>= shift;
-                } else {
-                        /* Approximate initial part of exponent
-                           with linear function:
-                           (1-W)^m ~= 1-mW + ...
-                           Seems, it is the best solution to
-                           problem of too coarce exponent tabulation.
-                         */
-                        us_idle = (q->qave * us_idle)>>q->Scell_log;
-                        if (us_idle < q->qave/2)
-                                q->qave -= us_idle;
-                        else
-                                q->qave >>= 1;
-                }
-        } else {
-                q->qave += sch->qstats.backlog - (q->qave >> q->Wlog);
-                /* NOTE:
-                   q->qave is fixed point number with point at Wlog.
-                   The formulae above is equvalent to floating point
-                   version:
-                   qave = qave*(1-W) + sch->qstats.backlog*W;
-                                                           --ANK (980924)
-                 */
-        }
-        if (q->qave < q->qth_min) {
+                        q->stats.prob_mark++;
-                q->qcount = -1;
+                        break;
-enqueue:
-                if (sch->qstats.backlog + skb->len <= q->limit) {
+                case RED_HARD_MARK:
-                        __skb_queue_tail(&sch->q, skb);
+                        sch->qstats.overlimits++;
-                        sch->qstats.backlog += skb->len;
+                        if (red_use_harddrop(q) || !red_use_ecn(q) ||
-                        sch->bstats.bytes += skb->len;
+                            !INET_ECN_set_ce(skb)) {
-                        sch->bstats.packets++;
+                                q->stats.forced_drop++;
-                        return NET_XMIT_SUCCESS;
+                                goto congestion_drop;
-                } else {
+                        }
-                        q->st.pdrop++;
-                }
-                kfree_skb(skb);
-                sch->qstats.drops++;
-                return NET_XMIT_DROP;
-        }
-        if (q->qave >= q->qth_max) {
-                q->qcount = -1;
-                sch->qstats.overlimits++;
-mark:
-                if  (!(q->flags&TC_RED_ECN) || !red_ecn_mark(skb)) {
-                        q->st.early++;
-                        goto drop;
-                }
-                q->st.marked++;
-                goto enqueue;
-        }
-        if (++q->qcount) {
+                        q->stats.forced_mark++;
-                /* The formula used below causes questions.
+                        break;
-                   OK. qR is random number in the interval 0..Rmask
-                   i.e. 0..(2^Plog). If we used floating point
-                   arithmetics, it would be: (2^Plog)*rnd_num,
-                   where rnd_num is less 1.
-                   Taking into account, that qave have fixed
-                   point at Wlog, and Plog is related to max_P by
-                   max_P = (qth_max-qth_min)/2^Plog; two lines
-                   below have the following floating point equivalent:
-                   
-                   max_P*(qave - qth_min)/(qth_max-qth_min) < rnd/qcount
-                   Any questions? --ANK (980924)
-                 */
-                if (((q->qave - q->qth_min)>>q->Wlog)*q->qcount < q->qR)
-                        goto enqueue;
-                q->qcount = 0;
-                q->qR = net_random()&q->Rmask;
-                sch->qstats.overlimits++;
-                goto mark;
        }
-        q->qR = net_random()&q->Rmask;
-        goto enqueue;
-drop:
+        if (sch->qstats.backlog + skb->len <= q->limit)
-        kfree_skb(skb);
+                return qdisc_enqueue_tail(skb, sch);
-        sch->qstats.drops++;
+        q->stats.pdrop++;
+        return qdisc_drop(skb, sch);
+congestion_drop:
+        qdisc_drop(skb, sch);
        return NET_XMIT_CN;
 }
-static int
+static int red_requeue(struct sk_buff *skb, struct Qdisc* sch)
-red_requeue(struct sk_buff *skb, struct Qdisc* sch)
 {
        struct red_sched_data *q = qdisc_priv(sch);
-        PSCHED_SET_PASTPERFECT(q->qidlestart);
+        if (red_is_idling(&q->parms))
+                red_end_of_idle_period(&q->parms);
-        __skb_queue_head(&sch->q, skb);
+        return qdisc_requeue(skb, sch);
-        sch->qstats.backlog += skb->len;
-        sch->qstats.requeues++;
-        return 0;
 }
-static struct sk_buff *
+static struct sk_buff * red_dequeue(struct Qdisc* sch)
-red_dequeue(struct Qdisc* sch)
 {
        struct sk_buff *skb;
        struct red_sched_data *q = qdisc_priv(sch);
-        skb = __skb_dequeue(&sch->q);
+        skb = qdisc_dequeue_head(sch);
-        if (skb) {
-                sch->qstats.backlog -= skb->len;
+        if (skb == NULL && !red_is_idling(&q->parms))
-                return skb;
+                red_start_of_idle_period(&q->parms);
-        }
-        PSCHED_GET_TIME(q->qidlestart);
+        return skb;
-        return NULL;
 }
 static unsigned int red_drop(struct Qdisc* sch)
@@ -333,16 +130,17 @@ static unsigned int red_drop(struct Qdisc* sch)
        struct sk_buff *skb;
        struct red_sched_data *q = qdisc_priv(sch);
-        skb = __skb_dequeue_tail(&sch->q);
+        skb = qdisc_dequeue_tail(sch);
        if (skb) {
                unsigned int len = skb->len;
-                sch->qstats.backlog -= len;
+                q->stats.other++;
-                sch->qstats.drops++;
+                qdisc_drop(skb, sch);
-                q->st.other++;
-                kfree_skb(skb);
                return len;
        }
-        PSCHED_GET_TIME(q->qidlestart);
+        if (!red_is_idling(&q->parms))
+                red_start_of_idle_period(&q->parms);
        return 0;
 }
@@ -350,43 +148,38 @@ static void red_reset(struct Qdisc* sch)
 {
        struct red_sched_data *q = qdisc_priv(sch);
-        __skb_queue_purge(&sch->q);
+        qdisc_reset_queue(sch);
-        sch->qstats.backlog = 0;
+        red_restart(&q->parms);
-        PSCHED_SET_PASTPERFECT(q->qidlestart);
-        q->qave = 0;
-        q->qcount = -1;
 }
 static int red_change(struct Qdisc *sch, struct rtattr *opt)
 {
        struct red_sched_data *q = qdisc_priv(sch);
-        struct rtattr *tb[TCA_RED_STAB];
+        struct rtattr *tb[TCA_RED_MAX];
        struct tc_red_qopt *ctl;
-        if (opt == NULL ||
+        if (opt == NULL || rtattr_parse_nested(tb, TCA_RED_MAX, opt))
-            rtattr_parse_nested(tb, TCA_RED_STAB, opt) ||
+                return -EINVAL;
-            tb[TCA_RED_PARMS-1] == 0 || tb[TCA_RED_STAB-1] == 0 ||
+        if (tb[TCA_RED_PARMS-1] == NULL ||
            RTA_PAYLOAD(tb[TCA_RED_PARMS-1]) < sizeof(*ctl) ||
-            RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < 256)
+            tb[TCA_RED_STAB-1] == NULL ||
+            RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < RED_STAB_SIZE)
                return -EINVAL;
        ctl = RTA_DATA(tb[TCA_RED_PARMS-1]);
        sch_tree_lock(sch);
        q->flags = ctl->flags;
-        q->Wlog = ctl->Wlog;
-        q->Plog = ctl->Plog;
-        q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL;
-        q->Scell_log = ctl->Scell_log;
-        q->Scell_max = (255<<q->Scell_log);
-        q->qth_min = ctl->qth_min<<ctl->Wlog;
-        q->qth_max = ctl->qth_max<<ctl->Wlog;
        q->limit = ctl->limit;
-        memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256);
-        q->qcount = -1;
+        red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
+                                 ctl->Plog, ctl->Scell_log,
+                                 RTA_DATA(tb[TCA_RED_STAB-1]));
        if (skb_queue_empty(&sch->q))
-                PSCHED_SET_PASTPERFECT(q->qidlestart);
+                red_end_of_idle_period(&q->parms);
        sch_tree_unlock(sch);
        return 0;
 }
@@ -399,39 +192,39 @@ static int red_init(struct Qdisc* sch, struct rtattr *opt)
 static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
        struct red_sched_data *q = qdisc_priv(sch);
-        unsigned char    *b = skb->tail;
+        struct rtattr *opts = NULL;
-        struct rtattr *rta;
+        struct tc_red_qopt opt = {
-        struct tc_red_qopt opt;
+                .limit          = q->limit,
+                .flags          = q->flags,
-        rta = (struct rtattr*)b;
+                .qth_min        = q->parms.qth_min >> q->parms.Wlog,
-        RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
+                .qth_max        = q->parms.qth_max >> q->parms.Wlog,
-        opt.limit = q->limit;
+                .Wlog           = q->parms.Wlog,
-        opt.qth_min = q->qth_min>>q->Wlog;
+                .Plog           = q->parms.Plog,
-        opt.qth_max = q->qth_max>>q->Wlog;
+                .Scell_log      = q->parms.Scell_log,
-        opt.Wlog = q->Wlog;
+        };
-        opt.Plog = q->Plog;
-        opt.Scell_log = q->Scell_log;
+        opts = RTA_NEST(skb, TCA_OPTIONS);
-        opt.flags = q->flags;
        RTA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt);
-        rta->rta_len = skb->tail - b;
+        return RTA_NEST_END(skb, opts);
-        return skb->len;
 rtattr_failure:
-        skb_trim(skb, b - skb->data);
+        return RTA_NEST_CANCEL(skb, opts);
-        return -1;
 }
 static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 {
        struct red_sched_data *q = qdisc_priv(sch);
+        struct tc_red_xstats st = {
-        return gnet_stats_copy_app(d, &q->st, sizeof(q->st));
+                .early  = q->stats.prob_drop + q->stats.forced_drop,
+                .pdrop  = q->stats.pdrop,
+                .other  = q->stats.other,
+                .marked = q->stats.prob_mark + q->stats.forced_mark,
+        };
+        return gnet_stats_copy_app(d, &st, sizeof(st));
 }
 static struct Qdisc_ops red_qdisc_ops = {
-        .next           =       NULL,
-        .cl_ops         =       NULL,
        .id             =       "red",
        .priv_size      =       sizeof(struct red_sched_data),
        .enqueue        =       red_enqueue,
@@ -450,10 +243,13 @@ static int __init red_module_init(void)
 {
        return register_qdisc(&red_qdisc_ops);
 }
-static void __exit red_module_exit(void) 
+static void __exit red_module_exit(void)
 {
        unregister_qdisc(&red_qdisc_ops);
 }
 module_init(red_module_init)
 module_exit(red_module_exit)
 MODULE_LICENSE("GPL");
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index a415d99c39..8c7756036e 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -299,11 +299,10 @@ put_rpccred(struct rpc_cred *cred)
 void
 rpcauth_unbindcred(struct rpc_task *task)
 {
-        struct rpc_auth *auth = task->tk_auth;
        struct rpc_cred *cred = task->tk_msg.rpc_cred;
        dprintk("RPC: %4d releasing %s cred %p\n",
-                task->tk_pid, auth->au_ops->au_name, cred);
+                task->tk_pid, task->tk_auth->au_ops->au_name, cred);
        put_rpccred(cred);
        task->tk_msg.rpc_cred = NULL;
@@ -312,22 +311,22 @@ rpcauth_unbindcred(struct rpc_task *task)
 u32 *
 rpcauth_marshcred(struct rpc_task *task, u32 *p)
 {
-        struct rpc_auth *auth = task->tk_auth;
        struct rpc_cred *cred = task->tk_msg.rpc_cred;
        dprintk("RPC: %4d marshaling %s cred %p\n",
-                task->tk_pid, auth->au_ops->au_name, cred);
+                task->tk_pid, task->tk_auth->au_ops->au_name, cred);
        return cred->cr_ops->crmarshal(task, p);
 }
 u32 *
 rpcauth_checkverf(struct rpc_task *task, u32 *p)
 {
-        struct rpc_auth *auth = task->tk_auth;
        struct rpc_cred *cred = task->tk_msg.rpc_cred;
        dprintk("RPC: %4d validating %s cred %p\n",
-                task->tk_pid, auth->au_ops->au_name, cred);
+                task->tk_pid, task->tk_auth->au_ops->au_name, cred);
        return cred->cr_ops->crvalidate(task, p);
 }
@@ -363,12 +362,12 @@ rpcauth_unwrap_resp(struct rpc_task *task, kxdrproc_t decode, void *rqstp,
 int
 rpcauth_refreshcred(struct rpc_task *task)
 {
-        struct rpc_auth *auth = task->tk_auth;
        struct rpc_cred *cred = task->tk_msg.rpc_cred;
        int err;
        dprintk("RPC: %4d refreshing %s cred %p\n",
-                task->tk_pid, auth->au_ops->au_name, cred);
+                task->tk_pid, task->tk_auth->au_ops->au_name, cred);
        err = cred->cr_ops->crrefresh(task);
        if (err < 0)
                task->tk_status = err;
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 2387e7b823..a03d4b600c 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -63,8 +63,6 @@ EXPORT_SYMBOL(rpc_mkpipe);
 /* Client transport */
 EXPORT_SYMBOL(xprt_create_proto);
 EXPORT_SYMBOL(xprt_set_timeout);
-EXPORT_SYMBOL(xprt_udp_slot_table_entries);
-EXPORT_SYMBOL(xprt_tcp_slot_table_entries);
 /* Client credential cache */
 EXPORT_SYMBOL(rpcauth_register);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index e9bd91265f..5a220b2bb3 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -313,6 +313,11 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp)
        rqstp->rq_proc = proc = ntohl(svc_getu32(argv));        /* procedure number */
        progp = serv->sv_program;
+        for (progp = serv->sv_program; progp; progp = progp->pg_next)
+                if (prog == progp->pg_prog)
+                        break;
        /*
         * Decode auth data, and add verifier to reply buffer.
         * We do this before anything else in order to get a decent
@@ -320,7 +325,7 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp)
         */
        auth_res = svc_authenticate(rqstp, &auth_stat);
        /* Also give the program a chance to reject this call: */
-        if (auth_res == SVC_OK) {
+        if (auth_res == SVC_OK && progp) {
                auth_stat = rpc_autherr_badcred;
                auth_res = progp->pg_authenticate(rqstp);
        }
@@ -340,10 +345,7 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp)
        case SVC_COMPLETE:
                goto sendit;
        }
-                
-        for (progp = serv->sv_program; progp; progp = progp->pg_next)
-                if (prog == progp->pg_prog)
-                        break;
        if (progp == NULL)
                goto err_bad_prog;
diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c
index d0c9f460e4..1065904841 100644
--- a/net/sunrpc/sysctl.c
+++ b/net/sunrpc/sysctl.c
@@ -119,13 +119,6 @@ done:
        return 0;
 }
-unsigned int xprt_udp_slot_table_entries = RPC_DEF_SLOT_TABLE;
-unsigned int xprt_tcp_slot_table_entries = RPC_DEF_SLOT_TABLE;
-unsigned int xprt_min_resvport = RPC_DEF_MIN_RESVPORT;
-EXPORT_SYMBOL(xprt_min_resvport);
-unsigned int xprt_max_resvport = RPC_DEF_MAX_RESVPORT;
-EXPORT_SYMBOL(xprt_max_resvport);
 static unsigned int min_slot_table_size = RPC_MIN_SLOT_TABLE;
 static unsigned int max_slot_table_size = RPC_MAX_SLOT_TABLE;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 2e1529217e..0a51fd46a8 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -36,6 +36,15 @@
 #include <net/tcp.h>
 /*
+ * xprtsock tunables
+ */
+unsigned int xprt_udp_slot_table_entries = RPC_DEF_SLOT_TABLE;
+unsigned int xprt_tcp_slot_table_entries = RPC_DEF_SLOT_TABLE;
+unsigned int xprt_min_resvport = RPC_DEF_MIN_RESVPORT;
+unsigned int xprt_max_resvport = RPC_DEF_MAX_RESVPORT;
+/*
 * How many times to try sending a request on a socket before waiting
 * for the socket buffer to clear.
 */