Merge branch 'for-3.5' of ../cgroup into block/for-3.5/core-merged

cgroup/for-3.5 contains the following changes which blk-cgroup needs to proceed with the on-going cleanup. * Dynamic addition and removal of cftypes to make config/stat file handling modular for policies. * cgroup removal update to not wait for css references to drain to fix blkcg removal hang caused by cfq caching cfqgs. Pull in cgroup/for-3.5 into block/for-3.5/core. This causes the following conflicts in block/blk-cgroup.c. * 761b3ef50e "cgroup: remove cgroup_subsys argument from callbacks" conflicts with blkiocg_pre_destroy() addition and blkiocg_attach() removal. Resolved by removing @subsys from all subsys methods. * 676f7c8f84 "cgroup: relocate cftype and cgroup_subsys definitions in controllers" conflicts with ->pre_destroy() and ->attach() updates and removal of modular config. Resolved by dropping forward declarations of the methods and applying updates to the relocated blkio_subsys. * 4baf6e3325 "cgroup: convert all non-memcg controllers to the new cftype interface" builds upon the previous item. Resolved by adding ->base_cftypes to the relocated blkio_subsys. Signed-off-by: Tejun Heo <tj@kernel.org>
author: Tejun Heo <tj@kernel.org> 2012-04-01 15:30:01 -0400
committer: Tejun Heo <tj@kernel.org> 2012-04-01 15:55:00 -0400
commit: 959d851caa48829eb85cb85aa949fd6b4c5d5bc6 (patch)
tree: 3ba9c94ec346275fb44c4f0d1cd2537cdff8d811 /net/sched
parent: a5567932fc926739e29e98487128080f40c61710 (diff)
parent: 48ddbe194623ae089cc0576e60363f2d2e85662a (diff)
6 files changed, 280 insertions, 29 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 2590e91b3289..75b58f81d53d 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -260,6 +260,32 @@ config NET_SCH_INGRESS
          To compile this code as a module, choose M here: the
          module will be called sch_ingress.
+config NET_SCH_PLUG
+        tristate "Plug network traffic until release (PLUG)"
+        ---help---
+          This queuing discipline allows userspace to plug/unplug a network
+          output queue, using the netlink interface.  When it receives an
+          enqueue command it inserts a plug into the outbound queue that
+          causes following packets to enqueue until a dequeue command arrives
+          over netlink, causing the plug to be removed and resuming the normal
+          packet flow.
+          This module also provides a generic "network output buffering"
+          functionality (aka output commit), wherein upon arrival of a dequeue
+          command, only packets up to the first plug are released for delivery.
+          The Remus HA project uses this module to enable speculative execution
+          of virtual machines by allowing the generated network output to be rolled
+          back if needed.
+          For more information, please refer to http://wiki.xensource.com/xenwiki/Remus
+          Say Y here if you are using this kernel for Xen dom0 and
+          want to protect Xen guests with Remus.
+          To compile this code as a module, choose M here: the
+          module will be called sch_plug.
 comment "Classification"
 config NET_CLS
diff --git a/net/sched/Makefile b/net/sched/Makefile
index dc5889c0a15a..8cdf4e2b51d3 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -33,6 +33,7 @@ obj-$(CONFIG_NET_SCH_MULTIQ)	+= sch_multiq.o
 obj-$(CONFIG_NET_SCH_ATM)       += sch_atm.o
 obj-$(CONFIG_NET_SCH_NETEM)     += sch_netem.o
 obj-$(CONFIG_NET_SCH_DRR)       += sch_drr.o
+obj-$(CONFIG_NET_SCH_PLUG)      += sch_plug.o
 obj-$(CONFIG_NET_SCH_MQPRIO)    += sch_mqprio.o
 obj-$(CONFIG_NET_SCH_CHOKE)     += sch_choke.o
 obj-$(CONFIG_NET_SCH_QFQ)       += sch_qfq.o
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index f84fdc3a7f27..7743ea8d1d38 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -22,23 +22,6 @@
 #include <net/sock.h>
 #include <net/cls_cgroup.h>
-static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
-                                               struct cgroup *cgrp);
-static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
-static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp);
-struct cgroup_subsys net_cls_subsys = {
-        .name           = "net_cls",
-        .create         = cgrp_create,
-        .destroy        = cgrp_destroy,
-        .populate       = cgrp_populate,
-#ifdef CONFIG_NET_CLS_CGROUP
-        .subsys_id      = net_cls_subsys_id,
-#endif
-        .module         = THIS_MODULE,
-};
 static inline struct cgroup_cls_state *cgrp_cls_state(struct cgroup *cgrp)
 {
        return container_of(cgroup_subsys_state(cgrp, net_cls_subsys_id),
@@ -51,8 +34,7 @@ static inline struct cgroup_cls_state *task_cls_state(struct task_struct *p)
                            struct cgroup_cls_state, css);
 }
-static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
+static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp)
-                                                 struct cgroup *cgrp)
 {
        struct cgroup_cls_state *cs;
@@ -66,7 +48,7 @@ static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
        return &cs->css;
 }
-static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
+static void cgrp_destroy(struct cgroup *cgrp)
 {
        kfree(cgrp_cls_state(cgrp));
 }
@@ -88,12 +70,19 @@ static struct cftype ss_files[] = {
                .read_u64 = read_classid,
                .write_u64 = write_classid,
        },
+        { }     /* terminate */
 };
-static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
+struct cgroup_subsys net_cls_subsys = {
-{
+        .name           = "net_cls",
-        return cgroup_add_files(cgrp, ss, ss_files, ARRAY_SIZE(ss_files));
+        .create         = cgrp_create,
-}
+        .destroy        = cgrp_destroy,
+#ifdef CONFIG_NET_CLS_CGROUP
+        .subsys_id      = net_cls_subsys_id,
+#endif
+        .base_cftypes   = ss_files,
+        .module         = THIS_MODULE,
+};
 struct cls_cgroup_head {
        u32                     handle;
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index e83d61ca78ca..5da548fa7ae9 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -501,9 +501,8 @@ tfifo_dequeue:
                /* if more time remaining? */
                if (cb->time_to_send <= psched_get_time()) {
-                        skb = qdisc_dequeue_tail(sch);
+                        __skb_unlink(skb, &sch->q);
-                        if (unlikely(!skb))
+                        sch->qstats.backlog -= qdisc_pkt_len(skb);
-                                goto qdisc_dequeue;
 #ifdef CONFIG_NET_CLS_ACT
                        /*
@@ -539,7 +538,6 @@ deliver:
                qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send);
        }
-qdisc_dequeue:
        if (q->qdisc) {
                skb = q->qdisc->ops->dequeue(q->qdisc);
                if (skb)
diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c
new file mode 100644
index 000000000000..89f8fcf73f18
--- /dev/null
+++ b/net/sched/sch_plug.c
@@ -0,0 +1,233 @@
+/*
+ * sch_plug.c Queue traffic until an explicit release command
+ *
+ *             This program is free software; you can redistribute it and/or
+ *             modify it under the terms of the GNU General Public License
+ *             as published by the Free Software Foundation; either version
+ *             2 of the License, or (at your option) any later version.
+ *
+ * There are two ways to use this qdisc:
+ * 1. A simple "instantaneous" plug/unplug operation, by issuing an alternating
+ *    sequence of TCQ_PLUG_BUFFER & TCQ_PLUG_RELEASE_INDEFINITE commands.
+ *
+ * 2. For network output buffering (a.k.a output commit) functionality.
+ *    Output commit property is commonly used by applications using checkpoint
+ *    based fault-tolerance to ensure that the checkpoint from which a system
+ *    is being restored is consistent w.r.t outside world.
+ *
+ *    Consider for e.g. Remus - a Virtual Machine checkpointing system,
+ *    wherein a VM is checkpointed, say every 50ms. The checkpoint is replicated
+ *    asynchronously to the backup host, while the VM continues executing the
+ *    next epoch speculatively.
+ *
+ *    The following is a typical sequence of output buffer operations:
+ *       1.At epoch i, start_buffer(i)
+ *       2. At end of epoch i (i.e. after 50ms):
+ *          2.1 Stop VM and take checkpoint(i).
+ *          2.2 start_buffer(i+1) and Resume VM
+ *       3. While speculatively executing epoch(i+1), asynchronously replicate
+ *          checkpoint(i) to backup host.
+ *       4. When checkpoint_ack(i) is received from backup, release_buffer(i)
+ *    Thus, this Qdisc would receive the following sequence of commands:
+ *       TCQ_PLUG_BUFFER (epoch i)
+ *       .. TCQ_PLUG_BUFFER (epoch i+1)
+ *       ....TCQ_PLUG_RELEASE_ONE (epoch i)
+ *       ......TCQ_PLUG_BUFFER (epoch i+2)
+ *       ........
+ */
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+/*
+ * State of the queue, when used for network output buffering:
+ *
+ *                 plug(i+1)            plug(i)          head
+ * ------------------+--------------------+---------------->
+ *                   |                    |
+ *                   |                    |
+ * pkts_current_epoch| pkts_last_epoch    |pkts_to_release
+ * ----------------->|<--------+--------->|+--------------->
+ *                   v                    v
+ *
+ */
+struct plug_sched_data {
+        /* If true, the dequeue function releases all packets
+         * from head to end of the queue. The queue turns into
+         * a pass-through queue for newly arriving packets.
+         */
+        bool unplug_indefinite;
+        /* Queue Limit in bytes */
+        u32 limit;
+        /* Number of packets (output) from the current speculatively
+         * executing epoch.
+         */
+        u32 pkts_current_epoch;
+        /* Number of packets corresponding to the recently finished
+         * epoch. These will be released when we receive a
+         * TCQ_PLUG_RELEASE_ONE command. This command is typically
+         * issued after committing a checkpoint at the target.
+         */
+        u32 pkts_last_epoch;
+        /*
+         * Number of packets from the head of the queue, that can
+         * be released (committed checkpoint).
+         */
+        u32 pkts_to_release;
+};
+static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+        struct plug_sched_data *q = qdisc_priv(sch);
+        if (likely(sch->qstats.backlog + skb->len <= q->limit)) {
+                if (!q->unplug_indefinite)
+                        q->pkts_current_epoch++;
+                return qdisc_enqueue_tail(skb, sch);
+        }
+        return qdisc_reshape_fail(skb, sch);
+}
+static struct sk_buff *plug_dequeue(struct Qdisc *sch)
+{
+        struct plug_sched_data *q = qdisc_priv(sch);
+        if (qdisc_is_throttled(sch))
+                return NULL;
+        if (!q->unplug_indefinite) {
+                if (!q->pkts_to_release) {
+                        /* No more packets to dequeue. Block the queue
+                         * and wait for the next release command.
+                         */
+                        qdisc_throttled(sch);
+                        return NULL;
+                }
+                q->pkts_to_release--;
+        }
+        return qdisc_dequeue_head(sch);
+}
+static int plug_init(struct Qdisc *sch, struct nlattr *opt)
+{
+        struct plug_sched_data *q = qdisc_priv(sch);
+        q->pkts_current_epoch = 0;
+        q->pkts_last_epoch = 0;
+        q->pkts_to_release = 0;
+        q->unplug_indefinite = false;
+        if (opt == NULL) {
+                /* We will set a default limit of 100 pkts (~150kB)
+                 * in case tx_queue_len is not available. The
+                 * default value is completely arbitrary.
+                 */
+                u32 pkt_limit = qdisc_dev(sch)->tx_queue_len ? : 100;
+                q->limit = pkt_limit * psched_mtu(qdisc_dev(sch));
+        } else {
+                struct tc_plug_qopt *ctl = nla_data(opt);
+                if (nla_len(opt) < sizeof(*ctl))
+                        return -EINVAL;
+                q->limit = ctl->limit;
+        }
+        qdisc_throttled(sch);
+        return 0;
+}
+/* Receives 4 types of messages:
+ * TCQ_PLUG_BUFFER: Inset a plug into the queue and
+ *  buffer any incoming packets
+ * TCQ_PLUG_RELEASE_ONE: Dequeue packets from queue head
+ *   to beginning of the next plug.
+ * TCQ_PLUG_RELEASE_INDEFINITE: Dequeue all packets from queue.
+ *   Stop buffering packets until the next TCQ_PLUG_BUFFER
+ *   command is received (just act as a pass-thru queue).
+ * TCQ_PLUG_LIMIT: Increase/decrease queue size
+ */
+static int plug_change(struct Qdisc *sch, struct nlattr *opt)
+{
+        struct plug_sched_data *q = qdisc_priv(sch);
+        struct tc_plug_qopt *msg;
+        if (opt == NULL)
+                return -EINVAL;
+        msg = nla_data(opt);
+        if (nla_len(opt) < sizeof(*msg))
+                return -EINVAL;
+        switch (msg->action) {
+        case TCQ_PLUG_BUFFER:
+                /* Save size of the current buffer */
+                q->pkts_last_epoch = q->pkts_current_epoch;
+                q->pkts_current_epoch = 0;
+                if (q->unplug_indefinite)
+                        qdisc_throttled(sch);
+                q->unplug_indefinite = false;
+                break;
+        case TCQ_PLUG_RELEASE_ONE:
+                /* Add packets from the last complete buffer to the
+                 * packets to be released set.
+                 */
+                q->pkts_to_release += q->pkts_last_epoch;
+                q->pkts_last_epoch = 0;
+                qdisc_unthrottled(sch);
+                netif_schedule_queue(sch->dev_queue);
+                break;
+        case TCQ_PLUG_RELEASE_INDEFINITE:
+                q->unplug_indefinite = true;
+                q->pkts_to_release = 0;
+                q->pkts_last_epoch = 0;
+                q->pkts_current_epoch = 0;
+                qdisc_unthrottled(sch);
+                netif_schedule_queue(sch->dev_queue);
+                break;
+        case TCQ_PLUG_LIMIT:
+                /* Limit is supplied in bytes */
+                q->limit = msg->limit;
+                break;
+        default:
+                return -EINVAL;
+        }
+        return 0;
+}
+static struct Qdisc_ops plug_qdisc_ops __read_mostly = {
+        .id          =       "plug",
+        .priv_size   =       sizeof(struct plug_sched_data),
+        .enqueue     =       plug_enqueue,
+        .dequeue     =       plug_dequeue,
+        .peek        =       qdisc_peek_head,
+        .init        =       plug_init,
+        .change      =       plug_change,
+        .owner       =       THIS_MODULE,
+};
+static int __init plug_module_init(void)
+{
+        return register_qdisc(&plug_qdisc_ops);
+}
+static void __exit plug_module_exit(void)
+{
+        unregister_qdisc(&plug_qdisc_ops);
+}
+module_init(plug_module_init)
+module_exit(plug_module_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 60d47180f043..02a21abea65e 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -469,11 +469,15 @@ enqueue:
        if (slot->qlen == 1) {          /* The flow is new */
                if (q->tail == NULL) {  /* It is the first flow */
                        slot->next = x;
-                        q->tail = slot;
                } else {
                        slot->next = q->tail->next;
                        q->tail->next = x;
                }
+                /* We put this flow at the end of our flow list.
+                 * This might sound unfair for a new flow to wait after old ones,
+                 * but we could endup servicing new flows only, and freeze old ones.
+                 */
+                q->tail = slot;
                /* We could use a bigger initial quantum for new flows */
                slot->allot = q->scaled_quantum;
        }
author	Tejun Heo <tj@kernel.org>	2012-04-01 15:30:01 -0400
committer	Tejun Heo <tj@kernel.org>	2012-04-01 15:55:00 -0400
commit	959d851caa48829eb85cb85aa949fd6b4c5d5bc6 (patch)
tree	3ba9c94ec346275fb44c4f0d1cd2537cdff8d811 /net/sched
parent	a5567932fc926739e29e98487128080f40c61710 (diff)
parent	48ddbe194623ae089cc0576e60363f2d2e85662a (diff)

diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 2590e91b3289..75b58f81d53d 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig
@@ -260,6 +260,32 @@ config NET_SCH_INGRESS
260	To compile this code as a module, choose M here: the	260	To compile this code as a module, choose M here: the
261	module will be called sch_ingress.	261	module will be called sch_ingress.
262		262
		263	config NET_SCH_PLUG
		264	tristate "Plug network traffic until release (PLUG)"
		265	---help---
		266
		267	This queuing discipline allows userspace to plug/unplug a network
		268	output queue, using the netlink interface. When it receives an
		269	enqueue command it inserts a plug into the outbound queue that
		270	causes following packets to enqueue until a dequeue command arrives
		271	over netlink, causing the plug to be removed and resuming the normal
		272	packet flow.
		273
		274	This module also provides a generic "network output buffering"
		275	functionality (aka output commit), wherein upon arrival of a dequeue
		276	command, only packets up to the first plug are released for delivery.
		277	The Remus HA project uses this module to enable speculative execution
		278	of virtual machines by allowing the generated network output to be rolled
		279	back if needed.
		280
		281	For more information, please refer to http://wiki.xensource.com/xenwiki/Remus
		282
		283	Say Y here if you are using this kernel for Xen dom0 and
		284	want to protect Xen guests with Remus.
		285
		286	To compile this code as a module, choose M here: the
		287	module will be called sch_plug.
		288
263	comment "Classification"	289	comment "Classification"
264		290
265	config NET_CLS	291	config NET_CLS


diff --git a/net/sched/Makefile b/net/sched/Makefile index dc5889c0a15a..8cdf4e2b51d3 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile
@@ -33,6 +33,7 @@ obj-$(CONFIG_NET_SCH_MULTIQ) += sch_multiq.o
33	obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o	33	obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o
34	obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o	34	obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o
35	obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o	35	obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o
		36	obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o
36	obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o	37	obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o
37	obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o	38	obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o
38	obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o	39	obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o


diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index f84fdc3a7f27..7743ea8d1d38 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c
@@ -22,23 +22,6 @@
22	#include <net/sock.h>	22	#include <net/sock.h>
23	#include <net/cls_cgroup.h>	23	#include <net/cls_cgroup.h>
24		24
25	static struct cgroup_subsys_state cgrp_create(struct cgroup_subsys ss,
26	struct cgroup *cgrp);
27	static void cgrp_destroy(struct cgroup_subsys ss, struct cgroup cgrp);
28	static int cgrp_populate(struct cgroup_subsys ss, struct cgroup cgrp);
29
30	struct cgroup_subsys net_cls_subsys = {
31	.name = "net_cls",
32	.create = cgrp_create,
33	.destroy = cgrp_destroy,
34	.populate = cgrp_populate,
35	#ifdef CONFIG_NET_CLS_CGROUP
36	.subsys_id = net_cls_subsys_id,
37	#endif
38	.module = THIS_MODULE,
39	};
40
41
42	static inline struct cgroup_cls_state cgrp_cls_state(struct cgroup cgrp)	25	static inline struct cgroup_cls_state cgrp_cls_state(struct cgroup cgrp)
43	{	26	{
44	return container_of(cgroup_subsys_state(cgrp, net_cls_subsys_id),	27	return container_of(cgroup_subsys_state(cgrp, net_cls_subsys_id),
@@ -51,8 +34,7 @@ static inline struct cgroup_cls_state task_cls_state(struct task_struct p)
51	struct cgroup_cls_state, css);	34	struct cgroup_cls_state, css);
52	}	35	}
53		36
54	static struct cgroup_subsys_state cgrp_create(struct cgroup_subsys ss,	37	static struct cgroup_subsys_state cgrp_create(struct cgroup cgrp)
55	struct cgroup *cgrp)
56	{	38	{
57	struct cgroup_cls_state *cs;	39	struct cgroup_cls_state *cs;
58		40
@@ -66,7 +48,7 @@ static struct cgroup_subsys_state cgrp_create(struct cgroup_subsys ss,
66	return &cs->css;	48	return &cs->css;
67	}	49	}
68		50
69	static void cgrp_destroy(struct cgroup_subsys ss, struct cgroup cgrp)	51	static void cgrp_destroy(struct cgroup *cgrp)
70	{	52	{
71	kfree(cgrp_cls_state(cgrp));	53	kfree(cgrp_cls_state(cgrp));
72	}	54	}
@@ -88,12 +70,19 @@ static struct cftype ss_files[] = {
88	.read_u64 = read_classid,	70	.read_u64 = read_classid,
89	.write_u64 = write_classid,	71	.write_u64 = write_classid,
90	},	72	},
		73	{ } /* terminate */
91	};	74	};
92		75
93	static int cgrp_populate(struct cgroup_subsys ss, struct cgroup cgrp)	76	struct cgroup_subsys net_cls_subsys = {
94	{	77	.name = "net_cls",
95	return cgroup_add_files(cgrp, ss, ss_files, ARRAY_SIZE(ss_files));	78	.create = cgrp_create,
96	}	79	.destroy = cgrp_destroy,
		80	#ifdef CONFIG_NET_CLS_CGROUP
		81	.subsys_id = net_cls_subsys_id,
		82	#endif
		83	.base_cftypes = ss_files,
		84	.module = THIS_MODULE,
		85	};
97		86
98	struct cls_cgroup_head {	87	struct cls_cgroup_head {
99	u32 handle;	88	u32 handle;


diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index e83d61ca78ca..5da548fa7ae9 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c
@@ -501,9 +501,8 @@ tfifo_dequeue:
501		501
502	/* if more time remaining? */	502	/* if more time remaining? */
503	if (cb->time_to_send <= psched_get_time()) {	503	if (cb->time_to_send <= psched_get_time()) {
504	skb = qdisc_dequeue_tail(sch);	504	__skb_unlink(skb, &sch->q);
505	if (unlikely(!skb))	505	sch->qstats.backlog -= qdisc_pkt_len(skb);
506	goto qdisc_dequeue;
507		506
508	#ifdef CONFIG_NET_CLS_ACT	507	#ifdef CONFIG_NET_CLS_ACT
509	/*	508	/*
@@ -539,7 +538,6 @@ deliver:
539	qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send);	538	qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send);
540	}	539	}
541		540
542	qdisc_dequeue:
543	if (q->qdisc) {	541	if (q->qdisc) {
544	skb = q->qdisc->ops->dequeue(q->qdisc);	542	skb = q->qdisc->ops->dequeue(q->qdisc);
545	if (skb)	543	if (skb)


diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c new file mode 100644 index 000000000000..89f8fcf73f18 --- /dev/null +++ b/net/sched/sch_plug.c
@@ -0,0 +1,233 @@
		1	/*
		2	* sch_plug.c Queue traffic until an explicit release command
		3	*
		4	* This program is free software; you can redistribute it and/or
		5	* modify it under the terms of the GNU General Public License
		6	* as published by the Free Software Foundation; either version
		7	* 2 of the License, or (at your option) any later version.
		8	*
		9	* There are two ways to use this qdisc:
		10	* 1. A simple "instantaneous" plug/unplug operation, by issuing an alternating
		11	* sequence of TCQ_PLUG_BUFFER & TCQ_PLUG_RELEASE_INDEFINITE commands.
		12	*
		13	* 2. For network output buffering (a.k.a output commit) functionality.
		14	* Output commit property is commonly used by applications using checkpoint
		15	* based fault-tolerance to ensure that the checkpoint from which a system
		16	* is being restored is consistent w.r.t outside world.
		17	*
		18	* Consider for e.g. Remus - a Virtual Machine checkpointing system,
		19	* wherein a VM is checkpointed, say every 50ms. The checkpoint is replicated
		20	* asynchronously to the backup host, while the VM continues executing the
		21	* next epoch speculatively.
		22	*
		23	* The following is a typical sequence of output buffer operations:
		24	* 1.At epoch i, start_buffer(i)
		25	* 2. At end of epoch i (i.e. after 50ms):
		26	* 2.1 Stop VM and take checkpoint(i).
		27	* 2.2 start_buffer(i+1) and Resume VM
		28	* 3. While speculatively executing epoch(i+1), asynchronously replicate
		29	* checkpoint(i) to backup host.
		30	* 4. When checkpoint_ack(i) is received from backup, release_buffer(i)
		31	* Thus, this Qdisc would receive the following sequence of commands:
		32	* TCQ_PLUG_BUFFER (epoch i)
		33	* .. TCQ_PLUG_BUFFER (epoch i+1)
		34	* ....TCQ_PLUG_RELEASE_ONE (epoch i)
		35	* ......TCQ_PLUG_BUFFER (epoch i+2)
		36	* ........
		37	*/
		38
		39	#include <linux/module.h>
		40	#include <linux/types.h>
		41	#include <linux/kernel.h>
		42	#include <linux/errno.h>
		43	#include <linux/netdevice.h>
		44	#include <linux/skbuff.h>
		45	#include <net/pkt_sched.h>
		46
		47	/*
		48	* State of the queue, when used for network output buffering:
		49	*
		50	* plug(i+1) plug(i) head
		51	* ------------------+--------------------+---------------->
		52	* \| \|
		53	* \| \|
		54	* pkts_current_epoch\| pkts_last_epoch \|pkts_to_release
		55	* ----------------->\|<--------+--------->\|+--------------->
		56	* v v
		57	*
		58	*/
		59
		60	struct plug_sched_data {
		61	/* If true, the dequeue function releases all packets
		62	* from head to end of the queue. The queue turns into
		63	* a pass-through queue for newly arriving packets.
		64	*/
		65	bool unplug_indefinite;
		66
		67	/* Queue Limit in bytes */
		68	u32 limit;
		69
		70	/* Number of packets (output) from the current speculatively
		71	* executing epoch.
		72	*/
		73	u32 pkts_current_epoch;
		74
		75	/* Number of packets corresponding to the recently finished
		76	* epoch. These will be released when we receive a
		77	* TCQ_PLUG_RELEASE_ONE command. This command is typically
		78	* issued after committing a checkpoint at the target.
		79	*/
		80	u32 pkts_last_epoch;
		81
		82	/*
		83	* Number of packets from the head of the queue, that can
		84	* be released (committed checkpoint).
		85	*/
		86	u32 pkts_to_release;
		87	};
		88
		89	static int plug_enqueue(struct sk_buff skb, struct Qdisc sch)
		90	{
		91	struct plug_sched_data *q = qdisc_priv(sch);
		92
		93	if (likely(sch->qstats.backlog + skb->len <= q->limit)) {
		94	if (!q->unplug_indefinite)
		95	q->pkts_current_epoch++;
		96	return qdisc_enqueue_tail(skb, sch);
		97	}
		98
		99	return qdisc_reshape_fail(skb, sch);
		100	}
		101
		102	static struct sk_buff plug_dequeue(struct Qdisc sch)
		103	{
		104	struct plug_sched_data *q = qdisc_priv(sch);
		105
		106	if (qdisc_is_throttled(sch))
		107	return NULL;
		108
		109	if (!q->unplug_indefinite) {
		110	if (!q->pkts_to_release) {
		111	/* No more packets to dequeue. Block the queue
		112	* and wait for the next release command.
		113	*/
		114	qdisc_throttled(sch);
		115	return NULL;
		116	}
		117	q->pkts_to_release--;
		118	}
		119
		120	return qdisc_dequeue_head(sch);
		121	}
		122
		123	static int plug_init(struct Qdisc sch, struct nlattr opt)
		124	{
		125	struct plug_sched_data *q = qdisc_priv(sch);
		126
		127	q->pkts_current_epoch = 0;
		128	q->pkts_last_epoch = 0;
		129	q->pkts_to_release = 0;
		130	q->unplug_indefinite = false;
		131
		132	if (opt == NULL) {
		133	/* We will set a default limit of 100 pkts (~150kB)
		134	* in case tx_queue_len is not available. The
		135	* default value is completely arbitrary.
		136	*/
		137	u32 pkt_limit = qdisc_dev(sch)->tx_queue_len ? : 100;
		138	q->limit = pkt_limit * psched_mtu(qdisc_dev(sch));
		139	} else {
		140	struct tc_plug_qopt *ctl = nla_data(opt);
		141
		142	if (nla_len(opt) < sizeof(*ctl))
		143	return -EINVAL;
		144
		145	q->limit = ctl->limit;
		146	}
		147
		148	qdisc_throttled(sch);
		149	return 0;
		150	}
		151
		152	/* Receives 4 types of messages:
		153	* TCQ_PLUG_BUFFER: Inset a plug into the queue and
		154	* buffer any incoming packets
		155	* TCQ_PLUG_RELEASE_ONE: Dequeue packets from queue head
		156	* to beginning of the next plug.
		157	* TCQ_PLUG_RELEASE_INDEFINITE: Dequeue all packets from queue.
		158	* Stop buffering packets until the next TCQ_PLUG_BUFFER
		159	* command is received (just act as a pass-thru queue).
		160	* TCQ_PLUG_LIMIT: Increase/decrease queue size
		161	*/
		162	static int plug_change(struct Qdisc sch, struct nlattr opt)
		163	{
		164	struct plug_sched_data *q = qdisc_priv(sch);
		165	struct tc_plug_qopt *msg;
		166
		167	if (opt == NULL)
		168	return -EINVAL;
		169
		170	msg = nla_data(opt);
		171	if (nla_len(opt) < sizeof(*msg))
		172	return -EINVAL;
		173
		174	switch (msg->action) {
		175	case TCQ_PLUG_BUFFER:
		176	/* Save size of the current buffer */
		177	q->pkts_last_epoch = q->pkts_current_epoch;
		178	q->pkts_current_epoch = 0;
		179	if (q->unplug_indefinite)
		180	qdisc_throttled(sch);
		181	q->unplug_indefinite = false;
		182	break;
		183	case TCQ_PLUG_RELEASE_ONE:
		184	/* Add packets from the last complete buffer to the
		185	* packets to be released set.
		186	*/
		187	q->pkts_to_release += q->pkts_last_epoch;
		188	q->pkts_last_epoch = 0;
		189	qdisc_unthrottled(sch);
		190	netif_schedule_queue(sch->dev_queue);
		191	break;
		192	case TCQ_PLUG_RELEASE_INDEFINITE:
		193	q->unplug_indefinite = true;
		194	q->pkts_to_release = 0;
		195	q->pkts_last_epoch = 0;
		196	q->pkts_current_epoch = 0;
		197	qdisc_unthrottled(sch);
		198	netif_schedule_queue(sch->dev_queue);
		199	break;
		200	case TCQ_PLUG_LIMIT:
		201	/* Limit is supplied in bytes */
		202	q->limit = msg->limit;
		203	break;
		204	default:
		205	return -EINVAL;
		206	}
		207
		208	return 0;
		209	}
		210
		211	static struct Qdisc_ops plug_qdisc_ops __read_mostly = {
		212	.id = "plug",
		213	.priv_size = sizeof(struct plug_sched_data),
		214	.enqueue = plug_enqueue,
		215	.dequeue = plug_dequeue,
		216	.peek = qdisc_peek_head,
		217	.init = plug_init,
		218	.change = plug_change,
		219	.owner = THIS_MODULE,
		220	};
		221
		222	static int __init plug_module_init(void)
		223	{
		224	return register_qdisc(&plug_qdisc_ops);
		225	}
		226
		227	static void __exit plug_module_exit(void)
		228	{
		229	unregister_qdisc(&plug_qdisc_ops);
		230	}
		231	module_init(plug_module_init)
		232	module_exit(plug_module_exit)
		233	MODULE_LICENSE("GPL");


diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 60d47180f043..02a21abea65e 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c
@@ -469,11 +469,15 @@ enqueue:
469	if (slot->qlen == 1) { /* The flow is new */	469	if (slot->qlen == 1) { /* The flow is new */
470	if (q->tail == NULL) { /* It is the first flow */	470	if (q->tail == NULL) { /* It is the first flow */
471	slot->next = x;	471	slot->next = x;
472	q->tail = slot;
473	} else {	472	} else {
474	slot->next = q->tail->next;	473	slot->next = q->tail->next;
475	q->tail->next = x;	474	q->tail->next = x;
476	}	475	}
		476	/* We put this flow at the end of our flow list.
		477	* This might sound unfair for a new flow to wait after old ones,
		478	* but we could endup servicing new flows only, and freeze old ones.
		479	*/
		480	q->tail = slot;
477	/* We could use a bigger initial quantum for new flows */	481	/* We could use a bigger initial quantum for new flows */
478	slot->allot = q->scaled_quantum;	482	slot->allot = q->scaled_quantum;
479	}	483	}