aboutsummaryrefslogtreecommitdiffstats
path: root/net/sched
diff options
context:
space:
mode:
authorShriram Rajagopalan <rshriram@cs.ubc.ca>2012-02-05 08:51:32 -0500
committerDavid S. Miller <davem@davemloft.net>2012-02-07 12:54:56 -0500
commitc3059be16c9ef29c05f0876a9df5fea21f29724f (patch)
tree18e5635982b69c7d8369e441d3233c437d2262df /net/sched
parent17b8a74f00474fb4fe6154aa426a80bcf1220997 (diff)
net/sched: sch_plug - Queue traffic until an explicit release command
The qdisc supports two operations - plug and unplug. When the qdisc receives a plug command via netlink request, packets arriving henceforth are buffered until a corresponding unplug command is received. Depending on the type of unplug command, the queue can be unplugged indefinitely or selectively. This qdisc can be used to implement output buffering, an essential functionality required for consistent recovery in checkpoint based fault-tolerance systems. Output buffering enables speculative execution by allowing generated network traffic to be rolled back. It is used to provide network protection for Xen Guests in the Remus high availability project, available as part of Xen. This module is generic enough to be used by any other system that wishes to add speculative execution and output buffering to its applications. This module was originally available in the linux 2.6.32 PV-OPS tree, used as dom0 for Xen. For more information, please refer to http://nss.cs.ubc.ca/remus/ and http://wiki.xensource.com/xenwiki/Remus Changes in V3: * Removed debug output (printk) on queue overflow * Added TCQ_PLUG_RELEASE_INDEFINITE - that allows the user to use this qdisc, for simple plug/unplug operations. * Use of packet counts instead of pointers to keep track of the buffers in the queue. Signed-off-by: Shriram Rajagopalan <rshriram@cs.ubc.ca> Signed-off-by: Brendan Cully <brendan@cs.ubc.ca> [author of the code in the linux 2.6.32 pvops tree] Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/sched')
-rw-r--r--net/sched/Kconfig26
-rw-r--r--net/sched/Makefile1
-rw-r--r--net/sched/sch_plug.c233
3 files changed, 260 insertions, 0 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 2590e91b3289..75b58f81d53d 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -260,6 +260,32 @@ config NET_SCH_INGRESS
260 To compile this code as a module, choose M here: the 260 To compile this code as a module, choose M here: the
261 module will be called sch_ingress. 261 module will be called sch_ingress.
262 262
263config NET_SCH_PLUG
264 tristate "Plug network traffic until release (PLUG)"
265 ---help---
266
267 This queuing discipline allows userspace to plug/unplug a network
268 output queue, using the netlink interface. When it receives an
269 enqueue command it inserts a plug into the outbound queue that
270 causes following packets to enqueue until a dequeue command arrives
271 over netlink, causing the plug to be removed and resuming the normal
272 packet flow.
273
274 This module also provides a generic "network output buffering"
275 functionality (aka output commit), wherein upon arrival of a dequeue
276 command, only packets up to the first plug are released for delivery.
277 The Remus HA project uses this module to enable speculative execution
278 of virtual machines by allowing the generated network output to be rolled
279 back if needed.
280
281 For more information, please refer to http://wiki.xensource.com/xenwiki/Remus
282
283 Say Y here if you are using this kernel for Xen dom0 and
284 want to protect Xen guests with Remus.
285
286 To compile this code as a module, choose M here: the
287 module will be called sch_plug.
288
263comment "Classification" 289comment "Classification"
264 290
265config NET_CLS 291config NET_CLS
diff --git a/net/sched/Makefile b/net/sched/Makefile
index dc5889c0a15a..8cdf4e2b51d3 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -33,6 +33,7 @@ obj-$(CONFIG_NET_SCH_MULTIQ) += sch_multiq.o
33obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o 33obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o
34obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o 34obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o
35obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o 35obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o
36obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o
36obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o 37obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o
37obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o 38obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o
38obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o 39obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o
diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c
new file mode 100644
index 000000000000..ba7b737e4055
--- /dev/null
+++ b/net/sched/sch_plug.c
@@ -0,0 +1,233 @@
1/*
2 * sch_plug.c Queue traffic until an explicit release command
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * There are two ways to use this qdisc:
10 * 1. A simple "instantaneous" plug/unplug operation, by issuing an alternating
11 * sequence of TCQ_PLUG_BUFFER & TCQ_PLUG_RELEASE_INDEFINITE commands.
12 *
13 * 2. For network output buffering (a.k.a output commit) functionality.
14 * Output commit property is commonly used by applications using checkpoint
15 * based fault-tolerance to ensure that the checkpoint from which a system
16 * is being restored is consistent w.r.t outside world.
17 *
18 * Consider for e.g. Remus - a Virtual Machine checkpointing system,
19 * wherein a VM is checkpointed, say every 50ms. The checkpoint is replicated
20 * asynchronously to the backup host, while the VM continues executing the
21 * next epoch speculatively.
22 *
23 * The following is a typical sequence of output buffer operations:
24 * 1.At epoch i, start_buffer(i)
25 * 2. At end of epoch i (i.e. after 50ms):
26 * 2.1 Stop VM and take checkpoint(i).
27 * 2.2 start_buffer(i+1) and Resume VM
28 * 3. While speculatively executing epoch(i+1), asynchronously replicate
29 * checkpoint(i) to backup host.
30 * 4. When checkpoint_ack(i) is received from backup, release_buffer(i)
31 * Thus, this Qdisc would receive the following sequence of commands:
32 * TCQ_PLUG_BUFFER (epoch i)
33 * .. TCQ_PLUG_BUFFER (epoch i+1)
34 * ....TCQ_PLUG_RELEASE_ONE (epoch i)
35 * ......TCQ_PLUG_BUFFER (epoch i+2)
36 * ........
37 */
38
39#include <linux/module.h>
40#include <linux/types.h>
41#include <linux/kernel.h>
42#include <linux/errno.h>
43#include <linux/netdevice.h>
44#include <linux/skbuff.h>
45#include <net/pkt_sched.h>
46
47/*
48 * State of the queue, when used for network output buffering:
49 *
50 * plug(i+1) plug(i) head
51 * ------------------+--------------------+---------------->
52 * | |
53 * | |
54 * pkts_current_epoch| pkts_last_epoch |pkts_to_release
55 * ----------------->|<--------+--------->|+--------------->
56 * v v
57 *
58 */
59
60struct plug_sched_data {
61 /* If true, the dequeue function releases all packets
62 * from head to end of the queue. The queue turns into
63 * a pass-through queue for newly arriving packets.
64 */
65 bool unplug_indefinite;
66
67 /* Queue Limit in bytes */
68 u32 limit;
69
70 /* Number of packets (output) from the current speculatively
71 * executing epoch.
72 */
73 u32 pkts_current_epoch;
74
75 /* Number of packets corresponding to the recently finished
76 * epoch. These will be released when we receive a
77 * TCQ_PLUG_RELEASE_ONE command. This command is typically
78 * issued after committing a checkpoint at the target.
79 */
80 u32 pkts_last_epoch;
81
82 /*
83 * Number of packets from the head of the queue, that can
84 * be released (committed checkpoint).
85 */
86 u32 pkts_to_release;
87};
88
89static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch)
90{
91 struct plug_sched_data *q = qdisc_priv(sch);
92
93 if (likely(sch->qstats.backlog + skb->len <= q->limit)) {
94 if (!q->unplug_indefinite)
95 q->pkts_current_epoch++;
96 return qdisc_enqueue_tail(skb, sch);
97 }
98
99 return qdisc_reshape_fail(skb, sch);
100}
101
102static struct sk_buff *plug_dequeue(struct Qdisc *sch)
103{
104 struct plug_sched_data *q = qdisc_priv(sch);
105
106 if (qdisc_is_throttled(sch))
107 return NULL;
108
109 if (!q->unplug_indefinite) {
110 if (!q->pkts_to_release) {
111 /* No more packets to dequeue. Block the queue
112 * and wait for the next release command.
113 */
114 qdisc_throttled(sch);
115 return NULL;
116 }
117 q->pkts_to_release--;
118 }
119
120 return qdisc_dequeue_head(sch);
121}
122
123static int plug_init(struct Qdisc *sch, struct nlattr *opt)
124{
125 struct plug_sched_data *q = qdisc_priv(sch);
126
127 q->pkts_current_epoch = 0;
128 q->pkts_last_epoch = 0;
129 q->pkts_to_release = 0;
130 q->unplug_indefinite = false;
131
132 if (opt == NULL) {
133 /* We will set a default limit of 100 pkts (~150kB)
134 * in case tx_queue_len is not available. The
135 * default value is completely arbitrary.
136 */
137 u32 pkt_limit = qdisc_dev(sch)->tx_queue_len ? : 100;
138 q->limit = pkt_limit * psched_mtu(qdisc_dev(sch));
139 } else {
140 struct tc_plug_qopt *ctl = nla_data(opt);
141
142 if (nla_len(opt) < sizeof(*ctl))
143 return -EINVAL;
144
145 q->limit = ctl->limit;
146 }
147
148 qdisc_throttled(sch);
149 return 0;
150}
151
152/* Receives 4 types of messages:
153 * TCQ_PLUG_BUFFER: Inset a plug into the queue and
154 * buffer any incoming packets
155 * TCQ_PLUG_RELEASE_ONE: Dequeue packets from queue head
156 * to beginning of the next plug.
157 * TCQ_PLUG_RELEASE_INDEFINITE: Dequeue all packets from queue.
158 * Stop buffering packets until the next TCQ_PLUG_BUFFER
159 * command is received (just act as a pass-thru queue).
160 * TCQ_PLUG_LIMIT: Increase/decrease queue size
161 */
162static int plug_change(struct Qdisc *sch, struct nlattr *opt)
163{
164 struct plug_sched_data *q = qdisc_priv(sch);
165 struct tc_plug_qopt *msg;
166
167 if (opt == NULL)
168 return -EINVAL;
169
170 msg = nla_data(opt);
171 if (nla_len(opt) < sizeof(*msg))
172 return -EINVAL;
173
174 switch (msg->action) {
175 case TCQ_PLUG_BUFFER:
176 /* Save size of the current buffer */
177 q->pkts_last_epoch = q->pkts_current_epoch;
178 q->pkts_current_epoch = 0;
179 if (q->unplug_indefinite)
180 qdisc_throttled(sch);
181 q->unplug_indefinite = false;
182 break;
183 case TCQ_PLUG_RELEASE_ONE:
184 /* Add packets from the last complete buffer to the
185 * packets to be released set.
186 */
187 q->pkts_to_release += q->pkts_last_epoch;
188 q->pkts_last_epoch = 0;
189 qdisc_unthrottled(sch);
190 netif_schedule_queue(sch->dev_queue);
191 break;
192 case TCQ_PLUG_RELEASE_INDEFINITE:
193 q->unplug_indefinite = true;
194 q->pkts_to_release = 0;
195 q->pkts_last_epoch = 0;
196 q->pkts_current_epoch = 0;
197 qdisc_unthrottled(sch);
198 netif_schedule_queue(sch->dev_queue);
199 break;
200 case TCQ_PLUG_LIMIT:
201 /* Limit is supplied in bytes */
202 q->limit = msg->limit;
203 break;
204 default:
205 return -EINVAL;
206 }
207
208 return 0;
209}
210
211struct Qdisc_ops plug_qdisc_ops = {
212 .id = "plug",
213 .priv_size = sizeof(struct plug_sched_data),
214 .enqueue = plug_enqueue,
215 .dequeue = plug_dequeue,
216 .peek = qdisc_peek_head,
217 .init = plug_init,
218 .change = plug_change,
219 .owner = THIS_MODULE,
220};
221
222static int __init plug_module_init(void)
223{
224 return register_qdisc(&plug_qdisc_ops);
225}
226
227static void __exit plug_module_exit(void)
228{
229 unregister_qdisc(&plug_qdisc_ops);
230}
231module_init(plug_module_init)
232module_exit(plug_module_exit)
233MODULE_LICENSE("GPL");