diff options
Diffstat (limited to 'net/sched')
-rw-r--r-- | net/sched/Kconfig | 26 | ||||
-rw-r--r-- | net/sched/Makefile | 1 | ||||
-rw-r--r-- | net/sched/sch_plug.c | 233 |
3 files changed, 260 insertions, 0 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 2590e91b3289..75b58f81d53d 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig | |||
@@ -260,6 +260,32 @@ config NET_SCH_INGRESS | |||
260 | To compile this code as a module, choose M here: the | 260 | To compile this code as a module, choose M here: the |
261 | module will be called sch_ingress. | 261 | module will be called sch_ingress. |
262 | 262 | ||
263 | config NET_SCH_PLUG | ||
264 | tristate "Plug network traffic until release (PLUG)" | ||
265 | ---help--- | ||
266 | |||
267 | This queuing discipline allows userspace to plug/unplug a network | ||
268 | output queue, using the netlink interface. When it receives an | ||
269 | enqueue command it inserts a plug into the outbound queue that | ||
270 | causes following packets to enqueue until a dequeue command arrives | ||
271 | over netlink, causing the plug to be removed and resuming the normal | ||
272 | packet flow. | ||
273 | |||
274 | This module also provides a generic "network output buffering" | ||
275 | functionality (aka output commit), wherein upon arrival of a dequeue | ||
276 | command, only packets up to the first plug are released for delivery. | ||
277 | The Remus HA project uses this module to enable speculative execution | ||
278 | of virtual machines by allowing the generated network output to be rolled | ||
279 | back if needed. | ||
280 | |||
281 | For more information, please refer to http://wiki.xensource.com/xenwiki/Remus | ||
282 | |||
283 | Say Y here if you are using this kernel for Xen dom0 and | ||
284 | want to protect Xen guests with Remus. | ||
285 | |||
286 | To compile this code as a module, choose M here: the | ||
287 | module will be called sch_plug. | ||
288 | |||
263 | comment "Classification" | 289 | comment "Classification" |
264 | 290 | ||
265 | config NET_CLS | 291 | config NET_CLS |
diff --git a/net/sched/Makefile b/net/sched/Makefile index dc5889c0a15a..8cdf4e2b51d3 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile | |||
@@ -33,6 +33,7 @@ obj-$(CONFIG_NET_SCH_MULTIQ) += sch_multiq.o | |||
33 | obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o | 33 | obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o |
34 | obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o | 34 | obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o |
35 | obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o | 35 | obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o |
36 | obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o | ||
36 | obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o | 37 | obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o |
37 | obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o | 38 | obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o |
38 | obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o | 39 | obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o |
diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c new file mode 100644 index 000000000000..89f8fcf73f18 --- /dev/null +++ b/net/sched/sch_plug.c | |||
@@ -0,0 +1,233 @@ | |||
1 | /* | ||
2 | * sch_plug.c Queue traffic until an explicit release command | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation; either version | ||
7 | * 2 of the License, or (at your option) any later version. | ||
8 | * | ||
9 | * There are two ways to use this qdisc: | ||
10 | * 1. A simple "instantaneous" plug/unplug operation, by issuing an alternating | ||
11 | * sequence of TCQ_PLUG_BUFFER & TCQ_PLUG_RELEASE_INDEFINITE commands. | ||
12 | * | ||
13 | * 2. For network output buffering (a.k.a output commit) functionality. | ||
14 | * Output commit property is commonly used by applications using checkpoint | ||
15 | * based fault-tolerance to ensure that the checkpoint from which a system | ||
16 | * is being restored is consistent w.r.t outside world. | ||
17 | * | ||
18 | * Consider for e.g. Remus - a Virtual Machine checkpointing system, | ||
19 | * wherein a VM is checkpointed, say every 50ms. The checkpoint is replicated | ||
20 | * asynchronously to the backup host, while the VM continues executing the | ||
21 | * next epoch speculatively. | ||
22 | * | ||
23 | * The following is a typical sequence of output buffer operations: | ||
24 | * 1.At epoch i, start_buffer(i) | ||
25 | * 2. At end of epoch i (i.e. after 50ms): | ||
26 | * 2.1 Stop VM and take checkpoint(i). | ||
27 | * 2.2 start_buffer(i+1) and Resume VM | ||
28 | * 3. While speculatively executing epoch(i+1), asynchronously replicate | ||
29 | * checkpoint(i) to backup host. | ||
30 | * 4. When checkpoint_ack(i) is received from backup, release_buffer(i) | ||
31 | * Thus, this Qdisc would receive the following sequence of commands: | ||
32 | * TCQ_PLUG_BUFFER (epoch i) | ||
33 | * .. TCQ_PLUG_BUFFER (epoch i+1) | ||
34 | * ....TCQ_PLUG_RELEASE_ONE (epoch i) | ||
35 | * ......TCQ_PLUG_BUFFER (epoch i+2) | ||
36 | * ........ | ||
37 | */ | ||
38 | |||
39 | #include <linux/module.h> | ||
40 | #include <linux/types.h> | ||
41 | #include <linux/kernel.h> | ||
42 | #include <linux/errno.h> | ||
43 | #include <linux/netdevice.h> | ||
44 | #include <linux/skbuff.h> | ||
45 | #include <net/pkt_sched.h> | ||
46 | |||
47 | /* | ||
48 | * State of the queue, when used for network output buffering: | ||
49 | * | ||
50 | * plug(i+1) plug(i) head | ||
51 | * ------------------+--------------------+----------------> | ||
52 | * | | | ||
53 | * | | | ||
54 | * pkts_current_epoch| pkts_last_epoch |pkts_to_release | ||
55 | * ----------------->|<--------+--------->|+---------------> | ||
56 | * v v | ||
57 | * | ||
58 | */ | ||
59 | |||
60 | struct plug_sched_data { | ||
61 | /* If true, the dequeue function releases all packets | ||
62 | * from head to end of the queue. The queue turns into | ||
63 | * a pass-through queue for newly arriving packets. | ||
64 | */ | ||
65 | bool unplug_indefinite; | ||
66 | |||
67 | /* Queue Limit in bytes */ | ||
68 | u32 limit; | ||
69 | |||
70 | /* Number of packets (output) from the current speculatively | ||
71 | * executing epoch. | ||
72 | */ | ||
73 | u32 pkts_current_epoch; | ||
74 | |||
75 | /* Number of packets corresponding to the recently finished | ||
76 | * epoch. These will be released when we receive a | ||
77 | * TCQ_PLUG_RELEASE_ONE command. This command is typically | ||
78 | * issued after committing a checkpoint at the target. | ||
79 | */ | ||
80 | u32 pkts_last_epoch; | ||
81 | |||
82 | /* | ||
83 | * Number of packets from the head of the queue, that can | ||
84 | * be released (committed checkpoint). | ||
85 | */ | ||
86 | u32 pkts_to_release; | ||
87 | }; | ||
88 | |||
89 | static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch) | ||
90 | { | ||
91 | struct plug_sched_data *q = qdisc_priv(sch); | ||
92 | |||
93 | if (likely(sch->qstats.backlog + skb->len <= q->limit)) { | ||
94 | if (!q->unplug_indefinite) | ||
95 | q->pkts_current_epoch++; | ||
96 | return qdisc_enqueue_tail(skb, sch); | ||
97 | } | ||
98 | |||
99 | return qdisc_reshape_fail(skb, sch); | ||
100 | } | ||
101 | |||
102 | static struct sk_buff *plug_dequeue(struct Qdisc *sch) | ||
103 | { | ||
104 | struct plug_sched_data *q = qdisc_priv(sch); | ||
105 | |||
106 | if (qdisc_is_throttled(sch)) | ||
107 | return NULL; | ||
108 | |||
109 | if (!q->unplug_indefinite) { | ||
110 | if (!q->pkts_to_release) { | ||
111 | /* No more packets to dequeue. Block the queue | ||
112 | * and wait for the next release command. | ||
113 | */ | ||
114 | qdisc_throttled(sch); | ||
115 | return NULL; | ||
116 | } | ||
117 | q->pkts_to_release--; | ||
118 | } | ||
119 | |||
120 | return qdisc_dequeue_head(sch); | ||
121 | } | ||
122 | |||
123 | static int plug_init(struct Qdisc *sch, struct nlattr *opt) | ||
124 | { | ||
125 | struct plug_sched_data *q = qdisc_priv(sch); | ||
126 | |||
127 | q->pkts_current_epoch = 0; | ||
128 | q->pkts_last_epoch = 0; | ||
129 | q->pkts_to_release = 0; | ||
130 | q->unplug_indefinite = false; | ||
131 | |||
132 | if (opt == NULL) { | ||
133 | /* We will set a default limit of 100 pkts (~150kB) | ||
134 | * in case tx_queue_len is not available. The | ||
135 | * default value is completely arbitrary. | ||
136 | */ | ||
137 | u32 pkt_limit = qdisc_dev(sch)->tx_queue_len ? : 100; | ||
138 | q->limit = pkt_limit * psched_mtu(qdisc_dev(sch)); | ||
139 | } else { | ||
140 | struct tc_plug_qopt *ctl = nla_data(opt); | ||
141 | |||
142 | if (nla_len(opt) < sizeof(*ctl)) | ||
143 | return -EINVAL; | ||
144 | |||
145 | q->limit = ctl->limit; | ||
146 | } | ||
147 | |||
148 | qdisc_throttled(sch); | ||
149 | return 0; | ||
150 | } | ||
151 | |||
152 | /* Receives 4 types of messages: | ||
153 | * TCQ_PLUG_BUFFER: Inset a plug into the queue and | ||
154 | * buffer any incoming packets | ||
155 | * TCQ_PLUG_RELEASE_ONE: Dequeue packets from queue head | ||
156 | * to beginning of the next plug. | ||
157 | * TCQ_PLUG_RELEASE_INDEFINITE: Dequeue all packets from queue. | ||
158 | * Stop buffering packets until the next TCQ_PLUG_BUFFER | ||
159 | * command is received (just act as a pass-thru queue). | ||
160 | * TCQ_PLUG_LIMIT: Increase/decrease queue size | ||
161 | */ | ||
162 | static int plug_change(struct Qdisc *sch, struct nlattr *opt) | ||
163 | { | ||
164 | struct plug_sched_data *q = qdisc_priv(sch); | ||
165 | struct tc_plug_qopt *msg; | ||
166 | |||
167 | if (opt == NULL) | ||
168 | return -EINVAL; | ||
169 | |||
170 | msg = nla_data(opt); | ||
171 | if (nla_len(opt) < sizeof(*msg)) | ||
172 | return -EINVAL; | ||
173 | |||
174 | switch (msg->action) { | ||
175 | case TCQ_PLUG_BUFFER: | ||
176 | /* Save size of the current buffer */ | ||
177 | q->pkts_last_epoch = q->pkts_current_epoch; | ||
178 | q->pkts_current_epoch = 0; | ||
179 | if (q->unplug_indefinite) | ||
180 | qdisc_throttled(sch); | ||
181 | q->unplug_indefinite = false; | ||
182 | break; | ||
183 | case TCQ_PLUG_RELEASE_ONE: | ||
184 | /* Add packets from the last complete buffer to the | ||
185 | * packets to be released set. | ||
186 | */ | ||
187 | q->pkts_to_release += q->pkts_last_epoch; | ||
188 | q->pkts_last_epoch = 0; | ||
189 | qdisc_unthrottled(sch); | ||
190 | netif_schedule_queue(sch->dev_queue); | ||
191 | break; | ||
192 | case TCQ_PLUG_RELEASE_INDEFINITE: | ||
193 | q->unplug_indefinite = true; | ||
194 | q->pkts_to_release = 0; | ||
195 | q->pkts_last_epoch = 0; | ||
196 | q->pkts_current_epoch = 0; | ||
197 | qdisc_unthrottled(sch); | ||
198 | netif_schedule_queue(sch->dev_queue); | ||
199 | break; | ||
200 | case TCQ_PLUG_LIMIT: | ||
201 | /* Limit is supplied in bytes */ | ||
202 | q->limit = msg->limit; | ||
203 | break; | ||
204 | default: | ||
205 | return -EINVAL; | ||
206 | } | ||
207 | |||
208 | return 0; | ||
209 | } | ||
210 | |||
211 | static struct Qdisc_ops plug_qdisc_ops __read_mostly = { | ||
212 | .id = "plug", | ||
213 | .priv_size = sizeof(struct plug_sched_data), | ||
214 | .enqueue = plug_enqueue, | ||
215 | .dequeue = plug_dequeue, | ||
216 | .peek = qdisc_peek_head, | ||
217 | .init = plug_init, | ||
218 | .change = plug_change, | ||
219 | .owner = THIS_MODULE, | ||
220 | }; | ||
221 | |||
222 | static int __init plug_module_init(void) | ||
223 | { | ||
224 | return register_qdisc(&plug_qdisc_ops); | ||
225 | } | ||
226 | |||
227 | static void __exit plug_module_exit(void) | ||
228 | { | ||
229 | unregister_qdisc(&plug_qdisc_ops); | ||
230 | } | ||
231 | module_init(plug_module_init) | ||
232 | module_exit(plug_module_exit) | ||
233 | MODULE_LICENSE("GPL"); | ||