aboutsummaryrefslogtreecommitdiffstats
path: root/net/sched
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2012-04-01 15:30:01 -0400
committerTejun Heo <tj@kernel.org>2012-04-01 15:55:00 -0400
commit959d851caa48829eb85cb85aa949fd6b4c5d5bc6 (patch)
tree3ba9c94ec346275fb44c4f0d1cd2537cdff8d811 /net/sched
parenta5567932fc926739e29e98487128080f40c61710 (diff)
parent48ddbe194623ae089cc0576e60363f2d2e85662a (diff)
Merge branch 'for-3.5' of ../cgroup into block/for-3.5/core-merged
cgroup/for-3.5 contains the following changes which blk-cgroup needs to proceed with the on-going cleanup. * Dynamic addition and removal of cftypes to make config/stat file handling modular for policies. * cgroup removal update to not wait for css references to drain to fix blkcg removal hang caused by cfq caching cfqgs. Pull in cgroup/for-3.5 into block/for-3.5/core. This causes the following conflicts in block/blk-cgroup.c. * 761b3ef50e "cgroup: remove cgroup_subsys argument from callbacks" conflicts with blkiocg_pre_destroy() addition and blkiocg_attach() removal. Resolved by removing @subsys from all subsys methods. * 676f7c8f84 "cgroup: relocate cftype and cgroup_subsys definitions in controllers" conflicts with ->pre_destroy() and ->attach() updates and removal of modular config. Resolved by dropping forward declarations of the methods and applying updates to the relocated blkio_subsys. * 4baf6e3325 "cgroup: convert all non-memcg controllers to the new cftype interface" builds upon the previous item. Resolved by adding ->base_cftypes to the relocated blkio_subsys. Signed-off-by: Tejun Heo <tj@kernel.org>
Diffstat (limited to 'net/sched')
-rw-r--r--net/sched/Kconfig26
-rw-r--r--net/sched/Makefile1
-rw-r--r--net/sched/cls_cgroup.c37
-rw-r--r--net/sched/sch_netem.c6
-rw-r--r--net/sched/sch_plug.c233
-rw-r--r--net/sched/sch_sfq.c6
6 files changed, 280 insertions, 29 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 2590e91b3289..75b58f81d53d 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -260,6 +260,32 @@ config NET_SCH_INGRESS
260 To compile this code as a module, choose M here: the 260 To compile this code as a module, choose M here: the
261 module will be called sch_ingress. 261 module will be called sch_ingress.
262 262
263config NET_SCH_PLUG
264 tristate "Plug network traffic until release (PLUG)"
265 ---help---
266
267 This queuing discipline allows userspace to plug/unplug a network
268 output queue, using the netlink interface. When it receives an
269 enqueue command it inserts a plug into the outbound queue that
270 causes following packets to enqueue until a dequeue command arrives
271 over netlink, causing the plug to be removed and resuming the normal
272 packet flow.
273
274 This module also provides a generic "network output buffering"
275 functionality (aka output commit), wherein upon arrival of a dequeue
276 command, only packets up to the first plug are released for delivery.
277 The Remus HA project uses this module to enable speculative execution
278 of virtual machines by allowing the generated network output to be rolled
279 back if needed.
280
281 For more information, please refer to http://wiki.xensource.com/xenwiki/Remus
282
283 Say Y here if you are using this kernel for Xen dom0 and
284 want to protect Xen guests with Remus.
285
286 To compile this code as a module, choose M here: the
287 module will be called sch_plug.
288
263comment "Classification" 289comment "Classification"
264 290
265config NET_CLS 291config NET_CLS
diff --git a/net/sched/Makefile b/net/sched/Makefile
index dc5889c0a15a..8cdf4e2b51d3 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -33,6 +33,7 @@ obj-$(CONFIG_NET_SCH_MULTIQ) += sch_multiq.o
33obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o 33obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o
34obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o 34obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o
35obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o 35obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o
36obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o
36obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o 37obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o
37obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o 38obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o
38obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o 39obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index f84fdc3a7f27..7743ea8d1d38 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -22,23 +22,6 @@
22#include <net/sock.h> 22#include <net/sock.h>
23#include <net/cls_cgroup.h> 23#include <net/cls_cgroup.h>
24 24
25static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
26 struct cgroup *cgrp);
27static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
28static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp);
29
30struct cgroup_subsys net_cls_subsys = {
31 .name = "net_cls",
32 .create = cgrp_create,
33 .destroy = cgrp_destroy,
34 .populate = cgrp_populate,
35#ifdef CONFIG_NET_CLS_CGROUP
36 .subsys_id = net_cls_subsys_id,
37#endif
38 .module = THIS_MODULE,
39};
40
41
42static inline struct cgroup_cls_state *cgrp_cls_state(struct cgroup *cgrp) 25static inline struct cgroup_cls_state *cgrp_cls_state(struct cgroup *cgrp)
43{ 26{
44 return container_of(cgroup_subsys_state(cgrp, net_cls_subsys_id), 27 return container_of(cgroup_subsys_state(cgrp, net_cls_subsys_id),
@@ -51,8 +34,7 @@ static inline struct cgroup_cls_state *task_cls_state(struct task_struct *p)
51 struct cgroup_cls_state, css); 34 struct cgroup_cls_state, css);
52} 35}
53 36
54static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss, 37static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp)
55 struct cgroup *cgrp)
56{ 38{
57 struct cgroup_cls_state *cs; 39 struct cgroup_cls_state *cs;
58 40
@@ -66,7 +48,7 @@ static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
66 return &cs->css; 48 return &cs->css;
67} 49}
68 50
69static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) 51static void cgrp_destroy(struct cgroup *cgrp)
70{ 52{
71 kfree(cgrp_cls_state(cgrp)); 53 kfree(cgrp_cls_state(cgrp));
72} 54}
@@ -88,12 +70,19 @@ static struct cftype ss_files[] = {
88 .read_u64 = read_classid, 70 .read_u64 = read_classid,
89 .write_u64 = write_classid, 71 .write_u64 = write_classid,
90 }, 72 },
73 { } /* terminate */
91}; 74};
92 75
93static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) 76struct cgroup_subsys net_cls_subsys = {
94{ 77 .name = "net_cls",
95 return cgroup_add_files(cgrp, ss, ss_files, ARRAY_SIZE(ss_files)); 78 .create = cgrp_create,
96} 79 .destroy = cgrp_destroy,
80#ifdef CONFIG_NET_CLS_CGROUP
81 .subsys_id = net_cls_subsys_id,
82#endif
83 .base_cftypes = ss_files,
84 .module = THIS_MODULE,
85};
97 86
98struct cls_cgroup_head { 87struct cls_cgroup_head {
99 u32 handle; 88 u32 handle;
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index e83d61ca78ca..5da548fa7ae9 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -501,9 +501,8 @@ tfifo_dequeue:
501 501
502 /* if more time remaining? */ 502 /* if more time remaining? */
503 if (cb->time_to_send <= psched_get_time()) { 503 if (cb->time_to_send <= psched_get_time()) {
504 skb = qdisc_dequeue_tail(sch); 504 __skb_unlink(skb, &sch->q);
505 if (unlikely(!skb)) 505 sch->qstats.backlog -= qdisc_pkt_len(skb);
506 goto qdisc_dequeue;
507 506
508#ifdef CONFIG_NET_CLS_ACT 507#ifdef CONFIG_NET_CLS_ACT
509 /* 508 /*
@@ -539,7 +538,6 @@ deliver:
539 qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send); 538 qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send);
540 } 539 }
541 540
542qdisc_dequeue:
543 if (q->qdisc) { 541 if (q->qdisc) {
544 skb = q->qdisc->ops->dequeue(q->qdisc); 542 skb = q->qdisc->ops->dequeue(q->qdisc);
545 if (skb) 543 if (skb)
diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c
new file mode 100644
index 000000000000..89f8fcf73f18
--- /dev/null
+++ b/net/sched/sch_plug.c
@@ -0,0 +1,233 @@
1/*
2 * sch_plug.c Queue traffic until an explicit release command
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * There are two ways to use this qdisc:
10 * 1. A simple "instantaneous" plug/unplug operation, by issuing an alternating
11 * sequence of TCQ_PLUG_BUFFER & TCQ_PLUG_RELEASE_INDEFINITE commands.
12 *
13 * 2. For network output buffering (a.k.a output commit) functionality.
14 * Output commit property is commonly used by applications using checkpoint
15 * based fault-tolerance to ensure that the checkpoint from which a system
16 * is being restored is consistent w.r.t outside world.
17 *
18 * Consider for e.g. Remus - a Virtual Machine checkpointing system,
19 * wherein a VM is checkpointed, say every 50ms. The checkpoint is replicated
20 * asynchronously to the backup host, while the VM continues executing the
21 * next epoch speculatively.
22 *
23 * The following is a typical sequence of output buffer operations:
24 * 1.At epoch i, start_buffer(i)
25 * 2. At end of epoch i (i.e. after 50ms):
26 * 2.1 Stop VM and take checkpoint(i).
27 * 2.2 start_buffer(i+1) and Resume VM
28 * 3. While speculatively executing epoch(i+1), asynchronously replicate
29 * checkpoint(i) to backup host.
30 * 4. When checkpoint_ack(i) is received from backup, release_buffer(i)
31 * Thus, this Qdisc would receive the following sequence of commands:
32 * TCQ_PLUG_BUFFER (epoch i)
33 * .. TCQ_PLUG_BUFFER (epoch i+1)
34 * ....TCQ_PLUG_RELEASE_ONE (epoch i)
35 * ......TCQ_PLUG_BUFFER (epoch i+2)
36 * ........
37 */
38
39#include <linux/module.h>
40#include <linux/types.h>
41#include <linux/kernel.h>
42#include <linux/errno.h>
43#include <linux/netdevice.h>
44#include <linux/skbuff.h>
45#include <net/pkt_sched.h>
46
47/*
48 * State of the queue, when used for network output buffering:
49 *
50 * plug(i+1) plug(i) head
51 * ------------------+--------------------+---------------->
52 * | |
53 * | |
54 * pkts_current_epoch| pkts_last_epoch |pkts_to_release
55 * ----------------->|<--------+--------->|+--------------->
56 * v v
57 *
58 */
59
60struct plug_sched_data {
61 /* If true, the dequeue function releases all packets
62 * from head to end of the queue. The queue turns into
63 * a pass-through queue for newly arriving packets.
64 */
65 bool unplug_indefinite;
66
67 /* Queue Limit in bytes */
68 u32 limit;
69
70 /* Number of packets (output) from the current speculatively
71 * executing epoch.
72 */
73 u32 pkts_current_epoch;
74
75 /* Number of packets corresponding to the recently finished
76 * epoch. These will be released when we receive a
77 * TCQ_PLUG_RELEASE_ONE command. This command is typically
78 * issued after committing a checkpoint at the target.
79 */
80 u32 pkts_last_epoch;
81
82 /*
83 * Number of packets from the head of the queue, that can
84 * be released (committed checkpoint).
85 */
86 u32 pkts_to_release;
87};
88
89static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch)
90{
91 struct plug_sched_data *q = qdisc_priv(sch);
92
93 if (likely(sch->qstats.backlog + skb->len <= q->limit)) {
94 if (!q->unplug_indefinite)
95 q->pkts_current_epoch++;
96 return qdisc_enqueue_tail(skb, sch);
97 }
98
99 return qdisc_reshape_fail(skb, sch);
100}
101
102static struct sk_buff *plug_dequeue(struct Qdisc *sch)
103{
104 struct plug_sched_data *q = qdisc_priv(sch);
105
106 if (qdisc_is_throttled(sch))
107 return NULL;
108
109 if (!q->unplug_indefinite) {
110 if (!q->pkts_to_release) {
111 /* No more packets to dequeue. Block the queue
112 * and wait for the next release command.
113 */
114 qdisc_throttled(sch);
115 return NULL;
116 }
117 q->pkts_to_release--;
118 }
119
120 return qdisc_dequeue_head(sch);
121}
122
123static int plug_init(struct Qdisc *sch, struct nlattr *opt)
124{
125 struct plug_sched_data *q = qdisc_priv(sch);
126
127 q->pkts_current_epoch = 0;
128 q->pkts_last_epoch = 0;
129 q->pkts_to_release = 0;
130 q->unplug_indefinite = false;
131
132 if (opt == NULL) {
133 /* We will set a default limit of 100 pkts (~150kB)
134 * in case tx_queue_len is not available. The
135 * default value is completely arbitrary.
136 */
137 u32 pkt_limit = qdisc_dev(sch)->tx_queue_len ? : 100;
138 q->limit = pkt_limit * psched_mtu(qdisc_dev(sch));
139 } else {
140 struct tc_plug_qopt *ctl = nla_data(opt);
141
142 if (nla_len(opt) < sizeof(*ctl))
143 return -EINVAL;
144
145 q->limit = ctl->limit;
146 }
147
148 qdisc_throttled(sch);
149 return 0;
150}
151
152/* Receives 4 types of messages:
153 * TCQ_PLUG_BUFFER: Inset a plug into the queue and
154 * buffer any incoming packets
155 * TCQ_PLUG_RELEASE_ONE: Dequeue packets from queue head
156 * to beginning of the next plug.
157 * TCQ_PLUG_RELEASE_INDEFINITE: Dequeue all packets from queue.
158 * Stop buffering packets until the next TCQ_PLUG_BUFFER
159 * command is received (just act as a pass-thru queue).
160 * TCQ_PLUG_LIMIT: Increase/decrease queue size
161 */
162static int plug_change(struct Qdisc *sch, struct nlattr *opt)
163{
164 struct plug_sched_data *q = qdisc_priv(sch);
165 struct tc_plug_qopt *msg;
166
167 if (opt == NULL)
168 return -EINVAL;
169
170 msg = nla_data(opt);
171 if (nla_len(opt) < sizeof(*msg))
172 return -EINVAL;
173
174 switch (msg->action) {
175 case TCQ_PLUG_BUFFER:
176 /* Save size of the current buffer */
177 q->pkts_last_epoch = q->pkts_current_epoch;
178 q->pkts_current_epoch = 0;
179 if (q->unplug_indefinite)
180 qdisc_throttled(sch);
181 q->unplug_indefinite = false;
182 break;
183 case TCQ_PLUG_RELEASE_ONE:
184 /* Add packets from the last complete buffer to the
185 * packets to be released set.
186 */
187 q->pkts_to_release += q->pkts_last_epoch;
188 q->pkts_last_epoch = 0;
189 qdisc_unthrottled(sch);
190 netif_schedule_queue(sch->dev_queue);
191 break;
192 case TCQ_PLUG_RELEASE_INDEFINITE:
193 q->unplug_indefinite = true;
194 q->pkts_to_release = 0;
195 q->pkts_last_epoch = 0;
196 q->pkts_current_epoch = 0;
197 qdisc_unthrottled(sch);
198 netif_schedule_queue(sch->dev_queue);
199 break;
200 case TCQ_PLUG_LIMIT:
201 /* Limit is supplied in bytes */
202 q->limit = msg->limit;
203 break;
204 default:
205 return -EINVAL;
206 }
207
208 return 0;
209}
210
211static struct Qdisc_ops plug_qdisc_ops __read_mostly = {
212 .id = "plug",
213 .priv_size = sizeof(struct plug_sched_data),
214 .enqueue = plug_enqueue,
215 .dequeue = plug_dequeue,
216 .peek = qdisc_peek_head,
217 .init = plug_init,
218 .change = plug_change,
219 .owner = THIS_MODULE,
220};
221
222static int __init plug_module_init(void)
223{
224 return register_qdisc(&plug_qdisc_ops);
225}
226
227static void __exit plug_module_exit(void)
228{
229 unregister_qdisc(&plug_qdisc_ops);
230}
231module_init(plug_module_init)
232module_exit(plug_module_exit)
233MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 60d47180f043..02a21abea65e 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -469,11 +469,15 @@ enqueue:
469 if (slot->qlen == 1) { /* The flow is new */ 469 if (slot->qlen == 1) { /* The flow is new */
470 if (q->tail == NULL) { /* It is the first flow */ 470 if (q->tail == NULL) { /* It is the first flow */
471 slot->next = x; 471 slot->next = x;
472 q->tail = slot;
473 } else { 472 } else {
474 slot->next = q->tail->next; 473 slot->next = q->tail->next;
475 q->tail->next = x; 474 q->tail->next = x;
476 } 475 }
476 /* We put this flow at the end of our flow list.
477 * This might sound unfair for a new flow to wait after old ones,
478 * but we could endup servicing new flows only, and freeze old ones.
479 */
480 q->tail = slot;
477 /* We could use a bigger initial quantum for new flows */ 481 /* We could use a bigger initial quantum for new flows */
478 slot->allot = q->scaled_quantum; 482 slot->allot = q->scaled_quantum;
479 } 483 }