diff options
author | Tejun Heo <tj@kernel.org> | 2012-04-01 15:30:01 -0400 |
---|---|---|
committer | Tejun Heo <tj@kernel.org> | 2012-04-01 15:55:00 -0400 |
commit | 959d851caa48829eb85cb85aa949fd6b4c5d5bc6 (patch) | |
tree | 3ba9c94ec346275fb44c4f0d1cd2537cdff8d811 /net/sched | |
parent | a5567932fc926739e29e98487128080f40c61710 (diff) | |
parent | 48ddbe194623ae089cc0576e60363f2d2e85662a (diff) |
Merge branch 'for-3.5' of ../cgroup into block/for-3.5/core-merged
cgroup/for-3.5 contains the following changes which blk-cgroup needs
to proceed with the on-going cleanup.
* Dynamic addition and removal of cftypes to make config/stat file
handling modular for policies.
* cgroup removal update to not wait for css references to drain to fix
blkcg removal hang caused by cfq caching cfqgs.
Pull in cgroup/for-3.5 into block/for-3.5/core. This causes the
following conflicts in block/blk-cgroup.c.
* 761b3ef50e "cgroup: remove cgroup_subsys argument from callbacks"
conflicts with blkiocg_pre_destroy() addition and blkiocg_attach()
removal. Resolved by removing @subsys from all subsys methods.
* 676f7c8f84 "cgroup: relocate cftype and cgroup_subsys definitions in
controllers" conflicts with ->pre_destroy() and ->attach() updates
and removal of modular config. Resolved by dropping forward
declarations of the methods and applying updates to the relocated
blkio_subsys.
* 4baf6e3325 "cgroup: convert all non-memcg controllers to the new
cftype interface" builds upon the previous item. Resolved by adding
->base_cftypes to the relocated blkio_subsys.
Signed-off-by: Tejun Heo <tj@kernel.org>
Diffstat (limited to 'net/sched')
-rw-r--r-- | net/sched/Kconfig | 26 | ||||
-rw-r--r-- | net/sched/Makefile | 1 | ||||
-rw-r--r-- | net/sched/cls_cgroup.c | 37 | ||||
-rw-r--r-- | net/sched/sch_netem.c | 6 | ||||
-rw-r--r-- | net/sched/sch_plug.c | 233 | ||||
-rw-r--r-- | net/sched/sch_sfq.c | 6 |
6 files changed, 280 insertions, 29 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 2590e91b3289..75b58f81d53d 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig | |||
@@ -260,6 +260,32 @@ config NET_SCH_INGRESS | |||
260 | To compile this code as a module, choose M here: the | 260 | To compile this code as a module, choose M here: the |
261 | module will be called sch_ingress. | 261 | module will be called sch_ingress. |
262 | 262 | ||
263 | config NET_SCH_PLUG | ||
264 | tristate "Plug network traffic until release (PLUG)" | ||
265 | ---help--- | ||
266 | |||
267 | This queuing discipline allows userspace to plug/unplug a network | ||
268 | output queue, using the netlink interface. When it receives an | ||
269 | enqueue command it inserts a plug into the outbound queue that | ||
270 | causes following packets to enqueue until a dequeue command arrives | ||
271 | over netlink, causing the plug to be removed and resuming the normal | ||
272 | packet flow. | ||
273 | |||
274 | This module also provides a generic "network output buffering" | ||
275 | functionality (aka output commit), wherein upon arrival of a dequeue | ||
276 | command, only packets up to the first plug are released for delivery. | ||
277 | The Remus HA project uses this module to enable speculative execution | ||
278 | of virtual machines by allowing the generated network output to be rolled | ||
279 | back if needed. | ||
280 | |||
281 | For more information, please refer to http://wiki.xensource.com/xenwiki/Remus | ||
282 | |||
283 | Say Y here if you are using this kernel for Xen dom0 and | ||
284 | want to protect Xen guests with Remus. | ||
285 | |||
286 | To compile this code as a module, choose M here: the | ||
287 | module will be called sch_plug. | ||
288 | |||
263 | comment "Classification" | 289 | comment "Classification" |
264 | 290 | ||
265 | config NET_CLS | 291 | config NET_CLS |
diff --git a/net/sched/Makefile b/net/sched/Makefile index dc5889c0a15a..8cdf4e2b51d3 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile | |||
@@ -33,6 +33,7 @@ obj-$(CONFIG_NET_SCH_MULTIQ) += sch_multiq.o | |||
33 | obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o | 33 | obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o |
34 | obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o | 34 | obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o |
35 | obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o | 35 | obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o |
36 | obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o | ||
36 | obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o | 37 | obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o |
37 | obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o | 38 | obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o |
38 | obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o | 39 | obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o |
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index f84fdc3a7f27..7743ea8d1d38 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c | |||
@@ -22,23 +22,6 @@ | |||
22 | #include <net/sock.h> | 22 | #include <net/sock.h> |
23 | #include <net/cls_cgroup.h> | 23 | #include <net/cls_cgroup.h> |
24 | 24 | ||
25 | static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss, | ||
26 | struct cgroup *cgrp); | ||
27 | static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp); | ||
28 | static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp); | ||
29 | |||
30 | struct cgroup_subsys net_cls_subsys = { | ||
31 | .name = "net_cls", | ||
32 | .create = cgrp_create, | ||
33 | .destroy = cgrp_destroy, | ||
34 | .populate = cgrp_populate, | ||
35 | #ifdef CONFIG_NET_CLS_CGROUP | ||
36 | .subsys_id = net_cls_subsys_id, | ||
37 | #endif | ||
38 | .module = THIS_MODULE, | ||
39 | }; | ||
40 | |||
41 | |||
42 | static inline struct cgroup_cls_state *cgrp_cls_state(struct cgroup *cgrp) | 25 | static inline struct cgroup_cls_state *cgrp_cls_state(struct cgroup *cgrp) |
43 | { | 26 | { |
44 | return container_of(cgroup_subsys_state(cgrp, net_cls_subsys_id), | 27 | return container_of(cgroup_subsys_state(cgrp, net_cls_subsys_id), |
@@ -51,8 +34,7 @@ static inline struct cgroup_cls_state *task_cls_state(struct task_struct *p) | |||
51 | struct cgroup_cls_state, css); | 34 | struct cgroup_cls_state, css); |
52 | } | 35 | } |
53 | 36 | ||
54 | static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss, | 37 | static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp) |
55 | struct cgroup *cgrp) | ||
56 | { | 38 | { |
57 | struct cgroup_cls_state *cs; | 39 | struct cgroup_cls_state *cs; |
58 | 40 | ||
@@ -66,7 +48,7 @@ static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss, | |||
66 | return &cs->css; | 48 | return &cs->css; |
67 | } | 49 | } |
68 | 50 | ||
69 | static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) | 51 | static void cgrp_destroy(struct cgroup *cgrp) |
70 | { | 52 | { |
71 | kfree(cgrp_cls_state(cgrp)); | 53 | kfree(cgrp_cls_state(cgrp)); |
72 | } | 54 | } |
@@ -88,12 +70,19 @@ static struct cftype ss_files[] = { | |||
88 | .read_u64 = read_classid, | 70 | .read_u64 = read_classid, |
89 | .write_u64 = write_classid, | 71 | .write_u64 = write_classid, |
90 | }, | 72 | }, |
73 | { } /* terminate */ | ||
91 | }; | 74 | }; |
92 | 75 | ||
93 | static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) | 76 | struct cgroup_subsys net_cls_subsys = { |
94 | { | 77 | .name = "net_cls", |
95 | return cgroup_add_files(cgrp, ss, ss_files, ARRAY_SIZE(ss_files)); | 78 | .create = cgrp_create, |
96 | } | 79 | .destroy = cgrp_destroy, |
80 | #ifdef CONFIG_NET_CLS_CGROUP | ||
81 | .subsys_id = net_cls_subsys_id, | ||
82 | #endif | ||
83 | .base_cftypes = ss_files, | ||
84 | .module = THIS_MODULE, | ||
85 | }; | ||
97 | 86 | ||
98 | struct cls_cgroup_head { | 87 | struct cls_cgroup_head { |
99 | u32 handle; | 88 | u32 handle; |
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index e83d61ca78ca..5da548fa7ae9 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c | |||
@@ -501,9 +501,8 @@ tfifo_dequeue: | |||
501 | 501 | ||
502 | /* if more time remaining? */ | 502 | /* if more time remaining? */ |
503 | if (cb->time_to_send <= psched_get_time()) { | 503 | if (cb->time_to_send <= psched_get_time()) { |
504 | skb = qdisc_dequeue_tail(sch); | 504 | __skb_unlink(skb, &sch->q); |
505 | if (unlikely(!skb)) | 505 | sch->qstats.backlog -= qdisc_pkt_len(skb); |
506 | goto qdisc_dequeue; | ||
507 | 506 | ||
508 | #ifdef CONFIG_NET_CLS_ACT | 507 | #ifdef CONFIG_NET_CLS_ACT |
509 | /* | 508 | /* |
@@ -539,7 +538,6 @@ deliver: | |||
539 | qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send); | 538 | qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send); |
540 | } | 539 | } |
541 | 540 | ||
542 | qdisc_dequeue: | ||
543 | if (q->qdisc) { | 541 | if (q->qdisc) { |
544 | skb = q->qdisc->ops->dequeue(q->qdisc); | 542 | skb = q->qdisc->ops->dequeue(q->qdisc); |
545 | if (skb) | 543 | if (skb) |
diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c new file mode 100644 index 000000000000..89f8fcf73f18 --- /dev/null +++ b/net/sched/sch_plug.c | |||
@@ -0,0 +1,233 @@ | |||
1 | /* | ||
2 | * sch_plug.c Queue traffic until an explicit release command | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation; either version | ||
7 | * 2 of the License, or (at your option) any later version. | ||
8 | * | ||
9 | * There are two ways to use this qdisc: | ||
10 | * 1. A simple "instantaneous" plug/unplug operation, by issuing an alternating | ||
11 | * sequence of TCQ_PLUG_BUFFER & TCQ_PLUG_RELEASE_INDEFINITE commands. | ||
12 | * | ||
13 | * 2. For network output buffering (a.k.a output commit) functionality. | ||
14 | * Output commit property is commonly used by applications using checkpoint | ||
15 | * based fault-tolerance to ensure that the checkpoint from which a system | ||
16 | * is being restored is consistent w.r.t outside world. | ||
17 | * | ||
18 | * Consider for e.g. Remus - a Virtual Machine checkpointing system, | ||
19 | * wherein a VM is checkpointed, say every 50ms. The checkpoint is replicated | ||
20 | * asynchronously to the backup host, while the VM continues executing the | ||
21 | * next epoch speculatively. | ||
22 | * | ||
23 | * The following is a typical sequence of output buffer operations: | ||
24 | * 1.At epoch i, start_buffer(i) | ||
25 | * 2. At end of epoch i (i.e. after 50ms): | ||
26 | * 2.1 Stop VM and take checkpoint(i). | ||
27 | * 2.2 start_buffer(i+1) and Resume VM | ||
28 | * 3. While speculatively executing epoch(i+1), asynchronously replicate | ||
29 | * checkpoint(i) to backup host. | ||
30 | * 4. When checkpoint_ack(i) is received from backup, release_buffer(i) | ||
31 | * Thus, this Qdisc would receive the following sequence of commands: | ||
32 | * TCQ_PLUG_BUFFER (epoch i) | ||
33 | * .. TCQ_PLUG_BUFFER (epoch i+1) | ||
34 | * ....TCQ_PLUG_RELEASE_ONE (epoch i) | ||
35 | * ......TCQ_PLUG_BUFFER (epoch i+2) | ||
36 | * ........ | ||
37 | */ | ||
38 | |||
39 | #include <linux/module.h> | ||
40 | #include <linux/types.h> | ||
41 | #include <linux/kernel.h> | ||
42 | #include <linux/errno.h> | ||
43 | #include <linux/netdevice.h> | ||
44 | #include <linux/skbuff.h> | ||
45 | #include <net/pkt_sched.h> | ||
46 | |||
47 | /* | ||
48 | * State of the queue, when used for network output buffering: | ||
49 | * | ||
50 | * plug(i+1) plug(i) head | ||
51 | * ------------------+--------------------+----------------> | ||
52 | * | | | ||
53 | * | | | ||
54 | * pkts_current_epoch| pkts_last_epoch |pkts_to_release | ||
55 | * ----------------->|<--------+--------->|+---------------> | ||
56 | * v v | ||
57 | * | ||
58 | */ | ||
59 | |||
60 | struct plug_sched_data { | ||
61 | /* If true, the dequeue function releases all packets | ||
62 | * from head to end of the queue. The queue turns into | ||
63 | * a pass-through queue for newly arriving packets. | ||
64 | */ | ||
65 | bool unplug_indefinite; | ||
66 | |||
67 | /* Queue Limit in bytes */ | ||
68 | u32 limit; | ||
69 | |||
70 | /* Number of packets (output) from the current speculatively | ||
71 | * executing epoch. | ||
72 | */ | ||
73 | u32 pkts_current_epoch; | ||
74 | |||
75 | /* Number of packets corresponding to the recently finished | ||
76 | * epoch. These will be released when we receive a | ||
77 | * TCQ_PLUG_RELEASE_ONE command. This command is typically | ||
78 | * issued after committing a checkpoint at the target. | ||
79 | */ | ||
80 | u32 pkts_last_epoch; | ||
81 | |||
82 | /* | ||
83 | * Number of packets from the head of the queue, that can | ||
84 | * be released (committed checkpoint). | ||
85 | */ | ||
86 | u32 pkts_to_release; | ||
87 | }; | ||
88 | |||
89 | static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch) | ||
90 | { | ||
91 | struct plug_sched_data *q = qdisc_priv(sch); | ||
92 | |||
93 | if (likely(sch->qstats.backlog + skb->len <= q->limit)) { | ||
94 | if (!q->unplug_indefinite) | ||
95 | q->pkts_current_epoch++; | ||
96 | return qdisc_enqueue_tail(skb, sch); | ||
97 | } | ||
98 | |||
99 | return qdisc_reshape_fail(skb, sch); | ||
100 | } | ||
101 | |||
102 | static struct sk_buff *plug_dequeue(struct Qdisc *sch) | ||
103 | { | ||
104 | struct plug_sched_data *q = qdisc_priv(sch); | ||
105 | |||
106 | if (qdisc_is_throttled(sch)) | ||
107 | return NULL; | ||
108 | |||
109 | if (!q->unplug_indefinite) { | ||
110 | if (!q->pkts_to_release) { | ||
111 | /* No more packets to dequeue. Block the queue | ||
112 | * and wait for the next release command. | ||
113 | */ | ||
114 | qdisc_throttled(sch); | ||
115 | return NULL; | ||
116 | } | ||
117 | q->pkts_to_release--; | ||
118 | } | ||
119 | |||
120 | return qdisc_dequeue_head(sch); | ||
121 | } | ||
122 | |||
123 | static int plug_init(struct Qdisc *sch, struct nlattr *opt) | ||
124 | { | ||
125 | struct plug_sched_data *q = qdisc_priv(sch); | ||
126 | |||
127 | q->pkts_current_epoch = 0; | ||
128 | q->pkts_last_epoch = 0; | ||
129 | q->pkts_to_release = 0; | ||
130 | q->unplug_indefinite = false; | ||
131 | |||
132 | if (opt == NULL) { | ||
133 | /* We will set a default limit of 100 pkts (~150kB) | ||
134 | * in case tx_queue_len is not available. The | ||
135 | * default value is completely arbitrary. | ||
136 | */ | ||
137 | u32 pkt_limit = qdisc_dev(sch)->tx_queue_len ? : 100; | ||
138 | q->limit = pkt_limit * psched_mtu(qdisc_dev(sch)); | ||
139 | } else { | ||
140 | struct tc_plug_qopt *ctl = nla_data(opt); | ||
141 | |||
142 | if (nla_len(opt) < sizeof(*ctl)) | ||
143 | return -EINVAL; | ||
144 | |||
145 | q->limit = ctl->limit; | ||
146 | } | ||
147 | |||
148 | qdisc_throttled(sch); | ||
149 | return 0; | ||
150 | } | ||
151 | |||
152 | /* Receives 4 types of messages: | ||
153 | * TCQ_PLUG_BUFFER: Inset a plug into the queue and | ||
154 | * buffer any incoming packets | ||
155 | * TCQ_PLUG_RELEASE_ONE: Dequeue packets from queue head | ||
156 | * to beginning of the next plug. | ||
157 | * TCQ_PLUG_RELEASE_INDEFINITE: Dequeue all packets from queue. | ||
158 | * Stop buffering packets until the next TCQ_PLUG_BUFFER | ||
159 | * command is received (just act as a pass-thru queue). | ||
160 | * TCQ_PLUG_LIMIT: Increase/decrease queue size | ||
161 | */ | ||
162 | static int plug_change(struct Qdisc *sch, struct nlattr *opt) | ||
163 | { | ||
164 | struct plug_sched_data *q = qdisc_priv(sch); | ||
165 | struct tc_plug_qopt *msg; | ||
166 | |||
167 | if (opt == NULL) | ||
168 | return -EINVAL; | ||
169 | |||
170 | msg = nla_data(opt); | ||
171 | if (nla_len(opt) < sizeof(*msg)) | ||
172 | return -EINVAL; | ||
173 | |||
174 | switch (msg->action) { | ||
175 | case TCQ_PLUG_BUFFER: | ||
176 | /* Save size of the current buffer */ | ||
177 | q->pkts_last_epoch = q->pkts_current_epoch; | ||
178 | q->pkts_current_epoch = 0; | ||
179 | if (q->unplug_indefinite) | ||
180 | qdisc_throttled(sch); | ||
181 | q->unplug_indefinite = false; | ||
182 | break; | ||
183 | case TCQ_PLUG_RELEASE_ONE: | ||
184 | /* Add packets from the last complete buffer to the | ||
185 | * packets to be released set. | ||
186 | */ | ||
187 | q->pkts_to_release += q->pkts_last_epoch; | ||
188 | q->pkts_last_epoch = 0; | ||
189 | qdisc_unthrottled(sch); | ||
190 | netif_schedule_queue(sch->dev_queue); | ||
191 | break; | ||
192 | case TCQ_PLUG_RELEASE_INDEFINITE: | ||
193 | q->unplug_indefinite = true; | ||
194 | q->pkts_to_release = 0; | ||
195 | q->pkts_last_epoch = 0; | ||
196 | q->pkts_current_epoch = 0; | ||
197 | qdisc_unthrottled(sch); | ||
198 | netif_schedule_queue(sch->dev_queue); | ||
199 | break; | ||
200 | case TCQ_PLUG_LIMIT: | ||
201 | /* Limit is supplied in bytes */ | ||
202 | q->limit = msg->limit; | ||
203 | break; | ||
204 | default: | ||
205 | return -EINVAL; | ||
206 | } | ||
207 | |||
208 | return 0; | ||
209 | } | ||
210 | |||
211 | static struct Qdisc_ops plug_qdisc_ops __read_mostly = { | ||
212 | .id = "plug", | ||
213 | .priv_size = sizeof(struct plug_sched_data), | ||
214 | .enqueue = plug_enqueue, | ||
215 | .dequeue = plug_dequeue, | ||
216 | .peek = qdisc_peek_head, | ||
217 | .init = plug_init, | ||
218 | .change = plug_change, | ||
219 | .owner = THIS_MODULE, | ||
220 | }; | ||
221 | |||
222 | static int __init plug_module_init(void) | ||
223 | { | ||
224 | return register_qdisc(&plug_qdisc_ops); | ||
225 | } | ||
226 | |||
227 | static void __exit plug_module_exit(void) | ||
228 | { | ||
229 | unregister_qdisc(&plug_qdisc_ops); | ||
230 | } | ||
231 | module_init(plug_module_init) | ||
232 | module_exit(plug_module_exit) | ||
233 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 60d47180f043..02a21abea65e 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c | |||
@@ -469,11 +469,15 @@ enqueue: | |||
469 | if (slot->qlen == 1) { /* The flow is new */ | 469 | if (slot->qlen == 1) { /* The flow is new */ |
470 | if (q->tail == NULL) { /* It is the first flow */ | 470 | if (q->tail == NULL) { /* It is the first flow */ |
471 | slot->next = x; | 471 | slot->next = x; |
472 | q->tail = slot; | ||
473 | } else { | 472 | } else { |
474 | slot->next = q->tail->next; | 473 | slot->next = q->tail->next; |
475 | q->tail->next = x; | 474 | q->tail->next = x; |
476 | } | 475 | } |
476 | /* We put this flow at the end of our flow list. | ||
477 | * This might sound unfair for a new flow to wait after old ones, | ||
478 | * but we could endup servicing new flows only, and freeze old ones. | ||
479 | */ | ||
480 | q->tail = slot; | ||
477 | /* We could use a bigger initial quantum for new flows */ | 481 | /* We could use a bigger initial quantum for new flows */ |
478 | slot->allot = q->scaled_quantum; | 482 | slot->allot = q->scaled_quantum; |
479 | } | 483 | } |