aboutsummaryrefslogtreecommitdiffstats
path: root/net/core
diff options
context:
space:
mode:
authorNeil Horman <nhorman@tuxdriver.com>2011-11-22 00:10:51 -0500
committerDavid S. Miller <davem@davemloft.net>2011-11-22 15:22:23 -0500
commit5bc1421e34ecfe0bd4b26dc3232b7d5e25179144 (patch)
tree783ed95187915c06757a260b637308919b35d5a0 /net/core
parent202ff1c26c768efeead20b388556eda265dc8352 (diff)
net: add network priority cgroup infrastructure (v4)
This patch adds in the infrastructure code to create the network priority cgroup. The cgroup, in addition to the standard processes file creates two control files: 1) prioidx - This is a read-only file that exports the index of this cgroup. This is a value that is both arbitrary and unique to a cgroup in this subsystem, and is used to index the per-device priority map 2) priomap - This is a writeable file. On read it reports a table of 2-tuples <name:priority> where name is the name of a network interface and priority is indicates the priority assigned to frames egresessing on the named interface and originating from a pid in this cgroup This cgroup allows for skb priority to be set prior to a root qdisc getting selected. This is benenficial for DCB enabled systems, in that it allows for any application to use dcb configured priorities so without application modification Signed-off-by: Neil Horman <nhorman@tuxdriver.com> Signed-off-by: John Fastabend <john.r.fastabend@intel.com> CC: Robert Love <robert.w.love@intel.com> CC: "David S. Miller" <davem@davemloft.net> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/core')
-rw-r--r--net/core/Makefile1
-rw-r--r--net/core/dev.c14
-rw-r--r--net/core/netprio_cgroup.c344
-rw-r--r--net/core/sock.c22
4 files changed, 380 insertions, 1 deletions
diff --git a/net/core/Makefile b/net/core/Makefile
index 0d357b1c4e57..3606d40aae62 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -19,3 +19,4 @@ obj-$(CONFIG_FIB_RULES) += fib_rules.o
19obj-$(CONFIG_TRACEPOINTS) += net-traces.o 19obj-$(CONFIG_TRACEPOINTS) += net-traces.o
20obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o 20obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o
21obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o 21obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o
22obj-$(CONFIG_NETPRIO_CGROUP) += netprio_cgroup.o
diff --git a/net/core/dev.c b/net/core/dev.c
index f78959996148..8afb244b205f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2449,6 +2449,18 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2449 return rc; 2449 return rc;
2450} 2450}
2451 2451
2452#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2453static void skb_update_prio(struct sk_buff *skb)
2454{
2455 struct netprio_map *map = rcu_dereference(skb->dev->priomap);
2456
2457 if ((!skb->priority) && (skb->sk) && map)
2458 skb->priority = map->priomap[skb->sk->sk_cgrp_prioidx];
2459}
2460#else
2461#define skb_update_prio(skb)
2462#endif
2463
2452static DEFINE_PER_CPU(int, xmit_recursion); 2464static DEFINE_PER_CPU(int, xmit_recursion);
2453#define RECURSION_LIMIT 10 2465#define RECURSION_LIMIT 10
2454 2466
@@ -2489,6 +2501,8 @@ int dev_queue_xmit(struct sk_buff *skb)
2489 */ 2501 */
2490 rcu_read_lock_bh(); 2502 rcu_read_lock_bh();
2491 2503
2504 skb_update_prio(skb);
2505
2492 txq = dev_pick_tx(dev, skb); 2506 txq = dev_pick_tx(dev, skb);
2493 q = rcu_dereference_bh(txq->qdisc); 2507 q = rcu_dereference_bh(txq->qdisc);
2494 2508
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
new file mode 100644
index 000000000000..72ad0bc6841e
--- /dev/null
+++ b/net/core/netprio_cgroup.c
@@ -0,0 +1,344 @@
1/*
2 * net/core/netprio_cgroup.c Priority Control Group
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Neil Horman <nhorman@tuxdriver.com>
10 */
11
12#include <linux/module.h>
13#include <linux/slab.h>
14#include <linux/types.h>
15#include <linux/string.h>
16#include <linux/errno.h>
17#include <linux/skbuff.h>
18#include <linux/cgroup.h>
19#include <linux/rcupdate.h>
20#include <linux/atomic.h>
21#include <net/rtnetlink.h>
22#include <net/pkt_cls.h>
23#include <net/sock.h>
24#include <net/netprio_cgroup.h>
25
26static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
27 struct cgroup *cgrp);
28static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
29static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp);
30
31struct cgroup_subsys net_prio_subsys = {
32 .name = "net_prio",
33 .create = cgrp_create,
34 .destroy = cgrp_destroy,
35 .populate = cgrp_populate,
36#ifdef CONFIG_NETPRIO_CGROUP
37 .subsys_id = net_prio_subsys_id,
38#endif
39 .module = THIS_MODULE
40};
41
42#define PRIOIDX_SZ 128
43
44static unsigned long prioidx_map[PRIOIDX_SZ];
45static DEFINE_SPINLOCK(prioidx_map_lock);
46static atomic_t max_prioidx = ATOMIC_INIT(0);
47
48static inline struct cgroup_netprio_state *cgrp_netprio_state(struct cgroup *cgrp)
49{
50 return container_of(cgroup_subsys_state(cgrp, net_prio_subsys_id),
51 struct cgroup_netprio_state, css);
52}
53
54static int get_prioidx(u32 *prio)
55{
56 unsigned long flags;
57 u32 prioidx;
58
59 spin_lock_irqsave(&prioidx_map_lock, flags);
60 prioidx = find_first_zero_bit(prioidx_map, sizeof(unsigned long) * PRIOIDX_SZ);
61 set_bit(prioidx, prioidx_map);
62 spin_unlock_irqrestore(&prioidx_map_lock, flags);
63 if (prioidx == sizeof(unsigned long) * PRIOIDX_SZ)
64 return -ENOSPC;
65
66 atomic_set(&max_prioidx, prioidx);
67 *prio = prioidx;
68 return 0;
69}
70
71static void put_prioidx(u32 idx)
72{
73 unsigned long flags;
74
75 spin_lock_irqsave(&prioidx_map_lock, flags);
76 clear_bit(idx, prioidx_map);
77 spin_unlock_irqrestore(&prioidx_map_lock, flags);
78}
79
80static void extend_netdev_table(struct net_device *dev, u32 new_len)
81{
82 size_t new_size = sizeof(struct netprio_map) +
83 ((sizeof(u32) * new_len));
84 struct netprio_map *new_priomap = kzalloc(new_size, GFP_KERNEL);
85 struct netprio_map *old_priomap;
86 int i;
87
88 old_priomap = rtnl_dereference(dev->priomap);
89
90 if (!new_priomap) {
91 printk(KERN_WARNING "Unable to alloc new priomap!\n");
92 return;
93 }
94
95 for (i = 0;
96 old_priomap && (i < old_priomap->priomap_len);
97 i++)
98 new_priomap->priomap[i] = old_priomap->priomap[i];
99
100 new_priomap->priomap_len = new_len;
101
102 rcu_assign_pointer(dev->priomap, new_priomap);
103 if (old_priomap)
104 kfree_rcu(old_priomap, rcu);
105}
106
107static void update_netdev_tables(void)
108{
109 struct net_device *dev;
110 u32 max_len = atomic_read(&max_prioidx);
111 struct netprio_map *map;
112
113 rtnl_lock();
114 for_each_netdev(&init_net, dev) {
115 map = rtnl_dereference(dev->priomap);
116 if ((!map) ||
117 (map->priomap_len < max_len))
118 extend_netdev_table(dev, max_len);
119 }
120 rtnl_unlock();
121}
122
123static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
124 struct cgroup *cgrp)
125{
126 struct cgroup_netprio_state *cs;
127 int ret;
128
129 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
130 if (!cs)
131 return ERR_PTR(-ENOMEM);
132
133 if (cgrp->parent && cgrp_netprio_state(cgrp->parent)->prioidx) {
134 kfree(cs);
135 return ERR_PTR(-EINVAL);
136 }
137
138 ret = get_prioidx(&cs->prioidx);
139 if (ret != 0) {
140 printk(KERN_WARNING "No space in priority index array\n");
141 kfree(cs);
142 return ERR_PTR(ret);
143 }
144
145 return &cs->css;
146}
147
148static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
149{
150 struct cgroup_netprio_state *cs;
151 struct net_device *dev;
152 struct netprio_map *map;
153
154 cs = cgrp_netprio_state(cgrp);
155 rtnl_lock();
156 for_each_netdev(&init_net, dev) {
157 map = rtnl_dereference(dev->priomap);
158 if (map)
159 map->priomap[cs->prioidx] = 0;
160 }
161 rtnl_unlock();
162 put_prioidx(cs->prioidx);
163 kfree(cs);
164}
165
166static u64 read_prioidx(struct cgroup *cgrp, struct cftype *cft)
167{
168 return (u64)cgrp_netprio_state(cgrp)->prioidx;
169}
170
171static int read_priomap(struct cgroup *cont, struct cftype *cft,
172 struct cgroup_map_cb *cb)
173{
174 struct net_device *dev;
175 u32 prioidx = cgrp_netprio_state(cont)->prioidx;
176 u32 priority;
177 struct netprio_map *map;
178
179 rcu_read_lock();
180 for_each_netdev_rcu(&init_net, dev) {
181 map = rcu_dereference(dev->priomap);
182 priority = map ? map->priomap[prioidx] : 0;
183 cb->fill(cb, dev->name, priority);
184 }
185 rcu_read_unlock();
186 return 0;
187}
188
189static int write_priomap(struct cgroup *cgrp, struct cftype *cft,
190 const char *buffer)
191{
192 char *devname = kstrdup(buffer, GFP_KERNEL);
193 int ret = -EINVAL;
194 u32 prioidx = cgrp_netprio_state(cgrp)->prioidx;
195 unsigned long priority;
196 char *priostr;
197 struct net_device *dev;
198 struct netprio_map *map;
199
200 if (!devname)
201 return -ENOMEM;
202
203 /*
204 * Minimally sized valid priomap string
205 */
206 if (strlen(devname) < 3)
207 goto out_free_devname;
208
209 priostr = strstr(devname, " ");
210 if (!priostr)
211 goto out_free_devname;
212
213 /*
214 *Separate the devname from the associated priority
215 *and advance the priostr poitner to the priority value
216 */
217 *priostr = '\0';
218 priostr++;
219
220 /*
221 * If the priostr points to NULL, we're at the end of the passed
222 * in string, and its not a valid write
223 */
224 if (*priostr == '\0')
225 goto out_free_devname;
226
227 ret = kstrtoul(priostr, 10, &priority);
228 if (ret < 0)
229 goto out_free_devname;
230
231 ret = -ENODEV;
232
233 dev = dev_get_by_name(&init_net, devname);
234 if (!dev)
235 goto out_free_devname;
236
237 update_netdev_tables();
238 ret = 0;
239 rcu_read_lock();
240 map = rcu_dereference(dev->priomap);
241 if (map)
242 map->priomap[prioidx] = priority;
243 rcu_read_unlock();
244 dev_put(dev);
245
246out_free_devname:
247 kfree(devname);
248 return ret;
249}
250
251static struct cftype ss_files[] = {
252 {
253 .name = "prioidx",
254 .read_u64 = read_prioidx,
255 },
256 {
257 .name = "ifpriomap",
258 .read_map = read_priomap,
259 .write_string = write_priomap,
260 },
261};
262
263static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
264{
265 return cgroup_add_files(cgrp, ss, ss_files, ARRAY_SIZE(ss_files));
266}
267
268static int netprio_device_event(struct notifier_block *unused,
269 unsigned long event, void *ptr)
270{
271 struct net_device *dev = ptr;
272 struct netprio_map *old;
273 u32 max_len = atomic_read(&max_prioidx);
274
275 /*
276 * Note this is called with rtnl_lock held so we have update side
277 * protection on our rcu assignments
278 */
279
280 switch (event) {
281
282 case NETDEV_REGISTER:
283 if (max_len)
284 extend_netdev_table(dev, max_len);
285 break;
286 case NETDEV_UNREGISTER:
287 old = rtnl_dereference(dev->priomap);
288 rcu_assign_pointer(dev->priomap, NULL);
289 if (old)
290 kfree_rcu(old, rcu);
291 break;
292 }
293 return NOTIFY_DONE;
294}
295
296static struct notifier_block netprio_device_notifier = {
297 .notifier_call = netprio_device_event
298};
299
300static int __init init_cgroup_netprio(void)
301{
302 int ret;
303
304 ret = cgroup_load_subsys(&net_prio_subsys);
305 if (ret)
306 goto out;
307#ifndef CONFIG_NETPRIO_CGROUP
308 smp_wmb();
309 net_prio_subsys_id = net_prio_subsys.subsys_id;
310#endif
311
312 register_netdevice_notifier(&netprio_device_notifier);
313
314out:
315 return ret;
316}
317
318static void __exit exit_cgroup_netprio(void)
319{
320 struct netprio_map *old;
321 struct net_device *dev;
322
323 unregister_netdevice_notifier(&netprio_device_notifier);
324
325 cgroup_unload_subsys(&net_prio_subsys);
326
327#ifndef CONFIG_NETPRIO_CGROUP
328 net_prio_subsys_id = -1;
329 synchronize_rcu();
330#endif
331
332 rtnl_lock();
333 for_each_netdev(&init_net, dev) {
334 old = rtnl_dereference(dev->priomap);
335 rcu_assign_pointer(dev->priomap, NULL);
336 if (old)
337 kfree_rcu(old, rcu);
338 }
339 rtnl_unlock();
340}
341
342module_init(init_cgroup_netprio);
343module_exit(exit_cgroup_netprio);
344MODULE_LICENSE("GPL v2");
diff --git a/net/core/sock.c b/net/core/sock.c
index 9a8b3fac1401..16069139797c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -125,6 +125,7 @@
125#include <net/xfrm.h> 125#include <net/xfrm.h>
126#include <linux/ipsec.h> 126#include <linux/ipsec.h>
127#include <net/cls_cgroup.h> 127#include <net/cls_cgroup.h>
128#include <net/netprio_cgroup.h>
128 129
129#include <linux/filter.h> 130#include <linux/filter.h>
130 131
@@ -221,10 +222,16 @@ __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
221int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); 222int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
222EXPORT_SYMBOL(sysctl_optmem_max); 223EXPORT_SYMBOL(sysctl_optmem_max);
223 224
224#if defined(CONFIG_CGROUPS) && !defined(CONFIG_NET_CLS_CGROUP) 225#if defined(CONFIG_CGROUPS)
226#if !defined(CONFIG_NET_CLS_CGROUP)
225int net_cls_subsys_id = -1; 227int net_cls_subsys_id = -1;
226EXPORT_SYMBOL_GPL(net_cls_subsys_id); 228EXPORT_SYMBOL_GPL(net_cls_subsys_id);
227#endif 229#endif
230#if !defined(CONFIG_NETPRIO_CGROUP)
231int net_prio_subsys_id = -1;
232EXPORT_SYMBOL_GPL(net_prio_subsys_id);
233#endif
234#endif
228 235
229static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) 236static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
230{ 237{
@@ -1120,6 +1127,18 @@ void sock_update_classid(struct sock *sk)
1120 sk->sk_classid = classid; 1127 sk->sk_classid = classid;
1121} 1128}
1122EXPORT_SYMBOL(sock_update_classid); 1129EXPORT_SYMBOL(sock_update_classid);
1130
1131void sock_update_netprioidx(struct sock *sk)
1132{
1133 struct cgroup_netprio_state *state;
1134 if (in_interrupt())
1135 return;
1136 rcu_read_lock();
1137 state = task_netprio_state(current);
1138 sk->sk_cgrp_prioidx = state ? state->prioidx : 0;
1139 rcu_read_unlock();
1140}
1141EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1123#endif 1142#endif
1124 1143
1125/** 1144/**
@@ -1147,6 +1166,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1147 atomic_set(&sk->sk_wmem_alloc, 1); 1166 atomic_set(&sk->sk_wmem_alloc, 1);
1148 1167
1149 sock_update_classid(sk); 1168 sock_update_classid(sk);
1169 sock_update_netprioidx(sk);
1150 } 1170 }
1151 1171
1152 return sk; 1172 return sk;