aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2015-12-07 17:38:53 -0500
committerDavid S. Miller <davem@davemloft.net>2015-12-08 22:02:33 -0500
commitbd1060a1d67128bb8fbe2e1384c518912cbe54e7 (patch)
treed0b4047bb6257f74780caa9732ac41ce4fb5a62a
parent2a56a1fec290bf0bc4676bbf4efdb3744953a3e7 (diff)
sock, cgroup: add sock->sk_cgroup
In cgroup v1, dealing with cgroup membership was difficult because the number of membership associations was unbound. As a result, cgroup v1 grew several controllers whose primary purpose is either tagging membership or pull in configuration knobs from other subsystems so that cgroup membership test can be avoided. net_cls and net_prio controllers are examples of the latter. They allow configuring network-specific attributes from cgroup side so that network subsystem can avoid testing cgroup membership; unfortunately, these are not only cumbersome but also problematic. Both net_cls and net_prio aren't properly hierarchical. Both inherit configuration from the parent on creation but there's no interaction afterwards. An ancestor doesn't restrict the behavior in its subtree in anyway and configuration changes aren't propagated downwards. Especially when combined with cgroup delegation, this is problematic because delegatees can mess up whatever network configuration implemented at the system level. net_prio would allow the delegatees to set whatever priority value regardless of CAP_NET_ADMIN and net_cls the same for classid. While it is possible to solve these issues from controller side by implementing hierarchical allowable ranges in both controllers, it would involve quite a bit of complexity in the controllers and further obfuscate network configuration as it becomes even more difficult to tell what's actually being configured looking from the network side. While not much can be done for v1 at this point, as membership handling is sane on cgroup v2, it'd be better to make cgroup matching behave like other network matches and classifiers than introducing further complications. In preparation, this patch updates sock->sk_cgrp_data handling so that it points to the v2 cgroup that sock was created in until either net_prio or net_cls is used. Once either of the two is used, sock->sk_cgrp_data reverts to its previous role of carrying prioidx and classid. This is to avoid adding yet another cgroup related field to struct sock. As the mode switching can happen at most once per boot, the switching mechanism is aimed at lowering hot path overhead. It may leak a finite, likely small, number of cgroup refs and report spurious prioidx or classid on switching; however, dynamic updates of prioidx and classid have always been racy and lossy - socks between creation and fd installation are never updated, config changes don't update existing sockets at all, and prioidx may index with dead and recycled cgroup IDs. Non-critical inaccuracies from small race windows won't make any noticeable difference. This patch doesn't make use of the pointer yet. The following patch will implement netfilter match for cgroup2 membership. v2: Use sock_cgroup_data to avoid inflating struct sock w/ another cgroup specific field. v3: Add comments explaining why sock_data_prioidx() and sock_data_classid() use different fallback values. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Daniel Borkmann <daniel@iogearbox.net> Cc: Daniel Wagner <daniel.wagner@bmw-carit.de> CC: Neil Horman <nhorman@tuxdriver.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/linux/cgroup-defs.h88
-rw-r--r--include/linux/cgroup.h41
-rw-r--r--kernel/cgroup.c55
-rw-r--r--net/core/netclassid_cgroup.c7
-rw-r--r--net/core/netprio_cgroup.c7
-rw-r--r--net/core/sock.c2
6 files changed, 191 insertions, 9 deletions
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index ed128fed0335..9dc226345e4e 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -544,31 +544,107 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {}
544 544
545#ifdef CONFIG_SOCK_CGROUP_DATA 545#ifdef CONFIG_SOCK_CGROUP_DATA
546 546
547/*
548 * sock_cgroup_data is embedded at sock->sk_cgrp_data and contains
549 * per-socket cgroup information except for memcg association.
550 *
551 * On legacy hierarchies, net_prio and net_cls controllers directly set
552 * attributes on each sock which can then be tested by the network layer.
553 * On the default hierarchy, each sock is associated with the cgroup it was
554 * created in and the networking layer can match the cgroup directly.
555 *
556 * To avoid carrying all three cgroup related fields separately in sock,
557 * sock_cgroup_data overloads (prioidx, classid) and the cgroup pointer.
558 * On boot, sock_cgroup_data records the cgroup that the sock was created
559 * in so that cgroup2 matches can be made; however, once either net_prio or
560 * net_cls starts being used, the area is overriden to carry prioidx and/or
561 * classid. The two modes are distinguished by whether the lowest bit is
562 * set. Clear bit indicates cgroup pointer while set bit prioidx and
563 * classid.
564 *
565 * While userland may start using net_prio or net_cls at any time, once
566 * either is used, cgroup2 matching no longer works. There is no reason to
567 * mix the two and this is in line with how legacy and v2 compatibility is
568 * handled. On mode switch, cgroup references which are already being
569 * pointed to by socks may be leaked. While this can be remedied by adding
570 * synchronization around sock_cgroup_data, given that the number of leaked
571 * cgroups is bound and highly unlikely to be high, this seems to be the
572 * better trade-off.
573 */
547struct sock_cgroup_data { 574struct sock_cgroup_data {
548 u16 prioidx; 575 union {
549 u32 classid; 576#ifdef __LITTLE_ENDIAN
577 struct {
578 u8 is_data;
579 u8 padding;
580 u16 prioidx;
581 u32 classid;
582 } __packed;
583#else
584 struct {
585 u32 classid;
586 u16 prioidx;
587 u8 padding;
588 u8 is_data;
589 } __packed;
590#endif
591 u64 val;
592 };
550}; 593};
551 594
595/*
596 * There's a theoretical window where the following accessors race with
597 * updaters and return part of the previous pointer as the prioidx or
598 * classid. Such races are short-lived and the result isn't critical.
599 */
552static inline u16 sock_cgroup_prioidx(struct sock_cgroup_data *skcd) 600static inline u16 sock_cgroup_prioidx(struct sock_cgroup_data *skcd)
553{ 601{
554 return skcd->prioidx; 602 /* fallback to 1 which is always the ID of the root cgroup */
603 return (skcd->is_data & 1) ? skcd->prioidx : 1;
555} 604}
556 605
557static inline u32 sock_cgroup_classid(struct sock_cgroup_data *skcd) 606static inline u32 sock_cgroup_classid(struct sock_cgroup_data *skcd)
558{ 607{
559 return skcd->classid; 608 /* fallback to 0 which is the unconfigured default classid */
609 return (skcd->is_data & 1) ? skcd->classid : 0;
560} 610}
561 611
612/*
613 * If invoked concurrently, the updaters may clobber each other. The
614 * caller is responsible for synchronization.
615 */
562static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd, 616static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd,
563 u16 prioidx) 617 u16 prioidx)
564{ 618{
565 skcd->prioidx = prioidx; 619 struct sock_cgroup_data skcd_buf = { .val = READ_ONCE(skcd->val) };
620
621 if (sock_cgroup_prioidx(&skcd_buf) == prioidx)
622 return;
623
624 if (!(skcd_buf.is_data & 1)) {
625 skcd_buf.val = 0;
626 skcd_buf.is_data = 1;
627 }
628
629 skcd_buf.prioidx = prioidx;
630 WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */
566} 631}
567 632
568static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd, 633static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd,
569 u32 classid) 634 u32 classid)
570{ 635{
571 skcd->classid = classid; 636 struct sock_cgroup_data skcd_buf = { .val = READ_ONCE(skcd->val) };
637
638 if (sock_cgroup_classid(&skcd_buf) == classid)
639 return;
640
641 if (!(skcd_buf.is_data & 1)) {
642 skcd_buf.val = 0;
643 skcd_buf.is_data = 1;
644 }
645
646 skcd_buf.classid = classid;
647 WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */
572} 648}
573 649
574#else /* CONFIG_SOCK_CGROUP_DATA */ 650#else /* CONFIG_SOCK_CGROUP_DATA */
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 4c3ffab81ba7..a8ba1ea0ea5a 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -578,4 +578,45 @@ static inline int cgroup_init(void) { return 0; }
578 578
579#endif /* !CONFIG_CGROUPS */ 579#endif /* !CONFIG_CGROUPS */
580 580
581/*
582 * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data
583 * definition in cgroup-defs.h.
584 */
585#ifdef CONFIG_SOCK_CGROUP_DATA
586
587#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
588extern spinlock_t cgroup_sk_update_lock;
589#endif
590
591void cgroup_sk_alloc_disable(void);
592void cgroup_sk_alloc(struct sock_cgroup_data *skcd);
593void cgroup_sk_free(struct sock_cgroup_data *skcd);
594
595static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd)
596{
597#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
598 unsigned long v;
599
600 /*
601 * @skcd->val is 64bit but the following is safe on 32bit too as we
602 * just need the lower ulong to be written and read atomically.
603 */
604 v = READ_ONCE(skcd->val);
605
606 if (v & 1)
607 return &cgrp_dfl_root.cgrp;
608
609 return (struct cgroup *)(unsigned long)v ?: &cgrp_dfl_root.cgrp;
610#else
611 return (struct cgroup *)(unsigned long)skcd->val;
612#endif
613}
614
615#else /* CONFIG_CGROUP_DATA */
616
617static inline void cgroup_sk_alloc(struct sock_cgroup_data *skcd) {}
618static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {}
619
620#endif /* CONFIG_CGROUP_DATA */
621
581#endif /* _LINUX_CGROUP_H */ 622#endif /* _LINUX_CGROUP_H */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3db5e8f5b702..4f8f7927b422 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -57,8 +57,8 @@
57#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 57#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
58#include <linux/kthread.h> 58#include <linux/kthread.h>
59#include <linux/delay.h> 59#include <linux/delay.h>
60
61#include <linux/atomic.h> 60#include <linux/atomic.h>
61#include <net/sock.h>
62 62
63/* 63/*
64 * pidlists linger the following amount before being destroyed. The goal 64 * pidlists linger the following amount before being destroyed. The goal
@@ -5782,6 +5782,59 @@ struct cgroup *cgroup_get_from_path(const char *path)
5782} 5782}
5783EXPORT_SYMBOL_GPL(cgroup_get_from_path); 5783EXPORT_SYMBOL_GPL(cgroup_get_from_path);
5784 5784
5785/*
5786 * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data
5787 * definition in cgroup-defs.h.
5788 */
5789#ifdef CONFIG_SOCK_CGROUP_DATA
5790
5791#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
5792
5793spinlock_t cgroup_sk_update_lock;
5794static bool cgroup_sk_alloc_disabled __read_mostly;
5795
5796void cgroup_sk_alloc_disable(void)
5797{
5798 if (cgroup_sk_alloc_disabled)
5799 return;
5800 pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
5801 cgroup_sk_alloc_disabled = true;
5802}
5803
5804#else
5805
5806#define cgroup_sk_alloc_disabled false
5807
5808#endif
5809
5810void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
5811{
5812 if (cgroup_sk_alloc_disabled)
5813 return;
5814
5815 rcu_read_lock();
5816
5817 while (true) {
5818 struct css_set *cset;
5819
5820 cset = task_css_set(current);
5821 if (likely(cgroup_tryget(cset->dfl_cgrp))) {
5822 skcd->val = (unsigned long)cset->dfl_cgrp;
5823 break;
5824 }
5825 cpu_relax();
5826 }
5827
5828 rcu_read_unlock();
5829}
5830
5831void cgroup_sk_free(struct sock_cgroup_data *skcd)
5832{
5833 cgroup_put(sock_cgroup_ptr(skcd));
5834}
5835
5836#endif /* CONFIG_SOCK_CGROUP_DATA */
5837
5785#ifdef CONFIG_CGROUP_DEBUG 5838#ifdef CONFIG_CGROUP_DEBUG
5786static struct cgroup_subsys_state * 5839static struct cgroup_subsys_state *
5787debug_css_alloc(struct cgroup_subsys_state *parent_css) 5840debug_css_alloc(struct cgroup_subsys_state *parent_css)
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index e60ded46b3ac..04257a0e3534 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -61,9 +61,12 @@ static int update_classid_sock(const void *v, struct file *file, unsigned n)
61 int err; 61 int err;
62 struct socket *sock = sock_from_file(file, &err); 62 struct socket *sock = sock_from_file(file, &err);
63 63
64 if (sock) 64 if (sock) {
65 spin_lock(&cgroup_sk_update_lock);
65 sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, 66 sock_cgroup_set_classid(&sock->sk->sk_cgrp_data,
66 (unsigned long)v); 67 (unsigned long)v);
68 spin_unlock(&cgroup_sk_update_lock);
69 }
67 return 0; 70 return 0;
68} 71}
69 72
@@ -98,6 +101,8 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
98{ 101{
99 struct cgroup_cls_state *cs = css_cls_state(css); 102 struct cgroup_cls_state *cs = css_cls_state(css);
100 103
104 cgroup_sk_alloc_disable();
105
101 cs->classid = (u32)value; 106 cs->classid = (u32)value;
102 107
103 update_classid(css, (void *)(unsigned long)cs->classid); 108 update_classid(css, (void *)(unsigned long)cs->classid);
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index de42aa7f6c77..053d60c33395 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -209,6 +209,8 @@ static ssize_t write_priomap(struct kernfs_open_file *of,
209 if (!dev) 209 if (!dev)
210 return -ENODEV; 210 return -ENODEV;
211 211
212 cgroup_sk_alloc_disable();
213
212 rtnl_lock(); 214 rtnl_lock();
213 215
214 ret = netprio_set_prio(of_css(of), dev, prio); 216 ret = netprio_set_prio(of_css(of), dev, prio);
@@ -222,9 +224,12 @@ static int update_netprio(const void *v, struct file *file, unsigned n)
222{ 224{
223 int err; 225 int err;
224 struct socket *sock = sock_from_file(file, &err); 226 struct socket *sock = sock_from_file(file, &err);
225 if (sock) 227 if (sock) {
228 spin_lock(&cgroup_sk_update_lock);
226 sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data, 229 sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data,
227 (unsigned long)v); 230 (unsigned long)v);
231 spin_unlock(&cgroup_sk_update_lock);
232 }
228 return 0; 233 return 0;
229} 234}
230 235
diff --git a/net/core/sock.c b/net/core/sock.c
index 947741dc43fa..1278d7b7bd9a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1363,6 +1363,7 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1363 if (!try_module_get(prot->owner)) 1363 if (!try_module_get(prot->owner))
1364 goto out_free_sec; 1364 goto out_free_sec;
1365 sk_tx_queue_clear(sk); 1365 sk_tx_queue_clear(sk);
1366 cgroup_sk_alloc(&sk->sk_cgrp_data);
1366 } 1367 }
1367 1368
1368 return sk; 1369 return sk;
@@ -1385,6 +1386,7 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)
1385 owner = prot->owner; 1386 owner = prot->owner;
1386 slab = prot->slab; 1387 slab = prot->slab;
1387 1388
1389 cgroup_sk_free(&sk->sk_cgrp_data);
1388 security_sk_free(sk); 1390 security_sk_free(sk);
1389 if (slab != NULL) 1391 if (slab != NULL)
1390 kmem_cache_free(slab, sk); 1392 kmem_cache_free(slab, sk);