summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRoman Gushchin <guro@fb.com>2019-05-25 12:37:39 -0400
committerAlexei Starovoitov <ast@kernel.org>2019-05-28 12:30:02 -0400
commit4bfc0bb2c60e2f4cc8eb60f03cf8dfa72336272a (patch)
treef392f37509246a1155c20d9f1c69857d0241e08a
parent37b54aed123faa19eb21d7ef2534756c5a152a7c (diff)
bpf: decouple the lifetime of cgroup_bpf from cgroup itself
Currently the lifetime of bpf programs attached to a cgroup is bound to the lifetime of the cgroup itself. It means that if a user forgets (or intentionally avoids) to detach a bpf program before removing the cgroup, it will stay attached up to the release of the cgroup. Since the cgroup can stay in the dying state (the state between being rmdir()'ed and being released) for a very long time, it leads to a waste of memory. Also, it blocks a possibility to implement the memcg-based memory accounting for bpf objects, because a circular reference dependency will occur. Charged memory pages are pinning the corresponding memory cgroup, and if the memory cgroup is pinning the attached bpf program, nothing will be ever released. A dying cgroup can not contain any processes, so the only chance for an attached bpf program to be executed is a live socket associated with the cgroup. So in order to release all bpf data early, let's count associated sockets using a new percpu refcounter. On cgroup removal the counter is transitioned to the atomic mode, and as soon as it reaches 0, all bpf programs are detached. Because cgroup_bpf_release() can block, it can't be called from the percpu ref counter callback directly, so instead an asynchronous work is scheduled. The reference counter is not socket specific, and can be used for any other types of programs, which can be executed from a cgroup-bpf hook outside of the process context, had such a need arise in the future. Signed-off-by: Roman Gushchin <guro@fb.com> Cc: jolsa@redhat.com Signed-off-by: Alexei Starovoitov <ast@kernel.org>
-rw-r--r--include/linux/bpf-cgroup.h11
-rw-r--r--include/linux/cgroup.h18
-rw-r--r--kernel/bpf/cgroup.c41
-rw-r--r--kernel/cgroup/cgroup.c11
4 files changed, 72 insertions, 9 deletions
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index cb3c6b3b89c8..9f100fc422c3 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -6,6 +6,7 @@
6#include <linux/errno.h> 6#include <linux/errno.h>
7#include <linux/jump_label.h> 7#include <linux/jump_label.h>
8#include <linux/percpu.h> 8#include <linux/percpu.h>
9#include <linux/percpu-refcount.h>
9#include <linux/rbtree.h> 10#include <linux/rbtree.h>
10#include <uapi/linux/bpf.h> 11#include <uapi/linux/bpf.h>
11 12
@@ -72,10 +73,16 @@ struct cgroup_bpf {
72 73
73 /* temp storage for effective prog array used by prog_attach/detach */ 74 /* temp storage for effective prog array used by prog_attach/detach */
74 struct bpf_prog_array __rcu *inactive; 75 struct bpf_prog_array __rcu *inactive;
76
77 /* reference counter used to detach bpf programs after cgroup removal */
78 struct percpu_ref refcnt;
79
80 /* cgroup_bpf is released using a work queue */
81 struct work_struct release_work;
75}; 82};
76 83
77void cgroup_bpf_put(struct cgroup *cgrp);
78int cgroup_bpf_inherit(struct cgroup *cgrp); 84int cgroup_bpf_inherit(struct cgroup *cgrp);
85void cgroup_bpf_offline(struct cgroup *cgrp);
79 86
80int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, 87int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
81 enum bpf_attach_type type, u32 flags); 88 enum bpf_attach_type type, u32 flags);
@@ -283,8 +290,8 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
283 290
284struct bpf_prog; 291struct bpf_prog;
285struct cgroup_bpf {}; 292struct cgroup_bpf {};
286static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
287static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } 293static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
294static inline void cgroup_bpf_offline(struct cgroup *cgrp) {}
288 295
289static inline int cgroup_bpf_prog_attach(const union bpf_attr *attr, 296static inline int cgroup_bpf_prog_attach(const union bpf_attr *attr,
290 enum bpf_prog_type ptype, 297 enum bpf_prog_type ptype,
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index c0077adeea83..49e8facf7c4a 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -924,4 +924,22 @@ static inline bool cgroup_task_frozen(struct task_struct *task)
924 924
925#endif /* !CONFIG_CGROUPS */ 925#endif /* !CONFIG_CGROUPS */
926 926
927#ifdef CONFIG_CGROUP_BPF
928static inline void cgroup_bpf_get(struct cgroup *cgrp)
929{
930 percpu_ref_get(&cgrp->bpf.refcnt);
931}
932
933static inline void cgroup_bpf_put(struct cgroup *cgrp)
934{
935 percpu_ref_put(&cgrp->bpf.refcnt);
936}
937
938#else /* CONFIG_CGROUP_BPF */
939
940static inline void cgroup_bpf_get(struct cgroup *cgrp) {}
941static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
942
943#endif /* CONFIG_CGROUP_BPF */
944
927#endif /* _LINUX_CGROUP_H */ 945#endif /* _LINUX_CGROUP_H */
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index fcde0f7b2585..d995edbe816d 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -22,12 +22,21 @@
22DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); 22DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
23EXPORT_SYMBOL(cgroup_bpf_enabled_key); 23EXPORT_SYMBOL(cgroup_bpf_enabled_key);
24 24
25void cgroup_bpf_offline(struct cgroup *cgrp)
26{
27 cgroup_get(cgrp);
28 percpu_ref_kill(&cgrp->bpf.refcnt);
29}
30
25/** 31/**
26 * cgroup_bpf_put() - put references of all bpf programs 32 * cgroup_bpf_release() - put references of all bpf programs and
27 * @cgrp: the cgroup to modify 33 * release all cgroup bpf data
34 * @work: work structure embedded into the cgroup to modify
28 */ 35 */
29void cgroup_bpf_put(struct cgroup *cgrp) 36static void cgroup_bpf_release(struct work_struct *work)
30{ 37{
38 struct cgroup *cgrp = container_of(work, struct cgroup,
39 bpf.release_work);
31 enum bpf_cgroup_storage_type stype; 40 enum bpf_cgroup_storage_type stype;
32 unsigned int type; 41 unsigned int type;
33 42
@@ -47,6 +56,22 @@ void cgroup_bpf_put(struct cgroup *cgrp)
47 } 56 }
48 bpf_prog_array_free(cgrp->bpf.effective[type]); 57 bpf_prog_array_free(cgrp->bpf.effective[type]);
49 } 58 }
59
60 percpu_ref_exit(&cgrp->bpf.refcnt);
61 cgroup_put(cgrp);
62}
63
64/**
65 * cgroup_bpf_release_fn() - callback used to schedule releasing
66 * of bpf cgroup data
67 * @ref: percpu ref counter structure
68 */
69static void cgroup_bpf_release_fn(struct percpu_ref *ref)
70{
71 struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);
72
73 INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
74 queue_work(system_wq, &cgrp->bpf.release_work);
50} 75}
51 76
52/* count number of elements in the list. 77/* count number of elements in the list.
@@ -167,7 +192,12 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
167 */ 192 */
168#define NR ARRAY_SIZE(cgrp->bpf.effective) 193#define NR ARRAY_SIZE(cgrp->bpf.effective)
169 struct bpf_prog_array __rcu *arrays[NR] = {}; 194 struct bpf_prog_array __rcu *arrays[NR] = {};
170 int i; 195 int ret, i;
196
197 ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0,
198 GFP_KERNEL);
199 if (ret)
200 return ret;
171 201
172 for (i = 0; i < NR; i++) 202 for (i = 0; i < NR; i++)
173 INIT_LIST_HEAD(&cgrp->bpf.progs[i]); 203 INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
@@ -183,6 +213,9 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
183cleanup: 213cleanup:
184 for (i = 0; i < NR; i++) 214 for (i = 0; i < NR; i++)
185 bpf_prog_array_free(arrays[i]); 215 bpf_prog_array_free(arrays[i]);
216
217 percpu_ref_exit(&cgrp->bpf.refcnt);
218
186 return -ENOMEM; 219 return -ENOMEM;
187} 220}
188 221
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 217cec4e22c6..ef9cfbfc82a9 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4955,8 +4955,6 @@ static void css_release_work_fn(struct work_struct *work)
4955 if (cgrp->kn) 4955 if (cgrp->kn)
4956 RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, 4956 RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
4957 NULL); 4957 NULL);
4958
4959 cgroup_bpf_put(cgrp);
4960 } 4958 }
4961 4959
4962 mutex_unlock(&cgroup_mutex); 4960 mutex_unlock(&cgroup_mutex);
@@ -5482,6 +5480,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
5482 5480
5483 cgroup1_check_for_release(parent); 5481 cgroup1_check_for_release(parent);
5484 5482
5483 cgroup_bpf_offline(cgrp);
5484
5485 /* put the base reference */ 5485 /* put the base reference */
5486 percpu_ref_kill(&cgrp->self.refcnt); 5486 percpu_ref_kill(&cgrp->self.refcnt);
5487 5487
@@ -6221,6 +6221,7 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
6221 * Don't use cgroup_get_live(). 6221 * Don't use cgroup_get_live().
6222 */ 6222 */
6223 cgroup_get(sock_cgroup_ptr(skcd)); 6223 cgroup_get(sock_cgroup_ptr(skcd));
6224 cgroup_bpf_get(sock_cgroup_ptr(skcd));
6224 return; 6225 return;
6225 } 6226 }
6226 6227
@@ -6232,6 +6233,7 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
6232 cset = task_css_set(current); 6233 cset = task_css_set(current);
6233 if (likely(cgroup_tryget(cset->dfl_cgrp))) { 6234 if (likely(cgroup_tryget(cset->dfl_cgrp))) {
6234 skcd->val = (unsigned long)cset->dfl_cgrp; 6235 skcd->val = (unsigned long)cset->dfl_cgrp;
6236 cgroup_bpf_get(cset->dfl_cgrp);
6235 break; 6237 break;
6236 } 6238 }
6237 cpu_relax(); 6239 cpu_relax();
@@ -6242,7 +6244,10 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
6242 6244
6243void cgroup_sk_free(struct sock_cgroup_data *skcd) 6245void cgroup_sk_free(struct sock_cgroup_data *skcd)
6244{ 6246{
6245 cgroup_put(sock_cgroup_ptr(skcd)); 6247 struct cgroup *cgrp = sock_cgroup_ptr(skcd);
6248
6249 cgroup_bpf_put(cgrp);
6250 cgroup_put(cgrp);
6246} 6251}
6247 6252
6248#endif /* CONFIG_SOCK_CGROUP_DATA */ 6253#endif /* CONFIG_SOCK_CGROUP_DATA */