aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorMike Galbraith <efault@gmx.de>2010-11-30 08:18:03 -0500
committerIngo Molnar <mingo@elte.hu>2010-11-30 10:03:35 -0500
commit5091faa449ee0b7d73bc296a93bca9540fc51d0a (patch)
tree55f5e96e189af65c85c769fce48627b8a5abb86b /kernel
parent822bc180a7f7a7bc5fcaaea195f41b487cc8cae8 (diff)
sched: Add 'autogroup' scheduling feature: automated per session task groups
A recurring complaint from CFS users is that parallel kbuild has a negative impact on desktop interactivity. This patch implements an idea from Linus, to automatically create task groups. Currently, only per session autogroups are implemented, but the patch leaves the way open for enhancement. Implementation: each task's signal struct contains an inherited pointer to a refcounted autogroup struct containing a task group pointer, the default for all tasks pointing to the init_task_group. When a task calls setsid(), a new task group is created, the process is moved into the new task group, and a reference to the preveious task group is dropped. Child processes inherit this task group thereafter, and increase it's refcount. When the last thread of a process exits, the process's reference is dropped, such that when the last process referencing an autogroup exits, the autogroup is destroyed. At runqueue selection time, IFF a task has no cgroup assignment, its current autogroup is used. Autogroup bandwidth is controllable via setting it's nice level through the proc filesystem: cat /proc/<pid>/autogroup Displays the task's group and the group's nice level. echo <nice level> > /proc/<pid>/autogroup Sets the task group's shares to the weight of nice <level> task. Setting nice level is rate limited for !admin users due to the abuse risk of task group locking. The feature is enabled from boot by default if CONFIG_SCHED_AUTOGROUP=y is selected, but can be disabled via the boot option noautogroup, and can also be turned on/off on the fly via: echo [01] > /proc/sys/kernel/sched_autogroup_enabled ... which will automatically move tasks to/from the root task group. Signed-off-by: Mike Galbraith <efault@gmx.de> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Paul Turner <pjt@google.com> Cc: Oleg Nesterov <oleg@redhat.com> [ Removed the task_group_path() debug code, and fixed !EVENTFD build failure. ] Signed-off-by: Ingo Molnar <mingo@elte.hu> LKML-Reference: <1290281700.28711.9.camel@maggy.simson.net> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/fork.c5
-rw-r--r--kernel/sched.c13
-rw-r--r--kernel/sched_autogroup.c229
-rw-r--r--kernel/sched_autogroup.h32
-rw-r--r--kernel/sched_debug.c47
-rw-r--r--kernel/sys.c4
-rw-r--r--kernel/sysctl.c11
7 files changed, 292 insertions, 49 deletions
diff --git a/kernel/fork.c b/kernel/fork.c
index 3b159c5991b7..b6f2475f1e83 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -174,8 +174,10 @@ static inline void free_signal_struct(struct signal_struct *sig)
174 174
175static inline void put_signal_struct(struct signal_struct *sig) 175static inline void put_signal_struct(struct signal_struct *sig)
176{ 176{
177 if (atomic_dec_and_test(&sig->sigcnt)) 177 if (atomic_dec_and_test(&sig->sigcnt)) {
178 sched_autogroup_exit(sig);
178 free_signal_struct(sig); 179 free_signal_struct(sig);
180 }
179} 181}
180 182
181void __put_task_struct(struct task_struct *tsk) 183void __put_task_struct(struct task_struct *tsk)
@@ -904,6 +906,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
904 posix_cpu_timers_init_group(sig); 906 posix_cpu_timers_init_group(sig);
905 907
906 tty_audit_fork(sig); 908 tty_audit_fork(sig);
909 sched_autogroup_fork(sig);
907 910
908 sig->oom_adj = current->signal->oom_adj; 911 sig->oom_adj = current->signal->oom_adj;
909 sig->oom_score_adj = current->signal->oom_score_adj; 912 sig->oom_score_adj = current->signal->oom_score_adj;
diff --git a/kernel/sched.c b/kernel/sched.c
index 66ef5790d932..b646dad4a40e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -79,6 +79,7 @@
79 79
80#include "sched_cpupri.h" 80#include "sched_cpupri.h"
81#include "workqueue_sched.h" 81#include "workqueue_sched.h"
82#include "sched_autogroup.h"
82 83
83#define CREATE_TRACE_POINTS 84#define CREATE_TRACE_POINTS
84#include <trace/events/sched.h> 85#include <trace/events/sched.h>
@@ -271,6 +272,10 @@ struct task_group {
271 struct task_group *parent; 272 struct task_group *parent;
272 struct list_head siblings; 273 struct list_head siblings;
273 struct list_head children; 274 struct list_head children;
275
276#ifdef CONFIG_SCHED_AUTOGROUP
277 struct autogroup *autogroup;
278#endif
274}; 279};
275 280
276#define root_task_group init_task_group 281#define root_task_group init_task_group
@@ -603,11 +608,14 @@ static inline int cpu_of(struct rq *rq)
603 */ 608 */
604static inline struct task_group *task_group(struct task_struct *p) 609static inline struct task_group *task_group(struct task_struct *p)
605{ 610{
611 struct task_group *tg;
606 struct cgroup_subsys_state *css; 612 struct cgroup_subsys_state *css;
607 613
608 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 614 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
609 lockdep_is_held(&task_rq(p)->lock)); 615 lockdep_is_held(&task_rq(p)->lock));
610 return container_of(css, struct task_group, css); 616 tg = container_of(css, struct task_group, css);
617
618 return autogroup_task_group(p, tg);
611} 619}
612 620
613/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 621/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@ -1869,6 +1877,7 @@ static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
1869#include "sched_idletask.c" 1877#include "sched_idletask.c"
1870#include "sched_fair.c" 1878#include "sched_fair.c"
1871#include "sched_rt.c" 1879#include "sched_rt.c"
1880#include "sched_autogroup.c"
1872#include "sched_stoptask.c" 1881#include "sched_stoptask.c"
1873#ifdef CONFIG_SCHED_DEBUG 1882#ifdef CONFIG_SCHED_DEBUG
1874# include "sched_debug.c" 1883# include "sched_debug.c"
@@ -7750,7 +7759,7 @@ void __init sched_init(void)
7750#ifdef CONFIG_CGROUP_SCHED 7759#ifdef CONFIG_CGROUP_SCHED
7751 list_add(&init_task_group.list, &task_groups); 7760 list_add(&init_task_group.list, &task_groups);
7752 INIT_LIST_HEAD(&init_task_group.children); 7761 INIT_LIST_HEAD(&init_task_group.children);
7753 7762 autogroup_init(&init_task);
7754#endif /* CONFIG_CGROUP_SCHED */ 7763#endif /* CONFIG_CGROUP_SCHED */
7755 7764
7756 for_each_possible_cpu(i) { 7765 for_each_possible_cpu(i) {
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
new file mode 100644
index 000000000000..57a7ac286a02
--- /dev/null
+++ b/kernel/sched_autogroup.c
@@ -0,0 +1,229 @@
1#ifdef CONFIG_SCHED_AUTOGROUP
2
3#include <linux/proc_fs.h>
4#include <linux/seq_file.h>
5#include <linux/kallsyms.h>
6#include <linux/utsname.h>
7
8unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
9static struct autogroup autogroup_default;
10static atomic_t autogroup_seq_nr;
11
12static void autogroup_init(struct task_struct *init_task)
13{
14 autogroup_default.tg = &init_task_group;
15 init_task_group.autogroup = &autogroup_default;
16 kref_init(&autogroup_default.kref);
17 init_rwsem(&autogroup_default.lock);
18 init_task->signal->autogroup = &autogroup_default;
19}
20
21static inline void autogroup_free(struct task_group *tg)
22{
23 kfree(tg->autogroup);
24}
25
26static inline void autogroup_destroy(struct kref *kref)
27{
28 struct autogroup *ag = container_of(kref, struct autogroup, kref);
29
30 sched_destroy_group(ag->tg);
31}
32
33static inline void autogroup_kref_put(struct autogroup *ag)
34{
35 kref_put(&ag->kref, autogroup_destroy);
36}
37
38static inline struct autogroup *autogroup_kref_get(struct autogroup *ag)
39{
40 kref_get(&ag->kref);
41 return ag;
42}
43
44static inline struct autogroup *autogroup_create(void)
45{
46 struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
47 struct task_group *tg;
48
49 if (!ag)
50 goto out_fail;
51
52 tg = sched_create_group(&init_task_group);
53
54 if (IS_ERR(tg))
55 goto out_free;
56
57 kref_init(&ag->kref);
58 init_rwsem(&ag->lock);
59 ag->id = atomic_inc_return(&autogroup_seq_nr);
60 ag->tg = tg;
61 tg->autogroup = ag;
62
63 return ag;
64
65out_free:
66 kfree(ag);
67out_fail:
68 if (printk_ratelimit()) {
69 printk(KERN_WARNING "autogroup_create: %s failure.\n",
70 ag ? "sched_create_group()" : "kmalloc()");
71 }
72
73 return autogroup_kref_get(&autogroup_default);
74}
75
76static inline bool
77task_wants_autogroup(struct task_struct *p, struct task_group *tg)
78{
79 if (tg != &root_task_group)
80 return false;
81
82 if (p->sched_class != &fair_sched_class)
83 return false;
84
85 /*
86 * We can only assume the task group can't go away on us if
87 * autogroup_move_group() can see us on ->thread_group list.
88 */
89 if (p->flags & PF_EXITING)
90 return false;
91
92 return true;
93}
94
95static inline struct task_group *
96autogroup_task_group(struct task_struct *p, struct task_group *tg)
97{
98 int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
99
100 if (enabled && task_wants_autogroup(p, tg))
101 return p->signal->autogroup->tg;
102
103 return tg;
104}
105
106static void
107autogroup_move_group(struct task_struct *p, struct autogroup *ag)
108{
109 struct autogroup *prev;
110 struct task_struct *t;
111 unsigned long flags;
112
113 BUG_ON(!lock_task_sighand(p, &flags));
114
115 prev = p->signal->autogroup;
116 if (prev == ag) {
117 unlock_task_sighand(p, &flags);
118 return;
119 }
120
121 p->signal->autogroup = autogroup_kref_get(ag);
122
123 t = p;
124 do {
125 sched_move_task(t);
126 } while_each_thread(p, t);
127
128 unlock_task_sighand(p, &flags);
129 autogroup_kref_put(prev);
130}
131
132/* Allocates GFP_KERNEL, cannot be called under any spinlock */
133void sched_autogroup_create_attach(struct task_struct *p)
134{
135 struct autogroup *ag = autogroup_create();
136
137 autogroup_move_group(p, ag);
138 /* drop extra refrence added by autogroup_create() */
139 autogroup_kref_put(ag);
140}
141EXPORT_SYMBOL(sched_autogroup_create_attach);
142
143/* Cannot be called under siglock. Currently has no users */
144void sched_autogroup_detach(struct task_struct *p)
145{
146 autogroup_move_group(p, &autogroup_default);
147}
148EXPORT_SYMBOL(sched_autogroup_detach);
149
150void sched_autogroup_fork(struct signal_struct *sig)
151{
152 struct task_struct *p = current;
153
154 spin_lock_irq(&p->sighand->siglock);
155 sig->autogroup = autogroup_kref_get(p->signal->autogroup);
156 spin_unlock_irq(&p->sighand->siglock);
157}
158
159void sched_autogroup_exit(struct signal_struct *sig)
160{
161 autogroup_kref_put(sig->autogroup);
162}
163
164static int __init setup_autogroup(char *str)
165{
166 sysctl_sched_autogroup_enabled = 0;
167
168 return 1;
169}
170
171__setup("noautogroup", setup_autogroup);
172
173#ifdef CONFIG_PROC_FS
174
175/* Called with siglock held. */
176int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice)
177{
178 static unsigned long next = INITIAL_JIFFIES;
179 struct autogroup *ag;
180 int err;
181
182 if (*nice < -20 || *nice > 19)
183 return -EINVAL;
184
185 err = security_task_setnice(current, *nice);
186 if (err)
187 return err;
188
189 if (*nice < 0 && !can_nice(current, *nice))
190 return -EPERM;
191
192 /* this is a heavy operation taking global locks.. */
193 if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
194 return -EAGAIN;
195
196 next = HZ / 10 + jiffies;
197 ag = autogroup_kref_get(p->signal->autogroup);
198
199 down_write(&ag->lock);
200 err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]);
201 if (!err)
202 ag->nice = *nice;
203 up_write(&ag->lock);
204
205 autogroup_kref_put(ag);
206
207 return err;
208}
209
210void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
211{
212 struct autogroup *ag = autogroup_kref_get(p->signal->autogroup);
213
214 down_read(&ag->lock);
215 seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
216 up_read(&ag->lock);
217
218 autogroup_kref_put(ag);
219}
220#endif /* CONFIG_PROC_FS */
221
222#ifdef CONFIG_SCHED_DEBUG
223static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
224{
225 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
226}
227#endif /* CONFIG_SCHED_DEBUG */
228
229#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
new file mode 100644
index 000000000000..5358e241cb20
--- /dev/null
+++ b/kernel/sched_autogroup.h
@@ -0,0 +1,32 @@
1#ifdef CONFIG_SCHED_AUTOGROUP
2
3struct autogroup {
4 struct kref kref;
5 struct task_group *tg;
6 struct rw_semaphore lock;
7 unsigned long id;
8 int nice;
9};
10
11static inline struct task_group *
12autogroup_task_group(struct task_struct *p, struct task_group *tg);
13
14#else /* !CONFIG_SCHED_AUTOGROUP */
15
16static inline void autogroup_init(struct task_struct *init_task) { }
17static inline void autogroup_free(struct task_group *tg) { }
18
19static inline struct task_group *
20autogroup_task_group(struct task_struct *p, struct task_group *tg)
21{
22 return tg;
23}
24
25#ifdef CONFIG_SCHED_DEBUG
26static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
27{
28 return 0;
29}
30#endif
31
32#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index e95b77414a99..1dfae3d014b5 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -54,8 +54,7 @@ static unsigned long nsec_low(unsigned long long nsec)
54#define SPLIT_NS(x) nsec_high(x), nsec_low(x) 54#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
55 55
56#ifdef CONFIG_FAIR_GROUP_SCHED 56#ifdef CONFIG_FAIR_GROUP_SCHED
57static void print_cfs_group_stats(struct seq_file *m, int cpu, 57static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
58 struct task_group *tg)
59{ 58{
60 struct sched_entity *se = tg->se[cpu]; 59 struct sched_entity *se = tg->se[cpu];
61 if (!se) 60 if (!se)
@@ -110,16 +109,6 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
110 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); 109 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
111#endif 110#endif
112 111
113#ifdef CONFIG_CGROUP_SCHED
114 {
115 char path[64];
116
117 rcu_read_lock();
118 cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
119 rcu_read_unlock();
120 SEQ_printf(m, " %s", path);
121 }
122#endif
123 SEQ_printf(m, "\n"); 112 SEQ_printf(m, "\n");
124} 113}
125 114
@@ -147,19 +136,6 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
147 read_unlock_irqrestore(&tasklist_lock, flags); 136 read_unlock_irqrestore(&tasklist_lock, flags);
148} 137}
149 138
150#if defined(CONFIG_CGROUP_SCHED) && \
151 (defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED))
152static void task_group_path(struct task_group *tg, char *buf, int buflen)
153{
154 /* may be NULL if the underlying cgroup isn't fully-created yet */
155 if (!tg->css.cgroup) {
156 buf[0] = '\0';
157 return;
158 }
159 cgroup_path(tg->css.cgroup, buf, buflen);
160}
161#endif
162
163void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) 139void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
164{ 140{
165 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, 141 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
@@ -168,16 +144,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
168 struct sched_entity *last; 144 struct sched_entity *last;
169 unsigned long flags; 145 unsigned long flags;
170 146
171#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
172 char path[128];
173 struct task_group *tg = cfs_rq->tg;
174
175 task_group_path(tg, path, sizeof(path));
176
177 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
178#else
179 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); 147 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
180#endif
181 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", 148 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
182 SPLIT_NS(cfs_rq->exec_clock)); 149 SPLIT_NS(cfs_rq->exec_clock));
183 150
@@ -215,7 +182,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
215 SEQ_printf(m, " .%-30s: %ld\n", "load_contrib", 182 SEQ_printf(m, " .%-30s: %ld\n", "load_contrib",
216 cfs_rq->load_contribution); 183 cfs_rq->load_contribution);
217 SEQ_printf(m, " .%-30s: %d\n", "load_tg", 184 SEQ_printf(m, " .%-30s: %d\n", "load_tg",
218 atomic_read(&tg->load_weight)); 185 atomic_read(&cfs_rq->tg->load_weight));
219#endif 186#endif
220 187
221 print_cfs_group_stats(m, cpu, cfs_rq->tg); 188 print_cfs_group_stats(m, cpu, cfs_rq->tg);
@@ -224,17 +191,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
224 191
225void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) 192void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
226{ 193{
227#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
228 char path[128];
229 struct task_group *tg = rt_rq->tg;
230
231 task_group_path(tg, path, sizeof(path));
232
233 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
234#else
235 SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); 194 SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
236#endif
237
238 195
239#define P(x) \ 196#define P(x) \
240 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) 197 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
diff --git a/kernel/sys.c b/kernel/sys.c
index 7f5a0cd296a9..2745dcdb6c6c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1080,8 +1080,10 @@ SYSCALL_DEFINE0(setsid)
1080 err = session; 1080 err = session;
1081out: 1081out:
1082 write_unlock_irq(&tasklist_lock); 1082 write_unlock_irq(&tasklist_lock);
1083 if (err > 0) 1083 if (err > 0) {
1084 proc_sid_connector(group_leader); 1084 proc_sid_connector(group_leader);
1085 sched_autogroup_create_attach(group_leader);
1086 }
1085 return err; 1087 return err;
1086} 1088}
1087 1089
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index a00fdefd24ce..121e4fff03d1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -370,6 +370,17 @@ static struct ctl_table kern_table[] = {
370 .mode = 0644, 370 .mode = 0644,
371 .proc_handler = proc_dointvec, 371 .proc_handler = proc_dointvec,
372 }, 372 },
373#ifdef CONFIG_SCHED_AUTOGROUP
374 {
375 .procname = "sched_autogroup_enabled",
376 .data = &sysctl_sched_autogroup_enabled,
377 .maxlen = sizeof(unsigned int),
378 .mode = 0644,
379 .proc_handler = proc_dointvec,
380 .extra1 = &zero,
381 .extra2 = &one,
382 },
383#endif
373#ifdef CONFIG_PROVE_LOCKING 384#ifdef CONFIG_PROVE_LOCKING
374 { 385 {
375 .procname = "prove_locking", 386 .procname = "prove_locking",