summaryrefslogtreecommitdiffstats
path: root/kernel/signal.c
diff options
context:
space:
mode:
authorRoman Gushchin <guro@fb.com>2019-04-19 13:03:04 -0400
committerTejun Heo <tj@kernel.org>2019-04-19 14:26:48 -0400
commit76f969e8948d82e78e1bc4beb6b9465908e74873 (patch)
tree1f5459d94820c5e5ea7293b103e8531d389c15c1 /kernel/signal.c
parent4dcabece4c3a9f9522127be12cc12cc120399b2f (diff)
cgroup: cgroup v2 freezer
Cgroup v1 implements the freezer controller, which provides an ability to stop the workload in a cgroup and temporarily free up some resources (cpu, io, network bandwidth and, potentially, memory) for some other tasks. Cgroup v2 lacks this functionality. This patch implements freezer for cgroup v2. Cgroup v2 freezer tries to put tasks into a state similar to jobctl stop. This means that tasks can be killed, ptraced (using PTRACE_SEIZE*), and interrupted. It is possible to attach to a frozen task, get some information (e.g. read registers) and detach. It's also possible to migrate a frozen tasks to another cgroup. This differs cgroup v2 freezer from cgroup v1 freezer, which mostly tried to imitate the system-wide freezer. However uninterruptible sleep is fine when all tasks are going to be frozen (hibernation case), it's not the acceptable state for some subset of the system. Cgroup v2 freezer is not supporting freezing kthreads. If a non-root cgroup contains kthread, the cgroup still can be frozen, but the kthread will remain running, the cgroup will be shown as non-frozen, and the notification will not be delivered. * PTRACE_ATTACH is not working because non-fatal signal delivery is blocked in frozen state. There are some interface differences between cgroup v1 and cgroup v2 freezer too, which are required to conform the cgroup v2 interface design principles: 1) There is no separate controller, which has to be turned on: the functionality is always available and is represented by cgroup.freeze and cgroup.events cgroup control files. 2) The desired state is defined by the cgroup.freeze control file. Any hierarchical configuration is allowed. 3) The interface is asynchronous. The actual state is available using cgroup.events control file ("frozen" field). There are no dedicated transitional states. 4) It's allowed to make any changes with the cgroup hierarchy (create new cgroups, remove old cgroups, move tasks between cgroups) no matter if some cgroups are frozen. Signed-off-by: Roman Gushchin <guro@fb.com> Signed-off-by: Tejun Heo <tj@kernel.org> No-objection-from-me-by: Oleg Nesterov <oleg@redhat.com> Cc: kernel-team@fb.com
Diffstat (limited to 'kernel/signal.c')
-rw-r--r--kernel/signal.c70
1 files changed, 65 insertions, 5 deletions
diff --git a/kernel/signal.c b/kernel/signal.c
index f98448cf2def..095e0fc57b25 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -43,6 +43,7 @@
43#include <linux/compiler.h> 43#include <linux/compiler.h>
44#include <linux/posix-timers.h> 44#include <linux/posix-timers.h>
45#include <linux/livepatch.h> 45#include <linux/livepatch.h>
46#include <linux/cgroup.h>
46 47
47#define CREATE_TRACE_POINTS 48#define CREATE_TRACE_POINTS
48#include <trace/events/signal.h> 49#include <trace/events/signal.h>
@@ -146,9 +147,10 @@ static inline bool has_pending_signals(sigset_t *signal, sigset_t *blocked)
146 147
147static bool recalc_sigpending_tsk(struct task_struct *t) 148static bool recalc_sigpending_tsk(struct task_struct *t)
148{ 149{
149 if ((t->jobctl & JOBCTL_PENDING_MASK) || 150 if ((t->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) ||
150 PENDING(&t->pending, &t->blocked) || 151 PENDING(&t->pending, &t->blocked) ||
151 PENDING(&t->signal->shared_pending, &t->blocked)) { 152 PENDING(&t->signal->shared_pending, &t->blocked) ||
153 cgroup_task_frozen(t)) {
152 set_tsk_thread_flag(t, TIF_SIGPENDING); 154 set_tsk_thread_flag(t, TIF_SIGPENDING);
153 return true; 155 return true;
154 } 156 }
@@ -2108,6 +2110,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
2108 preempt_disable(); 2110 preempt_disable();
2109 read_unlock(&tasklist_lock); 2111 read_unlock(&tasklist_lock);
2110 preempt_enable_no_resched(); 2112 preempt_enable_no_resched();
2113 cgroup_enter_frozen();
2111 freezable_schedule(); 2114 freezable_schedule();
2112 } else { 2115 } else {
2113 /* 2116 /*
@@ -2286,6 +2289,7 @@ static bool do_signal_stop(int signr)
2286 } 2289 }
2287 2290
2288 /* Now we don't run again until woken by SIGCONT or SIGKILL */ 2291 /* Now we don't run again until woken by SIGCONT or SIGKILL */
2292 cgroup_enter_frozen();
2289 freezable_schedule(); 2293 freezable_schedule();
2290 return true; 2294 return true;
2291 } else { 2295 } else {
@@ -2332,6 +2336,43 @@ static void do_jobctl_trap(void)
2332 } 2336 }
2333} 2337}
2334 2338
2339/**
2340 * do_freezer_trap - handle the freezer jobctl trap
2341 *
2342 * Puts the task into frozen state, if only the task is not about to quit.
2343 * In this case it drops JOBCTL_TRAP_FREEZE.
2344 *
2345 * CONTEXT:
2346 * Must be called with @current->sighand->siglock held,
2347 * which is always released before returning.
2348 */
2349static void do_freezer_trap(void)
2350 __releases(&current->sighand->siglock)
2351{
2352 /*
2353 * If there are other trap bits pending except JOBCTL_TRAP_FREEZE,
2354 * let's make another loop to give it a chance to be handled.
2355 * In any case, we'll return back.
2356 */
2357 if ((current->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) !=
2358 JOBCTL_TRAP_FREEZE) {
2359 spin_unlock_irq(&current->sighand->siglock);
2360 return;
2361 }
2362
2363 /*
2364 * Now we're sure that there is no pending fatal signal and no
2365 * pending traps. Clear TIF_SIGPENDING to not get out of schedule()
2366 * immediately (if there is a non-fatal signal pending), and
2367 * put the task into sleep.
2368 */
2369 __set_current_state(TASK_INTERRUPTIBLE);
2370 clear_thread_flag(TIF_SIGPENDING);
2371 spin_unlock_irq(&current->sighand->siglock);
2372 cgroup_enter_frozen();
2373 freezable_schedule();
2374}
2375
2335static int ptrace_signal(int signr, kernel_siginfo_t *info) 2376static int ptrace_signal(int signr, kernel_siginfo_t *info)
2336{ 2377{
2337 /* 2378 /*
@@ -2442,6 +2483,10 @@ relock:
2442 ksig->info.si_signo = signr = SIGKILL; 2483 ksig->info.si_signo = signr = SIGKILL;
2443 sigdelset(&current->pending.signal, SIGKILL); 2484 sigdelset(&current->pending.signal, SIGKILL);
2444 recalc_sigpending(); 2485 recalc_sigpending();
2486 current->jobctl &= ~JOBCTL_TRAP_FREEZE;
2487 spin_unlock_irq(&sighand->siglock);
2488 if (unlikely(cgroup_task_frozen(current)))
2489 cgroup_leave_frozen(true);
2445 goto fatal; 2490 goto fatal;
2446 } 2491 }
2447 2492
@@ -2452,9 +2497,24 @@ relock:
2452 do_signal_stop(0)) 2497 do_signal_stop(0))
2453 goto relock; 2498 goto relock;
2454 2499
2455 if (unlikely(current->jobctl & JOBCTL_TRAP_MASK)) { 2500 if (unlikely(current->jobctl &
2456 do_jobctl_trap(); 2501 (JOBCTL_TRAP_MASK | JOBCTL_TRAP_FREEZE))) {
2502 if (current->jobctl & JOBCTL_TRAP_MASK) {
2503 do_jobctl_trap();
2504 spin_unlock_irq(&sighand->siglock);
2505 } else if (current->jobctl & JOBCTL_TRAP_FREEZE)
2506 do_freezer_trap();
2507
2508 goto relock;
2509 }
2510
2511 /*
2512 * If the task is leaving the frozen state, let's update
2513 * cgroup counters and reset the frozen bit.
2514 */
2515 if (unlikely(cgroup_task_frozen(current))) {
2457 spin_unlock_irq(&sighand->siglock); 2516 spin_unlock_irq(&sighand->siglock);
2517 cgroup_leave_frozen(true);
2458 goto relock; 2518 goto relock;
2459 } 2519 }
2460 2520
@@ -2548,8 +2608,8 @@ relock:
2548 continue; 2608 continue;
2549 } 2609 }
2550 2610
2551 fatal:
2552 spin_unlock_irq(&sighand->siglock); 2611 spin_unlock_irq(&sighand->siglock);
2612 fatal:
2553 2613
2554 /* 2614 /*
2555 * Anything else is fatal, maybe with a core dump. 2615 * Anything else is fatal, maybe with a core dump.