aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/x86/include/asm/preempt.h1
-rw-r--r--include/uapi/linux/sched.h2
-rw-r--r--kernel/context_tracking.c40
-rw-r--r--kernel/sched/core.c47
-rw-r--r--kernel/sched/deadline.c41
-rw-r--r--kernel/sched/fair.c21
-rw-r--r--kernel/sysctl.c3
7 files changed, 99 insertions, 56 deletions
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 7024c12f7bfe..400873450e33 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -105,6 +105,7 @@ static __always_inline bool should_resched(void)
105# ifdef CONFIG_CONTEXT_TRACKING 105# ifdef CONFIG_CONTEXT_TRACKING
106 extern asmlinkage void ___preempt_schedule_context(void); 106 extern asmlinkage void ___preempt_schedule_context(void);
107# define __preempt_schedule_context() asm ("call ___preempt_schedule_context") 107# define __preempt_schedule_context() asm ("call ___preempt_schedule_context")
108 extern asmlinkage void preempt_schedule_context(void);
108# endif 109# endif
109#endif 110#endif
110 111
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 34f9d7387d13..b932be9f5c5b 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -13,7 +13,7 @@
13#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */ 13#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */
14#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */ 14#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */
15#define CLONE_THREAD 0x00010000 /* Same thread group? */ 15#define CLONE_THREAD 0x00010000 /* Same thread group? */
16#define CLONE_NEWNS 0x00020000 /* New namespace group? */ 16#define CLONE_NEWNS 0x00020000 /* New mount namespace group */
17#define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */ 17#define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */
18#define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */ 18#define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */
19#define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */ 19#define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 5664985c46a0..937ecdfdf258 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -107,46 +107,6 @@ void context_tracking_user_enter(void)
107} 107}
108NOKPROBE_SYMBOL(context_tracking_user_enter); 108NOKPROBE_SYMBOL(context_tracking_user_enter);
109 109
110#ifdef CONFIG_PREEMPT
111/**
112 * preempt_schedule_context - preempt_schedule called by tracing
113 *
114 * The tracing infrastructure uses preempt_enable_notrace to prevent
115 * recursion and tracing preempt enabling caused by the tracing
116 * infrastructure itself. But as tracing can happen in areas coming
117 * from userspace or just about to enter userspace, a preempt enable
118 * can occur before user_exit() is called. This will cause the scheduler
119 * to be called when the system is still in usermode.
120 *
121 * To prevent this, the preempt_enable_notrace will use this function
122 * instead of preempt_schedule() to exit user context if needed before
123 * calling the scheduler.
124 */
125asmlinkage __visible void __sched notrace preempt_schedule_context(void)
126{
127 enum ctx_state prev_ctx;
128
129 if (likely(!preemptible()))
130 return;
131
132 /*
133 * Need to disable preemption in case user_exit() is traced
134 * and the tracer calls preempt_enable_notrace() causing
135 * an infinite recursion.
136 */
137 preempt_disable_notrace();
138 prev_ctx = exception_enter();
139 preempt_enable_no_resched_notrace();
140
141 preempt_schedule();
142
143 preempt_disable_notrace();
144 exception_exit(prev_ctx);
145 preempt_enable_notrace();
146}
147EXPORT_SYMBOL_GPL(preempt_schedule_context);
148#endif /* CONFIG_PREEMPT */
149
150/** 110/**
151 * context_tracking_user_exit - Inform the context tracking that the CPU is 111 * context_tracking_user_exit - Inform the context tracking that the CPU is
152 * exiting userspace mode and entering the kernel. 112 * exiting userspace mode and entering the kernel.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 44999505e1bf..240157c13ddc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2951,6 +2951,47 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
2951} 2951}
2952NOKPROBE_SYMBOL(preempt_schedule); 2952NOKPROBE_SYMBOL(preempt_schedule);
2953EXPORT_SYMBOL(preempt_schedule); 2953EXPORT_SYMBOL(preempt_schedule);
2954
2955#ifdef CONFIG_CONTEXT_TRACKING
2956/**
2957 * preempt_schedule_context - preempt_schedule called by tracing
2958 *
2959 * The tracing infrastructure uses preempt_enable_notrace to prevent
2960 * recursion and tracing preempt enabling caused by the tracing
2961 * infrastructure itself. But as tracing can happen in areas coming
2962 * from userspace or just about to enter userspace, a preempt enable
2963 * can occur before user_exit() is called. This will cause the scheduler
2964 * to be called when the system is still in usermode.
2965 *
2966 * To prevent this, the preempt_enable_notrace will use this function
2967 * instead of preempt_schedule() to exit user context if needed before
2968 * calling the scheduler.
2969 */
2970asmlinkage __visible void __sched notrace preempt_schedule_context(void)
2971{
2972 enum ctx_state prev_ctx;
2973
2974 if (likely(!preemptible()))
2975 return;
2976
2977 do {
2978 __preempt_count_add(PREEMPT_ACTIVE);
2979 /*
2980 * Needs preempt disabled in case user_exit() is traced
2981 * and the tracer calls preempt_enable_notrace() causing
2982 * an infinite recursion.
2983 */
2984 prev_ctx = exception_enter();
2985 __schedule();
2986 exception_exit(prev_ctx);
2987
2988 __preempt_count_sub(PREEMPT_ACTIVE);
2989 barrier();
2990 } while (need_resched());
2991}
2992EXPORT_SYMBOL_GPL(preempt_schedule_context);
2993#endif /* CONFIG_CONTEXT_TRACKING */
2994
2954#endif /* CONFIG_PREEMPT */ 2995#endif /* CONFIG_PREEMPT */
2955 2996
2956/* 2997/*
@@ -7833,6 +7874,11 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
7833 sched_offline_group(tg); 7874 sched_offline_group(tg);
7834} 7875}
7835 7876
7877static void cpu_cgroup_fork(struct task_struct *task)
7878{
7879 sched_move_task(task);
7880}
7881
7836static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, 7882static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
7837 struct cgroup_taskset *tset) 7883 struct cgroup_taskset *tset)
7838{ 7884{
@@ -8205,6 +8251,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
8205 .css_free = cpu_cgroup_css_free, 8251 .css_free = cpu_cgroup_css_free,
8206 .css_online = cpu_cgroup_css_online, 8252 .css_online = cpu_cgroup_css_online,
8207 .css_offline = cpu_cgroup_css_offline, 8253 .css_offline = cpu_cgroup_css_offline,
8254 .fork = cpu_cgroup_fork,
8208 .can_attach = cpu_cgroup_can_attach, 8255 .can_attach = cpu_cgroup_can_attach,
8209 .attach = cpu_cgroup_attach, 8256 .attach = cpu_cgroup_attach,
8210 .exit = cpu_cgroup_exit, 8257 .exit = cpu_cgroup_exit,
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 256e577faf1b..5285332392d5 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -518,12 +518,20 @@ again:
518 } 518 }
519 519
520 /* 520 /*
521 * We need to take care of a possible races here. In fact, the 521 * We need to take care of several possible races here:
522 * task might have changed its scheduling policy to something 522 *
523 * different from SCHED_DEADLINE or changed its reservation 523 * - the task might have changed its scheduling policy
524 * parameters (through sched_setattr()). 524 * to something different than SCHED_DEADLINE
525 * - the task might have changed its reservation parameters
526 * (through sched_setattr())
527 * - the task might have been boosted by someone else and
528 * might be in the boosting/deboosting path
529 *
530 * In all this cases we bail out, as the task is already
531 * in the runqueue or is going to be enqueued back anyway.
525 */ 532 */
526 if (!dl_task(p) || dl_se->dl_new) 533 if (!dl_task(p) || dl_se->dl_new ||
534 dl_se->dl_boosted || !dl_se->dl_throttled)
527 goto unlock; 535 goto unlock;
528 536
529 sched_clock_tick(); 537 sched_clock_tick();
@@ -532,7 +540,7 @@ again:
532 dl_se->dl_yielded = 0; 540 dl_se->dl_yielded = 0;
533 if (task_on_rq_queued(p)) { 541 if (task_on_rq_queued(p)) {
534 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); 542 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
535 if (task_has_dl_policy(rq->curr)) 543 if (dl_task(rq->curr))
536 check_preempt_curr_dl(rq, p, 0); 544 check_preempt_curr_dl(rq, p, 0);
537 else 545 else
538 resched_curr(rq); 546 resched_curr(rq);
@@ -847,8 +855,19 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
847 * smaller than our one... OTW we keep our runtime and 855 * smaller than our one... OTW we keep our runtime and
848 * deadline. 856 * deadline.
849 */ 857 */
850 if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) 858 if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) {
851 pi_se = &pi_task->dl; 859 pi_se = &pi_task->dl;
860 } else if (!dl_prio(p->normal_prio)) {
861 /*
862 * Special case in which we have a !SCHED_DEADLINE task
863 * that is going to be deboosted, but exceedes its
864 * runtime while doing so. No point in replenishing
865 * it, as it's going to return back to its original
866 * scheduling class after this.
867 */
868 BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH);
869 return;
870 }
852 871
853 /* 872 /*
854 * If p is throttled, we do nothing. In fact, if it exhausted 873 * If p is throttled, we do nothing. In fact, if it exhausted
@@ -1607,8 +1626,12 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
1607 /* Only reschedule if pushing failed */ 1626 /* Only reschedule if pushing failed */
1608 check_resched = 0; 1627 check_resched = 0;
1609#endif /* CONFIG_SMP */ 1628#endif /* CONFIG_SMP */
1610 if (check_resched && task_has_dl_policy(rq->curr)) 1629 if (check_resched) {
1611 check_preempt_curr_dl(rq, p, 0); 1630 if (dl_task(rq->curr))
1631 check_preempt_curr_dl(rq, p, 0);
1632 else
1633 resched_curr(rq);
1634 }
1612 } 1635 }
1613} 1636}
1614 1637
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0b069bf3e708..34baa60f8a7b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -828,11 +828,12 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
828 828
829static unsigned int task_scan_min(struct task_struct *p) 829static unsigned int task_scan_min(struct task_struct *p)
830{ 830{
831 unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size);
831 unsigned int scan, floor; 832 unsigned int scan, floor;
832 unsigned int windows = 1; 833 unsigned int windows = 1;
833 834
834 if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW) 835 if (scan_size < MAX_SCAN_WINDOW)
835 windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size; 836 windows = MAX_SCAN_WINDOW / scan_size;
836 floor = 1000 / windows; 837 floor = 1000 / windows;
837 838
838 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p); 839 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
@@ -1164,9 +1165,19 @@ static void task_numa_compare(struct task_numa_env *env,
1164 long moveimp = imp; 1165 long moveimp = imp;
1165 1166
1166 rcu_read_lock(); 1167 rcu_read_lock();
1167 cur = ACCESS_ONCE(dst_rq->curr); 1168
1168 if (cur->pid == 0) /* idle */ 1169 raw_spin_lock_irq(&dst_rq->lock);
1170 cur = dst_rq->curr;
1171 /*
1172 * No need to move the exiting task, and this ensures that ->curr
1173 * wasn't reaped and thus get_task_struct() in task_numa_assign()
1174 * is safe under RCU read lock.
1175 * Note that rcu_read_lock() itself can't protect from the final
1176 * put_task_struct() after the last schedule().
1177 */
1178 if ((cur->flags & PF_EXITING) || is_idle_task(cur))
1169 cur = NULL; 1179 cur = NULL;
1180 raw_spin_unlock_irq(&dst_rq->lock);
1170 1181
1171 /* 1182 /*
1172 * "imp" is the fault differential for the source task between the 1183 * "imp" is the fault differential for the source task between the
@@ -1520,7 +1531,7 @@ static void update_task_scan_period(struct task_struct *p,
1520 * scanning faster if shared accesses dominate as it may 1531 * scanning faster if shared accesses dominate as it may
1521 * simply bounce migrations uselessly 1532 * simply bounce migrations uselessly
1522 */ 1533 */
1523 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); 1534 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
1524 diff = (diff * ratio) / NUMA_PERIOD_SLOTS; 1535 diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1525 } 1536 }
1526 1537
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4aada6d9fe74..15f2511a1b7c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -387,7 +387,8 @@ static struct ctl_table kern_table[] = {
387 .data = &sysctl_numa_balancing_scan_size, 387 .data = &sysctl_numa_balancing_scan_size,
388 .maxlen = sizeof(unsigned int), 388 .maxlen = sizeof(unsigned int),
389 .mode = 0644, 389 .mode = 0644,
390 .proc_handler = proc_dointvec, 390 .proc_handler = proc_dointvec_minmax,
391 .extra1 = &one,
391 }, 392 },
392 { 393 {
393 .procname = "numa_balancing", 394 .procname = "numa_balancing",