diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-10-31 17:05:35 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-10-31 17:05:35 -0400 |
commit | f5fa363026c3508735c6ab2f1029110d2c4966a2 (patch) | |
tree | 3fb2d1be0c1a6f49dc70a9aff2dd6e9a6710570b /kernel | |
parent | 5656b408ff2696551c0f572689edcad3113e3a32 (diff) | |
parent | f3a7e1a9c464a32ee186ab91388313c82e7ce018 (diff) |
Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Ingo Molnar:
"Various scheduler fixes all over the place: three SCHED_DL fixes,
three sched/numa fixes, two generic race fixes and a comment fix"
* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched/dl: Fix preemption checks
sched: Update comments for CLONE_NEWNS
sched: stop the unbound recursion in preempt_schedule_context()
sched/fair: Fix division by zero sysctl_numa_balancing_scan_size
sched/fair: Care divide error in update_task_scan_period()
sched/numa: Fix unsafe get_task_struct() in task_numa_assign()
sched/deadline: Fix races between rt_mutex_setprio() and dl_task_timer()
sched/deadline: Don't replenish from a !SCHED_DEADLINE entity
sched: Fix race between task_group and sched_task_group
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/context_tracking.c | 40 | ||||
-rw-r--r-- | kernel/sched/core.c | 47 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 41 | ||||
-rw-r--r-- | kernel/sched/fair.c | 21 | ||||
-rw-r--r-- | kernel/sysctl.c | 3 |
5 files changed, 97 insertions, 55 deletions
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 5664985c46a0..937ecdfdf258 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c | |||
@@ -107,46 +107,6 @@ void context_tracking_user_enter(void) | |||
107 | } | 107 | } |
108 | NOKPROBE_SYMBOL(context_tracking_user_enter); | 108 | NOKPROBE_SYMBOL(context_tracking_user_enter); |
109 | 109 | ||
110 | #ifdef CONFIG_PREEMPT | ||
111 | /** | ||
112 | * preempt_schedule_context - preempt_schedule called by tracing | ||
113 | * | ||
114 | * The tracing infrastructure uses preempt_enable_notrace to prevent | ||
115 | * recursion and tracing preempt enabling caused by the tracing | ||
116 | * infrastructure itself. But as tracing can happen in areas coming | ||
117 | * from userspace or just about to enter userspace, a preempt enable | ||
118 | * can occur before user_exit() is called. This will cause the scheduler | ||
119 | * to be called when the system is still in usermode. | ||
120 | * | ||
121 | * To prevent this, the preempt_enable_notrace will use this function | ||
122 | * instead of preempt_schedule() to exit user context if needed before | ||
123 | * calling the scheduler. | ||
124 | */ | ||
125 | asmlinkage __visible void __sched notrace preempt_schedule_context(void) | ||
126 | { | ||
127 | enum ctx_state prev_ctx; | ||
128 | |||
129 | if (likely(!preemptible())) | ||
130 | return; | ||
131 | |||
132 | /* | ||
133 | * Need to disable preemption in case user_exit() is traced | ||
134 | * and the tracer calls preempt_enable_notrace() causing | ||
135 | * an infinite recursion. | ||
136 | */ | ||
137 | preempt_disable_notrace(); | ||
138 | prev_ctx = exception_enter(); | ||
139 | preempt_enable_no_resched_notrace(); | ||
140 | |||
141 | preempt_schedule(); | ||
142 | |||
143 | preempt_disable_notrace(); | ||
144 | exception_exit(prev_ctx); | ||
145 | preempt_enable_notrace(); | ||
146 | } | ||
147 | EXPORT_SYMBOL_GPL(preempt_schedule_context); | ||
148 | #endif /* CONFIG_PREEMPT */ | ||
149 | |||
150 | /** | 110 | /** |
151 | * context_tracking_user_exit - Inform the context tracking that the CPU is | 111 | * context_tracking_user_exit - Inform the context tracking that the CPU is |
152 | * exiting userspace mode and entering the kernel. | 112 | * exiting userspace mode and entering the kernel. |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 44999505e1bf..240157c13ddc 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -2951,6 +2951,47 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) | |||
2951 | } | 2951 | } |
2952 | NOKPROBE_SYMBOL(preempt_schedule); | 2952 | NOKPROBE_SYMBOL(preempt_schedule); |
2953 | EXPORT_SYMBOL(preempt_schedule); | 2953 | EXPORT_SYMBOL(preempt_schedule); |
2954 | |||
2955 | #ifdef CONFIG_CONTEXT_TRACKING | ||
2956 | /** | ||
2957 | * preempt_schedule_context - preempt_schedule called by tracing | ||
2958 | * | ||
2959 | * The tracing infrastructure uses preempt_enable_notrace to prevent | ||
2960 | * recursion and tracing preempt enabling caused by the tracing | ||
2961 | * infrastructure itself. But as tracing can happen in areas coming | ||
2962 | * from userspace or just about to enter userspace, a preempt enable | ||
2963 | * can occur before user_exit() is called. This will cause the scheduler | ||
2964 | * to be called when the system is still in usermode. | ||
2965 | * | ||
2966 | * To prevent this, the preempt_enable_notrace will use this function | ||
2967 | * instead of preempt_schedule() to exit user context if needed before | ||
2968 | * calling the scheduler. | ||
2969 | */ | ||
2970 | asmlinkage __visible void __sched notrace preempt_schedule_context(void) | ||
2971 | { | ||
2972 | enum ctx_state prev_ctx; | ||
2973 | |||
2974 | if (likely(!preemptible())) | ||
2975 | return; | ||
2976 | |||
2977 | do { | ||
2978 | __preempt_count_add(PREEMPT_ACTIVE); | ||
2979 | /* | ||
2980 | * Needs preempt disabled in case user_exit() is traced | ||
2981 | * and the tracer calls preempt_enable_notrace() causing | ||
2982 | * an infinite recursion. | ||
2983 | */ | ||
2984 | prev_ctx = exception_enter(); | ||
2985 | __schedule(); | ||
2986 | exception_exit(prev_ctx); | ||
2987 | |||
2988 | __preempt_count_sub(PREEMPT_ACTIVE); | ||
2989 | barrier(); | ||
2990 | } while (need_resched()); | ||
2991 | } | ||
2992 | EXPORT_SYMBOL_GPL(preempt_schedule_context); | ||
2993 | #endif /* CONFIG_CONTEXT_TRACKING */ | ||
2994 | |||
2954 | #endif /* CONFIG_PREEMPT */ | 2995 | #endif /* CONFIG_PREEMPT */ |
2955 | 2996 | ||
2956 | /* | 2997 | /* |
@@ -7833,6 +7874,11 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) | |||
7833 | sched_offline_group(tg); | 7874 | sched_offline_group(tg); |
7834 | } | 7875 | } |
7835 | 7876 | ||
7877 | static void cpu_cgroup_fork(struct task_struct *task) | ||
7878 | { | ||
7879 | sched_move_task(task); | ||
7880 | } | ||
7881 | |||
7836 | static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, | 7882 | static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, |
7837 | struct cgroup_taskset *tset) | 7883 | struct cgroup_taskset *tset) |
7838 | { | 7884 | { |
@@ -8205,6 +8251,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { | |||
8205 | .css_free = cpu_cgroup_css_free, | 8251 | .css_free = cpu_cgroup_css_free, |
8206 | .css_online = cpu_cgroup_css_online, | 8252 | .css_online = cpu_cgroup_css_online, |
8207 | .css_offline = cpu_cgroup_css_offline, | 8253 | .css_offline = cpu_cgroup_css_offline, |
8254 | .fork = cpu_cgroup_fork, | ||
8208 | .can_attach = cpu_cgroup_can_attach, | 8255 | .can_attach = cpu_cgroup_can_attach, |
8209 | .attach = cpu_cgroup_attach, | 8256 | .attach = cpu_cgroup_attach, |
8210 | .exit = cpu_cgroup_exit, | 8257 | .exit = cpu_cgroup_exit, |
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 256e577faf1b..5285332392d5 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
@@ -518,12 +518,20 @@ again: | |||
518 | } | 518 | } |
519 | 519 | ||
520 | /* | 520 | /* |
521 | * We need to take care of a possible races here. In fact, the | 521 | * We need to take care of several possible races here: |
522 | * task might have changed its scheduling policy to something | 522 | * |
523 | * different from SCHED_DEADLINE or changed its reservation | 523 | * - the task might have changed its scheduling policy |
524 | * parameters (through sched_setattr()). | 524 | * to something different than SCHED_DEADLINE |
525 | * - the task might have changed its reservation parameters | ||
526 | * (through sched_setattr()) | ||
527 | * - the task might have been boosted by someone else and | ||
528 | * might be in the boosting/deboosting path | ||
529 | * | ||
530 | * In all this cases we bail out, as the task is already | ||
531 | * in the runqueue or is going to be enqueued back anyway. | ||
525 | */ | 532 | */ |
526 | if (!dl_task(p) || dl_se->dl_new) | 533 | if (!dl_task(p) || dl_se->dl_new || |
534 | dl_se->dl_boosted || !dl_se->dl_throttled) | ||
527 | goto unlock; | 535 | goto unlock; |
528 | 536 | ||
529 | sched_clock_tick(); | 537 | sched_clock_tick(); |
@@ -532,7 +540,7 @@ again: | |||
532 | dl_se->dl_yielded = 0; | 540 | dl_se->dl_yielded = 0; |
533 | if (task_on_rq_queued(p)) { | 541 | if (task_on_rq_queued(p)) { |
534 | enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); | 542 | enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); |
535 | if (task_has_dl_policy(rq->curr)) | 543 | if (dl_task(rq->curr)) |
536 | check_preempt_curr_dl(rq, p, 0); | 544 | check_preempt_curr_dl(rq, p, 0); |
537 | else | 545 | else |
538 | resched_curr(rq); | 546 | resched_curr(rq); |
@@ -847,8 +855,19 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) | |||
847 | * smaller than our one... OTW we keep our runtime and | 855 | * smaller than our one... OTW we keep our runtime and |
848 | * deadline. | 856 | * deadline. |
849 | */ | 857 | */ |
850 | if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) | 858 | if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) { |
851 | pi_se = &pi_task->dl; | 859 | pi_se = &pi_task->dl; |
860 | } else if (!dl_prio(p->normal_prio)) { | ||
861 | /* | ||
862 | * Special case in which we have a !SCHED_DEADLINE task | ||
863 | * that is going to be deboosted, but exceedes its | ||
864 | * runtime while doing so. No point in replenishing | ||
865 | * it, as it's going to return back to its original | ||
866 | * scheduling class after this. | ||
867 | */ | ||
868 | BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH); | ||
869 | return; | ||
870 | } | ||
852 | 871 | ||
853 | /* | 872 | /* |
854 | * If p is throttled, we do nothing. In fact, if it exhausted | 873 | * If p is throttled, we do nothing. In fact, if it exhausted |
@@ -1607,8 +1626,12 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) | |||
1607 | /* Only reschedule if pushing failed */ | 1626 | /* Only reschedule if pushing failed */ |
1608 | check_resched = 0; | 1627 | check_resched = 0; |
1609 | #endif /* CONFIG_SMP */ | 1628 | #endif /* CONFIG_SMP */ |
1610 | if (check_resched && task_has_dl_policy(rq->curr)) | 1629 | if (check_resched) { |
1611 | check_preempt_curr_dl(rq, p, 0); | 1630 | if (dl_task(rq->curr)) |
1631 | check_preempt_curr_dl(rq, p, 0); | ||
1632 | else | ||
1633 | resched_curr(rq); | ||
1634 | } | ||
1612 | } | 1635 | } |
1613 | } | 1636 | } |
1614 | 1637 | ||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0b069bf3e708..34baa60f8a7b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -828,11 +828,12 @@ static unsigned int task_nr_scan_windows(struct task_struct *p) | |||
828 | 828 | ||
829 | static unsigned int task_scan_min(struct task_struct *p) | 829 | static unsigned int task_scan_min(struct task_struct *p) |
830 | { | 830 | { |
831 | unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size); | ||
831 | unsigned int scan, floor; | 832 | unsigned int scan, floor; |
832 | unsigned int windows = 1; | 833 | unsigned int windows = 1; |
833 | 834 | ||
834 | if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW) | 835 | if (scan_size < MAX_SCAN_WINDOW) |
835 | windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size; | 836 | windows = MAX_SCAN_WINDOW / scan_size; |
836 | floor = 1000 / windows; | 837 | floor = 1000 / windows; |
837 | 838 | ||
838 | scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p); | 839 | scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p); |
@@ -1164,9 +1165,19 @@ static void task_numa_compare(struct task_numa_env *env, | |||
1164 | long moveimp = imp; | 1165 | long moveimp = imp; |
1165 | 1166 | ||
1166 | rcu_read_lock(); | 1167 | rcu_read_lock(); |
1167 | cur = ACCESS_ONCE(dst_rq->curr); | 1168 | |
1168 | if (cur->pid == 0) /* idle */ | 1169 | raw_spin_lock_irq(&dst_rq->lock); |
1170 | cur = dst_rq->curr; | ||
1171 | /* | ||
1172 | * No need to move the exiting task, and this ensures that ->curr | ||
1173 | * wasn't reaped and thus get_task_struct() in task_numa_assign() | ||
1174 | * is safe under RCU read lock. | ||
1175 | * Note that rcu_read_lock() itself can't protect from the final | ||
1176 | * put_task_struct() after the last schedule(). | ||
1177 | */ | ||
1178 | if ((cur->flags & PF_EXITING) || is_idle_task(cur)) | ||
1169 | cur = NULL; | 1179 | cur = NULL; |
1180 | raw_spin_unlock_irq(&dst_rq->lock); | ||
1170 | 1181 | ||
1171 | /* | 1182 | /* |
1172 | * "imp" is the fault differential for the source task between the | 1183 | * "imp" is the fault differential for the source task between the |
@@ -1520,7 +1531,7 @@ static void update_task_scan_period(struct task_struct *p, | |||
1520 | * scanning faster if shared accesses dominate as it may | 1531 | * scanning faster if shared accesses dominate as it may |
1521 | * simply bounce migrations uselessly | 1532 | * simply bounce migrations uselessly |
1522 | */ | 1533 | */ |
1523 | ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); | 1534 | ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1)); |
1524 | diff = (diff * ratio) / NUMA_PERIOD_SLOTS; | 1535 | diff = (diff * ratio) / NUMA_PERIOD_SLOTS; |
1525 | } | 1536 | } |
1526 | 1537 | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4aada6d9fe74..15f2511a1b7c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -387,7 +387,8 @@ static struct ctl_table kern_table[] = { | |||
387 | .data = &sysctl_numa_balancing_scan_size, | 387 | .data = &sysctl_numa_balancing_scan_size, |
388 | .maxlen = sizeof(unsigned int), | 388 | .maxlen = sizeof(unsigned int), |
389 | .mode = 0644, | 389 | .mode = 0644, |
390 | .proc_handler = proc_dointvec, | 390 | .proc_handler = proc_dointvec_minmax, |
391 | .extra1 = &one, | ||
391 | }, | 392 | }, |
392 | { | 393 | { |
393 | .procname = "numa_balancing", | 394 | .procname = "numa_balancing", |