diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-01-20 13:42:08 -0500 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-01-20 13:42:08 -0500 |
| commit | a0fa1dd3cdbccec9597fe53b6177a9aa6e20f2f8 (patch) | |
| tree | b249854573815eedf377e554f0ea516f86411841 /kernel | |
| parent | 9326657abe1a83ed4b4f396b923ca1217fd50cba (diff) | |
| parent | eaad45132c564ce377e6dce05e78e08e456d5315 (diff) | |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler changes from Ingo Molnar:
- Add the initial implementation of SCHED_DEADLINE support: a real-time
scheduling policy where tasks that meet their deadlines and
periodically execute their instances in less than their runtime quota
see real-time scheduling and won't miss any of their deadlines.
Tasks that go over their quota get delayed (Available to privileged
users for now)
- Clean up and fix preempt_enable_no_resched() abuse all around the
tree
- Do sched_clock() performance optimizations on x86 and elsewhere
- Fix and improve auto-NUMA balancing
- Fix and clean up the idle loop
- Apply various cleanups and fixes
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (60 commits)
sched: Fix __sched_setscheduler() nice test
sched: Move SCHED_RESET_ON_FORK into attr::sched_flags
sched: Fix up attr::sched_priority warning
sched: Fix up scheduler syscall LTP fails
sched: Preserve the nice level over sched_setscheduler() and sched_setparam() calls
sched/core: Fix htmldocs warnings
sched/deadline: No need to check p if dl_se is valid
sched/deadline: Remove unused variables
sched/deadline: Fix sparse static warnings
m68k: Fix build warning in mac_via.h
sched, thermal: Clean up preempt_enable_no_resched() abuse
sched, net: Fixup busy_loop_us_clock()
sched, net: Clean up preempt_enable_no_resched() abuse
sched/preempt: Fix up missed PREEMPT_NEED_RESCHED folding
sched/preempt, locking: Rework local_bh_{dis,en}able()
sched/clock, x86: Avoid a runtime condition in native_sched_clock()
sched/clock: Fix up clear_sched_clock_stable()
sched/clock, x86: Use a static_key for sched_clock_stable
sched/clock: Remove local_irq_disable() from the clocks
sched/clock, x86: Rewrite cyc2ns() to avoid the need to disable IRQs
...
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/cpu/idle.c | 17 | ||||
| -rw-r--r-- | kernel/fork.c | 12 | ||||
| -rw-r--r-- | kernel/futex.c | 2 | ||||
| -rw-r--r-- | kernel/hrtimer.c | 3 | ||||
| -rw-r--r-- | kernel/locking/rtmutex-debug.c | 8 | ||||
| -rw-r--r-- | kernel/locking/rtmutex.c | 166 | ||||
| -rw-r--r-- | kernel/locking/rtmutex_common.h | 23 | ||||
| -rw-r--r-- | kernel/sched/Makefile | 5 | ||||
| -rw-r--r-- | kernel/sched/clock.c | 78 | ||||
| -rw-r--r-- | kernel/sched/core.c | 822 | ||||
| -rw-r--r-- | kernel/sched/cpudeadline.c | 216 | ||||
| -rw-r--r-- | kernel/sched/cpudeadline.h | 33 | ||||
| -rw-r--r-- | kernel/sched/deadline.c | 1640 | ||||
| -rw-r--r-- | kernel/sched/debug.c | 4 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 83 | ||||
| -rw-r--r-- | kernel/sched/rt.c | 2 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 146 | ||||
| -rw-r--r-- | kernel/sched/stop_task.c | 2 | ||||
| -rw-r--r-- | kernel/softirq.c | 39 | ||||
| -rw-r--r-- | kernel/sysctl.c | 7 | ||||
| -rw-r--r-- | kernel/time/tick-sched.c | 2 | ||||
| -rw-r--r-- | kernel/trace/ring_buffer.c | 2 | ||||
| -rw-r--r-- | kernel/trace/trace_sched_wakeup.c | 65 | ||||
| -rw-r--r-- | kernel/trace/trace_selftest.c | 33 |
24 files changed, 3087 insertions, 323 deletions
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c index 988573a9a387..277f494c2a9a 100644 --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c | |||
| @@ -105,14 +105,17 @@ static void cpu_idle_loop(void) | |||
| 105 | __current_set_polling(); | 105 | __current_set_polling(); |
| 106 | } | 106 | } |
| 107 | arch_cpu_idle_exit(); | 107 | arch_cpu_idle_exit(); |
| 108 | /* | ||
| 109 | * We need to test and propagate the TIF_NEED_RESCHED | ||
| 110 | * bit here because we might not have send the | ||
| 111 | * reschedule IPI to idle tasks. | ||
| 112 | */ | ||
| 113 | if (tif_need_resched()) | ||
| 114 | set_preempt_need_resched(); | ||
| 115 | } | 108 | } |
| 109 | |||
| 110 | /* | ||
| 111 | * Since we fell out of the loop above, we know | ||
| 112 | * TIF_NEED_RESCHED must be set, propagate it into | ||
| 113 | * PREEMPT_NEED_RESCHED. | ||
| 114 | * | ||
| 115 | * This is required because for polling idle loops we will | ||
| 116 | * not have had an IPI to fold the state for us. | ||
| 117 | */ | ||
| 118 | preempt_set_need_resched(); | ||
| 116 | tick_nohz_idle_exit(); | 119 | tick_nohz_idle_exit(); |
| 117 | schedule_preempt_disabled(); | 120 | schedule_preempt_disabled(); |
| 118 | } | 121 | } |
diff --git a/kernel/fork.c b/kernel/fork.c index dfa736c98d17..294189fc7ac8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -1087,8 +1087,10 @@ static void rt_mutex_init_task(struct task_struct *p) | |||
| 1087 | { | 1087 | { |
| 1088 | raw_spin_lock_init(&p->pi_lock); | 1088 | raw_spin_lock_init(&p->pi_lock); |
| 1089 | #ifdef CONFIG_RT_MUTEXES | 1089 | #ifdef CONFIG_RT_MUTEXES |
| 1090 | plist_head_init(&p->pi_waiters); | 1090 | p->pi_waiters = RB_ROOT; |
| 1091 | p->pi_waiters_leftmost = NULL; | ||
| 1091 | p->pi_blocked_on = NULL; | 1092 | p->pi_blocked_on = NULL; |
| 1093 | p->pi_top_task = NULL; | ||
| 1092 | #endif | 1094 | #endif |
| 1093 | } | 1095 | } |
| 1094 | 1096 | ||
| @@ -1311,7 +1313,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| 1311 | #endif | 1313 | #endif |
| 1312 | 1314 | ||
| 1313 | /* Perform scheduler related setup. Assign this task to a CPU. */ | 1315 | /* Perform scheduler related setup. Assign this task to a CPU. */ |
| 1314 | sched_fork(clone_flags, p); | 1316 | retval = sched_fork(clone_flags, p); |
| 1317 | if (retval) | ||
| 1318 | goto bad_fork_cleanup_policy; | ||
| 1315 | 1319 | ||
| 1316 | retval = perf_event_init_task(p); | 1320 | retval = perf_event_init_task(p); |
| 1317 | if (retval) | 1321 | if (retval) |
| @@ -1403,13 +1407,11 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| 1403 | p->tgid = p->pid; | 1407 | p->tgid = p->pid; |
| 1404 | } | 1408 | } |
| 1405 | 1409 | ||
| 1406 | p->pdeath_signal = 0; | ||
| 1407 | p->exit_state = 0; | ||
| 1408 | |||
| 1409 | p->nr_dirtied = 0; | 1410 | p->nr_dirtied = 0; |
| 1410 | p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); | 1411 | p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); |
| 1411 | p->dirty_paused_when = 0; | 1412 | p->dirty_paused_when = 0; |
| 1412 | 1413 | ||
| 1414 | p->pdeath_signal = 0; | ||
| 1413 | INIT_LIST_HEAD(&p->thread_group); | 1415 | INIT_LIST_HEAD(&p->thread_group); |
| 1414 | p->task_works = NULL; | 1416 | p->task_works = NULL; |
| 1415 | 1417 | ||
diff --git a/kernel/futex.c b/kernel/futex.c index 1ddc4498f1e1..44a1261cb9ff 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
| @@ -2426,6 +2426,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, | |||
| 2426 | * code while we sleep on uaddr. | 2426 | * code while we sleep on uaddr. |
| 2427 | */ | 2427 | */ |
| 2428 | debug_rt_mutex_init_waiter(&rt_waiter); | 2428 | debug_rt_mutex_init_waiter(&rt_waiter); |
| 2429 | RB_CLEAR_NODE(&rt_waiter.pi_tree_entry); | ||
| 2430 | RB_CLEAR_NODE(&rt_waiter.tree_entry); | ||
| 2429 | rt_waiter.task = NULL; | 2431 | rt_waiter.task = NULL; |
| 2430 | 2432 | ||
| 2431 | ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); | 2433 | ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 383319bae3f7..09094361dce5 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
| @@ -46,6 +46,7 @@ | |||
| 46 | #include <linux/sched.h> | 46 | #include <linux/sched.h> |
| 47 | #include <linux/sched/sysctl.h> | 47 | #include <linux/sched/sysctl.h> |
| 48 | #include <linux/sched/rt.h> | 48 | #include <linux/sched/rt.h> |
| 49 | #include <linux/sched/deadline.h> | ||
| 49 | #include <linux/timer.h> | 50 | #include <linux/timer.h> |
| 50 | #include <linux/freezer.h> | 51 | #include <linux/freezer.h> |
| 51 | 52 | ||
| @@ -1610,7 +1611,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, | |||
| 1610 | unsigned long slack; | 1611 | unsigned long slack; |
| 1611 | 1612 | ||
| 1612 | slack = current->timer_slack_ns; | 1613 | slack = current->timer_slack_ns; |
| 1613 | if (rt_task(current)) | 1614 | if (dl_task(current) || rt_task(current)) |
| 1614 | slack = 0; | 1615 | slack = 0; |
| 1615 | 1616 | ||
| 1616 | hrtimer_init_on_stack(&t.timer, clockid, mode); | 1617 | hrtimer_init_on_stack(&t.timer, clockid, mode); |
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index 13b243a323fa..49b2ed3dced8 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c | |||
| @@ -24,7 +24,7 @@ | |||
| 24 | #include <linux/kallsyms.h> | 24 | #include <linux/kallsyms.h> |
| 25 | #include <linux/syscalls.h> | 25 | #include <linux/syscalls.h> |
| 26 | #include <linux/interrupt.h> | 26 | #include <linux/interrupt.h> |
| 27 | #include <linux/plist.h> | 27 | #include <linux/rbtree.h> |
| 28 | #include <linux/fs.h> | 28 | #include <linux/fs.h> |
| 29 | #include <linux/debug_locks.h> | 29 | #include <linux/debug_locks.h> |
| 30 | 30 | ||
| @@ -57,7 +57,7 @@ static void printk_lock(struct rt_mutex *lock, int print_owner) | |||
| 57 | 57 | ||
| 58 | void rt_mutex_debug_task_free(struct task_struct *task) | 58 | void rt_mutex_debug_task_free(struct task_struct *task) |
| 59 | { | 59 | { |
| 60 | DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters)); | 60 | DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters)); |
| 61 | DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); | 61 | DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); |
| 62 | } | 62 | } |
| 63 | 63 | ||
| @@ -154,16 +154,12 @@ void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) | |||
| 154 | void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) | 154 | void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) |
| 155 | { | 155 | { |
| 156 | memset(waiter, 0x11, sizeof(*waiter)); | 156 | memset(waiter, 0x11, sizeof(*waiter)); |
| 157 | plist_node_init(&waiter->list_entry, MAX_PRIO); | ||
| 158 | plist_node_init(&waiter->pi_list_entry, MAX_PRIO); | ||
| 159 | waiter->deadlock_task_pid = NULL; | 157 | waiter->deadlock_task_pid = NULL; |
| 160 | } | 158 | } |
| 161 | 159 | ||
| 162 | void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) | 160 | void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) |
| 163 | { | 161 | { |
| 164 | put_pid(waiter->deadlock_task_pid); | 162 | put_pid(waiter->deadlock_task_pid); |
| 165 | DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry)); | ||
| 166 | DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); | ||
| 167 | memset(waiter, 0x22, sizeof(*waiter)); | 163 | memset(waiter, 0x22, sizeof(*waiter)); |
| 168 | } | 164 | } |
| 169 | 165 | ||
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 0dd6aec1cb6a..2e960a2bab81 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c | |||
| @@ -14,6 +14,7 @@ | |||
| 14 | #include <linux/export.h> | 14 | #include <linux/export.h> |
| 15 | #include <linux/sched.h> | 15 | #include <linux/sched.h> |
| 16 | #include <linux/sched/rt.h> | 16 | #include <linux/sched/rt.h> |
| 17 | #include <linux/sched/deadline.h> | ||
| 17 | #include <linux/timer.h> | 18 | #include <linux/timer.h> |
| 18 | 19 | ||
| 19 | #include "rtmutex_common.h" | 20 | #include "rtmutex_common.h" |
| @@ -91,10 +92,107 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) | |||
| 91 | } | 92 | } |
| 92 | #endif | 93 | #endif |
| 93 | 94 | ||
| 95 | static inline int | ||
| 96 | rt_mutex_waiter_less(struct rt_mutex_waiter *left, | ||
| 97 | struct rt_mutex_waiter *right) | ||
| 98 | { | ||
| 99 | if (left->prio < right->prio) | ||
| 100 | return 1; | ||
| 101 | |||
| 102 | /* | ||
| 103 | * If both waiters have dl_prio(), we check the deadlines of the | ||
| 104 | * associated tasks. | ||
| 105 | * If left waiter has a dl_prio(), and we didn't return 1 above, | ||
| 106 | * then right waiter has a dl_prio() too. | ||
| 107 | */ | ||
| 108 | if (dl_prio(left->prio)) | ||
| 109 | return (left->task->dl.deadline < right->task->dl.deadline); | ||
| 110 | |||
| 111 | return 0; | ||
| 112 | } | ||
| 113 | |||
| 114 | static void | ||
| 115 | rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) | ||
| 116 | { | ||
| 117 | struct rb_node **link = &lock->waiters.rb_node; | ||
| 118 | struct rb_node *parent = NULL; | ||
| 119 | struct rt_mutex_waiter *entry; | ||
| 120 | int leftmost = 1; | ||
| 121 | |||
| 122 | while (*link) { | ||
| 123 | parent = *link; | ||
| 124 | entry = rb_entry(parent, struct rt_mutex_waiter, tree_entry); | ||
| 125 | if (rt_mutex_waiter_less(waiter, entry)) { | ||
| 126 | link = &parent->rb_left; | ||
| 127 | } else { | ||
| 128 | link = &parent->rb_right; | ||
| 129 | leftmost = 0; | ||
| 130 | } | ||
| 131 | } | ||
| 132 | |||
| 133 | if (leftmost) | ||
| 134 | lock->waiters_leftmost = &waiter->tree_entry; | ||
| 135 | |||
| 136 | rb_link_node(&waiter->tree_entry, parent, link); | ||
| 137 | rb_insert_color(&waiter->tree_entry, &lock->waiters); | ||
| 138 | } | ||
| 139 | |||
| 140 | static void | ||
| 141 | rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) | ||
| 142 | { | ||
| 143 | if (RB_EMPTY_NODE(&waiter->tree_entry)) | ||
| 144 | return; | ||
| 145 | |||
| 146 | if (lock->waiters_leftmost == &waiter->tree_entry) | ||
| 147 | lock->waiters_leftmost = rb_next(&waiter->tree_entry); | ||
| 148 | |||
| 149 | rb_erase(&waiter->tree_entry, &lock->waiters); | ||
| 150 | RB_CLEAR_NODE(&waiter->tree_entry); | ||
| 151 | } | ||
| 152 | |||
| 153 | static void | ||
| 154 | rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) | ||
| 155 | { | ||
| 156 | struct rb_node **link = &task->pi_waiters.rb_node; | ||
| 157 | struct rb_node *parent = NULL; | ||
| 158 | struct rt_mutex_waiter *entry; | ||
| 159 | int leftmost = 1; | ||
| 160 | |||
| 161 | while (*link) { | ||
| 162 | parent = *link; | ||
| 163 | entry = rb_entry(parent, struct rt_mutex_waiter, pi_tree_entry); | ||
| 164 | if (rt_mutex_waiter_less(waiter, entry)) { | ||
| 165 | link = &parent->rb_left; | ||
| 166 | } else { | ||
| 167 | link = &parent->rb_right; | ||
| 168 | leftmost = 0; | ||
| 169 | } | ||
| 170 | } | ||
| 171 | |||
| 172 | if (leftmost) | ||
| 173 | task->pi_waiters_leftmost = &waiter->pi_tree_entry; | ||
| 174 | |||
| 175 | rb_link_node(&waiter->pi_tree_entry, parent, link); | ||
| 176 | rb_insert_color(&waiter->pi_tree_entry, &task->pi_waiters); | ||
| 177 | } | ||
| 178 | |||
| 179 | static void | ||
| 180 | rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) | ||
| 181 | { | ||
| 182 | if (RB_EMPTY_NODE(&waiter->pi_tree_entry)) | ||
| 183 | return; | ||
| 184 | |||
| 185 | if (task->pi_waiters_leftmost == &waiter->pi_tree_entry) | ||
| 186 | task->pi_waiters_leftmost = rb_next(&waiter->pi_tree_entry); | ||
| 187 | |||
| 188 | rb_erase(&waiter->pi_tree_entry, &task->pi_waiters); | ||
| 189 | RB_CLEAR_NODE(&waiter->pi_tree_entry); | ||
| 190 | } | ||
| 191 | |||
| 94 | /* | 192 | /* |
| 95 | * Calculate task priority from the waiter list priority | 193 | * Calculate task priority from the waiter tree priority |
| 96 | * | 194 | * |
| 97 | * Return task->normal_prio when the waiter list is empty or when | 195 | * Return task->normal_prio when the waiter tree is empty or when |
| 98 | * the waiter is not allowed to do priority boosting | 196 | * the waiter is not allowed to do priority boosting |
| 99 | */ | 197 | */ |
| 100 | int rt_mutex_getprio(struct task_struct *task) | 198 | int rt_mutex_getprio(struct task_struct *task) |
| @@ -102,10 +200,18 @@ int rt_mutex_getprio(struct task_struct *task) | |||
| 102 | if (likely(!task_has_pi_waiters(task))) | 200 | if (likely(!task_has_pi_waiters(task))) |
| 103 | return task->normal_prio; | 201 | return task->normal_prio; |
| 104 | 202 | ||
| 105 | return min(task_top_pi_waiter(task)->pi_list_entry.prio, | 203 | return min(task_top_pi_waiter(task)->prio, |
| 106 | task->normal_prio); | 204 | task->normal_prio); |
| 107 | } | 205 | } |
| 108 | 206 | ||
| 207 | struct task_struct *rt_mutex_get_top_task(struct task_struct *task) | ||
| 208 | { | ||
| 209 | if (likely(!task_has_pi_waiters(task))) | ||
| 210 | return NULL; | ||
| 211 | |||
| 212 | return task_top_pi_waiter(task)->task; | ||
| 213 | } | ||
| 214 | |||
| 109 | /* | 215 | /* |
| 110 | * Adjust the priority of a task, after its pi_waiters got modified. | 216 | * Adjust the priority of a task, after its pi_waiters got modified. |
| 111 | * | 217 | * |
| @@ -115,7 +221,7 @@ static void __rt_mutex_adjust_prio(struct task_struct *task) | |||
| 115 | { | 221 | { |
| 116 | int prio = rt_mutex_getprio(task); | 222 | int prio = rt_mutex_getprio(task); |
| 117 | 223 | ||
| 118 | if (task->prio != prio) | 224 | if (task->prio != prio || dl_prio(prio)) |
| 119 | rt_mutex_setprio(task, prio); | 225 | rt_mutex_setprio(task, prio); |
| 120 | } | 226 | } |
| 121 | 227 | ||
| @@ -233,7 +339,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, | |||
| 233 | * When deadlock detection is off then we check, if further | 339 | * When deadlock detection is off then we check, if further |
| 234 | * priority adjustment is necessary. | 340 | * priority adjustment is necessary. |
| 235 | */ | 341 | */ |
| 236 | if (!detect_deadlock && waiter->list_entry.prio == task->prio) | 342 | if (!detect_deadlock && waiter->prio == task->prio) |
| 237 | goto out_unlock_pi; | 343 | goto out_unlock_pi; |
| 238 | 344 | ||
| 239 | lock = waiter->lock; | 345 | lock = waiter->lock; |
| @@ -254,9 +360,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, | |||
| 254 | top_waiter = rt_mutex_top_waiter(lock); | 360 | top_waiter = rt_mutex_top_waiter(lock); |
| 255 | 361 | ||
| 256 | /* Requeue the waiter */ | 362 | /* Requeue the waiter */ |
| 257 | plist_del(&waiter->list_entry, &lock->wait_list); | 363 | rt_mutex_dequeue(lock, waiter); |
| 258 | waiter->list_entry.prio = task->prio; | 364 | waiter->prio = task->prio; |
| 259 | plist_add(&waiter->list_entry, &lock->wait_list); | 365 | rt_mutex_enqueue(lock, waiter); |
| 260 | 366 | ||
| 261 | /* Release the task */ | 367 | /* Release the task */ |
| 262 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | 368 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); |
| @@ -280,17 +386,15 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, | |||
| 280 | 386 | ||
| 281 | if (waiter == rt_mutex_top_waiter(lock)) { | 387 | if (waiter == rt_mutex_top_waiter(lock)) { |
| 282 | /* Boost the owner */ | 388 | /* Boost the owner */ |
| 283 | plist_del(&top_waiter->pi_list_entry, &task->pi_waiters); | 389 | rt_mutex_dequeue_pi(task, top_waiter); |
| 284 | waiter->pi_list_entry.prio = waiter->list_entry.prio; | 390 | rt_mutex_enqueue_pi(task, waiter); |
| 285 | plist_add(&waiter->pi_list_entry, &task->pi_waiters); | ||
| 286 | __rt_mutex_adjust_prio(task); | 391 | __rt_mutex_adjust_prio(task); |
| 287 | 392 | ||
| 288 | } else if (top_waiter == waiter) { | 393 | } else if (top_waiter == waiter) { |
| 289 | /* Deboost the owner */ | 394 | /* Deboost the owner */ |
| 290 | plist_del(&waiter->pi_list_entry, &task->pi_waiters); | 395 | rt_mutex_dequeue_pi(task, waiter); |
| 291 | waiter = rt_mutex_top_waiter(lock); | 396 | waiter = rt_mutex_top_waiter(lock); |
| 292 | waiter->pi_list_entry.prio = waiter->list_entry.prio; | 397 | rt_mutex_enqueue_pi(task, waiter); |
| 293 | plist_add(&waiter->pi_list_entry, &task->pi_waiters); | ||
| 294 | __rt_mutex_adjust_prio(task); | 398 | __rt_mutex_adjust_prio(task); |
| 295 | } | 399 | } |
| 296 | 400 | ||
| @@ -355,7 +459,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, | |||
| 355 | * 3) it is top waiter | 459 | * 3) it is top waiter |
| 356 | */ | 460 | */ |
| 357 | if (rt_mutex_has_waiters(lock)) { | 461 | if (rt_mutex_has_waiters(lock)) { |
| 358 | if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) { | 462 | if (task->prio >= rt_mutex_top_waiter(lock)->prio) { |
| 359 | if (!waiter || waiter != rt_mutex_top_waiter(lock)) | 463 | if (!waiter || waiter != rt_mutex_top_waiter(lock)) |
| 360 | return 0; | 464 | return 0; |
| 361 | } | 465 | } |
| @@ -369,7 +473,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, | |||
| 369 | 473 | ||
| 370 | /* remove the queued waiter. */ | 474 | /* remove the queued waiter. */ |
| 371 | if (waiter) { | 475 | if (waiter) { |
| 372 | plist_del(&waiter->list_entry, &lock->wait_list); | 476 | rt_mutex_dequeue(lock, waiter); |
| 373 | task->pi_blocked_on = NULL; | 477 | task->pi_blocked_on = NULL; |
| 374 | } | 478 | } |
| 375 | 479 | ||
| @@ -379,8 +483,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, | |||
| 379 | */ | 483 | */ |
| 380 | if (rt_mutex_has_waiters(lock)) { | 484 | if (rt_mutex_has_waiters(lock)) { |
| 381 | top = rt_mutex_top_waiter(lock); | 485 | top = rt_mutex_top_waiter(lock); |
| 382 | top->pi_list_entry.prio = top->list_entry.prio; | 486 | rt_mutex_enqueue_pi(task, top); |
| 383 | plist_add(&top->pi_list_entry, &task->pi_waiters); | ||
| 384 | } | 487 | } |
| 385 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | 488 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); |
| 386 | } | 489 | } |
| @@ -416,13 +519,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, | |||
| 416 | __rt_mutex_adjust_prio(task); | 519 | __rt_mutex_adjust_prio(task); |
| 417 | waiter->task = task; | 520 | waiter->task = task; |
| 418 | waiter->lock = lock; | 521 | waiter->lock = lock; |
| 419 | plist_node_init(&waiter->list_entry, task->prio); | 522 | waiter->prio = task->prio; |
| 420 | plist_node_init(&waiter->pi_list_entry, task->prio); | ||
| 421 | 523 | ||
| 422 | /* Get the top priority waiter on the lock */ | 524 | /* Get the top priority waiter on the lock */ |
| 423 | if (rt_mutex_has_waiters(lock)) | 525 | if (rt_mutex_has_waiters(lock)) |
| 424 | top_waiter = rt_mutex_top_waiter(lock); | 526 | top_waiter = rt_mutex_top_waiter(lock); |
| 425 | plist_add(&waiter->list_entry, &lock->wait_list); | 527 | rt_mutex_enqueue(lock, waiter); |
| 426 | 528 | ||
| 427 | task->pi_blocked_on = waiter; | 529 | task->pi_blocked_on = waiter; |
| 428 | 530 | ||
| @@ -433,8 +535,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, | |||
| 433 | 535 | ||
| 434 | if (waiter == rt_mutex_top_waiter(lock)) { | 536 | if (waiter == rt_mutex_top_waiter(lock)) { |
| 435 | raw_spin_lock_irqsave(&owner->pi_lock, flags); | 537 | raw_spin_lock_irqsave(&owner->pi_lock, flags); |
| 436 | plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); | 538 | rt_mutex_dequeue_pi(owner, top_waiter); |
| 437 | plist_add(&waiter->pi_list_entry, &owner->pi_waiters); | 539 | rt_mutex_enqueue_pi(owner, waiter); |
| 438 | 540 | ||
| 439 | __rt_mutex_adjust_prio(owner); | 541 | __rt_mutex_adjust_prio(owner); |
| 440 | if (owner->pi_blocked_on) | 542 | if (owner->pi_blocked_on) |
| @@ -486,7 +588,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock) | |||
| 486 | * boosted mode and go back to normal after releasing | 588 | * boosted mode and go back to normal after releasing |
| 487 | * lock->wait_lock. | 589 | * lock->wait_lock. |
| 488 | */ | 590 | */ |
| 489 | plist_del(&waiter->pi_list_entry, ¤t->pi_waiters); | 591 | rt_mutex_dequeue_pi(current, waiter); |
| 490 | 592 | ||
| 491 | rt_mutex_set_owner(lock, NULL); | 593 | rt_mutex_set_owner(lock, NULL); |
| 492 | 594 | ||
| @@ -510,7 +612,7 @@ static void remove_waiter(struct rt_mutex *lock, | |||
| 510 | int chain_walk = 0; | 612 | int chain_walk = 0; |
| 511 | 613 | ||
| 512 | raw_spin_lock_irqsave(¤t->pi_lock, flags); | 614 | raw_spin_lock_irqsave(¤t->pi_lock, flags); |
| 513 | plist_del(&waiter->list_entry, &lock->wait_list); | 615 | rt_mutex_dequeue(lock, waiter); |
| 514 | current->pi_blocked_on = NULL; | 616 | current->pi_blocked_on = NULL; |
| 515 | raw_spin_unlock_irqrestore(¤t->pi_lock, flags); | 617 | raw_spin_unlock_irqrestore(¤t->pi_lock, flags); |
| 516 | 618 | ||
| @@ -521,13 +623,13 @@ static void remove_waiter(struct rt_mutex *lock, | |||
| 521 | 623 | ||
| 522 | raw_spin_lock_irqsave(&owner->pi_lock, flags); | 624 | raw_spin_lock_irqsave(&owner->pi_lock, flags); |
| 523 | 625 | ||
| 524 | plist_del(&waiter->pi_list_entry, &owner->pi_waiters); | 626 | rt_mutex_dequeue_pi(owner, waiter); |
| 525 | 627 | ||
| 526 | if (rt_mutex_has_waiters(lock)) { | 628 | if (rt_mutex_has_waiters(lock)) { |
| 527 | struct rt_mutex_waiter *next; | 629 | struct rt_mutex_waiter *next; |
| 528 | 630 | ||
| 529 | next = rt_mutex_top_waiter(lock); | 631 | next = rt_mutex_top_waiter(lock); |
| 530 | plist_add(&next->pi_list_entry, &owner->pi_waiters); | 632 | rt_mutex_enqueue_pi(owner, next); |
| 531 | } | 633 | } |
| 532 | __rt_mutex_adjust_prio(owner); | 634 | __rt_mutex_adjust_prio(owner); |
| 533 | 635 | ||
| @@ -537,8 +639,6 @@ static void remove_waiter(struct rt_mutex *lock, | |||
| 537 | raw_spin_unlock_irqrestore(&owner->pi_lock, flags); | 639 | raw_spin_unlock_irqrestore(&owner->pi_lock, flags); |
| 538 | } | 640 | } |
| 539 | 641 | ||
| 540 | WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); | ||
| 541 | |||
| 542 | if (!chain_walk) | 642 | if (!chain_walk) |
| 543 | return; | 643 | return; |
| 544 | 644 | ||
| @@ -565,7 +665,8 @@ void rt_mutex_adjust_pi(struct task_struct *task) | |||
| 565 | raw_spin_lock_irqsave(&task->pi_lock, flags); | 665 | raw_spin_lock_irqsave(&task->pi_lock, flags); |
| 566 | 666 | ||
| 567 | waiter = task->pi_blocked_on; | 667 | waiter = task->pi_blocked_on; |
| 568 | if (!waiter || waiter->list_entry.prio == task->prio) { | 668 | if (!waiter || (waiter->prio == task->prio && |
| 669 | !dl_prio(task->prio))) { | ||
| 569 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | 670 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); |
| 570 | return; | 671 | return; |
| 571 | } | 672 | } |
| @@ -638,6 +739,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, | |||
| 638 | int ret = 0; | 739 | int ret = 0; |
| 639 | 740 | ||
| 640 | debug_rt_mutex_init_waiter(&waiter); | 741 | debug_rt_mutex_init_waiter(&waiter); |
| 742 | RB_CLEAR_NODE(&waiter.pi_tree_entry); | ||
| 743 | RB_CLEAR_NODE(&waiter.tree_entry); | ||
| 641 | 744 | ||
| 642 | raw_spin_lock(&lock->wait_lock); | 745 | raw_spin_lock(&lock->wait_lock); |
| 643 | 746 | ||
| @@ -904,7 +1007,8 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name) | |||
| 904 | { | 1007 | { |
| 905 | lock->owner = NULL; | 1008 | lock->owner = NULL; |
| 906 | raw_spin_lock_init(&lock->wait_lock); | 1009 | raw_spin_lock_init(&lock->wait_lock); |
| 907 | plist_head_init(&lock->wait_list); | 1010 | lock->waiters = RB_ROOT; |
| 1011 | lock->waiters_leftmost = NULL; | ||
| 908 | 1012 | ||
| 909 | debug_rt_mutex_init(lock, name); | 1013 | debug_rt_mutex_init(lock, name); |
| 910 | } | 1014 | } |
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 53a66c85261b..7431a9c86f35 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h | |||
| @@ -40,13 +40,13 @@ extern void schedule_rt_mutex_test(struct rt_mutex *lock); | |||
| 40 | * This is the control structure for tasks blocked on a rt_mutex, | 40 | * This is the control structure for tasks blocked on a rt_mutex, |
| 41 | * which is allocated on the kernel stack on of the blocked task. | 41 | * which is allocated on the kernel stack on of the blocked task. |
| 42 | * | 42 | * |
| 43 | * @list_entry: pi node to enqueue into the mutex waiters list | 43 | * @tree_entry: pi node to enqueue into the mutex waiters tree |
| 44 | * @pi_list_entry: pi node to enqueue into the mutex owner waiters list | 44 | * @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree |
| 45 | * @task: task reference to the blocked task | 45 | * @task: task reference to the blocked task |
| 46 | */ | 46 | */ |
| 47 | struct rt_mutex_waiter { | 47 | struct rt_mutex_waiter { |
| 48 | struct plist_node list_entry; | 48 | struct rb_node tree_entry; |
| 49 | struct plist_node pi_list_entry; | 49 | struct rb_node pi_tree_entry; |
| 50 | struct task_struct *task; | 50 | struct task_struct *task; |
| 51 | struct rt_mutex *lock; | 51 | struct rt_mutex *lock; |
| 52 | #ifdef CONFIG_DEBUG_RT_MUTEXES | 52 | #ifdef CONFIG_DEBUG_RT_MUTEXES |
| @@ -54,14 +54,15 @@ struct rt_mutex_waiter { | |||
| 54 | struct pid *deadlock_task_pid; | 54 | struct pid *deadlock_task_pid; |
| 55 | struct rt_mutex *deadlock_lock; | 55 | struct rt_mutex *deadlock_lock; |
| 56 | #endif | 56 | #endif |
| 57 | int prio; | ||
| 57 | }; | 58 | }; |
| 58 | 59 | ||
| 59 | /* | 60 | /* |
| 60 | * Various helpers to access the waiters-plist: | 61 | * Various helpers to access the waiters-tree: |
| 61 | */ | 62 | */ |
| 62 | static inline int rt_mutex_has_waiters(struct rt_mutex *lock) | 63 | static inline int rt_mutex_has_waiters(struct rt_mutex *lock) |
| 63 | { | 64 | { |
| 64 | return !plist_head_empty(&lock->wait_list); | 65 | return !RB_EMPTY_ROOT(&lock->waiters); |
| 65 | } | 66 | } |
| 66 | 67 | ||
| 67 | static inline struct rt_mutex_waiter * | 68 | static inline struct rt_mutex_waiter * |
| @@ -69,8 +70,8 @@ rt_mutex_top_waiter(struct rt_mutex *lock) | |||
| 69 | { | 70 | { |
| 70 | struct rt_mutex_waiter *w; | 71 | struct rt_mutex_waiter *w; |
| 71 | 72 | ||
| 72 | w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter, | 73 | w = rb_entry(lock->waiters_leftmost, struct rt_mutex_waiter, |
| 73 | list_entry); | 74 | tree_entry); |
| 74 | BUG_ON(w->lock != lock); | 75 | BUG_ON(w->lock != lock); |
| 75 | 76 | ||
| 76 | return w; | 77 | return w; |
| @@ -78,14 +79,14 @@ rt_mutex_top_waiter(struct rt_mutex *lock) | |||
| 78 | 79 | ||
| 79 | static inline int task_has_pi_waiters(struct task_struct *p) | 80 | static inline int task_has_pi_waiters(struct task_struct *p) |
| 80 | { | 81 | { |
| 81 | return !plist_head_empty(&p->pi_waiters); | 82 | return !RB_EMPTY_ROOT(&p->pi_waiters); |
| 82 | } | 83 | } |
| 83 | 84 | ||
| 84 | static inline struct rt_mutex_waiter * | 85 | static inline struct rt_mutex_waiter * |
| 85 | task_top_pi_waiter(struct task_struct *p) | 86 | task_top_pi_waiter(struct task_struct *p) |
| 86 | { | 87 | { |
| 87 | return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter, | 88 | return rb_entry(p->pi_waiters_leftmost, struct rt_mutex_waiter, |
| 88 | pi_list_entry); | 89 | pi_tree_entry); |
| 89 | } | 90 | } |
| 90 | 91 | ||
| 91 | /* | 92 | /* |
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 7b621409cf15..9a95c8c2af2a 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile | |||
| @@ -11,9 +11,10 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) | |||
| 11 | CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer | 11 | CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer |
| 12 | endif | 12 | endif |
| 13 | 13 | ||
| 14 | obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o | 14 | obj-y += core.o proc.o clock.o cputime.o |
| 15 | obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o | ||
| 15 | obj-y += wait.o completion.o | 16 | obj-y += wait.o completion.o |
| 16 | obj-$(CONFIG_SMP) += cpupri.o | 17 | obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o |
| 17 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o | 18 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o |
| 18 | obj-$(CONFIG_SCHEDSTATS) += stats.o | 19 | obj-$(CONFIG_SCHEDSTATS) += stats.o |
| 19 | obj-$(CONFIG_SCHED_DEBUG) += debug.o | 20 | obj-$(CONFIG_SCHED_DEBUG) += debug.o |
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c3ae1446461c..6bd6a6731b21 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c | |||
| @@ -26,9 +26,10 @@ | |||
| 26 | * at 0 on boot (but people really shouldn't rely on that). | 26 | * at 0 on boot (but people really shouldn't rely on that). |
| 27 | * | 27 | * |
| 28 | * cpu_clock(i) -- can be used from any context, including NMI. | 28 | * cpu_clock(i) -- can be used from any context, including NMI. |
| 29 | * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI) | ||
| 30 | * local_clock() -- is cpu_clock() on the current cpu. | 29 | * local_clock() -- is cpu_clock() on the current cpu. |
| 31 | * | 30 | * |
| 31 | * sched_clock_cpu(i) | ||
| 32 | * | ||
| 32 | * How: | 33 | * How: |
| 33 | * | 34 | * |
| 34 | * The implementation either uses sched_clock() when | 35 | * The implementation either uses sched_clock() when |
| @@ -50,15 +51,6 @@ | |||
| 50 | * Furthermore, explicit sleep and wakeup hooks allow us to account for time | 51 | * Furthermore, explicit sleep and wakeup hooks allow us to account for time |
| 51 | * that is otherwise invisible (TSC gets stopped). | 52 | * that is otherwise invisible (TSC gets stopped). |
| 52 | * | 53 | * |
| 53 | * | ||
| 54 | * Notes: | ||
| 55 | * | ||
| 56 | * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things | ||
| 57 | * like cpufreq interrupts that can change the base clock (TSC) multiplier | ||
| 58 | * and cause funny jumps in time -- although the filtering provided by | ||
| 59 | * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it | ||
| 60 | * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on | ||
| 61 | * sched_clock(). | ||
| 62 | */ | 54 | */ |
| 63 | #include <linux/spinlock.h> | 55 | #include <linux/spinlock.h> |
| 64 | #include <linux/hardirq.h> | 56 | #include <linux/hardirq.h> |
| @@ -66,6 +58,8 @@ | |||
| 66 | #include <linux/percpu.h> | 58 | #include <linux/percpu.h> |
| 67 | #include <linux/ktime.h> | 59 | #include <linux/ktime.h> |
| 68 | #include <linux/sched.h> | 60 | #include <linux/sched.h> |
| 61 | #include <linux/static_key.h> | ||
| 62 | #include <linux/workqueue.h> | ||
| 69 | 63 | ||
| 70 | /* | 64 | /* |
| 71 | * Scheduler clock - returns current time in nanosec units. | 65 | * Scheduler clock - returns current time in nanosec units. |
| @@ -82,7 +76,37 @@ EXPORT_SYMBOL_GPL(sched_clock); | |||
| 82 | __read_mostly int sched_clock_running; | 76 | __read_mostly int sched_clock_running; |
| 83 | 77 | ||
| 84 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK | 78 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK |
| 85 | __read_mostly int sched_clock_stable; | 79 | static struct static_key __sched_clock_stable = STATIC_KEY_INIT; |
| 80 | |||
| 81 | int sched_clock_stable(void) | ||
| 82 | { | ||
| 83 | if (static_key_false(&__sched_clock_stable)) | ||
| 84 | return false; | ||
| 85 | return true; | ||
| 86 | } | ||
| 87 | |||
| 88 | void set_sched_clock_stable(void) | ||
| 89 | { | ||
| 90 | if (!sched_clock_stable()) | ||
| 91 | static_key_slow_dec(&__sched_clock_stable); | ||
| 92 | } | ||
| 93 | |||
| 94 | static void __clear_sched_clock_stable(struct work_struct *work) | ||
| 95 | { | ||
| 96 | /* XXX worry about clock continuity */ | ||
| 97 | if (sched_clock_stable()) | ||
| 98 | static_key_slow_inc(&__sched_clock_stable); | ||
| 99 | } | ||
| 100 | |||
| 101 | static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable); | ||
| 102 | |||
| 103 | void clear_sched_clock_stable(void) | ||
| 104 | { | ||
| 105 | if (keventd_up()) | ||
| 106 | schedule_work(&sched_clock_work); | ||
| 107 | else | ||
| 108 | __clear_sched_clock_stable(&sched_clock_work); | ||
| 109 | } | ||
| 86 | 110 | ||
| 87 | struct sched_clock_data { | 111 | struct sched_clock_data { |
| 88 | u64 tick_raw; | 112 | u64 tick_raw; |
| @@ -242,20 +266,20 @@ u64 sched_clock_cpu(int cpu) | |||
| 242 | struct sched_clock_data *scd; | 266 | struct sched_clock_data *scd; |
| 243 | u64 clock; | 267 | u64 clock; |
| 244 | 268 | ||
| 245 | WARN_ON_ONCE(!irqs_disabled()); | 269 | if (sched_clock_stable()) |
| 246 | |||
| 247 | if (sched_clock_stable) | ||
| 248 | return sched_clock(); | 270 | return sched_clock(); |
| 249 | 271 | ||
| 250 | if (unlikely(!sched_clock_running)) | 272 | if (unlikely(!sched_clock_running)) |
| 251 | return 0ull; | 273 | return 0ull; |
| 252 | 274 | ||
| 275 | preempt_disable(); | ||
| 253 | scd = cpu_sdc(cpu); | 276 | scd = cpu_sdc(cpu); |
| 254 | 277 | ||
| 255 | if (cpu != smp_processor_id()) | 278 | if (cpu != smp_processor_id()) |
| 256 | clock = sched_clock_remote(scd); | 279 | clock = sched_clock_remote(scd); |
| 257 | else | 280 | else |
| 258 | clock = sched_clock_local(scd); | 281 | clock = sched_clock_local(scd); |
| 282 | preempt_enable(); | ||
| 259 | 283 | ||
| 260 | return clock; | 284 | return clock; |
| 261 | } | 285 | } |
| @@ -265,7 +289,7 @@ void sched_clock_tick(void) | |||
| 265 | struct sched_clock_data *scd; | 289 | struct sched_clock_data *scd; |
| 266 | u64 now, now_gtod; | 290 | u64 now, now_gtod; |
| 267 | 291 | ||
| 268 | if (sched_clock_stable) | 292 | if (sched_clock_stable()) |
| 269 | return; | 293 | return; |
| 270 | 294 | ||
| 271 | if (unlikely(!sched_clock_running)) | 295 | if (unlikely(!sched_clock_running)) |
| @@ -316,14 +340,10 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); | |||
| 316 | */ | 340 | */ |
| 317 | u64 cpu_clock(int cpu) | 341 | u64 cpu_clock(int cpu) |
| 318 | { | 342 | { |
| 319 | u64 clock; | 343 | if (static_key_false(&__sched_clock_stable)) |
| 320 | unsigned long flags; | 344 | return sched_clock_cpu(cpu); |
| 321 | |||
| 322 | local_irq_save(flags); | ||
| 323 | clock = sched_clock_cpu(cpu); | ||
| 324 | local_irq_restore(flags); | ||
| 325 | 345 | ||
| 326 | return clock; | 346 | return sched_clock(); |
| 327 | } | 347 | } |
| 328 | 348 | ||
| 329 | /* | 349 | /* |
| @@ -335,14 +355,10 @@ u64 cpu_clock(int cpu) | |||
| 335 | */ | 355 | */ |
| 336 | u64 local_clock(void) | 356 | u64 local_clock(void) |
| 337 | { | 357 | { |
| 338 | u64 clock; | 358 | if (static_key_false(&__sched_clock_stable)) |
| 339 | unsigned long flags; | 359 | return sched_clock_cpu(raw_smp_processor_id()); |
| 340 | 360 | ||
| 341 | local_irq_save(flags); | 361 | return sched_clock(); |
| 342 | clock = sched_clock_cpu(smp_processor_id()); | ||
| 343 | local_irq_restore(flags); | ||
| 344 | |||
| 345 | return clock; | ||
| 346 | } | 362 | } |
| 347 | 363 | ||
| 348 | #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ | 364 | #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ |
| @@ -362,12 +378,12 @@ u64 sched_clock_cpu(int cpu) | |||
| 362 | 378 | ||
| 363 | u64 cpu_clock(int cpu) | 379 | u64 cpu_clock(int cpu) |
| 364 | { | 380 | { |
| 365 | return sched_clock_cpu(cpu); | 381 | return sched_clock(); |
| 366 | } | 382 | } |
| 367 | 383 | ||
| 368 | u64 local_clock(void) | 384 | u64 local_clock(void) |
| 369 | { | 385 | { |
| 370 | return sched_clock_cpu(0); | 386 | return sched_clock(); |
| 371 | } | 387 | } |
| 372 | 388 | ||
| 373 | #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ | 389 | #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a88f4a485c5e..36c951b7eef8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
| @@ -296,8 +296,6 @@ __read_mostly int scheduler_running; | |||
| 296 | */ | 296 | */ |
| 297 | int sysctl_sched_rt_runtime = 950000; | 297 | int sysctl_sched_rt_runtime = 950000; |
| 298 | 298 | ||
| 299 | |||
| 300 | |||
| 301 | /* | 299 | /* |
| 302 | * __task_rq_lock - lock the rq @p resides on. | 300 | * __task_rq_lock - lock the rq @p resides on. |
| 303 | */ | 301 | */ |
| @@ -899,7 +897,9 @@ static inline int normal_prio(struct task_struct *p) | |||
| 899 | { | 897 | { |
| 900 | int prio; | 898 | int prio; |
| 901 | 899 | ||
| 902 | if (task_has_rt_policy(p)) | 900 | if (task_has_dl_policy(p)) |
| 901 | prio = MAX_DL_PRIO-1; | ||
| 902 | else if (task_has_rt_policy(p)) | ||
| 903 | prio = MAX_RT_PRIO-1 - p->rt_priority; | 903 | prio = MAX_RT_PRIO-1 - p->rt_priority; |
| 904 | else | 904 | else |
| 905 | prio = __normal_prio(p); | 905 | prio = __normal_prio(p); |
| @@ -945,7 +945,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, | |||
| 945 | if (prev_class->switched_from) | 945 | if (prev_class->switched_from) |
| 946 | prev_class->switched_from(rq, p); | 946 | prev_class->switched_from(rq, p); |
| 947 | p->sched_class->switched_to(rq, p); | 947 | p->sched_class->switched_to(rq, p); |
| 948 | } else if (oldprio != p->prio) | 948 | } else if (oldprio != p->prio || dl_task(p)) |
| 949 | p->sched_class->prio_changed(rq, p, oldprio); | 949 | p->sched_class->prio_changed(rq, p, oldprio); |
| 950 | } | 950 | } |
| 951 | 951 | ||
| @@ -1499,8 +1499,7 @@ void scheduler_ipi(void) | |||
| 1499 | * TIF_NEED_RESCHED remotely (for the first time) will also send | 1499 | * TIF_NEED_RESCHED remotely (for the first time) will also send |
| 1500 | * this IPI. | 1500 | * this IPI. |
| 1501 | */ | 1501 | */ |
| 1502 | if (tif_need_resched()) | 1502 | preempt_fold_need_resched(); |
| 1503 | set_preempt_need_resched(); | ||
| 1504 | 1503 | ||
| 1505 | if (llist_empty(&this_rq()->wake_list) | 1504 | if (llist_empty(&this_rq()->wake_list) |
| 1506 | && !tick_nohz_full_cpu(smp_processor_id()) | 1505 | && !tick_nohz_full_cpu(smp_processor_id()) |
| @@ -1717,6 +1716,13 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
| 1717 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); | 1716 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); |
| 1718 | #endif | 1717 | #endif |
| 1719 | 1718 | ||
| 1719 | RB_CLEAR_NODE(&p->dl.rb_node); | ||
| 1720 | hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
| 1721 | p->dl.dl_runtime = p->dl.runtime = 0; | ||
| 1722 | p->dl.dl_deadline = p->dl.deadline = 0; | ||
| 1723 | p->dl.dl_period = 0; | ||
| 1724 | p->dl.flags = 0; | ||
| 1725 | |||
| 1720 | INIT_LIST_HEAD(&p->rt.run_list); | 1726 | INIT_LIST_HEAD(&p->rt.run_list); |
| 1721 | 1727 | ||
| 1722 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 1728 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
| @@ -1768,7 +1774,7 @@ void set_numabalancing_state(bool enabled) | |||
| 1768 | /* | 1774 | /* |
| 1769 | * fork()/clone()-time setup: | 1775 | * fork()/clone()-time setup: |
| 1770 | */ | 1776 | */ |
| 1771 | void sched_fork(unsigned long clone_flags, struct task_struct *p) | 1777 | int sched_fork(unsigned long clone_flags, struct task_struct *p) |
| 1772 | { | 1778 | { |
| 1773 | unsigned long flags; | 1779 | unsigned long flags; |
| 1774 | int cpu = get_cpu(); | 1780 | int cpu = get_cpu(); |
| @@ -1790,7 +1796,7 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
| 1790 | * Revert to default priority/policy on fork if requested. | 1796 | * Revert to default priority/policy on fork if requested. |
| 1791 | */ | 1797 | */ |
| 1792 | if (unlikely(p->sched_reset_on_fork)) { | 1798 | if (unlikely(p->sched_reset_on_fork)) { |
| 1793 | if (task_has_rt_policy(p)) { | 1799 | if (task_has_dl_policy(p) || task_has_rt_policy(p)) { |
| 1794 | p->policy = SCHED_NORMAL; | 1800 | p->policy = SCHED_NORMAL; |
| 1795 | p->static_prio = NICE_TO_PRIO(0); | 1801 | p->static_prio = NICE_TO_PRIO(0); |
| 1796 | p->rt_priority = 0; | 1802 | p->rt_priority = 0; |
| @@ -1807,8 +1813,14 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
| 1807 | p->sched_reset_on_fork = 0; | 1813 | p->sched_reset_on_fork = 0; |
| 1808 | } | 1814 | } |
| 1809 | 1815 | ||
| 1810 | if (!rt_prio(p->prio)) | 1816 | if (dl_prio(p->prio)) { |
| 1817 | put_cpu(); | ||
| 1818 | return -EAGAIN; | ||
| 1819 | } else if (rt_prio(p->prio)) { | ||
| 1820 | p->sched_class = &rt_sched_class; | ||
| 1821 | } else { | ||
| 1811 | p->sched_class = &fair_sched_class; | 1822 | p->sched_class = &fair_sched_class; |
| 1823 | } | ||
| 1812 | 1824 | ||
| 1813 | if (p->sched_class->task_fork) | 1825 | if (p->sched_class->task_fork) |
| 1814 | p->sched_class->task_fork(p); | 1826 | p->sched_class->task_fork(p); |
| @@ -1834,11 +1846,124 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
| 1834 | init_task_preempt_count(p); | 1846 | init_task_preempt_count(p); |
| 1835 | #ifdef CONFIG_SMP | 1847 | #ifdef CONFIG_SMP |
| 1836 | plist_node_init(&p->pushable_tasks, MAX_PRIO); | 1848 | plist_node_init(&p->pushable_tasks, MAX_PRIO); |
| 1849 | RB_CLEAR_NODE(&p->pushable_dl_tasks); | ||
| 1837 | #endif | 1850 | #endif |
| 1838 | 1851 | ||
| 1839 | put_cpu(); | 1852 | put_cpu(); |
| 1853 | return 0; | ||
| 1854 | } | ||
| 1855 | |||
| 1856 | unsigned long to_ratio(u64 period, u64 runtime) | ||
| 1857 | { | ||
| 1858 | if (runtime == RUNTIME_INF) | ||
| 1859 | return 1ULL << 20; | ||
| 1860 | |||
| 1861 | /* | ||
| 1862 | * Doing this here saves a lot of checks in all | ||
| 1863 | * the calling paths, and returning zero seems | ||
| 1864 | * safe for them anyway. | ||
| 1865 | */ | ||
| 1866 | if (period == 0) | ||
| 1867 | return 0; | ||
| 1868 | |||
| 1869 | return div64_u64(runtime << 20, period); | ||
| 1870 | } | ||
| 1871 | |||
| 1872 | #ifdef CONFIG_SMP | ||
| 1873 | inline struct dl_bw *dl_bw_of(int i) | ||
| 1874 | { | ||
| 1875 | return &cpu_rq(i)->rd->dl_bw; | ||
| 1840 | } | 1876 | } |
| 1841 | 1877 | ||
| 1878 | static inline int dl_bw_cpus(int i) | ||
| 1879 | { | ||
| 1880 | struct root_domain *rd = cpu_rq(i)->rd; | ||
| 1881 | int cpus = 0; | ||
| 1882 | |||
| 1883 | for_each_cpu_and(i, rd->span, cpu_active_mask) | ||
| 1884 | cpus++; | ||
| 1885 | |||
| 1886 | return cpus; | ||
| 1887 | } | ||
| 1888 | #else | ||
| 1889 | inline struct dl_bw *dl_bw_of(int i) | ||
| 1890 | { | ||
| 1891 | return &cpu_rq(i)->dl.dl_bw; | ||
| 1892 | } | ||
| 1893 | |||
| 1894 | static inline int dl_bw_cpus(int i) | ||
| 1895 | { | ||
| 1896 | return 1; | ||
| 1897 | } | ||
| 1898 | #endif | ||
| 1899 | |||
| 1900 | static inline | ||
| 1901 | void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) | ||
| 1902 | { | ||
| 1903 | dl_b->total_bw -= tsk_bw; | ||
| 1904 | } | ||
| 1905 | |||
| 1906 | static inline | ||
| 1907 | void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) | ||
| 1908 | { | ||
| 1909 | dl_b->total_bw += tsk_bw; | ||
| 1910 | } | ||
| 1911 | |||
| 1912 | static inline | ||
| 1913 | bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) | ||
| 1914 | { | ||
| 1915 | return dl_b->bw != -1 && | ||
| 1916 | dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; | ||
| 1917 | } | ||
| 1918 | |||
| 1919 | /* | ||
| 1920 | * We must be sure that accepting a new task (or allowing changing the | ||
| 1921 | * parameters of an existing one) is consistent with the bandwidth | ||
| 1922 | * constraints. If yes, this function also accordingly updates the currently | ||
| 1923 | * allocated bandwidth to reflect the new situation. | ||
| 1924 | * | ||
| 1925 | * This function is called while holding p's rq->lock. | ||
| 1926 | */ | ||
| 1927 | static int dl_overflow(struct task_struct *p, int policy, | ||
| 1928 | const struct sched_attr *attr) | ||
| 1929 | { | ||
| 1930 | |||
| 1931 | struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); | ||
| 1932 | u64 period = attr->sched_period; | ||
| 1933 | u64 runtime = attr->sched_runtime; | ||
| 1934 | u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; | ||
| 1935 | int cpus, err = -1; | ||
| 1936 | |||
| 1937 | if (new_bw == p->dl.dl_bw) | ||
| 1938 | return 0; | ||
| 1939 | |||
| 1940 | /* | ||
| 1941 | * Either if a task, enters, leave, or stays -deadline but changes | ||
| 1942 | * its parameters, we may need to update accordingly the total | ||
| 1943 | * allocated bandwidth of the container. | ||
| 1944 | */ | ||
| 1945 | raw_spin_lock(&dl_b->lock); | ||
| 1946 | cpus = dl_bw_cpus(task_cpu(p)); | ||
| 1947 | if (dl_policy(policy) && !task_has_dl_policy(p) && | ||
| 1948 | !__dl_overflow(dl_b, cpus, 0, new_bw)) { | ||
| 1949 | __dl_add(dl_b, new_bw); | ||
| 1950 | err = 0; | ||
| 1951 | } else if (dl_policy(policy) && task_has_dl_policy(p) && | ||
| 1952 | !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { | ||
| 1953 | __dl_clear(dl_b, p->dl.dl_bw); | ||
| 1954 | __dl_add(dl_b, new_bw); | ||
| 1955 | err = 0; | ||
| 1956 | } else if (!dl_policy(policy) && task_has_dl_policy(p)) { | ||
| 1957 | __dl_clear(dl_b, p->dl.dl_bw); | ||
| 1958 | err = 0; | ||
| 1959 | } | ||
| 1960 | raw_spin_unlock(&dl_b->lock); | ||
| 1961 | |||
| 1962 | return err; | ||
| 1963 | } | ||
| 1964 | |||
| 1965 | extern void init_dl_bw(struct dl_bw *dl_b); | ||
| 1966 | |||
| 1842 | /* | 1967 | /* |
| 1843 | * wake_up_new_task - wake up a newly created task for the first time. | 1968 | * wake_up_new_task - wake up a newly created task for the first time. |
| 1844 | * | 1969 | * |
| @@ -2003,6 +2128,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
| 2003 | if (unlikely(prev_state == TASK_DEAD)) { | 2128 | if (unlikely(prev_state == TASK_DEAD)) { |
| 2004 | task_numa_free(prev); | 2129 | task_numa_free(prev); |
| 2005 | 2130 | ||
| 2131 | if (prev->sched_class->task_dead) | ||
| 2132 | prev->sched_class->task_dead(prev); | ||
| 2133 | |||
| 2006 | /* | 2134 | /* |
| 2007 | * Remove function-return probe instances associated with this | 2135 | * Remove function-return probe instances associated with this |
| 2008 | * task and put them back on the free list. | 2136 | * task and put them back on the free list. |
| @@ -2296,7 +2424,7 @@ void scheduler_tick(void) | |||
| 2296 | 2424 | ||
| 2297 | #ifdef CONFIG_SMP | 2425 | #ifdef CONFIG_SMP |
| 2298 | rq->idle_balance = idle_cpu(cpu); | 2426 | rq->idle_balance = idle_cpu(cpu); |
| 2299 | trigger_load_balance(rq, cpu); | 2427 | trigger_load_balance(rq); |
| 2300 | #endif | 2428 | #endif |
| 2301 | rq_last_tick_reset(rq); | 2429 | rq_last_tick_reset(rq); |
| 2302 | } | 2430 | } |
| @@ -2414,10 +2542,10 @@ static inline void schedule_debug(struct task_struct *prev) | |||
| 2414 | { | 2542 | { |
| 2415 | /* | 2543 | /* |
| 2416 | * Test if we are atomic. Since do_exit() needs to call into | 2544 | * Test if we are atomic. Since do_exit() needs to call into |
| 2417 | * schedule() atomically, we ignore that path for now. | 2545 | * schedule() atomically, we ignore that path. Otherwise whine |
| 2418 | * Otherwise, whine if we are scheduling when we should not be. | 2546 | * if we are scheduling when we should not. |
| 2419 | */ | 2547 | */ |
| 2420 | if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) | 2548 | if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD)) |
| 2421 | __schedule_bug(prev); | 2549 | __schedule_bug(prev); |
| 2422 | rcu_sleep_check(); | 2550 | rcu_sleep_check(); |
| 2423 | 2551 | ||
| @@ -2761,11 +2889,11 @@ EXPORT_SYMBOL(sleep_on_timeout); | |||
| 2761 | */ | 2889 | */ |
| 2762 | void rt_mutex_setprio(struct task_struct *p, int prio) | 2890 | void rt_mutex_setprio(struct task_struct *p, int prio) |
| 2763 | { | 2891 | { |
| 2764 | int oldprio, on_rq, running; | 2892 | int oldprio, on_rq, running, enqueue_flag = 0; |
| 2765 | struct rq *rq; | 2893 | struct rq *rq; |
| 2766 | const struct sched_class *prev_class; | 2894 | const struct sched_class *prev_class; |
| 2767 | 2895 | ||
| 2768 | BUG_ON(prio < 0 || prio > MAX_PRIO); | 2896 | BUG_ON(prio > MAX_PRIO); |
| 2769 | 2897 | ||
| 2770 | rq = __task_rq_lock(p); | 2898 | rq = __task_rq_lock(p); |
| 2771 | 2899 | ||
| @@ -2788,6 +2916,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
| 2788 | } | 2916 | } |
| 2789 | 2917 | ||
| 2790 | trace_sched_pi_setprio(p, prio); | 2918 | trace_sched_pi_setprio(p, prio); |
| 2919 | p->pi_top_task = rt_mutex_get_top_task(p); | ||
| 2791 | oldprio = p->prio; | 2920 | oldprio = p->prio; |
| 2792 | prev_class = p->sched_class; | 2921 | prev_class = p->sched_class; |
| 2793 | on_rq = p->on_rq; | 2922 | on_rq = p->on_rq; |
| @@ -2797,23 +2926,49 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
| 2797 | if (running) | 2926 | if (running) |
| 2798 | p->sched_class->put_prev_task(rq, p); | 2927 | p->sched_class->put_prev_task(rq, p); |
| 2799 | 2928 | ||
| 2800 | if (rt_prio(prio)) | 2929 | /* |
| 2930 | * Boosting condition are: | ||
| 2931 | * 1. -rt task is running and holds mutex A | ||
| 2932 | * --> -dl task blocks on mutex A | ||
| 2933 | * | ||
| 2934 | * 2. -dl task is running and holds mutex A | ||
| 2935 | * --> -dl task blocks on mutex A and could preempt the | ||
| 2936 | * running task | ||
| 2937 | */ | ||
| 2938 | if (dl_prio(prio)) { | ||
| 2939 | if (!dl_prio(p->normal_prio) || (p->pi_top_task && | ||
| 2940 | dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) { | ||
| 2941 | p->dl.dl_boosted = 1; | ||
| 2942 | p->dl.dl_throttled = 0; | ||
| 2943 | enqueue_flag = ENQUEUE_REPLENISH; | ||
| 2944 | } else | ||
| 2945 | p->dl.dl_boosted = 0; | ||
| 2946 | p->sched_class = &dl_sched_class; | ||
| 2947 | } else if (rt_prio(prio)) { | ||
| 2948 | if (dl_prio(oldprio)) | ||
| 2949 | p->dl.dl_boosted = 0; | ||
| 2950 | if (oldprio < prio) | ||
| 2951 | enqueue_flag = ENQUEUE_HEAD; | ||
| 2801 | p->sched_class = &rt_sched_class; | 2952 | p->sched_class = &rt_sched_class; |
| 2802 | else | 2953 | } else { |
| 2954 | if (dl_prio(oldprio)) | ||
| 2955 | p->dl.dl_boosted = 0; | ||
| 2803 | p->sched_class = &fair_sched_class; | 2956 | p->sched_class = &fair_sched_class; |
| 2957 | } | ||
| 2804 | 2958 | ||
| 2805 | p->prio = prio; | 2959 | p->prio = prio; |
| 2806 | 2960 | ||
| 2807 | if (running) | 2961 | if (running) |
| 2808 | p->sched_class->set_curr_task(rq); | 2962 | p->sched_class->set_curr_task(rq); |
| 2809 | if (on_rq) | 2963 | if (on_rq) |
| 2810 | enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); | 2964 | enqueue_task(rq, p, enqueue_flag); |
| 2811 | 2965 | ||
| 2812 | check_class_changed(rq, p, prev_class, oldprio); | 2966 | check_class_changed(rq, p, prev_class, oldprio); |
| 2813 | out_unlock: | 2967 | out_unlock: |
| 2814 | __task_rq_unlock(rq); | 2968 | __task_rq_unlock(rq); |
| 2815 | } | 2969 | } |
| 2816 | #endif | 2970 | #endif |
| 2971 | |||
| 2817 | void set_user_nice(struct task_struct *p, long nice) | 2972 | void set_user_nice(struct task_struct *p, long nice) |
| 2818 | { | 2973 | { |
| 2819 | int old_prio, delta, on_rq; | 2974 | int old_prio, delta, on_rq; |
| @@ -2831,9 +2986,9 @@ void set_user_nice(struct task_struct *p, long nice) | |||
| 2831 | * The RT priorities are set via sched_setscheduler(), but we still | 2986 | * The RT priorities are set via sched_setscheduler(), but we still |
| 2832 | * allow the 'normal' nice value to be set - but as expected | 2987 | * allow the 'normal' nice value to be set - but as expected |
| 2833 | * it wont have any effect on scheduling until the task is | 2988 | * it wont have any effect on scheduling until the task is |
| 2834 | * SCHED_FIFO/SCHED_RR: | 2989 | * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR: |
| 2835 | */ | 2990 | */ |
| 2836 | if (task_has_rt_policy(p)) { | 2991 | if (task_has_dl_policy(p) || task_has_rt_policy(p)) { |
| 2837 | p->static_prio = NICE_TO_PRIO(nice); | 2992 | p->static_prio = NICE_TO_PRIO(nice); |
| 2838 | goto out_unlock; | 2993 | goto out_unlock; |
| 2839 | } | 2994 | } |
| @@ -2988,22 +3143,95 @@ static struct task_struct *find_process_by_pid(pid_t pid) | |||
| 2988 | return pid ? find_task_by_vpid(pid) : current; | 3143 | return pid ? find_task_by_vpid(pid) : current; |
| 2989 | } | 3144 | } |
| 2990 | 3145 | ||
| 2991 | /* Actually do priority change: must hold rq lock. */ | 3146 | /* |
| 3147 | * This function initializes the sched_dl_entity of a newly becoming | ||
| 3148 | * SCHED_DEADLINE task. | ||
| 3149 | * | ||
| 3150 | * Only the static values are considered here, the actual runtime and the | ||
| 3151 | * absolute deadline will be properly calculated when the task is enqueued | ||
| 3152 | * for the first time with its new policy. | ||
| 3153 | */ | ||
| 2992 | static void | 3154 | static void |
| 2993 | __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) | 3155 | __setparam_dl(struct task_struct *p, const struct sched_attr *attr) |
| 3156 | { | ||
| 3157 | struct sched_dl_entity *dl_se = &p->dl; | ||
| 3158 | |||
| 3159 | init_dl_task_timer(dl_se); | ||
| 3160 | dl_se->dl_runtime = attr->sched_runtime; | ||
| 3161 | dl_se->dl_deadline = attr->sched_deadline; | ||
| 3162 | dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; | ||
| 3163 | dl_se->flags = attr->sched_flags; | ||
| 3164 | dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); | ||
| 3165 | dl_se->dl_throttled = 0; | ||
| 3166 | dl_se->dl_new = 1; | ||
| 3167 | } | ||
| 3168 | |||
| 3169 | /* Actually do priority change: must hold pi & rq lock. */ | ||
| 3170 | static void __setscheduler(struct rq *rq, struct task_struct *p, | ||
| 3171 | const struct sched_attr *attr) | ||
| 2994 | { | 3172 | { |
| 3173 | int policy = attr->sched_policy; | ||
| 3174 | |||
| 3175 | if (policy == -1) /* setparam */ | ||
| 3176 | policy = p->policy; | ||
| 3177 | |||
| 2995 | p->policy = policy; | 3178 | p->policy = policy; |
| 2996 | p->rt_priority = prio; | 3179 | |
| 3180 | if (dl_policy(policy)) | ||
| 3181 | __setparam_dl(p, attr); | ||
| 3182 | else if (fair_policy(policy)) | ||
| 3183 | p->static_prio = NICE_TO_PRIO(attr->sched_nice); | ||
| 3184 | |||
| 3185 | /* | ||
| 3186 | * __sched_setscheduler() ensures attr->sched_priority == 0 when | ||
| 3187 | * !rt_policy. Always setting this ensures that things like | ||
| 3188 | * getparam()/getattr() don't report silly values for !rt tasks. | ||
| 3189 | */ | ||
| 3190 | p->rt_priority = attr->sched_priority; | ||
| 3191 | |||
| 2997 | p->normal_prio = normal_prio(p); | 3192 | p->normal_prio = normal_prio(p); |
| 2998 | /* we are holding p->pi_lock already */ | ||
| 2999 | p->prio = rt_mutex_getprio(p); | 3193 | p->prio = rt_mutex_getprio(p); |
| 3000 | if (rt_prio(p->prio)) | 3194 | |
| 3195 | if (dl_prio(p->prio)) | ||
| 3196 | p->sched_class = &dl_sched_class; | ||
| 3197 | else if (rt_prio(p->prio)) | ||
| 3001 | p->sched_class = &rt_sched_class; | 3198 | p->sched_class = &rt_sched_class; |
| 3002 | else | 3199 | else |
| 3003 | p->sched_class = &fair_sched_class; | 3200 | p->sched_class = &fair_sched_class; |
| 3201 | |||
| 3004 | set_load_weight(p); | 3202 | set_load_weight(p); |
| 3005 | } | 3203 | } |
| 3006 | 3204 | ||
| 3205 | static void | ||
| 3206 | __getparam_dl(struct task_struct *p, struct sched_attr *attr) | ||
| 3207 | { | ||
| 3208 | struct sched_dl_entity *dl_se = &p->dl; | ||
| 3209 | |||
| 3210 | attr->sched_priority = p->rt_priority; | ||
| 3211 | attr->sched_runtime = dl_se->dl_runtime; | ||
| 3212 | attr->sched_deadline = dl_se->dl_deadline; | ||
| 3213 | attr->sched_period = dl_se->dl_period; | ||
| 3214 | attr->sched_flags = dl_se->flags; | ||
| 3215 | } | ||
| 3216 | |||
| 3217 | /* | ||
| 3218 | * This function validates the new parameters of a -deadline task. | ||
| 3219 | * We ask for the deadline not being zero, and greater or equal | ||
| 3220 | * than the runtime, as well as the period of being zero or | ||
| 3221 | * greater than deadline. Furthermore, we have to be sure that | ||
| 3222 | * user parameters are above the internal resolution (1us); we | ||
| 3223 | * check sched_runtime only since it is always the smaller one. | ||
| 3224 | */ | ||
| 3225 | static bool | ||
| 3226 | __checkparam_dl(const struct sched_attr *attr) | ||
| 3227 | { | ||
| 3228 | return attr && attr->sched_deadline != 0 && | ||
| 3229 | (attr->sched_period == 0 || | ||
| 3230 | (s64)(attr->sched_period - attr->sched_deadline) >= 0) && | ||
| 3231 | (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0 && | ||
| 3232 | attr->sched_runtime >= (2 << (DL_SCALE - 1)); | ||
| 3233 | } | ||
| 3234 | |||
| 3007 | /* | 3235 | /* |
| 3008 | * check the target process has a UID that matches the current process's | 3236 | * check the target process has a UID that matches the current process's |
| 3009 | */ | 3237 | */ |
| @@ -3020,10 +3248,12 @@ static bool check_same_owner(struct task_struct *p) | |||
| 3020 | return match; | 3248 | return match; |
| 3021 | } | 3249 | } |
| 3022 | 3250 | ||
| 3023 | static int __sched_setscheduler(struct task_struct *p, int policy, | 3251 | static int __sched_setscheduler(struct task_struct *p, |
| 3024 | const struct sched_param *param, bool user) | 3252 | const struct sched_attr *attr, |
| 3253 | bool user) | ||
| 3025 | { | 3254 | { |
| 3026 | int retval, oldprio, oldpolicy = -1, on_rq, running; | 3255 | int retval, oldprio, oldpolicy = -1, on_rq, running; |
| 3256 | int policy = attr->sched_policy; | ||
| 3027 | unsigned long flags; | 3257 | unsigned long flags; |
| 3028 | const struct sched_class *prev_class; | 3258 | const struct sched_class *prev_class; |
| 3029 | struct rq *rq; | 3259 | struct rq *rq; |
| @@ -3037,31 +3267,40 @@ recheck: | |||
| 3037 | reset_on_fork = p->sched_reset_on_fork; | 3267 | reset_on_fork = p->sched_reset_on_fork; |
| 3038 | policy = oldpolicy = p->policy; | 3268 | policy = oldpolicy = p->policy; |
| 3039 | } else { | 3269 | } else { |
| 3040 | reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); | 3270 | reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); |
| 3041 | policy &= ~SCHED_RESET_ON_FORK; | ||
| 3042 | 3271 | ||
| 3043 | if (policy != SCHED_FIFO && policy != SCHED_RR && | 3272 | if (policy != SCHED_DEADLINE && |
| 3273 | policy != SCHED_FIFO && policy != SCHED_RR && | ||
| 3044 | policy != SCHED_NORMAL && policy != SCHED_BATCH && | 3274 | policy != SCHED_NORMAL && policy != SCHED_BATCH && |
| 3045 | policy != SCHED_IDLE) | 3275 | policy != SCHED_IDLE) |
| 3046 | return -EINVAL; | 3276 | return -EINVAL; |
| 3047 | } | 3277 | } |
| 3048 | 3278 | ||
| 3279 | if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK)) | ||
| 3280 | return -EINVAL; | ||
| 3281 | |||
| 3049 | /* | 3282 | /* |
| 3050 | * Valid priorities for SCHED_FIFO and SCHED_RR are | 3283 | * Valid priorities for SCHED_FIFO and SCHED_RR are |
| 3051 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, | 3284 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, |
| 3052 | * SCHED_BATCH and SCHED_IDLE is 0. | 3285 | * SCHED_BATCH and SCHED_IDLE is 0. |
| 3053 | */ | 3286 | */ |
| 3054 | if (param->sched_priority < 0 || | 3287 | if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) || |
| 3055 | (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || | 3288 | (!p->mm && attr->sched_priority > MAX_RT_PRIO-1)) |
| 3056 | (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) | ||
| 3057 | return -EINVAL; | 3289 | return -EINVAL; |
| 3058 | if (rt_policy(policy) != (param->sched_priority != 0)) | 3290 | if ((dl_policy(policy) && !__checkparam_dl(attr)) || |
| 3291 | (rt_policy(policy) != (attr->sched_priority != 0))) | ||
| 3059 | return -EINVAL; | 3292 | return -EINVAL; |
| 3060 | 3293 | ||
| 3061 | /* | 3294 | /* |
| 3062 | * Allow unprivileged RT tasks to decrease priority: | 3295 | * Allow unprivileged RT tasks to decrease priority: |
| 3063 | */ | 3296 | */ |
| 3064 | if (user && !capable(CAP_SYS_NICE)) { | 3297 | if (user && !capable(CAP_SYS_NICE)) { |
| 3298 | if (fair_policy(policy)) { | ||
| 3299 | if (attr->sched_nice < TASK_NICE(p) && | ||
| 3300 | !can_nice(p, attr->sched_nice)) | ||
| 3301 | return -EPERM; | ||
| 3302 | } | ||
| 3303 | |||
| 3065 | if (rt_policy(policy)) { | 3304 | if (rt_policy(policy)) { |
| 3066 | unsigned long rlim_rtprio = | 3305 | unsigned long rlim_rtprio = |
| 3067 | task_rlimit(p, RLIMIT_RTPRIO); | 3306 | task_rlimit(p, RLIMIT_RTPRIO); |
| @@ -3071,8 +3310,8 @@ recheck: | |||
| 3071 | return -EPERM; | 3310 | return -EPERM; |
| 3072 | 3311 | ||
| 3073 | /* can't increase priority */ | 3312 | /* can't increase priority */ |
| 3074 | if (param->sched_priority > p->rt_priority && | 3313 | if (attr->sched_priority > p->rt_priority && |
| 3075 | param->sched_priority > rlim_rtprio) | 3314 | attr->sched_priority > rlim_rtprio) |
| 3076 | return -EPERM; | 3315 | return -EPERM; |
| 3077 | } | 3316 | } |
| 3078 | 3317 | ||
| @@ -3120,14 +3359,21 @@ recheck: | |||
| 3120 | /* | 3359 | /* |
| 3121 | * If not changing anything there's no need to proceed further: | 3360 | * If not changing anything there's no need to proceed further: |
| 3122 | */ | 3361 | */ |
| 3123 | if (unlikely(policy == p->policy && (!rt_policy(policy) || | 3362 | if (unlikely(policy == p->policy)) { |
| 3124 | param->sched_priority == p->rt_priority))) { | 3363 | if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p)) |
| 3364 | goto change; | ||
| 3365 | if (rt_policy(policy) && attr->sched_priority != p->rt_priority) | ||
| 3366 | goto change; | ||
| 3367 | if (dl_policy(policy)) | ||
| 3368 | goto change; | ||
| 3369 | |||
| 3125 | task_rq_unlock(rq, p, &flags); | 3370 | task_rq_unlock(rq, p, &flags); |
| 3126 | return 0; | 3371 | return 0; |
| 3127 | } | 3372 | } |
| 3373 | change: | ||
| 3128 | 3374 | ||
| 3129 | #ifdef CONFIG_RT_GROUP_SCHED | ||
| 3130 | if (user) { | 3375 | if (user) { |
| 3376 | #ifdef CONFIG_RT_GROUP_SCHED | ||
| 3131 | /* | 3377 | /* |
| 3132 | * Do not allow realtime tasks into groups that have no runtime | 3378 | * Do not allow realtime tasks into groups that have no runtime |
| 3133 | * assigned. | 3379 | * assigned. |
| @@ -3138,8 +3384,24 @@ recheck: | |||
| 3138 | task_rq_unlock(rq, p, &flags); | 3384 | task_rq_unlock(rq, p, &flags); |
| 3139 | return -EPERM; | 3385 | return -EPERM; |
| 3140 | } | 3386 | } |
| 3141 | } | ||
| 3142 | #endif | 3387 | #endif |
| 3388 | #ifdef CONFIG_SMP | ||
| 3389 | if (dl_bandwidth_enabled() && dl_policy(policy)) { | ||
| 3390 | cpumask_t *span = rq->rd->span; | ||
| 3391 | |||
| 3392 | /* | ||
| 3393 | * Don't allow tasks with an affinity mask smaller than | ||
| 3394 | * the entire root_domain to become SCHED_DEADLINE. We | ||
| 3395 | * will also fail if there's no bandwidth available. | ||
| 3396 | */ | ||
| 3397 | if (!cpumask_subset(span, &p->cpus_allowed) || | ||
| 3398 | rq->rd->dl_bw.bw == 0) { | ||
| 3399 | task_rq_unlock(rq, p, &flags); | ||
| 3400 | return -EPERM; | ||
| 3401 | } | ||
| 3402 | } | ||
| 3403 | #endif | ||
| 3404 | } | ||
| 3143 | 3405 | ||
| 3144 | /* recheck policy now with rq lock held */ | 3406 | /* recheck policy now with rq lock held */ |
| 3145 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { | 3407 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { |
| @@ -3147,6 +3409,17 @@ recheck: | |||
| 3147 | task_rq_unlock(rq, p, &flags); | 3409 | task_rq_unlock(rq, p, &flags); |
| 3148 | goto recheck; | 3410 | goto recheck; |
| 3149 | } | 3411 | } |
| 3412 | |||
| 3413 | /* | ||
| 3414 | * If setscheduling to SCHED_DEADLINE (or changing the parameters | ||
| 3415 | * of a SCHED_DEADLINE task) we need to check if enough bandwidth | ||
| 3416 | * is available. | ||
| 3417 | */ | ||
| 3418 | if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) { | ||
| 3419 | task_rq_unlock(rq, p, &flags); | ||
| 3420 | return -EBUSY; | ||
| 3421 | } | ||
| 3422 | |||
| 3150 | on_rq = p->on_rq; | 3423 | on_rq = p->on_rq; |
| 3151 | running = task_current(rq, p); | 3424 | running = task_current(rq, p); |
| 3152 | if (on_rq) | 3425 | if (on_rq) |
| @@ -3158,7 +3431,7 @@ recheck: | |||
| 3158 | 3431 | ||
| 3159 | oldprio = p->prio; | 3432 | oldprio = p->prio; |
| 3160 | prev_class = p->sched_class; | 3433 | prev_class = p->sched_class; |
| 3161 | __setscheduler(rq, p, policy, param->sched_priority); | 3434 | __setscheduler(rq, p, attr); |
| 3162 | 3435 | ||
| 3163 | if (running) | 3436 | if (running) |
| 3164 | p->sched_class->set_curr_task(rq); | 3437 | p->sched_class->set_curr_task(rq); |
| @@ -3173,6 +3446,26 @@ recheck: | |||
| 3173 | return 0; | 3446 | return 0; |
| 3174 | } | 3447 | } |
| 3175 | 3448 | ||
| 3449 | static int _sched_setscheduler(struct task_struct *p, int policy, | ||
| 3450 | const struct sched_param *param, bool check) | ||
| 3451 | { | ||
| 3452 | struct sched_attr attr = { | ||
| 3453 | .sched_policy = policy, | ||
| 3454 | .sched_priority = param->sched_priority, | ||
| 3455 | .sched_nice = PRIO_TO_NICE(p->static_prio), | ||
| 3456 | }; | ||
| 3457 | |||
| 3458 | /* | ||
| 3459 | * Fixup the legacy SCHED_RESET_ON_FORK hack | ||
| 3460 | */ | ||
| 3461 | if (policy & SCHED_RESET_ON_FORK) { | ||
| 3462 | attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; | ||
| 3463 | policy &= ~SCHED_RESET_ON_FORK; | ||
| 3464 | attr.sched_policy = policy; | ||
| 3465 | } | ||
| 3466 | |||
| 3467 | return __sched_setscheduler(p, &attr, check); | ||
| 3468 | } | ||
| 3176 | /** | 3469 | /** |
| 3177 | * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. | 3470 | * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. |
| 3178 | * @p: the task in question. | 3471 | * @p: the task in question. |
| @@ -3186,10 +3479,16 @@ recheck: | |||
| 3186 | int sched_setscheduler(struct task_struct *p, int policy, | 3479 | int sched_setscheduler(struct task_struct *p, int policy, |
| 3187 | const struct sched_param *param) | 3480 | const struct sched_param *param) |
| 3188 | { | 3481 | { |
| 3189 | return __sched_setscheduler(p, policy, param, true); | 3482 | return _sched_setscheduler(p, policy, param, true); |
| 3190 | } | 3483 | } |
| 3191 | EXPORT_SYMBOL_GPL(sched_setscheduler); | 3484 | EXPORT_SYMBOL_GPL(sched_setscheduler); |
| 3192 | 3485 | ||
| 3486 | int sched_setattr(struct task_struct *p, const struct sched_attr *attr) | ||
| 3487 | { | ||
| 3488 | return __sched_setscheduler(p, attr, true); | ||
| 3489 | } | ||
| 3490 | EXPORT_SYMBOL_GPL(sched_setattr); | ||
| 3491 | |||
| 3193 | /** | 3492 | /** |
| 3194 | * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. | 3493 | * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. |
| 3195 | * @p: the task in question. | 3494 | * @p: the task in question. |
| @@ -3206,7 +3505,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler); | |||
| 3206 | int sched_setscheduler_nocheck(struct task_struct *p, int policy, | 3505 | int sched_setscheduler_nocheck(struct task_struct *p, int policy, |
| 3207 | const struct sched_param *param) | 3506 | const struct sched_param *param) |
| 3208 | { | 3507 | { |
| 3209 | return __sched_setscheduler(p, policy, param, false); | 3508 | return _sched_setscheduler(p, policy, param, false); |
| 3210 | } | 3509 | } |
| 3211 | 3510 | ||
| 3212 | static int | 3511 | static int |
| @@ -3231,6 +3530,79 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) | |||
| 3231 | return retval; | 3530 | return retval; |
| 3232 | } | 3531 | } |
| 3233 | 3532 | ||
| 3533 | /* | ||
| 3534 | * Mimics kernel/events/core.c perf_copy_attr(). | ||
| 3535 | */ | ||
| 3536 | static int sched_copy_attr(struct sched_attr __user *uattr, | ||
| 3537 | struct sched_attr *attr) | ||
| 3538 | { | ||
| 3539 | u32 size; | ||
| 3540 | int ret; | ||
| 3541 | |||
| 3542 | if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) | ||
| 3543 | return -EFAULT; | ||
| 3544 | |||
| 3545 | /* | ||
| 3546 | * zero the full structure, so that a short copy will be nice. | ||
| 3547 | */ | ||
| 3548 | memset(attr, 0, sizeof(*attr)); | ||
| 3549 | |||
| 3550 | ret = get_user(size, &uattr->size); | ||
| 3551 | if (ret) | ||
| 3552 | return ret; | ||
| 3553 | |||
| 3554 | if (size > PAGE_SIZE) /* silly large */ | ||
| 3555 | goto err_size; | ||
| 3556 | |||
| 3557 | if (!size) /* abi compat */ | ||
| 3558 | size = SCHED_ATTR_SIZE_VER0; | ||
| 3559 | |||
| 3560 | if (size < SCHED_ATTR_SIZE_VER0) | ||
| 3561 | goto err_size; | ||
| 3562 | |||
| 3563 | /* | ||
| 3564 | * If we're handed a bigger struct than we know of, | ||
| 3565 | * ensure all the unknown bits are 0 - i.e. new | ||
| 3566 | * user-space does not rely on any kernel feature | ||
| 3567 | * extensions we dont know about yet. | ||
| 3568 | */ | ||
| 3569 | if (size > sizeof(*attr)) { | ||
| 3570 | unsigned char __user *addr; | ||
| 3571 | unsigned char __user *end; | ||
| 3572 | unsigned char val; | ||
| 3573 | |||
| 3574 | addr = (void __user *)uattr + sizeof(*attr); | ||
| 3575 | end = (void __user *)uattr + size; | ||
| 3576 | |||
| 3577 | for (; addr < end; addr++) { | ||
| 3578 | ret = get_user(val, addr); | ||
| 3579 | if (ret) | ||
| 3580 | return ret; | ||
| 3581 | if (val) | ||
| 3582 | goto err_size; | ||
| 3583 | } | ||
| 3584 | size = sizeof(*attr); | ||
| 3585 | } | ||
| 3586 | |||
| 3587 | ret = copy_from_user(attr, uattr, size); | ||
| 3588 | if (ret) | ||
| 3589 | return -EFAULT; | ||
| 3590 | |||
| 3591 | /* | ||
| 3592 | * XXX: do we want to be lenient like existing syscalls; or do we want | ||
| 3593 | * to be strict and return an error on out-of-bounds values? | ||
| 3594 | */ | ||
| 3595 | attr->sched_nice = clamp(attr->sched_nice, -20, 19); | ||
| 3596 | |||
| 3597 | out: | ||
| 3598 | return ret; | ||
| 3599 | |||
| 3600 | err_size: | ||
| 3601 | put_user(sizeof(*attr), &uattr->size); | ||
| 3602 | ret = -E2BIG; | ||
| 3603 | goto out; | ||
| 3604 | } | ||
| 3605 | |||
| 3234 | /** | 3606 | /** |
| 3235 | * sys_sched_setscheduler - set/change the scheduler policy and RT priority | 3607 | * sys_sched_setscheduler - set/change the scheduler policy and RT priority |
| 3236 | * @pid: the pid in question. | 3608 | * @pid: the pid in question. |
| @@ -3262,6 +3634,33 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) | |||
| 3262 | } | 3634 | } |
| 3263 | 3635 | ||
| 3264 | /** | 3636 | /** |
| 3637 | * sys_sched_setattr - same as above, but with extended sched_attr | ||
| 3638 | * @pid: the pid in question. | ||
| 3639 | * @uattr: structure containing the extended parameters. | ||
| 3640 | */ | ||
| 3641 | SYSCALL_DEFINE2(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr) | ||
| 3642 | { | ||
| 3643 | struct sched_attr attr; | ||
| 3644 | struct task_struct *p; | ||
| 3645 | int retval; | ||
| 3646 | |||
| 3647 | if (!uattr || pid < 0) | ||
| 3648 | return -EINVAL; | ||
| 3649 | |||
| 3650 | if (sched_copy_attr(uattr, &attr)) | ||
| 3651 | return -EFAULT; | ||
| 3652 | |||
| 3653 | rcu_read_lock(); | ||
| 3654 | retval = -ESRCH; | ||
| 3655 | p = find_process_by_pid(pid); | ||
| 3656 | if (p != NULL) | ||
| 3657 | retval = sched_setattr(p, &attr); | ||
| 3658 | rcu_read_unlock(); | ||
| 3659 | |||
| 3660 | return retval; | ||
| 3661 | } | ||
| 3662 | |||
| 3663 | /** | ||
| 3265 | * sys_sched_getscheduler - get the policy (scheduling class) of a thread | 3664 | * sys_sched_getscheduler - get the policy (scheduling class) of a thread |
| 3266 | * @pid: the pid in question. | 3665 | * @pid: the pid in question. |
| 3267 | * | 3666 | * |
| @@ -3316,6 +3715,10 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) | |||
| 3316 | if (retval) | 3715 | if (retval) |
| 3317 | goto out_unlock; | 3716 | goto out_unlock; |
| 3318 | 3717 | ||
| 3718 | if (task_has_dl_policy(p)) { | ||
| 3719 | retval = -EINVAL; | ||
| 3720 | goto out_unlock; | ||
| 3721 | } | ||
| 3319 | lp.sched_priority = p->rt_priority; | 3722 | lp.sched_priority = p->rt_priority; |
| 3320 | rcu_read_unlock(); | 3723 | rcu_read_unlock(); |
| 3321 | 3724 | ||
| @@ -3331,6 +3734,96 @@ out_unlock: | |||
| 3331 | return retval; | 3734 | return retval; |
| 3332 | } | 3735 | } |
| 3333 | 3736 | ||
| 3737 | static int sched_read_attr(struct sched_attr __user *uattr, | ||
| 3738 | struct sched_attr *attr, | ||
| 3739 | unsigned int usize) | ||
| 3740 | { | ||
| 3741 | int ret; | ||
| 3742 | |||
| 3743 | if (!access_ok(VERIFY_WRITE, uattr, usize)) | ||
| 3744 | return -EFAULT; | ||
| 3745 | |||
| 3746 | /* | ||
| 3747 | * If we're handed a smaller struct than we know of, | ||
| 3748 | * ensure all the unknown bits are 0 - i.e. old | ||
| 3749 | * user-space does not get uncomplete information. | ||
| 3750 | */ | ||
| 3751 | if (usize < sizeof(*attr)) { | ||
| 3752 | unsigned char *addr; | ||
| 3753 | unsigned char *end; | ||
| 3754 | |||
| 3755 | addr = (void *)attr + usize; | ||
| 3756 | end = (void *)attr + sizeof(*attr); | ||
| 3757 | |||
| 3758 | for (; addr < end; addr++) { | ||
| 3759 | if (*addr) | ||
| 3760 | goto err_size; | ||
| 3761 | } | ||
| 3762 | |||
| 3763 | attr->size = usize; | ||
| 3764 | } | ||
| 3765 | |||
| 3766 | ret = copy_to_user(uattr, attr, usize); | ||
| 3767 | if (ret) | ||
| 3768 | return -EFAULT; | ||
| 3769 | |||
| 3770 | out: | ||
| 3771 | return ret; | ||
| 3772 | |||
| 3773 | err_size: | ||
| 3774 | ret = -E2BIG; | ||
| 3775 | goto out; | ||
| 3776 | } | ||
| 3777 | |||
| 3778 | /** | ||
| 3779 | * sys_sched_getattr - similar to sched_getparam, but with sched_attr | ||
| 3780 | * @pid: the pid in question. | ||
| 3781 | * @uattr: structure containing the extended parameters. | ||
| 3782 | * @size: sizeof(attr) for fwd/bwd comp. | ||
| 3783 | */ | ||
| 3784 | SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, | ||
| 3785 | unsigned int, size) | ||
| 3786 | { | ||
| 3787 | struct sched_attr attr = { | ||
| 3788 | .size = sizeof(struct sched_attr), | ||
| 3789 | }; | ||
| 3790 | struct task_struct *p; | ||
| 3791 | int retval; | ||
| 3792 | |||
| 3793 | if (!uattr || pid < 0 || size > PAGE_SIZE || | ||
| 3794 | size < SCHED_ATTR_SIZE_VER0) | ||
| 3795 | return -EINVAL; | ||
| 3796 | |||
| 3797 | rcu_read_lock(); | ||
| 3798 | p = find_process_by_pid(pid); | ||
| 3799 | retval = -ESRCH; | ||
| 3800 | if (!p) | ||
| 3801 | goto out_unlock; | ||
| 3802 | |||
| 3803 | retval = security_task_getscheduler(p); | ||
| 3804 | if (retval) | ||
| 3805 | goto out_unlock; | ||
| 3806 | |||
| 3807 | attr.sched_policy = p->policy; | ||
| 3808 | if (p->sched_reset_on_fork) | ||
| 3809 | attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; | ||
| 3810 | if (task_has_dl_policy(p)) | ||
| 3811 | __getparam_dl(p, &attr); | ||
| 3812 | else if (task_has_rt_policy(p)) | ||
| 3813 | attr.sched_priority = p->rt_priority; | ||
| 3814 | else | ||
| 3815 | attr.sched_nice = TASK_NICE(p); | ||
| 3816 | |||
| 3817 | rcu_read_unlock(); | ||
| 3818 | |||
| 3819 | retval = sched_read_attr(uattr, &attr, size); | ||
| 3820 | return retval; | ||
| 3821 | |||
| 3822 | out_unlock: | ||
| 3823 | rcu_read_unlock(); | ||
| 3824 | return retval; | ||
| 3825 | } | ||
| 3826 | |||
| 3334 | long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | 3827 | long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) |
| 3335 | { | 3828 | { |
| 3336 | cpumask_var_t cpus_allowed, new_mask; | 3829 | cpumask_var_t cpus_allowed, new_mask; |
| @@ -3375,8 +3868,26 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | |||
| 3375 | if (retval) | 3868 | if (retval) |
| 3376 | goto out_unlock; | 3869 | goto out_unlock; |
| 3377 | 3870 | ||
| 3871 | |||
| 3378 | cpuset_cpus_allowed(p, cpus_allowed); | 3872 | cpuset_cpus_allowed(p, cpus_allowed); |
| 3379 | cpumask_and(new_mask, in_mask, cpus_allowed); | 3873 | cpumask_and(new_mask, in_mask, cpus_allowed); |
| 3874 | |||
| 3875 | /* | ||
| 3876 | * Since bandwidth control happens on root_domain basis, | ||
| 3877 | * if admission test is enabled, we only admit -deadline | ||
| 3878 | * tasks allowed to run on all the CPUs in the task's | ||
| 3879 | * root_domain. | ||
| 3880 | */ | ||
| 3881 | #ifdef CONFIG_SMP | ||
| 3882 | if (task_has_dl_policy(p)) { | ||
| 3883 | const struct cpumask *span = task_rq(p)->rd->span; | ||
| 3884 | |||
| 3885 | if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) { | ||
| 3886 | retval = -EBUSY; | ||
| 3887 | goto out_unlock; | ||
| 3888 | } | ||
| 3889 | } | ||
| 3890 | #endif | ||
| 3380 | again: | 3891 | again: |
| 3381 | retval = set_cpus_allowed_ptr(p, new_mask); | 3892 | retval = set_cpus_allowed_ptr(p, new_mask); |
| 3382 | 3893 | ||
| @@ -3653,7 +4164,7 @@ again: | |||
| 3653 | } | 4164 | } |
| 3654 | 4165 | ||
| 3655 | double_rq_lock(rq, p_rq); | 4166 | double_rq_lock(rq, p_rq); |
| 3656 | while (task_rq(p) != p_rq) { | 4167 | if (task_rq(p) != p_rq) { |
| 3657 | double_rq_unlock(rq, p_rq); | 4168 | double_rq_unlock(rq, p_rq); |
| 3658 | goto again; | 4169 | goto again; |
| 3659 | } | 4170 | } |
| @@ -3742,6 +4253,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy) | |||
| 3742 | case SCHED_RR: | 4253 | case SCHED_RR: |
| 3743 | ret = MAX_USER_RT_PRIO-1; | 4254 | ret = MAX_USER_RT_PRIO-1; |
| 3744 | break; | 4255 | break; |
| 4256 | case SCHED_DEADLINE: | ||
| 3745 | case SCHED_NORMAL: | 4257 | case SCHED_NORMAL: |
| 3746 | case SCHED_BATCH: | 4258 | case SCHED_BATCH: |
| 3747 | case SCHED_IDLE: | 4259 | case SCHED_IDLE: |
| @@ -3768,6 +4280,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) | |||
| 3768 | case SCHED_RR: | 4280 | case SCHED_RR: |
| 3769 | ret = 1; | 4281 | ret = 1; |
| 3770 | break; | 4282 | break; |
| 4283 | case SCHED_DEADLINE: | ||
| 3771 | case SCHED_NORMAL: | 4284 | case SCHED_NORMAL: |
| 3772 | case SCHED_BATCH: | 4285 | case SCHED_BATCH: |
| 3773 | case SCHED_IDLE: | 4286 | case SCHED_IDLE: |
| @@ -4514,13 +5027,31 @@ static int sched_cpu_active(struct notifier_block *nfb, | |||
| 4514 | static int sched_cpu_inactive(struct notifier_block *nfb, | 5027 | static int sched_cpu_inactive(struct notifier_block *nfb, |
| 4515 | unsigned long action, void *hcpu) | 5028 | unsigned long action, void *hcpu) |
| 4516 | { | 5029 | { |
| 5030 | unsigned long flags; | ||
| 5031 | long cpu = (long)hcpu; | ||
| 5032 | |||
| 4517 | switch (action & ~CPU_TASKS_FROZEN) { | 5033 | switch (action & ~CPU_TASKS_FROZEN) { |
| 4518 | case CPU_DOWN_PREPARE: | 5034 | case CPU_DOWN_PREPARE: |
| 4519 | set_cpu_active((long)hcpu, false); | 5035 | set_cpu_active(cpu, false); |
| 5036 | |||
| 5037 | /* explicitly allow suspend */ | ||
| 5038 | if (!(action & CPU_TASKS_FROZEN)) { | ||
| 5039 | struct dl_bw *dl_b = dl_bw_of(cpu); | ||
| 5040 | bool overflow; | ||
| 5041 | int cpus; | ||
| 5042 | |||
| 5043 | raw_spin_lock_irqsave(&dl_b->lock, flags); | ||
| 5044 | cpus = dl_bw_cpus(cpu); | ||
| 5045 | overflow = __dl_overflow(dl_b, cpus, 0, 0); | ||
| 5046 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | ||
| 5047 | |||
| 5048 | if (overflow) | ||
| 5049 | return notifier_from_errno(-EBUSY); | ||
| 5050 | } | ||
| 4520 | return NOTIFY_OK; | 5051 | return NOTIFY_OK; |
| 4521 | default: | ||
| 4522 | return NOTIFY_DONE; | ||
| 4523 | } | 5052 | } |
| 5053 | |||
| 5054 | return NOTIFY_DONE; | ||
| 4524 | } | 5055 | } |
| 4525 | 5056 | ||
| 4526 | static int __init migration_init(void) | 5057 | static int __init migration_init(void) |
| @@ -4739,6 +5270,8 @@ static void free_rootdomain(struct rcu_head *rcu) | |||
| 4739 | struct root_domain *rd = container_of(rcu, struct root_domain, rcu); | 5270 | struct root_domain *rd = container_of(rcu, struct root_domain, rcu); |
| 4740 | 5271 | ||
| 4741 | cpupri_cleanup(&rd->cpupri); | 5272 | cpupri_cleanup(&rd->cpupri); |
| 5273 | cpudl_cleanup(&rd->cpudl); | ||
| 5274 | free_cpumask_var(rd->dlo_mask); | ||
| 4742 | free_cpumask_var(rd->rto_mask); | 5275 | free_cpumask_var(rd->rto_mask); |
| 4743 | free_cpumask_var(rd->online); | 5276 | free_cpumask_var(rd->online); |
| 4744 | free_cpumask_var(rd->span); | 5277 | free_cpumask_var(rd->span); |
| @@ -4790,8 +5323,14 @@ static int init_rootdomain(struct root_domain *rd) | |||
| 4790 | goto out; | 5323 | goto out; |
| 4791 | if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) | 5324 | if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) |
| 4792 | goto free_span; | 5325 | goto free_span; |
| 4793 | if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) | 5326 | if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) |
| 4794 | goto free_online; | 5327 | goto free_online; |
| 5328 | if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) | ||
| 5329 | goto free_dlo_mask; | ||
| 5330 | |||
| 5331 | init_dl_bw(&rd->dl_bw); | ||
| 5332 | if (cpudl_init(&rd->cpudl) != 0) | ||
| 5333 | goto free_dlo_mask; | ||
| 4795 | 5334 | ||
| 4796 | if (cpupri_init(&rd->cpupri) != 0) | 5335 | if (cpupri_init(&rd->cpupri) != 0) |
| 4797 | goto free_rto_mask; | 5336 | goto free_rto_mask; |
| @@ -4799,6 +5338,8 @@ static int init_rootdomain(struct root_domain *rd) | |||
| 4799 | 5338 | ||
| 4800 | free_rto_mask: | 5339 | free_rto_mask: |
| 4801 | free_cpumask_var(rd->rto_mask); | 5340 | free_cpumask_var(rd->rto_mask); |
| 5341 | free_dlo_mask: | ||
| 5342 | free_cpumask_var(rd->dlo_mask); | ||
| 4802 | free_online: | 5343 | free_online: |
| 4803 | free_cpumask_var(rd->online); | 5344 | free_cpumask_var(rd->online); |
| 4804 | free_span: | 5345 | free_span: |
| @@ -6150,6 +6691,7 @@ void __init sched_init_smp(void) | |||
| 6150 | free_cpumask_var(non_isolated_cpus); | 6691 | free_cpumask_var(non_isolated_cpus); |
| 6151 | 6692 | ||
| 6152 | init_sched_rt_class(); | 6693 | init_sched_rt_class(); |
| 6694 | init_sched_dl_class(); | ||
| 6153 | } | 6695 | } |
| 6154 | #else | 6696 | #else |
| 6155 | void __init sched_init_smp(void) | 6697 | void __init sched_init_smp(void) |
| @@ -6219,13 +6761,15 @@ void __init sched_init(void) | |||
| 6219 | #endif /* CONFIG_CPUMASK_OFFSTACK */ | 6761 | #endif /* CONFIG_CPUMASK_OFFSTACK */ |
| 6220 | } | 6762 | } |
| 6221 | 6763 | ||
| 6764 | init_rt_bandwidth(&def_rt_bandwidth, | ||
| 6765 | global_rt_period(), global_rt_runtime()); | ||
| 6766 | init_dl_bandwidth(&def_dl_bandwidth, | ||
| 6767 | global_rt_period(), global_rt_runtime()); | ||
| 6768 | |||
| 6222 | #ifdef CONFIG_SMP | 6769 | #ifdef CONFIG_SMP |
| 6223 | init_defrootdomain(); | 6770 | init_defrootdomain(); |
| 6224 | #endif | 6771 | #endif |
| 6225 | 6772 | ||
| 6226 | init_rt_bandwidth(&def_rt_bandwidth, | ||
| 6227 | global_rt_period(), global_rt_runtime()); | ||
| 6228 | |||
| 6229 | #ifdef CONFIG_RT_GROUP_SCHED | 6773 | #ifdef CONFIG_RT_GROUP_SCHED |
| 6230 | init_rt_bandwidth(&root_task_group.rt_bandwidth, | 6774 | init_rt_bandwidth(&root_task_group.rt_bandwidth, |
| 6231 | global_rt_period(), global_rt_runtime()); | 6775 | global_rt_period(), global_rt_runtime()); |
| @@ -6249,6 +6793,7 @@ void __init sched_init(void) | |||
| 6249 | rq->calc_load_update = jiffies + LOAD_FREQ; | 6793 | rq->calc_load_update = jiffies + LOAD_FREQ; |
| 6250 | init_cfs_rq(&rq->cfs); | 6794 | init_cfs_rq(&rq->cfs); |
| 6251 | init_rt_rq(&rq->rt, rq); | 6795 | init_rt_rq(&rq->rt, rq); |
| 6796 | init_dl_rq(&rq->dl, rq); | ||
| 6252 | #ifdef CONFIG_FAIR_GROUP_SCHED | 6797 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 6253 | root_task_group.shares = ROOT_TASK_GROUP_LOAD; | 6798 | root_task_group.shares = ROOT_TASK_GROUP_LOAD; |
| 6254 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 6799 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
| @@ -6320,10 +6865,6 @@ void __init sched_init(void) | |||
| 6320 | INIT_HLIST_HEAD(&init_task.preempt_notifiers); | 6865 | INIT_HLIST_HEAD(&init_task.preempt_notifiers); |
| 6321 | #endif | 6866 | #endif |
| 6322 | 6867 | ||
| 6323 | #ifdef CONFIG_RT_MUTEXES | ||
| 6324 | plist_head_init(&init_task.pi_waiters); | ||
| 6325 | #endif | ||
| 6326 | |||
| 6327 | /* | 6868 | /* |
| 6328 | * The boot idle thread does lazy MMU switching as well: | 6869 | * The boot idle thread does lazy MMU switching as well: |
| 6329 | */ | 6870 | */ |
| @@ -6397,13 +6938,16 @@ EXPORT_SYMBOL(__might_sleep); | |||
| 6397 | static void normalize_task(struct rq *rq, struct task_struct *p) | 6938 | static void normalize_task(struct rq *rq, struct task_struct *p) |
| 6398 | { | 6939 | { |
| 6399 | const struct sched_class *prev_class = p->sched_class; | 6940 | const struct sched_class *prev_class = p->sched_class; |
| 6941 | struct sched_attr attr = { | ||
| 6942 | .sched_policy = SCHED_NORMAL, | ||
| 6943 | }; | ||
| 6400 | int old_prio = p->prio; | 6944 | int old_prio = p->prio; |
| 6401 | int on_rq; | 6945 | int on_rq; |
| 6402 | 6946 | ||
| 6403 | on_rq = p->on_rq; | 6947 | on_rq = p->on_rq; |
| 6404 | if (on_rq) | 6948 | if (on_rq) |
| 6405 | dequeue_task(rq, p, 0); | 6949 | dequeue_task(rq, p, 0); |
| 6406 | __setscheduler(rq, p, SCHED_NORMAL, 0); | 6950 | __setscheduler(rq, p, &attr); |
| 6407 | if (on_rq) { | 6951 | if (on_rq) { |
| 6408 | enqueue_task(rq, p, 0); | 6952 | enqueue_task(rq, p, 0); |
| 6409 | resched_task(rq->curr); | 6953 | resched_task(rq->curr); |
| @@ -6433,7 +6977,7 @@ void normalize_rt_tasks(void) | |||
| 6433 | p->se.statistics.block_start = 0; | 6977 | p->se.statistics.block_start = 0; |
| 6434 | #endif | 6978 | #endif |
| 6435 | 6979 | ||
| 6436 | if (!rt_task(p)) { | 6980 | if (!dl_task(p) && !rt_task(p)) { |
| 6437 | /* | 6981 | /* |
| 6438 | * Renice negative nice level userspace | 6982 | * Renice negative nice level userspace |
| 6439 | * tasks back to 0: | 6983 | * tasks back to 0: |
| @@ -6628,16 +7172,6 @@ void sched_move_task(struct task_struct *tsk) | |||
| 6628 | } | 7172 | } |
| 6629 | #endif /* CONFIG_CGROUP_SCHED */ | 7173 | #endif /* CONFIG_CGROUP_SCHED */ |
| 6630 | 7174 | ||
| 6631 | #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) | ||
| 6632 | static unsigned long to_ratio(u64 period, u64 runtime) | ||
| 6633 | { | ||
| 6634 | if (runtime == RUNTIME_INF) | ||
| 6635 | return 1ULL << 20; | ||
| 6636 | |||
| 6637 | return div64_u64(runtime << 20, period); | ||
| 6638 | } | ||
| 6639 | #endif | ||
| 6640 | |||
| 6641 | #ifdef CONFIG_RT_GROUP_SCHED | 7175 | #ifdef CONFIG_RT_GROUP_SCHED |
| 6642 | /* | 7176 | /* |
| 6643 | * Ensure that the real time constraints are schedulable. | 7177 | * Ensure that the real time constraints are schedulable. |
| @@ -6811,24 +7345,13 @@ static long sched_group_rt_period(struct task_group *tg) | |||
| 6811 | do_div(rt_period_us, NSEC_PER_USEC); | 7345 | do_div(rt_period_us, NSEC_PER_USEC); |
| 6812 | return rt_period_us; | 7346 | return rt_period_us; |
| 6813 | } | 7347 | } |
| 7348 | #endif /* CONFIG_RT_GROUP_SCHED */ | ||
| 6814 | 7349 | ||
| 7350 | #ifdef CONFIG_RT_GROUP_SCHED | ||
| 6815 | static int sched_rt_global_constraints(void) | 7351 | static int sched_rt_global_constraints(void) |
| 6816 | { | 7352 | { |
| 6817 | u64 runtime, period; | ||
| 6818 | int ret = 0; | 7353 | int ret = 0; |
| 6819 | 7354 | ||
| 6820 | if (sysctl_sched_rt_period <= 0) | ||
| 6821 | return -EINVAL; | ||
| 6822 | |||
| 6823 | runtime = global_rt_runtime(); | ||
| 6824 | period = global_rt_period(); | ||
| 6825 | |||
| 6826 | /* | ||
| 6827 | * Sanity check on the sysctl variables. | ||
| 6828 | */ | ||
| 6829 | if (runtime > period && runtime != RUNTIME_INF) | ||
| 6830 | return -EINVAL; | ||
| 6831 | |||
| 6832 | mutex_lock(&rt_constraints_mutex); | 7355 | mutex_lock(&rt_constraints_mutex); |
| 6833 | read_lock(&tasklist_lock); | 7356 | read_lock(&tasklist_lock); |
| 6834 | ret = __rt_schedulable(NULL, 0, 0); | 7357 | ret = __rt_schedulable(NULL, 0, 0); |
| @@ -6851,17 +7374,7 @@ static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) | |||
| 6851 | static int sched_rt_global_constraints(void) | 7374 | static int sched_rt_global_constraints(void) |
| 6852 | { | 7375 | { |
| 6853 | unsigned long flags; | 7376 | unsigned long flags; |
| 6854 | int i; | 7377 | int i, ret = 0; |
| 6855 | |||
| 6856 | if (sysctl_sched_rt_period <= 0) | ||
| 6857 | return -EINVAL; | ||
| 6858 | |||
| 6859 | /* | ||
| 6860 | * There's always some RT tasks in the root group | ||
| 6861 | * -- migration, kstopmachine etc.. | ||
| 6862 | */ | ||
| 6863 | if (sysctl_sched_rt_runtime == 0) | ||
| 6864 | return -EBUSY; | ||
| 6865 | 7378 | ||
| 6866 | raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); | 7379 | raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); |
| 6867 | for_each_possible_cpu(i) { | 7380 | for_each_possible_cpu(i) { |
| @@ -6873,36 +7386,88 @@ static int sched_rt_global_constraints(void) | |||
| 6873 | } | 7386 | } |
| 6874 | raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); | 7387 | raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); |
| 6875 | 7388 | ||
| 6876 | return 0; | 7389 | return ret; |
| 6877 | } | 7390 | } |
| 6878 | #endif /* CONFIG_RT_GROUP_SCHED */ | 7391 | #endif /* CONFIG_RT_GROUP_SCHED */ |
| 6879 | 7392 | ||
| 6880 | int sched_rr_handler(struct ctl_table *table, int write, | 7393 | static int sched_dl_global_constraints(void) |
| 6881 | void __user *buffer, size_t *lenp, | ||
| 6882 | loff_t *ppos) | ||
| 6883 | { | 7394 | { |
| 6884 | int ret; | 7395 | u64 runtime = global_rt_runtime(); |
| 6885 | static DEFINE_MUTEX(mutex); | 7396 | u64 period = global_rt_period(); |
| 7397 | u64 new_bw = to_ratio(period, runtime); | ||
| 7398 | int cpu, ret = 0; | ||
| 6886 | 7399 | ||
| 6887 | mutex_lock(&mutex); | 7400 | /* |
| 6888 | ret = proc_dointvec(table, write, buffer, lenp, ppos); | 7401 | * Here we want to check the bandwidth not being set to some |
| 6889 | /* make sure that internally we keep jiffies */ | 7402 | * value smaller than the currently allocated bandwidth in |
| 6890 | /* also, writing zero resets timeslice to default */ | 7403 | * any of the root_domains. |
| 6891 | if (!ret && write) { | 7404 | * |
| 6892 | sched_rr_timeslice = sched_rr_timeslice <= 0 ? | 7405 | * FIXME: Cycling on all the CPUs is overdoing, but simpler than |
| 6893 | RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); | 7406 | * cycling on root_domains... Discussion on different/better |
| 7407 | * solutions is welcome! | ||
| 7408 | */ | ||
| 7409 | for_each_possible_cpu(cpu) { | ||
| 7410 | struct dl_bw *dl_b = dl_bw_of(cpu); | ||
| 7411 | |||
| 7412 | raw_spin_lock(&dl_b->lock); | ||
| 7413 | if (new_bw < dl_b->total_bw) | ||
| 7414 | ret = -EBUSY; | ||
| 7415 | raw_spin_unlock(&dl_b->lock); | ||
| 7416 | |||
| 7417 | if (ret) | ||
| 7418 | break; | ||
| 6894 | } | 7419 | } |
| 6895 | mutex_unlock(&mutex); | 7420 | |
| 6896 | return ret; | 7421 | return ret; |
| 6897 | } | 7422 | } |
| 6898 | 7423 | ||
| 7424 | static void sched_dl_do_global(void) | ||
| 7425 | { | ||
| 7426 | u64 new_bw = -1; | ||
| 7427 | int cpu; | ||
| 7428 | |||
| 7429 | def_dl_bandwidth.dl_period = global_rt_period(); | ||
| 7430 | def_dl_bandwidth.dl_runtime = global_rt_runtime(); | ||
| 7431 | |||
| 7432 | if (global_rt_runtime() != RUNTIME_INF) | ||
| 7433 | new_bw = to_ratio(global_rt_period(), global_rt_runtime()); | ||
| 7434 | |||
| 7435 | /* | ||
| 7436 | * FIXME: As above... | ||
| 7437 | */ | ||
| 7438 | for_each_possible_cpu(cpu) { | ||
| 7439 | struct dl_bw *dl_b = dl_bw_of(cpu); | ||
| 7440 | |||
| 7441 | raw_spin_lock(&dl_b->lock); | ||
| 7442 | dl_b->bw = new_bw; | ||
| 7443 | raw_spin_unlock(&dl_b->lock); | ||
| 7444 | } | ||
| 7445 | } | ||
| 7446 | |||
| 7447 | static int sched_rt_global_validate(void) | ||
| 7448 | { | ||
| 7449 | if (sysctl_sched_rt_period <= 0) | ||
| 7450 | return -EINVAL; | ||
| 7451 | |||
| 7452 | if (sysctl_sched_rt_runtime > sysctl_sched_rt_period) | ||
| 7453 | return -EINVAL; | ||
| 7454 | |||
| 7455 | return 0; | ||
| 7456 | } | ||
| 7457 | |||
| 7458 | static void sched_rt_do_global(void) | ||
| 7459 | { | ||
| 7460 | def_rt_bandwidth.rt_runtime = global_rt_runtime(); | ||
| 7461 | def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); | ||
| 7462 | } | ||
| 7463 | |||
| 6899 | int sched_rt_handler(struct ctl_table *table, int write, | 7464 | int sched_rt_handler(struct ctl_table *table, int write, |
| 6900 | void __user *buffer, size_t *lenp, | 7465 | void __user *buffer, size_t *lenp, |
| 6901 | loff_t *ppos) | 7466 | loff_t *ppos) |
| 6902 | { | 7467 | { |
| 6903 | int ret; | ||
| 6904 | int old_period, old_runtime; | 7468 | int old_period, old_runtime; |
| 6905 | static DEFINE_MUTEX(mutex); | 7469 | static DEFINE_MUTEX(mutex); |
| 7470 | int ret; | ||
| 6906 | 7471 | ||
| 6907 | mutex_lock(&mutex); | 7472 | mutex_lock(&mutex); |
| 6908 | old_period = sysctl_sched_rt_period; | 7473 | old_period = sysctl_sched_rt_period; |
| @@ -6911,21 +7476,50 @@ int sched_rt_handler(struct ctl_table *table, int write, | |||
| 6911 | ret = proc_dointvec(table, write, buffer, lenp, ppos); | 7476 | ret = proc_dointvec(table, write, buffer, lenp, ppos); |
| 6912 | 7477 | ||
| 6913 | if (!ret && write) { | 7478 | if (!ret && write) { |
| 7479 | ret = sched_rt_global_validate(); | ||
| 7480 | if (ret) | ||
| 7481 | goto undo; | ||
| 7482 | |||
| 6914 | ret = sched_rt_global_constraints(); | 7483 | ret = sched_rt_global_constraints(); |
| 6915 | if (ret) { | 7484 | if (ret) |
| 6916 | sysctl_sched_rt_period = old_period; | 7485 | goto undo; |
| 6917 | sysctl_sched_rt_runtime = old_runtime; | 7486 | |
| 6918 | } else { | 7487 | ret = sched_dl_global_constraints(); |
| 6919 | def_rt_bandwidth.rt_runtime = global_rt_runtime(); | 7488 | if (ret) |
| 6920 | def_rt_bandwidth.rt_period = | 7489 | goto undo; |
| 6921 | ns_to_ktime(global_rt_period()); | 7490 | |
| 6922 | } | 7491 | sched_rt_do_global(); |
| 7492 | sched_dl_do_global(); | ||
| 7493 | } | ||
| 7494 | if (0) { | ||
| 7495 | undo: | ||
| 7496 | sysctl_sched_rt_period = old_period; | ||
| 7497 | sysctl_sched_rt_runtime = old_runtime; | ||
| 6923 | } | 7498 | } |
| 6924 | mutex_unlock(&mutex); | 7499 | mutex_unlock(&mutex); |
| 6925 | 7500 | ||
| 6926 | return ret; | 7501 | return ret; |
| 6927 | } | 7502 | } |
| 6928 | 7503 | ||
| 7504 | int sched_rr_handler(struct ctl_table *table, int write, | ||
| 7505 | void __user *buffer, size_t *lenp, | ||
| 7506 | loff_t *ppos) | ||
| 7507 | { | ||
| 7508 | int ret; | ||
| 7509 | static DEFINE_MUTEX(mutex); | ||
| 7510 | |||
| 7511 | mutex_lock(&mutex); | ||
| 7512 | ret = proc_dointvec(table, write, buffer, lenp, ppos); | ||
| 7513 | /* make sure that internally we keep jiffies */ | ||
| 7514 | /* also, writing zero resets timeslice to default */ | ||
| 7515 | if (!ret && write) { | ||
| 7516 | sched_rr_timeslice = sched_rr_timeslice <= 0 ? | ||
| 7517 | RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); | ||
| 7518 | } | ||
| 7519 | mutex_unlock(&mutex); | ||
| 7520 | return ret; | ||
| 7521 | } | ||
| 7522 | |||
| 6929 | #ifdef CONFIG_CGROUP_SCHED | 7523 | #ifdef CONFIG_CGROUP_SCHED |
| 6930 | 7524 | ||
| 6931 | static inline struct task_group *css_tg(struct cgroup_subsys_state *css) | 7525 | static inline struct task_group *css_tg(struct cgroup_subsys_state *css) |
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c new file mode 100644 index 000000000000..045fc74e3f09 --- /dev/null +++ b/kernel/sched/cpudeadline.c | |||
| @@ -0,0 +1,216 @@ | |||
| 1 | /* | ||
| 2 | * kernel/sched/cpudl.c | ||
| 3 | * | ||
| 4 | * Global CPU deadline management | ||
| 5 | * | ||
| 6 | * Author: Juri Lelli <j.lelli@sssup.it> | ||
| 7 | * | ||
| 8 | * This program is free software; you can redistribute it and/or | ||
| 9 | * modify it under the terms of the GNU General Public License | ||
| 10 | * as published by the Free Software Foundation; version 2 | ||
| 11 | * of the License. | ||
| 12 | */ | ||
| 13 | |||
| 14 | #include <linux/gfp.h> | ||
| 15 | #include <linux/kernel.h> | ||
| 16 | #include "cpudeadline.h" | ||
| 17 | |||
| 18 | static inline int parent(int i) | ||
| 19 | { | ||
| 20 | return (i - 1) >> 1; | ||
| 21 | } | ||
| 22 | |||
| 23 | static inline int left_child(int i) | ||
| 24 | { | ||
| 25 | return (i << 1) + 1; | ||
| 26 | } | ||
| 27 | |||
| 28 | static inline int right_child(int i) | ||
| 29 | { | ||
| 30 | return (i << 1) + 2; | ||
| 31 | } | ||
| 32 | |||
| 33 | static inline int dl_time_before(u64 a, u64 b) | ||
| 34 | { | ||
| 35 | return (s64)(a - b) < 0; | ||
| 36 | } | ||
| 37 | |||
| 38 | static void cpudl_exchange(struct cpudl *cp, int a, int b) | ||
| 39 | { | ||
| 40 | int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; | ||
| 41 | |||
| 42 | swap(cp->elements[a], cp->elements[b]); | ||
| 43 | swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]); | ||
| 44 | } | ||
| 45 | |||
| 46 | static void cpudl_heapify(struct cpudl *cp, int idx) | ||
| 47 | { | ||
| 48 | int l, r, largest; | ||
| 49 | |||
| 50 | /* adapted from lib/prio_heap.c */ | ||
| 51 | while(1) { | ||
| 52 | l = left_child(idx); | ||
| 53 | r = right_child(idx); | ||
| 54 | largest = idx; | ||
| 55 | |||
| 56 | if ((l < cp->size) && dl_time_before(cp->elements[idx].dl, | ||
| 57 | cp->elements[l].dl)) | ||
| 58 | largest = l; | ||
| 59 | if ((r < cp->size) && dl_time_before(cp->elements[largest].dl, | ||
| 60 | cp->elements[r].dl)) | ||
| 61 | largest = r; | ||
| 62 | if (largest == idx) | ||
| 63 | break; | ||
| 64 | |||
| 65 | /* Push idx down the heap one level and bump one up */ | ||
| 66 | cpudl_exchange(cp, largest, idx); | ||
| 67 | idx = largest; | ||
| 68 | } | ||
| 69 | } | ||
| 70 | |||
| 71 | static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl) | ||
| 72 | { | ||
| 73 | WARN_ON(idx > num_present_cpus() || idx == IDX_INVALID); | ||
| 74 | |||
| 75 | if (dl_time_before(new_dl, cp->elements[idx].dl)) { | ||
| 76 | cp->elements[idx].dl = new_dl; | ||
| 77 | cpudl_heapify(cp, idx); | ||
| 78 | } else { | ||
| 79 | cp->elements[idx].dl = new_dl; | ||
| 80 | while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl, | ||
| 81 | cp->elements[idx].dl)) { | ||
| 82 | cpudl_exchange(cp, idx, parent(idx)); | ||
| 83 | idx = parent(idx); | ||
| 84 | } | ||
| 85 | } | ||
| 86 | } | ||
| 87 | |||
| 88 | static inline int cpudl_maximum(struct cpudl *cp) | ||
| 89 | { | ||
| 90 | return cp->elements[0].cpu; | ||
| 91 | } | ||
| 92 | |||
| 93 | /* | ||
| 94 | * cpudl_find - find the best (later-dl) CPU in the system | ||
| 95 | * @cp: the cpudl max-heap context | ||
| 96 | * @p: the task | ||
| 97 | * @later_mask: a mask to fill in with the selected CPUs (or NULL) | ||
| 98 | * | ||
| 99 | * Returns: int - best CPU (heap maximum if suitable) | ||
| 100 | */ | ||
| 101 | int cpudl_find(struct cpudl *cp, struct task_struct *p, | ||
| 102 | struct cpumask *later_mask) | ||
| 103 | { | ||
| 104 | int best_cpu = -1; | ||
| 105 | const struct sched_dl_entity *dl_se = &p->dl; | ||
| 106 | |||
| 107 | if (later_mask && cpumask_and(later_mask, cp->free_cpus, | ||
| 108 | &p->cpus_allowed) && cpumask_and(later_mask, | ||
| 109 | later_mask, cpu_active_mask)) { | ||
| 110 | best_cpu = cpumask_any(later_mask); | ||
| 111 | goto out; | ||
| 112 | } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && | ||
| 113 | dl_time_before(dl_se->deadline, cp->elements[0].dl)) { | ||
| 114 | best_cpu = cpudl_maximum(cp); | ||
| 115 | if (later_mask) | ||
| 116 | cpumask_set_cpu(best_cpu, later_mask); | ||
| 117 | } | ||
| 118 | |||
| 119 | out: | ||
| 120 | WARN_ON(best_cpu > num_present_cpus() && best_cpu != -1); | ||
| 121 | |||
| 122 | return best_cpu; | ||
| 123 | } | ||
| 124 | |||
| 125 | /* | ||
| 126 | * cpudl_set - update the cpudl max-heap | ||
| 127 | * @cp: the cpudl max-heap context | ||
| 128 | * @cpu: the target cpu | ||
| 129 | * @dl: the new earliest deadline for this cpu | ||
| 130 | * | ||
| 131 | * Notes: assumes cpu_rq(cpu)->lock is locked | ||
| 132 | * | ||
| 133 | * Returns: (void) | ||
| 134 | */ | ||
| 135 | void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) | ||
| 136 | { | ||
| 137 | int old_idx, new_cpu; | ||
| 138 | unsigned long flags; | ||
| 139 | |||
| 140 | WARN_ON(cpu > num_present_cpus()); | ||
| 141 | |||
| 142 | raw_spin_lock_irqsave(&cp->lock, flags); | ||
| 143 | old_idx = cp->cpu_to_idx[cpu]; | ||
| 144 | if (!is_valid) { | ||
| 145 | /* remove item */ | ||
| 146 | if (old_idx == IDX_INVALID) { | ||
| 147 | /* | ||
| 148 | * Nothing to remove if old_idx was invalid. | ||
| 149 | * This could happen if a rq_offline_dl is | ||
| 150 | * called for a CPU without -dl tasks running. | ||
| 151 | */ | ||
| 152 | goto out; | ||
| 153 | } | ||
| 154 | new_cpu = cp->elements[cp->size - 1].cpu; | ||
| 155 | cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; | ||
| 156 | cp->elements[old_idx].cpu = new_cpu; | ||
| 157 | cp->size--; | ||
| 158 | cp->cpu_to_idx[new_cpu] = old_idx; | ||
| 159 | cp->cpu_to_idx[cpu] = IDX_INVALID; | ||
| 160 | while (old_idx > 0 && dl_time_before( | ||
| 161 | cp->elements[parent(old_idx)].dl, | ||
| 162 | cp->elements[old_idx].dl)) { | ||
| 163 | cpudl_exchange(cp, old_idx, parent(old_idx)); | ||
| 164 | old_idx = parent(old_idx); | ||
| 165 | } | ||
| 166 | cpumask_set_cpu(cpu, cp->free_cpus); | ||
| 167 | cpudl_heapify(cp, old_idx); | ||
| 168 | |||
| 169 | goto out; | ||
| 170 | } | ||
| 171 | |||
| 172 | if (old_idx == IDX_INVALID) { | ||
| 173 | cp->size++; | ||
| 174 | cp->elements[cp->size - 1].dl = 0; | ||
| 175 | cp->elements[cp->size - 1].cpu = cpu; | ||
| 176 | cp->cpu_to_idx[cpu] = cp->size - 1; | ||
| 177 | cpudl_change_key(cp, cp->size - 1, dl); | ||
| 178 | cpumask_clear_cpu(cpu, cp->free_cpus); | ||
| 179 | } else { | ||
| 180 | cpudl_change_key(cp, old_idx, dl); | ||
| 181 | } | ||
| 182 | |||
| 183 | out: | ||
| 184 | raw_spin_unlock_irqrestore(&cp->lock, flags); | ||
| 185 | } | ||
| 186 | |||
| 187 | /* | ||
| 188 | * cpudl_init - initialize the cpudl structure | ||
| 189 | * @cp: the cpudl max-heap context | ||
| 190 | */ | ||
| 191 | int cpudl_init(struct cpudl *cp) | ||
| 192 | { | ||
| 193 | int i; | ||
| 194 | |||
| 195 | memset(cp, 0, sizeof(*cp)); | ||
| 196 | raw_spin_lock_init(&cp->lock); | ||
| 197 | cp->size = 0; | ||
| 198 | for (i = 0; i < NR_CPUS; i++) | ||
| 199 | cp->cpu_to_idx[i] = IDX_INVALID; | ||
| 200 | if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) | ||
| 201 | return -ENOMEM; | ||
| 202 | cpumask_setall(cp->free_cpus); | ||
| 203 | |||
| 204 | return 0; | ||
| 205 | } | ||
| 206 | |||
| 207 | /* | ||
| 208 | * cpudl_cleanup - clean up the cpudl structure | ||
| 209 | * @cp: the cpudl max-heap context | ||
| 210 | */ | ||
| 211 | void cpudl_cleanup(struct cpudl *cp) | ||
| 212 | { | ||
| 213 | /* | ||
| 214 | * nothing to do for the moment | ||
| 215 | */ | ||
| 216 | } | ||
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h new file mode 100644 index 000000000000..a202789a412c --- /dev/null +++ b/kernel/sched/cpudeadline.h | |||
| @@ -0,0 +1,33 @@ | |||
| 1 | #ifndef _LINUX_CPUDL_H | ||
| 2 | #define _LINUX_CPUDL_H | ||
| 3 | |||
| 4 | #include <linux/sched.h> | ||
| 5 | |||
| 6 | #define IDX_INVALID -1 | ||
| 7 | |||
| 8 | struct array_item { | ||
| 9 | u64 dl; | ||
| 10 | int cpu; | ||
| 11 | }; | ||
| 12 | |||
| 13 | struct cpudl { | ||
| 14 | raw_spinlock_t lock; | ||
| 15 | int size; | ||
| 16 | int cpu_to_idx[NR_CPUS]; | ||
| 17 | struct array_item elements[NR_CPUS]; | ||
| 18 | cpumask_var_t free_cpus; | ||
| 19 | }; | ||
| 20 | |||
| 21 | |||
| 22 | #ifdef CONFIG_SMP | ||
| 23 | int cpudl_find(struct cpudl *cp, struct task_struct *p, | ||
| 24 | struct cpumask *later_mask); | ||
| 25 | void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); | ||
| 26 | int cpudl_init(struct cpudl *cp); | ||
| 27 | void cpudl_cleanup(struct cpudl *cp); | ||
| 28 | #else | ||
| 29 | #define cpudl_set(cp, cpu, dl) do { } while (0) | ||
| 30 | #define cpudl_init() do { } while (0) | ||
| 31 | #endif /* CONFIG_SMP */ | ||
| 32 | |||
| 33 | #endif /* _LINUX_CPUDL_H */ | ||
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c new file mode 100644 index 000000000000..0de248202879 --- /dev/null +++ b/kernel/sched/deadline.c | |||
| @@ -0,0 +1,1640 @@ | |||
| 1 | /* | ||
| 2 | * Deadline Scheduling Class (SCHED_DEADLINE) | ||
| 3 | * | ||
| 4 | * Earliest Deadline First (EDF) + Constant Bandwidth Server (CBS). | ||
| 5 | * | ||
| 6 | * Tasks that periodically executes their instances for less than their | ||
| 7 | * runtime won't miss any of their deadlines. | ||
| 8 | * Tasks that are not periodic or sporadic or that tries to execute more | ||
| 9 | * than their reserved bandwidth will be slowed down (and may potentially | ||
| 10 | * miss some of their deadlines), and won't affect any other task. | ||
| 11 | * | ||
| 12 | * Copyright (C) 2012 Dario Faggioli <raistlin@linux.it>, | ||
| 13 | * Juri Lelli <juri.lelli@gmail.com>, | ||
| 14 | * Michael Trimarchi <michael@amarulasolutions.com>, | ||
| 15 | * Fabio Checconi <fchecconi@gmail.com> | ||
| 16 | */ | ||
| 17 | #include "sched.h" | ||
| 18 | |||
| 19 | #include <linux/slab.h> | ||
| 20 | |||
| 21 | struct dl_bandwidth def_dl_bandwidth; | ||
| 22 | |||
| 23 | static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) | ||
| 24 | { | ||
| 25 | return container_of(dl_se, struct task_struct, dl); | ||
| 26 | } | ||
| 27 | |||
| 28 | static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq) | ||
| 29 | { | ||
| 30 | return container_of(dl_rq, struct rq, dl); | ||
| 31 | } | ||
| 32 | |||
| 33 | static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se) | ||
| 34 | { | ||
| 35 | struct task_struct *p = dl_task_of(dl_se); | ||
| 36 | struct rq *rq = task_rq(p); | ||
| 37 | |||
| 38 | return &rq->dl; | ||
| 39 | } | ||
| 40 | |||
| 41 | static inline int on_dl_rq(struct sched_dl_entity *dl_se) | ||
| 42 | { | ||
| 43 | return !RB_EMPTY_NODE(&dl_se->rb_node); | ||
| 44 | } | ||
| 45 | |||
| 46 | static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) | ||
| 47 | { | ||
| 48 | struct sched_dl_entity *dl_se = &p->dl; | ||
| 49 | |||
| 50 | return dl_rq->rb_leftmost == &dl_se->rb_node; | ||
| 51 | } | ||
| 52 | |||
| 53 | void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime) | ||
| 54 | { | ||
| 55 | raw_spin_lock_init(&dl_b->dl_runtime_lock); | ||
| 56 | dl_b->dl_period = period; | ||
| 57 | dl_b->dl_runtime = runtime; | ||
| 58 | } | ||
| 59 | |||
| 60 | extern unsigned long to_ratio(u64 period, u64 runtime); | ||
| 61 | |||
| 62 | void init_dl_bw(struct dl_bw *dl_b) | ||
| 63 | { | ||
| 64 | raw_spin_lock_init(&dl_b->lock); | ||
| 65 | raw_spin_lock(&def_dl_bandwidth.dl_runtime_lock); | ||
| 66 | if (global_rt_runtime() == RUNTIME_INF) | ||
| 67 | dl_b->bw = -1; | ||
| 68 | else | ||
| 69 | dl_b->bw = to_ratio(global_rt_period(), global_rt_runtime()); | ||
| 70 | raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock); | ||
| 71 | dl_b->total_bw = 0; | ||
| 72 | } | ||
| 73 | |||
| 74 | void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq) | ||
| 75 | { | ||
| 76 | dl_rq->rb_root = RB_ROOT; | ||
| 77 | |||
| 78 | #ifdef CONFIG_SMP | ||
| 79 | /* zero means no -deadline tasks */ | ||
| 80 | dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0; | ||
| 81 | |||
| 82 | dl_rq->dl_nr_migratory = 0; | ||
| 83 | dl_rq->overloaded = 0; | ||
| 84 | dl_rq->pushable_dl_tasks_root = RB_ROOT; | ||
| 85 | #else | ||
| 86 | init_dl_bw(&dl_rq->dl_bw); | ||
| 87 | #endif | ||
| 88 | } | ||
| 89 | |||
| 90 | #ifdef CONFIG_SMP | ||
| 91 | |||
| 92 | static inline int dl_overloaded(struct rq *rq) | ||
| 93 | { | ||
| 94 | return atomic_read(&rq->rd->dlo_count); | ||
| 95 | } | ||
| 96 | |||
| 97 | static inline void dl_set_overload(struct rq *rq) | ||
| 98 | { | ||
| 99 | if (!rq->online) | ||
| 100 | return; | ||
| 101 | |||
| 102 | cpumask_set_cpu(rq->cpu, rq->rd->dlo_mask); | ||
| 103 | /* | ||
| 104 | * Must be visible before the overload count is | ||
| 105 | * set (as in sched_rt.c). | ||
| 106 | * | ||
| 107 | * Matched by the barrier in pull_dl_task(). | ||
| 108 | */ | ||
| 109 | smp_wmb(); | ||
| 110 | atomic_inc(&rq->rd->dlo_count); | ||
| 111 | } | ||
| 112 | |||
| 113 | static inline void dl_clear_overload(struct rq *rq) | ||
| 114 | { | ||
| 115 | if (!rq->online) | ||
| 116 | return; | ||
| 117 | |||
| 118 | atomic_dec(&rq->rd->dlo_count); | ||
| 119 | cpumask_clear_cpu(rq->cpu, rq->rd->dlo_mask); | ||
| 120 | } | ||
| 121 | |||
| 122 | static void update_dl_migration(struct dl_rq *dl_rq) | ||
| 123 | { | ||
| 124 | if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_total > 1) { | ||
| 125 | if (!dl_rq->overloaded) { | ||
| 126 | dl_set_overload(rq_of_dl_rq(dl_rq)); | ||
| 127 | dl_rq->overloaded = 1; | ||
| 128 | } | ||
| 129 | } else if (dl_rq->overloaded) { | ||
| 130 | dl_clear_overload(rq_of_dl_rq(dl_rq)); | ||
| 131 | dl_rq->overloaded = 0; | ||
| 132 | } | ||
| 133 | } | ||
| 134 | |||
| 135 | static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) | ||
| 136 | { | ||
| 137 | struct task_struct *p = dl_task_of(dl_se); | ||
| 138 | dl_rq = &rq_of_dl_rq(dl_rq)->dl; | ||
| 139 | |||
| 140 | dl_rq->dl_nr_total++; | ||
| 141 | if (p->nr_cpus_allowed > 1) | ||
| 142 | dl_rq->dl_nr_migratory++; | ||
| 143 | |||
| 144 | update_dl_migration(dl_rq); | ||
| 145 | } | ||
| 146 | |||
| 147 | static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) | ||
| 148 | { | ||
| 149 | struct task_struct *p = dl_task_of(dl_se); | ||
| 150 | dl_rq = &rq_of_dl_rq(dl_rq)->dl; | ||
| 151 | |||
| 152 | dl_rq->dl_nr_total--; | ||
| 153 | if (p->nr_cpus_allowed > 1) | ||
| 154 | dl_rq->dl_nr_migratory--; | ||
| 155 | |||
| 156 | update_dl_migration(dl_rq); | ||
| 157 | } | ||
| 158 | |||
| 159 | /* | ||
| 160 | * The list of pushable -deadline task is not a plist, like in | ||
| 161 | * sched_rt.c, it is an rb-tree with tasks ordered by deadline. | ||
| 162 | */ | ||
| 163 | static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) | ||
| 164 | { | ||
| 165 | struct dl_rq *dl_rq = &rq->dl; | ||
| 166 | struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_node; | ||
| 167 | struct rb_node *parent = NULL; | ||
| 168 | struct task_struct *entry; | ||
| 169 | int leftmost = 1; | ||
| 170 | |||
| 171 | BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks)); | ||
| 172 | |||
| 173 | while (*link) { | ||
| 174 | parent = *link; | ||
| 175 | entry = rb_entry(parent, struct task_struct, | ||
| 176 | pushable_dl_tasks); | ||
| 177 | if (dl_entity_preempt(&p->dl, &entry->dl)) | ||
| 178 | link = &parent->rb_left; | ||
| 179 | else { | ||
| 180 | link = &parent->rb_right; | ||
| 181 | leftmost = 0; | ||
| 182 | } | ||
| 183 | } | ||
| 184 | |||
| 185 | if (leftmost) | ||
| 186 | dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks; | ||
| 187 | |||
| 188 | rb_link_node(&p->pushable_dl_tasks, parent, link); | ||
| 189 | rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); | ||
| 190 | } | ||
| 191 | |||
| 192 | static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) | ||
| 193 | { | ||
| 194 | struct dl_rq *dl_rq = &rq->dl; | ||
| 195 | |||
| 196 | if (RB_EMPTY_NODE(&p->pushable_dl_tasks)) | ||
| 197 | return; | ||
| 198 | |||
| 199 | if (dl_rq->pushable_dl_tasks_leftmost == &p->pushable_dl_tasks) { | ||
| 200 | struct rb_node *next_node; | ||
| 201 | |||
| 202 | next_node = rb_next(&p->pushable_dl_tasks); | ||
| 203 | dl_rq->pushable_dl_tasks_leftmost = next_node; | ||
| 204 | } | ||
| 205 | |||
| 206 | rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); | ||
| 207 | RB_CLEAR_NODE(&p->pushable_dl_tasks); | ||
| 208 | } | ||
| 209 | |||
| 210 | static inline int has_pushable_dl_tasks(struct rq *rq) | ||
| 211 | { | ||
| 212 | return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root); | ||
| 213 | } | ||
| 214 | |||
| 215 | static int push_dl_task(struct rq *rq); | ||
| 216 | |||
| 217 | #else | ||
| 218 | |||
| 219 | static inline | ||
| 220 | void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) | ||
| 221 | { | ||
| 222 | } | ||
| 223 | |||
| 224 | static inline | ||
| 225 | void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) | ||
| 226 | { | ||
| 227 | } | ||
| 228 | |||
| 229 | static inline | ||
| 230 | void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) | ||
| 231 | { | ||
| 232 | } | ||
| 233 | |||
| 234 | static inline | ||
| 235 | void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) | ||
| 236 | { | ||
| 237 | } | ||
| 238 | |||
| 239 | #endif /* CONFIG_SMP */ | ||
| 240 | |||
| 241 | static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); | ||
| 242 | static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); | ||
| 243 | static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, | ||
| 244 | int flags); | ||
| 245 | |||
| 246 | /* | ||
| 247 | * We are being explicitly informed that a new instance is starting, | ||
| 248 | * and this means that: | ||
| 249 | * - the absolute deadline of the entity has to be placed at | ||
| 250 | * current time + relative deadline; | ||
| 251 | * - the runtime of the entity has to be set to the maximum value. | ||
| 252 | * | ||
| 253 | * The capability of specifying such event is useful whenever a -deadline | ||
| 254 | * entity wants to (try to!) synchronize its behaviour with the scheduler's | ||
| 255 | * one, and to (try to!) reconcile itself with its own scheduling | ||
| 256 | * parameters. | ||
| 257 | */ | ||
| 258 | static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, | ||
| 259 | struct sched_dl_entity *pi_se) | ||
| 260 | { | ||
| 261 | struct dl_rq *dl_rq = dl_rq_of_se(dl_se); | ||
| 262 | struct rq *rq = rq_of_dl_rq(dl_rq); | ||
| 263 | |||
| 264 | WARN_ON(!dl_se->dl_new || dl_se->dl_throttled); | ||
| 265 | |||
| 266 | /* | ||
| 267 | * We use the regular wall clock time to set deadlines in the | ||
| 268 | * future; in fact, we must consider execution overheads (time | ||
| 269 | * spent on hardirq context, etc.). | ||
| 270 | */ | ||
| 271 | dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; | ||
| 272 | dl_se->runtime = pi_se->dl_runtime; | ||
| 273 | dl_se->dl_new = 0; | ||
| 274 | } | ||
| 275 | |||
| 276 | /* | ||
| 277 | * Pure Earliest Deadline First (EDF) scheduling does not deal with the | ||
| 278 | * possibility of a entity lasting more than what it declared, and thus | ||
| 279 | * exhausting its runtime. | ||
| 280 | * | ||
| 281 | * Here we are interested in making runtime overrun possible, but we do | ||
| 282 | * not want a entity which is misbehaving to affect the scheduling of all | ||
| 283 | * other entities. | ||
| 284 | * Therefore, a budgeting strategy called Constant Bandwidth Server (CBS) | ||
| 285 | * is used, in order to confine each entity within its own bandwidth. | ||
| 286 | * | ||
| 287 | * This function deals exactly with that, and ensures that when the runtime | ||
| 288 | * of a entity is replenished, its deadline is also postponed. That ensures | ||
| 289 | * the overrunning entity can't interfere with other entity in the system and | ||
| 290 | * can't make them miss their deadlines. Reasons why this kind of overruns | ||
| 291 | * could happen are, typically, a entity voluntarily trying to overcome its | ||
| 292 | * runtime, or it just underestimated it during sched_setscheduler_ex(). | ||
| 293 | */ | ||
| 294 | static void replenish_dl_entity(struct sched_dl_entity *dl_se, | ||
| 295 | struct sched_dl_entity *pi_se) | ||
| 296 | { | ||
| 297 | struct dl_rq *dl_rq = dl_rq_of_se(dl_se); | ||
| 298 | struct rq *rq = rq_of_dl_rq(dl_rq); | ||
| 299 | |||
| 300 | BUG_ON(pi_se->dl_runtime <= 0); | ||
| 301 | |||
| 302 | /* | ||
| 303 | * This could be the case for a !-dl task that is boosted. | ||
| 304 | * Just go with full inherited parameters. | ||
| 305 | */ | ||
| 306 | if (dl_se->dl_deadline == 0) { | ||
| 307 | dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; | ||
| 308 | dl_se->runtime = pi_se->dl_runtime; | ||
| 309 | } | ||
| 310 | |||
| 311 | /* | ||
| 312 | * We keep moving the deadline away until we get some | ||
| 313 | * available runtime for the entity. This ensures correct | ||
| 314 | * handling of situations where the runtime overrun is | ||
| 315 | * arbitrary large. | ||
| 316 | */ | ||
| 317 | while (dl_se->runtime <= 0) { | ||
| 318 | dl_se->deadline += pi_se->dl_period; | ||
| 319 | dl_se->runtime += pi_se->dl_runtime; | ||
| 320 | } | ||
| 321 | |||
| 322 | /* | ||
| 323 | * At this point, the deadline really should be "in | ||
| 324 | * the future" with respect to rq->clock. If it's | ||
| 325 | * not, we are, for some reason, lagging too much! | ||
| 326 | * Anyway, after having warn userspace abut that, | ||
| 327 | * we still try to keep the things running by | ||
| 328 | * resetting the deadline and the budget of the | ||
| 329 | * entity. | ||
| 330 | */ | ||
| 331 | if (dl_time_before(dl_se->deadline, rq_clock(rq))) { | ||
| 332 | static bool lag_once = false; | ||
| 333 | |||
| 334 | if (!lag_once) { | ||
| 335 | lag_once = true; | ||
| 336 | printk_sched("sched: DL replenish lagged to much\n"); | ||
| 337 | } | ||
| 338 | dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; | ||
| 339 | dl_se->runtime = pi_se->dl_runtime; | ||
| 340 | } | ||
| 341 | } | ||
| 342 | |||
| 343 | /* | ||
| 344 | * Here we check if --at time t-- an entity (which is probably being | ||
| 345 | * [re]activated or, in general, enqueued) can use its remaining runtime | ||
| 346 | * and its current deadline _without_ exceeding the bandwidth it is | ||
| 347 | * assigned (function returns true if it can't). We are in fact applying | ||
| 348 | * one of the CBS rules: when a task wakes up, if the residual runtime | ||
| 349 | * over residual deadline fits within the allocated bandwidth, then we | ||
| 350 | * can keep the current (absolute) deadline and residual budget without | ||
| 351 | * disrupting the schedulability of the system. Otherwise, we should | ||
| 352 | * refill the runtime and set the deadline a period in the future, | ||
| 353 | * because keeping the current (absolute) deadline of the task would | ||
| 354 | * result in breaking guarantees promised to other tasks. | ||
| 355 | * | ||
| 356 | * This function returns true if: | ||
| 357 | * | ||
| 358 | * runtime / (deadline - t) > dl_runtime / dl_period , | ||
| 359 | * | ||
| 360 | * IOW we can't recycle current parameters. | ||
| 361 | * | ||
| 362 | * Notice that the bandwidth check is done against the period. For | ||
| 363 | * task with deadline equal to period this is the same of using | ||
| 364 | * dl_deadline instead of dl_period in the equation above. | ||
| 365 | */ | ||
| 366 | static bool dl_entity_overflow(struct sched_dl_entity *dl_se, | ||
| 367 | struct sched_dl_entity *pi_se, u64 t) | ||
| 368 | { | ||
| 369 | u64 left, right; | ||
| 370 | |||
| 371 | /* | ||
| 372 | * left and right are the two sides of the equation above, | ||
| 373 | * after a bit of shuffling to use multiplications instead | ||
| 374 | * of divisions. | ||
| 375 | * | ||
| 376 | * Note that none of the time values involved in the two | ||
| 377 | * multiplications are absolute: dl_deadline and dl_runtime | ||
| 378 | * are the relative deadline and the maximum runtime of each | ||
| 379 | * instance, runtime is the runtime left for the last instance | ||
| 380 | * and (deadline - t), since t is rq->clock, is the time left | ||
| 381 | * to the (absolute) deadline. Even if overflowing the u64 type | ||
| 382 | * is very unlikely to occur in both cases, here we scale down | ||
| 383 | * as we want to avoid that risk at all. Scaling down by 10 | ||
| 384 | * means that we reduce granularity to 1us. We are fine with it, | ||
| 385 | * since this is only a true/false check and, anyway, thinking | ||
| 386 | * of anything below microseconds resolution is actually fiction | ||
| 387 | * (but still we want to give the user that illusion >;). | ||
| 388 | */ | ||
| 389 | left = (pi_se->dl_period >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); | ||
| 390 | right = ((dl_se->deadline - t) >> DL_SCALE) * | ||
| 391 | (pi_se->dl_runtime >> DL_SCALE); | ||
| 392 | |||
| 393 | return dl_time_before(right, left); | ||
| 394 | } | ||
| 395 | |||
| 396 | /* | ||
| 397 | * When a -deadline entity is queued back on the runqueue, its runtime and | ||
| 398 | * deadline might need updating. | ||
| 399 | * | ||
| 400 | * The policy here is that we update the deadline of the entity only if: | ||
| 401 | * - the current deadline is in the past, | ||
| 402 | * - using the remaining runtime with the current deadline would make | ||
| 403 | * the entity exceed its bandwidth. | ||
| 404 | */ | ||
| 405 | static void update_dl_entity(struct sched_dl_entity *dl_se, | ||
| 406 | struct sched_dl_entity *pi_se) | ||
| 407 | { | ||
| 408 | struct dl_rq *dl_rq = dl_rq_of_se(dl_se); | ||
| 409 | struct rq *rq = rq_of_dl_rq(dl_rq); | ||
| 410 | |||
| 411 | /* | ||
| 412 | * The arrival of a new instance needs special treatment, i.e., | ||
| 413 | * the actual scheduling parameters have to be "renewed". | ||
| 414 | */ | ||
| 415 | if (dl_se->dl_new) { | ||
| 416 | setup_new_dl_entity(dl_se, pi_se); | ||
| 417 | return; | ||
| 418 | } | ||
| 419 | |||
| 420 | if (dl_time_before(dl_se->deadline, rq_clock(rq)) || | ||
| 421 | dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { | ||
| 422 | dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; | ||
| 423 | dl_se->runtime = pi_se->dl_runtime; | ||
| 424 | } | ||
| 425 | } | ||
| 426 | |||
| 427 | /* | ||
| 428 | * If the entity depleted all its runtime, and if we want it to sleep | ||
| 429 | * while waiting for some new execution time to become available, we | ||
| 430 | * set the bandwidth enforcement timer to the replenishment instant | ||
| 431 | * and try to activate it. | ||
| 432 | * | ||
| 433 | * Notice that it is important for the caller to know if the timer | ||
| 434 | * actually started or not (i.e., the replenishment instant is in | ||
| 435 | * the future or in the past). | ||
| 436 | */ | ||
| 437 | static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted) | ||
| 438 | { | ||
| 439 | struct dl_rq *dl_rq = dl_rq_of_se(dl_se); | ||
| 440 | struct rq *rq = rq_of_dl_rq(dl_rq); | ||
| 441 | ktime_t now, act; | ||
| 442 | ktime_t soft, hard; | ||
| 443 | unsigned long range; | ||
| 444 | s64 delta; | ||
| 445 | |||
| 446 | if (boosted) | ||
| 447 | return 0; | ||
| 448 | /* | ||
| 449 | * We want the timer to fire at the deadline, but considering | ||
| 450 | * that it is actually coming from rq->clock and not from | ||
| 451 | * hrtimer's time base reading. | ||
| 452 | */ | ||
| 453 | act = ns_to_ktime(dl_se->deadline); | ||
| 454 | now = hrtimer_cb_get_time(&dl_se->dl_timer); | ||
| 455 | delta = ktime_to_ns(now) - rq_clock(rq); | ||
| 456 | act = ktime_add_ns(act, delta); | ||
| 457 | |||
| 458 | /* | ||
| 459 | * If the expiry time already passed, e.g., because the value | ||
| 460 | * chosen as the deadline is too small, don't even try to | ||
| 461 | * start the timer in the past! | ||
| 462 | */ | ||
| 463 | if (ktime_us_delta(act, now) < 0) | ||
| 464 | return 0; | ||
| 465 | |||
| 466 | hrtimer_set_expires(&dl_se->dl_timer, act); | ||
| 467 | |||
| 468 | soft = hrtimer_get_softexpires(&dl_se->dl_timer); | ||
| 469 | hard = hrtimer_get_expires(&dl_se->dl_timer); | ||
| 470 | range = ktime_to_ns(ktime_sub(hard, soft)); | ||
| 471 | __hrtimer_start_range_ns(&dl_se->dl_timer, soft, | ||
| 472 | range, HRTIMER_MODE_ABS, 0); | ||
| 473 | |||
| 474 | return hrtimer_active(&dl_se->dl_timer); | ||
| 475 | } | ||
| 476 | |||
| 477 | /* | ||
| 478 | * This is the bandwidth enforcement timer callback. If here, we know | ||
| 479 | * a task is not on its dl_rq, since the fact that the timer was running | ||
| 480 | * means the task is throttled and needs a runtime replenishment. | ||
| 481 | * | ||
| 482 | * However, what we actually do depends on the fact the task is active, | ||
| 483 | * (it is on its rq) or has been removed from there by a call to | ||
| 484 | * dequeue_task_dl(). In the former case we must issue the runtime | ||
| 485 | * replenishment and add the task back to the dl_rq; in the latter, we just | ||
| 486 | * do nothing but clearing dl_throttled, so that runtime and deadline | ||
| 487 | * updating (and the queueing back to dl_rq) will be done by the | ||
| 488 | * next call to enqueue_task_dl(). | ||
| 489 | */ | ||
| 490 | static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) | ||
| 491 | { | ||
| 492 | struct sched_dl_entity *dl_se = container_of(timer, | ||
| 493 | struct sched_dl_entity, | ||
| 494 | dl_timer); | ||
| 495 | struct task_struct *p = dl_task_of(dl_se); | ||
| 496 | struct rq *rq = task_rq(p); | ||
| 497 | raw_spin_lock(&rq->lock); | ||
| 498 | |||
| 499 | /* | ||
| 500 | * We need to take care of a possible races here. In fact, the | ||
| 501 | * task might have changed its scheduling policy to something | ||
| 502 | * different from SCHED_DEADLINE or changed its reservation | ||
| 503 | * parameters (through sched_setscheduler()). | ||
| 504 | */ | ||
| 505 | if (!dl_task(p) || dl_se->dl_new) | ||
| 506 | goto unlock; | ||
| 507 | |||
| 508 | sched_clock_tick(); | ||
| 509 | update_rq_clock(rq); | ||
| 510 | dl_se->dl_throttled = 0; | ||
| 511 | if (p->on_rq) { | ||
| 512 | enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); | ||
| 513 | if (task_has_dl_policy(rq->curr)) | ||
| 514 | check_preempt_curr_dl(rq, p, 0); | ||
| 515 | else | ||
| 516 | resched_task(rq->curr); | ||
| 517 | #ifdef CONFIG_SMP | ||
| 518 | /* | ||
| 519 | * Queueing this task back might have overloaded rq, | ||
| 520 | * check if we need to kick someone away. | ||
| 521 | */ | ||
| 522 | if (has_pushable_dl_tasks(rq)) | ||
| 523 | push_dl_task(rq); | ||
| 524 | #endif | ||
| 525 | } | ||
| 526 | unlock: | ||
| 527 | raw_spin_unlock(&rq->lock); | ||
| 528 | |||
| 529 | return HRTIMER_NORESTART; | ||
| 530 | } | ||
| 531 | |||
| 532 | void init_dl_task_timer(struct sched_dl_entity *dl_se) | ||
| 533 | { | ||
| 534 | struct hrtimer *timer = &dl_se->dl_timer; | ||
| 535 | |||
| 536 | if (hrtimer_active(timer)) { | ||
| 537 | hrtimer_try_to_cancel(timer); | ||
| 538 | return; | ||
| 539 | } | ||
| 540 | |||
| 541 | hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
| 542 | timer->function = dl_task_timer; | ||
| 543 | } | ||
| 544 | |||
| 545 | static | ||
| 546 | int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se) | ||
| 547 | { | ||
| 548 | int dmiss = dl_time_before(dl_se->deadline, rq_clock(rq)); | ||
| 549 | int rorun = dl_se->runtime <= 0; | ||
| 550 | |||
| 551 | if (!rorun && !dmiss) | ||
| 552 | return 0; | ||
| 553 | |||
| 554 | /* | ||
| 555 | * If we are beyond our current deadline and we are still | ||
| 556 | * executing, then we have already used some of the runtime of | ||
| 557 | * the next instance. Thus, if we do not account that, we are | ||
| 558 | * stealing bandwidth from the system at each deadline miss! | ||
| 559 | */ | ||
| 560 | if (dmiss) { | ||
| 561 | dl_se->runtime = rorun ? dl_se->runtime : 0; | ||
| 562 | dl_se->runtime -= rq_clock(rq) - dl_se->deadline; | ||
| 563 | } | ||
| 564 | |||
| 565 | return 1; | ||
| 566 | } | ||
| 567 | |||
| 568 | /* | ||
| 569 | * Update the current task's runtime statistics (provided it is still | ||
| 570 | * a -deadline task and has not been removed from the dl_rq). | ||
| 571 | */ | ||
| 572 | static void update_curr_dl(struct rq *rq) | ||
| 573 | { | ||
| 574 | struct task_struct *curr = rq->curr; | ||
| 575 | struct sched_dl_entity *dl_se = &curr->dl; | ||
| 576 | u64 delta_exec; | ||
| 577 | |||
| 578 | if (!dl_task(curr) || !on_dl_rq(dl_se)) | ||
| 579 | return; | ||
| 580 | |||
| 581 | /* | ||
| 582 | * Consumed budget is computed considering the time as | ||
| 583 | * observed by schedulable tasks (excluding time spent | ||
| 584 | * in hardirq context, etc.). Deadlines are instead | ||
| 585 | * computed using hard walltime. This seems to be the more | ||
| 586 | * natural solution, but the full ramifications of this | ||
| 587 | * approach need further study. | ||
| 588 | */ | ||
| 589 | delta_exec = rq_clock_task(rq) - curr->se.exec_start; | ||
| 590 | if (unlikely((s64)delta_exec < 0)) | ||
| 591 | delta_exec = 0; | ||
| 592 | |||
| 593 | schedstat_set(curr->se.statistics.exec_max, | ||
| 594 | max(curr->se.statistics.exec_max, delta_exec)); | ||
| 595 | |||
| 596 | curr->se.sum_exec_runtime += delta_exec; | ||
| 597 | account_group_exec_runtime(curr, delta_exec); | ||
| 598 | |||
| 599 | curr->se.exec_start = rq_clock_task(rq); | ||
| 600 | cpuacct_charge(curr, delta_exec); | ||
| 601 | |||
| 602 | sched_rt_avg_update(rq, delta_exec); | ||
| 603 | |||
| 604 | dl_se->runtime -= delta_exec; | ||
| 605 | if (dl_runtime_exceeded(rq, dl_se)) { | ||
| 606 | __dequeue_task_dl(rq, curr, 0); | ||
| 607 | if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) | ||
| 608 | dl_se->dl_throttled = 1; | ||
| 609 | else | ||
| 610 | enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); | ||
| 611 | |||
| 612 | if (!is_leftmost(curr, &rq->dl)) | ||
| 613 | resched_task(curr); | ||
| 614 | } | ||
| 615 | |||
| 616 | /* | ||
| 617 | * Because -- for now -- we share the rt bandwidth, we need to | ||
| 618 | * account our runtime there too, otherwise actual rt tasks | ||
| 619 | * would be able to exceed the shared quota. | ||
| 620 | * | ||
| 621 | * Account to the root rt group for now. | ||
| 622 | * | ||
| 623 | * The solution we're working towards is having the RT groups scheduled | ||
| 624 | * using deadline servers -- however there's a few nasties to figure | ||
| 625 | * out before that can happen. | ||
| 626 | */ | ||
| 627 | if (rt_bandwidth_enabled()) { | ||
| 628 | struct rt_rq *rt_rq = &rq->rt; | ||
| 629 | |||
| 630 | raw_spin_lock(&rt_rq->rt_runtime_lock); | ||
| 631 | rt_rq->rt_time += delta_exec; | ||
| 632 | /* | ||
| 633 | * We'll let actual RT tasks worry about the overflow here, we | ||
| 634 | * have our own CBS to keep us inline -- see above. | ||
| 635 | */ | ||
| 636 | raw_spin_unlock(&rt_rq->rt_runtime_lock); | ||
| 637 | } | ||
| 638 | } | ||
| 639 | |||
| 640 | #ifdef CONFIG_SMP | ||
| 641 | |||
| 642 | static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu); | ||
| 643 | |||
| 644 | static inline u64 next_deadline(struct rq *rq) | ||
| 645 | { | ||
| 646 | struct task_struct *next = pick_next_earliest_dl_task(rq, rq->cpu); | ||
| 647 | |||
| 648 | if (next && dl_prio(next->prio)) | ||
| 649 | return next->dl.deadline; | ||
| 650 | else | ||
| 651 | return 0; | ||
| 652 | } | ||
| 653 | |||
| 654 | static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) | ||
| 655 | { | ||
| 656 | struct rq *rq = rq_of_dl_rq(dl_rq); | ||
| 657 | |||
| 658 | if (dl_rq->earliest_dl.curr == 0 || | ||
| 659 | dl_time_before(deadline, dl_rq->earliest_dl.curr)) { | ||
| 660 | /* | ||
| 661 | * If the dl_rq had no -deadline tasks, or if the new task | ||
| 662 | * has shorter deadline than the current one on dl_rq, we | ||
| 663 | * know that the previous earliest becomes our next earliest, | ||
| 664 | * as the new task becomes the earliest itself. | ||
| 665 | */ | ||
| 666 | dl_rq->earliest_dl.next = dl_rq->earliest_dl.curr; | ||
| 667 | dl_rq->earliest_dl.curr = deadline; | ||
| 668 | cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1); | ||
| 669 | } else if (dl_rq->earliest_dl.next == 0 || | ||
| 670 | dl_time_before(deadline, dl_rq->earliest_dl.next)) { | ||
| 671 | /* | ||
| 672 | * On the other hand, if the new -deadline task has a | ||
| 673 | * a later deadline than the earliest one on dl_rq, but | ||
| 674 | * it is earlier than the next (if any), we must | ||
| 675 | * recompute the next-earliest. | ||
| 676 | */ | ||
| 677 | dl_rq->earliest_dl.next = next_deadline(rq); | ||
| 678 | } | ||
| 679 | } | ||
| 680 | |||
| 681 | static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) | ||
| 682 | { | ||
| 683 | struct rq *rq = rq_of_dl_rq(dl_rq); | ||
| 684 | |||
| 685 | /* | ||
| 686 | * Since we may have removed our earliest (and/or next earliest) | ||
| 687 | * task we must recompute them. | ||
| 688 | */ | ||
| 689 | if (!dl_rq->dl_nr_running) { | ||
| 690 | dl_rq->earliest_dl.curr = 0; | ||
| 691 | dl_rq->earliest_dl.next = 0; | ||
| 692 | cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); | ||
| 693 | } else { | ||
| 694 | struct rb_node *leftmost = dl_rq->rb_leftmost; | ||
| 695 | struct sched_dl_entity *entry; | ||
| 696 | |||
| 697 | entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); | ||
| 698 | dl_rq->earliest_dl.curr = entry->deadline; | ||
| 699 | dl_rq->earliest_dl.next = next_deadline(rq); | ||
| 700 | cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1); | ||
| 701 | } | ||
| 702 | } | ||
| 703 | |||
| 704 | #else | ||
| 705 | |||
| 706 | static inline void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} | ||
| 707 | static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} | ||
| 708 | |||
| 709 | #endif /* CONFIG_SMP */ | ||
| 710 | |||
| 711 | static inline | ||
| 712 | void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) | ||
| 713 | { | ||
| 714 | int prio = dl_task_of(dl_se)->prio; | ||
| 715 | u64 deadline = dl_se->deadline; | ||
| 716 | |||
| 717 | WARN_ON(!dl_prio(prio)); | ||
| 718 | dl_rq->dl_nr_running++; | ||
| 719 | |||
| 720 | inc_dl_deadline(dl_rq, deadline); | ||
| 721 | inc_dl_migration(dl_se, dl_rq); | ||
| 722 | } | ||
| 723 | |||
| 724 | static inline | ||
| 725 | void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) | ||
| 726 | { | ||
| 727 | int prio = dl_task_of(dl_se)->prio; | ||
| 728 | |||
| 729 | WARN_ON(!dl_prio(prio)); | ||
| 730 | WARN_ON(!dl_rq->dl_nr_running); | ||
| 731 | dl_rq->dl_nr_running--; | ||
| 732 | |||
| 733 | dec_dl_deadline(dl_rq, dl_se->deadline); | ||
| 734 | dec_dl_migration(dl_se, dl_rq); | ||
| 735 | } | ||
| 736 | |||
| 737 | static void __enqueue_dl_entity(struct sched_dl_entity *dl_se) | ||
| 738 | { | ||
| 739 | struct dl_rq *dl_rq = dl_rq_of_se(dl_se); | ||
| 740 | struct rb_node **link = &dl_rq->rb_root.rb_node; | ||
| 741 | struct rb_node *parent = NULL; | ||
| 742 | struct sched_dl_entity *entry; | ||
| 743 | int leftmost = 1; | ||
| 744 | |||
| 745 | BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node)); | ||
| 746 | |||
| 747 | while (*link) { | ||
| 748 | parent = *link; | ||
| 749 | entry = rb_entry(parent, struct sched_dl_entity, rb_node); | ||
| 750 | if (dl_time_before(dl_se->deadline, entry->deadline)) | ||
| 751 | link = &parent->rb_left; | ||
| 752 | else { | ||
| 753 | link = &parent->rb_right; | ||
| 754 | leftmost = 0; | ||
| 755 | } | ||
| 756 | } | ||
| 757 | |||
| 758 | if (leftmost) | ||
| 759 | dl_rq->rb_leftmost = &dl_se->rb_node; | ||
| 760 | |||
| 761 | rb_link_node(&dl_se->rb_node, parent, link); | ||
| 762 | rb_insert_color(&dl_se->rb_node, &dl_rq->rb_root); | ||
| 763 | |||
| 764 | inc_dl_tasks(dl_se, dl_rq); | ||
| 765 | } | ||
| 766 | |||
| 767 | static void __dequeue_dl_entity(struct sched_dl_entity *dl_se) | ||
| 768 | { | ||
| 769 | struct dl_rq *dl_rq = dl_rq_of_se(dl_se); | ||
| 770 | |||
| 771 | if (RB_EMPTY_NODE(&dl_se->rb_node)) | ||
| 772 | return; | ||
| 773 | |||
| 774 | if (dl_rq->rb_leftmost == &dl_se->rb_node) { | ||
| 775 | struct rb_node *next_node; | ||
| 776 | |||
| 777 | next_node = rb_next(&dl_se->rb_node); | ||
| 778 | dl_rq->rb_leftmost = next_node; | ||
| 779 | } | ||
| 780 | |||
| 781 | rb_erase(&dl_se->rb_node, &dl_rq->rb_root); | ||
| 782 | RB_CLEAR_NODE(&dl_se->rb_node); | ||
| 783 | |||
| 784 | dec_dl_tasks(dl_se, dl_rq); | ||
| 785 | } | ||
| 786 | |||
| 787 | static void | ||
| 788 | enqueue_dl_entity(struct sched_dl_entity *dl_se, | ||
| 789 | struct sched_dl_entity *pi_se, int flags) | ||
| 790 | { | ||
| 791 | BUG_ON(on_dl_rq(dl_se)); | ||
| 792 | |||
| 793 | /* | ||
| 794 | * If this is a wakeup or a new instance, the scheduling | ||
| 795 | * parameters of the task might need updating. Otherwise, | ||
| 796 | * we want a replenishment of its runtime. | ||
| 797 | */ | ||
| 798 | if (!dl_se->dl_new && flags & ENQUEUE_REPLENISH) | ||
| 799 | replenish_dl_entity(dl_se, pi_se); | ||
| 800 | else | ||
| 801 | update_dl_entity(dl_se, pi_se); | ||
| 802 | |||
| 803 | __enqueue_dl_entity(dl_se); | ||
| 804 | } | ||
| 805 | |||
| 806 | static void dequeue_dl_entity(struct sched_dl_entity *dl_se) | ||
| 807 | { | ||
| 808 | __dequeue_dl_entity(dl_se); | ||
| 809 | } | ||
| 810 | |||
| 811 | static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) | ||
| 812 | { | ||
| 813 | struct task_struct *pi_task = rt_mutex_get_top_task(p); | ||
| 814 | struct sched_dl_entity *pi_se = &p->dl; | ||
| 815 | |||
| 816 | /* | ||
| 817 | * Use the scheduling parameters of the top pi-waiter | ||
| 818 | * task if we have one and its (relative) deadline is | ||
| 819 | * smaller than our one... OTW we keep our runtime and | ||
| 820 | * deadline. | ||
| 821 | */ | ||
| 822 | if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) | ||
| 823 | pi_se = &pi_task->dl; | ||
| 824 | |||
| 825 | /* | ||
| 826 | * If p is throttled, we do nothing. In fact, if it exhausted | ||
| 827 | * its budget it needs a replenishment and, since it now is on | ||
| 828 | * its rq, the bandwidth timer callback (which clearly has not | ||
| 829 | * run yet) will take care of this. | ||
| 830 | */ | ||
| 831 | if (p->dl.dl_throttled) | ||
| 832 | return; | ||
| 833 | |||
| 834 | enqueue_dl_entity(&p->dl, pi_se, flags); | ||
| 835 | |||
| 836 | if (!task_current(rq, p) && p->nr_cpus_allowed > 1) | ||
| 837 | enqueue_pushable_dl_task(rq, p); | ||
| 838 | |||
| 839 | inc_nr_running(rq); | ||
| 840 | } | ||
| 841 | |||
| 842 | static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) | ||
| 843 | { | ||
| 844 | dequeue_dl_entity(&p->dl); | ||
| 845 | dequeue_pushable_dl_task(rq, p); | ||
| 846 | } | ||
| 847 | |||
| 848 | static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) | ||
| 849 | { | ||
| 850 | update_curr_dl(rq); | ||
| 851 | __dequeue_task_dl(rq, p, flags); | ||
| 852 | |||
| 853 | dec_nr_running(rq); | ||
| 854 | } | ||
| 855 | |||
| 856 | /* | ||
| 857 | * Yield task semantic for -deadline tasks is: | ||
| 858 | * | ||
| 859 | * get off from the CPU until our next instance, with | ||
| 860 | * a new runtime. This is of little use now, since we | ||
| 861 | * don't have a bandwidth reclaiming mechanism. Anyway, | ||
| 862 | * bandwidth reclaiming is planned for the future, and | ||
| 863 | * yield_task_dl will indicate that some spare budget | ||
| 864 | * is available for other task instances to use it. | ||
| 865 | */ | ||
| 866 | static void yield_task_dl(struct rq *rq) | ||
| 867 | { | ||
| 868 | struct task_struct *p = rq->curr; | ||
| 869 | |||
| 870 | /* | ||
| 871 | * We make the task go to sleep until its current deadline by | ||
| 872 | * forcing its runtime to zero. This way, update_curr_dl() stops | ||
| 873 | * it and the bandwidth timer will wake it up and will give it | ||
| 874 | * new scheduling parameters (thanks to dl_new=1). | ||
| 875 | */ | ||
| 876 | if (p->dl.runtime > 0) { | ||
| 877 | rq->curr->dl.dl_new = 1; | ||
| 878 | p->dl.runtime = 0; | ||
| 879 | } | ||
| 880 | update_curr_dl(rq); | ||
| 881 | } | ||
| 882 | |||
| 883 | #ifdef CONFIG_SMP | ||
| 884 | |||
| 885 | static int find_later_rq(struct task_struct *task); | ||
| 886 | |||
| 887 | static int | ||
| 888 | select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) | ||
| 889 | { | ||
| 890 | struct task_struct *curr; | ||
| 891 | struct rq *rq; | ||
| 892 | |||
| 893 | if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) | ||
| 894 | goto out; | ||
| 895 | |||
| 896 | rq = cpu_rq(cpu); | ||
| 897 | |||
| 898 | rcu_read_lock(); | ||
| 899 | curr = ACCESS_ONCE(rq->curr); /* unlocked access */ | ||
| 900 | |||
| 901 | /* | ||
| 902 | * If we are dealing with a -deadline task, we must | ||
| 903 | * decide where to wake it up. | ||
| 904 | * If it has a later deadline and the current task | ||
| 905 | * on this rq can't move (provided the waking task | ||
| 906 | * can!) we prefer to send it somewhere else. On the | ||
| 907 | * other hand, if it has a shorter deadline, we | ||
| 908 | * try to make it stay here, it might be important. | ||
| 909 | */ | ||
| 910 | if (unlikely(dl_task(curr)) && | ||
| 911 | (curr->nr_cpus_allowed < 2 || | ||
| 912 | !dl_entity_preempt(&p->dl, &curr->dl)) && | ||
| 913 | (p->nr_cpus_allowed > 1)) { | ||
| 914 | int target = find_later_rq(p); | ||
| 915 | |||
| 916 | if (target != -1) | ||
| 917 | cpu = target; | ||
| 918 | } | ||
| 919 | rcu_read_unlock(); | ||
| 920 | |||
| 921 | out: | ||
| 922 | return cpu; | ||
| 923 | } | ||
| 924 | |||
| 925 | static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) | ||
| 926 | { | ||
| 927 | /* | ||
| 928 | * Current can't be migrated, useless to reschedule, | ||
| 929 | * let's hope p can move out. | ||
| 930 | */ | ||
| 931 | if (rq->curr->nr_cpus_allowed == 1 || | ||
| 932 | cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1) | ||
| 933 | return; | ||
| 934 | |||
| 935 | /* | ||
| 936 | * p is migratable, so let's not schedule it and | ||
| 937 | * see if it is pushed or pulled somewhere else. | ||
| 938 | */ | ||
| 939 | if (p->nr_cpus_allowed != 1 && | ||
| 940 | cpudl_find(&rq->rd->cpudl, p, NULL) != -1) | ||
| 941 | return; | ||
| 942 | |||
| 943 | resched_task(rq->curr); | ||
| 944 | } | ||
| 945 | |||
| 946 | #endif /* CONFIG_SMP */ | ||
| 947 | |||
| 948 | /* | ||
| 949 | * Only called when both the current and waking task are -deadline | ||
| 950 | * tasks. | ||
| 951 | */ | ||
| 952 | static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, | ||
| 953 | int flags) | ||
| 954 | { | ||
| 955 | if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { | ||
| 956 | resched_task(rq->curr); | ||
| 957 | return; | ||
| 958 | } | ||
| 959 | |||
| 960 | #ifdef CONFIG_SMP | ||
| 961 | /* | ||
| 962 | * In the unlikely case current and p have the same deadline | ||
| 963 | * let us try to decide what's the best thing to do... | ||
| 964 | */ | ||
| 965 | if ((p->dl.deadline == rq->curr->dl.deadline) && | ||
| 966 | !test_tsk_need_resched(rq->curr)) | ||
| 967 | check_preempt_equal_dl(rq, p); | ||
| 968 | #endif /* CONFIG_SMP */ | ||
| 969 | } | ||
| 970 | |||
| 971 | #ifdef CONFIG_SCHED_HRTICK | ||
| 972 | static void start_hrtick_dl(struct rq *rq, struct task_struct *p) | ||
| 973 | { | ||
| 974 | s64 delta = p->dl.dl_runtime - p->dl.runtime; | ||
| 975 | |||
| 976 | if (delta > 10000) | ||
| 977 | hrtick_start(rq, p->dl.runtime); | ||
| 978 | } | ||
| 979 | #endif | ||
| 980 | |||
| 981 | static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, | ||
| 982 | struct dl_rq *dl_rq) | ||
| 983 | { | ||
| 984 | struct rb_node *left = dl_rq->rb_leftmost; | ||
| 985 | |||
| 986 | if (!left) | ||
| 987 | return NULL; | ||
| 988 | |||
| 989 | return rb_entry(left, struct sched_dl_entity, rb_node); | ||
| 990 | } | ||
| 991 | |||
| 992 | struct task_struct *pick_next_task_dl(struct rq *rq) | ||
| 993 | { | ||
| 994 | struct sched_dl_entity *dl_se; | ||
| 995 | struct task_struct *p; | ||
| 996 | struct dl_rq *dl_rq; | ||
| 997 | |||
| 998 | dl_rq = &rq->dl; | ||
| 999 | |||
| 1000 | if (unlikely(!dl_rq->dl_nr_running)) | ||
| 1001 | return NULL; | ||
| 1002 | |||
| 1003 | dl_se = pick_next_dl_entity(rq, dl_rq); | ||
| 1004 | BUG_ON(!dl_se); | ||
| 1005 | |||
| 1006 | p = dl_task_of(dl_se); | ||
| 1007 | p->se.exec_start = rq_clock_task(rq); | ||
| 1008 | |||
| 1009 | /* Running task will never be pushed. */ | ||
| 1010 | dequeue_pushable_dl_task(rq, p); | ||
| 1011 | |||
| 1012 | #ifdef CONFIG_SCHED_HRTICK | ||
| 1013 | if (hrtick_enabled(rq)) | ||
| 1014 | start_hrtick_dl(rq, p); | ||
| 1015 | #endif | ||
| 1016 | |||
| 1017 | #ifdef CONFIG_SMP | ||
| 1018 | rq->post_schedule = has_pushable_dl_tasks(rq); | ||
| 1019 | #endif /* CONFIG_SMP */ | ||
| 1020 | |||
| 1021 | return p; | ||
| 1022 | } | ||
| 1023 | |||
| 1024 | static void put_prev_task_dl(struct rq *rq, struct task_struct *p) | ||
| 1025 | { | ||
| 1026 | update_curr_dl(rq); | ||
| 1027 | |||
| 1028 | if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) | ||
| 1029 | enqueue_pushable_dl_task(rq, p); | ||
| 1030 | } | ||
| 1031 | |||
| 1032 | static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) | ||
| 1033 | { | ||
| 1034 | update_curr_dl(rq); | ||
| 1035 | |||
| 1036 | #ifdef CONFIG_SCHED_HRTICK | ||
| 1037 | if (hrtick_enabled(rq) && queued && p->dl.runtime > 0) | ||
| 1038 | start_hrtick_dl(rq, p); | ||
| 1039 | #endif | ||
| 1040 | } | ||
| 1041 | |||
| 1042 | static void task_fork_dl(struct task_struct *p) | ||
| 1043 | { | ||
| 1044 | /* | ||
| 1045 | * SCHED_DEADLINE tasks cannot fork and this is achieved through | ||
| 1046 | * sched_fork() | ||
| 1047 | */ | ||
| 1048 | } | ||
| 1049 | |||
| 1050 | static void task_dead_dl(struct task_struct *p) | ||
| 1051 | { | ||
| 1052 | struct hrtimer *timer = &p->dl.dl_timer; | ||
| 1053 | struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); | ||
| 1054 | |||
| 1055 | /* | ||
| 1056 | * Since we are TASK_DEAD we won't slip out of the domain! | ||
| 1057 | */ | ||
| 1058 | raw_spin_lock_irq(&dl_b->lock); | ||
| 1059 | dl_b->total_bw -= p->dl.dl_bw; | ||
| 1060 | raw_spin_unlock_irq(&dl_b->lock); | ||
| 1061 | |||
| 1062 | hrtimer_cancel(timer); | ||
| 1063 | } | ||
| 1064 | |||
| 1065 | static void set_curr_task_dl(struct rq *rq) | ||
| 1066 | { | ||
| 1067 | struct task_struct *p = rq->curr; | ||
| 1068 | |||
| 1069 | p->se.exec_start = rq_clock_task(rq); | ||
| 1070 | |||
| 1071 | /* You can't push away the running task */ | ||
| 1072 | dequeue_pushable_dl_task(rq, p); | ||
| 1073 | } | ||
| 1074 | |||
| 1075 | #ifdef CONFIG_SMP | ||
| 1076 | |||
| 1077 | /* Only try algorithms three times */ | ||
| 1078 | #define DL_MAX_TRIES 3 | ||
| 1079 | |||
| 1080 | static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) | ||
| 1081 | { | ||
| 1082 | if (!task_running(rq, p) && | ||
| 1083 | (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && | ||
| 1084 | (p->nr_cpus_allowed > 1)) | ||
| 1085 | return 1; | ||
| 1086 | |||
| 1087 | return 0; | ||
| 1088 | } | ||
| 1089 | |||
| 1090 | /* Returns the second earliest -deadline task, NULL otherwise */ | ||
| 1091 | static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu) | ||
| 1092 | { | ||
| 1093 | struct rb_node *next_node = rq->dl.rb_leftmost; | ||
| 1094 | struct sched_dl_entity *dl_se; | ||
| 1095 | struct task_struct *p = NULL; | ||
| 1096 | |||
| 1097 | next_node: | ||
| 1098 | next_node = rb_next(next_node); | ||
| 1099 | if (next_node) { | ||
| 1100 | dl_se = rb_entry(next_node, struct sched_dl_entity, rb_node); | ||
| 1101 | p = dl_task_of(dl_se); | ||
| 1102 | |||
| 1103 | if (pick_dl_task(rq, p, cpu)) | ||
| 1104 | return p; | ||
| 1105 | |||
| 1106 | goto next_node; | ||
| 1107 | } | ||
| 1108 | |||
| 1109 | return NULL; | ||
| 1110 | } | ||
| 1111 | |||
| 1112 | static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl); | ||
| 1113 | |||
| 1114 | static int find_later_rq(struct task_struct *task) | ||
| 1115 | { | ||
| 1116 | struct sched_domain *sd; | ||
| 1117 | struct cpumask *later_mask = __get_cpu_var(local_cpu_mask_dl); | ||
| 1118 | int this_cpu = smp_processor_id(); | ||
| 1119 | int best_cpu, cpu = task_cpu(task); | ||
| 1120 | |||
| 1121 | /* Make sure the mask is initialized first */ | ||
| 1122 | if (unlikely(!later_mask)) | ||
| 1123 | return -1; | ||
| 1124 | |||
| 1125 | if (task->nr_cpus_allowed == 1) | ||
| 1126 | return -1; | ||
| 1127 | |||
| 1128 | best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, | ||
| 1129 | task, later_mask); | ||
| 1130 | if (best_cpu == -1) | ||
| 1131 | return -1; | ||
| 1132 | |||
| 1133 | /* | ||
| 1134 | * If we are here, some target has been found, | ||
| 1135 | * the most suitable of which is cached in best_cpu. | ||
| 1136 | * This is, among the runqueues where the current tasks | ||
| 1137 | * have later deadlines than the task's one, the rq | ||
| 1138 | * with the latest possible one. | ||
| 1139 | * | ||
| 1140 | * Now we check how well this matches with task's | ||
| 1141 | * affinity and system topology. | ||
| 1142 | * | ||
| 1143 | * The last cpu where the task run is our first | ||
| 1144 | * guess, since it is most likely cache-hot there. | ||
| 1145 | */ | ||
| 1146 | if (cpumask_test_cpu(cpu, later_mask)) | ||
| 1147 | return cpu; | ||
| 1148 | /* | ||
| 1149 | * Check if this_cpu is to be skipped (i.e., it is | ||
| 1150 | * not in the mask) or not. | ||
| 1151 | */ | ||
| 1152 | if (!cpumask_test_cpu(this_cpu, later_mask)) | ||
| 1153 | this_cpu = -1; | ||
| 1154 | |||
| 1155 | rcu_read_lock(); | ||
| 1156 | for_each_domain(cpu, sd) { | ||
| 1157 | if (sd->flags & SD_WAKE_AFFINE) { | ||
| 1158 | |||
| 1159 | /* | ||
| 1160 | * If possible, preempting this_cpu is | ||
| 1161 | * cheaper than migrating. | ||
| 1162 | */ | ||
| 1163 | if (this_cpu != -1 && | ||
| 1164 | cpumask_test_cpu(this_cpu, sched_domain_span(sd))) { | ||
| 1165 | rcu_read_unlock(); | ||
| 1166 | return this_cpu; | ||
| 1167 | } | ||
| 1168 | |||
| 1169 | /* | ||
| 1170 | * Last chance: if best_cpu is valid and is | ||
| 1171 | * in the mask, that becomes our choice. | ||
| 1172 | */ | ||
| 1173 | if (best_cpu < nr_cpu_ids && | ||
| 1174 | cpumask_test_cpu(best_cpu, sched_domain_span(sd))) { | ||
| 1175 | rcu_read_unlock(); | ||
| 1176 | return best_cpu; | ||
| 1177 | } | ||
| 1178 | } | ||
| 1179 | } | ||
| 1180 | rcu_read_unlock(); | ||
| 1181 | |||
| 1182 | /* | ||
| 1183 | * At this point, all our guesses failed, we just return | ||
| 1184 | * 'something', and let the caller sort the things out. | ||
| 1185 | */ | ||
| 1186 | if (this_cpu != -1) | ||
| 1187 | return this_cpu; | ||
| 1188 | |||
| 1189 | cpu = cpumask_any(later_mask); | ||
| 1190 | if (cpu < nr_cpu_ids) | ||
| 1191 | return cpu; | ||
| 1192 | |||
| 1193 | return -1; | ||
| 1194 | } | ||
| 1195 | |||
| 1196 | /* Locks the rq it finds */ | ||
| 1197 | static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) | ||
| 1198 | { | ||
| 1199 | struct rq *later_rq = NULL; | ||
| 1200 | int tries; | ||
| 1201 | int cpu; | ||
| 1202 | |||
| 1203 | for (tries = 0; tries < DL_MAX_TRIES; tries++) { | ||
| 1204 | cpu = find_later_rq(task); | ||
| 1205 | |||
| 1206 | if ((cpu == -1) || (cpu == rq->cpu)) | ||
| 1207 | break; | ||
| 1208 | |||
| 1209 | later_rq = cpu_rq(cpu); | ||
| 1210 | |||
| 1211 | /* Retry if something changed. */ | ||
| 1212 | if (double_lock_balance(rq, later_rq)) { | ||
| 1213 | if (unlikely(task_rq(task) != rq || | ||
| 1214 | !cpumask_test_cpu(later_rq->cpu, | ||
| 1215 | &task->cpus_allowed) || | ||
| 1216 | task_running(rq, task) || !task->on_rq)) { | ||
| 1217 | double_unlock_balance(rq, later_rq); | ||
| 1218 | later_rq = NULL; | ||
| 1219 | break; | ||
| 1220 | } | ||
| 1221 | } | ||
| 1222 | |||
| 1223 | /* | ||
| 1224 | * If the rq we found has no -deadline task, or | ||
| 1225 | * its earliest one has a later deadline than our | ||
| 1226 | * task, the rq is a good one. | ||
| 1227 | */ | ||
| 1228 | if (!later_rq->dl.dl_nr_running || | ||
| 1229 | dl_time_before(task->dl.deadline, | ||
| 1230 | later_rq->dl.earliest_dl.curr)) | ||
| 1231 | break; | ||
| 1232 | |||
| 1233 | /* Otherwise we try again. */ | ||
| 1234 | double_unlock_balance(rq, later_rq); | ||
| 1235 | later_rq = NULL; | ||
| 1236 | } | ||
| 1237 | |||
| 1238 | return later_rq; | ||
| 1239 | } | ||
| 1240 | |||
| 1241 | static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) | ||
| 1242 | { | ||
| 1243 | struct task_struct *p; | ||
| 1244 | |||
| 1245 | if (!has_pushable_dl_tasks(rq)) | ||
| 1246 | return NULL; | ||
| 1247 | |||
| 1248 | p = rb_entry(rq->dl.pushable_dl_tasks_leftmost, | ||
| 1249 | struct task_struct, pushable_dl_tasks); | ||
| 1250 | |||
| 1251 | BUG_ON(rq->cpu != task_cpu(p)); | ||
| 1252 | BUG_ON(task_current(rq, p)); | ||
| 1253 | BUG_ON(p->nr_cpus_allowed <= 1); | ||
| 1254 | |||
| 1255 | BUG_ON(!p->on_rq); | ||
| 1256 | BUG_ON(!dl_task(p)); | ||
| 1257 | |||
| 1258 | return p; | ||
| 1259 | } | ||
| 1260 | |||
| 1261 | /* | ||
| 1262 | * See if the non running -deadline tasks on this rq | ||
| 1263 | * can be sent to some other CPU where they can preempt | ||
| 1264 | * and start executing. | ||
| 1265 | */ | ||
| 1266 | static int push_dl_task(struct rq *rq) | ||
| 1267 | { | ||
| 1268 | struct task_struct *next_task; | ||
| 1269 | struct rq *later_rq; | ||
| 1270 | |||
| 1271 | if (!rq->dl.overloaded) | ||
| 1272 | return 0; | ||
| 1273 | |||
| 1274 | next_task = pick_next_pushable_dl_task(rq); | ||
| 1275 | if (!next_task) | ||
| 1276 | return 0; | ||
| 1277 | |||
| 1278 | retry: | ||
| 1279 | if (unlikely(next_task == rq->curr)) { | ||
| 1280 | WARN_ON(1); | ||
| 1281 | return 0; | ||
| 1282 | } | ||
| 1283 | |||
| 1284 | /* | ||
| 1285 | * If next_task preempts rq->curr, and rq->curr | ||
| 1286 | * can move away, it makes sense to just reschedule | ||
| 1287 | * without going further in pushing next_task. | ||
| 1288 | */ | ||
| 1289 | if (dl_task(rq->curr) && | ||
| 1290 | dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) && | ||
| 1291 | rq->curr->nr_cpus_allowed > 1) { | ||
| 1292 | resched_task(rq->curr); | ||
| 1293 | return 0; | ||
| 1294 | } | ||
| 1295 | |||
| 1296 | /* We might release rq lock */ | ||
| 1297 | get_task_struct(next_task); | ||
| 1298 | |||
| 1299 | /* Will lock the rq it'll find */ | ||
| 1300 | later_rq = find_lock_later_rq(next_task, rq); | ||
| 1301 | if (!later_rq) { | ||
| 1302 | struct task_struct *task; | ||
| 1303 | |||
| 1304 | /* | ||
| 1305 | * We must check all this again, since | ||
| 1306 | * find_lock_later_rq releases rq->lock and it is | ||
| 1307 | * then possible that next_task has migrated. | ||
| 1308 | */ | ||
| 1309 | task = pick_next_pushable_dl_task(rq); | ||
| 1310 | if (task_cpu(next_task) == rq->cpu && task == next_task) { | ||
| 1311 | /* | ||
| 1312 | * The task is still there. We don't try | ||
| 1313 | * again, some other cpu will pull it when ready. | ||
| 1314 | */ | ||
| 1315 | dequeue_pushable_dl_task(rq, next_task); | ||
| 1316 | goto out; | ||
| 1317 | } | ||
| 1318 | |||
| 1319 | if (!task) | ||
| 1320 | /* No more tasks */ | ||
| 1321 | goto out; | ||
| 1322 | |||
| 1323 | put_task_struct(next_task); | ||
| 1324 | next_task = task; | ||
| 1325 | goto retry; | ||
| 1326 | } | ||
| 1327 | |||
| 1328 | deactivate_task(rq, next_task, 0); | ||
| 1329 | set_task_cpu(next_task, later_rq->cpu); | ||
| 1330 | activate_task(later_rq, next_task, 0); | ||
| 1331 | |||
| 1332 | resched_task(later_rq->curr); | ||
| 1333 | |||
| 1334 | double_unlock_balance(rq, later_rq); | ||
| 1335 | |||
| 1336 | out: | ||
| 1337 | put_task_struct(next_task); | ||
| 1338 | |||
| 1339 | return 1; | ||
| 1340 | } | ||
| 1341 | |||
| 1342 | static void push_dl_tasks(struct rq *rq) | ||
| 1343 | { | ||
| 1344 | /* Terminates as it moves a -deadline task */ | ||
| 1345 | while (push_dl_task(rq)) | ||
| 1346 | ; | ||
| 1347 | } | ||
| 1348 | |||
| 1349 | static int pull_dl_task(struct rq *this_rq) | ||
| 1350 | { | ||
| 1351 | int this_cpu = this_rq->cpu, ret = 0, cpu; | ||
| 1352 | struct task_struct *p; | ||
| 1353 | struct rq *src_rq; | ||
| 1354 | u64 dmin = LONG_MAX; | ||
| 1355 | |||
| 1356 | if (likely(!dl_overloaded(this_rq))) | ||
| 1357 | return 0; | ||
| 1358 | |||
| 1359 | /* | ||
| 1360 | * Match the barrier from dl_set_overloaded; this guarantees that if we | ||
| 1361 | * see overloaded we must also see the dlo_mask bit. | ||
| 1362 | */ | ||
| 1363 | smp_rmb(); | ||
| 1364 | |||
| 1365 | for_each_cpu(cpu, this_rq->rd->dlo_mask) { | ||
| 1366 | if (this_cpu == cpu) | ||
| 1367 | continue; | ||
| 1368 | |||
| 1369 | src_rq = cpu_rq(cpu); | ||
| 1370 | |||
| 1371 | /* | ||
| 1372 | * It looks racy, abd it is! However, as in sched_rt.c, | ||
| 1373 | * we are fine with this. | ||
| 1374 | */ | ||
| 1375 | if (this_rq->dl.dl_nr_running && | ||
| 1376 | dl_time_before(this_rq->dl.earliest_dl.curr, | ||
| 1377 | src_rq->dl.earliest_dl.next)) | ||
| 1378 | continue; | ||
| 1379 | |||
| 1380 | /* Might drop this_rq->lock */ | ||
| 1381 | double_lock_balance(this_rq, src_rq); | ||
| 1382 | |||
| 1383 | /* | ||
| 1384 | * If there are no more pullable tasks on the | ||
| 1385 | * rq, we're done with it. | ||
| 1386 | */ | ||
| 1387 | if (src_rq->dl.dl_nr_running <= 1) | ||
| 1388 | goto skip; | ||
| 1389 | |||
| 1390 | p = pick_next_earliest_dl_task(src_rq, this_cpu); | ||
| 1391 | |||
| 1392 | /* | ||
| 1393 | * We found a task to be pulled if: | ||
| 1394 | * - it preempts our current (if there's one), | ||
| 1395 | * - it will preempt the last one we pulled (if any). | ||
| 1396 | */ | ||
| 1397 | if (p && dl_time_before(p->dl.deadline, dmin) && | ||
| 1398 | (!this_rq->dl.dl_nr_running || | ||
| 1399 | dl_time_before(p->dl.deadline, | ||
| 1400 | this_rq->dl.earliest_dl.curr))) { | ||
| 1401 | WARN_ON(p == src_rq->curr); | ||
| 1402 | WARN_ON(!p->on_rq); | ||
| 1403 | |||
| 1404 | /* | ||
| 1405 | * Then we pull iff p has actually an earlier | ||
| 1406 | * deadline than the current task of its runqueue. | ||
| 1407 | */ | ||
| 1408 | if (dl_time_before(p->dl.deadline, | ||
| 1409 | src_rq->curr->dl.deadline)) | ||
| 1410 | goto skip; | ||
| 1411 | |||
| 1412 | ret = 1; | ||
| 1413 | |||
| 1414 | deactivate_task(src_rq, p, 0); | ||
| 1415 | set_task_cpu(p, this_cpu); | ||
| 1416 | activate_task(this_rq, p, 0); | ||
| 1417 | dmin = p->dl.deadline; | ||
| 1418 | |||
| 1419 | /* Is there any other task even earlier? */ | ||
| 1420 | } | ||
| 1421 | skip: | ||
| 1422 | double_unlock_balance(this_rq, src_rq); | ||
| 1423 | } | ||
| 1424 | |||
| 1425 | return ret; | ||
| 1426 | } | ||
| 1427 | |||
| 1428 | static void pre_schedule_dl(struct rq *rq, struct task_struct *prev) | ||
| 1429 | { | ||
| 1430 | /* Try to pull other tasks here */ | ||
| 1431 | if (dl_task(prev)) | ||
| 1432 | pull_dl_task(rq); | ||
| 1433 | } | ||
| 1434 | |||
| 1435 | static void post_schedule_dl(struct rq *rq) | ||
| 1436 | { | ||
| 1437 | push_dl_tasks(rq); | ||
| 1438 | } | ||
| 1439 | |||
| 1440 | /* | ||
| 1441 | * Since the task is not running and a reschedule is not going to happen | ||
| 1442 | * anytime soon on its runqueue, we try pushing it away now. | ||
| 1443 | */ | ||
| 1444 | static void task_woken_dl(struct rq *rq, struct task_struct *p) | ||
| 1445 | { | ||
| 1446 | if (!task_running(rq, p) && | ||
| 1447 | !test_tsk_need_resched(rq->curr) && | ||
| 1448 | has_pushable_dl_tasks(rq) && | ||
| 1449 | p->nr_cpus_allowed > 1 && | ||
| 1450 | dl_task(rq->curr) && | ||
| 1451 | (rq->curr->nr_cpus_allowed < 2 || | ||
| 1452 | dl_entity_preempt(&rq->curr->dl, &p->dl))) { | ||
| 1453 | push_dl_tasks(rq); | ||
| 1454 | } | ||
| 1455 | } | ||
| 1456 | |||
| 1457 | static void set_cpus_allowed_dl(struct task_struct *p, | ||
| 1458 | const struct cpumask *new_mask) | ||
| 1459 | { | ||
| 1460 | struct rq *rq; | ||
| 1461 | int weight; | ||
| 1462 | |||
| 1463 | BUG_ON(!dl_task(p)); | ||
| 1464 | |||
| 1465 | /* | ||
| 1466 | * Update only if the task is actually running (i.e., | ||
| 1467 | * it is on the rq AND it is not throttled). | ||
| 1468 | */ | ||
| 1469 | if (!on_dl_rq(&p->dl)) | ||
| 1470 | return; | ||
| 1471 | |||
| 1472 | weight = cpumask_weight(new_mask); | ||
| 1473 | |||
| 1474 | /* | ||
| 1475 | * Only update if the process changes its state from whether it | ||
| 1476 | * can migrate or not. | ||
| 1477 | */ | ||
| 1478 | if ((p->nr_cpus_allowed > 1) == (weight > 1)) | ||
| 1479 | return; | ||
| 1480 | |||
| 1481 | rq = task_rq(p); | ||
| 1482 | |||
| 1483 | /* | ||
| 1484 | * The process used to be able to migrate OR it can now migrate | ||
| 1485 | */ | ||
| 1486 | if (weight <= 1) { | ||
| 1487 | if (!task_current(rq, p)) | ||
| 1488 | dequeue_pushable_dl_task(rq, p); | ||
| 1489 | BUG_ON(!rq->dl.dl_nr_migratory); | ||
| 1490 | rq->dl.dl_nr_migratory--; | ||
| 1491 | } else { | ||
| 1492 | if (!task_current(rq, p)) | ||
| 1493 | enqueue_pushable_dl_task(rq, p); | ||
| 1494 | rq->dl.dl_nr_migratory++; | ||
| 1495 | } | ||
| 1496 | |||
| 1497 | update_dl_migration(&rq->dl); | ||
| 1498 | } | ||
| 1499 | |||
| 1500 | /* Assumes rq->lock is held */ | ||
| 1501 | static void rq_online_dl(struct rq *rq) | ||
| 1502 | { | ||
| 1503 | if (rq->dl.overloaded) | ||
| 1504 | dl_set_overload(rq); | ||
| 1505 | |||
| 1506 | if (rq->dl.dl_nr_running > 0) | ||
| 1507 | cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1); | ||
| 1508 | } | ||
| 1509 | |||
| 1510 | /* Assumes rq->lock is held */ | ||
| 1511 | static void rq_offline_dl(struct rq *rq) | ||
| 1512 | { | ||
| 1513 | if (rq->dl.overloaded) | ||
| 1514 | dl_clear_overload(rq); | ||
| 1515 | |||
| 1516 | cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); | ||
| 1517 | } | ||
| 1518 | |||
| 1519 | void init_sched_dl_class(void) | ||
| 1520 | { | ||
| 1521 | unsigned int i; | ||
| 1522 | |||
| 1523 | for_each_possible_cpu(i) | ||
| 1524 | zalloc_cpumask_var_node(&per_cpu(local_cpu_mask_dl, i), | ||
| 1525 | GFP_KERNEL, cpu_to_node(i)); | ||
| 1526 | } | ||
| 1527 | |||
| 1528 | #endif /* CONFIG_SMP */ | ||
| 1529 | |||
| 1530 | static void switched_from_dl(struct rq *rq, struct task_struct *p) | ||
| 1531 | { | ||
| 1532 | if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy)) | ||
| 1533 | hrtimer_try_to_cancel(&p->dl.dl_timer); | ||
| 1534 | |||
| 1535 | #ifdef CONFIG_SMP | ||
| 1536 | /* | ||
| 1537 | * Since this might be the only -deadline task on the rq, | ||
| 1538 | * this is the right place to try to pull some other one | ||
| 1539 | * from an overloaded cpu, if any. | ||
| 1540 | */ | ||
| 1541 | if (!rq->dl.dl_nr_running) | ||
| 1542 | pull_dl_task(rq); | ||
| 1543 | #endif | ||
| 1544 | } | ||
| 1545 | |||
| 1546 | /* | ||
| 1547 | * When switching to -deadline, we may overload the rq, then | ||
| 1548 | * we try to push someone off, if possible. | ||
| 1549 | */ | ||
| 1550 | static void switched_to_dl(struct rq *rq, struct task_struct *p) | ||
| 1551 | { | ||
| 1552 | int check_resched = 1; | ||
| 1553 | |||
| 1554 | /* | ||
| 1555 | * If p is throttled, don't consider the possibility | ||
| 1556 | * of preempting rq->curr, the check will be done right | ||
| 1557 | * after its runtime will get replenished. | ||
| 1558 | */ | ||
| 1559 | if (unlikely(p->dl.dl_throttled)) | ||
| 1560 | return; | ||
| 1561 | |||
| 1562 | if (p->on_rq || rq->curr != p) { | ||
| 1563 | #ifdef CONFIG_SMP | ||
| 1564 | if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) | ||
| 1565 | /* Only reschedule if pushing failed */ | ||
| 1566 | check_resched = 0; | ||
| 1567 | #endif /* CONFIG_SMP */ | ||
| 1568 | if (check_resched && task_has_dl_policy(rq->curr)) | ||
| 1569 | check_preempt_curr_dl(rq, p, 0); | ||
| 1570 | } | ||
| 1571 | } | ||
| 1572 | |||
| 1573 | /* | ||
| 1574 | * If the scheduling parameters of a -deadline task changed, | ||
| 1575 | * a push or pull operation might be needed. | ||
| 1576 | */ | ||
| 1577 | static void prio_changed_dl(struct rq *rq, struct task_struct *p, | ||
| 1578 | int oldprio) | ||
| 1579 | { | ||
| 1580 | if (p->on_rq || rq->curr == p) { | ||
| 1581 | #ifdef CONFIG_SMP | ||
| 1582 | /* | ||
| 1583 | * This might be too much, but unfortunately | ||
| 1584 | * we don't have the old deadline value, and | ||
| 1585 | * we can't argue if the task is increasing | ||
| 1586 | * or lowering its prio, so... | ||
| 1587 | */ | ||
| 1588 | if (!rq->dl.overloaded) | ||
| 1589 | pull_dl_task(rq); | ||
| 1590 | |||
| 1591 | /* | ||
| 1592 | * If we now have a earlier deadline task than p, | ||
| 1593 | * then reschedule, provided p is still on this | ||
| 1594 | * runqueue. | ||
| 1595 | */ | ||
| 1596 | if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) && | ||
| 1597 | rq->curr == p) | ||
| 1598 | resched_task(p); | ||
| 1599 | #else | ||
| 1600 | /* | ||
| 1601 | * Again, we don't know if p has a earlier | ||
| 1602 | * or later deadline, so let's blindly set a | ||
| 1603 | * (maybe not needed) rescheduling point. | ||
| 1604 | */ | ||
| 1605 | resched_task(p); | ||
| 1606 | #endif /* CONFIG_SMP */ | ||
| 1607 | } else | ||
| 1608 | switched_to_dl(rq, p); | ||
| 1609 | } | ||
| 1610 | |||
| 1611 | const struct sched_class dl_sched_class = { | ||
| 1612 | .next = &rt_sched_class, | ||
| 1613 | .enqueue_task = enqueue_task_dl, | ||
| 1614 | .dequeue_task = dequeue_task_dl, | ||
| 1615 | .yield_task = yield_task_dl, | ||
| 1616 | |||
| 1617 | .check_preempt_curr = check_preempt_curr_dl, | ||
| 1618 | |||
| 1619 | .pick_next_task = pick_next_task_dl, | ||
| 1620 | .put_prev_task = put_prev_task_dl, | ||
| 1621 | |||
| 1622 | #ifdef CONFIG_SMP | ||
| 1623 | .select_task_rq = select_task_rq_dl, | ||
| 1624 | .set_cpus_allowed = set_cpus_allowed_dl, | ||
| 1625 | .rq_online = rq_online_dl, | ||
| 1626 | .rq_offline = rq_offline_dl, | ||
| 1627 | .pre_schedule = pre_schedule_dl, | ||
| 1628 | .post_schedule = post_schedule_dl, | ||
| 1629 | .task_woken = task_woken_dl, | ||
| 1630 | #endif | ||
| 1631 | |||
| 1632 | .set_curr_task = set_curr_task_dl, | ||
| 1633 | .task_tick = task_tick_dl, | ||
| 1634 | .task_fork = task_fork_dl, | ||
| 1635 | .task_dead = task_dead_dl, | ||
| 1636 | |||
| 1637 | .prio_changed = prio_changed_dl, | ||
| 1638 | .switched_from = switched_from_dl, | ||
| 1639 | .switched_to = switched_to_dl, | ||
| 1640 | }; | ||
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 5c34d1817e8f..dd52e7ffb10e 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
| @@ -139,7 +139,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | |||
| 139 | 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); | 139 | 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); |
| 140 | #endif | 140 | #endif |
| 141 | #ifdef CONFIG_NUMA_BALANCING | 141 | #ifdef CONFIG_NUMA_BALANCING |
| 142 | SEQ_printf(m, " %d", cpu_to_node(task_cpu(p))); | 142 | SEQ_printf(m, " %d", task_node(p)); |
| 143 | #endif | 143 | #endif |
| 144 | #ifdef CONFIG_CGROUP_SCHED | 144 | #ifdef CONFIG_CGROUP_SCHED |
| 145 | SEQ_printf(m, " %s", task_group_path(task_group(p))); | 145 | SEQ_printf(m, " %s", task_group_path(task_group(p))); |
| @@ -371,7 +371,7 @@ static void sched_debug_header(struct seq_file *m) | |||
| 371 | PN(cpu_clk); | 371 | PN(cpu_clk); |
| 372 | P(jiffies); | 372 | P(jiffies); |
| 373 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK | 373 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK |
| 374 | P(sched_clock_stable); | 374 | P(sched_clock_stable()); |
| 375 | #endif | 375 | #endif |
| 376 | #undef PN | 376 | #undef PN |
| 377 | #undef P | 377 | #undef P |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e64b0794060e..b24b6cfde9aa 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
| @@ -872,15 +872,6 @@ static unsigned int task_scan_max(struct task_struct *p) | |||
| 872 | return max(smin, smax); | 872 | return max(smin, smax); |
| 873 | } | 873 | } |
| 874 | 874 | ||
| 875 | /* | ||
| 876 | * Once a preferred node is selected the scheduler balancer will prefer moving | ||
| 877 | * a task to that node for sysctl_numa_balancing_settle_count number of PTE | ||
| 878 | * scans. This will give the process the chance to accumulate more faults on | ||
| 879 | * the preferred node but still allow the scheduler to move the task again if | ||
| 880 | * the nodes CPUs are overloaded. | ||
| 881 | */ | ||
| 882 | unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4; | ||
| 883 | |||
| 884 | static void account_numa_enqueue(struct rq *rq, struct task_struct *p) | 875 | static void account_numa_enqueue(struct rq *rq, struct task_struct *p) |
| 885 | { | 876 | { |
| 886 | rq->nr_numa_running += (p->numa_preferred_nid != -1); | 877 | rq->nr_numa_running += (p->numa_preferred_nid != -1); |
| @@ -930,7 +921,8 @@ static inline unsigned long group_faults(struct task_struct *p, int nid) | |||
| 930 | if (!p->numa_group) | 921 | if (!p->numa_group) |
| 931 | return 0; | 922 | return 0; |
| 932 | 923 | ||
| 933 | return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1]; | 924 | return p->numa_group->faults[task_faults_idx(nid, 0)] + |
| 925 | p->numa_group->faults[task_faults_idx(nid, 1)]; | ||
| 934 | } | 926 | } |
| 935 | 927 | ||
| 936 | /* | 928 | /* |
| @@ -1023,7 +1015,7 @@ struct task_numa_env { | |||
| 1023 | 1015 | ||
| 1024 | struct numa_stats src_stats, dst_stats; | 1016 | struct numa_stats src_stats, dst_stats; |
| 1025 | 1017 | ||
| 1026 | int imbalance_pct, idx; | 1018 | int imbalance_pct; |
| 1027 | 1019 | ||
| 1028 | struct task_struct *best_task; | 1020 | struct task_struct *best_task; |
| 1029 | long best_imp; | 1021 | long best_imp; |
| @@ -1211,7 +1203,7 @@ static int task_numa_migrate(struct task_struct *p) | |||
| 1211 | * elsewhere, so there is no point in (re)trying. | 1203 | * elsewhere, so there is no point in (re)trying. |
| 1212 | */ | 1204 | */ |
| 1213 | if (unlikely(!sd)) { | 1205 | if (unlikely(!sd)) { |
| 1214 | p->numa_preferred_nid = cpu_to_node(task_cpu(p)); | 1206 | p->numa_preferred_nid = task_node(p); |
| 1215 | return -EINVAL; | 1207 | return -EINVAL; |
| 1216 | } | 1208 | } |
| 1217 | 1209 | ||
| @@ -1278,7 +1270,7 @@ static void numa_migrate_preferred(struct task_struct *p) | |||
| 1278 | p->numa_migrate_retry = jiffies + HZ; | 1270 | p->numa_migrate_retry = jiffies + HZ; |
| 1279 | 1271 | ||
| 1280 | /* Success if task is already running on preferred CPU */ | 1272 | /* Success if task is already running on preferred CPU */ |
| 1281 | if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid) | 1273 | if (task_node(p) == p->numa_preferred_nid) |
| 1282 | return; | 1274 | return; |
| 1283 | 1275 | ||
| 1284 | /* Otherwise, try migrate to a CPU on the preferred node */ | 1276 | /* Otherwise, try migrate to a CPU on the preferred node */ |
| @@ -1350,7 +1342,6 @@ static void update_task_scan_period(struct task_struct *p, | |||
| 1350 | * scanning faster if shared accesses dominate as it may | 1342 | * scanning faster if shared accesses dominate as it may |
| 1351 | * simply bounce migrations uselessly | 1343 | * simply bounce migrations uselessly |
| 1352 | */ | 1344 | */ |
| 1353 | period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS); | ||
| 1354 | ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); | 1345 | ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); |
| 1355 | diff = (diff * ratio) / NUMA_PERIOD_SLOTS; | 1346 | diff = (diff * ratio) / NUMA_PERIOD_SLOTS; |
| 1356 | } | 1347 | } |
| @@ -4101,12 +4092,16 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | |||
| 4101 | */ | 4092 | */ |
| 4102 | static struct sched_group * | 4093 | static struct sched_group * |
| 4103 | find_idlest_group(struct sched_domain *sd, struct task_struct *p, | 4094 | find_idlest_group(struct sched_domain *sd, struct task_struct *p, |
| 4104 | int this_cpu, int load_idx) | 4095 | int this_cpu, int sd_flag) |
| 4105 | { | 4096 | { |
| 4106 | struct sched_group *idlest = NULL, *group = sd->groups; | 4097 | struct sched_group *idlest = NULL, *group = sd->groups; |
| 4107 | unsigned long min_load = ULONG_MAX, this_load = 0; | 4098 | unsigned long min_load = ULONG_MAX, this_load = 0; |
| 4099 | int load_idx = sd->forkexec_idx; | ||
| 4108 | int imbalance = 100 + (sd->imbalance_pct-100)/2; | 4100 | int imbalance = 100 + (sd->imbalance_pct-100)/2; |
| 4109 | 4101 | ||
| 4102 | if (sd_flag & SD_BALANCE_WAKE) | ||
| 4103 | load_idx = sd->wake_idx; | ||
| 4104 | |||
| 4110 | do { | 4105 | do { |
| 4111 | unsigned long load, avg_load; | 4106 | unsigned long load, avg_load; |
| 4112 | int local_group; | 4107 | int local_group; |
| @@ -4274,7 +4269,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
| 4274 | } | 4269 | } |
| 4275 | 4270 | ||
| 4276 | while (sd) { | 4271 | while (sd) { |
| 4277 | int load_idx = sd->forkexec_idx; | ||
| 4278 | struct sched_group *group; | 4272 | struct sched_group *group; |
| 4279 | int weight; | 4273 | int weight; |
| 4280 | 4274 | ||
| @@ -4283,10 +4277,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
| 4283 | continue; | 4277 | continue; |
| 4284 | } | 4278 | } |
| 4285 | 4279 | ||
| 4286 | if (sd_flag & SD_BALANCE_WAKE) | 4280 | group = find_idlest_group(sd, p, cpu, sd_flag); |
| 4287 | load_idx = sd->wake_idx; | ||
| 4288 | |||
| 4289 | group = find_idlest_group(sd, p, cpu, load_idx); | ||
| 4290 | if (!group) { | 4281 | if (!group) { |
| 4291 | sd = sd->child; | 4282 | sd = sd->child; |
| 4292 | continue; | 4283 | continue; |
| @@ -5512,7 +5503,6 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 5512 | struct sched_group *group, int load_idx, | 5503 | struct sched_group *group, int load_idx, |
| 5513 | int local_group, struct sg_lb_stats *sgs) | 5504 | int local_group, struct sg_lb_stats *sgs) |
| 5514 | { | 5505 | { |
| 5515 | unsigned long nr_running; | ||
| 5516 | unsigned long load; | 5506 | unsigned long load; |
| 5517 | int i; | 5507 | int i; |
| 5518 | 5508 | ||
| @@ -5521,8 +5511,6 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 5521 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { | 5511 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { |
| 5522 | struct rq *rq = cpu_rq(i); | 5512 | struct rq *rq = cpu_rq(i); |
| 5523 | 5513 | ||
| 5524 | nr_running = rq->nr_running; | ||
| 5525 | |||
| 5526 | /* Bias balancing toward cpus of our domain */ | 5514 | /* Bias balancing toward cpus of our domain */ |
| 5527 | if (local_group) | 5515 | if (local_group) |
| 5528 | load = target_load(i, load_idx); | 5516 | load = target_load(i, load_idx); |
| @@ -5530,7 +5518,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 5530 | load = source_load(i, load_idx); | 5518 | load = source_load(i, load_idx); |
| 5531 | 5519 | ||
| 5532 | sgs->group_load += load; | 5520 | sgs->group_load += load; |
| 5533 | sgs->sum_nr_running += nr_running; | 5521 | sgs->sum_nr_running += rq->nr_running; |
| 5534 | #ifdef CONFIG_NUMA_BALANCING | 5522 | #ifdef CONFIG_NUMA_BALANCING |
| 5535 | sgs->nr_numa_running += rq->nr_numa_running; | 5523 | sgs->nr_numa_running += rq->nr_numa_running; |
| 5536 | sgs->nr_preferred_running += rq->nr_preferred_running; | 5524 | sgs->nr_preferred_running += rq->nr_preferred_running; |
| @@ -6521,7 +6509,7 @@ static struct { | |||
| 6521 | unsigned long next_balance; /* in jiffy units */ | 6509 | unsigned long next_balance; /* in jiffy units */ |
| 6522 | } nohz ____cacheline_aligned; | 6510 | } nohz ____cacheline_aligned; |
| 6523 | 6511 | ||
| 6524 | static inline int find_new_ilb(int call_cpu) | 6512 | static inline int find_new_ilb(void) |
| 6525 | { | 6513 | { |
| 6526 | int ilb = cpumask_first(nohz.idle_cpus_mask); | 6514 | int ilb = cpumask_first(nohz.idle_cpus_mask); |
| 6527 | 6515 | ||
| @@ -6536,13 +6524,13 @@ static inline int find_new_ilb(int call_cpu) | |||
| 6536 | * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle | 6524 | * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle |
| 6537 | * CPU (if there is one). | 6525 | * CPU (if there is one). |
| 6538 | */ | 6526 | */ |
| 6539 | static void nohz_balancer_kick(int cpu) | 6527 | static void nohz_balancer_kick(void) |
| 6540 | { | 6528 | { |
| 6541 | int ilb_cpu; | 6529 | int ilb_cpu; |
| 6542 | 6530 | ||
| 6543 | nohz.next_balance++; | 6531 | nohz.next_balance++; |
| 6544 | 6532 | ||
| 6545 | ilb_cpu = find_new_ilb(cpu); | 6533 | ilb_cpu = find_new_ilb(); |
| 6546 | 6534 | ||
| 6547 | if (ilb_cpu >= nr_cpu_ids) | 6535 | if (ilb_cpu >= nr_cpu_ids) |
| 6548 | return; | 6536 | return; |
| @@ -6652,10 +6640,10 @@ void update_max_interval(void) | |||
| 6652 | * | 6640 | * |
| 6653 | * Balancing parameters are set up in init_sched_domains. | 6641 | * Balancing parameters are set up in init_sched_domains. |
| 6654 | */ | 6642 | */ |
| 6655 | static void rebalance_domains(int cpu, enum cpu_idle_type idle) | 6643 | static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) |
| 6656 | { | 6644 | { |
| 6657 | int continue_balancing = 1; | 6645 | int continue_balancing = 1; |
| 6658 | struct rq *rq = cpu_rq(cpu); | 6646 | int cpu = rq->cpu; |
| 6659 | unsigned long interval; | 6647 | unsigned long interval; |
| 6660 | struct sched_domain *sd; | 6648 | struct sched_domain *sd; |
| 6661 | /* Earliest time when we have to do rebalance again */ | 6649 | /* Earliest time when we have to do rebalance again */ |
| @@ -6752,9 +6740,9 @@ out: | |||
| 6752 | * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the | 6740 | * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the |
| 6753 | * rebalancing for all the cpus for whom scheduler ticks are stopped. | 6741 | * rebalancing for all the cpus for whom scheduler ticks are stopped. |
| 6754 | */ | 6742 | */ |
| 6755 | static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) | 6743 | static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) |
| 6756 | { | 6744 | { |
| 6757 | struct rq *this_rq = cpu_rq(this_cpu); | 6745 | int this_cpu = this_rq->cpu; |
| 6758 | struct rq *rq; | 6746 | struct rq *rq; |
| 6759 | int balance_cpu; | 6747 | int balance_cpu; |
| 6760 | 6748 | ||
| @@ -6781,7 +6769,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) | |||
| 6781 | update_idle_cpu_load(rq); | 6769 | update_idle_cpu_load(rq); |
| 6782 | raw_spin_unlock_irq(&rq->lock); | 6770 | raw_spin_unlock_irq(&rq->lock); |
| 6783 | 6771 | ||
| 6784 | rebalance_domains(balance_cpu, CPU_IDLE); | 6772 | rebalance_domains(rq, CPU_IDLE); |
| 6785 | 6773 | ||
| 6786 | if (time_after(this_rq->next_balance, rq->next_balance)) | 6774 | if (time_after(this_rq->next_balance, rq->next_balance)) |
| 6787 | this_rq->next_balance = rq->next_balance; | 6775 | this_rq->next_balance = rq->next_balance; |
| @@ -6800,14 +6788,14 @@ end: | |||
| 6800 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler | 6788 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler |
| 6801 | * domain span are idle. | 6789 | * domain span are idle. |
| 6802 | */ | 6790 | */ |
| 6803 | static inline int nohz_kick_needed(struct rq *rq, int cpu) | 6791 | static inline int nohz_kick_needed(struct rq *rq) |
| 6804 | { | 6792 | { |
| 6805 | unsigned long now = jiffies; | 6793 | unsigned long now = jiffies; |
| 6806 | struct sched_domain *sd; | 6794 | struct sched_domain *sd; |
| 6807 | struct sched_group_power *sgp; | 6795 | struct sched_group_power *sgp; |
| 6808 | int nr_busy; | 6796 | int nr_busy, cpu = rq->cpu; |
| 6809 | 6797 | ||
| 6810 | if (unlikely(idle_cpu(cpu))) | 6798 | if (unlikely(rq->idle_balance)) |
| 6811 | return 0; | 6799 | return 0; |
| 6812 | 6800 | ||
| 6813 | /* | 6801 | /* |
| @@ -6856,7 +6844,7 @@ need_kick: | |||
| 6856 | return 1; | 6844 | return 1; |
| 6857 | } | 6845 | } |
| 6858 | #else | 6846 | #else |
| 6859 | static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } | 6847 | static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } |
| 6860 | #endif | 6848 | #endif |
| 6861 | 6849 | ||
| 6862 | /* | 6850 | /* |
| @@ -6865,38 +6853,39 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } | |||
| 6865 | */ | 6853 | */ |
| 6866 | static void run_rebalance_domains(struct softirq_action *h) | 6854 | static void run_rebalance_domains(struct softirq_action *h) |
| 6867 | { | 6855 | { |
| 6868 | int this_cpu = smp_processor_id(); | 6856 | struct rq *this_rq = this_rq(); |
| 6869 | struct rq *this_rq = cpu_rq(this_cpu); | ||
| 6870 | enum cpu_idle_type idle = this_rq->idle_balance ? | 6857 | enum cpu_idle_type idle = this_rq->idle_balance ? |
| 6871 | CPU_IDLE : CPU_NOT_IDLE; | 6858 | CPU_IDLE : CPU_NOT_IDLE; |
| 6872 | 6859 | ||
| 6873 | rebalance_domains(this_cpu, idle); | 6860 | rebalance_domains(this_rq, idle); |
| 6874 | 6861 | ||
| 6875 | /* | 6862 | /* |
| 6876 | * If this cpu has a pending nohz_balance_kick, then do the | 6863 | * If this cpu has a pending nohz_balance_kick, then do the |
| 6877 | * balancing on behalf of the other idle cpus whose ticks are | 6864 | * balancing on behalf of the other idle cpus whose ticks are |
| 6878 | * stopped. | 6865 | * stopped. |
| 6879 | */ | 6866 | */ |
| 6880 | nohz_idle_balance(this_cpu, idle); | 6867 | nohz_idle_balance(this_rq, idle); |
| 6881 | } | 6868 | } |
| 6882 | 6869 | ||
| 6883 | static inline int on_null_domain(int cpu) | 6870 | static inline int on_null_domain(struct rq *rq) |
| 6884 | { | 6871 | { |
| 6885 | return !rcu_dereference_sched(cpu_rq(cpu)->sd); | 6872 | return !rcu_dereference_sched(rq->sd); |
| 6886 | } | 6873 | } |
| 6887 | 6874 | ||
| 6888 | /* | 6875 | /* |
| 6889 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. | 6876 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. |
| 6890 | */ | 6877 | */ |
| 6891 | void trigger_load_balance(struct rq *rq, int cpu) | 6878 | void trigger_load_balance(struct rq *rq) |
| 6892 | { | 6879 | { |
| 6893 | /* Don't need to rebalance while attached to NULL domain */ | 6880 | /* Don't need to rebalance while attached to NULL domain */ |
| 6894 | if (time_after_eq(jiffies, rq->next_balance) && | 6881 | if (unlikely(on_null_domain(rq))) |
| 6895 | likely(!on_null_domain(cpu))) | 6882 | return; |
| 6883 | |||
| 6884 | if (time_after_eq(jiffies, rq->next_balance)) | ||
| 6896 | raise_softirq(SCHED_SOFTIRQ); | 6885 | raise_softirq(SCHED_SOFTIRQ); |
| 6897 | #ifdef CONFIG_NO_HZ_COMMON | 6886 | #ifdef CONFIG_NO_HZ_COMMON |
| 6898 | if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) | 6887 | if (nohz_kick_needed(rq)) |
| 6899 | nohz_balancer_kick(cpu); | 6888 | nohz_balancer_kick(); |
| 6900 | #endif | 6889 | #endif |
| 6901 | } | 6890 | } |
| 6902 | 6891 | ||
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 1c4065575fa2..a2740b775b45 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
| @@ -1738,7 +1738,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) | |||
| 1738 | !test_tsk_need_resched(rq->curr) && | 1738 | !test_tsk_need_resched(rq->curr) && |
| 1739 | has_pushable_tasks(rq) && | 1739 | has_pushable_tasks(rq) && |
| 1740 | p->nr_cpus_allowed > 1 && | 1740 | p->nr_cpus_allowed > 1 && |
| 1741 | rt_task(rq->curr) && | 1741 | (dl_task(rq->curr) || rt_task(rq->curr)) && |
| 1742 | (rq->curr->nr_cpus_allowed < 2 || | 1742 | (rq->curr->nr_cpus_allowed < 2 || |
| 1743 | rq->curr->prio <= p->prio)) | 1743 | rq->curr->prio <= p->prio)) |
| 1744 | push_rt_tasks(rq); | 1744 | push_rt_tasks(rq); |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 88c85b21d633..c2119fd20f8b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
| @@ -2,6 +2,7 @@ | |||
| 2 | #include <linux/sched.h> | 2 | #include <linux/sched.h> |
| 3 | #include <linux/sched/sysctl.h> | 3 | #include <linux/sched/sysctl.h> |
| 4 | #include <linux/sched/rt.h> | 4 | #include <linux/sched/rt.h> |
| 5 | #include <linux/sched/deadline.h> | ||
| 5 | #include <linux/mutex.h> | 6 | #include <linux/mutex.h> |
| 6 | #include <linux/spinlock.h> | 7 | #include <linux/spinlock.h> |
| 7 | #include <linux/stop_machine.h> | 8 | #include <linux/stop_machine.h> |
| @@ -9,6 +10,7 @@ | |||
| 9 | #include <linux/slab.h> | 10 | #include <linux/slab.h> |
| 10 | 11 | ||
| 11 | #include "cpupri.h" | 12 | #include "cpupri.h" |
| 13 | #include "cpudeadline.h" | ||
| 12 | #include "cpuacct.h" | 14 | #include "cpuacct.h" |
| 13 | 15 | ||
| 14 | struct rq; | 16 | struct rq; |
| @@ -73,6 +75,13 @@ extern void update_cpu_load_active(struct rq *this_rq); | |||
| 73 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT | 75 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT |
| 74 | 76 | ||
| 75 | /* | 77 | /* |
| 78 | * Single value that decides SCHED_DEADLINE internal math precision. | ||
| 79 | * 10 -> just above 1us | ||
| 80 | * 9 -> just above 0.5us | ||
| 81 | */ | ||
| 82 | #define DL_SCALE (10) | ||
| 83 | |||
| 84 | /* | ||
| 76 | * These are the 'tuning knobs' of the scheduler: | 85 | * These are the 'tuning knobs' of the scheduler: |
| 77 | */ | 86 | */ |
| 78 | 87 | ||
| @@ -81,11 +90,19 @@ extern void update_cpu_load_active(struct rq *this_rq); | |||
| 81 | */ | 90 | */ |
| 82 | #define RUNTIME_INF ((u64)~0ULL) | 91 | #define RUNTIME_INF ((u64)~0ULL) |
| 83 | 92 | ||
| 93 | static inline int fair_policy(int policy) | ||
| 94 | { | ||
| 95 | return policy == SCHED_NORMAL || policy == SCHED_BATCH; | ||
| 96 | } | ||
| 97 | |||
| 84 | static inline int rt_policy(int policy) | 98 | static inline int rt_policy(int policy) |
| 85 | { | 99 | { |
| 86 | if (policy == SCHED_FIFO || policy == SCHED_RR) | 100 | return policy == SCHED_FIFO || policy == SCHED_RR; |
| 87 | return 1; | 101 | } |
| 88 | return 0; | 102 | |
| 103 | static inline int dl_policy(int policy) | ||
| 104 | { | ||
| 105 | return policy == SCHED_DEADLINE; | ||
| 89 | } | 106 | } |
| 90 | 107 | ||
| 91 | static inline int task_has_rt_policy(struct task_struct *p) | 108 | static inline int task_has_rt_policy(struct task_struct *p) |
| @@ -93,6 +110,25 @@ static inline int task_has_rt_policy(struct task_struct *p) | |||
| 93 | return rt_policy(p->policy); | 110 | return rt_policy(p->policy); |
| 94 | } | 111 | } |
| 95 | 112 | ||
| 113 | static inline int task_has_dl_policy(struct task_struct *p) | ||
| 114 | { | ||
| 115 | return dl_policy(p->policy); | ||
| 116 | } | ||
| 117 | |||
| 118 | static inline bool dl_time_before(u64 a, u64 b) | ||
| 119 | { | ||
| 120 | return (s64)(a - b) < 0; | ||
| 121 | } | ||
| 122 | |||
| 123 | /* | ||
| 124 | * Tells if entity @a should preempt entity @b. | ||
| 125 | */ | ||
| 126 | static inline bool | ||
| 127 | dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b) | ||
| 128 | { | ||
| 129 | return dl_time_before(a->deadline, b->deadline); | ||
| 130 | } | ||
| 131 | |||
| 96 | /* | 132 | /* |
| 97 | * This is the priority-queue data structure of the RT scheduling class: | 133 | * This is the priority-queue data structure of the RT scheduling class: |
| 98 | */ | 134 | */ |
| @@ -108,6 +144,47 @@ struct rt_bandwidth { | |||
| 108 | u64 rt_runtime; | 144 | u64 rt_runtime; |
| 109 | struct hrtimer rt_period_timer; | 145 | struct hrtimer rt_period_timer; |
| 110 | }; | 146 | }; |
| 147 | /* | ||
| 148 | * To keep the bandwidth of -deadline tasks and groups under control | ||
| 149 | * we need some place where: | ||
| 150 | * - store the maximum -deadline bandwidth of the system (the group); | ||
| 151 | * - cache the fraction of that bandwidth that is currently allocated. | ||
| 152 | * | ||
| 153 | * This is all done in the data structure below. It is similar to the | ||
| 154 | * one used for RT-throttling (rt_bandwidth), with the main difference | ||
| 155 | * that, since here we are only interested in admission control, we | ||
| 156 | * do not decrease any runtime while the group "executes", neither we | ||
| 157 | * need a timer to replenish it. | ||
| 158 | * | ||
| 159 | * With respect to SMP, the bandwidth is given on a per-CPU basis, | ||
| 160 | * meaning that: | ||
| 161 | * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU; | ||
| 162 | * - dl_total_bw array contains, in the i-eth element, the currently | ||
| 163 | * allocated bandwidth on the i-eth CPU. | ||
| 164 | * Moreover, groups consume bandwidth on each CPU, while tasks only | ||
| 165 | * consume bandwidth on the CPU they're running on. | ||
| 166 | * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw | ||
| 167 | * that will be shown the next time the proc or cgroup controls will | ||
| 168 | * be red. It on its turn can be changed by writing on its own | ||
| 169 | * control. | ||
| 170 | */ | ||
| 171 | struct dl_bandwidth { | ||
| 172 | raw_spinlock_t dl_runtime_lock; | ||
| 173 | u64 dl_runtime; | ||
| 174 | u64 dl_period; | ||
| 175 | }; | ||
| 176 | |||
| 177 | static inline int dl_bandwidth_enabled(void) | ||
| 178 | { | ||
| 179 | return sysctl_sched_rt_runtime >= 0; | ||
| 180 | } | ||
| 181 | |||
| 182 | extern struct dl_bw *dl_bw_of(int i); | ||
| 183 | |||
| 184 | struct dl_bw { | ||
| 185 | raw_spinlock_t lock; | ||
| 186 | u64 bw, total_bw; | ||
| 187 | }; | ||
| 111 | 188 | ||
| 112 | extern struct mutex sched_domains_mutex; | 189 | extern struct mutex sched_domains_mutex; |
| 113 | 190 | ||
| @@ -364,6 +441,42 @@ struct rt_rq { | |||
| 364 | #endif | 441 | #endif |
| 365 | }; | 442 | }; |
| 366 | 443 | ||
| 444 | /* Deadline class' related fields in a runqueue */ | ||
| 445 | struct dl_rq { | ||
| 446 | /* runqueue is an rbtree, ordered by deadline */ | ||
| 447 | struct rb_root rb_root; | ||
| 448 | struct rb_node *rb_leftmost; | ||
| 449 | |||
| 450 | unsigned long dl_nr_running; | ||
| 451 | |||
| 452 | #ifdef CONFIG_SMP | ||
| 453 | /* | ||
| 454 | * Deadline values of the currently executing and the | ||
| 455 | * earliest ready task on this rq. Caching these facilitates | ||
| 456 | * the decision wether or not a ready but not running task | ||
| 457 | * should migrate somewhere else. | ||
| 458 | */ | ||
| 459 | struct { | ||
| 460 | u64 curr; | ||
| 461 | u64 next; | ||
| 462 | } earliest_dl; | ||
| 463 | |||
| 464 | unsigned long dl_nr_migratory; | ||
| 465 | unsigned long dl_nr_total; | ||
| 466 | int overloaded; | ||
| 467 | |||
| 468 | /* | ||
| 469 | * Tasks on this rq that can be pushed away. They are kept in | ||
| 470 | * an rb-tree, ordered by tasks' deadlines, with caching | ||
| 471 | * of the leftmost (earliest deadline) element. | ||
| 472 | */ | ||
| 473 | struct rb_root pushable_dl_tasks_root; | ||
| 474 | struct rb_node *pushable_dl_tasks_leftmost; | ||
| 475 | #else | ||
| 476 | struct dl_bw dl_bw; | ||
| 477 | #endif | ||
| 478 | }; | ||
| 479 | |||
| 367 | #ifdef CONFIG_SMP | 480 | #ifdef CONFIG_SMP |
| 368 | 481 | ||
| 369 | /* | 482 | /* |
| @@ -382,6 +495,15 @@ struct root_domain { | |||
| 382 | cpumask_var_t online; | 495 | cpumask_var_t online; |
| 383 | 496 | ||
| 384 | /* | 497 | /* |
| 498 | * The bit corresponding to a CPU gets set here if such CPU has more | ||
| 499 | * than one runnable -deadline task (as it is below for RT tasks). | ||
| 500 | */ | ||
| 501 | cpumask_var_t dlo_mask; | ||
| 502 | atomic_t dlo_count; | ||
| 503 | struct dl_bw dl_bw; | ||
| 504 | struct cpudl cpudl; | ||
| 505 | |||
| 506 | /* | ||
| 385 | * The "RT overload" flag: it gets set if a CPU has more than | 507 | * The "RT overload" flag: it gets set if a CPU has more than |
| 386 | * one runnable RT task. | 508 | * one runnable RT task. |
| 387 | */ | 509 | */ |
| @@ -432,6 +554,7 @@ struct rq { | |||
| 432 | 554 | ||
| 433 | struct cfs_rq cfs; | 555 | struct cfs_rq cfs; |
| 434 | struct rt_rq rt; | 556 | struct rt_rq rt; |
| 557 | struct dl_rq dl; | ||
| 435 | 558 | ||
| 436 | #ifdef CONFIG_FAIR_GROUP_SCHED | 559 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 437 | /* list of leaf cfs_rq on this cpu: */ | 560 | /* list of leaf cfs_rq on this cpu: */ |
| @@ -827,8 +950,6 @@ static inline u64 global_rt_runtime(void) | |||
| 827 | return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; | 950 | return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; |
| 828 | } | 951 | } |
| 829 | 952 | ||
| 830 | |||
| 831 | |||
| 832 | static inline int task_current(struct rq *rq, struct task_struct *p) | 953 | static inline int task_current(struct rq *rq, struct task_struct *p) |
| 833 | { | 954 | { |
| 834 | return rq->curr == p; | 955 | return rq->curr == p; |
| @@ -988,6 +1109,7 @@ static const u32 prio_to_wmult[40] = { | |||
| 988 | #else | 1109 | #else |
| 989 | #define ENQUEUE_WAKING 0 | 1110 | #define ENQUEUE_WAKING 0 |
| 990 | #endif | 1111 | #endif |
| 1112 | #define ENQUEUE_REPLENISH 8 | ||
| 991 | 1113 | ||
| 992 | #define DEQUEUE_SLEEP 1 | 1114 | #define DEQUEUE_SLEEP 1 |
| 993 | 1115 | ||
| @@ -1023,6 +1145,7 @@ struct sched_class { | |||
| 1023 | void (*set_curr_task) (struct rq *rq); | 1145 | void (*set_curr_task) (struct rq *rq); |
| 1024 | void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); | 1146 | void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); |
| 1025 | void (*task_fork) (struct task_struct *p); | 1147 | void (*task_fork) (struct task_struct *p); |
| 1148 | void (*task_dead) (struct task_struct *p); | ||
| 1026 | 1149 | ||
| 1027 | void (*switched_from) (struct rq *this_rq, struct task_struct *task); | 1150 | void (*switched_from) (struct rq *this_rq, struct task_struct *task); |
| 1028 | void (*switched_to) (struct rq *this_rq, struct task_struct *task); | 1151 | void (*switched_to) (struct rq *this_rq, struct task_struct *task); |
| @@ -1042,6 +1165,7 @@ struct sched_class { | |||
| 1042 | for (class = sched_class_highest; class; class = class->next) | 1165 | for (class = sched_class_highest; class; class = class->next) |
| 1043 | 1166 | ||
| 1044 | extern const struct sched_class stop_sched_class; | 1167 | extern const struct sched_class stop_sched_class; |
| 1168 | extern const struct sched_class dl_sched_class; | ||
| 1045 | extern const struct sched_class rt_sched_class; | 1169 | extern const struct sched_class rt_sched_class; |
| 1046 | extern const struct sched_class fair_sched_class; | 1170 | extern const struct sched_class fair_sched_class; |
| 1047 | extern const struct sched_class idle_sched_class; | 1171 | extern const struct sched_class idle_sched_class; |
| @@ -1051,7 +1175,7 @@ extern const struct sched_class idle_sched_class; | |||
| 1051 | 1175 | ||
| 1052 | extern void update_group_power(struct sched_domain *sd, int cpu); | 1176 | extern void update_group_power(struct sched_domain *sd, int cpu); |
| 1053 | 1177 | ||
| 1054 | extern void trigger_load_balance(struct rq *rq, int cpu); | 1178 | extern void trigger_load_balance(struct rq *rq); |
| 1055 | extern void idle_balance(int this_cpu, struct rq *this_rq); | 1179 | extern void idle_balance(int this_cpu, struct rq *this_rq); |
| 1056 | 1180 | ||
| 1057 | extern void idle_enter_fair(struct rq *this_rq); | 1181 | extern void idle_enter_fair(struct rq *this_rq); |
| @@ -1068,8 +1192,11 @@ static inline void idle_balance(int cpu, struct rq *rq) | |||
| 1068 | extern void sysrq_sched_debug_show(void); | 1192 | extern void sysrq_sched_debug_show(void); |
| 1069 | extern void sched_init_granularity(void); | 1193 | extern void sched_init_granularity(void); |
| 1070 | extern void update_max_interval(void); | 1194 | extern void update_max_interval(void); |
| 1195 | |||
| 1196 | extern void init_sched_dl_class(void); | ||
| 1071 | extern void init_sched_rt_class(void); | 1197 | extern void init_sched_rt_class(void); |
| 1072 | extern void init_sched_fair_class(void); | 1198 | extern void init_sched_fair_class(void); |
| 1199 | extern void init_sched_dl_class(void); | ||
| 1073 | 1200 | ||
| 1074 | extern void resched_task(struct task_struct *p); | 1201 | extern void resched_task(struct task_struct *p); |
| 1075 | extern void resched_cpu(int cpu); | 1202 | extern void resched_cpu(int cpu); |
| @@ -1077,6 +1204,12 @@ extern void resched_cpu(int cpu); | |||
| 1077 | extern struct rt_bandwidth def_rt_bandwidth; | 1204 | extern struct rt_bandwidth def_rt_bandwidth; |
| 1078 | extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); | 1205 | extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); |
| 1079 | 1206 | ||
| 1207 | extern struct dl_bandwidth def_dl_bandwidth; | ||
| 1208 | extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); | ||
| 1209 | extern void init_dl_task_timer(struct sched_dl_entity *dl_se); | ||
| 1210 | |||
| 1211 | unsigned long to_ratio(u64 period, u64 runtime); | ||
| 1212 | |||
| 1080 | extern void update_idle_cpu_load(struct rq *this_rq); | 1213 | extern void update_idle_cpu_load(struct rq *this_rq); |
| 1081 | 1214 | ||
| 1082 | extern void init_task_runnable_average(struct task_struct *p); | 1215 | extern void init_task_runnable_average(struct task_struct *p); |
| @@ -1353,6 +1486,7 @@ extern void print_rt_stats(struct seq_file *m, int cpu); | |||
| 1353 | 1486 | ||
| 1354 | extern void init_cfs_rq(struct cfs_rq *cfs_rq); | 1487 | extern void init_cfs_rq(struct cfs_rq *cfs_rq); |
| 1355 | extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); | 1488 | extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); |
| 1489 | extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq); | ||
| 1356 | 1490 | ||
| 1357 | extern void cfs_bandwidth_usage_inc(void); | 1491 | extern void cfs_bandwidth_usage_inc(void); |
| 1358 | extern void cfs_bandwidth_usage_dec(void); | 1492 | extern void cfs_bandwidth_usage_dec(void); |
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 47197de8abd9..fdb6bb0b3356 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c | |||
| @@ -103,7 +103,7 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task) | |||
| 103 | * Simple, special scheduling class for the per-CPU stop tasks: | 103 | * Simple, special scheduling class for the per-CPU stop tasks: |
| 104 | */ | 104 | */ |
| 105 | const struct sched_class stop_sched_class = { | 105 | const struct sched_class stop_sched_class = { |
| 106 | .next = &rt_sched_class, | 106 | .next = &dl_sched_class, |
| 107 | 107 | ||
| 108 | .enqueue_task = enqueue_task_stop, | 108 | .enqueue_task = enqueue_task_stop, |
| 109 | .dequeue_task = dequeue_task_stop, | 109 | .dequeue_task = dequeue_task_stop, |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 9a4500e4c189..8b93b3770f85 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
| @@ -89,7 +89,7 @@ static void wakeup_softirqd(void) | |||
| 89 | * where hardirqs are disabled legitimately: | 89 | * where hardirqs are disabled legitimately: |
| 90 | */ | 90 | */ |
| 91 | #ifdef CONFIG_TRACE_IRQFLAGS | 91 | #ifdef CONFIG_TRACE_IRQFLAGS |
| 92 | static void __local_bh_disable(unsigned long ip, unsigned int cnt) | 92 | void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) |
| 93 | { | 93 | { |
| 94 | unsigned long flags; | 94 | unsigned long flags; |
| 95 | 95 | ||
| @@ -107,33 +107,21 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt) | |||
| 107 | /* | 107 | /* |
| 108 | * Were softirqs turned off above: | 108 | * Were softirqs turned off above: |
| 109 | */ | 109 | */ |
| 110 | if (softirq_count() == cnt) | 110 | if (softirq_count() == (cnt & SOFTIRQ_MASK)) |
| 111 | trace_softirqs_off(ip); | 111 | trace_softirqs_off(ip); |
| 112 | raw_local_irq_restore(flags); | 112 | raw_local_irq_restore(flags); |
| 113 | 113 | ||
| 114 | if (preempt_count() == cnt) | 114 | if (preempt_count() == cnt) |
| 115 | trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); | 115 | trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); |
| 116 | } | 116 | } |
| 117 | #else /* !CONFIG_TRACE_IRQFLAGS */ | 117 | EXPORT_SYMBOL(__local_bh_disable_ip); |
| 118 | static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) | ||
| 119 | { | ||
| 120 | preempt_count_add(cnt); | ||
| 121 | barrier(); | ||
| 122 | } | ||
| 123 | #endif /* CONFIG_TRACE_IRQFLAGS */ | 118 | #endif /* CONFIG_TRACE_IRQFLAGS */ |
| 124 | 119 | ||
| 125 | void local_bh_disable(void) | ||
| 126 | { | ||
| 127 | __local_bh_disable(_RET_IP_, SOFTIRQ_DISABLE_OFFSET); | ||
| 128 | } | ||
| 129 | |||
| 130 | EXPORT_SYMBOL(local_bh_disable); | ||
| 131 | |||
| 132 | static void __local_bh_enable(unsigned int cnt) | 120 | static void __local_bh_enable(unsigned int cnt) |
| 133 | { | 121 | { |
| 134 | WARN_ON_ONCE(!irqs_disabled()); | 122 | WARN_ON_ONCE(!irqs_disabled()); |
| 135 | 123 | ||
| 136 | if (softirq_count() == cnt) | 124 | if (softirq_count() == (cnt & SOFTIRQ_MASK)) |
| 137 | trace_softirqs_on(_RET_IP_); | 125 | trace_softirqs_on(_RET_IP_); |
| 138 | preempt_count_sub(cnt); | 126 | preempt_count_sub(cnt); |
| 139 | } | 127 | } |
| @@ -151,7 +139,7 @@ void _local_bh_enable(void) | |||
| 151 | 139 | ||
| 152 | EXPORT_SYMBOL(_local_bh_enable); | 140 | EXPORT_SYMBOL(_local_bh_enable); |
| 153 | 141 | ||
| 154 | static inline void _local_bh_enable_ip(unsigned long ip) | 142 | void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) |
| 155 | { | 143 | { |
| 156 | WARN_ON_ONCE(in_irq() || irqs_disabled()); | 144 | WARN_ON_ONCE(in_irq() || irqs_disabled()); |
| 157 | #ifdef CONFIG_TRACE_IRQFLAGS | 145 | #ifdef CONFIG_TRACE_IRQFLAGS |
| @@ -166,7 +154,7 @@ static inline void _local_bh_enable_ip(unsigned long ip) | |||
| 166 | * Keep preemption disabled until we are done with | 154 | * Keep preemption disabled until we are done with |
| 167 | * softirq processing: | 155 | * softirq processing: |
| 168 | */ | 156 | */ |
| 169 | preempt_count_sub(SOFTIRQ_DISABLE_OFFSET - 1); | 157 | preempt_count_sub(cnt - 1); |
| 170 | 158 | ||
| 171 | if (unlikely(!in_interrupt() && local_softirq_pending())) { | 159 | if (unlikely(!in_interrupt() && local_softirq_pending())) { |
| 172 | /* | 160 | /* |
| @@ -182,18 +170,7 @@ static inline void _local_bh_enable_ip(unsigned long ip) | |||
| 182 | #endif | 170 | #endif |
| 183 | preempt_check_resched(); | 171 | preempt_check_resched(); |
| 184 | } | 172 | } |
| 185 | 173 | EXPORT_SYMBOL(__local_bh_enable_ip); | |
| 186 | void local_bh_enable(void) | ||
| 187 | { | ||
| 188 | _local_bh_enable_ip(_RET_IP_); | ||
| 189 | } | ||
| 190 | EXPORT_SYMBOL(local_bh_enable); | ||
| 191 | |||
| 192 | void local_bh_enable_ip(unsigned long ip) | ||
| 193 | { | ||
| 194 | _local_bh_enable_ip(ip); | ||
| 195 | } | ||
| 196 | EXPORT_SYMBOL(local_bh_enable_ip); | ||
| 197 | 174 | ||
| 198 | /* | 175 | /* |
| 199 | * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times, | 176 | * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times, |
| @@ -264,7 +241,7 @@ asmlinkage void __do_softirq(void) | |||
| 264 | pending = local_softirq_pending(); | 241 | pending = local_softirq_pending(); |
| 265 | account_irq_enter_time(current); | 242 | account_irq_enter_time(current); |
| 266 | 243 | ||
| 267 | __local_bh_disable(_RET_IP_, SOFTIRQ_OFFSET); | 244 | __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); |
| 268 | in_hardirq = lockdep_softirq_start(); | 245 | in_hardirq = lockdep_softirq_start(); |
| 269 | 246 | ||
| 270 | cpu = smp_processor_id(); | 247 | cpu = smp_processor_id(); |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 34a604726d0b..c8da99f905cf 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -385,13 +385,6 @@ static struct ctl_table kern_table[] = { | |||
| 385 | .proc_handler = proc_dointvec, | 385 | .proc_handler = proc_dointvec, |
| 386 | }, | 386 | }, |
| 387 | { | 387 | { |
| 388 | .procname = "numa_balancing_settle_count", | ||
| 389 | .data = &sysctl_numa_balancing_settle_count, | ||
| 390 | .maxlen = sizeof(unsigned int), | ||
| 391 | .mode = 0644, | ||
| 392 | .proc_handler = proc_dointvec, | ||
| 393 | }, | ||
| 394 | { | ||
| 395 | .procname = "numa_balancing_migrate_deferred", | 388 | .procname = "numa_balancing_migrate_deferred", |
| 396 | .data = &sysctl_numa_balancing_migrate_deferred, | 389 | .data = &sysctl_numa_balancing_migrate_deferred, |
| 397 | .maxlen = sizeof(unsigned int), | 390 | .maxlen = sizeof(unsigned int), |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index ea20f7d1ac2c..c833249ab0fb 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
| @@ -177,7 +177,7 @@ static bool can_stop_full_tick(void) | |||
| 177 | * TODO: kick full dynticks CPUs when | 177 | * TODO: kick full dynticks CPUs when |
| 178 | * sched_clock_stable is set. | 178 | * sched_clock_stable is set. |
| 179 | */ | 179 | */ |
| 180 | if (!sched_clock_stable) { | 180 | if (!sched_clock_stable()) { |
| 181 | trace_tick_stop(0, "unstable sched clock\n"); | 181 | trace_tick_stop(0, "unstable sched clock\n"); |
| 182 | /* | 182 | /* |
| 183 | * Don't allow the user to think they can get | 183 | * Don't allow the user to think they can get |
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index cc2f66f68dc5..294b8a271a04 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
| @@ -2558,7 +2558,7 @@ rb_reserve_next_event(struct ring_buffer *buffer, | |||
| 2558 | if (unlikely(test_time_stamp(delta))) { | 2558 | if (unlikely(test_time_stamp(delta))) { |
| 2559 | int local_clock_stable = 1; | 2559 | int local_clock_stable = 1; |
| 2560 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK | 2560 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK |
| 2561 | local_clock_stable = sched_clock_stable; | 2561 | local_clock_stable = sched_clock_stable(); |
| 2562 | #endif | 2562 | #endif |
| 2563 | WARN_ONCE(delta > (1ULL << 59), | 2563 | WARN_ONCE(delta > (1ULL << 59), |
| 2564 | KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", | 2564 | KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", |
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index fee77e15d815..6e32635e5e57 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c | |||
| @@ -16,6 +16,7 @@ | |||
| 16 | #include <linux/uaccess.h> | 16 | #include <linux/uaccess.h> |
| 17 | #include <linux/ftrace.h> | 17 | #include <linux/ftrace.h> |
| 18 | #include <linux/sched/rt.h> | 18 | #include <linux/sched/rt.h> |
| 19 | #include <linux/sched/deadline.h> | ||
| 19 | #include <trace/events/sched.h> | 20 | #include <trace/events/sched.h> |
| 20 | #include "trace.h" | 21 | #include "trace.h" |
| 21 | 22 | ||
| @@ -27,6 +28,8 @@ static int wakeup_cpu; | |||
| 27 | static int wakeup_current_cpu; | 28 | static int wakeup_current_cpu; |
| 28 | static unsigned wakeup_prio = -1; | 29 | static unsigned wakeup_prio = -1; |
| 29 | static int wakeup_rt; | 30 | static int wakeup_rt; |
| 31 | static int wakeup_dl; | ||
| 32 | static int tracing_dl = 0; | ||
| 30 | 33 | ||
| 31 | static arch_spinlock_t wakeup_lock = | 34 | static arch_spinlock_t wakeup_lock = |
| 32 | (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; | 35 | (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; |
| @@ -437,6 +440,7 @@ static void __wakeup_reset(struct trace_array *tr) | |||
| 437 | { | 440 | { |
| 438 | wakeup_cpu = -1; | 441 | wakeup_cpu = -1; |
| 439 | wakeup_prio = -1; | 442 | wakeup_prio = -1; |
| 443 | tracing_dl = 0; | ||
| 440 | 444 | ||
| 441 | if (wakeup_task) | 445 | if (wakeup_task) |
| 442 | put_task_struct(wakeup_task); | 446 | put_task_struct(wakeup_task); |
| @@ -472,9 +476,17 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) | |||
| 472 | tracing_record_cmdline(p); | 476 | tracing_record_cmdline(p); |
| 473 | tracing_record_cmdline(current); | 477 | tracing_record_cmdline(current); |
| 474 | 478 | ||
| 475 | if ((wakeup_rt && !rt_task(p)) || | 479 | /* |
| 476 | p->prio >= wakeup_prio || | 480 | * Semantic is like this: |
| 477 | p->prio >= current->prio) | 481 | * - wakeup tracer handles all tasks in the system, independently |
| 482 | * from their scheduling class; | ||
| 483 | * - wakeup_rt tracer handles tasks belonging to sched_dl and | ||
| 484 | * sched_rt class; | ||
| 485 | * - wakeup_dl handles tasks belonging to sched_dl class only. | ||
| 486 | */ | ||
| 487 | if (tracing_dl || (wakeup_dl && !dl_task(p)) || | ||
| 488 | (wakeup_rt && !dl_task(p) && !rt_task(p)) || | ||
| 489 | (!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio))) | ||
| 478 | return; | 490 | return; |
| 479 | 491 | ||
| 480 | pc = preempt_count(); | 492 | pc = preempt_count(); |
| @@ -486,7 +498,8 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) | |||
| 486 | arch_spin_lock(&wakeup_lock); | 498 | arch_spin_lock(&wakeup_lock); |
| 487 | 499 | ||
| 488 | /* check for races. */ | 500 | /* check for races. */ |
| 489 | if (!tracer_enabled || p->prio >= wakeup_prio) | 501 | if (!tracer_enabled || tracing_dl || |
| 502 | (!dl_task(p) && p->prio >= wakeup_prio)) | ||
| 490 | goto out_locked; | 503 | goto out_locked; |
| 491 | 504 | ||
| 492 | /* reset the trace */ | 505 | /* reset the trace */ |
| @@ -496,6 +509,15 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) | |||
| 496 | wakeup_current_cpu = wakeup_cpu; | 509 | wakeup_current_cpu = wakeup_cpu; |
| 497 | wakeup_prio = p->prio; | 510 | wakeup_prio = p->prio; |
| 498 | 511 | ||
| 512 | /* | ||
| 513 | * Once you start tracing a -deadline task, don't bother tracing | ||
| 514 | * another task until the first one wakes up. | ||
| 515 | */ | ||
| 516 | if (dl_task(p)) | ||
| 517 | tracing_dl = 1; | ||
| 518 | else | ||
| 519 | tracing_dl = 0; | ||
| 520 | |||
| 499 | wakeup_task = p; | 521 | wakeup_task = p; |
| 500 | get_task_struct(wakeup_task); | 522 | get_task_struct(wakeup_task); |
| 501 | 523 | ||
| @@ -597,16 +619,25 @@ static int __wakeup_tracer_init(struct trace_array *tr) | |||
| 597 | 619 | ||
| 598 | static int wakeup_tracer_init(struct trace_array *tr) | 620 | static int wakeup_tracer_init(struct trace_array *tr) |
| 599 | { | 621 | { |
| 622 | wakeup_dl = 0; | ||
| 600 | wakeup_rt = 0; | 623 | wakeup_rt = 0; |
| 601 | return __wakeup_tracer_init(tr); | 624 | return __wakeup_tracer_init(tr); |
| 602 | } | 625 | } |
| 603 | 626 | ||
| 604 | static int wakeup_rt_tracer_init(struct trace_array *tr) | 627 | static int wakeup_rt_tracer_init(struct trace_array *tr) |
| 605 | { | 628 | { |
| 629 | wakeup_dl = 0; | ||
| 606 | wakeup_rt = 1; | 630 | wakeup_rt = 1; |
| 607 | return __wakeup_tracer_init(tr); | 631 | return __wakeup_tracer_init(tr); |
| 608 | } | 632 | } |
| 609 | 633 | ||
| 634 | static int wakeup_dl_tracer_init(struct trace_array *tr) | ||
| 635 | { | ||
| 636 | wakeup_dl = 1; | ||
| 637 | wakeup_rt = 0; | ||
| 638 | return __wakeup_tracer_init(tr); | ||
| 639 | } | ||
| 640 | |||
| 610 | static void wakeup_tracer_reset(struct trace_array *tr) | 641 | static void wakeup_tracer_reset(struct trace_array *tr) |
| 611 | { | 642 | { |
| 612 | int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; | 643 | int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; |
| @@ -674,6 +705,28 @@ static struct tracer wakeup_rt_tracer __read_mostly = | |||
| 674 | .use_max_tr = true, | 705 | .use_max_tr = true, |
| 675 | }; | 706 | }; |
| 676 | 707 | ||
| 708 | static struct tracer wakeup_dl_tracer __read_mostly = | ||
| 709 | { | ||
| 710 | .name = "wakeup_dl", | ||
| 711 | .init = wakeup_dl_tracer_init, | ||
| 712 | .reset = wakeup_tracer_reset, | ||
| 713 | .start = wakeup_tracer_start, | ||
| 714 | .stop = wakeup_tracer_stop, | ||
| 715 | .wait_pipe = poll_wait_pipe, | ||
| 716 | .print_max = true, | ||
| 717 | .print_header = wakeup_print_header, | ||
| 718 | .print_line = wakeup_print_line, | ||
| 719 | .flags = &tracer_flags, | ||
| 720 | .set_flag = wakeup_set_flag, | ||
| 721 | .flag_changed = wakeup_flag_changed, | ||
| 722 | #ifdef CONFIG_FTRACE_SELFTEST | ||
| 723 | .selftest = trace_selftest_startup_wakeup, | ||
| 724 | #endif | ||
| 725 | .open = wakeup_trace_open, | ||
| 726 | .close = wakeup_trace_close, | ||
| 727 | .use_max_tr = true, | ||
| 728 | }; | ||
| 729 | |||
| 677 | __init static int init_wakeup_tracer(void) | 730 | __init static int init_wakeup_tracer(void) |
| 678 | { | 731 | { |
| 679 | int ret; | 732 | int ret; |
| @@ -686,6 +739,10 @@ __init static int init_wakeup_tracer(void) | |||
| 686 | if (ret) | 739 | if (ret) |
| 687 | return ret; | 740 | return ret; |
| 688 | 741 | ||
| 742 | ret = register_tracer(&wakeup_dl_tracer); | ||
| 743 | if (ret) | ||
| 744 | return ret; | ||
| 745 | |||
| 689 | return 0; | 746 | return 0; |
| 690 | } | 747 | } |
| 691 | core_initcall(init_wakeup_tracer); | 748 | core_initcall(init_wakeup_tracer); |
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index a7329b7902f8..e98fca60974f 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c | |||
| @@ -1022,11 +1022,16 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr) | |||
| 1022 | #ifdef CONFIG_SCHED_TRACER | 1022 | #ifdef CONFIG_SCHED_TRACER |
| 1023 | static int trace_wakeup_test_thread(void *data) | 1023 | static int trace_wakeup_test_thread(void *data) |
| 1024 | { | 1024 | { |
| 1025 | /* Make this a RT thread, doesn't need to be too high */ | 1025 | /* Make this a -deadline thread */ |
| 1026 | static const struct sched_param param = { .sched_priority = 5 }; | 1026 | static const struct sched_attr attr = { |
| 1027 | .sched_policy = SCHED_DEADLINE, | ||
| 1028 | .sched_runtime = 100000ULL, | ||
| 1029 | .sched_deadline = 10000000ULL, | ||
| 1030 | .sched_period = 10000000ULL | ||
| 1031 | }; | ||
| 1027 | struct completion *x = data; | 1032 | struct completion *x = data; |
| 1028 | 1033 | ||
| 1029 | sched_setscheduler(current, SCHED_FIFO, ¶m); | 1034 | sched_setattr(current, &attr); |
| 1030 | 1035 | ||
| 1031 | /* Make it know we have a new prio */ | 1036 | /* Make it know we have a new prio */ |
| 1032 | complete(x); | 1037 | complete(x); |
| @@ -1040,8 +1045,8 @@ static int trace_wakeup_test_thread(void *data) | |||
| 1040 | /* we are awake, now wait to disappear */ | 1045 | /* we are awake, now wait to disappear */ |
| 1041 | while (!kthread_should_stop()) { | 1046 | while (!kthread_should_stop()) { |
| 1042 | /* | 1047 | /* |
| 1043 | * This is an RT task, do short sleeps to let | 1048 | * This will likely be the system top priority |
| 1044 | * others run. | 1049 | * task, do short sleeps to let others run. |
| 1045 | */ | 1050 | */ |
| 1046 | msleep(100); | 1051 | msleep(100); |
| 1047 | } | 1052 | } |
| @@ -1054,21 +1059,21 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) | |||
| 1054 | { | 1059 | { |
| 1055 | unsigned long save_max = tracing_max_latency; | 1060 | unsigned long save_max = tracing_max_latency; |
| 1056 | struct task_struct *p; | 1061 | struct task_struct *p; |
| 1057 | struct completion isrt; | 1062 | struct completion is_ready; |
| 1058 | unsigned long count; | 1063 | unsigned long count; |
| 1059 | int ret; | 1064 | int ret; |
| 1060 | 1065 | ||
| 1061 | init_completion(&isrt); | 1066 | init_completion(&is_ready); |
| 1062 | 1067 | ||
| 1063 | /* create a high prio thread */ | 1068 | /* create a -deadline thread */ |
| 1064 | p = kthread_run(trace_wakeup_test_thread, &isrt, "ftrace-test"); | 1069 | p = kthread_run(trace_wakeup_test_thread, &is_ready, "ftrace-test"); |
| 1065 | if (IS_ERR(p)) { | 1070 | if (IS_ERR(p)) { |
| 1066 | printk(KERN_CONT "Failed to create ftrace wakeup test thread "); | 1071 | printk(KERN_CONT "Failed to create ftrace wakeup test thread "); |
| 1067 | return -1; | 1072 | return -1; |
| 1068 | } | 1073 | } |
| 1069 | 1074 | ||
| 1070 | /* make sure the thread is running at an RT prio */ | 1075 | /* make sure the thread is running at -deadline policy */ |
| 1071 | wait_for_completion(&isrt); | 1076 | wait_for_completion(&is_ready); |
| 1072 | 1077 | ||
| 1073 | /* start the tracing */ | 1078 | /* start the tracing */ |
| 1074 | ret = tracer_init(trace, tr); | 1079 | ret = tracer_init(trace, tr); |
| @@ -1082,19 +1087,19 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) | |||
| 1082 | 1087 | ||
| 1083 | while (p->on_rq) { | 1088 | while (p->on_rq) { |
| 1084 | /* | 1089 | /* |
| 1085 | * Sleep to make sure the RT thread is asleep too. | 1090 | * Sleep to make sure the -deadline thread is asleep too. |
| 1086 | * On virtual machines we can't rely on timings, | 1091 | * On virtual machines we can't rely on timings, |
| 1087 | * but we want to make sure this test still works. | 1092 | * but we want to make sure this test still works. |
| 1088 | */ | 1093 | */ |
| 1089 | msleep(100); | 1094 | msleep(100); |
| 1090 | } | 1095 | } |
| 1091 | 1096 | ||
| 1092 | init_completion(&isrt); | 1097 | init_completion(&is_ready); |
| 1093 | 1098 | ||
| 1094 | wake_up_process(p); | 1099 | wake_up_process(p); |
| 1095 | 1100 | ||
| 1096 | /* Wait for the task to wake up */ | 1101 | /* Wait for the task to wake up */ |
| 1097 | wait_for_completion(&isrt); | 1102 | wait_for_completion(&is_ready); |
| 1098 | 1103 | ||
| 1099 | /* stop the tracing. */ | 1104 | /* stop the tracing. */ |
| 1100 | tracing_stop(); | 1105 | tracing_stop(); |
