aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c163
1 files changed, 128 insertions, 35 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 86f5a063f0b9..1dae85a1221a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -467,11 +467,17 @@ struct rt_rq {
467 struct rt_prio_array active; 467 struct rt_prio_array active;
468 unsigned long rt_nr_running; 468 unsigned long rt_nr_running;
469#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 469#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
470 int highest_prio; /* highest queued rt task prio */ 470 struct {
471 int curr; /* highest queued rt task prio */
472#ifdef CONFIG_SMP
473 int next; /* next highest */
474#endif
475 } highest_prio;
471#endif 476#endif
472#ifdef CONFIG_SMP 477#ifdef CONFIG_SMP
473 unsigned long rt_nr_migratory; 478 unsigned long rt_nr_migratory;
474 int overloaded; 479 int overloaded;
480 struct plist_head pushable_tasks;
475#endif 481#endif
476 int rt_throttled; 482 int rt_throttled;
477 u64 rt_time; 483 u64 rt_time;
@@ -1323,8 +1329,8 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1323 * slice expiry etc. 1329 * slice expiry etc.
1324 */ 1330 */
1325 1331
1326#define WEIGHT_IDLEPRIO 2 1332#define WEIGHT_IDLEPRIO 3
1327#define WMULT_IDLEPRIO (1 << 31) 1333#define WMULT_IDLEPRIO 1431655765
1328 1334
1329/* 1335/*
1330 * Nice levels are multiplicative, with a gentle 10% change for every 1336 * Nice levels are multiplicative, with a gentle 10% change for every
@@ -1610,21 +1616,42 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1610 1616
1611#endif 1617#endif
1612 1618
1619#ifdef CONFIG_PREEMPT
1620
1613/* 1621/*
1614 * double_lock_balance - lock the busiest runqueue, this_rq is locked already. 1622 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1623 * way at the expense of forcing extra atomic operations in all
1624 * invocations. This assures that the double_lock is acquired using the
1625 * same underlying policy as the spinlock_t on this architecture, which
1626 * reduces latency compared to the unfair variant below. However, it
1627 * also adds more overhead and therefore may reduce throughput.
1615 */ 1628 */
1616static int double_lock_balance(struct rq *this_rq, struct rq *busiest) 1629static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1630 __releases(this_rq->lock)
1631 __acquires(busiest->lock)
1632 __acquires(this_rq->lock)
1633{
1634 spin_unlock(&this_rq->lock);
1635 double_rq_lock(this_rq, busiest);
1636
1637 return 1;
1638}
1639
1640#else
1641/*
1642 * Unfair double_lock_balance: Optimizes throughput at the expense of
1643 * latency by eliminating extra atomic operations when the locks are
1644 * already in proper order on entry. This favors lower cpu-ids and will
1645 * grant the double lock to lower cpus over higher ids under contention,
1646 * regardless of entry order into the function.
1647 */
1648static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1617 __releases(this_rq->lock) 1649 __releases(this_rq->lock)
1618 __acquires(busiest->lock) 1650 __acquires(busiest->lock)
1619 __acquires(this_rq->lock) 1651 __acquires(this_rq->lock)
1620{ 1652{
1621 int ret = 0; 1653 int ret = 0;
1622 1654
1623 if (unlikely(!irqs_disabled())) {
1624 /* printk() doesn't work good under rq->lock */
1625 spin_unlock(&this_rq->lock);
1626 BUG_ON(1);
1627 }
1628 if (unlikely(!spin_trylock(&busiest->lock))) { 1655 if (unlikely(!spin_trylock(&busiest->lock))) {
1629 if (busiest < this_rq) { 1656 if (busiest < this_rq) {
1630 spin_unlock(&this_rq->lock); 1657 spin_unlock(&this_rq->lock);
@@ -1637,6 +1664,22 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1637 return ret; 1664 return ret;
1638} 1665}
1639 1666
1667#endif /* CONFIG_PREEMPT */
1668
1669/*
1670 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1671 */
1672static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1673{
1674 if (unlikely(!irqs_disabled())) {
1675 /* printk() doesn't work good under rq->lock */
1676 spin_unlock(&this_rq->lock);
1677 BUG_ON(1);
1678 }
1679
1680 return _double_lock_balance(this_rq, busiest);
1681}
1682
1640static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) 1683static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1641 __releases(busiest->lock) 1684 __releases(busiest->lock)
1642{ 1685{
@@ -2274,6 +2317,16 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2274 if (!sched_feat(SYNC_WAKEUPS)) 2317 if (!sched_feat(SYNC_WAKEUPS))
2275 sync = 0; 2318 sync = 0;
2276 2319
2320 if (!sync) {
2321 if (current->se.avg_overlap < sysctl_sched_migration_cost &&
2322 p->se.avg_overlap < sysctl_sched_migration_cost)
2323 sync = 1;
2324 } else {
2325 if (current->se.avg_overlap >= sysctl_sched_migration_cost ||
2326 p->se.avg_overlap >= sysctl_sched_migration_cost)
2327 sync = 0;
2328 }
2329
2277#ifdef CONFIG_SMP 2330#ifdef CONFIG_SMP
2278 if (sched_feat(LB_WAKEUP_UPDATE)) { 2331 if (sched_feat(LB_WAKEUP_UPDATE)) {
2279 struct sched_domain *sd; 2332 struct sched_domain *sd;
@@ -2472,6 +2525,8 @@ void sched_fork(struct task_struct *p, int clone_flags)
2472 /* Want to start with kernel preemption disabled. */ 2525 /* Want to start with kernel preemption disabled. */
2473 task_thread_info(p)->preempt_count = 1; 2526 task_thread_info(p)->preempt_count = 1;
2474#endif 2527#endif
2528 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2529
2475 put_cpu(); 2530 put_cpu();
2476} 2531}
2477 2532
@@ -2612,6 +2667,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2612{ 2667{
2613 struct mm_struct *mm = rq->prev_mm; 2668 struct mm_struct *mm = rq->prev_mm;
2614 long prev_state; 2669 long prev_state;
2670#ifdef CONFIG_SMP
2671 int post_schedule = 0;
2672
2673 if (current->sched_class->needs_post_schedule)
2674 post_schedule = current->sched_class->needs_post_schedule(rq);
2675#endif
2615 2676
2616 rq->prev_mm = NULL; 2677 rq->prev_mm = NULL;
2617 2678
@@ -2630,7 +2691,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2630 finish_arch_switch(prev); 2691 finish_arch_switch(prev);
2631 finish_lock_switch(rq, prev); 2692 finish_lock_switch(rq, prev);
2632#ifdef CONFIG_SMP 2693#ifdef CONFIG_SMP
2633 if (current->sched_class->post_schedule) 2694 if (post_schedule)
2634 current->sched_class->post_schedule(rq); 2695 current->sched_class->post_schedule(rq);
2635#endif 2696#endif
2636 2697
@@ -3011,6 +3072,16 @@ next:
3011 pulled++; 3072 pulled++;
3012 rem_load_move -= p->se.load.weight; 3073 rem_load_move -= p->se.load.weight;
3013 3074
3075#ifdef CONFIG_PREEMPT
3076 /*
3077 * NEWIDLE balancing is a source of latency, so preemptible kernels
3078 * will stop after the first task is pulled to minimize the critical
3079 * section.
3080 */
3081 if (idle == CPU_NEWLY_IDLE)
3082 goto out;
3083#endif
3084
3014 /* 3085 /*
3015 * We only want to steal up to the prescribed amount of weighted load. 3086 * We only want to steal up to the prescribed amount of weighted load.
3016 */ 3087 */
@@ -3057,9 +3128,15 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3057 sd, idle, all_pinned, &this_best_prio); 3128 sd, idle, all_pinned, &this_best_prio);
3058 class = class->next; 3129 class = class->next;
3059 3130
3131#ifdef CONFIG_PREEMPT
3132 /*
3133 * NEWIDLE balancing is a source of latency, so preemptible
3134 * kernels will stop after the first task is pulled to minimize
3135 * the critical section.
3136 */
3060 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) 3137 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3061 break; 3138 break;
3062 3139#endif
3063 } while (class && max_load_move > total_load_moved); 3140 } while (class && max_load_move > total_load_moved);
3064 3141
3065 return total_load_moved > 0; 3142 return total_load_moved > 0;
@@ -3904,19 +3981,24 @@ int select_nohz_load_balancer(int stop_tick)
3904 int cpu = smp_processor_id(); 3981 int cpu = smp_processor_id();
3905 3982
3906 if (stop_tick) { 3983 if (stop_tick) {
3907 cpumask_set_cpu(cpu, nohz.cpu_mask);
3908 cpu_rq(cpu)->in_nohz_recently = 1; 3984 cpu_rq(cpu)->in_nohz_recently = 1;
3909 3985
3910 /* 3986 if (!cpu_active(cpu)) {
3911 * If we are going offline and still the leader, give up! 3987 if (atomic_read(&nohz.load_balancer) != cpu)
3912 */ 3988 return 0;
3913 if (!cpu_active(cpu) && 3989
3914 atomic_read(&nohz.load_balancer) == cpu) { 3990 /*
3991 * If we are going offline and still the leader,
3992 * give up!
3993 */
3915 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 3994 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3916 BUG(); 3995 BUG();
3996
3917 return 0; 3997 return 0;
3918 } 3998 }
3919 3999
4000 cpumask_set_cpu(cpu, nohz.cpu_mask);
4001
3920 /* time for ilb owner also to sleep */ 4002 /* time for ilb owner also to sleep */
3921 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { 4003 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
3922 if (atomic_read(&nohz.load_balancer) == cpu) 4004 if (atomic_read(&nohz.load_balancer) == cpu)
@@ -4464,7 +4546,7 @@ void __kprobes sub_preempt_count(int val)
4464 /* 4546 /*
4465 * Underflow? 4547 * Underflow?
4466 */ 4548 */
4467 if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked()))) 4549 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
4468 return; 4550 return;
4469 /* 4551 /*
4470 * Is the spinlock portion underflowing? 4552 * Is the spinlock portion underflowing?
@@ -5150,7 +5232,7 @@ int can_nice(const struct task_struct *p, const int nice)
5150 * sys_setpriority is a more generic, but much slower function that 5232 * sys_setpriority is a more generic, but much slower function that
5151 * does similar things. 5233 * does similar things.
5152 */ 5234 */
5153asmlinkage long sys_nice(int increment) 5235SYSCALL_DEFINE1(nice, int, increment)
5154{ 5236{
5155 long nice, retval; 5237 long nice, retval;
5156 5238
@@ -5457,8 +5539,8 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
5457 * @policy: new policy. 5539 * @policy: new policy.
5458 * @param: structure containing the new RT priority. 5540 * @param: structure containing the new RT priority.
5459 */ 5541 */
5460asmlinkage long 5542SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
5461sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) 5543 struct sched_param __user *, param)
5462{ 5544{
5463 /* negative values for policy are not valid */ 5545 /* negative values for policy are not valid */
5464 if (policy < 0) 5546 if (policy < 0)
@@ -5472,7 +5554,7 @@ sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
5472 * @pid: the pid in question. 5554 * @pid: the pid in question.
5473 * @param: structure containing the new RT priority. 5555 * @param: structure containing the new RT priority.
5474 */ 5556 */
5475asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) 5557SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
5476{ 5558{
5477 return do_sched_setscheduler(pid, -1, param); 5559 return do_sched_setscheduler(pid, -1, param);
5478} 5560}
@@ -5481,7 +5563,7 @@ asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
5481 * sys_sched_getscheduler - get the policy (scheduling class) of a thread 5563 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
5482 * @pid: the pid in question. 5564 * @pid: the pid in question.
5483 */ 5565 */
5484asmlinkage long sys_sched_getscheduler(pid_t pid) 5566SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
5485{ 5567{
5486 struct task_struct *p; 5568 struct task_struct *p;
5487 int retval; 5569 int retval;
@@ -5506,7 +5588,7 @@ asmlinkage long sys_sched_getscheduler(pid_t pid)
5506 * @pid: the pid in question. 5588 * @pid: the pid in question.
5507 * @param: structure containing the RT priority. 5589 * @param: structure containing the RT priority.
5508 */ 5590 */
5509asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) 5591SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
5510{ 5592{
5511 struct sched_param lp; 5593 struct sched_param lp;
5512 struct task_struct *p; 5594 struct task_struct *p;
@@ -5624,8 +5706,8 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
5624 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 5706 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
5625 * @user_mask_ptr: user-space pointer to the new cpu mask 5707 * @user_mask_ptr: user-space pointer to the new cpu mask
5626 */ 5708 */
5627asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, 5709SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
5628 unsigned long __user *user_mask_ptr) 5710 unsigned long __user *, user_mask_ptr)
5629{ 5711{
5630 cpumask_var_t new_mask; 5712 cpumask_var_t new_mask;
5631 int retval; 5713 int retval;
@@ -5672,8 +5754,8 @@ out_unlock:
5672 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 5754 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
5673 * @user_mask_ptr: user-space pointer to hold the current cpu mask 5755 * @user_mask_ptr: user-space pointer to hold the current cpu mask
5674 */ 5756 */
5675asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, 5757SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
5676 unsigned long __user *user_mask_ptr) 5758 unsigned long __user *, user_mask_ptr)
5677{ 5759{
5678 int ret; 5760 int ret;
5679 cpumask_var_t mask; 5761 cpumask_var_t mask;
@@ -5702,7 +5784,7 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
5702 * This function yields the current CPU to other tasks. If there are no 5784 * This function yields the current CPU to other tasks. If there are no
5703 * other threads running on this CPU then this function will return. 5785 * other threads running on this CPU then this function will return.
5704 */ 5786 */
5705asmlinkage long sys_sched_yield(void) 5787SYSCALL_DEFINE0(sched_yield)
5706{ 5788{
5707 struct rq *rq = this_rq_lock(); 5789 struct rq *rq = this_rq_lock();
5708 5790
@@ -5843,7 +5925,7 @@ long __sched io_schedule_timeout(long timeout)
5843 * this syscall returns the maximum rt_priority that can be used 5925 * this syscall returns the maximum rt_priority that can be used
5844 * by a given scheduling class. 5926 * by a given scheduling class.
5845 */ 5927 */
5846asmlinkage long sys_sched_get_priority_max(int policy) 5928SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
5847{ 5929{
5848 int ret = -EINVAL; 5930 int ret = -EINVAL;
5849 5931
@@ -5868,7 +5950,7 @@ asmlinkage long sys_sched_get_priority_max(int policy)
5868 * this syscall returns the minimum rt_priority that can be used 5950 * this syscall returns the minimum rt_priority that can be used
5869 * by a given scheduling class. 5951 * by a given scheduling class.
5870 */ 5952 */
5871asmlinkage long sys_sched_get_priority_min(int policy) 5953SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
5872{ 5954{
5873 int ret = -EINVAL; 5955 int ret = -EINVAL;
5874 5956
@@ -5893,8 +5975,8 @@ asmlinkage long sys_sched_get_priority_min(int policy)
5893 * this syscall writes the default timeslice value of a given process 5975 * this syscall writes the default timeslice value of a given process
5894 * into the user-space timespec buffer. A value of '0' means infinity. 5976 * into the user-space timespec buffer. A value of '0' means infinity.
5895 */ 5977 */
5896asmlinkage 5978SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5897long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) 5979 struct timespec __user *, interval)
5898{ 5980{
5899 struct task_struct *p; 5981 struct task_struct *p;
5900 unsigned int time_slice; 5982 unsigned int time_slice;
@@ -8228,11 +8310,15 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
8228 __set_bit(MAX_RT_PRIO, array->bitmap); 8310 __set_bit(MAX_RT_PRIO, array->bitmap);
8229 8311
8230#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 8312#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
8231 rt_rq->highest_prio = MAX_RT_PRIO; 8313 rt_rq->highest_prio.curr = MAX_RT_PRIO;
8314#ifdef CONFIG_SMP
8315 rt_rq->highest_prio.next = MAX_RT_PRIO;
8316#endif
8232#endif 8317#endif
8233#ifdef CONFIG_SMP 8318#ifdef CONFIG_SMP
8234 rt_rq->rt_nr_migratory = 0; 8319 rt_rq->rt_nr_migratory = 0;
8235 rt_rq->overloaded = 0; 8320 rt_rq->overloaded = 0;
8321 plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
8236#endif 8322#endif
8237 8323
8238 rt_rq->rt_time = 0; 8324 rt_rq->rt_time = 0;
@@ -9074,6 +9160,13 @@ static int tg_schedulable(struct task_group *tg, void *data)
9074 runtime = d->rt_runtime; 9160 runtime = d->rt_runtime;
9075 } 9161 }
9076 9162
9163#ifdef CONFIG_USER_SCHED
9164 if (tg == &root_task_group) {
9165 period = global_rt_period();
9166 runtime = global_rt_runtime();
9167 }
9168#endif
9169
9077 /* 9170 /*
9078 * Cannot have more runtime than the period. 9171 * Cannot have more runtime than the period.
9079 */ 9172 */