aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c822
1 files changed, 708 insertions, 114 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a88f4a485c5e..36c951b7eef8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -296,8 +296,6 @@ __read_mostly int scheduler_running;
296 */ 296 */
297int sysctl_sched_rt_runtime = 950000; 297int sysctl_sched_rt_runtime = 950000;
298 298
299
300
301/* 299/*
302 * __task_rq_lock - lock the rq @p resides on. 300 * __task_rq_lock - lock the rq @p resides on.
303 */ 301 */
@@ -899,7 +897,9 @@ static inline int normal_prio(struct task_struct *p)
899{ 897{
900 int prio; 898 int prio;
901 899
902 if (task_has_rt_policy(p)) 900 if (task_has_dl_policy(p))
901 prio = MAX_DL_PRIO-1;
902 else if (task_has_rt_policy(p))
903 prio = MAX_RT_PRIO-1 - p->rt_priority; 903 prio = MAX_RT_PRIO-1 - p->rt_priority;
904 else 904 else
905 prio = __normal_prio(p); 905 prio = __normal_prio(p);
@@ -945,7 +945,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
945 if (prev_class->switched_from) 945 if (prev_class->switched_from)
946 prev_class->switched_from(rq, p); 946 prev_class->switched_from(rq, p);
947 p->sched_class->switched_to(rq, p); 947 p->sched_class->switched_to(rq, p);
948 } else if (oldprio != p->prio) 948 } else if (oldprio != p->prio || dl_task(p))
949 p->sched_class->prio_changed(rq, p, oldprio); 949 p->sched_class->prio_changed(rq, p, oldprio);
950} 950}
951 951
@@ -1499,8 +1499,7 @@ void scheduler_ipi(void)
1499 * TIF_NEED_RESCHED remotely (for the first time) will also send 1499 * TIF_NEED_RESCHED remotely (for the first time) will also send
1500 * this IPI. 1500 * this IPI.
1501 */ 1501 */
1502 if (tif_need_resched()) 1502 preempt_fold_need_resched();
1503 set_preempt_need_resched();
1504 1503
1505 if (llist_empty(&this_rq()->wake_list) 1504 if (llist_empty(&this_rq()->wake_list)
1506 && !tick_nohz_full_cpu(smp_processor_id()) 1505 && !tick_nohz_full_cpu(smp_processor_id())
@@ -1717,6 +1716,13 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1717 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 1716 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
1718#endif 1717#endif
1719 1718
1719 RB_CLEAR_NODE(&p->dl.rb_node);
1720 hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1721 p->dl.dl_runtime = p->dl.runtime = 0;
1722 p->dl.dl_deadline = p->dl.deadline = 0;
1723 p->dl.dl_period = 0;
1724 p->dl.flags = 0;
1725
1720 INIT_LIST_HEAD(&p->rt.run_list); 1726 INIT_LIST_HEAD(&p->rt.run_list);
1721 1727
1722#ifdef CONFIG_PREEMPT_NOTIFIERS 1728#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -1768,7 +1774,7 @@ void set_numabalancing_state(bool enabled)
1768/* 1774/*
1769 * fork()/clone()-time setup: 1775 * fork()/clone()-time setup:
1770 */ 1776 */
1771void sched_fork(unsigned long clone_flags, struct task_struct *p) 1777int sched_fork(unsigned long clone_flags, struct task_struct *p)
1772{ 1778{
1773 unsigned long flags; 1779 unsigned long flags;
1774 int cpu = get_cpu(); 1780 int cpu = get_cpu();
@@ -1790,7 +1796,7 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p)
1790 * Revert to default priority/policy on fork if requested. 1796 * Revert to default priority/policy on fork if requested.
1791 */ 1797 */
1792 if (unlikely(p->sched_reset_on_fork)) { 1798 if (unlikely(p->sched_reset_on_fork)) {
1793 if (task_has_rt_policy(p)) { 1799 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
1794 p->policy = SCHED_NORMAL; 1800 p->policy = SCHED_NORMAL;
1795 p->static_prio = NICE_TO_PRIO(0); 1801 p->static_prio = NICE_TO_PRIO(0);
1796 p->rt_priority = 0; 1802 p->rt_priority = 0;
@@ -1807,8 +1813,14 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p)
1807 p->sched_reset_on_fork = 0; 1813 p->sched_reset_on_fork = 0;
1808 } 1814 }
1809 1815
1810 if (!rt_prio(p->prio)) 1816 if (dl_prio(p->prio)) {
1817 put_cpu();
1818 return -EAGAIN;
1819 } else if (rt_prio(p->prio)) {
1820 p->sched_class = &rt_sched_class;
1821 } else {
1811 p->sched_class = &fair_sched_class; 1822 p->sched_class = &fair_sched_class;
1823 }
1812 1824
1813 if (p->sched_class->task_fork) 1825 if (p->sched_class->task_fork)
1814 p->sched_class->task_fork(p); 1826 p->sched_class->task_fork(p);
@@ -1834,11 +1846,124 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p)
1834 init_task_preempt_count(p); 1846 init_task_preempt_count(p);
1835#ifdef CONFIG_SMP 1847#ifdef CONFIG_SMP
1836 plist_node_init(&p->pushable_tasks, MAX_PRIO); 1848 plist_node_init(&p->pushable_tasks, MAX_PRIO);
1849 RB_CLEAR_NODE(&p->pushable_dl_tasks);
1837#endif 1850#endif
1838 1851
1839 put_cpu(); 1852 put_cpu();
1853 return 0;
1854}
1855
1856unsigned long to_ratio(u64 period, u64 runtime)
1857{
1858 if (runtime == RUNTIME_INF)
1859 return 1ULL << 20;
1860
1861 /*
1862 * Doing this here saves a lot of checks in all
1863 * the calling paths, and returning zero seems
1864 * safe for them anyway.
1865 */
1866 if (period == 0)
1867 return 0;
1868
1869 return div64_u64(runtime << 20, period);
1870}
1871
1872#ifdef CONFIG_SMP
1873inline struct dl_bw *dl_bw_of(int i)
1874{
1875 return &cpu_rq(i)->rd->dl_bw;
1840} 1876}
1841 1877
1878static inline int dl_bw_cpus(int i)
1879{
1880 struct root_domain *rd = cpu_rq(i)->rd;
1881 int cpus = 0;
1882
1883 for_each_cpu_and(i, rd->span, cpu_active_mask)
1884 cpus++;
1885
1886 return cpus;
1887}
1888#else
1889inline struct dl_bw *dl_bw_of(int i)
1890{
1891 return &cpu_rq(i)->dl.dl_bw;
1892}
1893
1894static inline int dl_bw_cpus(int i)
1895{
1896 return 1;
1897}
1898#endif
1899
1900static inline
1901void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
1902{
1903 dl_b->total_bw -= tsk_bw;
1904}
1905
1906static inline
1907void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
1908{
1909 dl_b->total_bw += tsk_bw;
1910}
1911
1912static inline
1913bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
1914{
1915 return dl_b->bw != -1 &&
1916 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
1917}
1918
1919/*
1920 * We must be sure that accepting a new task (or allowing changing the
1921 * parameters of an existing one) is consistent with the bandwidth
1922 * constraints. If yes, this function also accordingly updates the currently
1923 * allocated bandwidth to reflect the new situation.
1924 *
1925 * This function is called while holding p's rq->lock.
1926 */
1927static int dl_overflow(struct task_struct *p, int policy,
1928 const struct sched_attr *attr)
1929{
1930
1931 struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
1932 u64 period = attr->sched_period;
1933 u64 runtime = attr->sched_runtime;
1934 u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
1935 int cpus, err = -1;
1936
1937 if (new_bw == p->dl.dl_bw)
1938 return 0;
1939
1940 /*
1941 * Either if a task, enters, leave, or stays -deadline but changes
1942 * its parameters, we may need to update accordingly the total
1943 * allocated bandwidth of the container.
1944 */
1945 raw_spin_lock(&dl_b->lock);
1946 cpus = dl_bw_cpus(task_cpu(p));
1947 if (dl_policy(policy) && !task_has_dl_policy(p) &&
1948 !__dl_overflow(dl_b, cpus, 0, new_bw)) {
1949 __dl_add(dl_b, new_bw);
1950 err = 0;
1951 } else if (dl_policy(policy) && task_has_dl_policy(p) &&
1952 !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
1953 __dl_clear(dl_b, p->dl.dl_bw);
1954 __dl_add(dl_b, new_bw);
1955 err = 0;
1956 } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
1957 __dl_clear(dl_b, p->dl.dl_bw);
1958 err = 0;
1959 }
1960 raw_spin_unlock(&dl_b->lock);
1961
1962 return err;
1963}
1964
1965extern void init_dl_bw(struct dl_bw *dl_b);
1966
1842/* 1967/*
1843 * wake_up_new_task - wake up a newly created task for the first time. 1968 * wake_up_new_task - wake up a newly created task for the first time.
1844 * 1969 *
@@ -2003,6 +2128,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2003 if (unlikely(prev_state == TASK_DEAD)) { 2128 if (unlikely(prev_state == TASK_DEAD)) {
2004 task_numa_free(prev); 2129 task_numa_free(prev);
2005 2130
2131 if (prev->sched_class->task_dead)
2132 prev->sched_class->task_dead(prev);
2133
2006 /* 2134 /*
2007 * Remove function-return probe instances associated with this 2135 * Remove function-return probe instances associated with this
2008 * task and put them back on the free list. 2136 * task and put them back on the free list.
@@ -2296,7 +2424,7 @@ void scheduler_tick(void)
2296 2424
2297#ifdef CONFIG_SMP 2425#ifdef CONFIG_SMP
2298 rq->idle_balance = idle_cpu(cpu); 2426 rq->idle_balance = idle_cpu(cpu);
2299 trigger_load_balance(rq, cpu); 2427 trigger_load_balance(rq);
2300#endif 2428#endif
2301 rq_last_tick_reset(rq); 2429 rq_last_tick_reset(rq);
2302} 2430}
@@ -2414,10 +2542,10 @@ static inline void schedule_debug(struct task_struct *prev)
2414{ 2542{
2415 /* 2543 /*
2416 * Test if we are atomic. Since do_exit() needs to call into 2544 * Test if we are atomic. Since do_exit() needs to call into
2417 * schedule() atomically, we ignore that path for now. 2545 * schedule() atomically, we ignore that path. Otherwise whine
2418 * Otherwise, whine if we are scheduling when we should not be. 2546 * if we are scheduling when we should not.
2419 */ 2547 */
2420 if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) 2548 if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
2421 __schedule_bug(prev); 2549 __schedule_bug(prev);
2422 rcu_sleep_check(); 2550 rcu_sleep_check();
2423 2551
@@ -2761,11 +2889,11 @@ EXPORT_SYMBOL(sleep_on_timeout);
2761 */ 2889 */
2762void rt_mutex_setprio(struct task_struct *p, int prio) 2890void rt_mutex_setprio(struct task_struct *p, int prio)
2763{ 2891{
2764 int oldprio, on_rq, running; 2892 int oldprio, on_rq, running, enqueue_flag = 0;
2765 struct rq *rq; 2893 struct rq *rq;
2766 const struct sched_class *prev_class; 2894 const struct sched_class *prev_class;
2767 2895
2768 BUG_ON(prio < 0 || prio > MAX_PRIO); 2896 BUG_ON(prio > MAX_PRIO);
2769 2897
2770 rq = __task_rq_lock(p); 2898 rq = __task_rq_lock(p);
2771 2899
@@ -2788,6 +2916,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
2788 } 2916 }
2789 2917
2790 trace_sched_pi_setprio(p, prio); 2918 trace_sched_pi_setprio(p, prio);
2919 p->pi_top_task = rt_mutex_get_top_task(p);
2791 oldprio = p->prio; 2920 oldprio = p->prio;
2792 prev_class = p->sched_class; 2921 prev_class = p->sched_class;
2793 on_rq = p->on_rq; 2922 on_rq = p->on_rq;
@@ -2797,23 +2926,49 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
2797 if (running) 2926 if (running)
2798 p->sched_class->put_prev_task(rq, p); 2927 p->sched_class->put_prev_task(rq, p);
2799 2928
2800 if (rt_prio(prio)) 2929 /*
2930 * Boosting condition are:
2931 * 1. -rt task is running and holds mutex A
2932 * --> -dl task blocks on mutex A
2933 *
2934 * 2. -dl task is running and holds mutex A
2935 * --> -dl task blocks on mutex A and could preempt the
2936 * running task
2937 */
2938 if (dl_prio(prio)) {
2939 if (!dl_prio(p->normal_prio) || (p->pi_top_task &&
2940 dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) {
2941 p->dl.dl_boosted = 1;
2942 p->dl.dl_throttled = 0;
2943 enqueue_flag = ENQUEUE_REPLENISH;
2944 } else
2945 p->dl.dl_boosted = 0;
2946 p->sched_class = &dl_sched_class;
2947 } else if (rt_prio(prio)) {
2948 if (dl_prio(oldprio))
2949 p->dl.dl_boosted = 0;
2950 if (oldprio < prio)
2951 enqueue_flag = ENQUEUE_HEAD;
2801 p->sched_class = &rt_sched_class; 2952 p->sched_class = &rt_sched_class;
2802 else 2953 } else {
2954 if (dl_prio(oldprio))
2955 p->dl.dl_boosted = 0;
2803 p->sched_class = &fair_sched_class; 2956 p->sched_class = &fair_sched_class;
2957 }
2804 2958
2805 p->prio = prio; 2959 p->prio = prio;
2806 2960
2807 if (running) 2961 if (running)
2808 p->sched_class->set_curr_task(rq); 2962 p->sched_class->set_curr_task(rq);
2809 if (on_rq) 2963 if (on_rq)
2810 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 2964 enqueue_task(rq, p, enqueue_flag);
2811 2965
2812 check_class_changed(rq, p, prev_class, oldprio); 2966 check_class_changed(rq, p, prev_class, oldprio);
2813out_unlock: 2967out_unlock:
2814 __task_rq_unlock(rq); 2968 __task_rq_unlock(rq);
2815} 2969}
2816#endif 2970#endif
2971
2817void set_user_nice(struct task_struct *p, long nice) 2972void set_user_nice(struct task_struct *p, long nice)
2818{ 2973{
2819 int old_prio, delta, on_rq; 2974 int old_prio, delta, on_rq;
@@ -2831,9 +2986,9 @@ void set_user_nice(struct task_struct *p, long nice)
2831 * The RT priorities are set via sched_setscheduler(), but we still 2986 * The RT priorities are set via sched_setscheduler(), but we still
2832 * allow the 'normal' nice value to be set - but as expected 2987 * allow the 'normal' nice value to be set - but as expected
2833 * it wont have any effect on scheduling until the task is 2988 * it wont have any effect on scheduling until the task is
2834 * SCHED_FIFO/SCHED_RR: 2989 * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
2835 */ 2990 */
2836 if (task_has_rt_policy(p)) { 2991 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
2837 p->static_prio = NICE_TO_PRIO(nice); 2992 p->static_prio = NICE_TO_PRIO(nice);
2838 goto out_unlock; 2993 goto out_unlock;
2839 } 2994 }
@@ -2988,22 +3143,95 @@ static struct task_struct *find_process_by_pid(pid_t pid)
2988 return pid ? find_task_by_vpid(pid) : current; 3143 return pid ? find_task_by_vpid(pid) : current;
2989} 3144}
2990 3145
2991/* Actually do priority change: must hold rq lock. */ 3146/*
3147 * This function initializes the sched_dl_entity of a newly becoming
3148 * SCHED_DEADLINE task.
3149 *
3150 * Only the static values are considered here, the actual runtime and the
3151 * absolute deadline will be properly calculated when the task is enqueued
3152 * for the first time with its new policy.
3153 */
2992static void 3154static void
2993__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) 3155__setparam_dl(struct task_struct *p, const struct sched_attr *attr)
3156{
3157 struct sched_dl_entity *dl_se = &p->dl;
3158
3159 init_dl_task_timer(dl_se);
3160 dl_se->dl_runtime = attr->sched_runtime;
3161 dl_se->dl_deadline = attr->sched_deadline;
3162 dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
3163 dl_se->flags = attr->sched_flags;
3164 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
3165 dl_se->dl_throttled = 0;
3166 dl_se->dl_new = 1;
3167}
3168
3169/* Actually do priority change: must hold pi & rq lock. */
3170static void __setscheduler(struct rq *rq, struct task_struct *p,
3171 const struct sched_attr *attr)
2994{ 3172{
3173 int policy = attr->sched_policy;
3174
3175 if (policy == -1) /* setparam */
3176 policy = p->policy;
3177
2995 p->policy = policy; 3178 p->policy = policy;
2996 p->rt_priority = prio; 3179
3180 if (dl_policy(policy))
3181 __setparam_dl(p, attr);
3182 else if (fair_policy(policy))
3183 p->static_prio = NICE_TO_PRIO(attr->sched_nice);
3184
3185 /*
3186 * __sched_setscheduler() ensures attr->sched_priority == 0 when
3187 * !rt_policy. Always setting this ensures that things like
3188 * getparam()/getattr() don't report silly values for !rt tasks.
3189 */
3190 p->rt_priority = attr->sched_priority;
3191
2997 p->normal_prio = normal_prio(p); 3192 p->normal_prio = normal_prio(p);
2998 /* we are holding p->pi_lock already */
2999 p->prio = rt_mutex_getprio(p); 3193 p->prio = rt_mutex_getprio(p);
3000 if (rt_prio(p->prio)) 3194
3195 if (dl_prio(p->prio))
3196 p->sched_class = &dl_sched_class;
3197 else if (rt_prio(p->prio))
3001 p->sched_class = &rt_sched_class; 3198 p->sched_class = &rt_sched_class;
3002 else 3199 else
3003 p->sched_class = &fair_sched_class; 3200 p->sched_class = &fair_sched_class;
3201
3004 set_load_weight(p); 3202 set_load_weight(p);
3005} 3203}
3006 3204
3205static void
3206__getparam_dl(struct task_struct *p, struct sched_attr *attr)
3207{
3208 struct sched_dl_entity *dl_se = &p->dl;
3209
3210 attr->sched_priority = p->rt_priority;
3211 attr->sched_runtime = dl_se->dl_runtime;
3212 attr->sched_deadline = dl_se->dl_deadline;
3213 attr->sched_period = dl_se->dl_period;
3214 attr->sched_flags = dl_se->flags;
3215}
3216
3217/*
3218 * This function validates the new parameters of a -deadline task.
3219 * We ask for the deadline not being zero, and greater or equal
3220 * than the runtime, as well as the period of being zero or
3221 * greater than deadline. Furthermore, we have to be sure that
3222 * user parameters are above the internal resolution (1us); we
3223 * check sched_runtime only since it is always the smaller one.
3224 */
3225static bool
3226__checkparam_dl(const struct sched_attr *attr)
3227{
3228 return attr && attr->sched_deadline != 0 &&
3229 (attr->sched_period == 0 ||
3230 (s64)(attr->sched_period - attr->sched_deadline) >= 0) &&
3231 (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0 &&
3232 attr->sched_runtime >= (2 << (DL_SCALE - 1));
3233}
3234
3007/* 3235/*
3008 * check the target process has a UID that matches the current process's 3236 * check the target process has a UID that matches the current process's
3009 */ 3237 */
@@ -3020,10 +3248,12 @@ static bool check_same_owner(struct task_struct *p)
3020 return match; 3248 return match;
3021} 3249}
3022 3250
3023static int __sched_setscheduler(struct task_struct *p, int policy, 3251static int __sched_setscheduler(struct task_struct *p,
3024 const struct sched_param *param, bool user) 3252 const struct sched_attr *attr,
3253 bool user)
3025{ 3254{
3026 int retval, oldprio, oldpolicy = -1, on_rq, running; 3255 int retval, oldprio, oldpolicy = -1, on_rq, running;
3256 int policy = attr->sched_policy;
3027 unsigned long flags; 3257 unsigned long flags;
3028 const struct sched_class *prev_class; 3258 const struct sched_class *prev_class;
3029 struct rq *rq; 3259 struct rq *rq;
@@ -3037,31 +3267,40 @@ recheck:
3037 reset_on_fork = p->sched_reset_on_fork; 3267 reset_on_fork = p->sched_reset_on_fork;
3038 policy = oldpolicy = p->policy; 3268 policy = oldpolicy = p->policy;
3039 } else { 3269 } else {
3040 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); 3270 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
3041 policy &= ~SCHED_RESET_ON_FORK;
3042 3271
3043 if (policy != SCHED_FIFO && policy != SCHED_RR && 3272 if (policy != SCHED_DEADLINE &&
3273 policy != SCHED_FIFO && policy != SCHED_RR &&
3044 policy != SCHED_NORMAL && policy != SCHED_BATCH && 3274 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
3045 policy != SCHED_IDLE) 3275 policy != SCHED_IDLE)
3046 return -EINVAL; 3276 return -EINVAL;
3047 } 3277 }
3048 3278
3279 if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK))
3280 return -EINVAL;
3281
3049 /* 3282 /*
3050 * Valid priorities for SCHED_FIFO and SCHED_RR are 3283 * Valid priorities for SCHED_FIFO and SCHED_RR are
3051 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 3284 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
3052 * SCHED_BATCH and SCHED_IDLE is 0. 3285 * SCHED_BATCH and SCHED_IDLE is 0.
3053 */ 3286 */
3054 if (param->sched_priority < 0 || 3287 if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
3055 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || 3288 (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
3056 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
3057 return -EINVAL; 3289 return -EINVAL;
3058 if (rt_policy(policy) != (param->sched_priority != 0)) 3290 if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
3291 (rt_policy(policy) != (attr->sched_priority != 0)))
3059 return -EINVAL; 3292 return -EINVAL;
3060 3293
3061 /* 3294 /*
3062 * Allow unprivileged RT tasks to decrease priority: 3295 * Allow unprivileged RT tasks to decrease priority:
3063 */ 3296 */
3064 if (user && !capable(CAP_SYS_NICE)) { 3297 if (user && !capable(CAP_SYS_NICE)) {
3298 if (fair_policy(policy)) {
3299 if (attr->sched_nice < TASK_NICE(p) &&
3300 !can_nice(p, attr->sched_nice))
3301 return -EPERM;
3302 }
3303
3065 if (rt_policy(policy)) { 3304 if (rt_policy(policy)) {
3066 unsigned long rlim_rtprio = 3305 unsigned long rlim_rtprio =
3067 task_rlimit(p, RLIMIT_RTPRIO); 3306 task_rlimit(p, RLIMIT_RTPRIO);
@@ -3071,8 +3310,8 @@ recheck:
3071 return -EPERM; 3310 return -EPERM;
3072 3311
3073 /* can't increase priority */ 3312 /* can't increase priority */
3074 if (param->sched_priority > p->rt_priority && 3313 if (attr->sched_priority > p->rt_priority &&
3075 param->sched_priority > rlim_rtprio) 3314 attr->sched_priority > rlim_rtprio)
3076 return -EPERM; 3315 return -EPERM;
3077 } 3316 }
3078 3317
@@ -3120,14 +3359,21 @@ recheck:
3120 /* 3359 /*
3121 * If not changing anything there's no need to proceed further: 3360 * If not changing anything there's no need to proceed further:
3122 */ 3361 */
3123 if (unlikely(policy == p->policy && (!rt_policy(policy) || 3362 if (unlikely(policy == p->policy)) {
3124 param->sched_priority == p->rt_priority))) { 3363 if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p))
3364 goto change;
3365 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
3366 goto change;
3367 if (dl_policy(policy))
3368 goto change;
3369
3125 task_rq_unlock(rq, p, &flags); 3370 task_rq_unlock(rq, p, &flags);
3126 return 0; 3371 return 0;
3127 } 3372 }
3373change:
3128 3374
3129#ifdef CONFIG_RT_GROUP_SCHED
3130 if (user) { 3375 if (user) {
3376#ifdef CONFIG_RT_GROUP_SCHED
3131 /* 3377 /*
3132 * Do not allow realtime tasks into groups that have no runtime 3378 * Do not allow realtime tasks into groups that have no runtime
3133 * assigned. 3379 * assigned.
@@ -3138,8 +3384,24 @@ recheck:
3138 task_rq_unlock(rq, p, &flags); 3384 task_rq_unlock(rq, p, &flags);
3139 return -EPERM; 3385 return -EPERM;
3140 } 3386 }
3141 }
3142#endif 3387#endif
3388#ifdef CONFIG_SMP
3389 if (dl_bandwidth_enabled() && dl_policy(policy)) {
3390 cpumask_t *span = rq->rd->span;
3391
3392 /*
3393 * Don't allow tasks with an affinity mask smaller than
3394 * the entire root_domain to become SCHED_DEADLINE. We
3395 * will also fail if there's no bandwidth available.
3396 */
3397 if (!cpumask_subset(span, &p->cpus_allowed) ||
3398 rq->rd->dl_bw.bw == 0) {
3399 task_rq_unlock(rq, p, &flags);
3400 return -EPERM;
3401 }
3402 }
3403#endif
3404 }
3143 3405
3144 /* recheck policy now with rq lock held */ 3406 /* recheck policy now with rq lock held */
3145 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 3407 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
@@ -3147,6 +3409,17 @@ recheck:
3147 task_rq_unlock(rq, p, &flags); 3409 task_rq_unlock(rq, p, &flags);
3148 goto recheck; 3410 goto recheck;
3149 } 3411 }
3412
3413 /*
3414 * If setscheduling to SCHED_DEADLINE (or changing the parameters
3415 * of a SCHED_DEADLINE task) we need to check if enough bandwidth
3416 * is available.
3417 */
3418 if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
3419 task_rq_unlock(rq, p, &flags);
3420 return -EBUSY;
3421 }
3422
3150 on_rq = p->on_rq; 3423 on_rq = p->on_rq;
3151 running = task_current(rq, p); 3424 running = task_current(rq, p);
3152 if (on_rq) 3425 if (on_rq)
@@ -3158,7 +3431,7 @@ recheck:
3158 3431
3159 oldprio = p->prio; 3432 oldprio = p->prio;
3160 prev_class = p->sched_class; 3433 prev_class = p->sched_class;
3161 __setscheduler(rq, p, policy, param->sched_priority); 3434 __setscheduler(rq, p, attr);
3162 3435
3163 if (running) 3436 if (running)
3164 p->sched_class->set_curr_task(rq); 3437 p->sched_class->set_curr_task(rq);
@@ -3173,6 +3446,26 @@ recheck:
3173 return 0; 3446 return 0;
3174} 3447}
3175 3448
3449static int _sched_setscheduler(struct task_struct *p, int policy,
3450 const struct sched_param *param, bool check)
3451{
3452 struct sched_attr attr = {
3453 .sched_policy = policy,
3454 .sched_priority = param->sched_priority,
3455 .sched_nice = PRIO_TO_NICE(p->static_prio),
3456 };
3457
3458 /*
3459 * Fixup the legacy SCHED_RESET_ON_FORK hack
3460 */
3461 if (policy & SCHED_RESET_ON_FORK) {
3462 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
3463 policy &= ~SCHED_RESET_ON_FORK;
3464 attr.sched_policy = policy;
3465 }
3466
3467 return __sched_setscheduler(p, &attr, check);
3468}
3176/** 3469/**
3177 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. 3470 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
3178 * @p: the task in question. 3471 * @p: the task in question.
@@ -3186,10 +3479,16 @@ recheck:
3186int sched_setscheduler(struct task_struct *p, int policy, 3479int sched_setscheduler(struct task_struct *p, int policy,
3187 const struct sched_param *param) 3480 const struct sched_param *param)
3188{ 3481{
3189 return __sched_setscheduler(p, policy, param, true); 3482 return _sched_setscheduler(p, policy, param, true);
3190} 3483}
3191EXPORT_SYMBOL_GPL(sched_setscheduler); 3484EXPORT_SYMBOL_GPL(sched_setscheduler);
3192 3485
3486int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
3487{
3488 return __sched_setscheduler(p, attr, true);
3489}
3490EXPORT_SYMBOL_GPL(sched_setattr);
3491
3193/** 3492/**
3194 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. 3493 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
3195 * @p: the task in question. 3494 * @p: the task in question.
@@ -3206,7 +3505,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
3206int sched_setscheduler_nocheck(struct task_struct *p, int policy, 3505int sched_setscheduler_nocheck(struct task_struct *p, int policy,
3207 const struct sched_param *param) 3506 const struct sched_param *param)
3208{ 3507{
3209 return __sched_setscheduler(p, policy, param, false); 3508 return _sched_setscheduler(p, policy, param, false);
3210} 3509}
3211 3510
3212static int 3511static int
@@ -3231,6 +3530,79 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3231 return retval; 3530 return retval;
3232} 3531}
3233 3532
3533/*
3534 * Mimics kernel/events/core.c perf_copy_attr().
3535 */
3536static int sched_copy_attr(struct sched_attr __user *uattr,
3537 struct sched_attr *attr)
3538{
3539 u32 size;
3540 int ret;
3541
3542 if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
3543 return -EFAULT;
3544
3545 /*
3546 * zero the full structure, so that a short copy will be nice.
3547 */
3548 memset(attr, 0, sizeof(*attr));
3549
3550 ret = get_user(size, &uattr->size);
3551 if (ret)
3552 return ret;
3553
3554 if (size > PAGE_SIZE) /* silly large */
3555 goto err_size;
3556
3557 if (!size) /* abi compat */
3558 size = SCHED_ATTR_SIZE_VER0;
3559
3560 if (size < SCHED_ATTR_SIZE_VER0)
3561 goto err_size;
3562
3563 /*
3564 * If we're handed a bigger struct than we know of,
3565 * ensure all the unknown bits are 0 - i.e. new
3566 * user-space does not rely on any kernel feature
3567 * extensions we dont know about yet.
3568 */
3569 if (size > sizeof(*attr)) {
3570 unsigned char __user *addr;
3571 unsigned char __user *end;
3572 unsigned char val;
3573
3574 addr = (void __user *)uattr + sizeof(*attr);
3575 end = (void __user *)uattr + size;
3576
3577 for (; addr < end; addr++) {
3578 ret = get_user(val, addr);
3579 if (ret)
3580 return ret;
3581 if (val)
3582 goto err_size;
3583 }
3584 size = sizeof(*attr);
3585 }
3586
3587 ret = copy_from_user(attr, uattr, size);
3588 if (ret)
3589 return -EFAULT;
3590
3591 /*
3592 * XXX: do we want to be lenient like existing syscalls; or do we want
3593 * to be strict and return an error on out-of-bounds values?
3594 */
3595 attr->sched_nice = clamp(attr->sched_nice, -20, 19);
3596
3597out:
3598 return ret;
3599
3600err_size:
3601 put_user(sizeof(*attr), &uattr->size);
3602 ret = -E2BIG;
3603 goto out;
3604}
3605
3234/** 3606/**
3235 * sys_sched_setscheduler - set/change the scheduler policy and RT priority 3607 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
3236 * @pid: the pid in question. 3608 * @pid: the pid in question.
@@ -3262,6 +3634,33 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
3262} 3634}
3263 3635
3264/** 3636/**
3637 * sys_sched_setattr - same as above, but with extended sched_attr
3638 * @pid: the pid in question.
3639 * @uattr: structure containing the extended parameters.
3640 */
3641SYSCALL_DEFINE2(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr)
3642{
3643 struct sched_attr attr;
3644 struct task_struct *p;
3645 int retval;
3646
3647 if (!uattr || pid < 0)
3648 return -EINVAL;
3649
3650 if (sched_copy_attr(uattr, &attr))
3651 return -EFAULT;
3652
3653 rcu_read_lock();
3654 retval = -ESRCH;
3655 p = find_process_by_pid(pid);
3656 if (p != NULL)
3657 retval = sched_setattr(p, &attr);
3658 rcu_read_unlock();
3659
3660 return retval;
3661}
3662
3663/**
3265 * sys_sched_getscheduler - get the policy (scheduling class) of a thread 3664 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
3266 * @pid: the pid in question. 3665 * @pid: the pid in question.
3267 * 3666 *
@@ -3316,6 +3715,10 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
3316 if (retval) 3715 if (retval)
3317 goto out_unlock; 3716 goto out_unlock;
3318 3717
3718 if (task_has_dl_policy(p)) {
3719 retval = -EINVAL;
3720 goto out_unlock;
3721 }
3319 lp.sched_priority = p->rt_priority; 3722 lp.sched_priority = p->rt_priority;
3320 rcu_read_unlock(); 3723 rcu_read_unlock();
3321 3724
@@ -3331,6 +3734,96 @@ out_unlock:
3331 return retval; 3734 return retval;
3332} 3735}
3333 3736
3737static int sched_read_attr(struct sched_attr __user *uattr,
3738 struct sched_attr *attr,
3739 unsigned int usize)
3740{
3741 int ret;
3742
3743 if (!access_ok(VERIFY_WRITE, uattr, usize))
3744 return -EFAULT;
3745
3746 /*
3747 * If we're handed a smaller struct than we know of,
3748 * ensure all the unknown bits are 0 - i.e. old
3749 * user-space does not get uncomplete information.
3750 */
3751 if (usize < sizeof(*attr)) {
3752 unsigned char *addr;
3753 unsigned char *end;
3754
3755 addr = (void *)attr + usize;
3756 end = (void *)attr + sizeof(*attr);
3757
3758 for (; addr < end; addr++) {
3759 if (*addr)
3760 goto err_size;
3761 }
3762
3763 attr->size = usize;
3764 }
3765
3766 ret = copy_to_user(uattr, attr, usize);
3767 if (ret)
3768 return -EFAULT;
3769
3770out:
3771 return ret;
3772
3773err_size:
3774 ret = -E2BIG;
3775 goto out;
3776}
3777
3778/**
3779 * sys_sched_getattr - similar to sched_getparam, but with sched_attr
3780 * @pid: the pid in question.
3781 * @uattr: structure containing the extended parameters.
3782 * @size: sizeof(attr) for fwd/bwd comp.
3783 */
3784SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
3785 unsigned int, size)
3786{
3787 struct sched_attr attr = {
3788 .size = sizeof(struct sched_attr),
3789 };
3790 struct task_struct *p;
3791 int retval;
3792
3793 if (!uattr || pid < 0 || size > PAGE_SIZE ||
3794 size < SCHED_ATTR_SIZE_VER0)
3795 return -EINVAL;
3796
3797 rcu_read_lock();
3798 p = find_process_by_pid(pid);
3799 retval = -ESRCH;
3800 if (!p)
3801 goto out_unlock;
3802
3803 retval = security_task_getscheduler(p);
3804 if (retval)
3805 goto out_unlock;
3806
3807 attr.sched_policy = p->policy;
3808 if (p->sched_reset_on_fork)
3809 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
3810 if (task_has_dl_policy(p))
3811 __getparam_dl(p, &attr);
3812 else if (task_has_rt_policy(p))
3813 attr.sched_priority = p->rt_priority;
3814 else
3815 attr.sched_nice = TASK_NICE(p);
3816
3817 rcu_read_unlock();
3818
3819 retval = sched_read_attr(uattr, &attr, size);
3820 return retval;
3821
3822out_unlock:
3823 rcu_read_unlock();
3824 return retval;
3825}
3826
3334long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) 3827long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
3335{ 3828{
3336 cpumask_var_t cpus_allowed, new_mask; 3829 cpumask_var_t cpus_allowed, new_mask;
@@ -3375,8 +3868,26 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
3375 if (retval) 3868 if (retval)
3376 goto out_unlock; 3869 goto out_unlock;
3377 3870
3871
3378 cpuset_cpus_allowed(p, cpus_allowed); 3872 cpuset_cpus_allowed(p, cpus_allowed);
3379 cpumask_and(new_mask, in_mask, cpus_allowed); 3873 cpumask_and(new_mask, in_mask, cpus_allowed);
3874
3875 /*
3876 * Since bandwidth control happens on root_domain basis,
3877 * if admission test is enabled, we only admit -deadline
3878 * tasks allowed to run on all the CPUs in the task's
3879 * root_domain.
3880 */
3881#ifdef CONFIG_SMP
3882 if (task_has_dl_policy(p)) {
3883 const struct cpumask *span = task_rq(p)->rd->span;
3884
3885 if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) {
3886 retval = -EBUSY;
3887 goto out_unlock;
3888 }
3889 }
3890#endif
3380again: 3891again:
3381 retval = set_cpus_allowed_ptr(p, new_mask); 3892 retval = set_cpus_allowed_ptr(p, new_mask);
3382 3893
@@ -3653,7 +4164,7 @@ again:
3653 } 4164 }
3654 4165
3655 double_rq_lock(rq, p_rq); 4166 double_rq_lock(rq, p_rq);
3656 while (task_rq(p) != p_rq) { 4167 if (task_rq(p) != p_rq) {
3657 double_rq_unlock(rq, p_rq); 4168 double_rq_unlock(rq, p_rq);
3658 goto again; 4169 goto again;
3659 } 4170 }
@@ -3742,6 +4253,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
3742 case SCHED_RR: 4253 case SCHED_RR:
3743 ret = MAX_USER_RT_PRIO-1; 4254 ret = MAX_USER_RT_PRIO-1;
3744 break; 4255 break;
4256 case SCHED_DEADLINE:
3745 case SCHED_NORMAL: 4257 case SCHED_NORMAL:
3746 case SCHED_BATCH: 4258 case SCHED_BATCH:
3747 case SCHED_IDLE: 4259 case SCHED_IDLE:
@@ -3768,6 +4280,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
3768 case SCHED_RR: 4280 case SCHED_RR:
3769 ret = 1; 4281 ret = 1;
3770 break; 4282 break;
4283 case SCHED_DEADLINE:
3771 case SCHED_NORMAL: 4284 case SCHED_NORMAL:
3772 case SCHED_BATCH: 4285 case SCHED_BATCH:
3773 case SCHED_IDLE: 4286 case SCHED_IDLE:
@@ -4514,13 +5027,31 @@ static int sched_cpu_active(struct notifier_block *nfb,
4514static int sched_cpu_inactive(struct notifier_block *nfb, 5027static int sched_cpu_inactive(struct notifier_block *nfb,
4515 unsigned long action, void *hcpu) 5028 unsigned long action, void *hcpu)
4516{ 5029{
5030 unsigned long flags;
5031 long cpu = (long)hcpu;
5032
4517 switch (action & ~CPU_TASKS_FROZEN) { 5033 switch (action & ~CPU_TASKS_FROZEN) {
4518 case CPU_DOWN_PREPARE: 5034 case CPU_DOWN_PREPARE:
4519 set_cpu_active((long)hcpu, false); 5035 set_cpu_active(cpu, false);
5036
5037 /* explicitly allow suspend */
5038 if (!(action & CPU_TASKS_FROZEN)) {
5039 struct dl_bw *dl_b = dl_bw_of(cpu);
5040 bool overflow;
5041 int cpus;
5042
5043 raw_spin_lock_irqsave(&dl_b->lock, flags);
5044 cpus = dl_bw_cpus(cpu);
5045 overflow = __dl_overflow(dl_b, cpus, 0, 0);
5046 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
5047
5048 if (overflow)
5049 return notifier_from_errno(-EBUSY);
5050 }
4520 return NOTIFY_OK; 5051 return NOTIFY_OK;
4521 default:
4522 return NOTIFY_DONE;
4523 } 5052 }
5053
5054 return NOTIFY_DONE;
4524} 5055}
4525 5056
4526static int __init migration_init(void) 5057static int __init migration_init(void)
@@ -4739,6 +5270,8 @@ static void free_rootdomain(struct rcu_head *rcu)
4739 struct root_domain *rd = container_of(rcu, struct root_domain, rcu); 5270 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
4740 5271
4741 cpupri_cleanup(&rd->cpupri); 5272 cpupri_cleanup(&rd->cpupri);
5273 cpudl_cleanup(&rd->cpudl);
5274 free_cpumask_var(rd->dlo_mask);
4742 free_cpumask_var(rd->rto_mask); 5275 free_cpumask_var(rd->rto_mask);
4743 free_cpumask_var(rd->online); 5276 free_cpumask_var(rd->online);
4744 free_cpumask_var(rd->span); 5277 free_cpumask_var(rd->span);
@@ -4790,8 +5323,14 @@ static int init_rootdomain(struct root_domain *rd)
4790 goto out; 5323 goto out;
4791 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) 5324 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
4792 goto free_span; 5325 goto free_span;
4793 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) 5326 if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
4794 goto free_online; 5327 goto free_online;
5328 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
5329 goto free_dlo_mask;
5330
5331 init_dl_bw(&rd->dl_bw);
5332 if (cpudl_init(&rd->cpudl) != 0)
5333 goto free_dlo_mask;
4795 5334
4796 if (cpupri_init(&rd->cpupri) != 0) 5335 if (cpupri_init(&rd->cpupri) != 0)
4797 goto free_rto_mask; 5336 goto free_rto_mask;
@@ -4799,6 +5338,8 @@ static int init_rootdomain(struct root_domain *rd)
4799 5338
4800free_rto_mask: 5339free_rto_mask:
4801 free_cpumask_var(rd->rto_mask); 5340 free_cpumask_var(rd->rto_mask);
5341free_dlo_mask:
5342 free_cpumask_var(rd->dlo_mask);
4802free_online: 5343free_online:
4803 free_cpumask_var(rd->online); 5344 free_cpumask_var(rd->online);
4804free_span: 5345free_span:
@@ -6150,6 +6691,7 @@ void __init sched_init_smp(void)
6150 free_cpumask_var(non_isolated_cpus); 6691 free_cpumask_var(non_isolated_cpus);
6151 6692
6152 init_sched_rt_class(); 6693 init_sched_rt_class();
6694 init_sched_dl_class();
6153} 6695}
6154#else 6696#else
6155void __init sched_init_smp(void) 6697void __init sched_init_smp(void)
@@ -6219,13 +6761,15 @@ void __init sched_init(void)
6219#endif /* CONFIG_CPUMASK_OFFSTACK */ 6761#endif /* CONFIG_CPUMASK_OFFSTACK */
6220 } 6762 }
6221 6763
6764 init_rt_bandwidth(&def_rt_bandwidth,
6765 global_rt_period(), global_rt_runtime());
6766 init_dl_bandwidth(&def_dl_bandwidth,
6767 global_rt_period(), global_rt_runtime());
6768
6222#ifdef CONFIG_SMP 6769#ifdef CONFIG_SMP
6223 init_defrootdomain(); 6770 init_defrootdomain();
6224#endif 6771#endif
6225 6772
6226 init_rt_bandwidth(&def_rt_bandwidth,
6227 global_rt_period(), global_rt_runtime());
6228
6229#ifdef CONFIG_RT_GROUP_SCHED 6773#ifdef CONFIG_RT_GROUP_SCHED
6230 init_rt_bandwidth(&root_task_group.rt_bandwidth, 6774 init_rt_bandwidth(&root_task_group.rt_bandwidth,
6231 global_rt_period(), global_rt_runtime()); 6775 global_rt_period(), global_rt_runtime());
@@ -6249,6 +6793,7 @@ void __init sched_init(void)
6249 rq->calc_load_update = jiffies + LOAD_FREQ; 6793 rq->calc_load_update = jiffies + LOAD_FREQ;
6250 init_cfs_rq(&rq->cfs); 6794 init_cfs_rq(&rq->cfs);
6251 init_rt_rq(&rq->rt, rq); 6795 init_rt_rq(&rq->rt, rq);
6796 init_dl_rq(&rq->dl, rq);
6252#ifdef CONFIG_FAIR_GROUP_SCHED 6797#ifdef CONFIG_FAIR_GROUP_SCHED
6253 root_task_group.shares = ROOT_TASK_GROUP_LOAD; 6798 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
6254 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 6799 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
@@ -6320,10 +6865,6 @@ void __init sched_init(void)
6320 INIT_HLIST_HEAD(&init_task.preempt_notifiers); 6865 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
6321#endif 6866#endif
6322 6867
6323#ifdef CONFIG_RT_MUTEXES
6324 plist_head_init(&init_task.pi_waiters);
6325#endif
6326
6327 /* 6868 /*
6328 * The boot idle thread does lazy MMU switching as well: 6869 * The boot idle thread does lazy MMU switching as well:
6329 */ 6870 */
@@ -6397,13 +6938,16 @@ EXPORT_SYMBOL(__might_sleep);
6397static void normalize_task(struct rq *rq, struct task_struct *p) 6938static void normalize_task(struct rq *rq, struct task_struct *p)
6398{ 6939{
6399 const struct sched_class *prev_class = p->sched_class; 6940 const struct sched_class *prev_class = p->sched_class;
6941 struct sched_attr attr = {
6942 .sched_policy = SCHED_NORMAL,
6943 };
6400 int old_prio = p->prio; 6944 int old_prio = p->prio;
6401 int on_rq; 6945 int on_rq;
6402 6946
6403 on_rq = p->on_rq; 6947 on_rq = p->on_rq;
6404 if (on_rq) 6948 if (on_rq)
6405 dequeue_task(rq, p, 0); 6949 dequeue_task(rq, p, 0);
6406 __setscheduler(rq, p, SCHED_NORMAL, 0); 6950 __setscheduler(rq, p, &attr);
6407 if (on_rq) { 6951 if (on_rq) {
6408 enqueue_task(rq, p, 0); 6952 enqueue_task(rq, p, 0);
6409 resched_task(rq->curr); 6953 resched_task(rq->curr);
@@ -6433,7 +6977,7 @@ void normalize_rt_tasks(void)
6433 p->se.statistics.block_start = 0; 6977 p->se.statistics.block_start = 0;
6434#endif 6978#endif
6435 6979
6436 if (!rt_task(p)) { 6980 if (!dl_task(p) && !rt_task(p)) {
6437 /* 6981 /*
6438 * Renice negative nice level userspace 6982 * Renice negative nice level userspace
6439 * tasks back to 0: 6983 * tasks back to 0:
@@ -6628,16 +7172,6 @@ void sched_move_task(struct task_struct *tsk)
6628} 7172}
6629#endif /* CONFIG_CGROUP_SCHED */ 7173#endif /* CONFIG_CGROUP_SCHED */
6630 7174
6631#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
6632static unsigned long to_ratio(u64 period, u64 runtime)
6633{
6634 if (runtime == RUNTIME_INF)
6635 return 1ULL << 20;
6636
6637 return div64_u64(runtime << 20, period);
6638}
6639#endif
6640
6641#ifdef CONFIG_RT_GROUP_SCHED 7175#ifdef CONFIG_RT_GROUP_SCHED
6642/* 7176/*
6643 * Ensure that the real time constraints are schedulable. 7177 * Ensure that the real time constraints are schedulable.
@@ -6811,24 +7345,13 @@ static long sched_group_rt_period(struct task_group *tg)
6811 do_div(rt_period_us, NSEC_PER_USEC); 7345 do_div(rt_period_us, NSEC_PER_USEC);
6812 return rt_period_us; 7346 return rt_period_us;
6813} 7347}
7348#endif /* CONFIG_RT_GROUP_SCHED */
6814 7349
7350#ifdef CONFIG_RT_GROUP_SCHED
6815static int sched_rt_global_constraints(void) 7351static int sched_rt_global_constraints(void)
6816{ 7352{
6817 u64 runtime, period;
6818 int ret = 0; 7353 int ret = 0;
6819 7354
6820 if (sysctl_sched_rt_period <= 0)
6821 return -EINVAL;
6822
6823 runtime = global_rt_runtime();
6824 period = global_rt_period();
6825
6826 /*
6827 * Sanity check on the sysctl variables.
6828 */
6829 if (runtime > period && runtime != RUNTIME_INF)
6830 return -EINVAL;
6831
6832 mutex_lock(&rt_constraints_mutex); 7355 mutex_lock(&rt_constraints_mutex);
6833 read_lock(&tasklist_lock); 7356 read_lock(&tasklist_lock);
6834 ret = __rt_schedulable(NULL, 0, 0); 7357 ret = __rt_schedulable(NULL, 0, 0);
@@ -6851,17 +7374,7 @@ static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
6851static int sched_rt_global_constraints(void) 7374static int sched_rt_global_constraints(void)
6852{ 7375{
6853 unsigned long flags; 7376 unsigned long flags;
6854 int i; 7377 int i, ret = 0;
6855
6856 if (sysctl_sched_rt_period <= 0)
6857 return -EINVAL;
6858
6859 /*
6860 * There's always some RT tasks in the root group
6861 * -- migration, kstopmachine etc..
6862 */
6863 if (sysctl_sched_rt_runtime == 0)
6864 return -EBUSY;
6865 7378
6866 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 7379 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
6867 for_each_possible_cpu(i) { 7380 for_each_possible_cpu(i) {
@@ -6873,36 +7386,88 @@ static int sched_rt_global_constraints(void)
6873 } 7386 }
6874 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 7387 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
6875 7388
6876 return 0; 7389 return ret;
6877} 7390}
6878#endif /* CONFIG_RT_GROUP_SCHED */ 7391#endif /* CONFIG_RT_GROUP_SCHED */
6879 7392
6880int sched_rr_handler(struct ctl_table *table, int write, 7393static int sched_dl_global_constraints(void)
6881 void __user *buffer, size_t *lenp,
6882 loff_t *ppos)
6883{ 7394{
6884 int ret; 7395 u64 runtime = global_rt_runtime();
6885 static DEFINE_MUTEX(mutex); 7396 u64 period = global_rt_period();
7397 u64 new_bw = to_ratio(period, runtime);
7398 int cpu, ret = 0;
6886 7399
6887 mutex_lock(&mutex); 7400 /*
6888 ret = proc_dointvec(table, write, buffer, lenp, ppos); 7401 * Here we want to check the bandwidth not being set to some
6889 /* make sure that internally we keep jiffies */ 7402 * value smaller than the currently allocated bandwidth in
6890 /* also, writing zero resets timeslice to default */ 7403 * any of the root_domains.
6891 if (!ret && write) { 7404 *
6892 sched_rr_timeslice = sched_rr_timeslice <= 0 ? 7405 * FIXME: Cycling on all the CPUs is overdoing, but simpler than
6893 RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); 7406 * cycling on root_domains... Discussion on different/better
7407 * solutions is welcome!
7408 */
7409 for_each_possible_cpu(cpu) {
7410 struct dl_bw *dl_b = dl_bw_of(cpu);
7411
7412 raw_spin_lock(&dl_b->lock);
7413 if (new_bw < dl_b->total_bw)
7414 ret = -EBUSY;
7415 raw_spin_unlock(&dl_b->lock);
7416
7417 if (ret)
7418 break;
6894 } 7419 }
6895 mutex_unlock(&mutex); 7420
6896 return ret; 7421 return ret;
6897} 7422}
6898 7423
7424static void sched_dl_do_global(void)
7425{
7426 u64 new_bw = -1;
7427 int cpu;
7428
7429 def_dl_bandwidth.dl_period = global_rt_period();
7430 def_dl_bandwidth.dl_runtime = global_rt_runtime();
7431
7432 if (global_rt_runtime() != RUNTIME_INF)
7433 new_bw = to_ratio(global_rt_period(), global_rt_runtime());
7434
7435 /*
7436 * FIXME: As above...
7437 */
7438 for_each_possible_cpu(cpu) {
7439 struct dl_bw *dl_b = dl_bw_of(cpu);
7440
7441 raw_spin_lock(&dl_b->lock);
7442 dl_b->bw = new_bw;
7443 raw_spin_unlock(&dl_b->lock);
7444 }
7445}
7446
7447static int sched_rt_global_validate(void)
7448{
7449 if (sysctl_sched_rt_period <= 0)
7450 return -EINVAL;
7451
7452 if (sysctl_sched_rt_runtime > sysctl_sched_rt_period)
7453 return -EINVAL;
7454
7455 return 0;
7456}
7457
7458static void sched_rt_do_global(void)
7459{
7460 def_rt_bandwidth.rt_runtime = global_rt_runtime();
7461 def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
7462}
7463
6899int sched_rt_handler(struct ctl_table *table, int write, 7464int sched_rt_handler(struct ctl_table *table, int write,
6900 void __user *buffer, size_t *lenp, 7465 void __user *buffer, size_t *lenp,
6901 loff_t *ppos) 7466 loff_t *ppos)
6902{ 7467{
6903 int ret;
6904 int old_period, old_runtime; 7468 int old_period, old_runtime;
6905 static DEFINE_MUTEX(mutex); 7469 static DEFINE_MUTEX(mutex);
7470 int ret;
6906 7471
6907 mutex_lock(&mutex); 7472 mutex_lock(&mutex);
6908 old_period = sysctl_sched_rt_period; 7473 old_period = sysctl_sched_rt_period;
@@ -6911,21 +7476,50 @@ int sched_rt_handler(struct ctl_table *table, int write,
6911 ret = proc_dointvec(table, write, buffer, lenp, ppos); 7476 ret = proc_dointvec(table, write, buffer, lenp, ppos);
6912 7477
6913 if (!ret && write) { 7478 if (!ret && write) {
7479 ret = sched_rt_global_validate();
7480 if (ret)
7481 goto undo;
7482
6914 ret = sched_rt_global_constraints(); 7483 ret = sched_rt_global_constraints();
6915 if (ret) { 7484 if (ret)
6916 sysctl_sched_rt_period = old_period; 7485 goto undo;
6917 sysctl_sched_rt_runtime = old_runtime; 7486
6918 } else { 7487 ret = sched_dl_global_constraints();
6919 def_rt_bandwidth.rt_runtime = global_rt_runtime(); 7488 if (ret)
6920 def_rt_bandwidth.rt_period = 7489 goto undo;
6921 ns_to_ktime(global_rt_period()); 7490
6922 } 7491 sched_rt_do_global();
7492 sched_dl_do_global();
7493 }
7494 if (0) {
7495undo:
7496 sysctl_sched_rt_period = old_period;
7497 sysctl_sched_rt_runtime = old_runtime;
6923 } 7498 }
6924 mutex_unlock(&mutex); 7499 mutex_unlock(&mutex);
6925 7500
6926 return ret; 7501 return ret;
6927} 7502}
6928 7503
7504int sched_rr_handler(struct ctl_table *table, int write,
7505 void __user *buffer, size_t *lenp,
7506 loff_t *ppos)
7507{
7508 int ret;
7509 static DEFINE_MUTEX(mutex);
7510
7511 mutex_lock(&mutex);
7512 ret = proc_dointvec(table, write, buffer, lenp, ppos);
7513 /* make sure that internally we keep jiffies */
7514 /* also, writing zero resets timeslice to default */
7515 if (!ret && write) {
7516 sched_rr_timeslice = sched_rr_timeslice <= 0 ?
7517 RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
7518 }
7519 mutex_unlock(&mutex);
7520 return ret;
7521}
7522
6929#ifdef CONFIG_CGROUP_SCHED 7523#ifdef CONFIG_CGROUP_SCHED
6930 7524
6931static inline struct task_group *css_tg(struct cgroup_subsys_state *css) 7525static inline struct task_group *css_tg(struct cgroup_subsys_state *css)