aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c880
1 files changed, 756 insertions, 124 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a88f4a485c5e..f5c6635b806c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -296,8 +296,6 @@ __read_mostly int scheduler_running;
296 */ 296 */
297int sysctl_sched_rt_runtime = 950000; 297int sysctl_sched_rt_runtime = 950000;
298 298
299
300
301/* 299/*
302 * __task_rq_lock - lock the rq @p resides on. 300 * __task_rq_lock - lock the rq @p resides on.
303 */ 301 */
@@ -899,7 +897,9 @@ static inline int normal_prio(struct task_struct *p)
899{ 897{
900 int prio; 898 int prio;
901 899
902 if (task_has_rt_policy(p)) 900 if (task_has_dl_policy(p))
901 prio = MAX_DL_PRIO-1;
902 else if (task_has_rt_policy(p))
903 prio = MAX_RT_PRIO-1 - p->rt_priority; 903 prio = MAX_RT_PRIO-1 - p->rt_priority;
904 else 904 else
905 prio = __normal_prio(p); 905 prio = __normal_prio(p);
@@ -945,7 +945,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
945 if (prev_class->switched_from) 945 if (prev_class->switched_from)
946 prev_class->switched_from(rq, p); 946 prev_class->switched_from(rq, p);
947 p->sched_class->switched_to(rq, p); 947 p->sched_class->switched_to(rq, p);
948 } else if (oldprio != p->prio) 948 } else if (oldprio != p->prio || dl_task(p))
949 p->sched_class->prio_changed(rq, p, oldprio); 949 p->sched_class->prio_changed(rq, p, oldprio);
950} 950}
951 951
@@ -1108,6 +1108,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)
1108 if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) 1108 if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
1109 goto out; 1109 goto out;
1110 1110
1111 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
1111 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); 1112 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
1112 1113
1113out: 1114out:
@@ -1499,8 +1500,7 @@ void scheduler_ipi(void)
1499 * TIF_NEED_RESCHED remotely (for the first time) will also send 1500 * TIF_NEED_RESCHED remotely (for the first time) will also send
1500 * this IPI. 1501 * this IPI.
1501 */ 1502 */
1502 if (tif_need_resched()) 1503 preempt_fold_need_resched();
1503 set_preempt_need_resched();
1504 1504
1505 if (llist_empty(&this_rq()->wake_list) 1505 if (llist_empty(&this_rq()->wake_list)
1506 && !tick_nohz_full_cpu(smp_processor_id()) 1506 && !tick_nohz_full_cpu(smp_processor_id())
@@ -1717,6 +1717,13 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1717 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 1717 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
1718#endif 1718#endif
1719 1719
1720 RB_CLEAR_NODE(&p->dl.rb_node);
1721 hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1722 p->dl.dl_runtime = p->dl.runtime = 0;
1723 p->dl.dl_deadline = p->dl.deadline = 0;
1724 p->dl.dl_period = 0;
1725 p->dl.flags = 0;
1726
1720 INIT_LIST_HEAD(&p->rt.run_list); 1727 INIT_LIST_HEAD(&p->rt.run_list);
1721 1728
1722#ifdef CONFIG_PREEMPT_NOTIFIERS 1729#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -1763,12 +1770,34 @@ void set_numabalancing_state(bool enabled)
1763 numabalancing_enabled = enabled; 1770 numabalancing_enabled = enabled;
1764} 1771}
1765#endif /* CONFIG_SCHED_DEBUG */ 1772#endif /* CONFIG_SCHED_DEBUG */
1766#endif /* CONFIG_NUMA_BALANCING */ 1773
1774#ifdef CONFIG_PROC_SYSCTL
1775int sysctl_numa_balancing(struct ctl_table *table, int write,
1776 void __user *buffer, size_t *lenp, loff_t *ppos)
1777{
1778 struct ctl_table t;
1779 int err;
1780 int state = numabalancing_enabled;
1781
1782 if (write && !capable(CAP_SYS_ADMIN))
1783 return -EPERM;
1784
1785 t = *table;
1786 t.data = &state;
1787 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
1788 if (err < 0)
1789 return err;
1790 if (write)
1791 set_numabalancing_state(state);
1792 return err;
1793}
1794#endif
1795#endif
1767 1796
1768/* 1797/*
1769 * fork()/clone()-time setup: 1798 * fork()/clone()-time setup:
1770 */ 1799 */
1771void sched_fork(unsigned long clone_flags, struct task_struct *p) 1800int sched_fork(unsigned long clone_flags, struct task_struct *p)
1772{ 1801{
1773 unsigned long flags; 1802 unsigned long flags;
1774 int cpu = get_cpu(); 1803 int cpu = get_cpu();
@@ -1790,7 +1819,7 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p)
1790 * Revert to default priority/policy on fork if requested. 1819 * Revert to default priority/policy on fork if requested.
1791 */ 1820 */
1792 if (unlikely(p->sched_reset_on_fork)) { 1821 if (unlikely(p->sched_reset_on_fork)) {
1793 if (task_has_rt_policy(p)) { 1822 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
1794 p->policy = SCHED_NORMAL; 1823 p->policy = SCHED_NORMAL;
1795 p->static_prio = NICE_TO_PRIO(0); 1824 p->static_prio = NICE_TO_PRIO(0);
1796 p->rt_priority = 0; 1825 p->rt_priority = 0;
@@ -1807,8 +1836,14 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p)
1807 p->sched_reset_on_fork = 0; 1836 p->sched_reset_on_fork = 0;
1808 } 1837 }
1809 1838
1810 if (!rt_prio(p->prio)) 1839 if (dl_prio(p->prio)) {
1840 put_cpu();
1841 return -EAGAIN;
1842 } else if (rt_prio(p->prio)) {
1843 p->sched_class = &rt_sched_class;
1844 } else {
1811 p->sched_class = &fair_sched_class; 1845 p->sched_class = &fair_sched_class;
1846 }
1812 1847
1813 if (p->sched_class->task_fork) 1848 if (p->sched_class->task_fork)
1814 p->sched_class->task_fork(p); 1849 p->sched_class->task_fork(p);
@@ -1834,11 +1869,124 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p)
1834 init_task_preempt_count(p); 1869 init_task_preempt_count(p);
1835#ifdef CONFIG_SMP 1870#ifdef CONFIG_SMP
1836 plist_node_init(&p->pushable_tasks, MAX_PRIO); 1871 plist_node_init(&p->pushable_tasks, MAX_PRIO);
1872 RB_CLEAR_NODE(&p->pushable_dl_tasks);
1837#endif 1873#endif
1838 1874
1839 put_cpu(); 1875 put_cpu();
1876 return 0;
1877}
1878
1879unsigned long to_ratio(u64 period, u64 runtime)
1880{
1881 if (runtime == RUNTIME_INF)
1882 return 1ULL << 20;
1883
1884 /*
1885 * Doing this here saves a lot of checks in all
1886 * the calling paths, and returning zero seems
1887 * safe for them anyway.
1888 */
1889 if (period == 0)
1890 return 0;
1891
1892 return div64_u64(runtime << 20, period);
1840} 1893}
1841 1894
1895#ifdef CONFIG_SMP
1896inline struct dl_bw *dl_bw_of(int i)
1897{
1898 return &cpu_rq(i)->rd->dl_bw;
1899}
1900
1901static inline int dl_bw_cpus(int i)
1902{
1903 struct root_domain *rd = cpu_rq(i)->rd;
1904 int cpus = 0;
1905
1906 for_each_cpu_and(i, rd->span, cpu_active_mask)
1907 cpus++;
1908
1909 return cpus;
1910}
1911#else
1912inline struct dl_bw *dl_bw_of(int i)
1913{
1914 return &cpu_rq(i)->dl.dl_bw;
1915}
1916
1917static inline int dl_bw_cpus(int i)
1918{
1919 return 1;
1920}
1921#endif
1922
1923static inline
1924void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
1925{
1926 dl_b->total_bw -= tsk_bw;
1927}
1928
1929static inline
1930void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
1931{
1932 dl_b->total_bw += tsk_bw;
1933}
1934
1935static inline
1936bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
1937{
1938 return dl_b->bw != -1 &&
1939 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
1940}
1941
1942/*
1943 * We must be sure that accepting a new task (or allowing changing the
1944 * parameters of an existing one) is consistent with the bandwidth
1945 * constraints. If yes, this function also accordingly updates the currently
1946 * allocated bandwidth to reflect the new situation.
1947 *
1948 * This function is called while holding p's rq->lock.
1949 */
1950static int dl_overflow(struct task_struct *p, int policy,
1951 const struct sched_attr *attr)
1952{
1953
1954 struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
1955 u64 period = attr->sched_period ?: attr->sched_deadline;
1956 u64 runtime = attr->sched_runtime;
1957 u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
1958 int cpus, err = -1;
1959
1960 if (new_bw == p->dl.dl_bw)
1961 return 0;
1962
1963 /*
1964 * Either if a task, enters, leave, or stays -deadline but changes
1965 * its parameters, we may need to update accordingly the total
1966 * allocated bandwidth of the container.
1967 */
1968 raw_spin_lock(&dl_b->lock);
1969 cpus = dl_bw_cpus(task_cpu(p));
1970 if (dl_policy(policy) && !task_has_dl_policy(p) &&
1971 !__dl_overflow(dl_b, cpus, 0, new_bw)) {
1972 __dl_add(dl_b, new_bw);
1973 err = 0;
1974 } else if (dl_policy(policy) && task_has_dl_policy(p) &&
1975 !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
1976 __dl_clear(dl_b, p->dl.dl_bw);
1977 __dl_add(dl_b, new_bw);
1978 err = 0;
1979 } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
1980 __dl_clear(dl_b, p->dl.dl_bw);
1981 err = 0;
1982 }
1983 raw_spin_unlock(&dl_b->lock);
1984
1985 return err;
1986}
1987
1988extern void init_dl_bw(struct dl_bw *dl_b);
1989
1842/* 1990/*
1843 * wake_up_new_task - wake up a newly created task for the first time. 1991 * wake_up_new_task - wake up a newly created task for the first time.
1844 * 1992 *
@@ -2003,6 +2151,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2003 if (unlikely(prev_state == TASK_DEAD)) { 2151 if (unlikely(prev_state == TASK_DEAD)) {
2004 task_numa_free(prev); 2152 task_numa_free(prev);
2005 2153
2154 if (prev->sched_class->task_dead)
2155 prev->sched_class->task_dead(prev);
2156
2006 /* 2157 /*
2007 * Remove function-return probe instances associated with this 2158 * Remove function-return probe instances associated with this
2008 * task and put them back on the free list. 2159 * task and put them back on the free list.
@@ -2296,7 +2447,7 @@ void scheduler_tick(void)
2296 2447
2297#ifdef CONFIG_SMP 2448#ifdef CONFIG_SMP
2298 rq->idle_balance = idle_cpu(cpu); 2449 rq->idle_balance = idle_cpu(cpu);
2299 trigger_load_balance(rq, cpu); 2450 trigger_load_balance(rq);
2300#endif 2451#endif
2301 rq_last_tick_reset(rq); 2452 rq_last_tick_reset(rq);
2302} 2453}
@@ -2325,7 +2476,7 @@ u64 scheduler_tick_max_deferment(void)
2325 if (time_before_eq(next, now)) 2476 if (time_before_eq(next, now))
2326 return 0; 2477 return 0;
2327 2478
2328 return jiffies_to_usecs(next - now) * NSEC_PER_USEC; 2479 return jiffies_to_nsecs(next - now);
2329} 2480}
2330#endif 2481#endif
2331 2482
@@ -2414,10 +2565,10 @@ static inline void schedule_debug(struct task_struct *prev)
2414{ 2565{
2415 /* 2566 /*
2416 * Test if we are atomic. Since do_exit() needs to call into 2567 * Test if we are atomic. Since do_exit() needs to call into
2417 * schedule() atomically, we ignore that path for now. 2568 * schedule() atomically, we ignore that path. Otherwise whine
2418 * Otherwise, whine if we are scheduling when we should not be. 2569 * if we are scheduling when we should not.
2419 */ 2570 */
2420 if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) 2571 if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
2421 __schedule_bug(prev); 2572 __schedule_bug(prev);
2422 rcu_sleep_check(); 2573 rcu_sleep_check();
2423 2574
@@ -2761,11 +2912,11 @@ EXPORT_SYMBOL(sleep_on_timeout);
2761 */ 2912 */
2762void rt_mutex_setprio(struct task_struct *p, int prio) 2913void rt_mutex_setprio(struct task_struct *p, int prio)
2763{ 2914{
2764 int oldprio, on_rq, running; 2915 int oldprio, on_rq, running, enqueue_flag = 0;
2765 struct rq *rq; 2916 struct rq *rq;
2766 const struct sched_class *prev_class; 2917 const struct sched_class *prev_class;
2767 2918
2768 BUG_ON(prio < 0 || prio > MAX_PRIO); 2919 BUG_ON(prio > MAX_PRIO);
2769 2920
2770 rq = __task_rq_lock(p); 2921 rq = __task_rq_lock(p);
2771 2922
@@ -2788,6 +2939,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
2788 } 2939 }
2789 2940
2790 trace_sched_pi_setprio(p, prio); 2941 trace_sched_pi_setprio(p, prio);
2942 p->pi_top_task = rt_mutex_get_top_task(p);
2791 oldprio = p->prio; 2943 oldprio = p->prio;
2792 prev_class = p->sched_class; 2944 prev_class = p->sched_class;
2793 on_rq = p->on_rq; 2945 on_rq = p->on_rq;
@@ -2797,23 +2949,49 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
2797 if (running) 2949 if (running)
2798 p->sched_class->put_prev_task(rq, p); 2950 p->sched_class->put_prev_task(rq, p);
2799 2951
2800 if (rt_prio(prio)) 2952 /*
2953 * Boosting condition are:
2954 * 1. -rt task is running and holds mutex A
2955 * --> -dl task blocks on mutex A
2956 *
2957 * 2. -dl task is running and holds mutex A
2958 * --> -dl task blocks on mutex A and could preempt the
2959 * running task
2960 */
2961 if (dl_prio(prio)) {
2962 if (!dl_prio(p->normal_prio) || (p->pi_top_task &&
2963 dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) {
2964 p->dl.dl_boosted = 1;
2965 p->dl.dl_throttled = 0;
2966 enqueue_flag = ENQUEUE_REPLENISH;
2967 } else
2968 p->dl.dl_boosted = 0;
2969 p->sched_class = &dl_sched_class;
2970 } else if (rt_prio(prio)) {
2971 if (dl_prio(oldprio))
2972 p->dl.dl_boosted = 0;
2973 if (oldprio < prio)
2974 enqueue_flag = ENQUEUE_HEAD;
2801 p->sched_class = &rt_sched_class; 2975 p->sched_class = &rt_sched_class;
2802 else 2976 } else {
2977 if (dl_prio(oldprio))
2978 p->dl.dl_boosted = 0;
2803 p->sched_class = &fair_sched_class; 2979 p->sched_class = &fair_sched_class;
2980 }
2804 2981
2805 p->prio = prio; 2982 p->prio = prio;
2806 2983
2807 if (running) 2984 if (running)
2808 p->sched_class->set_curr_task(rq); 2985 p->sched_class->set_curr_task(rq);
2809 if (on_rq) 2986 if (on_rq)
2810 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 2987 enqueue_task(rq, p, enqueue_flag);
2811 2988
2812 check_class_changed(rq, p, prev_class, oldprio); 2989 check_class_changed(rq, p, prev_class, oldprio);
2813out_unlock: 2990out_unlock:
2814 __task_rq_unlock(rq); 2991 __task_rq_unlock(rq);
2815} 2992}
2816#endif 2993#endif
2994
2817void set_user_nice(struct task_struct *p, long nice) 2995void set_user_nice(struct task_struct *p, long nice)
2818{ 2996{
2819 int old_prio, delta, on_rq; 2997 int old_prio, delta, on_rq;
@@ -2831,9 +3009,9 @@ void set_user_nice(struct task_struct *p, long nice)
2831 * The RT priorities are set via sched_setscheduler(), but we still 3009 * The RT priorities are set via sched_setscheduler(), but we still
2832 * allow the 'normal' nice value to be set - but as expected 3010 * allow the 'normal' nice value to be set - but as expected
2833 * it wont have any effect on scheduling until the task is 3011 * it wont have any effect on scheduling until the task is
2834 * SCHED_FIFO/SCHED_RR: 3012 * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
2835 */ 3013 */
2836 if (task_has_rt_policy(p)) { 3014 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
2837 p->static_prio = NICE_TO_PRIO(nice); 3015 p->static_prio = NICE_TO_PRIO(nice);
2838 goto out_unlock; 3016 goto out_unlock;
2839 } 3017 }
@@ -2988,22 +3166,95 @@ static struct task_struct *find_process_by_pid(pid_t pid)
2988 return pid ? find_task_by_vpid(pid) : current; 3166 return pid ? find_task_by_vpid(pid) : current;
2989} 3167}
2990 3168
2991/* Actually do priority change: must hold rq lock. */ 3169/*
3170 * This function initializes the sched_dl_entity of a newly becoming
3171 * SCHED_DEADLINE task.
3172 *
3173 * Only the static values are considered here, the actual runtime and the
3174 * absolute deadline will be properly calculated when the task is enqueued
3175 * for the first time with its new policy.
3176 */
2992static void 3177static void
2993__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) 3178__setparam_dl(struct task_struct *p, const struct sched_attr *attr)
2994{ 3179{
3180 struct sched_dl_entity *dl_se = &p->dl;
3181
3182 init_dl_task_timer(dl_se);
3183 dl_se->dl_runtime = attr->sched_runtime;
3184 dl_se->dl_deadline = attr->sched_deadline;
3185 dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
3186 dl_se->flags = attr->sched_flags;
3187 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
3188 dl_se->dl_throttled = 0;
3189 dl_se->dl_new = 1;
3190}
3191
3192/* Actually do priority change: must hold pi & rq lock. */
3193static void __setscheduler(struct rq *rq, struct task_struct *p,
3194 const struct sched_attr *attr)
3195{
3196 int policy = attr->sched_policy;
3197
3198 if (policy == -1) /* setparam */
3199 policy = p->policy;
3200
2995 p->policy = policy; 3201 p->policy = policy;
2996 p->rt_priority = prio; 3202
3203 if (dl_policy(policy))
3204 __setparam_dl(p, attr);
3205 else if (fair_policy(policy))
3206 p->static_prio = NICE_TO_PRIO(attr->sched_nice);
3207
3208 /*
3209 * __sched_setscheduler() ensures attr->sched_priority == 0 when
3210 * !rt_policy. Always setting this ensures that things like
3211 * getparam()/getattr() don't report silly values for !rt tasks.
3212 */
3213 p->rt_priority = attr->sched_priority;
3214
2997 p->normal_prio = normal_prio(p); 3215 p->normal_prio = normal_prio(p);
2998 /* we are holding p->pi_lock already */
2999 p->prio = rt_mutex_getprio(p); 3216 p->prio = rt_mutex_getprio(p);
3000 if (rt_prio(p->prio)) 3217
3218 if (dl_prio(p->prio))
3219 p->sched_class = &dl_sched_class;
3220 else if (rt_prio(p->prio))
3001 p->sched_class = &rt_sched_class; 3221 p->sched_class = &rt_sched_class;
3002 else 3222 else
3003 p->sched_class = &fair_sched_class; 3223 p->sched_class = &fair_sched_class;
3224
3004 set_load_weight(p); 3225 set_load_weight(p);
3005} 3226}
3006 3227
3228static void
3229__getparam_dl(struct task_struct *p, struct sched_attr *attr)
3230{
3231 struct sched_dl_entity *dl_se = &p->dl;
3232
3233 attr->sched_priority = p->rt_priority;
3234 attr->sched_runtime = dl_se->dl_runtime;
3235 attr->sched_deadline = dl_se->dl_deadline;
3236 attr->sched_period = dl_se->dl_period;
3237 attr->sched_flags = dl_se->flags;
3238}
3239
3240/*
3241 * This function validates the new parameters of a -deadline task.
3242 * We ask for the deadline not being zero, and greater or equal
3243 * than the runtime, as well as the period of being zero or
3244 * greater than deadline. Furthermore, we have to be sure that
3245 * user parameters are above the internal resolution (1us); we
3246 * check sched_runtime only since it is always the smaller one.
3247 */
3248static bool
3249__checkparam_dl(const struct sched_attr *attr)
3250{
3251 return attr && attr->sched_deadline != 0 &&
3252 (attr->sched_period == 0 ||
3253 (s64)(attr->sched_period - attr->sched_deadline) >= 0) &&
3254 (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0 &&
3255 attr->sched_runtime >= (2 << (DL_SCALE - 1));
3256}
3257
3007/* 3258/*
3008 * check the target process has a UID that matches the current process's 3259 * check the target process has a UID that matches the current process's
3009 */ 3260 */
@@ -3020,10 +3271,12 @@ static bool check_same_owner(struct task_struct *p)
3020 return match; 3271 return match;
3021} 3272}
3022 3273
3023static int __sched_setscheduler(struct task_struct *p, int policy, 3274static int __sched_setscheduler(struct task_struct *p,
3024 const struct sched_param *param, bool user) 3275 const struct sched_attr *attr,
3276 bool user)
3025{ 3277{
3026 int retval, oldprio, oldpolicy = -1, on_rq, running; 3278 int retval, oldprio, oldpolicy = -1, on_rq, running;
3279 int policy = attr->sched_policy;
3027 unsigned long flags; 3280 unsigned long flags;
3028 const struct sched_class *prev_class; 3281 const struct sched_class *prev_class;
3029 struct rq *rq; 3282 struct rq *rq;
@@ -3037,31 +3290,40 @@ recheck:
3037 reset_on_fork = p->sched_reset_on_fork; 3290 reset_on_fork = p->sched_reset_on_fork;
3038 policy = oldpolicy = p->policy; 3291 policy = oldpolicy = p->policy;
3039 } else { 3292 } else {
3040 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); 3293 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
3041 policy &= ~SCHED_RESET_ON_FORK;
3042 3294
3043 if (policy != SCHED_FIFO && policy != SCHED_RR && 3295 if (policy != SCHED_DEADLINE &&
3296 policy != SCHED_FIFO && policy != SCHED_RR &&
3044 policy != SCHED_NORMAL && policy != SCHED_BATCH && 3297 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
3045 policy != SCHED_IDLE) 3298 policy != SCHED_IDLE)
3046 return -EINVAL; 3299 return -EINVAL;
3047 } 3300 }
3048 3301
3302 if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK))
3303 return -EINVAL;
3304
3049 /* 3305 /*
3050 * Valid priorities for SCHED_FIFO and SCHED_RR are 3306 * Valid priorities for SCHED_FIFO and SCHED_RR are
3051 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 3307 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
3052 * SCHED_BATCH and SCHED_IDLE is 0. 3308 * SCHED_BATCH and SCHED_IDLE is 0.
3053 */ 3309 */
3054 if (param->sched_priority < 0 || 3310 if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
3055 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || 3311 (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
3056 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
3057 return -EINVAL; 3312 return -EINVAL;
3058 if (rt_policy(policy) != (param->sched_priority != 0)) 3313 if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
3314 (rt_policy(policy) != (attr->sched_priority != 0)))
3059 return -EINVAL; 3315 return -EINVAL;
3060 3316
3061 /* 3317 /*
3062 * Allow unprivileged RT tasks to decrease priority: 3318 * Allow unprivileged RT tasks to decrease priority:
3063 */ 3319 */
3064 if (user && !capable(CAP_SYS_NICE)) { 3320 if (user && !capable(CAP_SYS_NICE)) {
3321 if (fair_policy(policy)) {
3322 if (attr->sched_nice < TASK_NICE(p) &&
3323 !can_nice(p, attr->sched_nice))
3324 return -EPERM;
3325 }
3326
3065 if (rt_policy(policy)) { 3327 if (rt_policy(policy)) {
3066 unsigned long rlim_rtprio = 3328 unsigned long rlim_rtprio =
3067 task_rlimit(p, RLIMIT_RTPRIO); 3329 task_rlimit(p, RLIMIT_RTPRIO);
@@ -3071,11 +3333,20 @@ recheck:
3071 return -EPERM; 3333 return -EPERM;
3072 3334
3073 /* can't increase priority */ 3335 /* can't increase priority */
3074 if (param->sched_priority > p->rt_priority && 3336 if (attr->sched_priority > p->rt_priority &&
3075 param->sched_priority > rlim_rtprio) 3337 attr->sched_priority > rlim_rtprio)
3076 return -EPERM; 3338 return -EPERM;
3077 } 3339 }
3078 3340
3341 /*
3342 * Can't set/change SCHED_DEADLINE policy at all for now
3343 * (safest behavior); in the future we would like to allow
3344 * unprivileged DL tasks to increase their relative deadline
3345 * or reduce their runtime (both ways reducing utilization)
3346 */
3347 if (dl_policy(policy))
3348 return -EPERM;
3349
3079 /* 3350 /*
3080 * Treat SCHED_IDLE as nice 20. Only allow a switch to 3351 * Treat SCHED_IDLE as nice 20. Only allow a switch to
3081 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. 3352 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
@@ -3120,14 +3391,21 @@ recheck:
3120 /* 3391 /*
3121 * If not changing anything there's no need to proceed further: 3392 * If not changing anything there's no need to proceed further:
3122 */ 3393 */
3123 if (unlikely(policy == p->policy && (!rt_policy(policy) || 3394 if (unlikely(policy == p->policy)) {
3124 param->sched_priority == p->rt_priority))) { 3395 if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p))
3396 goto change;
3397 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
3398 goto change;
3399 if (dl_policy(policy))
3400 goto change;
3401
3125 task_rq_unlock(rq, p, &flags); 3402 task_rq_unlock(rq, p, &flags);
3126 return 0; 3403 return 0;
3127 } 3404 }
3405change:
3128 3406
3129#ifdef CONFIG_RT_GROUP_SCHED
3130 if (user) { 3407 if (user) {
3408#ifdef CONFIG_RT_GROUP_SCHED
3131 /* 3409 /*
3132 * Do not allow realtime tasks into groups that have no runtime 3410 * Do not allow realtime tasks into groups that have no runtime
3133 * assigned. 3411 * assigned.
@@ -3138,8 +3416,24 @@ recheck:
3138 task_rq_unlock(rq, p, &flags); 3416 task_rq_unlock(rq, p, &flags);
3139 return -EPERM; 3417 return -EPERM;
3140 } 3418 }
3141 }
3142#endif 3419#endif
3420#ifdef CONFIG_SMP
3421 if (dl_bandwidth_enabled() && dl_policy(policy)) {
3422 cpumask_t *span = rq->rd->span;
3423
3424 /*
3425 * Don't allow tasks with an affinity mask smaller than
3426 * the entire root_domain to become SCHED_DEADLINE. We
3427 * will also fail if there's no bandwidth available.
3428 */
3429 if (!cpumask_subset(span, &p->cpus_allowed) ||
3430 rq->rd->dl_bw.bw == 0) {
3431 task_rq_unlock(rq, p, &flags);
3432 return -EPERM;
3433 }
3434 }
3435#endif
3436 }
3143 3437
3144 /* recheck policy now with rq lock held */ 3438 /* recheck policy now with rq lock held */
3145 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 3439 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
@@ -3147,6 +3441,17 @@ recheck:
3147 task_rq_unlock(rq, p, &flags); 3441 task_rq_unlock(rq, p, &flags);
3148 goto recheck; 3442 goto recheck;
3149 } 3443 }
3444
3445 /*
3446 * If setscheduling to SCHED_DEADLINE (or changing the parameters
3447 * of a SCHED_DEADLINE task) we need to check if enough bandwidth
3448 * is available.
3449 */
3450 if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
3451 task_rq_unlock(rq, p, &flags);
3452 return -EBUSY;
3453 }
3454
3150 on_rq = p->on_rq; 3455 on_rq = p->on_rq;
3151 running = task_current(rq, p); 3456 running = task_current(rq, p);
3152 if (on_rq) 3457 if (on_rq)
@@ -3158,7 +3463,7 @@ recheck:
3158 3463
3159 oldprio = p->prio; 3464 oldprio = p->prio;
3160 prev_class = p->sched_class; 3465 prev_class = p->sched_class;
3161 __setscheduler(rq, p, policy, param->sched_priority); 3466 __setscheduler(rq, p, attr);
3162 3467
3163 if (running) 3468 if (running)
3164 p->sched_class->set_curr_task(rq); 3469 p->sched_class->set_curr_task(rq);
@@ -3173,6 +3478,26 @@ recheck:
3173 return 0; 3478 return 0;
3174} 3479}
3175 3480
3481static int _sched_setscheduler(struct task_struct *p, int policy,
3482 const struct sched_param *param, bool check)
3483{
3484 struct sched_attr attr = {
3485 .sched_policy = policy,
3486 .sched_priority = param->sched_priority,
3487 .sched_nice = PRIO_TO_NICE(p->static_prio),
3488 };
3489
3490 /*
3491 * Fixup the legacy SCHED_RESET_ON_FORK hack
3492 */
3493 if (policy & SCHED_RESET_ON_FORK) {
3494 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
3495 policy &= ~SCHED_RESET_ON_FORK;
3496 attr.sched_policy = policy;
3497 }
3498
3499 return __sched_setscheduler(p, &attr, check);
3500}
3176/** 3501/**
3177 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. 3502 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
3178 * @p: the task in question. 3503 * @p: the task in question.
@@ -3186,10 +3511,16 @@ recheck:
3186int sched_setscheduler(struct task_struct *p, int policy, 3511int sched_setscheduler(struct task_struct *p, int policy,
3187 const struct sched_param *param) 3512 const struct sched_param *param)
3188{ 3513{
3189 return __sched_setscheduler(p, policy, param, true); 3514 return _sched_setscheduler(p, policy, param, true);
3190} 3515}
3191EXPORT_SYMBOL_GPL(sched_setscheduler); 3516EXPORT_SYMBOL_GPL(sched_setscheduler);
3192 3517
3518int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
3519{
3520 return __sched_setscheduler(p, attr, true);
3521}
3522EXPORT_SYMBOL_GPL(sched_setattr);
3523
3193/** 3524/**
3194 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. 3525 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
3195 * @p: the task in question. 3526 * @p: the task in question.
@@ -3206,7 +3537,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
3206int sched_setscheduler_nocheck(struct task_struct *p, int policy, 3537int sched_setscheduler_nocheck(struct task_struct *p, int policy,
3207 const struct sched_param *param) 3538 const struct sched_param *param)
3208{ 3539{
3209 return __sched_setscheduler(p, policy, param, false); 3540 return _sched_setscheduler(p, policy, param, false);
3210} 3541}
3211 3542
3212static int 3543static int
@@ -3231,6 +3562,79 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3231 return retval; 3562 return retval;
3232} 3563}
3233 3564
3565/*
3566 * Mimics kernel/events/core.c perf_copy_attr().
3567 */
3568static int sched_copy_attr(struct sched_attr __user *uattr,
3569 struct sched_attr *attr)
3570{
3571 u32 size;
3572 int ret;
3573
3574 if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
3575 return -EFAULT;
3576
3577 /*
3578 * zero the full structure, so that a short copy will be nice.
3579 */
3580 memset(attr, 0, sizeof(*attr));
3581
3582 ret = get_user(size, &uattr->size);
3583 if (ret)
3584 return ret;
3585
3586 if (size > PAGE_SIZE) /* silly large */
3587 goto err_size;
3588
3589 if (!size) /* abi compat */
3590 size = SCHED_ATTR_SIZE_VER0;
3591
3592 if (size < SCHED_ATTR_SIZE_VER0)
3593 goto err_size;
3594
3595 /*
3596 * If we're handed a bigger struct than we know of,
3597 * ensure all the unknown bits are 0 - i.e. new
3598 * user-space does not rely on any kernel feature
3599 * extensions we dont know about yet.
3600 */
3601 if (size > sizeof(*attr)) {
3602 unsigned char __user *addr;
3603 unsigned char __user *end;
3604 unsigned char val;
3605
3606 addr = (void __user *)uattr + sizeof(*attr);
3607 end = (void __user *)uattr + size;
3608
3609 for (; addr < end; addr++) {
3610 ret = get_user(val, addr);
3611 if (ret)
3612 return ret;
3613 if (val)
3614 goto err_size;
3615 }
3616 size = sizeof(*attr);
3617 }
3618
3619 ret = copy_from_user(attr, uattr, size);
3620 if (ret)
3621 return -EFAULT;
3622
3623 /*
3624 * XXX: do we want to be lenient like existing syscalls; or do we want
3625 * to be strict and return an error on out-of-bounds values?
3626 */
3627 attr->sched_nice = clamp(attr->sched_nice, -20, 19);
3628
3629out:
3630 return ret;
3631
3632err_size:
3633 put_user(sizeof(*attr), &uattr->size);
3634 ret = -E2BIG;
3635 goto out;
3636}
3637
3234/** 3638/**
3235 * sys_sched_setscheduler - set/change the scheduler policy and RT priority 3639 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
3236 * @pid: the pid in question. 3640 * @pid: the pid in question.
@@ -3262,6 +3666,34 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
3262} 3666}
3263 3667
3264/** 3668/**
3669 * sys_sched_setattr - same as above, but with extended sched_attr
3670 * @pid: the pid in question.
3671 * @uattr: structure containing the extended parameters.
3672 */
3673SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
3674 unsigned int, flags)
3675{
3676 struct sched_attr attr;
3677 struct task_struct *p;
3678 int retval;
3679
3680 if (!uattr || pid < 0 || flags)
3681 return -EINVAL;
3682
3683 if (sched_copy_attr(uattr, &attr))
3684 return -EFAULT;
3685
3686 rcu_read_lock();
3687 retval = -ESRCH;
3688 p = find_process_by_pid(pid);
3689 if (p != NULL)
3690 retval = sched_setattr(p, &attr);
3691 rcu_read_unlock();
3692
3693 return retval;
3694}
3695
3696/**
3265 * sys_sched_getscheduler - get the policy (scheduling class) of a thread 3697 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
3266 * @pid: the pid in question. 3698 * @pid: the pid in question.
3267 * 3699 *
@@ -3316,6 +3748,10 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
3316 if (retval) 3748 if (retval)
3317 goto out_unlock; 3749 goto out_unlock;
3318 3750
3751 if (task_has_dl_policy(p)) {
3752 retval = -EINVAL;
3753 goto out_unlock;
3754 }
3319 lp.sched_priority = p->rt_priority; 3755 lp.sched_priority = p->rt_priority;
3320 rcu_read_unlock(); 3756 rcu_read_unlock();
3321 3757
@@ -3331,6 +3767,96 @@ out_unlock:
3331 return retval; 3767 return retval;
3332} 3768}
3333 3769
3770static int sched_read_attr(struct sched_attr __user *uattr,
3771 struct sched_attr *attr,
3772 unsigned int usize)
3773{
3774 int ret;
3775
3776 if (!access_ok(VERIFY_WRITE, uattr, usize))
3777 return -EFAULT;
3778
3779 /*
3780 * If we're handed a smaller struct than we know of,
3781 * ensure all the unknown bits are 0 - i.e. old
3782 * user-space does not get uncomplete information.
3783 */
3784 if (usize < sizeof(*attr)) {
3785 unsigned char *addr;
3786 unsigned char *end;
3787
3788 addr = (void *)attr + usize;
3789 end = (void *)attr + sizeof(*attr);
3790
3791 for (; addr < end; addr++) {
3792 if (*addr)
3793 goto err_size;
3794 }
3795
3796 attr->size = usize;
3797 }
3798
3799 ret = copy_to_user(uattr, attr, attr->size);
3800 if (ret)
3801 return -EFAULT;
3802
3803out:
3804 return ret;
3805
3806err_size:
3807 ret = -E2BIG;
3808 goto out;
3809}
3810
3811/**
3812 * sys_sched_getattr - similar to sched_getparam, but with sched_attr
3813 * @pid: the pid in question.
3814 * @uattr: structure containing the extended parameters.
3815 * @size: sizeof(attr) for fwd/bwd comp.
3816 */
3817SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
3818 unsigned int, size, unsigned int, flags)
3819{
3820 struct sched_attr attr = {
3821 .size = sizeof(struct sched_attr),
3822 };
3823 struct task_struct *p;
3824 int retval;
3825
3826 if (!uattr || pid < 0 || size > PAGE_SIZE ||
3827 size < SCHED_ATTR_SIZE_VER0 || flags)
3828 return -EINVAL;
3829
3830 rcu_read_lock();
3831 p = find_process_by_pid(pid);
3832 retval = -ESRCH;
3833 if (!p)
3834 goto out_unlock;
3835
3836 retval = security_task_getscheduler(p);
3837 if (retval)
3838 goto out_unlock;
3839
3840 attr.sched_policy = p->policy;
3841 if (p->sched_reset_on_fork)
3842 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
3843 if (task_has_dl_policy(p))
3844 __getparam_dl(p, &attr);
3845 else if (task_has_rt_policy(p))
3846 attr.sched_priority = p->rt_priority;
3847 else
3848 attr.sched_nice = TASK_NICE(p);
3849
3850 rcu_read_unlock();
3851
3852 retval = sched_read_attr(uattr, &attr, size);
3853 return retval;
3854
3855out_unlock:
3856 rcu_read_unlock();
3857 return retval;
3858}
3859
3334long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) 3860long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
3335{ 3861{
3336 cpumask_var_t cpus_allowed, new_mask; 3862 cpumask_var_t cpus_allowed, new_mask;
@@ -3375,8 +3901,26 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
3375 if (retval) 3901 if (retval)
3376 goto out_unlock; 3902 goto out_unlock;
3377 3903
3904
3378 cpuset_cpus_allowed(p, cpus_allowed); 3905 cpuset_cpus_allowed(p, cpus_allowed);
3379 cpumask_and(new_mask, in_mask, cpus_allowed); 3906 cpumask_and(new_mask, in_mask, cpus_allowed);
3907
3908 /*
3909 * Since bandwidth control happens on root_domain basis,
3910 * if admission test is enabled, we only admit -deadline
3911 * tasks allowed to run on all the CPUs in the task's
3912 * root_domain.
3913 */
3914#ifdef CONFIG_SMP
3915 if (task_has_dl_policy(p)) {
3916 const struct cpumask *span = task_rq(p)->rd->span;
3917
3918 if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) {
3919 retval = -EBUSY;
3920 goto out_unlock;
3921 }
3922 }
3923#endif
3380again: 3924again:
3381 retval = set_cpus_allowed_ptr(p, new_mask); 3925 retval = set_cpus_allowed_ptr(p, new_mask);
3382 3926
@@ -3653,7 +4197,7 @@ again:
3653 } 4197 }
3654 4198
3655 double_rq_lock(rq, p_rq); 4199 double_rq_lock(rq, p_rq);
3656 while (task_rq(p) != p_rq) { 4200 if (task_rq(p) != p_rq) {
3657 double_rq_unlock(rq, p_rq); 4201 double_rq_unlock(rq, p_rq);
3658 goto again; 4202 goto again;
3659 } 4203 }
@@ -3742,6 +4286,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
3742 case SCHED_RR: 4286 case SCHED_RR:
3743 ret = MAX_USER_RT_PRIO-1; 4287 ret = MAX_USER_RT_PRIO-1;
3744 break; 4288 break;
4289 case SCHED_DEADLINE:
3745 case SCHED_NORMAL: 4290 case SCHED_NORMAL:
3746 case SCHED_BATCH: 4291 case SCHED_BATCH:
3747 case SCHED_IDLE: 4292 case SCHED_IDLE:
@@ -3768,6 +4313,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
3768 case SCHED_RR: 4313 case SCHED_RR:
3769 ret = 1; 4314 ret = 1;
3770 break; 4315 break;
4316 case SCHED_DEADLINE:
3771 case SCHED_NORMAL: 4317 case SCHED_NORMAL:
3772 case SCHED_BATCH: 4318 case SCHED_BATCH:
3773 case SCHED_IDLE: 4319 case SCHED_IDLE:
@@ -3811,7 +4357,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
3811 goto out_unlock; 4357 goto out_unlock;
3812 4358
3813 rq = task_rq_lock(p, &flags); 4359 rq = task_rq_lock(p, &flags);
3814 time_slice = p->sched_class->get_rr_interval(rq, p); 4360 time_slice = 0;
4361 if (p->sched_class->get_rr_interval)
4362 time_slice = p->sched_class->get_rr_interval(rq, p);
3815 task_rq_unlock(rq, p, &flags); 4363 task_rq_unlock(rq, p, &flags);
3816 4364
3817 rcu_read_unlock(); 4365 rcu_read_unlock();
@@ -4090,6 +4638,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
4090 4638
4091 /* TODO: This is not properly updating schedstats */ 4639 /* TODO: This is not properly updating schedstats */
4092 4640
4641 trace_sched_move_numa(p, curr_cpu, target_cpu);
4093 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); 4642 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
4094} 4643}
4095 4644
@@ -4514,13 +5063,31 @@ static int sched_cpu_active(struct notifier_block *nfb,
4514static int sched_cpu_inactive(struct notifier_block *nfb, 5063static int sched_cpu_inactive(struct notifier_block *nfb,
4515 unsigned long action, void *hcpu) 5064 unsigned long action, void *hcpu)
4516{ 5065{
5066 unsigned long flags;
5067 long cpu = (long)hcpu;
5068
4517 switch (action & ~CPU_TASKS_FROZEN) { 5069 switch (action & ~CPU_TASKS_FROZEN) {
4518 case CPU_DOWN_PREPARE: 5070 case CPU_DOWN_PREPARE:
4519 set_cpu_active((long)hcpu, false); 5071 set_cpu_active(cpu, false);
5072
5073 /* explicitly allow suspend */
5074 if (!(action & CPU_TASKS_FROZEN)) {
5075 struct dl_bw *dl_b = dl_bw_of(cpu);
5076 bool overflow;
5077 int cpus;
5078
5079 raw_spin_lock_irqsave(&dl_b->lock, flags);
5080 cpus = dl_bw_cpus(cpu);
5081 overflow = __dl_overflow(dl_b, cpus, 0, 0);
5082 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
5083
5084 if (overflow)
5085 return notifier_from_errno(-EBUSY);
5086 }
4520 return NOTIFY_OK; 5087 return NOTIFY_OK;
4521 default:
4522 return NOTIFY_DONE;
4523 } 5088 }
5089
5090 return NOTIFY_DONE;
4524} 5091}
4525 5092
4526static int __init migration_init(void) 5093static int __init migration_init(void)
@@ -4739,6 +5306,8 @@ static void free_rootdomain(struct rcu_head *rcu)
4739 struct root_domain *rd = container_of(rcu, struct root_domain, rcu); 5306 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
4740 5307
4741 cpupri_cleanup(&rd->cpupri); 5308 cpupri_cleanup(&rd->cpupri);
5309 cpudl_cleanup(&rd->cpudl);
5310 free_cpumask_var(rd->dlo_mask);
4742 free_cpumask_var(rd->rto_mask); 5311 free_cpumask_var(rd->rto_mask);
4743 free_cpumask_var(rd->online); 5312 free_cpumask_var(rd->online);
4744 free_cpumask_var(rd->span); 5313 free_cpumask_var(rd->span);
@@ -4790,8 +5359,14 @@ static int init_rootdomain(struct root_domain *rd)
4790 goto out; 5359 goto out;
4791 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) 5360 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
4792 goto free_span; 5361 goto free_span;
4793 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) 5362 if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
4794 goto free_online; 5363 goto free_online;
5364 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
5365 goto free_dlo_mask;
5366
5367 init_dl_bw(&rd->dl_bw);
5368 if (cpudl_init(&rd->cpudl) != 0)
5369 goto free_dlo_mask;
4795 5370
4796 if (cpupri_init(&rd->cpupri) != 0) 5371 if (cpupri_init(&rd->cpupri) != 0)
4797 goto free_rto_mask; 5372 goto free_rto_mask;
@@ -4799,6 +5374,8 @@ static int init_rootdomain(struct root_domain *rd)
4799 5374
4800free_rto_mask: 5375free_rto_mask:
4801 free_cpumask_var(rd->rto_mask); 5376 free_cpumask_var(rd->rto_mask);
5377free_dlo_mask:
5378 free_cpumask_var(rd->dlo_mask);
4802free_online: 5379free_online:
4803 free_cpumask_var(rd->online); 5380 free_cpumask_var(rd->online);
4804free_span: 5381free_span:
@@ -6150,6 +6727,7 @@ void __init sched_init_smp(void)
6150 free_cpumask_var(non_isolated_cpus); 6727 free_cpumask_var(non_isolated_cpus);
6151 6728
6152 init_sched_rt_class(); 6729 init_sched_rt_class();
6730 init_sched_dl_class();
6153} 6731}
6154#else 6732#else
6155void __init sched_init_smp(void) 6733void __init sched_init_smp(void)
@@ -6219,13 +6797,15 @@ void __init sched_init(void)
6219#endif /* CONFIG_CPUMASK_OFFSTACK */ 6797#endif /* CONFIG_CPUMASK_OFFSTACK */
6220 } 6798 }
6221 6799
6800 init_rt_bandwidth(&def_rt_bandwidth,
6801 global_rt_period(), global_rt_runtime());
6802 init_dl_bandwidth(&def_dl_bandwidth,
6803 global_rt_period(), global_rt_runtime());
6804
6222#ifdef CONFIG_SMP 6805#ifdef CONFIG_SMP
6223 init_defrootdomain(); 6806 init_defrootdomain();
6224#endif 6807#endif
6225 6808
6226 init_rt_bandwidth(&def_rt_bandwidth,
6227 global_rt_period(), global_rt_runtime());
6228
6229#ifdef CONFIG_RT_GROUP_SCHED 6809#ifdef CONFIG_RT_GROUP_SCHED
6230 init_rt_bandwidth(&root_task_group.rt_bandwidth, 6810 init_rt_bandwidth(&root_task_group.rt_bandwidth,
6231 global_rt_period(), global_rt_runtime()); 6811 global_rt_period(), global_rt_runtime());
@@ -6249,6 +6829,7 @@ void __init sched_init(void)
6249 rq->calc_load_update = jiffies + LOAD_FREQ; 6829 rq->calc_load_update = jiffies + LOAD_FREQ;
6250 init_cfs_rq(&rq->cfs); 6830 init_cfs_rq(&rq->cfs);
6251 init_rt_rq(&rq->rt, rq); 6831 init_rt_rq(&rq->rt, rq);
6832 init_dl_rq(&rq->dl, rq);
6252#ifdef CONFIG_FAIR_GROUP_SCHED 6833#ifdef CONFIG_FAIR_GROUP_SCHED
6253 root_task_group.shares = ROOT_TASK_GROUP_LOAD; 6834 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
6254 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 6835 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
@@ -6320,10 +6901,6 @@ void __init sched_init(void)
6320 INIT_HLIST_HEAD(&init_task.preempt_notifiers); 6901 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
6321#endif 6902#endif
6322 6903
6323#ifdef CONFIG_RT_MUTEXES
6324 plist_head_init(&init_task.pi_waiters);
6325#endif
6326
6327 /* 6904 /*
6328 * The boot idle thread does lazy MMU switching as well: 6905 * The boot idle thread does lazy MMU switching as well:
6329 */ 6906 */
@@ -6397,13 +6974,16 @@ EXPORT_SYMBOL(__might_sleep);
6397static void normalize_task(struct rq *rq, struct task_struct *p) 6974static void normalize_task(struct rq *rq, struct task_struct *p)
6398{ 6975{
6399 const struct sched_class *prev_class = p->sched_class; 6976 const struct sched_class *prev_class = p->sched_class;
6977 struct sched_attr attr = {
6978 .sched_policy = SCHED_NORMAL,
6979 };
6400 int old_prio = p->prio; 6980 int old_prio = p->prio;
6401 int on_rq; 6981 int on_rq;
6402 6982
6403 on_rq = p->on_rq; 6983 on_rq = p->on_rq;
6404 if (on_rq) 6984 if (on_rq)
6405 dequeue_task(rq, p, 0); 6985 dequeue_task(rq, p, 0);
6406 __setscheduler(rq, p, SCHED_NORMAL, 0); 6986 __setscheduler(rq, p, &attr);
6407 if (on_rq) { 6987 if (on_rq) {
6408 enqueue_task(rq, p, 0); 6988 enqueue_task(rq, p, 0);
6409 resched_task(rq->curr); 6989 resched_task(rq->curr);
@@ -6433,7 +7013,7 @@ void normalize_rt_tasks(void)
6433 p->se.statistics.block_start = 0; 7013 p->se.statistics.block_start = 0;
6434#endif 7014#endif
6435 7015
6436 if (!rt_task(p)) { 7016 if (!dl_task(p) && !rt_task(p)) {
6437 /* 7017 /*
6438 * Renice negative nice level userspace 7018 * Renice negative nice level userspace
6439 * tasks back to 0: 7019 * tasks back to 0:
@@ -6628,16 +7208,6 @@ void sched_move_task(struct task_struct *tsk)
6628} 7208}
6629#endif /* CONFIG_CGROUP_SCHED */ 7209#endif /* CONFIG_CGROUP_SCHED */
6630 7210
6631#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
6632static unsigned long to_ratio(u64 period, u64 runtime)
6633{
6634 if (runtime == RUNTIME_INF)
6635 return 1ULL << 20;
6636
6637 return div64_u64(runtime << 20, period);
6638}
6639#endif
6640
6641#ifdef CONFIG_RT_GROUP_SCHED 7211#ifdef CONFIG_RT_GROUP_SCHED
6642/* 7212/*
6643 * Ensure that the real time constraints are schedulable. 7213 * Ensure that the real time constraints are schedulable.
@@ -6811,24 +7381,13 @@ static long sched_group_rt_period(struct task_group *tg)
6811 do_div(rt_period_us, NSEC_PER_USEC); 7381 do_div(rt_period_us, NSEC_PER_USEC);
6812 return rt_period_us; 7382 return rt_period_us;
6813} 7383}
7384#endif /* CONFIG_RT_GROUP_SCHED */
6814 7385
7386#ifdef CONFIG_RT_GROUP_SCHED
6815static int sched_rt_global_constraints(void) 7387static int sched_rt_global_constraints(void)
6816{ 7388{
6817 u64 runtime, period;
6818 int ret = 0; 7389 int ret = 0;
6819 7390
6820 if (sysctl_sched_rt_period <= 0)
6821 return -EINVAL;
6822
6823 runtime = global_rt_runtime();
6824 period = global_rt_period();
6825
6826 /*
6827 * Sanity check on the sysctl variables.
6828 */
6829 if (runtime > period && runtime != RUNTIME_INF)
6830 return -EINVAL;
6831
6832 mutex_lock(&rt_constraints_mutex); 7391 mutex_lock(&rt_constraints_mutex);
6833 read_lock(&tasklist_lock); 7392 read_lock(&tasklist_lock);
6834 ret = __rt_schedulable(NULL, 0, 0); 7393 ret = __rt_schedulable(NULL, 0, 0);
@@ -6851,17 +7410,7 @@ static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
6851static int sched_rt_global_constraints(void) 7410static int sched_rt_global_constraints(void)
6852{ 7411{
6853 unsigned long flags; 7412 unsigned long flags;
6854 int i; 7413 int i, ret = 0;
6855
6856 if (sysctl_sched_rt_period <= 0)
6857 return -EINVAL;
6858
6859 /*
6860 * There's always some RT tasks in the root group
6861 * -- migration, kstopmachine etc..
6862 */
6863 if (sysctl_sched_rt_runtime == 0)
6864 return -EBUSY;
6865 7414
6866 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 7415 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
6867 for_each_possible_cpu(i) { 7416 for_each_possible_cpu(i) {
@@ -6873,36 +7422,91 @@ static int sched_rt_global_constraints(void)
6873 } 7422 }
6874 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 7423 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
6875 7424
6876 return 0; 7425 return ret;
6877} 7426}
6878#endif /* CONFIG_RT_GROUP_SCHED */ 7427#endif /* CONFIG_RT_GROUP_SCHED */
6879 7428
6880int sched_rr_handler(struct ctl_table *table, int write, 7429static int sched_dl_global_constraints(void)
6881 void __user *buffer, size_t *lenp,
6882 loff_t *ppos)
6883{ 7430{
6884 int ret; 7431 u64 runtime = global_rt_runtime();
6885 static DEFINE_MUTEX(mutex); 7432 u64 period = global_rt_period();
7433 u64 new_bw = to_ratio(period, runtime);
7434 int cpu, ret = 0;
7435 unsigned long flags;
6886 7436
6887 mutex_lock(&mutex); 7437 /*
6888 ret = proc_dointvec(table, write, buffer, lenp, ppos); 7438 * Here we want to check the bandwidth not being set to some
6889 /* make sure that internally we keep jiffies */ 7439 * value smaller than the currently allocated bandwidth in
6890 /* also, writing zero resets timeslice to default */ 7440 * any of the root_domains.
6891 if (!ret && write) { 7441 *
6892 sched_rr_timeslice = sched_rr_timeslice <= 0 ? 7442 * FIXME: Cycling on all the CPUs is overdoing, but simpler than
6893 RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); 7443 * cycling on root_domains... Discussion on different/better
7444 * solutions is welcome!
7445 */
7446 for_each_possible_cpu(cpu) {
7447 struct dl_bw *dl_b = dl_bw_of(cpu);
7448
7449 raw_spin_lock_irqsave(&dl_b->lock, flags);
7450 if (new_bw < dl_b->total_bw)
7451 ret = -EBUSY;
7452 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
7453
7454 if (ret)
7455 break;
6894 } 7456 }
6895 mutex_unlock(&mutex); 7457
6896 return ret; 7458 return ret;
6897} 7459}
6898 7460
7461static void sched_dl_do_global(void)
7462{
7463 u64 new_bw = -1;
7464 int cpu;
7465 unsigned long flags;
7466
7467 def_dl_bandwidth.dl_period = global_rt_period();
7468 def_dl_bandwidth.dl_runtime = global_rt_runtime();
7469
7470 if (global_rt_runtime() != RUNTIME_INF)
7471 new_bw = to_ratio(global_rt_period(), global_rt_runtime());
7472
7473 /*
7474 * FIXME: As above...
7475 */
7476 for_each_possible_cpu(cpu) {
7477 struct dl_bw *dl_b = dl_bw_of(cpu);
7478
7479 raw_spin_lock_irqsave(&dl_b->lock, flags);
7480 dl_b->bw = new_bw;
7481 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
7482 }
7483}
7484
7485static int sched_rt_global_validate(void)
7486{
7487 if (sysctl_sched_rt_period <= 0)
7488 return -EINVAL;
7489
7490 if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
7491 (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
7492 return -EINVAL;
7493
7494 return 0;
7495}
7496
7497static void sched_rt_do_global(void)
7498{
7499 def_rt_bandwidth.rt_runtime = global_rt_runtime();
7500 def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
7501}
7502
6899int sched_rt_handler(struct ctl_table *table, int write, 7503int sched_rt_handler(struct ctl_table *table, int write,
6900 void __user *buffer, size_t *lenp, 7504 void __user *buffer, size_t *lenp,
6901 loff_t *ppos) 7505 loff_t *ppos)
6902{ 7506{
6903 int ret;
6904 int old_period, old_runtime; 7507 int old_period, old_runtime;
6905 static DEFINE_MUTEX(mutex); 7508 static DEFINE_MUTEX(mutex);
7509 int ret;
6906 7510
6907 mutex_lock(&mutex); 7511 mutex_lock(&mutex);
6908 old_period = sysctl_sched_rt_period; 7512 old_period = sysctl_sched_rt_period;
@@ -6911,21 +7515,50 @@ int sched_rt_handler(struct ctl_table *table, int write,
6911 ret = proc_dointvec(table, write, buffer, lenp, ppos); 7515 ret = proc_dointvec(table, write, buffer, lenp, ppos);
6912 7516
6913 if (!ret && write) { 7517 if (!ret && write) {
7518 ret = sched_rt_global_validate();
7519 if (ret)
7520 goto undo;
7521
6914 ret = sched_rt_global_constraints(); 7522 ret = sched_rt_global_constraints();
6915 if (ret) { 7523 if (ret)
6916 sysctl_sched_rt_period = old_period; 7524 goto undo;
6917 sysctl_sched_rt_runtime = old_runtime; 7525
6918 } else { 7526 ret = sched_dl_global_constraints();
6919 def_rt_bandwidth.rt_runtime = global_rt_runtime(); 7527 if (ret)
6920 def_rt_bandwidth.rt_period = 7528 goto undo;
6921 ns_to_ktime(global_rt_period()); 7529
6922 } 7530 sched_rt_do_global();
7531 sched_dl_do_global();
7532 }
7533 if (0) {
7534undo:
7535 sysctl_sched_rt_period = old_period;
7536 sysctl_sched_rt_runtime = old_runtime;
6923 } 7537 }
6924 mutex_unlock(&mutex); 7538 mutex_unlock(&mutex);
6925 7539
6926 return ret; 7540 return ret;
6927} 7541}
6928 7542
7543int sched_rr_handler(struct ctl_table *table, int write,
7544 void __user *buffer, size_t *lenp,
7545 loff_t *ppos)
7546{
7547 int ret;
7548 static DEFINE_MUTEX(mutex);
7549
7550 mutex_lock(&mutex);
7551 ret = proc_dointvec(table, write, buffer, lenp, ppos);
7552 /* make sure that internally we keep jiffies */
7553 /* also, writing zero resets timeslice to default */
7554 if (!ret && write) {
7555 sched_rr_timeslice = sched_rr_timeslice <= 0 ?
7556 RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
7557 }
7558 mutex_unlock(&mutex);
7559 return ret;
7560}
7561
6929#ifdef CONFIG_CGROUP_SCHED 7562#ifdef CONFIG_CGROUP_SCHED
6930 7563
6931static inline struct task_group *css_tg(struct cgroup_subsys_state *css) 7564static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
@@ -7258,15 +7891,14 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
7258 return ret; 7891 return ret;
7259} 7892}
7260 7893
7261static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft, 7894static int cpu_stats_show(struct seq_file *sf, void *v)
7262 struct cgroup_map_cb *cb)
7263{ 7895{
7264 struct task_group *tg = css_tg(css); 7896 struct task_group *tg = css_tg(seq_css(sf));
7265 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7897 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7266 7898
7267 cb->fill(cb, "nr_periods", cfs_b->nr_periods); 7899 seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
7268 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); 7900 seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
7269 cb->fill(cb, "throttled_time", cfs_b->throttled_time); 7901 seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
7270 7902
7271 return 0; 7903 return 0;
7272} 7904}
@@ -7320,7 +7952,7 @@ static struct cftype cpu_files[] = {
7320 }, 7952 },
7321 { 7953 {
7322 .name = "stat", 7954 .name = "stat",
7323 .read_map = cpu_stats_show, 7955 .seq_show = cpu_stats_show,
7324 }, 7956 },
7325#endif 7957#endif
7326#ifdef CONFIG_RT_GROUP_SCHED 7958#ifdef CONFIG_RT_GROUP_SCHED