aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDario Faggioli <raistlin@linux.it>2013-11-28 05:14:43 -0500
committerIngo Molnar <mingo@kernel.org>2014-01-13 07:41:06 -0500
commitaab03e05e8f7e26f51dee792beddcb5cca9215a5 (patch)
treebae7f6033c849e7ca77a98783c732caea412ae75
parentd50dde5a10f305253cbc3855307f608f8a3c5f73 (diff)
sched/deadline: Add SCHED_DEADLINE structures & implementation
Introduces the data structures, constants and symbols needed for SCHED_DEADLINE implementation. Core data structure of SCHED_DEADLINE are defined, along with their initializers. Hooks for checking if a task belong to the new policy are also added where they are needed. Adds a scheduling class, in sched/dl.c and a new policy called SCHED_DEADLINE. It is an implementation of the Earliest Deadline First (EDF) scheduling algorithm, augmented with a mechanism (called Constant Bandwidth Server, CBS) that makes it possible to isolate the behaviour of tasks between each other. The typical -deadline task will be made up of a computation phase (instance) which is activated on a periodic or sporadic fashion. The expected (maximum) duration of such computation is called the task's runtime; the time interval by which each instance need to be completed is called the task's relative deadline. The task's absolute deadline is dynamically calculated as the time instant a task (better, an instance) activates plus the relative deadline. The EDF algorithms selects the task with the smallest absolute deadline as the one to be executed first, while the CBS ensures each task to run for at most its runtime every (relative) deadline length time interval, avoiding any interference between different tasks (bandwidth isolation). Thanks to this feature, also tasks that do not strictly comply with the computational model sketched above can effectively use the new policy. To summarize, this patch: - introduces the data structures, constants and symbols needed; - implements the core logic of the scheduling algorithm in the new scheduling class file; - provides all the glue code between the new scheduling class and the core scheduler and refines the interactions between sched/dl and the other existing scheduling classes. Signed-off-by: Dario Faggioli <raistlin@linux.it> Signed-off-by: Michael Trimarchi <michael@amarulasolutions.com> Signed-off-by: Fabio Checconi <fchecconi@gmail.com> Signed-off-by: Juri Lelli <juri.lelli@gmail.com> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1383831828-15501-4-git-send-email-juri.lelli@gmail.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--include/linux/sched.h46
-rw-r--r--include/linux/sched/deadline.h24
-rw-r--r--include/uapi/linux/sched.h1
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/hrtimer.c3
-rw-r--r--kernel/sched/Makefile3
-rw-r--r--kernel/sched/core.c109
-rw-r--r--kernel/sched/deadline.c684
-rw-r--r--kernel/sched/sched.h26
-rw-r--r--kernel/sched/stop_task.c2
10 files changed, 882 insertions, 20 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 86025b6c6387..6c196794fc12 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -97,6 +97,10 @@ struct sched_param {
97 * Given this task model, there are a multiplicity of scheduling algorithms 97 * Given this task model, there are a multiplicity of scheduling algorithms
98 * and policies, that can be used to ensure all the tasks will make their 98 * and policies, that can be used to ensure all the tasks will make their
99 * timing constraints. 99 * timing constraints.
100 *
101 * As of now, the SCHED_DEADLINE policy (sched_dl scheduling class) is the
102 * only user of this new interface. More information about the algorithm
103 * available in the scheduling class file or in Documentation/.
100 */ 104 */
101struct sched_attr { 105struct sched_attr {
102 u32 size; 106 u32 size;
@@ -1088,6 +1092,45 @@ struct sched_rt_entity {
1088#endif 1092#endif
1089}; 1093};
1090 1094
1095struct sched_dl_entity {
1096 struct rb_node rb_node;
1097
1098 /*
1099 * Original scheduling parameters. Copied here from sched_attr
1100 * during sched_setscheduler2(), they will remain the same until
1101 * the next sched_setscheduler2().
1102 */
1103 u64 dl_runtime; /* maximum runtime for each instance */
1104 u64 dl_deadline; /* relative deadline of each instance */
1105
1106 /*
1107 * Actual scheduling parameters. Initialized with the values above,
1108 * they are continously updated during task execution. Note that
1109 * the remaining runtime could be < 0 in case we are in overrun.
1110 */
1111 s64 runtime; /* remaining runtime for this instance */
1112 u64 deadline; /* absolute deadline for this instance */
1113 unsigned int flags; /* specifying the scheduler behaviour */
1114
1115 /*
1116 * Some bool flags:
1117 *
1118 * @dl_throttled tells if we exhausted the runtime. If so, the
1119 * task has to wait for a replenishment to be performed at the
1120 * next firing of dl_timer.
1121 *
1122 * @dl_new tells if a new instance arrived. If so we must
1123 * start executing it with full runtime and reset its absolute
1124 * deadline;
1125 */
1126 int dl_throttled, dl_new;
1127
1128 /*
1129 * Bandwidth enforcement timer. Each -deadline task has its
1130 * own bandwidth to be enforced, thus we need one timer per task.
1131 */
1132 struct hrtimer dl_timer;
1133};
1091 1134
1092struct rcu_node; 1135struct rcu_node;
1093 1136
@@ -1124,6 +1167,7 @@ struct task_struct {
1124#ifdef CONFIG_CGROUP_SCHED 1167#ifdef CONFIG_CGROUP_SCHED
1125 struct task_group *sched_task_group; 1168 struct task_group *sched_task_group;
1126#endif 1169#endif
1170 struct sched_dl_entity dl;
1127 1171
1128#ifdef CONFIG_PREEMPT_NOTIFIERS 1172#ifdef CONFIG_PREEMPT_NOTIFIERS
1129 /* list of struct preempt_notifier: */ 1173 /* list of struct preempt_notifier: */
@@ -2099,7 +2143,7 @@ extern void wake_up_new_task(struct task_struct *tsk);
2099#else 2143#else
2100 static inline void kick_process(struct task_struct *tsk) { } 2144 static inline void kick_process(struct task_struct *tsk) { }
2101#endif 2145#endif
2102extern void sched_fork(unsigned long clone_flags, struct task_struct *p); 2146extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
2103extern void sched_dead(struct task_struct *p); 2147extern void sched_dead(struct task_struct *p);
2104 2148
2105extern void proc_caches_init(void); 2149extern void proc_caches_init(void);
diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h
new file mode 100644
index 000000000000..9d303b8847df
--- /dev/null
+++ b/include/linux/sched/deadline.h
@@ -0,0 +1,24 @@
1#ifndef _SCHED_DEADLINE_H
2#define _SCHED_DEADLINE_H
3
4/*
5 * SCHED_DEADLINE tasks has negative priorities, reflecting
6 * the fact that any of them has higher prio than RT and
7 * NORMAL/BATCH tasks.
8 */
9
10#define MAX_DL_PRIO 0
11
12static inline int dl_prio(int prio)
13{
14 if (unlikely(prio < MAX_DL_PRIO))
15 return 1;
16 return 0;
17}
18
19static inline int dl_task(struct task_struct *p)
20{
21 return dl_prio(p->prio);
22}
23
24#endif /* _SCHED_DEADLINE_H */
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 5a0f945927ac..2d5e49a2a6d2 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -39,6 +39,7 @@
39#define SCHED_BATCH 3 39#define SCHED_BATCH 3
40/* SCHED_ISO: reserved but not implemented yet */ 40/* SCHED_ISO: reserved but not implemented yet */
41#define SCHED_IDLE 5 41#define SCHED_IDLE 5
42#define SCHED_DEADLINE 6
42/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ 43/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
43#define SCHED_RESET_ON_FORK 0x40000000 44#define SCHED_RESET_ON_FORK 0x40000000
44 45
diff --git a/kernel/fork.c b/kernel/fork.c
index 6023d150a305..e6c0f1a22914 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1311,7 +1311,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1311#endif 1311#endif
1312 1312
1313 /* Perform scheduler related setup. Assign this task to a CPU. */ 1313 /* Perform scheduler related setup. Assign this task to a CPU. */
1314 sched_fork(clone_flags, p); 1314 retval = sched_fork(clone_flags, p);
1315 if (retval)
1316 goto bad_fork_cleanup_policy;
1315 1317
1316 retval = perf_event_init_task(p); 1318 retval = perf_event_init_task(p);
1317 if (retval) 1319 if (retval)
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 383319bae3f7..09094361dce5 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -46,6 +46,7 @@
46#include <linux/sched.h> 46#include <linux/sched.h>
47#include <linux/sched/sysctl.h> 47#include <linux/sched/sysctl.h>
48#include <linux/sched/rt.h> 48#include <linux/sched/rt.h>
49#include <linux/sched/deadline.h>
49#include <linux/timer.h> 50#include <linux/timer.h>
50#include <linux/freezer.h> 51#include <linux/freezer.h>
51 52
@@ -1610,7 +1611,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
1610 unsigned long slack; 1611 unsigned long slack;
1611 1612
1612 slack = current->timer_slack_ns; 1613 slack = current->timer_slack_ns;
1613 if (rt_task(current)) 1614 if (dl_task(current) || rt_task(current))
1614 slack = 0; 1615 slack = 0;
1615 1616
1616 hrtimer_init_on_stack(&t.timer, clockid, mode); 1617 hrtimer_init_on_stack(&t.timer, clockid, mode);
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 7b621409cf15..b039035a9376 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -11,7 +11,8 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer 11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
12endif 12endif
13 13
14obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o 14obj-y += core.o proc.o clock.o cputime.o
15obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
15obj-y += wait.o completion.o 16obj-y += wait.o completion.o
16obj-$(CONFIG_SMP) += cpupri.o 17obj-$(CONFIG_SMP) += cpupri.o
17obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o 18obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8174f889076c..203aecdcfccb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -899,7 +899,9 @@ static inline int normal_prio(struct task_struct *p)
899{ 899{
900 int prio; 900 int prio;
901 901
902 if (task_has_rt_policy(p)) 902 if (task_has_dl_policy(p))
903 prio = MAX_DL_PRIO-1;
904 else if (task_has_rt_policy(p))
903 prio = MAX_RT_PRIO-1 - p->rt_priority; 905 prio = MAX_RT_PRIO-1 - p->rt_priority;
904 else 906 else
905 prio = __normal_prio(p); 907 prio = __normal_prio(p);
@@ -1717,6 +1719,12 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1717 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 1719 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
1718#endif 1720#endif
1719 1721
1722 RB_CLEAR_NODE(&p->dl.rb_node);
1723 hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1724 p->dl.dl_runtime = p->dl.runtime = 0;
1725 p->dl.dl_deadline = p->dl.deadline = 0;
1726 p->dl.flags = 0;
1727
1720 INIT_LIST_HEAD(&p->rt.run_list); 1728 INIT_LIST_HEAD(&p->rt.run_list);
1721 1729
1722#ifdef CONFIG_PREEMPT_NOTIFIERS 1730#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -1768,7 +1776,7 @@ void set_numabalancing_state(bool enabled)
1768/* 1776/*
1769 * fork()/clone()-time setup: 1777 * fork()/clone()-time setup:
1770 */ 1778 */
1771void sched_fork(unsigned long clone_flags, struct task_struct *p) 1779int sched_fork(unsigned long clone_flags, struct task_struct *p)
1772{ 1780{
1773 unsigned long flags; 1781 unsigned long flags;
1774 int cpu = get_cpu(); 1782 int cpu = get_cpu();
@@ -1790,7 +1798,7 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p)
1790 * Revert to default priority/policy on fork if requested. 1798 * Revert to default priority/policy on fork if requested.
1791 */ 1799 */
1792 if (unlikely(p->sched_reset_on_fork)) { 1800 if (unlikely(p->sched_reset_on_fork)) {
1793 if (task_has_rt_policy(p)) { 1801 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
1794 p->policy = SCHED_NORMAL; 1802 p->policy = SCHED_NORMAL;
1795 p->static_prio = NICE_TO_PRIO(0); 1803 p->static_prio = NICE_TO_PRIO(0);
1796 p->rt_priority = 0; 1804 p->rt_priority = 0;
@@ -1807,8 +1815,14 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p)
1807 p->sched_reset_on_fork = 0; 1815 p->sched_reset_on_fork = 0;
1808 } 1816 }
1809 1817
1810 if (!rt_prio(p->prio)) 1818 if (dl_prio(p->prio)) {
1819 put_cpu();
1820 return -EAGAIN;
1821 } else if (rt_prio(p->prio)) {
1822 p->sched_class = &rt_sched_class;
1823 } else {
1811 p->sched_class = &fair_sched_class; 1824 p->sched_class = &fair_sched_class;
1825 }
1812 1826
1813 if (p->sched_class->task_fork) 1827 if (p->sched_class->task_fork)
1814 p->sched_class->task_fork(p); 1828 p->sched_class->task_fork(p);
@@ -1837,6 +1851,7 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p)
1837#endif 1851#endif
1838 1852
1839 put_cpu(); 1853 put_cpu();
1854 return 0;
1840} 1855}
1841 1856
1842/* 1857/*
@@ -2768,7 +2783,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
2768 struct rq *rq; 2783 struct rq *rq;
2769 const struct sched_class *prev_class; 2784 const struct sched_class *prev_class;
2770 2785
2771 BUG_ON(prio < 0 || prio > MAX_PRIO); 2786 BUG_ON(prio > MAX_PRIO);
2772 2787
2773 rq = __task_rq_lock(p); 2788 rq = __task_rq_lock(p);
2774 2789
@@ -2800,7 +2815,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
2800 if (running) 2815 if (running)
2801 p->sched_class->put_prev_task(rq, p); 2816 p->sched_class->put_prev_task(rq, p);
2802 2817
2803 if (rt_prio(prio)) 2818 if (dl_prio(prio))
2819 p->sched_class = &dl_sched_class;
2820 else if (rt_prio(prio))
2804 p->sched_class = &rt_sched_class; 2821 p->sched_class = &rt_sched_class;
2805 else 2822 else
2806 p->sched_class = &fair_sched_class; 2823 p->sched_class = &fair_sched_class;
@@ -2835,9 +2852,9 @@ void set_user_nice(struct task_struct *p, long nice)
2835 * The RT priorities are set via sched_setscheduler(), but we still 2852 * The RT priorities are set via sched_setscheduler(), but we still
2836 * allow the 'normal' nice value to be set - but as expected 2853 * allow the 'normal' nice value to be set - but as expected
2837 * it wont have any effect on scheduling until the task is 2854 * it wont have any effect on scheduling until the task is
2838 * SCHED_FIFO/SCHED_RR: 2855 * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
2839 */ 2856 */
2840 if (task_has_rt_policy(p)) { 2857 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
2841 p->static_prio = NICE_TO_PRIO(nice); 2858 p->static_prio = NICE_TO_PRIO(nice);
2842 goto out_unlock; 2859 goto out_unlock;
2843 } 2860 }
@@ -2992,6 +3009,27 @@ static struct task_struct *find_process_by_pid(pid_t pid)
2992 return pid ? find_task_by_vpid(pid) : current; 3009 return pid ? find_task_by_vpid(pid) : current;
2993} 3010}
2994 3011
3012/*
3013 * This function initializes the sched_dl_entity of a newly becoming
3014 * SCHED_DEADLINE task.
3015 *
3016 * Only the static values are considered here, the actual runtime and the
3017 * absolute deadline will be properly calculated when the task is enqueued
3018 * for the first time with its new policy.
3019 */
3020static void
3021__setparam_dl(struct task_struct *p, const struct sched_attr *attr)
3022{
3023 struct sched_dl_entity *dl_se = &p->dl;
3024
3025 init_dl_task_timer(dl_se);
3026 dl_se->dl_runtime = attr->sched_runtime;
3027 dl_se->dl_deadline = attr->sched_deadline;
3028 dl_se->flags = attr->sched_flags;
3029 dl_se->dl_throttled = 0;
3030 dl_se->dl_new = 1;
3031}
3032
2995/* Actually do priority change: must hold pi & rq lock. */ 3033/* Actually do priority change: must hold pi & rq lock. */
2996static void __setscheduler(struct rq *rq, struct task_struct *p, 3034static void __setscheduler(struct rq *rq, struct task_struct *p,
2997 const struct sched_attr *attr) 3035 const struct sched_attr *attr)
@@ -3000,7 +3038,9 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
3000 3038
3001 p->policy = policy; 3039 p->policy = policy;
3002 3040
3003 if (rt_policy(policy)) 3041 if (dl_policy(policy))
3042 __setparam_dl(p, attr);
3043 else if (rt_policy(policy))
3004 p->rt_priority = attr->sched_priority; 3044 p->rt_priority = attr->sched_priority;
3005 else 3045 else
3006 p->static_prio = NICE_TO_PRIO(attr->sched_nice); 3046 p->static_prio = NICE_TO_PRIO(attr->sched_nice);
@@ -3008,13 +3048,39 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
3008 p->normal_prio = normal_prio(p); 3048 p->normal_prio = normal_prio(p);
3009 p->prio = rt_mutex_getprio(p); 3049 p->prio = rt_mutex_getprio(p);
3010 3050
3011 if (rt_prio(p->prio)) 3051 if (dl_prio(p->prio))
3052 p->sched_class = &dl_sched_class;
3053 else if (rt_prio(p->prio))
3012 p->sched_class = &rt_sched_class; 3054 p->sched_class = &rt_sched_class;
3013 else 3055 else
3014 p->sched_class = &fair_sched_class; 3056 p->sched_class = &fair_sched_class;
3015 3057
3016 set_load_weight(p); 3058 set_load_weight(p);
3017} 3059}
3060
3061static void
3062__getparam_dl(struct task_struct *p, struct sched_attr *attr)
3063{
3064 struct sched_dl_entity *dl_se = &p->dl;
3065
3066 attr->sched_priority = p->rt_priority;
3067 attr->sched_runtime = dl_se->dl_runtime;
3068 attr->sched_deadline = dl_se->dl_deadline;
3069 attr->sched_flags = dl_se->flags;
3070}
3071
3072/*
3073 * This function validates the new parameters of a -deadline task.
3074 * We ask for the deadline not being zero, and greater or equal
3075 * than the runtime.
3076 */
3077static bool
3078__checkparam_dl(const struct sched_attr *attr)
3079{
3080 return attr && attr->sched_deadline != 0 &&
3081 (s64)(attr->sched_deadline - attr->sched_runtime) >= 0;
3082}
3083
3018/* 3084/*
3019 * check the target process has a UID that matches the current process's 3085 * check the target process has a UID that matches the current process's
3020 */ 3086 */
@@ -3053,7 +3119,8 @@ recheck:
3053 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); 3119 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
3054 policy &= ~SCHED_RESET_ON_FORK; 3120 policy &= ~SCHED_RESET_ON_FORK;
3055 3121
3056 if (policy != SCHED_FIFO && policy != SCHED_RR && 3122 if (policy != SCHED_DEADLINE &&
3123 policy != SCHED_FIFO && policy != SCHED_RR &&
3057 policy != SCHED_NORMAL && policy != SCHED_BATCH && 3124 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
3058 policy != SCHED_IDLE) 3125 policy != SCHED_IDLE)
3059 return -EINVAL; 3126 return -EINVAL;
@@ -3068,7 +3135,8 @@ recheck:
3068 (p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) || 3135 (p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
3069 (!p->mm && attr->sched_priority > MAX_RT_PRIO-1)) 3136 (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
3070 return -EINVAL; 3137 return -EINVAL;
3071 if (rt_policy(policy) != (attr->sched_priority != 0)) 3138 if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
3139 (rt_policy(policy) != (attr->sched_priority != 0)))
3072 return -EINVAL; 3140 return -EINVAL;
3073 3141
3074 /* 3142 /*
@@ -3143,6 +3211,8 @@ recheck:
3143 goto change; 3211 goto change;
3144 if (rt_policy(policy) && attr->sched_priority != p->rt_priority) 3212 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
3145 goto change; 3213 goto change;
3214 if (dl_policy(policy))
3215 goto change;
3146 3216
3147 task_rq_unlock(rq, p, &flags); 3217 task_rq_unlock(rq, p, &flags);
3148 return 0; 3218 return 0;
@@ -3453,6 +3523,10 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
3453 if (retval) 3523 if (retval)
3454 goto out_unlock; 3524 goto out_unlock;
3455 3525
3526 if (task_has_dl_policy(p)) {
3527 retval = -EINVAL;
3528 goto out_unlock;
3529 }
3456 lp.sched_priority = p->rt_priority; 3530 lp.sched_priority = p->rt_priority;
3457 rcu_read_unlock(); 3531 rcu_read_unlock();
3458 3532
@@ -3510,7 +3584,7 @@ err_size:
3510} 3584}
3511 3585
3512/** 3586/**
3513 * sys_sched_getattr - same as above, but with extended "sched_param" 3587 * sys_sched_getattr - similar to sched_getparam, but with sched_attr
3514 * @pid: the pid in question. 3588 * @pid: the pid in question.
3515 * @attr: structure containing the extended parameters. 3589 * @attr: structure containing the extended parameters.
3516 * @size: sizeof(attr) for fwd/bwd comp. 3590 * @size: sizeof(attr) for fwd/bwd comp.
@@ -3539,7 +3613,9 @@ SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
3539 goto out_unlock; 3613 goto out_unlock;
3540 3614
3541 attr.sched_policy = p->policy; 3615 attr.sched_policy = p->policy;
3542 if (task_has_rt_policy(p)) 3616 if (task_has_dl_policy(p))
3617 __getparam_dl(p, &attr);
3618 else if (task_has_rt_policy(p))
3543 attr.sched_priority = p->rt_priority; 3619 attr.sched_priority = p->rt_priority;
3544 else 3620 else
3545 attr.sched_nice = TASK_NICE(p); 3621 attr.sched_nice = TASK_NICE(p);
@@ -3965,6 +4041,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
3965 case SCHED_RR: 4041 case SCHED_RR:
3966 ret = MAX_USER_RT_PRIO-1; 4042 ret = MAX_USER_RT_PRIO-1;
3967 break; 4043 break;
4044 case SCHED_DEADLINE:
3968 case SCHED_NORMAL: 4045 case SCHED_NORMAL:
3969 case SCHED_BATCH: 4046 case SCHED_BATCH:
3970 case SCHED_IDLE: 4047 case SCHED_IDLE:
@@ -3991,6 +4068,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
3991 case SCHED_RR: 4068 case SCHED_RR:
3992 ret = 1; 4069 ret = 1;
3993 break; 4070 break;
4071 case SCHED_DEADLINE:
3994 case SCHED_NORMAL: 4072 case SCHED_NORMAL:
3995 case SCHED_BATCH: 4073 case SCHED_BATCH:
3996 case SCHED_IDLE: 4074 case SCHED_IDLE:
@@ -6472,6 +6550,7 @@ void __init sched_init(void)
6472 rq->calc_load_update = jiffies + LOAD_FREQ; 6550 rq->calc_load_update = jiffies + LOAD_FREQ;
6473 init_cfs_rq(&rq->cfs); 6551 init_cfs_rq(&rq->cfs);
6474 init_rt_rq(&rq->rt, rq); 6552 init_rt_rq(&rq->rt, rq);
6553 init_dl_rq(&rq->dl, rq);
6475#ifdef CONFIG_FAIR_GROUP_SCHED 6554#ifdef CONFIG_FAIR_GROUP_SCHED
6476 root_task_group.shares = ROOT_TASK_GROUP_LOAD; 6555 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
6477 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 6556 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
@@ -6659,7 +6738,7 @@ void normalize_rt_tasks(void)
6659 p->se.statistics.block_start = 0; 6738 p->se.statistics.block_start = 0;
6660#endif 6739#endif
6661 6740
6662 if (!rt_task(p)) { 6741 if (!dl_task(p) && !rt_task(p)) {
6663 /* 6742 /*
6664 * Renice negative nice level userspace 6743 * Renice negative nice level userspace
6665 * tasks back to 0: 6744 * tasks back to 0:
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
new file mode 100644
index 000000000000..93d82b2a88bd
--- /dev/null
+++ b/kernel/sched/deadline.c
@@ -0,0 +1,684 @@
1/*
2 * Deadline Scheduling Class (SCHED_DEADLINE)
3 *
4 * Earliest Deadline First (EDF) + Constant Bandwidth Server (CBS).
5 *
6 * Tasks that periodically executes their instances for less than their
7 * runtime won't miss any of their deadlines.
8 * Tasks that are not periodic or sporadic or that tries to execute more
9 * than their reserved bandwidth will be slowed down (and may potentially
10 * miss some of their deadlines), and won't affect any other task.
11 *
12 * Copyright (C) 2012 Dario Faggioli <raistlin@linux.it>,
13 * Michael Trimarchi <michael@amarulasolutions.com>,
14 * Fabio Checconi <fchecconi@gmail.com>
15 */
16#include "sched.h"
17
18static inline int dl_time_before(u64 a, u64 b)
19{
20 return (s64)(a - b) < 0;
21}
22
23static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
24{
25 return container_of(dl_se, struct task_struct, dl);
26}
27
28static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq)
29{
30 return container_of(dl_rq, struct rq, dl);
31}
32
33static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se)
34{
35 struct task_struct *p = dl_task_of(dl_se);
36 struct rq *rq = task_rq(p);
37
38 return &rq->dl;
39}
40
41static inline int on_dl_rq(struct sched_dl_entity *dl_se)
42{
43 return !RB_EMPTY_NODE(&dl_se->rb_node);
44}
45
46static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
47{
48 struct sched_dl_entity *dl_se = &p->dl;
49
50 return dl_rq->rb_leftmost == &dl_se->rb_node;
51}
52
53void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq)
54{
55 dl_rq->rb_root = RB_ROOT;
56}
57
58static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
59static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags);
60static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
61 int flags);
62
63/*
64 * We are being explicitly informed that a new instance is starting,
65 * and this means that:
66 * - the absolute deadline of the entity has to be placed at
67 * current time + relative deadline;
68 * - the runtime of the entity has to be set to the maximum value.
69 *
70 * The capability of specifying such event is useful whenever a -deadline
71 * entity wants to (try to!) synchronize its behaviour with the scheduler's
72 * one, and to (try to!) reconcile itself with its own scheduling
73 * parameters.
74 */
75static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
76{
77 struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
78 struct rq *rq = rq_of_dl_rq(dl_rq);
79
80 WARN_ON(!dl_se->dl_new || dl_se->dl_throttled);
81
82 /*
83 * We use the regular wall clock time to set deadlines in the
84 * future; in fact, we must consider execution overheads (time
85 * spent on hardirq context, etc.).
86 */
87 dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline;
88 dl_se->runtime = dl_se->dl_runtime;
89 dl_se->dl_new = 0;
90}
91
92/*
93 * Pure Earliest Deadline First (EDF) scheduling does not deal with the
94 * possibility of a entity lasting more than what it declared, and thus
95 * exhausting its runtime.
96 *
97 * Here we are interested in making runtime overrun possible, but we do
98 * not want a entity which is misbehaving to affect the scheduling of all
99 * other entities.
100 * Therefore, a budgeting strategy called Constant Bandwidth Server (CBS)
101 * is used, in order to confine each entity within its own bandwidth.
102 *
103 * This function deals exactly with that, and ensures that when the runtime
104 * of a entity is replenished, its deadline is also postponed. That ensures
105 * the overrunning entity can't interfere with other entity in the system and
106 * can't make them miss their deadlines. Reasons why this kind of overruns
107 * could happen are, typically, a entity voluntarily trying to overcome its
108 * runtime, or it just underestimated it during sched_setscheduler_ex().
109 */
110static void replenish_dl_entity(struct sched_dl_entity *dl_se)
111{
112 struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
113 struct rq *rq = rq_of_dl_rq(dl_rq);
114
115 /*
116 * We keep moving the deadline away until we get some
117 * available runtime for the entity. This ensures correct
118 * handling of situations where the runtime overrun is
119 * arbitrary large.
120 */
121 while (dl_se->runtime <= 0) {
122 dl_se->deadline += dl_se->dl_deadline;
123 dl_se->runtime += dl_se->dl_runtime;
124 }
125
126 /*
127 * At this point, the deadline really should be "in
128 * the future" with respect to rq->clock. If it's
129 * not, we are, for some reason, lagging too much!
130 * Anyway, after having warn userspace abut that,
131 * we still try to keep the things running by
132 * resetting the deadline and the budget of the
133 * entity.
134 */
135 if (dl_time_before(dl_se->deadline, rq_clock(rq))) {
136 static bool lag_once = false;
137
138 if (!lag_once) {
139 lag_once = true;
140 printk_sched("sched: DL replenish lagged to much\n");
141 }
142 dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline;
143 dl_se->runtime = dl_se->dl_runtime;
144 }
145}
146
147/*
148 * Here we check if --at time t-- an entity (which is probably being
149 * [re]activated or, in general, enqueued) can use its remaining runtime
150 * and its current deadline _without_ exceeding the bandwidth it is
151 * assigned (function returns true if it can't). We are in fact applying
152 * one of the CBS rules: when a task wakes up, if the residual runtime
153 * over residual deadline fits within the allocated bandwidth, then we
154 * can keep the current (absolute) deadline and residual budget without
155 * disrupting the schedulability of the system. Otherwise, we should
156 * refill the runtime and set the deadline a period in the future,
157 * because keeping the current (absolute) deadline of the task would
158 * result in breaking guarantees promised to other tasks.
159 *
160 * This function returns true if:
161 *
162 * runtime / (deadline - t) > dl_runtime / dl_deadline ,
163 *
164 * IOW we can't recycle current parameters.
165 */
166static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t)
167{
168 u64 left, right;
169
170 /*
171 * left and right are the two sides of the equation above,
172 * after a bit of shuffling to use multiplications instead
173 * of divisions.
174 *
175 * Note that none of the time values involved in the two
176 * multiplications are absolute: dl_deadline and dl_runtime
177 * are the relative deadline and the maximum runtime of each
178 * instance, runtime is the runtime left for the last instance
179 * and (deadline - t), since t is rq->clock, is the time left
180 * to the (absolute) deadline. Even if overflowing the u64 type
181 * is very unlikely to occur in both cases, here we scale down
182 * as we want to avoid that risk at all. Scaling down by 10
183 * means that we reduce granularity to 1us. We are fine with it,
184 * since this is only a true/false check and, anyway, thinking
185 * of anything below microseconds resolution is actually fiction
186 * (but still we want to give the user that illusion >;).
187 */
188 left = (dl_se->dl_deadline >> 10) * (dl_se->runtime >> 10);
189 right = ((dl_se->deadline - t) >> 10) * (dl_se->dl_runtime >> 10);
190
191 return dl_time_before(right, left);
192}
193
194/*
195 * When a -deadline entity is queued back on the runqueue, its runtime and
196 * deadline might need updating.
197 *
198 * The policy here is that we update the deadline of the entity only if:
199 * - the current deadline is in the past,
200 * - using the remaining runtime with the current deadline would make
201 * the entity exceed its bandwidth.
202 */
203static void update_dl_entity(struct sched_dl_entity *dl_se)
204{
205 struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
206 struct rq *rq = rq_of_dl_rq(dl_rq);
207
208 /*
209 * The arrival of a new instance needs special treatment, i.e.,
210 * the actual scheduling parameters have to be "renewed".
211 */
212 if (dl_se->dl_new) {
213 setup_new_dl_entity(dl_se);
214 return;
215 }
216
217 if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
218 dl_entity_overflow(dl_se, rq_clock(rq))) {
219 dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline;
220 dl_se->runtime = dl_se->dl_runtime;
221 }
222}
223
224/*
225 * If the entity depleted all its runtime, and if we want it to sleep
226 * while waiting for some new execution time to become available, we
227 * set the bandwidth enforcement timer to the replenishment instant
228 * and try to activate it.
229 *
230 * Notice that it is important for the caller to know if the timer
231 * actually started or not (i.e., the replenishment instant is in
232 * the future or in the past).
233 */
234static int start_dl_timer(struct sched_dl_entity *dl_se)
235{
236 struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
237 struct rq *rq = rq_of_dl_rq(dl_rq);
238 ktime_t now, act;
239 ktime_t soft, hard;
240 unsigned long range;
241 s64 delta;
242
243 /*
244 * We want the timer to fire at the deadline, but considering
245 * that it is actually coming from rq->clock and not from
246 * hrtimer's time base reading.
247 */
248 act = ns_to_ktime(dl_se->deadline);
249 now = hrtimer_cb_get_time(&dl_se->dl_timer);
250 delta = ktime_to_ns(now) - rq_clock(rq);
251 act = ktime_add_ns(act, delta);
252
253 /*
254 * If the expiry time already passed, e.g., because the value
255 * chosen as the deadline is too small, don't even try to
256 * start the timer in the past!
257 */
258 if (ktime_us_delta(act, now) < 0)
259 return 0;
260
261 hrtimer_set_expires(&dl_se->dl_timer, act);
262
263 soft = hrtimer_get_softexpires(&dl_se->dl_timer);
264 hard = hrtimer_get_expires(&dl_se->dl_timer);
265 range = ktime_to_ns(ktime_sub(hard, soft));
266 __hrtimer_start_range_ns(&dl_se->dl_timer, soft,
267 range, HRTIMER_MODE_ABS, 0);
268
269 return hrtimer_active(&dl_se->dl_timer);
270}
271
272/*
273 * This is the bandwidth enforcement timer callback. If here, we know
274 * a task is not on its dl_rq, since the fact that the timer was running
275 * means the task is throttled and needs a runtime replenishment.
276 *
277 * However, what we actually do depends on the fact the task is active,
278 * (it is on its rq) or has been removed from there by a call to
279 * dequeue_task_dl(). In the former case we must issue the runtime
280 * replenishment and add the task back to the dl_rq; in the latter, we just
281 * do nothing but clearing dl_throttled, so that runtime and deadline
282 * updating (and the queueing back to dl_rq) will be done by the
283 * next call to enqueue_task_dl().
284 */
285static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
286{
287 struct sched_dl_entity *dl_se = container_of(timer,
288 struct sched_dl_entity,
289 dl_timer);
290 struct task_struct *p = dl_task_of(dl_se);
291 struct rq *rq = task_rq(p);
292 raw_spin_lock(&rq->lock);
293
294 /*
295 * We need to take care of a possible races here. In fact, the
296 * task might have changed its scheduling policy to something
297 * different from SCHED_DEADLINE or changed its reservation
298 * parameters (through sched_setscheduler()).
299 */
300 if (!dl_task(p) || dl_se->dl_new)
301 goto unlock;
302
303 sched_clock_tick();
304 update_rq_clock(rq);
305 dl_se->dl_throttled = 0;
306 if (p->on_rq) {
307 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
308 if (task_has_dl_policy(rq->curr))
309 check_preempt_curr_dl(rq, p, 0);
310 else
311 resched_task(rq->curr);
312 }
313unlock:
314 raw_spin_unlock(&rq->lock);
315
316 return HRTIMER_NORESTART;
317}
318
319void init_dl_task_timer(struct sched_dl_entity *dl_se)
320{
321 struct hrtimer *timer = &dl_se->dl_timer;
322
323 if (hrtimer_active(timer)) {
324 hrtimer_try_to_cancel(timer);
325 return;
326 }
327
328 hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
329 timer->function = dl_task_timer;
330}
331
332static
333int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
334{
335 int dmiss = dl_time_before(dl_se->deadline, rq_clock(rq));
336 int rorun = dl_se->runtime <= 0;
337
338 if (!rorun && !dmiss)
339 return 0;
340
341 /*
342 * If we are beyond our current deadline and we are still
343 * executing, then we have already used some of the runtime of
344 * the next instance. Thus, if we do not account that, we are
345 * stealing bandwidth from the system at each deadline miss!
346 */
347 if (dmiss) {
348 dl_se->runtime = rorun ? dl_se->runtime : 0;
349 dl_se->runtime -= rq_clock(rq) - dl_se->deadline;
350 }
351
352 return 1;
353}
354
355/*
356 * Update the current task's runtime statistics (provided it is still
357 * a -deadline task and has not been removed from the dl_rq).
358 */
359static void update_curr_dl(struct rq *rq)
360{
361 struct task_struct *curr = rq->curr;
362 struct sched_dl_entity *dl_se = &curr->dl;
363 u64 delta_exec;
364
365 if (!dl_task(curr) || !on_dl_rq(dl_se))
366 return;
367
368 /*
369 * Consumed budget is computed considering the time as
370 * observed by schedulable tasks (excluding time spent
371 * in hardirq context, etc.). Deadlines are instead
372 * computed using hard walltime. This seems to be the more
373 * natural solution, but the full ramifications of this
374 * approach need further study.
375 */
376 delta_exec = rq_clock_task(rq) - curr->se.exec_start;
377 if (unlikely((s64)delta_exec < 0))
378 delta_exec = 0;
379
380 schedstat_set(curr->se.statistics.exec_max,
381 max(curr->se.statistics.exec_max, delta_exec));
382
383 curr->se.sum_exec_runtime += delta_exec;
384 account_group_exec_runtime(curr, delta_exec);
385
386 curr->se.exec_start = rq_clock_task(rq);
387 cpuacct_charge(curr, delta_exec);
388
389 dl_se->runtime -= delta_exec;
390 if (dl_runtime_exceeded(rq, dl_se)) {
391 __dequeue_task_dl(rq, curr, 0);
392 if (likely(start_dl_timer(dl_se)))
393 dl_se->dl_throttled = 1;
394 else
395 enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
396
397 if (!is_leftmost(curr, &rq->dl))
398 resched_task(curr);
399 }
400}
401
402static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)
403{
404 struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
405 struct rb_node **link = &dl_rq->rb_root.rb_node;
406 struct rb_node *parent = NULL;
407 struct sched_dl_entity *entry;
408 int leftmost = 1;
409
410 BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node));
411
412 while (*link) {
413 parent = *link;
414 entry = rb_entry(parent, struct sched_dl_entity, rb_node);
415 if (dl_time_before(dl_se->deadline, entry->deadline))
416 link = &parent->rb_left;
417 else {
418 link = &parent->rb_right;
419 leftmost = 0;
420 }
421 }
422
423 if (leftmost)
424 dl_rq->rb_leftmost = &dl_se->rb_node;
425
426 rb_link_node(&dl_se->rb_node, parent, link);
427 rb_insert_color(&dl_se->rb_node, &dl_rq->rb_root);
428
429 dl_rq->dl_nr_running++;
430}
431
432static void __dequeue_dl_entity(struct sched_dl_entity *dl_se)
433{
434 struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
435
436 if (RB_EMPTY_NODE(&dl_se->rb_node))
437 return;
438
439 if (dl_rq->rb_leftmost == &dl_se->rb_node) {
440 struct rb_node *next_node;
441
442 next_node = rb_next(&dl_se->rb_node);
443 dl_rq->rb_leftmost = next_node;
444 }
445
446 rb_erase(&dl_se->rb_node, &dl_rq->rb_root);
447 RB_CLEAR_NODE(&dl_se->rb_node);
448
449 dl_rq->dl_nr_running--;
450}
451
452static void
453enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
454{
455 BUG_ON(on_dl_rq(dl_se));
456
457 /*
458 * If this is a wakeup or a new instance, the scheduling
459 * parameters of the task might need updating. Otherwise,
460 * we want a replenishment of its runtime.
461 */
462 if (!dl_se->dl_new && flags & ENQUEUE_REPLENISH)
463 replenish_dl_entity(dl_se);
464 else
465 update_dl_entity(dl_se);
466
467 __enqueue_dl_entity(dl_se);
468}
469
470static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
471{
472 __dequeue_dl_entity(dl_se);
473}
474
475static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
476{
477 /*
478 * If p is throttled, we do nothing. In fact, if it exhausted
479 * its budget it needs a replenishment and, since it now is on
480 * its rq, the bandwidth timer callback (which clearly has not
481 * run yet) will take care of this.
482 */
483 if (p->dl.dl_throttled)
484 return;
485
486 enqueue_dl_entity(&p->dl, flags);
487 inc_nr_running(rq);
488}
489
490static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
491{
492 dequeue_dl_entity(&p->dl);
493}
494
495static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
496{
497 update_curr_dl(rq);
498 __dequeue_task_dl(rq, p, flags);
499
500 dec_nr_running(rq);
501}
502
503/*
504 * Yield task semantic for -deadline tasks is:
505 *
506 * get off from the CPU until our next instance, with
507 * a new runtime. This is of little use now, since we
508 * don't have a bandwidth reclaiming mechanism. Anyway,
509 * bandwidth reclaiming is planned for the future, and
510 * yield_task_dl will indicate that some spare budget
511 * is available for other task instances to use it.
512 */
513static void yield_task_dl(struct rq *rq)
514{
515 struct task_struct *p = rq->curr;
516
517 /*
518 * We make the task go to sleep until its current deadline by
519 * forcing its runtime to zero. This way, update_curr_dl() stops
520 * it and the bandwidth timer will wake it up and will give it
521 * new scheduling parameters (thanks to dl_new=1).
522 */
523 if (p->dl.runtime > 0) {
524 rq->curr->dl.dl_new = 1;
525 p->dl.runtime = 0;
526 }
527 update_curr_dl(rq);
528}
529
530/*
531 * Only called when both the current and waking task are -deadline
532 * tasks.
533 */
534static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
535 int flags)
536{
537 if (dl_time_before(p->dl.deadline, rq->curr->dl.deadline))
538 resched_task(rq->curr);
539}
540
541#ifdef CONFIG_SCHED_HRTICK
542static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
543{
544 s64 delta = p->dl.dl_runtime - p->dl.runtime;
545
546 if (delta > 10000)
547 hrtick_start(rq, p->dl.runtime);
548}
549#endif
550
551static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
552 struct dl_rq *dl_rq)
553{
554 struct rb_node *left = dl_rq->rb_leftmost;
555
556 if (!left)
557 return NULL;
558
559 return rb_entry(left, struct sched_dl_entity, rb_node);
560}
561
562struct task_struct *pick_next_task_dl(struct rq *rq)
563{
564 struct sched_dl_entity *dl_se;
565 struct task_struct *p;
566 struct dl_rq *dl_rq;
567
568 dl_rq = &rq->dl;
569
570 if (unlikely(!dl_rq->dl_nr_running))
571 return NULL;
572
573 dl_se = pick_next_dl_entity(rq, dl_rq);
574 BUG_ON(!dl_se);
575
576 p = dl_task_of(dl_se);
577 p->se.exec_start = rq_clock_task(rq);
578#ifdef CONFIG_SCHED_HRTICK
579 if (hrtick_enabled(rq))
580 start_hrtick_dl(rq, p);
581#endif
582 return p;
583}
584
585static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
586{
587 update_curr_dl(rq);
588}
589
590static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
591{
592 update_curr_dl(rq);
593
594#ifdef CONFIG_SCHED_HRTICK
595 if (hrtick_enabled(rq) && queued && p->dl.runtime > 0)
596 start_hrtick_dl(rq, p);
597#endif
598}
599
600static void task_fork_dl(struct task_struct *p)
601{
602 /*
603 * SCHED_DEADLINE tasks cannot fork and this is achieved through
604 * sched_fork()
605 */
606}
607
608static void task_dead_dl(struct task_struct *p)
609{
610 struct hrtimer *timer = &p->dl.dl_timer;
611
612 if (hrtimer_active(timer))
613 hrtimer_try_to_cancel(timer);
614}
615
616static void set_curr_task_dl(struct rq *rq)
617{
618 struct task_struct *p = rq->curr;
619
620 p->se.exec_start = rq_clock_task(rq);
621}
622
623static void switched_from_dl(struct rq *rq, struct task_struct *p)
624{
625 if (hrtimer_active(&p->dl.dl_timer))
626 hrtimer_try_to_cancel(&p->dl.dl_timer);
627}
628
629static void switched_to_dl(struct rq *rq, struct task_struct *p)
630{
631 /*
632 * If p is throttled, don't consider the possibility
633 * of preempting rq->curr, the check will be done right
634 * after its runtime will get replenished.
635 */
636 if (unlikely(p->dl.dl_throttled))
637 return;
638
639 if (p->on_rq || rq->curr != p) {
640 if (task_has_dl_policy(rq->curr))
641 check_preempt_curr_dl(rq, p, 0);
642 else
643 resched_task(rq->curr);
644 }
645}
646
647static void prio_changed_dl(struct rq *rq, struct task_struct *p,
648 int oldprio)
649{
650 switched_to_dl(rq, p);
651}
652
653#ifdef CONFIG_SMP
654static int
655select_task_rq_dl(struct task_struct *p, int prev_cpu, int sd_flag, int flags)
656{
657 return task_cpu(p);
658}
659#endif
660
661const struct sched_class dl_sched_class = {
662 .next = &rt_sched_class,
663 .enqueue_task = enqueue_task_dl,
664 .dequeue_task = dequeue_task_dl,
665 .yield_task = yield_task_dl,
666
667 .check_preempt_curr = check_preempt_curr_dl,
668
669 .pick_next_task = pick_next_task_dl,
670 .put_prev_task = put_prev_task_dl,
671
672#ifdef CONFIG_SMP
673 .select_task_rq = select_task_rq_dl,
674#endif
675
676 .set_curr_task = set_curr_task_dl,
677 .task_tick = task_tick_dl,
678 .task_fork = task_fork_dl,
679 .task_dead = task_dead_dl,
680
681 .prio_changed = prio_changed_dl,
682 .switched_from = switched_from_dl,
683 .switched_to = switched_to_dl,
684};
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index df023db7721c..83eb5390f753 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2,6 +2,7 @@
2#include <linux/sched.h> 2#include <linux/sched.h>
3#include <linux/sched/sysctl.h> 3#include <linux/sched/sysctl.h>
4#include <linux/sched/rt.h> 4#include <linux/sched/rt.h>
5#include <linux/sched/deadline.h>
5#include <linux/mutex.h> 6#include <linux/mutex.h>
6#include <linux/spinlock.h> 7#include <linux/spinlock.h>
7#include <linux/stop_machine.h> 8#include <linux/stop_machine.h>
@@ -91,11 +92,21 @@ static inline int rt_policy(int policy)
91 return policy == SCHED_FIFO || policy == SCHED_RR; 92 return policy == SCHED_FIFO || policy == SCHED_RR;
92} 93}
93 94
95static inline int dl_policy(int policy)
96{
97 return policy == SCHED_DEADLINE;
98}
99
94static inline int task_has_rt_policy(struct task_struct *p) 100static inline int task_has_rt_policy(struct task_struct *p)
95{ 101{
96 return rt_policy(p->policy); 102 return rt_policy(p->policy);
97} 103}
98 104
105static inline int task_has_dl_policy(struct task_struct *p)
106{
107 return dl_policy(p->policy);
108}
109
99/* 110/*
100 * This is the priority-queue data structure of the RT scheduling class: 111 * This is the priority-queue data structure of the RT scheduling class:
101 */ 112 */
@@ -367,6 +378,15 @@ struct rt_rq {
367#endif 378#endif
368}; 379};
369 380
381/* Deadline class' related fields in a runqueue */
382struct dl_rq {
383 /* runqueue is an rbtree, ordered by deadline */
384 struct rb_root rb_root;
385 struct rb_node *rb_leftmost;
386
387 unsigned long dl_nr_running;
388};
389
370#ifdef CONFIG_SMP 390#ifdef CONFIG_SMP
371 391
372/* 392/*
@@ -435,6 +455,7 @@ struct rq {
435 455
436 struct cfs_rq cfs; 456 struct cfs_rq cfs;
437 struct rt_rq rt; 457 struct rt_rq rt;
458 struct dl_rq dl;
438 459
439#ifdef CONFIG_FAIR_GROUP_SCHED 460#ifdef CONFIG_FAIR_GROUP_SCHED
440 /* list of leaf cfs_rq on this cpu: */ 461 /* list of leaf cfs_rq on this cpu: */
@@ -991,6 +1012,7 @@ static const u32 prio_to_wmult[40] = {
991#else 1012#else
992#define ENQUEUE_WAKING 0 1013#define ENQUEUE_WAKING 0
993#endif 1014#endif
1015#define ENQUEUE_REPLENISH 8
994 1016
995#define DEQUEUE_SLEEP 1 1017#define DEQUEUE_SLEEP 1
996 1018
@@ -1046,6 +1068,7 @@ struct sched_class {
1046 for (class = sched_class_highest; class; class = class->next) 1068 for (class = sched_class_highest; class; class = class->next)
1047 1069
1048extern const struct sched_class stop_sched_class; 1070extern const struct sched_class stop_sched_class;
1071extern const struct sched_class dl_sched_class;
1049extern const struct sched_class rt_sched_class; 1072extern const struct sched_class rt_sched_class;
1050extern const struct sched_class fair_sched_class; 1073extern const struct sched_class fair_sched_class;
1051extern const struct sched_class idle_sched_class; 1074extern const struct sched_class idle_sched_class;
@@ -1081,6 +1104,8 @@ extern void resched_cpu(int cpu);
1081extern struct rt_bandwidth def_rt_bandwidth; 1104extern struct rt_bandwidth def_rt_bandwidth;
1082extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); 1105extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
1083 1106
1107extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
1108
1084extern void update_idle_cpu_load(struct rq *this_rq); 1109extern void update_idle_cpu_load(struct rq *this_rq);
1085 1110
1086extern void init_task_runnable_average(struct task_struct *p); 1111extern void init_task_runnable_average(struct task_struct *p);
@@ -1357,6 +1382,7 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
1357 1382
1358extern void init_cfs_rq(struct cfs_rq *cfs_rq); 1383extern void init_cfs_rq(struct cfs_rq *cfs_rq);
1359extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); 1384extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
1385extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq);
1360 1386
1361extern void cfs_bandwidth_usage_inc(void); 1387extern void cfs_bandwidth_usage_inc(void);
1362extern void cfs_bandwidth_usage_dec(void); 1388extern void cfs_bandwidth_usage_dec(void);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 47197de8abd9..fdb6bb0b3356 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -103,7 +103,7 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task)
103 * Simple, special scheduling class for the per-CPU stop tasks: 103 * Simple, special scheduling class for the per-CPU stop tasks:
104 */ 104 */
105const struct sched_class stop_sched_class = { 105const struct sched_class stop_sched_class = {
106 .next = &rt_sched_class, 106 .next = &dl_sched_class,
107 107
108 .enqueue_task = enqueue_task_stop, 108 .enqueue_task = enqueue_task_stop,
109 .dequeue_task = dequeue_task_stop, 109 .dequeue_task = dequeue_task_stop,