diff options
author | Dario Faggioli <raistlin@linux.it> | 2013-11-07 08:43:36 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2014-01-13 07:41:04 -0500 |
commit | d50dde5a10f305253cbc3855307f608f8a3c5f73 (patch) | |
tree | 940022e0216611f198d9a00f1cb3bfc59b2014d8 /kernel/sched/core.c | |
parent | 56b4811039174bba9cbd68318d0d8b1585b9eded (diff) |
sched: Add new scheduler syscalls to support an extended scheduling parameters ABI
Add the syscalls needed for supporting scheduling algorithms
with extended scheduling parameters (e.g., SCHED_DEADLINE).
In general, it makes possible to specify a periodic/sporadic task,
that executes for a given amount of runtime at each instance, and is
scheduled according to the urgency of their own timing constraints,
i.e.:
- a (maximum/typical) instance execution time,
- a minimum interval between consecutive instances,
- a time constraint by which each instance must be completed.
Thus, both the data structure that holds the scheduling parameters of
the tasks and the system calls dealing with it must be extended.
Unfortunately, modifying the existing struct sched_param would break
the ABI and result in potentially serious compatibility issues with
legacy binaries.
For these reasons, this patch:
- defines the new struct sched_attr, containing all the fields
that are necessary for specifying a task in the computational
model described above;
- defines and implements the new scheduling related syscalls that
manipulate it, i.e., sched_setattr() and sched_getattr().
Syscalls are introduced for x86 (32 and 64 bits) and ARM only, as a
proof of concept and for developing and testing purposes. Making them
available on other architectures is straightforward.
Since no "user" for these new parameters is introduced in this patch,
the implementation of the new system calls is just identical to their
already existing counterpart. Future patches that implement scheduling
policies able to exploit the new data structure must also take care of
modifying the sched_*attr() calls accordingly with their own purposes.
Signed-off-by: Dario Faggioli <raistlin@linux.it>
[ Rewrote to use sched_attr. ]
Signed-off-by: Juri Lelli <juri.lelli@gmail.com>
[ Removed sched_setscheduler2() for now. ]
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1383831828-15501-3-git-send-email-juri.lelli@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r-- | kernel/sched/core.c | 263 |
1 files changed, 243 insertions, 20 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b21a63ed5d62..8174f889076c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -2817,6 +2817,7 @@ out_unlock: | |||
2817 | __task_rq_unlock(rq); | 2817 | __task_rq_unlock(rq); |
2818 | } | 2818 | } |
2819 | #endif | 2819 | #endif |
2820 | |||
2820 | void set_user_nice(struct task_struct *p, long nice) | 2821 | void set_user_nice(struct task_struct *p, long nice) |
2821 | { | 2822 | { |
2822 | int old_prio, delta, on_rq; | 2823 | int old_prio, delta, on_rq; |
@@ -2991,22 +2992,29 @@ static struct task_struct *find_process_by_pid(pid_t pid) | |||
2991 | return pid ? find_task_by_vpid(pid) : current; | 2992 | return pid ? find_task_by_vpid(pid) : current; |
2992 | } | 2993 | } |
2993 | 2994 | ||
2994 | /* Actually do priority change: must hold rq lock. */ | 2995 | /* Actually do priority change: must hold pi & rq lock. */ |
2995 | static void | 2996 | static void __setscheduler(struct rq *rq, struct task_struct *p, |
2996 | __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) | 2997 | const struct sched_attr *attr) |
2997 | { | 2998 | { |
2999 | int policy = attr->sched_policy; | ||
3000 | |||
2998 | p->policy = policy; | 3001 | p->policy = policy; |
2999 | p->rt_priority = prio; | 3002 | |
3003 | if (rt_policy(policy)) | ||
3004 | p->rt_priority = attr->sched_priority; | ||
3005 | else | ||
3006 | p->static_prio = NICE_TO_PRIO(attr->sched_nice); | ||
3007 | |||
3000 | p->normal_prio = normal_prio(p); | 3008 | p->normal_prio = normal_prio(p); |
3001 | /* we are holding p->pi_lock already */ | ||
3002 | p->prio = rt_mutex_getprio(p); | 3009 | p->prio = rt_mutex_getprio(p); |
3010 | |||
3003 | if (rt_prio(p->prio)) | 3011 | if (rt_prio(p->prio)) |
3004 | p->sched_class = &rt_sched_class; | 3012 | p->sched_class = &rt_sched_class; |
3005 | else | 3013 | else |
3006 | p->sched_class = &fair_sched_class; | 3014 | p->sched_class = &fair_sched_class; |
3015 | |||
3007 | set_load_weight(p); | 3016 | set_load_weight(p); |
3008 | } | 3017 | } |
3009 | |||
3010 | /* | 3018 | /* |
3011 | * check the target process has a UID that matches the current process's | 3019 | * check the target process has a UID that matches the current process's |
3012 | */ | 3020 | */ |
@@ -3023,10 +3031,12 @@ static bool check_same_owner(struct task_struct *p) | |||
3023 | return match; | 3031 | return match; |
3024 | } | 3032 | } |
3025 | 3033 | ||
3026 | static int __sched_setscheduler(struct task_struct *p, int policy, | 3034 | static int __sched_setscheduler(struct task_struct *p, |
3027 | const struct sched_param *param, bool user) | 3035 | const struct sched_attr *attr, |
3036 | bool user) | ||
3028 | { | 3037 | { |
3029 | int retval, oldprio, oldpolicy = -1, on_rq, running; | 3038 | int retval, oldprio, oldpolicy = -1, on_rq, running; |
3039 | int policy = attr->sched_policy; | ||
3030 | unsigned long flags; | 3040 | unsigned long flags; |
3031 | const struct sched_class *prev_class; | 3041 | const struct sched_class *prev_class; |
3032 | struct rq *rq; | 3042 | struct rq *rq; |
@@ -3054,17 +3064,22 @@ recheck: | |||
3054 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, | 3064 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, |
3055 | * SCHED_BATCH and SCHED_IDLE is 0. | 3065 | * SCHED_BATCH and SCHED_IDLE is 0. |
3056 | */ | 3066 | */ |
3057 | if (param->sched_priority < 0 || | 3067 | if (attr->sched_priority < 0 || |
3058 | (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || | 3068 | (p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) || |
3059 | (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) | 3069 | (!p->mm && attr->sched_priority > MAX_RT_PRIO-1)) |
3060 | return -EINVAL; | 3070 | return -EINVAL; |
3061 | if (rt_policy(policy) != (param->sched_priority != 0)) | 3071 | if (rt_policy(policy) != (attr->sched_priority != 0)) |
3062 | return -EINVAL; | 3072 | return -EINVAL; |
3063 | 3073 | ||
3064 | /* | 3074 | /* |
3065 | * Allow unprivileged RT tasks to decrease priority: | 3075 | * Allow unprivileged RT tasks to decrease priority: |
3066 | */ | 3076 | */ |
3067 | if (user && !capable(CAP_SYS_NICE)) { | 3077 | if (user && !capable(CAP_SYS_NICE)) { |
3078 | if (fair_policy(policy)) { | ||
3079 | if (!can_nice(p, attr->sched_nice)) | ||
3080 | return -EPERM; | ||
3081 | } | ||
3082 | |||
3068 | if (rt_policy(policy)) { | 3083 | if (rt_policy(policy)) { |
3069 | unsigned long rlim_rtprio = | 3084 | unsigned long rlim_rtprio = |
3070 | task_rlimit(p, RLIMIT_RTPRIO); | 3085 | task_rlimit(p, RLIMIT_RTPRIO); |
@@ -3074,8 +3089,8 @@ recheck: | |||
3074 | return -EPERM; | 3089 | return -EPERM; |
3075 | 3090 | ||
3076 | /* can't increase priority */ | 3091 | /* can't increase priority */ |
3077 | if (param->sched_priority > p->rt_priority && | 3092 | if (attr->sched_priority > p->rt_priority && |
3078 | param->sched_priority > rlim_rtprio) | 3093 | attr->sched_priority > rlim_rtprio) |
3079 | return -EPERM; | 3094 | return -EPERM; |
3080 | } | 3095 | } |
3081 | 3096 | ||
@@ -3123,11 +3138,16 @@ recheck: | |||
3123 | /* | 3138 | /* |
3124 | * If not changing anything there's no need to proceed further: | 3139 | * If not changing anything there's no need to proceed further: |
3125 | */ | 3140 | */ |
3126 | if (unlikely(policy == p->policy && (!rt_policy(policy) || | 3141 | if (unlikely(policy == p->policy)) { |
3127 | param->sched_priority == p->rt_priority))) { | 3142 | if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p)) |
3143 | goto change; | ||
3144 | if (rt_policy(policy) && attr->sched_priority != p->rt_priority) | ||
3145 | goto change; | ||
3146 | |||
3128 | task_rq_unlock(rq, p, &flags); | 3147 | task_rq_unlock(rq, p, &flags); |
3129 | return 0; | 3148 | return 0; |
3130 | } | 3149 | } |
3150 | change: | ||
3131 | 3151 | ||
3132 | #ifdef CONFIG_RT_GROUP_SCHED | 3152 | #ifdef CONFIG_RT_GROUP_SCHED |
3133 | if (user) { | 3153 | if (user) { |
@@ -3161,7 +3181,7 @@ recheck: | |||
3161 | 3181 | ||
3162 | oldprio = p->prio; | 3182 | oldprio = p->prio; |
3163 | prev_class = p->sched_class; | 3183 | prev_class = p->sched_class; |
3164 | __setscheduler(rq, p, policy, param->sched_priority); | 3184 | __setscheduler(rq, p, attr); |
3165 | 3185 | ||
3166 | if (running) | 3186 | if (running) |
3167 | p->sched_class->set_curr_task(rq); | 3187 | p->sched_class->set_curr_task(rq); |
@@ -3189,10 +3209,20 @@ recheck: | |||
3189 | int sched_setscheduler(struct task_struct *p, int policy, | 3209 | int sched_setscheduler(struct task_struct *p, int policy, |
3190 | const struct sched_param *param) | 3210 | const struct sched_param *param) |
3191 | { | 3211 | { |
3192 | return __sched_setscheduler(p, policy, param, true); | 3212 | struct sched_attr attr = { |
3213 | .sched_policy = policy, | ||
3214 | .sched_priority = param->sched_priority | ||
3215 | }; | ||
3216 | return __sched_setscheduler(p, &attr, true); | ||
3193 | } | 3217 | } |
3194 | EXPORT_SYMBOL_GPL(sched_setscheduler); | 3218 | EXPORT_SYMBOL_GPL(sched_setscheduler); |
3195 | 3219 | ||
3220 | int sched_setattr(struct task_struct *p, const struct sched_attr *attr) | ||
3221 | { | ||
3222 | return __sched_setscheduler(p, attr, true); | ||
3223 | } | ||
3224 | EXPORT_SYMBOL_GPL(sched_setattr); | ||
3225 | |||
3196 | /** | 3226 | /** |
3197 | * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. | 3227 | * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. |
3198 | * @p: the task in question. | 3228 | * @p: the task in question. |
@@ -3209,7 +3239,11 @@ EXPORT_SYMBOL_GPL(sched_setscheduler); | |||
3209 | int sched_setscheduler_nocheck(struct task_struct *p, int policy, | 3239 | int sched_setscheduler_nocheck(struct task_struct *p, int policy, |
3210 | const struct sched_param *param) | 3240 | const struct sched_param *param) |
3211 | { | 3241 | { |
3212 | return __sched_setscheduler(p, policy, param, false); | 3242 | struct sched_attr attr = { |
3243 | .sched_policy = policy, | ||
3244 | .sched_priority = param->sched_priority | ||
3245 | }; | ||
3246 | return __sched_setscheduler(p, &attr, false); | ||
3213 | } | 3247 | } |
3214 | 3248 | ||
3215 | static int | 3249 | static int |
@@ -3234,6 +3268,79 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) | |||
3234 | return retval; | 3268 | return retval; |
3235 | } | 3269 | } |
3236 | 3270 | ||
3271 | /* | ||
3272 | * Mimics kernel/events/core.c perf_copy_attr(). | ||
3273 | */ | ||
3274 | static int sched_copy_attr(struct sched_attr __user *uattr, | ||
3275 | struct sched_attr *attr) | ||
3276 | { | ||
3277 | u32 size; | ||
3278 | int ret; | ||
3279 | |||
3280 | if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) | ||
3281 | return -EFAULT; | ||
3282 | |||
3283 | /* | ||
3284 | * zero the full structure, so that a short copy will be nice. | ||
3285 | */ | ||
3286 | memset(attr, 0, sizeof(*attr)); | ||
3287 | |||
3288 | ret = get_user(size, &uattr->size); | ||
3289 | if (ret) | ||
3290 | return ret; | ||
3291 | |||
3292 | if (size > PAGE_SIZE) /* silly large */ | ||
3293 | goto err_size; | ||
3294 | |||
3295 | if (!size) /* abi compat */ | ||
3296 | size = SCHED_ATTR_SIZE_VER0; | ||
3297 | |||
3298 | if (size < SCHED_ATTR_SIZE_VER0) | ||
3299 | goto err_size; | ||
3300 | |||
3301 | /* | ||
3302 | * If we're handed a bigger struct than we know of, | ||
3303 | * ensure all the unknown bits are 0 - i.e. new | ||
3304 | * user-space does not rely on any kernel feature | ||
3305 | * extensions we dont know about yet. | ||
3306 | */ | ||
3307 | if (size > sizeof(*attr)) { | ||
3308 | unsigned char __user *addr; | ||
3309 | unsigned char __user *end; | ||
3310 | unsigned char val; | ||
3311 | |||
3312 | addr = (void __user *)uattr + sizeof(*attr); | ||
3313 | end = (void __user *)uattr + size; | ||
3314 | |||
3315 | for (; addr < end; addr++) { | ||
3316 | ret = get_user(val, addr); | ||
3317 | if (ret) | ||
3318 | return ret; | ||
3319 | if (val) | ||
3320 | goto err_size; | ||
3321 | } | ||
3322 | size = sizeof(*attr); | ||
3323 | } | ||
3324 | |||
3325 | ret = copy_from_user(attr, uattr, size); | ||
3326 | if (ret) | ||
3327 | return -EFAULT; | ||
3328 | |||
3329 | /* | ||
3330 | * XXX: do we want to be lenient like existing syscalls; or do we want | ||
3331 | * to be strict and return an error on out-of-bounds values? | ||
3332 | */ | ||
3333 | attr->sched_nice = clamp(attr->sched_nice, -20, 19); | ||
3334 | |||
3335 | out: | ||
3336 | return ret; | ||
3337 | |||
3338 | err_size: | ||
3339 | put_user(sizeof(*attr), &uattr->size); | ||
3340 | ret = -E2BIG; | ||
3341 | goto out; | ||
3342 | } | ||
3343 | |||
3237 | /** | 3344 | /** |
3238 | * sys_sched_setscheduler - set/change the scheduler policy and RT priority | 3345 | * sys_sched_setscheduler - set/change the scheduler policy and RT priority |
3239 | * @pid: the pid in question. | 3346 | * @pid: the pid in question. |
@@ -3265,6 +3372,33 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) | |||
3265 | } | 3372 | } |
3266 | 3373 | ||
3267 | /** | 3374 | /** |
3375 | * sys_sched_setattr - same as above, but with extended sched_attr | ||
3376 | * @pid: the pid in question. | ||
3377 | * @attr: structure containing the extended parameters. | ||
3378 | */ | ||
3379 | SYSCALL_DEFINE2(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr) | ||
3380 | { | ||
3381 | struct sched_attr attr; | ||
3382 | struct task_struct *p; | ||
3383 | int retval; | ||
3384 | |||
3385 | if (!uattr || pid < 0) | ||
3386 | return -EINVAL; | ||
3387 | |||
3388 | if (sched_copy_attr(uattr, &attr)) | ||
3389 | return -EFAULT; | ||
3390 | |||
3391 | rcu_read_lock(); | ||
3392 | retval = -ESRCH; | ||
3393 | p = find_process_by_pid(pid); | ||
3394 | if (p != NULL) | ||
3395 | retval = sched_setattr(p, &attr); | ||
3396 | rcu_read_unlock(); | ||
3397 | |||
3398 | return retval; | ||
3399 | } | ||
3400 | |||
3401 | /** | ||
3268 | * sys_sched_getscheduler - get the policy (scheduling class) of a thread | 3402 | * sys_sched_getscheduler - get the policy (scheduling class) of a thread |
3269 | * @pid: the pid in question. | 3403 | * @pid: the pid in question. |
3270 | * | 3404 | * |
@@ -3334,6 +3468,92 @@ out_unlock: | |||
3334 | return retval; | 3468 | return retval; |
3335 | } | 3469 | } |
3336 | 3470 | ||
3471 | static int sched_read_attr(struct sched_attr __user *uattr, | ||
3472 | struct sched_attr *attr, | ||
3473 | unsigned int usize) | ||
3474 | { | ||
3475 | int ret; | ||
3476 | |||
3477 | if (!access_ok(VERIFY_WRITE, uattr, usize)) | ||
3478 | return -EFAULT; | ||
3479 | |||
3480 | /* | ||
3481 | * If we're handed a smaller struct than we know of, | ||
3482 | * ensure all the unknown bits are 0 - i.e. old | ||
3483 | * user-space does not get uncomplete information. | ||
3484 | */ | ||
3485 | if (usize < sizeof(*attr)) { | ||
3486 | unsigned char *addr; | ||
3487 | unsigned char *end; | ||
3488 | |||
3489 | addr = (void *)attr + usize; | ||
3490 | end = (void *)attr + sizeof(*attr); | ||
3491 | |||
3492 | for (; addr < end; addr++) { | ||
3493 | if (*addr) | ||
3494 | goto err_size; | ||
3495 | } | ||
3496 | |||
3497 | attr->size = usize; | ||
3498 | } | ||
3499 | |||
3500 | ret = copy_to_user(uattr, attr, usize); | ||
3501 | if (ret) | ||
3502 | return -EFAULT; | ||
3503 | |||
3504 | out: | ||
3505 | return ret; | ||
3506 | |||
3507 | err_size: | ||
3508 | ret = -E2BIG; | ||
3509 | goto out; | ||
3510 | } | ||
3511 | |||
3512 | /** | ||
3513 | * sys_sched_getattr - same as above, but with extended "sched_param" | ||
3514 | * @pid: the pid in question. | ||
3515 | * @attr: structure containing the extended parameters. | ||
3516 | * @size: sizeof(attr) for fwd/bwd comp. | ||
3517 | */ | ||
3518 | SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, | ||
3519 | unsigned int, size) | ||
3520 | { | ||
3521 | struct sched_attr attr = { | ||
3522 | .size = sizeof(struct sched_attr), | ||
3523 | }; | ||
3524 | struct task_struct *p; | ||
3525 | int retval; | ||
3526 | |||
3527 | if (!uattr || pid < 0 || size > PAGE_SIZE || | ||
3528 | size < SCHED_ATTR_SIZE_VER0) | ||
3529 | return -EINVAL; | ||
3530 | |||
3531 | rcu_read_lock(); | ||
3532 | p = find_process_by_pid(pid); | ||
3533 | retval = -ESRCH; | ||
3534 | if (!p) | ||
3535 | goto out_unlock; | ||
3536 | |||
3537 | retval = security_task_getscheduler(p); | ||
3538 | if (retval) | ||
3539 | goto out_unlock; | ||
3540 | |||
3541 | attr.sched_policy = p->policy; | ||
3542 | if (task_has_rt_policy(p)) | ||
3543 | attr.sched_priority = p->rt_priority; | ||
3544 | else | ||
3545 | attr.sched_nice = TASK_NICE(p); | ||
3546 | |||
3547 | rcu_read_unlock(); | ||
3548 | |||
3549 | retval = sched_read_attr(uattr, &attr, size); | ||
3550 | return retval; | ||
3551 | |||
3552 | out_unlock: | ||
3553 | rcu_read_unlock(); | ||
3554 | return retval; | ||
3555 | } | ||
3556 | |||
3337 | long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | 3557 | long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) |
3338 | { | 3558 | { |
3339 | cpumask_var_t cpus_allowed, new_mask; | 3559 | cpumask_var_t cpus_allowed, new_mask; |
@@ -6400,13 +6620,16 @@ EXPORT_SYMBOL(__might_sleep); | |||
6400 | static void normalize_task(struct rq *rq, struct task_struct *p) | 6620 | static void normalize_task(struct rq *rq, struct task_struct *p) |
6401 | { | 6621 | { |
6402 | const struct sched_class *prev_class = p->sched_class; | 6622 | const struct sched_class *prev_class = p->sched_class; |
6623 | struct sched_attr attr = { | ||
6624 | .sched_policy = SCHED_NORMAL, | ||
6625 | }; | ||
6403 | int old_prio = p->prio; | 6626 | int old_prio = p->prio; |
6404 | int on_rq; | 6627 | int on_rq; |
6405 | 6628 | ||
6406 | on_rq = p->on_rq; | 6629 | on_rq = p->on_rq; |
6407 | if (on_rq) | 6630 | if (on_rq) |
6408 | dequeue_task(rq, p, 0); | 6631 | dequeue_task(rq, p, 0); |
6409 | __setscheduler(rq, p, SCHED_NORMAL, 0); | 6632 | __setscheduler(rq, p, &attr); |
6410 | if (on_rq) { | 6633 | if (on_rq) { |
6411 | enqueue_task(rq, p, 0); | 6634 | enqueue_task(rq, p, 0); |
6412 | resched_task(rq->curr); | 6635 | resched_task(rq->curr); |