aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDario Faggioli <raistlin@linux.it>2013-11-07 08:43:36 -0500
committerIngo Molnar <mingo@kernel.org>2014-01-13 07:41:04 -0500
commitd50dde5a10f305253cbc3855307f608f8a3c5f73 (patch)
tree940022e0216611f198d9a00f1cb3bfc59b2014d8
parent56b4811039174bba9cbd68318d0d8b1585b9eded (diff)
sched: Add new scheduler syscalls to support an extended scheduling parameters ABI
Add the syscalls needed for supporting scheduling algorithms with extended scheduling parameters (e.g., SCHED_DEADLINE). In general, it makes possible to specify a periodic/sporadic task, that executes for a given amount of runtime at each instance, and is scheduled according to the urgency of their own timing constraints, i.e.: - a (maximum/typical) instance execution time, - a minimum interval between consecutive instances, - a time constraint by which each instance must be completed. Thus, both the data structure that holds the scheduling parameters of the tasks and the system calls dealing with it must be extended. Unfortunately, modifying the existing struct sched_param would break the ABI and result in potentially serious compatibility issues with legacy binaries. For these reasons, this patch: - defines the new struct sched_attr, containing all the fields that are necessary for specifying a task in the computational model described above; - defines and implements the new scheduling related syscalls that manipulate it, i.e., sched_setattr() and sched_getattr(). Syscalls are introduced for x86 (32 and 64 bits) and ARM only, as a proof of concept and for developing and testing purposes. Making them available on other architectures is straightforward. Since no "user" for these new parameters is introduced in this patch, the implementation of the new system calls is just identical to their already existing counterpart. Future patches that implement scheduling policies able to exploit the new data structure must also take care of modifying the sched_*attr() calls accordingly with their own purposes. Signed-off-by: Dario Faggioli <raistlin@linux.it> [ Rewrote to use sched_attr. ] Signed-off-by: Juri Lelli <juri.lelli@gmail.com> [ Removed sched_setscheduler2() for now. ] Signed-off-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1383831828-15501-3-git-send-email-juri.lelli@gmail.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--arch/arm/include/asm/unistd.h2
-rw-r--r--arch/arm/include/uapi/asm/unistd.h2
-rw-r--r--arch/arm/kernel/calls.S2
-rw-r--r--arch/x86/syscalls/syscall_32.tbl2
-rw-r--r--arch/x86/syscalls/syscall_64.tbl2
-rw-r--r--include/linux/sched.h62
-rw-r--r--include/linux/syscalls.h6
-rw-r--r--kernel/sched/core.c263
-rw-r--r--kernel/sched/sched.h9
9 files changed, 326 insertions, 24 deletions
diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h
index 141baa3f9a72..acabef1a75df 100644
--- a/arch/arm/include/asm/unistd.h
+++ b/arch/arm/include/asm/unistd.h
@@ -15,7 +15,7 @@
15 15
16#include <uapi/asm/unistd.h> 16#include <uapi/asm/unistd.h>
17 17
18#define __NR_syscalls (380) 18#define __NR_syscalls (384)
19#define __ARM_NR_cmpxchg (__ARM_NR_BASE+0x00fff0) 19#define __ARM_NR_cmpxchg (__ARM_NR_BASE+0x00fff0)
20 20
21#define __ARCH_WANT_STAT64 21#define __ARCH_WANT_STAT64
diff --git a/arch/arm/include/uapi/asm/unistd.h b/arch/arm/include/uapi/asm/unistd.h
index af33b44990ed..fb5584d0cc05 100644
--- a/arch/arm/include/uapi/asm/unistd.h
+++ b/arch/arm/include/uapi/asm/unistd.h
@@ -406,6 +406,8 @@
406#define __NR_process_vm_writev (__NR_SYSCALL_BASE+377) 406#define __NR_process_vm_writev (__NR_SYSCALL_BASE+377)
407#define __NR_kcmp (__NR_SYSCALL_BASE+378) 407#define __NR_kcmp (__NR_SYSCALL_BASE+378)
408#define __NR_finit_module (__NR_SYSCALL_BASE+379) 408#define __NR_finit_module (__NR_SYSCALL_BASE+379)
409#define __NR_sched_setattr (__NR_SYSCALL_BASE+380)
410#define __NR_sched_getattr (__NR_SYSCALL_BASE+381)
409 411
410/* 412/*
411 * This may need to be greater than __NR_last_syscall+1 in order to 413 * This may need to be greater than __NR_last_syscall+1 in order to
diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S
index c6ca7e376773..166e945de832 100644
--- a/arch/arm/kernel/calls.S
+++ b/arch/arm/kernel/calls.S
@@ -389,6 +389,8 @@
389 CALL(sys_process_vm_writev) 389 CALL(sys_process_vm_writev)
390 CALL(sys_kcmp) 390 CALL(sys_kcmp)
391 CALL(sys_finit_module) 391 CALL(sys_finit_module)
392/* 380 */ CALL(sys_sched_setattr)
393 CALL(sys_sched_getattr)
392#ifndef syscalls_counted 394#ifndef syscalls_counted
393.equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls 395.equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls
394#define syscalls_counted 396#define syscalls_counted
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index aabfb8380a1c..96bc506ac6de 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -357,3 +357,5 @@
357348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev 357348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev
358349 i386 kcmp sys_kcmp 358349 i386 kcmp sys_kcmp
359350 i386 finit_module sys_finit_module 359350 i386 finit_module sys_finit_module
360351 i386 sched_setattr sys_sched_setattr
361352 i386 sched_getattr sys_sched_getattr
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index 38ae65dfd14f..a12bddc7ccea 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -320,6 +320,8 @@
320311 64 process_vm_writev sys_process_vm_writev 320311 64 process_vm_writev sys_process_vm_writev
321312 common kcmp sys_kcmp 321312 common kcmp sys_kcmp
322313 common finit_module sys_finit_module 322313 common finit_module sys_finit_module
323314 common sched_setattr sys_sched_setattr
324315 common sched_getattr sys_sched_getattr
323 325
324# 326#
325# x32-specific system call numbers start at 512 to avoid cache impact 327# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3a1e9857b393..86025b6c6387 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -56,6 +56,66 @@ struct sched_param {
56 56
57#include <asm/processor.h> 57#include <asm/processor.h>
58 58
59#define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */
60
61/*
62 * Extended scheduling parameters data structure.
63 *
64 * This is needed because the original struct sched_param can not be
65 * altered without introducing ABI issues with legacy applications
66 * (e.g., in sched_getparam()).
67 *
68 * However, the possibility of specifying more than just a priority for
69 * the tasks may be useful for a wide variety of application fields, e.g.,
70 * multimedia, streaming, automation and control, and many others.
71 *
72 * This variant (sched_attr) is meant at describing a so-called
73 * sporadic time-constrained task. In such model a task is specified by:
74 * - the activation period or minimum instance inter-arrival time;
75 * - the maximum (or average, depending on the actual scheduling
76 * discipline) computation time of all instances, a.k.a. runtime;
77 * - the deadline (relative to the actual activation time) of each
78 * instance.
79 * Very briefly, a periodic (sporadic) task asks for the execution of
80 * some specific computation --which is typically called an instance--
81 * (at most) every period. Moreover, each instance typically lasts no more
82 * than the runtime and must be completed by time instant t equal to
83 * the instance activation time + the deadline.
84 *
85 * This is reflected by the actual fields of the sched_attr structure:
86 *
87 * @size size of the structure, for fwd/bwd compat.
88 *
89 * @sched_policy task's scheduling policy
90 * @sched_flags for customizing the scheduler behaviour
91 * @sched_nice task's nice value (SCHED_NORMAL/BATCH)
92 * @sched_priority task's static priority (SCHED_FIFO/RR)
93 * @sched_deadline representative of the task's deadline
94 * @sched_runtime representative of the task's runtime
95 * @sched_period representative of the task's period
96 *
97 * Given this task model, there are a multiplicity of scheduling algorithms
98 * and policies, that can be used to ensure all the tasks will make their
99 * timing constraints.
100 */
101struct sched_attr {
102 u32 size;
103
104 u32 sched_policy;
105 u64 sched_flags;
106
107 /* SCHED_NORMAL, SCHED_BATCH */
108 s32 sched_nice;
109
110 /* SCHED_FIFO, SCHED_RR */
111 u32 sched_priority;
112
113 /* SCHED_DEADLINE */
114 u64 sched_runtime;
115 u64 sched_deadline;
116 u64 sched_period;
117};
118
59struct exec_domain; 119struct exec_domain;
60struct futex_pi_state; 120struct futex_pi_state;
61struct robust_list_head; 121struct robust_list_head;
@@ -1958,6 +2018,8 @@ extern int sched_setscheduler(struct task_struct *, int,
1958 const struct sched_param *); 2018 const struct sched_param *);
1959extern int sched_setscheduler_nocheck(struct task_struct *, int, 2019extern int sched_setscheduler_nocheck(struct task_struct *, int,
1960 const struct sched_param *); 2020 const struct sched_param *);
2021extern int sched_setattr(struct task_struct *,
2022 const struct sched_attr *);
1961extern struct task_struct *idle_task(int cpu); 2023extern struct task_struct *idle_task(int cpu);
1962/** 2024/**
1963 * is_idle_task - is the specified task an idle task? 2025 * is_idle_task - is the specified task an idle task?
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 94273bbe6050..40ed9e9a77e5 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -38,6 +38,7 @@ struct rlimit;
38struct rlimit64; 38struct rlimit64;
39struct rusage; 39struct rusage;
40struct sched_param; 40struct sched_param;
41struct sched_attr;
41struct sel_arg_struct; 42struct sel_arg_struct;
42struct semaphore; 43struct semaphore;
43struct sembuf; 44struct sembuf;
@@ -279,9 +280,14 @@ asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
279 struct sched_param __user *param); 280 struct sched_param __user *param);
280asmlinkage long sys_sched_setparam(pid_t pid, 281asmlinkage long sys_sched_setparam(pid_t pid,
281 struct sched_param __user *param); 282 struct sched_param __user *param);
283asmlinkage long sys_sched_setattr(pid_t pid,
284 struct sched_attr __user *attr);
282asmlinkage long sys_sched_getscheduler(pid_t pid); 285asmlinkage long sys_sched_getscheduler(pid_t pid);
283asmlinkage long sys_sched_getparam(pid_t pid, 286asmlinkage long sys_sched_getparam(pid_t pid,
284 struct sched_param __user *param); 287 struct sched_param __user *param);
288asmlinkage long sys_sched_getattr(pid_t pid,
289 struct sched_attr __user *attr,
290 unsigned int size);
285asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, 291asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
286 unsigned long __user *user_mask_ptr); 292 unsigned long __user *user_mask_ptr);
287asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, 293asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b21a63ed5d62..8174f889076c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2817,6 +2817,7 @@ out_unlock:
2817 __task_rq_unlock(rq); 2817 __task_rq_unlock(rq);
2818} 2818}
2819#endif 2819#endif
2820
2820void set_user_nice(struct task_struct *p, long nice) 2821void set_user_nice(struct task_struct *p, long nice)
2821{ 2822{
2822 int old_prio, delta, on_rq; 2823 int old_prio, delta, on_rq;
@@ -2991,22 +2992,29 @@ static struct task_struct *find_process_by_pid(pid_t pid)
2991 return pid ? find_task_by_vpid(pid) : current; 2992 return pid ? find_task_by_vpid(pid) : current;
2992} 2993}
2993 2994
2994/* Actually do priority change: must hold rq lock. */ 2995/* Actually do priority change: must hold pi & rq lock. */
2995static void 2996static void __setscheduler(struct rq *rq, struct task_struct *p,
2996__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) 2997 const struct sched_attr *attr)
2997{ 2998{
2999 int policy = attr->sched_policy;
3000
2998 p->policy = policy; 3001 p->policy = policy;
2999 p->rt_priority = prio; 3002
3003 if (rt_policy(policy))
3004 p->rt_priority = attr->sched_priority;
3005 else
3006 p->static_prio = NICE_TO_PRIO(attr->sched_nice);
3007
3000 p->normal_prio = normal_prio(p); 3008 p->normal_prio = normal_prio(p);
3001 /* we are holding p->pi_lock already */
3002 p->prio = rt_mutex_getprio(p); 3009 p->prio = rt_mutex_getprio(p);
3010
3003 if (rt_prio(p->prio)) 3011 if (rt_prio(p->prio))
3004 p->sched_class = &rt_sched_class; 3012 p->sched_class = &rt_sched_class;
3005 else 3013 else
3006 p->sched_class = &fair_sched_class; 3014 p->sched_class = &fair_sched_class;
3015
3007 set_load_weight(p); 3016 set_load_weight(p);
3008} 3017}
3009
3010/* 3018/*
3011 * check the target process has a UID that matches the current process's 3019 * check the target process has a UID that matches the current process's
3012 */ 3020 */
@@ -3023,10 +3031,12 @@ static bool check_same_owner(struct task_struct *p)
3023 return match; 3031 return match;
3024} 3032}
3025 3033
3026static int __sched_setscheduler(struct task_struct *p, int policy, 3034static int __sched_setscheduler(struct task_struct *p,
3027 const struct sched_param *param, bool user) 3035 const struct sched_attr *attr,
3036 bool user)
3028{ 3037{
3029 int retval, oldprio, oldpolicy = -1, on_rq, running; 3038 int retval, oldprio, oldpolicy = -1, on_rq, running;
3039 int policy = attr->sched_policy;
3030 unsigned long flags; 3040 unsigned long flags;
3031 const struct sched_class *prev_class; 3041 const struct sched_class *prev_class;
3032 struct rq *rq; 3042 struct rq *rq;
@@ -3054,17 +3064,22 @@ recheck:
3054 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 3064 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
3055 * SCHED_BATCH and SCHED_IDLE is 0. 3065 * SCHED_BATCH and SCHED_IDLE is 0.
3056 */ 3066 */
3057 if (param->sched_priority < 0 || 3067 if (attr->sched_priority < 0 ||
3058 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || 3068 (p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
3059 (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) 3069 (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
3060 return -EINVAL; 3070 return -EINVAL;
3061 if (rt_policy(policy) != (param->sched_priority != 0)) 3071 if (rt_policy(policy) != (attr->sched_priority != 0))
3062 return -EINVAL; 3072 return -EINVAL;
3063 3073
3064 /* 3074 /*
3065 * Allow unprivileged RT tasks to decrease priority: 3075 * Allow unprivileged RT tasks to decrease priority:
3066 */ 3076 */
3067 if (user && !capable(CAP_SYS_NICE)) { 3077 if (user && !capable(CAP_SYS_NICE)) {
3078 if (fair_policy(policy)) {
3079 if (!can_nice(p, attr->sched_nice))
3080 return -EPERM;
3081 }
3082
3068 if (rt_policy(policy)) { 3083 if (rt_policy(policy)) {
3069 unsigned long rlim_rtprio = 3084 unsigned long rlim_rtprio =
3070 task_rlimit(p, RLIMIT_RTPRIO); 3085 task_rlimit(p, RLIMIT_RTPRIO);
@@ -3074,8 +3089,8 @@ recheck:
3074 return -EPERM; 3089 return -EPERM;
3075 3090
3076 /* can't increase priority */ 3091 /* can't increase priority */
3077 if (param->sched_priority > p->rt_priority && 3092 if (attr->sched_priority > p->rt_priority &&
3078 param->sched_priority > rlim_rtprio) 3093 attr->sched_priority > rlim_rtprio)
3079 return -EPERM; 3094 return -EPERM;
3080 } 3095 }
3081 3096
@@ -3123,11 +3138,16 @@ recheck:
3123 /* 3138 /*
3124 * If not changing anything there's no need to proceed further: 3139 * If not changing anything there's no need to proceed further:
3125 */ 3140 */
3126 if (unlikely(policy == p->policy && (!rt_policy(policy) || 3141 if (unlikely(policy == p->policy)) {
3127 param->sched_priority == p->rt_priority))) { 3142 if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p))
3143 goto change;
3144 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
3145 goto change;
3146
3128 task_rq_unlock(rq, p, &flags); 3147 task_rq_unlock(rq, p, &flags);
3129 return 0; 3148 return 0;
3130 } 3149 }
3150change:
3131 3151
3132#ifdef CONFIG_RT_GROUP_SCHED 3152#ifdef CONFIG_RT_GROUP_SCHED
3133 if (user) { 3153 if (user) {
@@ -3161,7 +3181,7 @@ recheck:
3161 3181
3162 oldprio = p->prio; 3182 oldprio = p->prio;
3163 prev_class = p->sched_class; 3183 prev_class = p->sched_class;
3164 __setscheduler(rq, p, policy, param->sched_priority); 3184 __setscheduler(rq, p, attr);
3165 3185
3166 if (running) 3186 if (running)
3167 p->sched_class->set_curr_task(rq); 3187 p->sched_class->set_curr_task(rq);
@@ -3189,10 +3209,20 @@ recheck:
3189int sched_setscheduler(struct task_struct *p, int policy, 3209int sched_setscheduler(struct task_struct *p, int policy,
3190 const struct sched_param *param) 3210 const struct sched_param *param)
3191{ 3211{
3192 return __sched_setscheduler(p, policy, param, true); 3212 struct sched_attr attr = {
3213 .sched_policy = policy,
3214 .sched_priority = param->sched_priority
3215 };
3216 return __sched_setscheduler(p, &attr, true);
3193} 3217}
3194EXPORT_SYMBOL_GPL(sched_setscheduler); 3218EXPORT_SYMBOL_GPL(sched_setscheduler);
3195 3219
3220int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
3221{
3222 return __sched_setscheduler(p, attr, true);
3223}
3224EXPORT_SYMBOL_GPL(sched_setattr);
3225
3196/** 3226/**
3197 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. 3227 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
3198 * @p: the task in question. 3228 * @p: the task in question.
@@ -3209,7 +3239,11 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
3209int sched_setscheduler_nocheck(struct task_struct *p, int policy, 3239int sched_setscheduler_nocheck(struct task_struct *p, int policy,
3210 const struct sched_param *param) 3240 const struct sched_param *param)
3211{ 3241{
3212 return __sched_setscheduler(p, policy, param, false); 3242 struct sched_attr attr = {
3243 .sched_policy = policy,
3244 .sched_priority = param->sched_priority
3245 };
3246 return __sched_setscheduler(p, &attr, false);
3213} 3247}
3214 3248
3215static int 3249static int
@@ -3234,6 +3268,79 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3234 return retval; 3268 return retval;
3235} 3269}
3236 3270
3271/*
3272 * Mimics kernel/events/core.c perf_copy_attr().
3273 */
3274static int sched_copy_attr(struct sched_attr __user *uattr,
3275 struct sched_attr *attr)
3276{
3277 u32 size;
3278 int ret;
3279
3280 if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
3281 return -EFAULT;
3282
3283 /*
3284 * zero the full structure, so that a short copy will be nice.
3285 */
3286 memset(attr, 0, sizeof(*attr));
3287
3288 ret = get_user(size, &uattr->size);
3289 if (ret)
3290 return ret;
3291
3292 if (size > PAGE_SIZE) /* silly large */
3293 goto err_size;
3294
3295 if (!size) /* abi compat */
3296 size = SCHED_ATTR_SIZE_VER0;
3297
3298 if (size < SCHED_ATTR_SIZE_VER0)
3299 goto err_size;
3300
3301 /*
3302 * If we're handed a bigger struct than we know of,
3303 * ensure all the unknown bits are 0 - i.e. new
3304 * user-space does not rely on any kernel feature
3305 * extensions we dont know about yet.
3306 */
3307 if (size > sizeof(*attr)) {
3308 unsigned char __user *addr;
3309 unsigned char __user *end;
3310 unsigned char val;
3311
3312 addr = (void __user *)uattr + sizeof(*attr);
3313 end = (void __user *)uattr + size;
3314
3315 for (; addr < end; addr++) {
3316 ret = get_user(val, addr);
3317 if (ret)
3318 return ret;
3319 if (val)
3320 goto err_size;
3321 }
3322 size = sizeof(*attr);
3323 }
3324
3325 ret = copy_from_user(attr, uattr, size);
3326 if (ret)
3327 return -EFAULT;
3328
3329 /*
3330 * XXX: do we want to be lenient like existing syscalls; or do we want
3331 * to be strict and return an error on out-of-bounds values?
3332 */
3333 attr->sched_nice = clamp(attr->sched_nice, -20, 19);
3334
3335out:
3336 return ret;
3337
3338err_size:
3339 put_user(sizeof(*attr), &uattr->size);
3340 ret = -E2BIG;
3341 goto out;
3342}
3343
3237/** 3344/**
3238 * sys_sched_setscheduler - set/change the scheduler policy and RT priority 3345 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
3239 * @pid: the pid in question. 3346 * @pid: the pid in question.
@@ -3265,6 +3372,33 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
3265} 3372}
3266 3373
3267/** 3374/**
3375 * sys_sched_setattr - same as above, but with extended sched_attr
3376 * @pid: the pid in question.
3377 * @attr: structure containing the extended parameters.
3378 */
3379SYSCALL_DEFINE2(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr)
3380{
3381 struct sched_attr attr;
3382 struct task_struct *p;
3383 int retval;
3384
3385 if (!uattr || pid < 0)
3386 return -EINVAL;
3387
3388 if (sched_copy_attr(uattr, &attr))
3389 return -EFAULT;
3390
3391 rcu_read_lock();
3392 retval = -ESRCH;
3393 p = find_process_by_pid(pid);
3394 if (p != NULL)
3395 retval = sched_setattr(p, &attr);
3396 rcu_read_unlock();
3397
3398 return retval;
3399}
3400
3401/**
3268 * sys_sched_getscheduler - get the policy (scheduling class) of a thread 3402 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
3269 * @pid: the pid in question. 3403 * @pid: the pid in question.
3270 * 3404 *
@@ -3334,6 +3468,92 @@ out_unlock:
3334 return retval; 3468 return retval;
3335} 3469}
3336 3470
3471static int sched_read_attr(struct sched_attr __user *uattr,
3472 struct sched_attr *attr,
3473 unsigned int usize)
3474{
3475 int ret;
3476
3477 if (!access_ok(VERIFY_WRITE, uattr, usize))
3478 return -EFAULT;
3479
3480 /*
3481 * If we're handed a smaller struct than we know of,
3482 * ensure all the unknown bits are 0 - i.e. old
3483 * user-space does not get uncomplete information.
3484 */
3485 if (usize < sizeof(*attr)) {
3486 unsigned char *addr;
3487 unsigned char *end;
3488
3489 addr = (void *)attr + usize;
3490 end = (void *)attr + sizeof(*attr);
3491
3492 for (; addr < end; addr++) {
3493 if (*addr)
3494 goto err_size;
3495 }
3496
3497 attr->size = usize;
3498 }
3499
3500 ret = copy_to_user(uattr, attr, usize);
3501 if (ret)
3502 return -EFAULT;
3503
3504out:
3505 return ret;
3506
3507err_size:
3508 ret = -E2BIG;
3509 goto out;
3510}
3511
3512/**
3513 * sys_sched_getattr - same as above, but with extended "sched_param"
3514 * @pid: the pid in question.
3515 * @attr: structure containing the extended parameters.
3516 * @size: sizeof(attr) for fwd/bwd comp.
3517 */
3518SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
3519 unsigned int, size)
3520{
3521 struct sched_attr attr = {
3522 .size = sizeof(struct sched_attr),
3523 };
3524 struct task_struct *p;
3525 int retval;
3526
3527 if (!uattr || pid < 0 || size > PAGE_SIZE ||
3528 size < SCHED_ATTR_SIZE_VER0)
3529 return -EINVAL;
3530
3531 rcu_read_lock();
3532 p = find_process_by_pid(pid);
3533 retval = -ESRCH;
3534 if (!p)
3535 goto out_unlock;
3536
3537 retval = security_task_getscheduler(p);
3538 if (retval)
3539 goto out_unlock;
3540
3541 attr.sched_policy = p->policy;
3542 if (task_has_rt_policy(p))
3543 attr.sched_priority = p->rt_priority;
3544 else
3545 attr.sched_nice = TASK_NICE(p);
3546
3547 rcu_read_unlock();
3548
3549 retval = sched_read_attr(uattr, &attr, size);
3550 return retval;
3551
3552out_unlock:
3553 rcu_read_unlock();
3554 return retval;
3555}
3556
3337long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) 3557long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
3338{ 3558{
3339 cpumask_var_t cpus_allowed, new_mask; 3559 cpumask_var_t cpus_allowed, new_mask;
@@ -6400,13 +6620,16 @@ EXPORT_SYMBOL(__might_sleep);
6400static void normalize_task(struct rq *rq, struct task_struct *p) 6620static void normalize_task(struct rq *rq, struct task_struct *p)
6401{ 6621{
6402 const struct sched_class *prev_class = p->sched_class; 6622 const struct sched_class *prev_class = p->sched_class;
6623 struct sched_attr attr = {
6624 .sched_policy = SCHED_NORMAL,
6625 };
6403 int old_prio = p->prio; 6626 int old_prio = p->prio;
6404 int on_rq; 6627 int on_rq;
6405 6628
6406 on_rq = p->on_rq; 6629 on_rq = p->on_rq;
6407 if (on_rq) 6630 if (on_rq)
6408 dequeue_task(rq, p, 0); 6631 dequeue_task(rq, p, 0);
6409 __setscheduler(rq, p, SCHED_NORMAL, 0); 6632 __setscheduler(rq, p, &attr);
6410 if (on_rq) { 6633 if (on_rq) {
6411 enqueue_task(rq, p, 0); 6634 enqueue_task(rq, p, 0);
6412 resched_task(rq->curr); 6635 resched_task(rq->curr);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b3b4a4953efc..df023db7721c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -81,11 +81,14 @@ extern void update_cpu_load_active(struct rq *this_rq);
81 */ 81 */
82#define RUNTIME_INF ((u64)~0ULL) 82#define RUNTIME_INF ((u64)~0ULL)
83 83
84static inline int fair_policy(int policy)
85{
86 return policy == SCHED_NORMAL || policy == SCHED_BATCH;
87}
88
84static inline int rt_policy(int policy) 89static inline int rt_policy(int policy)
85{ 90{
86 if (policy == SCHED_FIFO || policy == SCHED_RR) 91 return policy == SCHED_FIFO || policy == SCHED_RR;
87 return 1;
88 return 0;
89} 92}
90 93
91static inline int task_has_rt_policy(struct task_struct *p) 94static inline int task_has_rt_policy(struct task_struct *p)