sched: Add new scheduler syscalls to support an extended scheduling parameters ABI

Add the syscalls needed for supporting scheduling algorithms with extended scheduling parameters (e.g., SCHED_DEADLINE). In general, it makes possible to specify a periodic/sporadic task, that executes for a given amount of runtime at each instance, and is scheduled according to the urgency of their own timing constraints, i.e.: - a (maximum/typical) instance execution time, - a minimum interval between consecutive instances, - a time constraint by which each instance must be completed. Thus, both the data structure that holds the scheduling parameters of the tasks and the system calls dealing with it must be extended. Unfortunately, modifying the existing struct sched_param would break the ABI and result in potentially serious compatibility issues with legacy binaries. For these reasons, this patch: - defines the new struct sched_attr, containing all the fields that are necessary for specifying a task in the computational model described above; - defines and implements the new scheduling related syscalls that manipulate it, i.e., sched_setattr() and sched_getattr(). Syscalls are introduced for x86 (32 and 64 bits) and ARM only, as a proof of concept and for developing and testing purposes. Making them available on other architectures is straightforward. Since no "user" for these new parameters is introduced in this patch, the implementation of the new system calls is just identical to their already existing counterpart. Future patches that implement scheduling policies able to exploit the new data structure must also take care of modifying the sched_*attr() calls accordingly with their own purposes. Signed-off-by: Dario Faggioli <raistlin@linux.it> [ Rewrote to use sched_attr. ] Signed-off-by: Juri Lelli <juri.lelli@gmail.com> [ Removed sched_setscheduler2() for now. ] Signed-off-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1383831828-15501-3-git-send-email-juri.lelli@gmail.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Dario Faggioli <raistlin@linux.it> 2013-11-07 08:43:36 -0500
committer: Ingo Molnar <mingo@kernel.org> 2014-01-13 07:41:04 -0500
commit: d50dde5a10f305253cbc3855307f608f8a3c5f73 (patch)
tree: 940022e0216611f198d9a00f1cb3bfc59b2014d8 /kernel/sched/core.c
parent: 56b4811039174bba9cbd68318d0d8b1585b9eded (diff)
1 files changed, 243 insertions, 20 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b21a63ed5d62..8174f889076c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2817,6 +2817,7 @@ out_unlock:
        __task_rq_unlock(rq);
 }
 #endif
 void set_user_nice(struct task_struct *p, long nice)
 {
        int old_prio, delta, on_rq;
@@ -2991,22 +2992,29 @@ static struct task_struct *find_process_by_pid(pid_t pid)
        return pid ? find_task_by_vpid(pid) : current;
 }
-/* Actually do priority change: must hold rq lock. */
+/* Actually do priority change: must hold pi & rq lock. */
-static void
+static void __setscheduler(struct rq *rq, struct task_struct *p,
-__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
+                           const struct sched_attr *attr)
 {
+        int policy = attr->sched_policy;
        p->policy = policy;
-        p->rt_priority = prio;
+        if (rt_policy(policy))
+                p->rt_priority = attr->sched_priority;
+        else
+                p->static_prio = NICE_TO_PRIO(attr->sched_nice);
        p->normal_prio = normal_prio(p);
-        /* we are holding p->pi_lock already */
        p->prio = rt_mutex_getprio(p);
        if (rt_prio(p->prio))
                p->sched_class = &rt_sched_class;
        else
                p->sched_class = &fair_sched_class;
        set_load_weight(p);
 }
 /*
 * check the target process has a UID that matches the current process's
 */
@@ -3023,10 +3031,12 @@ static bool check_same_owner(struct task_struct *p)
        return match;
 }
-static int __sched_setscheduler(struct task_struct *p, int policy,
+static int __sched_setscheduler(struct task_struct *p,
-                                const struct sched_param *param, bool user)
+                                const struct sched_attr *attr,
+                                bool user)
 {
        int retval, oldprio, oldpolicy = -1, on_rq, running;
+        int policy = attr->sched_policy;
        unsigned long flags;
        const struct sched_class *prev_class;
        struct rq *rq;
@@ -3054,17 +3064,22 @@ recheck:
         * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
         * SCHED_BATCH and SCHED_IDLE is 0.
         */
-        if (param->sched_priority < 0 ||
+        if (attr->sched_priority < 0 ||
-            (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
+            (p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
-            (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
+            (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
                return -EINVAL;
-        if (rt_policy(policy) != (param->sched_priority != 0))
+        if (rt_policy(policy) != (attr->sched_priority != 0))
                return -EINVAL;
        /*
         * Allow unprivileged RT tasks to decrease priority:
         */
        if (user && !capable(CAP_SYS_NICE)) {
+                if (fair_policy(policy)) {
+                        if (!can_nice(p, attr->sched_nice))
+                                return -EPERM;
+                }
                if (rt_policy(policy)) {
                        unsigned long rlim_rtprio =
                                        task_rlimit(p, RLIMIT_RTPRIO);
@@ -3074,8 +3089,8 @@ recheck:
                                return -EPERM;
                        /* can't increase priority */
-                        if (param->sched_priority > p->rt_priority &&
+                        if (attr->sched_priority > p->rt_priority &&
-                            param->sched_priority > rlim_rtprio)
+                            attr->sched_priority > rlim_rtprio)
                                return -EPERM;
                }
@@ -3123,11 +3138,16 @@ recheck:
        /*
         * If not changing anything there's no need to proceed further:
         */
-        if (unlikely(policy == p->policy && (!rt_policy(policy) ||
+        if (unlikely(policy == p->policy)) {
-                        param->sched_priority == p->rt_priority))) {
+                if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p))
+                        goto change;
+                if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
+                        goto change;
                task_rq_unlock(rq, p, &flags);
                return 0;
        }
+change:
 #ifdef CONFIG_RT_GROUP_SCHED
        if (user) {
@@ -3161,7 +3181,7 @@ recheck:
        oldprio = p->prio;
        prev_class = p->sched_class;
-        __setscheduler(rq, p, policy, param->sched_priority);
+        __setscheduler(rq, p, attr);
        if (running)
                p->sched_class->set_curr_task(rq);
@@ -3189,10 +3209,20 @@ recheck:
 int sched_setscheduler(struct task_struct *p, int policy,
                       const struct sched_param *param)
 {
-        return __sched_setscheduler(p, policy, param, true);
+        struct sched_attr attr = {
+                .sched_policy   = policy,
+                .sched_priority = param->sched_priority
+        };
+        return __sched_setscheduler(p, &attr, true);
 }
 EXPORT_SYMBOL_GPL(sched_setscheduler);
+int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
+{
+        return __sched_setscheduler(p, attr, true);
+}
+EXPORT_SYMBOL_GPL(sched_setattr);
 /**
 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
 * @p: the task in question.
@@ -3209,7 +3239,11 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
 int sched_setscheduler_nocheck(struct task_struct *p, int policy,
                               const struct sched_param *param)
 {
-        return __sched_setscheduler(p, policy, param, false);
+        struct sched_attr attr = {
+                .sched_policy   = policy,
+                .sched_priority = param->sched_priority
+        };
+        return __sched_setscheduler(p, &attr, false);
 }
 static int
@@ -3234,6 +3268,79 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
        return retval;
 }
+/*
+ * Mimics kernel/events/core.c perf_copy_attr().
+ */
+static int sched_copy_attr(struct sched_attr __user *uattr,
+                           struct sched_attr *attr)
+{
+        u32 size;
+        int ret;
+        if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
+                return -EFAULT;
+        /*
+         * zero the full structure, so that a short copy will be nice.
+         */
+        memset(attr, 0, sizeof(*attr));
+        ret = get_user(size, &uattr->size);
+        if (ret)
+                return ret;
+        if (size > PAGE_SIZE)   /* silly large */
+                goto err_size;
+        if (!size)              /* abi compat */
+                size = SCHED_ATTR_SIZE_VER0;
+        if (size < SCHED_ATTR_SIZE_VER0)
+                goto err_size;
+        /*
+         * If we're handed a bigger struct than we know of,
+         * ensure all the unknown bits are 0 - i.e. new
+         * user-space does not rely on any kernel feature
+         * extensions we dont know about yet.
+         */
+        if (size > sizeof(*attr)) {
+                unsigned char __user *addr;
+                unsigned char __user *end;
+                unsigned char val;
+                addr = (void __user *)uattr + sizeof(*attr);
+                end  = (void __user *)uattr + size;
+                for (; addr < end; addr++) {
+                        ret = get_user(val, addr);
+                        if (ret)
+                                return ret;
+                        if (val)
+                                goto err_size;
+                }
+                size = sizeof(*attr);
+        }
+        ret = copy_from_user(attr, uattr, size);
+        if (ret)
+                return -EFAULT;
+        /*
+         * XXX: do we want to be lenient like existing syscalls; or do we want
+         * to be strict and return an error on out-of-bounds values?
+         */
+        attr->sched_nice = clamp(attr->sched_nice, -20, 19);
+out:
+        return ret;
+err_size:
+        put_user(sizeof(*attr), &uattr->size);
+        ret = -E2BIG;
+        goto out;
+}
 /**
 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
 * @pid: the pid in question.
@@ -3265,6 +3372,33 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
 }
 /**
+ * sys_sched_setattr - same as above, but with extended sched_attr
+ * @pid: the pid in question.
+ * @attr: structure containing the extended parameters.
+ */
+SYSCALL_DEFINE2(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr)
+{
+        struct sched_attr attr;
+        struct task_struct *p;
+        int retval;
+        if (!uattr || pid < 0)
+                return -EINVAL;
+        if (sched_copy_attr(uattr, &attr))
+                return -EFAULT;
+        rcu_read_lock();
+        retval = -ESRCH;
+        p = find_process_by_pid(pid);
+        if (p != NULL)
+                retval = sched_setattr(p, &attr);
+        rcu_read_unlock();
+        return retval;
+}
+/**
 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
 * @pid: the pid in question.
 *
@@ -3334,6 +3468,92 @@ out_unlock:
        return retval;
 }
+static int sched_read_attr(struct sched_attr __user *uattr,
+                           struct sched_attr *attr,
+                           unsigned int usize)
+{
+        int ret;
+        if (!access_ok(VERIFY_WRITE, uattr, usize))
+                return -EFAULT;
+        /*
+         * If we're handed a smaller struct than we know of,
+         * ensure all the unknown bits are 0 - i.e. old
+         * user-space does not get uncomplete information.
+         */
+        if (usize < sizeof(*attr)) {
+                unsigned char *addr;
+                unsigned char *end;
+                addr = (void *)attr + usize;
+                end  = (void *)attr + sizeof(*attr);
+                for (; addr < end; addr++) {
+                        if (*addr)
+                                goto err_size;
+                }
+                attr->size = usize;
+        }
+        ret = copy_to_user(uattr, attr, usize);
+        if (ret)
+                return -EFAULT;
+out:
+        return ret;
+err_size:
+        ret = -E2BIG;
+        goto out;
+}
+/**
+ * sys_sched_getattr - same as above, but with extended "sched_param"
+ * @pid: the pid in question.
+ * @attr: structure containing the extended parameters.
+ * @size: sizeof(attr) for fwd/bwd comp.
+ */
+SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
+                unsigned int, size)
+{
+        struct sched_attr attr = {
+                .size = sizeof(struct sched_attr),
+        };
+        struct task_struct *p;
+        int retval;
+        if (!uattr || pid < 0 || size > PAGE_SIZE ||
+            size < SCHED_ATTR_SIZE_VER0)
+                return -EINVAL;
+        rcu_read_lock();
+        p = find_process_by_pid(pid);
+        retval = -ESRCH;
+        if (!p)
+                goto out_unlock;
+        retval = security_task_getscheduler(p);
+        if (retval)
+                goto out_unlock;
+        attr.sched_policy = p->policy;
+        if (task_has_rt_policy(p))
+                attr.sched_priority = p->rt_priority;
+        else
+                attr.sched_nice = TASK_NICE(p);
+        rcu_read_unlock();
+        retval = sched_read_attr(uattr, &attr, size);
+        return retval;
+out_unlock:
+        rcu_read_unlock();
+        return retval;
+}
 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 {
        cpumask_var_t cpus_allowed, new_mask;
@@ -6400,13 +6620,16 @@ EXPORT_SYMBOL(__might_sleep);
 static void normalize_task(struct rq *rq, struct task_struct *p)
 {
        const struct sched_class *prev_class = p->sched_class;
+        struct sched_attr attr = {
+                .sched_policy = SCHED_NORMAL,
+        };
        int old_prio = p->prio;
        int on_rq;
        on_rq = p->on_rq;
        if (on_rq)
                dequeue_task(rq, p, 0);
-        __setscheduler(rq, p, SCHED_NORMAL, 0);
+        __setscheduler(rq, p, &attr);
        if (on_rq) {
                enqueue_task(rq, p, 0);
                resched_task(rq->curr);
author	Dario Faggioli <raistlin@linux.it>	2013-11-07 08:43:36 -0500
committer	Ingo Molnar <mingo@kernel.org>	2014-01-13 07:41:04 -0500
commit	d50dde5a10f305253cbc3855307f608f8a3c5f73 (patch)
tree	940022e0216611f198d9a00f1cb3bfc59b2014d8 /kernel/sched/core.c
parent	56b4811039174bba9cbd68318d0d8b1585b9eded (diff)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b21a63ed5d62..8174f889076c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c
@@ -2817,6 +2817,7 @@ out_unlock:
2817	__task_rq_unlock(rq);	2817	__task_rq_unlock(rq);
2818	}	2818	}
2819	#endif	2819	#endif
		2820
2820	void set_user_nice(struct task_struct *p, long nice)	2821	void set_user_nice(struct task_struct *p, long nice)
2821	{	2822	{
2822	int old_prio, delta, on_rq;	2823	int old_prio, delta, on_rq;
@@ -2991,22 +2992,29 @@ static struct task_struct *find_process_by_pid(pid_t pid)
2991	return pid ? find_task_by_vpid(pid) : current;	2992	return pid ? find_task_by_vpid(pid) : current;
2992	}	2993	}
2993		2994
2994	/* Actually do priority change: must hold rq lock. */	2995	/* Actually do priority change: must hold pi & rq lock. */
2995	static void	2996	static void __setscheduler(struct rq rq, struct task_struct p,
2996	__setscheduler(struct rq rq, struct task_struct p, int policy, int prio)	2997	const struct sched_attr *attr)
2997	{	2998	{
		2999	int policy = attr->sched_policy;
		3000
2998	p->policy = policy;	3001	p->policy = policy;
2999	p->rt_priority = prio;	3002
		3003	if (rt_policy(policy))
		3004	p->rt_priority = attr->sched_priority;
		3005	else
		3006	p->static_prio = NICE_TO_PRIO(attr->sched_nice);
		3007
3000	p->normal_prio = normal_prio(p);	3008	p->normal_prio = normal_prio(p);
3001	/* we are holding p->pi_lock already */
3002	p->prio = rt_mutex_getprio(p);	3009	p->prio = rt_mutex_getprio(p);
		3010
3003	if (rt_prio(p->prio))	3011	if (rt_prio(p->prio))
3004	p->sched_class = &rt_sched_class;	3012	p->sched_class = &rt_sched_class;
3005	else	3013	else
3006	p->sched_class = &fair_sched_class;	3014	p->sched_class = &fair_sched_class;
		3015
3007	set_load_weight(p);	3016	set_load_weight(p);
3008	}	3017	}
3009
3010	/*	3018	/*
3011	* check the target process has a UID that matches the current process's	3019	* check the target process has a UID that matches the current process's
3012	*/	3020	*/
@@ -3023,10 +3031,12 @@ static bool check_same_owner(struct task_struct *p)
3023	return match;	3031	return match;
3024	}	3032	}
3025		3033
3026	static int __sched_setscheduler(struct task_struct *p, int policy,	3034	static int __sched_setscheduler(struct task_struct *p,
3027	const struct sched_param *param, bool user)	3035	const struct sched_attr *attr,
		3036	bool user)
3028	{	3037	{
3029	int retval, oldprio, oldpolicy = -1, on_rq, running;	3038	int retval, oldprio, oldpolicy = -1, on_rq, running;
		3039	int policy = attr->sched_policy;
3030	unsigned long flags;	3040	unsigned long flags;
3031	const struct sched_class *prev_class;	3041	const struct sched_class *prev_class;
3032	struct rq *rq;	3042	struct rq *rq;
@@ -3054,17 +3064,22 @@ recheck:
3054	* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,	3064	* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
3055	* SCHED_BATCH and SCHED_IDLE is 0.	3065	* SCHED_BATCH and SCHED_IDLE is 0.
3056	*/	3066	*/
3057	if (param->sched_priority < 0 \|\|	3067	if (attr->sched_priority < 0 \|\|
3058	(p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) \|\|	3068	(p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) \|\|
3059	(!p->mm && param->sched_priority > MAX_RT_PRIO-1))	3069	(!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
3060	return -EINVAL;	3070	return -EINVAL;
3061	if (rt_policy(policy) != (param->sched_priority != 0))	3071	if (rt_policy(policy) != (attr->sched_priority != 0))
3062	return -EINVAL;	3072	return -EINVAL;
3063		3073
3064	/*	3074	/*
3065	* Allow unprivileged RT tasks to decrease priority:	3075	* Allow unprivileged RT tasks to decrease priority:
3066	*/	3076	*/
3067	if (user && !capable(CAP_SYS_NICE)) {	3077	if (user && !capable(CAP_SYS_NICE)) {
		3078	if (fair_policy(policy)) {
		3079	if (!can_nice(p, attr->sched_nice))
		3080	return -EPERM;
		3081	}
		3082
3068	if (rt_policy(policy)) {	3083	if (rt_policy(policy)) {
3069	unsigned long rlim_rtprio =	3084	unsigned long rlim_rtprio =
3070	task_rlimit(p, RLIMIT_RTPRIO);	3085	task_rlimit(p, RLIMIT_RTPRIO);
@@ -3074,8 +3089,8 @@ recheck:
3074	return -EPERM;	3089	return -EPERM;
3075		3090
3076	/* can't increase priority */	3091	/* can't increase priority */
3077	if (param->sched_priority > p->rt_priority &&	3092	if (attr->sched_priority > p->rt_priority &&
3078	param->sched_priority > rlim_rtprio)	3093	attr->sched_priority > rlim_rtprio)
3079	return -EPERM;	3094	return -EPERM;
3080	}	3095	}
3081		3096
@@ -3123,11 +3138,16 @@ recheck:
3123	/*	3138	/*
3124	* If not changing anything there's no need to proceed further:	3139	* If not changing anything there's no need to proceed further:
3125	*/	3140	*/
3126	if (unlikely(policy == p->policy && (!rt_policy(policy) \|\|	3141	if (unlikely(policy == p->policy)) {
3127	param->sched_priority == p->rt_priority))) {	3142	if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p))
		3143	goto change;
		3144	if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
		3145	goto change;
		3146
3128	task_rq_unlock(rq, p, &flags);	3147	task_rq_unlock(rq, p, &flags);
3129	return 0;	3148	return 0;
3130	}	3149	}
		3150	change:
3131		3151
3132	#ifdef CONFIG_RT_GROUP_SCHED	3152	#ifdef CONFIG_RT_GROUP_SCHED
3133	if (user) {	3153	if (user) {
@@ -3161,7 +3181,7 @@ recheck:
3161		3181
3162	oldprio = p->prio;	3182	oldprio = p->prio;
3163	prev_class = p->sched_class;	3183	prev_class = p->sched_class;
3164	__setscheduler(rq, p, policy, param->sched_priority);	3184	__setscheduler(rq, p, attr);
3165		3185
3166	if (running)	3186	if (running)
3167	p->sched_class->set_curr_task(rq);	3187	p->sched_class->set_curr_task(rq);
@@ -3189,10 +3209,20 @@ recheck:
3189	int sched_setscheduler(struct task_struct *p, int policy,	3209	int sched_setscheduler(struct task_struct *p, int policy,
3190	const struct sched_param *param)	3210	const struct sched_param *param)
3191	{	3211	{
3192	return __sched_setscheduler(p, policy, param, true);	3212	struct sched_attr attr = {
		3213	.sched_policy = policy,
		3214	.sched_priority = param->sched_priority
		3215	};
		3216	return __sched_setscheduler(p, &attr, true);
3193	}	3217	}
3194	EXPORT_SYMBOL_GPL(sched_setscheduler);	3218	EXPORT_SYMBOL_GPL(sched_setscheduler);
3195		3219
		3220	int sched_setattr(struct task_struct p, const struct sched_attr attr)
		3221	{
		3222	return __sched_setscheduler(p, attr, true);
		3223	}
		3224	EXPORT_SYMBOL_GPL(sched_setattr);
		3225
3196	/**	3226	/**
3197	* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.	3227	* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
3198	* @p: the task in question.	3228	* @p: the task in question.
@@ -3209,7 +3239,11 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
3209	int sched_setscheduler_nocheck(struct task_struct *p, int policy,	3239	int sched_setscheduler_nocheck(struct task_struct *p, int policy,
3210	const struct sched_param *param)	3240	const struct sched_param *param)
3211	{	3241	{
3212	return __sched_setscheduler(p, policy, param, false);	3242	struct sched_attr attr = {
		3243	.sched_policy = policy,
		3244	.sched_priority = param->sched_priority
		3245	};
		3246	return __sched_setscheduler(p, &attr, false);
3213	}	3247	}
3214		3248
3215	static int	3249	static int
@@ -3234,6 +3268,79 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3234	return retval;	3268	return retval;
3235	}	3269	}
3236		3270
		3271	/*
		3272	* Mimics kernel/events/core.c perf_copy_attr().
		3273	*/
		3274	static int sched_copy_attr(struct sched_attr __user *uattr,
		3275	struct sched_attr *attr)
		3276	{
		3277	u32 size;
		3278	int ret;
		3279
		3280	if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
		3281	return -EFAULT;
		3282
		3283	/*
		3284	* zero the full structure, so that a short copy will be nice.
		3285	*/
		3286	memset(attr, 0, sizeof(*attr));
		3287
		3288	ret = get_user(size, &uattr->size);
		3289	if (ret)
		3290	return ret;
		3291
		3292	if (size > PAGE_SIZE) /* silly large */
		3293	goto err_size;
		3294
		3295	if (!size) /* abi compat */
		3296	size = SCHED_ATTR_SIZE_VER0;
		3297
		3298	if (size < SCHED_ATTR_SIZE_VER0)
		3299	goto err_size;
		3300
		3301	/*
		3302	* If we're handed a bigger struct than we know of,
		3303	* ensure all the unknown bits are 0 - i.e. new
		3304	* user-space does not rely on any kernel feature
		3305	* extensions we dont know about yet.
		3306	*/
		3307	if (size > sizeof(*attr)) {
		3308	unsigned char __user *addr;
		3309	unsigned char __user *end;
		3310	unsigned char val;
		3311
		3312	addr = (void __user )uattr + sizeof(attr);
		3313	end = (void __user *)uattr + size;
		3314
		3315	for (; addr < end; addr++) {
		3316	ret = get_user(val, addr);
		3317	if (ret)
		3318	return ret;
		3319	if (val)
		3320	goto err_size;
		3321	}
		3322	size = sizeof(*attr);
		3323	}
		3324
		3325	ret = copy_from_user(attr, uattr, size);
		3326	if (ret)
		3327	return -EFAULT;
		3328
		3329	/*
		3330	* XXX: do we want to be lenient like existing syscalls; or do we want
		3331	* to be strict and return an error on out-of-bounds values?
		3332	*/
		3333	attr->sched_nice = clamp(attr->sched_nice, -20, 19);
		3334
		3335	out:
		3336	return ret;
		3337
		3338	err_size:
		3339	put_user(sizeof(*attr), &uattr->size);
		3340	ret = -E2BIG;
		3341	goto out;
		3342	}
		3343
3237	/**	3344	/**
3238	* sys_sched_setscheduler - set/change the scheduler policy and RT priority	3345	* sys_sched_setscheduler - set/change the scheduler policy and RT priority
3239	* @pid: the pid in question.	3346	* @pid: the pid in question.
@@ -3265,6 +3372,33 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
3265	}	3372	}
3266		3373
3267	/**	3374	/**
		3375	* sys_sched_setattr - same as above, but with extended sched_attr
		3376	* @pid: the pid in question.
		3377	* @attr: structure containing the extended parameters.
		3378	*/
		3379	SYSCALL_DEFINE2(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr)
		3380	{
		3381	struct sched_attr attr;
		3382	struct task_struct *p;
		3383	int retval;
		3384
		3385	if (!uattr \|\| pid < 0)
		3386	return -EINVAL;
		3387
		3388	if (sched_copy_attr(uattr, &attr))
		3389	return -EFAULT;
		3390
		3391	rcu_read_lock();
		3392	retval = -ESRCH;
		3393	p = find_process_by_pid(pid);
		3394	if (p != NULL)
		3395	retval = sched_setattr(p, &attr);
		3396	rcu_read_unlock();
		3397
		3398	return retval;
		3399	}
		3400
		3401	/**
3268	* sys_sched_getscheduler - get the policy (scheduling class) of a thread	3402	* sys_sched_getscheduler - get the policy (scheduling class) of a thread
3269	* @pid: the pid in question.	3403	* @pid: the pid in question.
3270	*	3404	*
@@ -3334,6 +3468,92 @@ out_unlock:
3334	return retval;	3468	return retval;
3335	}	3469	}
3336		3470
		3471	static int sched_read_attr(struct sched_attr __user *uattr,
		3472	struct sched_attr *attr,
		3473	unsigned int usize)
		3474	{
		3475	int ret;
		3476
		3477	if (!access_ok(VERIFY_WRITE, uattr, usize))
		3478	return -EFAULT;
		3479
		3480	/*
		3481	* If we're handed a smaller struct than we know of,
		3482	* ensure all the unknown bits are 0 - i.e. old
		3483	* user-space does not get uncomplete information.
		3484	*/
		3485	if (usize < sizeof(*attr)) {
		3486	unsigned char *addr;
		3487	unsigned char *end;
		3488
		3489	addr = (void *)attr + usize;
		3490	end = (void )attr + sizeof(attr);
		3491
		3492	for (; addr < end; addr++) {
		3493	if (*addr)
		3494	goto err_size;
		3495	}
		3496
		3497	attr->size = usize;
		3498	}
		3499
		3500	ret = copy_to_user(uattr, attr, usize);
		3501	if (ret)
		3502	return -EFAULT;
		3503
		3504	out:
		3505	return ret;
		3506
		3507	err_size:
		3508	ret = -E2BIG;
		3509	goto out;
		3510	}
		3511
		3512	/**
		3513	* sys_sched_getattr - same as above, but with extended "sched_param"
		3514	* @pid: the pid in question.
		3515	* @attr: structure containing the extended parameters.
		3516	* @size: sizeof(attr) for fwd/bwd comp.
		3517	*/
		3518	SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
		3519	unsigned int, size)
		3520	{
		3521	struct sched_attr attr = {
		3522	.size = sizeof(struct sched_attr),
		3523	};
		3524	struct task_struct *p;
		3525	int retval;
		3526
		3527	if (!uattr \|\| pid < 0 \|\| size > PAGE_SIZE \|\|
		3528	size < SCHED_ATTR_SIZE_VER0)
		3529	return -EINVAL;
		3530
		3531	rcu_read_lock();
		3532	p = find_process_by_pid(pid);
		3533	retval = -ESRCH;
		3534	if (!p)
		3535	goto out_unlock;
		3536
		3537	retval = security_task_getscheduler(p);
		3538	if (retval)
		3539	goto out_unlock;
		3540
		3541	attr.sched_policy = p->policy;
		3542	if (task_has_rt_policy(p))
		3543	attr.sched_priority = p->rt_priority;
		3544	else
		3545	attr.sched_nice = TASK_NICE(p);
		3546
		3547	rcu_read_unlock();
		3548
		3549	retval = sched_read_attr(uattr, &attr, size);
		3550	return retval;
		3551
		3552	out_unlock:
		3553	rcu_read_unlock();
		3554	return retval;
		3555	}
		3556
3337	long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)	3557	long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
3338	{	3558	{
3339	cpumask_var_t cpus_allowed, new_mask;	3559	cpumask_var_t cpus_allowed, new_mask;
@@ -6400,13 +6620,16 @@ EXPORT_SYMBOL(__might_sleep);
6400	static void normalize_task(struct rq rq, struct task_struct p)	6620	static void normalize_task(struct rq rq, struct task_struct p)
6401	{	6621	{
6402	const struct sched_class *prev_class = p->sched_class;	6622	const struct sched_class *prev_class = p->sched_class;
		6623	struct sched_attr attr = {
		6624	.sched_policy = SCHED_NORMAL,
		6625	};
6403	int old_prio = p->prio;	6626	int old_prio = p->prio;
6404	int on_rq;	6627	int on_rq;
6405		6628
6406	on_rq = p->on_rq;	6629	on_rq = p->on_rq;
6407	if (on_rq)	6630	if (on_rq)
6408	dequeue_task(rq, p, 0);	6631	dequeue_task(rq, p, 0);
6409	__setscheduler(rq, p, SCHED_NORMAL, 0);	6632	__setscheduler(rq, p, &attr);
6410	if (on_rq) {	6633	if (on_rq) {
6411	enqueue_task(rq, p, 0);	6634	enqueue_task(rq, p, 0);
6412	resched_task(rq->curr);	6635	resched_task(rq->curr);