7 files changed, 53 insertions, 33 deletions
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 43c2bcc35761..b30a2924ef14 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -301,14 +301,14 @@ u64 sched_clock_cpu(int cpu)
        if (unlikely(!sched_clock_running))
                return 0ull;
-        preempt_disable();
+        preempt_disable_notrace();
        scd = cpu_sdc(cpu);
        if (cpu != smp_processor_id())
                clock = sched_clock_remote(scd);
        else
                clock = sched_clock_local(scd);
-        preempt_enable();
+        preempt_enable_notrace();
        return clock;
 }
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b46131ef6aab..f5c6635b806c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1952,7 +1952,7 @@ static int dl_overflow(struct task_struct *p, int policy,
 {
        struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
-        u64 period = attr->sched_period;
+        u64 period = attr->sched_period ?: attr->sched_deadline;
        u64 runtime = attr->sched_runtime;
        u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
        int cpus, err = -1;
@@ -3338,6 +3338,15 @@ recheck:
                                return -EPERM;
                }
+                 /*
+                  * Can't set/change SCHED_DEADLINE policy at all for now
+                  * (safest behavior); in the future we would like to allow
+                  * unprivileged DL tasks to increase their relative deadline
+                  * or reduce their runtime (both ways reducing utilization)
+                  */
+                if (dl_policy(policy))
+                        return -EPERM;
                /*
                 * Treat SCHED_IDLE as nice 20. Only allow a switch to
                 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
@@ -3661,13 +3670,14 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
 * @pid: the pid in question.
 * @uattr: structure containing the extended parameters.
 */
-SYSCALL_DEFINE2(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr)
+SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
+                               unsigned int, flags)
 {
        struct sched_attr attr;
        struct task_struct *p;
        int retval;
-        if (!uattr || pid < 0)
+        if (!uattr || pid < 0 || flags)
                return -EINVAL;
        if (sched_copy_attr(uattr, &attr))
@@ -3786,7 +3796,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
                attr->size = usize;
        }
-        ret = copy_to_user(uattr, attr, usize);
+        ret = copy_to_user(uattr, attr, attr->size);
        if (ret)
                return -EFAULT;
@@ -3804,8 +3814,8 @@ err_size:
 * @uattr: structure containing the extended parameters.
 * @size: sizeof(attr) for fwd/bwd comp.
 */
-SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
+SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
-                unsigned int, size)
+                unsigned int, size, unsigned int, flags)
 {
        struct sched_attr attr = {
                .size = sizeof(struct sched_attr),
@@ -3814,7 +3824,7 @@ SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
        int retval;
        if (!uattr || pid < 0 || size > PAGE_SIZE ||
-            size < SCHED_ATTR_SIZE_VER0)
+            size < SCHED_ATTR_SIZE_VER0 || flags)
                return -EINVAL;
        rcu_read_lock();
@@ -7422,6 +7432,7 @@ static int sched_dl_global_constraints(void)
        u64 period = global_rt_period();
        u64 new_bw = to_ratio(period, runtime);
        int cpu, ret = 0;
+        unsigned long flags;
        /*
         * Here we want to check the bandwidth not being set to some
@@ -7435,10 +7446,10 @@ static int sched_dl_global_constraints(void)
        for_each_possible_cpu(cpu) {
                struct dl_bw *dl_b = dl_bw_of(cpu);
-                raw_spin_lock(&dl_b->lock);
+                raw_spin_lock_irqsave(&dl_b->lock, flags);
                if (new_bw < dl_b->total_bw)
                        ret = -EBUSY;
-                raw_spin_unlock(&dl_b->lock);
+                raw_spin_unlock_irqrestore(&dl_b->lock, flags);
                if (ret)
                        break;
@@ -7451,6 +7462,7 @@ static void sched_dl_do_global(void)
 {
        u64 new_bw = -1;
        int cpu;
+        unsigned long flags;
        def_dl_bandwidth.dl_period = global_rt_period();
        def_dl_bandwidth.dl_runtime = global_rt_runtime();
@@ -7464,9 +7476,9 @@ static void sched_dl_do_global(void)
        for_each_possible_cpu(cpu) {
                struct dl_bw *dl_b = dl_bw_of(cpu);
-                raw_spin_lock(&dl_b->lock);
+                raw_spin_lock_irqsave(&dl_b->lock, flags);
                dl_b->bw = new_bw;
-                raw_spin_unlock(&dl_b->lock);
+                raw_spin_unlock_irqrestore(&dl_b->lock, flags);
        }
 }
@@ -7475,7 +7487,8 @@ static int sched_rt_global_validate(void)
        if (sysctl_sched_rt_period <= 0)
                return -EINVAL;
-        if (sysctl_sched_rt_runtime > sysctl_sched_rt_period)
+        if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
+                (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
                return -EINVAL;
        return 0;
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 045fc74e3f09..5b9bb42b2d47 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -70,7 +70,7 @@ static void cpudl_heapify(struct cpudl *cp, int idx)
 static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl)
 {
-        WARN_ON(idx > num_present_cpus() || idx == IDX_INVALID);
+        WARN_ON(idx == IDX_INVALID || !cpu_present(idx));
        if (dl_time_before(new_dl, cp->elements[idx].dl)) {
                cp->elements[idx].dl = new_dl;
@@ -117,7 +117,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
        }
 out:
-        WARN_ON(best_cpu > num_present_cpus() && best_cpu != -1);
+        WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
        return best_cpu;
 }
@@ -137,7 +137,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
        int old_idx, new_cpu;
        unsigned long flags;
-        WARN_ON(cpu > num_present_cpus());
+        WARN_ON(!cpu_present(cpu));
        raw_spin_lock_irqsave(&cp->lock, flags);
        old_idx = cp->cpu_to_idx[cpu];
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 0dd5e0971a07..6e79b3faa4cd 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -121,7 +121,7 @@ static inline void dl_clear_overload(struct rq *rq)
 static void update_dl_migration(struct dl_rq *dl_rq)
 {
-        if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_total > 1) {
+        if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) {
                if (!dl_rq->overloaded) {
                        dl_set_overload(rq_of_dl_rq(dl_rq));
                        dl_rq->overloaded = 1;
@@ -135,9 +135,7 @@ static void update_dl_migration(struct dl_rq *dl_rq)
 static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 {
        struct task_struct *p = dl_task_of(dl_se);
-        dl_rq = &rq_of_dl_rq(dl_rq)->dl;
-        dl_rq->dl_nr_total++;
        if (p->nr_cpus_allowed > 1)
                dl_rq->dl_nr_migratory++;
@@ -147,9 +145,7 @@ static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 {
        struct task_struct *p = dl_task_of(dl_se);
-        dl_rq = &rq_of_dl_rq(dl_rq)->dl;
-        dl_rq->dl_nr_total--;
        if (p->nr_cpus_allowed > 1)
                dl_rq->dl_nr_migratory--;
@@ -566,6 +562,8 @@ int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
        return 1;
 }
+extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
 /*
 * Update the current task's runtime statistics (provided it is still
 * a -deadline task and has not been removed from the dl_rq).
@@ -629,11 +627,13 @@ static void update_curr_dl(struct rq *rq)
                struct rt_rq *rt_rq = &rq->rt;
                raw_spin_lock(&rt_rq->rt_runtime_lock);
-                rt_rq->rt_time += delta_exec;
                /*
                 * We'll let actual RT tasks worry about the overflow here, we
-                 * have our own CBS to keep us inline -- see above.
+                 * have our own CBS to keep us inline; only account when RT
+                 * bandwidth is relevant.
                 */
+                if (sched_rt_bandwidth_account(rt_rq))
+                        rt_rq->rt_time += delta_exec;
                raw_spin_unlock(&rt_rq->rt_runtime_lock);
        }
 }
@@ -717,6 +717,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
        WARN_ON(!dl_prio(prio));
        dl_rq->dl_nr_running++;
+        inc_nr_running(rq_of_dl_rq(dl_rq));
        inc_dl_deadline(dl_rq, deadline);
        inc_dl_migration(dl_se, dl_rq);
@@ -730,6 +731,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
        WARN_ON(!dl_prio(prio));
        WARN_ON(!dl_rq->dl_nr_running);
        dl_rq->dl_nr_running--;
+        dec_nr_running(rq_of_dl_rq(dl_rq));
        dec_dl_deadline(dl_rq, dl_se->deadline);
        dec_dl_migration(dl_se, dl_rq);
@@ -836,8 +838,6 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
        if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
                enqueue_pushable_dl_task(rq, p);
-        inc_nr_running(rq);
 }
 static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
@@ -850,8 +850,6 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 {
        update_curr_dl(rq);
        __dequeue_task_dl(rq, p, flags);
-        dec_nr_running(rq);
 }
 /*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 966cc2bfcb77..9b4c4f320130 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1757,6 +1757,8 @@ void task_numa_work(struct callback_head *work)
                        start = end;
                        if (pages <= 0)
                                goto out;
+                        cond_resched();
                } while (end != vma->vm_end);
        }
@@ -6999,15 +7001,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
        /*
-         * Ensure the task's vruntime is normalized, so that when its
+         * Ensure the task's vruntime is normalized, so that when it's
         * switched back to the fair class the enqueue_entity(.flags=0) will
         * do the right thing.
         *
-         * If it was on_rq, then the dequeue_entity(.flags=0) will already
+         * If it's on_rq, then the dequeue_entity(.flags=0) will already
-         * have normalized the vruntime, if it was !on_rq, then only when
+         * have normalized the vruntime, if it's !on_rq, then only when
         * the task is sleeping will it still have non-normalized vruntime.
         */
-        if (!se->on_rq && p->state != TASK_RUNNING) {
+        if (!p->on_rq && p->state != TASK_RUNNING) {
                /*
                 * Fix up our vruntime so that the current sleep doesn't
                 * cause 'unlimited' sleep bonus.
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a2740b775b45..1999021042c7 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -538,6 +538,14 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 #endif /* CONFIG_RT_GROUP_SCHED */
+bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
+{
+        struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+        return (hrtimer_active(&rt_b->rt_period_timer) ||
+                rt_rq->rt_time < rt_b->rt_runtime);
+}
 #ifdef CONFIG_SMP
 /*
 * We ran out of runtime, see if we can borrow some from our neighbours.
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c2119fd20f8b..f964add50f38 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -462,7 +462,6 @@ struct dl_rq {
        } earliest_dl;
        unsigned long dl_nr_migratory;
-        unsigned long dl_nr_total;
        int overloaded;
        /*