/*
* litmus/sched_pfp.c
*
* Implementation of partitioned fixed-priority scheduling.
* Based on PSN-EDF.
*/
#include <linux/percpu.h>
#include <linux/sched.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/module.h>
#include <litmus/litmus.h>
#include <litmus/wait.h>
#include <litmus/jobs.h>
#include <litmus/preempt.h>
#include <litmus/fp_common.h>
#include <litmus/sched_plugin.h>
#include <litmus/sched_trace.h>
#include <litmus/trace.h>
#include <litmus/budget.h>
#include <linux/uaccess.h>
typedef struct {
rt_domain_t domain;
struct fp_prio_queue ready_queue;
int cpu;
struct task_struct* scheduled; /* only RT tasks */
/*
* scheduling lock slock
* protects the domain and serializes scheduling decisions
*/
#define slock domain.ready_lock
} pfp_domain_t;
DEFINE_PER_CPU(pfp_domain_t, pfp_domains);
pfp_domain_t* pfp_doms[NR_CPUS];
#define local_pfp (&__get_cpu_var(pfp_domains))
#define remote_dom(cpu) (&per_cpu(pfp_domains, cpu).domain)
#define remote_pfp(cpu) (&per_cpu(pfp_domains, cpu))
#define task_dom(task) remote_dom(get_partition(task))
#define task_pfp(task) remote_pfp(get_partition(task))
/* we assume the lock is being held */
static void preempt(pfp_domain_t *pfp)
{
preempt_if_preemptable(pfp->scheduled, pfp->cpu);
}
static unsigned int priority_index(struct task_struct* t)
{
#ifdef CONFIG_LOCKING
if (unlikely(t->rt_param.inh_task))
/* use effective priority */
t = t->rt_param.inh_task;
if (is_priority_boosted(t)) {
/* zero is reserved for priority-boosted tasks */
return 0;
} else
#endif
return get_priority(t);
}
static void pfp_release_jobs(rt_domain_t* rt, struct bheap* tasks)
{
pfp_domain_t *pfp = container_of(rt, pfp_domain_t, domain);
unsigned long flags;
struct task_struct* t;
struct bheap_node* hn;
raw_spin_lock_irqsave(&pfp->slock, flags);
while (!bheap_empty(tasks)) {
hn = bheap_take(fp_ready_order, tasks);
t = bheap2task(hn);
TRACE_TASK(t, "released (part:%d prio:%d)\n",
get_partition(t), get_priority(t));
fp_prio_add(&pfp->ready_queue, t, priority_index(t));
}
/* do we need to preempt? */
if (fp_higher_prio(fp_prio_peek(&pfp->ready_queue), pfp->scheduled)) {
TRACE_CUR("preempted by new release\n");
preempt(pfp);
}
raw_spin_unlock_irqrestore(&pfp->slock, flags);
}
static void pfp_preempt_check(pfp_domain_t *pfp)
{
if (fp_higher_prio(fp_prio_peek(&pfp->ready_queue), pfp->scheduled))
preempt(pfp);
}
static void pfp_domain_init(pfp_domain_t* pfp,
int cpu)
{
fp_domain_init(&pfp->domain, NULL, pfp_release_jobs);
pfp->cpu = cpu;
pfp->scheduled = NULL;
fp_prio_queue_init(&pfp->ready_queue);
}
static void requeue(struct task_struct* t, pfp_domain_t *pfp)
{
if (t->state != TASK_RUNNING)
TRACE_TASK(t, "requeue: !TASK_RUNNING\n");
set_rt_flags(t, RT_F_RUNNING);
if (is_released(t, litmus_clock()))
fp_prio_add(&pfp->ready_queue, t, priority_index(t));
else
add_release(&pfp->domain, t); /* it has got to wait */
}
static void job_completion(struct task_struct* t, int forced)
{
sched_trace_task_completion(t,forced);
TRACE_TASK(t, "job_completion().\n");
set_rt_flags(t, RT_F_SLEEP);
prepare_for_next_period(t);
}
static void pfp_tick(struct task_struct *t)
{
pfp_domain_t *pfp = local_pfp;
/* Check for inconsistency. We don't need the lock for this since
* ->scheduled is only changed in schedule, which obviously is not
* executing in parallel on this CPU
*/
BUG_ON(is_realtime(t) && t != pfp->scheduled);
if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) {
if (!is_np(t)) {
litmus_reschedule_local();
TRACE("pfp_scheduler_tick: "
"%d is preemptable "
" => FORCE_RESCHED\n", t->pid);
} else if (is_user_np(t)) {
TRACE("pfp_scheduler_tick: "
"%d is non-preemptable, "
"preemption delayed.\n", t->pid);
request_exit_np(t);
}
}
}
static struct task_struct* pfp_schedule(struct task_struct * prev)
{
pfp_domain_t* pfp = local_pfp;
struct task_struct* next;
int out_of_time, sleep, preempt, np, exists, blocks, resched, migrate;
raw_spin_lock(&pfp->slock);
/* sanity checking
* differently from gedf, when a task exits (dead)
* pfp->schedule may be null and prev _is_ realtime
*/
BUG_ON(pfp->scheduled && pfp->scheduled != prev);
BUG_ON(pfp->scheduled && !is_realtime(prev));
/* (0) Determine state */
exists = pfp->scheduled != NULL;
blocks = exists && !is_running(pfp->scheduled);
out_of_time = exists &&
budget_enforced(pfp->scheduled) &&
budget_exhausted(pfp->scheduled);
np = exists && is_np(pfp->scheduled);
sleep = exists && get_rt_flags(pfp->scheduled) == RT_F_SLEEP;
migrate = exists && get_partition(pfp->scheduled) != pfp->cpu;
preempt = migrate || fp_preemption_needed(&pfp->ready_queue, prev);
/* If we need to preempt do so.
* The following checks set resched to 1 in case of special
* circumstances.
*/
resched = preempt;
/* If a task blocks we have no choice but to reschedule.
*/
if (blocks)
resched = 1;
/* Request a sys_exit_np() call if we would like to preempt but cannot.
* Multiple calls to request_exit_np() don't hurt.
*/
if (np && (out_of_time || preempt || sleep))
request_exit_np(pfp->scheduled);
/* Any task that is preemptable and either exhausts its execution
* budget or wants to sleep completes. We may have to reschedule after
* this.
*/
if (!np && (out_of_time || sleep) && !blocks && !migrate) {
job_completion(pfp->scheduled, !sleep);
resched = 1;
}
/* The final scheduling decision. Do we need to switch for some reason?
* Switch if we are in RT mode and have no task or if we need to
* resched.
*/
next = NULL;
if ((!np || blocks) && (resched || !exists)) {
/* When preempting a task that does not block, then
* re-insert it into either the ready queue or the
* release queue (if it completed). requeue() picks
* the appropriate queue.
*/
if (pfp->scheduled && !blocks && !migrate)
requeue(pfp->scheduled, pfp);
next = fp_prio_take(&pfp->ready_queue);
} else
/* Only override Linux scheduler if we have a real-time task
* scheduled that needs to continue.
*/
if (exists)
next = prev;
if (next) {
TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
set_rt_flags(next, RT_F_RUNNING);
} else {
TRACE("becoming idle at %llu\n", litmus_clock());
}
pfp->scheduled = next;
sched_state_task_picked();
raw_spin_unlock(&pfp->slock);
return next;
}
#ifdef CONFIG_LITMUS_LOCKING
/* prev is no longer scheduled --- see if it needs to migrate */
static void pfp_finish_switch(struct task_struct *prev)
{
pfp_domain_t *to;
if (is_realtime(prev) &&
is_running(prev) &&
get_partition(prev) != smp_processor_id()) {
TRACE_TASK(prev, "needs to migrate from P%d to P%d\n",
smp_processor_id(), get_partition(prev));
to = task_pfp(prev);
raw_spin_lock(&to->slock);
TRACE_TASK(prev, "adding to queue on P%d\n", to->cpu);
requeue(prev, to);
if (fp_preemption_needed(&to->ready_queue, to->scheduled))
preempt(to);
raw_spin_unlock(&to->slock);
}
}
#endif
/* Prepare a task for running in RT mode
*/
static void pfp_task_new(struct task_struct * t, int on_rq, int running)
{
pfp_domain_t* pfp = task_pfp(t);
unsigned long flags;
TRACE_TASK(t, "P-FP: task new, cpu = %d\n",
t->rt_param.task_params.cpu);
/* setup job parameters */
release_at(t, litmus_clock());
/* The task should be running in the queue, otherwise signal
* code will try to wake it up with fatal consequences.
*/
raw_spin_lock_irqsave(&pfp->slock, flags);
if (running) {
/* there shouldn't be anything else running at the time */
BUG_ON(pfp->scheduled);
pfp->scheduled = t;
} else {
requeue(t, pfp);
/* maybe we have to reschedule */
pfp_preempt_check(pfp);
}
raw_spin_unlock_irqrestore(&pfp->slock, flags);
}
static void pfp_task_wake_up(struct task_struct *task)
{
unsigned long flags;
pfp_domain_t* pfp = task_pfp(task);
lt_t now;
TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
raw_spin_lock_irqsave(&pfp->slock, flags);
#ifdef CONFIG_LITMUS_LOCKING
/* Should only be queued when processing a fake-wake up due to a
* migration-related state change. */
if (unlikely(is_queued(task))) {
TRACE_TASK(task, "WARNING: waking task still queued. Is this right?\n");
goto out_unlock;
}
#else
BUG_ON(is_queued(task));
#endif
now = litmus_clock();
if (is_tardy(task, now)
#ifdef CONFIG_LITMUS_LOCKING
/* We need to take suspensions because of semaphores into
* account! If a job resumes after being suspended due to acquiring
* a semaphore, it should never be treated as a new job release.
*/
&& !is_priority_boosted(task)
#endif
) {
/* new sporadic release */
release_at(task, now);
sched_trace_task_release(task);
}
/* Only add to ready queue if it is not the currently-scheduled
* task. This could be the case if a task was woken up concurrently
* on a remote CPU before the executing CPU got around to actually
* de-scheduling the task, i.e., wake_up() raced with schedule()
* and won. Also, don't requeue if it is still queued, which can
* happen under the DPCP due wake-ups racing with migrations.
*/
if (pfp->scheduled != task) {
requeue(task, pfp);
pfp_preempt_check(pfp);
}
out_unlock:
raw_spin_unlock_irqrestore(&pfp->slock, flags);
TRACE_TASK(task, "wake up done\n");
}
static void pfp_task_block(struct task_struct *t)
{
/* only running tasks can block, thus t is in no queue */
TRACE_TASK(t, "block at %llu, state=%d\n", litmus_clock(), t->state);
BUG_ON(!is_realtime(t));
/* If this task blocked normally, it shouldn't be queued. The exception is
* if this is a simulated block()/wakeup() pair from the pull-migration code path.
* This should only happen if the DPCP is being used.
*/
#ifdef CONFIG_LITMUS_LOCKING
if (unlikely(is_queued(t)))
TRACE_TASK(t, "WARNING: blocking task still queued. Is this right?\n");
#else
BUG_ON(is_queued(t));
#endif
}
static void pfp_task_exit(struct task_struct * t)
{
unsigned long flags;
pfp_domain_t* pfp = task_pfp(t);
rt_domain_t* dom;
raw_spin_lock_irqsave(&pfp->slock, flags);
if (is_queued(t)) {
BUG(); /* This currently doesn't work. */
/* dequeue */
dom = task_dom(t);
remove(dom, t);
}
if (pfp->scheduled == t) {
pfp->scheduled = NULL;
preempt(pfp);
}
TRACE_TASK(t, "RIP, now reschedule\n");
raw_spin_unlock_irqrestore(&pfp->slock, flags);
}
#ifdef CONFIG_LITMUS_LOCKING
#include <litmus/fdso.h>
#include <litmus/srp.h>
static void fp_dequeue(pfp_domain_t* pfp, struct task_struct* t)
{
BUG_ON(pfp->scheduled == t && is_queued(t));
if (is_queued(t))
fp_prio_remove(&pfp->ready_queue, t, priority_index(t));
}
static void fp_set_prio_inh(pfp_domain_t* pfp, struct task_struct* t,
struct task_struct* prio_inh)
{
int requeue;
if (!t || t->rt_param.inh_task == prio_inh) {
/* no update required */
if (t)
TRACE_TASK(t, "no prio-inh update required\n");
return;
}
requeue = is_queued(t);
TRACE_TASK(t, "prio-inh: is_queued:%d\n", requeue);
if (requeue)
/* first remove */
fp_dequeue(pfp, t);
t->rt_param.inh_task = prio_inh;
if (requeue)
/* add again to the right queue */
fp_prio_add(&pfp->ready_queue, t, priority_index(t));
}
static int effective_agent_priority(int prio)
{
/* make sure agents have higher priority */
return prio - LITMUS_MAX_PRIORITY;
}
static lt_t prio_point(int eprio)
{
/* make sure we have non-negative prio points */
return eprio + LITMUS_MAX_PRIORITY;
}
static int prio_from_point(lt_t prio_point)
{
return ((int) prio_point) - LITMUS_MAX_PRIORITY;
}
static void boost_priority(struct task_struct* t, lt_t priority_point)
{
unsigned long flags;
pfp_domain_t* pfp = task_pfp(t);
raw_spin_lock_irqsave(&pfp->slock, flags);
TRACE_TASK(t, "priority boosted at %llu\n", litmus_clock());
tsk_rt(t)->priority_boosted = 1;
/* tie-break by protocol-specific priority point */
tsk_rt(t)->boost_start_time = priority_point;
if (pfp->scheduled != t) {
/* holder may be queued: first stop queue changes */
raw_spin_lock(&pfp->domain.release_lock);
if (is_queued(t) &&
/* If it is queued, then we need to re-order. */
bheap_decrease(fp_ready_order, tsk_rt(t)->heap_node) &&
/* If we bubbled to the top, then we need to check for preemptions. */
fp_preemption_needed(&pfp->ready_queue, pfp->scheduled))
preempt(pfp);
raw_spin_unlock(&pfp->domain.release_lock);
} /* else: nothing to do since the job is not queued while scheduled */
raw_spin_unlock_irqrestore(&pfp->slock, flags);
}
static void unboost_priority(struct task_struct* t)
{
unsigned long flags;
pfp_domain_t* pfp = task_pfp(t);
lt_t now;
raw_spin_lock_irqsave(&pfp->slock, flags);
now = litmus_clock();
/* assumption: this only happens when the job is scheduled */
BUG_ON(pfp->scheduled != t);
TRACE_TASK(t, "priority restored at %llu\n", now);
/* priority boosted jobs must be scheduled */
BUG_ON(pfp->scheduled != t);
tsk_rt(t)->priority_boosted = 0;
tsk_rt(t)->boost_start_time = 0;
/* check if this changes anything */
if (fp_preemption_needed(&pfp->ready_queue, pfp->scheduled))
preempt(pfp);
raw_spin_unlock_irqrestore(&pfp->slock, flags);
}
/* ******************** SRP support ************************ */
static unsigned int pfp_get_srp_prio(struct task_struct* t)
{
return get_priority(t);
}
/* ******************** FMLP support ********************** */
struct fmlp_semaphore {
struct litmus_lock litmus_lock;
/* current resource holder */
struct task_struct *owner;
/* FIFO queue of waiting tasks */
wait_queue_head_t wait;
};
static inline struct fmlp_semaphore* fmlp_from_lock(struct litmus_lock* lock)
{
return container_of(lock, struct fmlp_semaphore, litmus_lock);
}
int pfp_fmlp_lock(struct litmus_lock* l)
{
struct task_struct* t = current;
struct fmlp_semaphore *sem = fmlp_from_lock(l);
wait_queue_t wait;
unsigned long flags;
lt_t time_of_request;
if (!is_realtime(t))
return -EPERM;
spin_lock_irqsave(&sem->wait.lock, flags);
/* tie-break by this point in time */
time_of_request = litmus_clock();
/* Priority-boost ourself *before* we suspend so that
* our priority is boosted when we resume. */
boost_priority(t, time_of_request);
if (sem->owner) {
/* resource is not free => must suspend and wait */
init_waitqueue_entry(&wait, t);
/* FIXME: interruptible would be nice some day */
set_task_state(t, TASK_UNINTERRUPTIBLE);
__add_wait_queue_tail_exclusive(&sem->wait, &wait);
TS_LOCK_SUSPEND;
/* release lock before sleeping */
spin_unlock_irqrestore(&sem->wait.lock, flags);
/* We depend on the FIFO order. Thus, we don't need to recheck
* when we wake up; we are guaranteed to have the lock since
* there is only one wake up per release.
*/
schedule();
TS_LOCK_RESUME;
/* Since we hold the lock, no other task will change
* ->owner. We can thus check it without acquiring the spin
* lock. */
BUG_ON(sem->owner != t);
} else {
/* it's ours now */
sem->owner = t;
spin_unlock_irqrestore(&sem->wait.lock, flags);
}
return 0;
}
int pfp_fmlp_unlock(struct litmus_lock* l)
{
struct task_struct *t = current, *next;
struct fmlp_semaphore *sem = fmlp_from_lock(l);
unsigned long flags;
int err = 0;
spin_lock_irqsave(&sem->wait.lock, flags);
if (sem->owner != t) {
err = -EINVAL;
goto out;
}
/* we lose the benefit of priority boosting */
unboost_priority(t);
/* check if there are jobs waiting for this resource */
next = __waitqueue_remove_first(&sem->wait);
if (next) {
/* next becomes the resouce holder */
sem->owner = next;
/* Wake up next. The waiting job is already priority-boosted. */
wake_up_process(next);
} else
/* resource becomes available */
sem->owner = NULL;
out:
spin_unlock_irqrestore(&sem->wait.lock, flags);
return err;
}
int pfp_fmlp_close(struct litmus_lock* l)
{
struct task_struct *t = current;
struct fmlp_semaphore *sem = fmlp_from_lock(l);
unsigned long flags;
int owner;
spin_lock_irqsave(&sem->wait.lock, flags);
owner = sem->owner == t;
spin_unlock_irqrestore(&sem->wait.lock, flags);
if (owner)
pfp_fmlp_unlock(l);
return 0;
}
void pfp_fmlp_free(struct litmus_lock* lock)
{
kfree(fmlp_from_lock(lock));
}
static struct litmus_lock_ops pfp_fmlp_lock_ops = {
.close = pfp_fmlp_close,
.lock = pfp_fmlp_lock,
.unlock = pfp_fmlp_unlock,
.deallocate = pfp_fmlp_free,
};
static struct litmus_lock* pfp_new_fmlp(void)
{
struct fmlp_semaphore* sem;
sem = kmalloc(sizeof(*sem), GFP_KERNEL);
if (!sem)
return NULL;
sem->owner = NULL;
init_waitqueue_head(&sem->wait);
sem->litmus_lock.ops = &pfp_fmlp_lock_ops;
return &sem->litmus_lock;
}
/* ******************** MPCP support ********************** */
struct mpcp_semaphore {
struct litmus_lock litmus_lock;
/* current resource holder */
struct task_struct *owner;
/* priority queue of waiting tasks */
wait_queue_head_t wait;
/* priority ceiling per cpu */
unsigned int prio_ceiling[NR_CPUS];
/* should jobs spin "virtually" for this resource? */
int vspin;
};
#define OMEGA_CEILING UINT_MAX
/* Since jobs spin "virtually" while waiting to acquire a lock,
* they first must aquire a local per-cpu resource.
*/
static DEFINE_PER_CPU(wait_queue_head_t, mpcpvs_vspin_wait);
static DEFINE_PER_CPU(struct task_struct*, mpcpvs_vspin);
/* called with preemptions off <=> no local modifications */
static void mpcp_vspin_enter(void)
{
struct task_struct* t = current;
while (1) {
if (__get_cpu_var(mpcpvs_vspin) == NULL) {
/* good, we get to issue our request */
__get_cpu_var(mpcpvs_vspin) = t;
break;
} else {
/* some job is spinning => enqueue in request queue */
prio_wait_queue_t wait;
wait_queue_head_t* vspin = &__get_cpu_var(mpcpvs_vspin_wait);
unsigned long flags;
/* ordered by regular priority */
init_prio_waitqueue_entry(&wait, t, prio_point(get_priority(t)));
spin_lock_irqsave(&vspin->lock, flags);
set_task_state(t, TASK_UNINTERRUPTIBLE);
__add_wait_queue_prio_exclusive(vspin, &wait);
spin_unlock_irqrestore(&vspin->lock, flags);
TS_LOCK_SUSPEND;
preempt_enable_no_resched();
schedule();
preempt_disable();
TS_LOCK_RESUME;
/* Recheck if we got it --- some higher-priority process might
* have swooped in. */
}
}
/* ok, now it is ours */
}
/* called with preemptions off */
static void mpcp_vspin_exit(void)
{
struct task_struct* t = current, *next;
unsigned long flags;
wait_queue_head_t* vspin = &__get_cpu_var(mpcpvs_vspin_wait);
BUG_ON(__get_cpu_var(mpcpvs_vspin) != t);
/* no spinning job */
__get_cpu_var(mpcpvs_vspin) = NULL;
/* see if anyone is waiting for us to stop "spinning" */
spin_lock_irqsave(&vspin->lock, flags);
next = __waitqueue_remove_first(vspin);
if (next)
wake_up_process(next);
spin_unlock_irqrestore(&vspin->lock, flags);
}
static inline struct mpcp_semaphore* mpcp_from_lock(struct litmus_lock* lock)
{
return container_of(lock, struct mpcp_semaphore, litmus_lock);
}
int pfp_mpcp_lock(struct litmus_lock* l)
{
struct task_struct* t = current;
struct mpcp_semaphore *sem = mpcp_from_lock(l);
prio_wait_queue_t wait;
unsigned long flags;
if (!is_realtime(t))
return -EPERM;
preempt_disable();
if (sem->vspin)
mpcp_vspin_enter();
/* Priority-boost ourself *before* we suspend so that
* our priority is boosted when we resume. Use the priority
* ceiling for the local partition. */
boost_priority(t, sem->prio_ceiling[get_partition(t)]);
spin_lock_irqsave(&sem->wait.lock, flags);
preempt_enable_no_resched();
if (sem->owner) {
/* resource is not free => must suspend and wait */
/* ordered by regular priority */
init_prio_waitqueue_entry(&wait, t, prio_point(get_priority(t)));
/* FIXME: interruptible would be nice some day */
set_task_state(t, TASK_UNINTERRUPTIBLE);
__add_wait_queue_prio_exclusive(&sem->wait, &wait);
TS_LOCK_SUSPEND;
/* release lock before sleeping */
spin_unlock_irqrestore(&sem->wait.lock, flags);
/* We depend on the FIFO order. Thus, we don't need to recheck
* when we wake up; we are guaranteed to have the lock since
* there is only one wake up per release.
*/
schedule();
TS_LOCK_RESUME;
/* Since we hold the lock, no other task will change
* ->owner. We can thus check it without acquiring the spin
* lock. */
BUG_ON(sem->owner != t);
} else {
/* it's ours now */
sem->owner = t;
spin_unlock_irqrestore(&sem->wait.lock, flags);
}
return 0;
}
int pfp_mpcp_unlock(struct litmus_lock* l)
{
struct task_struct *t = current, *next;
struct mpcp_semaphore *sem = mpcp_from_lock(l);
unsigned long flags;
int err = 0;
spin_lock_irqsave(&sem->wait.lock, flags);
if (sem->owner != t) {
err = -EINVAL;
goto out;
}
/* we lose the benefit of priority boosting */
unboost_priority(t);
/* check if there are jobs waiting for this resource */
next = __waitqueue_remove_first(&sem->wait);
if (next) {
/* next becomes the resouce holder */
sem->owner = next;
/* Wake up next. The waiting job is already priority-boosted. */
wake_up_process(next);
} else
/* resource becomes available */
sem->owner = NULL;
out:
spin_unlock_irqrestore(&sem->wait.lock, flags);
if (sem->vspin && err == 0) {
preempt_disable();
mpcp_vspin_exit();
preempt_enable();
}
return err;
}
int pfp_mpcp_open(struct litmus_lock* l, void* config)
{
struct task_struct *t = current;
struct mpcp_semaphore *sem = mpcp_from_lock(l);
int cpu, local_cpu;
unsigned long flags;
if (!is_realtime(t))
/* we need to know the real-time priority */
return -EPERM;
local_cpu = get_partition(t);
spin_lock_irqsave(&sem->wait.lock, flags);
for (cpu = 0; cpu < NR_CPUS; cpu++)
if (cpu != local_cpu)
{
sem->prio_ceiling[cpu] = min(sem->prio_ceiling[cpu],
get_priority(t));
TRACE_CUR("priority ceiling for sem %p is now %d on cpu %d\n",
sem, sem->prio_ceiling[cpu], cpu);
}
spin_unlock_irqrestore(&sem->wait.lock, flags);
return 0;
}
int pfp_mpcp_close(struct litmus_lock* l)
{
struct task_struct *t = current;
struct mpcp_semaphore *sem = mpcp_from_lock(l);
unsigned long flags;
int owner;
spin_lock_irqsave(&sem->wait.lock, flags);
owner = sem->owner == t;
spin_unlock_irqrestore(&sem->wait.lock, flags);
if (owner)
pfp_mpcp_unlock(l);
return 0;
}
void pfp_mpcp_free(struct litmus_lock* lock)
{
kfree(mpcp_from_lock(lock));
}
static struct litmus_lock_ops pfp_mpcp_lock_ops = {
.close = pfp_mpcp_close,
.lock = pfp_mpcp_lock,
.open = pfp_mpcp_open,
.unlock = pfp_mpcp_unlock,
.deallocate = pfp_mpcp_free,
};
static struct litmus_lock* pfp_new_mpcp(int vspin)
{
struct mpcp_semaphore* sem;
int cpu;
sem = kmalloc(sizeof(*sem), GFP_KERNEL);
if (!sem)
return NULL;
sem->owner = NULL;
init_waitqueue_head(&sem->wait);
sem->litmus_lock.ops = &pfp_mpcp_lock_ops;
for (cpu = 0; cpu < NR_CPUS; cpu++)
sem->prio_ceiling[cpu] = OMEGA_CEILING;
/* mark as virtual spinning */
sem->vspin = vspin;
return &sem->litmus_lock;
}
/* ******************** PCP support ********************** */
struct pcp_semaphore {
struct litmus_lock litmus_lock;
struct list_head ceiling;
/* current resource holder */
struct task_struct *owner;
/* priority ceiling --- can be negative due to DPCP support */
int prio_ceiling;
/* on which processor is this PCP semaphore allocated? */
int on_cpu;
};
static inline struct pcp_semaphore* pcp_from_lock(struct litmus_lock* lock)
{
return container_of(lock, struct pcp_semaphore, litmus_lock);
}
struct pcp_state {
struct list_head system_ceiling;
/* highest-priority waiting task */
struct task_struct* hp_waiter;
/* list of jobs waiting to get past the system ceiling */
wait_queue_head_t ceiling_blocked;
};
static void pcp_init_state(struct pcp_state* s)
{
INIT_LIST_HEAD(&s->system_ceiling);
s->hp_waiter = NULL;
init_waitqueue_head(&s->ceiling_blocked);
}
static DEFINE_PER_CPU(struct pcp_state, pcp_state);
/* assumes preemptions are off */
static struct pcp_semaphore* pcp_get_ceiling(void)
{
struct list_head* top = __get_cpu_var(pcp_state).system_ceiling.next;
if (top)
return list_entry(top, struct pcp_semaphore, ceiling);
else
return NULL;
}
/* assumes preempt off */
static void pcp_add_ceiling(struct pcp_semaphore* sem)
{
struct list_head *pos;
struct list_head *in_use = &__get_cpu_var(pcp_state).system_ceiling;
struct pcp_semaphore* held;
BUG_ON(sem->on_cpu != smp_processor_id());
BUG_ON(in_list(&sem->ceiling));
list_for_each(pos, in_use) {
held = list_entry(pos, struct pcp_semaphore, ceiling);
if (held->prio_ceiling >= sem->prio_ceiling) {
__list_add(&sem->ceiling, pos->prev, pos);
return;
}
}
/* we hit the end of the list */
list_add_tail(&sem->ceiling, in_use);
}
/* assumes preempt off */
static int pcp_exceeds_ceiling(struct pcp_semaphore* ceiling,
struct task_struct* task,
int effective_prio)
{
return ceiling == NULL ||
ceiling->prio_ceiling > effective_prio ||
ceiling->owner == task;
}
/* assumes preempt off */
static void pcp_priority_inheritance(void)
{
unsigned long flags;
pfp_domain_t* pfp = local_pfp;
struct pcp_semaphore* ceiling = pcp_get_ceiling();
struct task_struct *blocker, *blocked;
blocker = ceiling ? ceiling->owner : NULL;
blocked = __get_cpu_var(pcp_state).hp_waiter;
raw_spin_lock_irqsave(&pfp->slock, flags);
/* Current is no longer inheriting anything by default. This should be
* the currently scheduled job, and hence not currently queued. */
BUG_ON(current != pfp->scheduled);
fp_set_prio_inh(pfp, current, NULL);
fp_set_prio_inh(pfp, blocked, NULL);
fp_set_prio_inh(pfp, blocker, NULL);
/* Let blocking job inherit priority of blocked job, if required. */
if (blocker && blocked &&
fp_higher_prio(blocked, blocker)) {
TRACE_TASK(blocker, "PCP inherits from %s/%d (prio %u -> %u) \n",
blocked->comm, blocked->pid,
get_priority(blocker), get_priority(blocked));
fp_set_prio_inh(pfp, blocker, blocked);
}
/* check if anything changed */
if (fp_higher_prio(fp_prio_peek(&pfp->ready_queue), pfp->scheduled))
preempt(pfp);
raw_spin_unlock_irqrestore(&pfp->slock, flags);
}
/* called with preemptions off */
static void pcp_raise_ceiling(struct pcp_semaphore* sem,
int effective_prio)
{
struct task_struct* t = current;
struct pcp_semaphore* ceiling;
prio_wait_queue_t wait;
unsigned int waiting_higher_prio;
do {
ceiling = pcp_get_ceiling();
if (pcp_exceeds_ceiling(ceiling, t, effective_prio))
break;
TRACE_CUR("PCP ceiling-blocked, wanted sem %p, but %s/%d has the ceiling \n",
sem, ceiling->owner->comm, ceiling->owner->pid);
/* we need to wait until the ceiling is lowered */
/* enqueue in priority order */
init_prio_waitqueue_entry(&wait, t, prio_point(effective_prio));
set_task_state(t, TASK_UNINTERRUPTIBLE);
waiting_higher_prio = add_wait_queue_prio_exclusive(
&__get_cpu_var(pcp_state).ceiling_blocked, &wait);
if (waiting_higher_prio == 0) {
TRACE_CUR("PCP new highest-prio waiter => prio inheritance\n");
/* we are the new highest-priority waiting job
* => update inheritance */
__get_cpu_var(pcp_state).hp_waiter = t;
pcp_priority_inheritance();
}
TS_LOCK_SUSPEND;
preempt_enable_no_resched();
schedule();
preempt_disable();
/* pcp_resume_unblocked() removed us from wait queue */
TS_LOCK_RESUME;
} while(1);
TRACE_CUR("PCP got the ceiling and sem %p\n", sem);
/* We are good to go. The semaphore should be available. */
BUG_ON(sem->owner != NULL);
sem->owner = t;
pcp_add_ceiling(sem);
}
static void pcp_resume_unblocked(void)
{
wait_queue_head_t *blocked = &__get_cpu_var(pcp_state).ceiling_blocked;
unsigned long flags;
prio_wait_queue_t* q;
struct task_struct* t = NULL;
struct pcp_semaphore* ceiling = pcp_get_ceiling();
spin_lock_irqsave(&blocked->lock, flags);
while (waitqueue_active(blocked)) {
/* check first == highest-priority waiting job */
q = list_entry(blocked->task_list.next,
prio_wait_queue_t, wq.task_list);
t = (struct task_struct*) q->wq.private;
/* can it proceed now? => let it go */
if (pcp_exceeds_ceiling(ceiling, t,
prio_from_point(q->priority))) {
__remove_wait_queue(blocked, &q->wq);
wake_up_process(t);
} else {
/* We are done. Update highest-priority waiter. */
__get_cpu_var(pcp_state).hp_waiter = t;
goto out;
}
}
/* If we get here, then there are no more waiting
* jobs. */
__get_cpu_var(pcp_state).hp_waiter = NULL;
out:
spin_unlock_irqrestore(&blocked->lock, flags);
}
/* assumes preempt off */
static void pcp_lower_ceiling(struct pcp_semaphore* sem)
{
BUG_ON(!in_list(&sem->ceiling));
BUG_ON(sem->owner != current);
BUG_ON(sem->on_cpu != smp_processor_id());
/* remove from ceiling list */
list_del(&sem->ceiling);
/* release */
sem->owner = NULL;
TRACE_CUR("PCP released sem %p\n", sem);
/* Wake up all ceiling-blocked jobs that now pass the ceiling. */
pcp_resume_unblocked();
pcp_priority_inheritance();
}
static void pcp_update_prio_ceiling(struct pcp_semaphore* sem,
int effective_prio)
{
/* This needs to be synchronized on something.
* Might as well use waitqueue lock for the processor.
* We assume this happens only before the task set starts execution,
* (i.e., during initialization), but it may happen on multiple processors
* at the same time.
*/
unsigned long flags;
struct pcp_state* s = &per_cpu(pcp_state, sem->on_cpu);
spin_lock_irqsave(&s->ceiling_blocked.lock, flags);
sem->prio_ceiling = min(sem->prio_ceiling, effective_prio);
spin_unlock_irqrestore(&s->ceiling_blocked.lock, flags);
}
static void pcp_init_semaphore(struct pcp_semaphore* sem, int cpu)
{
sem->owner = NULL;
INIT_LIST_HEAD(&sem->ceiling);
sem->prio_ceiling = INT_MAX;
sem->on_cpu = cpu;
}
int pfp_pcp_lock(struct litmus_lock* l)
{
struct task_struct* t = current;
struct pcp_semaphore *sem = pcp_from_lock(l);
int eprio = effective_agent_priority(get_priority(t));
int from = get_partition(t);
int to = sem->on_cpu;
if (!is_realtime(t) || from != to)
return -EPERM;
preempt_disable();
pcp_raise_ceiling(sem, eprio);
preempt_enable();
return 0;
}
int pfp_pcp_unlock(struct litmus_lock* l)
{
struct task_struct *t = current;
struct pcp_semaphore *sem = pcp_from_lock(l);
int err = 0;
preempt_disable();
if (sem->on_cpu != smp_processor_id() || sem->owner != t) {
err = -EINVAL;
goto out;
}
/* give it back */
pcp_lower_ceiling(sem);
out:
preempt_enable();
return err;
}
int pfp_pcp_open(struct litmus_lock* l, void* __user config)
{
struct task_struct *t = current;
struct pcp_semaphore *sem = pcp_from_lock(l);
int cpu, eprio;
if (!is_realtime(t))
/* we need to know the real-time priority */
return -EPERM;
if (get_user(cpu, (int*) config))
return -EFAULT;
/* make sure the resource location matches */
if (cpu != sem->on_cpu)
return -EINVAL;
eprio = effective_agent_priority(get_priority(t));
pcp_update_prio_ceiling(sem, eprio);
return 0;
}
int pfp_pcp_close(struct litmus_lock* l)
{
struct task_struct *t = current;
struct pcp_semaphore *sem = pcp_from_lock(l);
int owner = 0;
preempt_disable();
if (sem->on_cpu == smp_processor_id())
owner = sem->owner == t;
preempt_enable();
if (owner)
pfp_pcp_unlock(l);
return 0;
}
void pfp_pcp_free(struct litmus_lock* lock)
{
kfree(pcp_from_lock(lock));
}
static struct litmus_lock_ops pfp_pcp_lock_ops = {
.close = pfp_pcp_close,
.lock = pfp_pcp_lock,
.open = pfp_pcp_open,
.unlock = pfp_pcp_unlock,
.deallocate = pfp_pcp_free,
};
static struct litmus_lock* pfp_new_pcp(int on_cpu)
{
struct pcp_semaphore* sem;
sem = kmalloc(sizeof(*sem), GFP_KERNEL);
if (!sem)
return NULL;
sem->litmus_lock.ops = &pfp_pcp_lock_ops;
pcp_init_semaphore(sem, on_cpu);
return &sem->litmus_lock;
}
/* ******************** DPCP support ********************** */
struct dpcp_semaphore {
struct litmus_lock litmus_lock;
struct pcp_semaphore pcp;
int owner_cpu;
};
static inline struct dpcp_semaphore* dpcp_from_lock(struct litmus_lock* lock)
{
return container_of(lock, struct dpcp_semaphore, litmus_lock);
}
/* called with preemptions disabled */
static void pfp_migrate_to(int target_cpu)
{
struct task_struct* t = current;
pfp_domain_t *from;
if (get_partition(t) == target_cpu)
return;
/* make sure target_cpu makes sense */
BUG_ON(!cpu_online(target_cpu));
local_irq_disable();
/* scheduled task should not be in any ready or release queue */
BUG_ON(is_queued(t));
/* lock both pfp domains in order of address */
from = task_pfp(t);
raw_spin_lock(&from->slock);
/* switch partitions */
tsk_rt(t)->task_params.cpu = target_cpu;
raw_spin_unlock(&from->slock);
/* Don't trace scheduler costs as part of
* locking overhead. Scheduling costs are accounted for
* explicitly. */
TS_LOCK_SUSPEND;
local_irq_enable();
preempt_enable_no_resched();
/* deschedule to be migrated */
schedule();
/* we are now on the target processor */
preempt_disable();
/* start recording costs again */
TS_LOCK_RESUME;
BUG_ON(smp_processor_id() != target_cpu);
}
int pfp_dpcp_lock(struct litmus_lock* l)
{
struct task_struct* t = current;
struct dpcp_semaphore *sem = dpcp_from_lock(l);
int eprio = effective_agent_priority(get_priority(t));
int from = get_partition(t);
int to = sem->pcp.on_cpu;
if (!is_realtime(t))
return -EPERM;
preempt_disable();
/* Priority-boost ourself *before* we suspend so that
* our priority is boosted when we resume. */
boost_priority(t, get_priority(t));
pfp_migrate_to(to);
pcp_raise_ceiling(&sem->pcp, eprio);
/* yep, we got it => execute request */
sem->owner_cpu = from;
preempt_enable();
return 0;
}
int pfp_dpcp_unlock(struct litmus_lock* l)
{
struct task_struct *t = current;
struct dpcp_semaphore *sem = dpcp_from_lock(l);
int err = 0;
int home;
preempt_disable();
if (sem->pcp.on_cpu != smp_processor_id() || sem->pcp.owner != t) {
err = -EINVAL;
goto out;
}
home = sem->owner_cpu;
/* give it back */
pcp_lower_ceiling(&sem->pcp);
/* we lose the benefit of priority boosting */
unboost_priority(t);
pfp_migrate_to(home);
out:
preempt_enable();
return err;
}
int pfp_dpcp_open(struct litmus_lock* l, void* __user config)
{
struct task_struct *t = current;
struct dpcp_semaphore *sem = dpcp_from_lock(l);
int cpu, eprio;
if (!is_realtime(t))
/* we need to know the real-time priority */
return -EPERM;
if (get_user(cpu, (int*) config))
return -EFAULT;
/* make sure the resource location matches */
if (cpu != sem->pcp.on_cpu)
return -EINVAL;
eprio = effective_agent_priority(get_priority(t));
pcp_update_prio_ceiling(&sem->pcp, eprio);
return 0;
}
int pfp_dpcp_close(struct litmus_lock* l)
{
struct task_struct *t = current;
struct dpcp_semaphore *sem = dpcp_from_lock(l);
int owner = 0;
preempt_disable();
if (sem->pcp.on_cpu == smp_processor_id())
owner = sem->pcp.owner == t;
preempt_enable();
if (owner)
pfp_dpcp_unlock(l);
return 0;
}
void pfp_dpcp_free(struct litmus_lock* lock)
{
kfree(dpcp_from_lock(lock));
}
static struct litmus_lock_ops pfp_dpcp_lock_ops = {
.close = pfp_dpcp_close,
.lock = pfp_dpcp_lock,
.open = pfp_dpcp_open,
.unlock = pfp_dpcp_unlock,
.deallocate = pfp_dpcp_free,
};
static struct litmus_lock* pfp_new_dpcp(int on_cpu)
{
struct dpcp_semaphore* sem;
sem = kmalloc(sizeof(*sem), GFP_KERNEL);
if (!sem)
return NULL;
sem->litmus_lock.ops = &pfp_dpcp_lock_ops;
sem->owner_cpu = NO_CPU;
pcp_init_semaphore(&sem->pcp, on_cpu);
return &sem->litmus_lock;
}
/* **** lock constructor **** */
static long pfp_allocate_lock(struct litmus_lock **lock, int type,
void* __user config)
{
int err = -ENXIO, cpu;
struct srp_semaphore* srp;
/* P-FP currently supports the SRP for local resources and the FMLP
* for global resources. */
switch (type) {
case FMLP_SEM:
/* FIFO Mutex Locking Protocol */
*lock = pfp_new_fmlp();
if (*lock)
err = 0;
else
err = -ENOMEM;
break;
case MPCP_SEM:
/* Multiprocesor Priority Ceiling Protocol */
*lock = pfp_new_mpcp(0);
if (*lock)
err = 0;
else
err = -ENOMEM;
break;
case MPCP_VS_SEM:
/* Multiprocesor Priority Ceiling Protocol with virtual spinning */
*lock = pfp_new_mpcp(1);
if (*lock)
err = 0;
else
err = -ENOMEM;
break;
case DPCP_SEM:
/* Distributed Priority Ceiling Protocol */
if (get_user(cpu, (int*) config))
return -EFAULT;
if (!cpu_online(cpu))
return -EINVAL;
*lock = pfp_new_dpcp(cpu);
if (*lock)
err = 0;
else
err = -ENOMEM;
break;
case SRP_SEM:
/* Baker's Stack Resource Policy */
srp = allocate_srp_semaphore();
if (srp) {
*lock = &srp->litmus_lock;
err = 0;
} else
err = -ENOMEM;
break;
case PCP_SEM:
/* Priority Ceiling Protocol */
if (get_user(cpu, (int*) config))
return -EFAULT;
if (!cpu_online(cpu))
return -EINVAL;
*lock = pfp_new_pcp(cpu);
if (*lock)
err = 0;
else
err = -ENOMEM;
break;
};
return err;
}
#endif
static long pfp_admit_task(struct task_struct* tsk)
{
if (task_cpu(tsk) == tsk->rt_param.task_params.cpu &&
#ifdef CONFIG_RELEASE_MASTER
/* don't allow tasks on release master CPU */
task_cpu(tsk) != remote_dom(task_cpu(tsk))->release_master &&
#endif
litmus_is_valid_fixed_prio(get_priority(tsk)))
return 0;
else
return -EINVAL;
}
static long pfp_activate_plugin(void)
{
#if defined(CONFIG_RELEASE_MASTER) || defined(CONFIG_LITMUS_LOCKING)
int cpu;
#endif
#ifdef CONFIG_RELEASE_MASTER
for_each_online_cpu(cpu) {
remote_dom(cpu)->release_master = atomic_read(&release_master_cpu);
}
#endif
#ifdef CONFIG_LITMUS_LOCKING
get_srp_prio = pfp_get_srp_prio;
for_each_online_cpu(cpu) {
init_waitqueue_head(&per_cpu(mpcpvs_vspin_wait, cpu));
per_cpu(mpcpvs_vspin, cpu) = NULL;
pcp_init_state(&per_cpu(pcp_state, cpu));
pfp_doms[cpu] = remote_pfp(cpu);
}
#endif
return 0;
}
/* Plugin object */
static struct sched_plugin pfp_plugin __cacheline_aligned_in_smp = {
.plugin_name = "P-FP",
.tick = pfp_tick,
.task_new = pfp_task_new,
.complete_job = complete_job,
.task_exit = pfp_task_exit,
.schedule = pfp_schedule,
.task_wake_up = pfp_task_wake_up,
.task_block = pfp_task_block,
.admit_task = pfp_admit_task,
.activate_plugin = pfp_activate_plugin,
#ifdef CONFIG_LITMUS_LOCKING
.allocate_lock = pfp_allocate_lock,
.finish_switch = pfp_finish_switch,
#endif
};
static int __init init_pfp(void)
{
int i;
/* We do not really want to support cpu hotplug, do we? ;)
* However, if we are so crazy to do so,
* we cannot use num_online_cpu()
*/
for (i = 0; i < num_online_cpus(); i++) {
pfp_domain_init(remote_pfp(i), i);
}
return register_sched_plugin(&pfp_plugin);
}
module_init(init_pfp);