/* * PI semaphores and SRP implementations. * Much of the code here is borrowed from include/asm-i386/semaphore.h. * * NOTE: This implementation is very much a prototype and horribly insecure. It * is intended to be a proof of concept, not a feature-complete solution. */ #include #include #include #include #include #include #include #include #include #include /* ************************************************************************** */ /* PRIORITY INHERITANCE */ /* ************************************************************************** */ static void* create_pi_semaphore(void) { struct pi_semaphore* sem; int i; sem = kmalloc(sizeof(struct pi_semaphore), GFP_KERNEL); if (!sem) return NULL; atomic_set(&sem->count, 1); sem->sleepers = 0; init_waitqueue_head(&sem->wait); sem->hp.task = NULL; sem->holder = NULL; for (i = 0; i < NR_CPUS; i++) sem->hp.cpu_task[i] = NULL; return sem; } static void destroy_pi_semaphore(void* sem) { /* XXX assert invariants */ kfree(sem); } struct fdso_ops pi_sem_ops = { .create = create_pi_semaphore, .destroy = destroy_pi_semaphore }; struct wq_pair { struct task_struct* tsk; struct pi_semaphore* sem; }; static int rt_pi_wake_up(wait_queue_t *wait, unsigned mode, int sync, void *key) { struct wq_pair* wqp = (struct wq_pair*) wait->private; set_rt_flags(wqp->tsk, RT_F_EXIT_SEM); curr_sched_plugin->inherit_priority(wqp->sem, wqp->tsk); TRACE_TASK(wqp->tsk, "woken up by rt_pi_wake_up() (RT_F_SEM_EXIT, PI)\n"); /* point to task for default_wake_function() */ wait->private = wqp->tsk; default_wake_function(wait, mode, sync, key); /* Always return true since we know that if we encountered a task * that was already running the wake_up raced with the schedule in * rt_pi_down(). In that case the task in rt_pi_down() will be scheduled * immediately and own the lock. We must not wake up another task in * any case. */ return 1; } /* caller is responsible for locking */ int edf_set_hp_task(struct pi_semaphore *sem) { struct list_head *tmp, *next; struct task_struct *queued; int ret = 0; sem->hp.task = NULL; list_for_each_safe(tmp, next, &sem->wait.task_list) { queued = ((struct wq_pair*) list_entry(tmp, wait_queue_t, task_list)->private)->tsk; /* Compare task prios, find high prio task. */ if (edf_higher_prio(queued, sem->hp.task)) { sem->hp.task = queued; ret = 1; } } return ret; } /* caller is responsible for locking */ int edf_set_hp_cpu_task(struct pi_semaphore *sem, int cpu) { struct list_head *tmp, *next; struct task_struct *queued; int ret = 0; sem->hp.cpu_task[cpu] = NULL; list_for_each_safe(tmp, next, &sem->wait.task_list) { queued = ((struct wq_pair*) list_entry(tmp, wait_queue_t, task_list)->private)->tsk; /* Compare task prios, find high prio task. */ if (get_partition(queued) == cpu && edf_higher_prio(queued, sem->hp.cpu_task[cpu])) { sem->hp.cpu_task[cpu] = queued; ret = 1; } } return ret; } int do_pi_down(struct pi_semaphore* sem) { unsigned long flags; struct task_struct *tsk = current; struct wq_pair pair; int suspended = 1; wait_queue_t wait = { .private = &pair, .func = rt_pi_wake_up, .task_list = {NULL, NULL} }; pair.tsk = tsk; pair.sem = sem; spin_lock_irqsave(&sem->wait.lock, flags); if (atomic_dec_return(&sem->count) < 0 || waitqueue_active(&sem->wait)) { /* we need to suspend */ tsk->state = TASK_UNINTERRUPTIBLE; add_wait_queue_exclusive_locked(&sem->wait, &wait); TRACE_CUR("suspends on PI lock %p\n", sem); curr_sched_plugin->pi_block(sem, tsk); /* release lock before sleeping */ spin_unlock_irqrestore(&sem->wait.lock, flags); TS_PI_DOWN_END; preempt_enable_no_resched(); /* we depend on the FIFO order * Thus, we don't need to recheck when we wake up, we * are guaranteed to have the lock since there is only one * wake up per release */ schedule(); TRACE_CUR("woke up, now owns PI lock %p\n", sem); /* try_to_wake_up() set our state to TASK_RUNNING, * all we need to do is to remove our wait queue entry */ remove_wait_queue(&sem->wait, &wait); } else { /* no priority inheritance necessary, since there are no queued * tasks. */ suspended = 0; TRACE_CUR("acquired PI lock %p, no contention\n", sem); sem->holder = tsk; sem->hp.task = tsk; curr_sched_plugin->inherit_priority(sem, tsk); spin_unlock_irqrestore(&sem->wait.lock, flags); } return suspended; } void do_pi_up(struct pi_semaphore* sem) { unsigned long flags; spin_lock_irqsave(&sem->wait.lock, flags); TRACE_CUR("releases PI lock %p\n", sem); curr_sched_plugin->return_priority(sem); sem->holder = NULL; if (atomic_inc_return(&sem->count) < 1) /* there is a task queued */ wake_up_locked(&sem->wait); spin_unlock_irqrestore(&sem->wait.lock, flags); } asmlinkage long sys_pi_down(int sem_od) { long ret = 0; struct pi_semaphore * sem; int suspended = 0; preempt_disable(); TS_PI_DOWN_START; sem = lookup_pi_sem(sem_od); if (sem) suspended = do_pi_down(sem); else ret = -EINVAL; if (!suspended) { TS_PI_DOWN_END; preempt_enable(); } return ret; } asmlinkage long sys_pi_up(int sem_od) { long ret = 0; struct pi_semaphore * sem; preempt_disable(); TS_PI_UP_START; sem = lookup_pi_sem(sem_od); if (sem) do_pi_up(sem); else ret = -EINVAL; TS_PI_UP_END; preempt_enable(); return ret; } /* Clear wait queue and wakeup waiting tasks, and free semaphore. */ /* asmlinkage long sys_pi_sema_free(int sem_id) { struct list_head *tmp, *next; unsigned long flags; if (sem_id < 0 || sem_id >= MAX_PI_SEMAPHORES) return -EINVAL; if (!pi_sems[sem_id].used) return -EINVAL; spin_lock_irqsave(&pi_sems[sem_id].wait.lock, flags); if (waitqueue_active(&pi_sems[sem_id].wait)) { list_for_each_safe(tmp, next, &pi_sems[sem_id].wait.task_list) { wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); list_del(tmp); set_rt_flags((struct task_struct*)curr->private, RT_F_EXIT_SEM); curr->func(curr, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 0, NULL); } } spin_unlock_irqrestore(&pi_sems[sem_id].wait.lock, flags); pi_sems[sem_id].used = 0; return 0; } */ /* ************************************************************************** */ /* STACK RESOURCE POLICY */ /* ************************************************************************** */ struct srp_priority { struct list_head list; unsigned int period; pid_t pid; }; #define list2prio(l) list_entry(l, struct srp_priority, list) /* SRP task priority comparison function. Smaller periods have highest * priority, tie-break is PID. Special case: period == 0 <=> no priority */ static int srp_higher_prio(struct srp_priority* first, struct srp_priority* second) { if (!first->period) return 0; else return !second->period || first->period < second->period || ( first->period == second->period && first->pid < second->pid); } struct srp { struct list_head ceiling; wait_queue_head_t ceiling_blocked; }; DEFINE_PER_CPU(struct srp, srp); #define system_ceiling(srp) list2prio(srp->ceiling.next) static int srp_exceeds_ceiling(struct task_struct* first, struct srp* srp) { return list_empty(&srp->ceiling) || get_rt_period(first) < system_ceiling(srp)->period || (get_rt_period(first) == system_ceiling(srp)->period && first->pid < system_ceiling(srp)->pid); } static void srp_add_prio(struct srp* srp, struct srp_priority* prio) { struct list_head *pos; if (in_list(&prio->list)) { TRACE_CUR("WARNING: SRP violation detected, prio is already in " "ceiling list!\n"); return; } list_for_each(pos, &srp->ceiling) if (unlikely(srp_higher_prio(prio, list2prio(pos)))) { __list_add(&prio->list, pos->prev, pos); return; } list_add_tail(&prio->list, &srp->ceiling); } /* struct for uniprocessor SRP "semaphore" */ struct srp_semaphore { struct srp_priority ceiling; int cpu; /* cpu associated with this "semaphore" and resource */ int claimed; /* is the resource claimed (ceiling should be used)? */ }; static void* create_srp_semaphore(void) { struct srp_semaphore* sem; if (!is_realtime(current)) /* XXX log error */ return NULL; sem = kmalloc(sizeof(*sem), GFP_KERNEL); if (!sem) return NULL; INIT_LIST_HEAD(&sem->ceiling.list); sem->ceiling.period = 0; sem->claimed = 0; sem->cpu = get_partition(current); return sem; } static void destroy_srp_semaphore(void* sem) { /* XXX invariants */ kfree(sem); } struct fdso_ops srp_sem_ops = { .create = create_srp_semaphore, .destroy = destroy_srp_semaphore }; /* Initialize SRP semaphores at boot time. */ static int __init srp_sema_boot_init(void) { int i; printk("Initializing SRP per-CPU ceilings..."); for (i = 0; i < NR_CPUS; i++) { init_waitqueue_head(&per_cpu(srp, i).ceiling_blocked); INIT_LIST_HEAD(&per_cpu(srp, i).ceiling); } printk(" done!\n"); return 0; } __initcall(srp_sema_boot_init); void do_srp_down(struct srp_semaphore* sem) { /* claim... */ sem->claimed = 1; /* ...and update ceiling */ srp_add_prio(&__get_cpu_var(srp), &sem->ceiling); } void do_srp_up(struct srp_semaphore* sem) { sem->claimed = 0; /* Determine new system priority ceiling for this CPU. */ if (in_list(&sem->ceiling.list)) list_del(&sem->ceiling.list); else TRACE_CUR("WARNING: SRP violation detected, prio not in ceiling" " list!\n"); /* Wake tasks on this CPU, if they exceed current ceiling. */ wake_up_all(&__get_cpu_var(srp).ceiling_blocked); } /* Adjust the system-wide priority ceiling if resource is claimed. */ asmlinkage long sys_srp_down(int sem_od) { int cpu; int ret = -EINVAL; struct srp_semaphore* sem; /* disabling preemptions is sufficient protection since * SRP is strictly per CPU and we don't interfere with any * interrupt handlers */ preempt_disable(); TS_SRP_DOWN_START; cpu = smp_processor_id(); sem = lookup_srp_sem(sem_od); if (sem && sem->cpu == cpu) { do_srp_down(sem); ret = 0; } TS_SRP_DOWN_END; preempt_enable(); return ret; } /* Adjust the system-wide priority ceiling if resource is freed. */ asmlinkage long sys_srp_up(int sem_od) { int cpu; int ret = -EINVAL; struct srp_semaphore* sem; preempt_disable(); TS_SRP_UP_START; cpu = smp_processor_id(); sem = lookup_srp_sem(sem_od); if (sem && sem->cpu == cpu) { do_srp_up(sem); ret = 0; } TS_SRP_UP_END; preempt_enable(); return ret; } /* Indicate that task will use a resource associated with a given * semaphore. Should be done *a priori* before RT task system is * executed, so this does *not* update the system priority * ceiling! (The ceiling would be meaningless anyway, as the SRP * breaks without this a priori knowledge.) */ asmlinkage long sys_reg_task_srp_sem(int sem_od) { /* * FIXME: This whole concept is rather brittle! * There must be a better solution. Maybe register on * first reference? */ struct task_struct *t = current; struct srp_priority t_prio; struct srp_semaphore* sem; sem = lookup_srp_sem(sem_od); if (!sem) return -EINVAL; if (!is_realtime(t)) return -EPERM; if (sem->cpu != get_partition(t)) return -EINVAL; preempt_disable(); t->rt_param.subject_to_srp = 1; t_prio.period = get_rt_period(t); t_prio.pid = t->pid; if (srp_higher_prio(&t_prio, &sem->ceiling)) { sem->ceiling.period = t_prio.period; sem->ceiling.pid = t_prio.pid; } preempt_enable(); return 0; } static int srp_wake_up(wait_queue_t *wait, unsigned mode, int sync, void *key) { int cpu = smp_processor_id(); struct task_struct *tsk = wait->private; if (cpu != get_partition(tsk)) TRACE_TASK(tsk, "srp_wake_up on wrong cpu, partition is %d\b", get_partition(tsk)); else if (srp_exceeds_ceiling(tsk, &__get_cpu_var(srp))) return default_wake_function(wait, mode, sync, key); return 0; } /* Wait for current task priority to exceed system-wide priority ceiling. * Can be used to determine when it is safe to run a job after its release. */ void srp_ceiling_block(void) { struct task_struct *tsk = current; wait_queue_t wait = { .private = tsk, .func = srp_wake_up, .task_list = {NULL, NULL} }; preempt_disable(); if (!srp_exceeds_ceiling(tsk, &__get_cpu_var(srp))) { tsk->state = TASK_UNINTERRUPTIBLE; add_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait); TRACE_CUR("is priority ceiling blocked.\n"); preempt_enable_no_resched(); schedule(); /* Access to CPU var must occur with preemptions disabled, * otherwise Linux debug code complains loudly, even if it is * ok here. */ preempt_disable(); TRACE_CUR("finally exceeds system ceiling.\n"); remove_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait); preempt_enable(); } else { TRACE_CUR("is not priority ceiling blocked\n"); preempt_enable(); } } /* ************************************************************************** */