#include #include #include #include #include #if defined(CONFIG_LITMUS_AFFINITY_LOCKING) && defined(CONFIG_LITMUS_NVIDIA) #include #include #endif #include // big signed value. #define IKGLP_INVAL_DISTANCE 0x7FFFFFFF int ikglp_max_heap_base_priority_order(struct binheap_node *a, struct binheap_node *b) { ikglp_heap_node_t *d_a = binheap_entry(a, ikglp_heap_node_t, node); ikglp_heap_node_t *d_b = binheap_entry(b, ikglp_heap_node_t, node); BUG_ON(!d_a); BUG_ON(!d_b); return litmus->__compare(d_a->task, BASE, d_b->task, BASE); } int ikglp_min_heap_base_priority_order(struct binheap_node *a, struct binheap_node *b) { ikglp_heap_node_t *d_a = binheap_entry(a, ikglp_heap_node_t, node); ikglp_heap_node_t *d_b = binheap_entry(b, ikglp_heap_node_t, node); return litmus->__compare(d_b->task, BASE, d_a->task, BASE); } int ikglp_donor_max_heap_base_priority_order(struct binheap_node *a, struct binheap_node *b) { ikglp_wait_state_t *d_a = binheap_entry(a, ikglp_wait_state_t, node); ikglp_wait_state_t *d_b = binheap_entry(b, ikglp_wait_state_t, node); return litmus->__compare(d_a->task, BASE, d_b->task, BASE); } int ikglp_min_heap_donee_order(struct binheap_node *a, struct binheap_node *b) { struct task_struct *prio_a, *prio_b; ikglp_donee_heap_node_t *d_a = binheap_entry(a, ikglp_donee_heap_node_t, node); ikglp_donee_heap_node_t *d_b = binheap_entry(b, ikglp_donee_heap_node_t, node); if(!d_a->donor_info) { prio_a = d_a->task; } else { prio_a = d_a->donor_info->task; BUG_ON(d_a->task != d_a->donor_info->donee_info->task); } if(!d_b->donor_info) { prio_b = d_b->task; } else { prio_b = d_b->donor_info->task; BUG_ON(d_b->task != d_b->donor_info->donee_info->task); } // note reversed order return litmus->__compare(prio_b, BASE, prio_a, BASE); } static inline int ikglp_get_idx(struct ikglp_semaphore *sem, struct fifo_queue *queue) { return (queue - &sem->fifo_queues[0]); } static inline struct fifo_queue* ikglp_get_queue(struct ikglp_semaphore *sem, struct task_struct *holder) { int i; for(i = 0; i < sem->nr_replicas; ++i) if(sem->fifo_queues[i].owner == holder) return(&sem->fifo_queues[i]); return(NULL); } static struct task_struct* ikglp_find_hp_waiter(struct fifo_queue *kqueue, struct task_struct *skip) { struct list_head *pos; struct task_struct *queued, *found = NULL; list_for_each(pos, &kqueue->wait.task_list) { queued = (struct task_struct*) list_entry(pos, wait_queue_t, task_list)->private; /* Compare task prios, find high prio task. */ if(queued != skip && litmus->compare(queued, found)) found = queued; } return found; } static struct fifo_queue* ikglp_find_shortest(struct ikglp_semaphore *sem, struct fifo_queue *search_start) { // we start our search at search_start instead of at the beginning of the // queue list to load-balance across all resources. struct fifo_queue* step = search_start; struct fifo_queue* shortest = sem->shortest_fifo_queue; do { step = (step+1 != &sem->fifo_queues[sem->nr_replicas]) ? step+1 : &sem->fifo_queues[0]; if(step->count < shortest->count) { shortest = step; if(step->count == 0) break; /* can't get any shorter */ } }while(step != search_start); return(shortest); } static inline struct task_struct* ikglp_mth_highest(struct ikglp_semaphore *sem) { return binheap_top_entry(&sem->top_m, ikglp_heap_node_t, node)->task; } #if 0 static void print_global_list(struct binheap_node* n, int depth) { ikglp_heap_node_t *global_heap_node; char padding[81] = " "; if(n == NULL) { TRACE_CUR("+-> %p\n", NULL); return; } global_heap_node = binheap_entry(n, ikglp_heap_node_t, node); if(depth*2 <= 80) padding[depth*2] = '\0'; TRACE_CUR("%s+-> %s/%d\n", padding, global_heap_node->task->comm, global_heap_node->task->pid); if(n->left) print_global_list(n->left, depth+1); if(n->right) print_global_list(n->right, depth+1); } static void print_donees(struct ikglp_semaphore *sem, struct binheap_node *n, int depth) { ikglp_donee_heap_node_t *donee_node; char padding[81] = " "; struct task_struct* donor = NULL; if(n == NULL) { TRACE_CUR("+-> %p\n", NULL); return; } donee_node = binheap_entry(n, ikglp_donee_heap_node_t, node); if(depth*2 <= 80) padding[depth*2] = '\0'; if(donee_node->donor_info) { donor = donee_node->donor_info->task; } TRACE_CUR("%s+-> %s/%d (d: %s/%d) (fq: %d)\n", padding, donee_node->task->comm, donee_node->task->pid, (donor) ? donor->comm : "nil", (donor) ? donor->pid : -1, ikglp_get_idx(sem, donee_node->fq)); if(n->left) print_donees(sem, n->left, depth+1); if(n->right) print_donees(sem, n->right, depth+1); } static void print_donors(struct binheap_node *n, int depth) { ikglp_wait_state_t *donor_node; char padding[81] = " "; if(n == NULL) { TRACE_CUR("+-> %p\n", NULL); return; } donor_node = binheap_entry(n, ikglp_wait_state_t, node); if(depth*2 <= 80) padding[depth*2] = '\0'; TRACE_CUR("%s+-> %s/%d (donee: %s/%d)\n", padding, donor_node->task->comm, donor_node->task->pid, donor_node->donee_info->task->comm, donor_node->donee_info->task->pid); if(n->left) print_donors(n->left, depth+1); if(n->right) print_donors(n->right, depth+1); } #endif static void ikglp_add_global_list(struct ikglp_semaphore *sem, struct task_struct *t, ikglp_heap_node_t *node) { node->task = t; INIT_BINHEAP_NODE(&node->node); if(sem->top_m_size < sem->m) { TRACE_CUR("Trivially adding %s/%d to top-m global list.\n", t->comm, t->pid); // TRACE_CUR("Top-M Before (size = %d):\n", sem->top_m_size); // print_global_list(sem->top_m.root, 1); binheap_add(&node->node, &sem->top_m, ikglp_heap_node_t, node); ++(sem->top_m_size); // TRACE_CUR("Top-M After (size = %d):\n", sem->top_m_size); // print_global_list(sem->top_m.root, 1); } else if(litmus->__compare(t, BASE, ikglp_mth_highest(sem), BASE)) { ikglp_heap_node_t *evicted = binheap_top_entry(&sem->top_m, ikglp_heap_node_t, node); TRACE_CUR("Adding %s/%d to top-m and evicting %s/%d.\n", t->comm, t->pid, evicted->task->comm, evicted->task->pid); // TRACE_CUR("Not-Top-M Before:\n"); // print_global_list(sem->not_top_m.root, 1); // TRACE_CUR("Top-M Before (size = %d):\n", sem->top_m_size); // print_global_list(sem->top_m.root, 1); binheap_delete_root(&sem->top_m, ikglp_heap_node_t, node); INIT_BINHEAP_NODE(&evicted->node); binheap_add(&evicted->node, &sem->not_top_m, ikglp_heap_node_t, node); binheap_add(&node->node, &sem->top_m, ikglp_heap_node_t, node); // TRACE_CUR("Top-M After (size = %d):\n", sem->top_m_size); // print_global_list(sem->top_m.root, 1); // TRACE_CUR("Not-Top-M After:\n"); // print_global_list(sem->not_top_m.root, 1); } else { TRACE_CUR("Trivially adding %s/%d to not-top-m global list.\n", t->comm, t->pid); // TRACE_CUR("Not-Top-M Before:\n"); // print_global_list(sem->not_top_m.root, 1); binheap_add(&node->node, &sem->not_top_m, ikglp_heap_node_t, node); // TRACE_CUR("Not-Top-M After:\n"); // print_global_list(sem->not_top_m.root, 1); } } static void ikglp_del_global_list(struct ikglp_semaphore *sem, struct task_struct *t, ikglp_heap_node_t *node) { BUG_ON(!binheap_is_in_heap(&node->node)); TRACE_CUR("Removing %s/%d from global list.\n", t->comm, t->pid); if(binheap_is_in_this_heap(&node->node, &sem->top_m)) { TRACE_CUR("%s/%d is in top-m\n", t->comm, t->pid); // TRACE_CUR("Not-Top-M Before:\n"); // print_global_list(sem->not_top_m.root, 1); // TRACE_CUR("Top-M Before (size = %d):\n", sem->top_m_size); // print_global_list(sem->top_m.root, 1); binheap_delete(&node->node, &sem->top_m); if(!binheap_empty(&sem->not_top_m)) { ikglp_heap_node_t *promoted = binheap_top_entry(&sem->not_top_m, ikglp_heap_node_t, node); TRACE_CUR("Promoting %s/%d to top-m\n", promoted->task->comm, promoted->task->pid); binheap_delete_root(&sem->not_top_m, ikglp_heap_node_t, node); INIT_BINHEAP_NODE(&promoted->node); binheap_add(&promoted->node, &sem->top_m, ikglp_heap_node_t, node); } else { TRACE_CUR("No one to promote to top-m.\n"); --(sem->top_m_size); } // TRACE_CUR("Top-M After (size = %d):\n", sem->top_m_size); // print_global_list(sem->top_m.root, 1); // TRACE_CUR("Not-Top-M After:\n"); // print_global_list(sem->not_top_m.root, 1); } else { TRACE_CUR("%s/%d is in not-top-m\n", t->comm, t->pid); // TRACE_CUR("Not-Top-M Before:\n"); // print_global_list(sem->not_top_m.root, 1); binheap_delete(&node->node, &sem->not_top_m); // TRACE_CUR("Not-Top-M After:\n"); // print_global_list(sem->not_top_m.root, 1); } } static void ikglp_add_donees(struct ikglp_semaphore *sem, struct fifo_queue *fq, struct task_struct *t, ikglp_donee_heap_node_t* node) { // TRACE_CUR("Adding %s/%d to donee list.\n", t->comm, t->pid); // TRACE_CUR("donees Before:\n"); // print_donees(sem, sem->donees.root, 1); node->task = t; node->donor_info = NULL; node->fq = fq; INIT_BINHEAP_NODE(&node->node); binheap_add(&node->node, &sem->donees, ikglp_donee_heap_node_t, node); // TRACE_CUR("donees After:\n"); // print_donees(sem, sem->donees.root, 1); } static void ikglp_refresh_owners_prio_increase(struct task_struct *t, struct fifo_queue *fq, struct ikglp_semaphore *sem, unsigned long flags) { // priority of 't' has increased (note: 't' might already be hp_waiter). if ((t == fq->hp_waiter) || litmus->compare(t, fq->hp_waiter)) { struct task_struct *old_max_eff_prio; struct task_struct *new_max_eff_prio; struct task_struct *new_prio = NULL; struct task_struct *owner = fq->owner; if(fq->hp_waiter) TRACE_TASK(t, "has higher prio than hp_waiter (%s/%d).\n", fq->hp_waiter->comm, fq->hp_waiter->pid); else TRACE_TASK(t, "has higher prio than hp_waiter (NIL).\n"); if(owner) { raw_spin_lock(&tsk_rt(owner)->hp_blocked_tasks_lock); // TRACE_TASK(owner, "Heap Before:\n"); // print_hp_waiters(tsk_rt(owner)->hp_blocked_tasks.root, 0); old_max_eff_prio = top_priority(&tsk_rt(owner)->hp_blocked_tasks); fq->hp_waiter = t; fq->nest.hp_waiter_eff_prio = effective_priority(fq->hp_waiter); binheap_decrease(&fq->nest.hp_binheap_node, &tsk_rt(owner)->hp_blocked_tasks); // TRACE_TASK(owner, "Heap After:\n"); // print_hp_waiters(tsk_rt(owner)->hp_blocked_tasks.root, 0); new_max_eff_prio = top_priority(&tsk_rt(owner)->hp_blocked_tasks); if(new_max_eff_prio != old_max_eff_prio) { TRACE_TASK(t, "is new hp_waiter.\n"); if ((effective_priority(owner) == old_max_eff_prio) || (litmus->__compare(new_max_eff_prio, BASE, owner, EFFECTIVE))){ new_prio = new_max_eff_prio; } } else { TRACE_TASK(t, "no change in max_eff_prio of heap.\n"); } if(new_prio) { // set new inheritance and propagate TRACE_TASK(t, "Effective priority changed for owner %s/%d to %s/%d\n", owner->comm, owner->pid, new_prio->comm, new_prio->pid); litmus->nested_increase_prio(owner, new_prio, &sem->lock, flags); // unlocks lock. } else { TRACE_TASK(t, "No change in effective priority (is %s/%d). Propagation halted.\n", new_max_eff_prio->comm, new_max_eff_prio->pid); raw_spin_unlock(&tsk_rt(owner)->hp_blocked_tasks_lock); unlock_fine_irqrestore(&sem->lock, flags); } } else { fq->hp_waiter = t; fq->nest.hp_waiter_eff_prio = effective_priority(fq->hp_waiter); TRACE_TASK(t, "no owner.\n"); unlock_fine_irqrestore(&sem->lock, flags); } } else { TRACE_TASK(t, "hp_waiter is unaffected.\n"); unlock_fine_irqrestore(&sem->lock, flags); } } // hp_waiter has decreased static void ikglp_refresh_owners_prio_decrease(struct fifo_queue *fq, struct ikglp_semaphore *sem, unsigned long flags) { struct task_struct *owner = fq->owner; struct task_struct *old_max_eff_prio; struct task_struct *new_max_eff_prio; if(!owner) { TRACE_CUR("No owner. Returning.\n"); unlock_fine_irqrestore(&sem->lock, flags); return; } TRACE_CUR("ikglp_refresh_owners_prio_decrease\n"); raw_spin_lock(&tsk_rt(owner)->hp_blocked_tasks_lock); old_max_eff_prio = top_priority(&tsk_rt(owner)->hp_blocked_tasks); binheap_delete(&fq->nest.hp_binheap_node, &tsk_rt(owner)->hp_blocked_tasks); fq->nest.hp_waiter_eff_prio = fq->hp_waiter; binheap_add(&fq->nest.hp_binheap_node, &tsk_rt(owner)->hp_blocked_tasks, struct nested_info, hp_binheap_node); new_max_eff_prio = top_priority(&tsk_rt(owner)->hp_blocked_tasks); if((old_max_eff_prio != new_max_eff_prio) && (effective_priority(owner) == old_max_eff_prio)) { // Need to set new effective_priority for owner struct task_struct *decreased_prio; TRACE_CUR("Propagating decreased inheritance to holder of fq %d.\n", ikglp_get_idx(sem, fq)); if(litmus->__compare(new_max_eff_prio, BASE, owner, BASE)) { TRACE_CUR("%s/%d has greater base priority than base priority of owner (%s/%d) of fq %d.\n", (new_max_eff_prio) ? new_max_eff_prio->comm : "nil", (new_max_eff_prio) ? new_max_eff_prio->pid : -1, owner->comm, owner->pid, ikglp_get_idx(sem, fq)); decreased_prio = new_max_eff_prio; } else { TRACE_CUR("%s/%d has lesser base priority than base priority of owner (%s/%d) of fq %d.\n", (new_max_eff_prio) ? new_max_eff_prio->comm : "nil", (new_max_eff_prio) ? new_max_eff_prio->pid : -1, owner->comm, owner->pid, ikglp_get_idx(sem, fq)); decreased_prio = NULL; } // beware: recursion litmus->nested_decrease_prio(owner, decreased_prio, &sem->lock, flags); // will unlock mutex->lock } else { TRACE_TASK(owner, "No need to propagate priority decrease forward.\n"); raw_spin_unlock(&tsk_rt(owner)->hp_blocked_tasks_lock); unlock_fine_irqrestore(&sem->lock, flags); } } static void ikglp_remove_donation_from_owner(struct binheap_node *n, struct fifo_queue *fq, struct ikglp_semaphore *sem, unsigned long flags) { struct task_struct *owner = fq->owner; struct task_struct *old_max_eff_prio; struct task_struct *new_max_eff_prio; BUG_ON(!owner); raw_spin_lock(&tsk_rt(owner)->hp_blocked_tasks_lock); old_max_eff_prio = top_priority(&tsk_rt(owner)->hp_blocked_tasks); binheap_delete(n, &tsk_rt(owner)->hp_blocked_tasks); new_max_eff_prio = top_priority(&tsk_rt(owner)->hp_blocked_tasks); if((old_max_eff_prio != new_max_eff_prio) && (effective_priority(owner) == old_max_eff_prio)) { // Need to set new effective_priority for owner struct task_struct *decreased_prio; TRACE_CUR("Propagating decreased inheritance to holder of fq %d.\n", ikglp_get_idx(sem, fq)); if(litmus->__compare(new_max_eff_prio, BASE, owner, BASE)) { TRACE_CUR("has greater base priority than base priority of owner of fq %d.\n", ikglp_get_idx(sem, fq)); decreased_prio = new_max_eff_prio; } else { TRACE_CUR("has lesser base priority than base priority of owner of fq %d.\n", ikglp_get_idx(sem, fq)); decreased_prio = NULL; } // beware: recursion litmus->nested_decrease_prio(owner, decreased_prio, &sem->lock, flags); // will unlock mutex->lock } else { TRACE_TASK(owner, "No need to propagate priority decrease forward.\n"); raw_spin_unlock(&tsk_rt(owner)->hp_blocked_tasks_lock); unlock_fine_irqrestore(&sem->lock, flags); } } static void ikglp_remove_donation_from_fq_waiter(struct task_struct *t, struct binheap_node *n) { struct task_struct *old_max_eff_prio; struct task_struct *new_max_eff_prio; raw_spin_lock(&tsk_rt(t)->hp_blocked_tasks_lock); old_max_eff_prio = top_priority(&tsk_rt(t)->hp_blocked_tasks); binheap_delete(n, &tsk_rt(t)->hp_blocked_tasks); new_max_eff_prio = top_priority(&tsk_rt(t)->hp_blocked_tasks); if((old_max_eff_prio != new_max_eff_prio) && (effective_priority(t) == old_max_eff_prio)) { // Need to set new effective_priority for owner struct task_struct *decreased_prio; if(litmus->__compare(new_max_eff_prio, BASE, t, BASE)) { decreased_prio = new_max_eff_prio; } else { decreased_prio = NULL; } tsk_rt(t)->inh_task = decreased_prio; } raw_spin_unlock(&tsk_rt(t)->hp_blocked_tasks_lock); } static void ikglp_get_immediate(struct task_struct* t, struct fifo_queue *fq, struct ikglp_semaphore *sem, unsigned long flags) { // resource available now TRACE_CUR("queue %d: acquired immediately\n", ikglp_get_idx(sem, fq)); fq->owner = t; raw_spin_lock(&tsk_rt(t)->hp_blocked_tasks_lock); binheap_add(&fq->nest.hp_binheap_node, &tsk_rt(t)->hp_blocked_tasks, struct nested_info, hp_binheap_node); raw_spin_unlock(&tsk_rt(t)->hp_blocked_tasks_lock); ++(fq->count); ikglp_add_global_list(sem, t, &fq->global_heap_node); ikglp_add_donees(sem, fq, t, &fq->donee_heap_node); sem->shortest_fifo_queue = ikglp_find_shortest(sem, sem->shortest_fifo_queue); #ifdef CONFIG_LITMUS_AFFINITY_LOCKING if(sem->aff_obs) { sem->aff_obs->ops->notify_enqueue(sem->aff_obs, fq, t); sem->aff_obs->ops->notify_acquired(sem->aff_obs, fq, t); } #endif unlock_fine_irqrestore(&sem->lock, flags); } static void __ikglp_enqueue_on_fq(struct ikglp_semaphore *sem, struct fifo_queue* fq, struct task_struct* t, wait_queue_t *wait, ikglp_heap_node_t *global_heap_node, ikglp_donee_heap_node_t *donee_heap_node) { /* resource is not free => must suspend and wait */ TRACE_TASK(t, "Enqueuing on fq %d.\n", ikglp_get_idx(sem, fq)); init_waitqueue_entry(wait, t); __add_wait_queue_tail_exclusive(&fq->wait, wait); ++(fq->count); ++(sem->nr_in_fifos); // update global list. if(likely(global_heap_node)) { if(binheap_is_in_heap(&global_heap_node->node)) { WARN_ON(1); ikglp_del_global_list(sem, t, global_heap_node); } ikglp_add_global_list(sem, t, global_heap_node); } // update donor eligiblity list. if(likely(donee_heap_node)) { // if(binheap_is_in_heap(&donee_heap_node->node)) { // WARN_ON(1); // } ikglp_add_donees(sem, fq, t, donee_heap_node); } if(sem->shortest_fifo_queue == fq) { sem->shortest_fifo_queue = ikglp_find_shortest(sem, fq); } #ifdef CONFIG_LITMUS_AFFINITY_LOCKING if(sem->aff_obs) { sem->aff_obs->ops->notify_enqueue(sem->aff_obs, fq, t); } #endif TRACE_TASK(t, "shortest queue is now %d\n", ikglp_get_idx(sem, fq)); } static void ikglp_enqueue_on_fq( struct ikglp_semaphore *sem, struct fifo_queue *fq, ikglp_wait_state_t *wait, unsigned long flags) { /* resource is not free => must suspend and wait */ TRACE_TASK(wait->task, "queue %d: Resource is not free => must suspend and wait.\n", ikglp_get_idx(sem, fq)); INIT_BINHEAP_NODE(&wait->global_heap_node.node); INIT_BINHEAP_NODE(&wait->donee_heap_node.node); __ikglp_enqueue_on_fq(sem, fq, wait->task, &wait->fq_node, &wait->global_heap_node, &wait->donee_heap_node); ikglp_refresh_owners_prio_increase(wait->task, fq, sem, flags); // unlocks sem->lock } static void __ikglp_enqueue_on_pq(struct ikglp_semaphore *sem, ikglp_wait_state_t *wait) { TRACE_TASK(wait->task, "goes to PQ.\n"); wait->pq_node.task = wait->task; // copy over task (little redundant...) binheap_add(&wait->pq_node.node, &sem->priority_queue, ikglp_heap_node_t, node); } static void ikglp_enqueue_on_pq(struct ikglp_semaphore *sem, ikglp_wait_state_t *wait) { INIT_BINHEAP_NODE(&wait->global_heap_node.node); INIT_BINHEAP_NODE(&wait->donee_heap_node.node); INIT_BINHEAP_NODE(&wait->pq_node.node); __ikglp_enqueue_on_pq(sem, wait); } static void ikglp_enqueue_on_donor(struct ikglp_semaphore *sem, ikglp_wait_state_t* wait, unsigned long flags) { struct task_struct *t = wait->task; ikglp_donee_heap_node_t *donee_node = NULL; struct task_struct *donee; struct task_struct *old_max_eff_prio; struct task_struct *new_max_eff_prio; struct task_struct *new_prio = NULL; INIT_BINHEAP_NODE(&wait->global_heap_node.node); INIT_BINHEAP_NODE(&wait->donee_heap_node.node); INIT_BINHEAP_NODE(&wait->pq_node.node); INIT_BINHEAP_NODE(&wait->node); // TRACE_CUR("Adding %s/%d as donor.\n", t->comm, t->pid); // TRACE_CUR("donors Before:\n"); // print_donors(sem->donors.root, 1); // Add donor to the global list. ikglp_add_global_list(sem, t, &wait->global_heap_node); // Select a donee #ifdef CONFIG_LITMUS_AFFINITY_LOCKING donee_node = (sem->aff_obs) ? sem->aff_obs->ops->advise_donee_selection(sem->aff_obs, t) : binheap_top_entry(&sem->donees, ikglp_donee_heap_node_t, node); #else donee_node = binheap_top_entry(&sem->donees, ikglp_donee_heap_node_t, node); #endif donee = donee_node->task; TRACE_TASK(t, "Donee selected: %s/%d\n", donee->comm, donee->pid); TRACE_CUR("Temporarily removing %s/%d to donee list.\n", donee->comm, donee->pid); // TRACE_CUR("donees Before:\n"); // print_donees(sem, sem->donees.root, 1); //binheap_delete_root(&sem->donees, ikglp_donee_heap_node_t, node); // will re-add it shortly binheap_delete(&donee_node->node, &sem->donees); // TRACE_CUR("donees After:\n"); // print_donees(sem, sem->donees.root, 1); wait->donee_info = donee_node; // Add t to donor heap. binheap_add(&wait->node, &sem->donors, ikglp_wait_state_t, node); // Now adjust the donee's priority. // Lock the donee's inheritance heap. raw_spin_lock(&tsk_rt(donee)->hp_blocked_tasks_lock); old_max_eff_prio = top_priority(&tsk_rt(donee)->hp_blocked_tasks); if(donee_node->donor_info) { // Steal donation relation. Evict old donor to PQ. // Remove old donor from donor heap ikglp_wait_state_t *old_wait = donee_node->donor_info; struct task_struct *old_donor = old_wait->task; TRACE_TASK(t, "Donee (%s/%d) had donor %s/%d. Moving old donor to PQ.\n", donee->comm, donee->pid, old_donor->comm, old_donor->pid); binheap_delete(&old_wait->node, &sem->donors); // Remove donation from donee's inheritance heap. binheap_delete(&old_wait->prio_donation.hp_binheap_node, &tsk_rt(donee)->hp_blocked_tasks); // WARNING: have not updated inh_prio! // Add old donor to PQ. __ikglp_enqueue_on_pq(sem, old_wait); // Remove old donor from the global heap. ikglp_del_global_list(sem, old_donor, &old_wait->global_heap_node); } // Add back donee's node to the donees heap with increased prio donee_node->donor_info = wait; INIT_BINHEAP_NODE(&donee_node->node); TRACE_CUR("Adding %s/%d back to donee list.\n", donee->comm, donee->pid); // TRACE_CUR("donees Before:\n"); // print_donees(sem, sem->donees.root, 1); binheap_add(&donee_node->node, &sem->donees, ikglp_donee_heap_node_t, node); // TRACE_CUR("donees After:\n"); // print_donees(sem, sem->donees.root, 1); // Add an inheritance/donation to the donee's inheritance heap. wait->prio_donation.lock = (struct litmus_lock*)sem; wait->prio_donation.hp_waiter_eff_prio = t; wait->prio_donation.hp_waiter_ptr = NULL; INIT_BINHEAP_NODE(&wait->prio_donation.hp_binheap_node); binheap_add(&wait->prio_donation.hp_binheap_node, &tsk_rt(donee)->hp_blocked_tasks, struct nested_info, hp_binheap_node); new_max_eff_prio = top_priority(&tsk_rt(donee)->hp_blocked_tasks); if(new_max_eff_prio != old_max_eff_prio) { if ((effective_priority(donee) == old_max_eff_prio) || (litmus->__compare(new_max_eff_prio, BASE, donee, EFFECTIVE))){ TRACE_TASK(t, "Donation increases %s/%d's effective priority\n", donee->comm, donee->pid); new_prio = new_max_eff_prio; } // else { // // should be bug. donor would not be in top-m. // TRACE_TASK(t, "Donation is not greater than base prio of %s/%d?\n", donee->comm, donee->pid); // WARN_ON(1); // } // } // else { // // should be bug. donor would not be in top-m. // TRACE_TASK(t, "No change in %s/%d's inheritance heap?\n", donee->comm, donee->pid); // WARN_ON(1); } if(new_prio) { struct fifo_queue *donee_fq = donee_node->fq; if(donee != donee_fq->owner) { TRACE_TASK(t, "%s/%d is not the owner. Propagating priority to owner %s/%d.\n", donee->comm, donee->pid, donee_fq->owner->comm, donee_fq->owner->pid); raw_spin_unlock(&tsk_rt(donee)->hp_blocked_tasks_lock); ikglp_refresh_owners_prio_increase(donee, donee_fq, sem, flags); // unlocks sem->lock } else { TRACE_TASK(t, "%s/%d is the owner. Progatating priority immediatly.\n", donee->comm, donee->pid); litmus->nested_increase_prio(donee, new_prio, &sem->lock, flags); // unlocks sem->lock and donee's heap lock } } else { TRACE_TASK(t, "No change in effective priority (it is %d/%s). BUG?\n", new_max_eff_prio->comm, new_max_eff_prio->pid); raw_spin_unlock(&tsk_rt(donee)->hp_blocked_tasks_lock); unlock_fine_irqrestore(&sem->lock, flags); } // TRACE_CUR("donors After:\n"); // print_donors(sem->donors.root, 1); } int ikglp_lock(struct litmus_lock* l) { struct task_struct* t = current; struct ikglp_semaphore *sem = ikglp_from_lock(l); unsigned long flags = 0, real_flags; struct fifo_queue *fq = NULL; int replica = -EINVAL; #ifdef CONFIG_LITMUS_DGL_SUPPORT raw_spinlock_t *dgl_lock; #endif ikglp_wait_state_t wait; if (!is_realtime(t)) return -EPERM; #ifdef CONFIG_LITMUS_DGL_SUPPORT dgl_lock = litmus->get_dgl_spinlock(t); #endif raw_spin_lock_irqsave(&sem->real_lock, real_flags); lock_global_irqsave(dgl_lock, flags); lock_fine_irqsave(&sem->lock, flags); if(sem->nr_in_fifos < sem->m) { // enqueue somwhere #ifdef CONFIG_LITMUS_AFFINITY_LOCKING fq = (sem->aff_obs) ? sem->aff_obs->ops->advise_enqueue(sem->aff_obs, t) : sem->shortest_fifo_queue; #else fq = sem->shortest_fifo_queue; #endif if(fq->count == 0) { // take available resource replica = ikglp_get_idx(sem, fq); ikglp_get_immediate(t, fq, sem, flags); // unlocks sem->lock unlock_global_irqrestore(dgl_lock, flags); raw_spin_unlock_irqrestore(&sem->real_lock, real_flags); goto acquired; } else { wait.task = t; // THIS IS CRITICALLY IMPORTANT!!! tsk_rt(t)->blocked_lock = (struct litmus_lock*)sem; // record where we are blocked mb(); /* FIXME: interruptible would be nice some day */ set_task_state(t, TASK_UNINTERRUPTIBLE); ikglp_enqueue_on_fq(sem, fq, &wait, flags); // unlocks sem->lock } } else { // donor! wait.task = t; // THIS IS CRITICALLY IMPORTANT!!! tsk_rt(t)->blocked_lock = (struct litmus_lock*)sem; // record where we are blocked mb(); /* FIXME: interruptible would be nice some day */ set_task_state(t, TASK_UNINTERRUPTIBLE); if(litmus->__compare(ikglp_mth_highest(sem), BASE, t, BASE)) { // enqueue on PQ ikglp_enqueue_on_pq(sem, &wait); unlock_fine_irqrestore(&sem->lock, flags); } else { // enqueue as donor ikglp_enqueue_on_donor(sem, &wait, flags); // unlocks sem->lock } } unlock_global_irqrestore(dgl_lock, flags); raw_spin_unlock_irqrestore(&sem->real_lock, real_flags); TS_LOCK_SUSPEND; suspend_for_lock(); TS_LOCK_RESUME; fq = ikglp_get_queue(sem, t); BUG_ON(!fq); replica = ikglp_get_idx(sem, fq); acquired: TRACE_CUR("Acquired lock %d, queue %d\n", l->ident, replica); #ifdef CONFIG_LITMUS_AFFINITY_LOCKING if(sem->aff_obs) { return sem->aff_obs->ops->replica_to_resource(sem->aff_obs, fq); } #endif return replica; } //int ikglp_lock(struct litmus_lock* l) //{ // struct task_struct* t = current; // struct ikglp_semaphore *sem = ikglp_from_lock(l); // unsigned long flags = 0, real_flags; // struct fifo_queue *fq = NULL; // int replica = -EINVAL; // //#ifdef CONFIG_LITMUS_DGL_SUPPORT // raw_spinlock_t *dgl_lock; //#endif // // ikglp_wait_state_t wait; // // if (!is_realtime(t)) // return -EPERM; // //#ifdef CONFIG_LITMUS_DGL_SUPPORT // dgl_lock = litmus->get_dgl_spinlock(t); //#endif // // raw_spin_lock_irqsave(&sem->real_lock, real_flags); // // lock_global_irqsave(dgl_lock, flags); // lock_fine_irqsave(&sem->lock, flags); // // //#ifdef CONFIG_LITMUS_AFFINITY_LOCKING // fq = (sem->aff_obs) ? // sem->aff_obs->ops->advise_enqueue(sem->aff_obs, t) : // sem->shortest_fifo_queue; //#else // fq = sem->shortest_fifo_queue; //#endif // // if(fq->count == 0) { // // take available resource // replica = ikglp_get_idx(sem, fq); // // ikglp_get_immediate(t, fq, sem, flags); // unlocks sem->lock // // unlock_global_irqrestore(dgl_lock, flags); // raw_spin_unlock_irqrestore(&sem->real_lock, real_flags); // } // else // { // // we have to suspend. // // wait.task = t; // THIS IS CRITICALLY IMPORTANT!!! // // tsk_rt(t)->blocked_lock = (struct litmus_lock*)sem; // record where we are blocked // mb(); // // /* FIXME: interruptible would be nice some day */ // set_task_state(t, TASK_UNINTERRUPTIBLE); // // if(fq->count < sem->max_fifo_len) { // // enqueue on fq // ikglp_enqueue_on_fq(sem, fq, &wait, flags); // unlocks sem->lock // } // else { // // TRACE_CUR("IKGLP fifo queues are full (at least they better be).\n"); // // // no room in fifos. Go to PQ or donors. // // if(litmus->__compare(ikglp_mth_highest(sem), BASE, t, BASE)) { // // enqueue on PQ // ikglp_enqueue_on_pq(sem, &wait); // unlock_fine_irqrestore(&sem->lock, flags); // } // else { // // enqueue as donor // ikglp_enqueue_on_donor(sem, &wait, flags); // unlocks sem->lock // } // } // // unlock_global_irqrestore(dgl_lock, flags); // raw_spin_unlock_irqrestore(&sem->real_lock, real_flags); // // TS_LOCK_SUSPEND; // // schedule(); // // TS_LOCK_RESUME; // // fq = ikglp_get_queue(sem, t); // BUG_ON(!fq); // // replica = ikglp_get_idx(sem, fq); // } // // TRACE_CUR("Acquired lock %d, queue %d\n", // l->ident, replica); // //#ifdef CONFIG_LITMUS_AFFINITY_LOCKING // if(sem->aff_obs) { // return sem->aff_obs->ops->replica_to_resource(sem->aff_obs, fq); // } //#endif // // return replica; //} static void ikglp_move_donor_to_fq(struct ikglp_semaphore *sem, struct fifo_queue *fq, ikglp_wait_state_t *donor_info) { struct task_struct *t = donor_info->task; TRACE_CUR("Donor %s/%d being moved to fq %d\n", t->comm, t->pid, ikglp_get_idx(sem, fq)); binheap_delete(&donor_info->node, &sem->donors); __ikglp_enqueue_on_fq(sem, fq, t, &donor_info->fq_node, NULL, // already in global_list, so pass null to prevent adding 2nd time. &donor_info->donee_heap_node); // warning: // ikglp_update_owners_prio(t, fq, sem, flags) has not been called. } static void ikglp_move_pq_to_fq(struct ikglp_semaphore *sem, struct fifo_queue *fq, ikglp_wait_state_t *wait) { struct task_struct *t = wait->task; TRACE_CUR("PQ request %s/%d being moved to fq %d\n", t->comm, t->pid, ikglp_get_idx(sem, fq)); binheap_delete(&wait->pq_node.node, &sem->priority_queue); __ikglp_enqueue_on_fq(sem, fq, t, &wait->fq_node, &wait->global_heap_node, &wait->donee_heap_node); // warning: // ikglp_update_owners_prio(t, fq, sem, flags) has not been called. } static ikglp_wait_state_t* ikglp_find_hp_waiter_to_steal( struct ikglp_semaphore* sem) { /* must hold sem->lock */ struct fifo_queue *fq = NULL; struct list_head *pos; struct task_struct *queued; int i; for(i = 0; i < sem->nr_replicas; ++i) { if( (sem->fifo_queues[i].count > 1) && (!fq || litmus->compare(sem->fifo_queues[i].hp_waiter, fq->hp_waiter)) ) { TRACE_CUR("hp_waiter on fq %d (%s/%d) has higher prio than hp_waiter on fq %d (%s/%d)\n", ikglp_get_idx(sem, &sem->fifo_queues[i]), sem->fifo_queues[i].hp_waiter->comm, sem->fifo_queues[i].hp_waiter->pid, (fq) ? ikglp_get_idx(sem, fq) : -1, (fq) ? ((fq->hp_waiter) ? fq->hp_waiter->comm : "nil") : "nilXX", (fq) ? ((fq->hp_waiter) ? fq->hp_waiter->pid : -1) : -2); fq = &sem->fifo_queues[i]; WARN_ON(!(fq->hp_waiter)); } } if(fq) { struct task_struct *max_hp = fq->hp_waiter; ikglp_wait_state_t* ret = NULL; TRACE_CUR("Searching for %s/%d on fq %d\n", max_hp->comm, max_hp->pid, ikglp_get_idx(sem, fq)); BUG_ON(!max_hp); list_for_each(pos, &fq->wait.task_list) { wait_queue_t *wait = list_entry(pos, wait_queue_t, task_list); queued = (struct task_struct*) wait->private; TRACE_CUR("fq %d entry: %s/%d\n", ikglp_get_idx(sem, fq), queued->comm, queued->pid); /* Compare task prios, find high prio task. */ if (queued == max_hp) { TRACE_CUR("Found it!\n"); ret = container_of(wait, ikglp_wait_state_t, fq_node); } } WARN_ON(!ret); return ret; } return(NULL); } static void ikglp_steal_to_fq(struct ikglp_semaphore *sem, struct fifo_queue *fq, ikglp_wait_state_t *fq_wait) { struct task_struct *t = fq_wait->task; struct fifo_queue *fq_steal = fq_wait->donee_heap_node.fq; TRACE_CUR("FQ request %s/%d being moved to fq %d\n", t->comm, t->pid, ikglp_get_idx(sem, fq)); fq_wait->donee_heap_node.fq = fq; // just to be safe __remove_wait_queue(&fq_steal->wait, &fq_wait->fq_node); --(fq_steal->count); #ifdef CONFIG_LITMUS_AFFINITY_LOCKING if(sem->aff_obs) { sem->aff_obs->ops->notify_dequeue(sem->aff_obs, fq_steal, t); } #endif if(t == fq_steal->hp_waiter) { fq_steal->hp_waiter = ikglp_find_hp_waiter(fq_steal, NULL); TRACE_TASK(t, "New hp_waiter for fq %d is %s/%d!\n", ikglp_get_idx(sem, fq_steal), (fq_steal->hp_waiter) ? fq_steal->hp_waiter->comm : "nil", (fq_steal->hp_waiter) ? fq_steal->hp_waiter->pid : -1); } // Update shortest. if(fq_steal->count < sem->shortest_fifo_queue->count) { sem->shortest_fifo_queue = fq_steal; } __ikglp_enqueue_on_fq(sem, fq, t, &fq_wait->fq_node, NULL, NULL); // warning: We have not checked the priority inheritance of fq's owner yet. } static void ikglp_migrate_fq_to_owner_heap_nodes(struct ikglp_semaphore *sem, struct fifo_queue *fq, ikglp_wait_state_t *old_wait) { struct task_struct *t = old_wait->task; BUG_ON(old_wait->donee_heap_node.fq != fq); TRACE_TASK(t, "Migrating wait_state to memory of queue %d.\n", ikglp_get_idx(sem, fq)); // need to migrate global_heap_node and donee_heap_node off of the stack // to the nodes allocated for the owner of this fq. // TODO: Enhance binheap() to perform this operation in place. ikglp_del_global_list(sem, t, &old_wait->global_heap_node); // remove fq->global_heap_node = old_wait->global_heap_node; // copy ikglp_add_global_list(sem, t, &fq->global_heap_node); // re-add binheap_delete(&old_wait->donee_heap_node.node, &sem->donees); // remove fq->donee_heap_node = old_wait->donee_heap_node; // copy if(fq->donee_heap_node.donor_info) { // let donor know that our location has changed BUG_ON(fq->donee_heap_node.donor_info->donee_info->task != t); // validate cross-link fq->donee_heap_node.donor_info->donee_info = &fq->donee_heap_node; } INIT_BINHEAP_NODE(&fq->donee_heap_node.node); binheap_add(&fq->donee_heap_node.node, &sem->donees, ikglp_donee_heap_node_t, node); // re-add } int ikglp_unlock(struct litmus_lock* l) { struct ikglp_semaphore *sem = ikglp_from_lock(l); struct task_struct *t = current; struct task_struct *donee = NULL; struct task_struct *next = NULL; struct task_struct *new_on_fq = NULL; struct fifo_queue *fq_of_new_on_fq = NULL; ikglp_wait_state_t *other_donor_info = NULL; struct fifo_queue *to_steal = NULL; int need_steal_prio_reeval = 0; struct fifo_queue *fq; #ifdef CONFIG_LITMUS_DGL_SUPPORT raw_spinlock_t *dgl_lock; #endif unsigned long flags = 0, real_flags; int err = 0; fq = ikglp_get_queue(sem, t); // returns NULL if 't' is not owner. if (!fq) { err = -EINVAL; goto out; } #ifdef CONFIG_LITMUS_DGL_SUPPORT dgl_lock = litmus->get_dgl_spinlock(t); #endif raw_spin_lock_irqsave(&sem->real_lock, real_flags); lock_global_irqsave(dgl_lock, flags); // TODO: Push this deeper lock_fine_irqsave(&sem->lock, flags); TRACE_TASK(t, "Freeing replica %d.\n", ikglp_get_idx(sem, fq)); // Remove 't' from the heaps, but data in nodes will still be good. ikglp_del_global_list(sem, t, &fq->global_heap_node); binheap_delete(&fq->donee_heap_node.node, &sem->donees); fq->owner = NULL; // no longer owned!! --(fq->count); if(fq->count < sem->shortest_fifo_queue->count) { sem->shortest_fifo_queue = fq; } --(sem->nr_in_fifos); #ifdef CONFIG_LITMUS_AFFINITY_LOCKING if(sem->aff_obs) { sem->aff_obs->ops->notify_dequeue(sem->aff_obs, fq, t); sem->aff_obs->ops->notify_freed(sem->aff_obs, fq, t); } #endif // Move the next request into the FQ and update heaps as needed. // We defer re-evaluation of priorities to later in the function. if(fq->donee_heap_node.donor_info) { // move my donor to FQ ikglp_wait_state_t *donor_info = fq->donee_heap_node.donor_info; new_on_fq = donor_info->task; // donor moved to FQ donee = t; #ifdef CONFIG_LITMUS_AFFINITY_LOCKING if(sem->aff_obs && sem->aff_obs->relax_max_fifo_len) { fq_of_new_on_fq = sem->aff_obs->ops->advise_enqueue(sem->aff_obs, new_on_fq); if(fq_of_new_on_fq->count == 0) { // ignore it? // fq_of_new_on_fq = fq; } } else { fq_of_new_on_fq = fq; } #else fq_of_new_on_fq = fq; #endif TRACE_TASK(t, "Moving MY donor (%s/%d) to fq %d (non-aff wanted fq %d).\n", new_on_fq->comm, new_on_fq->pid, ikglp_get_idx(sem, fq_of_new_on_fq), ikglp_get_idx(sem, fq)); ikglp_move_donor_to_fq(sem, fq_of_new_on_fq, donor_info); } else if(!binheap_empty(&sem->donors)) { // No donor, so move any donor to FQ // move other donor to FQ // Select a donor #ifdef CONFIG_LITMUS_AFFINITY_LOCKING other_donor_info = (sem->aff_obs) ? sem->aff_obs->ops->advise_donor_to_fq(sem->aff_obs, fq) : binheap_top_entry(&sem->donors, ikglp_wait_state_t, node); #else other_donor_info = binheap_top_entry(&sem->donors, ikglp_wait_state_t, node); #endif new_on_fq = other_donor_info->task; donee = other_donor_info->donee_info->task; // update the donee's heap position. other_donor_info->donee_info->donor_info = NULL; // clear the cross-link binheap_decrease(&other_donor_info->donee_info->node, &sem->donees); #ifdef CONFIG_LITMUS_AFFINITY_LOCKING if(sem->aff_obs && sem->aff_obs->relax_max_fifo_len) { fq_of_new_on_fq = sem->aff_obs->ops->advise_enqueue(sem->aff_obs, new_on_fq); if(fq_of_new_on_fq->count == 0) { // ignore it? // fq_of_new_on_fq = fq; } } else { fq_of_new_on_fq = fq; } #else fq_of_new_on_fq = fq; #endif TRACE_TASK(t, "Moving a donor (%s/%d) to fq %d (non-aff wanted fq %d).\n", new_on_fq->comm, new_on_fq->pid, ikglp_get_idx(sem, fq_of_new_on_fq), ikglp_get_idx(sem, fq)); ikglp_move_donor_to_fq(sem, fq_of_new_on_fq, other_donor_info); } else if(!binheap_empty(&sem->priority_queue)) { // No donors, so move PQ ikglp_heap_node_t *pq_node = binheap_top_entry(&sem->priority_queue, ikglp_heap_node_t, node); ikglp_wait_state_t *pq_wait = container_of(pq_node, ikglp_wait_state_t, pq_node); new_on_fq = pq_wait->task; #ifdef CONFIG_LITMUS_AFFINITY_LOCKING if(sem->aff_obs && sem->aff_obs->relax_max_fifo_len) { fq_of_new_on_fq = sem->aff_obs->ops->advise_enqueue(sem->aff_obs, new_on_fq); if(fq_of_new_on_fq->count == 0) { // ignore it? // fq_of_new_on_fq = fq; } } else { fq_of_new_on_fq = fq; } #else fq_of_new_on_fq = fq; #endif TRACE_TASK(t, "Moving a pq waiter (%s/%d) to fq %d (non-aff wanted fq %d).\n", new_on_fq->comm, new_on_fq->pid, ikglp_get_idx(sem, fq_of_new_on_fq), ikglp_get_idx(sem, fq)); ikglp_move_pq_to_fq(sem, fq_of_new_on_fq, pq_wait); } else if(fq->count == 0) { // No PQ and this queue is empty, so steal. ikglp_wait_state_t *fq_wait; TRACE_TASK(t, "Looking to steal a request for fq %d...\n", ikglp_get_idx(sem, fq)); #ifdef CONFIG_LITMUS_AFFINITY_LOCKING fq_wait = (sem->aff_obs) ? sem->aff_obs->ops->advise_steal(sem->aff_obs, fq) : ikglp_find_hp_waiter_to_steal(sem); #else fq_wait = ikglp_find_hp_waiter_to_steal(sem); #endif if(fq_wait) { to_steal = fq_wait->donee_heap_node.fq; new_on_fq = fq_wait->task; fq_of_new_on_fq = fq; need_steal_prio_reeval = (new_on_fq == to_steal->hp_waiter); TRACE_TASK(t, "Found %s/%d of fq %d to steal for fq %d...\n", new_on_fq->comm, new_on_fq->pid, ikglp_get_idx(sem, to_steal), ikglp_get_idx(sem, fq)); ikglp_steal_to_fq(sem, fq, fq_wait); } else { TRACE_TASK(t, "Found nothing to steal for fq %d.\n", ikglp_get_idx(sem, fq)); } } else { // move no one } // 't' must drop all priority and clean up data structures before hand-off. // DROP ALL INHERITANCE. IKGLP MUST BE OUTER-MOST raw_spin_lock(&tsk_rt(t)->hp_blocked_tasks_lock); { int count = 0; while(!binheap_empty(&tsk_rt(t)->hp_blocked_tasks)) { binheap_delete_root(&tsk_rt(t)->hp_blocked_tasks, struct nested_info, hp_binheap_node); ++count; } litmus->decrease_prio(t, NULL); WARN_ON(count > 2); // should not be greater than 2. only local fq inh and donation can be possible. } raw_spin_unlock(&tsk_rt(t)->hp_blocked_tasks_lock); // Now patch up other priorities. // // At most one of the following: // if(donee && donee != t), decrease prio, propagate to owner, or onward // if(to_steal), update owner's prio (hp_waiter has already been set) // BUG_ON((other_donor_info != NULL) && (to_steal != NULL)); if(other_donor_info) { struct fifo_queue *other_fq = other_donor_info->donee_info->fq; BUG_ON(!donee); BUG_ON(donee == t); TRACE_TASK(t, "Terminating donation relation of donor %s/%d to donee %s/%d!\n", other_donor_info->task->comm, other_donor_info->task->pid, donee->comm, donee->pid); // need to terminate donation relation. if(donee == other_fq->owner) { TRACE_TASK(t, "Donee %s/%d is an owner of fq %d.\n", donee->comm, donee->pid, ikglp_get_idx(sem, other_fq)); ikglp_remove_donation_from_owner(&other_donor_info->prio_donation.hp_binheap_node, other_fq, sem, flags); lock_fine_irqsave(&sem->lock, flags); // there should be no contention!!!! } else { TRACE_TASK(t, "Donee %s/%d is an blocked in of fq %d.\n", donee->comm, donee->pid, ikglp_get_idx(sem, other_fq)); ikglp_remove_donation_from_fq_waiter(donee, &other_donor_info->prio_donation.hp_binheap_node); if(donee == other_fq->hp_waiter) { TRACE_TASK(t, "Donee %s/%d was an hp_waiter of fq %d. Rechecking hp_waiter.\n", donee->comm, donee->pid, ikglp_get_idx(sem, other_fq)); other_fq->hp_waiter = ikglp_find_hp_waiter(other_fq, NULL); TRACE_TASK(t, "New hp_waiter for fq %d is %s/%d!\n", ikglp_get_idx(sem, other_fq), (other_fq->hp_waiter) ? other_fq->hp_waiter->comm : "nil", (other_fq->hp_waiter) ? other_fq->hp_waiter->pid : -1); ikglp_refresh_owners_prio_decrease(other_fq, sem, flags); // unlocks sem->lock. reacquire it. lock_fine_irqsave(&sem->lock, flags); // there should be no contention!!!! } } } else if(to_steal) { TRACE_TASK(t, "Rechecking priority inheritance of fq %d, triggered by stealing.\n", ikglp_get_idx(sem, to_steal)); if(need_steal_prio_reeval) { ikglp_refresh_owners_prio_decrease(to_steal, sem, flags); // unlocks sem->lock. reacquire it. lock_fine_irqsave(&sem->lock, flags); // there should be no contention!!!! } } // check for new HP waiter. if(new_on_fq) { if(fq == fq_of_new_on_fq) { // fq->owner is null, so just update the hp_waiter without locking. if(new_on_fq == fq->hp_waiter) { TRACE_TASK(t, "new_on_fq is already hp_waiter.\n", fq->hp_waiter->comm, fq->hp_waiter->pid); fq->nest.hp_waiter_eff_prio = effective_priority(fq->hp_waiter); // set this just to be sure... } else if(litmus->compare(new_on_fq, fq->hp_waiter)) { if(fq->hp_waiter) TRACE_TASK(t, "has higher prio than hp_waiter (%s/%d).\n", fq->hp_waiter->comm, fq->hp_waiter->pid); else TRACE_TASK(t, "has higher prio than hp_waiter (NIL).\n"); fq->hp_waiter = new_on_fq; fq->nest.hp_waiter_eff_prio = effective_priority(fq->hp_waiter); TRACE_TASK(t, "New hp_waiter for fq %d is %s/%d!\n", ikglp_get_idx(sem, fq), (fq->hp_waiter) ? fq->hp_waiter->comm : "nil", (fq->hp_waiter) ? fq->hp_waiter->pid : -1); } } else { ikglp_refresh_owners_prio_increase(new_on_fq, fq_of_new_on_fq, sem, flags); // unlocks sem->lock. reacquire it. lock_fine_irqsave(&sem->lock, flags); // there should be no contention!!!! } } wake_kludge: if(waitqueue_active(&fq->wait)) { wait_queue_t *wait = list_entry(fq->wait.task_list.next, wait_queue_t, task_list); ikglp_wait_state_t *fq_wait = container_of(wait, ikglp_wait_state_t, fq_node); next = (struct task_struct*) wait->private; __remove_wait_queue(&fq->wait, wait); TRACE_CUR("queue %d: ASSIGNING %s/%d as owner - next\n", ikglp_get_idx(sem, fq), next->comm, next->pid); // migrate wait-state to fifo-memory. ikglp_migrate_fq_to_owner_heap_nodes(sem, fq, fq_wait); /* next becomes the resouce holder */ fq->owner = next; tsk_rt(next)->blocked_lock = NULL; #ifdef CONFIG_LITMUS_AFFINITY_LOCKING if(sem->aff_obs) { sem->aff_obs->ops->notify_acquired(sem->aff_obs, fq, next); } #endif /* determine new hp_waiter if necessary */ if (next == fq->hp_waiter) { TRACE_TASK(next, "was highest-prio waiter\n"); /* next has the highest priority --- it doesn't need to * inherit. However, we need to make sure that the * next-highest priority in the queue is reflected in * hp_waiter. */ fq->hp_waiter = ikglp_find_hp_waiter(fq, NULL); TRACE_TASK(next, "New hp_waiter for fq %d is %s/%d!\n", ikglp_get_idx(sem, fq), (fq->hp_waiter) ? fq->hp_waiter->comm : "nil", (fq->hp_waiter) ? fq->hp_waiter->pid : -1); fq->nest.hp_waiter_eff_prio = (fq->hp_waiter) ? effective_priority(fq->hp_waiter) : NULL; if (fq->hp_waiter) TRACE_TASK(fq->hp_waiter, "is new highest-prio waiter\n"); else TRACE("no further waiters\n"); raw_spin_lock(&tsk_rt(next)->hp_blocked_tasks_lock); // TRACE_TASK(next, "Heap Before:\n"); // print_hp_waiters(tsk_rt(next)->hp_blocked_tasks.root, 0); binheap_add(&fq->nest.hp_binheap_node, &tsk_rt(next)->hp_blocked_tasks, struct nested_info, hp_binheap_node); // TRACE_TASK(next, "Heap After:\n"); // print_hp_waiters(tsk_rt(next)->hp_blocked_tasks.root, 0); raw_spin_unlock(&tsk_rt(next)->hp_blocked_tasks_lock); } else { /* Well, if 'next' is not the highest-priority waiter, * then it (probably) ought to inherit the highest-priority * waiter's priority. */ TRACE_TASK(next, "is not hp_waiter of replica %d. hp_waiter is %s/%d\n", ikglp_get_idx(sem, fq), (fq->hp_waiter) ? fq->hp_waiter->comm : "nil", (fq->hp_waiter) ? fq->hp_waiter->pid : -1); raw_spin_lock(&tsk_rt(next)->hp_blocked_tasks_lock); binheap_add(&fq->nest.hp_binheap_node, &tsk_rt(next)->hp_blocked_tasks, struct nested_info, hp_binheap_node); /* It is possible that 'next' *should* be the hp_waiter, but isn't * because that update hasn't yet executed (update operation is * probably blocked on mutex->lock). So only inherit if the top of * 'next's top heap node is indeed the effective prio. of hp_waiter. * (We use fq->hp_waiter_eff_prio instead of effective_priority(hp_waiter) * since the effective priority of hp_waiter can change (and the * update has not made it to this lock).) */ if(likely(top_priority(&tsk_rt(next)->hp_blocked_tasks) == fq->nest.hp_waiter_eff_prio)) { if(fq->nest.hp_waiter_eff_prio) litmus->increase_prio(next, fq->nest.hp_waiter_eff_prio); else WARN_ON(1); } raw_spin_unlock(&tsk_rt(next)->hp_blocked_tasks_lock); } // wake up the new resource holder! wake_up_process(next); } if(fq_of_new_on_fq && fq_of_new_on_fq != fq && fq_of_new_on_fq->count == 1) { // The guy we promoted when to an empty FQ. (Why didn't stealing pick this up?) // Wake up the new guy too. BUG_ON(fq_of_new_on_fq->owner != NULL); fq = fq_of_new_on_fq; fq_of_new_on_fq = NULL; goto wake_kludge; } unlock_fine_irqrestore(&sem->lock, flags); unlock_global_irqrestore(dgl_lock, flags); raw_spin_unlock_irqrestore(&sem->real_lock, real_flags); out: return err; } int ikglp_close(struct litmus_lock* l) { struct task_struct *t = current; struct ikglp_semaphore *sem = ikglp_from_lock(l); unsigned long flags; int owner = 0; int i; raw_spin_lock_irqsave(&sem->real_lock, flags); for(i = 0; i < sem->nr_replicas; ++i) { if(sem->fifo_queues[i].owner == t) { owner = 1; break; } } raw_spin_unlock_irqrestore(&sem->real_lock, flags); if (owner) ikglp_unlock(l); return 0; } void ikglp_free(struct litmus_lock* l) { struct ikglp_semaphore *sem = ikglp_from_lock(l); kfree(sem->fifo_queues); kfree(sem); } struct litmus_lock* ikglp_new(int m, struct litmus_lock_ops* ops, void* __user arg) { struct ikglp_semaphore* sem; int nr_replicas = 0; int i; if(!access_ok(VERIFY_READ, arg, sizeof(nr_replicas))) { return(NULL); } if(__copy_from_user(&nr_replicas, arg, sizeof(nr_replicas))) { return(NULL); } if(nr_replicas < 1) { return(NULL); } sem = kmalloc(sizeof(*sem), GFP_KERNEL); if(!sem) { return NULL; } sem->fifo_queues = kmalloc(sizeof(struct fifo_queue)*nr_replicas, GFP_KERNEL); if(!sem->fifo_queues) { kfree(sem); return NULL; } sem->litmus_lock.ops = ops; #ifdef CONFIG_DEBUG_SPINLOCK { __raw_spin_lock_init(&sem->lock, ((struct litmus_lock*)sem)->cheat_lockdep, &((struct litmus_lock*)sem)->key); } #else raw_spin_lock_init(&sem->lock); #endif raw_spin_lock_init(&sem->real_lock); sem->nr_replicas = nr_replicas; sem->m = m; sem->max_fifo_len = (sem->m/nr_replicas) + ((sem->m%nr_replicas) != 0); sem->nr_in_fifos = 0; TRACE("New IKGLP Sem: m = %d, k = %d, max fifo_len = %d\n", sem->m, sem->nr_replicas, sem->max_fifo_len); for(i = 0; i < nr_replicas; ++i) { struct fifo_queue* q = &(sem->fifo_queues[i]); q->owner = NULL; q->hp_waiter = NULL; init_waitqueue_head(&q->wait); q->count = 0; q->global_heap_node.task = NULL; INIT_BINHEAP_NODE(&q->global_heap_node.node); q->donee_heap_node.task = NULL; q->donee_heap_node.donor_info = NULL; q->donee_heap_node.fq = NULL; INIT_BINHEAP_NODE(&q->donee_heap_node.node); q->nest.lock = (struct litmus_lock*)sem; q->nest.hp_waiter_eff_prio = NULL; q->nest.hp_waiter_ptr = &q->hp_waiter; INIT_BINHEAP_NODE(&q->nest.hp_binheap_node); } sem->shortest_fifo_queue = &sem->fifo_queues[0]; sem->top_m_size = 0; // init heaps INIT_BINHEAP_HANDLE(&sem->top_m, ikglp_min_heap_base_priority_order); INIT_BINHEAP_HANDLE(&sem->not_top_m, ikglp_max_heap_base_priority_order); INIT_BINHEAP_HANDLE(&sem->donees, ikglp_min_heap_donee_order); INIT_BINHEAP_HANDLE(&sem->priority_queue, ikglp_max_heap_base_priority_order); INIT_BINHEAP_HANDLE(&sem->donors, ikglp_donor_max_heap_base_priority_order); #ifdef CONFIG_LITMUS_AFFINITY_LOCKING sem->aff_obs = NULL; #endif return &sem->litmus_lock; } #if defined(CONFIG_LITMUS_AFFINITY_LOCKING) && defined(CONFIG_LITMUS_NVIDIA) static inline int __replica_to_gpu(struct ikglp_affinity* aff, int replica) { int gpu = replica % aff->nr_rsrc; return gpu; } static inline int replica_to_gpu(struct ikglp_affinity* aff, int replica) { int gpu = __replica_to_gpu(aff, replica) + aff->offset; return gpu; } static inline int gpu_to_base_replica(struct ikglp_affinity* aff, int gpu) { int replica = gpu - aff->offset; return replica; } int ikglp_aff_obs_close(struct affinity_observer* obs) { return 0; } void ikglp_aff_obs_free(struct affinity_observer* obs) { struct ikglp_affinity *ikglp_aff = ikglp_aff_obs_from_aff_obs(obs); // make sure the thread destroying this semaphore will not // call the exit callback on a destroyed lock. struct task_struct *t = current; if (is_realtime(t) && tsk_rt(t)->rsrc_exit_cb_args == ikglp_aff) { tsk_rt(t)->rsrc_exit_cb = NULL; tsk_rt(t)->rsrc_exit_cb_args = NULL; } kfree(ikglp_aff->nr_cur_users_on_rsrc); kfree(ikglp_aff->nr_aff_on_rsrc); kfree(ikglp_aff->q_info); kfree(ikglp_aff); } static struct affinity_observer* ikglp_aff_obs_new(struct affinity_observer_ops* ops, struct ikglp_affinity_ops* ikglp_ops, void* __user args) { struct ikglp_affinity* ikglp_aff; struct gpu_affinity_observer_args aff_args; struct ikglp_semaphore* sem; int i; unsigned long flags; if(!access_ok(VERIFY_READ, args, sizeof(aff_args))) { return(NULL); } if(__copy_from_user(&aff_args, args, sizeof(aff_args))) { return(NULL); } sem = (struct ikglp_semaphore*) get_lock_from_od(aff_args.obs.lock_od); if(sem->litmus_lock.type != IKGLP_SEM) { TRACE_CUR("Lock type not supported. Type = %d\n", sem->litmus_lock.type); return(NULL); } if((aff_args.nr_simult_users <= 0) || (sem->nr_replicas%aff_args.nr_simult_users != 0)) { TRACE_CUR("Lock %d does not support #replicas (%d) for #simult_users " "(%d) per replica. #replicas should be evenly divisible " "by #simult_users.\n", sem->litmus_lock.ident, sem->nr_replicas, aff_args.nr_simult_users); return(NULL); } if(aff_args.nr_simult_users > NV_MAX_SIMULT_USERS) { TRACE_CUR("System does not support #simult_users > %d. %d requested.\n", NV_MAX_SIMULT_USERS, aff_args.nr_simult_users); // return(NULL); } ikglp_aff = kmalloc(sizeof(*ikglp_aff), GFP_KERNEL); if(!ikglp_aff) { return(NULL); } ikglp_aff->q_info = kmalloc(sizeof(struct ikglp_queue_info)*sem->nr_replicas, GFP_KERNEL); if(!ikglp_aff->q_info) { kfree(ikglp_aff); return(NULL); } ikglp_aff->nr_cur_users_on_rsrc = kmalloc(sizeof(int)*(sem->nr_replicas / aff_args.nr_simult_users), GFP_KERNEL); if(!ikglp_aff->nr_cur_users_on_rsrc) { kfree(ikglp_aff->q_info); kfree(ikglp_aff); return(NULL); } ikglp_aff->nr_aff_on_rsrc = kmalloc(sizeof(int)*(sem->nr_replicas / aff_args.nr_simult_users), GFP_KERNEL); if(!ikglp_aff->nr_aff_on_rsrc) { kfree(ikglp_aff->nr_cur_users_on_rsrc); kfree(ikglp_aff->q_info); kfree(ikglp_aff); return(NULL); } affinity_observer_new(&ikglp_aff->obs, ops, &aff_args.obs); ikglp_aff->ops = ikglp_ops; ikglp_aff->offset = aff_args.replica_to_gpu_offset; ikglp_aff->nr_simult = aff_args.nr_simult_users; ikglp_aff->nr_rsrc = sem->nr_replicas / ikglp_aff->nr_simult; ikglp_aff->relax_max_fifo_len = (aff_args.relaxed_rules) ? 1 : 0; TRACE_CUR("GPU affinity_observer: offset = %d, nr_simult = %d, " "nr_rsrc = %d, relaxed_fifo_len = %d\n", ikglp_aff->offset, ikglp_aff->nr_simult, ikglp_aff->nr_rsrc, ikglp_aff->relax_max_fifo_len); memset(ikglp_aff->nr_cur_users_on_rsrc, 0, sizeof(int)*(ikglp_aff->nr_rsrc)); memset(ikglp_aff->nr_aff_on_rsrc, 0, sizeof(int)*(ikglp_aff->nr_rsrc)); for(i = 0; i < sem->nr_replicas; ++i) { ikglp_aff->q_info[i].q = &sem->fifo_queues[i]; ikglp_aff->q_info[i].estimated_len = 0; // multiple q_info's will point to the same resource (aka GPU) if // aff_args.nr_simult_users > 1 ikglp_aff->q_info[i].nr_cur_users = &ikglp_aff->nr_cur_users_on_rsrc[__replica_to_gpu(ikglp_aff,i)]; ikglp_aff->q_info[i].nr_aff_users = &ikglp_aff->nr_aff_on_rsrc[__replica_to_gpu(ikglp_aff,i)]; } // attach observer to the lock raw_spin_lock_irqsave(&sem->real_lock, flags); sem->aff_obs = ikglp_aff; raw_spin_unlock_irqrestore(&sem->real_lock, flags); return &ikglp_aff->obs; } static int gpu_replica_to_resource(struct ikglp_affinity* aff, struct fifo_queue* fq) { struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock); return(replica_to_gpu(aff, ikglp_get_idx(sem, fq))); } // Smart IKGLP Affinity //static inline struct ikglp_queue_info* ikglp_aff_find_shortest(struct ikglp_affinity* aff) //{ // struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock); // struct ikglp_queue_info *shortest = &aff->q_info[0]; // int i; // // for(i = 1; i < sem->nr_replicas; ++i) { // if(aff->q_info[i].estimated_len < shortest->estimated_len) { // shortest = &aff->q_info[i]; // } // } // // return(shortest); //} struct fifo_queue* gpu_ikglp_advise_enqueue(struct ikglp_affinity* aff, struct task_struct* t) { // advise_enqueue must be smart as not not break IKGLP rules: // * No queue can be greater than ceil(m/k) in length. We may return // such a queue, but IKGLP will be smart enough as to send requests // to donors or PQ. // * Cannot let a queue idle if there exist waiting PQ/donors // -- needed to guarantee parallel progress of waiters. // // We may be able to relax some of these constraints, but this will have to // be carefully evaluated. // // Huristic strategy: Find the shortest queue that is not full. struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock); lt_t min_len; int min_nr_users, min_nr_aff_users; struct ikglp_queue_info *shortest; struct fifo_queue *to_enqueue; int i; int affinity_gpu; int max_fifo_len = (aff->relax_max_fifo_len) ? sem->m : sem->max_fifo_len; // if we have no affinity, find the GPU with the least number of users // with active affinity if(unlikely(tsk_rt(t)->last_gpu < 0)) { int temp_min = aff->nr_aff_on_rsrc[0]; affinity_gpu = aff->offset; for(i = 1; i < aff->nr_rsrc; ++i) { if(aff->nr_aff_on_rsrc[i] < temp_min) { affinity_gpu = aff->offset + i; } } TRACE_CUR("no affinity. defaulting to %d with %d aff users.\n", affinity_gpu, temp_min); } else { affinity_gpu = tsk_rt(t)->last_gpu; } // all things being equal, let's start with the queue with which we have // affinity. this helps us maintain affinity even when we don't have // an estiamte for local-affinity execution time (i.e., 2nd time on GPU) shortest = &aff->q_info[gpu_to_base_replica(aff, affinity_gpu)]; // if(shortest == aff->shortest_queue) { // TRACE_CUR("special case: have affinity with shortest queue\n"); // goto out; // } min_len = shortest->estimated_len + get_gpu_estimate(t, MIG_LOCAL); min_nr_users = *(shortest->nr_cur_users); min_nr_aff_users = *(shortest->nr_aff_users); TRACE_CUR("cs is %llu on queue %d (count = %d): est len = %llu\n", get_gpu_estimate(t, MIG_LOCAL), ikglp_get_idx(sem, shortest->q), shortest->q->count, min_len); for(i = 0; i < sem->nr_replicas; ++i) { if(&aff->q_info[i] != shortest) { if(aff->q_info[i].q->count < max_fifo_len) { lt_t est_len = aff->q_info[i].estimated_len + get_gpu_estimate(t, gpu_migration_distance(tsk_rt(t)->last_gpu, replica_to_gpu(aff, i))); // queue is smaller, or they're equal and the other has a smaller number // of total users. // // tie-break on the shortest number of simult users. this only kicks in // when there are more than 1 empty queues. // TODO: Make "est_len < min_len" a fuzzy function that allows // queues "close enough" in length to be considered equal. if((shortest->q->count >= max_fifo_len) || /* 'shortest' is full and i-th queue is not */ (est_len < min_len) || /* i-th queue has shortest length */ ((est_len == min_len) && /* equal lengths, but one has fewer over-all users */ ((*(aff->q_info[i].nr_aff_users) < min_nr_aff_users) || ((*(aff->q_info[i].nr_aff_users) == min_nr_aff_users) && (*(aff->q_info[i].nr_cur_users) < min_nr_users))))) { shortest = &aff->q_info[i]; min_len = est_len; min_nr_users = *(aff->q_info[i].nr_cur_users); min_nr_aff_users = *(aff->q_info[i].nr_aff_users); } TRACE_CUR("cs is %llu on queue %d (count = %d): est len = %llu\n", get_gpu_estimate(t, gpu_migration_distance(tsk_rt(t)->last_gpu, replica_to_gpu(aff, i))), ikglp_get_idx(sem, aff->q_info[i].q), aff->q_info[i].q->count, est_len); } else { TRACE_CUR("queue %d is too long. ineligible for enqueue.\n", ikglp_get_idx(sem, aff->q_info[i].q)); } } } if(shortest->q->count >= max_fifo_len) { TRACE_CUR("selected fq %d is too long, but returning it anyway.\n", ikglp_get_idx(sem, shortest->q)); } to_enqueue = shortest->q; TRACE_CUR("enqueue on fq %d (count = %d) (non-aff wanted fq %d)\n", ikglp_get_idx(sem, to_enqueue), to_enqueue->count, ikglp_get_idx(sem, sem->shortest_fifo_queue)); return to_enqueue; //return(sem->shortest_fifo_queue); } static ikglp_wait_state_t* pick_steal(struct ikglp_affinity* aff, int dest_gpu, struct fifo_queue* fq) { struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock); ikglp_wait_state_t *wait = NULL; int max_improvement = -(MIG_NONE+1); int replica = ikglp_get_idx(sem, fq); if(waitqueue_active(&fq->wait)) { int this_gpu = replica_to_gpu(aff, replica); struct list_head *pos; list_for_each(pos, &fq->wait.task_list) { wait_queue_t *fq_wait = list_entry(pos, wait_queue_t, task_list); ikglp_wait_state_t *tmp_wait = container_of(fq_wait, ikglp_wait_state_t, fq_node); int tmp_improvement = gpu_migration_distance(this_gpu, tsk_rt(tmp_wait->task)->last_gpu) - gpu_migration_distance(dest_gpu, tsk_rt(tmp_wait->task)->last_gpu); if(tmp_improvement > max_improvement) { wait = tmp_wait; max_improvement = tmp_improvement; if(max_improvement >= (MIG_NONE-1)) { goto out; } } } BUG_ON(!wait); } else { TRACE_CUR("fq %d is empty!\n", replica); } out: TRACE_CUR("Candidate victim from fq %d is %s/%d. aff improvement = %d.\n", replica, (wait) ? wait->task->comm : "nil", (wait) ? wait->task->pid : -1, max_improvement); return wait; } ikglp_wait_state_t* gpu_ikglp_advise_steal(struct ikglp_affinity* aff, struct fifo_queue* dst) { // Huristic strategy: Find task with greatest improvement in affinity. // struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock); ikglp_wait_state_t *to_steal_state = NULL; // ikglp_wait_state_t *default_to_steal_state = ikglp_find_hp_waiter_to_steal(sem); int max_improvement = -(MIG_NONE+1); int replica, i; int dest_gpu; replica = ikglp_get_idx(sem, dst); dest_gpu = replica_to_gpu(aff, replica); for(i = 0; i < sem->nr_replicas; ++i) { ikglp_wait_state_t *tmp_to_steal_state = pick_steal(aff, dest_gpu, &sem->fifo_queues[i]); if(tmp_to_steal_state) { int tmp_improvement = gpu_migration_distance(replica_to_gpu(aff, i), tsk_rt(tmp_to_steal_state->task)->last_gpu) - gpu_migration_distance(dest_gpu, tsk_rt(tmp_to_steal_state->task)->last_gpu); if(tmp_improvement > max_improvement) { to_steal_state = tmp_to_steal_state; max_improvement = tmp_improvement; if(max_improvement >= (MIG_NONE-1)) { goto out; } } } } out: if(!to_steal_state) { TRACE_CUR("Could not find anyone to steal.\n"); } else { TRACE_CUR("Selected victim %s/%d on fq %d (GPU %d) for fq %d (GPU %d): improvement = %d\n", to_steal_state->task->comm, to_steal_state->task->pid, ikglp_get_idx(sem, to_steal_state->donee_heap_node.fq), replica_to_gpu(aff, ikglp_get_idx(sem, to_steal_state->donee_heap_node.fq)), ikglp_get_idx(sem, dst), dest_gpu, max_improvement); // TRACE_CUR("Non-aff wanted to select victim %s/%d on fq %d (GPU %d) for fq %d (GPU %d): improvement = %d\n", // default_to_steal_state->task->comm, default_to_steal_state->task->pid, // ikglp_get_idx(sem, default_to_steal_state->donee_heap_node.fq), // replica_to_gpu(aff, ikglp_get_idx(sem, default_to_steal_state->donee_heap_node.fq)), // ikglp_get_idx(sem, dst), // replica_to_gpu(aff, ikglp_get_idx(sem, dst)), // // gpu_migration_distance( // replica_to_gpu(aff, ikglp_get_idx(sem, default_to_steal_state->donee_heap_node.fq)), // tsk_rt(default_to_steal_state->task)->last_gpu) - // gpu_migration_distance(dest_gpu, tsk_rt(default_to_steal_state->task)->last_gpu)); } return(to_steal_state); } static inline int has_donor(wait_queue_t* fq_wait) { ikglp_wait_state_t *wait = container_of(fq_wait, ikglp_wait_state_t, fq_node); return(wait->donee_heap_node.donor_info != NULL); } static ikglp_donee_heap_node_t* pick_donee(struct ikglp_affinity* aff, struct fifo_queue* fq, int* dist_from_head) { struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock); struct task_struct *donee; ikglp_donee_heap_node_t *donee_node; struct task_struct *mth_highest = ikglp_mth_highest(sem); // lt_t now = litmus_clock(); // // TRACE_CUR("fq %d: mth_highest: %s/%d, deadline = %d: (donor) = ??? ", // ikglp_get_idx(sem, fq), // mth_highest->comm, mth_highest->pid, // (int)get_deadline(mth_highest) - now); if(fq->owner && fq->donee_heap_node.donor_info == NULL && mth_highest != fq->owner && litmus->__compare(mth_highest, BASE, fq->owner, BASE)) { donee = fq->owner; donee_node = &(fq->donee_heap_node); *dist_from_head = 0; BUG_ON(donee != donee_node->task); TRACE_CUR("picked owner of fq %d as donee\n", ikglp_get_idx(sem, fq)); goto out; } else if(waitqueue_active(&fq->wait)) { struct list_head *pos; // TRACE_CUR("fq %d: owner: %s/%d, deadline = %d: (donor) = %s/%d " // "(mth_highest != fq->owner) = %d " // "(mth_highest > fq->owner) = %d\n", // ikglp_get_idx(sem, fq), // (fq->owner) ? fq->owner->comm : "nil", // (fq->owner) ? fq->owner->pid : -1, // (fq->owner) ? (int)get_deadline(fq->owner) - now : -999, // (fq->donee_heap_node.donor_info) ? fq->donee_heap_node.donor_info->task->comm : "nil", // (fq->donee_heap_node.donor_info) ? fq->donee_heap_node.donor_info->task->pid : -1, // (mth_highest != fq->owner), // (litmus->__compare(mth_highest, BASE, fq->owner, BASE))); *dist_from_head = 1; // iterating from the start of the queue is nice since this means // the donee will be closer to obtaining a resource. list_for_each(pos, &fq->wait.task_list) { wait_queue_t *fq_wait = list_entry(pos, wait_queue_t, task_list); ikglp_wait_state_t *wait = container_of(fq_wait, ikglp_wait_state_t, fq_node); // TRACE_CUR("fq %d: waiter %d: %s/%d, deadline = %d (donor) = %s/%d " // "(mth_highest != wait->task) = %d " // "(mth_highest > wait->task) = %d\n", // ikglp_get_idx(sem, fq), // dist_from_head, // wait->task->comm, wait->task->pid, // (int)get_deadline(wait->task) - now, // (wait->donee_heap_node.donor_info) ? wait->donee_heap_node.donor_info->task->comm : "nil", // (wait->donee_heap_node.donor_info) ? wait->donee_heap_node.donor_info->task->pid : -1, // (mth_highest != wait->task), // (litmus->__compare(mth_highest, BASE, wait->task, BASE))); if(!has_donor(fq_wait) && mth_highest != wait->task && litmus->__compare(mth_highest, BASE, wait->task, BASE)) { donee = (struct task_struct*) fq_wait->private; donee_node = &wait->donee_heap_node; BUG_ON(donee != donee_node->task); TRACE_CUR("picked waiter in fq %d as donee\n", ikglp_get_idx(sem, fq)); goto out; } ++(*dist_from_head); } } donee = NULL; donee_node = NULL; //*dist_from_head = sem->max_fifo_len + 1; *dist_from_head = IKGLP_INVAL_DISTANCE; TRACE_CUR("Found no one to be donee in fq %d!\n", ikglp_get_idx(sem, fq)); out: TRACE_CUR("Candidate donee for fq %d is %s/%d (dist_from_head = %d)\n", ikglp_get_idx(sem, fq), (donee) ? (donee)->comm : "nil", (donee) ? (donee)->pid : -1, *dist_from_head); return donee_node; } ikglp_donee_heap_node_t* gpu_ikglp_advise_donee_selection( struct ikglp_affinity* aff, struct task_struct* donor) { // Huristic strategy: Find the highest-priority donee that is waiting on // a queue closest to our affinity. (1) The donee CANNOT already have a // donor (exception: donee is the lowest-prio task in the donee heap). // (2) Requests in 'top_m' heap are ineligible. // // Further strategy: amongst elible donees waiting for the same GPU, pick // the one closest to the head of the FIFO queue (including owners). // struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock); ikglp_donee_heap_node_t *donee_node; gpu_migration_dist_t distance; int start, i, j; ikglp_donee_heap_node_t *default_donee; ikglp_wait_state_t *default_donee_donor_info; if(tsk_rt(donor)->last_gpu < 0) { // no affinity. just return the min prio, like standard IKGLP // TODO: Find something closer to the head of the queue?? donee_node = binheap_top_entry(&sem->donees, ikglp_donee_heap_node_t, node); goto out; } // Temporarily break any donation relation the default donee (the lowest // prio task in the FIFO queues) to make it eligible for selection below. // // NOTE: The original donor relation *must* be restored, even if we select // the default donee throug affinity-aware selection, before returning // from this function so we don't screw up our heap ordering. // The standard IKGLP algorithm will steal the donor relationship if needed. default_donee = binheap_top_entry(&sem->donees, ikglp_donee_heap_node_t, node); default_donee_donor_info = default_donee->donor_info; // back-up donor relation default_donee->donor_info = NULL; // temporarily break any donor relation. // initialize our search donee_node = NULL; distance = MIG_NONE; // TODO: The below search logic may work well for locating nodes to steal // when an FQ goes idle. Validate this code and apply it to stealing. // begin search with affinity GPU. start = gpu_to_base_replica(aff, tsk_rt(donor)->last_gpu); i = start; do { // "for each gpu" / "for each aff->nr_rsrc" gpu_migration_dist_t temp_distance = gpu_migration_distance(start, i); // only interested in queues that will improve our distance if(temp_distance < distance || donee_node == NULL) { int dist_from_head = IKGLP_INVAL_DISTANCE; TRACE_CUR("searching for donor on GPU %d", i); // visit each queue and pick a donee. bail as soon as we find // one for this class. for(j = 0; j < aff->nr_simult; ++j) { int temp_dist_from_head; ikglp_donee_heap_node_t *temp_donee_node; struct fifo_queue *fq; fq = &(sem->fifo_queues[i + j*aff->nr_rsrc]); temp_donee_node = pick_donee(aff, fq, &temp_dist_from_head); if(temp_dist_from_head < dist_from_head) { // we check all the FQs for this GPU to spread priorities // out across the queues. does this decrease jitter? donee_node = temp_donee_node; dist_from_head = temp_dist_from_head; } } if(dist_from_head != IKGLP_INVAL_DISTANCE) { TRACE_CUR("found donee %s/%d and is the %d-th waiter.\n", donee_node->task->comm, donee_node->task->pid, dist_from_head); } else { TRACE_CUR("found no eligible donors from GPU %d\n", i); } } else { TRACE_CUR("skipping GPU %d (distance = %d, best donor " "distance = %d)\n", i, temp_distance, distance); } i = (i+1 < aff->nr_rsrc) ? i+1 : 0; // increment with wrap-around } while (i != start); // restore old donor info state. default_donee->donor_info = default_donee_donor_info; if(!donee_node) { donee_node = default_donee; TRACE_CUR("Could not find a donee. We have to steal one.\n"); WARN_ON(default_donee->donor_info == NULL); } out: TRACE_CUR("Selected donee %s/%d on fq %d (GPU %d) for %s/%d with affinity for GPU %d\n", donee_node->task->comm, donee_node->task->pid, ikglp_get_idx(sem, donee_node->fq), replica_to_gpu(aff, ikglp_get_idx(sem, donee_node->fq)), donor->comm, donor->pid, tsk_rt(donor)->last_gpu); return(donee_node); } static void __find_closest_donor(int target_gpu, struct binheap_node* donor_node, ikglp_wait_state_t** cur_closest, int* cur_dist) { ikglp_wait_state_t *this_donor = binheap_entry(donor_node, ikglp_wait_state_t, node); int this_dist = gpu_migration_distance(target_gpu, tsk_rt(this_donor->task)->last_gpu); // TRACE_CUR("%s/%d: dist from target = %d\n", // this_donor->task->comm, // this_donor->task->pid, // this_dist); if(this_dist < *cur_dist) { // take this donor *cur_dist = this_dist; *cur_closest = this_donor; } else if(this_dist == *cur_dist) { // priority tie-break. Even though this is a pre-order traversal, // this is a heap, not a binary tree, so we still need to do a priority // comparision. if(!(*cur_closest) || litmus->compare(this_donor->task, (*cur_closest)->task)) { *cur_dist = this_dist; *cur_closest = this_donor; } } if(donor_node->left) __find_closest_donor(target_gpu, donor_node->left, cur_closest, cur_dist); if(donor_node->right) __find_closest_donor(target_gpu, donor_node->right, cur_closest, cur_dist); } ikglp_wait_state_t* gpu_ikglp_advise_donor_to_fq(struct ikglp_affinity* aff, struct fifo_queue* fq) { // Huristic strategy: Find donor with the closest affinity to fq. // Tie-break on priority. // We need to iterate over all the donors to do this. Unfortunatly, // our donors are organized in a heap. We'll visit each node with a // recurisve call. This is realitively safe since there are only sem->m // donors, at most. We won't recurse too deeply to have to worry about // our stack. (even with 128 CPUs, our nest depth is at most 7 deep). struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock); ikglp_wait_state_t *donor = NULL; int distance = MIG_NONE; int gpu = replica_to_gpu(aff, ikglp_get_idx(sem, fq)); ikglp_wait_state_t* default_donor = binheap_top_entry(&sem->donors, ikglp_wait_state_t, node); __find_closest_donor(gpu, sem->donors.root, &donor, &distance); TRACE_CUR("Selected donor %s/%d (distance = %d) to move to fq %d " "(non-aff wanted %s/%d). differs = %d\n", donor->task->comm, donor->task->pid, distance, ikglp_get_idx(sem, fq), default_donor->task->comm, default_donor->task->pid, (donor->task != default_donor->task) ); return(donor); } void gpu_ikglp_notify_enqueue(struct ikglp_affinity* aff, struct fifo_queue* fq, struct task_struct* t) { struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock); int replica = ikglp_get_idx(sem, fq); int gpu = replica_to_gpu(aff, replica); struct ikglp_queue_info *info = &aff->q_info[replica]; lt_t est_time; lt_t est_len_before; if(current == t) { tsk_rt(t)->suspend_gpu_tracker_on_block = 1; } est_len_before = info->estimated_len; est_time = get_gpu_estimate(t, gpu_migration_distance(tsk_rt(t)->last_gpu, gpu)); info->estimated_len += est_time; TRACE_CUR("fq %d: q_len (%llu) + est_cs (%llu) = %llu\n", ikglp_get_idx(sem, info->q), est_len_before, est_time, info->estimated_len); // if(aff->shortest_queue == info) { // // we may no longer be the shortest // aff->shortest_queue = ikglp_aff_find_shortest(aff); // // TRACE_CUR("shortest queue is fq %d (with %d in queue) has est len %llu\n", // ikglp_get_idx(sem, aff->shortest_queue->q), // aff->shortest_queue->q->count, // aff->shortest_queue->estimated_len); // } } void gpu_ikglp_notify_dequeue(struct ikglp_affinity* aff, struct fifo_queue* fq, struct task_struct* t) { struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock); int replica = ikglp_get_idx(sem, fq); int gpu = replica_to_gpu(aff, replica); struct ikglp_queue_info *info = &aff->q_info[replica]; lt_t est_time = get_gpu_estimate(t, gpu_migration_distance(tsk_rt(t)->last_gpu, gpu)); if(est_time > info->estimated_len) { WARN_ON(1); info->estimated_len = 0; } else { info->estimated_len -= est_time; } TRACE_CUR("fq %d est len is now %llu\n", ikglp_get_idx(sem, info->q), info->estimated_len); // check to see if we're the shortest queue now. // if((aff->shortest_queue != info) && // (aff->shortest_queue->estimated_len > info->estimated_len)) { // // aff->shortest_queue = info; // // TRACE_CUR("shortest queue is fq %d (with %d in queue) has est len %llu\n", // ikglp_get_idx(sem, info->q), // info->q->count, // info->estimated_len); // } } int gpu_ikglp_notify_exit(struct ikglp_affinity* aff, struct task_struct* t) { struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock); unsigned long flags = 0, real_flags; int aff_rsrc; #ifdef CONFIG_LITMUS_DGL_SUPPORT raw_spinlock_t *dgl_lock; dgl_lock = litmus->get_dgl_spinlock(t); #endif if (tsk_rt(t)->last_gpu < 0) return 0; raw_spin_lock_irqsave(&sem->real_lock, real_flags); lock_global_irqsave(dgl_lock, flags); lock_fine_irqsave(&sem->lock, flags); // decrement affinity count on old GPU aff_rsrc = tsk_rt(t)->last_gpu - aff->offset; --(aff->nr_aff_on_rsrc[aff_rsrc]); if(unlikely(aff->nr_aff_on_rsrc[aff_rsrc] < 0)) { WARN_ON(aff->nr_aff_on_rsrc[aff_rsrc] < 0); aff->nr_aff_on_rsrc[aff_rsrc] = 0; } unlock_fine_irqrestore(&sem->lock, flags); unlock_global_irqrestore(dgl_lock, flags); raw_spin_unlock_irqrestore(&sem->real_lock, real_flags); return 0; } int gpu_ikglp_notify_exit_trampoline(struct task_struct* t) { struct ikglp_affinity* aff = (struct ikglp_affinity*)tsk_rt(t)->rsrc_exit_cb_args; if(likely(aff)) { return gpu_ikglp_notify_exit(aff, t); } else { return -1; } } void gpu_ikglp_notify_acquired(struct ikglp_affinity* aff, struct fifo_queue* fq, struct task_struct* t) { struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock); int replica = ikglp_get_idx(sem, fq); int gpu = replica_to_gpu(aff, replica); int last_gpu = tsk_rt(t)->last_gpu; tsk_rt(t)->gpu_migration = gpu_migration_distance(last_gpu, gpu); // record the type of migration TRACE_CUR("%s/%d acquired gpu %d (prev = %d). migration type = %d\n", t->comm, t->pid, gpu, last_gpu, tsk_rt(t)->gpu_migration); // count the number or resource holders ++(*(aff->q_info[replica].nr_cur_users)); if(gpu != last_gpu) { if(last_gpu >= 0) { int old_rsrc = last_gpu - aff->offset; --(aff->nr_aff_on_rsrc[old_rsrc]); } // increment affinity count on new GPU ++(aff->nr_aff_on_rsrc[gpu - aff->offset]); tsk_rt(t)->rsrc_exit_cb_args = aff; tsk_rt(t)->rsrc_exit_cb = gpu_ikglp_notify_exit_trampoline; } reg_nv_device(gpu, 1, t); // register tsk_rt(t)->suspend_gpu_tracker_on_block = 0; reset_gpu_tracker(t); start_gpu_tracker(t); } void gpu_ikglp_notify_freed(struct ikglp_affinity* aff, struct fifo_queue* fq, struct task_struct* t) { struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock); int replica = ikglp_get_idx(sem, fq); int gpu = replica_to_gpu(aff, replica); lt_t est_time; stop_gpu_tracker(t); // stop the tracker before we do anything else. est_time = get_gpu_estimate(t, gpu_migration_distance(tsk_rt(t)->last_gpu, gpu)); // count the number or resource holders --(*(aff->q_info[replica].nr_cur_users)); reg_nv_device(gpu, 0, t); // unregister // update estimates update_gpu_estimate(t, get_gpu_time(t)); TRACE_CUR("%s/%d freed gpu %d (prev = %d). mig type = %d. actual time was %llu. " "estimated was %llu. diff is %d\n", t->comm, t->pid, gpu, tsk_rt(t)->last_gpu, tsk_rt(t)->gpu_migration, get_gpu_time(t), est_time, (long long)get_gpu_time(t) - (long long)est_time); tsk_rt(t)->last_gpu = gpu; } struct ikglp_affinity_ops gpu_ikglp_affinity = { .advise_enqueue = gpu_ikglp_advise_enqueue, .advise_steal = gpu_ikglp_advise_steal, .advise_donee_selection = gpu_ikglp_advise_donee_selection, .advise_donor_to_fq = gpu_ikglp_advise_donor_to_fq, .notify_enqueue = gpu_ikglp_notify_enqueue, .notify_dequeue = gpu_ikglp_notify_dequeue, .notify_acquired = gpu_ikglp_notify_acquired, .notify_freed = gpu_ikglp_notify_freed, .notify_exit = gpu_ikglp_notify_exit, .replica_to_resource = gpu_replica_to_resource, }; struct affinity_observer* ikglp_gpu_aff_obs_new(struct affinity_observer_ops* ops, void* __user args) { return ikglp_aff_obs_new(ops, &gpu_ikglp_affinity, args); } // Simple ikglp Affinity (standard ikglp with auto-gpu registration) struct fifo_queue* simple_gpu_ikglp_advise_enqueue(struct ikglp_affinity* aff, struct task_struct* t) { struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock); int min_count; int min_nr_users; struct ikglp_queue_info *shortest; struct fifo_queue *to_enqueue; int i; // TRACE_CUR("Simple GPU ikglp advise_enqueue invoked\n"); shortest = &aff->q_info[0]; min_count = shortest->q->count; min_nr_users = *(shortest->nr_cur_users); TRACE_CUR("queue %d: waiters = %d, total holders = %d\n", ikglp_get_idx(sem, shortest->q), shortest->q->count, min_nr_users); for(i = 1; i < sem->nr_replicas; ++i) { int len = aff->q_info[i].q->count; // queue is smaller, or they're equal and the other has a smaller number // of total users. // // tie-break on the shortest number of simult users. this only kicks in // when there are more than 1 empty queues. if((len < min_count) || ((len == min_count) && (*(aff->q_info[i].nr_cur_users) < min_nr_users))) { shortest = &aff->q_info[i]; min_count = shortest->q->count; min_nr_users = *(aff->q_info[i].nr_cur_users); } TRACE_CUR("queue %d: waiters = %d, total holders = %d\n", ikglp_get_idx(sem, aff->q_info[i].q), aff->q_info[i].q->count, *(aff->q_info[i].nr_cur_users)); } to_enqueue = shortest->q; TRACE_CUR("enqueue on fq %d (non-aff wanted fq %d)\n", ikglp_get_idx(sem, to_enqueue), ikglp_get_idx(sem, sem->shortest_fifo_queue)); return to_enqueue; } ikglp_wait_state_t* simple_gpu_ikglp_advise_steal(struct ikglp_affinity* aff, struct fifo_queue* dst) { struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock); // TRACE_CUR("Simple GPU ikglp advise_steal invoked\n"); return ikglp_find_hp_waiter_to_steal(sem); } ikglp_donee_heap_node_t* simple_gpu_ikglp_advise_donee_selection(struct ikglp_affinity* aff, struct task_struct* donor) { struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock); ikglp_donee_heap_node_t *donee = binheap_top_entry(&sem->donees, ikglp_donee_heap_node_t, node); return(donee); } ikglp_wait_state_t* simple_gpu_ikglp_advise_donor_to_fq(struct ikglp_affinity* aff, struct fifo_queue* fq) { struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock); ikglp_wait_state_t* donor = binheap_top_entry(&sem->donors, ikglp_wait_state_t, node); return(donor); } void simple_gpu_ikglp_notify_enqueue(struct ikglp_affinity* aff, struct fifo_queue* fq, struct task_struct* t) { // TRACE_CUR("Simple GPU ikglp notify_enqueue invoked\n"); } void simple_gpu_ikglp_notify_dequeue(struct ikglp_affinity* aff, struct fifo_queue* fq, struct task_struct* t) { // TRACE_CUR("Simple GPU ikglp notify_dequeue invoked\n"); } void simple_gpu_ikglp_notify_acquired(struct ikglp_affinity* aff, struct fifo_queue* fq, struct task_struct* t) { struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock); int replica = ikglp_get_idx(sem, fq); int gpu = replica_to_gpu(aff, replica); // TRACE_CUR("Simple GPU ikglp notify_acquired invoked\n"); // count the number or resource holders ++(*(aff->q_info[replica].nr_cur_users)); reg_nv_device(gpu, 1, t); // register } void simple_gpu_ikglp_notify_freed(struct ikglp_affinity* aff, struct fifo_queue* fq, struct task_struct* t) { struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock); int replica = ikglp_get_idx(sem, fq); int gpu = replica_to_gpu(aff, replica); // TRACE_CUR("Simple GPU ikglp notify_freed invoked\n"); // count the number or resource holders --(*(aff->q_info[replica].nr_cur_users)); reg_nv_device(gpu, 0, t); // unregister } struct ikglp_affinity_ops simple_gpu_ikglp_affinity = { .advise_enqueue = simple_gpu_ikglp_advise_enqueue, .advise_steal = simple_gpu_ikglp_advise_steal, .advise_donee_selection = simple_gpu_ikglp_advise_donee_selection, .advise_donor_to_fq = simple_gpu_ikglp_advise_donor_to_fq, .notify_enqueue = simple_gpu_ikglp_notify_enqueue, .notify_dequeue = simple_gpu_ikglp_notify_dequeue, .notify_acquired = simple_gpu_ikglp_notify_acquired, .notify_freed = simple_gpu_ikglp_notify_freed, .notify_exit = NULL, .replica_to_resource = gpu_replica_to_resource, }; struct affinity_observer* ikglp_simple_gpu_aff_obs_new(struct affinity_observer_ops* ops, void* __user args) { return ikglp_aff_obs_new(ops, &simple_gpu_ikglp_affinity, args); } #endif