path: root/litmus/ikglp_lock.c



#include <linux/slab.h>
#include <linux/uaccess.h>

#include <litmus/trace.h>
#include <litmus/sched_plugin.h>
#include <litmus/fdso.h>

#if defined(CONFIG_LITMUS_AFFINITY_LOCKING) && defined(CONFIG_LITMUS_NVIDIA)
#include <litmus/gpu_affinity.h>
#include <litmus/nvidia_info.h>
#endif

#include <litmus/ikglp_lock.h>

int ikglp_max_heap_base_priority_order(struct binheap_node *a,
										   struct binheap_node *b)
{
	ikglp_heap_node_t *d_a = binheap_entry(a, ikglp_heap_node_t, node);
	ikglp_heap_node_t *d_b = binheap_entry(b, ikglp_heap_node_t, node);

	BUG_ON(!d_a);
	BUG_ON(!d_b);

	return litmus->__compare(d_a->task, BASE, d_b->task, BASE);
}

int ikglp_min_heap_base_priority_order(struct binheap_node *a,
										   struct binheap_node *b)
{
	ikglp_heap_node_t *d_a = binheap_entry(a, ikglp_heap_node_t, node);
	ikglp_heap_node_t *d_b = binheap_entry(b, ikglp_heap_node_t, node);

	return litmus->__compare(d_b->task, BASE, d_a->task, BASE);
}

int ikglp_donor_max_heap_base_priority_order(struct binheap_node *a,
												 struct binheap_node *b)
{
	ikglp_wait_state_t *d_a = binheap_entry(a, ikglp_wait_state_t, node);
	ikglp_wait_state_t *d_b = binheap_entry(b, ikglp_wait_state_t, node);

	return litmus->__compare(d_a->task, BASE, d_b->task, BASE);
}


int ikglp_min_heap_donee_order(struct binheap_node *a,
								   struct binheap_node *b)
{
	struct task_struct *prio_a, *prio_b;

	ikglp_donee_heap_node_t *d_a =
		binheap_entry(a, ikglp_donee_heap_node_t, node);
	ikglp_donee_heap_node_t *d_b =
		binheap_entry(b, ikglp_donee_heap_node_t, node);

	if(!d_a->donor_info) {
		prio_a = d_a->task;
	}
	else {
		prio_a = d_a->donor_info->task;
		BUG_ON(d_a->task != d_a->donor_info->donee_info->task);
	}

	if(!d_b->donor_info) {
		prio_b = d_b->task;
	}
	else {
		prio_b = d_b->donor_info->task;
		BUG_ON(d_b->task != d_b->donor_info->donee_info->task);
	}

	// note reversed order
	return litmus->__compare(prio_b, BASE, prio_a, BASE);
}


static inline int ikglp_get_idx(struct ikglp_semaphore *sem,
								struct fifo_queue *queue)
{
	return (queue - &sem->fifo_queues[0]);
}

static inline struct fifo_queue* ikglp_get_queue(struct ikglp_semaphore *sem,
												 struct task_struct *holder)
{
	int i;
	for(i = 0; i < sem->nr_replicas; ++i)
		if(sem->fifo_queues[i].owner == holder)
			return(&sem->fifo_queues[i]);
	return(NULL);
}


static struct task_struct* ikglp_find_hp_waiter(struct fifo_queue *kqueue,
												struct task_struct *skip)
{
	struct list_head *pos;
	struct task_struct *queued, *found = NULL;

	list_for_each(pos, &kqueue->wait.task_list) {
		queued  = (struct task_struct*) list_entry(pos,
											wait_queue_t, task_list)->private;

		/* Compare task prios, find high prio task. */
		if(queued != skip && litmus->compare(queued, found))
			found = queued;
	}
	return found;
}

static struct fifo_queue* ikglp_find_shortest(struct ikglp_semaphore *sem,
											  struct fifo_queue *search_start)
{
	// we start our search at search_start instead of at the beginning of the
	// queue list to load-balance across all resources.
	struct fifo_queue* step = search_start;
	struct fifo_queue* shortest = sem->shortest_fifo_queue;

	do {
		step = (step+1 != &sem->fifo_queues[sem->nr_replicas]) ?
		step+1 : &sem->fifo_queues[0];

		if(step->count < shortest->count) {
			shortest = step;
			if(step->count == 0)
				break; /* can't get any shorter */
		}

	}while(step != search_start);

	return(shortest);
}

static inline struct task_struct* ikglp_mth_highest(struct ikglp_semaphore *sem)
{
	return binheap_top_entry(&sem->top_m, ikglp_heap_node_t, node)->task;
}


#if 0
static void print_global_list(struct binheap_node* n, int depth)
{
	ikglp_heap_node_t *global_heap_node;
	char padding[81] = "                                                                                ";

	if(n == NULL) {
		TRACE_CUR("+-> %p\n", NULL);
		return;
	}

	global_heap_node = binheap_entry(n, ikglp_heap_node_t, node);

	if(depth*2 <= 80)
		padding[depth*2] = '\0';

	TRACE_CUR("%s+-> %s/%d\n",
			  padding,
			  global_heap_node->task->comm,
			  global_heap_node->task->pid);

    if(n->left) print_global_list(n->left, depth+1);
    if(n->right) print_global_list(n->right, depth+1);
}

static void print_donees(struct ikglp_semaphore *sem, struct binheap_node *n, int depth)
{
	ikglp_donee_heap_node_t *donee_node;
	char padding[81] = "                                                                                ";
	struct task_struct* donor = NULL;

	if(n == NULL) {
		TRACE_CUR("+-> %p\n", NULL);
		return;
	}

	donee_node = binheap_entry(n, ikglp_donee_heap_node_t, node);

	if(depth*2 <= 80)
		padding[depth*2] = '\0';

	if(donee_node->donor_info) {
		donor = donee_node->donor_info->task;
	}

	TRACE_CUR("%s+-> %s/%d (d: %s/%d) (fq: %d)\n",
			  padding,
			  donee_node->task->comm,
			  donee_node->task->pid,
			  (donor) ? donor->comm : "nil",
			  (donor) ? donor->pid : -1,
			  ikglp_get_idx(sem, donee_node->fq));

    if(n->left) print_donees(sem, n->left, depth+1);
    if(n->right) print_donees(sem, n->right, depth+1);
}

static void print_donors(struct binheap_node *n, int depth)
{
	ikglp_wait_state_t *donor_node;
	char padding[81] = "                                                                                ";

	if(n == NULL) {
		TRACE_CUR("+-> %p\n", NULL);
		return;
	}

	donor_node = binheap_entry(n, ikglp_wait_state_t, node);

	if(depth*2 <= 80)
		padding[depth*2] = '\0';


	TRACE_CUR("%s+-> %s/%d (donee: %s/%d)\n",
			  padding,
			  donor_node->task->comm,
			  donor_node->task->pid,
			  donor_node->donee_info->task->comm,
			  donor_node->donee_info->task->pid);

    if(n->left) print_donors(n->left, depth+1);
    if(n->right) print_donors(n->right, depth+1);
}
#endif

static void ikglp_add_global_list(struct ikglp_semaphore *sem,
								  struct task_struct *t,
								  ikglp_heap_node_t *node)
{


	node->task = t;
	INIT_BINHEAP_NODE(&node->node);

	if(sem->top_m_size < sem->m) {
		TRACE_CUR("Trivially adding %s/%d to top-m global list.\n",
				  t->comm, t->pid);
//		TRACE_CUR("Top-M Before (size = %d):\n", sem->top_m_size);
//		print_global_list(sem->top_m.root, 1);

		binheap_add(&node->node, &sem->top_m, ikglp_heap_node_t, node);
		++(sem->top_m_size);

//		TRACE_CUR("Top-M After (size = %d):\n", sem->top_m_size);
//		print_global_list(sem->top_m.root, 1);
	}
	else if(litmus->__compare(t, BASE, ikglp_mth_highest(sem), BASE)) {
		ikglp_heap_node_t *evicted =
			binheap_top_entry(&sem->top_m, ikglp_heap_node_t, node);

		TRACE_CUR("Adding %s/%d to top-m and evicting %s/%d.\n",
				  t->comm, t->pid,
				  evicted->task->comm, evicted->task->pid);

//		TRACE_CUR("Not-Top-M Before:\n");
//		print_global_list(sem->not_top_m.root, 1);
//		TRACE_CUR("Top-M Before (size = %d):\n", sem->top_m_size);
//		print_global_list(sem->top_m.root, 1);


		binheap_delete_root(&sem->top_m, ikglp_heap_node_t, node);
		INIT_BINHEAP_NODE(&evicted->node);
		binheap_add(&evicted->node, &sem->not_top_m, ikglp_heap_node_t, node);

		binheap_add(&node->node, &sem->top_m, ikglp_heap_node_t, node);

//		TRACE_CUR("Top-M After (size = %d):\n", sem->top_m_size);
//		print_global_list(sem->top_m.root, 1);
//		TRACE_CUR("Not-Top-M After:\n");
//		print_global_list(sem->not_top_m.root, 1);
	}
	else {
		TRACE_CUR("Trivially adding %s/%d to not-top-m global list.\n",
				  t->comm, t->pid);
//		TRACE_CUR("Not-Top-M Before:\n");
//		print_global_list(sem->not_top_m.root, 1);

		binheap_add(&node->node, &sem->not_top_m, ikglp_heap_node_t, node);

//		TRACE_CUR("Not-Top-M After:\n");
//		print_global_list(sem->not_top_m.root, 1);
	}
}


static void ikglp_del_global_list(struct ikglp_semaphore *sem,
								  struct task_struct *t,
								  ikglp_heap_node_t *node)
{
	BUG_ON(!binheap_is_in_heap(&node->node));

	TRACE_CUR("Removing %s/%d from global list.\n", t->comm, t->pid);

	if(binheap_is_in_this_heap(&node->node, &sem->top_m)) {
		TRACE_CUR("%s/%d is in top-m\n", t->comm, t->pid);

//		TRACE_CUR("Not-Top-M Before:\n");
//		print_global_list(sem->not_top_m.root, 1);
//		TRACE_CUR("Top-M Before (size = %d):\n", sem->top_m_size);
//		print_global_list(sem->top_m.root, 1);


		binheap_delete(&node->node, &sem->top_m);

		if(!binheap_empty(&sem->not_top_m)) {
			ikglp_heap_node_t *promoted =
				binheap_top_entry(&sem->not_top_m, ikglp_heap_node_t, node);

			TRACE_CUR("Promoting %s/%d to top-m\n",
					  promoted->task->comm, promoted->task->pid);

			binheap_delete_root(&sem->not_top_m, ikglp_heap_node_t, node);
			INIT_BINHEAP_NODE(&promoted->node);

			binheap_add(&promoted->node, &sem->top_m, ikglp_heap_node_t, node);
		}
		else {
			TRACE_CUR("No one to promote to top-m.\n");
			--(sem->top_m_size);
		}

//		TRACE_CUR("Top-M After (size = %d):\n", sem->top_m_size);
//		print_global_list(sem->top_m.root, 1);
//		TRACE_CUR("Not-Top-M After:\n");
//		print_global_list(sem->not_top_m.root, 1);
	}
	else {
//		TRACE_CUR("%s/%d is in not-top-m\n", t->comm, t->pid);
//		TRACE_CUR("Not-Top-M Before:\n");
//		print_global_list(sem->not_top_m.root, 1);

		binheap_delete(&node->node, &sem->not_top_m);

//		TRACE_CUR("Not-Top-M After:\n");
//		print_global_list(sem->not_top_m.root, 1);
	}
}


static void ikglp_add_donees(struct ikglp_semaphore *sem,
							 struct fifo_queue *fq,
							 struct task_struct *t,
							 ikglp_donee_heap_node_t* node)
{
//	TRACE_CUR("Adding %s/%d to donee list.\n", t->comm, t->pid);
//	TRACE_CUR("donees Before:\n");
//	print_donees(sem, sem->donees.root, 1);

	node->task = t;
	node->donor_info = NULL;
	node->fq = fq;
	INIT_BINHEAP_NODE(&node->node);

	binheap_add(&node->node, &sem->donees, ikglp_donee_heap_node_t, node);

//	TRACE_CUR("donees After:\n");
//	print_donees(sem, sem->donees.root, 1);
}


static void ikglp_refresh_owners_prio_increase(struct task_struct *t,
											   struct fifo_queue *fq,
											   struct ikglp_semaphore *sem,
											   unsigned long flags)
{
	// priority of 't' has increased (note: 't' might already be hp_waiter).
	if ((t == fq->hp_waiter) || litmus->compare(t, fq->hp_waiter)) {
		struct task_struct *old_max_eff_prio;
		struct task_struct *new_max_eff_prio;
		struct task_struct *new_prio = NULL;
		struct task_struct *owner = fq->owner;

		if(fq->hp_waiter)
			TRACE_TASK(t, "has higher prio than hp_waiter (%s/%d).\n",
					   fq->hp_waiter->comm, fq->hp_waiter->pid);
		else
			TRACE_TASK(t, "has higher prio than hp_waiter (NIL).\n");

		if(owner)
		{
			raw_spin_lock(&tsk_rt(owner)->hp_blocked_tasks_lock);

//			TRACE_TASK(owner, "Heap Before:\n");
//			print_hp_waiters(tsk_rt(owner)->hp_blocked_tasks.root, 0);

			old_max_eff_prio = top_priority(&tsk_rt(owner)->hp_blocked_tasks);

			fq->hp_waiter = t;
			fq->nest.hp_waiter_eff_prio = effective_priority(fq->hp_waiter);

			binheap_decrease(&fq->nest.hp_binheap_node,
							 &tsk_rt(owner)->hp_blocked_tasks);

//			TRACE_TASK(owner, "Heap After:\n");
//			print_hp_waiters(tsk_rt(owner)->hp_blocked_tasks.root, 0);

			new_max_eff_prio = top_priority(&tsk_rt(owner)->hp_blocked_tasks);

			if(new_max_eff_prio != old_max_eff_prio) {
				TRACE_TASK(t, "is new hp_waiter.\n");

				if ((effective_priority(owner) == old_max_eff_prio) ||
					(litmus->__compare(new_max_eff_prio, BASE,
									   owner, EFFECTIVE))){
					new_prio = new_max_eff_prio;
				}
			}
			else {
				TRACE_TASK(t, "no change in max_eff_prio of heap.\n");
			}

			if(new_prio) {
				// set new inheritance and propagate
				TRACE_TASK(t, "Effective priority changed for owner %s/%d to %s/%d\n",
						   owner->comm, owner->pid,
						   new_prio->comm, new_prio->pid);
				litmus->nested_increase_prio(owner, new_prio, &sem->lock,
											 flags);  // unlocks lock.
			}
			else {
				TRACE_TASK(t, "No change in effective priority (is %s/%d).  Propagation halted.\n",
						   new_max_eff_prio->comm, new_max_eff_prio->pid);
				raw_spin_unlock(&tsk_rt(owner)->hp_blocked_tasks_lock);
				unlock_fine_irqrestore(&sem->lock, flags);
			}
		}
		else {
			fq->hp_waiter = t;
			fq->nest.hp_waiter_eff_prio = effective_priority(fq->hp_waiter);

			TRACE_TASK(t, "no owner??\n");
			unlock_fine_irqrestore(&sem->lock, flags);
		}
	}
	else {
		TRACE_TASK(t, "hp_waiter is unaffected.\n");
		unlock_fine_irqrestore(&sem->lock, flags);
	}
}

// hp_waiter has decreased
static void ikglp_refresh_owners_prio_decrease(struct fifo_queue *fq,
											   struct ikglp_semaphore *sem,
											   unsigned long flags)
{
	struct task_struct *owner = fq->owner;

	struct task_struct *old_max_eff_prio;
	struct task_struct *new_max_eff_prio;

	if(!owner) {
		TRACE_CUR("No owner.  Returning.\n");
		unlock_fine_irqrestore(&sem->lock, flags);
		return;
	}

	raw_spin_lock(&tsk_rt(owner)->hp_blocked_tasks_lock);

	old_max_eff_prio = top_priority(&tsk_rt(owner)->hp_blocked_tasks);

	binheap_delete(&fq->nest.hp_binheap_node, &tsk_rt(owner)->hp_blocked_tasks);
	fq->nest.hp_waiter_eff_prio = fq->hp_waiter;
	binheap_add(&fq->nest.hp_binheap_node, &tsk_rt(owner)->hp_blocked_tasks,
				struct nested_info, hp_binheap_node);

	new_max_eff_prio = top_priority(&tsk_rt(owner)->hp_blocked_tasks);

	if((old_max_eff_prio != new_max_eff_prio) &&
	   (effective_priority(owner) == old_max_eff_prio))
	{
		// Need to set new effective_priority for owner
		struct task_struct *decreased_prio;

		TRACE_CUR("Propagating decreased inheritance to holder of fq %d.\n",
				  ikglp_get_idx(sem, fq));

		if(litmus->__compare(new_max_eff_prio, BASE, owner, BASE)) {
			TRACE_CUR("%s/%d has greater base priority than base priority of owner (%s/%d) of fq %d.\n",
					  (new_max_eff_prio) ? new_max_eff_prio->comm : "nil",
					  (new_max_eff_prio) ? new_max_eff_prio->pid : -1,
					  owner->comm,
					  owner->pid,
					  ikglp_get_idx(sem, fq));

			decreased_prio = new_max_eff_prio;
		}
		else {
			TRACE_CUR("%s/%d has lesser base priority than base priority of owner (%s/%d) of fq %d.\n",
					  (new_max_eff_prio) ? new_max_eff_prio->comm : "nil",
					  (new_max_eff_prio) ? new_max_eff_prio->pid : -1,
					  owner->comm,
					  owner->pid,
					  ikglp_get_idx(sem, fq));

			decreased_prio = NULL;
		}

		// beware: recursion
		litmus->nested_decrease_prio(owner, decreased_prio, &sem->lock, flags);	// will unlock mutex->lock
	}
	else {
		TRACE_TASK(owner, "No need to propagate priority decrease forward.\n");
		raw_spin_unlock(&tsk_rt(owner)->hp_blocked_tasks_lock);
		unlock_fine_irqrestore(&sem->lock, flags);
	}
}


static void ikglp_remove_donation_from_owner(struct binheap_node *n,
											 struct fifo_queue *fq,
											 struct ikglp_semaphore *sem,
											 unsigned long flags)
{
	struct task_struct *owner = fq->owner;

	struct task_struct *old_max_eff_prio;
	struct task_struct *new_max_eff_prio;

	BUG_ON(!owner);

	raw_spin_lock(&tsk_rt(owner)->hp_blocked_tasks_lock);

	old_max_eff_prio = top_priority(&tsk_rt(owner)->hp_blocked_tasks);

	binheap_delete(n, &tsk_rt(owner)->hp_blocked_tasks);

	new_max_eff_prio = top_priority(&tsk_rt(owner)->hp_blocked_tasks);

	if((old_max_eff_prio != new_max_eff_prio) &&
	   (effective_priority(owner) == old_max_eff_prio))
	{
		// Need to set new effective_priority for owner
		struct task_struct *decreased_prio;

		TRACE_CUR("Propagating decreased inheritance to holder of fq %d.\n",
				  ikglp_get_idx(sem, fq));

		if(litmus->__compare(new_max_eff_prio, BASE, owner, BASE)) {
			TRACE_CUR("has greater base priority than base priority of owner of fq %d.\n",
					  ikglp_get_idx(sem, fq));
			decreased_prio = new_max_eff_prio;
		}
		else {
			TRACE_CUR("has lesser base priority than base priority of owner of fq %d.\n",
					  ikglp_get_idx(sem, fq));
			decreased_prio = NULL;
		}

		// beware: recursion
		litmus->nested_decrease_prio(owner, decreased_prio, &sem->lock, flags);	// will unlock mutex->lock
	}
	else {
		TRACE_TASK(owner, "No need to propagate priority decrease forward.\n");
		raw_spin_unlock(&tsk_rt(owner)->hp_blocked_tasks_lock);
		unlock_fine_irqrestore(&sem->lock, flags);
	}
}

static void ikglp_remove_donation_from_fq_waiter(struct task_struct *t,
												 struct binheap_node *n)
{
	struct task_struct *old_max_eff_prio;
	struct task_struct *new_max_eff_prio;

	raw_spin_lock(&tsk_rt(t)->hp_blocked_tasks_lock);

	old_max_eff_prio = top_priority(&tsk_rt(t)->hp_blocked_tasks);

	binheap_delete(n, &tsk_rt(t)->hp_blocked_tasks);

	new_max_eff_prio = top_priority(&tsk_rt(t)->hp_blocked_tasks);

	if((old_max_eff_prio != new_max_eff_prio) &&
	   (effective_priority(t) == old_max_eff_prio))
	{
		// Need to set new effective_priority for owner
		struct task_struct *decreased_prio;

		if(litmus->__compare(new_max_eff_prio, BASE, t, BASE)) {
			decreased_prio = new_max_eff_prio;
		}
		else {
			decreased_prio = NULL;
		}

		tsk_rt(t)->inh_task = decreased_prio;
	}

	raw_spin_unlock(&tsk_rt(t)->hp_blocked_tasks_lock);
}

static void ikglp_get_immediate(struct task_struct* t,
								struct fifo_queue *fq,
								struct ikglp_semaphore *sem,
								unsigned long flags)
{
	// resource available now
	TRACE_CUR("queue %d: acquired immediately\n", ikglp_get_idx(sem, fq));

	fq->owner = t;

	raw_spin_lock(&tsk_rt(t)->hp_blocked_tasks_lock);
	binheap_add(&fq->nest.hp_binheap_node, &tsk_rt(t)->hp_blocked_tasks,
				struct nested_info, hp_binheap_node);
	raw_spin_unlock(&tsk_rt(t)->hp_blocked_tasks_lock);

	++(fq->count);

	ikglp_add_global_list(sem, t, &fq->global_heap_node);
	ikglp_add_donees(sem, fq, t, &fq->donee_heap_node);

	sem->shortest_fifo_queue = ikglp_find_shortest(sem, sem->shortest_fifo_queue);

#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
	if(sem->aff_obs) {
		sem->aff_obs->ops->notify_enqueue(sem->aff_obs, fq, t);
		sem->aff_obs->ops->notify_acquired(sem->aff_obs, fq, t);
	}
#endif

	unlock_fine_irqrestore(&sem->lock, flags);
}


static void __ikglp_enqueue_on_fq(struct ikglp_semaphore *sem,
								  struct fifo_queue* fq,
								  struct task_struct* t,
								  wait_queue_t *wait,
								  ikglp_heap_node_t *global_heap_node,
								  ikglp_donee_heap_node_t *donee_heap_node)
{
	/* resource is not free => must suspend and wait */
	TRACE_TASK(t, "Enqueuing on fq %d.\n",
			   ikglp_get_idx(sem, fq));

	init_waitqueue_entry(wait, t);

	__add_wait_queue_tail_exclusive(&fq->wait, wait);

	++(fq->count);

	// update global list.
	if(likely(global_heap_node)) {
		if(binheap_is_in_heap(&global_heap_node->node)) {
			WARN_ON(1);
			ikglp_del_global_list(sem, t, global_heap_node);
		}
		ikglp_add_global_list(sem, t, global_heap_node);
	}
	// update donor eligiblity list.
	if(likely(donee_heap_node)) {
//		if(binheap_is_in_heap(&donee_heap_node->node)) {
//			WARN_ON(1);
//		}
		ikglp_add_donees(sem, fq, t, donee_heap_node);
	}

	if(sem->shortest_fifo_queue == fq) {
		sem->shortest_fifo_queue = ikglp_find_shortest(sem, fq);
	}

#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
	if(sem->aff_obs) {
		sem->aff_obs->ops->notify_enqueue(sem->aff_obs, fq, t);
	}
#endif

	TRACE_TASK(t, "shortest queue is now %d\n", ikglp_get_idx(sem, fq));
}


static void ikglp_enqueue_on_fq(
								struct ikglp_semaphore *sem,
								struct fifo_queue *fq,
								ikglp_wait_state_t *wait,
								unsigned long flags)
{
	/* resource is not free => must suspend and wait */
	TRACE_TASK(wait->task, "queue %d: Resource is not free => must suspend and wait.\n",
			   ikglp_get_idx(sem, fq));

	INIT_BINHEAP_NODE(&wait->global_heap_node.node);
	INIT_BINHEAP_NODE(&wait->donee_heap_node.node);

	__ikglp_enqueue_on_fq(sem, fq, wait->task, &wait->fq_node,
						  &wait->global_heap_node, &wait->donee_heap_node);

	ikglp_refresh_owners_prio_increase(wait->task, fq, sem, flags);  // unlocks sem->lock
}


static void __ikglp_enqueue_on_pq(struct ikglp_semaphore *sem,
								  ikglp_wait_state_t *wait)
{
	TRACE_TASK(wait->task, "goes to PQ.\n");

	wait->pq_node.task = wait->task; // copy over task (little redundant...)

	binheap_add(&wait->pq_node.node, &sem->priority_queue,
				ikglp_heap_node_t, node);
}

static void ikglp_enqueue_on_pq(struct ikglp_semaphore *sem,
								ikglp_wait_state_t *wait)
{
	INIT_BINHEAP_NODE(&wait->global_heap_node.node);
	INIT_BINHEAP_NODE(&wait->donee_heap_node.node);
	INIT_BINHEAP_NODE(&wait->pq_node.node);

	__ikglp_enqueue_on_pq(sem, wait);
}

static void ikglp_enqueue_on_donor(struct ikglp_semaphore *sem,
								   ikglp_wait_state_t* wait,
								   unsigned long flags)
{
	struct task_struct *t = wait->task;
	ikglp_donee_heap_node_t *donee_node = NULL;
	struct task_struct *donee;

	struct task_struct *old_max_eff_prio;
	struct task_struct *new_max_eff_prio;
	struct task_struct *new_prio = NULL;

	INIT_BINHEAP_NODE(&wait->global_heap_node.node);
	INIT_BINHEAP_NODE(&wait->donee_heap_node.node);
	INIT_BINHEAP_NODE(&wait->pq_node.node);
	INIT_BINHEAP_NODE(&wait->node);

//	TRACE_CUR("Adding %s/%d as donor.\n", t->comm, t->pid);
//	TRACE_CUR("donors Before:\n");
//	print_donors(sem->donors.root, 1);

	// Add donor to the global list.
	ikglp_add_global_list(sem, t, &wait->global_heap_node);

	// Select a donee
#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
	donee_node = (sem->aff_obs) ?
		sem->aff_obs->ops->advise_donee_selection(sem->aff_obs, t) :
		binheap_top_entry(&sem->donees, ikglp_donee_heap_node_t, node);
#else
	donee_node = binheap_top_entry(&sem->donees, ikglp_donee_heap_node_t, node);
#endif

	donee = donee_node->task;

	TRACE_TASK(t, "Donee selected: %s/%d\n", donee->comm, donee->pid);

	TRACE_CUR("Temporarily removing %s/%d to donee list.\n",
			  donee->comm, donee->pid);
//	TRACE_CUR("donees Before:\n");
//	print_donees(sem, sem->donees.root, 1);

	//binheap_delete_root(&sem->donees, ikglp_donee_heap_node_t, node);  // will re-add it shortly
	binheap_delete(&donee_node->node, &sem->donees);

//	TRACE_CUR("donees After:\n");
//	print_donees(sem, sem->donees.root, 1);


	wait->donee_info = donee_node;

	// Add t to donor heap.
	binheap_add(&wait->node, &sem->donors, ikglp_wait_state_t, node);

	// Now adjust the donee's priority.

	// Lock the donee's inheritance heap.
	raw_spin_lock(&tsk_rt(donee)->hp_blocked_tasks_lock);

	old_max_eff_prio = top_priority(&tsk_rt(donee)->hp_blocked_tasks);

	if(donee_node->donor_info) {
		// Steal donation relation.  Evict old donor to PQ.

		// Remove old donor from donor heap
		ikglp_wait_state_t *old_wait = donee_node->donor_info;
		struct task_struct *old_donor = old_wait->task;

		TRACE_TASK(t, "Donee (%s/%d) had donor %s/%d.  Moving old donor to PQ.\n",
				   donee->comm, donee->pid, old_donor->comm, old_donor->pid);

		binheap_delete(&old_wait->node, &sem->donors);

		// Remove donation from donee's inheritance heap.
		binheap_delete(&old_wait->prio_donation.hp_binheap_node,
					   &tsk_rt(donee)->hp_blocked_tasks);
		// WARNING: have not updated inh_prio!

		// Add old donor to PQ.
		__ikglp_enqueue_on_pq(sem, old_wait);

		// Remove old donor from the global heap.
		ikglp_del_global_list(sem, old_donor, &old_wait->global_heap_node);
	}

	// Add back donee's node to the donees heap with increased prio
	donee_node->donor_info = wait;
	INIT_BINHEAP_NODE(&donee_node->node);


	TRACE_CUR("Adding %s/%d back to donee list.\n", donee->comm, donee->pid);
//	TRACE_CUR("donees Before:\n");
//	print_donees(sem, sem->donees.root, 1);

	binheap_add(&donee_node->node, &sem->donees, ikglp_donee_heap_node_t, node);

//	TRACE_CUR("donees After:\n");
//	print_donees(sem, sem->donees.root, 1);

	// Add an inheritance/donation to the donee's inheritance heap.
	wait->prio_donation.lock = (struct litmus_lock*)sem;
	wait->prio_donation.hp_waiter_eff_prio = t;
	wait->prio_donation.hp_waiter_ptr = NULL;
	INIT_BINHEAP_NODE(&wait->prio_donation.hp_binheap_node);

	binheap_add(&wait->prio_donation.hp_binheap_node,
				&tsk_rt(donee)->hp_blocked_tasks,
				struct nested_info, hp_binheap_node);

	new_max_eff_prio = top_priority(&tsk_rt(donee)->hp_blocked_tasks);

	if(new_max_eff_prio != old_max_eff_prio) {
		if ((effective_priority(donee) == old_max_eff_prio) ||
			(litmus->__compare(new_max_eff_prio, BASE, donee, EFFECTIVE))){
			TRACE_TASK(t, "Donation increases %s/%d's effective priority\n",
					   donee->comm, donee->pid);
			new_prio = new_max_eff_prio;
		}
//		else {
//			// should be bug.  donor would not be in top-m.
//			TRACE_TASK(t, "Donation is not greater than base prio of %s/%d?\n", donee->comm, donee->pid);
//			WARN_ON(1);
//		}
//	}
//	else {
//		// should be bug.  donor would not be in top-m.
//		TRACE_TASK(t, "No change in %s/%d's inheritance heap?\n", donee->comm, donee->pid);
//		WARN_ON(1);
	}

	if(new_prio) {
		struct fifo_queue *donee_fq = donee_node->fq;

		if(donee != donee_fq->owner) {
			TRACE_TASK(t, "%s/%d is not the owner. Propagating priority to owner %s/%d.\n",
					   donee->comm, donee->pid,
					   donee_fq->owner->comm, donee_fq->owner->pid);

			raw_spin_unlock(&tsk_rt(donee)->hp_blocked_tasks_lock);
			ikglp_refresh_owners_prio_increase(donee, donee_fq, sem, flags);  // unlocks sem->lock
		}
		else {
			TRACE_TASK(t, "%s/%d is the owner. Progatating priority immediatly.\n",
					   donee->comm, donee->pid);
			litmus->nested_increase_prio(donee, new_prio, &sem->lock, flags);  // unlocks sem->lock and donee's heap lock
		}
	}
	else {
		TRACE_TASK(t, "No change in effective priority (it is %d/%s).  BUG?\n",
				   new_max_eff_prio->comm, new_max_eff_prio->pid);
		raw_spin_unlock(&tsk_rt(donee)->hp_blocked_tasks_lock);
		unlock_fine_irqrestore(&sem->lock, flags);
	}


//	TRACE_CUR("donors After:\n");
//	print_donors(sem->donors.root, 1);
}


int ikglp_lock(struct litmus_lock* l)
{
	struct task_struct* t = current;
	struct ikglp_semaphore *sem = ikglp_from_lock(l);
	unsigned long flags = 0, real_flags;
	struct fifo_queue *fq = NULL;
	int replica = -EINVAL;

#ifdef CONFIG_LITMUS_DGL_SUPPORT
	raw_spinlock_t *dgl_lock;
#endif

	ikglp_wait_state_t wait;

	if (!is_realtime(t))
		return -EPERM;

#ifdef CONFIG_LITMUS_DGL_SUPPORT
	dgl_lock = litmus->get_dgl_spinlock(t);
#endif

	raw_spin_lock_irqsave(&sem->real_lock, real_flags);

	lock_global_irqsave(dgl_lock, flags);
	lock_fine_irqsave(&sem->lock, flags);


#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
	fq = (sem->aff_obs) ?
		sem->aff_obs->ops->advise_enqueue(sem->aff_obs, t) :
		sem->shortest_fifo_queue;
#else
	fq = sem->shortest_fifo_queue;
#endif

	if(fq->count == 0) {
		// take available resource
		replica = ikglp_get_idx(sem, fq);

		ikglp_get_immediate(t, fq, sem, flags);  // unlocks sem->lock

		unlock_global_irqrestore(dgl_lock, flags);
		raw_spin_unlock_irqrestore(&sem->real_lock, real_flags);
	}
	else
	{
		// we have to suspend.

		wait.task = t;   // THIS IS CRITICALLY IMPORTANT!!!

		tsk_rt(t)->blocked_lock = (struct litmus_lock*)sem;  // record where we are blocked
		mb();

		/* FIXME: interruptible would be nice some day */
		set_task_state(t, TASK_UNINTERRUPTIBLE);

		if(fq->count < sem->max_fifo_len) {
			// enqueue on fq
			ikglp_enqueue_on_fq(sem, fq, &wait, flags);  // unlocks sem->lock
		}
		else {

			TRACE_CUR("IKGLP fifo queues are full (at least they better be).\n");

			// no room in fifos.  Go to PQ or donors.

			if(litmus->__compare(ikglp_mth_highest(sem), BASE, t, BASE)) {
				// enqueue on PQ
				ikglp_enqueue_on_pq(sem, &wait);
				unlock_fine_irqrestore(&sem->lock, flags);
			}
			else {
				// enqueue as donor
				ikglp_enqueue_on_donor(sem, &wait, flags);	 // unlocks sem->lock
			}
		}

		unlock_global_irqrestore(dgl_lock, flags);
		raw_spin_unlock_irqrestore(&sem->real_lock, real_flags);

		TS_LOCK_SUSPEND;

		schedule();

		TS_LOCK_RESUME;

		fq = ikglp_get_queue(sem, t);
		BUG_ON(!fq);

		replica = ikglp_get_idx(sem, fq);
	}

	TRACE_CUR("Acquired lock %d, queue %d\n",
			  l->ident, replica);

#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
	if(sem->aff_obs) {
		return sem->aff_obs->ops->replica_to_resource(sem->aff_obs, fq);
	}
#endif

	return replica;
}

static void ikglp_move_donor_to_fq(struct ikglp_semaphore *sem,
								   struct fifo_queue *fq,
								   ikglp_wait_state_t *donor_info)
{
	struct task_struct *t = donor_info->task;

	TRACE_CUR("Donor %s/%d being moved to fq %d\n",
			  t->comm,
			  t->pid,
			  ikglp_get_idx(sem, fq));

	binheap_delete(&donor_info->node, &sem->donors);

	__ikglp_enqueue_on_fq(sem, fq, t,
						  &donor_info->fq_node,
						  NULL, // already in global_list, so pass null to prevent adding 2nd time.
						  &donor_info->donee_heap_node);

	// warning:
	// ikglp_update_owners_prio(t, fq, sem, flags) has not been called.
}

static void ikglp_move_pq_to_fq(struct ikglp_semaphore *sem,
								struct fifo_queue *fq,
								ikglp_wait_state_t *wait)
{
	struct task_struct *t = wait->task;

	TRACE_CUR("PQ request %s/%d being moved to fq %d\n",
			  t->comm,
			  t->pid,
			  ikglp_get_idx(sem, fq));

	binheap_delete(&wait->pq_node.node, &sem->priority_queue);

	__ikglp_enqueue_on_fq(sem, fq, t,
						  &wait->fq_node,
						  &wait->global_heap_node,
						  &wait->donee_heap_node);
	// warning:
	// ikglp_update_owners_prio(t, fq, sem, flags) has not been called.
}

static ikglp_wait_state_t* ikglp_find_hp_waiter_to_steal(
	struct ikglp_semaphore* sem)
{
	/* must hold sem->lock */

	struct fifo_queue *fq = NULL;
	struct list_head	*pos;
	struct task_struct 	*queued;
	int i;

	for(i = 0; i < sem->nr_replicas; ++i) {
		if( (sem->fifo_queues[i].count > 1) &&
		   (!fq || litmus->compare(sem->fifo_queues[i].hp_waiter, fq->hp_waiter)) ) {

			TRACE_CUR("hp_waiter on fq %d (%s/%d) has higher prio than hp_waiter on fq %d (%s/%d)\n",
					  ikglp_get_idx(sem, &sem->fifo_queues[i]),
					  sem->fifo_queues[i].hp_waiter->comm,
					  sem->fifo_queues[i].hp_waiter->pid,
					  (fq) ? ikglp_get_idx(sem, fq) : -1,
					  (fq) ? ((fq->hp_waiter) ? fq->hp_waiter->comm : "nil") : "nilXX",
					  (fq) ? ((fq->hp_waiter) ? fq->hp_waiter->pid : -1) : -2);

			fq = &sem->fifo_queues[i];

			WARN_ON(!(fq->hp_waiter));
		}
	}

	if(fq) {
		struct task_struct *max_hp = fq->hp_waiter;
		ikglp_wait_state_t* ret = NULL;

		TRACE_CUR("Searching for %s/%d on fq %d\n",
				  max_hp->comm,
				  max_hp->pid,
				  ikglp_get_idx(sem, fq));

		BUG_ON(!max_hp);

		list_for_each(pos, &fq->wait.task_list) {
			wait_queue_t *wait = list_entry(pos, wait_queue_t, task_list);

			queued  = (struct task_struct*) wait->private;

			TRACE_CUR("fq %d entry: %s/%d\n",
					  ikglp_get_idx(sem, fq),
					  queued->comm,
					  queued->pid);

			/* Compare task prios, find high prio task. */
			if (queued == max_hp) {
				TRACE_CUR("Found it!\n");
				ret = container_of(wait, ikglp_wait_state_t, fq_node);
			}
		}

		WARN_ON(!ret);
		return ret;
	}

	return(NULL);
}

static void ikglp_steal_to_fq(struct ikglp_semaphore *sem,
							  struct fifo_queue *fq,
							  ikglp_wait_state_t *fq_wait)
{
	struct task_struct *t = fq_wait->task;
	struct fifo_queue *fq_steal = fq_wait->donee_heap_node.fq;

	WARN_ON(t != fq_steal->hp_waiter);

	TRACE_CUR("FQ request %s/%d being moved to fq %d\n",
			  t->comm,
			  t->pid,
			  ikglp_get_idx(sem, fq));

	fq_wait->donee_heap_node.fq = fq;  // just to be safe


	__remove_wait_queue(&fq_steal->wait, &fq_wait->fq_node);
	--(fq_steal->count);

#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
	if(sem->aff_obs) {
		sem->aff_obs->ops->notify_dequeue(sem->aff_obs, fq_steal, t);
	}
#endif

	fq_steal->hp_waiter = ikglp_find_hp_waiter(fq_steal, NULL);
	TRACE_TASK(t, "New hp_waiter for fq %d is %s/%d!\n",
			   ikglp_get_idx(sem, fq_steal),
			   (fq_steal->hp_waiter) ? fq_steal->hp_waiter->comm : "nil",
			   (fq_steal->hp_waiter) ? fq_steal->hp_waiter->pid : -1);


	// Update shortest.
	if(fq_steal->count < sem->shortest_fifo_queue->count) {
		sem->shortest_fifo_queue = fq_steal;
	}

	__ikglp_enqueue_on_fq(sem, fq, t,
						  &fq_wait->fq_node,
						  NULL,
						  NULL);

	// warning: We have not checked the priority inheritance of fq's owner yet.
}


static void ikglp_migrate_fq_to_owner_heap_nodes(struct ikglp_semaphore *sem,
												 struct fifo_queue *fq,
												 ikglp_wait_state_t *old_wait)
{
	struct task_struct *t = old_wait->task;

	BUG_ON(old_wait->donee_heap_node.fq != fq);

	TRACE_TASK(t, "Migrating wait_state to memory of queue %d.\n",
			   ikglp_get_idx(sem, fq));

	// need to migrate global_heap_node and donee_heap_node off of the stack
	// to the nodes allocated for the owner of this fq.

	// TODO: Enhance binheap() to perform this operation in place.

	ikglp_del_global_list(sem, t, &old_wait->global_heap_node); // remove
	fq->global_heap_node = old_wait->global_heap_node;			// copy
	ikglp_add_global_list(sem, t, &fq->global_heap_node);		// re-add

	binheap_delete(&old_wait->donee_heap_node.node, &sem->donees);  // remove
	fq->donee_heap_node = old_wait->donee_heap_node;  // copy

	if(fq->donee_heap_node.donor_info) {
		// let donor know that our location has changed
		BUG_ON(fq->donee_heap_node.donor_info->donee_info->task != t);	// validate cross-link
		fq->donee_heap_node.donor_info->donee_info = &fq->donee_heap_node;
	}
	INIT_BINHEAP_NODE(&fq->donee_heap_node.node);
	binheap_add(&fq->donee_heap_node.node, &sem->donees,
				ikglp_donee_heap_node_t, node);  // re-add
}

int ikglp_unlock(struct litmus_lock* l)
{
	struct ikglp_semaphore *sem = ikglp_from_lock(l);
	struct task_struct *t = current;
	struct task_struct *donee = NULL;
	struct task_struct *next = NULL;
	struct task_struct *new_on_fq = NULL;

	ikglp_wait_state_t *other_donor_info = NULL;
	struct fifo_queue *to_steal = NULL;
	struct fifo_queue *fq;

#ifdef CONFIG_LITMUS_DGL_SUPPORT
	raw_spinlock_t *dgl_lock;
#endif

	unsigned long flags = 0, real_flags;

	int err = 0;

#ifdef CONFIG_LITMUS_DGL_SUPPORT
	dgl_lock = litmus->get_dgl_spinlock(t);
#endif
	raw_spin_lock_irqsave(&sem->real_lock, real_flags);

	lock_global_irqsave(dgl_lock, flags);  // TODO: Push this deeper
	lock_fine_irqsave(&sem->lock, flags);


	fq = ikglp_get_queue(sem, t);  // returns NULL if 't' is not owner.

	if (!fq) {
		err = -EINVAL;
		goto out;
	}

	TRACE_TASK(t, "Freeing replica %d.\n", ikglp_get_idx(sem, fq));


	// Remove 't' from the heaps, but data in nodes will still be good.
	ikglp_del_global_list(sem, t, &fq->global_heap_node);
	binheap_delete(&fq->donee_heap_node.node, &sem->donees);

	fq->owner = NULL;  // no longer owned!!
	--(fq->count);
	if(fq->count < sem->shortest_fifo_queue->count) {
		sem->shortest_fifo_queue = fq;
	}

#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
	if(sem->aff_obs) {
		sem->aff_obs->ops->notify_dequeue(sem->aff_obs, fq, t);
		sem->aff_obs->ops->notify_freed(sem->aff_obs, fq, t);
	}
#endif

	// Move the next request into the FQ and update heaps as needed.
	// We defer re-evaluation of priorities to later in the function.
	if(fq->donee_heap_node.donor_info) {  // move my donor to FQ
		ikglp_wait_state_t *donor_info = fq->donee_heap_node.donor_info;

		new_on_fq = donor_info->task;

		TRACE_TASK(t, "Moving MY donor (%s/%d) to fq %d.\n",
				   new_on_fq->comm, new_on_fq->pid,
				   ikglp_get_idx(sem, fq));
		// donor moved to FQ
		donee = t;
		ikglp_move_donor_to_fq(sem, fq, donor_info);
	}
	else if(!binheap_empty(&sem->donors)) {  // No donor, so move any donor to FQ
											 // move other donor to FQ
		// Select a donor
#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
		other_donor_info = (sem->aff_obs) ?
			sem->aff_obs->ops->advise_donor_to_fq(sem->aff_obs, fq) :
			binheap_top_entry(&sem->donors, ikglp_wait_state_t, node);
#else
		other_donor_info = binheap_top_entry(&sem->donors, ikglp_wait_state_t, node);
#endif

		new_on_fq = other_donor_info->task;
		donee = other_donor_info->donee_info->task;

		// update the donee's heap position.
		other_donor_info->donee_info->donor_info = NULL;  // clear the cross-link
		binheap_decrease(&other_donor_info->donee_info->node, &sem->donees);

		TRACE_TASK(t, "Moving a donor (%s/%d) to fq %d.\n",
				   new_on_fq->comm, new_on_fq->pid,
				   ikglp_get_idx(sem, fq));

		ikglp_move_donor_to_fq(sem, fq, other_donor_info);
	}
	else if(!binheap_empty(&sem->priority_queue)) {  // No donors, so move PQ
		ikglp_heap_node_t *pq_node = binheap_top_entry(&sem->priority_queue,
													   ikglp_heap_node_t, node);
		ikglp_wait_state_t *pq_wait = container_of(pq_node, ikglp_wait_state_t,
												   pq_node);

		new_on_fq = pq_wait->task;

		TRACE_TASK(t, "Moving a pq waiter (%s/%d) to fq %d.\n",
				   new_on_fq->comm, new_on_fq->pid,
				   ikglp_get_idx(sem, fq));

		ikglp_move_pq_to_fq(sem, fq, pq_wait);
	}
	else if(fq->count == 0) {  // No PQ and this queue is empty, so steal.
		ikglp_wait_state_t *fq_wait;

		TRACE_TASK(t, "Looking to steal a request for fq %d...\n",
				   ikglp_get_idx(sem, fq));

#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
		fq_wait = (sem->aff_obs) ?
			sem->aff_obs->ops->advise_steal(sem->aff_obs, fq) :
			ikglp_find_hp_waiter_to_steal(sem);
#else
		fq_wait = ikglp_find_hp_waiter_to_steal(sem);
#endif

		if(fq_wait) {
			to_steal = fq_wait->donee_heap_node.fq;

			new_on_fq = fq_wait->task;

			TRACE_TASK(t, "Found %s/%d of fq %d to steal for fq %d...\n",
					   new_on_fq->comm, new_on_fq->pid,
					   ikglp_get_idx(sem, to_steal),
					   ikglp_get_idx(sem, fq));

			ikglp_steal_to_fq(sem, fq, fq_wait);
		}
		else {
			TRACE_TASK(t, "Found nothing to steal for fq %d.\n",
					   ikglp_get_idx(sem, fq));
		}
	}
	else { // move no one
	}

	// 't' must drop all priority and clean up data structures before hand-off.

	// DROP ALL INHERITANCE.  IKGLP MUST BE OUTER-MOST
	raw_spin_lock(&tsk_rt(t)->hp_blocked_tasks_lock);
	{
		int count = 0;
		while(!binheap_empty(&tsk_rt(t)->hp_blocked_tasks)) {
			binheap_delete_root(&tsk_rt(t)->hp_blocked_tasks,
								struct nested_info, hp_binheap_node);
			++count;
		}
		litmus->decrease_prio(t, NULL);
		WARN_ON(count > 2); // should not be greater than 2.  only local fq inh and donation can be possible.
	}
	raw_spin_unlock(&tsk_rt(t)->hp_blocked_tasks_lock);


	// Now patch up other priorities.
	//
	// At most one of the following:
	//   if(donee && donee != t), decrease prio, propagate to owner, or onward
	//   if(to_steal), update owner's prio (hp_waiter has already been set)
	//

	BUG_ON((other_donor_info != NULL) && (to_steal != NULL));

	if(other_donor_info) {
		struct fifo_queue *other_fq = other_donor_info->donee_info->fq;

		BUG_ON(!donee);
		BUG_ON(donee == t);

		TRACE_TASK(t, "Terminating donation relation of donor %s/%d to donee %s/%d!\n",
				   other_donor_info->task->comm, other_donor_info->task->pid,
				   donee->comm, donee->pid);

		// need to terminate donation relation.
		if(donee == other_fq->owner) {
			TRACE_TASK(t, "Donee %s/%d is an owner of fq %d.\n",
					   donee->comm, donee->pid,
					   ikglp_get_idx(sem, other_fq));

			ikglp_remove_donation_from_owner(&other_donor_info->prio_donation.hp_binheap_node, other_fq, sem, flags);
			lock_fine_irqsave(&sem->lock, flags);  // there should be no contention!!!!
		}
		else {
			TRACE_TASK(t, "Donee %s/%d is an blocked in of fq %d.\n",
					   donee->comm, donee->pid,
					   ikglp_get_idx(sem, other_fq));

			ikglp_remove_donation_from_fq_waiter(donee, &other_donor_info->prio_donation.hp_binheap_node);
			if(donee == other_fq->hp_waiter) {
				TRACE_TASK(t, "Donee %s/%d was an hp_waiter of fq %d. Rechecking hp_waiter.\n",
						   donee->comm, donee->pid,
						   ikglp_get_idx(sem, other_fq));

				other_fq->hp_waiter = ikglp_find_hp_waiter(other_fq, NULL);
				TRACE_TASK(t, "New hp_waiter for fq %d is %s/%d!\n",
						   ikglp_get_idx(sem, other_fq),
						   (other_fq->hp_waiter) ? other_fq->hp_waiter->comm : "nil",
						   (other_fq->hp_waiter) ? other_fq->hp_waiter->pid : -1);

				ikglp_refresh_owners_prio_decrease(other_fq, sem, flags); // unlocks sem->lock.  reacquire it.
				lock_fine_irqsave(&sem->lock, flags);  // there should be no contention!!!!
			}
		}
	}
	else if(to_steal) {
		TRACE_TASK(t, "Rechecking priority inheritance of fq %d, triggered by stealing.\n",
				   ikglp_get_idx(sem, to_steal));

		ikglp_refresh_owners_prio_decrease(to_steal, sem, flags); // unlocks sem->lock.  reacquire it.
		lock_fine_irqsave(&sem->lock, flags);  // there should be no contention!!!!
	}

	// check for new HP waiter.
	if(new_on_fq) {
		// fq->owner is null, so just update the hp_waiter without locking.

		if(new_on_fq == fq->hp_waiter) {
			TRACE_TASK(t, "new_on_fq is already hp_waiter.\n",
					   fq->hp_waiter->comm, fq->hp_waiter->pid);
			fq->nest.hp_waiter_eff_prio = effective_priority(fq->hp_waiter);  // set this just to be sure...
		}
		else if(litmus->compare(new_on_fq, fq->hp_waiter)) {
			if(fq->hp_waiter)
				TRACE_TASK(t, "has higher prio than hp_waiter (%s/%d).\n",
						   fq->hp_waiter->comm, fq->hp_waiter->pid);
			else
				TRACE_TASK(t, "has higher prio than hp_waiter (NIL).\n");

			fq->hp_waiter = new_on_fq;
			fq->nest.hp_waiter_eff_prio = effective_priority(fq->hp_waiter);

			TRACE_TASK(t, "New hp_waiter for fq %d is %s/%d!\n",
					   ikglp_get_idx(sem, fq),
					   (fq->hp_waiter) ? fq->hp_waiter->comm : "nil",
					   (fq->hp_waiter) ? fq->hp_waiter->pid : -1);
		}
	}


	if(waitqueue_active(&fq->wait))
	{
		wait_queue_t *wait = list_entry(fq->wait.task_list.next, wait_queue_t, task_list);
		ikglp_wait_state_t *fq_wait = container_of(wait, ikglp_wait_state_t, fq_node);
		next = (struct task_struct*) wait->private;

		__remove_wait_queue(&fq->wait, wait);

		TRACE_CUR("queue %d: ASSIGNING %s/%d as owner - next\n",
				  ikglp_get_idx(sem, fq),
				  next->comm, next->pid);

		// migrate wait-state to fifo-memory.
		ikglp_migrate_fq_to_owner_heap_nodes(sem, fq, fq_wait);

		/* next becomes the resouce holder */
		fq->owner = next;
		tsk_rt(next)->blocked_lock = NULL;

#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
		if(sem->aff_obs) {
			sem->aff_obs->ops->notify_acquired(sem->aff_obs, fq, next);
		}
#endif

		/* determine new hp_waiter if necessary */
		if (next == fq->hp_waiter) {

			TRACE_TASK(next, "was highest-prio waiter\n");
			/* next has the highest priority --- it doesn't need to
			 * inherit.  However, we need to make sure that the
			 * next-highest priority in the queue is reflected in
			 * hp_waiter. */
			fq->hp_waiter = ikglp_find_hp_waiter(fq, NULL);
			TRACE_TASK(next, "New hp_waiter for fq %d is %s/%d!\n",
					   ikglp_get_idx(sem, fq),
					   (fq->hp_waiter) ? fq->hp_waiter->comm : "nil",
					   (fq->hp_waiter) ? fq->hp_waiter->pid : -1);

			fq->nest.hp_waiter_eff_prio = (fq->hp_waiter) ?
								effective_priority(fq->hp_waiter) : NULL;

			if (fq->hp_waiter)
				TRACE_TASK(fq->hp_waiter, "is new highest-prio waiter\n");
			else
				TRACE("no further waiters\n");

			raw_spin_lock(&tsk_rt(next)->hp_blocked_tasks_lock);

//			TRACE_TASK(next, "Heap Before:\n");
//			print_hp_waiters(tsk_rt(next)->hp_blocked_tasks.root, 0);

			binheap_add(&fq->nest.hp_binheap_node,
						&tsk_rt(next)->hp_blocked_tasks,
						struct nested_info,
						hp_binheap_node);

//			TRACE_TASK(next, "Heap After:\n");
//			print_hp_waiters(tsk_rt(next)->hp_blocked_tasks.root, 0);

			raw_spin_unlock(&tsk_rt(next)->hp_blocked_tasks_lock);
		}
		else {
			/* Well, if 'next' is not the highest-priority waiter,
			 * then it (probably) ought to inherit the highest-priority
			 * waiter's priority. */
			TRACE_TASK(next, "is not hp_waiter of replica %d. hp_waiter is %s/%d\n",
					   ikglp_get_idx(sem, fq),
					   (fq->hp_waiter) ? fq->hp_waiter->comm : "nil",
					   (fq->hp_waiter) ? fq->hp_waiter->pid : -1);

			raw_spin_lock(&tsk_rt(next)->hp_blocked_tasks_lock);

			binheap_add(&fq->nest.hp_binheap_node,
						&tsk_rt(next)->hp_blocked_tasks,
						struct nested_info,
						hp_binheap_node);

			/* It is possible that 'next' *should* be the hp_waiter, but isn't
		     * because that update hasn't yet executed (update operation is
			 * probably blocked on mutex->lock). So only inherit if the top of
			 * 'next's top heap node is indeed the effective prio. of hp_waiter.
			 * (We use fq->hp_waiter_eff_prio instead of effective_priority(hp_waiter)
			 * since the effective priority of hp_waiter can change (and the
			 * update has not made it to this lock).)
			 */
			if(likely(top_priority(&tsk_rt(next)->hp_blocked_tasks) ==
												fq->nest.hp_waiter_eff_prio))
			{
				if(fq->nest.hp_waiter_eff_prio)
					litmus->increase_prio(next, fq->nest.hp_waiter_eff_prio);
				else
					WARN_ON(1);
			}

			raw_spin_unlock(&tsk_rt(next)->hp_blocked_tasks_lock);
		}


		// wake up the new resource holder!
		wake_up_process(next);
	}

	unlock_fine_irqrestore(&sem->lock, flags);
	unlock_global_irqrestore(dgl_lock, flags);

	raw_spin_unlock_irqrestore(&sem->real_lock, real_flags);

out:
	return err;
}


int ikglp_close(struct litmus_lock* l)
{
	struct task_struct *t = current;
	struct ikglp_semaphore *sem = ikglp_from_lock(l);
	unsigned long flags;

	int owner = 0;
	int i;

	raw_spin_lock_irqsave(&sem->real_lock, flags);

	for(i = 0; i < sem->nr_replicas; ++i) {
		if(sem->fifo_queues[i].owner == t) {
			owner = 1;
			break;
		}
	}

	raw_spin_unlock_irqrestore(&sem->real_lock, flags);

	if (owner)
		ikglp_unlock(l);

	return 0;
}

void ikglp_free(struct litmus_lock* l)
{
	struct ikglp_semaphore *sem = ikglp_from_lock(l);

	kfree(sem->fifo_queues);
	kfree(sem);
}


struct litmus_lock* ikglp_new(int m,
							  struct litmus_lock_ops* ops,
							  void* __user arg)
{
	struct ikglp_semaphore* sem;
	int nr_replicas = 0;
	int i;

	if(!access_ok(VERIFY_READ, arg, sizeof(nr_replicas)))
	{
		return(NULL);
	}
	if(__copy_from_user(&nr_replicas, arg, sizeof(nr_replicas)))
	{
		return(NULL);
	}
	if(nr_replicas < 1)
	{
		return(NULL);
	}

	sem = kmalloc(sizeof(*sem), GFP_KERNEL);
	if(!sem)
	{
		return NULL;
	}

	sem->fifo_queues = kmalloc(sizeof(struct fifo_queue)*nr_replicas, GFP_KERNEL);
	if(!sem->fifo_queues)
	{
		kfree(sem);
		return NULL;
	}

	sem->litmus_lock.ops = ops;

#ifdef CONFIG_DEBUG_SPINLOCK
	{
		__raw_spin_lock_init(&sem->lock, ((struct litmus_lock*)sem)->cheat_lockdep, &((struct litmus_lock*)sem)->key);
	}
#else
	raw_spin_lock_init(&sem->lock);
#endif

	raw_spin_lock_init(&sem->real_lock);

	sem->nr_replicas = nr_replicas;
	sem->m = m;
	sem->max_fifo_len = (sem->m/nr_replicas) + ((sem->m%nr_replicas) != 0);

	TRACE("New IKGLP Sem: m = %d, k = %d, max fifo_len = %d\n",
		  sem->m,
		  sem->nr_replicas,
		  sem->max_fifo_len);

	for(i = 0; i < nr_replicas; ++i)
	{
		struct fifo_queue* q = &(sem->fifo_queues[i]);

		q->owner = NULL;
		q->hp_waiter = NULL;
		init_waitqueue_head(&q->wait);
		q->count = 0;

		q->global_heap_node.task = NULL;
		INIT_BINHEAP_NODE(&q->global_heap_node.node);

		q->donee_heap_node.task = NULL;
		q->donee_heap_node.donor_info = NULL;
		q->donee_heap_node.fq = NULL;
		INIT_BINHEAP_NODE(&q->donee_heap_node.node);

		q->nest.lock = (struct litmus_lock*)sem;
		q->nest.hp_waiter_eff_prio = NULL;
		q->nest.hp_waiter_ptr = &q->hp_waiter;
		INIT_BINHEAP_NODE(&q->nest.hp_binheap_node);
	}

	sem->shortest_fifo_queue = &sem->fifo_queues[0];

	sem->top_m_size = 0;

	// init heaps
	INIT_BINHEAP_HANDLE(&sem->top_m, ikglp_min_heap_base_priority_order);
	INIT_BINHEAP_HANDLE(&sem->not_top_m, ikglp_max_heap_base_priority_order);
	INIT_BINHEAP_HANDLE(&sem->donees, ikglp_min_heap_donee_order);
	INIT_BINHEAP_HANDLE(&sem->priority_queue, ikglp_max_heap_base_priority_order);
	INIT_BINHEAP_HANDLE(&sem->donors, ikglp_donor_max_heap_base_priority_order);

#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
	sem->aff_obs = NULL;
#endif

	return &sem->litmus_lock;
}


#if defined(CONFIG_LITMUS_AFFINITY_LOCKING) && defined(CONFIG_LITMUS_NVIDIA)

static inline int __replica_to_gpu(struct ikglp_affinity* aff, int replica)
{
	int gpu = replica % aff->nr_rsrc;
	return gpu;
}

static inline int replica_to_gpu(struct ikglp_affinity* aff, int replica)
{
	int gpu = __replica_to_gpu(aff, replica) + aff->offset;
	return gpu;
}

static inline int gpu_to_base_replica(struct ikglp_affinity* aff, int gpu)
{
	int replica = gpu - aff->offset;
	return replica;
}


int ikglp_aff_obs_close(struct affinity_observer* obs)
{
	return 0;
}

void ikglp_aff_obs_free(struct affinity_observer* obs)
{
	struct ikglp_affinity *ikglp_aff = ikglp_aff_obs_from_aff_obs(obs);
	kfree(ikglp_aff->nr_cur_users_on_rsrc);
	kfree(ikglp_aff->q_info);
	kfree(ikglp_aff);
}

static struct affinity_observer* ikglp_aff_obs_new(struct affinity_observer_ops* ops,
												   struct ikglp_affinity_ops* ikglp_ops,
												   void* __user args)
{
	struct ikglp_affinity* ikglp_aff;
	struct gpu_affinity_observer_args aff_args;
	struct ikglp_semaphore* sem;
	int i;
	unsigned long flags;

	if(!access_ok(VERIFY_READ, args, sizeof(aff_args))) {
		return(NULL);
	}
	if(__copy_from_user(&aff_args, args, sizeof(aff_args))) {
		return(NULL);
	}

	sem = (struct ikglp_semaphore*) get_lock_from_od(aff_args.obs.lock_od);

	if(sem->litmus_lock.type != IKGLP_SEM) {
		TRACE_CUR("Lock type not supported.  Type = %d\n", sem->litmus_lock.type);
		return(NULL);
	}

	if((aff_args.nr_simult_users <= 0) ||
	   (sem->nr_replicas%aff_args.nr_simult_users != 0)) {
		TRACE_CUR("Lock %d does not support #replicas (%d) for #simult_users "
				  "(%d) per replica.  #replicas should be evenly divisible "
				  "by #simult_users.\n",
				  sem->litmus_lock.ident,
				  sem->nr_replicas,
				  aff_args.nr_simult_users);
		return(NULL);
	}

	if(aff_args.nr_simult_users > NV_MAX_SIMULT_USERS) {
		TRACE_CUR("System does not support #simult_users > %d. %d requested.\n",
				  NV_MAX_SIMULT_USERS, aff_args.nr_simult_users);
		return(NULL);
	}

	ikglp_aff = kmalloc(sizeof(*ikglp_aff), GFP_KERNEL);
	if(!ikglp_aff) {
		return(NULL);
	}

	ikglp_aff->q_info = kmalloc(sizeof(struct ikglp_queue_info)*sem->nr_replicas, GFP_KERNEL);
	if(!ikglp_aff->q_info) {
		kfree(ikglp_aff);
		return(NULL);
	}

	ikglp_aff->nr_cur_users_on_rsrc = kmalloc(sizeof(int)*(sem->nr_replicas / aff_args.nr_simult_users), GFP_KERNEL);
	if(!ikglp_aff->nr_cur_users_on_rsrc) {
		kfree(ikglp_aff->q_info);
		kfree(ikglp_aff);
		return(NULL);
	}

	affinity_observer_new(&ikglp_aff->obs, ops, &aff_args.obs);

	ikglp_aff->ops = ikglp_ops;
	ikglp_aff->offset = aff_args.replica_to_gpu_offset;
	ikglp_aff->nr_simult = aff_args.nr_simult_users;
	ikglp_aff->nr_rsrc = sem->nr_replicas / ikglp_aff->nr_simult;

	memset(ikglp_aff->nr_cur_users_on_rsrc, 0, sizeof(int)*(ikglp_aff->nr_rsrc));

	for(i = 0; i < sem->nr_replicas; ++i) {
		ikglp_aff->q_info[i].q = &sem->fifo_queues[i];
		ikglp_aff->q_info[i].estimated_len = 0;

		// multiple q_info's will point to the same resource (aka GPU) if
		// aff_args.nr_simult_users > 1
		ikglp_aff->q_info[i].nr_cur_users = &ikglp_aff->nr_cur_users_on_rsrc[__replica_to_gpu(ikglp_aff,i)];
	}

	// attach observer to the lock
	raw_spin_lock_irqsave(&sem->real_lock, flags);
	sem->aff_obs = ikglp_aff;
	raw_spin_unlock_irqrestore(&sem->real_lock, flags);

	return &ikglp_aff->obs;
}


static int gpu_replica_to_resource(struct ikglp_affinity* aff,
								   struct fifo_queue* fq) {
	struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock);
	return(replica_to_gpu(aff, ikglp_get_idx(sem, fq)));
}


// Smart IKGLP Affinity

//static inline struct ikglp_queue_info* ikglp_aff_find_shortest(struct ikglp_affinity* aff)
//{
//	struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock);
//	struct ikglp_queue_info *shortest = &aff->q_info[0];
//	int i;
//
//	for(i = 1; i < sem->nr_replicas; ++i) {
//		if(aff->q_info[i].estimated_len < shortest->estimated_len) {
//			shortest = &aff->q_info[i];
//		}
//	}
//
//	return(shortest);
//}

struct fifo_queue* gpu_ikglp_advise_enqueue(struct ikglp_affinity* aff, struct task_struct* t)
{
	// advise_enqueue must be smart as not not break IKGLP rules:
	//  * No queue can be greater than ceil(m/k) in length.  We may return
	//    such a queue, but IKGLP will be smart enough as to send requests
	//    to donors or PQ.
	//  * Cannot let a queue idle if there exist waiting PQ/donors
	//      -- needed to guarantee parallel progress of waiters.
	//
	// We may be able to relax some of these constraints, but this will have to
	// be carefully evaluated.
	//
	// Huristic strategy: Find the shortest queue that is not full.

	struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock);
	lt_t min_len;
	int min_nr_users;
	struct ikglp_queue_info *shortest;
	struct fifo_queue *to_enqueue;
	int i;
	int affinity_gpu;

	// simply pick the shortest queue if, we have no affinity, or we have
	// affinity with the shortest
	if(unlikely(tsk_rt(t)->last_gpu < 0)) {
		affinity_gpu = aff->offset;  // first gpu
		TRACE_CUR("no affinity\n");
	}
	else {
		affinity_gpu = tsk_rt(t)->last_gpu;
	}

	// all things being equal, let's start with the queue with which we have
	// affinity.  this helps us maintain affinity even when we don't have
	// an estiamte for local-affinity execution time (i.e., 2nd time on GPU)
	shortest = &aff->q_info[gpu_to_base_replica(aff, affinity_gpu)];

	//	if(shortest == aff->shortest_queue) {
	//		TRACE_CUR("special case: have affinity with shortest queue\n");
	//		goto out;
	//	}

	min_len = shortest->estimated_len + get_gpu_estimate(t, MIG_LOCAL);
	min_nr_users = *(shortest->nr_cur_users);

	TRACE_CUR("cs is %llu on queue %d (count = %d): est len = %llu\n",
			  get_gpu_estimate(t, MIG_LOCAL),
			  ikglp_get_idx(sem, shortest->q),
			  shortest->q->count,
			  min_len);

	for(i = 0; i < sem->nr_replicas; ++i) {
		if(&aff->q_info[i] != shortest) {
			if(aff->q_info[i].q->count < sem->max_fifo_len) {

				lt_t est_len =
					aff->q_info[i].estimated_len +
					get_gpu_estimate(t,
								gpu_migration_distance(tsk_rt(t)->last_gpu,
													replica_to_gpu(aff, i)));

		// queue is smaller, or they're equal and the other has a smaller number
		// of total users.
		//
		// tie-break on the shortest number of simult users.  this only kicks in
		// when there are more than 1 empty queues.
				if((shortest->q->count >= sem->max_fifo_len) ||	/* 'shortest' is full and i-th queue is not */
				   (est_len < min_len) ||						/* i-th queue has shortest length */
				   ((est_len == min_len) &&						/* equal lengths, but one has fewer over-all users */
					(*(aff->q_info[i].nr_cur_users) < min_nr_users))) {

					shortest = &aff->q_info[i];
					min_len = est_len;
					min_nr_users = *(aff->q_info[i].nr_cur_users);
				}

				TRACE_CUR("cs is %llu on queue %d (count = %d): est len = %llu\n",
						  get_gpu_estimate(t,
								gpu_migration_distance(tsk_rt(t)->last_gpu,
													   replica_to_gpu(aff, i))),
						  ikglp_get_idx(sem, aff->q_info[i].q),
						  aff->q_info[i].q->count,
						  est_len);
			}
			else {
				TRACE_CUR("queue %d is too long.  ineligible for enqueue.\n",
						  ikglp_get_idx(sem, aff->q_info[i].q));
			}
		}
	}

	if(shortest->q->count >= sem->max_fifo_len) {
		TRACE_CUR("selected fq %d is too long, but returning it anyway.\n",
				  ikglp_get_idx(sem, shortest->q));
	}

	to_enqueue = shortest->q;
	TRACE_CUR("enqueue on fq %d (count = %d) (non-aff wanted fq %d)\n",
			  ikglp_get_idx(sem, to_enqueue),
			  to_enqueue->count,
			  ikglp_get_idx(sem, sem->shortest_fifo_queue));

	return to_enqueue;

	//return(sem->shortest_fifo_queue);
}

ikglp_wait_state_t* gpu_ikglp_advise_steal(struct ikglp_affinity* aff,
										   struct fifo_queue* dst)
{
	struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock);

	// For now, just steal highest priority waiter
	// TODO: Implement affinity-aware stealing.

	return ikglp_find_hp_waiter_to_steal(sem);
}


static inline int has_donor(wait_queue_t* fq_wait)
{
	ikglp_wait_state_t *wait = container_of(fq_wait, ikglp_wait_state_t, fq_node);
	return(wait->donee_heap_node.donor_info != NULL);
}

static ikglp_donee_heap_node_t* pick_donee(struct ikglp_affinity* aff,
					  struct fifo_queue* fq,
					  int* dist_from_head)
{
	struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock);
	struct task_struct *donee;
	ikglp_donee_heap_node_t *donee_node;
	struct task_struct *mth_highest = ikglp_mth_highest(sem);

	lt_t now = litmus_clock();

//	TRACE_CUR("fq %d: mth_highest: %s/%d, deadline = %d: (donor) = ??? ",
//			  ikglp_get_idx(sem, fq),
//			  mth_highest->comm, mth_highest->pid,
//			  (int)get_deadline(mth_highest) - now);

	if(fq->owner &&
	   fq->donee_heap_node.donor_info == NULL &&
	   mth_highest != fq->owner &&
	   litmus->__compare(mth_highest, BASE, fq->owner, BASE)) {
		donee = fq->owner;
		donee_node = &(fq->donee_heap_node);
		*dist_from_head = 0;

		BUG_ON(donee != donee_node->task);

		TRACE_CUR("picked owner of fq %d as donee\n",
				  ikglp_get_idx(sem, fq));

		goto out;
	}
	else if(waitqueue_active(&fq->wait)) {
		struct list_head	*pos;


//		TRACE_CUR("fq %d: owner: %s/%d, deadline = %d: (donor) = %s/%d "
//				  "(mth_highest != fq->owner) = %d "
//				  "(mth_highest > fq->owner) = %d\n",
//				  ikglp_get_idx(sem, fq),
//				  (fq->owner) ? fq->owner->comm : "nil",
//				  (fq->owner) ? fq->owner->pid : -1,
//				  (fq->owner) ? (int)get_deadline(fq->owner) - now : -999,
//				  (fq->donee_heap_node.donor_info) ? fq->donee_heap_node.donor_info->task->comm : "nil",
//				  (fq->donee_heap_node.donor_info) ? fq->donee_heap_node.donor_info->task->pid : -1,
//				  (mth_highest != fq->owner),
//				  (litmus->__compare(mth_highest, BASE, fq->owner, BASE)));


		*dist_from_head = 1;

		// iterating from the start of the queue is nice since this means
		// the donee will be closer to obtaining a resource.
		list_for_each(pos, &fq->wait.task_list) {
			wait_queue_t *fq_wait = list_entry(pos, wait_queue_t, task_list);
			ikglp_wait_state_t *wait = container_of(fq_wait, ikglp_wait_state_t, fq_node);

//			TRACE_CUR("fq %d: waiter %d: %s/%d, deadline = %d (donor) = %s/%d "
//					  "(mth_highest != wait->task) = %d "
//					  "(mth_highest > wait->task) = %d\n",
//					  ikglp_get_idx(sem, fq),
//					  dist_from_head,
//					  wait->task->comm, wait->task->pid,
//					  (int)get_deadline(wait->task) - now,
//					  (wait->donee_heap_node.donor_info) ? wait->donee_heap_node.donor_info->task->comm : "nil",
//					  (wait->donee_heap_node.donor_info) ? wait->donee_heap_node.donor_info->task->pid : -1,
//					  (mth_highest != wait->task),
//					  (litmus->__compare(mth_highest, BASE, wait->task, BASE)));


			if(!has_donor(fq_wait) &&
			   mth_highest != wait->task &&
			   litmus->__compare(mth_highest, BASE, wait->task, BASE)) {
				donee = (struct task_struct*) fq_wait->private;
				donee_node = &wait->donee_heap_node;

				BUG_ON(donee != donee_node->task);

				TRACE_CUR("picked waiter in fq %d as donee\n",
						  ikglp_get_idx(sem, fq));

				goto out;
			}
			++(*dist_from_head);
		}
	}

	donee = NULL;
	donee_node = NULL;
	*dist_from_head = sem->max_fifo_len + 1;

	TRACE_CUR("Found no one to be donee in fq %d!\n", ikglp_get_idx(sem, fq));

out:

	TRACE_CUR("Candidate donee for fq %d is %s/%d (dist_from_head = %d)\n",
			  ikglp_get_idx(sem, fq),
			  (donee) ? (donee)->comm : "nil",
			  (donee) ? (donee)->pid  : -1,
			  *dist_from_head);

	return donee_node;
}

ikglp_donee_heap_node_t* gpu_ikglp_advise_donee_selection(
											struct ikglp_affinity* aff,
											struct task_struct* donor)
{
	// Huristic strategy: Find the highest-priority donee that is waiting on
	// a queue closest to our affinity.  (1) The donee CANNOT already have a
	// donor (exception: donee is the lowest-prio task in the donee heap).
	// (2) Requests in 'top_m' heap are ineligible.
	//
	// Further strategy: amongst elible donees waiting for the same GPU, pick
	// the one closest to the head of the FIFO queue (including owners).
	//
	struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock);
	ikglp_donee_heap_node_t *donee_node;
	gpu_migration_dist_t distance;
	int start, i, j;

	ikglp_donee_heap_node_t *default_donee;
	ikglp_wait_state_t *default_donee_donor_info;

	if(tsk_rt(donor)->last_gpu < 0) {
		// no affinity.  just return the min prio, like standard IKGLP
		// TODO: Find something closer to the head of the queue??
		donee_node = binheap_top_entry(&sem->donees,
									   ikglp_donee_heap_node_t,
									   node);
		goto out;
	}


	// Temporarily break any donation relation the default donee (the lowest
	// prio task in the FIFO queues) to make it eligible for selection below.
	//
	// NOTE: The original donor relation *must* be restored, even if we select
	// the default donee throug affinity-aware selection, before returning
	// from this function so we don't screw up our heap ordering.
	// The standard IKGLP algorithm will steal the donor relationship if needed.
	default_donee = binheap_top_entry(&sem->donees, ikglp_donee_heap_node_t, node);
	default_donee_donor_info = default_donee->donor_info;  // back-up donor relation
	default_donee->donor_info = NULL;  // temporarily break any donor relation.

	// initialize our search
	donee_node = NULL;
	distance = MIG_NONE;

	// TODO: The below search logic may work well for locating nodes to steal
	// when an FQ goes idle.  Validate this code and apply it to stealing.

	// begin search with affinity GPU.
	start = gpu_to_base_replica(aff, tsk_rt(donor)->last_gpu);
	i = start;
	do {  // "for each gpu" / "for each aff->nr_rsrc"
		gpu_migration_dist_t temp_distance = gpu_migration_distance(start, i);

		// only interested in queues that will improve our distance
		if(temp_distance < distance || donee_node == NULL) {
			int dist_from_head = sem->max_fifo_len + 1;

			TRACE_CUR("searching for donor on GPU %d", i);

			// visit each queue and pick a donee.  bail as soon as we find
			// one for this class.

			for(j = 0; j < aff->nr_simult; ++j) {
				int temp_dist_from_head;
				ikglp_donee_heap_node_t *temp_donee_node;
				struct fifo_queue *fq;

				fq = &(sem->fifo_queues[i + j*aff->nr_rsrc]);
				temp_donee_node = pick_donee(aff, fq, &temp_dist_from_head);

				if(temp_dist_from_head < dist_from_head)
				{
					// we check all the FQs for this GPU to spread priorities
					// out across the queues.  does this decrease jitter?
					donee_node = temp_donee_node;
					dist_from_head = temp_dist_from_head;
				}
			}

			if(dist_from_head != sem->max_fifo_len + 1) {
				TRACE_CUR("found donee %s/%d and is the %d-th waiter.\n",
						  donee_node->task->comm, donee_node->task->pid,
						  dist_from_head);
			}
			else {
				TRACE_CUR("found no eligible donors from GPU %d\n", i);
			}
		}
		else {
			TRACE_CUR("skipping GPU %d (distance = %d, best donor "
					  "distance = %d)\n", i, temp_distance, distance);
		}

		i = (i+1 < aff->nr_rsrc) ? i+1 : 0;  // increment with wrap-around
	} while (i != start);


	// restore old donor info state.
	default_donee->donor_info = default_donee_donor_info;

	if(!donee_node) {
		donee_node = default_donee;

		TRACE_CUR("Could not find a donee. We have to steal one.\n");
		WARN_ON(default_donee->donor_info == NULL);
	}

out:

	TRACE_CUR("Selected donee %s/%d on fq %d (GPU %d) for %s/%d with affinity for GPU %d\n",
			  donee_node->task->comm, donee_node->task->pid,
			  ikglp_get_idx(sem, donee_node->fq),
			  replica_to_gpu(aff, ikglp_get_idx(sem, donee_node->fq)),
			  donor->comm, donor->pid, tsk_rt(donor)->last_gpu);

	return(donee_node);
}


static void __find_closest_donor(int target_gpu,
								 struct binheap_node* donor_node,
								 ikglp_wait_state_t** cur_closest,
								 int* cur_dist)
{
	ikglp_wait_state_t *this_donor =
		binheap_entry(donor_node, ikglp_wait_state_t, node);

	int this_dist =
		gpu_migration_distance(target_gpu, tsk_rt(this_donor->task)->last_gpu);

//	TRACE_CUR("%s/%d: dist from target = %d\n",
//			  this_donor->task->comm,
//			  this_donor->task->pid,
//			  this_dist);

	if(this_dist < *cur_dist) {
		// take this donor
		*cur_dist = this_dist;
		*cur_closest = this_donor;
	}
	else if(this_dist == *cur_dist) {
		// priority tie-break.  Even though this is a pre-order traversal,
		// this is a heap, not a binary tree, so we still need to do a priority
		// comparision.
		if(!(*cur_closest) ||
		   litmus->compare(this_donor->task, (*cur_closest)->task)) {
			*cur_dist = this_dist;
			*cur_closest = this_donor;
		}
	}

    if(donor_node->left) __find_closest_donor(target_gpu, donor_node->left, cur_closest, cur_dist);
    if(donor_node->right) __find_closest_donor(target_gpu, donor_node->right, cur_closest, cur_dist);
}

ikglp_wait_state_t* gpu_ikglp_advise_donor_to_fq(struct ikglp_affinity* aff, struct fifo_queue* fq)
{
	// Huristic strategy: Find donor with the closest affinity to fq.
	// Tie-break on priority.

	// We need to iterate over all the donors to do this.  Unfortunatly,
	// our donors are organized in a heap.  We'll visit each node with a
	// recurisve call.  This is realitively safe since there are only sem->m
	// donors, at most.  We won't recurse too deeply to have to worry about
	// our stack.  (even with 128 CPUs, our nest depth is at most 7 deep).

	struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock);
	ikglp_wait_state_t *donor = NULL;
	int distance = MIG_NONE;
	int gpu = replica_to_gpu(aff, ikglp_get_idx(sem, fq));
	ikglp_wait_state_t* default_donor = binheap_top_entry(&sem->donors, ikglp_wait_state_t, node);

	__find_closest_donor(gpu, sem->donors.root, &donor, &distance);

	TRACE_CUR("Selected donor %s/%d (distance = %d) to move to fq %d "
			  "(non-aff wanted %s/%d). differs = %d\n",
			  donor->task->comm, donor->task->pid,
			  distance,
			  ikglp_get_idx(sem, fq),
			  default_donor->task->comm, default_donor->task->pid,
			  (donor->task != default_donor->task)
			  );

	return(donor);
}


void gpu_ikglp_notify_enqueue(struct ikglp_affinity* aff, struct fifo_queue* fq, struct task_struct* t)
{
	struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock);
	int replica = ikglp_get_idx(sem, fq);
	int gpu = replica_to_gpu(aff, replica);
	struct ikglp_queue_info *info = &aff->q_info[replica];
	lt_t est_time;
	lt_t est_len_before;

	if(current == t) {
		tsk_rt(t)->suspend_gpu_tracker_on_block = 1;
	}

	est_len_before = info->estimated_len;
	est_time = get_gpu_estimate(t, gpu_migration_distance(tsk_rt(t)->last_gpu, gpu));
	info->estimated_len += est_time;

	TRACE_CUR("fq %d: q_len (%llu) + est_cs (%llu) = %llu\n",
			  ikglp_get_idx(sem, info->q),
			  est_len_before, est_time,
			  info->estimated_len);

	//	if(aff->shortest_queue == info) {
	//		// we may no longer be the shortest
	//		aff->shortest_queue = ikglp_aff_find_shortest(aff);
	//
	//		TRACE_CUR("shortest queue is fq %d (with %d in queue) has est len %llu\n",
	//				  ikglp_get_idx(sem, aff->shortest_queue->q),
	//				  aff->shortest_queue->q->count,
	//				  aff->shortest_queue->estimated_len);
	//	}
}

void gpu_ikglp_notify_dequeue(struct ikglp_affinity* aff, struct fifo_queue* fq, struct task_struct* t)
{
	struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock);
	int replica = ikglp_get_idx(sem, fq);
	int gpu = replica_to_gpu(aff, replica);
	struct ikglp_queue_info *info = &aff->q_info[replica];
	lt_t est_time = get_gpu_estimate(t, gpu_migration_distance(tsk_rt(t)->last_gpu, gpu));

	if(est_time > info->estimated_len) {
		WARN_ON(1);
		info->estimated_len = 0;
	}
	else {
		info->estimated_len -= est_time;
	}

	TRACE_CUR("fq %d est len is now %llu\n",
			  ikglp_get_idx(sem, info->q),
			  info->estimated_len);

	// check to see if we're the shortest queue now.
	//	if((aff->shortest_queue != info) &&
	//	   (aff->shortest_queue->estimated_len > info->estimated_len)) {
	//
	//		aff->shortest_queue = info;
	//
	//		TRACE_CUR("shortest queue is fq %d (with %d in queue) has est len %llu\n",
	//				  ikglp_get_idx(sem, info->q),
	//				  info->q->count,
	//				  info->estimated_len);
	//	}
}

void gpu_ikglp_notify_acquired(struct ikglp_affinity* aff,
							   struct fifo_queue* fq,
							   struct task_struct* t)
{
	struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock);
	int replica = ikglp_get_idx(sem, fq);
	int gpu = replica_to_gpu(aff, replica);

	tsk_rt(t)->gpu_migration = gpu_migration_distance(tsk_rt(t)->last_gpu, gpu);  // record the type of migration

	TRACE_CUR("%s/%d acquired gpu %d.  migration type = %d\n",
			  t->comm, t->pid, gpu, tsk_rt(t)->gpu_migration);

	// count the number or resource holders
	++(*(aff->q_info[replica].nr_cur_users));

	reg_nv_device(gpu, 1, t);  // register

	tsk_rt(t)->suspend_gpu_tracker_on_block = 0;
	reset_gpu_tracker(t);
	start_gpu_tracker(t);
}

void gpu_ikglp_notify_freed(struct ikglp_affinity* aff,
							struct fifo_queue* fq,
							struct task_struct* t)
{
	struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock);
	int replica = ikglp_get_idx(sem, fq);
	int gpu = replica_to_gpu(aff, replica);
	lt_t est_time;

	stop_gpu_tracker(t);  // stop the tracker before we do anything else.

	est_time = get_gpu_estimate(t, gpu_migration_distance(tsk_rt(t)->last_gpu, gpu));

	tsk_rt(t)->last_gpu = gpu;

	// count the number or resource holders
	--(*(aff->q_info[replica].nr_cur_users));

	reg_nv_device(gpu, 0, t);	// unregister

	// update estimates
	update_gpu_estimate(t, get_gpu_time(t));

	TRACE_CUR("%s/%d freed gpu %d.  actual time was %llu.  "
			  "estimated was %llu.  diff is %d\n",
			  t->comm, t->pid, gpu,
			  get_gpu_time(t),
			  est_time,
			  (long long)get_gpu_time(t) - (long long)est_time);
}

struct ikglp_affinity_ops gpu_ikglp_affinity =
{
	.advise_enqueue = gpu_ikglp_advise_enqueue,
	.advise_steal = gpu_ikglp_advise_steal,
	.advise_donee_selection = gpu_ikglp_advise_donee_selection,
	.advise_donor_to_fq = gpu_ikglp_advise_donor_to_fq,

	.notify_enqueue = gpu_ikglp_notify_enqueue,
	.notify_dequeue = gpu_ikglp_notify_dequeue,
	.notify_acquired = gpu_ikglp_notify_acquired,
	.notify_freed = gpu_ikglp_notify_freed,

	.replica_to_resource = gpu_replica_to_resource,
};

struct affinity_observer* ikglp_gpu_aff_obs_new(struct affinity_observer_ops* ops,
												void* __user args)
{
	return ikglp_aff_obs_new(ops, &gpu_ikglp_affinity, args);
}


// Simple ikglp Affinity (standard ikglp with auto-gpu registration)

struct fifo_queue* simple_gpu_ikglp_advise_enqueue(struct ikglp_affinity* aff, struct task_struct* t)
{
	struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock);
	int min_count;
	int min_nr_users;
	struct ikglp_queue_info *shortest;
	struct fifo_queue *to_enqueue;
	int i;

	//	TRACE_CUR("Simple GPU ikglp advise_enqueue invoked\n");

	shortest = &aff->q_info[0];
	min_count = shortest->q->count;
	min_nr_users = *(shortest->nr_cur_users);

	TRACE_CUR("queue %d: waiters = %d, total holders = %d\n",
			  ikglp_get_idx(sem, shortest->q),
			  shortest->q->count,
			  min_nr_users);

	for(i = 1; i < sem->nr_replicas; ++i) {
		int len = aff->q_info[i].q->count;

		// queue is smaller, or they're equal and the other has a smaller number
		// of total users.
		//
		// tie-break on the shortest number of simult users.  this only kicks in
		// when there are more than 1 empty queues.
		if((len < min_count) ||
		   ((len == min_count) && (*(aff->q_info[i].nr_cur_users) < min_nr_users))) {
			shortest = &aff->q_info[i];
			min_count = shortest->q->count;
			min_nr_users = *(aff->q_info[i].nr_cur_users);
		}

		TRACE_CUR("queue %d: waiters = %d, total holders = %d\n",
				  ikglp_get_idx(sem, aff->q_info[i].q),
				  aff->q_info[i].q->count,
				  *(aff->q_info[i].nr_cur_users));
	}

	to_enqueue = shortest->q;
	TRACE_CUR("enqueue on fq %d (non-aff wanted fq %d)\n",
			  ikglp_get_idx(sem, to_enqueue),
			  ikglp_get_idx(sem, sem->shortest_fifo_queue));

	return to_enqueue;
}

ikglp_wait_state_t* simple_gpu_ikglp_advise_steal(struct ikglp_affinity* aff,
												  struct fifo_queue* dst)
{
	struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock);
	//	TRACE_CUR("Simple GPU ikglp advise_steal invoked\n");
	return ikglp_find_hp_waiter_to_steal(sem);
}

ikglp_donee_heap_node_t* simple_gpu_ikglp_advise_donee_selection(struct ikglp_affinity* aff, struct task_struct* donor)
{
	struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock);
	ikglp_donee_heap_node_t *donee = binheap_top_entry(&sem->donees, ikglp_donee_heap_node_t, node);
	return(donee);
}

ikglp_wait_state_t* simple_gpu_ikglp_advise_donor_to_fq(struct ikglp_affinity* aff, struct fifo_queue* fq)
{
	struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock);
	ikglp_wait_state_t* donor = binheap_top_entry(&sem->donors, ikglp_wait_state_t, node);
	return(donor);
}

void simple_gpu_ikglp_notify_enqueue(struct ikglp_affinity* aff, struct fifo_queue* fq, struct task_struct* t)
{
	//	TRACE_CUR("Simple GPU ikglp notify_enqueue invoked\n");
}

void simple_gpu_ikglp_notify_dequeue(struct ikglp_affinity* aff, struct fifo_queue* fq, struct task_struct* t)
{
	//	TRACE_CUR("Simple GPU ikglp notify_dequeue invoked\n");
}

void simple_gpu_ikglp_notify_acquired(struct ikglp_affinity* aff, struct fifo_queue* fq, struct task_struct* t)
{
	struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock);
	int replica = ikglp_get_idx(sem, fq);
	int gpu = replica_to_gpu(aff, replica);

	//	TRACE_CUR("Simple GPU ikglp notify_acquired invoked\n");

	// count the number or resource holders
	++(*(aff->q_info[replica].nr_cur_users));

	reg_nv_device(gpu, 1, t);  // register
}

void simple_gpu_ikglp_notify_freed(struct ikglp_affinity* aff, struct fifo_queue* fq, struct task_struct* t)
{
	struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock);
	int replica = ikglp_get_idx(sem, fq);
	int gpu = replica_to_gpu(aff, replica);

	//	TRACE_CUR("Simple GPU ikglp notify_freed invoked\n");
	// count the number or resource holders
	--(*(aff->q_info[replica].nr_cur_users));

	reg_nv_device(gpu, 0, t);	// unregister
}

struct ikglp_affinity_ops simple_gpu_ikglp_affinity =
{
	.advise_enqueue = simple_gpu_ikglp_advise_enqueue,
	.advise_steal = simple_gpu_ikglp_advise_steal,
	.advise_donee_selection = simple_gpu_ikglp_advise_donee_selection,
	.advise_donor_to_fq = simple_gpu_ikglp_advise_donor_to_fq,

	.notify_enqueue = simple_gpu_ikglp_notify_enqueue,
	.notify_dequeue = simple_gpu_ikglp_notify_dequeue,
	.notify_acquired = simple_gpu_ikglp_notify_acquired,
	.notify_freed = simple_gpu_ikglp_notify_freed,

	.replica_to_resource = gpu_replica_to_resource,
};

struct affinity_observer* ikglp_simple_gpu_aff_obs_new(struct affinity_observer_ops* ops,
													   void* __user args)
{
	return ikglp_aff_obs_new(ops, &simple_gpu_ikglp_affinity, args);
}

#endif