/* Scheduling policy function that implements a "min thread use, min interference" policy,
 * i.e., find the ready-to-launch kernel that will occupy the smallest number of available
 * GPU threads AND does not fail a test for interference effects.  The test for
 * interference effects requires that ratio between the number of threads in the 
 * kernel under consideration and any kernel already scheduled does not exceed a 
 * threshold (in this implementation, 2.0).  This test is motivated by empirical
 * measurements that have shown interfernce effects such as 500% or higher for 
 * large thread ratios between concurrently executing kernels.  This is thought
 * to be an artifact of the un-documented warp scheduling algorithm in the NVIDIA SMs.  
 */

//put any global (static) declarations here:


#define MAX_THREAD_RATIO 2.0  // Threshold ratio between scheduled and new kernel


int find_best_kernel(void) {
  int i;
  int this_one = -1;  //default return value indicating no kernel to launch
  int need_threads, available_threads, left_over;
  int k;
  int occupied_threads[MAX_STREAMS];  //GPU threads allocated to scheduled kernels

  //Must be called with sched_lock held


  //record the allocated GPU threads in the kernel scheduled for each stream
  for (i = 0; i < stream_count; i++) 
    occupied_threads[i] = GPU.stream_threads[i];

   //GPU threads available for allocation
   available_threads = (MAX_GPU_THREADS - GPU.threads_occupied);
   left_over = -1;  //the number of threads left available if a kernel is scheduled

   for (i = 0; i < stream_count; i++) { //examine all streams
      if (Stream[i].state == READY_LAUNCH) { //only threads/streams ready to launch are considered

	 //determine how many threads would be allocated for this kernel (see
	 //allocate_gpu_threads() for a description)
         need_threads = min(MAX_GPU_THREADS, Stream[i].blocks * Stream[i].block_threads);
         if (need_threads > available_threads) //can't be scheduled
	    continue;

	 // find kernel with smalled thread allocation that does not create thread imbalance
	 //?? should there be a starvation-prevention part of this policy ??

         if ((available_threads - need_threads) > left_over) {
	    //found kernel with smallest thread allocation so far
	    //compute and test the ratios of threads between it and all kernels scheduled

	    for (k = 0; k < stream_count; k++) {//examine all streams
	        if (occupied_threads[k] == 0)  //stream has no kernel scheduled
		   continue;
		//if test fails for any already scheduled kernel, this stream can't launch
                if ((float)(occupied_threads[k] / (float)need_threads) > MAX_THREAD_RATIO)
	            break;
                if ((float)(need_threads / (float)occupied_threads[k]) > MAX_THREAD_RATIO)
		    break;
	    }
	    //if the test is passed for all scheduled kernels, this stream's kernel can launch
            if (k == stream_count) {
	       this_one = i; //the final value of this_one is the stream index to schedule (or -1)
               left_over = available_threads - need_threads;  //current smallest thread allocation
	    }
	 } //end test for smaller thread allocation
      } //end test for stream ready to launch
   } //end outer for loop

  if (TRACE_ON) {
     show_gpu_state();
     show_stream_state(this_one);
  }
  return this_one; //the scheduling decision (stream index)
}

// Utility function to trace GPU state used in scheduling policy decisions
void show_gpu_state(void) {

  //Must be called with sched_lock held

  int i;
  if (trc_idx >= MAX_SCHED_TRACE)
    return;

  for (i = 0; i < MAX_STREAMS; i++) {
    SchedTrace[trc_idx].stream[i] = GPU.streams[i];
    SchedTrace[trc_idx].stream_threads[i] = GPU.stream_threads[i];
    SchedTrace[trc_idx].next = 0;
    strcpy(SchedTrace[trc_idx].type, "GPU");
  }
  trc_idx++;
}

// Utility function to trace stream state used in scheduling policy decisions
void show_stream_state(int this_one) {

  //Must be called with sched_lock held

  int i;
  int need_threads;
  if (trc_idx >= MAX_SCHED_TRACE)
    return;

  for (i = 0; i < MAX_STREAMS; i++) {
    need_threads = min(MAX_GPU_THREADS, Stream[i].blocks * Stream[i].block_threads);
    if ((Stream[i].state != READY_LAUNCH) &&
	(Stream[i].state != LAUNCHED))
      need_threads = -need_threads; //encode unschedulable state in threads with minus
    SchedTrace[trc_idx].stream[i] = Stream[i].thread;
    SchedTrace[trc_idx].stream_threads[i] = need_threads;
    if (this_one == -1)
        SchedTrace[trc_idx].next = this_one;
    else
        SchedTrace[trc_idx].next = Stream[this_one].thread; 
    strcpy(SchedTrace[trc_idx].type, "STR");
  }
  trc_idx++;
}