/* Scheduling policy function that implements a "min thread use, min interference" policy, * i.e., find the ready-to-launch kernel that will occupy the smallest number of available * GPU threads AND does not fail a test for interference effects. The test for * interference effects requires that ratio between the number of threads in the * kernel under consideration and any kernel already scheduled does not exceed a * threshold (in this implementation, 2.0). This test is motivated by empirical * measurements that have shown interfernce effects such as 500% or higher for * large thread ratios between concurrently executing kernels. This is thought * to be an artifact of the un-documented warp scheduling algorithm in the NVIDIA SMs. */ //put any global (static) declarations here: #define MAX_THREAD_RATIO 2.0 // Threshold ratio between scheduled and new kernel int find_best_kernel(void) { int i; int this_one = -1; //default return value indicating no kernel to launch int need_threads, available_threads, left_over; int k; int occupied_threads[MAX_STREAMS]; //GPU threads allocated to scheduled kernels //Must be called with sched_lock held //record the allocated GPU threads in the kernel scheduled for each stream for (i = 0; i < stream_count; i++) occupied_threads[i] = GPU.stream_threads[i]; //GPU threads available for allocation available_threads = (MAX_GPU_THREADS - GPU.threads_occupied); left_over = -1; //the number of threads left available if a kernel is scheduled for (i = 0; i < stream_count; i++) { //examine all streams if (Stream[i].state == READY_LAUNCH) { //only threads/streams ready to launch are considered //determine how many threads would be allocated for this kernel (see //allocate_gpu_threads() for a description) need_threads = min(MAX_GPU_THREADS, Stream[i].blocks * Stream[i].block_threads); if (need_threads > available_threads) //can't be scheduled continue; // find kernel with smalled thread allocation that does not create thread imbalance //?? should there be a starvation-prevention part of this policy ?? if ((available_threads - need_threads) > left_over) { //found kernel with smallest thread allocation so far //compute and test the ratios of threads between it and all kernels scheduled for (k = 0; k < stream_count; k++) {//examine all streams if (occupied_threads[k] == 0) //stream has no kernel scheduled continue; //if test fails for any already scheduled kernel, this stream can't launch if ((float)(occupied_threads[k] / (float)need_threads) > MAX_THREAD_RATIO) break; if ((float)(need_threads / (float)occupied_threads[k]) > MAX_THREAD_RATIO) break; } //if the test is passed for all scheduled kernels, this stream's kernel can launch if (k == stream_count) { this_one = i; //the final value of this_one is the stream index to schedule (or -1) left_over = available_threads - need_threads; //current smallest thread allocation } } //end test for smaller thread allocation } //end test for stream ready to launch } //end outer for loop if (TRACE_ON) { show_gpu_state(); show_stream_state(this_one); } return this_one; //the scheduling decision (stream index) } // Utility function to trace GPU state used in scheduling policy decisions void show_gpu_state(void) { //Must be called with sched_lock held int i; if (trc_idx >= MAX_SCHED_TRACE) return; for (i = 0; i < MAX_STREAMS; i++) { SchedTrace[trc_idx].stream[i] = GPU.streams[i]; SchedTrace[trc_idx].stream_threads[i] = GPU.stream_threads[i]; SchedTrace[trc_idx].next = 0; strcpy(SchedTrace[trc_idx].type, "GPU"); } trc_idx++; } // Utility function to trace stream state used in scheduling policy decisions void show_stream_state(int this_one) { //Must be called with sched_lock held int i; int need_threads; if (trc_idx >= MAX_SCHED_TRACE) return; for (i = 0; i < MAX_STREAMS; i++) { need_threads = min(MAX_GPU_THREADS, Stream[i].blocks * Stream[i].block_threads); if ((Stream[i].state != READY_LAUNCH) && (Stream[i].state != LAUNCHED)) need_threads = -need_threads; //encode unschedulable state in threads with minus SchedTrace[trc_idx].stream[i] = Stream[i].thread; SchedTrace[trc_idx].stream_threads[i] = need_threads; if (this_one == -1) SchedTrace[trc_idx].next = this_one; else SchedTrace[trc_idx].next = Stream[this_one].thread; strcpy(SchedTrace[trc_idx].type, "STR"); } trc_idx++; }