1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
|
/* Scheduling policy function that implements a "min thread use, min interference" policy,
* i.e., find the ready-to-launch kernel that will occupy the smallest number of available
* GPU threads AND does not fail a test for interference effects. The test for
* interference effects requires that ratio between the number of threads in the
* kernel under consideration and any kernel already scheduled does not exceed a
* threshold (in this implementation, 2.0). This test is motivated by empirical
* measurements that have shown interfernce effects such as 500% or higher for
* large thread ratios between concurrently executing kernels. This is thought
* to be an artifact of the un-documented warp scheduling algorithm in the NVIDIA SMs.
*/
//put any global (static) declarations here:
#define MAX_THREAD_RATIO 2.0 // Threshold ratio between scheduled and new kernel
int find_best_kernel(void) {
int i;
int this_one = -1; //default return value indicating no kernel to launch
int need_threads, available_threads, left_over;
int k;
int occupied_threads[MAX_STREAMS]; //GPU threads allocated to scheduled kernels
//Must be called with sched_lock held
//record the allocated GPU threads in the kernel scheduled for each stream
for (i = 0; i < stream_count; i++)
occupied_threads[i] = GPU.stream_threads[i];
//GPU threads available for allocation
available_threads = (MAX_GPU_THREADS - GPU.threads_occupied);
left_over = -1; //the number of threads left available if a kernel is scheduled
for (i = 0; i < stream_count; i++) { //examine all streams
if (Stream[i].state == READY_LAUNCH) { //only threads/streams ready to launch are considered
//determine how many threads would be allocated for this kernel (see
//allocate_gpu_threads() for a description)
need_threads = min(MAX_GPU_THREADS, Stream[i].blocks * Stream[i].block_threads);
if (need_threads > available_threads) //can't be scheduled
continue;
// find kernel with smalled thread allocation that does not create thread imbalance
//?? should there be a starvation-prevention part of this policy ??
if ((available_threads - need_threads) > left_over) {
//found kernel with smallest thread allocation so far
//compute and test the ratios of threads between it and all kernels scheduled
for (k = 0; k < stream_count; k++) {//examine all streams
if (occupied_threads[k] == 0) //stream has no kernel scheduled
continue;
//if test fails for any already scheduled kernel, this stream can't launch
if ((float)(occupied_threads[k] / (float)need_threads) > MAX_THREAD_RATIO)
break;
if ((float)(need_threads / (float)occupied_threads[k]) > MAX_THREAD_RATIO)
break;
}
//if the test is passed for all scheduled kernels, this stream's kernel can launch
if (k == stream_count) {
this_one = i; //the final value of this_one is the stream index to schedule (or -1)
left_over = available_threads - need_threads; //current smallest thread allocation
}
} //end test for smaller thread allocation
} //end test for stream ready to launch
} //end outer for loop
if (TRACE_ON) {
show_gpu_state();
show_stream_state(this_one);
}
return this_one; //the scheduling decision (stream index)
}
// Utility function to trace GPU state used in scheduling policy decisions
void show_gpu_state(void) {
//Must be called with sched_lock held
int i;
if (trc_idx >= MAX_SCHED_TRACE)
return;
for (i = 0; i < MAX_STREAMS; i++) {
SchedTrace[trc_idx].stream[i] = GPU.streams[i];
SchedTrace[trc_idx].stream_threads[i] = GPU.stream_threads[i];
SchedTrace[trc_idx].next = 0;
strcpy(SchedTrace[trc_idx].type, "GPU");
}
trc_idx++;
}
// Utility function to trace stream state used in scheduling policy decisions
void show_stream_state(int this_one) {
//Must be called with sched_lock held
int i;
int need_threads;
if (trc_idx >= MAX_SCHED_TRACE)
return;
for (i = 0; i < MAX_STREAMS; i++) {
need_threads = min(MAX_GPU_THREADS, Stream[i].blocks * Stream[i].block_threads);
if ((Stream[i].state != READY_LAUNCH) &&
(Stream[i].state != LAUNCHED))
need_threads = -need_threads; //encode unschedulable state in threads with minus
SchedTrace[trc_idx].stream[i] = Stream[i].thread;
SchedTrace[trc_idx].stream_threads[i] = need_threads;
if (this_one == -1)
SchedTrace[trc_idx].next = this_one;
else
SchedTrace[trc_idx].next = Stream[this_one].thread;
strcpy(SchedTrace[trc_idx].type, "STR");
}
trc_idx++;
}
|