Fix timestamping bugs and clean up CPU side of mon_cross_ctx_copies

- Add a more reliable heuristic for how long to run the CPU-side copy monitor. - Check that copies sizes are appropriately divisible. - Fix parsing issue for copies larger than 8192 GiB. - Clean up, and document bugs in, the time synchronization code. - Wait for user before logging copies.
author: Joshua Bakita <bakitajoshua@gmail.com> 2024-02-21 14:05:34 -0500
committer: Joshua Bakita <bakitajoshua@gmail.com> 2024-02-21 14:05:34 -0500
commit: 5c65954998591bc61a4138024ba4895bed64a8a6 (patch)
tree: 08a62949fc91a298d0a7e00c6b87c8213f86fe1a
parent: 2accc2be54d3f9ad20d15f21bca6397ef6cabf92 (diff)
1 files changed, 57 insertions, 21 deletions
diff --git a/copy_experiments/mon_cross_ctx_copies.cu b/copy_experiments/mon_cross_ctx_copies.cu
index e836179..1163b95 100644
--- a/copy_experiments/mon_cross_ctx_copies.cu
+++ b/copy_experiments/mon_cross_ctx_copies.cu
@@ -7,9 +7,20 @@
 * - GPU clock ticks at a constant rate
 * - GPU clock ticks even while GPU is idle
 * - Different contexts view the same underlying GPU clock
+ * - GPU and CPU clocks tick at the same rate
-WARNING: By default, assumes that GPU and CPU clocks tick at the same rate.
+ *
+ * BUGS:
+ * - GPU and CPU clocks *do not* tick at the same rate
+ * - CPU clocks may be at different offsets and rates on different cores
+ * TODO:
+ * - Disable migrations while consistent timestamps are needed
+ * - When using GPU monitoring, synchronize clocks in each thread, and
+ *   do this serially
+ * - Support configuring copy direction
 *
+ * Note that only CPU-side monitoring is used in the RTAS'24 paper, so this
+ * tool is still correct and known-bug-free for the purposes of artifact
+ * evaluation.
 */
 #include "copy_testbench.h"
 #include "../task_host_utilities.cu"
@@ -110,7 +121,6 @@ void* copy_thread(void* args_raw) {
        uint64_t dev_ns, dev_ns2;
        double host_s, host_s2;
-        //GetHostDeviceTimeOffset(dev, &host_s, &dev_ns);
        if (GPU_COMPUTE && SKEW_CHECK)
                InternalReadGPUNanoseconds(dev, &host_s, &dev_ns);
@@ -147,11 +157,18 @@ void* copy_thread(void* args_raw) {
                        for (int i = 0; i < GPU_MON_THREADS; i++)
                                ready &= barrier[i];
                }
+        } else {
+                if ((COPY_SIZE / PG_SZ) % CPU_MON_DIVISOR != 0) {
+                        fprintf(stderr, "copy_size must be divisible by %d when using CPU-monitored copy threads.\n", CPU_MON_DIVISOR);
+                        exit(1);
+                }
        }
        // Tell our parent we're ready
        args->is_ready = 1;
-        // Wait for our parent to tell us to go
+        // Wait for our parent to tell us to go (spinning here should also cause
+        // Linux's load-balancing logic to implictly move each monitoring thread to
+        // a separate core)
        while (!READY)
                continue;
@@ -176,8 +193,10 @@ void* copy_thread(void* args_raw) {
                // Wait for copy monitor (and hence copy) to complete
                SAFE(cudaStreamSynchronize(stream2));
        } else {
-                // Guess number of needed CPU-only monitoring cycles
+                // Guess number of needed CPU-only monitoring cycles (this heuristic
-                cpu_copy_mon(COPY_SIZE / PG_SZ, pinned_hostmem);
+                // ensures that monitoring runs for roughly the same amount of time,
+                // no matter the timestamping granularity).
+                cpu_copy_mon(CPU_MON_DIVISOR * COPY_SIZE / PG_SZ, pinned_hostmem);
                // Make sure that the copy finished in case we guessed small
                SAFE(cudaStreamSynchronize(stream1));
        }
@@ -229,7 +248,7 @@ static error_t arg_parser(int key, char* arg, struct argp_state *state) {
                case 's':
                        if (atol(arg) < CPU_MON_DIVISOR * 2)
                                argp_error(state, "Please specify a larger copy size. It must be at least %d pages for accurate tracking.\n", CPU_MON_DIVISOR * 2);
-                        COPY_SIZE = atol(arg) * PG_SZ;
+                        COPY_SIZE = strtoull(arg, NULL, 0) * PG_SZ;
                        break;
                case 'g':
                        if (g_args->num_threads == MAX_THREADS)
@@ -254,6 +273,16 @@ int main(int argc, char**argv) {
        pthread_t t[MAX_THREADS];
        global_args_t g_args = {0};
+        struct argp argp = {opts, arg_parser, 0, desc};
+        argp_parse(&argp, argc, argv, 0, 0, &g_args);
+        if (g_args.num_threads == 0) {
+                fprintf(stderr, "At least one copy thread must be specified with --gpu or --cpu arguments.\n");
+                return 3;
+        }
+        fprintf(stderr, "(%d) Synchronizing clocks and initializing copy threads...\n", getpid());
        // Temporarially initialize CUDA to query device attributes
        SAFE_D(cuInit(0));
        SAFE_D(cuDeviceGet(&dev_itrl, dev));
@@ -261,7 +290,7 @@ int main(int argc, char**argv) {
        /// XXX: Still seems to work fine if it isn't???
        SAFE_D(cuDeviceGetAttribute(&tmp, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev_itrl));
        if (!tmp) {
-                fprintf(stderr, "Unsupported platform. It must be possible to map host (CPU)"
+                fprintf(stderr, "Unsupported GPU. It must be possible to map host (CPU)"
                                " DRAM into the GPU virtual address space for accurate clock "
                                "synchronization. Exiting...\n");
                return 1;
@@ -270,22 +299,25 @@ int main(int argc, char**argv) {
        // reused in subprocesses
        SAFE_D(cuDevicePrimaryCtxRelease(dev_itrl));
-        struct argp argp = {opts, arg_parser, 0, desc};
+        double d2h_scale, host_s;
-        argp_parse(&argp, argc, argv, 0, 0, &g_args);
-        if (g_args.num_threads == 0) {
-                fprintf(stderr, "At least one copy thread must be specified with --gpu or --cpu arguments.\n");
-                return 3;
-        }
-        // TODO: Rewrite this so that it's all (u)int64_t
-        // Skip, as this is 1-to-1 on GV100 and GV11B
-        //double d2h_scale = GetGPUTimerScale(dev);
-        // Get the offset of GPU time from CPU time
        uint64_t dev_ns;
-        double host_s;
+        // Get the core-specific offset of GPU time from CPU time, and the
+        // core-specific difference in tick rates (typical variance of -13 to 60
+        // microseconds per second).
+        // XXX: This is not sufficient for time synchronization, see "BUGS" at the
+        //      top of this file.
+        // XXX: This creates an implict context, but should reuse the above.
+        d2h_scale = InternalGetGPUTimerScale(dev);
        InternalReadGPUNanoseconds(dev, &host_s, &dev_ns);
+        if (d2h_scale == -1 || (host_s == 0 && !dev_ns)) {
+                fprintf(stderr, "Unabled to synchronize time with the GPU. Aborting...\n");
+                return 1;
+        }
        GPU_TIME_OFFSET = dev_ns - s2ns(host_s);
+        // Necessary to synchronize experiments running on different CPU cores
+        // (as CPU clocks are only semi-synchronized)
+        fprintf(stderr, "(%d) CPU clock - GPU clock: %ld tick gap\n", getpid(), (long)s2ns(host_s) - dev_ns);
+        fprintf(stderr, "(%d) 1 CPU tick/1 GPU tick: %.9f\n", getpid(), d2h_scale);
        // Copy buffers are filled with random numbers. Seed the RNG.
        srand(0);
@@ -295,8 +327,12 @@ int main(int argc, char**argv) {
                while (!g_args.thread_args[tid].is_ready)
                        continue;
        }
+        fprintf(stderr, "(%d) Initialization completed. Press enter to start copies...", getpid());
+        getc(stdin); // Wait for user
        // Tell children initialization is done and that they can go
        READY = 1;
        // Wait for threads to finish, and determine the earliest recorded time
        uint64_t smallest = UINT64_MAX;
        for (int tid = 0; tid < g_args.num_threads; tid++) {
author	Joshua Bakita <bakitajoshua@gmail.com>	2024-02-21 14:05:34 -0500
committer	Joshua Bakita <bakitajoshua@gmail.com>	2024-02-21 14:05:34 -0500
commit	5c65954998591bc61a4138024ba4895bed64a8a6 (patch)
tree	08a62949fc91a298d0a7e00c6b87c8213f86fe1a
parent	2accc2be54d3f9ad20d15f21bca6397ef6cabf92 (diff)

diff --git a/copy_experiments/mon_cross_ctx_copies.cu b/copy_experiments/mon_cross_ctx_copies.cu index e836179..1163b95 100644 --- a/copy_experiments/mon_cross_ctx_copies.cu +++ b/copy_experiments/mon_cross_ctx_copies.cu
@@ -7,9 +7,20 @@
7	* - GPU clock ticks at a constant rate	7	* - GPU clock ticks at a constant rate
8	* - GPU clock ticks even while GPU is idle	8	* - GPU clock ticks even while GPU is idle
9	* - Different contexts view the same underlying GPU clock	9	* - Different contexts view the same underlying GPU clock
10		10	* - GPU and CPU clocks tick at the same rate
11	WARNING: By default, assumes that GPU and CPU clocks tick at the same rate.	11	*
		12	* BUGS:
		13	* - GPU and CPU clocks do not tick at the same rate
		14	* - CPU clocks may be at different offsets and rates on different cores
		15	* TODO:
		16	* - Disable migrations while consistent timestamps are needed
		17	* - When using GPU monitoring, synchronize clocks in each thread, and
		18	* do this serially
		19	* - Support configuring copy direction
12	*	20	*
		21	* Note that only CPU-side monitoring is used in the RTAS'24 paper, so this
		22	* tool is still correct and known-bug-free for the purposes of artifact
		23	* evaluation.
13	*/	24	*/
14	#include "copy_testbench.h"	25	#include "copy_testbench.h"
15	#include "../task_host_utilities.cu"	26	#include "../task_host_utilities.cu"
@@ -110,7 +121,6 @@ void* copy_thread(void* args_raw) {
110		121
111	uint64_t dev_ns, dev_ns2;	122	uint64_t dev_ns, dev_ns2;
112	double host_s, host_s2;	123	double host_s, host_s2;
113	//GetHostDeviceTimeOffset(dev, &host_s, &dev_ns);
114	if (GPU_COMPUTE && SKEW_CHECK)	124	if (GPU_COMPUTE && SKEW_CHECK)
115	InternalReadGPUNanoseconds(dev, &host_s, &dev_ns);	125	InternalReadGPUNanoseconds(dev, &host_s, &dev_ns);
116		126
@@ -147,11 +157,18 @@ void* copy_thread(void* args_raw) {
147	for (int i = 0; i < GPU_MON_THREADS; i++)	157	for (int i = 0; i < GPU_MON_THREADS; i++)
148	ready &= barrier[i];	158	ready &= barrier[i];
149	}	159	}
		160	} else {
		161	if ((COPY_SIZE / PG_SZ) % CPU_MON_DIVISOR != 0) {
		162	fprintf(stderr, "copy_size must be divisible by %d when using CPU-monitored copy threads.\n", CPU_MON_DIVISOR);
		163	exit(1);
		164	}
150	}	165	}
151		166
152	// Tell our parent we're ready	167	// Tell our parent we're ready
153	args->is_ready = 1;	168	args->is_ready = 1;
154	// Wait for our parent to tell us to go	169	// Wait for our parent to tell us to go (spinning here should also cause
		170	// Linux's load-balancing logic to implictly move each monitoring thread to
		171	// a separate core)
155	while (!READY)	172	while (!READY)
156	continue;	173	continue;
157		174
@@ -176,8 +193,10 @@ void* copy_thread(void* args_raw) {
176	// Wait for copy monitor (and hence copy) to complete	193	// Wait for copy monitor (and hence copy) to complete
177	SAFE(cudaStreamSynchronize(stream2));	194	SAFE(cudaStreamSynchronize(stream2));
178	} else {	195	} else {
179	// Guess number of needed CPU-only monitoring cycles	196	// Guess number of needed CPU-only monitoring cycles (this heuristic
180	cpu_copy_mon(COPY_SIZE / PG_SZ, pinned_hostmem);	197	// ensures that monitoring runs for roughly the same amount of time,
		198	// no matter the timestamping granularity).
		199	cpu_copy_mon(CPU_MON_DIVISOR * COPY_SIZE / PG_SZ, pinned_hostmem);
181	// Make sure that the copy finished in case we guessed small	200	// Make sure that the copy finished in case we guessed small
182	SAFE(cudaStreamSynchronize(stream1));	201	SAFE(cudaStreamSynchronize(stream1));
183	}	202	}
@@ -229,7 +248,7 @@ static error_t arg_parser(int key, char* arg, struct argp_state *state) {
229	case 's':	248	case 's':
230	if (atol(arg) < CPU_MON_DIVISOR * 2)	249	if (atol(arg) < CPU_MON_DIVISOR * 2)
231	argp_error(state, "Please specify a larger copy size. It must be at least %d pages for accurate tracking.\n", CPU_MON_DIVISOR * 2);	250	argp_error(state, "Please specify a larger copy size. It must be at least %d pages for accurate tracking.\n", CPU_MON_DIVISOR * 2);
232	COPY_SIZE = atol(arg) * PG_SZ;	251	COPY_SIZE = strtoull(arg, NULL, 0) * PG_SZ;
233	break;	252	break;
234	case 'g':	253	case 'g':
235	if (g_args->num_threads == MAX_THREADS)	254	if (g_args->num_threads == MAX_THREADS)
@@ -254,6 +273,16 @@ int main(int argc, char**argv) {
254	pthread_t t[MAX_THREADS];	273	pthread_t t[MAX_THREADS];
255	global_args_t g_args = {0};	274	global_args_t g_args = {0};
256		275
		276	struct argp argp = {opts, arg_parser, 0, desc};
		277	argp_parse(&argp, argc, argv, 0, 0, &g_args);
		278
		279	if (g_args.num_threads == 0) {
		280	fprintf(stderr, "At least one copy thread must be specified with --gpu or --cpu arguments.\n");
		281	return 3;
		282	}
		283
		284	fprintf(stderr, "(%d) Synchronizing clocks and initializing copy threads...\n", getpid());
		285
257	// Temporarially initialize CUDA to query device attributes	286	// Temporarially initialize CUDA to query device attributes
258	SAFE_D(cuInit(0));	287	SAFE_D(cuInit(0));
259	SAFE_D(cuDeviceGet(&dev_itrl, dev));	288	SAFE_D(cuDeviceGet(&dev_itrl, dev));
@@ -261,7 +290,7 @@ int main(int argc, char**argv) {
261	/// XXX: Still seems to work fine if it isn't???	290	/// XXX: Still seems to work fine if it isn't???
262	SAFE_D(cuDeviceGetAttribute(&tmp, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev_itrl));	291	SAFE_D(cuDeviceGetAttribute(&tmp, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev_itrl));
263	if (!tmp) {	292	if (!tmp) {
264	fprintf(stderr, "Unsupported platform. It must be possible to map host (CPU)"	293	fprintf(stderr, "Unsupported GPU. It must be possible to map host (CPU)"
265	" DRAM into the GPU virtual address space for accurate clock "	294	" DRAM into the GPU virtual address space for accurate clock "
266	"synchronization. Exiting...\n");	295	"synchronization. Exiting...\n");
267	return 1;	296	return 1;
@@ -270,22 +299,25 @@ int main(int argc, char**argv) {
270	// reused in subprocesses	299	// reused in subprocesses
271	SAFE_D(cuDevicePrimaryCtxRelease(dev_itrl));	300	SAFE_D(cuDevicePrimaryCtxRelease(dev_itrl));
272		301
273	struct argp argp = {opts, arg_parser, 0, desc};	302	double d2h_scale, host_s;
274	argp_parse(&argp, argc, argv, 0, 0, &g_args);
275
276	if (g_args.num_threads == 0) {
277	fprintf(stderr, "At least one copy thread must be specified with --gpu or --cpu arguments.\n");
278	return 3;
279	}
280
281	// TODO: Rewrite this so that it's all (u)int64_t
282	// Skip, as this is 1-to-1 on GV100 and GV11B
283	//double d2h_scale = GetGPUTimerScale(dev);
284	// Get the offset of GPU time from CPU time
285	uint64_t dev_ns;	303	uint64_t dev_ns;
286	double host_s;	304	// Get the core-specific offset of GPU time from CPU time, and the
		305	// core-specific difference in tick rates (typical variance of -13 to 60
		306	// microseconds per second).
		307	// XXX: This is not sufficient for time synchronization, see "BUGS" at the
		308	// top of this file.
		309	// XXX: This creates an implict context, but should reuse the above.
		310	d2h_scale = InternalGetGPUTimerScale(dev);
287	InternalReadGPUNanoseconds(dev, &host_s, &dev_ns);	311	InternalReadGPUNanoseconds(dev, &host_s, &dev_ns);
		312	if (d2h_scale == -1 \|\| (host_s == 0 && !dev_ns)) {
		313	fprintf(stderr, "Unabled to synchronize time with the GPU. Aborting...\n");
		314	return 1;
		315	}
288	GPU_TIME_OFFSET = dev_ns - s2ns(host_s);	316	GPU_TIME_OFFSET = dev_ns - s2ns(host_s);
		317	// Necessary to synchronize experiments running on different CPU cores
		318	// (as CPU clocks are only semi-synchronized)
		319	fprintf(stderr, "(%d) CPU clock - GPU clock: %ld tick gap\n", getpid(), (long)s2ns(host_s) - dev_ns);
		320	fprintf(stderr, "(%d) 1 CPU tick/1 GPU tick: %.9f\n", getpid(), d2h_scale);
289		321
290	// Copy buffers are filled with random numbers. Seed the RNG.	322	// Copy buffers are filled with random numbers. Seed the RNG.
291	srand(0);	323	srand(0);
@@ -295,8 +327,12 @@ int main(int argc, char**argv) {
295	while (!g_args.thread_args[tid].is_ready)	327	while (!g_args.thread_args[tid].is_ready)
296	continue;	328	continue;
297	}	329	}
		330
		331	fprintf(stderr, "(%d) Initialization completed. Press enter to start copies...", getpid());
		332	getc(stdin); // Wait for user
298	// Tell children initialization is done and that they can go	333	// Tell children initialization is done and that they can go
299	READY = 1;	334	READY = 1;
		335
300	// Wait for threads to finish, and determine the earliest recorded time	336	// Wait for threads to finish, and determine the earliest recorded time
301	uint64_t smallest = UINT64_MAX;	337	uint64_t smallest = UINT64_MAX;
302	for (int tid = 0; tid < g_args.num_threads; tid++) {	338	for (int tid = 0; tid < g_args.num_threads; tid++) {