From 0f89bddde73d448511004a60b98b8be042f6ffd6 Mon Sep 17 00:00:00 2001
From: Glenn Elliott <gelliott@cs.unc.edu>
Date: Thu, 2 May 2013 18:02:10 -0400
Subject: randomize job execution time w/ noraml distribu.

---
 Makefile        |   4 +-
 gpu/budget.cpp  |  32 +++-
 gpu/gpuspin.cu  | 564 ++++++++++++++++++++++++++++++++------------------------
 src/migration.c |   4 +-
 4 files changed, 356 insertions(+), 248 deletions(-)

diff --git a/Makefile b/Makefile
index b91dec5..831c16b 100644
--- a/Makefile
+++ b/Makefile
@@ -30,7 +30,7 @@ flags-api      = -D_XOPEN_SOURCE=600 -D_GNU_SOURCE
 flags-misc     = -fasynchronous-unwind-tables -fnon-call-exceptions
 
 flags-cu-debug = -g -G -Xcompiler -Wall -Xcompiler -Werror
-flags-cu-optim = -O3 -Xcompiler -march=native
+flags-cu-optim = -O2 -Xcompiler -march=native
 flags-cu-nvcc = --use_fast_math -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30
 flags-cu-misc  = -Xcompiler -fasynchronous-unwind-tables -Xcompiler -fnon-call-exceptions -Xcompiler -malign-double -Xcompiler -pthread
 flags-cu-x86_64 = -m64
@@ -299,7 +299,7 @@ lib-budget = -lrt -lm -pthread
 vpath %.cu gpu/
 
 objcu-gpuspin = gpuspin.o common.o
-lib-gpuspin = -lrt -lm -lpthread
+lib-gpuspin = -lblitz -lrt -lm -lpthread
 
 # ##############################################################################
 # Build everything that depends on liblitmus.
diff --git a/gpu/budget.cpp b/gpu/budget.cpp
index eebb14e..e08daf7 100644
--- a/gpu/budget.cpp
+++ b/gpu/budget.cpp
@@ -134,7 +134,7 @@ int job(lt_t exec_ns, lt_t budget_ns)
 						for(int i = 0; i < NUM_LOCKS; ++i)
 							litmus_lock(LOCKS[i]);
 				}
-				
+
 				// intentionally overrun via suspension
 				if (OVERRUN_BY_SLEEP)
 					lt_sleep(approx_remaining + overrun_extra);
@@ -146,11 +146,11 @@ int job(lt_t exec_ns, lt_t budget_ns)
 						litmus_dgl_unlock(LOCKS, NUM_LOCKS);
 					else
 						for(int i = NUM_LOCKS-1; i >= 0; --i)
-							litmus_unlock(LOCKS[i]);						
+							litmus_unlock(LOCKS[i]);
 					if (NEST_IN_IKGLP)
 						litmus_unlock(IKGLP_LOCK);
 				}
-				
+
 				if (SIGNALS && BLOCK_SIGNALS_ON_SLEEP)
 					unblock_litmus_signals(SIG_BUDGET);
 			}
@@ -165,7 +165,7 @@ int job(lt_t exec_ns, lt_t budget_ns)
 	return 1;
 }
 
-#define OPTSTR "SbosOvzalwqixdn:r:"
+#define OPTSTR "SbosOvzalwqixdn:r:p:"
 
 int main(int argc, char** argv)
 {
@@ -185,9 +185,16 @@ int main(int argc, char** argv)
 	int compute_overrun_rate = 0;
 	int once = 1;
 
+	bool migrate = false;
+	int partition = 0;
+	int partition_sz = 1;
 
 	while ((opt = getopt(argc, argv, OPTSTR)) != -1) {
 		switch(opt) {
+		case 'p':
+			migrate = true;
+			partition = atoi(optarg);
+			break;
 		case 'S':
 			SIGNALS = 1;
 			break;
@@ -261,7 +268,7 @@ int main(int argc, char** argv)
 	assert(NUM_LOCKS > 0);
 	if (LOCK_TYPE == IKGLP || NEST_IN_IKGLP)
 		assert(NUM_REPLICAS >= 1);
-	
+
 	LOCKS = new int[NUM_LOCKS];
 
 	if (compute_overrun_rate) {
@@ -281,7 +288,14 @@ int main(int argc, char** argv)
 		param.budget_policy = PRECISE_ENFORCEMENT;
 	else
 		param.budget_signal_policy = PRECISE_SIGNALS;
+	if (migrate)
+		param.cpu = cluster_to_first_cpu(partition, partition_sz);
 
+	// set up affinity and init litmus
+	if (migrate) {
+		ret = be_migrate_to_cluster(partition, partition_sz);
+		assert(!ret);
+	}
 	init_litmus();
 
 	ret = set_rt_task_param(gettid(), &param);
@@ -309,7 +323,7 @@ int main(int argc, char** argv)
 			}
 			LOCKS[i] = lock;
 		}
-		
+
 		if (NEST_IN_IKGLP) {
 			IKGLP_LOCK = open_ikglp_sem(NAMESPACE, i, NUM_REPLICAS);
 			if (IKGLP_LOCK < 0) {
@@ -318,13 +332,13 @@ int main(int argc, char** argv)
 			}
 		}
 	}
-	
+
 	if (WAIT) {
 		ret = wait_for_ts_release();
 		if (ret < 0)
 			perror("wait_for_ts_release");
 	}
-	
+
 	ret = task_mode(LITMUS_RT_TASK);
 	assert(ret == 0);
 
@@ -360,6 +374,6 @@ int main(int argc, char** argv)
 	printf("# Overruns: %d\n", NUM_OVERRUNS);
 
 	delete[] LOCKS;
-	
+
 	return 0;
 }
diff --git a/gpu/gpuspin.cu b/gpu/gpuspin.cu
index 970d6f2..21134f6 100644
--- a/gpu/gpuspin.cu
+++ b/gpu/gpuspin.cu
@@ -11,6 +11,8 @@
 #include <boost/interprocess/managed_shared_memory.hpp>
 #include <boost/interprocess/sync/interprocess_mutex.hpp>
 
+#include <random/normal.h>
+
 #include <cuda_runtime.h>
 
 #include "litmus.h"
@@ -18,6 +20,9 @@
 
 using namespace std;
 using namespace boost::interprocess;
+using namespace ranlib;
+
+#define ms2s(ms)  ((ms)*0.001)
 
 const char *lock_namespace = "./.gpuspin-locks";
 
@@ -143,10 +148,10 @@ struct ce_lock_state
 	size_t num_locks;
 	size_t budget_remaining;
 	bool locked;
-	
+
 	ce_lock_state(int device_a, enum cudaMemcpyKind kind, size_t size, int device_b = -1, bool migration = false) {
 		num_locks = (device_a != -1) + (device_b != -1);
-		
+
 		if(device_a != -1) {
 			if (!migration)
 				locks[0] = (kind == cudaMemcpyHostToDevice || (kind == cudaMemcpyDeviceToDevice && device_b == -1)) ?
@@ -155,15 +160,15 @@ struct ce_lock_state
 				locks[0] = (kind == cudaMemcpyHostToDevice || (kind == cudaMemcpyDeviceToDevice && device_b == -1)) ?
 				CE_MIGR_SEND_LOCKS[device_a] : CE_MIGR_RECV_LOCKS[device_a];
 		}
-		
+
 		if(device_b != -1) {
 			assert(kind == cudaMemcpyDeviceToDevice);
-			
+
 			if (!migration)
 				locks[1] = CE_RECV_LOCKS[device_b];
 			else
 				locks[1] = CE_MIGR_RECV_LOCKS[device_b];
-			
+
 			if(locks[1] < locks[0]) {
 				// enforce total order on locking
 				int temp = locks[1];
@@ -174,35 +179,35 @@ struct ce_lock_state
 		else {
 			locks[1] = -1;
 		}
-		
+
 		if(!ENABLE_CHUNKING)
 			budget_remaining = size;
 		else
 			budget_remaining = CHUNK_SIZE;
 	}
-	
+
 	void crash(void) {
 		void *array[50];
 		int size, i;
 		char **messages;
-		
+
 		size = backtrace(array, 50);
 		messages = backtrace_symbols(array, size);
-		
+
 		fprintf(stderr, "%d: TRIED TO GRAB SAME LOCK TWICE! Lock = %d\n", getpid(), locks[0]);
 		for (i = 1; i < size && messages != NULL; ++i)
 		{
 			fprintf(stderr, "%d: [bt]: (%d) %s\n", getpid(), i, messages[i]);
 		}
 		free(messages);
-		
+
 		assert(false);
 	}
-	
-	
+
+
 	void lock() {
 		if(locks[0] == locks[1]) crash();
-		
+
 		if(USE_DYNAMIC_GROUP_LOCKS) {
 			litmus_dgl_lock(locks, num_locks);
 		}
@@ -215,10 +220,10 @@ struct ce_lock_state
 		}
 		locked = true;
 	}
-	
+
 	void unlock() {
 		if(locks[0] == locks[1]) crash();
-		
+
 		if(USE_DYNAMIC_GROUP_LOCKS) {
 			litmus_dgl_unlock(locks, num_locks);
 		}
@@ -232,15 +237,15 @@ struct ce_lock_state
 		}
 		locked = false;
 	}
-	
+
 	void refresh() {
 		budget_remaining = CHUNK_SIZE;
 	}
-	
+
 	bool budgetIsAvailable(size_t tosend) {
 		return(tosend >= budget_remaining);
 	}
-	
+
 	void decreaseBudget(size_t spent) {
 		budget_remaining -= spent;
 	}
@@ -253,28 +258,28 @@ static cudaError_t __chunkMemcpy(void* a_dst, const void* a_src, size_t count,
 {
     cudaError_t ret = cudaSuccess;
     int remaining = count;
-	
+
     char* dst = (char*)a_dst;
     const char* src = (const char*)a_src;
-	
+
 	// disable chunking, if needed, by setting chunk_size equal to the
 	// amount of data to be copied.
 	int chunk_size = (ENABLE_CHUNKING) ? CHUNK_SIZE : count;
 	int i = 0;
-	
+
     while(remaining != 0)
     {
         int bytesToCopy = std::min(remaining, chunk_size);
-		
+
 		if(state && state->budgetIsAvailable(bytesToCopy) && state->locked) {
 			cudaStreamSynchronize(STREAMS[CUR_DEVICE]);
 			ret = cudaGetLastError();
-			
+
 			if(ret != cudaSuccess)
 			{
 				break;
 			}
-			
+
 			state->unlock();
 			state->refresh(); // replentish.
 							  // we can only run out of
@@ -283,14 +288,14 @@ static cudaError_t __chunkMemcpy(void* a_dst, const void* a_src, size_t count,
 							  // be set to cover entire memcpy
 							  // if chunking were disabled.
 		}
-		
+
 		if(state && !state->locked) {
 			state->lock();
 		}
-		
+
         //ret = cudaMemcpy(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind);
 		cudaMemcpyAsync(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind, STREAMS[CUR_DEVICE]);
-		
+
 		if(state) {
 			state->decreaseBudget(bytesToCopy);
 		}
@@ -332,9 +337,9 @@ void allocate_locks_litmus(void)
 {
 	// allocate k-FMLP lock
 	int fd = open(lock_namespace, O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR);
-	
+
 	int base_name = GPU_PARTITION * 1000;
-	
+
 	if (GPU_SYNC_MODE == IKGLP_MODE) {
 		/* Standard (optimal) IKGLP */
 		TOKEN_LOCK = open_gpusync_token_lock(fd,
@@ -390,15 +395,15 @@ void allocate_locks_litmus(void)
 		perror("Invalid GPUSync mode specified\n");
 		TOKEN_LOCK = -1;
 	}
-	
+
 	if(TOKEN_LOCK < 0)
 		perror("open_token_sem");
-	
+
 	if(USE_ENGINE_LOCKS)
 	{
 		assert(NUM_COPY_ENGINES == 1 || NUM_COPY_ENGINES == 2);
 		assert((NUM_COPY_ENGINES == 1 && !RESERVED_MIGR_COPY_ENGINE) || NUM_COPY_ENGINES == 2);
-		
+
 		// allocate the engine locks.
 		for (int i = 0; i < GPU_PARTITION_SIZE; ++i)
 		{
@@ -407,27 +412,27 @@ void allocate_locks_litmus(void)
 			int ce_0_name = (i+1)*10 + base_name + 1;
 			int ce_1_name = (i+1)*10 + base_name + 2;
 			int ee_lock = -1, ce_0_lock = -1, ce_1_lock = -1;
-			
+
 			open_sem_t openEngineLock = (ENGINE_LOCK_TYPE == FIFO) ?
 				open_fifo_sem : open_prioq_sem;
-			
+
 			ee_lock = openEngineLock(fd, ee_name);
 			if (ee_lock < 0)
 				perror("open_*_sem (engine lock)");
-			
+
 			ce_0_lock = openEngineLock(fd, ce_0_name);
 			if (ce_0_lock < 0)
 				perror("open_*_sem (engine lock)");
-			
+
 			if (NUM_COPY_ENGINES == 2)
 			{
 				ce_1_lock = openEngineLock(fd, ce_1_name);
 				if (ce_1_lock < 0)
 					perror("open_*_sem (engine lock)");
 			}
-			
+
 			EE_LOCKS[idx] = ee_lock;
-			
+
 			if (NUM_COPY_ENGINES == 1)
 			{
 				// share locks
@@ -439,7 +444,7 @@ void allocate_locks_litmus(void)
 			else
 			{
 				assert(NUM_COPY_ENGINES == 2);
-				
+
 				if (RESERVED_MIGR_COPY_ENGINE) {
 					// copy engine deadicated to migration operations
 					CE_SEND_LOCKS[idx] = ce_0_lock;
@@ -469,15 +474,18 @@ public:
     {
 		memset(&pool[0], 0, sizeof(pool[0])*poolSize);
     }
-	
+
     int get(pthread_mutex_t* tex, int preference = -1)
     {
         int which = -1;
-		int last = (preference >= 0) ? preference : 0;
+	//	int last = (preference >= 0) ? preference : 0;
+		int last = (ENABLE_AFFINITY) ?
+				(preference >= 0) ? preference : 0 :
+				rand()%poolSize;
 		int minIdx = last;
-		
+
 		pthread_mutex_lock(tex);
-		
+
 		int min = pool[last];
 		for(int i = (minIdx+1)%poolSize; i != last; i = (i+1)%poolSize)
 		{
@@ -485,21 +493,21 @@ public:
 				minIdx = i;
 		}
 		++pool[minIdx];
-		
+
 		pthread_mutex_unlock(tex);
-		
+
 		which = minIdx;
-		
+
         return which;
     }
-	
+
     void put(pthread_mutex_t* tex, int which)
     {
 		pthread_mutex_lock(tex);
 		--pool[which];
 		pthread_mutex_unlock(tex);
     }
-	
+
 private:
 	int poolSize;
     int pool[NR_GPUS]; // >= gpu_part_size
@@ -508,19 +516,19 @@ private:
 static gpu_pool* GPU_LINUX_SEM_POOL = NULL;
 static pthread_mutex_t* GPU_LINUX_MUTEX_POOL = NULL;
 
-static void allocate_locks_linux(int num_gpu_users)
+static void allocate_locks_linux(const int num_gpu_users)
 {
 	managed_shared_memory *segment_pool_ptr = NULL;
 	managed_shared_memory *segment_mutex_ptr = NULL;
-	
+
 	int numGpuPartitions = NR_GPUS/GPU_PARTITION_SIZE;
-	
-	if(num_gpu_users != 0)
+
+	if(num_gpu_users > 0)
 	{
 		printf("%d creating shared memory for linux semaphores; num pools = %d, pool size = %d\n", getpid(), numGpuPartitions, GPU_PARTITION_SIZE);
 		shared_memory_object::remove("linux_mutex_memory");
 		shared_memory_object::remove("linux_sem_memory");
-		
+
 		segment_mutex_ptr = new managed_shared_memory(create_only, "linux_mutex_memory", 4*1024);
 		GPU_LINUX_MUTEX_POOL = segment_mutex_ptr->construct<pthread_mutex_t>("pthread_mutex_t linux_m")[numGpuPartitions]();
 		for(int i = 0; i < numGpuPartitions; ++i)
@@ -531,7 +539,7 @@ static void allocate_locks_linux(int num_gpu_users)
 			pthread_mutex_init(&(GPU_LINUX_MUTEX_POOL[i]), &attr);
 			pthread_mutexattr_destroy(&attr);
 		}
-		
+
 		segment_pool_ptr = new managed_shared_memory(create_only, "linux_sem_memory", 4*1024);
 		GPU_LINUX_SEM_POOL = segment_pool_ptr->construct<gpu_pool>("gpu_pool linux_p")[numGpuPartitions](GPU_PARTITION_SIZE);
 	}
@@ -548,7 +556,7 @@ static void allocate_locks_linux(int num_gpu_users)
 				sleep(1);
 			}
 		}while(segment_pool_ptr == NULL);
-		
+
 		do
 		{
 			try
@@ -560,7 +568,7 @@ static void allocate_locks_linux(int num_gpu_users)
 				sleep(1);
 			}
 		}while(segment_mutex_ptr == NULL);
-		
+
 		GPU_LINUX_SEM_POOL = segment_pool_ptr->find<gpu_pool>("gpu_pool linux_p").first;
 		GPU_LINUX_MUTEX_POOL = segment_mutex_ptr->find<pthread_mutex_t>("pthread_mutex_t linux_m").first;
 	}
@@ -569,7 +577,7 @@ static void allocate_locks_linux(int num_gpu_users)
 
 
 
-static void allocate_locks(int num_gpu_users, bool linux_mode)
+static void allocate_locks(const int num_gpu_users, bool linux_mode)
 {
 	if(!linux_mode)
 		allocate_locks_litmus();
@@ -593,14 +601,14 @@ static pthread_barrier_t *gpu_barrier = NULL;
 static interprocess_mutex *gpu_mgmt_mutexes = NULL;
 static managed_shared_memory *segment_ptr = NULL;
 
-void coordinate_gpu_tasks(int num_gpu_users)
+void coordinate_gpu_tasks(const int num_gpu_users)
 {
-	if(num_gpu_users != 0)
+	if(num_gpu_users > 0)
 	{
 		printf("%d creating shared memory\n", getpid());
 		shared_memory_object::remove("gpu_barrier_memory");
 		segment_ptr = new managed_shared_memory(create_only, "gpu_barrier_memory", 4*1024);
-		
+
 		printf("%d creating a barrier for %d users\n", getpid(), num_gpu_users);
 		gpu_barrier = segment_ptr->construct<pthread_barrier_t>("pthread_barrier_t gpu_barrier")();
 		pthread_barrierattr_t battr;
@@ -624,7 +632,7 @@ void coordinate_gpu_tasks(int num_gpu_users)
 				sleep(1);
 			}
 		}while(segment_ptr == NULL);
-		
+
 		gpu_barrier = segment_ptr->find<pthread_barrier_t>("pthread_barrier_t gpu_barrier").first;
 		gpu_mgmt_mutexes = segment_ptr->find<interprocess_mutex>("interprocess_mutex m").first;
 	}
@@ -647,15 +655,16 @@ char *h_state_data = 0;
 
 unsigned int *h_iteration_count[NR_GPUS] = {0};
 
-static void init_cuda(int num_gpu_users)
+static void init_cuda(const int num_gpu_users)
 {
 	const int PAGE_SIZE = 4*1024;
 	size_t send_alloc_bytes = SEND_SIZE + (SEND_SIZE%PAGE_SIZE != 0)*PAGE_SIZE;
 	size_t recv_alloc_bytes = RECV_SIZE + (RECV_SIZE%PAGE_SIZE != 0)*PAGE_SIZE;
 	size_t state_alloc_bytes = STATE_SIZE + (STATE_SIZE%PAGE_SIZE != 0)*PAGE_SIZE;
-	
+
 	coordinate_gpu_tasks(num_gpu_users);
-	
+
+#if 1
 	switch (CUDA_SYNC_MODE)
 	{
 		case BLOCKING:
@@ -665,72 +674,85 @@ static void init_cuda(int num_gpu_users)
 			cudaSetDeviceFlags(cudaDeviceScheduleSpin);
 			break;
 	}
-	
+#else
+	cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+#endif
+
 	for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
 	{
 		cudaDeviceProp prop;
 		int which = GPU_PARTITION*GPU_PARTITION_SIZE + i;
-		
+
 		gpu_mgmt_mutexes[which].lock();
-		
-		set_cur_gpu(which);
-		cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 0);
-		cudaDeviceSetLimit(cudaLimitMallocHeapSize, 0);
-
-		cudaGetDeviceProperties(&prop, which);
-		GPU_HZ[which] = prop.clockRate * 1000; /* khz -> hz */
-		NUM_SM[which] = prop.multiProcessorCount;
-		WARP_SIZE[which] = prop.warpSize;
-		
-		// enough to fill the L2 cache exactly.
-		ELEM_PER_THREAD[which] = (prop.l2CacheSize/(NUM_SM[which]*WARP_SIZE[which]*sizeof(spindata_t)));
-		
-		
-		if (!MIGRATE_VIA_SYSMEM && prop.unifiedAddressing)
+		try
 		{
-			for(int j = 0; j < GPU_PARTITION_SIZE; ++j)
+			set_cur_gpu(which);
+			cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 0);
+			cudaDeviceSetLimit(cudaLimitMallocHeapSize, 0);
+
+			cudaGetDeviceProperties(&prop, which);
+			GPU_HZ[which] = prop.clockRate * 1000; /* khz -> hz */
+			NUM_SM[which] = prop.multiProcessorCount;
+			WARP_SIZE[which] = prop.warpSize;
+
+			// enough to fill the L2 cache exactly.
+			ELEM_PER_THREAD[which] = (prop.l2CacheSize/(NUM_SM[which]*WARP_SIZE[which]*sizeof(spindata_t)));
+
+
+			if (!MIGRATE_VIA_SYSMEM && prop.unifiedAddressing)
 			{
-				if (i != j)
+				for(int j = 0; j < GPU_PARTITION_SIZE; ++j)
 				{
-					int other = GPU_PARTITION*GPU_PARTITION_SIZE + j;
-					int canAccess = 0;
-					cudaDeviceCanAccessPeer(&canAccess, which, other);
-					if(canAccess)
+					if (i != j)
 					{
-						cudaDeviceEnablePeerAccess(other, 0);
-						p2pMigration[which][other] = true;
+						int other = GPU_PARTITION*GPU_PARTITION_SIZE + j;
+						int canAccess = 0;
+						cudaDeviceCanAccessPeer(&canAccess, which, other);
+						if(canAccess)
+						{
+							cudaDeviceEnablePeerAccess(other, 0);
+							p2pMigration[which][other] = true;
+						}
 					}
 				}
 			}
+
+			cudaStreamCreate(&STREAMS[CUR_DEVICE]);
+
+			cudaMalloc(&d_spin_data[which], prop.l2CacheSize);
+			cudaMemset(&d_spin_data[which], 0, prop.l2CacheSize);
+//			cudaMalloc(&d_iteration_count[which], NUM_SM[which]*WARP_SIZE[which]*sizeof(unsigned int));
+//			cudaHostAlloc(&h_iteration_count[which], NUM_SM[which]*WARP_SIZE[which]*sizeof(unsigned int), cudaHostAllocPortable | cudaHostAllocMapped);
+
+			if (send_alloc_bytes) {
+				cudaMalloc(&d_send_data[which], send_alloc_bytes);
+				cudaHostAlloc(&h_send_data, send_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped);
+			}
+
+			if (h_recv_data) {
+				cudaMalloc(&d_recv_data[which], recv_alloc_bytes);
+				cudaHostAlloc(&h_recv_data, recv_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped);
+			}
+
+			if (h_state_data) {
+				cudaMalloc(&d_state_data[which], state_alloc_bytes);
+
+				if (MIGRATE_VIA_SYSMEM)
+					cudaHostAlloc(&h_state_data, state_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped | cudaHostAllocWriteCombined);
+			}
 		}
-		
-		cudaStreamCreate(&STREAMS[CUR_DEVICE]);
-		
-		cudaMalloc(&d_spin_data[which], prop.l2CacheSize);
-		cudaMemset(&d_spin_data[which], 0, prop.l2CacheSize);
-//		cudaMalloc(&d_iteration_count[which], NUM_SM[which]*WARP_SIZE[which]*sizeof(unsigned int));
-//		cudaHostAlloc(&h_iteration_count[which], NUM_SM[which]*WARP_SIZE[which]*sizeof(unsigned int), cudaHostAllocPortable | cudaHostAllocMapped);
-		
-		if (send_alloc_bytes) {
-			cudaMalloc(&d_send_data[which], send_alloc_bytes);
-			cudaHostAlloc(&h_send_data, send_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped);
-		}
-		
-		if (h_recv_data) {
-			cudaMalloc(&d_recv_data[which], recv_alloc_bytes);
-			cudaHostAlloc(&h_recv_data, recv_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped);
+		catch(std::exception &e)
+		{
+			printf("caught an exception during initializiation!: %s\n", e.what());
 		}
-		
-		if (h_state_data) {
-			cudaMalloc(&d_state_data[which], state_alloc_bytes);
-			
-			if (MIGRATE_VIA_SYSMEM)
-				cudaHostAlloc(&h_state_data, state_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped | cudaHostAllocWriteCombined);
+		catch(...)
+		{
+			printf("caught unknown exception.\n");
 		}
-		
-		gpu_mgmt_mutexes[which].unlock();		
+
+		gpu_mgmt_mutexes[which].unlock();
 	}
-	
+
 	// roll back to first GPU
 	set_cur_gpu(GPU_PARTITION*GPU_PARTITION_SIZE);
 }
@@ -772,26 +794,26 @@ static bool MigrateToGPU_SysMem(int from, int to)
 	// you should be using speculative migrations.
 	// Use PushState() and PullState().
 	assert(false); // for now
-	
+
 	bool success = true;
-	
+
 	set_cur_gpu(from);
 	chunkMemcpy(h_state_data, this_gpu(d_state_data),
 				STATE_SIZE, cudaMemcpyDeviceToHost,
 				from, useEngineLocks(), -1, true);
-	
+
 	set_cur_gpu(to);
 	chunkMemcpy(this_gpu(d_state_data), h_state_data,
 				STATE_SIZE, cudaMemcpyHostToDevice,
 				to, useEngineLocks(), -1, true);
-	
+
 	return success;
 }
 
 static bool MigrateToGPU(int from, int to)
 {
 	bool success = false;
-	
+
 	if (from != to)
 	{
 		if(!MIGRATE_VIA_SYSMEM && p2pMigration[to][from])
@@ -804,7 +826,7 @@ static bool MigrateToGPU(int from, int to)
 		set_cur_gpu(to);
 		success = true;
 	}
-	
+
 	return success;
 }
 
@@ -851,9 +873,9 @@ static void catch_exit(int catch_exit)
 		{
 			int which = GPU_PARTITION*GPU_PARTITION_SIZE + i;
 			set_cur_gpu(which);
-			
+
 //			cudaDeviceReset();
-			
+
 			// try to unlock everything.  litmus will prevent bogus calls.
 			if(USE_ENGINE_LOCKS)
 			{
@@ -883,15 +905,15 @@ static int gpucount = 0;
 
 __global__ void docudaspin(float* data, /*unsigned int* iterations,*/ unsigned int num_elem, unsigned int cycles)
 {
-	long long int now = clock64();	
+	long long int now = clock64();
 	long long unsigned int elapsed = 0;
 	long long int last;
-	
+
 //	unsigned int iter = 0;
 	unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 	unsigned int j = 0;
 	bool toggle = true;
-	
+
 //	iterations[i] = 0;
 	do
 	{
@@ -899,7 +921,7 @@ __global__ void docudaspin(float* data, /*unsigned int* iterations,*/ unsigned i
 		j = (j + 1 != num_elem) ? j + 1 : 0;
 		toggle = !toggle;
 //		iter++;
-		
+
 		last = now;
 		now = clock64();
 
@@ -909,13 +931,13 @@ __global__ void docudaspin(float* data, /*unsigned int* iterations,*/ unsigned i
 //		elapsed += (diff > 0) ?
 //			diff :
 //			now + ((~((long long int)0)<<1)>>1) - last;
-		
+
 		// don't count iterations with clock roll-over
 		elapsed += max(0ll, now - last);
 	}while(elapsed < cycles);
 
 //	iterations[i] = iter;
-	
+
 	return;
 }
 
@@ -923,9 +945,11 @@ static void gpu_loop_for(double gpu_sec_time, unsigned int num_kernels, double e
 {
 	int next_gpu;
 
+	if (gpu_sec_time <= 0.0)
+		goto out;
 	if (emergency_exit && wctime() > emergency_exit)
 		goto out;
-	
+
 	next_gpu = litmus_lock(TOKEN_LOCK);
 	{
 		MigrateIfNeeded(next_gpu);
@@ -934,7 +958,7 @@ static void gpu_loop_for(double gpu_sec_time, unsigned int num_kernels, double e
 		if(SEND_SIZE > 0)
 			chunkMemcpy(this_gpu(d_state_data), h_send_data, SEND_SIZE,
 						cudaMemcpyHostToDevice, CUR_DEVICE, useEngineLocks());
-		
+
 		for(unsigned int i = 0; i < num_kernels; ++i)
 		{
 			if(useEngineLocks()) litmus_lock(cur_ee());
@@ -943,18 +967,18 @@ static void gpu_loop_for(double gpu_sec_time, unsigned int num_kernels, double e
 			cudaStreamSynchronize(cur_stream());
 			if(useEngineLocks()) litmus_unlock(cur_ee());
 		}
-		
+
 		if(RECV_SIZE > 0)
 			chunkMemcpy(h_recv_data, this_gpu(d_state_data), RECV_SIZE,
 						cudaMemcpyDeviceToHost, CUR_DEVICE, useEngineLocks());
-		
+
 		if (MIGRATE_VIA_SYSMEM)
 			PullState();
 	}
 	litmus_unlock(TOKEN_LOCK);
-	
+
 	last_gpu() = cur_gpu();
-	
+
 out:
 	return;
 }
@@ -964,7 +988,14 @@ static void gpu_loop_for_linux(double gpu_sec_time, unsigned int num_kernels, do
 	static int GPU_OFFSET = GPU_PARTITION * GPU_PARTITION_SIZE;
 	static gpu_pool *pool = &GPU_LINUX_SEM_POOL[GPU_PARTITION];
 	static pthread_mutex_t *mutex = &GPU_LINUX_MUTEX_POOL[GPU_PARTITION];
-	
+
+	int next_gpu;
+
+	if (gpu_sec_time <= 0.0)
+		goto out;
+	if (emergency_exit && wctime() > emergency_exit)
+		goto out;
+
 #ifdef VANILLA_LINUX
 	static bool once = false;
 	static cudaEvent_t start, end;
@@ -977,21 +1008,16 @@ static void gpu_loop_for_linux(double gpu_sec_time, unsigned int num_kernels, do
 	}
 #endif
 
-	int next_gpu;
-	
-	if (emergency_exit && wctime() > emergency_exit)
-		goto out;
-	
 	next_gpu = pool->get(mutex, cur_gpu() - GPU_OFFSET) + GPU_OFFSET;
 	{
 		MigrateIfNeeded(next_gpu);
-		
+
 		unsigned int numcycles = ((unsigned int)(cur_hz() * gpu_sec_time))/num_kernels;
-		
+
 		if(SEND_SIZE > 0)
 			chunkMemcpy(this_gpu(d_state_data), h_send_data, SEND_SIZE,
 						cudaMemcpyHostToDevice, cur_gpu(), useEngineLocks());
-		
+
 		for(unsigned int i = 0; i < num_kernels; ++i)
 		{
 			/* one block per sm, one warp per block */
@@ -1004,7 +1030,7 @@ static void gpu_loop_for_linux(double gpu_sec_time, unsigned int num_kernels, do
 			cudaEventSynchronize(end);
 #endif
 			cudaStreamSynchronize(cur_stream());
-		
+
 #ifdef VANILLA_LINUX
 			cudaEventElapsedTime(&ms, start, end);
 			ms_sum += ms;
@@ -1013,18 +1039,18 @@ static void gpu_loop_for_linux(double gpu_sec_time, unsigned int num_kernels, do
 #ifdef VANILLA_LINUX
 		++gpucount;
 #endif
-		
+
 		if(RECV_SIZE > 0)
 			chunkMemcpy(h_recv_data, this_gpu(d_state_data), RECV_SIZE,
 						cudaMemcpyDeviceToHost, cur_gpu(), useEngineLocks());
-		
+
 		if (MIGRATE_VIA_SYSMEM)
 			PullState();
 	}
 	pool->put(mutex, cur_gpu() - GPU_OFFSET);
-	
+
 	last_gpu() = cur_gpu();
-	
+
 out:
 	return;
 }
@@ -1131,15 +1157,20 @@ static int loop_once(void)
 
 static int loop_for(double exec_time, double emergency_exit)
 {
-	double last_loop = 0, loop_start;
 	int tmp = 0;
+	double last_loop, loop_start;
+	double start, now;
+
+	if (exec_time <= 0.0)
+		goto out;
 
-	double start = cputime();
-	double now = cputime();
+	start = cputime();
+	now = cputime();
 
 	if (emergency_exit && wctime() > emergency_exit)
 		goto out;
 
+	last_loop = 0;
 	while (now + last_loop < start + exec_time) {
 		loop_start = now;
 		tmp += loop_once();
@@ -1177,36 +1208,39 @@ static void debug_delay_loop(void)
 	}
 }
 
-static int gpu_job(double exec_time, double gpu_exec_time, unsigned int num_kernels, double program_end)
+typedef bool (*gpu_job_t)(double exec_time, double gpu_exec_time, unsigned int num_kernels, double program_end);
+typedef bool (*cpu_job_t)(double exec_time, double program_end);
+
+static bool gpu_job(double exec_time, double gpu_exec_time, unsigned int num_kernels, double program_end)
 {
 	double chunk1, chunk2;
 
 	if (wctime() > program_end) {
-		return 0;
+		return false;
 	}
 	else {
 		chunk1 = exec_time * drand48();
 		chunk2 = exec_time - chunk1;
-		
+
 		loop_for(chunk1, program_end + 1);
 		gpu_loop_for(gpu_exec_time, num_kernels, program_end + 1);
 		loop_for(chunk2, program_end + 1);
-		
+
 		sleep_next_period();
 	}
-	return 1;
+	return true;
 }
 
-static int job(double exec_time, double program_end)
+static bool job(double exec_time, double program_end)
 {
 	if (wctime() > program_end) {
-		return 0;
+		return false;
 	}
 	else {
 		loop_for(exec_time, program_end + 1);
 		sleep_next_period();
 	}
-	return 1;
+	return true;
 }
 
 /*****************************/
@@ -1254,12 +1288,12 @@ static void init_linux()
 	mlockall(MCL_CURRENT | MCL_FUTURE);
 }
 
-static int gpu_job_linux(double exec_time, double gpu_exec_time, unsigned int num_kernels, double program_end)
+static bool gpu_job_linux(double exec_time, double gpu_exec_time, unsigned int num_kernels, double program_end)
 {
 	double chunk1, chunk2;
-	
+
 	if (wctime() > program_end) {
-		return 0;
+		return false;
 	}
 	else {
 		chunk1 = exec_time * drand48();
@@ -1268,22 +1302,22 @@ static int gpu_job_linux(double exec_time, double gpu_exec_time, unsigned int nu
 		loop_for(chunk1, program_end + 1);
 		gpu_loop_for_linux(gpu_exec_time, num_kernels, program_end + 1);
 		loop_for(chunk2, program_end + 1);
-		
+
 		sleep_next_period_linux();
 	}
-	return 1;
+	return true;
 }
 
-static int job_linux(double exec_time, double program_end)
+static bool job_linux(double exec_time, double program_end)
 {
 	if (wctime() > program_end) {
-		return 0;
+		return false;
 	}
 	else {
 		loop_for(exec_time, program_end + 1);
 		sleep_next_period_linux();
 	}
-	return 1;
+	return true;
 }
 
 /*****************************/
@@ -1296,7 +1330,7 @@ enum eScheduler
 };
 
 #define CPU_OPTIONS "p:z:c:wlveio:f:s:q:X:L:Q:d:"
-#define GPU_OPTIONS "g:y:r:C:E:DG:xS:R:T:Z:aFm:b:MNIk:V"
+#define GPU_OPTIONS "g:y:r:C:E:DG:xS:R:T:Z:aFm:b:MNIk:VW:"
 
 // concat the option strings
 #define OPTSTR CPU_OPTIONS GPU_OPTIONS
@@ -1304,37 +1338,52 @@ enum eScheduler
 int main(int argc, char** argv)
 {
 	int ret;
+
+	struct rt_task param;
+
 	lt_t wcet;
 	lt_t period;
-	double wcet_ms = -1, gpu_wcet_ms = -1, period_ms = -1;
+	lt_t budget;
+	double wcet_ms = -1.0;
+	double gpu_wcet_ms = 0.0;
+	double period_ms = -1.0;
+	double budget_ms = -1.0;
+
+	unsigned int num_kernels = 1;
+
+	budget_drain_policy_t drain = DRAIN_SIMPLE;
+	bool want_enforcement = false;
+	bool want_signals = false;
+
 	unsigned int priority = LITMUS_LOWEST_PRIORITY;
+
+	task_class_t cls = RT_CLASS_SOFT;
+
+	eScheduler scheduler = LITMUS;
+	int num_gpu_users = 0;
 	int migrate = 0;
 	int cluster = 0;
 	int cluster_size = 1;
-	int opt;
+
+	Normal<double> *wcet_dist_ms = NULL;
+	float stdpct = 0.0;
+
+	cpu_job_t cjobfn = NULL;
+	gpu_job_t gjobfn = NULL;
+
 	int wait = 0;
+	double scale = 1.0;
 	int test_loop = 0;
-	int column = 1;
-	const char *file = NULL;
-	int want_enforcement = 0;
-	int want_signals = 0;
+
 	double duration = 0, start = 0;
-	double *exec_times = NULL;
-	double scale = 1.0;
-	task_class_t cls = RT_CLASS_SOFT;
 	int cur_job = 0, num_jobs = 0;
-	struct rt_task param;
+	int column = 1;
 
-	double budget_ms = -1.0;
-	lt_t budget;
-	
-	int num_gpu_users = 0;
-	unsigned int num_kernels = 1;
+	int opt;
+
+	double *exec_times = NULL;
+	const char *file = NULL;
 
-	budget_drain_policy_t drain = DRAIN_SIMPLE;
-	
-	eScheduler scheduler = LITMUS;
-	
 	/* locking */
 //	int lock_od = -1;
 //	int resource_id = 0;
@@ -1414,7 +1463,7 @@ int main(int argc, char** argv)
 			MIGRATE_VIA_SYSMEM = true;
 			break;
 		case 'm':
-			num_gpu_users = atoi(optarg);
+			num_gpu_users = (int)atoi(optarg);
 			assert(num_gpu_users > 0);
 			break;
 		case 'k':
@@ -1423,6 +1472,9 @@ int main(int argc, char** argv)
 		case 'b':
 			budget_ms = atoi(optarg);
 			break;
+		case 'W':
+			stdpct = atof(optarg);
+			break;
 		case 'N':
 			scheduler = LINUX;
 			break;
@@ -1438,10 +1490,10 @@ int main(int argc, char** argv)
 				usage("Unknown task class.");
 			break;
 		case 'e':
-			want_enforcement = 1;
+			want_enforcement = true;
 			break;
 		case 'i':
-			want_signals = 1;
+			want_signals = true;
 			break;
 		case 'd':
 			drain = (budget_drain_policy_t)atoi(optarg);
@@ -1489,27 +1541,34 @@ int main(int argc, char** argv)
 	assert(scheduler != LITMUS);
 	assert(!wait);
 #endif
-	
+
+	assert(stdpct >= 0.0);
+
+	if (MIGRATE_VIA_SYSMEM)
+		assert(GPU_PARTITION_SIZE != 1);
+
 	// turn off some features to be safe
 	if (scheduler != LITMUS)
 	{
 		RHO = 0;
 		USE_ENGINE_LOCKS = false;
 		USE_DYNAMIC_GROUP_LOCKS = false;
-		ENABLE_AFFINITY = false;
 		RELAX_FIFO_MAX_LEN = false;
-		ENABLE_RT_AUX_THREADS = false;		
+		ENABLE_RT_AUX_THREADS = false;
 		budget_ms = -1.0;
-		want_enforcement = 0;
-		want_signals = 0;
-		
+		want_enforcement = false;
+		want_signals = false;
+
+		cjobfn = job_linux;
+		gjobfn = gpu_job_linux;
+
 		if (scheduler == RT_LINUX)
 		{
 			struct sched_param fifoparams;
-			
+
 			assert(priority >= sched_get_priority_min(SCHED_FIFO) &&
 				   priority <= sched_get_priority_max(SCHED_FIFO));
-			
+
 			memset(&fifoparams, 0, sizeof(fifoparams));
 			fifoparams.sched_priority = priority;
 			assert(0 == sched_setscheduler(getpid(), SCHED_FIFO, &fifoparams));
@@ -1517,16 +1576,19 @@ int main(int argc, char** argv)
 	}
 	else
 	{
+		cjobfn = job;
+		gjobfn = gpu_job;
+
 		if (!litmus_is_valid_fixed_prio(priority))
 			usage("Invalid priority.");
 	}
-	
+
 	if (test_loop) {
 		debug_delay_loop();
 		return 0;
 	}
 
-	srand(getpid());
+	srand(time(0));
 
 	if (file) {
 		get_exec_times(file, column, &num_jobs, &exec_times);
@@ -1548,7 +1610,7 @@ int main(int argc, char** argv)
 	}
 
 	if (argc - optind == 3) {
-		assert(!GPU_USING);		
+		assert(!GPU_USING);
 		wcet_ms   = atof(argv[optind + 0]);
 		period_ms = atof(argv[optind + 1]);
 		duration  = atof(argv[optind + 2]);
@@ -1560,7 +1622,7 @@ int main(int argc, char** argv)
 		period_ms = atof(argv[optind + 2]);
 		duration  = atof(argv[optind + 3]);
 	}
-	
+
 	wcet   = ms2ns(wcet_ms);
 	period = ms2ns(period_ms);
 	if (wcet <= 0)
@@ -1579,7 +1641,29 @@ int main(int argc, char** argv)
 		budget = ms2ns(budget_ms);
 	else
 		budget = wcet;
-	
+
+#if 0
+	// use upscale to determine breakdown utilization
+	// only scaling up CPU time for now.
+	double upscale = (double)period/(double)budget - 1.0;
+	upscale = std::min(std::max(0.0, upscale), 0.6); // at most 30%
+	wcet = wcet + wcet*upscale;
+	budget = budget + wcet*upscale;
+	wcet_ms = wcet_ms + wcet_ms*upscale;
+
+	// fucking floating point
+	if (budget < wcet)
+		budget = wcet;
+	if (budget > period)
+		budget = period;
+#endif
+
+	// randomize execution time according to a normal distribution
+	// centered around the desired execution time.
+	// standard deviation is a percentage of this average
+	wcet_dist_ms = new Normal<double>(wcet_ms + gpu_wcet_ms, (wcet_ms + gpu_wcet_ms) * stdpct);
+	wcet_dist_ms->seed((unsigned int)time(0));
+
 	if (file && num_jobs > 1)
 		duration += period_ms * 0.001 * (num_jobs - 1);
 
@@ -1588,7 +1672,7 @@ int main(int argc, char** argv)
 		if (ret < 0)
 			bail_out("could not migrate to target partition or cluster.");
 	}
-	
+
 	if (scheduler != LITMUS)
 	{
 		// set some variables needed by linux modes
@@ -1612,17 +1696,19 @@ int main(int argc, char** argv)
 			PRECISE_SIGNALS : NO_SIGNALS;
 	param.drain_policy = drain;
 	param.release_policy = PERIODIC;
-				
+
 	if (migrate)
 		param.cpu = cluster_to_first_cpu(cluster, cluster_size);
 	ret = set_rt_task_param(gettid(), &param);
 	if (ret < 0)
 		bail_out("could not setup rt task params");
 
-	if (scheduler == LITMUS)
+	if (scheduler == LITMUS) {
 		init_litmus();
-	else
+	}
+	else {
 		init_linux();
+	}
 
 	if (want_signals) {
 		/* bind default longjmp signal handler to SIG_BUDGET. */
@@ -1640,16 +1726,16 @@ int main(int argc, char** argv)
 
 	if (GPU_USING) {
 		allocate_locks(num_gpu_users, scheduler != LITMUS);
-		
+
 		signal(SIGABRT, catch_exit);
 		signal(SIGTERM, catch_exit);
 		signal(SIGQUIT, catch_exit);
 		signal(SIGSEGV, catch_exit);
-		
+
 		init_cuda(num_gpu_users);
 		safetynet = true;
 	}
-	
+
 	if (scheduler == LITMUS)
 	{
 		ret = task_mode(LITMUS_RT_TASK);
@@ -1666,7 +1752,7 @@ int main(int argc, char** argv)
 		ret = wait_for_ts_release2(&releaseTime);
 		if (ret != 0)
 			bail_out("wait_for_ts_release2()");
-		
+
 		if (scheduler != LITMUS)
 			log_release();
 	}
@@ -1683,35 +1769,38 @@ int main(int argc, char** argv)
 
 	start = wctime();
 
-	if (scheduler == LITMUS)
-	{
-		if (!GPU_USING) {
-			while (job(wcet_ms * 0.001 * scale, start + duration));
-		}
-		else {
-			while (gpu_job(wcet_ms * 0.001 * scale,
-						   gpu_wcet_ms * 0.001 * scale,
-						   num_kernels,
-						   start + duration));
-		}
+	if (!GPU_USING) {
+		bool keepgoing;
+		do
+		{
+			double job_ms = wcet_dist_ms->random();
+			if (job_ms < 0.0)
+				job_ms = 0.0;
+			keepgoing = cjobfn(ms2s(job_ms * scale), start + duration);
+		}while(keepgoing);
 	}
-	else
-	{
-		if (!GPU_USING) {
-			while (job_linux(wcet_ms * 0.001 * scale, start + duration));
-		}
-		else {
-			while (gpu_job_linux(wcet_ms * 0.001 * scale,
-						   gpu_wcet_ms * 0.001 * scale,
-						   num_kernels,
-						   start + duration));
-		}
+	else {
+		bool keepgoing;
+		do
+		{
+			double job_ms = wcet_dist_ms->random();
+			if (job_ms < 0.0)
+				job_ms = 0.0;
+
+			double cpu_job_ms = (job_ms/(wcet_ms + gpu_wcet_ms))*wcet_ms;
+			double gpu_job_ms = (job_ms/(wcet_ms + gpu_wcet_ms))*gpu_wcet_ms;
+			keepgoing = gjobfn(
+							ms2s(cpu_job_ms * scale),
+							ms2s(gpu_job_ms * scale),
+							num_kernels,
+							start + duration);
+		}while(keepgoing);
 	}
-	
+
 	if (GPU_USING && ENABLE_RT_AUX_THREADS)
 		if (disable_aux_rt_tasks(AUX_CURRENT | AUX_FUTURE) != 0)
 			bail_out("disable_aux_rt_tasks() failed");
-	
+
 //	if (file) {
 //		/* use times read from the CSV file */
 //		for (cur_job = 0; cur_job < num_jobs; ++cur_job) {
@@ -1740,7 +1829,10 @@ int main(int argc, char** argv)
 
 //		printf("avg: %f\n", ms_sum/gpucount);
 	}
-	
+
+	if (wcet_dist_ms)
+		delete wcet_dist_ms;
+
 	if (file)
 		free(exec_times);
 
diff --git a/src/migration.c b/src/migration.c
index 7ac320e..084b68c 100644
--- a/src/migration.c
+++ b/src/migration.c
@@ -66,6 +66,7 @@ int cluster_to_first_cpu(int cluster, int cluster_sz)
 static int setup_numa(pid_t tid, int sz, const cpu_set_t *cpus)
 {
 	int nr_nodes;
+	int nr_cpus = num_online_cpus();
 	struct bitmask* new_nodes;
 	struct bitmask* old_nodes;
 	int i;
@@ -78,7 +79,7 @@ static int setup_numa(pid_t tid, int sz, const cpu_set_t *cpus)
 	new_nodes = numa_bitmask_alloc(nr_nodes);
 	old_nodes = numa_bitmask_alloc(nr_nodes);
 	/* map the cpu mask to a numa mask */
-	for (i = 0; i < sz; ++i) {
+	for (i = 0; i < nr_cpus; ++i) {
 		if(CPU_ISSET_S(i, sz, cpus)) {
 			numa_bitmask_setbit(new_nodes, numa_node_of_cpu(i));
 		}
@@ -124,6 +125,7 @@ int be_migrate_thread_to_cpu(pid_t tid, int target_cpu)
 
 	cpu_set = CPU_ALLOC(num_cpus);
 	sz = CPU_ALLOC_SIZE(num_cpus);
+
 	CPU_ZERO_S(sz, cpu_set);
 	CPU_SET_S(target_cpu, sz, cpu_set);
 
-- 
cgit v1.2.2