/*
 * This library implements a transparent extension of the NVIDIA runtime
 * API (libcudart) that is dynamically linked with CUDA programs.  This
 * extension provides a "middleware" scheduling infrastructure that controls
 * CUDA kernel launch requests.  It is designed to control kernel scheduling 
 * for CUDA programs with the following characteristics commonly used for 
 * concurrent GPU sharing:
 *  - A main process that creates multiple threads (pthreads) sharing a 
 *    single process address space (i.e., the conditions under which
 *    kernels can run concurrently on a GPU).
 *  - Each thread creates one user-defined CUDA stream (FIFO queue)
 *    that it manages and uses for invoking GPU operations.  There is a
 *    one-to-one relationship between threads and streams.
 *  - The program is written to launch kernels using the angle-brackets
 *    syntax (<<<.....>>>) and synchronizes the CPU and GPU with at least 
 *    one call to cudaStreamSynchronize() between successive instances of
 *    kernel launches in a given stream.
 *  - The CUDA program is dynamically linked with the CUDA library libcudart
 *
 * In the case of a CUDA program with multiple user-defined streams, the NVIDIA 
 * scheduling rules for choosing among multiple streams with kernels at the
 * top of their FIFO queues are not documented.  This middleware attempts to
 * implement and control some of the scheduling choices that can be made.
 * 
 * The library functions are transparently invoked by "wrapping" calls
 * to certain of the original CUDA API functions (described below) and
 * performing scheduling choices before or after invoking the "real" CUDA
 * code.  Control over which kernel launch requests can be presented to
 * the NVIDIA software and hardware scheduling mechanisms is achieved
 * by blocking and signaling operations on the program threads.
 *
 * The new library functions were designed following the fundamental
 * principle of separation between mechanism and policy.  Most of the library
 * is for implementing the mechanisms that are required for any policy.
 * Many scheduling policies are possible given adequate mechanisms for
 * carrying out a given policy.  The separation of mechanism and policy
 * makes it easy to try out and evaluate different policies.  In the library
 * code, all aspects of policy are implemented in a single function,
 * find_next_kernel(), which returns either an identifier for a stream
 * to launch a kernel or -1 to indicate that no new launch is allowed.
 * The policy functions are intended to be implemented as instances
 * of the find_next_kernel() function each contained in a .h file named
 * in a #include statement. 
 *
 * For a given thread/stream, the basic sequence of actions is:
 *  - The program creates a user-defined stream using one of the calls
 *    cudaStreamCreate(), cudaStreamCreateWithFlags(), or 
 *    cudaStreamCreateWithPriority().  These calls first invoke the
 *    new library function streamInit() that initializes state about the
 *    stream and, on the first call, initializes state about the GPU.  
 *    The "real" cuda runtime code for creating a stream is then executed.
 *
 *  - A CUDA kernel launch on a created stream using the angle-brackets syntax 
 *    (<<<.....>>>) is compiled with nvcc by generating two API calls which 
 *    result in calls to the new library functions:
 *    - cudaConfigureCall() first calls the new library function 
 *      schedConfCall() which records configuration information (e.g., 
 *      number of blocks and threads) about the kernel about to be launched
 *      and then executes the "real" library code for configuring a launch.
 *    - cudaLaunch() first calls the new library function schedLaunch().
 *      This function implements a scheduling decision that determines
 *      which, if any, threads, including the one attempting a launch, should
 *      not be blocked so the actual launch can be allowed to happen.  For those
 *      that are already blocked, the corresponding condition is signaled.
 *      For the current launching thread, the thread is not blocked and it
 *      executes the "real" CUDA launch.  If the current launching thread
 *      should be blocked for later scheduling, the corresponding pthread
 *      condition wait is executed.  
 *       
 *  - The CUDA program synchronizes the CPU thread with a launched kernel
 *    using a cudaStreamSynchronize() call.  The "wrapper" for this function
 *    has a different sequence of operations.  It first executes the "real"
 *    synchronization function which may result in the CPU thread being 
 *    blocked until the kernel completes on the GPU.  When the "real" call
 *    returns (kernel completed), the new library function schedSync() is called.
 *    It implements a scheduling decision that determines whether any blocked 
 *    kernel launches can now be executed and, if so, signals the conditions 
 *    that are blocking the threads attempting to launch a kernel.  The kernel
 *    launch is then handled by the "real" NVIDIA scheduling functions.
 *
 * The limitation inherent in this design is that the underlying Linux scheduler
 * actually determines the order in which blocked threads run when unblocked.
 * In the case that multiple blocked threads are signalled, the actual order of
 * launches depends on how the Linux scheduler orders the thread dispatching.
 * Only in the case where the scheduling algorithm allows only one thread to launch
 * a kernel at a time (essentially eliminating any concurrency) can the launch
 * order be made completely deterministic.  In all other cases, the schedluer
 * can only control the set of kernels that are allowed to run concurrently, 
 * not the specific order in which they start executing.
 *
 * The new scheduling "middleware" is implemented as a library that is compiled 
 * and linked with the cuda wrappers as a dynamic-linked load module (see the Makefile). 
 * A process has one copy of this library program and all threads created by the 
 * process share the global state for the library.
 *
 * IMPORTANT: Assumes that a process will create threads only with the POSIX 
 * API call pthread_create() and not use a system call like clone() directly.
 * Also assumes that there is a one-to-one relationship between threads and
 * streams and that the Linux thread TID is sufficient to identify a stream.
 *
 * Note that all new library calls have a void return.  If a call returns, it
 * can be assumed that the call completed without encountering potential errors.  
 * If any error is identified, the process is terminated.
 *
 * Written by Don Smith, Department of Computer Science,
 * University of North Carolina at Chapel Hill.
 * 2019.
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <time.h>
#include <errno.h>
#define _GNU_SOURCE
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>
	 
#include <pthread.h>

#define TRACE_ON  1   //change to 1 for producing trace of launch decisions, 0 for not
#define MAX_SCHED_TRACE 100000

#define MAX_STREAMS 4 // One per basic ARM core on TX2
#define MAX_GPU_BLOCKS 64  //Max blocks on 2 SM TX2
#define MAX_GPU_THREADS 4096 //Max threads on 2 SM TX2

#define min(a,b) ((a) <= (b) ? (a) : (b))
#define max(a,b) ((a) >= (b) ? (a) : (b))

int trc_idx = 0;
struct schedTrace *tr_ptr;
struct schedTrace {
  int stream[MAX_STREAMS];
  int stream_threads[MAX_STREAMS];
  int next;
  char type[4];
}SchedTrace[MAX_SCHED_TRACE];


int Initialized = 0;  //Only initialize GPU once -- set to 1 the first time
enum st_states {INIT, IDLE, PENDING, READY_LAUNCH, LAUNCHED}; //stream states
enum gpu_states {FREE, BUSY}; //gpu states

// this mutex is required to protect shared stream and GPU states
pthread_mutex_t sched_lock = PTHREAD_MUTEX_INITIALIZER;

struct stream {
  pid_t thread;  //the tid (Linux thread id) of the thread "owning" the stream
  void *stream;  //the CUDA runtime pointer of the stream (not currently used).
  int priority;  //stream priority from cudaCreateStreamPriority() or 0.
  enum st_states state;  //current stream state
  int blocks;        //number of blocks in kernel ready to launch
  int block_threads; //number of threads per block
  int look_count;    //for use in policy algorithms concerned with starvation
  pthread_mutex_t st_lock;  //required for using condition wait/signal on stream
  pthread_cond_t st_cond;   //condition variable for thread/stream block/signal
} Stream[MAX_STREAMS];    //a thread/stream is identified by an index (str_idx) in the array.

int stream_count = 0;  //number of streams that have been created
int next = 0;  //index of stream that can launch the next kernel

struct gpu {
  enum gpu_states GPU_state;  //current GPU state
  int threads_occupied;       //total threads allocated over both SMs
  int kernels_dispatched;     //number of kernels currently dispatched to SMs
  int streams[MAX_STREAMS];   //for kernel executing, its thread/stream tid, else 0
  int stream_threads[MAX_STREAMS];  //for kernel executing, its allocated threads, else 0
};

struct gpu GPU;

void free_gpu_threads(pid_t my_tid, int str_idx);
void gpu_exit(pid_t my_tid, int str_idx);
void alloc_gpu_threads(pid_t my_tid, int str_idx);
void gpu_run(pid_t my_tid, int str_idx);
int find_best_kernel(void);
void ready_launch(pid_t my_tid);
void schedule_next(pid_t my_tid);
void dispatch_next(int this_one, pid_t my_tid);
int get_stream(pid_t my_tid);
void show_gpu_state(void);
void show_stream_state(int this_one);


//Include here the .h file containing the scheduling policy implementation
//in funtion: int find_best_kernel(void)
#include "MinFitMinIntfR2.h"

/* Function called from library wrapper of cudaDeviceReset().
 * If any entries have been made in a trace of scheduling decisions
 * made by find_best_kernel(), they are formatted and written to
 * stdout.
 */
void tracePrint(void)
{
  int i, j;
  for (i = 0; i < trc_idx; i++) {
    fprintf(stderr, "%d %s %d ", i, SchedTrace[i].type, SchedTrace[i].next);
    for (j = 0; j < MAX_STREAMS; j++) {
      fprintf(stderr, "[%d, %d] ", SchedTrace[i].stream[j], SchedTrace[i].stream_threads[j]);
    }
    fprintf(stderr, "\n");
  }
}

/* Function called from library wrapper of cudaStreamCreateXXXX().
 * The stream structure at the current index into the Stream structure
 * array is initialized.  The GPU state for the created stream is 
 * also initialized.  Each invocation creates a new index 
 * by incrementing stream_count. On the first invocation, the part of
 * the GPU structure not specific to a stream is also initialized.
 */
void streamInit(pid_t my_tid, int priority) 
//my_tid is the thread creating a user-defined stream
{
  //WARNING: any flags are ignored.

  //printf("cudaStreamCreate TID %d\n", my_tid);
  //fflush(stdout);

  pthread_mutex_lock(&sched_lock);

    Stream[stream_count].thread = my_tid; //stream identified by tid of creating thread
    Stream[stream_count].priority = priority; 
    Stream[stream_count].state = INIT;
    Stream[stream_count].look_count = 0;

    //stream mutex and condition variable initialized to free
    pthread_mutex_init(&Stream[stream_count].st_lock, NULL);
    pthread_cond_init(&Stream[stream_count].st_cond, NULL);

    //initialize GPU state for this newly created stream
    GPU.streams[stream_count] = 0;  //no kernel from stream running
    GPU.stream_threads[stream_count] = 0; //no threads allocated

    stream_count += 1; //increment stream index

    if (Initialized == 0) { //initialize GPU state on first stream create
      GPU.threads_occupied = 0;
      GPU.kernels_dispatched = 0;
      GPU.GPU_state = FREE;

      //tr_ptr = (struct schedTrace *)mem_ptr;
      Initialized = 1;
    }

  pthread_mutex_unlock(&sched_lock);
}

/* Function called from the library wrapper of cudaConfigureCall()
 * generated from the <<<.....>>> kernel launch statement in the
 * CUDA program.  The stream state for the stream is initialized
 * with the block and threads/block counts for the kernel.
 */
void schedConfCall(pid_t my_tid, void *stream, int blocks, int threads) 
//my_tid is the thread/stream attempting to launch
{
  int str_idx;

  pthread_mutex_lock(&sched_lock);

  //printf("cudaConfigureCall TID %d stream %p blocks %d threads %d\n", 
  //	    my_tid, stream, blocks, threads);
  //fflush(stdout);

     // get the stream array index for the thread that "owns" this stream
     str_idx = get_stream(my_tid);
 
     //initialize state for the kernel that the thread is launching 
     Stream[str_idx].state = PENDING; //call configured but not launched
     Stream[str_idx].blocks = blocks; //total blocks in the kernel
     Stream[str_idx].block_threads = threads; //total threads per block

  pthread_mutex_unlock(&sched_lock);
}

/* Function called from the library wrapper of cudaLaunch()
 * generated from the <<<.....>>> kernel launch statement in the
 * CUDA program.  The stream state for the stream is changed to
 * show that the kernel is ready for launching.  The utility
 * function ready_launch() is called.  On return from ready_launch()
 * this function returns to the wrapper which then invokes the
 * "real" CUDA launch.  The return from ready_launch() is 
 * immediate in the case the scheduler determines that this kernel
 * can be launched.  The call may instead result in blocking 
 * the thread if the scheduler determines that the launch should 
 * be deferred.  When the blocking condition is signalled by the
 * scheduler, ready_launch() then returns to this function.
 */ 
void schedLaunch(pid_t my_tid) {//my_tid is the thread/stream attempting to launch
  int str_idx;

  pthread_mutex_lock(&sched_lock);

  //printf("cudaLaunch TID %d\n", my_tid);
  //fflush(stdout);

     // get the stream array index for the thread that "owns" this stream
     str_idx = get_stream(my_tid);

     Stream[str_idx].state = READY_LAUNCH; //kernel can be considered for scheduling

     //printf("TID %d Ready, Blocks %d Threads %d\n", my_tid, 
     //	    Stream[str_idx].blocks, Stream[str_idx].block_threads);
     //fflush(stdout);

     // ready_launch() is called with the sched_lock still held. The function will
     // either block (and the thread will run when signaled) or will return
     // immediately.  In either case, (a) the lock will have been unlocked, and 
     // (b) the kernal will be launched by the "real" CUDA launch.

     ready_launch(my_tid);   // thread/stream will launch on return or after blocking
}


/* Utility function called from schedLaunch().  It invokes the scheduling policy
 * function, find_best_kernel(), one or more times to determine which, if any,
 * streams have a kernel that is ready to launch and should be launched.  For
 * streams, other than the one that invoked schedLaunch(), having kernels that
 * should launch, the utility function dispatch_next() is called to unblock
 * their owning threads.  If the stream owned by the calling thread has a kernel
 * to launch, the sched_lock is released and the function just returns.  If the
 * kernel in the stream of the calling thread is to be deferred, sched_lock is
 * released and the thread blocks with a pthread_cond_wait on its stream condition.
 */
void ready_launch(pid_t my_tid) {
//my_tid is the thread/stream attempting to launch
  int str_idx, rc;
  int this_one;
  int will_block;  

  //Must be called with sched_lock held.  It must release the lock before
  //returning or blocking and then returning.

     // get the stream array index for the thread that "owns" this stream
     str_idx = get_stream(my_tid);

     will_block = 1; // will not block if a kernel scheduled on this stream 

     do {
       /*       
         if (TRACE_ON) {
            printf("TID %d find new kernel on Launch\n", my_tid);
            fflush(stdout);
         }
       */
	 //call the scheduling policy function.  It returns a stream index for
	 //a stream in the READY_LAUNCH state with a kernel to be launched now 
         //(returns -1 if none found)
         this_one = find_best_kernel();

         if (this_one == str_idx) {//kernel from calling thread can launch
	    will_block = 0;  // no block, just return
            alloc_gpu_threads(my_tid, str_idx);  //set up GPU state to launch
            gpu_run(my_tid, str_idx);
            Stream[str_idx].state = LAUNCHED;  //kernel has been scheduled
         }
         else {//kernel from a different thread/stream should be launched
               if (this_one >= 0)
		  dispatch_next(this_one, my_tid);  //set state and signal
	 }
     } while (this_one >= 0); //  -1 indicates no more kernel launches now
              
     //Must unlock so calling thread can return or block 
  pthread_mutex_unlock(&sched_lock);

     if (will_block == 0)
        return;  //allows launch from calling thread to take place

     // thread/stream must block until scheduler indicates its kernel can launch
     rc = pthread_mutex_lock(&Stream[str_idx].st_lock);
     if (rc != 0) {
         fprintf(stderr, "TID %d Failed - Locking Stream Mutex\n", my_tid);
         exit (-1);
     }
     rc = pthread_cond_wait(&Stream[str_idx].st_cond, &Stream[str_idx].st_lock);
     if (rc != 0) {
         fprintf(stderr, "TID %d Failed - Waiting Stream Condition\n", my_tid);
         exit (-1);
     }
     rc = pthread_mutex_unlock(&Stream[str_idx].st_lock);
     if (rc != 0) {
         fprintf(stderr, "TID %d Failed - Unlocking Stream Mutex\n", my_tid);
         exit (-1);
     }
}

/* Utility function called from ready_launch() and schedule_next() to set
 * state and signal a blocked thread/stream so it can execute the "real"
 * CUDA launch.
 */
void dispatch_next(int this_one, pid_t my_tid) {//my_tid is calling thread/stream
  //this_one is the stream index of the stream to launch a kernel
  pid_t new_tid;
  int rc;

  //Must be called with sched_lock held; will be unlocked by caller

      //new_tid is the thread/stream that has been scheduled for kernel launch
      new_tid = Stream[this_one].thread;

      alloc_gpu_threads(new_tid, this_one); //set up GPU state for launch
      gpu_run(new_tid, this_one);
      Stream[this_one].state = LAUNCHED; //kernel has been scheduled
       
     //signal the blocked thread/stream so it can execute the "real" launch
       rc = pthread_mutex_lock(&Stream[this_one].st_lock);
       if (rc != 0) {
           fprintf(stderr, "TID %d Failed - Locking Stream Mutex\n", my_tid);
           exit (-1);
       }
       rc = pthread_cond_signal(&Stream[this_one].st_cond);
       if (rc != 0) {
           fprintf(stderr, "TID %d Failed - Signaling Stream Condition\n", my_tid);
           exit (-1);
       }
       rc = pthread_mutex_unlock(&Stream[this_one].st_lock);
       if (rc != 0) {
           fprintf(stderr, "TID %d Failed - Unlocking Stream Mutex\n", my_tid);
           exit (-1);
       }
}

/* Function called from library wrapper of cudaStreamSynchronize().
 * This CUDA function provides an essential notification that a kernel has
 * completed execution on the GPU.  The CUDA program must be written so 
 * that it synchronizes the CPU and GPU with at least one call to 
 * cudaStreamSynchronize() between successive instances of kernel
 * launches on a given stream.
 *
 * The function sets the stream and GPU state to reflect the kernel's
 * completion which frees GPU resources for use to execute a new
 * kernel.  It then calls the utility function, schedule_next() to
 * schedule launches of any kernels the scheduling policy determines should
 * be eligible to run now.
 */
void schedSync(pid_t my_tid, void *stream) {
//my_tid is the thread/stream synchronizing the CPU with a GPU kernel completion
  int str_idx;

  pthread_mutex_lock(&sched_lock);

  //printf("cudaStreamSynchronize TID %d stream %p\n", my_tid, stream);
  //fflush(stdout);

     // get the stream array index for the thread that "owns" this stream
     str_idx = get_stream(my_tid);

     // if the stream is idle (does not have a kernel being executed), the
     // call is not related to kernel execution (e.g., is for an asynchronous
     // cudaMemcpy).  It can be ignored.
     if (Stream[str_idx].state == IDLE) {
  // unlock for return
  pthread_mutex_unlock(&sched_lock);
        return;
     }
     // still holding sched_lock here
     free_gpu_threads(my_tid, str_idx); //set up GPU state for kernel completion
     gpu_exit(my_tid, str_idx);
     
     Stream[str_idx].state = IDLE;   //set up stream state for kernel completion
     Stream[str_idx].blocks = 0;
     Stream[str_idx].block_threads = 0;

     // schedule_next is called with sched_lock held.  The function must 
     // release it before returning.
     schedule_next(my_tid); // which, if any, thread/stream should launch now?
}

/* Utility function called from schedSynch().  It invokes the scheduling policy
 * function, find_best_kernel(), one or more times to determine which, if any,
 * streams have a kernel that is ready to launch and should be launched.  For
 * streams having kernels that should launch, the utility function dispatch_next() 
 * is called to unblock their owning threads.  Note that the calling thread/stream
 * cannot have a kernel to schedule until it executes another launch.
 */
void schedule_next(pid_t my_tid) {
//my_tid is the thread/stream synchronizing the CPU with a GPU kernel completion
 
  int this_one;

  //Must be called with sched_lock held.  It must release the lock before
  //returning.

  do {
    /*
      if (TRACE_ON) {
         printf("TID %d find new kernel on Sync\n", my_tid);
         fflush(stdout);
      }
    */
    
      //call the scheduling policy function.  It returns a stream index for 
      //a stream in the READY_LAUNCH state with a kernel to be launched now 
      //(returns -1 if none found)
      this_one = find_best_kernel();

      if (this_one >= 0) 
          dispatch_next(this_one, my_tid);
  } while (this_one >= 0); //  -1 indicates no more kernel launches now
       
  pthread_mutex_unlock(&sched_lock);
}

/* Utility function called from schedSync() to free GPU threads for a
 * completed kernel.
 */
void free_gpu_threads(pid_t my_tid, int str_idx) {
  //str_idx is the stream index of the stream with a completed kernel
  int alloc_threads;

  //Must be called with sched_lock held

     //see alloc_gpu_threads() for a description of thread allocations
     alloc_threads = min(MAX_GPU_THREADS, Stream[str_idx].blocks * Stream[str_idx].block_threads); 
     GPU.threads_occupied -= alloc_threads;
     if (GPU.threads_occupied < 0) {
         fprintf(stderr, "TID %d Failed - GPU Threads < 0\n", my_tid);
         exit (-1);
     }
}

/* Utility function called from schedSync() to set GPU stream state for
 * a completed kernel.
 */
void gpu_exit(pid_t my_tid, int str_idx) {
  //str_idx is the stream index of the stream with a completed kernel

  //Must be called with sched_lock held

  //printf("GPU Kernel End %d Threads\n",GPU.stream_threads[str_idx]);

  GPU.streams[str_idx] = 0;
  GPU.stream_threads[str_idx] = 0;

  GPU.kernels_dispatched -= 1;
  if (GPU.kernels_dispatched < 0) {
      fprintf(stderr, "TID %d Failed - GPU Kernels < 0\n", my_tid);
      exit (-1);
  }

}

/* Utility function called from ready_launch() and dispatch_next() to
 * allocate GPU threads for a kernel scheduled for launching.  The total
 * number of threads required by the kernel is computed as the number
 * of blocks in the kernel * the number of threads per block.  If the
 * total threads is >= MAX_GPU_THREADS, the number of allocated threads
 * on the GPU is set to MAX_GPU_THREADS so all GPU threads are occupied
 * until the kernel completes.  
 * NOTE: Once a kernel is launched that occupies all the GPU threads, 
 * no additional kernels can launch until that kernel completes.  
 * This prevents the GPU from concurrently executing the last blocks of 
 * a current kernel with the first blocks of a newly dispatched kernel.
 */  
void alloc_gpu_threads(pid_t my_tid, int str_idx) {
  //str_idx is the stream index of the stream with a scheduled kernel

  int alloc_threads;

  //Must be called with sched_lock held

     alloc_threads = min(MAX_GPU_THREADS, Stream[str_idx].blocks * Stream[str_idx].block_threads); 
     GPU.threads_occupied += alloc_threads;
     if (GPU.threads_occupied > MAX_GPU_THREADS) {
         fprintf(stderr, "TID %d Failed - GPU Threads Exceeded\n", my_tid);
         exit (-1);
     }
}

/* Utility function called from ready_launch() and dispatch_next() to set
 * stream-related and kernel-dispatch state on the GPU for a kernel
 * scheduled for launching.
 */
void gpu_run(pid_t my_tid, int str_idx) {
  //str_idx is the stream index of the stream with a scheduled kernel


  //Must be called with sched_lock held

  GPU.streams[str_idx] = my_tid;

  //see alloc_gpu_threads for a description of thread allocations
  GPU.stream_threads[str_idx] = min(MAX_GPU_THREADS, 
                                Stream[str_idx].blocks * Stream[str_idx].block_threads);
  /*
  int i;

  if (TRACE_ON) {
     printf("GPU Thread Blocks [ ");
     for (i = 0; i < stream_count; i++) {
         if (GPU.stream_threads[i] != 0) 
             printf("%d ", GPU.stream_threads[i]);
     }
     printf("]\n");
  }
  */
  
  GPU.kernels_dispatched += 1;
  if (GPU.kernels_dispatched > stream_count) {
      fprintf(stderr, "TID %d Failed - GPU Kernels > streams\n", my_tid);
      exit (-1);
  }
}

/* Utility function used in multiple functions to find the index in the
 * stream array for the stream owned by the thread with TID of my_tid.
 * The stream must have been previously created (cudaStreamCreate()).
 */
int get_stream(pid_t my_tid)
{
  //Must be called with sched_lock held
 
  int i;

  for (i = 0; i < MAX_STREAMS; i++) {
    if (Stream[i].thread == my_tid)
        break;
  }
  if (i == MAX_STREAMS) {
      fprintf(stderr, "TID %d Failed - get_stream()", my_tid);
      exit (-1);
  }
  return i;
}