/*
  Copyright (c) 2019-21, Lawrence Livermore National Security, LLC. and other
  Goulash project contributors LLNL-CODE-795383, All rights reserved.
  For details about use and distribution, please read LICENSE and NOTICE from
  the Goulash project repository: http://github.com/llnl/goulash
  SPDX-License-Identifier: BSD-3-Clause
*/

/* Designed to allow direct performance comparisons between 
 * naively written HIP/CUDA and OpenMP GPU offloading schemes
 * in a variety of coding styles and languages of a 
 * parameterized embarrassingly parallel Rush Larsen kernel. 
 * Also allows testing build systems (including SPACK) handling
 * of complicated build situations that LLNL cares about.
 *
 * Designed to create several single file test variants 
 * where no -D options required to select the variant
 * and no include files are needed.
 * Almost all code in this file is identical between 
 * variants (this is intentional).
 * MPI support is ifdefed out for non-MPI variants.
 * 
 * The key computational kernel can be located by
 * searching for: KERNEL
 * 
 * Designed to create trivial variants of the same
 * test to be compiled with different compilers for
 * the interoperability tests.   This is why most
 * functions are static and the main kernel test
 * is called rush_larsen_gpu_hip (preprocessor
 * defines this and VARIANT_DESC based on 
 * variant configuration).
 *
 * The naming convention of the variant copies is
 * intended to indicate variant abilities:
 * _cpu_serial         - single threaded, no OpenMP, on CPU
 * _cpu_omp            - use OpenMP to spawn threads on CPU
 * _gpu_omp            - uses OpenMP to offload to GPU 
 * _gpu_hip            - uses HIP to offload to AMD or Nvidia GPU
 * _gpu_lambda_hip     - RAJA-like lambda HIP variant
 * _gpu_cuda           - uses CUDA to offload to Nvidia GPU
 * _gpu_lambda_cuda    - RAJA-like lambda CUDA variant
 * *_mpi               - uses and exercises MPI e.g. _gpu_omp_mpi 
 * *_fort              - Fortran version e.g. _gpu_omp_mpi_fort
 * *_unified_hipmalloc - Same hipMalloced buffers used on CPU & GPU, no transfer
 * *_unified_calloc    - Same calloced buffers used on CPU & GPU, no transfer
 * *_gpu_aware_mpi     - uses and exercises gpu-aware MPI
 *
 * For the interop tests, there is an additional suffix
 * to indicate different copies of the same configuration
 * that are intended to be compiled by different compilers:
 * _compiler1  - E.g., rush_larsen_gpu_omp_compiler1.cc
 *
 * VARIANT_DESC set by preprocessor directives to
 * the configuration of this file.
 *
 * Recommended that a -DCOMPILERID be set to the compiler used to compile each file:
 *
 * /opt/rocm-4.0.1/llvm/bin/clang++  -o rush_larsen_cpu_omp -O3 -g "-DCOMPILERID=rocm-4.0.1" -fopenmp rush_larsen_cpu_omp.cc
 * 
 * Run with no arguments for suggested arguments, for example:
 *   Usage: ./rush_larsen_cpu_omp  Iterations  Kernel_GBs_used
 *
 *     Measure serial launch overhead:   env OMP_NUM_THREADS=1 ./rush_larsen_cpu_omp 100000 .00000001
 *     Measure thread launch overhead:   ./rush_larsen_cpu_omp 100000 .00000001
 *     Measure kernel performance: ./rush_larsen_cpu_omp    100 10
 * 
 * 
 * The Goulash project conceived of and designed by David Richards, 
 * Tom Scogland, and John Gyllenhaal at LLNL Oct 2019.
 * Please contact John Gyllenhaal (gyllenhaal1@llnl.gov) with questions.
 *
 * Rush Larsen core CUDA/OpenMP kernels written by Rob Blake (LLNL) Sept 2016.
 * The goulash Rush Larsen tests add benchmarking infrastructure 
 * around this incredibly useful compact GPU test kernel.   Thank you Rob!
 * 
 * Inline performance measurements added (nvprof not needed)
 * by John Gyllenhaal at LLNL 11/10/20.
 *
 * Command line argument handling, performance difference printing in
 * form easily found with grep, OpenMP thread mapping and initial data
 * sanity checks on just the first array element calculated by kernel
 * by John Gyllenhaal at LLNL 03/22/21
 * 
 * Pulled code from print_openmp_mapping.c by John Gyllenhaal at
 * LLNL written June 2020 which was based on mpibind tests
 * (https://github.com/LLNL/mpibind) by Edgar Leon at LLNL
 *
 * RAJA-perf-suite-like (https://github.com/LLNL/RAJAPerf)
 * C++ lambda versions created by Jason Burmark at LLNL 06/16/21
 * 
 * C-like Fortran ports (by hand) of C++ version of 
 * rush larsen variants by John Gyllenhaal at LLNL 06/28/21.
 * 
 * MPI stat aggregation and MPI exercisers written
 * by John Gyllenhaal at LLNL based on previous user issue reproducers.
 * Pulled into rush larsen tests on 07/02/21 by John Gyllenhaal at LLNL.
 *
 * Enhanced data checks of all kernel generated array data, including 
 * across MPI ranks by John Gyllenhaal at LLNL 07/03/21
 *
 * Interop versions create by John Gyllenhaal at LLNL 07/14/21
 * to test mixing all the Rush Larsen tests with multiple GPU compilers
 * all in one final executable.
 * 
 * Initial test generator from template files, including Makefiles
 * created by John Gyllenhaal at LLNL 07/21/21 for V2.0RC1 and V2.0
 *
 * Partial preprocessor script, finalize_source, created during hackathon that converts 
 * #if defined() heavy benchmark templates into clean source files with all unused varations elided.
 * Only minor changes to templates required to match more restrictive partial preprocessor requirements.
 * The finalize_source script created by John Gyllenhaal at LLNL 08/13/21 and inserted into V2.1 (12/07/21)
 *
 * Rush_larsen tweeaks V2.1-V2.6 done by John Gyllenhaal at LLNL, most recent tweaks 07/03/24
 *
 * V2.6 07/03/24 Added runtime estimates and additional status messages every 60 seconds to support longer runs
 *               Added huge GPU memory support and unified CPU/GPU memory test configurations
 * V2.5 04/19/24 Make fortran rush_larsen versions compatible with flang-new
 * V2.4 01/26/24 Check all kernel launches (required for HIP) and print FILE/LINE for error messages
 * V2.3 09/29/23 Fix nCells bounds check in GPU kernel detected by GPU ASAN
 * V2.2 09/01/22 Tweaked finalize_source support and add mini goulash prototype
 * V2.1 12/07/21 Initial finalize_source (a partial preprocessor support (makes source more human readable)
 * V2.0 09/03/21 Minor fixes, more compiler makedefs
 * V2.0RC1 07/21/21 Added MPI support, interop version, enhanced data checks.
 * V1.2 06/28/21 Fortran and C++ lambda versions added, consistent use of long type
 * V1.1 03/22/21 command line args, perf diffs, maps, checks return codes and answer
 * V1.0 11/10/20 initial release, hard coded inputs, no error checking
 */

/* Allow version to be printed in output */
#define VERSION_STRING "Version 2.6 (7/03/24)"

/* If NO_STATIC defined, make all support routines non-static (visible) */
#define STATIC static

/* Preprocessor macro rushglue(x,y) glues two defined values together */
#define rushglue2(x,y) x##y
#define rushglue(x,y) rushglue2(x,y) 

/* Preprocessor macro rushxstr(s) converts value to string */
#define rushxstr2(s) #s
#define rushxstr(s) rushxstr2(s)
/* Preprocessing macro to check return code from HIP calls using local scope err */
#define HIPCHECK(x) {hipError_t err=x; if (err != hipSuccess) punt("ERROR HIP failed: " rushxstr(x) ": %s at %s:%d", hipGetErrorString(err), __FILE__, __LINE__);}

#define HIPCHECK_LASTERROR(msg) \
  do { \
    hipError_t lastErr = hipGetLastError(); \
    if (lastErr != hipSuccess) { \
        fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
            msg, hipGetErrorString(lastErr), \
            __FILE__, __LINE__); \
        fprintf(stderr, "*** FAILED - ABORTING\n"); \
        exit(1); \
    } \
  } while (0)


/* For interop version, can #define VARIANT_ID 1, etc. to make different named kernels.
 * If not defined, make empty string 
 */
#ifndef VARIANT_ID
#define VARIANT_ID
#endif

/* Create unique gpu_hip based on file #def and #undef
 * settings that is used to create rush_larsen function
 * call name and to annotate key lines of output
 */
#undef TARGET_TAG
#define TARGET_TAG gpu_hip

/* Append _mpi to indicate using MPI */
#undef RUSH_MPI_TAG
#define RUSH_MPI_TAG TARGET_TAG

/* Append VARIANT_ID (may be empty) to get gpu_hip */
#undef gpu_hip
#define CXX_VARIANT_TAG rushglue(RUSH_MPI_TAG,VARIANT_ID)


/* Generate VARIANT_DESC string that annotates the end of key output
 * lines spread across this whole file.  Uses C trick that
 * "omp" " [" "g++" "]" 
 * is equivalent to
 * "omp [g++]"
 * Since I could not figure out how to create one big string
 * with the preprocessor.
 */
#ifdef COMPILERID
#define VARIANT_DESC rushxstr(gpu_hip) " ["  rushxstr(COMPILERID) "]"
#else
#define VARIANT_DESC rushxstr(gpu_hip)
#endif

#include "hip/hip_runtime.h"
#include <cstdlib>
#include <cmath>
#include <stdio.h>
#include <sys/time.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdarg.h>
#include <string.h>
#include <sched.h>

/* Get raw time in seconds as double (a large number).
 * Returns -1.0 on unexpected error.
 */
STATIC double get_raw_secs( void )
{
    struct timeval ts;
    int status;
    double raw_time;
        
    /* Get wall-clock time */
    /* status = getclock( CLOCK_REALTIME, &ts ); */
    status = gettimeofday( &ts, NULL );
        
    /* Return -1.0 on error */
    if( status != 0 ) return -1.0;
        
    /* Convert structure to double (in seconds ) (a large number) */
    raw_time = (double)ts.tv_sec + (double)ts.tv_usec * 1e-6;

    return (raw_time);
}
        
/* Returns base time.  If new_time >= 0, 
 * sets base_time to new_time before returning.
 * Using this as access method to static variable
 * in a way I can trivially emulate in fortran.
 *
 * Note: Lock shouldn't be needed, since even if multiple
 *       threads initialize this, it will be to basically
 *       the same value.
 */
STATIC double get_base_time(double new_time)
{
    static double base_time = -1.0;

    /* If passed value >= 0, use as new base_time */ 
    if (new_time >= 0.0)
        base_time = new_time;

    return(base_time);
}

/* Returns time in seconds (double) since the first call to secs_elapsed
 * (i.e., the first call returns 0.0).
 */
STATIC double secs_elapsed( void )
{
    double new_time;
    double base_time;
        
    /* Get current raw time (a big number) */
    new_time = get_raw_secs();
        
    /* Get the offset since first time called (pass -1 to query)*/
    base_time = get_base_time(-1.0);
    
    /* If base time not set (negative), set to current time (pass in positive secs)*/
    if (base_time < 0.0)
        base_time = get_base_time(new_time);
     
    /* Returned offset from first time called */
    return (new_time - base_time);
}

/* Works like vfprintf, except prefixes wall-clock time (using secs_elapsed)
 * and the difference since last vfprintf.
 * Also flushes out after printing so messages appear immediately
 * Used to implement printf_timestamp, punt, etc.
 */
STATIC double last_printf_timestamp=0.0;
STATIC void vfprintf_timestamp(FILE*out, const char * fmt, va_list args)
{
    char buf[4096];
    int rank = -1;  /* Don't print rank for serial runs */

    /* Get wall-clock time since first call to secs_elapsed */
    double sec = secs_elapsed();
    double diff = sec - last_printf_timestamp;
    last_printf_timestamp=sec;

    /* Print out passed message to big buffer*/
    vsnprintf(buf, sizeof(buf), fmt, args);

    /* No MPI case */
    if (rank < 0)
    {
        /* Print out timestamp and diff seconds with buffer*/
        fprintf(out, "%7.3f (%05.3fs): %s", sec, diff, buf);
    }
    /* MPI case, add rank */
    else
    {
        /* Print out timestamp and diff seconds and MPI rank with buffer*/
        fprintf(out, "%3i: %7.3f (%05.3fs): %s", rank, sec, diff, buf);
    }

    /* Flush out, so message appears immediately */
    fflush(out);
}

/* Prints to stdout for all MPI ranks with timestamps and time diffs */
STATIC void printf_timestamp(const char * fmt, ...)
{
    va_list args;
    va_start(args, fmt);

    /* Use helper routine to actually do print and flush */
    vfprintf_timestamp(stdout,fmt,args);

    va_end(args);
}

/* Prints to stdout for only Rank 0 with timestamps and time diffs.
 * For all other MPI ranks, the message is thrown away.
 */
STATIC void rank0_printf_timestamp(const char * fmt, ...)
{
    int rank = 0;  /* Non-mpi runs always get printed out */
    va_list args;
    va_start(args, fmt);


    /* Only print if rank 0 (or non-MPI program) */
    if (rank == 0)
    {
        /* Use helper routine to actually do print and flush */
        vfprintf_timestamp(stdout,fmt,args);
    }

    va_end(args);
}

/* Prints to stderr (flushes stdout first) with timestamp and exits */
STATIC void punt(const char * fmt, ...)
{
    va_list args;
    va_start(args, fmt);

    /* Flush stdout, so pending message appears before punt message */
    fflush(stdout);

    /* Use helper routine to actually do print and flush */
    vfprintf_timestamp(stderr,fmt,args);

    va_end(args);

    /* Abort the program */
    exit(1);
}

/* The maximum number of threads supported */
#define MAX_SIZE 1024


/* Print kernel runtime stats and aggregate across MPI processes if necessary.
 * Prints one liner if not using MPI
 */
STATIC void print_runtime_stats(long iterations, double kernel_mem_used, double kernel_runtime, double transfer_runtime)
{
    /* NO MPI CASE - print one line */
    /* Print time stats */
    printf_timestamp("RUSHSTATS  Rush Larsen %li %.8f  %.4lf s  %.2lf us/iter  %.3f s datatrans %s\n",  
                     iterations, kernel_mem_used, kernel_runtime, (double)(kernel_runtime)*1000000.0/(double) iterations, transfer_runtime, VARIANT_DESC);
}

/* Do sanity and consistency checks on all of m_gate. Including cross-rank if MPI mode 
 * Prints PASS or FAIL based on data check results
 * If bad data found, will print up to 5 lines of debug info per MPI rank.
 * Returns fail count so can be returned to caller.
 */
STATIC long data_check(double *m_gate, long iterations, double kernel_mem_used, long nCells)
{
    /* If in MPI mode, in order to prevent MPI hangs on data fails, 
     * need to do MPI allreduces even if earlier checks fail.   
     * As a bonus, this algorithm allows rank 0 to be able to 
     * print out how many data check failures occurred across all ranks.
     */
    long fail_count = 0;


    rank0_printf_timestamp("Starting data check for sanity and consistency\n");

    /* Sanity check that calculation not giving garbage
     * Found m_gate[0] to be ~.0.506796353074569 after 1 iteration (really 2 with warmup)
     * and converges to 0.996321172062538 after 100 iterations.  Make sure in that bounds
     * for now.  With a little slop (~.000001) for now (not sure rounding error expected)
     */
    if (m_gate[0] < 0.506796) 
    {
        printf_timestamp("ERROR Data sanity check m_gate[0]=%.15lf < 0.506796 (0.506796353074569 min expected value) %s\n", m_gate[0], VARIANT_DESC);
        fail_count++;
    }

    if (m_gate[0] > 0.996322)
    {
        printf_timestamp("ERROR Data sanity check m_gate[0]=%.15lf > 0.996322 (0.996321172062538 max expected value) %s\n", m_gate[0], VARIANT_DESC);
        fail_count++;

    }

    /* Every array entry should have the same value as m_gate[0], make sure that is true */
    for (long i = 1; i < nCells; i++)
    {
        if (m_gate[i] != m_gate[0])
        {
            fail_count++;
            /* Only print at most 5 warnings per rank */
            if (fail_count < 5)
            {
                printf_timestamp("ERROR Data consistency check m_gate[%i]=%.15lf != m_gate[0]=%.15lf %s\n", 
                                 i, m_gate[i], m_gate[0], VARIANT_DESC);
            }
            if (fail_count == 5)
            {
                printf_timestamp("ERROR Data consistency check REMAINING ERROR MESSAGES SUPPRESSED! %s\n", VARIANT_DESC);
            }

        }
    }

    /* Value looks ok, check all ranks match if using MPI */

    /* Print out summary PASSED or FAILED count from rank 0 only*/
    if (fail_count == 0)
    {
        rank0_printf_timestamp("PASSED Data check %ld %.8f  m_gate[0]=%.15lf %s\n", iterations, kernel_mem_used, m_gate[0], VARIANT_DESC);
    }
    else
    {
        rank0_printf_timestamp("FAILED Data check %ld %.8f  with %li DATA CHECK ERRORS m_gate[0]=%.15lf %s\n", 
                               iterations, kernel_mem_used, fail_count, m_gate[0], VARIANT_DESC);
    }

    /* Return the number of data_check failures detected (across all ranks, if MPI mode) */
    return (fail_count);
}


/* If using OpenMP offloading, make sure GPU works before doing test */
STATIC void verify_gpu_hip(int gpu_id)
{
    char mpi_desc[50]="";
    double *first_test_GPU_data;

    rank0_printf_timestamp("Selecting GPU %i as default device%s\n", gpu_id, mpi_desc); 

    /* Pick GPU to use to exercise selection call */
    HIPCHECK(hipSetDevice(gpu_id));

    rank0_printf_timestamp("Launching HIP GPU hipMalloc test%s\n", mpi_desc); 

    /* Make sure GPU is not totally broken */
    HIPCHECK(hipMalloc(&first_test_GPU_data, sizeof(double)*4));
    HIPCHECK(hipFree(first_test_GPU_data));

    rank0_printf_timestamp("Verified hipMalloc worked on GPU%s\n", mpi_desc);
}

/* Returns secs_elapsed after MPI barrier(if MPI) and printing desc to rank 0 */
STATIC double sync_starttime(const char *desc)
{
    double start_time;

    rank0_printf_timestamp("%s", desc);

    start_time=secs_elapsed();
    return (start_time);
}

/* Returns secs_elapsed before MPI barrier (if MPI) and printing desc to rank 0 */
STATIC double sync_endtime(const char *desc)
{
    double end_time;

    end_time=secs_elapsed();

    rank0_printf_timestamp("%s", desc);

    return (end_time);
}

STATIC __global__ void rush_larsen_gpu_kernel(double* m_gate, const long nCells, const double* Vm) 
{
    /* Now using 2D grid to enable huge memory footprints */
    long blockId = blockIdx.y * gridDim.x + blockIdx.x;
    long ii = blockId*blockDim.x + threadIdx.x;

    if (ii >= nCells) { return; }

    /* Identical contents to the loop body below */
    double sum1,sum2;
    const double x = Vm[ii];
    const int Mhu_l = 10;
    const int Mhu_m = 5;
    const double Mhu_a[] = { 9.9632117206253790e-01,  4.0825738726469545e-02,  6.3401613233199589e-04,  4.4158436861700431e-06,  1.1622058324043520e-08,  1.0000000000000000e+00,  4.0568375699663400e-02,  6.4216825832642788e-04,  4.2661664422410096e-06,  1.3559930396321903e-08, -1.3573468728873069e-11, -4.2594802366702580e-13,  7.6779952208246166e-15,  1.4260675804433780e-16, -2.6656212072499249e-18};

    sum1 = 0;
    for (int j = Mhu_m-1; j >= 0; j--)
        sum1 = Mhu_a[j] + x*sum1;
    sum2 = 0;
    int k = Mhu_m + Mhu_l - 1;
    for (int j = k; j >= Mhu_m; j--)
        sum2 = Mhu_a[j] + x * sum2;
    double mhu = sum1/sum2;

    const int Tau_m = 18;
    const double Tau_a[] = {1.7765862602413648e+01*0.02,  5.0010202770602419e-02*0.02, -7.8002064070783474e-04*0.02, -6.9399661775931530e-05*0.02,  1.6936588308244311e-06*0.02,  5.4629017090963798e-07*0.02, -1.3805420990037933e-08*0.02, -8.0678945216155694e-10*0.02,  1.6209833004622630e-11*0.02,  6.5130101230170358e-13*0.02, -6.9931705949674988e-15*0.02, -3.1161210504114690e-16*0.02,  5.0166191902609083e-19*0.02,  7.8608831661430381e-20*0.02,  4.3936315597226053e-22*0.02, -7.0535966258003289e-24*0.02, -9.0473475495087118e-26*0.02, -2.9878427692323621e-28*0.02,  1.0000000000000000e+00};

    sum1 = 0;
    for (int j = Tau_m-1; j >= 0; j--)
        sum1 = Tau_a[j] + x*sum1;
    double tauR = sum1;
    m_gate[ii] += (mhu - m_gate[ii])*(1-exp(-tauR));
}

/* Sets up and runs the doRushLarsen kernel 'iterations' times, 
 * allocating CPU arrays and perhaps GPU arrays to consume 
 * kernel_mem_used GBs of memory.   
 *
 * This polynomial is a fit to the dynamics of a small part of a cardiac
 * myocyte, specifically the fast sodium m-gate described here:
 * https://www.ncbi.nlm.nih.gov/pubmed/16565318
 *
 * Does exactly the same work on every cell.   Can scale from one cell
 * to filling entire memory.   Does use cell's value as input
 * to calculations.
 * 
 * Returns number of data check failures, returns 0 if all data checks out.
 */
extern "C" long rush_larsen_gpu_hip(long iterations, double kernel_mem_used)
{
    double kernel_starttime,kernel_endtime, kernel_runtime;
    double transfer_starttime,transfer_endtime, transfer_runtime;
    double last_notify_time;
    double warmup_starttime, warmup_endtime;
    double elapsed_secs, avg_secs_per_iter, predicted_secs;
    long nCells;
    long status_point;
    long fail_count =0;

    /* To make interop performance easier to compare,
     * start this file's timers over every time called.
     * Reset this file's secs_elapsed() counter to 0 
     */
    get_base_time(get_raw_secs());

    /* Synchronize printf timestamps across MPI ranks */
    last_printf_timestamp= secs_elapsed();

    /* Print separator before and after output with function name*/
    rank0_printf_timestamp("--------------- Begin rush_larsen_" VARIANT_DESC " (timer zeroed) ---------------\n");

    /* For print niceness, make .00000001 lower bound on GB memory */
    if (kernel_mem_used < .00000001)
        kernel_mem_used = .00000001;

    /* Calculate nCells from target memory target */
    nCells = (long) ((kernel_mem_used * 1024.0 * 1024.0 * 1024.0) / (sizeof(double) * 2));

    /* Must have at least 1 cell */
    if (nCells < 1)
        nCells = 1;

    /* Must have at least 1 iteration */
    if (iterations < 1)
        iterations=1;

    /* Give status every 10% of iterations */
    status_point=iterations/10;
    /* Must be at least 1 to make mod work*/
    if (status_point < 1)
        status_point = 1;
            
    /* Print what we are running */
    rank0_printf_timestamp("START Rush Larsen %ld %.8f  cells %ld  %s\n", iterations, kernel_mem_used, nCells, VARIANT_DESC); 
    rank0_printf_timestamp("%s\n", VERSION_STRING); 

    /* If using GPU, make sure GPU works before doing test */
    verify_gpu_hip(0);


    rank0_printf_timestamp("Allocating and initializing kernel arrays with calloc\n");
    /* Using calloc for CPU memory allocations */
    double* m_gate = (double*)calloc(nCells,sizeof(double));
    if (m_gate == NULL)
    {
        punt("%s failed calloc m_gate",VARIANT_DESC);
    }
         
    double* Vm = (double*)calloc(nCells,sizeof(double));
    if (Vm == NULL)
    {
        punt("%s failed calloc Vm", VARIANT_DESC);
    }

    /* No data transfer time if not using GPU or if no transfer needed */
    transfer_starttime=0.0;
    transfer_endtime=0.0;

    double *gpu_m_gate, *gpu_Vm;

    /* Using hipMalloc for GPU-only memory allocations */
    sync_starttime("Starting hipMalloc of GPU arrays\n");
    HIPCHECK(hipMalloc(&gpu_m_gate, sizeof(double)*nCells));
    HIPCHECK(hipMalloc(&gpu_Vm, sizeof(double)*nCells));
    sync_endtime("Finished hipMalloc of GPU arrays\n");

    transfer_starttime=sync_starttime("Starting hipMemcpy of CPU arrays to GPU arrays\n");
    HIPCHECK(hipMemcpy(gpu_m_gate, m_gate, sizeof(double)*nCells, hipMemcpyHostToDevice));
    HIPCHECK(hipMemcpy(gpu_Vm, Vm, sizeof(double)*nCells, hipMemcpyHostToDevice));
    transfer_endtime=sync_endtime("Finished hipMemcpy of CPU arrays to GPU arrays\n");


    dim3 gridSize, blockSize;
    /* Switch to 2D grid and larger blocks to enable use of up to 1TB of memory on GPU */
    blockSize.x=1024; blockSize.y=1; blockSize.z=1;
    gridSize.x = (((nCells+7)/8) + blockSize.x-1) / (blockSize.x); gridSize.y=8; gridSize.z=1;
    rank0_printf_timestamp("blockSize.x = %i   gridsize.x = %i   gridsize.y = %i\n", blockSize.x, gridSize.x, gridSize.y);
    transfer_runtime=transfer_endtime-transfer_starttime;
            
    /* Do the iterations asked for plus 1 for warmup */
    for (long itime=0; itime<=iterations; itime++) 
    {
        /* Print warmup message for 0th iteration */
        if (itime == 0) 
        {
            rank0_printf_timestamp("Launching warmup iteration (not included in kernel timings)\n");
            warmup_starttime=secs_elapsed();
        }
                
        /* Print status every 10% of iterations and after first iteration */
        else if ((((itime-1) % status_point) == 0) || (itime == 2))
        {   
            if (itime==1)
            {
                /* Do not count any MPI tests into warmup time */
                warmup_endtime=secs_elapsed();


            }

            if (itime == 1)
            {
                rank0_printf_timestamp("Starting kernel timings for Rush Larsen %ld %.8f\n", iterations, kernel_mem_used);
                elapsed_secs=0.0;
                avg_secs_per_iter=(double)(warmup_endtime - warmup_starttime);
                predicted_secs=avg_secs_per_iter * (double)iterations;
                rank0_printf_timestamp("Starting iteration %6li (%.3lfm of %.3lfm est max runtime)\n", itime, (elapsed_secs/60.0), (predicted_secs/60.0));
            }
            else
            {
                elapsed_secs=secs_elapsed() - kernel_starttime;
                avg_secs_per_iter=elapsed_secs/(double)(itime-1);
                predicted_secs=avg_secs_per_iter * (double)iterations;
                rank0_printf_timestamp("Starting iteration %6li (%.3lfm of %.3lfm est runtime)\n", itime, (elapsed_secs/60.0), (predicted_secs/60.0));
            }
            last_notify_time=secs_elapsed();
        }

        /* Start timer after warm-up iteration 0 */
        if (itime==1)
        {
            kernel_starttime=secs_elapsed();
        }
        
        /* Print status if more than 1 minute have gone by since last status update */ 
        if (((secs_elapsed() - last_notify_time) >= 60.0) && (itime > 1))
        {
            elapsed_secs=secs_elapsed() - kernel_starttime;
            avg_secs_per_iter=elapsed_secs/(double)(itime-1);
            predicted_secs=avg_secs_per_iter * (double)iterations;
            rank0_printf_timestamp("Starting iteration %6li (%.3lfm of %.3lfm est runtime)\n", itime, (elapsed_secs/60.0), (predicted_secs/60.0));
            last_notify_time=secs_elapsed();
        } 
                
        /*
         * RUSH LARSEN KERNEL BEING TIMED START
         */
        /* Launch captured loop body on GPU using HIP and the emulateRajaForall device routine */
        hipLaunchKernelGGL(rush_larsen_gpu_kernel, dim3(gridSize), dim3(blockSize), 0, 0, gpu_m_gate, nCells, gpu_Vm);
        HIPCHECK_LASTERROR("Kernel execution failure");
        HIPCHECK(hipDeviceSynchronize());
        /*
         * RUSH LARSEN KERNEL BEING TIMED END
         */
    }

    /* Get time after all iterations */
    kernel_endtime=secs_elapsed();

    /* Calculate kernel runtime */
    kernel_runtime = kernel_endtime-kernel_starttime;

    rank0_printf_timestamp("Finished kernel timings for Rush Larsen %ld %.8f\n", iterations, kernel_mem_used);

    /* Print kernel runtime stats, syncs and aggregates MPI rank (if MPI mode) */
    print_runtime_stats(iterations, kernel_mem_used, kernel_runtime, transfer_runtime);

    /* Transfer GPU m_gate kernel memory to CPU kernel memory for data checks */
    rank0_printf_timestamp("Starting hipMemcpy of GPU result array to CPU array\n");
    HIPCHECK(hipMemcpy(m_gate, gpu_m_gate, sizeof(double)*nCells, hipMemcpyDeviceToHost));
    rank0_printf_timestamp("Finished hipMemcpy of GPU result array to CPU array\n");

    /* Do sanity and consistency checks on all of m_gate. Including cross-rank if in MPI mode.
     * Prints PASS or FAIL based on data check results
     * Returns fail count so can be returned to caller.
     */
    fail_count = data_check(m_gate, iterations, kernel_mem_used, nCells);

    /* Free kernel GPU Memory */
    HIPCHECK(hipFree(gpu_Vm));
    HIPCHECK(hipFree(gpu_m_gate));

    /* Free kernel CPU Memory */
    free(Vm);
    free(m_gate);

    rank0_printf_timestamp("DONE Freed memory %s\n", VARIANT_DESC);

    /* Print separator before and after output */
    rank0_printf_timestamp("----------------- End rush_larsen_" VARIANT_DESC " ---------------\n");
   
    /* Return number of data check failures */
    return (fail_count);
}

/* Main driver (single test) when not being used in interop test */
int main(int argc, char* argv[]) 
{
    long max_iterations=1;
    double kernel_mem_used=0.0;
    int rank = 0; /* Rank will be 0 for the no MPI case */
    int fail_count = 0;

    if (argc != 3)
    {
        if (rank == 0)
        {
            printf("Usage: %s  Iterations  Kernel_GBs_used\n", argv[0]);
            printf("\n");
            printf("Measure serial baseline small: %s 100000 .00000001\n", argv[0]);
            printf("Measure serial baseline large: %s    100 10\n", argv[0]);
            printf("\n");
            printf("%s\n", VERSION_STRING);
            printf("\n");
            printf("RUSH LARSEN VARIANT:  rush_larsen_gpu_hip\n");
            printf("VARIANT_DESC: " VARIANT_DESC "\n");
            printf("\n");
            printf("Questions? Contact John Gyllenhaal (gyllenhaal1@llnl.gov)\n");
        }
        exit (1);
    }

    /* Get iteration count and target kernel memory used arguments */
    max_iterations = atol(argv[1]);
    kernel_mem_used=atof(argv[2]);

    /*
     * Do the rush larsen test with the specified configuration 
     */
    fail_count = rush_larsen_gpu_hip(max_iterations, kernel_mem_used);

    /* Return 1 if data checks failed */
    if (fail_count == 0)
        return(0);
    else
        return(1);
}