fpga_synthesis merge

2020-06-23 12:41:26 -07:00
parent f034b4c63a c52030dee0
commit b56fb31a6a
1384 changed files with 5533630 additions and 791310 deletions
--- a/driver/Makefile
+++ b/driver/Makefile
@@ -0,0 +1,23 @@
+
+
+all: stub
+
+stub:
+	$(MAKE) -C stub
+
+opae:
+	$(MAKE) -C opae
+
+rtlsim:
+	$(MAKE) -C rtlsim
+
+simx:
+	$(MAKE) -C simx
+
+clean:
+	$(MAKE) clean -C dummy
+	$(MAKE) clean -C opae
+	$(MAKE) clean -C rtlsim
+	$(MAKE) clean -C simx
+
+.PHONY: all opae rtlsim simx clean
--- a/driver/common/vx_utils.cpp
+++ b/driver/common/vx_utils.cpp
@@ -0,0 +1,114 @@
+#include <iostream>
+#include <fstream>
+#include <cstring>
+#include <vortex.h>
+#include <VX_config.h>
+
+extern int vx_dev_caps(int caps_id) {
+  switch (caps_id) {
+  case VX_CAPS_VERSION:
+    return 0;
+  case VX_CAPS_MAX_CORES:
+    return NUM_CORES;
+  case VX_CAPS_MAX_WARPS:
+    return NUM_WARPS;
+  case VX_CAPS_MAX_THREADS:
+    return NUM_THREADS;
+  case VX_CAPS_CACHE_LINESIZE:
+    return 64;
+  case VX_CAPS_LOCAL_MEM_SIZE:
+    return 0xffffffff;
+  case VX_CAPS_ALLOC_BASE_ADDR:
+    return 0x10000000;
+  case VX_CAPS_KERNEL_BASE_ADDR:
+    return 0x80000000;
+  default:
+    std::cout << "invalid caps id: " << caps_id << std::endl;
+    std::abort();
+    return 0;
+  }
+}
+
+extern int vx_upload_kernel_bytes(vx_device_h device, const void* content, size_t size) {
+  int err = 0;
+
+  if (NULL == content || 0 == size)
+    return -1;
+
+  uint32_t buffer_transfer_size = 65536;
+  uint32_t kernel_base_addr = vx_dev_caps(VX_CAPS_KERNEL_BASE_ADDR);
+
+  // allocate device buffer
+  vx_buffer_h buffer;
+  err = vx_alloc_shared_mem(device, buffer_transfer_size, &buffer);
+  if (err != 0)
+    return -1; 
+
+  // get buffer address
+  auto buf_ptr = (uint8_t*)vx_host_ptr(buffer);
+
+ #if defined(USE_SIMX)
+  // default startup routine
+  ((uint32_t*)buf_ptr)[0] = 0xf1401073;
+  ((uint32_t*)buf_ptr)[1] = 0xf1401073;      
+  ((uint32_t*)buf_ptr)[2] = 0x30101073;
+  ((uint32_t*)buf_ptr)[3] = 0x800000b7;
+  ((uint32_t*)buf_ptr)[4] = 0x000080e7;
+  err = vx_copy_to_dev(buffer, 0, 5 * 4, 0);
+  if (err != 0) {
+    vx_buf_release(buffer);
+    return err;
+  }
+
+  // newlib io simulator trap
+  ((uint32_t*)buf_ptr)[0] = 0x00008067;
+  err = vx_copy_to_dev(buffer, 0x70000000, 4, 0);
+  if (err != 0) {
+    vx_buf_release(buffer);
+    return err;
+  }
+#endif
+
+  //
+  // upload content
+  //
+
+  size_t offset = 0;
+  while (offset < size) {
+    auto chunk_size = std::min<size_t>(buffer_transfer_size, size - offset);
+    std::memcpy(buf_ptr, (uint8_t*)content + offset, chunk_size);
+    err = vx_copy_to_dev(buffer, kernel_base_addr + offset, chunk_size, 0);
+    if (err != 0) {
+      vx_buf_release(buffer);
+      return err;
+    }
+    offset += chunk_size;
+  }
+
+  vx_buf_release(buffer);
+
+  return 0;
+}
+
+extern int vx_upload_kernel_file(vx_device_h device, const char* filename) {
+  std::ifstream ifs(filename);
+  if (!ifs) {
+    std::cout << "error: " << filename << " not found" << std::endl;
+    return -1;
+  }
+
+  // read file content
+  ifs.seekg(0, ifs.end);
+  auto size = ifs.tellg();
+  auto content = new char [size];   
+  ifs.seekg(0, ifs.beg);
+  ifs.read(content, size);
+
+  // upload
+  int err = vx_upload_kernel_bytes(device, content, size);
+
+  // release buffer
+  delete[] content;
+
+  return err;
+}
--- a/driver/include/vortex.h
+++ b/driver/include/vortex.h
@@ -0,0 +1,78 @@
+#ifndef __VX_DRIVER_H__
+#define __VX_DRIVER_H__
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void* vx_device_h;
+
+typedef void* vx_buffer_h;
+
+// device caps ids
+#define VX_CAPS_VERSION           0x0 
+#define VX_CAPS_MAX_CORES         0x1
+#define VX_CAPS_MAX_WARPS         0x2
+#define VX_CAPS_MAX_THREADS       0x3
+#define VX_CAPS_CACHE_LINESIZE    0x4
+#define VX_CAPS_LOCAL_MEM_SIZE    0x5
+#define VX_CAPS_ALLOC_BASE_ADDR   0x6
+#define VX_CAPS_KERNEL_BASE_ADDR  0x7
+
+// return device configurations
+int vx_dev_caps(int caps_id);
+
+// open the device and connect to it
+int vx_dev_open(vx_device_h* hdevice);
+
+// Close the device when all the operations are done
+int vx_dev_close(vx_device_h hdevice);
+
+// Allocate shared buffer with device
+int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hbuffer);
+
+// Get host pointer address  
+volatile void* vx_host_ptr(vx_buffer_h hbuffer);
+
+// release buffer
+int vx_buf_release(vx_buffer_h hbuffer);
+
+// allocate device memory and return address
+int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr);
+
+// Copy bytes from device local memory to buffer
+int vx_flush_caches(vx_device_h hdevice, size_t dev_maddr, size_t size);
+
+// Copy bytes from buffer to device local memory
+int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t src_offset);
+
+// Copy bytes from device local memory to buffer
+int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t dst_offset);
+
+// Start device execution
+int vx_start(vx_device_h hdevice);
+
+// Wait for device ready with milliseconds timeout
+int vx_ready_wait(vx_device_h hdevice, long long timeout);
+
+// set device constant registers
+int vx_set_regiters(int state, int value);
+
+// get device constant registers
+int vx_get_regiters(int state, int* value);
+
+////////////////////////////// UTILITY FUNCIONS ///////////////////////////////
+
+// upload kernel bytes to device
+int vx_upload_kernel_bytes(vx_device_h device, const void* content, size_t size);
+
+// upload kernel file to device
+int vx_upload_kernel_file(vx_device_h device, const char* filename);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __VX_DRIVER_H__
--- a/driver/opae/Makefile
+++ b/driver/opae/Makefile
@@ -0,0 +1,69 @@
+
+CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -pedantic -Wfatal-errors
+
+CXXFLAGS += -I../include -I/tools/opae/1.4.0/include  -I../../hw
+
+LDFLAGS += -L/tools/opae/1.4.0/lib
+
+# stack execution protection
+LDFLAGS +=-z noexecstack
+
+# data relocation and projection
+LDFLAGS +=-z relro -z now
+
+# stack buffer overrun detection
+CXXFLAGS +=-fstack-protector
+
+# Position independent code
+CXXFLAGS += -fPIC
+
+# Enable scope analyzer
+#CXXFLAGS += -DSCOPE
+
+LDFLAGS += -luuid
+
+LDFLAGS += -shared
+
+FPGA_LIBS += -lopae-c
+
+ASE_LIBS += -lopae-c-ase
+
+LIB_DIR=../lib
+
+ASE_DIR = ase
+
+PROJECT = libvortex.so
+
+PROJECT_ASE = $(ASE_DIR)/libvortex.so
+
+AFU_JSON_INFO = vortex_afu.h
+
+SRCS = vortex.cpp scope.cpp ../common/vx_utils.cpp
+
+all: $(PROJECT) $(PROJECT_ASE)
+
+# AFU info from JSON file, including AFU UUID
+$(AFU_JSON_INFO): ../../hw/opae/vortex_afu.json
+	afu_json_mgr json-info --afu-json=$^ --c-hdr=$@
+
+$(PROJECT): $(SRCS)
+	$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) $(FPGA_LIBS) -o $@
+
+$(PROJECT_ASE): $(SRCS) $(ASE_DIR)
+	$(CXX) $(CXXFLAGS) -DUSE_ASE $(SRCS) $(LDFLAGS) $(ASE_LIBS) -o $@
+
+vortex.o: vortex.cpp $(AFU_JSON_INFO)
+	$(CXX) $(CXXFLAGS) -c vortex.cpp -o $@
+
+$(ASE_DIR):
+	mkdir -p ase
+
+.depend: $(SRCS) $(AFU_JSON_INFO)
+	$(CXX) $(CXXFLAGS) -MM $(SRCS) > .depend;
+
+clean:
+	rm -rf $(PROJECT) $(PROJECT_ASE) $(AFU_JSON_INFO) *.o .depend 
+
+ifneq ($(MAKECMDGOALS),clean)
+    -include .depend
+endif
--- a/driver/opae/scope.cpp
+++ b/driver/opae/scope.cpp
@@ -0,0 +1,259 @@
+#include <iostream>
+#include <fstream>
+#include <thread>
+#include <chrono>
+#include <vector>
+#include <assert.h>
+#include <VX_config.h>
+#include "scope.h"
+#include "vortex_afu.h"
+
+#define CHECK_RES(_expr)                            \
+   do {                                             \
+     fpga_result res = _expr;                       \
+     if (res == FPGA_OK)                            \
+       break;                                       \
+     printf("OPAE Error: '%s' returned %d, %s!\n",  \
+            #_expr, (int)res, fpgaErrStr(res));     \
+     return -1;                                     \
+   } while (false)
+
+#define MMIO_CSR_SCOPE_CMD      (AFU_IMAGE_MMIO_CSR_SCOPE_CMD * 4)
+#define MMIO_CSR_SCOPE_DATA     (AFU_IMAGE_MMIO_CSR_SCOPE_DATA * 4)
+
+struct scope_signal_t {
+    int width;
+    const char* name;
+};
+
+constexpr int ilog2(int n) {
+    return (n > 1) ? 1 + ilog2(n >> 1) : 0;
+}
+
+static constexpr int NW_BITS = ilog2(NUM_WARPS);
+
+static const scope_signal_t scope_signals[] = {
+
+    { 32, "dram_req_addr" },
+    { 1,  "dram_req_rw" },
+    { 16, "dram_req_byteen" },
+    { 32, "dram_req_data" },
+    { 29, "dram_req_tag" },
+    { 32, "dram_rsp_data" },
+    { 29, "dram_rsp_tag" }, 
+
+    { 32, "snp_req_addr" },
+    { 1,  "snp_req_invalidate" },
+    { 16, "snp_req_tag" },
+    { 16, "snp_rsp_tag" },    
+    
+    { NW_BITS, "icache_req_warp_num" },
+    { 32, "icache_req_addr" },    
+    { NW_BITS, "icache_req_tag" },  
+    { 32, "icache_rsp_data" },    
+    { NW_BITS, "icache_rsp_tag" },
+
+    { NW_BITS, "dcache_req_warp_num" },         
+    { 32, "dcache_req_curr_PC" },
+    { 32, "dcache_req_addr" },
+    { 1,  "dcache_req_rw" },
+    { 4,  "dcache_req_byteen" },
+    { 32, "dcache_req_data" },
+    { NW_BITS, "dcache_req_tag" },
+    { 32, "dcache_rsp_data" },    
+    { NW_BITS, "dcache_rsp_tag" }, 
+    
+    { NW_BITS, "decode_warp_num" },
+    { 32, "decode_curr_PC" },
+    { 1, "decode_is_jal" },
+    { 5, "decode_rs1" },
+    { 5, "decode_rs2" },    
+    
+    { NW_BITS, "execute_warp_num" },
+    { 5,  "execute_rd" },
+    { 32, "execute_a" },
+    { 32, "execute_b" },    
+    
+    { NW_BITS, "writeback_warp_num" },    
+    { 2,  "writeback_wb" },
+    { 5,  "writeback_rd" },
+    { 32, "writeback_data" },    
+
+    ///////////////////////////////////////////////////////////////////////////
+    
+    { 1, "dram_req_valid" },   
+    { 1, "dram_req_ready" },
+    { 1, "dram_rsp_valid" },
+    { 1, "dram_rsp_ready" },
+    
+    { 1, "snp_req_valid" },   
+    { 1, "snp_req_ready" },
+    { 1, "snp_rsp_valid" },
+    { 1, "snp_rsp_ready" },
+
+    { 1, "icache_req_valid" },
+    { 1, "icache_req_ready" },
+    { 1, "icache_rsp_valid" },
+    { 1, "icache_rsp_ready" },
+
+    { NUM_THREADS, "dcache_req_valid" },  
+    { 1, "dcache_req_ready" }, 
+    { NUM_THREADS, "dcache_rsp_valid" }, 
+    { 1, "dcache_rsp_ready" },
+    
+    { NUM_THREADS, "decode_valid" },
+    { NUM_THREADS, "execute_valid" },
+    { NUM_THREADS, "writeback_valid" },    
+    { 1, "schedule_delay" },
+    { 1, "memory_delay" },
+    { 1, "exec_delay" },
+    { 1, "gpr_stage_delay" },
+    { 1, "busy" },
+};
+
+static const int num_signals = sizeof(scope_signals) / sizeof(scope_signal_t);
+
+int vx_scope_start(fpga_handle hfpga, uint64_t delay) {    
+    if (nullptr == hfpga)
+        return -1;  
+    
+    if (delay != uint64_t(-1)) {
+        // set start delay
+        uint64_t cmd_delay = ((delay << 3) | 4);
+        CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_CSR_SCOPE_CMD, cmd_delay));    
+        std::cout << "scope start delay: " << delay << std::endl;
+    }
+
+    return 0;
+}
+
+int vx_scope_stop(fpga_handle hfpga, uint64_t delay) {    
+    if (nullptr == hfpga)
+        return -1;
+    
+    if (delay != uint64_t(-1)) {
+        // stop recording
+        uint64_t cmd_stop = ((delay << 3) | 5);
+        CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_CSR_SCOPE_CMD, cmd_stop));
+        std::cout << "scope stop delay: " << delay << std::endl;
+    }
+
+    std::ofstream ofs("vx_scope.vcd");
+
+    ofs << "$timescale 1 ns $end" << std::endl;
+    ofs << "$var reg 1 0 clk $end" << std::endl;
+
+    int fwidth = 0;
+    for (int i = 0; i < num_signals; ++i) {
+        ofs << "$var reg " << scope_signals[i].width << " " << (i+1) << " " << scope_signals[i].name << " $end" << std::endl;
+        fwidth += scope_signals[i].width;
+    }
+
+    ofs << "enddefinitions $end" << std::endl;
+
+    uint64_t frame_width, max_frames, data_valid;    
+
+    // wait for recording to terminate
+    CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_CSR_SCOPE_CMD, 0));
+    do {        
+        CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_CSR_SCOPE_DATA, &data_valid));        
+        if (data_valid)
+            break;
+        std::this_thread::sleep_for(std::chrono::seconds(1));
+    } while (true);
+
+    std::cout << "scope trace dump begin..." << std::endl;    
+
+    CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_CSR_SCOPE_CMD, 2));
+    CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_CSR_SCOPE_DATA, &frame_width));
+    std::cout << "scope::frame_width=" << std::dec << frame_width << std::endl;
+
+    CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_CSR_SCOPE_CMD, 3));
+    CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_CSR_SCOPE_DATA, &max_frames));
+    std::cout << "scope::max_frames=" << std::dec << max_frames << std::endl;    
+
+    CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_CSR_SCOPE_CMD, 1));
+
+    if (fwidth != (int)frame_width) {   
+        std::cerr << "invalid frame_width: expecting " << std::dec << fwidth << "!" << std::endl;
+        std::abort();
+    }
+    std::vector<char> signal_data(frame_width+1);
+    
+    uint64_t frame_offset = 0;
+    uint64_t frame_no = 0;
+    uint64_t timestamp = 0;    
+    int signal_id = 0;
+    int signal_offset = 0;
+
+    auto print_header = [&] () {
+        ofs << '#' << timestamp++ << std::endl;
+        ofs << "b0 0" << std::endl;
+        ofs << '#' << timestamp++ << std::endl;
+        ofs << "b1 0" << std::endl;
+        
+        uint64_t delta;
+        fpga_result res = fpgaReadMMIO64(hfpga, 0, MMIO_CSR_SCOPE_DATA, &delta);
+        assert(res == FPGA_OK);
+
+        while (delta != 0) {
+            ofs << '#' << timestamp++ << std::endl;
+            ofs << "b0 0" << std::endl;
+            ofs << '#' << timestamp++ << std::endl;
+            ofs << "b1 0" << std::endl;
+            --delta;
+        }
+
+        signal_id = num_signals;
+    };
+
+    print_header();
+
+    do {
+        if (frame_no == (max_frames-1)) {
+            // verify last frame is valid
+            CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_CSR_SCOPE_CMD, 0));
+            CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_CSR_SCOPE_DATA, &data_valid));  
+            assert(data_valid == 1);
+            CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_CSR_SCOPE_CMD, 1));
+        }
+
+        uint64_t word;
+        CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_CSR_SCOPE_DATA, &word));
+        
+        do {          
+            int signal_width = scope_signals[signal_id-1].width;
+            int word_offset = frame_offset % 64;
+
+            signal_data[signal_width - signal_offset - 1] = ((word >> word_offset) & 0x1) ? '1' : '0';
+
+            ++signal_offset;
+            ++frame_offset;
+
+            if (signal_offset == signal_width) {
+                signal_data[signal_width] = 0; // string null termination
+                ofs << 'b' << signal_data.data() << ' ' << signal_id << std::endl;
+                signal_offset = 0;            
+                --signal_id;
+            }
+
+            if (frame_offset == frame_width) {   
+                assert(0 == signal_offset);   
+                frame_offset = 0;
+                ++frame_no;
+                if (frame_no != max_frames) {                
+                    print_header();
+                }                        
+            }
+        } while ((frame_offset % 64) != 0);
+    } while (frame_no != max_frames);
+
+    std::cout << "scope trace dump done! - " << (timestamp/2) << " cycles" << std::endl;
+
+    // verify data not valid
+    CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_CSR_SCOPE_CMD, 0));
+    CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_CSR_SCOPE_DATA, &data_valid));  
+    assert(data_valid == 0);
+
+    return 0;
+}
--- a/driver/opae/scope.h
+++ b/driver/opae/scope.h
@@ -0,0 +1,7 @@
+#pragma once
+
+#include <opae/fpga.h>
+
+int vx_scope_start(fpga_handle hfpga, uint64_t delay = -1);
+
+int vx_scope_stop(fpga_handle hfpga, uint64_t delay = -1);
--- a/driver/opae/vortex.cpp
+++ b/driver/opae/vortex.cpp
@@ -0,0 +1,408 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <assert.h>
+#include <cmath>
+#include <uuid/uuid.h>
+#include <opae/fpga.h>
+#include <vortex.h>
+#include "vortex_afu.h"
+#ifdef SCOPE
+#include "scope.h"
+#endif
+
+#define CHECK_RES(_expr)                                            \
+   do {                                                             \
+     fpga_result res = _expr;                                       \
+     if (res == FPGA_OK)                                            \
+       break;                                                       \
+     printf("OPAE Error: '%s' returned %d, %s!\n",                  \
+            #_expr, (int)res, fpgaErrStr(res));                     \
+     return -1;                                                     \
+   } while (false)
+
+///////////////////////////////////////////////////////////////////////////////
+
+#define CMD_TYPE_READ           AFU_IMAGE_CMD_TYPE_READ
+#define CMD_TYPE_WRITE          AFU_IMAGE_CMD_TYPE_WRITE
+#define CMD_TYPE_RUN            AFU_IMAGE_CMD_TYPE_RUN
+#define CMD_TYPE_CLFLUSH        AFU_IMAGE_CMD_TYPE_CLFLUSH
+
+#define MMIO_CSR_CMD            (AFU_IMAGE_MMIO_CSR_CMD * 4)
+#define MMIO_CSR_IO_ADDR        (AFU_IMAGE_MMIO_CSR_IO_ADDR * 4)
+#define MMIO_CSR_MEM_ADDR       (AFU_IMAGE_MMIO_CSR_MEM_ADDR * 4)
+#define MMIO_CSR_DATA_SIZE      (AFU_IMAGE_MMIO_CSR_DATA_SIZE * 4)
+#define MMIO_CSR_STATUS         (AFU_IMAGE_MMIO_CSR_STATUS * 4)
+
+///////////////////////////////////////////////////////////////////////////////
+
+typedef struct vx_device_ {
+    fpga_handle fpga;
+    size_t mem_allocation;
+} vx_device_t;
+
+typedef struct vx_buffer_ {
+    uint64_t wsid;
+    volatile void* host_ptr;
+    uint64_t io_addr;
+    vx_device_h hdevice;
+    size_t size;
+} vx_buffer_t;
+
+inline size_t align_size(size_t size, size_t alignment) {        
+    assert(0 == (alignment & (alignment - 1)));
+    return (size + alignment - 1) & ~(alignment - 1);
+}
+
+inline bool is_aligned(size_t addr, size_t alignment) {
+    assert(0 == (alignment & (alignment - 1)));
+    return 0 == (addr & (alignment - 1));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+extern int vx_dev_open(vx_device_h* hdevice) {
+    fpga_properties filter = nullptr;
+    fpga_result res;
+    fpga_guid guid;
+    fpga_token accel_token;
+    uint32_t num_matches;
+    fpga_handle accel_handle;
+    vx_device_t* device;
+
+    if (nullptr == hdevice)
+        return  -1;
+
+    // ensure that the block size 64
+    assert(64 == vx_dev_caps(VX_CAPS_CACHE_LINESIZE));
+
+    // Set up a filter that will search for an accelerator
+    fpgaGetProperties(nullptr, &filter);
+    fpgaPropertiesSetObjectType(filter, FPGA_ACCELERATOR);
+
+    // Add the desired UUID to the filter
+    uuid_parse(AFU_ACCEL_UUID, guid);
+    fpgaPropertiesSetGUID(filter, guid);
+
+    // Do the search across the available FPGA contexts
+    num_matches = 1;
+    fpgaEnumerate(&filter, 1, &accel_token, 1, &num_matches);
+
+    // Not needed anymore
+    fpgaDestroyProperties(&filter);
+
+    if (num_matches < 1) {
+        fprintf(stderr, "Accelerator %s not found!\n", AFU_ACCEL_UUID);
+        return -1;
+    }
+
+    // Open accelerator
+    res = fpgaOpen(accel_token, &accel_handle, 0);
+    if (FPGA_OK != res) {
+        return -1;
+    }
+
+    // Done with token
+    fpgaDestroyToken(&accel_token);
+
+    // allocate device object
+    device = (vx_device_t*)malloc(sizeof(vx_device_t));
+    if (nullptr == device) {
+        fpgaClose(accel_handle);
+        return -1;
+    }
+
+    device->fpga = accel_handle;
+    device->mem_allocation = vx_dev_caps(VX_CAPS_ALLOC_BASE_ADDR);
+
+    *hdevice = device;
+    
+#ifdef SCOPE
+    {
+        int ret = vx_scope_start(device->fpga, 0);
+        if (ret != 0)
+            return ret;
+    }
+#endif
+
+    return 0;
+}
+
+extern int vx_dev_close(vx_device_h hdevice) {
+    if (nullptr == hdevice)
+        return -1;
+
+    vx_device_t *device = ((vx_device_t*)hdevice);
+
+#ifdef SCOPE
+    vx_scope_stop(device->fpga, 0);
+#endif
+
+    fpgaClose(device->fpga);
+
+    free(device);
+
+    return 0;
+}
+
+extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr) {
+    if (nullptr == hdevice 
+     || nullptr == dev_maddr
+     || 0 >= size)
+        return -1;
+
+    vx_device_t *device = ((vx_device_t*)hdevice);
+
+    int line_size = vx_dev_caps(VX_CAPS_CACHE_LINESIZE);    
+    size_t dev_mem_size = vx_dev_caps(VX_CAPS_LOCAL_MEM_SIZE);
+
+    size_t asize = align_size(size, line_size);
+    
+    if (device->mem_allocation + asize > dev_mem_size)
+        return -1;   
+
+    *dev_maddr = device->mem_allocation;
+    device->mem_allocation += asize;
+
+    return 0;
+}
+
+extern int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hbuffer) {
+    fpga_result res;
+    void* host_ptr;
+    uint64_t wsid;
+    uint64_t io_addr;
+    vx_buffer_t* buffer;
+
+    if (nullptr == hdevice
+     || 0 >= size
+     || nullptr == hbuffer)
+        return -1;
+
+    vx_device_t *device = ((vx_device_t*)hdevice);
+
+    int line_size = vx_dev_caps(VX_CAPS_CACHE_LINESIZE);
+
+    size_t asize = align_size(size, line_size);
+
+    res = fpgaPrepareBuffer(device->fpga, asize, &host_ptr, &wsid, 0);
+    if (FPGA_OK != res) {
+        return -1;
+    }
+
+    // Get the physical address of the buffer in the accelerator
+    res = fpgaGetIOAddress(device->fpga, wsid, &io_addr);
+    if (FPGA_OK != res) {
+        fpgaReleaseBuffer(device->fpga, wsid);
+        return -1;
+    }
+
+    // allocate buffer object
+    buffer = (vx_buffer_t*)malloc(sizeof(vx_buffer_t));
+    if (nullptr == buffer) {
+        fpgaReleaseBuffer(device->fpga, wsid);
+        return -1;
+    }
+
+    buffer->wsid     = wsid;
+    buffer->host_ptr = host_ptr;
+    buffer->io_addr  = io_addr;
+    buffer->hdevice  = hdevice;
+    buffer->size     = asize;
+
+    *hbuffer = buffer;
+
+    return 0;
+}
+
+extern volatile void* vx_host_ptr(vx_buffer_h hbuffer) {
+    if (nullptr == hbuffer)
+        return nullptr;
+
+    vx_buffer_t* buffer = ((vx_buffer_t*)hbuffer);
+
+    return buffer->host_ptr;
+}
+
+extern int vx_buf_release(vx_buffer_h hbuffer) {
+    if (nullptr == hbuffer)
+        return -1;
+
+    vx_buffer_t* buffer = ((vx_buffer_t*)hbuffer);
+    vx_device_t *device = ((vx_device_t*)buffer->hdevice);
+
+    fpgaReleaseBuffer(device->fpga, buffer->wsid);
+
+    free(buffer);
+
+    return 0;
+}
+
+extern int vx_ready_wait(vx_device_h hdevice, long long timeout) {
+    if (nullptr == hdevice)
+        return -1;
+    
+    vx_device_t *device = ((vx_device_t*)hdevice);
+
+    uint64_t data = 0;
+    struct timespec sleep_time; 
+
+#if defined(USE_ASE)
+    sleep_time.tv_sec = 1;
+    sleep_time.tv_nsec = 0;
+#else
+    sleep_time.tv_sec = 0;
+    sleep_time.tv_nsec = 1000000;
+#endif
+
+    // to milliseconds
+    long long sleep_time_ms = (sleep_time.tv_sec * 1000) + (sleep_time.tv_nsec / 1000000);
+    
+    for (;;) {
+        CHECK_RES(fpgaReadMMIO64(device->fpga, 0, MMIO_CSR_STATUS, &data));
+        if (0 == data || 0 == timeout) {
+            if (data != 0) {
+                fprintf(stdout, "ready-wait timed out: status=%ld\n", data);
+            }
+            break;
+        }
+        nanosleep(&sleep_time, nullptr);
+        timeout -= sleep_time_ms;
+    };
+
+    return 0;
+}
+
+extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t src_offset) {
+    if (nullptr == hbuffer 
+     || 0 >= size)
+        return -1;
+
+    vx_buffer_t *buffer = ((vx_buffer_t*)hbuffer);
+    vx_device_t *device = ((vx_device_t*)buffer->hdevice);
+
+    int line_size = vx_dev_caps(VX_CAPS_CACHE_LINESIZE);   
+    size_t dev_mem_size = vx_dev_caps(VX_CAPS_LOCAL_MEM_SIZE); 
+
+    size_t asize = align_size(size, line_size);
+
+    // check alignment
+    if (!is_aligned(dev_maddr, line_size))
+        return -1;
+    if (!is_aligned(buffer->io_addr + src_offset, line_size))
+        return -1;
+    
+    // bound checking
+    if (src_offset + asize > buffer->size)
+        return -1;
+    if (dev_maddr + asize > dev_mem_size)
+        return -1;
+
+    // Ensure ready for new command
+    if (vx_ready_wait(buffer->hdevice, -1) != 0)
+        return -1;
+
+    auto ls_shift = (int)std::log2(line_size);
+
+    CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_IO_ADDR, (buffer->io_addr + src_offset) >> ls_shift));
+    CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_MEM_ADDR, (dev_maddr >> ls_shift) ));
+    CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_DATA_SIZE, asize >> ls_shift));   
+    CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_CMD, CMD_TYPE_WRITE));
+
+    // Wait for the write operation to finish
+    if (vx_ready_wait(buffer->hdevice, -1) != 0)
+        return -1;
+
+    return 0;
+}
+
+extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t dest_offset) {
+    if (nullptr == hbuffer 
+     || 0 >= size)
+        return -1;
+
+    vx_buffer_t *buffer = ((vx_buffer_t*)hbuffer);
+    vx_device_t *device = ((vx_device_t*)buffer->hdevice);
+
+    int line_size = vx_dev_caps(VX_CAPS_CACHE_LINESIZE); 
+    size_t dev_mem_size = vx_dev_caps(VX_CAPS_LOCAL_MEM_SIZE);  
+
+    size_t asize = align_size(size, line_size);
+
+    // check alignment
+    if (!is_aligned(dev_maddr, line_size))
+        return -1;
+    if (!is_aligned(buffer->io_addr + dest_offset, line_size))
+        return -1; 
+
+    // bound checking
+    if (dest_offset + asize > buffer->size)
+        return -1;
+    if (dev_maddr + asize > dev_mem_size)
+        return -1;
+
+    // Ensure ready for new command
+    if (vx_ready_wait(buffer->hdevice, -1) != 0)
+        return -1;
+
+    auto ls_shift = (int)std::log2(line_size);
+
+    CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_IO_ADDR, (buffer->io_addr + dest_offset) >> ls_shift));
+    CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_MEM_ADDR, (dev_maddr) >> ls_shift));    
+    CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_DATA_SIZE, asize >> ls_shift));   
+    CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_CMD, CMD_TYPE_READ));
+
+    // Wait for the write operation to finish
+    if (vx_ready_wait(buffer->hdevice, -1) != 0)
+        return -1;
+
+    return 0;
+}
+
+extern int vx_flush_caches(vx_device_h hdevice, size_t dev_maddr, size_t size) {
+    if (nullptr == hdevice 
+     || 0 >= size)
+        return -1;
+
+    vx_device_t* device = ((vx_device_t*)hdevice);
+
+    int line_size = vx_dev_caps(VX_CAPS_CACHE_LINESIZE); 
+
+    size_t asize = align_size(size, line_size);  
+
+    // check alignment
+    if (!is_aligned(dev_maddr, line_size))
+        return -1;
+
+    // Ensure ready for new command
+    if (vx_ready_wait(hdevice, -1) != 0)
+        return -1;
+
+    auto ls_shift = (int)std::log2(line_size);
+
+    CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_MEM_ADDR, dev_maddr >> ls_shift));
+    CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_DATA_SIZE, asize >> ls_shift));   
+    CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_CMD, CMD_TYPE_CLFLUSH));
+
+    // Wait for the write operation to finish
+    if (vx_ready_wait(hdevice, -1) != 0)
+        return -1;
+
+    return 0;
+}
+
+extern int vx_start(vx_device_h hdevice) {
+    if (nullptr == hdevice)
+        return -1;   
+
+    // Ensure ready for new command
+    if (vx_ready_wait(hdevice, -1) != 0)
+        return -1;    
+  
+    // start execution
+    vx_device_t *device = ((vx_device_t*)hdevice);
+    CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_CMD, CMD_TYPE_RUN));
+
+    return 0;
+}
--- a/driver/rtlsim/.gitignore
+++ b/driver/rtlsim/.gitignore
@@ -0,0 +1,2 @@
+obj_dir
+*.so
--- a/driver/rtlsim/Makefile
+++ b/driver/rtlsim/Makefile
@@ -0,0 +1,76 @@
+#CFLAGS += -std=c++11 -O2 -DNDEBUG -Wall -Wextra -Wfatal-errors
+CFLAGS += -std=c++11 -g -O0 -Wall -Wextra -Wfatal-errors
+
+CFLAGS += -I../../include -I../../../hw/simulate -I../../../hw
+
+# control RTL debug print states
+DBG_PRINT_FLAGS = -DDBG_PRINT_CORE_ICACHE \
+				  -DDBG_PRINT_CORE_DCACHE \
+			      -DDBG_PRINT_CACHE_BANK  \
+				  -DDBG_PRINT_CACHE_SNP   \
+				  -DDBG_PRINT_CACHE_MSRQ  \
+				  -DDBG_PRINT_DRAM        \
+				  -DDBG_PRINT_PIPELINE    \
+				  -DDBG_PRINT_OPAE
+
+#DBG_PRINT=$(DBG_PRINT_FLAGS)
+
+#CONFIGS += -DNUM_CLUSTERS=2 -DNUM_CORES=4
+#CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=4
+#CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=2
+
+#DEBUG=1
+#AFU=1
+
+CFLAGS += -fPIC
+
+CFLAGS += -DUSE_RTLSIM $(CONFIGS)
+
+LDFLAGS += -shared -pthread
+# LDFLAGS += -dynamiclib -pthread
+
+TOP = Vortex
+
+SRCS = vortex.cpp ../common/vx_utils.cpp ../../hw/simulate/simulator.cpp
+
+RTL_INCLUDE = -I../../hw/rtl -I../../hw/rtl/libs -I../../hw/rtl/interfaces -I../../hw/rtl/pipe_regs -I../../hw/rtl/cache
+
+VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic $(CONFIGS)
+VL_FLAGS += -Wno-DECLFILENAME
+VL_FLAGS += --x-initial unique
+VL_FLAGS += --x-assign unique
+
+# Enable Verilator multithreaded simulation
+#THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))')
+#VL_FLAGS += --threads $(THREADS)
+
+# Debugigng
+ifdef DEBUG
+	VL_FLAGS += --trace -DVCD_OUTPUT $(DBG_PRINT)
+	CFLAGS   += -DVCD_OUTPUT $(DBG_PRINT)
+	#VL_FLAGS += -DDBG_CORE_REQ_INFO
+	#CFLAGS   += -DDBG_CORE_REQ_INFO
+else
+    CFLAGS   += -DNDEBUG
+	VL_FLAGS += -DNDEBUG
+endif
+
+# AFU
+ifdef AFU
+    TOP = vortex_afu_sim
+	VL_FLAGS += -DNOPAE -DSCOPE
+	CFLAGS += -DNOPAE -DSCOPE
+	RTL_INCLUDE += -I../../hw/opae  -I../../hw/opae/ccip
+endif
+
+PROJECT = libvortex.so
+# PROJECT = libvortex.dylib
+
+all: $(PROJECT)
+	
+$(PROJECT): $(SRCS)
+	verilator --exe --cc $(TOP) $(RTL_INCLUDE) $(VL_FLAGS) $(SRCS) -CFLAGS '$(CFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$(PROJECT)
+	make -j -C obj_dir -f V$(TOP).mk
+
+clean:
+	rm -rf $(PROJECT) obj_dir
--- a/driver/rtlsim/ram.h
+++ b/driver/rtlsim/ram.h
@@ -0,0 +1,64 @@
+#pragma once
+
+#include <stdio.h>
+#include <stdint.h>
+
+class RAM {
+private:
+
+  mutable uint8_t *mem_[(1 << 12)];      
+
+  uint8_t *get(uint32_t address) const {
+    uint32_t block_addr   = address >> 20;
+    uint32_t block_offset = address & 0x000FFFFF;
+    if (mem_[block_addr] == NULL) {
+      mem_[block_addr] = new uint8_t[(1 << 20)];
+    }
+    return mem_[block_addr] + block_offset;
+  }
+
+public:
+
+  RAM() {
+    for (uint32_t i = 0; i < (1 << 12); i++) {
+      mem_[i] = NULL;
+    }
+  }
+
+  ~RAM() {
+    this->clear();
+  }
+
+  size_t size() const {
+    return (1ull << 32);
+  }
+
+  void clear() {
+    for (uint32_t i = 0; i < (1 << 12); i++) {
+      if (mem_[i]) {
+        delete mem_[i];
+        mem_[i] = NULL;
+      }
+    }
+  }
+
+  void read(uint32_t address, uint32_t length, uint8_t *data) const {
+    for (unsigned i = 0; i < length; i++) {
+      data[i] = *this->get(address + i);
+    }
+  }
+
+  void write(uint32_t address, uint32_t length, const uint8_t *data) {
+    for (unsigned i = 0; i < length; i++) {
+      *this->get(address + i) = data[i];
+    }
+  }
+
+  uint8_t& operator[](uint32_t address) {
+    return *get(address);
+  }
+
+  const uint8_t& operator[](uint32_t address) const {
+    return *get(address);
+  }
+};
--- a/driver/rtlsim/simulator.cpp
+++ b/driver/rtlsim/simulator.cpp
@@ -0,0 +1,70 @@
+#include "simulator.h"
+#include <iostream>
+#include <fstream>
+#include <iomanip>
+
+uint64_t timestamp = 0;
+
+double sc_time_stamp() { 
+  return timestamp;
+}
+
+Simulator::Simulator() {    
+  // force random values for unitialized signals
+  const char* args[] = {"", "+verilator+rand+reset+2", "+verilator+seed+50"};
+  Verilated::commandArgs(3, args);
+
+  vortex_ = new Vvortex_afu_sim();
+
+#ifdef VCD_OUTPUT
+  Verilated::traceEverOn(true);
+  trace_ = new VerilatedVcdC;
+  vortex_->trace(trace_, 99);
+  trace_->open("trace.vcd");
+#endif  
+}
+
+Simulator::~Simulator() {
+#ifdef VCD_OUTPUT
+  trace_->close();
+#endif
+  delete vortex_;
+}
+
+void Simulator::reset() {     
+#ifndef NDEBUG
+  std::cout << timestamp << ": [sim] reset()" << std::endl;
+#endif 
+  vortex_->reset = 1;
+  this->step();  
+  vortex_->reset = 0;
+
+  dram_rsp_vec_.clear();
+}
+
+void Simulator::step() {
+  vortex_->clk = 0;
+  this->eval();
+
+  vortex_->clk = 1;
+  this->eval();
+
+  avs_driver();
+  ccip_driver();
+}
+
+void Simulator::eval() {
+  vortex_->eval();
+#ifdef VCD_OUTPUT
+  trace_->dump(timestamp);
+#endif
+  ++timestamp;
+}
+
+void Simulator::avs_driver() {
+  //--
+}
+
+ void Simulator::ccip_driver() {
+   //--
+ }
--- a/driver/rtlsim/simulator.h
+++ b/driver/rtlsim/simulator.h
@@ -0,0 +1,59 @@
+#pragma once
+
+#include "Vvortex_afu_sim.h"
+#include "Vvortex_afu_sim__Syms.h"
+#include "verilated.h"
+
+#ifdef VCD_OUTPUT
+#include <verilated_vcd_c.h>
+#endif
+
+#include <VX_config.h>
+#include "ram.h"
+
+#include <ostream>
+#include <vector>
+
+#define ENABLE_DRAM_STALLS
+#define DRAM_LATENCY 100
+#define DRAM_RQ_SIZE 16
+#define DRAM_STALLS_MODULO 16
+
+typedef struct {
+  int cycles_left;  
+  uint8_t *data;
+  unsigned tag;
+} dram_req_t;
+
+class Simulator {
+public:
+  
+  Simulator();
+  virtual ~Simulator();
+
+  void reset();
+  
+  void step();  
+
+  int mmio_read(uint64_t addr, uint64_t* value);
+
+  int mmio_write(uint64_t addr, uint64_t value);
+  
+private:  
+
+  void eval(); 
+
+  void avs_driver();
+
+  void ccip_driver(); 
+  
+  std::vector<dram_req_t> dram_rsp_vec_;
+
+  RAM ram_;
+  Vvortex_afu_sim *vortex_;
+  
+
+#ifdef VCD_OUTPUT
+  VerilatedVcdC *trace_;
+#endif
+};
--- a/driver/rtlsim/vortex.cpp
+++ b/driver/rtlsim/vortex.cpp
@@ -0,0 +1,281 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <iostream>
+#include <future>
+#include <chrono>
+
+#include <vortex.h>
+#include <ram.h>
+#include <simulator.h>
+
+///////////////////////////////////////////////////////////////////////////////
+
+static size_t align_size(size_t size) {
+    uint32_t cache_block_size = vx_dev_caps(VX_CAPS_CACHE_LINESIZE);
+    return cache_block_size * ((size + cache_block_size - 1) / cache_block_size);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+class vx_device;
+
+class vx_buffer {
+public:
+    vx_buffer(size_t size, vx_device* device) 
+        : size_(size)
+        , device_(device) {
+        auto aligned_asize = align_size(size);
+        data_ = malloc(aligned_asize);
+    }
+
+    ~vx_buffer() {
+        if (data_) {
+            free(data_);
+        }
+    }
+
+    void* data() const {
+        return data_;
+    }
+
+    size_t size() const {
+        return size_;
+    }
+
+    vx_device* device() const {
+        return device_;
+    }
+
+private:
+    size_t size_;
+    vx_device* device_;
+    void* data_;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+class vx_device {    
+public:
+    vx_device() {        
+        mem_allocation_ = vx_dev_caps(VX_CAPS_ALLOC_BASE_ADDR);
+        simulator_.attach_ram(&ram_);
+    } 
+
+    ~vx_device() {     
+        if (future_.valid()) {
+            future_.wait();
+        }
+    }
+
+    int alloc_local_mem(size_t size, size_t* dev_maddr) {
+        size_t asize = align_size(size);
+        auto dev_mem_size = vx_dev_caps(VX_CAPS_LOCAL_MEM_SIZE);
+        if (mem_allocation_ + asize > dev_mem_size)
+            return -1;
+        *dev_maddr = mem_allocation_;
+        mem_allocation_ += asize;
+        return 0;
+    }
+
+    int upload(void* src, size_t dest_addr, size_t size, size_t src_offset) {
+        size_t asize = align_size(size);
+        if (dest_addr + asize > ram_.size())
+            return -1;
+
+        /*printf("VXDRV: upload %d bytes to 0x%x\n", size, dest_addr);
+        for (int i = 0; i < size; i += 4) {
+            printf("mem-write: 0x%x <- 0x%x\n", uint32_t(dest_addr + i), *(uint32_t*)((uint8_t*)src + src_offset + i));
+        }*/
+        
+        ram_.write(dest_addr, asize, (uint8_t*)src + src_offset);
+        return 0;
+    }
+
+    int download(const void* dest, size_t src_addr, size_t size, size_t dest_offset) {
+        size_t asize = align_size(size);
+        if (src_addr + asize > ram_.size())
+            return -1;
+
+        ram_.read(src_addr, asize, (uint8_t*)dest + dest_offset);
+        
+        /*printf("VXDRV: download %d bytes from 0x%x\n", size, src_addr);
+        for (int i = 0; i < size; i += 4) {
+            printf("mem-read: 0x%x -> 0x%x\n", uint32_t(src_addr + i), *(uint32_t*)((uint8_t*)dest + dest_offset + i));
+        }*/
+        
+        return 0;
+    }
+
+    int start() {   
+        if (future_.valid()) {
+            future_.wait(); // ensure prior run completed
+        }
+        future_ = std::async(std::launch::async, [&]{             
+            simulator_.reset();        
+            while (simulator_.is_busy()) {
+                simulator_.step();
+            }
+        });
+        return 0;
+    }
+
+    int wait(long long timeout) {
+        if (!future_.valid())
+            return 0;
+        auto timeout_sec = (timeout < 0) ? timeout : (timeout / 1000);
+        std::chrono::seconds wait_time(1);
+        for (;;) {
+            auto status = future_.wait_for(wait_time); // wait for 1 sec and check status
+            if (status == std::future_status::ready 
+             || 0 == timeout_sec--)
+                break;
+        }
+        return 0;
+    }
+
+    int flush_caches(size_t dev_maddr, size_t size) {
+        if (future_.valid()) {
+            future_.wait(); // ensure prior run completed
+        }        
+        simulator_.flush_caches(dev_maddr, size);        
+        while (simulator_.is_busy()) {
+            simulator_.step();
+        };
+        return 0;
+    }
+
+private:
+
+    size_t mem_allocation_;     
+    RAM ram_;
+    Simulator simulator_;
+    std::future<void> future_;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+extern int vx_dev_open(vx_device_h* hdevice) {
+    if (nullptr == hdevice)
+        return  -1;
+
+    *hdevice = new vx_device();
+
+    return 0;
+}
+
+extern int vx_dev_close(vx_device_h hdevice) {
+    if (nullptr == hdevice)
+        return -1;
+
+    vx_device *device = ((vx_device*)hdevice);
+
+    delete device;
+
+    return 0;
+}
+
+extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr) {
+    if (nullptr == hdevice 
+     || nullptr == dev_maddr
+     || 0 >= size)
+        return -1;
+
+    vx_device *device = ((vx_device*)hdevice);
+    return device->alloc_local_mem(size, dev_maddr);
+}
+
+extern int vx_flush_caches(vx_device_h hdevice, size_t dev_maddr, size_t size) {
+    if (nullptr == hdevice 
+     || 0 >= size)
+        return -1;
+
+    vx_device *device = ((vx_device*)hdevice);
+
+    return device->flush_caches(dev_maddr, size);
+}
+
+
+extern int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hbuffer) {
+    if (nullptr == hdevice 
+     || 0 >= size
+     || nullptr == hbuffer)
+        return -1;
+
+    vx_device *device = ((vx_device*)hdevice);
+
+    auto buffer = new vx_buffer(size, device);
+    if (nullptr == buffer->data()) {
+        delete buffer;
+        return -1;
+    }
+
+    *hbuffer = buffer;
+
+    return 0;
+}
+
+extern volatile void* vx_host_ptr(vx_buffer_h hbuffer) {
+    if (nullptr == hbuffer)
+        return nullptr;
+
+    vx_buffer* buffer = ((vx_buffer*)hbuffer);
+
+    return buffer->data();
+}
+
+extern int vx_buf_release(vx_buffer_h hbuffer) {
+    if (nullptr == hbuffer)
+        return -1;
+
+    vx_buffer* buffer = ((vx_buffer*)hbuffer);
+
+    delete buffer;
+
+    return 0;
+}
+
+extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t src_offset) {
+    if (nullptr == hbuffer 
+     || 0 >= size)
+        return -1;
+
+    auto buffer = (vx_buffer*)hbuffer;
+
+    if (size + src_offset > buffer->size())
+        return -1;
+
+    return buffer->device()->upload(buffer->data(), dev_maddr, size, src_offset);
+}
+
+extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t dest_offset) {
+     if (nullptr == hbuffer 
+      || 0 >= size)
+        return -1;
+
+    auto buffer = (vx_buffer*)hbuffer;
+
+    if (size + dest_offset > buffer->size())
+        return -1;    
+
+    return buffer->device()->download(buffer->data(), dev_maddr, size, dest_offset);
+}
+
+extern int vx_start(vx_device_h hdevice) {
+    if (nullptr == hdevice)
+        return -1;
+
+    vx_device *device = ((vx_device*)hdevice);
+
+    return device->start();
+}
+
+extern int vx_ready_wait(vx_device_h hdevice, long long timeout) {
+    if (nullptr == hdevice)
+        return -1;
+
+    vx_device *device = ((vx_device*)hdevice);
+
+    return device->wait(timeout);
+}
--- a/driver/simx/.gitignore
+++ b/driver/simx/.gitignore
@@ -0,0 +1,2 @@
+obj_dir
+libvortex.so
--- a/driver/simx/Makefile
+++ b/driver/simx/Makefile
@@ -0,0 +1,32 @@
+CFLAGS += -std=c++11 -O3 -Wall -Wextra -pedantic -Wfatal-errors
+#CFLAGS += -std=c++11 -g -O0 -Wall -Wextra -pedantic -Wfatal-errors
+
+CFLAGS += -I../../include -I../../../simX/include -I../../../hw
+
+CFLAGS += -fPIC
+
+CFLAGS += -DUSE_SIMX 
+
+LDFLAGS += -shared -pthread
+
+SRCS = vortex.cpp ../common/vx_utils.cpp ../../simX/args.cpp ../../simX/mem.cpp ../../simX/core.cpp ../../simX/instruction.cpp ../../simX/enc.cpp ../../simX/util.cpp
+
+RTL_TOP = ../../simX/cache_simX.v
+
+RTL_INCLUDE = -I../../hw/old_rtl -I../../hw/old_rtl/interfaces -I../../hw/old_rtl/cache -I../../hw/old_rtl/shared_memory
+
+THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))')
+VL_FLAGS += --threads $(THREADS)
+
+VL_FLAGS += -Wno-UNOPTFLAT -Wno-WIDTH 
+
+PROJECT = libvortex.so
+
+all: $(PROJECT)
+
+$(PROJECT): $(SRCS) 
+	verilator --exe --cc $(RTL_TOP) $(RTL_INCLUDE) $(VL_FLAGS) $(SRCS) -CFLAGS '$(CFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$(PROJECT)
+	make -j -C obj_dir -f Vcache_simX.mk
+
+clean:
+	rm -rf $(PROJECT) obj_dir
--- a/driver/simx/vortex.cpp
+++ b/driver/simx/vortex.cpp
@@ -0,0 +1,318 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <iostream>
+#include <thread>
+#include <mutex>
+#include <chrono>
+
+#include <vortex.h>
+#include <core.h>
+#include <VX_config.h>
+
+#define PAGE_SIZE 4096
+
+///////////////////////////////////////////////////////////////////////////////
+
+static size_t align_size(size_t size) {
+    uint32_t cache_block_size = vx_dev_caps(VX_CAPS_CACHE_LINESIZE);
+    return cache_block_size * ((size + cache_block_size - 1) / cache_block_size);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+class vx_device;
+
+class vx_buffer {
+public:
+    vx_buffer(size_t size, vx_device* device) 
+        : size_(size)
+        , device_(device) {
+        auto aligned_asize = align_size(size);
+        data_ = malloc(aligned_asize);
+    }
+
+    ~vx_buffer() {
+        if (data_) {
+            free(data_);
+        }
+    }
+
+    void* data() const {
+        return data_;
+    }
+
+    size_t size() const {
+        return size_;
+    }
+
+    vx_device* device() const {
+        return device_;
+    }
+
+private:
+    size_t size_;
+    vx_device* device_;
+    void* data_;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+class vx_device {    
+public:
+    vx_device() 
+        : is_done_(false)
+        , is_running_(false)
+        , thread_(__thread_proc__, this)  {
+        mem_allocation_ = vx_dev_caps(VX_CAPS_ALLOC_BASE_ADDR);
+    }
+
+    ~vx_device() {
+        mutex_.lock();
+        is_done_ = true;
+        mutex_.unlock();
+        
+        thread_.join();
+    }
+
+    int alloc_local_mem(size_t size, size_t* dev_maddr) {
+        auto asize = align_size(size);
+        auto dev_mem_size = vx_dev_caps(VX_CAPS_LOCAL_MEM_SIZE);
+        if (mem_allocation_ + asize > dev_mem_size)
+            return -1;
+        *dev_maddr = mem_allocation_;
+        mem_allocation_ += asize;
+        return 0;
+    }
+
+    int upload(void* src, size_t dest_addr, size_t size, size_t src_offset) {
+        auto asize = align_size(size);
+        if (dest_addr + asize > ram_.size())
+            return -1;
+
+        /*printf("VXDRV: upload %d bytes to 0x%x\n", size, dest_addr);
+        for (int i = 0; i < size; i += 4) {
+            printf("mem-write: 0x%x <- 0x%x\n", dest_addr + i, *(uint32_t*)((uint8_t*)src + src_offset + i));
+        }*/
+        
+        ram_.write(dest_addr, asize, (uint8_t*)src + src_offset);
+        return 0;
+    }
+
+    int download(const void* dest, size_t src_addr, size_t size, size_t dest_offset) {
+        size_t asize = align_size(size);
+        if (src_addr + asize > ram_.size())
+            return -1;
+
+        ram_.read(src_addr, asize, (uint8_t*)dest + dest_offset);
+        
+        /*printf("VXDRV: download %d bytes from 0x%x\n", size, src_addr);
+        for (int i = 0; i < size; i += 4) {
+            printf("mem-read: 0x%x -> 0x%x\n", src_addr + i, *(uint32_t*)((uint8_t*)dest + dest_offset + i));
+        }*/
+        
+        return 0;
+    }
+
+    int start() {  
+
+        mutex_.lock();     
+        is_running_ = true;
+        mutex_.unlock();
+
+        return 0;
+    }
+
+    int wait(long long timeout) {
+        auto timeout_sec = (timeout < 0) ? timeout : (timeout / 1000);
+        for (;;) {
+            mutex_.lock();
+            bool is_running = is_running_;
+            mutex_.unlock();
+
+            if (!is_running || 0 == timeout_sec--)
+                break;
+
+            std::this_thread::sleep_for(std::chrono::seconds(1));            
+        }
+        return 0;
+    }
+
+private:
+
+    void run() {        
+        Harp::ArchDef arch("rv32i", NUM_WARPS, NUM_THREADS);
+        Harp::WordDecoder dec(arch);
+        Harp::MemoryUnit mu(PAGE_SIZE, arch.getWordSize(), true);
+        Harp::Core core(arch, dec, mu);
+        mu.attach(ram_, 0);  
+
+        while (core.running()) { 
+            core.step();
+        }
+        core.printStats();
+    }
+
+    void thread_proc() {
+        std::cout << "Device ready..." << std::endl;
+
+        for (;;) {
+            mutex_.lock();
+            bool is_done = is_done_;
+            bool is_running = is_running_;
+            mutex_.unlock();
+
+            if (is_done)
+                break;
+
+            if (is_running) {                                
+                std::cout << "Device running..." << std::endl;
+                
+                this->run();
+
+                mutex_.lock();
+                is_running_ = false;
+                mutex_.unlock();
+
+                std::cout << "Device ready..." << std::endl;
+            }
+        }
+
+        std::cout << "Device shutdown..." << std::endl;
+    }
+
+    static void __thread_proc__(vx_device* device) {
+        device->thread_proc();
+    }
+
+    bool is_done_;
+    bool is_running_;   
+    size_t mem_allocation_; 
+    std::thread thread_;   
+    Harp::RAM ram_;
+    std::mutex mutex_;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+extern int vx_dev_open(vx_device_h* hdevice) {
+    if (nullptr == hdevice)
+        return  -1;
+
+    *hdevice = new vx_device();
+
+    return 0;
+}
+
+extern int vx_dev_close(vx_device_h hdevice) {
+    if (nullptr == hdevice)
+        return -1;
+
+    vx_device *device = ((vx_device*)hdevice);
+
+    delete device;
+
+    return 0;
+}
+
+extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr) {
+    if (nullptr == hdevice 
+     || nullptr == dev_maddr
+     || 0 >= size)
+        return -1;
+
+    vx_device *device = ((vx_device*)hdevice);
+    return device->alloc_local_mem(size, dev_maddr);
+}
+
+extern int vx_flush_caches(vx_device_h hdevice, size_t /*dev_maddr*/, size_t size) {
+    if (nullptr == hdevice 
+     || 0 >= size)
+        return -1;
+    // this functionality is not need by simX 
+    return 0;
+}
+
+extern int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hbuffer) {
+    if (nullptr == hdevice 
+     || 0 >= size
+     || nullptr == hbuffer)
+        return -1;
+
+    vx_device *device = ((vx_device*)hdevice);
+
+    auto buffer = new vx_buffer(size, device);
+    if (nullptr == buffer->data()) {
+        delete buffer;
+        return -1;
+    }
+
+    *hbuffer = buffer;
+
+    return 0;
+}
+
+extern volatile void* vx_host_ptr(vx_buffer_h hbuffer) {
+    if (nullptr == hbuffer)
+        return nullptr;
+
+    vx_buffer* buffer = ((vx_buffer*)hbuffer);
+
+    return buffer->data();
+}
+
+extern int vx_buf_release(vx_buffer_h hbuffer) {
+    if (nullptr == hbuffer)
+        return -1;
+
+    vx_buffer* buffer = ((vx_buffer*)hbuffer);
+
+    delete buffer;
+
+    return 0;
+}
+
+extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t src_offset) {
+    if (nullptr == hbuffer 
+     || 0 >= size)
+        return -1;
+
+    auto buffer = (vx_buffer*)hbuffer;
+
+    if (size + src_offset > buffer->size())
+        return -1;
+
+    return buffer->device()->upload(buffer->data(), dev_maddr, size, src_offset);
+}
+
+extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t dest_offset) {
+     if (nullptr == hbuffer 
+      || 0 >= size)
+        return -1;
+
+    auto buffer = (vx_buffer*)hbuffer;
+
+    if (size + dest_offset > buffer->size())
+        return -1;    
+
+    return buffer->device()->download(buffer->data(), dev_maddr, size, dest_offset);
+}
+
+extern int vx_start(vx_device_h hdevice) {
+    if (nullptr == hdevice)
+        return -1;
+
+    vx_device *device = ((vx_device*)hdevice);
+
+    return device->start();
+}
+
+extern int vx_ready_wait(vx_device_h hdevice, long long timeout) {
+    if (nullptr == hdevice)
+        return -1;
+
+    vx_device *device = ((vx_device*)hdevice);
+
+    return device->wait(timeout);
+}
--- a/driver/stub/Makefile
+++ b/driver/stub/Makefile
@@ -0,0 +1,20 @@
+CXXFLAGS += -std=c++11 -O3 -Wall -Wextra -pedantic -Wfatal-errors
+#CXXFLAGS += -std=c++11 -g -O0 -Wall -Wextra -pedantic -Wfatal-errors
+
+CXXFLAGS += -I../include -I../../runtime
+
+CXXFLAGS += -fPIC
+
+LDFLAGS += -shared -pthread
+
+SRCS = vortex.cpp ../common/vx_utils.cpp
+
+PROJECT = libvortex.so
+
+all: $(PROJECT)
+
+$(PROJECT): $(SRCS) 
+	$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
+
+clean:
+	rm -rf $(PROJECT) obj_dir
--- a/driver/stub/vortex.cpp
+++ b/driver/stub/vortex.cpp
@@ -0,0 +1,45 @@
+#include <vortex.h>
+
+extern int vx_dev_open(vx_device_h* /*hdevice*/) {
+    return -1;
+}
+
+extern int vx_dev_close(vx_device_h /*hdevice*/) {
+    return -1;
+}
+
+extern int vx_alloc_dev_mem(vx_device_h /*hdevice*/, size_t /*size*/, size_t* /*dev_maddr*/) {
+    return -1;
+}
+
+extern int vx_flush_caches(vx_device_h /*hdevice*/, size_t /*dev_maddr*/, size_t /*size*/) {
+    return -1;
+}
+
+extern int vx_alloc_shared_mem(vx_device_h /*hdevice*/, size_t /*size*/, vx_buffer_h* /*hbuffer*/) {
+    return -1;
+}
+
+extern volatile void* vx_host_ptr(vx_buffer_h /*hbuffer*/) {
+    return nullptr;
+}
+
+extern int vx_buf_release(vx_buffer_h /*hbuffer*/) {
+    return -1;
+}
+
+extern int vx_copy_to_dev(vx_buffer_h /*hbuffer*/, size_t /*dev_maddr*/, size_t /*size*/, size_t /*src_offset*/) {
+    return -1;
+}
+
+extern int vx_copy_from_dev(vx_buffer_h /*hbuffer*/, size_t /*dev_maddr*/, size_t /*size*/, size_t /*dest_offset*/) {
+     return -1;
+}
+
+extern int vx_start(vx_device_h /*hdevice*/) {
+    return -1;
+}
+
+extern int vx_ready_wait(vx_device_h /*hdevice*/, long long /*timeout*/) {
+    return -1;
+}
--- a/driver/tests/basic/Makefile
+++ b/driver/tests/basic/Makefile
@@ -0,0 +1,69 @@
+RISCV_TOOL_PATH ?= $(wildcard ~/dev/riscv-gnu-toolchain/drops)
+# RISCV_TOOL_PATH ?= /opt/riscv-new/drops
+VX_RT_PATH ?= $(wildcard ../../../runtime)
+
+VX_CC  = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
+VX_CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
+VX_DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
+VX_CPY = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
+
+VX_NEWLIB = $(VX_RT_PATH)/newlib/newlib.c
+VX_STR = $(VX_RT_PATH)/startup/vx_start.S
+VX_INT = $(VX_RT_PATH)/intrinsics/vx_intrinsics.S
+VX_IO  = $(VX_RT_PATH)/io/vx_io.S $(VX_RT_PATH)/io/vx_io.c
+VX_API = $(VX_RT_PATH)/vx_api/vx_api.c
+VX_FIO = $(VX_RT_PATH)/fileio/fileio.S
+
+VX_CFLAGS = -march=rv32im -mabi=ilp32 -O3 -Wl,-Bstatic,-T,$(VX_RT_PATH)/startup/vx_link.ld -ffreestanding -nostartfiles -Wl,--gc-sections
+VX_CFLAGS += -I../../../hw 
+
+VX_SRCS = kernel.c
+
+CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -pedantic -Wfatal-errors
+
+CXXFLAGS += -I../../include
+
+LDFLAGS += 
+
+PROJECT = basic
+
+SRCS = basic.cpp
+
+all: $(PROJECT) kernel.bin kernel.dump
+
+kernel.dump:  kernel.elf
+	$(VX_DMP) -D  kernel.elf > kernel.dump
+
+kernel.bin:  kernel.elf
+	$(VX_CPY) -O binary kernel.elf kernel.bin
+
+kernel.elf: $(SRCS)
+	$(VX_CC) $(VX_CFLAGS) $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_SRCS) -I$(VX_RT_PATH) -o kernel.elf
+
+$(PROJECT): $(SRCS)
+	$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -L../../stub -lvortex -o $@
+
+run-fpga: $(PROJECT)
+	LD_LIBRARY_PATH=../../opae:$(LD_LIBRARY_PATH) ./$(PROJECT) -n 256
+
+run-ase: $(PROJECT)
+	ASE_LOG=0 LD_LIBRARY_PATH=../../opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) -n 256
+
+run-rtlsim: $(PROJECT)
+	LD_LIBRARY_PATH=../../rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) -n 256
+
+run-simx: $(PROJECT)
+	LD_LIBRARY_PATH=../../simx:$(LD_LIBRARY_PATH) ./$(PROJECT) -n 256
+
+.depend: $(SRCS)
+	$(CXX) $(CXXFLAGS) -MM $^ > .depend;
+
+clean:
+	rm -rf $(PROJECT) *.o .depend
+
+clean-all:
+	rm -rf $(PROJECT) *.o *.elf *.bin *.dump .depend
+
+ifneq ($(MAKECMDGOALS),clean)
+    -include .depend
+endif
--- a/driver/tests/basic/basic.cpp
+++ b/driver/tests/basic/basic.cpp
@@ -0,0 +1,241 @@
+#include <iostream>
+#include <unistd.h>
+#include <string.h>
+#include <vortex.h>
+#include "common.h"
+
+#define RT_CHECK(_expr)                                         \
+   do {                                                         \
+     int _ret = _expr;                                          \
+     if (0 == _ret)                                             \
+       break;                                                   \
+     printf("Error: '%s' returned %d!\n", #_expr, (int)_ret);   \
+	 cleanup();			                                              \
+     exit(-1);                                                  \
+   } while (false)
+
+const char* kernel_file = "kernel.bin";
+int test = -1;
+uint32_t count = 0;
+
+vx_device_h device = nullptr;
+vx_buffer_h buffer = nullptr;
+
+static void show_usage() {
+   std::cout << "Vortex Driver Test." << std::endl;
+   std::cout << "Usage: [-t testno][-k: kernel][-n words][-h: help]" << std::endl;
+}
+
+static void parse_args(int argc, char **argv) {
+  int c;
+  while ((c = getopt(argc, argv, "n:t:k:h?")) != -1) {
+    switch (c) {
+    case 'n':
+      count = atoi(optarg);
+      break;
+    case 't':
+      test = atoi(optarg);
+      break;
+    case 'k':
+      kernel_file = optarg;
+      break;
+    case 'h': 
+    case '?': {
+      show_usage();
+      exit(0);
+    } break;
+    default:
+      show_usage();
+      exit(-1);
+    }
+  }
+}
+
+void cleanup() {
+  if (buffer) {
+    vx_buf_release(buffer);
+  }
+  if (device) {
+    vx_dev_close(device);
+  }
+}
+
+uint64_t shuffle(int i, uint64_t value) {
+  return (value << i) | (value & ((1 << i)-1));;
+}
+
+int run_memcopy_test(uint32_t dev_addr, uint64_t value, int num_blocks) {
+  int errors = 0;
+
+  // update source buffer
+  for (int i = 0; i < (64 * num_blocks) / 8; ++i) {
+    ((uint64_t*)vx_host_ptr(buffer))[i] = shuffle(i, value);
+  }
+
+  // write buffer to local memory
+  std::cout << "write buffer to local memory" << std::endl;
+  RT_CHECK(vx_copy_to_dev(buffer, dev_addr, 64 * num_blocks, 0));
+
+  // clear destination buffer
+  for (int i = 0; i < (64 * num_blocks) / 8; ++i) {
+    ((uint64_t*)vx_host_ptr(buffer))[i] = 0;
+  }
+
+  // read buffer from local memory
+  std::cout << "read buffer from local memory" << std::endl;
+  RT_CHECK(vx_copy_from_dev(buffer, dev_addr, 64 * num_blocks, 0));
+
+  // verify result
+  std::cout << "verify result" << std::endl;
+  for (int i = 0; i < (64 * num_blocks) / 8; ++i) {
+    auto curr = ((uint64_t*)vx_host_ptr(buffer))[i];
+    auto ref = shuffle(i, value);
+    if (curr != ref) {
+      std::cout << "error at 0x" << std::hex << (dev_addr + 8 * i)
+                << ": actual 0x" << curr << ", expected 0x" << ref << std::endl;
+      ++errors;
+    }
+  } 
+  
+  if (errors != 0) {
+    std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
+    std::cout << "FAILED!" << std::endl;
+    return 1;
+  }
+
+  return 0;
+}
+
+int run_kernel_test(const kernel_arg_t& kernel_arg, 
+                    uint32_t buf_size, 
+                    uint32_t num_points) {
+  int errors = 0; 
+  
+  // update source buffer
+  {
+    auto buf_ptr = (int32_t*)vx_host_ptr(buffer);
+    for (uint32_t i = 0; i < num_points; ++i) {
+      buf_ptr[i] = i;
+    }
+  }
+  std::cout << "upload source buffer" << std::endl;
+  RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.src_ptr, buf_size, 0));
+
+  // clear destination buffer
+  {
+    auto buf_ptr = (int32_t*)vx_host_ptr(buffer);
+    for (uint32_t i = 0; i < num_points; ++i) {
+      buf_ptr[i] = 0xffffffff;
+    }
+  }  
+  std::cout << "clear destination buffer" << std::endl;
+  RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.dst_ptr, buf_size, 0));
+
+  // start device
+  std::cout << "start device" << std::endl;
+  RT_CHECK(vx_start(device));
+
+  // wait for completion
+  std::cout << "wait for completion" << std::endl;
+  RT_CHECK(vx_ready_wait(device, -1));
+
+  // flush the caches
+  std::cout << "flush the caches" << std::endl;
+  RT_CHECK(vx_flush_caches(device, kernel_arg.dst_ptr, buf_size));
+
+  // read buffer from local memory
+  std::cout << "read buffer from local memory" << std::endl;
+  RT_CHECK(vx_copy_from_dev(buffer, kernel_arg.dst_ptr, buf_size, 0));
+
+  // verify result
+  std::cout << "verify result" << std::endl;
+  for (uint32_t i = 0; i < num_points; ++i) {
+    int32_t curr = ((int32_t*)vx_host_ptr(buffer))[i];
+    int32_t ref = i;
+    if (curr != ref) {
+      std::cout << "error at value " << i
+                << ": actual 0x" << curr << ", expected 0x" << ref << std::endl;
+      ++errors;
+    }
+  } 
+  
+  if (errors != 0) {
+    std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
+    std::cout << "FAILED!" << std::endl;
+    return 1;
+  }
+
+  return 0;
+}
+
+int main(int argc, char *argv[]) {
+  size_t value; 
+  kernel_arg_t kernel_arg;
+
+  // parse command arguments
+  parse_args(argc, argv);
+
+  if (count == 0) {
+    count = 1;
+  }
+
+  uint32_t max_cores = vx_dev_caps(VX_CAPS_MAX_CORES);
+  uint32_t num_points = max_cores * count;
+  uint32_t num_blocks = (num_points * sizeof(uint32_t) + 63) / 64;
+  uint32_t buf_size = num_blocks * 64;
+
+  std::cout << "number of points: " << num_points << std::endl;
+  std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
+
+  // open device connection
+  std::cout << "open device connection" << std::endl;
+  RT_CHECK(vx_dev_open(&device));
+
+  // allocate device memory
+  RT_CHECK(vx_alloc_dev_mem(device, buf_size, &value));
+  kernel_arg.src_ptr = value;
+  RT_CHECK(vx_alloc_dev_mem(device, buf_size, &value));
+  kernel_arg.dst_ptr = value;
+
+  kernel_arg.count = count;
+
+  std::cout << "dev_src=" << std::hex << kernel_arg.src_ptr << std::endl;
+  std::cout << "dev_dst=" << std::hex << kernel_arg.dst_ptr << std::endl;
+
+  // allocate shared memory  
+  std::cout << "allocate shared memory" << std::endl;
+  uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
+  RT_CHECK(vx_alloc_shared_mem(device, alloc_size, &buffer));
+
+  // run tests  
+  if (0 == test || -1 == test) {
+    std::cout << "run memcopy test" << std::endl;
+    RT_CHECK(run_memcopy_test(kernel_arg.src_ptr, 0x0badf00d00ff00ff, 1));
+    RT_CHECK(run_memcopy_test(kernel_arg.src_ptr, 0x0badf00d40ff40ff, num_blocks));
+  }
+
+  if (1 == test || -1 == test) {
+    // upload program
+    std::cout << "upload program" << std::endl;  
+    RT_CHECK(vx_upload_kernel_file(device, kernel_file));
+
+    // upload kernel argument
+    std::cout << "upload kernel argument" << std::endl;
+    {
+      auto buf_ptr = (void*)vx_host_ptr(buffer);
+      memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
+      RT_CHECK(vx_copy_to_dev(buffer, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
+    }
+
+    std::cout << "run kernel test" << std::endl;
+    RT_CHECK(run_kernel_test(kernel_arg, buf_size, num_points));
+  }
+
+  // cleanup
+  std::cout << "cleanup" << std::endl;  
+  cleanup();
+
+  std::cout << "Test PASSED" << std::endl;
+
+  return 0;
+}
--- a/driver/tests/basic/common.h
+++ b/driver/tests/basic/common.h
@@ -0,0 +1,12 @@
+#ifndef _COMMON_H_
+#define _COMMON_H_
+
+#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
+
+struct kernel_arg_t {
+  uint32_t count;
+  uint32_t src_ptr;
+  uint32_t dst_ptr;  
+};
+
+#endif
--- a/driver/tests/basic/kernel.bin
+++ b/driver/tests/basic/kernel.bin
--- a/driver/tests/basic/kernel.c
+++ b/driver/tests/basic/kernel.c
@@ -0,0 +1,17 @@
+#include <stdint.h>
+#include <VX_config.h>
+#include "intrinsics/vx_intrinsics.h"
+#include "common.h"
+
+void main() {
+	struct kernel_arg_t* arg = (struct kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
+	uint32_t count   = arg->count;
+	int32_t* src_ptr = (int32_t*)arg->src_ptr;
+	int32_t* dst_ptr = (int32_t*)arg->dst_ptr;
+	
+	uint32_t offset  = vx_core_id() * count;
+	
+	for (uint32_t i = 0; i < count; ++i) {
+		dst_ptr[offset + i] = src_ptr[offset + i];
+	}
+}
--- a/driver/tests/demo/Makefile
+++ b/driver/tests/demo/Makefile
@@ -0,0 +1,66 @@
+RISCV_TOOL_PATH ?= $(wildcard ~/dev/riscv-gnu-toolchain/drops)
+VX_RT_PATH ?= $(wildcard ../../../runtime)
+
+VX_CC  = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
+VX_CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
+VX_DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
+VX_CPY = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
+
+#VX_NEWLIB = $(VX_RT_PATH)/newlib/newlib.c
+VX_STR = $(VX_RT_PATH)/startup/vx_start.S
+VX_INT = $(VX_RT_PATH)/intrinsics/vx_intrinsics.S
+#VX_IO  = $(VX_RT_PATH)/io/vx_io.S $(VX_RT_PATH)/io/vx_io.c
+VX_API = $(VX_RT_PATH)/vx_api/vx_api.c
+#VX_FIO = $(VX_RT_PATH)/fileio/fileio.S
+
+VX_CFLAGS = -march=rv32im -mabi=ilp32 -O3 -Wl,-Bstatic,-T,$(VX_RT_PATH)/startup/vx_link.ld -ffreestanding -nostartfiles -Wl,--gc-sections
+VX_CFLAGS += -I../../../hw 
+
+VX_SRCS = kernel.c
+
+CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -pedantic -Wfatal-errors
+
+CXXFLAGS += -I../../include
+
+PROJECT = demo
+
+SRCS = demo.cpp
+
+all: $(PROJECT) kernel.bin kernel.dump
+ 
+kernel.dump:  kernel.elf
+	$(VX_DMP) -D  kernel.elf > kernel.dump
+
+kernel.bin:  kernel.elf
+	$(VX_CPY) -O binary kernel.elf kernel.bin
+
+kernel.elf: $(SRCS)
+	$(VX_CC) $(VX_CFLAGS) $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_SRCS) -I$(VX_RT_PATH) -o kernel.elf
+
+$(PROJECT): $(SRCS)
+	$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -L../../stub -lvortex -o $@
+
+run-fpga: $(PROJECT)
+	LD_LIBRARY_PATH=../../opae:$(LD_LIBRARY_PATH) ./$(PROJECT) -n 16
+
+run-ase: $(PROJECT)
+	ASE_LOG=0 LD_LIBRARY_PATH=../../opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) -n 16
+
+run-rtlsim: $(PROJECT)
+	LD_LIBRARY_PATH=../../rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) -n 16
+	
+run-simx: $(PROJECT)
+	LD_LIBRARY_PATH=../../simx:$(LD_LIBRARY_PATH) ./$(PROJECT) -n 16
+
+.depend: $(SRCS)
+	$(CXX) $(CXXFLAGS) -MM $^ > .depend;
+
+clean:
+	rm -rf $(PROJECT) *.o .depend
+
+clean-all:
+	rm -rf $(PROJECT) *.o *.elf *.bin *.dump .depend
+
+ifneq ($(MAKECMDGOALS),clean)
+    -include .depend
+endif
--- a/driver/tests/demo/common.h
+++ b/driver/tests/demo/common.h
@@ -0,0 +1,13 @@
+#ifndef _COMMON_H_
+#define _COMMON_H_
+
+#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
+
+struct kernel_arg_t {
+  uint32_t count;
+  uint32_t src0_ptr;
+  uint32_t src1_ptr;
+  uint32_t dst_ptr;  
+};
+
+#endif
--- a/driver/tests/demo/demo.cpp
+++ b/driver/tests/demo/demo.cpp
@@ -0,0 +1,201 @@
+#include <iostream>
+#include <unistd.h>
+#include <string.h>
+#include <vortex.h>
+#include "common.h"
+
+#define RT_CHECK(_expr)                                         \
+   do {                                                         \
+     int _ret = _expr;                                          \
+     if (0 == _ret)                                             \
+       break;                                                   \
+     printf("Error: '%s' returned %d!\n", #_expr, (int)_ret);   \
+	 cleanup();			                                              \
+     exit(-1);                                                  \
+   } while (false)
+
+const char* kernel_file = "kernel.bin";
+uint32_t count = 0;
+
+vx_device_h device = nullptr;
+vx_buffer_h buffer = nullptr;
+
+static void show_usage() {
+   std::cout << "Vortex Driver Test." << std::endl;
+   std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
+}
+
+static void parse_args(int argc, char **argv) {
+  int c;
+  while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
+    switch (c) {
+    case 'n':
+      count = atoi(optarg);
+      break;
+    case 'k':
+      kernel_file = optarg;
+      break;
+    case 'h':
+    case '?': {
+      show_usage();
+      exit(0);
+    } break;
+    default:
+      show_usage();
+      exit(-1);
+    }
+  }
+}
+
+void cleanup() {
+  if (buffer) {
+    vx_buf_release(buffer);
+  }
+  if (device) {
+    vx_dev_close(device);
+  }
+}
+
+int run_test(const kernel_arg_t& kernel_arg,
+             uint32_t buf_size, 
+             uint32_t num_points) {
+  // start device
+  std::cout << "start device" << std::endl;
+  RT_CHECK(vx_start(device));
+
+  // wait for completion
+  std::cout << "wait for completion" << std::endl;
+  RT_CHECK(vx_ready_wait(device, -1));
+
+  // flush the destination buffer caches
+  std::cout << "flush the destination buffer caches" << std::endl;
+  RT_CHECK(vx_flush_caches(device, kernel_arg.dst_ptr, buf_size));
+
+  // download destination buffer
+  std::cout << "download destination buffer" << std::endl;
+  RT_CHECK(vx_copy_from_dev(buffer, kernel_arg.dst_ptr, buf_size, 0));
+
+  // verify result
+  std::cout << "verify result" << std::endl;  
+  {
+    int errors = 0;
+    auto buf_ptr = (int32_t*)vx_host_ptr(buffer);
+    for (uint32_t i = 0; i < num_points; ++i) {
+      int ref = i + i; 
+      int cur = buf_ptr[i];
+      if (cur != ref) {
+        std::cout << "error at value " << i
+                  << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
+        ++errors;
+      }
+    }
+    if (errors != 0) {
+      std::cout << "Found " << errors << " errors!" << std::endl;
+      std::cout << "FAILED!" << std::endl;
+      return 1;  
+    }
+  }
+
+  return 0;
+}
+
+int main(int argc, char *argv[]) {
+  size_t value; 
+  kernel_arg_t kernel_arg;
+  
+  // parse command arguments
+  parse_args(argc, argv);
+
+  if (count == 0) {
+    count = 1;
+  }
+
+  uint32_t max_cores   = vx_dev_caps(VX_CAPS_MAX_CORES);
+  uint32_t max_warps   = vx_dev_caps(VX_CAPS_MAX_WARPS);
+  uint32_t max_threads = vx_dev_caps(VX_CAPS_MAX_THREADS);
+
+  uint32_t num_points = count * max_cores * max_warps * max_threads;
+  uint32_t buf_size = num_points * sizeof(uint32_t);
+
+  std::cout << "number of points: " << num_points << std::endl;
+  std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
+
+  // open device connection
+  std::cout << "open device connection" << std::endl;  
+  RT_CHECK(vx_dev_open(&device));
+
+  // upload program
+  std::cout << "upload program" << std::endl;  
+  RT_CHECK(vx_upload_kernel_file(device, kernel_file));
+
+  // allocate device memory
+  std::cout << "allocate device memory" << std::endl;  
+
+  RT_CHECK(vx_alloc_dev_mem(device, buf_size, &value));
+  kernel_arg.src0_ptr = value;
+  RT_CHECK(vx_alloc_dev_mem(device, buf_size, &value));
+  kernel_arg.src1_ptr = value;
+  RT_CHECK(vx_alloc_dev_mem(device, buf_size, &value));
+  kernel_arg.dst_ptr = value;
+
+  kernel_arg.count = count;
+
+  std::cout << "dev_src0=" << std::hex << kernel_arg.src0_ptr << std::endl;
+  std::cout << "dev_src1=" << std::hex << kernel_arg.src1_ptr << std::endl;
+  std::cout << "dev_dst=" << std::hex << kernel_arg.dst_ptr << std::endl;
+  
+  // allocate shared memory  
+  std::cout << "allocate shared memory" << std::endl;    
+  uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
+  RT_CHECK(vx_alloc_shared_mem(device, alloc_size, &buffer));
+  
+  // upload kernel argument
+  std::cout << "upload kernel argument" << std::endl;
+  {
+    auto buf_ptr = (int*)vx_host_ptr(buffer);
+    memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
+    RT_CHECK(vx_copy_to_dev(buffer, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
+  }
+
+  // upload source buffer0
+  {
+    auto buf_ptr = (int32_t*)vx_host_ptr(buffer);
+    for (uint32_t i = 0; i < num_points; ++i) {
+      buf_ptr[i] = i-1;
+    }
+  }
+  std::cout << "upload source buffer0" << std::endl;      
+  RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.src0_ptr, buf_size, 0));
+
+  // upload source buffer1
+  {
+    auto buf_ptr = (int32_t*)vx_host_ptr(buffer);
+    for (uint32_t i = 0; i < num_points; ++i) {
+      buf_ptr[i] = i+1;
+    }
+  }
+  std::cout << "upload source buffer1" << std::endl;      
+  RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.src1_ptr, buf_size, 0));
+
+  // clear destination buffer
+  {
+    auto buf_ptr = (int32_t*)vx_host_ptr(buffer);
+    for (uint32_t i = 0; i < num_points; ++i) {
+      buf_ptr[i] = 0xffffffff;
+    }
+  }
+  std::cout << "clear destination buffer" << std::endl;      
+  RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.dst_ptr, buf_size, 0));  
+
+  // run tests
+  std::cout << "run tests" << std::endl;
+  RT_CHECK(run_test(kernel_arg, buf_size, num_points));
+
+  // cleanup
+  std::cout << "cleanup" << std::endl;  
+  cleanup();
+
+  std::cout << "PASSED!" << std::endl;
+
+  return 0;
+}
--- a/driver/tests/demo/kernel.bin
+++ b/driver/tests/demo/kernel.bin
--- a/driver/tests/demo/kernel.c
+++ b/driver/tests/demo/kernel.c
@@ -0,0 +1,30 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include "intrinsics/vx_intrinsics.h"
+#include "vx_api/vx_api.h"
+#include "common.h"
+
+void kernel_body(void* arg) {
+	struct kernel_arg_t* _arg = (struct kernel_arg_t*)(arg);
+	uint32_t count    = _arg->count;
+	int32_t* src0_ptr = (int32_t*)_arg->src0_ptr;
+	int32_t* src1_ptr = (int32_t*)_arg->src1_ptr;
+	int32_t* dst_ptr  = (int32_t*)_arg->dst_ptr;
+	
+	uint32_t offset = vx_thread_gid() * count;
+
+	for (uint32_t i = 0; i < count; ++i) {
+		dst_ptr[offset+i] = src0_ptr[offset+i] + src1_ptr[offset+i];
+	}
+}
+
+void main() {
+	struct kernel_arg_t* arg = (struct kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
+	/*printf("stride=%d\n", arg->stride);
+	printf("src0_ptr=0x%src0\n", arg->src0_ptr);
+	printf("src1_ptr=0x%src0\n", arg->src1_ptr);
+	printf("dst_ptr=0x%src0\n", arg->dst_ptr);*/
+	int num_warps = vx_num_warps();
+	int num_threads = vx_num_threads();
+	vx_spawn_warps(num_warps, num_threads, kernel_body, arg);
+}
--- a/driver/tests/demo/kernel.elf
+++ b/driver/tests/demo/kernel.elf
--- a/driver/tests/dogfood/Memcpy/hw/rtl/_hdr
+++ b/driver/tests/dogfood/Memcpy/hw/rtl/_hdr
@@ -0,0 +1,603 @@
+//
+// Copyright (c) 2017, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// Redistributions of source code must retain the above copyright notice, this
+// list of conditions and the following disclaimer.
+//
+// Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// Neither the name of the Intel Corporation nor the names of its contributors
+// may be used to endorse or promote products derived from this software
+// without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+
+// Read from the memory locations first and then write to the memory locations
+
+`include "platform_if.vh"
+`include "afu_json_info.vh"
+
+
+module ccip_std_afu
+   (
+    // CCI-P Clocks and Resets
+    input           logic             pClk,              // 400MHz - CCI-P clock domain. Primary interface clock
+    input           logic             pClkDiv2,          // 200MHz - CCI-P clock domain.
+    input           logic             pClkDiv4,          // 100MHz - CCI-P clock domain.
+    input           logic             uClk_usr,          // User clock domain. Refer to clock programming guide  ** Currently provides fixed 300MHz clock **
+    input           logic             uClk_usrDiv2,      // User clock domain. Half the programmed frequency  ** Currently provides fixed 150MHz clock **
+    input           logic             pck_cp2af_softReset,      // CCI-P ACTIVE HIGH Soft Reset
+    input           logic [1:0]       pck_cp2af_pwrState,       // CCI-P AFU Power State
+    input           logic             pck_cp2af_error,          // CCI-P Protocol Error Detected
+
+    // Interface structures
+    input           t_if_ccip_Rx      pck_cp2af_sRx,        // CCI-P Rx Port
+    output          t_if_ccip_Tx      pck_af2cp_sTx         // CCI-P Tx Port
+    );
+
+
+    //
+    // Run the entire design at the standard CCI-P frequency (400 MHz).
+    //
+    logic clk;
+    assign clk = pClk;
+
+    logic reset;
+    assign reset = pck_cp2af_softReset;
+
+    logic [511:0] wr_data;
+    logic [511:0] rd_data;
+
+    logic get_write_addr;
+    logic do_update;
+    logic rd_end_of_list;
+    logic rd_needed;
+    logic wr_needed;
+    logic [15:0] cnt_list_length;
+
+    // =========================================================================
+    //
+    //   Register requests.
+    //
+    // =========================================================================
+
+    //
+    // The incoming pck_cp2af_sRx and outgoing pck_af2cp_sTx must both be
+    // registered.  Here we register pck_cp2af_sRx and assign it to sRx.
+    // We also assign pck_af2cp_sTx to sTx here but don't register it.
+    // The code below never uses combinational logic to write sTx.
+    //
+
+    t_if_ccip_Rx sRx;
+    always_ff @(posedge clk)
+    begin
+        sRx <= pck_cp2af_sRx;
+    end
+
+    t_if_ccip_Tx sTx;
+    assign pck_af2cp_sTx = sTx;
+
+
+    // =========================================================================
+    //
+    //   CSR (MMIO) handling.
+    //
+    // =========================================================================
+
+    // The AFU ID is a unique ID for a given program.  Here we generated
+    // one with the "uuidgen" program and stored it in the AFU's JSON file.
+    // ASE and synthesis setup scripts automatically invoke afu_json_mgr
+    // to extract the UUID into afu_json_info.vh.
+    logic [127:0] afu_id = `AFU_ACCEL_UUID;
+
+    //
+    // A valid AFU must implement a device feature list, starting at MMIO
+    // address 0.  Every entry in the feature list begins with 5 64-bit
+    // words: a device feature header, two AFU UUID words and two reserved
+    // words.
+    //
+
+    // Is a CSR read request active this cycle?
+    logic is_csr_read;
+    assign is_csr_read = sRx.c0.mmioRdValid;
+
+    // Is a CSR write request active this cycle?
+    logic is_csr_write;
+    assign is_csr_write = sRx.c0.mmioWrValid;
+
+    // The MMIO request header is overlayed on the normal c0 memory read
+    // response data structure.  Cast the c0Rx header to an MMIO request
+    // header.
+    t_ccip_c0_ReqMmioHdr mmio_req_hdr;
+    assign mmio_req_hdr = t_ccip_c0_ReqMmioHdr'(sRx.c0.hdr);
+
+
+    //
+    // Implement the device feature list by responding to MMIO reads.
+    //
+
+    always_ff @(posedge clk)
+    begin
+        if (reset)
+        begin
+            sTx.c2.mmioRdValid <= 1'b0;
+        end
+        else
+        begin
+            // Always respond with something for every read request
+            sTx.c2.mmioRdValid <= is_csr_read;
+
+            // The unique transaction ID matches responses to requests
+            sTx.c2.hdr.tid <= mmio_req_hdr.tid;
+
+            // Addresses are of 32-bit objects in MMIO space.  Addresses
+            // of 64-bit objects are thus multiples of 2.
+            case (mmio_req_hdr.address)
+              0: // AFU DFH (device feature header)
+                begin
+                    // Here we define a trivial feature list.  In this
+                    // example, our AFU is the only entry in this list.
+                    sTx.c2.data <= t_ccip_mmioData'(0);
+                    // Feature type is AFU
+                    sTx.c2.data[63:60] <= 4'h1;
+                    // End of list (last entry in list)
+                    sTx.c2.data[40] <= 1'b1;
+                end
+
+              // AFU_ID_L
+              2: sTx.c2.data <= afu_id[63:0];
+
+              // AFU_ID_H
+              4: sTx.c2.data <= afu_id[127:64];
+
+              // DFH_RSVD0
+              6: sTx.c2.data <= t_ccip_mmioData'(0);
+
+              // DFH_RSVD1
+              8: sTx.c2.data <= t_ccip_mmioData'(0);
+
+              default: sTx.c2.data <= t_ccip_mmioData'(0);
+            endcase
+        end
+    end
+
+
+    //
+    // CSR write handling.  Host software must tell the AFU the memory address
+    // to which it should be writing.  The address is set by writing a CSR.
+    //
+
+    // We use MMIO address 0 to set the memory address.  The read and
+    // write MMIO spaces are logically separate so we are free to use
+    // whatever we like.  This may not be good practice for cleanly
+    // organizing the MMIO address space, but it is legal.
+    logic is_mem_addr_csr_write;
+    assign is_mem_addr_csr_write = get_write_addr && is_csr_write &&
+                                   (mmio_req_hdr.address == t_ccip_mmioAddr'(0));
+
+    // Memory address to which this AFU will write.
+    t_ccip_clAddr write_mem_addr;
+
+    always_ff @(posedge clk)
+    begin
+        if (reset)
+        begin
+            get_write_addr <= 1'b1;
+        end
+	else if (is_mem_addr_csr_write)
+        begin
+            write_mem_addr <= t_ccip_clAddr'(sRx.c0.data);
+	    get_write_addr <= 1'b0;
+        end
+    end
+    
+
+    // We use MMIO address 0 to set the memory address for reading data.
+    logic is_mem_addr_csr_read;
+    assign is_mem_addr_csr_read = !get_write_addr && is_csr_write &&
+                                   (mmio_req_hdr.address == t_ccip_mmioAddr'(0));
+
+    // Memory address from which this AFU will read.
+    logic start_read;
+    t_ccip_clAddr read_mem_addr;
+
+    //logic start_traversal = 'b0;
+    //t_ccip_clAddr start_traversal_addr;
+
+    always_ff @(posedge clk)
+    begin
+        if (reset)
+        begin
+	    start_read <= 1'b0;
+        end
+        else if (is_mem_addr_csr_read)
+        begin
+            read_mem_addr <= t_ccip_clAddr'(sRx.c0.data);
+	    start_read <= 'b1;
+        end
+    end
+
+
+    // =========================================================================
+    //
+    //   Main AFU logic
+    //
+    // =========================================================================
+
+    //
+    // States in our simple example.
+    //
+    //typedef enum logic [0:0]
+    typedef enum logic [1:0]
+    {
+	STATE_IDLE,
+        STATE_READ,
+        STATE_UPDATE,
+        STATE_WRITE
+    }
+    t_state;
+
+    t_state state;
+
+    //
+    // State machine
+    //
+    always_ff @(posedge clk)
+    begin
+        if (reset)
+        begin
+            state <= STATE_IDLE;
+	    rd_end_of_list <= 1'b0;
+        end
+        else
+        begin
+            case (state)
+              STATE_IDLE:
+                begin
+                    // Traversal begins when CSR 1 is written
+                    if (start_read)
+                    begin
+                        state <= STATE_READ;
+                        $display("AFU starting traversal at 0x%x", t_ccip_clAddr'(read_mem_addr));
+                    end
+                end
+
+              STATE_READ:
+                begin
+                    if (rd_needed)
+                    begin
+		    // Read data from the address and update address
+		    	state <= STATE_UPDATE;
+		    	start_read <= 'b0;
+                    	$display("AFU reading data and pointing to next read address...");
+                    end
+                end
+
+              STATE_UPDATE:
+                begin
+		    // Update the read value to be written back
+                    if (do_update)
+		    begin
+		    	state <= STATE_WRITE;
+                    	$display("AFU performing comutations on the read values...");
+		    end
+                end
+
+              STATE_WRITE:
+                begin
+		    // Write the updated value to the address
+		    // Point to new address after that
+		    // if done then point to IDLE; else read new values 
+                    if (rd_end_of_list)
+		    begin
+			state <= STATE_IDLE;
+			$display("AFU done...");
+		    end
+                    else
+		    begin
+			if (wr_needed)
+		    	begin
+			    state <= STATE_READ;
+			    $display("AFU reading again from read address...");
+		    	end
+		    end
+                end
+            endcase
+        end
+    end
+
+
+    // =========================================================================
+    //
+    //   Read logic.
+    //
+    // =========================================================================
+
+    //
+    // READ REQUEST
+    //
+
+    // Did a write response just arrive
+    logic addr_next_valid;
+
+    // Next read address
+    t_ccip_clAddr addr_next;
+
+    always_ff @(posedge clk)
+    begin
+	// Next read address is valid when we have got the write response back
+	// and channel is not full
+        //addr_next_valid <= sRx.c0TxAlmFull; 
+        addr_next_valid <= sRx.c1.rspValid;
+
+        // Next address is current address plus address length
+	// Apurve 
+        //addr_next <= addr_next + addr_size;
+        addr_next <= addr_next + 0;
+
+        // End of list reached if we have read 10 times
+        rd_end_of_list <= (cnt_list_length == 'h10);
+    end
+
+    //
+    // Since back pressure may prevent an immediate read request, we must
+    // record whether a read is needed and hold it until the request can
+    // be sent to the FIU.
+    //
+    t_ccip_clAddr rd_addr;
+
+    always_ff @(posedge clk)
+    begin
+        if (reset)
+        begin
+            rd_needed <= 1'b0;
+        end
+        else
+        begin
+            // If reads are allowed this cycle then we can safely clear
+            // any previously requested reads.  This simple AFU has only
+            // one read in flight at a time since it is walking a pointer
+            // chain.
+            if (rd_needed)
+            begin
+                rd_needed <= sRx.c0TxAlmFull;
+            end
+            else
+            begin
+                // Need a read under two conditions:
+                //   - Starting a new walk
+                //   - A read response just arrived from a line containing
+                //     a next pointer.
+                rd_needed <= (start_read || (addr_next_valid && ! rd_end_of_list));
+                rd_addr <= (start_read ? read_mem_addr : addr_next);
+            end
+        end
+    end
+
+    //
+    // Emit read requests to the FIU.
+    //
+
+    // Read header defines the request to the FIU
+    t_cci_c0_ReqMemHdr rd_hdr;
+
+    always_comb
+    begin
+        rd_hdr = t_cci_c0_ReqMemHdr'(0);
+
+        // Read request type
+        rd_hdr.req_type = eREQ_RDLINE_I;
+        // Virtual address (MPF virtual addressing is enabled)
+        rd_hdr.address = rd_addr;
+        // Let the FIU pick the channel
+        rd_hdr.vc_sel = eVC_VA;
+        // Read 4 lines (the size of an entry in the list)
+        rd_hdr.cl_len = eCL_LEN_4;
+    end
+
+    // Send read requests to the FIU
+    always_ff @(posedge clk)
+    begin
+        if (reset)
+        begin
+            sTx.c0.valid <= 1'b0;
+            cnt_list_length <= 0;
+        end
+        else
+        begin
+            // Generate a read request when needed and the FIU isn't full
+            sTx.c0.valid <= (rd_needed && ! sRx.c0TxAlmFull);
+            sTx.c0.hdr <= rd_hdr;
+
+            if (rd_needed && ! sRx.c0TxAlmFull)
+            begin
+                cnt_list_length <= cnt_list_length + 1;
+                //$display("  Reading from VA 0x%x", clAddrToByteAddr(rd_addr));
+                $display("Incrementing read count...");
+            end
+        end
+    end
+
+    //
+    // READ RESPONSE HANDLING
+    //
+
+    //
+    // Receive data (read responses).
+    //
+    always_ff @(posedge clk)
+    begin
+	if (reset)
+	begin
+            do_update <= 1'b0;
+        end
+	else
+	begin
+	    if (state == STATE_READ)
+	    begin
+                rd_data <= sRx.c0.data;
+                do_update <= 1'b1;
+            end
+	    if (state == STATE_UPDATE)
+	    begin
+	        // Update the read data and put it in the write data to be written
+                wr_data <= rd_data + 1;
+                do_update <= 1'b0;
+            end
+        end
+    end
+
+
+    // =========================================================================
+    //
+    //   Write logic.
+    //
+    // =========================================================================
+
+
+    //
+    // WRITE REQUEST
+    //
+
+    // Did a write response just arrive
+    logic wr_addr_next_valid;
+
+    // Next write address
+    t_ccip_clAddr wr_addr_next;
+
+    always_ff @(posedge clk)
+    begin
+        // Next write address is valid when we have got the read response back
+        // and channel is not full
+        //wr_addr_next_valid <= sRx.c1TxAlmFull; 
+        wr_addr_next_valid <= sRx.c0.rspValid;
+
+        // Next address is current address plus address length
+        // Apurve 
+        //wr_addr_next <= wr_addr_next + addr_size;
+        wr_addr_next <= wr_addr_next + 0;
+    end
+
+    //
+    // Since back pressure may prevent an immediate write request, we must
+    // record whether a write is needed and hold it until the request can
+    // be sent to the FIU.
+    //
+    t_ccip_clAddr wr_addr;
+
+    always_ff @(posedge clk)
+    begin
+        if (reset)
+        begin
+            wr_needed <= 1'b0;
+        end
+        else
+        begin
+            // If writes are allowed this cycle then we can safely clear
+            // any previously requested writes.  This simple AFU has only
+            // one write in flight at a time since it is walking a pointer
+            // chain.
+            if (wr_needed)
+            begin
+                wr_needed <= sRx.c1TxAlmFull;
+            end
+            else
+            begin
+                // Need a write under two conditions:
+                //   - Starting a new walk
+                //   - A write response just arrived from a line containing
+                //     a next pointer.
+                //wr_needed <= (start_write || (wr_addr_next_valid && ! rd_end_of_list));
+                wr_needed <= (start_write || wr_addr_next_valid);
+                wr_addr <= (start_write ? write_mem_addr : wr_addr_next);
+            end
+        end
+    end
+
+    //
+    // Emit write requests to the FIU.
+    //
+
+    // Write header defines the request to the FIU
+    t_ccip_c1_ReqMemHdr wr_hdr;
+
+    always_comb
+    begin
+        wr_hdr = t_cci_c1_ReqMemHdr'(0);
+
+        // Write request type
+        wr_hdr.req_type = eREQ_RDLINE_I;
+        // Virtual address (MPF virtual addressing is enabled)
+        wr_hdr.address = wr_addr;
+        // Let the FIU pick the channel
+        wr_hdr.vc_sel = eVC_VA;
+        // Write 4 lines (the size of an entry in the list)
+        wr_hdr.cl_len = eCL_LEN_4;
+        // Start of packet is true (single line write)
+        wr_hdr.sop = 1'b1;
+    end
+
+    // Send write requests to the FIU
+    always_ff @(posedge clk)
+    begin
+        if (reset)
+        begin
+            sTx.c1.valid <= 1'b0;
+            //cnt_list_length <= 0;
+        end
+        else
+        begin
+            // Generate a write request when needed and the FIU isn't full
+            sTx.c1.valid <= (wr_needed && ! sRx.c1TxAlmFull);
+            sTx.c1.hdr <= wr_hdr;
+	    sTx.c1.data = t_ccip_clData'(wr_data);
+
+            //if (wr_needed && ! sRx.c1TxAlmFull)
+            //begin
+            //    cnt_list_length <= cnt_list_length + 1;
+            //    //$display("  Writing from VA 0x%x", clAddrToByteAddr(rd_addr));
+            //    $display("Incrementing write count...");
+            //end
+        end
+    end
+
+    //
+    // WRITE RESPONSE HANDLING
+    //
+
+    // Apurve: Check if a signal is to be sent to read to start reading in case
+    // write response does not work
+    //
+    // Send data (write requests).
+    //
+    //always_ff @(posedge clk)
+    //begin
+    //    if (state == STATE_WRITE)
+    //    begin
+    //        rd_data <= sRx.c0.data;
+    //    end
+    //    if (state == STATE_UPDATE)
+    //    begin
+    //        // Update the write data and put it in the write data to be written
+    //        wr_data <= rd_data + 1;
+    //    end
+    //end
+
+endmodule
--- a/driver/tests/dogfood/Memcpy/hw/rtl/cci_hello.json
+++ b/driver/tests/dogfood/Memcpy/hw/rtl/cci_hello.json
@@ -0,0 +1,18 @@
+{
+   "version": 1,
+   "afu-image": {
+      "power": 0,
+      "afu-top-interface":
+         {
+            "name": "ccip_std_afu"
+         },
+      "accelerator-clusters":
+         [
+            {
+               "name": "cci_hello",
+               "total-contexts": 1,
+               "accelerator-type-uuid": "c6aa954a-9b91-4a37-abc1-1d9f0709dcc3"
+            }
+         ]
+   }
+}
--- a/driver/tests/dogfood/Memcpy/hw/rtl/cci_hello_afu.sv
+++ b/driver/tests/dogfood/Memcpy/hw/rtl/cci_hello_afu.sv
@@ -0,0 +1,653 @@
+//
+// Copyright (c) 2017, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// Redistributions of source code must retain the above copyright notice, this
+// list of conditions and the following disclaimer.
+//
+// Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// Neither the name of the Intel Corporation nor the names of its contributors
+// may be used to endorse or promote products derived from this software
+// without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+
+// Read from the memory locations first and then write to the memory locations
+
+`include "platform_if.vh"
+`include "afu_json_info.vh"
+
+
+module ccip_std_afu
+   (
+    // CCI-P Clocks and Resets
+    input           logic             pClk,              // 400MHz - CCI-P clock domain. Primary interface clock
+    input           logic             pClkDiv2,          // 200MHz - CCI-P clock domain.
+    input           logic             pClkDiv4,          // 100MHz - CCI-P clock domain.
+    input           logic             uClk_usr,          // User clock domain. Refer to clock programming guide  ** Currently provides fixed 300MHz clock **
+    input           logic             uClk_usrDiv2,      // User clock domain. Half the programmed frequency  ** Currently provides fixed 150MHz clock **
+    input           logic             pck_cp2af_softReset,      // CCI-P ACTIVE HIGH Soft Reset
+    input           logic [1:0]       pck_cp2af_pwrState,       // CCI-P AFU Power State
+    input           logic             pck_cp2af_error,          // CCI-P Protocol Error Detected
+
+    // Interface structures
+    input           t_if_ccip_Rx      pck_cp2af_sRx,        // CCI-P Rx Port
+    output          t_if_ccip_Tx      pck_af2cp_sTx         // CCI-P Tx Port
+    );
+
+
+    //
+    // Run the entire design at the standard CCI-P frequency (400 MHz).
+    //
+    logic clk;
+    assign clk = pClk;
+
+    logic reset;
+    assign reset = pck_cp2af_softReset;
+
+    logic [511:0] wr_data;
+    logic [511:0] rd_data;
+
+    logic do_update;
+    logic start_read;
+    logic start_write;
+    logic wr_addr_next_valid;
+    logic addr_next_valid;
+    logic rd_end_of_list;
+    logic rd_needed;
+    logic wr_needed;
+    logic read_req;
+    logic write_req;
+    logic [15:0] cnt_list_length;
+    t_ccip_clAddr rd_addr;
+    t_ccip_clAddr wr_addr;
+    t_ccip_clAddr addr_next;
+    t_ccip_clAddr wr_addr_next;
+
+    // =========================================================================
+    //
+    //   Register requests.
+    //
+    // =========================================================================
+
+    //
+    // The incoming pck_cp2af_sRx and outgoing pck_af2cp_sTx must both be
+    // registered.  Here we register pck_cp2af_sRx and assign it to sRx.
+    // We also assign pck_af2cp_sTx to sTx here but don't register it.
+    // The code below never uses combinational logic to write sTx.
+    //
+
+    t_if_ccip_Rx sRx;
+    always_ff @(posedge clk)
+    begin
+        sRx <= pck_cp2af_sRx;
+    end
+
+    t_if_ccip_Tx sTx;
+    assign pck_af2cp_sTx = sTx;
+
+
+    // =========================================================================
+    //
+    //   CSR (MMIO) handling.
+    //
+    // =========================================================================
+
+    // The AFU ID is a unique ID for a given program.  Here we generated
+    // one with the "uuidgen" program and stored it in the AFU's JSON file.
+    // ASE and synthesis setup scripts automatically invoke afu_json_mgr
+    // to extract the UUID into afu_json_info.vh.
+    logic [127:0] afu_id = `AFU_ACCEL_UUID;
+
+    //
+    // A valid AFU must implement a device feature list, starting at MMIO
+    // address 0.  Every entry in the feature list begins with 5 64-bit
+    // words: a device feature header, two AFU UUID words and two reserved
+    // words.
+    //
+
+    // Is a CSR read request active this cycle?
+    logic is_csr_read;
+    assign is_csr_read = sRx.c0.mmioRdValid;
+
+    // Is a CSR write request active this cycle?
+    logic is_csr_write;
+    assign is_csr_write = sRx.c0.mmioWrValid;
+
+    // The MMIO request header is overlayed on the normal c0 memory read
+    // response data structure.  Cast the c0Rx header to an MMIO request
+    // header.
+    t_ccip_c0_ReqMmioHdr mmio_req_hdr;
+    assign mmio_req_hdr = t_ccip_c0_ReqMmioHdr'(sRx.c0.hdr);
+
+
+    //
+    // Implement the device feature list by responding to MMIO reads.
+    //
+
+    always_ff @(posedge clk)
+    begin
+        if (reset)
+        begin
+            sTx.c2.mmioRdValid <= 1'b0;
+        end
+        else
+        begin
+            // Always respond with something for every read request
+            sTx.c2.mmioRdValid <= is_csr_read;
+
+            // The unique transaction ID matches responses to requests
+            sTx.c2.hdr.tid <= mmio_req_hdr.tid;
+
+            // Addresses are of 32-bit objects in MMIO space.  Addresses
+            // of 64-bit objects are thus multiples of 2.
+            case (mmio_req_hdr.address)
+              0: // AFU DFH (device feature header)
+                begin
+                    // Here we define a trivial feature list.  In this
+                    // example, our AFU is the only entry in this list.
+                    sTx.c2.data <= t_ccip_mmioData'(0);
+                    // Feature type is AFU
+                    sTx.c2.data[63:60] <= 4'h1;
+                    // End of list (last entry in list)
+                    sTx.c2.data[40] <= 1'b1;
+                end
+
+              // AFU_ID_L
+              2: sTx.c2.data <= afu_id[63:0];
+
+              // AFU_ID_H
+              4: sTx.c2.data <= afu_id[127:64];
+
+              // DFH_RSVD0
+              6: sTx.c2.data <= t_ccip_mmioData'(0);
+
+              // DFH_RSVD1
+              8: sTx.c2.data <= t_ccip_mmioData'(0);
+
+	      // Updated by apurve to check fpgaReadMMIO
+              10: sTx.c2.data <= t_ccip_mmioData'(start_read);
+
+              default: sTx.c2.data <= t_ccip_mmioData'(0);
+            endcase
+        end
+    end
+
+
+    //
+    // CSR write handling.  Host software must tell the AFU the memory address
+    // to which it should be writing.  The address is set by writing a CSR.
+    //
+
+    // We use MMIO address 0 to set the memory address.  The read and
+    // write MMIO spaces are logically separate so we are free to use
+    // whatever we like.  This may not be good practice for cleanly
+    // organizing the MMIO address space, but it is legal.
+    logic is_mem_addr_csr_write;
+    assign is_mem_addr_csr_write = is_csr_write &&
+                                   (mmio_req_hdr.address == t_ccip_mmioAddr'(0));
+
+    // Memory address to which this AFU will write.
+    t_ccip_clAddr write_mem_addr;
+
+    always_ff @(posedge clk)
+    begin
+        if (reset)
+        begin
+	    start_write <= 1'b0;
+        end
+	else if (is_mem_addr_csr_write)
+        begin
+            write_mem_addr <= t_ccip_clAddr'(sRx.c0.data);
+	    start_write <= 1'b1;
+            //$display("Write mem address is 0x%x", t_ccip_clAddr'(write_mem_addr));
+        end
+    end
+    
+
+    // We use MMIO address 8 to set the memory address for reading data.
+    logic is_mem_addr_csr_read;
+    assign is_mem_addr_csr_read = is_csr_write &&
+                                   (mmio_req_hdr.address == t_ccip_mmioAddr'(2));
+
+    // Memory address from which this AFU will read.
+    t_ccip_clAddr read_mem_addr;
+
+    //logic start_traversal = 'b0;
+    //t_ccip_clAddr start_traversal_addr;
+
+    always_ff @(posedge clk)
+    begin
+        if (reset)
+        begin
+	    start_read <= 1'b0;
+        end
+        else if (is_mem_addr_csr_read)
+        begin
+            read_mem_addr <= t_ccip_clAddr'(sRx.c0.data);
+	    start_read <= 1'b1;
+            //$display("Read mem address is 0x%x", t_ccip_clAddr'(read_mem_addr));
+        end
+    end
+
+
+    // =========================================================================
+    //
+    //   Main AFU logic
+    //
+    // =========================================================================
+
+    //
+    // States in our simple example.
+    //
+    //typedef enum logic [0:0]
+    typedef enum logic [1:0]
+    {
+	STATE_IDLE,
+        STATE_READ,
+        STATE_UPDATE,
+        STATE_WRITE
+    }
+    t_state;
+
+    t_state state;
+
+    //
+    // State machine
+    //
+    always_ff @(posedge clk)
+    begin
+        if (reset)
+        begin
+            state <= STATE_IDLE;
+	    rd_end_of_list <= 1'b0;
+        end
+        else
+        begin
+            case (state)
+              STATE_IDLE:
+                begin
+                    // Traversal begins when CSR 1 is written
+                    if (start_read)
+                    begin
+                        state <= STATE_READ;
+                        $display("AFU starting traversal at 0x%x", t_ccip_clAddr'(read_mem_addr));
+                    end
+                end
+
+              STATE_READ:
+                begin
+                    $display("AFU in READ...");
+                    $display("do_update is %d...",do_update);
+                    $display("addr_next_valid is %d...",addr_next_valid);
+                    $display("rd_needed is %d...",rd_needed);
+                    if (!rd_needed && do_update)
+                    begin
+		    	state <= STATE_UPDATE;
+                        $display("AFU moving to UPDATE...");
+                    end
+                end
+
+              STATE_UPDATE:
+                begin
+		    // Update the read value to be written back
+                    $display("AFU in UPDATE...");
+                    if (!do_update)
+		    begin
+		    	state <= STATE_WRITE;
+			wr_needed <= 1'b1; 
+                        $display("AFU moving to WRITE...");
+		    end
+                end
+
+              STATE_WRITE:
+                begin
+		    // Write the updated value to the address
+		    // Point to new address after that
+		    // if done then point to IDLE; else read new values 
+                    $display("AFU in WRITE...");
+                    if (rd_end_of_list)
+		    begin
+			state <= STATE_IDLE;
+			$display("AFU done...");
+		    end
+                    else if (!wr_needed)
+		    begin
+			state <= STATE_READ;
+			$display("AFU moving to READ from WRITE...");
+		    	start_write <= 1'b0;
+			write_req <= 1'b0;
+		    end
+                end
+            endcase
+        end
+    end
+
+
+    // =========================================================================
+    //
+    //   Read logic.
+    //
+    // =========================================================================
+
+    //
+    // READ REQUEST
+    //
+
+    // Did a write response just arrive
+
+    // Next read address
+
+    always_ff @(posedge clk)
+    begin
+	// Next read address is valid when we have got the write response back
+	if (sRx.c1.rspValid)
+    	begin
+            addr_next_valid <= sRx.c1.rspValid;
+
+	    //if (state == STATE_READ && !rd_needed)
+    	    //begin
+                // Apurve: Next address is current address plus address length
+                //addr_next <= addr_next + addr_size;
+            addr_next <= (addr_next_valid ? rd_addr + 0 : rd_addr);
+
+                // End of list reached if we have read 5 times
+            rd_end_of_list <= (cnt_list_length == 'h5);
+    	    //end
+    	end	
+    end
+
+    //
+    // Since back pressure may prevent an immediate read request, we must
+    // record whether a read is needed and hold it until the request can
+    // be sent to the FIU.
+    //
+
+    always_ff @(posedge clk)
+    begin
+        if (reset)
+        begin
+            rd_needed <= 1'b0;
+        end
+        else
+        begin
+            // If reads are allowed this cycle then we can safely clear
+            // any previously requested reads.  This simple AFU has only
+            // one read in flight at a time since it is walking a pointer
+            // chain.
+            if (rd_needed)
+            begin
+                //rd_needed <= sRx.c0TxAlmFull;
+                //rd_needed <= (!sRx.c0TxAlmFull && !sRx.c0.rspValid);
+                rd_needed <= !sRx.c0.rspValid;
+            end
+            else if (state == STATE_READ)
+            begin
+                // Need a read under two conditions:
+                //   - Starting a new walk
+                //   - A read response just arrived from a line containing
+                //     a next pointer.
+                rd_needed <= (start_read || (!sRx.c0TxAlmFull && (addr_next_valid && ! rd_end_of_list)));
+                rd_addr <= (start_read ? read_mem_addr : addr_next);
+            	//$display("rd_addr is 0x%x",  t_ccip_clAddr'(rd_addr));
+            	//$display("read mem addr is 0x%x",  t_ccip_clAddr'(read_mem_addr));
+            	//$display("start read is %d", start_read);
+            end
+        end
+    end
+
+    //
+    // Emit read requests to the FIU.
+    //
+
+    // Read header defines the request to the FIU
+    t_ccip_c0_ReqMemHdr rd_hdr;
+
+    always_comb
+    begin
+        rd_hdr = t_ccip_c0_ReqMemHdr'(0);
+
+        // Read request type (No intention to cache)
+        //rd_hdr.req_type = 4'h0;
+
+        // Virtual address (MPF virtual addressing is enabled)
+        rd_hdr.address = rd_addr;
+
+        // Read over channel VA 
+        //rd_hdr.vc_sel = 2'h0;
+
+        // Read one cache line (64 bytes) 
+        //rd_hdr.cl_len = 2'h0;
+    end
+
+    // Send read requests to the FIU
+    always_ff @(posedge clk)
+    begin
+        if (reset)
+        begin
+            sTx.c0.valid <= 1'b0;
+            cnt_list_length <= 0;
+	    read_req <= 1'b0;
+        end
+        else
+        begin
+            // Generate a read request when needed and the FIU isn't full
+	    if (state == STATE_READ)
+            begin
+            	sTx.c0.valid <= (rd_needed && !sRx.c0TxAlmFull && !read_req);
+
+            	if (rd_needed && !sRx.c0TxAlmFull && !read_req)
+            	begin
+	    	    sTx.c0.hdr <= rd_hdr;
+            	    cnt_list_length <= cnt_list_length + 1;
+		    read_req <= 1'b1;
+            	    $display("Incrementing read count...%d",cnt_list_length);
+            	    $display("Read address is 0x%x...",rd_hdr.address);
+		    addr_next_valid <= 1'b0;
+		    // Apurve: Add something to stop read once this section has been accessed
+		    //rd_needed <= 1'b0; 
+            	end
+            end
+        end
+    end
+
+    //
+    // READ RESPONSE HANDLING
+    //
+
+    //
+    // Receive data (read responses).
+    //
+    always_ff @(posedge clk)
+    begin
+	if (reset)
+	begin
+            do_update <= 1'b0;
+        end
+	else
+	begin
+	    if (!do_update && sRx.c0.rspValid)
+	    begin
+                rd_data <= sRx.c0.data;
+                do_update <= 1'b1;
+	        $display("rd data is %d...",rd_data);
+            end
+
+	    if ((state == STATE_UPDATE) && (do_update == 1'b1))
+	    begin
+	        // Update the read data and put it in the write data to be written
+                wr_data <= rd_data + 2;
+                do_update <= 1'b0;
+		read_req <= 1'b0;
+	        $display("write data is %d...",wr_data);
+
+		// First read done. Next reads should be from the updated addresses
+		start_read <= 1'b0; 
+            end
+        end
+    end
+
+
+    // =========================================================================
+    //
+    //   Write logic.
+    //
+    // =========================================================================
+
+
+    //
+    // WRITE REQUEST
+    //
+
+    // Did a write response just arrive
+
+    // Next write address
+
+    always_ff @(posedge clk)
+    begin
+	if (sRx.c0.rspValid)
+    	begin
+            // Next write address is valid when we have got the read response back
+            wr_addr_next_valid <= sRx.c0.rspValid;
+            //wr_addr_next_valid <= (!start_write && sRx.c0.rspValid);
+
+	    //if (state == STATE_WRITE && !wr_needed)
+	    //begin
+                // Apurve: Next address is current address plus address length
+                //wr_addr_next <= wr_addr + 0;
+                wr_addr_next <= (wr_addr_next_valid ? wr_addr + 0 : wr_addr);
+	    //end
+	end
+    end
+
+    //
+    // Since back pressure may prevent an immediate write request, we must
+    // record whether a write is needed and hold it until the request can
+    // be sent to the FIU.
+    //
+
+    always_ff @(posedge clk)
+    begin
+        if (reset)
+        begin
+            wr_needed <= 1'b0;
+        end
+        else
+        begin
+            // If writes are allowed this cycle then we can safely clear
+            // any previously requested writes.  This simple AFU has only
+            // one write in flight at a time since it is walking a pointer
+            // chain.
+            if (wr_needed)
+            begin
+                //wr_needed <= sRx.c1TxAlmFull;
+                //wr_needed <= (!sRx.c1TxAlmFull && !sRx.c1.rspValid);
+                wr_needed <= !sRx.c1.rspValid;
+            end
+            else
+            begin
+                // Need a write under two conditions:
+                //   - Starting a new walk
+                //   - A write response just arrived from a line containing
+                //     a next pointer.
+                wr_needed <= (start_write || (!sRx.c1TxAlmFull && wr_addr_next_valid));
+                wr_addr <= (start_write ? write_mem_addr : wr_addr_next);
+            	//$display("Write mem address later is 0x%x", t_ccip_clAddr'(write_mem_addr));
+            end
+        end
+    end
+
+    //
+    // Emit write requests to the FIU.
+    //
+
+    // Write header defines the request to the FIU
+    t_ccip_c1_ReqMemHdr wr_hdr;
+
+    always_comb
+    begin
+        wr_hdr = t_ccip_c1_ReqMemHdr'(0);
+
+        // Write request type
+        //wr_hdr.req_type = 4'h0;
+
+        // Virtual address (MPF virtual addressing is enabled)
+        wr_hdr.address = wr_addr;
+
+        // Let the FIU pick the channel
+        //wr_hdr.vc_sel = 2'h2;
+
+        // Write 1 cache line (64 bytes) 
+        //wr_hdr.cl_len = 2'h0;
+
+        // Start of packet is true (single line write)
+        wr_hdr.sop = 1'b1;
+    end
+
+    // Send write requests to the FIU
+    always_ff @(posedge clk)
+    begin
+        if (reset)
+        begin
+            sTx.c1.valid <= 1'b0;
+            write_req <= 1'b0;
+        end
+        else
+        begin
+            // Generate a write request when needed and the FIU isn't full
+	    if (state == STATE_WRITE)
+            begin
+            	sTx.c1.valid <= (wr_needed && !sRx.c1TxAlmFull && !write_req);
+		if (wr_needed && !sRx.c1TxAlmFull && !write_req)
+		begin
+            	    sTx.c1.hdr <= wr_hdr;
+	    	    sTx.c1.data <= t_ccip_clData'(wr_data);
+		    write_req <= 1'b1;
+		    wr_addr_next_valid <= 1'b0;
+		    $display("Write address is 0x%x...", wr_hdr.address);
+            	end
+            end
+        end
+    end
+
+
+    //
+    // WRITE RESPONSE HANDLING
+    //
+
+    // Apurve: Check if a signal is to be sent to read to start reading in case
+    // write response does not work
+    //
+    // Send data (write requests).
+    //
+    //always_ff @(posedge clk)
+    //begin
+    //    if (state == STATE_WRITE)
+    //    begin
+    //        rd_data <= sRx.c0.data;
+    //    end
+    //    if (state == STATE_UPDATE)
+    //    begin
+    //        // Update the write data and put it in the write data to be written
+    //        wr_data <= rd_data + 1;
+    //    end
+    //end
+
+endmodule
--- a/driver/tests/dogfood/Memcpy/hw/rtl/cci_hello_afu_working.sv
+++ b/driver/tests/dogfood/Memcpy/hw/rtl/cci_hello_afu_working.sv
@@ -0,0 +1,621 @@
+//
+// Copyright (c) 2017, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// Redistributions of source code must retain the above copyright notice, this
+// list of conditions and the following disclaimer.
+//
+// Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// Neither the name of the Intel Corporation nor the names of its contributors
+// may be used to endorse or promote products derived from this software
+// without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+
+// Read from the memory locations first and then write to the memory locations
+
+`include "platform_if.vh"
+`include "afu_json_info.vh"
+
+
+module ccip_std_afu
+   (
+    // CCI-P Clocks and Resets
+    input           logic             pClk,              // 400MHz - CCI-P clock domain. Primary interface clock
+    input           logic             pClkDiv2,          // 200MHz - CCI-P clock domain.
+    input           logic             pClkDiv4,          // 100MHz - CCI-P clock domain.
+    input           logic             uClk_usr,          // User clock domain. Refer to clock programming guide  ** Currently provides fixed 300MHz clock **
+    input           logic             uClk_usrDiv2,      // User clock domain. Half the programmed frequency  ** Currently provides fixed 150MHz clock **
+    input           logic             pck_cp2af_softReset,      // CCI-P ACTIVE HIGH Soft Reset
+    input           logic [1:0]       pck_cp2af_pwrState,       // CCI-P AFU Power State
+    input           logic             pck_cp2af_error,          // CCI-P Protocol Error Detected
+
+    // Interface structures
+    input           t_if_ccip_Rx      pck_cp2af_sRx,        // CCI-P Rx Port
+    output          t_if_ccip_Tx      pck_af2cp_sTx         // CCI-P Tx Port
+    );
+
+
+    //
+    // Run the entire design at the standard CCI-P frequency (400 MHz).
+    //
+    logic clk;
+    assign clk = pClk;
+
+    logic reset;
+    assign reset = pck_cp2af_softReset;
+
+    logic [511:0] wr_data;
+    logic [511:0] rd_data;
+
+    logic do_update;
+    logic start_read;
+    logic start_write;
+    logic wr_addr_next_valid;
+    logic addr_next_valid;
+    logic rd_end_of_list;
+    logic rd_needed;
+    logic wr_needed;
+    logic [15:0] cnt_list_length;
+    t_ccip_clAddr rd_addr;
+    t_ccip_clAddr wr_addr;
+    t_ccip_clAddr addr_next;
+    t_ccip_clAddr wr_addr_next;
+
+    // =========================================================================
+    //
+    //   Register requests.
+    //
+    // =========================================================================
+
+    //
+    // The incoming pck_cp2af_sRx and outgoing pck_af2cp_sTx must both be
+    // registered.  Here we register pck_cp2af_sRx and assign it to sRx.
+    // We also assign pck_af2cp_sTx to sTx here but don't register it.
+    // The code below never uses combinational logic to write sTx.
+    //
+
+    t_if_ccip_Rx sRx;
+    always_ff @(posedge clk)
+    begin
+        sRx <= pck_cp2af_sRx;
+    end
+
+    t_if_ccip_Tx sTx;
+    assign pck_af2cp_sTx = sTx;
+
+
+    // =========================================================================
+    //
+    //   CSR (MMIO) handling.
+    //
+    // =========================================================================
+
+    // The AFU ID is a unique ID for a given program.  Here we generated
+    // one with the "uuidgen" program and stored it in the AFU's JSON file.
+    // ASE and synthesis setup scripts automatically invoke afu_json_mgr
+    // to extract the UUID into afu_json_info.vh.
+    logic [127:0] afu_id = `AFU_ACCEL_UUID;
+
+    //
+    // A valid AFU must implement a device feature list, starting at MMIO
+    // address 0.  Every entry in the feature list begins with 5 64-bit
+    // words: a device feature header, two AFU UUID words and two reserved
+    // words.
+    //
+
+    // Is a CSR read request active this cycle?
+    logic is_csr_read;
+    assign is_csr_read = sRx.c0.mmioRdValid;
+
+    // Is a CSR write request active this cycle?
+    logic is_csr_write;
+    assign is_csr_write = sRx.c0.mmioWrValid;
+
+    // The MMIO request header is overlayed on the normal c0 memory read
+    // response data structure.  Cast the c0Rx header to an MMIO request
+    // header.
+    t_ccip_c0_ReqMmioHdr mmio_req_hdr;
+    assign mmio_req_hdr = t_ccip_c0_ReqMmioHdr'(sRx.c0.hdr);
+
+
+    //
+    // Implement the device feature list by responding to MMIO reads.
+    //
+
+    always_ff @(posedge clk)
+    begin
+        if (reset)
+        begin
+            sTx.c2.mmioRdValid <= 1'b0;
+        end
+        else
+        begin
+            // Always respond with something for every read request
+            sTx.c2.mmioRdValid <= is_csr_read;
+
+            // The unique transaction ID matches responses to requests
+            sTx.c2.hdr.tid <= mmio_req_hdr.tid;
+
+            // Addresses are of 32-bit objects in MMIO space.  Addresses
+            // of 64-bit objects are thus multiples of 2.
+            case (mmio_req_hdr.address)
+              0: // AFU DFH (device feature header)
+                begin
+                    // Here we define a trivial feature list.  In this
+                    // example, our AFU is the only entry in this list.
+                    sTx.c2.data <= t_ccip_mmioData'(0);
+                    // Feature type is AFU
+                    sTx.c2.data[63:60] <= 4'h1;
+                    // End of list (last entry in list)
+                    sTx.c2.data[40] <= 1'b1;
+                end
+
+              // AFU_ID_L
+              2: sTx.c2.data <= afu_id[63:0];
+
+              // AFU_ID_H
+              4: sTx.c2.data <= afu_id[127:64];
+
+              // DFH_RSVD0
+              6: sTx.c2.data <= t_ccip_mmioData'(0);
+
+              // DFH_RSVD1
+              8: sTx.c2.data <= t_ccip_mmioData'(0);
+
+	      // Updated by apurve to check fpgaReadMMIO
+              10: sTx.c2.data <= t_ccip_mmioData'(start_read);
+
+              default: sTx.c2.data <= t_ccip_mmioData'(0);
+            endcase
+        end
+    end
+
+
+    //
+    // CSR write handling.  Host software must tell the AFU the memory address
+    // to which it should be writing.  The address is set by writing a CSR.
+    //
+
+    // We use MMIO address 0 to set the memory address.  The read and
+    // write MMIO spaces are logically separate so we are free to use
+    // whatever we like.  This may not be good practice for cleanly
+    // organizing the MMIO address space, but it is legal.
+    logic is_mem_addr_csr_write;
+    assign is_mem_addr_csr_write = is_csr_write &&
+                                   (mmio_req_hdr.address == t_ccip_mmioAddr'(0));
+
+    // Memory address to which this AFU will write.
+    t_ccip_clAddr write_mem_addr;
+
+    always_ff @(posedge clk)
+    begin
+        if (reset)
+        begin
+	    start_write <= 1'b0;
+        end
+	else if (is_mem_addr_csr_write)
+        begin
+            write_mem_addr <= t_ccip_clAddr'(sRx.c0.data);
+	    start_write <= 1'b1;
+            //$display("Write mem address is 0x%x", t_ccip_clAddr'(write_mem_addr));
+        end
+    end
+    
+
+    // We use MMIO address 8 to set the memory address for reading data.
+    logic is_mem_addr_csr_read;
+    assign is_mem_addr_csr_read = is_csr_write &&
+                                   (mmio_req_hdr.address == t_ccip_mmioAddr'(2));
+
+    // Memory address from which this AFU will read.
+    t_ccip_clAddr read_mem_addr;
+
+    //logic start_traversal = 'b0;
+    //t_ccip_clAddr start_traversal_addr;
+
+    always_ff @(posedge clk)
+    begin
+        if (reset)
+        begin
+	    start_read <= 1'b0;
+        end
+        else if (is_mem_addr_csr_read)
+        begin
+            read_mem_addr <= t_ccip_clAddr'(sRx.c0.data);
+	    start_read <= 1'b1;
+            //$display("Read mem address is 0x%x", t_ccip_clAddr'(read_mem_addr));
+        end
+    end
+
+
+    // =========================================================================
+    //
+    //   Main AFU logic
+    //
+    // =========================================================================
+
+    //
+    // States in our simple example.
+    //
+    //typedef enum logic [0:0]
+    typedef enum logic [1:0]
+    {
+	STATE_IDLE,
+        STATE_READ,
+        STATE_UPDATE,
+        STATE_WRITE
+    }
+    t_state;
+
+    t_state state;
+
+    //
+    // State machine
+    //
+    always_ff @(posedge clk)
+    begin
+        if (reset)
+        begin
+            state <= STATE_IDLE;
+	    rd_end_of_list <= 1'b0;
+        end
+        else
+        begin
+            case (state)
+              STATE_IDLE:
+                begin
+                    // Traversal begins when CSR 1 is written
+                    if (start_read)
+                    begin
+                        state <= STATE_READ;
+                        $display("AFU starting traversal at 0x%x", t_ccip_clAddr'(read_mem_addr));
+                    end
+                end
+
+              STATE_READ:
+                begin
+                    $display("AFU in READ...");
+                    if (!rd_needed && do_update)
+                    begin
+		    	state <= STATE_UPDATE;
+                        $display("AFU moving to UPDATE...");
+                    end
+                end
+
+              STATE_UPDATE:
+                begin
+		    // Update the read value to be written back
+                    $display("AFU in UPDATE...");
+                    if (!do_update)
+		    begin
+		    	state <= STATE_WRITE;
+			wr_needed <= 1'b1; 
+                        $display("AFU moving to WRITE...");
+		    end
+                end
+
+              STATE_WRITE:
+                begin
+		    // Write the updated value to the address
+		    // Point to new address after that
+		    // if done then point to IDLE; else read new values 
+                    $display("AFU in WRITE...");
+                    if (rd_end_of_list)
+		    begin
+			state <= STATE_IDLE;
+			$display("AFU done...");
+		    end
+                    else if (!wr_needed)
+		    begin
+			state <= STATE_READ;
+			$display("AFU moving to READ from WRITE...");
+		    	start_write <= 1'b0;
+		    end
+                end
+            endcase
+        end
+    end
+
+
+    // =========================================================================
+    //
+    //   Read logic.
+    //
+    // =========================================================================
+
+    //
+    // READ REQUEST
+    //
+
+    // Did a write response just arrive
+
+    // Next read address
+
+    always_ff @(posedge clk)
+    begin
+	// Next read address is valid when we have got the write response back
+        addr_next_valid <= sRx.c1.rspValid;
+
+        // Apurve: Next address is current address plus address length
+        //addr_next <= addr_next + addr_size;
+        addr_next <= rd_addr + 0;
+
+        // End of list reached if we have read 5 times
+        rd_end_of_list <= (cnt_list_length == 'h5);
+    end
+
+    //
+    // Since back pressure may prevent an immediate read request, we must
+    // record whether a read is needed and hold it until the request can
+    // be sent to the FIU.
+    //
+
+    always_ff @(posedge clk)
+    begin
+        if (reset)
+        begin
+            rd_needed <= 1'b0;
+        end
+        else
+        begin
+            // If reads are allowed this cycle then we can safely clear
+            // any previously requested reads.  This simple AFU has only
+            // one read in flight at a time since it is walking a pointer
+            // chain.
+            if (rd_needed)
+            begin
+                rd_needed <= sRx.c0TxAlmFull;
+            end
+            else
+            begin
+                // Need a read under two conditions:
+                //   - Starting a new walk
+                //   - A read response just arrived from a line containing
+                //     a next pointer.
+                rd_needed <= (start_read || (!sRx.c0TxAlmFull && (addr_next_valid && ! rd_end_of_list)));
+                rd_addr <= (start_read ? read_mem_addr : addr_next);
+            	//$display("rd_addr is 0x%x",  t_ccip_clAddr'(rd_addr));
+            	//$display("read mem addr is 0x%x",  t_ccip_clAddr'(read_mem_addr));
+            	//$display("start read is %d", start_read);
+            end
+        end
+    end
+
+    //
+    // Emit read requests to the FIU.
+    //
+
+    // Read header defines the request to the FIU
+    t_ccip_c0_ReqMemHdr rd_hdr;
+
+    always_comb
+    begin
+        rd_hdr = t_ccip_c0_ReqMemHdr'(0);
+
+        // Read request type (No intention to cache)
+        //rd_hdr.req_type = 4'h0;
+
+        // Virtual address (MPF virtual addressing is enabled)
+        rd_hdr.address = rd_addr;
+
+        // Read over channel VA 
+        //rd_hdr.vc_sel = 2'h0;
+
+        // Read one cache line (64 bytes) 
+        //rd_hdr.cl_len = 2'h0;
+    end
+
+    // Send read requests to the FIU
+    always_ff @(posedge clk)
+    begin
+        if (reset)
+        begin
+            sTx.c0.valid <= 1'b0;
+            cnt_list_length <= 0;
+        end
+        else
+        begin
+            // Generate a read request when needed and the FIU isn't full
+	    if (state == STATE_READ)
+            begin
+            	sTx.c0.valid <= (rd_needed && !sRx.c0TxAlmFull);
+
+            	if (rd_needed && !sRx.c0TxAlmFull)
+            	begin
+	    	    sTx.c0.hdr <= rd_hdr;
+            	    cnt_list_length <= cnt_list_length + 1;
+            	    $display("Incrementing read count...%d",cnt_list_length);
+            	    $display("Read address is 0x%x...",rd_hdr.address);
+		    // Apurve: Add something to stop read once this section has been accessed
+            	end
+            end
+        end
+    end
+
+    //
+    // READ RESPONSE HANDLING
+    //
+
+    //
+    // Receive data (read responses).
+    //
+    always_ff @(posedge clk)
+    begin
+	if (reset)
+	begin
+            do_update <= 1'b0;
+        end
+	else
+	begin
+	    if (sRx.c0.rspValid)
+	    begin
+                rd_data <= sRx.c0.data;
+                do_update <= 1'b1;
+	        //$display("rd data is %d...",rd_data);
+            end
+
+	    if (state == STATE_UPDATE)
+	    begin
+	        // Update the read data and put it in the write data to be written
+                wr_data <= rd_data + 2;
+                do_update <= 1'b0;
+	        $display("write data is %d...",wr_data);
+
+		// First read done. Next reads should be from the updated addresses
+		start_read <= 1'b0; 
+            end
+        end
+    end
+
+
+    // =========================================================================
+    //
+    //   Write logic.
+    //
+    // =========================================================================
+
+
+    //
+    // WRITE REQUEST
+    //
+
+    // Did a write response just arrive
+
+    // Next write address
+
+    always_ff @(posedge clk)
+    begin
+        // Next write address is valid when we have got the read response back
+        wr_addr_next_valid <= sRx.c0.rspValid;
+
+        // Apurve: Next address is current address plus address length
+        wr_addr_next <= wr_addr + 0;
+
+    end
+
+    //
+    // Since back pressure may prevent an immediate write request, we must
+    // record whether a write is needed and hold it until the request can
+    // be sent to the FIU.
+    //
+
+    always_ff @(posedge clk)
+    begin
+        if (reset)
+        begin
+            wr_needed <= 1'b0;
+        end
+        else
+        begin
+            // If writes are allowed this cycle then we can safely clear
+            // any previously requested writes.  This simple AFU has only
+            // one write in flight at a time since it is walking a pointer
+            // chain.
+            if (wr_needed)
+            begin
+                wr_needed <= sRx.c1TxAlmFull;
+            end
+            else
+            begin
+                // Need a write under two conditions:
+                //   - Starting a new walk
+                //   - A write response just arrived from a line containing
+                //     a next pointer.
+                wr_needed <= (start_write || (!sRx.c1TxAlmFull && wr_addr_next_valid));
+                wr_addr <= (start_write ? write_mem_addr : wr_addr_next);
+            	//$display("Write mem address later is 0x%x", t_ccip_clAddr'(write_mem_addr));
+            end
+        end
+    end
+
+    //
+    // Emit write requests to the FIU.
+    //
+
+    // Write header defines the request to the FIU
+    t_ccip_c1_ReqMemHdr wr_hdr;
+
+    always_comb
+    begin
+        wr_hdr = t_ccip_c1_ReqMemHdr'(0);
+
+        // Write request type
+        //wr_hdr.req_type = 4'h0;
+
+        // Virtual address (MPF virtual addressing is enabled)
+        wr_hdr.address = wr_addr;
+
+        // Let the FIU pick the channel
+        //wr_hdr.vc_sel = 2'h2;
+
+        // Write 1 cache line (64 bytes) 
+        //wr_hdr.cl_len = 2'h0;
+
+        // Start of packet is true (single line write)
+        wr_hdr.sop = 1'b1;
+    end
+
+    // Send write requests to the FIU
+    always_ff @(posedge clk)
+    begin
+        if (reset)
+        begin
+            sTx.c1.valid <= 1'b0;
+        end
+        else
+        begin
+            // Generate a write request when needed and the FIU isn't full
+	    if (state == STATE_WRITE)
+            begin
+            	sTx.c1.valid <= (wr_needed && !sRx.c1TxAlmFull);
+		if (wr_needed && !sRx.c1TxAlmFull)
+		begin
+            	    sTx.c1.hdr <= wr_hdr;
+	    	    sTx.c1.data <= t_ccip_clData'(wr_data);
+            	end
+            end
+        end
+    end
+
+
+    //
+    // WRITE RESPONSE HANDLING
+    //
+
+    // Apurve: Check if a signal is to be sent to read to start reading in case
+    // write response does not work
+    //
+    // Send data (write requests).
+    //
+    //always_ff @(posedge clk)
+    //begin
+    //    if (state == STATE_WRITE)
+    //    begin
+    //        rd_data <= sRx.c0.data;
+    //    end
+    //    if (state == STATE_UPDATE)
+    //    begin
+    //        // Update the write data and put it in the write data to be written
+    //        wr_data <= rd_data + 1;
+    //    end
+    //end
+
+endmodule
--- a/driver/tests/dogfood/Memcpy/hw/rtl/sources.txt
+++ b/driver/tests/dogfood/Memcpy/hw/rtl/sources.txt
@@ -0,0 +1,2 @@
+cci_hello.json
+cci_hello_afu.sv
--- a/driver/tests/dogfood/Memcpy/hw/sim/setup_ase
+++ b/driver/tests/dogfood/Memcpy/hw/sim/setup_ase
@@ -0,0 +1,11 @@
+#!/bin/sh
+
+##
+## Setup ASE environment using ../rtl/sources.txt.
+##
+
+# Absolute path to this script
+SCRIPT=$(readlink -f "$0")
+SCRIPT_PATH=$(dirname "$SCRIPT")
+
+afu_sim_setup --sources="${SCRIPT_PATH}/../rtl/sources.txt" $@
--- a/driver/tests/dogfood/Memcpy/sw/Makefile
+++ b/driver/tests/dogfood/Memcpy/sw/Makefile
@@ -0,0 +1,41 @@
+include ../../common/sw/common_include.mk
+
+# Primary test name
+TEST = cci_hello
+
+# Build directory
+OBJDIR = obj
+CFLAGS += -I./$(OBJDIR)
+CPPFLAGS += -I./$(OBJDIR)
+
+# Files and folders
+SRCS = $(TEST).c
+OBJS = $(addprefix $(OBJDIR)/,$(patsubst %.c,%.o,$(SRCS)))
+
+# Targets (build only $(TEST)_ase by default)
+all: $(TEST) $(TEST)_ase
+
+# AFU info from JSON file, including AFU UUID
+AFU_JSON_INFO = $(OBJDIR)/afu_json_info.h
+
+$(AFU_JSON_INFO): ../hw/rtl/$(TEST).json | objdir
+	afu_json_mgr json-info --afu-json=$^ --c-hdr=$@
+
+$(OBJS): $(AFU_JSON_INFO)
+
+$(TEST): $(OBJS)
+	$(CC) -o $@ $^ $(LDFLAGS) $(FPGA_LIBS)
+
+$(TEST)_ase: $(OBJS)
+	$(CC) -o $@ $^ $(LDFLAGS) $(ASE_LIBS)
+
+$(OBJDIR)/%.o: %.c | objdir
+	$(CC) $(CFLAGS) -c $< -o $@
+
+clean:
+	rm -rf $(TEST) $(TEST)_ase $(OBJDIR)
+
+objdir:
+	@mkdir -p $(OBJDIR)
+
+.PHONY: all clean
--- a/driver/tests/dogfood/Memcpy/sw/cci_hello.c
+++ b/driver/tests/dogfood/Memcpy/sw/cci_hello.c
@@ -0,0 +1,210 @@
+//
+// Copyright (c) 2017, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// Redistributions of source code must retain the above copyright notice, this
+// list of conditions and the following disclaimer.
+//
+// Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// Neither the name of the Intel Corporation nor the names of its contributors
+// may be used to endorse or promote products derived from this software
+// without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <assert.h>
+#include <uuid/uuid.h>
+
+#include <opae/fpga.h>
+
+// State from the AFU's JSON file, extracted using OPAE's afu_json_mgr script
+#include "afu_json_info.h"
+
+#define CACHELINE_BYTES 64
+#define CL(x) ((x) * CACHELINE_BYTES)
+
+
+//
+// Search for an accelerator matching the requested UUID and connect to it.
+//
+static fpga_handle connect_to_accel(const char *accel_uuid)
+{
+    fpga_properties filter = NULL;
+    fpga_guid guid;
+    fpga_token accel_token;
+    uint32_t num_matches;
+    fpga_handle accel_handle;
+    fpga_result r;
+
+    // Don't print verbose messages in ASE by default
+    //setenv("ASE_LOG", "0", 0);
+
+    // Set up a filter that will search for an accelerator
+    fpgaGetProperties(NULL, &filter);
+    fpgaPropertiesSetObjectType(filter, FPGA_ACCELERATOR);
+
+    // Add the desired UUID to the filter
+    uuid_parse(accel_uuid, guid);
+    fpgaPropertiesSetGUID(filter, guid);
+
+    // Do the search across the available FPGA contexts
+    num_matches = 1;
+    fpgaEnumerate(&filter, 1, &accel_token, 1, &num_matches);
+
+    // Not needed anymore
+    fpgaDestroyProperties(&filter);
+
+    if (num_matches < 1)
+    {
+        fprintf(stderr, "Accelerator %s not found!\n", accel_uuid);
+        return 0;
+    }
+
+    // Open accelerator
+    r = fpgaOpen(accel_token, &accel_handle, 0);
+    assert(FPGA_OK == r);
+
+    // Done with token
+    fpgaDestroyToken(&accel_token);
+
+    return accel_handle;
+}
+
+
+//
+// Allocate a buffer in I/O memory, shared with the FPGA.
+//
+static volatile void* alloc_buffer(fpga_handle accel_handle,
+                                   ssize_t size,
+                                   uint64_t *wsid,
+                                   uint64_t *io_addr)
+{
+    fpga_result r;
+    volatile void* buf;
+
+    r = fpgaPrepareBuffer(accel_handle, size, (void*)&buf, wsid, 0);
+    if (FPGA_OK != r) return NULL;
+
+    // Get the physical address of the buffer in the accelerator
+    r = fpgaGetIOAddress(accel_handle, *wsid, io_addr);
+    assert(FPGA_OK == r);
+
+    return buf;
+}
+
+
+int main(int argc, char *argv[])
+{
+    fpga_handle accel_handle;
+    volatile char *buf;
+    volatile char *buf_r;
+    uint64_t wsid1;
+    uint64_t wsid2;
+    uint64_t buf_pa;
+    uint64_t ret_buf_pa;
+    uint64_t buf_rpa;
+    uint64_t ret_buf_rpa;
+    fpga_result r;
+
+    // Find and connect to the accelerator
+    accel_handle = connect_to_accel(AFU_ACCEL_UUID);
+
+    // Allocate a single page memory buffer for write
+    buf = (volatile char*)alloc_buffer(accel_handle, 4 * getpagesize(),
+                                       &wsid1, &buf_pa);
+    // Allocate a single page memory buffer for read
+    buf_r = (volatile char*)alloc_buffer(accel_handle, 4 * getpagesize(),
+                                       &wsid2, &buf_rpa);
+    assert(NULL != buf);
+
+    //// Set the low byte of the shared buffer to 0.  The FPGA will write
+    //// a non-zero value to it.
+    //buf[0] = 0;
+
+    // Set the low byte of the shared buffer buf_r to 0.  The FPGA will read
+    // the values and write to buf address 
+    buf[0] = 5;
+    buf_r[0] = 5;
+
+    // Tell the accelerator the address of the buffer using cache line
+    // addresses.  The accelerator will respond by writing to the buffer.
+    r = fpgaWriteMMIO64(accel_handle, 0, 0, buf_pa / CL(1));
+    printf("Write address is %08lx\n", buf_pa);
+    printf("Write address div 64 is %08lx\n", buf_pa/ CL(1));
+    assert(FPGA_OK == r);
+
+    // Wait for response from FPGA. Check using fpgaReadMMIO
+    //r = fpgaReadMMIO64(accel_handle, 0, 0, &ret_buf_pa);
+    //printf("Returned write is %08lx\n", ret_buf_pa);
+    //assert(FPGA_OK == r);
+
+///////////////////// Added to check fpgaRead
+    // Wait for response from FPGA. Check using fpgaReadMMIO
+    r = fpgaReadMMIO64(accel_handle, 0, 5 * sizeof(uint64_t), &ret_buf_rpa);
+    printf("Returned read at 10 is %08lx\n", ret_buf_rpa);
+    assert(FPGA_OK == r);
+///////////////////////////////////////////////
+
+
+    // Tell the accelerator the address of the buffer using cache line
+    // addresses.  The accelerator will read from the buffer.
+    // Write the address to MMIO 1
+    r = fpgaWriteMMIO64(accel_handle, 0, sizeof(uint64_t), buf_rpa / CL(1));
+    printf("Read address is %08lx\n", buf_rpa);
+    printf("Read address div64 is %08lx\n", buf_rpa / CL(1));
+    assert(FPGA_OK == r);
+
+    // Wait for response from FPGA. Check using fpgaReadMMIO
+    //r = fpgaReadMMIO64(accel_handle, 0, sizeof(uint64_t), &ret_buf_rpa);
+    //printf("Returned write is %08lx\n", ret_buf_rpa);
+    //assert(FPGA_OK == r);
+
+
+
+
+
+
+
+
+    // Update this
+    // Spin, waiting for the value in memory to change to something non-zero.
+    while (5 == buf[0])
+    {
+        // A well-behaved program would use _mm_pause(), nanosleep() or
+        // equivalent to save power here.
+    };
+
+    // Print the string written by the FPGA
+    printf("%d\n", buf[0]);
+
+    do {
+        //printf("%d\n", buf[0]);
+    } while (10 != buf[0]);
+
+    // Done
+    fpgaReleaseBuffer(accel_handle, wsid1);
+    fpgaReleaseBuffer(accel_handle, wsid2);
+    fpgaClose(accel_handle);
+
+    return 0;
+}
--- a/driver/tests/dogfood/Memcpy/sw/obj/afu_json_info.h
+++ b/driver/tests/dogfood/Memcpy/sw/obj/afu_json_info.h
@@ -0,0 +1,13 @@
+//
+// Generated by afu_json_mgr from ../hw/rtl/cci_hello.json
+//
+
+#ifndef __AFU_JSON_INFO__
+#define __AFU_JSON_INFO__
+
+#define AFU_ACCEL_NAME "cci_hello"
+#define AFU_ACCEL_UUID "C6AA954A-9B91-4A37-ABC1-1D9F0709DCC3"
+#define AFU_IMAGE_POWER 0
+#define AFU_TOP_IFC "ccip_std_afu"
+
+#endif // __AFU_JSON_INFO__
--- a/driver/tests/dogfood/Memcpy/sw/obj/cci_hello.o
+++ b/driver/tests/dogfood/Memcpy/sw/obj/cci_hello.o