vx_spawn_warps redesign using opencl's style scheduler
This commit is contained in:
@@ -8,9 +8,9 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef void (*func_t)(void *);
|
||||
typedef void (*pfn_callback)(int task_id, void *arg);
|
||||
|
||||
void vx_spawn_warps(int num_warps, int num_threads, func_t func_ptr , void * args);
|
||||
void vx_spawn_tasks(int num_tasks, pfn_callback callback , void * args);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
@@ -8,59 +8,103 @@ extern "C" {
|
||||
|
||||
#define NUM_CORES_MAX 16
|
||||
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
|
||||
typedef struct {
|
||||
func_t function;
|
||||
void * arguments;
|
||||
int nthreads;
|
||||
} spawn_t;
|
||||
pfn_callback callback;
|
||||
void * args;
|
||||
int offset;
|
||||
int N;
|
||||
int R;
|
||||
} wspawn_args_t;
|
||||
|
||||
spawn_t* g_spawn[NUM_CORES_MAX];
|
||||
wspawn_args_t* g_wspawn_args[NUM_CORES_MAX];
|
||||
|
||||
void spawn_warp_all() {
|
||||
// active all threads
|
||||
int num_threads = vx_num_threads();
|
||||
vx_tmc(num_threads);
|
||||
void spawn_tasks_callback() {
|
||||
vx_tmc(vx_num_threads());
|
||||
|
||||
int core_id = vx_core_id();
|
||||
spawn_t* p_spawn = g_spawn[core_id];
|
||||
int core_id = vx_core_id();
|
||||
int wid = vx_warp_id();
|
||||
int tid = vx_thread_id();
|
||||
int NT = vx_num_threads();
|
||||
|
||||
wspawn_args_t* p_wspawn_args = g_wspawn_args[core_id];
|
||||
|
||||
// call user routine
|
||||
p_spawn->function(p_spawn->arguments);
|
||||
int wK = (p_wspawn_args->N * wid) + MIN(p_wspawn_args->R, wid);
|
||||
int tK = p_wspawn_args->N + (wid < p_wspawn_args->R);
|
||||
int offset = p_wspawn_args->offset + (wK * NT) + (tid * tK);
|
||||
|
||||
// resume single-warp execution on exit
|
||||
int wid = vx_warp_id();
|
||||
unsigned tmask = (0 == wid) ? 0x1 : 0x0;
|
||||
vx_tmc(tmask);
|
||||
for (int task_id = offset, N = task_id + tK; task_id < N; ++task_id) {
|
||||
(p_wspawn_args->callback)(task_id, p_wspawn_args->args);
|
||||
}
|
||||
|
||||
vx_tmc(0 == wid);
|
||||
}
|
||||
|
||||
void spawn_warp_threads(int num_threads) {
|
||||
// active all threads
|
||||
vx_tmc(num_threads);
|
||||
void spawn_remaining_tasks_callback(int nthreads) {
|
||||
vx_tmc(nthreads);
|
||||
|
||||
int core_id = vx_core_id();
|
||||
spawn_t* p_spawn = g_spawn[core_id];
|
||||
int core_id = vx_core_id();
|
||||
int tid = vx_thread_gid();
|
||||
|
||||
// call user routine
|
||||
p_spawn->function(p_spawn->arguments);
|
||||
wspawn_args_t* p_wspawn_args = g_wspawn_args[core_id];
|
||||
|
||||
// resume single-warp execution on exit
|
||||
int wid = vx_warp_id();
|
||||
unsigned tmask = (0 == wid) ? 0x1 : 0x0;
|
||||
vx_tmc(tmask);
|
||||
int task_id = p_wspawn_args->offset + tid;
|
||||
(p_wspawn_args->callback)(task_id, p_wspawn_args->args);
|
||||
|
||||
vx_tmc(1);
|
||||
}
|
||||
|
||||
void vx_spawn_warps(int num_warps, int num_threads, func_t func_ptr , void * args) {
|
||||
int core_id = vx_core_id();
|
||||
if (core_id >= NUM_CORES_MAX)
|
||||
return;
|
||||
|
||||
spawn_t spawn = { func_ptr, args, num_threads };
|
||||
g_spawn[core_id] = &spawn;
|
||||
void vx_spawn_tasks(int num_tasks, pfn_callback callback , void * args) {
|
||||
// device specs
|
||||
int NC = vx_num_cores();
|
||||
int NW = vx_num_warps();
|
||||
int NT = vx_num_threads();
|
||||
|
||||
if (num_warps > 1) {
|
||||
vx_wspawn(num_warps, (unsigned)spawn_warp_all);
|
||||
}
|
||||
spawn_warp_threads(num_threads);
|
||||
// current core id
|
||||
int core_id = vx_core_id();
|
||||
if (core_id >= NUM_CORES_MAX)
|
||||
return;
|
||||
|
||||
// calculate necessary active cores
|
||||
int WT = NW * NT;
|
||||
int nC = (num_tasks > WT) ? (num_tasks / WT) : 1;
|
||||
int nc = MIN(nC, NC);
|
||||
if (core_id >= nc)
|
||||
return; // terminate unused cores
|
||||
|
||||
// number of tasks per core
|
||||
int tasks_per_core = num_tasks / nc;
|
||||
int tasks_per_core0 = tasks_per_core;
|
||||
if (core_id == (NC-1)) {
|
||||
int QC_r = num_tasks - (nc * tasks_per_core0);
|
||||
tasks_per_core0 += QC_r; // last core executes remaining tasks
|
||||
}
|
||||
|
||||
// number of tasks per warp
|
||||
int nW = tasks_per_core0 / NT; // total warps per core
|
||||
int rT = tasks_per_core0 - (nW * NT); // remaining threads
|
||||
int fW = (nW >= NW) ? (nW / NW) : 0; // full warps iterations
|
||||
int rW = (fW != 0) ? (nW - fW * NW) : 0; // remaining warps
|
||||
if (0 == fW)
|
||||
fW = 1;
|
||||
|
||||
//--
|
||||
wspawn_args_t wspawn_args = { callback, args, core_id * tasks_per_core, fW, rW };
|
||||
g_wspawn_args[core_id] = &wspawn_args;
|
||||
|
||||
//--
|
||||
if (nW > 1) {
|
||||
int nw = MIN(nW, NW);
|
||||
vx_wspawn(nw, (unsigned)&spawn_tasks_callback);
|
||||
spawn_tasks_callback();
|
||||
}
|
||||
|
||||
//--
|
||||
if (rT != 0) {
|
||||
wspawn_args.offset = tasks_per_core0 - rT;
|
||||
spawn_remaining_tasks_callback(rT);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
@@ -33,21 +33,10 @@ unsigned z[] = {0, 0, 0, 0,
|
||||
0, 0, 0, 0,
|
||||
0, 0, 0, 0};
|
||||
|
||||
void mat_add_kernel(void * void_arguments)
|
||||
void mat_add_kernel(int task_id, void * void_arguments)
|
||||
{
|
||||
mat_add_args_t * arguments = (mat_add_args_t *) void_arguments;
|
||||
|
||||
unsigned wid = vx_warp_id();
|
||||
unsigned tid = vx_thread_id();
|
||||
|
||||
bool valid = (wid < arguments->numRows) && (tid < arguments->numColums);
|
||||
|
||||
__if (valid)
|
||||
{
|
||||
unsigned index = (wid * arguments->numColums) + tid;
|
||||
arguments->z[index] = arguments->x[index] + arguments->y[index];
|
||||
}
|
||||
__endif
|
||||
arguments->z[task_id] = arguments->x[task_id] + arguments->y[task_id];
|
||||
}
|
||||
|
||||
void vx_print_mat(unsigned * matPtr, int numRows, int numCols)
|
||||
@@ -62,15 +51,11 @@ void vx_print_mat(unsigned * matPtr, int numRows, int numCols)
|
||||
}
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
// Main is called with all threads active of warp 0
|
||||
vx_tmc(1);
|
||||
|
||||
int main() {
|
||||
// void * hellp = malloc(4);
|
||||
vx_printf("Confirm Dev Main\n");
|
||||
|
||||
vx_printf("vx_spawn_warps\n");
|
||||
vx_printf("vx_spawn_tasks\n");
|
||||
|
||||
mat_add_args_t arguments;
|
||||
arguments.x = x;
|
||||
@@ -79,12 +64,8 @@ int main()
|
||||
arguments.numColums = 4;
|
||||
arguments.numRows = 4;
|
||||
|
||||
|
||||
int numWarps = 4;
|
||||
int numThreads = 4;
|
||||
|
||||
// First kernel call
|
||||
vx_spawn_warps(numWarps, numThreads, mat_add_kernel, &arguments);
|
||||
vx_spawn_tasks(arguments.numRows * arguments.numColums, mat_add_kernel, &arguments);
|
||||
vx_print_mat(z, arguments.numRows, arguments.numColums);
|
||||
|
||||
|
||||
@@ -95,8 +76,9 @@ int main()
|
||||
arguments.numRows = 4;
|
||||
|
||||
// Second Kernel Call
|
||||
vx_spawn_warps(numWarps, numThreads, mat_add_kernel, &arguments);
|
||||
vx_spawn_tasks(arguments.numRows * arguments.numColums, mat_add_kernel, &arguments);
|
||||
vx_print_mat(z, arguments.numRows, arguments.numColums);
|
||||
|
||||
vx_prints("Passed!\n");
|
||||
|
||||
return 0;
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because it is too large
Load Diff
@@ -68,7 +68,7 @@ Disassembly of section .text:
|
||||
800000c0: 7f000117 auipc sp,0x7f000
|
||||
800000c4: f4010113 addi sp,sp,-192 # ff000000 <__stack_top>
|
||||
800000c8: 40000593 li a1,1024
|
||||
800000cc: cc202673 csrr a2,0xcc2
|
||||
800000cc: cc102673 csrr a2,0xcc1
|
||||
800000d0: 02c585b3 mul a1,a1,a2
|
||||
800000d4: 40b10133 sub sp,sp,a1
|
||||
800000d8: cc3026f3 csrr a3,0xcc3
|
||||
|
||||
Binary file not shown.
@@ -11,7 +11,7 @@
|
||||
:1000900093070000638807003705008013054513A8
|
||||
:1000A0006F00C00467800000130500006B000500AE
|
||||
:1000B000732500FC6B0005009761010093810175B9
|
||||
:1000C0001701007F130101F493050040732620CC33
|
||||
:1000C0001701007F130101F493050040732610CC43
|
||||
:1000D000B385C5023301B140F32630CC63860600F8
|
||||
:1000E000130500006B000500678000009305050004
|
||||
:1000F0009306000013060000130500006F30400156
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because it is too large
Load Diff
@@ -28,21 +28,9 @@ unsigned z[] = {0, 0, 0, 0,
|
||||
0, 0, 0, 0,
|
||||
0, 0, 0, 0};
|
||||
|
||||
void mat_add_kernel(void * void_arguments) {
|
||||
void mat_add_kernel(int task_id, void * void_arguments) {
|
||||
mat_add_args_t * arguments = (mat_add_args_t *) void_arguments;
|
||||
|
||||
unsigned wid = vx_warp_id();
|
||||
unsigned tid = vx_thread_id();
|
||||
|
||||
bool valid = (wid < arguments->numRows) && (tid < arguments->numColums);
|
||||
|
||||
// __if (valid)
|
||||
// {
|
||||
unsigned index = (wid * arguments->numColums) + tid;
|
||||
unsigned val = arguments->x[index] + arguments->y[index];
|
||||
arguments->z[index] = val;
|
||||
// }
|
||||
// __endif
|
||||
arguments->z[task_id] = arguments->x[task_id] + arguments->y[task_id];
|
||||
}
|
||||
|
||||
int main() {
|
||||
@@ -98,7 +86,7 @@ int main() {
|
||||
ptr++;
|
||||
}
|
||||
|
||||
vx_printf("vx_spawn_warps mat_add_kernel\n");
|
||||
vx_printf("vx_spawn_tasks mat_add_kernel\n");
|
||||
|
||||
mat_add_args_t arguments;
|
||||
arguments.x = x;
|
||||
@@ -107,16 +95,13 @@ int main() {
|
||||
arguments.numColums = 4;
|
||||
arguments.numRows = 4;
|
||||
|
||||
int numWarps = 4;
|
||||
int numThreads = 4;
|
||||
|
||||
vx_spawn_warps(numWarps, numThreads, mat_add_kernel, &arguments);
|
||||
vx_spawn_tasks(arguments.numRows * arguments.numColums, mat_add_kernel, &arguments);
|
||||
|
||||
vx_printf("Waiting to ensure other warps are done... (Takes a while)\n");
|
||||
for (int i = 0; i < 5000; i++) {}
|
||||
|
||||
for (int i = 0; i < numWarps; i++) {
|
||||
for (int j = 0; j < numThreads; j++) {
|
||||
for (int i = 0; i < arguments.numRows; i++) {
|
||||
for (int j = 0; j < arguments.numColums; j++) {
|
||||
unsigned index = (i * arguments.numColums) + j;
|
||||
vx_printf("0x%x ", z[index]);
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user