vx_spawn_warps redesign using opencl's style scheduler

This commit is contained in:
Blaise Tine
2021-01-01 14:13:48 -05:00
parent 138db29310
commit 30d950ada2
35 changed files with 81204 additions and 81014 deletions

View File

@@ -8,59 +8,103 @@ extern "C" {
#define NUM_CORES_MAX 16
#define MIN(a, b) ((a) < (b) ? (a) : (b))
typedef struct {
func_t function;
void * arguments;
int nthreads;
} spawn_t;
pfn_callback callback;
void * args;
int offset;
int N;
int R;
} wspawn_args_t;
spawn_t* g_spawn[NUM_CORES_MAX];
wspawn_args_t* g_wspawn_args[NUM_CORES_MAX];
void spawn_warp_all() {
// active all threads
int num_threads = vx_num_threads();
vx_tmc(num_threads);
void spawn_tasks_callback() {
vx_tmc(vx_num_threads());
int core_id = vx_core_id();
spawn_t* p_spawn = g_spawn[core_id];
int core_id = vx_core_id();
int wid = vx_warp_id();
int tid = vx_thread_id();
int NT = vx_num_threads();
wspawn_args_t* p_wspawn_args = g_wspawn_args[core_id];
// call user routine
p_spawn->function(p_spawn->arguments);
int wK = (p_wspawn_args->N * wid) + MIN(p_wspawn_args->R, wid);
int tK = p_wspawn_args->N + (wid < p_wspawn_args->R);
int offset = p_wspawn_args->offset + (wK * NT) + (tid * tK);
// resume single-warp execution on exit
int wid = vx_warp_id();
unsigned tmask = (0 == wid) ? 0x1 : 0x0;
vx_tmc(tmask);
for (int task_id = offset, N = task_id + tK; task_id < N; ++task_id) {
(p_wspawn_args->callback)(task_id, p_wspawn_args->args);
}
vx_tmc(0 == wid);
}
void spawn_warp_threads(int num_threads) {
// active all threads
vx_tmc(num_threads);
void spawn_remaining_tasks_callback(int nthreads) {
vx_tmc(nthreads);
int core_id = vx_core_id();
spawn_t* p_spawn = g_spawn[core_id];
int core_id = vx_core_id();
int tid = vx_thread_gid();
// call user routine
p_spawn->function(p_spawn->arguments);
wspawn_args_t* p_wspawn_args = g_wspawn_args[core_id];
// resume single-warp execution on exit
int wid = vx_warp_id();
unsigned tmask = (0 == wid) ? 0x1 : 0x0;
vx_tmc(tmask);
int task_id = p_wspawn_args->offset + tid;
(p_wspawn_args->callback)(task_id, p_wspawn_args->args);
vx_tmc(1);
}
void vx_spawn_warps(int num_warps, int num_threads, func_t func_ptr , void * args) {
int core_id = vx_core_id();
if (core_id >= NUM_CORES_MAX)
return;
spawn_t spawn = { func_ptr, args, num_threads };
g_spawn[core_id] = &spawn;
void vx_spawn_tasks(int num_tasks, pfn_callback callback , void * args) {
// device specs
int NC = vx_num_cores();
int NW = vx_num_warps();
int NT = vx_num_threads();
if (num_warps > 1) {
vx_wspawn(num_warps, (unsigned)spawn_warp_all);
}
spawn_warp_threads(num_threads);
// current core id
int core_id = vx_core_id();
if (core_id >= NUM_CORES_MAX)
return;
// calculate necessary active cores
int WT = NW * NT;
int nC = (num_tasks > WT) ? (num_tasks / WT) : 1;
int nc = MIN(nC, NC);
if (core_id >= nc)
return; // terminate unused cores
// number of tasks per core
int tasks_per_core = num_tasks / nc;
int tasks_per_core0 = tasks_per_core;
if (core_id == (NC-1)) {
int QC_r = num_tasks - (nc * tasks_per_core0);
tasks_per_core0 += QC_r; // last core executes remaining tasks
}
// number of tasks per warp
int nW = tasks_per_core0 / NT; // total warps per core
int rT = tasks_per_core0 - (nW * NT); // remaining threads
int fW = (nW >= NW) ? (nW / NW) : 0; // full warps iterations
int rW = (fW != 0) ? (nW - fW * NW) : 0; // remaining warps
if (0 == fW)
fW = 1;
//--
wspawn_args_t wspawn_args = { callback, args, core_id * tasks_per_core, fW, rW };
g_wspawn_args[core_id] = &wspawn_args;
//--
if (nW > 1) {
int nw = MIN(nW, NW);
vx_wspawn(nw, (unsigned)&spawn_tasks_callback);
spawn_tasks_callback();
}
//--
if (rT != 0) {
wspawn_args.offset = tasks_per_core0 - rT;
spawn_remaining_tasks_callback(rT);
}
}
#ifdef __cplusplus