uti: Add/Modify test programs

Change-Id: I27a39d6b11af5243f93d07c31c2ef80f6727dd53
This commit is contained in:
Masamichi Takagi
2018-09-03 15:09:15 +09:00
parent 52afbbbc98
commit 4438f994dc
97 changed files with 17368 additions and 0 deletions

216
test/uti/mpi/001.c Executable file
View File

@@ -0,0 +1,216 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <sys/mman.h>
#include <mpi.h>
#include <unistd.h>
#include <sys/syscall.h> /* For SYS_xxx definitions */
#include <sched.h>
//#define DEBUG
#ifdef DEBUG
#define dprintf printf
#else
#define dprintf {}
#endif
#define SZENTRY_DEFAULT (65536) /* Size of one slot */
#define NENTRY_DEFAULT 10000 /* Number of slots */
#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec))
static int print_cpu_last_executed_on() {
char fn[256];
char* result;
pid_t tid = syscall(SYS_gettid);
int fd;
int offset;
int mpi_errno = 0;
sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid);
//printf("fn=%s\n", fn);
fd = open(fn, O_RDONLY);
if(fd == -1) {
printf("open() failed\n");
goto fn_fail;
}
result = malloc(65536);
if(result == NULL) {
printf("malloc() failed");
goto fn_fail;
}
int amount = 0;
offset = 0;
while(1) {
amount = read(fd, result + offset, 65536);
// printf("amount=%d\n", amount);
if(amount == -1) {
printf("read() failed");
goto fn_fail;
}
if(amount == 0) {
goto eof;
}
offset += amount;
}
eof:;
//printf("result:%s\n", result);
char* next_delim = result;
char* field;
int i;
for(i = 0; i < 39; i++) {
field = strsep(&next_delim, " ");
}
int cpu = sched_getcpu();
if(cpu == -1) {
printf("getpu() failed\n");
goto fn_fail;
}
printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, tid); fflush(stdout);
fn_exit:
free(result);
return mpi_errno;
fn_fail:
mpi_errno = -1;
goto fn_exit;
}
void sendrecv(int rank, int nentry, char **sendv, char **recvv, int szentry, int src, int dest, MPI_Request* reqs, MPI_Status* status, double usec) {
int i;
if(rank == 1) {
for(i = 0; i < nentry; i++) {
MPI_Isend(sendv[i], szentry, MPI_CHAR, dest, 0, MPI_COMM_WORLD, &reqs[i]);
if (nentry > 10 && i % (nentry / 10) == 0) {
printf("s"); fflush(stdout);
}
}
MPI_Waitall(nentry, reqs, status);
printf("w\n"); fflush(stdout);
} else {
for(i = 0; i < nentry; i++) {
MPI_Irecv(recvv[i], szentry, MPI_CHAR, src, 0, MPI_COMM_WORLD, &reqs[i]);
if (nentry > 10 && i % (nentry / 10) == 0) {
printf("r"); fflush(stdout);
}
}
usleep(usec);
MPI_Waitall(nentry, reqs, status);
printf("W\n"); fflush(stdout);
}
}
int main(int argc, char **argv) {
int my_rank = -1, size = -1;
int i, j;
char **sendv, **recvv;
MPI_Status* status;
MPI_Request* reqs;
long szentry;
long nentry;
int src, dest;
struct timespec start, end;
double diffusec;
if(argc == 3) {
szentry = atoi(argv[1]);
nentry = atoi(argv[2]);
} else {
szentry = SZENTRY_DEFAULT;
nentry = NENTRY_DEFAULT;
}
printf("szentry=%ld,nentry=%ld\n", szentry, nentry);
status = (MPI_Status*)malloc(sizeof(MPI_Status) * nentry);
reqs = (MPI_Request*)malloc(sizeof(MPI_Request) * nentry);
int actual;
MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &actual);
printf("Thread support level is %d\n", actual);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
src = (size + my_rank - 1) % size;
dest = (my_rank + 1) % size;
printf("rank=%d, size=%d, src=%d, dest=%d\n", my_rank, size, src, dest);
sendv = malloc(sizeof(char *) * nentry);
if(!sendv) { printf("malloc failed"); goto fn_fail; }
for (i = 0; i < nentry; i++) {
#if 0
int fd;
fd = open("./file", O_RDWR);
if(fd == -1) { printf("open failed\n"); goto fn_fail; }
sendv[i] = (char*)mmap(0, szentry, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
#else
sendv[i] = (char*)mmap(0, szentry, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
#endif
if(sendv[i] == MAP_FAILED) { printf("mmap failed"); goto fn_fail; }
dprintf("[%d] sendv[%d]=%p\n", my_rank, i, sendv[i]);
memset(sendv[i], 0xaa, szentry);
}
recvv = malloc(sizeof(char *) * nentry);
if(!recvv) { printf("malloc failed"); goto fn_fail; }
for (i = 0; i < nentry; i++) {
#if 0
int fd;
fd = open("./file", O_RDWR);
if(fd == -1) { printf("open failed\n"); goto fn_fail; }
recvv[i] = (char*)mmap(0, szentry, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
#else
recvv[i] = (char*)mmap(0, szentry, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
#endif
if(recvv[i] == MAP_FAILED) { printf("mmap failed"); goto fn_fail; }
dprintf("[%d] recvv[%d]=%p\n", my_rank, i, recvv[i]);
memset(recvv[i], 0, szentry);
}
printf("after memset\n");
print_cpu_last_executed_on();
for (i = 0; i < 1; i++) {
MPI_Barrier(MPI_COMM_WORLD);
if(my_rank == 0) {
clock_gettime(CLOCK_REALTIME, &start);
}
sendrecv(my_rank, nentry, sendv, recvv, szentry, src, dest, reqs, status, 0);
MPI_Barrier(MPI_COMM_WORLD);
if(my_rank == 0) {
clock_gettime(CLOCK_REALTIME, &end);
diffusec = DIFFNSEC(end, start) / (double)1000;
printf("%4.4f sec\n", DIFFNSEC(end, start) / (double)1000000000); fflush(stdout);
}
MPI_Barrier(MPI_COMM_WORLD);
if(my_rank == 0) {
clock_gettime(CLOCK_REALTIME, &start);
}
sendrecv(my_rank, nentry, sendv, recvv, szentry, src, dest, reqs, status, diffusec);
MPI_Barrier(MPI_COMM_WORLD);
if(my_rank == 0) {
clock_gettime(CLOCK_REALTIME, &end);
printf("%4.4f sec\n", DIFFNSEC(end, start) / (double)1000000000); fflush(stdout);
}
}
fn_exit:
MPI_Finalize();
return 0;
fn_fail:
goto fn_exit;
}

127
test/uti/mpi/002.c Executable file
View File

@@ -0,0 +1,127 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <sys/mman.h>
#include <mpi.h>
#include <unistd.h>
#include <sys/syscall.h> /* For SYS_xxx definitions */
#include <sched.h>
//#define DEBUG
#ifdef DEBUG
#define dprintf printf
#else
#define dprintf {}
#endif
#define SZENTRY_DEFAULT (65536) /* Size of one slot */
#define NENTRY_DEFAULT 10000 /* Number of slots */
#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec))
static int print_cpu_last_executed_on() {
char fn[256];
char* result;
pid_t tid = syscall(SYS_gettid);
int fd;
int offset;
int mpi_errno = 0;
sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid);
//printf("fn=%s\n", fn);
fd = open(fn, O_RDONLY);
if(fd == -1) {
printf("open() failed\n");
goto fn_fail;
}
result = malloc(65536);
if(result == NULL) {
printf("malloc() failed");
goto fn_fail;
}
int amount = 0;
offset = 0;
while(1) {
amount = read(fd, result + offset, 65536);
// printf("amount=%d\n", amount);
if(amount == -1) {
printf("read() failed");
goto fn_fail;
}
if(amount == 0) {
goto eof;
}
offset += amount;
}
eof:;
//printf("result:%s\n", result);
char* next_delim = result;
char* field;
int i;
for(i = 0; i < 39; i++) {
field = strsep(&next_delim, " ");
}
int cpu = sched_getcpu();
if(cpu == -1) {
printf("getpu() failed\n");
goto fn_fail;
}
printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, tid); fflush(stdout);
fn_exit:
free(result);
return mpi_errno;
fn_fail:
mpi_errno = -1;
goto fn_exit;
}
int main(int argc, char **argv) {
int my_rank = -1, size = -1;
int i, j;
struct timespec start, end;
int actual;
printf("nloop=%d\n", atoi(argv[1]));
MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &actual);
printf("Thread support level is %d\n", actual);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
print_cpu_last_executed_on();
printf("Before 1st barrier\n"); fflush(stdout);
MPI_Barrier(MPI_COMM_WORLD);
printf("Before 2nd barrier\n"); fflush(stdout);
if(my_rank == 0) {
clock_gettime(CLOCK_REALTIME, &start);
}
for (i = 0; i < atoi(argv[1]); i++) {
MPI_Barrier(MPI_COMM_WORLD);
}
if(my_rank == 0) {
clock_gettime(CLOCK_REALTIME, &end);
printf("%4.4f sec\n", DIFFNSEC(end, start) / (double)1000000000); fflush(stdout);
}
fn_exit:
//MPI_Finalize();
usleep(100000);
return 0;
fn_fail:
goto fn_exit;
}

188
test/uti/mpi/003.c Executable file
View File

@@ -0,0 +1,188 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <sys/mman.h>
#include <mpi.h>
#include <unistd.h>
#include <sys/syscall.h> /* For SYS_xxx definitions */
#include <sched.h>
//#define DEBUG
#ifdef DEBUG
#define dprintf printf
#else
#define dprintf {}
#endif
#define SZENTRY_DEFAULT (65536) /* Size of one slot */
#define NENTRY_DEFAULT 10000 /* Number of slots */
#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec))
static int print_cpu_last_executed_on() {
char fn[256];
char* result;
pid_t tid = syscall(SYS_gettid);
int fd;
int offset;
int mpi_errno = 0;
sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid);
//printf("fn=%s\n", fn);
fd = open(fn, O_RDONLY);
if(fd == -1) {
printf("open() failed\n");
goto fn_fail;
}
result = malloc(65536);
if(result == NULL) {
printf("malloc() failed");
goto fn_fail;
}
int amount = 0;
offset = 0;
while(1) {
amount = read(fd, result + offset, 65536);
// printf("amount=%d\n", amount);
if(amount == -1) {
printf("read() failed");
goto fn_fail;
}
if(amount == 0) {
goto eof;
}
offset += amount;
}
eof:;
//printf("result:%s\n", result);
char* next_delim = result;
char* field;
int i;
for(i = 0; i < 39; i++) {
field = strsep(&next_delim, " ");
}
int cpu = sched_getcpu();
if(cpu == -1) {
printf("getpu() failed\n");
goto fn_fail;
}
printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, tid); fflush(stdout);
fn_exit:
free(result);
return mpi_errno;
fn_fail:
mpi_errno = -1;
goto fn_exit;
}
void sendrecv(int rank, int nentry, char **sendv, char **recvv, int szentry, int src, int dest, MPI_Request* reqs, MPI_Status* status, double usec) {
int i;
if(rank == 1) {
for(i = 0; i < nentry; i++) {
if (i % (nentry / 10) == 0) {
printf("s"); fflush(stdout);
}
MPI_Isend(sendv[0], szentry, MPI_CHAR, dest, 0, MPI_COMM_WORLD, &reqs[i]);
}
printf("\n"); fflush(stdout);
MPI_Waitall(nentry, reqs, status);
} else {
for(i = 0; i < nentry; i++) {
if (i % (nentry / 10) == 0) {
printf("r"); fflush(stdout);
}
MPI_Irecv(recvv[0], szentry, MPI_CHAR, src, 0, MPI_COMM_WORLD, &reqs[i]);
}
usleep(usec);
MPI_Waitall(nentry, reqs, status);
}
}
int main(int argc, char **argv) {
int my_rank = -1, size = -1;
int i, j;
char **sendv, **recvv;
MPI_Status* status;
MPI_Request* reqs;
long szentry;
long nentry;
int src, dest;
struct timespec start, end;
double diffusec;
if(argc == 3) {
szentry = atoi(argv[1]);
nentry = atoi(argv[2]);
} else {
szentry = SZENTRY_DEFAULT;
nentry = NENTRY_DEFAULT;
}
status = (MPI_Status*)malloc(sizeof(MPI_Status) * nentry);
reqs = (MPI_Request*)malloc(sizeof(MPI_Request) * nentry);
int actual;
MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &actual);
printf("Thread support level is %d\n", actual);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
src = (size + my_rank - 1) % size;
dest = (my_rank + 1) % size;
printf("rank=%d, size=%d, src=%d, dest=%d\n", my_rank, size, src, dest);
sendv = malloc(sizeof(char *) * nentry);
if(!sendv) { printf("malloc failed"); goto fn_fail; }
for (i = 0; i < 1; i++) {
sendv[i] = (char*)mmap(0, szentry, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
if(sendv[i] == MAP_FAILED) { printf("mmap failed"); goto fn_fail; }
dprintf("[%d] sendv[%d]=%p\n", my_rank, i, sendv[i]);
memset(sendv[i], 0xaa, szentry);
}
recvv = malloc(sizeof(char *) * nentry);
if(!recvv) { printf("malloc failed"); goto fn_fail; }
for (i = 0; i < 1; i++) {
recvv[i] = (char*)mmap(0, szentry, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
if(recvv[i] == MAP_FAILED) { printf("mmap failed"); goto fn_fail; }
dprintf("[%d] recvv[%d]=%p\n", my_rank, i, recvv[i]);
memset(recvv[i], 0, szentry);
}
printf("after memset\n");
print_cpu_last_executed_on();
printf("Before 1st barrier\n"); fflush(stdout);
MPI_Barrier(MPI_COMM_WORLD);
if(my_rank == 0) {
clock_gettime(CLOCK_REALTIME, &start);
}
sendrecv(my_rank, nentry, sendv, recvv, szentry, src, dest, reqs, status, 0);
printf("Before 2nd barrier\n"); fflush(stdout);
MPI_Barrier(MPI_COMM_WORLD);
if(my_rank == 0) {
clock_gettime(CLOCK_REALTIME, &end);
diffusec = DIFFNSEC(end, start) / (double)1000;
printf("%4.4f sec\n", DIFFNSEC(end, start) / (double)1000000000); fflush(stdout);
}
fn_exit:
MPI_Finalize();
return 0;
fn_fail:
goto fn_exit;
}

281
test/uti/mpi/004.c Executable file
View File

@@ -0,0 +1,281 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <sys/mman.h>
#include <mpi.h>
#include <unistd.h>
#include <getopt.h>
#include <sys/syscall.h> /* For SYS_xxx definitions */
#include <sched.h>
//#define DEBUG
#ifdef DEBUG
#define dprintf printf
#else
#define dprintf {}
#endif
#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec))
static inline void fixed_size_work() {
asm volatile(
"movq $0, %%rcx\n\t"
"1:\t"
"addq $1, %%rcx\n\t"
"cmpq $99, %%rcx\n\t"
"jle 1b\n\t"
:
:
: "rcx", "cc");
}
static inline void bulk_fsw(unsigned long n) {
int j;
for (j = 0; j < (n); j++) {
fixed_size_work();
}
}
double nspw; /* nsec per work */
unsigned long nsec;
void fwq_init() {
struct timespec start, end;
int i;
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
#define N_INIT 10000000
bulk_fsw(N_INIT);
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
nsec = DIFFNSEC(end, start);
nspw = nsec / (double)N_INIT;
}
#if 1
void fwq(long delay_nsec) {
if (delay_nsec < 0) {
return;
//printf("%s: delay_nsec < 0\n", __FUNCTION__);
}
bulk_fsw(delay_nsec / nspw);
}
#else /* For machines with large core-to-core performance variation (e.g. OFP) */
void fwq(long delay_nsec) {
struct timespec start, end;
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
while (1) {
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
if (DIFFNSEC(end, start) >= delay_nsec) {
break;
}
bulk_fsw(2); /* ~150 ns per iteration on FOP */
}
}
#endif
static int print_cpu_last_executed_on() {
char fn[256];
char* result;
pid_t tid = syscall(SYS_gettid);
int fd;
int offset;
int mpi_errno = 0;
sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid);
//printf("fn=%s\n", fn);
fd = open(fn, O_RDONLY);
if(fd == -1) {
printf("open() failed\n");
goto fn_fail;
}
result = malloc(65536);
if(result == NULL) {
printf("malloc() failed");
goto fn_fail;
}
int amount = 0;
offset = 0;
while(1) {
amount = read(fd, result + offset, 65536);
// printf("amount=%d\n", amount);
if(amount == -1) {
printf("read() failed");
goto fn_fail;
}
if(amount == 0) {
goto eof;
}
offset += amount;
}
eof:;
//printf("result:%s\n", result);
char* next_delim = result;
char* field;
int i;
for(i = 0; i < 39; i++) {
field = strsep(&next_delim, " ");
}
int cpu = sched_getcpu();
if(cpu == -1) {
printf("getpu() failed\n");
goto fn_fail;
}
printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,pid=%d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, getpid(), tid); fflush(stdout);
fn_exit:
free(result);
return mpi_errno;
fn_fail:
mpi_errno = -1;
goto fn_exit;
}
static inline int on_same_node(int ppn, int me, int you) {
return (me / ppn == you / ppn);
}
/* isend-calc-wait */
void my_send(int nproc, int ppn, int rank, double *sbuf, double *rbuf, int ndoubles, MPI_Request* reqs, long calc_nsec) {
int i;
int r = 0, s = 0;
int req = 0;
for (i = 0; i < nproc; i++) {
if (!on_same_node(ppn, rank, i)) {
MPI_Irecv(rbuf + r * ndoubles, ndoubles, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, &reqs[req]);
r++;
req++;
MPI_Isend(sbuf + s * ndoubles, ndoubles, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, &reqs[req]);
s++;
req++;
}
}
fwq(calc_nsec);
MPI_Waitall(req, reqs, MPI_STATUSES_IGNORE);
}
static struct option options[] = {
{
.name = "ppn",
.has_arg = required_argument,
.flag = NULL,
.val = 'P',
},
/* end */
{ NULL, 0, NULL, 0, },
};
int main(int argc, char **argv) {
int actual;
int ppn = -1;
int nproc;
int ndoubles = -1;
int my_rank = -1, size = -1;
int i, j;
double *sbuf, *rbuf;
MPI_Request* reqs;
struct timespec start, end;
long t_pure_l, t_overall_l;
long t_pure, t_overall;
int opt;
fwq_init();
while ((opt = getopt_long(argc, argv, "+d:P:", options, NULL)) != -1) {
switch (opt) {
case 'd':
ndoubles = (1ULL << atoi(optarg));
break;
case 'P':
ppn = atoi(optarg);
break;
default: /* '?' */
printf("unknown option %c\n", optopt);
exit(1);
}
}
if (ndoubles == -1 || ppn == -1) {
printf("specify ndoubles with -d and ppn with --ppn");
exit(1);
}
MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &actual);
if (actual != 3) {
printf("ERROR: Thread support level is %d (it should be 3)\n", actual);
exit(1);
}
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
MPI_Comm_size(MPI_COMM_WORLD, &nproc);
if (my_rank == 0) {
printf("tid=%d,pid=%d,ndoubles=%d,nproc=%d\n", syscall(__NR_gettid), getpid(), ndoubles, nproc);
printf("nsec=%ld, nspw=%f\n", nsec, nspw);
}
reqs = (MPI_Request*)malloc(sizeof(MPI_Request) * nproc * 2);
if(!reqs) { printf("malloc failed"); goto fn_fail; }
sbuf = malloc(sizeof(double) * ndoubles * nproc);
if(!sbuf) { printf("malloc failed"); goto fn_fail; }
memset(sbuf, 0, sizeof(double) * ndoubles);
printf("tid=%d,pid=%d,sbuf=%p\n", syscall(__NR_gettid), getpid(), sbuf);
rbuf = malloc(sizeof(double) * ndoubles * nproc);
if(!rbuf) { printf("malloc failed"); goto fn_fail; }
memset(rbuf, 0, sizeof(double) * ndoubles);
printf("tid=%d,pid=%d,rbuf=%p\n", syscall(__NR_gettid), getpid(), rbuf);
print_cpu_last_executed_on();
/* Measure isend-wait time */
MPI_Barrier(MPI_COMM_WORLD);
#define NSKIP 5
#define NPURE 30
for (i = 0; i < NPURE + NSKIP; i++) {
if (i == NSKIP) {
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
}
my_send(nproc, ppn, my_rank, sbuf, rbuf, ndoubles, reqs, 0);
}
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
t_pure_l = DIFFNSEC(end, start) / NPURE;
//printf("t_pure (local): %ld usec\n", t_pure_l / 1000UL);
MPI_Allreduce(&t_pure_l, &t_pure, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD);
if (my_rank == 0) printf("t_pure (max): %ld usec\n", t_pure / 1000UL);
/* Measure isend-calc-wait time */
MPI_Barrier(MPI_COMM_WORLD);
#define NOVERALL 30
for (i = 0; i < NOVERALL + NSKIP; i++) {
if (i == NSKIP) {
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
}
my_send(nproc, ppn, my_rank, sbuf, rbuf, ndoubles, reqs, t_pure);
}
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
t_overall_l = DIFFNSEC(end, start) / NOVERALL;
//printf("t_overall (local): %ld usec\n", t_overall_l / 1000UL);
MPI_Allreduce(&t_overall_l, &t_overall, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD);
if (my_rank == 0) printf("t_overall (max): %ld usec\n", t_overall / 1000UL);
if (my_rank == 0) {
long t_abs = (t_pure * 2) - t_overall;
printf("overlap: %.2f %%\n", (t_abs * 100) / (double)t_pure);
}
fn_exit:
MPI_Finalize();
return 0;
fn_fail:
goto fn_exit;
}

338
test/uti/mpi/005.c Executable file
View File

@@ -0,0 +1,338 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <sys/mman.h>
#include <mpi.h>
#include <unistd.h>
#include <getopt.h>
#include <sys/syscall.h> /* For SYS_xxx definitions */
#include <sched.h>
//#define DEBUG
#ifdef DEBUG
#define dprintf printf
#else
#define dprintf {}
#endif
#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec))
#if 1
#define BEGIN_EPOCH(win) do { MPI_Win_fence(0, win); } while(0)
#define END_EPOCH(win) do { MPI_Win_fence(0, win); } while(0)
#define BAR_EPOCH do { } while(0)
#else
#define BEGIN_EPOCH(win) do { MPI_Win_lock_all(0, win); } while(0)
#define END_EPOCH(win) do { MPI_Win_unlock_all(win); } while(0)
#define BAR_EPOCH do { MPI_Barrier(MPI_COMM_WORLD); } while(0)
#endif
static inline void fixed_size_work() {
asm volatile(
"movq $0, %%rcx\n\t"
"1:\t"
"addq $1, %%rcx\n\t"
"cmpq $99, %%rcx\n\t"
"jle 1b\n\t"
:
:
: "rcx", "cc");
}
static inline void bulk_fsw(unsigned long n) {
int j;
for (j = 0; j < (n); j++) {
fixed_size_work();
}
}
double nspw; /* nsec per work */
unsigned long nsec;
void fwq_init() {
struct timespec start, end;
int i;
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
#define N_INIT 10000000
bulk_fsw(N_INIT);
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
nsec = DIFFNSEC(end, start);
nspw = nsec / (double)N_INIT;
}
#if 1
void fwq(long delay_nsec) {
if (delay_nsec < 0) {
return;
//printf("%s: delay_nsec < 0\n", __FUNCTION__);
}
bulk_fsw(delay_nsec / nspw);
}
#else /* For machines with large core-to-core performance variation (e.g. OFP) */
void fwq(long delay_nsec) {
struct timespec start, end;
if (delay_nsec < 0) { return; }
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
while (1) {
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
if (DIFFNSEC(end, start) >= delay_nsec) {
break;
}
bulk_fsw(2); /* ~150 ns per iteration on FOP */
}
}
#endif
static int print_cpu_last_executed_on() {
char fn[256];
char* result;
pid_t tid = syscall(SYS_gettid);
int fd;
int offset;
int mpi_errno = 0;
sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid);
//printf("fn=%s\n", fn);
fd = open(fn, O_RDONLY);
if(fd == -1) {
printf("open() failed\n");
goto fn_fail;
}
result = malloc(65536);
if(result == NULL) {
printf("malloc() failed");
goto fn_fail;
}
int amount = 0;
offset = 0;
while(1) {
amount = read(fd, result + offset, 65536);
// printf("amount=%d\n", amount);
if(amount == -1) {
printf("read() failed");
goto fn_fail;
}
if(amount == 0) {
goto eof;
}
offset += amount;
}
eof:;
//printf("result:%s\n", result);
char* next_delim = result;
char* field;
int i;
for(i = 0; i < 39; i++) {
field = strsep(&next_delim, " ");
}
int cpu = sched_getcpu();
if(cpu == -1) {
printf("getpu() failed\n");
goto fn_fail;
}
printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, tid); fflush(stdout);
fn_exit:
free(result);
return mpi_errno;
fn_fail:
mpi_errno = -1;
goto fn_exit;
}
static inline int on_same_node(int ppn, int me, int you) {
return (me / ppn == you / ppn);
}
/* fence-accumulate-calc-fence */
void accumulate(int nproc, int ppn, int rank, double *wbuf, double *rbuf, int ndoubles, MPI_Win win, long calc_nsec) {
int i, j;
int r = 0, s = 0;
int req = 0;
BEGIN_EPOCH(win);
for (i = 0; i < nproc; i++) {
if (!on_same_node(ppn, rank, i)) {
for (j = 0; j < ndoubles; j++) {
//printf("i=%d,j=%d,rbuf=%f,wbuf=%f\n", i, j, rbuf[i * ndoubles + j], wbuf[i * ndoubles + j]);
MPI_Accumulate(rbuf + i * ndoubles + j, 1, MPI_DOUBLE, i, i * ndoubles + j, 1, MPI_DOUBLE, MPI_SUM, win);
}
}
}
fwq(calc_nsec);
END_EPOCH(win);
}
static struct option options[] = {
{
.name = "ppn",
.has_arg = required_argument,
.flag = NULL,
.val = 'P',
},
/* end */
{ NULL, 0, NULL, 0, },
};
int main(int argc, char **argv) {
int rc;
int actual;
int ppn = -1;
int nproc;
int ndoubles = -1;
int my_rank = -1, size = -1;
int i, j;
double *wbuf, *rbuf;
MPI_Win win;
struct timespec start, end;
long t_fence_l, t_pure_l, t_overall_l;
long t_fence, t_pure, t_overall;
int opt;
fwq_init();
while ((opt = getopt_long(argc, argv, "+d:P:", options, NULL)) != -1) {
switch (opt) {
case 'd':
ndoubles = (1ULL << atoi(optarg));
break;
case 'P':
ppn = atoi(optarg);
break;
default: /* '?' */
printf("unknown option %c\n", optopt);
exit(1);
}
}
if (ndoubles == -1 || ppn == -1) {
printf("specify ndoubles with -d and ppn with --ppn");
exit(1);
}
MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &actual);
if (actual != 3) {
printf("ERROR: MPI_THREAD_MULTIPLE not available (level was set to %d)\n", actual);
exit(1);
}
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
MPI_Comm_size(MPI_COMM_WORLD, &nproc);
if (my_rank == 0) {
printf("ndoubles=%d,nproc=%d\n", ndoubles, nproc);
printf("nsec=%ld, nspw=%f\n", nsec, nspw);
}
/* write-to buffer */
wbuf = malloc(sizeof(double) * ndoubles * nproc);
if(!wbuf) { printf("malloc failed"); goto fn_fail; }
memset(wbuf, 0, sizeof(double) * ndoubles * nproc);
/* read-from buffer */
rbuf = malloc(sizeof(double) * ndoubles * nproc);
if(!rbuf) { printf("malloc failed"); goto fn_fail; }
memset(rbuf, 0, sizeof(double) * ndoubles * nproc);
if (rc = MPI_Win_create(wbuf, sizeof(double) * ndoubles * nproc, sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &win)) {
printf("MPI_Win_create failed,rc=%d\n", rc);
}
print_cpu_last_executed_on();
for (i = 0; i < nproc; i++) {
for (j = 0; j < ndoubles; j++) {
wbuf[i * ndoubles + j] = i + 1 + j;
rbuf[i * ndoubles + j] = (i + 1) * 2 + j;
}
}
#if 0
for (i = 0; i < nproc; i++) {
for (j = 0; j < ndoubles; j++) {
printf("wbuf,proc=%d,j=%d,val=%f\n", i, j, wbuf[i * ndoubles + j]);
printf("rbuf,proc=%d,j=%d,val=%f\n", i, j, rbuf[i * ndoubles + j]);
}
}
#endif
/* Measure fence-fence time */
MPI_Barrier(MPI_COMM_WORLD);
#define NSKIP 5
#define NFENCE 30
for (i = 0; i < NFENCE + NSKIP; i++) {
if (i == NSKIP) {
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
}
BEGIN_EPOCH(win);
END_EPOCH(win);
}
BAR_EPOCH;
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
t_fence_l = DIFFNSEC(end, start) / NFENCE;
//printf("t_fence (local): %ld usec\n", t_fence_l / 1000UL);
MPI_Allreduce(&t_fence_l, &t_fence, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD);
if (my_rank == 0) printf("t_fence (max): %ld usec\n", t_fence / 1000UL);
/* Measure fence-acc-fence time */
MPI_Barrier(MPI_COMM_WORLD);
#define NPURE 30
for (i = 0; i < NPURE + NSKIP; i++) {
if (i == NSKIP) {
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
}
accumulate(nproc, ppn, my_rank, wbuf, rbuf, ndoubles, win, 0);
}
BAR_EPOCH;
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
t_pure_l = DIFFNSEC(end, start) / NPURE;
//printf("t_pure (local): %ld usec\n", t_pure_l / 1000UL);
MPI_Allreduce(&t_pure_l, &t_pure, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD);
if (my_rank == 0) printf("t_pure (max): %ld usec\n", t_pure / 1000UL);
#if 0
for (i = 0; i < nproc; i++) {
for (j = 0; j < ndoubles; j++) {
printf("wbuf,proc=%d,j=%d,val=%f\n", i, j, wbuf[i * ndoubles + j]);
printf("rbuf,proc=%d,j=%d,val=%f\n", i, j, rbuf[i * ndoubles + j]);
}
}
#endif
/* Measure fenc-acc-calc-fence time */
MPI_Barrier(MPI_COMM_WORLD);
#define NOVERALL 30
for (i = 0; i < NOVERALL + NSKIP; i++) {
if (i == NSKIP) {
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
}
accumulate(nproc, ppn, my_rank, wbuf, rbuf, ndoubles, win, t_pure - t_fence);
}
BAR_EPOCH;
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
t_overall_l = DIFFNSEC(end, start) / NOVERALL;
//printf("t_overall (local): %ld usec\n", t_overall_l / 1000UL);
MPI_Allreduce(&t_overall_l, &t_overall, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD);
if (my_rank == 0) printf("t_overall (max): %ld usec\n", t_overall / 1000UL);
if (my_rank == 0) {
long t_abs = (t_pure * 2) - t_overall;
printf("overlap: %.2f %%\n", (t_abs * 100) / (double)t_pure);
}
fn_exit:
MPI_Finalize();
return 0;
fn_fail:
goto fn_exit;
}

127
test/uti/mpi/005.sh Executable file
View File

@@ -0,0 +1,127 @@
#!/usr/bin/bash
#!/usr/bin/bash -x
MYHOME=$HOME
UTI_MPI_TOP=${MYHOME}/project/os/mckernel/test/uti/mpi
MCK=${MYHOME}/project/os/install
unset DISABLE_UTI
cmdline="./005"
stop=0
reboot=0
go=0
mck=0
nloops=1
ppn=1
while getopts srgac:n:mdl:P:o: OPT
do
case ${OPT} in
s) stop=1
;;
r) reboot=1
;;
g) go=1
;;
a) async=1
;;
c) cmdline=$OPTARG
;;
n) ndoubles=$OPTARG
;;
m) mck=1
;;
d) export DISABLE_UTI=1
;;
P) ppn=$OPTARG
;;
o) omp_num_threads=$OPTARG
;;
*) echo "invalid option -${OPT}" >&2
exit 1
esac
done
if [ ${mck} -eq 1 ]; then
mcexec="${mck_dir}/bin/mcexec"
mcexecopt="--enable-uti --uti-thread-rank=$uti_thread_rank"
if [ ${use_hfi} -eq 1 ]; then
mcexecopt="--enable-hfi1 $mcexecopt"
fi
mcexecopt="-n $ppn -t $((256 / ppn + 4)) -m 1 $mcexecopt"
else
mcexec=
mcexecopt=
fi
if [ ${mck} -eq 1 ]; then
i_mpi_pin=off
else
i_mpi_pin=on
fi
if [ "$i_mpi_pin" == on ] ; then
i_mpi_pin_domain="export I_MPI_PIN_DOMAIN=$((omp_num_threads + 1)):scatter"
else
i_mpi_pin_domain=
fi
if [ $async -eq 0 ] || [ "$async_progress_pin" == "" ] ; then
i_mpi_async_progress_pin=
else
i_mpi_async_progress_pin="export I_MPI_ASYNC_PROGRESS_PIN=$async_progress_pin"
fi
if [ ${stop} -eq 1 ]; then
if [ ${mck} -eq 1 ]; then
sudo ${MCK}/sbin/mcstop+release.sh
else
:
fi
fi
if [ ${reboot} -eq 1 ]; then
if [ ${mck} -eq 1 ]; then
if hostname | grep ofp &>/dev/null; then
sudo ${MCK}/sbin/mcreboot.sh -s -c 2-17,70-85,138-153,206-221,20-35,88-103,156-171,224-239,36-51,104-119,172-187,240-255,52-67,120-135,188-203,256-271 -r 2-5,70-73,138-141,206-209:0+6-9,74-77,142-145,210-213:1+10-13,78-81,146-149,214-217:68+14-17,82-85,150-153,218-221:69+20-23,88-91,156-159,224-227:136+24-27,92-95,160-163,228-231:137+28-31,96-99,164-167,232-235:204+32-35,100-103,168-171,236-239:205+36-39,104-107,172-175,240-243:18+40-43,108-111,176-179,244-247:19+44-47,112-115,180-183,248-251:86+48-51,116-119,184-187,252-255:87+52-55,120-123,188-191,256-259:154+56-59,124-127,192-195,260-263:155+60-63,128-131,196-199,264-267:222+64-67,132-135,200-203,268-271:223 -m 32G@0,12G@1
else
sudo ${MCK}/sbin/mcreboot.sh -s -c 1-15,65-79,129-143,193-207,17-31,81-95,145-159,209-223,33-47,97-111,161-175,225-239,49-63,113-127,177-191,241-255 -r 1-15:0+65-79:64+129-143:128+193-207:192+17-31:16+81-95:80+145-159:144+209-223:208+33-47:32+97-111:96+161-175:160+225-239:224+49-63:48+113-127:112+177-191:176+241-255:240 -m 12G@0,12G@1,12G@2,12G@3,3920M@4,3920M@5,3920M@6,3920M@7
fi
else
:
fi
fi
cd ${UTI_MPI_TOP}
(
cat <<EOF
#!/bin/sh
export I_MPI_DEBUG=4
export I_MPI_HYDRA_DEBUG=on
export PSM2_RCVTHREAD=0
export I_MPI_PIN=$i_mpi_pin
$i_mpi_pin_domain
export KMP_AFFINITY=granularity=thread,scatter
export I_MPI_ASYNC_PROGRESS=$async
$i_mpi_async_progress_pin
${MCK}/bin/mcexec taskset -c 3 ./005 --ppn 16
EOF
) > ./job.sh
if [ ${go} -eq 1 ]; then
cd ${UTI_MPI_TOP}
make CC=gcc 008
./job.sh
fi

625
test/uti/mpi/006.c Executable file
View File

@@ -0,0 +1,625 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <sys/mman.h>
#include <mpi.h>
#include <unistd.h>
#include <getopt.h>
#include <sys/syscall.h> /* For SYS_xxx definitions */
#include <sched.h>
#include <errno.h>
#include <psm2.h> /* required for core PSM2 functions */
#include <psm2_mq.h> /* required for PSM2 MQ functions (send, recv, etc) */
//#define DEBUG
#ifdef DEBUG
#define dprintf printf
#else
#define dprintf {}
#endif
#define BUFFER_LENGTH 8000000
#define CONNECT_ARRAY_SIZE 8
void die(char *msg, int rc) {
fprintf(stderr, "%s: %d\n", msg, rc);
}
#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec))
static inline void fixed_size_work() {
asm volatile(
"movq $0, %%rcx\n\t"
"1:\t"
"addq $1, %%rcx\n\t"
"cmpq $99, %%rcx\n\t"
"jle 1b\n\t"
:
:
: "rcx", "cc");
}
static inline void bulk_fsw(unsigned long n) {
int j;
for (j = 0; j < (n); j++) {
fixed_size_work();
}
}
double nspw; /* nsec per work */
unsigned long nsec;
void fwq_init() {
struct timespec start, end;
int i;
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
#define N_INIT 10000000
bulk_fsw(N_INIT);
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
nsec = DIFFNSEC(end, start);
nspw = nsec / (double)N_INIT;
}
#if 1
void fwq(long delay_nsec) {
if (delay_nsec < 0) {
return;
//printf("%s: delay_nsec < 0\n", __FUNCTION__);
}
bulk_fsw(delay_nsec / nspw);
}
#else /* For machines with large core-to-core performance variation (e.g. OFP) */
void fwq(long delay_nsec) {
struct timespec start, end;
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
while (1) {
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
if (DIFFNSEC(end, start) >= delay_nsec) {
break;
}
bulk_fsw(2); /* ~150 ns per iteration on FOP */
}
}
#endif
static int print_cpu_last_executed_on() {
char fn[256];
char* result;
pid_t tid = syscall(SYS_gettid);
int fd;
int offset;
int mpi_errno = 0;
sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid);
//printf("fn=%s\n", fn);
fd = open(fn, O_RDONLY);
if(fd == -1) {
printf("open() failed\n");
goto fn_fail;
}
result = malloc(65536);
if(result == NULL) {
printf("malloc() failed");
goto fn_fail;
}
int amount = 0;
offset = 0;
while(1) {
amount = read(fd, result + offset, 65536);
// printf("amount=%d\n", amount);
if(amount == -1) {
printf("read() failed");
goto fn_fail;
}
if(amount == 0) {
goto eof;
}
offset += amount;
}
eof:;
//printf("result:%s\n", result);
char* next_delim = result;
char* field;
int i;
for(i = 0; i < 39; i++) {
field = strsep(&next_delim, " ");
}
int cpu = sched_getcpu();
if(cpu == -1) {
printf("getpu() failed\n");
goto fn_fail;
}
printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,pid=%d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, getpid(), tid); fflush(stdout);
fn_exit:
free(result);
return mpi_errno;
fn_fail:
mpi_errno = -1;
goto fn_exit;
}
static inline int on_same_node(int ppn, int me, int you) {
return (me / ppn == you / ppn);
}
/* isend-calc-wait */
void my_send(int nproc, int ppn, int rank, double *sbuf, double *rbuf, int ndoubles, MPI_Request* reqs, long calc_nsec) {
int i;
int r = 0, s = 0;
int req = 0;
for (i = 0; i < nproc; i++) {
if (!on_same_node(ppn, rank, i)) {
MPI_Irecv(rbuf + r * ndoubles, ndoubles, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, &reqs[req]);
r++;
req++;
MPI_Isend(sbuf + s * ndoubles, ndoubles, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, &reqs[req]);
s++;
req++;
}
}
fwq(calc_nsec);
MPI_Waitall(req, reqs, MPI_STATUSES_IGNORE);
}
/* Helper functions to find the server's PSM2 endpoint identifier (epid). */
psm2_epid_t find_server(int rank) {
FILE *fp = NULL;
psm2_epid_t server_epid = 0;
char fn[256];
sprintf(fn, "psm2-demo-server-epid-%d", rank);
printf("PSM2 client waiting for epid mapping file to appear...\n");
while (!fp) {
sleep(1);
fp = fopen(fn, "r");
}
fscanf(fp, "%lx", &server_epid);
fclose(fp);
printf("PSM2 client found server epid = 0x%lx\n", server_epid);
return server_epid;
}
void write_epid_to_file(int rank, psm2_epid_t myepid) {
FILE *fp;
char fn[256];
sprintf(fn, "psm2-demo-server-epid-%d", rank);
fp = fopen(fn, "w");
if (!fp) {
fprintf(stderr,
"Exiting, couldn't write server's epid mapping file: ");
die(strerror(errno), errno);
}
fprintf(fp, "0x%lx", myepid);
fclose(fp);
printf("PSM2 server wrote epid = 0x%lx to file.\n", myepid);
return;
}
int psm2_sendrecv(int rank, int sender, int receiver) {
struct psm2_ep_open_opts o;
psm2_uuid_t uuid; /* 16 byte */
psm2_ep_t myep;
psm2_epid_t myepid;
psm2_epid_t server_epid;
psm2_epid_t epid_array[CONNECT_ARRAY_SIZE];
int epid_array_mask[CONNECT_ARRAY_SIZE];
psm2_error_t epid_connect_errors[CONNECT_ARRAY_SIZE];
psm2_epaddr_t epaddr_array[CONNECT_ARRAY_SIZE];
int rc;
int ver_major = PSM2_VERNO_MAJOR;
int ver_minor = PSM2_VERNO_MINOR;
char msgbuf[BUFFER_LENGTH];
psm2_mq_t q;
psm2_mq_req_t req_mq;
memset(uuid, 0, sizeof(psm2_uuid_t)); /* Use a UUID of zero */
*((int *)&uuid) = rand();
/* Try to initialize PSM2 with the requested library version.
* * In this example, given the use of the PSM2_VERNO_MAJOR and MINOR
* * as defined in the PSM2 headers, ensure that we are linking with
* * the same version of PSM2 as we compiled against. */
if ((rc = psm2_init(&ver_major, &ver_minor)) != PSM2_OK) {
die("couldn't init", rc);
return -1;
}
printf("PSM2 init done.\n");
/* Setup the endpoint options struct */
if ((rc = psm2_ep_open_opts_get_defaults(&o)) != PSM2_OK) {
die("couldn't set default opts", rc);
return -1;
}
printf("PSM2 opts_get_defaults done.\n");
/* Attempt to open a PSM2 endpoint. This allocates hardware resources. */
if ((rc = psm2_ep_open(uuid, &o, &myep, &myepid)) != PSM2_OK) {
die("couldn't psm2_ep_open()", rc);
return -1;
}
printf("PSM2 endpoint open done.\n");
int is_server = (rank == receiver) ? 1 : 0;
if (is_server) {
write_epid_to_file(rank, myepid);
} else {
server_epid = find_server(receiver);
}
if (is_server) {
/* Server does nothing here. A connection does not have to be
* * established to receive messages. */
printf("PSM2 server up.\n");
} else {
/* Setup connection request info */
/* PSM2 can connect to a single epid per request,
* * or an arbitrary number of epids in a single connect call.
* * For this example, use part of an array of
* * connection requests. */
memset(epid_array_mask, 0, sizeof(int) * CONNECT_ARRAY_SIZE);
epid_array[0] = server_epid;
epid_array_mask[0] = 1;
/* Begin the connection process.
* * note that if a requested epid is not responding,
* * the connect call will still return OK.
* * The errors array will contain the state of individual
* * connection requests. */
if ((rc = psm2_ep_connect(myep,
CONNECT_ARRAY_SIZE,
epid_array,
epid_array_mask,
epid_connect_errors,
epaddr_array,
0 /* no timeout */
)) != PSM2_OK) {
die("couldn't ep_connect", rc);
return -1;
}
printf("PSM2 connect request processed.\n");
/* Now check if our connection to the server is ready */
if (epid_connect_errors[0] != PSM2_OK) {
die("couldn't connect to server",
epid_connect_errors[0]);
return -1;
}
printf("PSM2 client-server connection established.\n");
}
/* Setup our PSM2 message queue */
if ((rc = psm2_mq_init(myep, PSM2_MQ_ORDERMASK_NONE, NULL, 0, &q))
!= PSM2_OK) {
die("couldn't initialize PSM2 MQ", rc);
return -1;
}
printf("PSM2 MQ init done.\n");
if (is_server) {
psm2_mq_tag_t t = {0xABCD};
psm2_mq_tag_t tm = {-1};
/* Post the receive request */
if ((rc = psm2_mq_irecv2(q, PSM2_MQ_ANY_ADDR,
&t, /* message tag */
&tm, /* message tag mask */
0, /* no flags */
msgbuf, BUFFER_LENGTH,
NULL, /* no context to add */
&req_mq /* track irecv status */
)) != PSM2_OK) {
die("couldn't post psm2_mq_irecv()", rc);
return -1;
}
printf("PSM2 MQ irecv() posted\n");
/* Wait until the message arrives */
if ((rc = psm2_mq_wait(&req_mq, NULL)) != PSM2_OK) {
die("couldn't wait for the irecv", rc);
return -1;
}
printf("PSM2 MQ wait() done.\n");
printf("Message from client:\n");
printf("%s", msgbuf);
unlink("psm2-demo-server-epid");
} else {
/* Say hello */
snprintf(msgbuf, BUFFER_LENGTH,
"Hello world from epid=0x%lx, pid=%d.\n",
myepid, getpid());
psm2_mq_tag_t t = {0xABCD};
if ((rc = psm2_mq_send2(q,
epaddr_array[0], /* destination epaddr */
PSM2_MQ_FLAG_SENDSYNC, /* no flags */
&t, /* tag */
msgbuf, BUFFER_LENGTH
)) != PSM2_OK) {
die("couldn't post psm2_mq_isend", rc);
return -1;
}
printf("PSM2 MQ send() done.\n");
}
/* Close down the MQ */
if ((rc = psm2_mq_finalize(q)) != PSM2_OK) {
die("couldn't psm2_mq_finalize()", rc);
return -1;
}
printf("PSM2 MQ finalized.\n");
/* Close our ep, releasing all hardware resources.
* * Try to close all connections properly */
if ((rc = psm2_ep_close(myep, PSM2_EP_CLOSE_GRACEFUL,
0 /* no timeout */)) != PSM2_OK) {
die("couldn't psm2_ep_close()", rc);
return -1;
}
printf("PSM2 ep closed.\n");
/* Release all local PSM2 resources */
if ((rc = psm2_finalize()) != PSM2_OK) {
die("couldn't psm2_finalize()", rc);
return -1;
}
printf("PSM2 shut down, exiting.\n");
return 0;
}
static struct option options[] = {
{
.name = "ppn",
.has_arg = required_argument,
.flag = NULL,
.val = 'P',
},
/* end */
{ NULL, 0, NULL, 0, },
};
struct thr_arg {
volatile int bar_count; /* Barrier before entering loop */
pthread_mutex_t bar_lock;
pthread_cond_t bar_cond;
pthread_t pthread;
int rank;
int ppn;
int nproc;
};
struct thr_arg thr_arg;
void *progress_fn(void *arg) {
struct thr_arg *thr_arg = (struct thr_arg *)arg;
int rc;
int i;
rc = syscall(732);
if (rc == -1)
fprintf(stdout, "CT09100 progress_fn running on Linux OK\n");
else {
fprintf(stdout, "CT09100 progress_fn running on McKernel NG (%d)\n", rc);
}
printf("progress,enter\n");
/* barrier */
pthread_mutex_lock(&thr_arg->bar_lock);
thr_arg->bar_count++;
if (thr_arg->bar_count == 2) {
if ((rc = pthread_cond_broadcast(&thr_arg->bar_cond))) {
printf("pthread_cond_broadcast failed,rc=%d\n", rc);
}
}
while (thr_arg->bar_count != 2) {
if ((rc = pthread_cond_wait(&thr_arg->bar_cond, &thr_arg->bar_lock))) {
printf("pthread_cond_wait failed,rc=%d\n", rc);
}
}
pthread_mutex_unlock(&thr_arg->bar_lock);
#if 0
printf("progress,after barrier\n");
for (i = 0; i < thr_arg->nproc; i++) {
if (!on_same_node(thr_arg->ppn, thr_arg->rank, i)) {
if (thr_arg->rank < i) {
psm2_sendrecv(thr_arg->rank, thr_arg->rank, i);
} else {
psm2_sendrecv(thr_arg->rank, i, thr_arg->rank);
}
}
}
#endif
/* barrier */
pthread_mutex_lock(&thr_arg->bar_lock);
thr_arg->bar_count--;
if (thr_arg->bar_count == 0) {
if ((rc = pthread_cond_broadcast(&thr_arg->bar_cond))) {
printf("pthread_cond_broadcast failed,rc=%d\n", rc);
}
}
while (thr_arg->bar_count != 0) {
if ((rc = pthread_cond_wait(&thr_arg->bar_cond, &thr_arg->bar_lock))) {
printf("pthread_cond_wait failed,rc=%d\n", rc);
}
}
pthread_mutex_unlock(&thr_arg->bar_lock);
printf("progress,exit\n");
return NULL;
}
int main(int argc, char **argv) {
int rc;
int actual;
int nproc;
int ppn = -1;
int ndoubles = -1;
int my_rank = -1, size = -1;
int i, j;
double *sbuf, *rbuf;
MPI_Request* reqs;
struct timespec start, end;
long t_pure_l, t_overall_l;
long t_pure, t_overall;
int opt;
pthread_condattr_t condattr;
pthread_mutexattr_t mutexattr;
fwq_init();
while ((opt = getopt_long(argc, argv, "+d:P:", options, NULL)) != -1) {
switch (opt) {
case 'd':
ndoubles = (1ULL << atoi(optarg));
break;
case 'P':
ppn = atoi(optarg);
break;
default: /* '?' */
printf("unknown option %c\n", optopt);
exit(1);
}
}
if (ndoubles == -1 || ppn == -1) {
printf("specify ndoubles with -d and ppn with --ppn");
exit(1);
}
MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &actual);
if (actual != 3) {
printf("ERROR: Thread support level is %d (it should be 3)\n", actual);
exit(1);
}
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
MPI_Comm_size(MPI_COMM_WORLD, &nproc);
if (my_rank == 0) {
printf("tid=%d,pid=%d,ndoubles=%d,nproc=%d\n", syscall(__NR_gettid), getpid(), ndoubles, nproc);
printf("nsec=%ld, nspw=%f\n", nsec, nspw);
}
/* Spawn a thread */
thr_arg.rank = my_rank;
thr_arg.ppn = ppn;
thr_arg.nproc = nproc;
thr_arg.bar_count = 0;
pthread_condattr_init(&condattr);
pthread_cond_init(&thr_arg.bar_cond, &condattr);
pthread_mutexattr_init(&mutexattr);
pthread_mutex_init(&thr_arg.bar_lock, &mutexattr);
char *uti_str = getenv("DISABLE_UTI");
int uti_val = uti_str ? atoi(uti_str) : 0;
if (!uti_val) {
rc = syscall(731, 1, NULL);
if (rc) {
fprintf(stdout, "CT09003 INFO: uti not available (rc=%d)\n", rc);
} else {
fprintf(stdout, "CT09003 INFO: uti available\n");
}
} else {
fprintf(stdout, "CT09003 INFO: uti disabled\n");
}
rc = pthread_create(&thr_arg.pthread, NULL, progress_fn, &thr_arg);
if (rc){
fprintf(stdout, "pthread_create: %d\n", rc);
exit(1);
}
/* barrier */
pthread_mutex_lock(&thr_arg.bar_lock);
thr_arg.bar_count++;
if (thr_arg.bar_count == 2) {
if ((rc = pthread_cond_broadcast(&thr_arg.bar_cond))) {
printf("pthread_cond_broadcast failed,rc=%d\n", rc);
}
}
while (thr_arg.bar_count != 2) {
if ((rc = pthread_cond_wait(&thr_arg.bar_cond, &thr_arg.bar_lock))) {
printf("pthread_cond_wait failed,rc=%d\n", rc);
}
}
pthread_mutex_unlock(&thr_arg.bar_lock);
printf("parent,after barrier\n");
reqs = (MPI_Request*)malloc(sizeof(MPI_Request) * nproc * 2);
if(!reqs) { printf("malloc failed"); goto fn_fail; }
sbuf = malloc(sizeof(double) * ndoubles * nproc);
if(!sbuf) { printf("malloc failed"); goto fn_fail; }
memset(sbuf, 0, sizeof(double) * ndoubles);
printf("tid=%d,pid=%d,sbuf=%p\n", syscall(__NR_gettid), getpid(), sbuf);
rbuf = malloc(sizeof(double) * ndoubles * nproc);
if(!rbuf) { printf("malloc failed"); goto fn_fail; }
memset(rbuf, 0, sizeof(double) * ndoubles);
printf("tid=%d,pid=%d,rbuf=%p\n", syscall(__NR_gettid), getpid(), rbuf);
print_cpu_last_executed_on();
/* Measure isend-wait time */
MPI_Barrier(MPI_COMM_WORLD);
#define NSKIP 5
#define NPURE 30
for (i = 0; i < NPURE + NSKIP; i++) {
if (i == NSKIP) {
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
}
my_send(nproc, ppn, my_rank, sbuf, rbuf, ndoubles, reqs, 0);
}
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
t_pure_l = DIFFNSEC(end, start) / NPURE;
//printf("t_pure (local): %ld usec\n", t_pure_l / 1000UL);
MPI_Allreduce(&t_pure_l, &t_pure, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD);
if (my_rank == 0) printf("t_pure (max): %ld usec\n", t_pure / 1000UL);
/* Measure isend-calc-wait time */
MPI_Barrier(MPI_COMM_WORLD);
#define NOVERALL 30
for (i = 0; i < NOVERALL + NSKIP; i++) {
if (i == NSKIP) {
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
}
my_send(nproc, ppn, my_rank, sbuf, rbuf, ndoubles, reqs, t_pure);
}
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
t_overall_l = DIFFNSEC(end, start) / NOVERALL;
//printf("t_overall (local): %ld usec\n", t_overall_l / 1000UL);
MPI_Allreduce(&t_overall_l, &t_overall, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD);
if (my_rank == 0) printf("t_overall (max): %ld usec\n", t_overall / 1000UL);
if (my_rank == 0) {
long t_abs = (t_pure * 2) - t_overall;
printf("overlap: %.2f %%\n", (t_abs * 100) / (double)t_pure);
}
/* barrier */
pthread_mutex_lock(&thr_arg.bar_lock);
thr_arg.bar_count--;
if (thr_arg.bar_count == 0) {
if ((rc = pthread_cond_broadcast(&thr_arg.bar_cond))) {
printf("pthread_cond_broadcast failed,rc=%d\n", rc);
}
}
while (thr_arg.bar_count != 0) {
if ((rc = pthread_cond_wait(&thr_arg.bar_cond, &thr_arg.bar_lock))) {
printf("pthread_cond_wait failed,rc=%d\n", rc);
}
}
pthread_mutex_unlock(&thr_arg.bar_lock);
pthread_join(thr_arg.pthread, NULL);
fn_exit:
MPI_Finalize();
return 0;
fn_fail:
goto fn_exit;
}

563
test/uti/mpi/007.c Executable file
View File

@@ -0,0 +1,563 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <sys/mman.h>
#include <mpi.h>
#include <unistd.h>
#include <getopt.h>
#include <sys/syscall.h> /* For SYS_xxx definitions */
#include <sched.h>
#include <errno.h>
#include <psm2.h> /* required for core PSM2 functions */
#include <psm2_mq.h> /* required for PSM2 MQ functions (send, recv, etc) */
//#define DEBUG
#ifdef DEBUG
#define dprintf printf
#else
#define dprintf {}
#endif
#define BUFFER_LENGTH 8000000
#define CONNECT_ARRAY_SIZE 8
void die(char *msg, int rc) {
fprintf(stderr, "%s: %d\n", msg, rc);
exit(1);
}
#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec))
static inline void fixed_size_work() {
asm volatile(
"movq $0, %%rcx\n\t"
"1:\t"
"addq $1, %%rcx\n\t"
"cmpq $99, %%rcx\n\t"
"jle 1b\n\t"
:
:
: "rcx", "cc");
}
static inline void bulk_fsw(unsigned long n) {
int j;
for (j = 0; j < (n); j++) {
fixed_size_work();
}
}
double nspw; /* nsec per work */
unsigned long nsec;
void fwq_init() {
struct timespec start, end;
int i;
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
#define N_INIT 10000000
bulk_fsw(N_INIT);
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
nsec = DIFFNSEC(end, start);
nspw = nsec / (double)N_INIT;
}
#if 1
void fwq(long delay_nsec) {
if (delay_nsec < 0) {
return;
//printf("%s: delay_nsec < 0\n", __FUNCTION__);
}
bulk_fsw(delay_nsec / nspw);
}
#else /* For machines with large core-to-core performance variation (e.g. OFP) */
void fwq(long delay_nsec) {
struct timespec start, end;
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
while (1) {
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
if (DIFFNSEC(end, start) >= delay_nsec) {
break;
}
bulk_fsw(2); /* ~150 ns per iteration on FOP */
}
}
#endif
static int print_cpu_last_executed_on() {
char fn[256];
char* result;
pid_t tid = syscall(SYS_gettid);
int fd;
int offset;
int mpi_errno = 0;
sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid);
//printf("fn=%s\n", fn);
fd = open(fn, O_RDONLY);
if(fd == -1) {
printf("open() failed\n");
goto fn_fail;
}
result = malloc(65536);
if(result == NULL) {
printf("malloc() failed");
goto fn_fail;
}
int amount = 0;
offset = 0;
while(1) {
amount = read(fd, result + offset, 65536);
// printf("amount=%d\n", amount);
if(amount == -1) {
printf("read() failed");
goto fn_fail;
}
if(amount == 0) {
goto eof;
}
offset += amount;
}
eof:;
//printf("result:%s\n", result);
char* next_delim = result;
char* field;
int i;
for(i = 0; i < 39; i++) {
field = strsep(&next_delim, " ");
}
int cpu = sched_getcpu();
if(cpu == -1) {
printf("getpu() failed\n");
goto fn_fail;
}
printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,pid=%d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, getpid(), tid); fflush(stdout);
fn_exit:
free(result);
return mpi_errno;
fn_fail:
mpi_errno = -1;
goto fn_exit;
}
static inline int on_same_node(int ppn, int me, int you) {
return (me / ppn == you / ppn);
}
/* isend-calc-wait */
void my_send(int nproc, int ppn, int rank, double *sbuf, double *rbuf, int ndoubles, MPI_Request* reqs, long calc_nsec) {
int i;
int r = 0, s = 0;
int req = 0;
for (i = 0; i < nproc; i++) {
if (!on_same_node(ppn, rank, i)) {
MPI_Irecv(rbuf + r * ndoubles, ndoubles, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, &reqs[req]);
r++;
req++;
MPI_Isend(sbuf + s * ndoubles, ndoubles, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, &reqs[req]);
s++;
req++;
}
}
fwq(calc_nsec);
MPI_Waitall(req, reqs, MPI_STATUSES_IGNORE);
}
/* Helper functions to find the server's PSM2 endpoint identifier (epid). */
psm2_epid_t find_server(int rank) {
FILE *fp = NULL;
psm2_epid_t server_epid = 0;
char fn[256];
sprintf(fn, "psm2-demo-server-epid-%d", rank);
printf("PSM2 client waiting for epid mapping file to appear...\n");
while (!fp) {
sleep(1);
fp = fopen(fn, "r");
}
fscanf(fp, "%lx", &server_epid);
fclose(fp);
printf("PSM2 client found server epid = 0x%lx\n", server_epid);
return server_epid;
}
void write_epid_to_file(int rank, psm2_epid_t myepid) {
FILE *fp;
char fn[256];
sprintf(fn, "psm2-demo-server-epid-%d", rank);
fp = fopen(fn, "w");
if (!fp) {
fprintf(stderr,
"Exiting, couldn't write server's epid mapping file: ");
die(strerror(errno), errno);
}
fprintf(fp, "0x%lx", myepid);
fclose(fp);
printf("PSM2 server wrote epid = 0x%lx to file.\n", myepid);
return;
}
int psm2_sendrecv(int rank, int sender, int receiver) {
struct psm2_ep_open_opts o;
psm2_uuid_t uuid;
psm2_ep_t myep;
psm2_epid_t myepid;
psm2_epid_t server_epid;
psm2_epid_t epid_array[CONNECT_ARRAY_SIZE];
int epid_array_mask[CONNECT_ARRAY_SIZE];
psm2_error_t epid_connect_errors[CONNECT_ARRAY_SIZE];
psm2_epaddr_t epaddr_array[CONNECT_ARRAY_SIZE];
int rc;
int ver_major = PSM2_VERNO_MAJOR;
int ver_minor = PSM2_VERNO_MINOR;
char msgbuf[BUFFER_LENGTH];
psm2_mq_t q;
psm2_mq_req_t req_mq;
memset(uuid, 0, sizeof(psm2_uuid_t)); /* Use a UUID of zero */
/* Try to initialize PSM2 with the requested library version.
* * In this example, given the use of the PSM2_VERNO_MAJOR and MINOR
* * as defined in the PSM2 headers, ensure that we are linking with
* * the same version of PSM2 as we compiled against. */
if ((rc = psm2_init(&ver_major, &ver_minor)) != PSM2_OK) {
die("couldn't init", rc);
}
printf("PSM2 init done.\n");
/* Setup the endpoint options struct */
if ((rc = psm2_ep_open_opts_get_defaults(&o)) != PSM2_OK) {
die("couldn't set default opts", rc);
}
printf("PSM2 opts_get_defaults done.\n");
/* Attempt to open a PSM2 endpoint. This allocates hardware resources. */
if ((rc = psm2_ep_open(uuid, &o, &myep, &myepid)) != PSM2_OK) {
die("couldn't psm2_ep_open()", rc);
}
printf("PSM2 endpoint open done.\n");
int is_server = (rank == receiver) ? 1 : 0;
if (is_server) {
write_epid_to_file(rank, myepid);
} else {
server_epid = find_server(receiver);
}
if (is_server) {
/* Server does nothing here. A connection does not have to be
* * established to receive messages. */
printf("PSM2 server up.\n");
} else {
/* Setup connection request info */
/* PSM2 can connect to a single epid per request,
* * or an arbitrary number of epids in a single connect call.
* * For this example, use part of an array of
* * connection requests. */
memset(epid_array_mask, 0, sizeof(int) * CONNECT_ARRAY_SIZE);
epid_array[0] = server_epid;
epid_array_mask[0] = 1;
/* Begin the connection process.
* * note that if a requested epid is not responding,
* * the connect call will still return OK.
* * The errors array will contain the state of individual
* * connection requests. */
if ((rc = psm2_ep_connect(myep,
CONNECT_ARRAY_SIZE,
epid_array,
epid_array_mask,
epid_connect_errors,
epaddr_array,
0 /* no timeout */
)) != PSM2_OK) {
die("couldn't ep_connect", rc);
}
printf("PSM2 connect request processed.\n");
/* Now check if our connection to the server is ready */
if (epid_connect_errors[0] != PSM2_OK) {
die("couldn't connect to server",
epid_connect_errors[0]);
}
printf("PSM2 client-server connection established.\n");
}
/* Setup our PSM2 message queue */
if ((rc = psm2_mq_init(myep, PSM2_MQ_ORDERMASK_NONE, NULL, 0, &q))
!= PSM2_OK) {
die("couldn't initialize PSM2 MQ", rc);
}
printf("PSM2 MQ init done.\n");
if (is_server) {
psm2_mq_tag_t t = {0xABCD};
psm2_mq_tag_t tm = {-1};
/* Post the receive request */
if ((rc = psm2_mq_irecv2(q, PSM2_MQ_ANY_ADDR,
&t, /* message tag */
&tm, /* message tag mask */
0, /* no flags */
msgbuf, BUFFER_LENGTH,
NULL, /* no context to add */
&req_mq /* track irecv status */
)) != PSM2_OK) {
die("couldn't post psm2_mq_irecv()", rc);
}
printf("PSM2 MQ irecv() posted\n");
/* Wait until the message arrives */
if ((rc = psm2_mq_wait(&req_mq, NULL)) != PSM2_OK) {
die("couldn't wait for the irecv", rc);
}
printf("PSM2 MQ wait() done.\n");
printf("Message from client:\n");
printf("%s", msgbuf);
unlink("psm2-demo-server-epid");
} else {
/* Say hello */
snprintf(msgbuf, BUFFER_LENGTH,
"Hello world from epid=0x%lx, pid=%d.\n",
myepid, getpid());
psm2_mq_tag_t t = {0xABCD};
if ((rc = psm2_mq_send2(q,
epaddr_array[0], /* destination epaddr */
PSM2_MQ_FLAG_SENDSYNC, /* no flags */
&t, /* tag */
msgbuf, BUFFER_LENGTH
)) != PSM2_OK) {
die("couldn't post psm2_mq_isend", rc);
}
printf("PSM2 MQ send() done.\n");
}
/* Close down the MQ */
if ((rc = psm2_mq_finalize(q)) != PSM2_OK) {
die("couldn't psm2_mq_finalize()", rc);
}
printf("PSM2 MQ finalized.\n");
/* Close our ep, releasing all hardware resources.
* * Try to close all connections properly */
if ((rc = psm2_ep_close(myep, PSM2_EP_CLOSE_GRACEFUL,
0 /* no timeout */)) != PSM2_OK) {
die("couldn't psm2_ep_close()", rc);
}
printf("PSM2 ep closed.\n");
/* Release all local PSM2 resources */
if ((rc = psm2_finalize()) != PSM2_OK) {
die("couldn't psm2_finalize()", rc);
}
printf("PSM2 shut down, exiting.\n");
return 0;
}
static struct option options[] = {
{
.name = "ppn",
.has_arg = required_argument,
.flag = NULL,
.val = 'P',
},
/* end */
{ NULL, 0, NULL, 0, },
};
struct thr_arg {
volatile int bar_count; /* Barrier before entering loop */
pthread_mutex_t bar_lock;
pthread_cond_t bar_cond;
pthread_t pthread;
int rank;
int ppn;
int nproc;
};
struct thr_arg thr_arg;
void *progress_fn(void *arg) {
struct thr_arg *thr_arg = (struct thr_arg *)arg;
int rc;
int i;
rc = syscall(732);
if (rc == -1)
fprintf(stdout, "CT09100 progress_fn running on Linux OK\n");
else {
fprintf(stdout, "CT09100 progress_fn running on McKernel NG (%d)\n", rc);
}
printf("progress,enter\n");
/* barrier */
pthread_mutex_lock(&thr_arg->bar_lock);
thr_arg->bar_count++;
if (thr_arg->bar_count == 2) {
if ((rc = pthread_cond_broadcast(&thr_arg->bar_cond))) {
printf("pthread_cond_broadcast failed,rc=%d\n", rc);
}
}
while (thr_arg->bar_count != 2) {
if ((rc = pthread_cond_wait(&thr_arg->bar_cond, &thr_arg->bar_lock))) {
printf("pthread_cond_wait failed,rc=%d\n", rc);
}
}
pthread_mutex_unlock(&thr_arg->bar_lock);
printf("progress,after barrier\n");
#if 1
for (i = 0; i < thr_arg->nproc; i++) {
if (!on_same_node(thr_arg->ppn, thr_arg->rank, i)) {
if (thr_arg->rank < i) {
psm2_sendrecv(thr_arg->rank, thr_arg->rank, i);
} else {
psm2_sendrecv(thr_arg->rank, i, thr_arg->rank);
}
}
}
#endif
/* barrier */
pthread_mutex_lock(&thr_arg->bar_lock);
thr_arg->bar_count--;
if (thr_arg->bar_count == 0) {
if ((rc = pthread_cond_broadcast(&thr_arg->bar_cond))) {
printf("pthread_cond_broadcast failed,rc=%d\n", rc);
}
}
while (thr_arg->bar_count != 0) {
if ((rc = pthread_cond_wait(&thr_arg->bar_cond, &thr_arg->bar_lock))) {
printf("pthread_cond_wait failed,rc=%d\n", rc);
}
}
pthread_mutex_unlock(&thr_arg->bar_lock);
printf("progress,exit\n");
return NULL;
}
int main(int argc, char **argv) {
int rc;
int actual;
int nproc;
int ppn = -1;
int ndoubles = -1;
int my_rank = -1, size = -1;
int i, j;
double *sbuf, *rbuf;
MPI_Request* reqs;
struct timespec start, end;
long t_pure_l, t_overall_l;
long t_pure, t_overall;
int opt;
pthread_condattr_t condattr;
pthread_mutexattr_t mutexattr;
fwq_init();
while ((opt = getopt_long(argc, argv, "+d:P:", options, NULL)) != -1) {
switch (opt) {
case 'd':
ndoubles = (1ULL << atoi(optarg));
break;
case 'P':
ppn = atoi(optarg);
break;
default: /* '?' */
printf("unknown option %c\n", optopt);
exit(1);
}
}
if (ndoubles == -1 || ppn == -1) {
printf("specify ndoubles with -d and ppn with --ppn");
exit(1);
}
char *rank_str = getenv("PMI_RANK");
if (!rank_str) {
printf("getenv failed\n");
exit(1);
}
my_rank = atoi(rank_str);
nproc = 2;
if (my_rank == 0) {
printf("tid=%d,pid=%d,ndoubles=%d,nproc=%d\n", syscall(__NR_gettid), getpid(), ndoubles, nproc);
printf("nsec=%ld, nspw=%f\n", nsec, nspw);
}
/* Spawn a thread */
thr_arg.rank = my_rank;
thr_arg.ppn = ppn;
thr_arg.nproc = nproc;
thr_arg.bar_count = 0;
pthread_condattr_init(&condattr);
pthread_cond_init(&thr_arg.bar_cond, &condattr);
pthread_mutexattr_init(&mutexattr);
pthread_mutex_init(&thr_arg.bar_lock, &mutexattr);
char *uti_str = getenv("DISABLE_UTI");
int uti_val = uti_str ? atoi(uti_str) : 0;
if (!uti_val) {
rc = syscall(731, 1, NULL);
if (rc) {
fprintf(stdout, "CT09003 INFO: uti not available (rc=%d)\n", rc);
} else {
fprintf(stdout, "CT09003 INFO: uti available\n");
}
} else {
fprintf(stdout, "CT09003 INFO: uti disabled\n");
}
rc = pthread_create(&thr_arg.pthread, NULL, progress_fn, &thr_arg);
if (rc){
fprintf(stdout, "pthread_create: %d\n", rc);
exit(1);
}
/* barrier */
pthread_mutex_lock(&thr_arg.bar_lock);
thr_arg.bar_count++;
if (thr_arg.bar_count == 2) {
if ((rc = pthread_cond_broadcast(&thr_arg.bar_cond))) {
printf("pthread_cond_broadcast failed,rc=%d\n", rc);
}
}
while (thr_arg.bar_count != 2) {
if ((rc = pthread_cond_wait(&thr_arg.bar_cond, &thr_arg.bar_lock))) {
printf("pthread_cond_wait failed,rc=%d\n", rc);
}
}
pthread_mutex_unlock(&thr_arg.bar_lock);
printf("parent,after barrier\n");
print_cpu_last_executed_on();
/* barrier */
pthread_mutex_lock(&thr_arg.bar_lock);
thr_arg.bar_count--;
if (thr_arg.bar_count == 0) {
if ((rc = pthread_cond_broadcast(&thr_arg.bar_cond))) {
printf("pthread_cond_broadcast failed,rc=%d\n", rc);
}
}
while (thr_arg.bar_count != 0) {
if ((rc = pthread_cond_wait(&thr_arg.bar_cond, &thr_arg.bar_lock))) {
printf("pthread_cond_wait failed,rc=%d\n", rc);
}
}
pthread_mutex_unlock(&thr_arg.bar_lock);
pthread_join(thr_arg.pthread, NULL);
fn_exit:
return 0;
fn_fail:
goto fn_exit;
}

589
test/uti/mpi/008.c Executable file
View File

@@ -0,0 +1,589 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <sys/mman.h>
#include <unistd.h>
#include <getopt.h>
#include <sys/syscall.h> /* For SYS_xxx definitions */
#include <sched.h>
#include <errno.h>
#include <psm2.h> /* required for core PSM2 functions */
#include <psm2_mq.h> /* required for PSM2 MQ functions (send, recv, etc) */
//#define DEBUG
#ifdef DEBUG
#define dprintf printf
#else
#define dprintf {}
#endif
#define BUFFER_LENGTH /*8000000*/(1ULL<<12)
#define CONNECT_ARRAY_SIZE 8
void die(char *msg, int rc) {
fprintf(stderr, "%s: %d\n", msg, rc);
fflush(stderr);
}
#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec))
static inline void fixed_size_work() {
asm volatile(
"movq $0, %%rcx\n\t"
"1:\t"
"addq $1, %%rcx\n\t"
"cmpq $99, %%rcx\n\t"
"jle 1b\n\t"
:
:
: "rcx", "cc");
}
static inline void bulk_fsw(unsigned long n) {
int j;
for (j = 0; j < (n); j++) {
fixed_size_work();
}
}
double nspw; /* nsec per work */
unsigned long nsec;
void fwq_init() {
struct timespec start, end;
int i;
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
#define N_INIT 10000000
bulk_fsw(N_INIT);
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
nsec = DIFFNSEC(end, start);
nspw = nsec / (double)N_INIT;
}
#if 1
void fwq(long delay_nsec) {
if (delay_nsec < 0) {
return;
//printf("%s: delay_nsec < 0\n", __FUNCTION__);
}
bulk_fsw(delay_nsec / nspw);
}
#else /* For machines with large core-to-core performance variation (e.g. OFP) */
void fwq(long delay_nsec) {
struct timespec start, end;
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
while (1) {
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
if (DIFFNSEC(end, start) >= delay_nsec) {
break;
}
bulk_fsw(2); /* ~150 ns per iteration on FOP */
}
}
#endif
static int print_cpu_last_executed_on() {
char fn[256];
char* result;
pid_t tid = syscall(SYS_gettid);
int fd;
int offset;
int mpi_errno = 0;
sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid);
//printf("fn=%s\n", fn);
fd = open(fn, O_RDONLY);
if(fd == -1) {
printf("open() failed\n");
goto fn_fail;
}
result = malloc(65536);
if(result == NULL) {
printf("malloc() failed");
goto fn_fail;
}
int amount = 0;
offset = 0;
while(1) {
amount = read(fd, result + offset, 65536);
// printf("amount=%d\n", amount);
if(amount == -1) {
printf("read() failed");
goto fn_fail;
}
if(amount == 0) {
goto eof;
}
offset += amount;
}
eof:;
//printf("result:%s\n", result);
char* next_delim = result;
char* field;
int i;
for(i = 0; i < 39; i++) {
field = strsep(&next_delim, " ");
}
int cpu = sched_getcpu();
if(cpu == -1) {
printf("getpu() failed\n");
goto fn_fail;
}
printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,pid=%d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, getpid(), tid); fflush(stdout);
fn_exit:
free(result);
return mpi_errno;
fn_fail:
mpi_errno = -1;
goto fn_exit;
}
static inline int on_same_node(int ppn, int me, int you) {
return (me / ppn == you / ppn);
}
/* Helper functions to find the server's PSM2 endpoint identifier (epid). */
psm2_epid_t find_server(int rank) {
FILE *fp = NULL;
psm2_epid_t server_epid = 0;
char fn[256];
printf("%s: enter\n", __FUNCTION__); fflush(stdout);
sprintf(fn, "psm2-demo-server-epid-%d", rank);
printf("PSM2 client waiting for epid mapping file to appear...\n"); fflush(stdout);
while (!fp) {
usleep(250*1000);
fp = fopen(fn, "r");
}
fscanf(fp, "%lx", &server_epid);
fclose(fp);
printf("PSM2 client found server epid = 0x%lx\n", server_epid);
return server_epid;
}
void write_epid_to_file(int rank, psm2_epid_t myepid) {
FILE *fp;
char fn[256];
printf("%s: enter\n", __FUNCTION__);
sprintf(fn, "psm2-demo-server-epid-%d", rank);
fp = fopen(fn, "w");
if (!fp) {
fprintf(stderr,
"Exiting, couldn't write server's epid mapping file: ");
die(strerror(errno), errno);
}
fprintf(fp, "0x%lx", myepid);
fclose(fp);
printf("PSM2 server wrote epid = 0x%lx to file.\n", myepid);
return;
}
psm2_uuid_t uuid;
psm2_ep_t myep;
psm2_epid_t myepid;
psm2_epid_t server_epid;
psm2_epid_t epid_array[CONNECT_ARRAY_SIZE];
int epid_array_mask[CONNECT_ARRAY_SIZE];
psm2_error_t epid_connect_errors[CONNECT_ARRAY_SIZE];
psm2_epaddr_t epaddr_array[CONNECT_ARRAY_SIZE];
int my_psm2_init(int my_rank, int server_rank) {
struct psm2_ep_open_opts o;
int rc;
int ver_major = PSM2_VERNO_MAJOR;
int ver_minor = PSM2_VERNO_MINOR;
memset(uuid, 0, sizeof(psm2_uuid_t)); /* Use a UUID of zero */
/* Try to initialize PSM2 with the requested library version.
* * In this example, given the use of the PSM2_VERNO_MAJOR and MINOR
* * as defined in the PSM2 headers, ensure that we are linking with
* * the same version of PSM2 as we compiled against. */
if ((rc = psm2_init(&ver_major, &ver_minor)) != PSM2_OK) {
die("couldn't init", rc);
}
printf("PSM2 init done.\n");
/* Setup the endpoint options struct */
if ((rc = psm2_ep_open_opts_get_defaults(&o)) != PSM2_OK) {
die("couldn't set default opts", rc);
}
printf("PSM2 opts_get_defaults done.\n");
/* Attempt to open a PSM2 endpoint. This allocates hardware resources. */
if ((rc = psm2_ep_open(uuid, &o, &myep, &myepid)) != PSM2_OK) {
die("couldn't psm2_ep_open()", rc);
}
printf("PSM2 endpoint open done.\n");
return 0;
}
psm2_mq_t q;
int my_psm2_connect(int my_rank, int server_rank) {
int rc;
int is_server = (my_rank == server_rank) ? 1 : 0;
printf("%s: enter\n", __FUNCTION__); fflush(stdout);
if (is_server) {
write_epid_to_file(my_rank, myepid);
} else {
server_epid = find_server(server_rank);
}
printf("%s: epid exchange done\n", __FUNCTION__); fflush(stdout);
if (is_server) {
/* Server does nothing here. A connection does not have to be
* * established to receive messages. */
printf("PSM2 server up.\n");
} else {
/* Setup connection request info */
/* PSM2 can connect to a single epid per request,
* * or an arbitrary number of epids in a single connect call.
* * For this example, use part of an array of
* * connection requests. */
memset(epid_array_mask, 0, sizeof(int) * CONNECT_ARRAY_SIZE);
epid_array[0] = server_epid;
epid_array_mask[0] = 1;
/* Begin the connection process.
* * note that if a requested epid is not responding,
* * the connect call will still return OK.
* * The errors array will contain the state of individual
* * connection requests. */
printf("calling ep_connect\n");
int count = 0;
while ((rc = psm2_ep_connect(myep,
CONNECT_ARRAY_SIZE,
epid_array,
epid_array_mask,
epid_connect_errors,
epaddr_array,
1 /* 0.5 sec timeout */
)) != PSM2_OK) {
struct timespec ts = { .tv_sec = 0, .tv_nsec = 500*1000*1000 };
nanosleep(&ts, NULL);
printf("."); fflush(stdout);
count++;
if (count > 30) {
break;
}
}
if (rc != PSM2_OK) {
printf("psm2_ep_connect timed-out\n");
return -1;
}
printf("PSM2 connect request processed.\n");
/* Now check if our connection to the server is ready */
if (epid_connect_errors[0] != PSM2_OK) {
die("couldn't connect to server", epid_connect_errors[0]);
return -1;
}
printf("PSM2 client-server connection established.\n");
}
/* Setup our PSM2 message queue */
if ((rc = psm2_mq_init(myep, PSM2_MQ_ORDERMASK_NONE, NULL, 0, &q))
!= PSM2_OK) {
die("couldn't initialize PSM2 MQ", rc);
}
printf("PSM2 MQ init done.\n");
return 0;
}
char msgbuf[BUFFER_LENGTH];
int my_psm2_sendrecv(int rank, int sender, int receiver) {
int is_server = (rank == receiver) ? 1 : 0;
int rc;
psm2_mq_req_t req_mq;
//char msgbuf[BUFFER_LENGTH];
register long rsp asm ("rsp");
printf("rsp=%lx.msgbuf=%p\n", rsp, msgbuf); fflush(stdout);
memset(msgbuf, 0, BUFFER_LENGTH);
if (is_server) {
psm2_mq_tag_t t = {0xABCD};
psm2_mq_tag_t tm = {-1};
/* Post the receive request */
if ((rc = psm2_mq_irecv2(q, PSM2_MQ_ANY_ADDR,
&t, /* message tag */
&tm, /* message tag mask */
0, /* no flags */
msgbuf, BUFFER_LENGTH,
NULL, /* no context to add */
&req_mq /* track irecv status */
)) != PSM2_OK) {
die("couldn't post psm2_mq_irecv()", rc);
}
printf("PSM2 MQ irecv() posted\n");
#if 0
/* Wait until the message arrives */
if ((rc = psm2_mq_wait(&req_mq, NULL)) != PSM2_OK) {
die("couldn't wait for the irecv", rc);
}
printf("PSM2 MQ wait() done.\n");
printf("Message from client:\n");
printf("%s", msgbuf);
if (is_server) {
char fn[256];
sprintf(fn, "psm2-demo-server-epid-%d", rank);
unlink(fn);
}
#else
int count = 0;
while ((rc = psm2_mq_ipeek(q, &req_mq, NULL)) != PSM2_OK) {
struct timespec ts = { .tv_sec = 0, .tv_nsec = 500*1000*1000 };
nanosleep(&ts, NULL);
printf("."); fflush(stdout);
count++;
if (count > 2) {
break;
}
}
if (rc == PSM2_OK) {
if ((rc = psm2_mq_test(&req_mq, NULL)) != PSM2_OK) {
printf("psm2_mq_test failed\n");
} else {
printf("PSM2 MQ test() done.\n");
printf("Message from client:\n");
printf("%s", msgbuf);
}
char fn[256];
sprintf(fn, "psm2-demo-server-epid-%d", rank);
unlink(fn);
} else {
printf("PSM2 MQ test() timed-out.\n");
}
#endif
} else {
/* Say hello */
snprintf(msgbuf, BUFFER_LENGTH,
"Hello world from epid=0x%lx, pid=%d.\n",
myepid, getpid());
psm2_mq_tag_t t = {0xABCD};
#if 0
if ((rc = psm2_mq_send2(q,
epaddr_array[0], /* destination epaddr */
PSM2_MQ_FLAG_SENDSYNC, /* no flags */
&t, /* tag */
msgbuf, BUFFER_LENGTH
)) != PSM2_OK) {
die("couldn't post psm2_mq_isend", rc);
}
printf("PSM2 MQ send() done.\n");
#else
if ((rc = psm2_mq_isend2(q,
epaddr_array[0], /* destination epaddr */
PSM2_MQ_FLAG_SENDSYNC, /* no flags */
&t, /* tag */
msgbuf, BUFFER_LENGTH,
NULL, /* no context to add */
&req_mq /* track irecv status */
)) != PSM2_OK) {
die("couldn't post psm2_mq_isend", rc);
}
printf("PSM2 MQ isend() posted\n");
int count = 0;
while ((rc = psm2_mq_ipeek2(q, &req_mq, NULL)) != PSM2_OK) {
struct timespec ts = { .tv_sec = 0, .tv_nsec = 500*1000*1000 };
nanosleep(&ts, NULL);
printf("."); fflush(stdout);
count++;
if (count > 30) {
break;
}
}
if (rc == PSM2_OK) {
if ((rc = psm2_mq_test2(&req_mq, NULL)) != PSM2_OK) {
printf("PSM2 MQ test() failed.\n");
} else {
printf("PSM2 MQ test() done.\n");
}
} else {
printf("PSM2 MQ test() timeout.\n");
}
#endif
}
/* Close down the MQ */
if ((rc = psm2_mq_finalize(q)) != PSM2_OK) {
die("couldn't psm2_mq_finalize()", rc);
}
printf("PSM2 MQ finalized.\n");
/* Close our ep, releasing all hardware resources.
* * Try to close all connections properly */
if ((rc = psm2_ep_close(myep, PSM2_EP_CLOSE_GRACEFUL,
0 /* no timeout */)) != PSM2_OK) {
die("couldn't psm2_ep_close()", rc);
}
printf("PSM2 ep closed.\n");
/* Release all local PSM2 resources */
if ((rc = psm2_finalize()) != PSM2_OK) {
die("couldn't psm2_finalize()", rc);
}
printf("PSM2 shut down, exiting.\n");
return 0;
}
static struct option options[] = {
{
.name = "ppn",
.has_arg = required_argument,
.flag = NULL,
.val = 'P',
},
/* end */
{ NULL, 0, NULL, 0, },
};
struct thr_arg {
pthread_barrier_t bar;
pthread_t pthread;
int rank;
int ppn;
int nproc;
};
struct thr_arg thr_arg;
void *progress_fn(void *arg) {
struct thr_arg *thr_arg = (struct thr_arg *)arg;
int rc;
int i;
rc = syscall(732);
if (rc == -1)
fprintf(stdout, "CT09100 progress_fn running on Linux OK\n");
else {
fprintf(stdout, "CT09100 progress_fn running on McKernel NG (%d)\n", rc);
}
printf("progress,enter\n");
pthread_barrier_wait(&thr_arg->bar);
#if 1
for (i = 0; i < thr_arg->nproc; i++) {
if (!on_same_node(thr_arg->ppn, thr_arg->rank, i)) {
if (thr_arg->rank < i) {
my_psm2_sendrecv(thr_arg->rank, thr_arg->rank, i);
} else {
my_psm2_sendrecv(thr_arg->rank, i, thr_arg->rank);
}
}
}
#endif
pthread_barrier_wait(&thr_arg->bar);
#if 0
printf("progress,entering infinite loop\n");
while(1) { }
#endif
printf("progress,returning\n");
return NULL;
}
int main(int argc, char **argv) {
int rc;
int actual;
int nproc;
int ppn = -1;
int my_rank = -1, size = -1;
int i, j;
struct timespec start, end;
long t_pure_l, t_overall_l;
long t_pure, t_overall;
int opt;
pthread_barrierattr_t barrierattr;
fwq_init();
while ((opt = getopt_long(argc, argv, "+P:", options, NULL)) != -1) {
switch (opt) {
case 'P':
ppn = atoi(optarg);
break;
default: /* '?' */
printf("unknown option %c\n", optopt);
exit(1);
}
}
if (ppn == -1) {
printf("specify ppn with --ppn");
exit(1);
}
char *rank_str = getenv("PMI_RANK");
if (!rank_str) {
printf("getenv failed\n");
exit(1);
}
my_rank = atoi(rank_str);
printf("my_rank=%d\n", my_rank); fflush(stdout);
nproc = 2;
if (my_rank == 0) {
printf("tid=%d,pid=%d,nproc=%d\n", syscall(__NR_gettid), getpid(), nproc);
printf("nsec=%ld, nspw=%f\n", nsec, nspw);
}
int server_rank = ppn + (my_rank % ppn);
my_psm2_init(my_rank, server_rank);
my_psm2_connect(my_rank, server_rank);
/* Spawn a thread */
thr_arg.rank = my_rank;
thr_arg.ppn = ppn;
thr_arg.nproc = nproc;
pthread_barrierattr_init(&barrierattr);
pthread_barrier_init(&thr_arg.bar, &barrierattr, nproc);
char *uti_str = getenv("DISABLE_UTI");
int uti_val = uti_str ? atoi(uti_str) : 0;
if (!uti_val) {
rc = syscall(731, 1, NULL);
if (rc) {
fprintf(stdout, "CT09003 INFO: uti not available (rc=%d)\n", rc);
} else {
fprintf(stdout, "CT09003 INFO: uti available\n");
}
} else {
fprintf(stdout, "CT09003 INFO: uti disabled\n");
}
rc = pthread_create(&thr_arg.pthread, NULL, progress_fn, &thr_arg);
if (rc){
fprintf(stdout, "pthread_create: %d\n", rc);
exit(1);
}
pthread_barrier_wait(&thr_arg.bar);
pthread_barrier_wait(&thr_arg.bar);
pthread_join(thr_arg.pthread, NULL);
fn_exit:
return 0;
fn_fail:
goto fn_exit;
}

89
test/uti/mpi/008.sh Executable file
View File

@@ -0,0 +1,89 @@
#!/usr/bin/bash
#!/usr/bin/bash -x
MYHOME=$HOME
UTI_MPI_TOP=${MYHOME}/project/os/mckernel/test/uti/mpi
MCK=${MYHOME}/project/os/install
unset DISABLE_UTI
cmdline="./008"
stop=0
reboot=0
go=0
mck=0
nloops=1
while getopts srgac:n:mdl: OPT
do
case ${OPT} in
s) stop=1
;;
r) reboot=1
;;
g) go=1
;;
a) async=1
;;
c) cmdline=$OPTARG
;;
n) ndoubles=$OPTARG
;;
m)
mck=1
;;
d) export DISABLE_UTI=1
;;
l) nloops=$OPTARG
;;
*) echo "invalid option -${OPT}" >&2
exit 1
esac
done
if [ ${mck} -eq 1 ]; then
MCEXEC="${MCK}/bin/mcexec"
else
MCEXEC=
fi
if [ ${stop} -eq 1 ]; then
if [ ${mck} -eq 1 ]; then
sudo ${MCK}/sbin/mcstop+release.sh
else
:
fi
fi
if [ ${reboot} -eq 1 ]; then
if [ ${mck} -eq 1 ]; then
if hostname | grep ofp &>/dev/null; then
sudo ${MCK}/sbin/mcreboot.sh -s -c 2-17,70-85,138-153,206-221,20-35,88-103,156-171,224-239,36-51,104-119,172-187,240-255,52-67,120-135,188-203,256-271 -r 2-5,70-73,138-141,206-209:0+6-9,74-77,142-145,210-213:1+10-13,78-81,146-149,214-217:68+14-17,82-85,150-153,218-221:69+20-23,88-91,156-159,224-227:136+24-27,92-95,160-163,228-231:137+28-31,96-99,164-167,232-235:204+32-35,100-103,168-171,236-239:205+36-39,104-107,172-175,240-243:18+40-43,108-111,176-179,244-247:19+44-47,112-115,180-183,248-251:86+48-51,116-119,184-187,252-255:87+52-55,120-123,188-191,256-259:154+56-59,124-127,192-195,260-263:155+60-63,128-131,196-199,264-267:222+64-67,132-135,200-203,268-271:223 -m 32G@0,12G@1
else
sudo ${MCK}/sbin/mcreboot.sh -s -c 1-15,65-79,129-143,193-207,17-31,81-95,145-159,209-223,33-47,97-111,161-175,225-239,49-63,113-127,177-191,241-255 -r 1-15:0+65-79:64+129-143:128+193-207:192+17-31:16+81-95:80+145-159:144+209-223:208+33-47:32+97-111:96+161-175:160+225-239:224+49-63:48+113-127:112+177-191:176+241-255:240 -m 12G@0,12G@1,12G@2,12G@3,3920M@4,3920M@5,3920M@6,3920M@7
fi
else
:
fi
fi
if [ ${go} -eq 1 ]; then
cd ${UTI_MPI_TOP}
make CC=gcc 008
for i in `seq 1 ${nloops}`; do
rm -f psm2-demo-server-epid-*
#PSM2_RCVTHREAD=0 PMI_RANK=0 DISABLE_UTI=1 ${MCK}/bin/mcexec --enable-uti taskset -c 2 ./008 --ppn 1 &
PSM2_RCVTHREAD=0 PMI_RANK=1 DISABLE_UTI=0 ${MCK}/bin/mcexec --enable-uti taskset -c 3 ./008 --ppn 1
#wait
echo =====;
echo $i;
echo =====; i=$((i+1));
#sleep 2
done
fi

537
test/uti/mpi/009.c Executable file
View File

@@ -0,0 +1,537 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <sys/mman.h>
#include <mpi.h>
#include <unistd.h>
#include <getopt.h>
#include <sys/syscall.h> /* For SYS_xxx definitions */
#include <sched.h>
#include <errno.h>
#include <psm2.h> /* required for core PSM2 functions */
#include <psm2_mq.h> /* required for PSM2 MQ functions (send, recv, etc) */
//#define DEBUG
#ifdef DEBUG
#define dprintf printf
#else
#define dprintf {}
#endif
#define BUFFER_LENGTH /*8000000*/(1ULL<<12)
#define CONNECT_ARRAY_SIZE 8
void die(char *msg, int rc) {
fprintf(stderr, "%s: %d\n", msg, rc);
fflush(stderr);
}
#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec))
static inline void fixed_size_work() {
asm volatile(
"movq $0, %%rcx\n\t"
"1:\t"
"addq $1, %%rcx\n\t"
"cmpq $99, %%rcx\n\t"
"jle 1b\n\t"
:
:
: "rcx", "cc");
}
static inline void bulk_fsw(unsigned long n) {
int j;
for (j = 0; j < (n); j++) {
fixed_size_work();
}
}
double nspw; /* nsec per work */
unsigned long nsec;
void fwq_init() {
struct timespec start, end;
int i;
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
#define N_INIT 10000000
bulk_fsw(N_INIT);
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
nsec = DIFFNSEC(end, start);
nspw = nsec / (double)N_INIT;
}
#if 1
void fwq(long delay_nsec) {
if (delay_nsec < 0) {
return;
//printf("%s: delay_nsec < 0\n", __FUNCTION__);
}
bulk_fsw(delay_nsec / nspw);
}
#else /* For machines with large core-to-core performance variation (e.g. OFP) */
void fwq(long delay_nsec) {
struct timespec start, end;
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
while (1) {
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
if (DIFFNSEC(end, start) >= delay_nsec) {
break;
}
bulk_fsw(2); /* ~150 ns per iteration on FOP */
}
}
#endif
static int print_cpu_last_executed_on() {
char fn[256];
char* result;
pid_t tid = syscall(SYS_gettid);
int fd;
int offset;
int mpi_errno = 0;
sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid);
//printf("fn=%s\n", fn);
fd = open(fn, O_RDONLY);
if(fd == -1) {
printf("open() failed\n");
goto fn_fail;
}
result = malloc(65536);
if(result == NULL) {
printf("malloc() failed");
goto fn_fail;
}
int amount = 0;
offset = 0;
while(1) {
amount = read(fd, result + offset, 65536);
// printf("amount=%d\n", amount);
if(amount == -1) {
printf("read() failed");
goto fn_fail;
}
if(amount == 0) {
goto eof;
}
offset += amount;
}
eof:;
//printf("result:%s\n", result);
char* next_delim = result;
char* field;
int i;
for(i = 0; i < 39; i++) {
field = strsep(&next_delim, " ");
}
int cpu = sched_getcpu();
if(cpu == -1) {
printf("getpu() failed\n");
goto fn_fail;
}
printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,pid=%d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, getpid(), tid); fflush(stdout);
fn_exit:
free(result);
return mpi_errno;
fn_fail:
mpi_errno = -1;
goto fn_exit;
}
static inline int on_same_node(int ppn, int me, int you) {
return (me / ppn == you / ppn);
}
/* isend-calc-wait */
void my_send(int nproc, int ppn, int rank, double *sbuf, double *rbuf, int ndoubles, MPI_Request* reqs, long calc_nsec) {
int i;
int r = 0, s = 0;
int req = 0;
for (i = 0; i < nproc; i++) {
if (!on_same_node(ppn, rank, i)) {
MPI_Irecv(rbuf + r * ndoubles, ndoubles, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, &reqs[req]);
r++;
req++;
MPI_Isend(sbuf + s * ndoubles, ndoubles, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, &reqs[req]);
s++;
req++;
}
}
fwq(calc_nsec);
MPI_Waitall(req, reqs, MPI_STATUSES_IGNORE);
}
/* Helper functions to find the server's PSM2 endpoint identifier (epid). */
psm2_epid_t find_server(int rank) {
FILE *fp = NULL;
psm2_epid_t server_epid = 0;
char fn[256];
printf("%s: enter\n", __FUNCTION__); fflush(stdout);
sprintf(fn, "psm2-demo-server-epid-%d", rank);
printf("PSM2 client waiting for epid mapping file to appear...\n"); fflush(stdout);
while (!fp) {
sleep(1);
fp = fopen(fn, "r");
}
fscanf(fp, "%lx", &server_epid);
fclose(fp);
printf("PSM2 client found server epid = 0x%lx\n", server_epid);
return server_epid;
}
void write_epid_to_file(int rank, psm2_epid_t myepid) {
FILE *fp;
char fn[256];
printf("%s: enter\n", __FUNCTION__);
sprintf(fn, "psm2-demo-server-epid-%d", rank);
fp = fopen(fn, "w");
if (!fp) {
fprintf(stderr,
"Exiting, couldn't write server's epid mapping file: ");
die(strerror(errno), errno);
}
fprintf(fp, "0x%lx", myepid);
fclose(fp);
printf("PSM2 server wrote epid = 0x%lx to file.\n", myepid);
return;
}
psm2_uuid_t uuid;
psm2_ep_t myep;
psm2_epid_t myepid;
psm2_epid_t server_epid;
psm2_epid_t epid_array[CONNECT_ARRAY_SIZE];
int epid_array_mask[CONNECT_ARRAY_SIZE];
psm2_error_t epid_connect_errors[CONNECT_ARRAY_SIZE];
psm2_epaddr_t epaddr_array[CONNECT_ARRAY_SIZE];
int my_psm2_init(int my_rank, int server_rank) {
struct psm2_ep_open_opts o;
int rc;
int ver_major = PSM2_VERNO_MAJOR;
int ver_minor = PSM2_VERNO_MINOR;
printf("%s: my_rank=%d,server_rank=%d\n", __FUNCTION__, my_rank, server_rank); fflush(stdout);
memset(uuid, 0, sizeof(psm2_uuid_t)); /* Use a UUID of zero */
/* Try to initialize PSM2 with the requested library version.
* * In this example, given the use of the PSM2_VERNO_MAJOR and MINOR
* * as defined in the PSM2 headers, ensure that we are linking with
* * the same version of PSM2 as we compiled against. */
if ((rc = psm2_init(&ver_major, &ver_minor)) != PSM2_OK) {
die("couldn't init", rc);
}
printf("PSM2 init done.\n");
/* Setup the endpoint options struct */
if ((rc = psm2_ep_open_opts_get_defaults(&o)) != PSM2_OK) {
die("couldn't set default opts", rc);
}
printf("PSM2 opts_get_defaults done.\n");
/* Attempt to open a PSM2 endpoint. This allocates hardware resources. */
if ((rc = psm2_ep_open(uuid, &o, &myep, &myepid)) != PSM2_OK) {
die("couldn't psm2_ep_open()", rc);
}
printf("PSM2 endpoint open done.\n");
return 0;
}
int my_psm2_connect(int my_rank, int server_rank) {
int rc;
int is_server = (my_rank == server_rank) ? 1 : 0;
printf("%s: my_rank=%d,server_rank=%d\n", __FUNCTION__, my_rank, server_rank); fflush(stdout);
if (is_server) {
write_epid_to_file(my_rank, myepid);
} else {
server_epid = find_server(server_rank);
}
printf("%s: epid exchange done\n", __FUNCTION__); fflush(stdout);
if (is_server) {
/* Server does nothing here. A connection does not have to be
* * established to receive messages. */
printf("PSM2 server up.\n");
} else {
/* Setup connection request info */
/* PSM2 can connect to a single epid per request,
* * or an arbitrary number of epids in a single connect call.
* * For this example, use part of an array of
* * connection requests. */
memset(epid_array_mask, 0, sizeof(int) * CONNECT_ARRAY_SIZE);
epid_array[0] = server_epid;
epid_array_mask[0] = 1;
/* Begin the connection process.
* * note that if a requested epid is not responding,
* * the connect call will still return OK.
* * The errors array will contain the state of individual
* * connection requests. */
if ((rc = psm2_ep_connect(myep,
CONNECT_ARRAY_SIZE,
epid_array,
epid_array_mask,
epid_connect_errors,
epaddr_array,
0 /* no timeout */
)) != PSM2_OK) {
die("couldn't ep_connect", rc);
return -1;
}
printf("PSM2 connect request processed.\n");
/* Now check if our connection to the server is ready */
if (epid_connect_errors[0] != PSM2_OK) {
die("couldn't connect to server", epid_connect_errors[0]);
return -1;
}
printf("PSM2 client-server connection established.\n");
}
return 0;
}
char msgbuf[BUFFER_LENGTH];
int my_psm2_sendrecv(int rank, int sender, int receiver) {
int is_server = (rank == receiver) ? 1 : 0;
int rc;
psm2_mq_t q;
psm2_mq_req_t req_mq;
//char msgbuf[BUFFER_LENGTH];
register long rsp asm ("rsp");
printf("rsp=%lx.msgbuf=%p\n", rsp, msgbuf); fflush(stdout);
memset(msgbuf, 0, BUFFER_LENGTH);
/* Setup our PSM2 message queue */
if ((rc = psm2_mq_init(myep, PSM2_MQ_ORDERMASK_NONE, NULL, 0, &q))
!= PSM2_OK) {
die("couldn't initialize PSM2 MQ", rc);
}
printf("PSM2 MQ init done.\n");
if (is_server) {
psm2_mq_tag_t t = {0xABCD};
psm2_mq_tag_t tm = {-1};
/* Post the receive request */
if ((rc = psm2_mq_irecv2(q, PSM2_MQ_ANY_ADDR,
&t, /* message tag */
&tm, /* message tag mask */
0, /* no flags */
msgbuf, BUFFER_LENGTH,
NULL, /* no context to add */
&req_mq /* track irecv status */
)) != PSM2_OK) {
die("couldn't post psm2_mq_irecv()", rc);
}
printf("PSM2 MQ irecv() posted\n");
/* Wait until the message arrives */
if ((rc = psm2_mq_wait(&req_mq, NULL)) != PSM2_OK) {
die("couldn't wait for the irecv", rc);
}
printf("PSM2 MQ wait() done.\n");
printf("Message from client:\n");
printf("%s", msgbuf);
if (is_server) {
char fn[256];
sprintf(fn, "psm2-demo-server-epid-%d", rank);
unlink(fn);
}
} else {
/* Say hello */
snprintf(msgbuf, BUFFER_LENGTH,
"Hello world from epid=0x%lx, pid=%d.\n",
myepid, getpid());
psm2_mq_tag_t t = {0xABCD};
if ((rc = psm2_mq_send2(q,
epaddr_array[0], /* destination epaddr */
PSM2_MQ_FLAG_SENDSYNC, /* no flags */
&t, /* tag */
msgbuf, BUFFER_LENGTH
)) != PSM2_OK) {
die("couldn't post psm2_mq_isend", rc);
}
printf("PSM2 MQ send() done.\n");
}
/* Close down the MQ */
if ((rc = psm2_mq_finalize(q)) != PSM2_OK) {
die("couldn't psm2_mq_finalize()", rc);
}
printf("PSM2 MQ finalized.\n");
/* Close our ep, releasing all hardware resources.
* * Try to close all connections properly */
if ((rc = psm2_ep_close(myep, PSM2_EP_CLOSE_GRACEFUL,
0 /* no timeout */)) != PSM2_OK) {
die("couldn't psm2_ep_close()", rc);
}
printf("PSM2 ep closed.\n");
/* Release all local PSM2 resources */
if ((rc = psm2_finalize()) != PSM2_OK) {
die("couldn't psm2_finalize()", rc);
}
printf("PSM2 shut down, exiting.\n");
return 0;
}
static struct option options[] = {
{
.name = "ppn",
.has_arg = required_argument,
.flag = NULL,
.val = 'P',
},
/* end */
{ NULL, 0, NULL, 0, },
};
struct thr_arg {
pthread_barrier_t bar;
pthread_t pthread;
int rank;
int ppn;
int nproc;
int server_rank;
};
struct thr_arg thr_arg;
void *progress_fn(void *arg) {
struct thr_arg *thr_arg = (struct thr_arg *)arg;
int rc;
int i;
rc = syscall(732);
if (rc == -1)
fprintf(stdout, "CT09100 progress_fn running on Linux OK\n");
else {
fprintf(stdout, "CT09100 progress_fn running on McKernel NG (%d)\n", rc);
}
printf("progress,enter\n");
pthread_barrier_wait(&thr_arg->bar);
#if 1
my_psm2_init(thr_arg->rank, thr_arg->server_rank);
my_psm2_connect(thr_arg->rank, thr_arg->server_rank);
for (i = 0; i < thr_arg->nproc; i++) {
if (!on_same_node(thr_arg->ppn, thr_arg->rank, i)) {
if (thr_arg->rank < i) {
my_psm2_sendrecv(thr_arg->rank, thr_arg->rank, i);
} else {
my_psm2_sendrecv(thr_arg->rank, i, thr_arg->rank);
}
}
}
#endif
pthread_barrier_wait(&thr_arg->bar);
printf("progress,exit\n");
return NULL;
}
int main(int argc, char **argv) {
int rc;
int actual;
int nproc;
int ppn = -1;
int ndoubles = -1;
int my_rank = -1, size = -1;
int i, j;
double *sbuf, *rbuf;
MPI_Request* reqs;
struct timespec start, end;
long t_pure_l, t_overall_l;
long t_pure, t_overall;
int opt;
pthread_barrierattr_t barrierattr;
fwq_init();
while ((opt = getopt_long(argc, argv, "+d:P:", options, NULL)) != -1) {
switch (opt) {
case 'd':
ndoubles = (1ULL << atoi(optarg));
break;
case 'P':
ppn = atoi(optarg);
break;
default: /* '?' */
printf("unknown option %c\n", optopt);
exit(1);
}
}
if (ndoubles == -1 || ppn == -1) {
printf("specify ndoubles with -d and ppn with --ppn");
exit(1);
}
char *rank_str = getenv("PMI_RANK");
if (!rank_str) {
printf("getenv failed\n");
exit(1);
}
my_rank = atoi(rank_str);
printf("my_rank=%d\n", my_rank); fflush(stdout);
nproc = 2;
if (my_rank == 0) {
printf("tid=%d,pid=%d,ndoubles=%d,nproc=%d\n", syscall(__NR_gettid), getpid(), ndoubles, nproc);
printf("nsec=%ld, nspw=%f\n", nsec, nspw);
}
/* Spawn a thread */
thr_arg.rank = my_rank;
thr_arg.ppn = ppn;
thr_arg.nproc = nproc;
thr_arg.server_rank = ppn + (my_rank % ppn);
pthread_barrierattr_init(&barrierattr);
pthread_barrier_init(&thr_arg.bar, &barrierattr, nproc);
char *uti_str = getenv("DISABLE_UTI");
int uti_val = uti_str ? atoi(uti_str) : 0;
if (!uti_val) {
rc = syscall(731, 1, NULL);
if (rc) {
fprintf(stdout, "CT09003 INFO: uti not available (rc=%d)\n", rc);
} else {
fprintf(stdout, "CT09003 INFO: uti available\n");
}
} else {
fprintf(stdout, "CT09003 INFO: uti disabled\n");
}
rc = pthread_create(&thr_arg.pthread, NULL, progress_fn, &thr_arg);
if (rc){
fprintf(stdout, "pthread_create: %d\n", rc);
exit(1);
}
pthread_barrier_wait(&thr_arg.bar);
pthread_barrier_wait(&thr_arg.bar);
pthread_join(thr_arg.pthread, NULL);
fn_exit:
return 0;
fn_fail:
goto fn_exit;
}

508
test/uti/mpi/010.c Executable file
View File

@@ -0,0 +1,508 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <sys/mman.h>
#include <unistd.h>
#include <getopt.h>
#include <sys/syscall.h> /* For SYS_xxx definitions */
#include <sched.h>
#include <errno.h>
#include <psm2.h> /* required for core PSM2 functions */
#include <psm2_mq.h> /* required for PSM2 MQ functions (send, recv, etc) */
//#define DEBUG
#ifdef DEBUG
#define dprintf printf
#else
#define dprintf {}
#endif
#define BUFFER_LENGTH /*8000000*/(1ULL<<12)
#define CONNECT_ARRAY_SIZE 8
void die(char *msg, int rc) {
fprintf(stderr, "%s: %d\n", msg, rc);
fflush(stderr);
}
#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec))
static inline void fixed_size_work() {
asm volatile(
"movq $0, %%rcx\n\t"
"1:\t"
"addq $1, %%rcx\n\t"
"cmpq $99, %%rcx\n\t"
"jle 1b\n\t"
:
:
: "rcx", "cc");
}
static inline void bulk_fsw(unsigned long n) {
int j;
for (j = 0; j < (n); j++) {
fixed_size_work();
}
}
double nspw; /* nsec per work */
unsigned long nsec;
void fwq_init() {
struct timespec start, end;
int i;
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
#define N_INIT 10000000
bulk_fsw(N_INIT);
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
nsec = DIFFNSEC(end, start);
nspw = nsec / (double)N_INIT;
}
#if 1
void fwq(long delay_nsec) {
if (delay_nsec < 0) {
return;
//printf("%s: delay_nsec < 0\n", __FUNCTION__);
}
bulk_fsw(delay_nsec / nspw);
}
#else /* For machines with large core-to-core performance variation (e.g. OFP) */
void fwq(long delay_nsec) {
struct timespec start, end;
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
while (1) {
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
if (DIFFNSEC(end, start) >= delay_nsec) {
break;
}
bulk_fsw(2); /* ~150 ns per iteration on FOP */
}
}
#endif
static int print_cpu_last_executed_on() {
char fn[256];
char* result;
pid_t tid = syscall(SYS_gettid);
int fd;
int offset;
int mpi_errno = 0;
sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid);
//printf("fn=%s\n", fn);
fd = open(fn, O_RDONLY);
if(fd == -1) {
printf("open() failed\n");
goto fn_fail;
}
result = malloc(65536);
if(result == NULL) {
printf("malloc() failed");
goto fn_fail;
}
int amount = 0;
offset = 0;
while(1) {
amount = read(fd, result + offset, 65536);
// printf("amount=%d\n", amount);
if(amount == -1) {
printf("read() failed");
goto fn_fail;
}
if(amount == 0) {
goto eof;
}
offset += amount;
}
eof:;
//printf("result:%s\n", result);
char* next_delim = result;
char* field;
int i;
for(i = 0; i < 39; i++) {
field = strsep(&next_delim, " ");
}
int cpu = sched_getcpu();
if(cpu == -1) {
printf("getpu() failed\n");
goto fn_fail;
}
printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,pid=%d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, getpid(), tid); fflush(stdout);
fn_exit:
free(result);
return mpi_errno;
fn_fail:
mpi_errno = -1;
goto fn_exit;
}
static inline int on_same_node(int ppn, int me, int you) {
return (me / ppn == you / ppn);
}
/* Helper functions to find the server's PSM2 endpoint identifier (epid). */
psm2_epid_t find_server(int rank) {
FILE *fp = NULL;
psm2_epid_t server_epid = 0;
char fn[256];
printf("%s: enter\n", __FUNCTION__); fflush(stdout);
sprintf(fn, "psm2-demo-server-epid-%d", rank);
printf("PSM2 client waiting for epid mapping file to appear...\n"); fflush(stdout);
while (!fp) {
sleep(1);
fp = fopen(fn, "r");
}
fscanf(fp, "%lx", &server_epid);
fclose(fp);
printf("PSM2 client found server epid = 0x%lx\n", server_epid);
return server_epid;
}
void write_epid_to_file(int rank, psm2_epid_t myepid) {
FILE *fp;
char fn[256];
printf("%s: enter\n", __FUNCTION__);
sprintf(fn, "psm2-demo-server-epid-%d", rank);
fp = fopen(fn, "w");
if (!fp) {
fprintf(stderr,
"Exiting, couldn't write server's epid mapping file: ");
die(strerror(errno), errno);
}
fprintf(fp, "0x%lx", myepid);
fclose(fp);
printf("PSM2 server wrote epid = 0x%lx to file.\n", myepid);
return;
}
psm2_uuid_t uuid;
psm2_ep_t myep;
psm2_epid_t myepid;
psm2_epid_t server_epid;
psm2_epid_t epid_array[CONNECT_ARRAY_SIZE];
int epid_array_mask[CONNECT_ARRAY_SIZE];
psm2_error_t epid_connect_errors[CONNECT_ARRAY_SIZE];
psm2_epaddr_t epaddr_array[CONNECT_ARRAY_SIZE];
int my_psm2_init(int my_rank, int server_rank) {
struct psm2_ep_open_opts o;
int rc;
int ver_major = PSM2_VERNO_MAJOR;
int ver_minor = PSM2_VERNO_MINOR;
printf("%s: my_rank=%d,server_rank=%d\n", __FUNCTION__, my_rank, server_rank); fflush(stdout);
memset(uuid, 0, sizeof(psm2_uuid_t)); /* Use a UUID of zero */
/* Try to initialize PSM2 with the requested library version.
* * In this example, given the use of the PSM2_VERNO_MAJOR and MINOR
* * as defined in the PSM2 headers, ensure that we are linking with
* * the same version of PSM2 as we compiled against. */
if ((rc = psm2_init(&ver_major, &ver_minor)) != PSM2_OK) {
die("couldn't init", rc);
}
printf("PSM2 init done.\n");
/* Setup the endpoint options struct */
if ((rc = psm2_ep_open_opts_get_defaults(&o)) != PSM2_OK) {
die("couldn't set default opts", rc);
}
printf("PSM2 opts_get_defaults done.\n");
/* Attempt to open a PSM2 endpoint. This allocates hardware resources. */
if ((rc = psm2_ep_open(uuid, &o, &myep, &myepid)) != PSM2_OK) {
die("couldn't psm2_ep_open()", rc);
}
printf("PSM2 endpoint open done.\n");
return 0;
}
int my_psm2_connect(int my_rank, int server_rank) {
int rc;
int is_server = (my_rank == server_rank) ? 1 : 0;
printf("%s: my_rank=%d,server_rank=%d\n", __FUNCTION__, my_rank, server_rank); fflush(stdout);
if (is_server) {
write_epid_to_file(my_rank, myepid);
} else {
server_epid = find_server(server_rank);
}
printf("%s: epid exchange done\n", __FUNCTION__); fflush(stdout);
if (is_server) {
/* Server does nothing here. A connection does not have to be
* * established to receive messages. */
printf("PSM2 server up.\n");
} else {
/* Setup connection request info */
/* PSM2 can connect to a single epid per request,
* * or an arbitrary number of epids in a single connect call.
* * For this example, use part of an array of
* * connection requests. */
memset(epid_array_mask, 0, sizeof(int) * CONNECT_ARRAY_SIZE);
epid_array[0] = server_epid;
epid_array_mask[0] = 1;
/* Begin the connection process.
* * note that if a requested epid is not responding,
* * the connect call will still return OK.
* * The errors array will contain the state of individual
* * connection requests. */
if ((rc = psm2_ep_connect(myep,
CONNECT_ARRAY_SIZE,
epid_array,
epid_array_mask,
epid_connect_errors,
epaddr_array,
0 /* no timeout */
)) != PSM2_OK) {
die("couldn't ep_connect", rc);
return -1;
}
printf("PSM2 connect request processed.\n");
/* Now check if our connection to the server is ready */
if (epid_connect_errors[0] != PSM2_OK) {
die("couldn't connect to server", epid_connect_errors[0]);
return -1;
}
printf("PSM2 client-server connection established.\n");
}
return 0;
}
char msgbuf[BUFFER_LENGTH];
int my_psm2_sendrecv(int rank, int sender, int receiver) {
int is_server = (rank == receiver) ? 1 : 0;
int rc;
psm2_mq_t q;
psm2_mq_req_t req_mq;
//char msgbuf[BUFFER_LENGTH];
register long rsp asm ("rsp");
printf("rsp=%lx.msgbuf=%p\n", rsp, msgbuf); fflush(stdout);
memset(msgbuf, 0, BUFFER_LENGTH);
/* Setup our PSM2 message queue */
if ((rc = psm2_mq_init(myep, PSM2_MQ_ORDERMASK_NONE, NULL, 0, &q))
!= PSM2_OK) {
die("couldn't initialize PSM2 MQ", rc);
}
printf("PSM2 MQ init done.\n");
if (is_server) {
psm2_mq_tag_t t = {0xABCD};
psm2_mq_tag_t tm = {-1};
/* Post the receive request */
if ((rc = psm2_mq_irecv2(q, PSM2_MQ_ANY_ADDR,
&t, /* message tag */
&tm, /* message tag mask */
0, /* no flags */
msgbuf, BUFFER_LENGTH,
NULL, /* no context to add */
&req_mq /* track irecv status */
)) != PSM2_OK) {
die("couldn't post psm2_mq_irecv()", rc);
}
printf("PSM2 MQ irecv() posted\n");
/* Wait until the message arrives */
if ((rc = psm2_mq_wait(&req_mq, NULL)) != PSM2_OK) {
die("couldn't wait for the irecv", rc);
}
printf("PSM2 MQ wait() done.\n");
printf("Message from client:\n");
printf("%s", msgbuf);
if (is_server) {
char fn[256];
sprintf(fn, "psm2-demo-server-epid-%d", rank);
unlink(fn);
}
} else {
/* Say hello */
snprintf(msgbuf, BUFFER_LENGTH,
"Hello world from epid=0x%lx, pid=%d.\n",
myepid, getpid());
psm2_mq_tag_t t = {0xABCD};
if ((rc = psm2_mq_send2(q,
epaddr_array[0], /* destination epaddr */
PSM2_MQ_FLAG_SENDSYNC, /* no flags */
&t, /* tag */
msgbuf, BUFFER_LENGTH
)) != PSM2_OK) {
die("couldn't post psm2_mq_isend", rc);
}
printf("PSM2 MQ send() done.\n");
}
/* Close down the MQ */
if ((rc = psm2_mq_finalize(q)) != PSM2_OK) {
die("couldn't psm2_mq_finalize()", rc);
}
printf("PSM2 MQ finalized.\n");
/* Close our ep, releasing all hardware resources.
* * Try to close all connections properly */
if ((rc = psm2_ep_close(myep, PSM2_EP_CLOSE_GRACEFUL,
0 /* no timeout */)) != PSM2_OK) {
die("couldn't psm2_ep_close()", rc);
}
printf("PSM2 ep closed.\n");
/* Release all local PSM2 resources */
if ((rc = psm2_finalize()) != PSM2_OK) {
die("couldn't psm2_finalize()", rc);
}
printf("PSM2 shut down, exiting.\n");
return 0;
}
static struct option options[] = {
{
.name = "ppn",
.has_arg = required_argument,
.flag = NULL,
.val = 'P',
},
/* end */
{ NULL, 0, NULL, 0, },
};
struct thr_arg {
pthread_barrier_t bar;
pthread_t pthread;
int rank;
int ppn;
int nproc;
int server_rank;
};
struct thr_arg thr_arg;
void *progress_fn(void *arg) {
struct thr_arg *thr_arg = (struct thr_arg *)arg;
int rc;
int i;
rc = syscall(732);
if (rc == -1)
fprintf(stdout, "CT09100 progress_fn running on Linux OK\n");
else {
fprintf(stdout, "CT09100 progress_fn running on McKernel NG (%d)\n", rc);
}
printf("progress,enter\n");
pthread_barrier_wait(&thr_arg->bar);
pthread_barrier_wait(&thr_arg->bar);
printf("progress,exit\n");
return NULL;
}
int main(int argc, char **argv) {
int rc;
int actual;
int nproc;
int ppn = -1;
int my_rank = -1, size = -1;
int i, j;
struct timespec start, end;
long t_pure_l, t_overall_l;
long t_pure, t_overall;
int opt;
pthread_barrierattr_t barrierattr;
fwq_init();
while ((opt = getopt_long(argc, argv, "+P:", options, NULL)) != -1) {
switch (opt) {
case 'P':
ppn = atoi(optarg);
break;
default: /* '?' */
printf("unknown option %c\n", optopt);
exit(1);
}
}
if (ppn == -1) {
printf("specify ppn with --ppn");
exit(1);
}
char *rank_str = getenv("PMI_RANK");
if (!rank_str) {
printf("getenv failed\n");
exit(1);
}
my_rank = atoi(rank_str);
printf("my_rank=%d\n", my_rank); fflush(stdout);
nproc = 2;
if (my_rank == 0) {
printf("tid=%d,pid=%d,nproc=%d\n", syscall(__NR_gettid), getpid(), nproc);
printf("nsec=%ld, nspw=%f\n", nsec, nspw);
}
/* Spawn a thread */
thr_arg.rank = my_rank;
thr_arg.ppn = ppn;
thr_arg.nproc = nproc;
thr_arg.server_rank = ppn + (my_rank % ppn);
pthread_barrierattr_init(&barrierattr);
pthread_barrier_init(&thr_arg.bar, &barrierattr, nproc);
char *uti_str = getenv("DISABLE_UTI");
int uti_val = uti_str ? atoi(uti_str) : 0;
if (!uti_val) {
rc = syscall(731, 1, NULL);
if (rc) {
fprintf(stdout, "CT09003 INFO: uti not available (rc=%d)\n", rc);
} else {
fprintf(stdout, "CT09003 INFO: uti available\n");
}
} else {
fprintf(stdout, "CT09003 INFO: uti disabled\n");
}
rc = pthread_create(&thr_arg.pthread, NULL, progress_fn, &thr_arg);
if (rc){
fprintf(stdout, "pthread_create: %d\n", rc);
exit(1);
}
pthread_barrier_wait(&thr_arg.bar);
my_psm2_init(thr_arg.rank, thr_arg.server_rank);
my_psm2_connect(thr_arg.rank, thr_arg.server_rank);
for (i = 0; i < thr_arg.nproc; i++) {
if (!on_same_node(thr_arg.ppn, thr_arg.rank, i)) {
if (thr_arg.rank < i) {
my_psm2_sendrecv(thr_arg.rank, thr_arg.rank, i);
} else {
my_psm2_sendrecv(thr_arg.rank, i, thr_arg.rank);
}
}
}
pthread_barrier_wait(&thr_arg.bar);
pthread_join(thr_arg.pthread, NULL);
fn_exit:
return 0;
fn_fail:
goto fn_exit;
}

220
test/uti/mpi/011.c Executable file
View File

@@ -0,0 +1,220 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <sys/mman.h>
#include <mpi.h>
#include <unistd.h>
#include <sys/syscall.h> /* For SYS_xxx definitions */
#include <sched.h>
//#define DEBUG
#ifdef DEBUG
#define dprintf printf
#else
#define dprintf {}
#endif
#define SZENTRY_DEFAULT (65536) /* Size of one slot */
#define NENTRY_DEFAULT 10000 /* Number of slots */
#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec))
static int print_cpu_last_executed_on() {
char fn[256];
char* result;
pid_t tid = syscall(SYS_gettid);
int fd;
int offset;
int mpi_errno = 0;
sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid);
//printf("fn=%s\n", fn);
fd = open(fn, O_RDONLY);
if(fd == -1) {
printf("open() failed\n");
goto fn_fail;
}
result = malloc(65536);
if(result == NULL) {
printf("malloc() failed");
goto fn_fail;
}
int amount = 0;
offset = 0;
while(1) {
amount = read(fd, result + offset, 65536);
// printf("amount=%d\n", amount);
if(amount == -1) {
printf("read() failed");
goto fn_fail;
}
if(amount == 0) {
goto eof;
}
offset += amount;
}
eof:;
//printf("result:%s\n", result);
char* next_delim = result;
char* field;
int i;
for(i = 0; i < 39; i++) {
field = strsep(&next_delim, " ");
}
int cpu = sched_getcpu();
if(cpu == -1) {
printf("getpu() failed\n");
goto fn_fail;
}
printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, tid); fflush(stdout);
fn_exit:
free(result);
return mpi_errno;
fn_fail:
mpi_errno = -1;
goto fn_exit;
}
void sendrecv(int rank, int nentry, char **sendv, char **recvv, int szentry, int src, int dest, MPI_Request* reqs, MPI_Status* status, double usec) {
int i;
if(rank == 1) {
for(i = 0; i < nentry; i++) {
MPI_Isend(sendv[i], szentry, MPI_CHAR, dest, 0, MPI_COMM_WORLD, &reqs[i]);
if (nentry > 10 && i % (nentry / 10) == 0) {
printf("s"); fflush(stdout);
}
}
MPI_Waitall(nentry, reqs, status);
printf("w\n"); fflush(stdout);
} else {
for(i = 0; i < nentry; i++) {
MPI_Irecv(recvv[i], szentry, MPI_CHAR, src, 0, MPI_COMM_WORLD, &reqs[i]);
if (nentry > 10 && i % (nentry / 10) == 0) {
printf("r"); fflush(stdout);
}
}
usleep(usec);
MPI_Waitall(nentry, reqs, status);
printf("W\n"); fflush(stdout);
}
}
int main(int argc, char **argv) {
int my_rank = -1, size = -1;
int i, j;
char **sendv, **recvv;
MPI_Status* status;
MPI_Request* reqs;
long szentry;
long nentry;
int src, dest;
struct timespec start, end;
double diffusec;
if(argc == 3) {
szentry = atoi(argv[1]);
nentry = atoi(argv[2]);
} else {
szentry = SZENTRY_DEFAULT;
nentry = NENTRY_DEFAULT;
}
printf("szentry=%ld,nentry=%ld\n", szentry, nentry);
status = (MPI_Status*)malloc(sizeof(MPI_Status) * nentry);
reqs = (MPI_Request*)malloc(sizeof(MPI_Request) * nentry);
int actual;
MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &actual);
printf("Thread support level is %d\n", actual);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
src = (size + my_rank - 1) % size;
dest = (my_rank + 1) % size;
printf("rank=%d, size=%d, src=%d, dest=%d\n", my_rank, size, src, dest);
sendv = malloc(sizeof(char *) * nentry);
if(!sendv) { printf("malloc failed"); goto fn_fail; }
for (i = 0; i < nentry; i++) {
#if 0
int fd;
fd = open("./file", O_RDWR);
if(fd == -1) { printf("open failed\n"); goto fn_fail; }
sendv[i] = (char*)mmap(0, szentry, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
#else
sendv[i] = (char*)mmap(0, szentry, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
#endif
if(sendv[i] == MAP_FAILED) { printf("mmap failed"); goto fn_fail; }
dprintf("[%d] sendv[%d]=%p\n", my_rank, i, sendv[i]);
memset(sendv[i], 0xaa, szentry);
}
recvv = malloc(sizeof(char *) * nentry);
if(!recvv) { printf("malloc failed"); goto fn_fail; }
for (i = 0; i < nentry; i++) {
#if 0
int fd;
fd = open("./file", O_RDWR);
if(fd == -1) { printf("open failed\n"); goto fn_fail; }
recvv[i] = (char*)mmap(0, szentry, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
#else
recvv[i] = (char*)mmap(0, szentry, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
#endif
if(recvv[i] == MAP_FAILED) { printf("mmap failed"); goto fn_fail; }
dprintf("[%d] recvv[%d]=%p\n", my_rank, i, recvv[i]);
memset(recvv[i], 0, szentry);
}
printf("after memset\n");
print_cpu_last_executed_on();
#pragma omp parallel for
for (i = 0; i < omp_get_num_threads(); i++) {
printf("thread_num=%d,tid=%d\n", i, syscall(SYS_gettid));
}
for (i = 0; i < 1; i++) {
MPI_Barrier(MPI_COMM_WORLD);
if(my_rank == 0) {
clock_gettime(CLOCK_REALTIME, &start);
}
sendrecv(my_rank, nentry, sendv, recvv, szentry, src, dest, reqs, status, 0);
MPI_Barrier(MPI_COMM_WORLD);
if(my_rank == 0) {
clock_gettime(CLOCK_REALTIME, &end);
diffusec = DIFFNSEC(end, start) / (double)1000;
printf("%4.4f sec\n", DIFFNSEC(end, start) / (double)1000000000); fflush(stdout);
}
MPI_Barrier(MPI_COMM_WORLD);
if(my_rank == 0) {
clock_gettime(CLOCK_REALTIME, &start);
}
sendrecv(my_rank, nentry, sendv, recvv, szentry, src, dest, reqs, status, diffusec);
MPI_Barrier(MPI_COMM_WORLD);
if(my_rank == 0) {
clock_gettime(CLOCK_REALTIME, &end);
printf("%4.4f sec\n", DIFFNSEC(end, start) / (double)1000000000); fflush(stdout);
}
}
fn_exit:
MPI_Finalize();
return 0;
fn_fail:
goto fn_exit;
}

338
test/uti/mpi/012.c Executable file
View File

@@ -0,0 +1,338 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <sys/mman.h>
#include <mpi.h>
#include <unistd.h>
#include <getopt.h>
#include <sys/syscall.h> /* For SYS_xxx definitions */
#include <sched.h>
//#define DEBUG
#ifdef DEBUG
#define dprintf printf
#else
#define dprintf {}
#endif
#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec))
#define BEGIN_EPOCH(win) do { MPI_Win_lock_all(0, win); } while(0)
#define END_EPOCH(win) do { MPI_Win_unlock_all(win); } while(0)
#define FLUSH(win) do { MPI_Win_flush_local_all(win); } while(0)
static inline void fixed_size_work() {
asm volatile(
"movq $0, %%rcx\n\t"
"1:\t"
"addq $1, %%rcx\n\t"
"cmpq $99, %%rcx\n\t"
"jle 1b\n\t"
:
:
: "rcx", "cc");
}
static inline void bulk_fsw(unsigned long n) {
int j;
for (j = 0; j < (n); j++) {
fixed_size_work();
}
}
double nspw; /* nsec per work */
unsigned long nsec;
void fwq_init() {
struct timespec start, end;
int i;
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
#define N_INIT 10000000
bulk_fsw(N_INIT);
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
nsec = DIFFNSEC(end, start);
nspw = nsec / (double)N_INIT;
}
#if 0
void fwq(long delay_nsec) {
if (delay_nsec < 0) {
return;
//printf("%s: delay_nsec < 0\n", __FUNCTION__);
}
bulk_fsw(delay_nsec / nspw);
}
#else /* For machines with large core-to-core performance variation (e.g. OFP) */
void fwq(long delay_nsec) {
struct timespec start, end;
if (delay_nsec < 0) { return; }
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
while (1) {
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
if (DIFFNSEC(end, start) >= delay_nsec) {
break;
}
bulk_fsw(2); /* ~150 ns per iteration on FOP */
}
}
#endif
static int print_cpu_last_executed_on() {
char fn[256];
char* result;
pid_t tid = syscall(SYS_gettid);
int fd;
int offset;
int mpi_errno = 0;
sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid);
//printf("fn=%s\n", fn);
fd = open(fn, O_RDONLY);
if(fd == -1) {
printf("open() failed\n");
goto fn_fail;
}
result = malloc(65536);
if(result == NULL) {
printf("malloc() failed");
goto fn_fail;
}
int amount = 0;
offset = 0;
while(1) {
amount = read(fd, result + offset, 65536);
// printf("amount=%d\n", amount);
if(amount == -1) {
printf("read() failed");
goto fn_fail;
}
if(amount == 0) {
goto eof;
}
offset += amount;
}
eof:;
//printf("result:%s\n", result);
char* next_delim = result;
char* field;
int i;
for(i = 0; i < 39; i++) {
field = strsep(&next_delim, " ");
}
int cpu = sched_getcpu();
if(cpu == -1) {
printf("getpu() failed\n");
goto fn_fail;
}
printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, tid); fflush(stdout);
fn_exit:
free(result);
return mpi_errno;
fn_fail:
mpi_errno = -1;
goto fn_exit;
}
static inline int on_same_node(int ppn, int me, int you) {
return (me / ppn == you / ppn);
}
/* get_acc-calc-flush_local */
void rma(int nproc, int ppn, int rank, double *wbuf, double *rbuf, double *result, int ndoubles, MPI_Win win, long calc_nsec) {
int i, j;
int r = 0, s = 0;
int req = 0;
for (i = 0; i < nproc; i++) {
if (!on_same_node(ppn, rank, i)) {
for (j = 0; j < ndoubles; j++) {
//printf("i=%d,j=%d,rbuf=%f,wbuf=%f\n", i, j, rbuf[i * ndoubles + j], wbuf[i * ndoubles + j]);
MPI_Get_accumulate(rbuf + i * ndoubles + j, 1, MPI_DOUBLE,
result + i * ndoubles + j, 1, MPI_DOUBLE,
i, i * ndoubles + j, 1, MPI_DOUBLE,
MPI_SUM, win);
}
}
}
fwq(calc_nsec);
FLUSH(win);
}
static struct option options[] = {
{
.name = "ppn",
.has_arg = required_argument,
.flag = NULL,
.val = 'P',
},
/* end */
{ NULL, 0, NULL, 0, },
};
int main(int argc, char **argv) {
int rc;
int actual;
int ppn = -1;
int nproc;
int ndoubles = -1;
int my_rank = -1, size = -1;
int i, j;
double *wbuf, *rbuf, *result;
MPI_Win win;
struct timespec start, end;
long t_flush_l, t_pure_l, t_overall_l;
long t_flush, t_pure, t_overall;
int opt;
fwq_init();
while ((opt = getopt_long(argc, argv, "+d:P:", options, NULL)) != -1) {
switch (opt) {
case 'd':
ndoubles = (1ULL << atoi(optarg));
break;
case 'P':
ppn = atoi(optarg);
break;
default: /* '?' */
printf("unknown option %c\n", optopt);
exit(1);
}
}
if (ndoubles == -1 || ppn == -1) {
printf("specify ndoubles with -d and ppn with --ppn");
exit(1);
}
MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &actual);
if (actual != 3) {
printf("ERROR: MPI_THREAD_MULTIPLE not available (level was set to %d)\n", actual);
exit(1);
}
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
MPI_Comm_size(MPI_COMM_WORLD, &nproc);
if (my_rank == 0) {
printf("ndoubles=%d,nproc=%d\n", ndoubles, nproc);
printf("nsec=%ld, nspw=%f\n", nsec, nspw);
}
/* accumulate-to buffer */
wbuf = malloc(sizeof(double) * ndoubles * nproc);
if(!wbuf) { printf("malloc failed"); goto fn_fail; }
memset(wbuf, 0, sizeof(double) * ndoubles * nproc);
/* read-from buffer */
rbuf = malloc(sizeof(double) * ndoubles * nproc);
if(!rbuf) { printf("malloc failed"); goto fn_fail; }
memset(rbuf, 0, sizeof(double) * ndoubles * nproc);
/* fetch-to buffer */
result = malloc(sizeof(double) * ndoubles * nproc);
if(!result) { printf("malloc failed"); goto fn_fail; }
memset(result, 0, sizeof(double) * ndoubles * nproc);
/* Expose accumulate-to buffer*/
if (rc = MPI_Win_create(wbuf, sizeof(double) * ndoubles * nproc, sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &win)) {
printf("MPI_Win_create failed,rc=%d\n", rc);
}
//print_cpu_last_executed_on();
for (i = 0; i < nproc; i++) {
for (j = 0; j < ndoubles; j++) {
wbuf[i * ndoubles + j] = (i + 1) * 1000 + (j + 1);
rbuf[i * ndoubles + j] = (i + 1) * 10000 + (j + 1);
result[i * ndoubles + j] = (i + 1) * 100000 + (j + 1);
}
}
#if 0
for (i = 0; i < nproc; i++) {
for (j = 0; j < ndoubles; j++) {
printf("wbuf,proc=%d,j=%d,val=%f\n", i, j, wbuf[i * ndoubles + j]);
printf("rbuf,proc=%d,j=%d,val=%f\n", i, j, rbuf[i * ndoubles + j]);
printf("result,proc=%d,j=%d,val=%f\n", i, j, result[i * ndoubles + j]);
}
}
#endif
/* Measure flush time */
MPI_Barrier(MPI_COMM_WORLD);
#define NFENCE 10
BEGIN_EPOCH(win);
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
for (i = 0; i < NFENCE; i++) {
FLUSH(win);
}
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
END_EPOCH(win);
t_flush_l = DIFFNSEC(end, start) / NFENCE;
//printf("t_flush (local): %ld usec\n", t_flush_l / 1000UL);
MPI_Allreduce(&t_flush_l, &t_flush, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD);
if (my_rank == 0) printf("t_flush (max): %ld usec\n", t_flush / 1000UL);
/* Measure get_acc-flush time */
MPI_Barrier(MPI_COMM_WORLD);
#define NPURE 10
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
for (i = 0; i < NPURE; i++) {
BEGIN_EPOCH(win);
rma(nproc, ppn, my_rank, wbuf, rbuf, result, ndoubles, win, 0);
END_EPOCH(win);
}
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
t_pure_l = DIFFNSEC(end, start) / NPURE;
//printf("t_pure (local): %ld usec\n", t_pure_l / 1000UL);
MPI_Allreduce(&t_pure_l, &t_pure, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD);
if (my_rank == 0) printf("t_pure (max): %ld usec\n", t_pure / 1000UL);
#if 0
for (i = 0; i < nproc; i++) {
for (j = 0; j < ndoubles; j++) {
printf("wbuf,proc=%d,j=%d,val=%f\n", i, j, wbuf[i * ndoubles + j]);
printf("rbuf,proc=%d,j=%d,val=%f\n", i, j, rbuf[i * ndoubles + j]);
printf("result,proc=%d,j=%d,val=%f\n", i, j, result[i * ndoubles + j]);
}
}
#endif
/* Measure get_acc-calc-flush time */
MPI_Barrier(MPI_COMM_WORLD);
#define NOVERALL 10
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
for (i = 0; i < NOVERALL; i++) {
BEGIN_EPOCH(win);
rma(nproc, ppn, my_rank, wbuf, rbuf, result, ndoubles, win, t_pure - t_flush);
END_EPOCH(win);
}
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
t_overall_l = DIFFNSEC(end, start) / NOVERALL;
//printf("t_overall (local): %ld usec\n", t_overall_l / 1000UL);
MPI_Allreduce(&t_overall_l, &t_overall, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD);
if (my_rank == 0) printf("t_overall (max): %ld usec\n", t_overall / 1000UL);
if (my_rank == 0) {
long t_abs = (t_pure * 2) - t_overall;
printf("overlap: %.2f %%\n", (t_abs * 100) / (double)t_pure);
}
fn_exit:
MPI_Finalize();
return 0;
fn_fail:
goto fn_exit;
}

174
test/uti/mpi/012.sh Executable file
View File

@@ -0,0 +1,174 @@
#!/usr/bin/bash
#!/usr/bin/bash -x
MYHOME=/work/gg10/e29005
UTI_MPI_TOP=${MYHOME}/project/os/mckernel/test/uti/mpi
MCK=${MYHOME}/project/os/install
unset DISABLE_UTI
stop=0
reboot=0
go=0
async=0
mck=0
nnodes=2
LASTNODE=8200
ndoubles=10 #12-15
omp_num_threads=1
ppn=1 #16
async_progress_pin=64,132,200,268,65,133,201,269,66,134,202,270,67,135,203,271
lpp=4 # logical-per-physical
ncpu_mt=256 # number of CPUs for main-thread
while getopts srga:c:n:mdl:N:P:o: OPT
do
case ${OPT} in
s) stop=1
;;
r) reboot=1
;;
g) go=1
;;
a) async=$OPTARG
;;
n) ndoubles=$OPTARG
;;
m) mck=1
;;
d) export DISABLE_UTI=1
;;
N) nnodes=$OPTARG
;;
P) ppn=$OPTARG
;;
o) omp_num_threads=$OPTARG
;;
*) echo "invalid option -${OPT}" >&2
exit 1
esac
done
nprocs=$((ppn * nnodes))
nodes=`echo $(seq -s ",c" $(($LASTNODE + 1 - $nnodes)) $LASTNODE) | sed 's/^/c/'`
echo nprocs=$nprocs nnodes=$nnodes ppn=$ppn nodes=$nodes
if [ ${mck} -eq 1 ]; then
mcexec="${mck_dir}/bin/mcexec"
nmcexecthr=$((omp_num_threads + 4))
mcexecopt="--uti-thread-rank=$uti_thread_rank"
if [ ${use_hfi} -eq 1 ]; then
mcexecopt="--enable-hfi1 $mcexecopt"
fi
mcexecopt="-n $ppn -t $nmcexecthr $mcexecopt"
else
mcexec=
mcexecopt=
fi
if [ ${mck} -eq 1 ]; then
i_mpi_pin=off
i_mpi_pin_domain=
i_mpi_pin_order=
else
# Let each domain have all logical cores and use KMP_AFFINITY=scatter if you want to use only physical cores
i_mpi_pin=on
if [ $((omp_num_threads * lpp * ppn)) -le $ncpu_mt ]; then
domain=$((omp_num_threads * lpp)) # Prefer physical but adjacent physicals share L1
else
domain=$((ncpu_mt / ppn)) # Use logical as well
fi
i_mpi_pin_domain="export I_MPI_PIN_DOMAIN=$domain"
i_mpi_pin_order="export I_MPI_PIN_ORDER=compact"
fi
if [ $async -eq 0 ] || [ "$async_progress_pin" == "" ] ; then
i_mpi_async_progress_pin=
else
i_mpi_async_progress_pin="export I_MPI_ASYNC_PROGRESS_PIN=$async_progress_pin"
fi
if [ ${stop} -eq 1 ]; then
PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes sudo mount /work
if [ ${mck} -eq 1 ]; then
sudo ${MCK}/sbin/mcstop+release.sh
else
:
fi
fi
if [ ${reboot} -eq 1 ]; then
PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes sudo mount /work
if [ ${mck} -eq 1 ]; then
if hostname | grep ofp &>/dev/null; then
sudo ${MCK}/sbin/mcreboot.sh -s -c 2-17,70-85,138-153,206-221,20-35,88-103,156-171,224-239,36-51,104-119,172-187,240-255,52-67,120-135,188-203,256-271 -r 2-5,70-73,138-141,206-209:0+6-9,74-77,142-145,210-213:1+10-13,78-81,146-149,214-217:68+14-17,82-85,150-153,218-221:69+20-23,88-91,156-159,224-227:136+24-27,92-95,160-163,228-231:137+28-31,96-99,164-167,232-235:204+32-35,100-103,168-171,236-239:205+36-39,104-107,172-175,240-243:18+40-43,108-111,176-179,244-247:19+44-47,112-115,180-183,248-251:86+48-51,116-119,184-187,252-255:87+52-55,120-123,188-191,256-259:154+56-59,124-127,192-195,260-263:155+60-63,128-131,196-199,264-267:222+64-67,132-135,200-203,268-271:223 -m 32G@0,12G@1
else
sudo ${MCK}/sbin/mcreboot.sh -s -c 1-15,65-79,129-143,193-207,17-31,81-95,145-159,209-223,33-47,97-111,161-175,225-239,49-63,113-127,177-191,241-255 -r 1-15:0+65-79:64+129-143:128+193-207:192+17-31:16+81-95:80+145-159:144+209-223:208+33-47:32+97-111:96+161-175:160+225-239:224+49-63:48+113-127:112+177-191:176+241-255:240 -m 12G@0,12G@1,12G@2,12G@3,3920M@4,3920M@5,3920M@6,3920M@7
fi
else
:
fi
fi
cd ${UTI_MPI_TOP}
(
cat <<EOF
#!/bin/sh
export I_MPI_HYDRA_BOOTSTRAP_EXEC=/usr/bin/ssh
export I_MPI_HYDRA_BOOTSTRAP=ssh
export OMP_NUM_THREADS=$omp_num_threads
#export OMP_STACKSIZE=64M
export KMP_BLOCKTIME=1
export PSM2_RCVTHREAD=0
export I_MPI_PIN=$i_mpi_pin
$i_mpi_pin_domain
$i_mpi_pin_order
export HFI_NO_CPUAFFINITY=1
export I_MPI_COLL_INTRANODE_SHM_THRESHOLD=4194304
export I_MPI_FABRICS=shm:tmi
export PSM2_RCVTHREAD=0
export I_MPI_TMI_PROVIDER=psm2
export I_MPI_FALLBACK=0
export PSM2_MQ_RNDV_HFI_WINDOW=4194304
export PSM2_MQ_EAGER_SDMA_SZ=65536
export PSM2_MQ_RNDV_HFI_THRESH=200000
export MCKERNEL_RLIMIT_STACK=32M,16G
export KMP_STACKSIZE=64m
export KMP_AFFINITY=granularity=thread,scatter
#export KMP_HW_SUBSET=64c,1t
export I_MPI_ASYNC_PROGRESS=$async
$i_mpi_async_progress_pin
#export I_MPI_STATS=native:20,ipm
export I_MPI_STATS=ipm
export I_MPI_DEBUG=4
#export I_MPI_HYDRA_DEBUG=on
mpiexec.hydra -l -n $nprocs -ppn $ppn -hosts $nodes $ilpopt $mcexec $mcexecopt `pwd`/012 --ppn $ppn -d $ndoubles
EOF
) > ./job.sh
chmod u+x ./job.sh
if [ ${go} -eq 1 ]; then
cd ${UTI_MPI_TOP}
if [ $mck -eq 1 ]; then
make clean && make 012
else
make clean && make CC=mpiicc 012
fi
./job.sh
fi

335
test/uti/mpi/013.c Executable file
View File

@@ -0,0 +1,335 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <sys/mman.h>
#include <mpi.h>
#include <unistd.h>
#include <getopt.h>
#include <sys/syscall.h> /* For SYS_xxx definitions */
#include <sched.h>
//#define DEBUG
#ifdef DEBUG
#define dprintf printf
#else
#define dprintf {}
#endif
#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec))
#define BEGIN_EPOCH(win) do { MPI_Win_lock_all(0, win); } while(0)
#define END_EPOCH(win) do { MPI_Win_unlock_all(win); } while(0)
static inline void fixed_size_work() {
asm volatile(
"movq $0, %%rcx\n\t"
"1:\t"
"addq $1, %%rcx\n\t"
"cmpq $99, %%rcx\n\t"
"jle 1b\n\t"
:
:
: "rcx", "cc");
}
static inline void bulk_fsw(unsigned long n) {
int j;
for (j = 0; j < (n); j++) {
fixed_size_work();
}
}
double nspw; /* nsec per work */
unsigned long nsec;
void fwq_init() {
struct timespec start, end;
int i;
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
#define N_INIT 10000000
bulk_fsw(N_INIT);
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
nsec = DIFFNSEC(end, start);
nspw = nsec / (double)N_INIT;
}
#if 0
void fwq(long delay_nsec) {
if (delay_nsec < 0) {
return;
//printf("%s: delay_nsec < 0\n", __FUNCTION__);
}
bulk_fsw(delay_nsec / nspw);
}
#else /* For machines with large core-to-core performance variation (e.g. OFP) */
void fwq(long delay_nsec) {
struct timespec start, end;
if (delay_nsec < 0) { return; }
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
while (1) {
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
if (DIFFNSEC(end, start) >= delay_nsec) {
break;
}
bulk_fsw(2); /* ~150 ns per iteration on OFP */
}
}
#endif
static int print_cpu_last_executed_on() {
char fn[256];
char* result;
pid_t tid = syscall(SYS_gettid);
int fd;
int offset;
int mpi_errno = 0;
sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid);
//printf("fn=%s\n", fn);
fd = open(fn, O_RDONLY);
if(fd == -1) {
printf("open() failed\n");
goto fn_fail;
}
result = malloc(65536);
if(result == NULL) {
printf("malloc() failed");
goto fn_fail;
}
int amount = 0;
offset = 0;
while(1) {
amount = read(fd, result + offset, 65536);
// printf("amount=%d\n", amount);
if(amount == -1) {
printf("read() failed");
goto fn_fail;
}
if(amount == 0) {
goto eof;
}
offset += amount;
}
eof:;
//printf("result:%s\n", result);
char* next_delim = result;
char* field;
int i;
for(i = 0; i < 39; i++) {
field = strsep(&next_delim, " ");
}
int cpu = sched_getcpu();
if(cpu == -1) {
printf("getpu() failed\n");
goto fn_fail;
}
printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, tid); fflush(stdout);
fn_exit:
free(result);
return mpi_errno;
fn_fail:
mpi_errno = -1;
goto fn_exit;
}
static inline int on_same_node(int ppn, int me, int you) {
return (me / ppn == you / ppn);
}
/* get_acc-calc-flush_local */
void rma(int nproc, int ppn, int rank, double *wbuf, double *rbuf, int ndoubles, MPI_Win win, long calc_nsec, int flush_only) {
int i, j;
int r = 0, s = 0;
int req = 0;
for (i = 0; i < nproc; i++) {
if (!on_same_node(ppn, rank, i)) {
for (j = 0; j < ndoubles; j++) {
//printf("i=%d,j=%d,rbuf=%f,wbuf=%f\n", i, j, rbuf[i * ndoubles + j], wbuf[i * ndoubles + j]);
if (!flush_only) {
MPI_Accumulate(rbuf + i * ndoubles + j, 1, MPI_DOUBLE,
i, i * ndoubles + j, 1, MPI_DOUBLE,
MPI_SUM, win);
}
MPI_Win_flush_local(i, win);
}
}
}
fwq(calc_nsec);
}
static struct option options[] = {
{
.name = "ppn",
.has_arg = required_argument,
.flag = NULL,
.val = 'P',
},
/* end */
{ NULL, 0, NULL, 0, },
};
int main(int argc, char **argv) {
int rc;
int actual;
int ppn = -1;
int nproc;
int ndoubles = -1;
int my_rank = -1, size = -1;
int i, j;
double *wbuf, *rbuf;
MPI_Win win;
struct timespec start, end;
long t_flush_l, t_pure_l, t_overall_l;
long t_flush, t_pure, t_overall;
int opt;
fwq_init();
while ((opt = getopt_long(argc, argv, "+d:P:", options, NULL)) != -1) {
switch (opt) {
case 'd':
ndoubles = (1ULL << atoi(optarg));
break;
case 'P':
ppn = atoi(optarg);
break;
default: /* '?' */
printf("unknown option %c\n", optopt);
exit(1);
}
}
if (ndoubles == -1 || ppn == -1) {
printf("specify ndoubles with -d and ppn with --ppn");
exit(1);
}
MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &actual);
if (actual != 3) {
printf("ERROR: MPI_THREAD_MULTIPLE not available (level was set to %d)\n", actual);
exit(1);
}
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
MPI_Comm_size(MPI_COMM_WORLD, &nproc);
if (my_rank == 0) {
printf("ndoubles=%d,nproc=%d\n", ndoubles, nproc);
printf("nsec=%ld, nspw=%f\n", nsec, nspw);
}
/* accumulate-to buffer */
wbuf = malloc(sizeof(double) * ndoubles * nproc);
if(!wbuf) { printf("malloc failed"); goto fn_fail; }
memset(wbuf, 0, sizeof(double) * ndoubles * nproc);
/* read-from buffer */
rbuf = malloc(sizeof(double) * ndoubles * nproc);
if(!rbuf) { printf("malloc failed"); goto fn_fail; }
memset(rbuf, 0, sizeof(double) * ndoubles * nproc);
/* Expose accumulate-to buffer*/
if (rc = MPI_Win_create(wbuf, sizeof(double) * ndoubles * nproc, sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &win)) {
printf("MPI_Win_create failed,rc=%d\n", rc);
}
//print_cpu_last_executed_on();
for (i = 0; i < nproc; i++) {
for (j = 0; j < ndoubles; j++) {
wbuf[i * ndoubles + j] = (i + 1) * 1000 + (j + 1);
rbuf[i * ndoubles + j] = (i + 1) * 10000 + (j + 1);
}
}
#if 0
for (i = 0; i < nproc; i++) {
for (j = 0; j < ndoubles; j++) {
printf("wbuf,proc=%d,j=%d,val=%f\n", i, j, wbuf[i * ndoubles + j]);
printf("rbuf,proc=%d,j=%d,val=%f\n", i, j, rbuf[i * ndoubles + j]);
}
}
#endif
/* Measure flush time */
MPI_Barrier(MPI_COMM_WORLD);
#define NFENCE 10
BEGIN_EPOCH(win);
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
for (i = 0; i < NFENCE; i++) {
rma(nproc, ppn, my_rank, wbuf, rbuf, ndoubles, win, 0, 1);
}
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
END_EPOCH(win);
t_flush_l = DIFFNSEC(end, start) / NFENCE;
//printf("t_flush (local): %ld usec\n", t_flush_l / 1000UL);
MPI_Allreduce(&t_flush_l, &t_flush, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD);
if (my_rank == 0) printf("t_flush (max): %ld usec\n", t_flush / 1000UL);
/* Measure get_acc-flush time */
MPI_Barrier(MPI_COMM_WORLD);
#define NPURE 10
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
//MPI_Pcontrol(1, "rma");
for (i = 0; i < NPURE; i++) {
BEGIN_EPOCH(win);
rma(nproc, ppn, my_rank, wbuf, rbuf, ndoubles, win, 0, 0);
END_EPOCH(win);
}
//MPI_Pcontrol(-1, "rma");
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
t_pure_l = DIFFNSEC(end, start) / NPURE;
//printf("t_pure (local): %ld usec\n", t_pure_l / 1000UL);
MPI_Allreduce(&t_pure_l, &t_pure, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD);
if (my_rank == 0) printf("t_pure (max): %ld usec\n", t_pure / 1000UL);
#if 0
for (i = 0; i < nproc; i++) {
for (j = 0; j < ndoubles; j++) {
printf("wbuf,proc=%d,j=%d,val=%f\n", i, j, wbuf[i * ndoubles + j]);
printf("rbuf,proc=%d,j=%d,val=%f\n", i, j, rbuf[i * ndoubles + j]);
}
}
#endif
/* Measure get_acc-calc-flush time */
MPI_Barrier(MPI_COMM_WORLD);
#define NOVERALL 10
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
//MPI_Pcontrol(1, "rma-calc");
for (i = 0; i < NOVERALL; i++) {
BEGIN_EPOCH(win);
rma(nproc, ppn, my_rank, wbuf, rbuf, ndoubles, win, t_pure - t_flush, 0);
END_EPOCH(win);
}
//MPI_Pcontrol(-1, "rma-calc");
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
t_overall_l = DIFFNSEC(end, start) / NOVERALL;
//printf("t_overall (local): %ld usec\n", t_overall_l / 1000UL);
MPI_Allreduce(&t_overall_l, &t_overall, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD);
if (my_rank == 0) printf("t_overall (max): %ld usec\n", t_overall / 1000UL);
if (my_rank == 0) {
long t_abs = (t_pure * 2) - t_overall;
printf("overlap: %.2f %%\n", (t_abs * 100) / (double)t_pure);
}
fn_exit:
MPI_Finalize();
return 0;
fn_fail:
goto fn_exit;
}

176
test/uti/mpi/013.sh Executable file
View File

@@ -0,0 +1,176 @@
#!/usr/bin/bash
#!/usr/bin/bash -x
MYHOME=/work/gg10/e29005
UTI_MPI_TOP=${MYHOME}/project/os/mckernel/test/uti/mpi
MCK=${MYHOME}/project/os/install
unset DISABLE_UTI
stop=0
reboot=0
go=0
async=0
mck=0
nnodes=4
LASTNODE=8200
ndoubles=10 #12-15
omp_num_threads=1
ppn=16 #16
async_progress_pin=64,132,200,268,65,133,201,269,66,134,202,270,67,135,203,271
lpp=4 # logical-per-physical
ncpu_mt=256 # number of CPUs for main-thread
exe=`basename $0 | sed 's/\.sh$//'`
while getopts srga:c:n:mdl:N:P:o: OPT
do
case ${OPT} in
s) stop=1
;;
r) reboot=1
;;
g) go=1
;;
a) async=$OPTARG
;;
n) ndoubles=$OPTARG
;;
m) mck=1
;;
d) export DISABLE_UTI=1
;;
N) nnodes=$OPTARG
;;
P) ppn=$OPTARG
;;
o) omp_num_threads=$OPTARG
;;
*) echo "invalid option -${OPT}" >&2
exit 1
esac
done
nprocs=$((ppn * nnodes))
nodes=`echo $(seq -s ",c" $(($LASTNODE + 1 - $nnodes)) $LASTNODE) | sed 's/^/c/'`
echo nprocs=$nprocs nnodes=$nnodes ppn=$ppn nodes=$nodes
if [ ${mck} -eq 1 ]; then
mcexec="${mck_dir}/bin/mcexec"
nmcexecthr=$((omp_num_threads + 4))
mcexecopt="--uti-thread-rank=$uti_thread_rank"
if [ ${use_hfi} -eq 1 ]; then
mcexecopt="--enable-hfi1 $mcexecopt"
fi
mcexecopt="-n $ppn -t $nmcexecthr $mcexecopt"
else
mcexec=
mcexecopt=
fi
if [ ${mck} -eq 1 ]; then
i_mpi_pin=off
i_mpi_pin_domain=
i_mpi_pin_order=
else
# Let each domain have all logical cores and use KMP_AFFINITY=scatter if you want to use only physical cores
i_mpi_pin=on
if [ $((omp_num_threads * lpp * ppn)) -le $ncpu_mt ]; then
domain=$((omp_num_threads * lpp)) # Prefer physical but adjacent physicals share L1
else
domain=$((ncpu_mt / ppn)) # Use logical as well
fi
i_mpi_pin_domain="export I_MPI_PIN_DOMAIN=$domain"
i_mpi_pin_order="export I_MPI_PIN_ORDER=compact"
fi
if [ $async -eq 0 ] || [ "$async_progress_pin" == "" ] ; then
i_mpi_async_progress_pin=
else
i_mpi_async_progress_pin="export I_MPI_ASYNC_PROGRESS_PIN=$async_progress_pin"
fi
if [ ${stop} -eq 1 ]; then
PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes sudo mount /work
if [ ${mck} -eq 1 ]; then
sudo ${MCK}/sbin/mcstop+release.sh
else
:
fi
fi
if [ ${reboot} -eq 1 ]; then
PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes sudo mount /work
if [ ${mck} -eq 1 ]; then
if hostname | grep ofp &>/dev/null; then
sudo ${MCK}/sbin/mcreboot.sh -s -c 2-17,70-85,138-153,206-221,20-35,88-103,156-171,224-239,36-51,104-119,172-187,240-255,52-67,120-135,188-203,256-271 -r 2-5,70-73,138-141,206-209:0+6-9,74-77,142-145,210-213:1+10-13,78-81,146-149,214-217:68+14-17,82-85,150-153,218-221:69+20-23,88-91,156-159,224-227:136+24-27,92-95,160-163,228-231:137+28-31,96-99,164-167,232-235:204+32-35,100-103,168-171,236-239:205+36-39,104-107,172-175,240-243:18+40-43,108-111,176-179,244-247:19+44-47,112-115,180-183,248-251:86+48-51,116-119,184-187,252-255:87+52-55,120-123,188-191,256-259:154+56-59,124-127,192-195,260-263:155+60-63,128-131,196-199,264-267:222+64-67,132-135,200-203,268-271:223 -m 32G@0,12G@1
else
sudo ${MCK}/sbin/mcreboot.sh -s -c 1-15,65-79,129-143,193-207,17-31,81-95,145-159,209-223,33-47,97-111,161-175,225-239,49-63,113-127,177-191,241-255 -r 1-15:0+65-79:64+129-143:128+193-207:192+17-31:16+81-95:80+145-159:144+209-223:208+33-47:32+97-111:96+161-175:160+225-239:224+49-63:48+113-127:112+177-191:176+241-255:240 -m 12G@0,12G@1,12G@2,12G@3,3920M@4,3920M@5,3920M@6,3920M@7
fi
else
:
fi
fi
cd ${UTI_MPI_TOP}
(
cat <<EOF
#!/bin/sh
export I_MPI_HYDRA_BOOTSTRAP_EXEC=/usr/bin/ssh
export I_MPI_HYDRA_BOOTSTRAP=ssh
export OMP_NUM_THREADS=$omp_num_threads
#export OMP_STACKSIZE=64M
export KMP_BLOCKTIME=1
export PSM2_RCVTHREAD=0
export I_MPI_PIN=$i_mpi_pin
$i_mpi_pin_domain
$i_mpi_pin_order
export HFI_NO_CPUAFFINITY=1
export I_MPI_COLL_INTRANODE_SHM_THRESHOLD=4194304
export I_MPI_FABRICS=shm:tmi
export PSM2_RCVTHREAD=0
export I_MPI_TMI_PROVIDER=psm2
export I_MPI_FALLBACK=0
export PSM2_MQ_RNDV_HFI_WINDOW=4194304
export PSM2_MQ_EAGER_SDMA_SZ=65536
export PSM2_MQ_RNDV_HFI_THRESH=200000
export MCKERNEL_RLIMIT_STACK=32M,16G
export KMP_STACKSIZE=64m
export KMP_AFFINITY=granularity=thread,scatter
#export KMP_HW_SUBSET=64c,1t
export I_MPI_ASYNC_PROGRESS=$async
$i_mpi_async_progress_pin
#export I_MPI_STATS=native:20,ipm
export I_MPI_STATS=ipm
#export I_MPI_DEBUG=4
#export I_MPI_HYDRA_DEBUG=on
mpiexec.hydra -l -n $nprocs -ppn $ppn -hosts $nodes $ilpopt $mcexec $mcexecopt `pwd`/$exe --ppn $ppn -d $ndoubles
EOF
) > ./job.sh
chmod u+x ./job.sh
if [ ${go} -eq 1 ]; then
cd ${UTI_MPI_TOP}
if [ $mck -eq 1 ]; then
make clean && make $exe
else
make clean && make CC=mpiicc $exe
fi
./job.sh
fi

242
test/uti/mpi/014.c Executable file
View File

@@ -0,0 +1,242 @@
#define _GNU_SOURCE /* See feature_test_macros(7) */
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <mpi.h>
#include <unistd.h>
#include <sys/syscall.h> /* For SYS_xxx definitions */
#include <getopt.h>
#include <sched.h>
#include <sys/time.h>
#include <sys/resource.h>
#include "async_progress.h"
#include "util.h"
//#define DEBUG
#ifdef DEBUG
#define dprintf printf
#else
#define dprintf {}
#endif
static struct option options[] = {
/* end */
{ NULL, 0, NULL, 0, },
};
int main(int argc, char **argv) {
int rc;
int actual;
int nproc;
int nsamples = -1;
int my_rank = -1, size = -1;
int i, j, k, l, m;
double *wbuf, *rbuf, *result;
MPI_Win win;
long start, end;
long t_pure_l, t_pure, t_pure0 = 0;
int opt;
int szbuf = 8;
struct rusage ru_start, ru_end;
struct timeval tv_start, tv_end;
fwq_init();
while ((opt = getopt_long(argc, argv, "+n:", options, NULL)) != -1) {
switch (opt) {
case 'n':
nsamples = atoi(optarg);
break;
default: /* '?' */
printf("unknown option %c\n", optopt);
exit(1);
}
}
if (nsamples == -1) {
printf("specify nsamples with -n");
exit(1);
}
MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &actual);
if (actual != 3) {
printf("ERROR: MPI_THREAD_MULTIPLE not available (level was set to %d)\n", actual);
exit(1);
}
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
MPI_Comm_size(MPI_COMM_WORLD, &nproc);
if (my_rank == 0) {
printf("nsamples=%d,nproc=%d\n", nsamples, nproc);
}
/* accumulate-to buffer */
wbuf = malloc(sizeof(double) * szbuf);
if(!wbuf) { printf("malloc failed"); goto fn_fail; }
memset(wbuf, 0, sizeof(double) * szbuf);
/* read-from buffer */
rbuf = malloc(sizeof(double) * szbuf);
if(!rbuf) { printf("malloc failed"); goto fn_fail; }
memset(rbuf, 0, sizeof(double) * szbuf);
/* fetch-to buffer */
result = malloc(sizeof(double) * szbuf);
if(!result) { printf("malloc failed"); goto fn_fail; }
memset(result, 0, sizeof(double) * szbuf);
/* Expose accumulate-to buffer*/
if (rc = MPI_Win_create(wbuf, sizeof(double) * szbuf, sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &win)) {
printf("MPI_Win_create failed,rc=%d\n", rc);
}
for (j = 0; j < szbuf; j++) {
wbuf[j] = j + 1;
rbuf[j] = 10000 + j + 1;
result[j] = 100000 + j + 1;
}
#if 0
for (j = 0; j < szbuf; j++) {
printf("wbuf,j=%d,val=%f\n", j, wbuf[j]);
printf("rbuf,j=%d,val=%f\n", j, rbuf[j]);
printf("result,j=%d,val=%f\n", j, result[j]);
}
}
#endif
for (k = 0; k < 2; k++) {
if (k == 1) {
print_cpu_last_executed_on("main");
INIT_ASYNC_THREAD_();
if ((rc = getrusage(RUSAGE_THREAD, &ru_start))) {
printf("%s: ERROR: getrusage failed (%d)\n", __FUNCTION__, rc);
}
if ((rc = gettimeofday(&tv_start, NULL))) {
printf("%s: ERROR: gettimeofday failed (%d)\n", __FUNCTION__, rc);
}
syscall(701, 1 | 2 | 0x80000000);
}
for (m = 0; m < 3; m++) {
for (l = 0; l <= 10; l++) {
long calc_cyc = /*(k == 1 && l == 0) ? (double)t_pure0 * 0.1 :*/ t_pure0 / 10 * l;
MPI_Barrier(MPI_COMM_WORLD);
MPI_Win_lock_all(0, win);
//clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
start = rdtsc_light();
for (j = 0; j < nsamples; j++) {
for (i = 0; i < nproc; i++) {
int target = j % nproc;
if (target == my_rank) {
continue;
}
#if 0
MPI_Get_accumulate(rbuf + j % szbuf, 1, MPI_DOUBLE,
result + j % szbuf, 1, MPI_DOUBLE,
i,
j % szbuf, 1, MPI_DOUBLE,
MPI_SUM, win);
#endif
#if 1
MPI_Get_accumulate(rbuf, szbuf, MPI_DOUBLE,
result, szbuf, MPI_DOUBLE,
i,
0, szbuf, MPI_DOUBLE,
MPI_SUM, win);
#endif
#if 0
MPI_Accumulate(rbuf, szbuf, MPI_DOUBLE,
i,
0, szbuf, MPI_DOUBLE,
MPI_SUM, win);
#endif
#if 0
MPI_Get(rbuf + j % szbuf, 1, MPI_DOUBLE,
i,
j % szbuf, 1, MPI_DOUBLE,
win);
#endif
}
}
fwq(calc_cyc * nsamples);
MPI_Win_flush_local_all(win);
end = rdtsc_light();
//clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
MPI_Win_unlock_all(win);
MPI_Barrier(MPI_COMM_WORLD);
t_pure_l = (end - start) / nsamples;
//t_pure_l = DIFFNSEC(end, start) / nsamples;
if (1||m == 2) {
MPI_Allreduce(&t_pure_l, &t_pure, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD);
if (my_rank == 0) {
if (l == 0) {
printf("async: %d, trial: %d\n", k, m);
}
if (k == 0) {
printf("%ld\t%ld\n", calc_cyc, t_pure);
} else {
printf("%ld\n", t_pure);
}
}
}
if (k == 0 && l == 0) {
t_pure0 = t_pure;
}
#if 0
for (i = 0; i < nproc; i++) {
for (j = 0; j < sbuf; j++) {
printf("wbuf,j=%d,val=%f\n", j, wbuf[j]);
printf("rbuf,j=%d,val=%f\n", j, rbuf[j]);
printf("result,j=%d,val=%f\n", j, result[j]);
}
}
#endif
}
}
if (k == 1) {
FINALIZE_ASYNC_THREAD_();
#if 0
if ((rc = getrusage(RUSAGE_THREAD, &ru_end))) {
printf("%s: ERROR: getrusage failed (%d)\n", __FUNCTION__, rc);
}
if ((rc = gettimeofday(&tv_end, NULL))) {
printf("%s: ERROR: gettimeofday failed (%d)\n", __FUNCTION__, rc);
}
printf("%s: wall: %ld, user: %ld, sys: %ld\n", __FUNCTION__,
DIFFUSEC(tv_end, tv_start),
DIFFUSEC(ru_end.ru_utime, ru_start.ru_utime),
DIFFUSEC(ru_end.ru_stime, ru_start.ru_stime));
syscall(701, 4 | 8 | 0x80000000);
#endif
}
}
fn_exit:
MPI_Finalize();
return 0;
fn_fail:
goto fn_exit;
}

191
test/uti/mpi/014.sh Executable file
View File

@@ -0,0 +1,191 @@
#!/usr/bin/bash
#!/usr/bin/bash -x
MYHOME=/work/gg10/e29005
UTI_MPI_TOP=${MYHOME}/project/os/mckernel/test/uti/mpi
mck_dir=${MYHOME}/project/os/install
exe=`basename $0 | sed 's/\.sh//'`
stop=0
reboot=0
go=0
async=0
mck=0
nnodes=2
LASTNODE=8200
nsamples=100 #2^12-15
use_hfi=0
omp_num_threads=1
ppn=4
lpp=4 # logical-per-physical
ncpu_mt=256 # number of CPUs for main-thread
myasync=1
async_in_mck=0
while getopts srga:c:n:ml:N:P:ho:A:M: OPT
do
case ${OPT} in
s) stop=1
;;
r) reboot=1
;;
g) go=1
;;
a) async=$OPTARG
;;
n) nsamples=$OPTARG
;;
m) mck=1
;;
N) nnodes=$OPTARG
;;
P) ppn=$OPTARG
;;
h) use_hfi=1
;;
o) omp_num_threads=$OPTARG
;;
A) myasync=$OPTARG
;;
M) async_in_mck=$OPTARG
;;
*) echo "invalid option -${OPT}" >&2
exit 1
esac
done
nprocs=$((ppn * nnodes))
nodes=`echo $(seq -s ",c" $(($LASTNODE + 1 - $nnodes)) $LASTNODE) | sed 's/^/c/'`
echo nprocs=$nprocs nnodes=$nnodes ppn=$ppn nodes=$nodes
PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes bash -c \'if \[ \"\`cat /etc/mtab \| while read line\; do cut -d\" \" -f 2\; done \| grep /work\`\" == \"\" \]\; then sudo mount /work\; fi\'
if [ $mck -eq 0 ] || [ $async_in_mck -eq 1 ]; then
export DISABLE_UTI=1
else
unset DISABLE_UTI
fi
if [ $mck -eq 0 ]; then
async_progress_pin=64,65,66,67,132,133,134,135,200,201,202,203,268,269,270,271
else
async_progress_pin=`(for ((i=0;i<ppn;i++)) do printf "%d," $((i * (ncpu_mt / ppn) +1)); done) | sed 's/,$//'`
# same tile, different physical core
fi
echo async_progress_pin=$async_progress_pin
if [ ${mck} -eq 1 ]; then
mcexec="${mck_dir}/bin/mcexec"
nmcexecthr=$((omp_num_threads + 4))
mcexecopt="--uti-thread-rank=$uti_thread_rank"
if [ ${use_hfi} -eq 1 ]; then
mcexecopt="--enable-hfi1 $mcexecopt"
fi
mcexecopt="-n $ppn -t $nmcexecthr $mcexecopt"
else
mcexec=
mcexecopt=
fi
if [ ${mck} -eq 1 ]; then
i_mpi_pin=off
i_mpi_pin_domain=
i_mpi_pin_order=
else
# Let each domain have all logical cores and use KMP_AFFINITY=scatter if you want to use only physical cores
i_mpi_pin=on
if [ $((omp_num_threads * lpp * ppn)) -le $ncpu_mt ]; then
domain=$((omp_num_threads * lpp)) # Prefer physical but adjacent physicals share L1
else
domain=$((ncpu_mt / ppn)) # Use logical as well
fi
i_mpi_pin_domain="export I_MPI_PIN_DOMAIN=$domain"
i_mpi_pin_order="export I_MPI_PIN_ORDER=compact"
fi
if [[ ($async -eq 1 && "$async_progress_pin" != "" ) || $myasync -eq 1 ]]; then
i_mpi_async_progress_pin="export I_MPI_ASYNC_PROGRESS_PIN=$async_progress_pin"
else
i_mpi_async_progress_pin=
fi
if [ ${stop} -eq 1 ]; then
if [ ${mck} -eq 1 ]; then
PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \
sudo ${mck_dir}/sbin/mcstop+release.sh
else
:
fi
fi
if [ ${reboot} -eq 1 ]; then
if [ ${mck} -eq 1 ]; then
if hostname | grep ofp &>/dev/null; then
PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \
sudo ${mck_dir}/sbin/mcreboot.sh -s -c 2-17,70-85,138-153,206-221,20-35,88-103,156-171,224-239,36-51,104-119,172-187,240-255,52-67,120-135,188-203,256-271 -r 2-5,70-73,138-141,206-209:0+6-9,74-77,142-145,210-213:1+10-13,78-81,146-149,214-217:68+14-17,82-85,150-153,218-221:69+20-23,88-91,156-159,224-227:136+24-27,92-95,160-163,228-231:137+28-31,96-99,164-167,232-235:204+32-35,100-103,168-171,236-239:205+36-39,104-107,172-175,240-243:18+40-43,108-111,176-179,244-247:19+44-47,112-115,180-183,248-251:86+48-51,116-119,184-187,252-255:87+52-55,120-123,188-191,256-259:154+56-59,124-127,192-195,260-263:155+60-63,128-131,196-199,264-267:222+64-67,132-135,200-203,268-271:223 -m 32G@0,12G@1
else
PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \
sudo ${mck_dir}/sbin/mcreboot.sh -s -c 1-15,65-79,129-143,193-207,17-31,81-95,145-159,209-223,33-47,97-111,161-175,225-239,49-63,113-127,177-191,241-255 -r 1-15:0+65-79:64+129-143:128+193-207:192+17-31:16+81-95:80+145-159:144+209-223:208+33-47:32+97-111:96+161-175:160+225-239:224+49-63:48+113-127:112+177-191:176+241-255:240 -m 12G@0,12G@1,12G@2,12G@3,3920M@4,3920M@5,3920M@6,3920M@7
fi
else
:
fi
fi
cd ${UTI_MPI_TOP}
(
cat <<EOF
#!/bin/sh
export I_MPI_HYDRA_BOOTSTRAP_EXEC=/usr/bin/ssh
export I_MPI_HYDRA_BOOTSTRAP=ssh
export OMP_NUM_THREADS=$omp_num_threads
#export OMP_STACKSIZE=64M
export KMP_BLOCKTIME=1
export PSM2_RCVTHREAD=0
export I_MPI_PIN=$i_mpi_pin
$i_mpi_pin_domain
$i_mpi_pin_order
export HFI_NO_CPUAFFINITY=1
export I_MPI_COLL_INTRANODE_SHM_THRESHOLD=4194304
export I_MPI_FABRICS=shm:tmi
export PSM2_RCVTHREAD=0
export I_MPI_TMI_PROVIDER=psm2
export I_MPI_FALLBACK=0
export PSM2_MQ_RNDV_HFI_WINDOW=4194304
export PSM2_MQ_EAGER_SDMA_SZ=65536
export PSM2_MQ_RNDV_HFI_THRESH=200000
export MCKERNEL_RLIMIT_STACK=32M,16G
export KMP_STACKSIZE=64m
export KMP_AFFINITY=granularity=thread,scatter
#export KMP_HW_SUBSET=64c,1t
export I_MPI_ASYNC_PROGRESS=$async
$i_mpi_async_progress_pin
export MY_ASYNC_PROGRESS=$myasync
#export I_MPI_STATS=native:20,ipm
#export I_MPI_STATS=ipm
#export I_MPI_DEBUG=4
#export I_MPI_HYDRA_DEBUG=on
mpiexec.hydra -l -n $nprocs -ppn $ppn -hosts $nodes $ilpopt $mcexec $mcexecopt ./$exe -n $nsamples
EOF
) > ./job.sh
chmod u+x ./job.sh
if [ ${go} -eq 1 ]; then
. /home/opt/local/cores/intel/compilers_and_libraries_2018.1.163/linux/bin/compilervars.sh intel64
cd ${UTI_MPI_TOP}
make ./$exe
bash -c '. /home/opt/local/cores/intel/compilers_and_libraries_2018.1.163/linux/bin/compilervars.sh intel64; ./job.sh'
fi

346
test/uti/mpi/015.c Executable file
View File

@@ -0,0 +1,346 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <sys/mman.h>
#include <mpi.h>
#include <unistd.h>
#include <getopt.h>
#include <sys/syscall.h> /* For SYS_xxx definitions */
#include <sched.h>
#include "async_progress.h"
//#define DEBUG
#ifdef DEBUG
#define dprintf printf
#else
#define dprintf {}
#endif
#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec))
#define BEGIN_EPOCH(win) do { MPI_Win_lock_all(0, win); } while(0)
#define END_EPOCH(win) do { MPI_Win_unlock_all(win); } while(0)
static inline uint64_t rdtsc_light(void )
{
uint64_t x;
__asm__ __volatile__("rdtscp;" /* rdtscp don't jump over earlier instructions */
"shl $32, %%rdx;"
"or %%rdx, %%rax" :
"=a"(x) :
:
"%rcx", "%rdx", "memory");
return x;
}
static inline void fixed_size_work() {
asm volatile(
"movq $0, %%rcx\n\t"
"1:\t"
"addq $1, %%rcx\n\t"
"cmpq $99, %%rcx\n\t"
"jle 1b\n\t"
:
:
: "rcx", "cc");
}
static inline void bulk_fsw(unsigned long n) {
int j;
for (j = 0; j < (n); j++) {
fixed_size_work();
}
}
long cyc, cycpw; /* cycles per work */
void fwq_init() {
long start, end;
int i;
start = rdtsc_light();
#define N_INIT 10000000
bulk_fsw(N_INIT);
end = rdtsc_light();
cyc = end - start;
cycpw = cyc / (double)N_INIT;
}
#if 0
void fwq(long delay_cyc) {
if (delay_cyc < 0) {
return;
//printf("%s: delay_cyc < 0\n", __FUNCTION__);
}
bulk_fsw(delay_cyc / cycpw);
}
#else /* For machines with large core-to-core performance variation (e.g. OFP) */
void fwq(long delay_cyc) {
long start, end;
if (delay_cyc < 0) { return; }
start = rdtsc_light();
while (1) {
end = rdtsc_light();
if (end - start >= delay_cyc) {
break;
}
bulk_fsw(2); /* ~150 ns per iteration on FOP */
}
}
#endif
static int print_cpu_last_executed_on() {
char fn[256];
char* result;
pid_t tid = syscall(SYS_gettid);
int fd;
int offset;
int mpi_errno = 0;
sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid);
//printf("fn=%s\n", fn);
fd = open(fn, O_RDONLY);
if(fd == -1) {
printf("open() failed\n");
goto fn_fail;
}
result = malloc(65536);
if(result == NULL) {
printf("malloc() failed");
goto fn_fail;
}
int amount = 0;
offset = 0;
while(1) {
amount = read(fd, result + offset, 65536);
// printf("amount=%d\n", amount);
if(amount == -1) {
printf("read() failed");
goto fn_fail;
}
if(amount == 0) {
goto eof;
}
offset += amount;
}
eof:;
//printf("result:%s\n", result);
char* next_delim = result;
char* field;
int i;
for(i = 0; i < 39; i++) {
field = strsep(&next_delim, " ");
}
int cpu = sched_getcpu();
if(cpu == -1) {
printf("getpu() failed\n");
goto fn_fail;
}
printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, tid); fflush(stdout);
fn_exit:
free(result);
return mpi_errno;
fn_fail:
mpi_errno = -1;
goto fn_exit;
}
/* ga_acc per rank:ga_sync=40:1 */
void rma(int nproc, int my_rank, double *wbuf, double *rbuf, int ndoubles, MPI_Win win, long calc_nsec) {
int i, j;
int r = 0, s = 0;
int req = 0;
for (i = 0; i < nproc; i++) {
if (i != my_rank) {
for (j = 0; j < ndoubles; j++) {
MPI_Accumulate(rbuf + i * ndoubles + j, 1, MPI_DOUBLE,
i, i * ndoubles + j, 1, MPI_DOUBLE,
MPI_SUM, win);
MPI_Win_flush_local(i, win); /* ga_acc() calls flush_local() immediately */
}
}
}
fwq(calc_nsec);
}
static struct option options[] = {
{
.name = "ppn",
.has_arg = required_argument,
.flag = NULL,
.val = 'P',
},
/* end */
{ NULL, 0, NULL, 0, },
};
int main(int argc, char **argv) {
int rc;
int actual;
int ppn = -1;
int nproc;
int ndoubles = -1;
double add_rate = 1.0;
int my_rank = -1, size = -1;
int i, j, k, l;
double *wbuf, *rbuf, *result;
MPI_Win win;
long start, end;
//struct timespec start, end;
long t_pure_l, t_overall_l;
long t_pure, t_overall;
int opt;
fwq_init();
while ((opt = getopt_long(argc, argv, "+d:P:R:", options, NULL)) != -1) {
switch (opt) {
case 'd':
ndoubles = atoi(optarg);
break;
case 'P':
ppn = atoi(optarg);
break;
case 'R':
add_rate = atof(optarg);
break;
default: /* '?' */
printf("unknown option %c\n", optopt);
exit(1);
}
}
if (ndoubles == -1 || ppn == -1) {
printf("specify ndoubles with -d and ppn with --ppn");
exit(1);
}
MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &actual);
if (actual != 3) {
printf("ERROR: MPI_THREAD_MULTIPLE not available (level was set to %d)\n", actual);
exit(1);
}
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
MPI_Comm_size(MPI_COMM_WORLD, &nproc);
if (my_rank == 0) {
printf("ndoubles=%d,nproc=%d,add_rate=%f\n", ndoubles, nproc, add_rate);
printf("cyc=%ld, cycpw=%ld\n", cyc, cycpw);
}
/* accumulate-to buffer */
wbuf = malloc(sizeof(double) * ndoubles * nproc);
if(!wbuf) { printf("malloc failed"); goto fn_fail; }
memset(wbuf, 0, sizeof(double) * ndoubles * nproc);
/* read-from buffer */
rbuf = malloc(sizeof(double) * ndoubles * nproc);
if(!rbuf) { printf("malloc failed"); goto fn_fail; }
memset(rbuf, 0, sizeof(double) * ndoubles * nproc);
/* Expose accumulate-to buffer*/
if (rc = MPI_Win_create(wbuf, sizeof(double) * ndoubles * nproc, sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &win)) {
printf("MPI_Win_create failed,rc=%d\n", rc);
}
//print_cpu_last_executed_on();
for (i = 0; i < nproc; i++) {
for (j = 0; j < ndoubles; j++) {
wbuf[i * ndoubles + j] = (i + 1) * 1000 + (j + 1);
rbuf[i * ndoubles + j] = (i + 1) * 10000 + (j + 1);
}
}
#if 0
for (i = 0; i < nproc; i++) {
for (j = 0; j < ndoubles; j++) {
printf("wbuf,proc=%d,j=%d,val=%f\n", i, j, wbuf[i * ndoubles + j]);
printf("rbuf,proc=%d,j=%d,val=%f\n", i, j, rbuf[i * ndoubles + j]);
}
}
#endif
for (k = 0; k < 2; k++) {
if (k == 1) {
INIT_ASYNC_THREAD_();
}
/* Measure get_acc-flush time */
MPI_Barrier(MPI_COMM_WORLD);
#define NPURE 10
//clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
start = rdtsc_light();
MPI_Pcontrol(1, "rma");
syscall(701, 1);
syscall(701, 2);
for (i = 0; i < NPURE; i++) {
BEGIN_EPOCH(win);
rma(nproc, my_rank, wbuf, rbuf, ndoubles, win, 0);
END_EPOCH(win);
}
MPI_Pcontrol(-1, "rma");
syscall(701, 4);
syscall(701, 8);
end = rdtsc_light();
//clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
MPI_Barrier(MPI_COMM_WORLD);
t_pure_l = (end - start) / NPURE;
//t_pure_l = DIFFNSEC(end, start) / NPURE;
//printf("t_pure (local): %ld usec\n", t_pure_l / 1000UL);
MPI_Allreduce(&t_pure_l, &t_pure, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD);
if (my_rank == 0) printf("t_pure (max): %ld cycles\n", t_pure);
#if 1
for (l = 1; l <= 10; l++) {
MPI_Barrier(MPI_COMM_WORLD);
#define NOVERALL 10
start = rdtsc_light();
for (i = 0; i < NOVERALL; i++) {
BEGIN_EPOCH(win);
rma(nproc, my_rank, wbuf, rbuf, ndoubles, win, 100UL * 1000000 * l);
END_EPOCH(win);
}
end = rdtsc_light();
MPI_Barrier(MPI_COMM_WORLD);
t_overall_l = (end - start) / NOVERALL;
MPI_Allreduce(&t_overall_l, &t_overall, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD);
if (my_rank == 0) printf("t_overall (max): %ld cycle\n", t_overall);
}
#endif
if (k == 1) {
FINALIZE_ASYNC_THREAD_();
}
#if 0
for (i = 0; i < nproc; i++) {
for (j = 0; j < ndoubles; j++) {
printf("wbuf,proc=%d,j=%d,val=%f\n", i, j, wbuf[i * ndoubles + j]);
printf("rbuf,proc=%d,j=%d,val=%f\n", i, j, rbuf[i * ndoubles + j]);
printf("result,proc=%d,j=%d,val=%f\n", i, j, result[i * ndoubles + j]);
}
}
#endif
}
fn_exit:
MPI_Finalize();
return 0;
fn_fail:
goto fn_exit;
}

189
test/uti/mpi/015.sh Executable file
View File

@@ -0,0 +1,189 @@
#!/usr/bin/bash
#!/usr/bin/bash -x
MYHOME=/work/gg10/e29005
UTI_MPI_TOP=${MYHOME}/project/os/mckernel/test/uti/mpi
mck_dir=${MYHOME}/project/os/install
exe=`basename $0 | sed 's/\.sh//'`
stop=0
reboot=0
go=0
async=0
mck=0
nnodes=2
LASTNODE=8200
ndoubles=16 #2^12-15
add_rate="1.0"
disable_uti=0
omp_num_threads=1
ppn=16 #16
async_progress_pin=64,132,200,268,65,133,201,269,66,134,202,270,67,135,203,271
lpp=4 # logical-per-physical
ncpu_mt=256 # number of CPUs for main-thread
myasync=1
use_hfi=0
while getopts srga:c:n:md:l:N:P:o:A:R: OPT
do
case ${OPT} in
s) stop=1
;;
r) reboot=1
;;
g) go=1
;;
a) async=$OPTARG
;;
n) ndoubles=$OPTARG
;;
m) mck=1
;;
d) disable_uti=$OPTARG
;;
N) nnodes=$OPTARG
;;
P) ppn=$OPTARG
;;
o) omp_num_threads=$OPTARG
;;
A) myasync=$OPTARG
;;
R) add_rate=$OPTARG
;;
*) echo "invalid option -${OPT}" >&2
exit 1
esac
done
nprocs=$((ppn * nnodes))
nodes=`echo $(seq -s ",c" $(($LASTNODE + 1 - $nnodes)) $LASTNODE) | sed 's/^/c/'`
echo nprocs=$nprocs nnodes=$nnodes ppn=$ppn nodes=$nodes
PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes bash -c \'if \[ \"\`cat /etc/mtab \| while read line\; do cut -d\" \" -f 2\; done \| grep /work\`\" == \"\" \]\; then sudo mount /work\; fi\'
if [ $disable_uti -eq 1 ]; then
export DISABLE_UTI=1
else
unset DISABLE_UTI
fi
if [ ${mck} -eq 1 ]; then
mcexec="${mck_dir}/bin/mcexec"
nmcexecthr=$((omp_num_threads + 4))
mcexecopt="--uti-thread-rank=$uti_thread_rank"
if [ ${use_hfi} -eq 1 ]; then
mcexecopt="--enable-hfi1 $mcexecopt"
fi
mcexecopt="-n $ppn -t $nmcexecthr $mcexecopt"
else
mcexec=
mcexecopt=
fi
if [ ${mck} -eq 1 ]; then
i_mpi_pin=off
i_mpi_pin_domain=
i_mpi_pin_order=
else
# Let each domain have all logical cores and use KMP_AFFINITY=scatter if you want to use only physical cores
i_mpi_pin=on
if [ $((omp_num_threads * lpp * ppn)) -le $ncpu_mt ]; then
domain=$((omp_num_threads * lpp)) # Prefer physical but adjacent physicals share L1
else
domain=$((ncpu_mt / ppn)) # Use logical as well
fi
i_mpi_pin_domain="export I_MPI_PIN_DOMAIN=$domain"
i_mpi_pin_order="export I_MPI_PIN_ORDER=compact"
fi
if [[ ($async -eq 1 && "$async_progress_pin" != "" ) || $myasync -eq 1 ]]; then
i_mpi_async_progress_pin="export I_MPI_ASYNC_PROGRESS_PIN=$async_progress_pin"
else
i_mpi_async_progress_pin=
fi
if [ ${stop} -eq 1 ]; then
if [ ${mck} -eq 1 ]; then
PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes sudo ${mck_dir}/sbin/mcstop+release.sh
else
:
fi
fi
if [ ${reboot} -eq 1 ]; then
if [ ${mck} -eq 1 ]; then
if hostname | grep ofp &>/dev/null; then
PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes sudo ${mck_dir}/sbin/mcreboot.sh -s -c 2-17,70-85,138-153,206-221,20-35,88-103,156-171,224-239,36-51,104-119,172-187,240-255,52-67,120-135,188-203,256-271 -r 2-5,70-73,138-141,206-209:0+6-9,74-77,142-145,210-213:1+10-13,78-81,146-149,214-217:68+14-17,82-85,150-153,218-221:69+20-23,88-91,156-159,224-227:136+24-27,92-95,160-163,228-231:137+28-31,96-99,164-167,232-235:204+32-35,100-103,168-171,236-239:205+36-39,104-107,172-175,240-243:18+40-43,108-111,176-179,244-247:19+44-47,112-115,180-183,248-251:86+48-51,116-119,184-187,252-255:87+52-55,120-123,188-191,256-259:154+56-59,124-127,192-195,260-263:155+60-63,128-131,196-199,264-267:222+64-67,132-135,200-203,268-271:223 -m 32G@0,12G@1
else
PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes sudo ${mck_dir}/sbin/mcreboot.sh -s -c 1-15,65-79,129-143,193-207,17-31,81-95,145-159,209-223,33-47,97-111,161-175,225-239,49-63,113-127,177-191,241-255 -r 1-15:0+65-79:64+129-143:128+193-207:192+17-31:16+81-95:80+145-159:144+209-223:208+33-47:32+97-111:96+161-175:160+225-239:224+49-63:48+113-127:112+177-191:176+241-255:240 -m 12G@0,12G@1,12G@2,12G@3,3920M@4,3920M@5,3920M@6,3920M@7
fi
else
:
fi
fi
cd ${UTI_MPI_TOP}
(
cat <<EOF
#!/bin/sh
export I_MPI_HYDRA_BOOTSTRAP_EXEC=/usr/bin/ssh
export I_MPI_HYDRA_BOOTSTRAP=ssh
export OMP_NUM_THREADS=$omp_num_threads
#export OMP_STACKSIZE=64M
export KMP_BLOCKTIME=1
export PSM2_RCVTHREAD=0
export I_MPI_PIN=$i_mpi_pin
$i_mpi_pin_domain
$i_mpi_pin_order
export HFI_NO_CPUAFFINITY=1
export I_MPI_COLL_INTRANODE_SHM_THRESHOLD=4194304
export I_MPI_FABRICS=shm:tmi
export PSM2_RCVTHREAD=0
export I_MPI_TMI_PROVIDER=psm2
export I_MPI_FALLBACK=0
export PSM2_MQ_RNDV_HFI_WINDOW=4194304
export PSM2_MQ_EAGER_SDMA_SZ=65536
export PSM2_MQ_RNDV_HFI_THRESH=200000
export MCKERNEL_RLIMIT_STACK=32M,16G
export KMP_STACKSIZE=64m
export KMP_AFFINITY=granularity=thread,scatter
#export KMP_HW_SUBSET=64c,1t
export I_MPI_ASYNC_PROGRESS=$async
$i_mpi_async_progress_pin
export MY_ASYNC_PROGRESS=$myasync
#export I_MPI_STATS=native:20,ipm
#export I_MPI_STATS=ipm
#export I_MPI_DEBUG=4
#export I_MPI_HYDRA_DEBUG=on
mpiexec.hydra -l -n $nprocs -ppn $ppn -hosts $nodes $ilpopt $mcexec $mcexecopt ./$exe --ppn $ppn -d $ndoubles -R $add_rate
EOF
) > ./job.sh
chmod u+x ./job.sh
if [ ${go} -eq 1 ]; then
cd ${UTI_MPI_TOP}
if [ $mck -eq 1 ]; then
make $exe
else
. /home/opt/local/cores/intel/compilers_and_libraries_2018.1.163/linux/bin/compilervars.sh intel64
make CC=mpiicc $exe
fi
./job.sh
fi

349
test/uti/mpi/016.c Executable file
View File

@@ -0,0 +1,349 @@
#define _GNU_SOURCE /* See feature_test_macros(7) */
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <mpi.h>
#include <unistd.h>
#include <sys/syscall.h> /* For SYS_xxx definitions */
#include <getopt.h>
#include <sched.h>
#include <sys/time.h>
#include <sys/resource.h>
#include "async_progress.h"
#include "util.h"
#define MYTIME_UNIT "usec"
#define MYTIME_TOUSEC 1000000
#define MYTIME_TONSEC 1000000000
#define NROW 16 /* 0%, 10%, ..., 140% */
#define NCOL 4
#define NSAMPLES_DROP 5/*10*/
#define NSAMPLES_COMM 10/*20*/
#define NSAMPLES_TOTAL 10/*20*/
#define NSAMPLES_INNER 5
#define PROGRESS_CALC_PHASE_ONLY
static inline double mytime() {
return /*rdtsc_light()*/MPI_Wtime();
}
static int ppn = -1;
void init_buf(double *origin_buf, double *result, double *target_buf, int szbuf, int rank, int id) {
int j;
for (j = 0; j < szbuf; j++) {
origin_buf[j] = (rank + 1) * 100.0 + (j + 1);
result[j] = (id + 1) * 100000000.0 + (rank + 1) * 10000.0 + (j + 1);
target_buf[j] = (rank + 1) * 1000000.0 + (j + 1);
}
}
void pr_buf(double *origin_buf, double *result, double *target_buf, int szbuf, int rank, int nproc) {
int i, j;
for (i = 0; i < nproc; i++) {
MPI_Barrier(MPI_COMM_WORLD);
if (i != rank) {
usleep(100000);
continue;
}
for (j = 0; j < szbuf; j++) {
pr_debug("[%d] origin_buf,j=%d,val=%f\n", rank, j, origin_buf[j]);
pr_debug("[%d] result,j=%d,val=%f\n", rank, j, result[j]);
pr_debug("[%d] target_buf,j=%d,val=%f\n", rank, j, target_buf[j]);
}
}
}
void rma(int rank, int nproc, MPI_Win win, double *origin_buf, double *result, int szbuf, long nsec_calc, int async_progress, int sync_progress, double pct_calc) {
int i, j, target_rank;
int completed, ret;
for (j = 0; j < NSAMPLES_INNER; j++) {
for (i = 1; i < nproc; i++) {
target_rank = (rank + i) % nproc;
MPI_Get_accumulate(origin_buf, szbuf, MPI_DOUBLE,
result, szbuf, MPI_DOUBLE,
target_rank,
0, szbuf, MPI_DOUBLE,
MPI_NO_OP, win);
#if 0
if (sync_progress) {
if ((ret = MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &completed, MPI_STATUS_IGNORE)) != MPI_SUCCESS) {
pr_err("%s: error: MPI_Iprobe: %d\n", __func__, ret);
}
}
#endif
}
}
if (async_progress) {
#ifdef PROGRESS_CALC_PHASE_ONLY
progress_start();
#endif
}
ndelay(nsec_calc);
if (async_progress) {
#ifdef PROGRESS_CALC_PHASE_ONLY
progress_stop();
#endif
}
#define MAX2(x,y) ((x) > (y) ? (x) : (y))
#if 1
/* iprobe is 10 times faster than win_flush_local_all,
20679 usec / (8*63*5) messages for 8-ppn 8-node case */
if (1/*!sync_progress*/)
for (j = 0; j < (async_progress ? MAX2(NSAMPLES_INNER * (nproc - 1) * (1.0 - pct_calc), nproc - 1) : NSAMPLES_INNER * (nproc - 1)); j++) {
//for (j = 0; j < MAX2(NSAMPLES_INNER * (nproc - 1) * (1.0 - pct_calc), nproc - 1); j++) {
if ((ret = MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &completed, MPI_STATUS_IGNORE)) != MPI_SUCCESS) {
pr_err("%s: error: MPI_Iprobe: %d\n", __func__, ret);
}
}
#endif
MPI_Win_flush_local_all(win);
}
double measure(int rank, int nproc, MPI_Win win, double *origin_buf, double* result, double *target_buf, int szbuf, long nsec_calc, int async_progress, int sync_progress, int nsamples, int nsamples_drop, double pct_calc) {
int i;
double t_l, t_g, t_sum = 0;
double start, end;
for (i = 0; i < nsamples + nsamples_drop; i++) {
MPI_Barrier(MPI_COMM_WORLD);
MPI_Win_lock_all(0, win);
/* Set parameter based on current IPC and frequency */
ndelay_init(0);
start = mytime();
rma(rank, nproc, win, origin_buf, result, szbuf, nsec_calc, async_progress, sync_progress, pct_calc);
end = mytime();
MPI_Win_unlock_all(win);
MPI_Barrier(MPI_COMM_WORLD);
t_l = end - start;
MPI_Allreduce(&t_l, &t_g, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
if (i < nsamples_drop) {
continue;
}
t_sum += t_g;
}
return t_sum / nsamples;
}
int main(int argc, char **argv)
{
int ret;
int actual;
int rank = -1;
int nproc;
int i, j, progress, l, m;
double *target_buf, *origin_buf, *result;
MPI_Win win;
double t_comm_l, t_comm_g, t_comm_sum, t_comm_ave;
double t_total_l, t_total_g, t_total_sum, t_total_ave;
double t_table[NROW][NCOL];
int opt;
int szbuf = 1; /* Number of doubles to send */
struct rusage ru_start, ru_end;
struct timeval tv_start, tv_end;
int disable_syscall_intercept = 0;
cpu_set_t cpuset;
//test_set_loglevel(TEST_LOGLEVEL_WARN);
ndelay_init(1);
while ((opt = getopt(argc, argv, "+p:I:")) != -1) {
switch (opt) {
case 'p':
ppn = atoi(optarg);
break;
case 'I':
disable_syscall_intercept = atoi(optarg);
break;
default: /* '?' */
printf("unknown option %c\n", optopt);
ret = -1;
goto out;
}
}
if (ppn == -1) {
pr_err("Error: Specify processes-per-rank with -p");
ret = -1;
goto out;
}
MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &actual);
if (actual != MPI_THREAD_MULTIPLE) {
pr_err("Error: MPI_THREAD_MULTIPLE is not available\n");
ret = -1;
goto out;
}
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &nproc);
if (rank == 0) {
printf("ndoubles=%d,nproc=%d\n", szbuf, nproc);
#pragma omp parallel
{
//printf("%d cpu\n", sched_getcpu());
if (omp_get_thread_num() == 0) {
printf("#threads=%d\n", omp_get_num_threads());
}
}
}
/* accumulate-to buffer */
target_buf = malloc(sizeof(double) * szbuf);
if (!target_buf) {
pr_err("Error: allocating target_buf");
ret = -1;
goto out;
}
memset(target_buf, 0, sizeof(double) * szbuf);
/* read-from buffer */
origin_buf = malloc(sizeof(double) * szbuf);
if (!origin_buf) {
pr_err("Error: alloacting origin_buf");
ret = -1;
goto out;
}
memset(origin_buf, 0, sizeof(double) * szbuf);
/* fetch-to buffer */
result = malloc(sizeof(double) * szbuf);
if (!result) {
pr_err("Error: allocating result");
ret = -1;
goto out;
}
memset(result, 0, sizeof(double) * szbuf);
/* Expose accumulate-to buffer*/
ret = MPI_Win_create(target_buf, sizeof(double) * szbuf, sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &win);
if (ret != 0) {
pr_err("Error: MPI_Win_create returned %d\n", ret);
ret = -1;
goto out;
}
/* Measure RMA-only time */
init_buf(origin_buf, result, target_buf, szbuf, rank, 99);
t_comm_ave = measure(rank, nproc, win, origin_buf, result, target_buf, szbuf, 0, 0, 1, NSAMPLES_COMM, NSAMPLES_DROP, 0);
if (rank == 0) {
printf("t_comm_ave: %.0f %s\n", t_comm_ave * MYTIME_TOUSEC, MYTIME_UNIT);
}
#ifdef PROFILE
syscall(701, 1 | 2 | 0x80000000); /* syscall profile start */
#endif
/* 0: no progress, 1: progress, no uti, 2: progress, uti */
for (progress = 0; progress <= (disable_syscall_intercept ? 0 : 2); progress += 1) {
if (progress == 1) {
setenv("DISABLE_UTI", "1", 1); /* Don't use uti_attr and pin to Linux/McKernel CPUs */
progress_init();
} else if (progress == 2) {
progress_finalize();
unsetenv("DISABLE_UTI");
progress_init();
}
if (progress == 1 || progress == 2) {
#ifndef PROGRESS_CALC_PHASE_ONLY
//progress_start();
#endif
}
/* RMA-start, compute for T_{RMA} * l / 10, RMA-flush */
for (l = 0; l <= NROW - 1; l += 1) {
long nsec_calc = (t_comm_ave * MYTIME_TONSEC * l) / 10;
init_buf(origin_buf, result, target_buf, szbuf, rank, l);
//pr_buf(origin_buf, result, target_buf, szbuf, rank, nproc);
t_total_ave = measure(rank, nproc, win, origin_buf, result, target_buf, szbuf, nsec_calc, progress, 0, NSAMPLES_TOTAL, NSAMPLES_DROP, l / 10.0);
//pr_buf(origin_buf, result, target_buf, szbuf, rank, nproc);
if (rank == 0) {
if (l == 0) {
pr_debug("progress=%d\n", progress);
if (progress == 0) {
pr_debug("calc\ttotal\n");
} else {
pr_debug("total\n");
}
}
t_table[l][0] = nsec_calc * (MYTIME_TOUSEC / (double)MYTIME_TONSEC);
if (progress == 0) {
pr_debug("%.0f\t%.0f\n", nsec_calc * (MYTIME_TOUSEC / (double)MYTIME_TONSEC), t_total_ave * MYTIME_TOUSEC);
t_table[l][progress + 1] = t_total_ave * MYTIME_TOUSEC;
} else {
pr_debug("%.0f\n", t_total_ave * MYTIME_TOUSEC);
t_table[l][progress + 1] = t_total_ave * MYTIME_TOUSEC;
}
}
}
if (progress == 1 || progress == 2) {
#ifndef PROGRESS_CALC_PHASE_ONLY
//progress_stop();
#endif
}
}
#ifdef PROFILE
syscall(701, 4 | 8 | 0x80000000); /* syscall profile report */
#endif
if (rank == 0) {
printf("calc,no prog,prog and no uti, prog and uti\n");
for (l = 0; l <= NROW - 1; l++) {
for (i = 0; i < NCOL; i++) {
if (i > 0) {
printf(",");
}
printf("%.0f", t_table[l][i]);
}
printf("\n");
}
}
MPI_Barrier(MPI_COMM_WORLD);
if (progress >= 1) {
progress_finalize();
}
MPI_Finalize();
ret = 0;
out:
return ret;
}

272
test/uti/mpi/016.sh Executable file
View File

@@ -0,0 +1,272 @@
#!/usr/bin/bash
#!/usr/bin/bash -x
MYHOME=/home/e29005
test_dir=`pwd -P`
mck_dir=${MYHOME}/project/os/install
uti_dir_lin=${MYHOME}/project/uti/install_linux
uti_dir_mck=${MYHOME}/project/uti/install_mckernel
exe=`basename $0 | sed 's/\.sh//'`
stop=0
reboot=0
go=0
interactive=0
pjsub=0
gdb=0
disable_syscall_intercept=0
mck=0
nnodes=2
LASTNODE=8196
use_hfi=0
omp_num_threads=32
ppn=4
while getopts srgc:ml:N:P:o:hGI:ipL: OPT
do
case ${OPT} in
s) stop=1
;;
r) reboot=1
;;
g) go=1
;;
m) mck=1
;;
N) nnodes=$OPTARG
;;
P) ppn=$OPTARG
;;
o) omp_num_threads=$OPTARG
;;
h) use_hfi=1
;;
G) gdb=1
;;
I) disable_syscall_intercept=$OPTARG
;;
i) interactive=1
;;
p) pjsub=1
;;
L) LASTNODE=$OPTARG
;;
*) echo "invalid option -${OPT}" >&2
exit 1
esac
done
nprocs=$((ppn * nnodes))
nodes=`echo $(seq -s ",c" $(($LASTNODE + 1 - $nnodes)) $LASTNODE) | sed 's/^/c/'`
# vertical cut, excluding phys loaded with Linux tasks
uti_cpu_set_lin=1,69,137,205,18-19,86-87,154-155,222-223
exclude_list=0-1,68-69,136-137,204-205,18-19,86-87,154-155,222-223
#64-67,132-135,200-203,268-271
uti_cpu_set_mck=1,69,137,205,18-19,86-87,154-155,222-223
# horizontal cut, excluding phys loaded with Linux tasks for mckernel
#uti_cpu_set_lin=204-271
#uti_cpu_set_mck=1-67
if [ $mck -eq 0 ]; then
uti_cpu_set_str="export UTI_CPU_SET=$uti_cpu_set_lin"
i_mpi_pin_processor_exclude_list="export I_MPI_PIN_PROCESSOR_EXCLUDE_LIST=$exclude_list"
else
uti_cpu_set_str="export UTI_CPU_SET=$uti_cpu_set_mck"
i_mpi_pin_processor_exclude_list=
fi
if [ ${mck} -eq 1 ]; then
i_mpi_pin=off
i_mpi_pin_domain=
i_mpi_pin_order=
# if [ $omp_num_threads -eq 1 ]; then
# # Avoid binding main thread and uti thread to one CPU
kmp_affinity="export KMP_AFFINITY=disabled"
# else
# # Bind rank to OMP_NUM_THREAD-sized CPU-domain
# kmp_affinity="export KMP_AFFINITY=granularity=thread,scatter"
# fi
else
i_mpi_pin=on
domain=$omp_num_threads # Use 32 when you want to match mck's -n division
i_mpi_pin_domain="export I_MPI_PIN_DOMAIN=$domain"
i_mpi_pin_order="export I_MPI_PIN_ORDER=compact"
kmp_affinity="export KMP_AFFINITY=granularity=thread,scatter"
fi
echo nprocs=$nprocs nnodes=$nnodes ppn=$ppn nodes=$nodes domain=$domain
if [ ${mck} -eq 1 ]; then
makeopt="UTI_DIR=$uti_dir_mck"
use_mck="#PJM -x MCK=$mck_dir"
mck_mem="#PJM -x MCK_MEM=32G@0,8G@1"
mcexec="${mck_dir}/bin/mcexec"
nmcexecthr=$((omp_num_threads + 4))
mcexecopt="-n $ppn --uti-use-last-cpu" # -t $nmcexecthr
if [ ${use_hfi} -eq 1 ]; then
mcexecopt="--enable-hfi1 $mcexecopt"
fi
if [ $disable_syscall_intercept -eq 0 ]; then
mcexecopt="--enable-uti $mcexecopt"
fi
else
offline=`PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes lscpu \| grep Off 2>&1 | dshbak -c | grep Off`
if [ "$offline" != "" ]; then
echo "Error: Some CPUs are offline: $offline"
exit
fi
makeopt="UTI_DIR=$uti_dir_lin"
use_mck=
mck_mem=
mcexec=
mcexecopt=
fi
if [ $gdb -eq 1 ]; then
enable_x="-enable-x"
gdbcmd="xterm -display localhost:11 -hold -e gdb -ex run --args"
fi
if [ $interactive -eq 1 ]; then
i_mpi_hydra_bootstrap_exec=
i_mpi_hydra_bootstrap=
hosts=
opt_dir=/opt/intel
ssh=
else
# PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes bash -c \'if \[ \"\`cat /etc/mtab \| while read line\; do cut -d\" \" -f 2\; done \| grep /work\`\" == \"\" \]\; then sudo mount /work\; fi\'
i_mpi_hydra_bootstrap_exec="export I_MPI_HYDRA_BOOTSTRAP_EXEC=/usr/bin/ssh"
i_mpi_hydra_bootstrap="export I_MPI_HYDRA_BOOTSTRAP=ssh"
hosts="-hosts $nodes"
opt_dir=/home/opt/local/cores/intel
ssh="ssh -A c$LASTNODE"
fi
# If using ssh
# Latest versions are: 1.163, 2.199, 3.222
if [ $pjsub -eq 0 ] && [ $interactive -eq 0 ]; then
compilervars=". ${opt_dir}/compilers_and_libraries_2018.2.199/linux/bin/compilervars.sh intel64"
else
compilervars=
fi
if [ ${stop} -eq 1 ]; then
if [ ${mck} -eq 1 ]; then
PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \
/usr/sbin/pidof mcexec \| xargs -r sudo kill -9
PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \
/usr/sbin/pidof $exe \| xargs -r sudo kill -9
PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \
sudo ${mck_dir}/sbin/mcstop+release.sh
else
PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \
/usr/sbin/pidof $exe \| xargs -r sudo kill -9
fi
fi
if [ ${reboot} -eq 1 ]; then
if [ ${mck} -eq 1 ]; then
if hostname | grep ofp &>/dev/null; then
# -h: Hide idle thread to prevent KNL CPU from mux-ing resource and halving throughput
PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \
sudo ${mck_dir}/sbin/mcreboot.sh -h -O -c 2-17,70-85,138-153,206-221,20-35,88-103,156-171,224-239,36-51,104-119,172-187,240-255,52-67,120-135,188-203,256-271 -r 2-5,70-73,138-141,206-209:0+6-9,74-77,142-145,210-213:1+10-13,78-81,146-149,214-217:68+14-17,82-85,150-153,218-221:69+20-23,88-91,156-159,224-227:136+24-27,92-95,160-163,228-231:137+28-31,96-99,164-167,232-235:204+32-35,100-103,168-171,236-239:205+36-39,104-107,172-175,240-243:18+40-43,108-111,176-179,244-247:19+44-47,112-115,180-183,248-251:86+48-51,116-119,184-187,252-255:87+52-55,120-123,188-191,256-259:154+56-59,124-127,192-195,260-263:155+60-63,128-131,196-199,264-267:222+64-67,132-135,200-203,268-271:223 -m 32G@0,12G@1
# perl -e 'for ($i=0;$i<68;$i++){if($i>0){print "+";}printf("%d,%d,%d:%d", $i+68,$i+136,$i+204,$i);}'
# PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \
# sudo ${mck_dir}/sbin/mcreboot.sh -O -c 68-271 -r 68,136,204:0+69,137,205:1+70,138,206:2+71,139,207:3+72,140,208:4+73,141,209:5+74,142,210:6+75,143,211:7+76,144,212:8+77,145,213:9+78,146,214:10+79,147,215:11+80,148,216:12+81,149,217:13+82,150,218:14+83,151,219:15+84,152,220:16+85,153,221:17+86,154,222:18+87,155,223:19+88,156,224:20+89,157,225:21+90,158,226:22+91,159,227:23+92,160,228:24+93,161,229:25+94,162,230:26+95,163,231:27+96,164,232:28+97,165,233:29+98,166,234:30+99,167,235:31+100,168,236:32+101,169,237:33+102,170,238:34+103,171,239:35+104,172,240:36+105,173,241:37+106,174,242:38+107,175,243:39+108,176,244:40+109,177,245:41+110,178,246:42+111,179,247:43+112,180,248:44+113,181,249:45+114,182,250:46+115,183,251:47+116,184,252:48+117,185,253:49+118,186,254:50+119,187,255:51+120,188,256:52+121,189,257:53+122,190,258:54+123,191,259:55+124,192,260:56+125,193,261:57+126,194,262:58+127,195,263:59+128,196,264:60+129,197,265:61+130,198,266:62+131,199,267:63+132,200,268:64+133,201,269:65+134,202,270:66+135,203,271:67 -m 32G@0,12G@1
else
echo "unkwon host type"
exit 1
fi
else
:
fi
fi
(
cat <<EOF
#!/bin/sh
#PJM -L rscgrp=$rg
#PJM -L node=$nnodes
#PJM --mpi proc=$nprocs
#PJM -L elapse=$elapse
#PJM -L proc-crproc=16384
#PJM -g gg10
#PJM -j
#PJM -s
$use_mck
$mck_mem
$i_mpi_hydra_bootstrap_exec
$i_mpi_hydra_bootstrap
export OMP_NUM_THREADS=$omp_num_threads
#export OMP_STACKSIZE=64M
export KMP_BLOCKTIME=1
export PSM2_RCVTHREAD=0
$uti_cpu_set_str
export I_MPI_PIN=$i_mpi_pin
$i_mpi_pin_processor_exclude_list
$i_mpi_pin_domain
$i_mpi_pin_order
$kmp_affinity
export HFI_NO_CPUAFFINITY=1
export I_MPI_COLL_INTRANODE_SHM_THRESHOLD=4194304
export I_MPI_FABRICS=shm:tmi
export PSM2_RCVTHREAD=0
export I_MPI_TMI_PROVIDER=psm2
export I_MPI_FALLBACK=0
export PSM2_MQ_RNDV_HFI_WINDOW=4194304
export PSM2_MQ_EAGER_SDMA_SZ=65536
export PSM2_MQ_RNDV_HFI_THRESH=200000
export MCKERNEL_RLIMIT_STACK=32M,16G
export KMP_STACKSIZE=64m
#export KMP_HW_SUBSET=64c,1t
export I_MPI_ASYNC_PROGRESS=off
#export I_MPI_STATS=native:20,ipm
#export I_MPI_STATS=ipm
#export I_MPI_DEBUG=4
#export I_MPI_HYDRA_DEBUG=on
ulimit -c unlimited
$compilervars
mpiexec.hydra -n $nprocs -ppn $ppn $hosts $ilpopt $enable_x $gdbcmd $mcexec $mcexecopt ${test_dir}/$exe -p $ppn -I $disable_syscall_intercept
#-l
EOF
) > ./job.sh
chmod u+x ./job.sh
if [ ${go} -eq 1 ]; then
if [ $pjsub -eq 1 ]; then
pjsub ./job.sh
else
if [ $interactive -eq 0 ]; then
. ${opt_dir}/compilers_and_libraries_2018.2.199/linux/bin/compilervars.sh intel64
fi
rm ./$exe
make $makeopt ./$exe
PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \
/usr/sbin/pidof $exe \| xargs -r sudo kill -9
$ssh ${test_dir}/job.sh
fi
fi

56
test/uti/mpi/Makefile Executable file
View File

@@ -0,0 +1,56 @@
.SUFFIXES: # Clear suffixes
MYHOME=/home/e29005
# Specify it via 016.sh
UTI_DIR=${MYHOME}/project/uti/install_linux
CC=mpiicc
LD=$(CC)
CFLAGS = -g -O0 -Wall
LDFLAGS = -lpthread -lpsm2 -L$(UTI_DIR)/lib -Wl,-rpath -Wl,$(UTI_DIR)/lib -luti
SRCS = $(shell ls *.c)
OBJS = $(SRCS:.c=.o)
EXES = $(SRCS:.c=)
TMPFILES = $(shell ls psm2-demo-* 2>/dev/null)
all: $(EXES) file
file: $(TMPFILES)
rm -f $(TMPFILES)
dd if=/dev/zero of=./file bs=1M count=1
async_progress.o:: async_progress.c util.h
$(CC) $(CFLAGS) -I$(UTI_DIR)/include -c $<
util.o:: util.c util.h
$(CC) $(CFLAGS) -qopenmp -c $<
014: 014.o async_progress.o util.o
$(LD) -o $@ $^ $(LDFLAGS)
015: 015.o async_progress.o
$(LD) -o $@ $^ $(LDFLAGS)
016: 016.o async_progress.o util.o
$(LD) -o $@ $^ $(LDFLAGS) -qopenmp
016.o::016.c
$(CC) $(CFLAGS) -qopenmp -c $<
011: 011.o
$(LD) -o $@ $^ $(LDFLAGS) -qopenmp
011.o::011.c
$(CC) $(CFLAGS) -qopenmp -c $<
%: %.o
$(LD) -o $@ $^ $(LDFLAGS)
%.o::%.c
$(CC) $(CFLAGS) -c $<
clean:
rm -f core $(EXES) $(OBJS) $(DSRCS)

25
test/uti/mpi/README Normal file
View File

@@ -0,0 +1,25 @@
001 isend 送受信に使用するバッファは毎回異なる
002 barrier
003 isend 送受信に使用するバッファは一つ、waitの前にsleepしない
004 isend-calc-wait, all-to-all
005 lockall-accumulate-calc-unlockall, all-to-all
006 parent isend-calc-wait, child does nothing --> crash
007 parent isend-calc-wait, child psm2 send/recv --> one ep per process
008 parent psm2-init and psm2-connect, child psm2-send/recv --> receiver side crash
009 parent does nothing, child psm2-init, psm2-connect, psm2-send/recv --> receiver side crash
010 parent psm2-init, psm2-connect, psm2-send/recv, child does nothing
011 001にopenmpスレッドを追加
012 get_acc-calc-flush_local_all, all-to-all. Execute ./012.sh
013 acc-flush_local-calc, all-to-all, acc:flush_local=1:1
014 012 + async progress thread.
015 013 + async progress thread
016 MPI_Get_accumulate()のオーバーラップ
* 通信パターンは全対全、
* CPUはいくつかをprogress thread専用に割く
* ステップは以下の通り
(1) MPI_Get_accumulate()
(2) MPI_Get_accumulate()とMPI_Flush_local_all()だけを行った場合の
  時間の0.i倍の計算を実行
(3) MPI_Flush_local_all()

View File

@@ -0,0 +1,530 @@
#define _GNU_SOURCE /* See feature_test_macros(7) */
#include <unistd.h>
#include <sys/syscall.h> /* For SYS_xxx definitions */
#include <sched.h>
#include <pthread.h>
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <uti.h>
#include "util.h"
#include "async_progress.h"
//#define PROFILE
#define STOP_BY_MPI 0
#define STOP_BY_MEM 1
#define STOP_TYPE STOP_BY_MEM/*STOP_BY_MPI*/
#define POLL_BY_PROBE 0
#define POLL_BY_WAIT 1
#define POLL_BY_TEST 2
#define POLL_TYPE POLL_BY_PROBE/*POLL_BY_WAIT*/
static int progress_rank, progress_world_rank, progress_world_nproc;
static pthread_t progress_thr;
static pthread_mutex_t progress_mutex;
static pthread_cond_t progress_cond_down;
static volatile int progress_flag_up, progress_flag_down;
static enum progress_state progress_state;
static int progress_stop_flag;
static MPI_Comm progress_comm;
static int progress_refc;
#define WAKE_TAG 100
#define NROW_STAT 10
#define NRANK_STAT 1
#define RECORD_STAT(count, array, end, start) do { \
if (count < NROW_STAT) { \
array[count++] += (end - start); \
} \
} while(0)
static int cyc_prog1_count, cyc_prog2_count, cyc_init1_count, cyc_init2_count, cyc_start_count, cyc_stop1_count, cyc_stop2_count, cyc_stop3_count, cyc_finalize_count;
static unsigned long cyc_prog1[NROW_STAT];
static unsigned long cyc_prog2[NROW_STAT];
static unsigned long cyc_init1[NROW_STAT];
static unsigned long cyc_init2[NROW_STAT];
static unsigned long cyc_start[NROW_STAT];
static unsigned long cyc_stop1[NROW_STAT];
static unsigned long cyc_stop2[NROW_STAT];
static unsigned long cyc_stop3[NROW_STAT];
static unsigned long cyc_finalize[NROW_STAT];
#define MIN2(x,y) ((x) < (y) ? (x) : (y))
void pr_stat(char *name, int count, unsigned long *array) {
int i;
pr_debug("[%d] %s: ", progress_world_rank, name);
for (i = 0; i < MIN2(count, NROW_STAT); i++) {
if (i > 0) pr_debug(",");
pr_debug("%ld", array[i]);
}
pr_debug("\n");
}
static void *progress_fn(void* data)
{
int ret;
MPI_Request req;
struct rusage ru_start, ru_end;
struct timeval tv_start, tv_end;
unsigned long start, end;
#if 0
ret = syscall(732);
if (ret == -1) {
pr_debug("Progress is running on Linux\n");
} else {
pr_debug("Progress is running on McKernel\n");
}
if ((ret = getrusage(RUSAGE_THREAD, &ru_start))) {
pr_err("%s: error: getrusage failed (%d)\n", __func__, ret);
}
if ((ret = gettimeofday(&tv_start, NULL))) {
pr_err("%s: error: gettimeofday failed (%d)\n", __func__, ret);
}
#endif
#if STOP_TYPE == STOP_BY_MEM && POLL_TYPE == POLL_BY_TEST
if ((ret = MPI_Irecv(NULL, 0, MPI_CHAR, progress_rank, WAKE_TAG, progress_comm, &req)) != MPI_SUCCESS) {
pr_err("%s: error: MPI_Irecv: %d\n", __func__, ret);
}
#endif
init:
#ifdef PROFILE
start = rdtsc_light();
#endif
/* Wait for state transition */
pthread_mutex_lock(&progress_mutex);
while (!progress_flag_down) {
pthread_cond_wait(&progress_cond_down, &progress_mutex);
}
progress_flag_down = 0;
if (progress_state == PROGRESS_FINALIZE) {
pthread_mutex_unlock(&progress_mutex);
goto finalize;
}
if (progress_state != PROGRESS_START) {
pr_err("%s: error: unexpected state: %d\n", __func__, progress_state);
pthread_mutex_unlock(&progress_mutex);
goto finalize;
}
pthread_mutex_unlock(&progress_mutex);
#ifdef PROFILE
end = rdtsc_light();
RECORD_STAT(cyc_prog1_count, cyc_prog1, end, start);
#endif
//if (progress_world_rank < 2) pr_debug("[%d] poll,cpu=%d\n", progress_world_rank, sched_getcpu());
#ifdef PROFILE
start = rdtsc_light();
#endif
#if STOP_TYPE == STOP_BY_MEM
#if POLL_TYPE == POLL_BY_PROBE
int completed = 0;
while (!progress_stop_flag) {
if ((ret = MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &completed, MPI_STATUS_IGNORE)) != MPI_SUCCESS) {
pr_err("%s: error: MPI_Iprobe: %d\n", __func__, ret);
break;
}
//usleep(1);
}
#elif POLL_TYPE == POLL_BY_TEST
int completed = 0;
while (!completed && !progress_stop_flag) {
if ((ret = MPI_Test(&req, &completed, MPI_STATUS_IGNORE)) != MPI_SUCCESS) {
pr_err("%s: error: MPI_Iprobe: %d\n", __func__, ret);
break;
}
//usleep(1);
}
#endif /* POLL_TYPE */
#elif STOP_TYPE == STOP_BY_MPI
#if POLL_TYPE == POLL_BY_WAIT
if ((ret = MPI_Irecv(NULL, 0, MPI_CHAR, progress_rank, WAKE_TAG, progress_comm, &req)) != MPI_SUCCESS) {
pr_err("%s: error: MPI_Irecv: %d\n", __func__, ret);
}
if ((ret = MPI_Wait(&req, MPI_STATUS_IGNORE)) != MPI_SUCCESS) {
pr_err("%s: error: MPI_Wait failed (%d)\n", __func__, ret);
}
#elif POLL_TYPE == POLL_BY_PROBE
int completed = 0;
while (!completed) {
if ((ret = MPI_Iprobe(progress_rank, WAKE_TAG, progress_comm, &completed, MPI_STATUS_IGNORE)) != MPI_SUCCESS) {
pr_err("%s: error: MPI_Iprobe: %d\n", __func__, ret);
break;
}
usleep(1);
}
if ((ret = MPI_Recv(NULL, 0, MPI_CHAR, progress_rank, WAKE_TAG, progress_comm, MPI_STATUS_IGNORE)) != MPI_SUCCESS) {
pr_err("%s: error: MPI_Irecv: %d\n", __func__, ret);
}
#endif /* POLL_TYPE */
#endif /* STOP_TYPE */
progress_state = PROGRESS_INIT;
__sync_synchronize(); /* st-st barrier */
progress_flag_up = 1;
#ifdef PROFILE
end = rdtsc_light();
RECORD_STAT(cyc_prog2_count, cyc_prog2, end, start);
#endif
goto init;
finalize:
if ((ret = getrusage(RUSAGE_THREAD, &ru_end))) {
pr_err("%s: error: getrusage failed (%d)\n", __func__, ret);
}
if ((ret = gettimeofday(&tv_end, NULL))) {
pr_err("%s: error: gettimeofday failed (%d)\n", __func__, ret);
}
#if 0
pr_debug("%s: wall: %ld, user: %ld, sys: %ld\n", __func__,
DIFFUSEC(tv_end, tv_start),
DIFFUSEC(ru_end.ru_utime, ru_start.ru_utime),
DIFFUSEC(ru_end.ru_stime, ru_start.ru_stime));
#endif
progress_state = PROGRESS_INIT;
__sync_synchronize(); /* st-st barrier */
progress_flag_up = 1;
return NULL;
}
void progress_init()
{
int ret = 0;
pthread_attr_t pthread_attr;
uti_attr_t uti_attr;
unsigned long start, end;
#ifdef PROFILE
start = rdtsc_light();
#endif
MPI_Comm_rank(MPI_COMM_WORLD, &progress_world_rank);
MPI_Comm_size(MPI_COMM_WORLD, &progress_world_nproc);
if (__sync_val_compare_and_swap(&progress_refc, 0, 1) == 1) {
return;
}
/* printf costs much in MPI */
uti_set_loglevel(UTI_LOGLEVEL_ERR);
if ((ret = MPI_Comm_dup(MPI_COMM_SELF, &progress_comm))) {
pr_err("%s: error: MPI_Comm_dup failed (%d)\n", __func__, ret);
goto out;
}
MPI_Comm_rank(progress_comm, &progress_rank);
if ((ret = pthread_mutex_init(&progress_mutex, NULL))) {
pr_err("%s: error: pthread_mutex_init failed (%d)\n", __func__, ret);
goto out;
}
if ((ret = pthread_cond_init(&progress_cond_down, NULL))) {
pr_err("%s: error: pthread_cond_init failed (%d)\n", __func__, ret);
goto out;
}
if ((ret = pthread_attr_init(&pthread_attr))) {
pr_err("%s: error: pthread_attr_init failed (%d)\n", __func__, ret);
goto out;
}
if ((ret = uti_attr_init(&uti_attr))) {
pr_err("%s: error: uti_attr_init failed (%d)\n", __func__, ret);
goto out;
}
#if 0
if ((ret = UTI_ATTR_SAME_L1(&uti_attr))) {
pr_err("%s: error: UTI_ATTR_SAME_L1 failed\n", __func__);
}
#endif
#if 1 /* Expecting round-robin binding */
if ((ret = UTI_ATTR_CPU_INTENSIVE(&uti_attr))) {
pr_err("%s: error: UTI_ATTR_CPU_INTENSIVE failed\n", __func__);
}
#endif
#ifdef PROFILE
end = rdtsc_light();
RECORD_STAT(cyc_init1_count, cyc_init1, end, start);
#endif
#ifdef PROFILE
start = rdtsc_light();
#endif
if ((ret = uti_pthread_create(&progress_thr, &pthread_attr, progress_fn, NULL, &uti_attr))) {
pr_err("%s: error: uti_pthread_create failed (%d)\n", __func__, ret);
goto out;
}
ret = 0;
out:
if (ret) {
__sync_fetch_and_sub(&progress_refc, 1);
}
#ifdef PROFILE
end = rdtsc_light();
RECORD_STAT(cyc_init2_count, cyc_init2, end, start);
#endif
}
void progress_start()
{
unsigned long start, end;
if (progress_refc == 0) {
progress_init();
}
#ifdef PROFILE
start = rdtsc_light();
#endif
pthread_mutex_lock(&progress_mutex);
if (progress_state == PROGRESS_FINALIZE) {
pr_warn("%s: warning: FINALIZE\n", __func__);
pthread_mutex_unlock(&progress_mutex);
return;
}
if (progress_state == PROGRESS_START) {
//pr_warn("%s: warning: START\n", __func__);
pthread_mutex_unlock(&progress_mutex);
return;
}
if (progress_state != PROGRESS_INIT) {
pr_err("%s: error: unexpected state: %d\n", __func__, progress_state);
pthread_mutex_unlock(&progress_mutex);
return;
}
progress_state = PROGRESS_START;
#if STOP_TYPE == STOP_BY_MEM
progress_stop_flag = 0;
#endif
__sync_synchronize(); /* memory barrier instruction */
progress_flag_down = 1;
pthread_cond_signal(&progress_cond_down);
pthread_mutex_unlock(&progress_mutex);
#ifdef PROFILE
end = rdtsc_light();
RECORD_STAT(cyc_start_count, cyc_start, end, start);
#endif
}
void do_progress_stop()
{
int ret;
unsigned long start, end;
//if (progress_world_rank < 2) pr_debug("[%d] stop,cpu=%d\n", progress_world_rank, sched_getcpu());
#ifdef PROFILE
start = rdtsc_light();
#endif
#if STOP_TYPE == STOP_BY_MEM
progress_stop_flag = 1;
__sync_synchronize(); /* st-st barrier */
#elif STOP_TYPE == STOP_BY_MPI
if ((ret = MPI_Send(NULL, 0, MPI_CHAR, progress_rank, WAKE_TAG, progress_comm)) != MPI_SUCCESS) {
pr_err("%s: error: MPI_Send failed (%d)\n", __func__, ret);
return;
}
#endif /* STOP_TYPE */
#ifdef PROFILE
end = rdtsc_light();
RECORD_STAT(cyc_stop2_count, cyc_stop2, end, start);
start = rdtsc_light();
#endif
/* Make sure the following command will observe INIT */
while (!progress_flag_up) {
}
progress_flag_up = 0;
#ifdef PROFILE
end = rdtsc_light();
RECORD_STAT(cyc_stop3_count, cyc_stop3, end, start);
#endif
}
void progress_stop()
{
unsigned long start, end;
#ifdef PROFILE
start = rdtsc_light();
#endif
if (progress_refc == 0) {
return;
}
pthread_mutex_lock(&progress_mutex);
if (progress_state == PROGRESS_INIT) {
pthread_mutex_unlock(&progress_mutex);
return;
}
if (progress_state == PROGRESS_FINALIZE) {
pthread_mutex_unlock(&progress_mutex);
return;
}
if (progress_state != PROGRESS_START) {
pr_err("%s: error: unexpected state: %d\n", __func__, progress_state);
pthread_mutex_unlock(&progress_mutex);
return;
}
pthread_mutex_unlock(&progress_mutex);
#ifdef PROFILE
end = rdtsc_light();
RECORD_STAT(cyc_stop1_count, cyc_stop1, end, start);
#endif
do_progress_stop();
}
void progress_finalize()
{
int ret;
int i, j;
MPI_Request req;
unsigned long start, end;
int nproc;
MPI_Comm_size(MPI_COMM_WORLD, &nproc);
#ifdef PROFILE
start = rdtsc_light();
#endif
if (progress_refc == 0) {
return;
}
retry:
pthread_mutex_lock(&progress_mutex);
if (progress_state == PROGRESS_START) {
pthread_mutex_unlock(&progress_mutex);
do_progress_stop();
goto retry;
}
if (progress_state == PROGRESS_FINALIZE) {
pthread_mutex_unlock(&progress_mutex);
return;
}
if (progress_state != PROGRESS_INIT) {
pr_err("%s: error: unexpected state: %d\n", __func__, progress_state);
pthread_mutex_unlock(&progress_mutex);
return;
}
progress_state = PROGRESS_FINALIZE;
__sync_synchronize(); /* st-st barrier */
progress_flag_down = 1;
pthread_cond_signal(&progress_cond_down);
pthread_mutex_unlock(&progress_mutex);
/* Make sure the following command will observe INIT */
while (!progress_flag_up) {
}
progress_flag_up = 0;
pthread_join(progress_thr, NULL);
if ((ret = MPI_Comm_free(&progress_comm)) != MPI_SUCCESS) {
pr_err("%s: error: MPI_Comm_free failed (%d)\n", __func__, ret);
return;
}
progress_refc = 0;
#ifdef PROFILE
end = rdtsc_light();
RECORD_STAT(cyc_finalize_count, cyc_finalize, end, start);
for (j = 0; j < NRANK_STAT; j++) {
MPI_Barrier(MPI_COMM_WORLD);
if (j != progress_world_rank) {
usleep(1000000);
continue;
}
pr_stat("cyc_prog1", cyc_prog1_count, cyc_prog1);
pr_stat("cyc_prog2", cyc_prog2_count, cyc_prog2);
pr_stat("cyc_init1", cyc_init1_count, cyc_init1);
pr_stat("cyc_init2", cyc_init2_count, cyc_init2);
pr_stat("cyc_start", cyc_start_count, cyc_start);
pr_stat("cyc_stop1", cyc_stop1_count, cyc_stop1);
pr_stat("cyc_stop2", cyc_stop2_count, cyc_stop2);
pr_stat("cyc_stop3", cyc_stop3_count, cyc_stop3);
pr_stat("cyc_finalize", cyc_finalize_count, cyc_finalize);
}
#endif
}

View File

@@ -0,0 +1,15 @@
#ifndef _ASYNC_PROGRESS_INCLUDED_
#define _ASYNC_PROGRESS_INCLUDED_
enum progress_state {
PROGRESS_INIT = 0,
PROGRESS_START,
PROGRESS_FINALIZE
};
void progress_init();
void progress_start();
void progress_stop();
void progress_finalize();
#endif

17
test/uti/mpi/env_intel.sh Normal file
View File

@@ -0,0 +1,17 @@
export HYDRA_BOOTSTRAP_EXEC=/bin/pjrsh
export HYDRA_BOOTSTRAP=rsh
export HYDRA_PROXY_RETRY_COUNT=30
#export HYDRA_BRANCH_COUNT=4
export I_MPI_PIN=off
export HFI_NO_CPUAFFINITY=1
export KMP_AFFINITY=granularity=thread,scatter
export I_MPI_COLL_INTRANODE_SHM_THRESHOLD=4194304
export I_MPI_FABRICS=shm:tmi
export PSM2_RCVTHREAD=0
export I_MPI_TMI_PROVIDER=psm2
export I_MPI_FALLBACK=0
export PSM2_MQ_RNDV_HFI_WINDOW=4194304
export PSM2_MQ_EAGER_SDMA_SZ=65536
export PSM2_MQ_RNDV_HFI_THRESH=200000

View File

@@ -0,0 +1,5 @@
export HYDRA_BOOTSTRAP_EXEC=/bin/pjrsh
export HYDRA_BOOTSTRAP=rsh
export HYDRA_PROXY_RETRY_COUNT=30
export MPIR_CVAR_OFI_USE_PROVIDER=psm2

22
test/uti/mpi/filter.pl Executable file
View File

@@ -0,0 +1,22 @@
#!/usr/bin/perl
while(<>) { # For each line of hostfile
open();
$found = 0;
while(<>) {
if($_ =~ /progress_fn,enter,tid=(\d+)/) {
$tid = $1;
$found = 1;
# print 'tid='.$tid."\n"
}
if($found == 1 && $_ =~ /^$tid/) {
if($_ =~ /^$tid\s(\w+)/) {
# print $1."\n";
$freq{$1}{$hostname}++;
}
}
}
}
foreach $key (sort(keys(%freq))) {
print $key.",".$freq{$key}."\n";
}

View File

@@ -0,0 +1,100 @@
#!/usr/bin/perl
# Usage ./mpi_progress.pl <#procs> <#nnodes> (mck|lin) (mpich|intel)
use File::Basename;
use File::Copy "cp";
($nprocs, $nnodes, $os, $mpi) = @ARGV;
$ppn = $nprocs / $nnodes;
@command = split /\s+/, basename($0);
@fn = split /\./, $command[0];
if($nnodes <= 16) {
$rg = 'MCK-FLAT-QUADRANT';
} elsif($ARGV[1] <= 128) {
$rg = 'debug-flat';
} else {
$rg = 'regular-flat';
}
%elapse = (
'1', '00:10:00',
'2', '00:10:00',
'4', '00:10:00',
'8', '00:10:00',
'16', '00:10:00',
'32', '00:10:00',
'64', '00:05:00',
'128', '00:05:00',
'256', '00:10:00',
'512', '00:15:00',
'1024', '00:15:00',
'2048', '00:30:00',
);
if ($os eq 'lin') {
$use_mck = '';
$mck_mem = '';
$mcexec = '';
$mcexecopt = '';
} else {
$path_to_mck = '/work/gg10/e29005/project/os/install';
$use_mck = '#PJM -x MCK='.$path_to_mck;
$mck_mem = '#PJM -x MCK_MEM=32G@0,8G@1';
$mcexec = $path_to_mck.'/bin/mcexec';
$mcexecopt = '-n '.$ppn;
}
if ($mpi eq 'intel') {
$cc = 'mpiicc';
$mpiexec = 'mpiexec';
$genv = '';
$progress = '-genv I_MPI_ASYNC_PROGRESS 1'; # -genv I_MPI_ASYNC_PROGRESS_PIN 1
} else {
$mpi_lib = '/work/gg10/e29005/project/mpich/install';
$cc = $mpi_lib.'/bin/mpicc';
$mpiexec = $mpi_lib.'/bin/mpiexec';
$genv = '-genv LD_LIBRARY_PATH '.$mpi_lib.'/lib:$LD_LIBRARY_PATH';
$progress = '-genv MPIR_CVAR_ASYNC_PROGRESS 1';
}
system("make clean; make CC=$cc");
$dir=$ARGV[2].'_'.$ARGV[0].'_'.$ARGV[1].'_'.`date +%Y%m%d_%H%M%S`;
chomp($dir);
print 'less '.$dir.'/job.sh.o*'."\n";
mkdir $dir;
chdir $dir;
cp('../001', './001') or die 'copy failed';
open(IN, "../$fn[0].sh.in");
open(OUT, ">./job.sh");
while(<IN>) {
s/\@rg@/$rg/g;
s/\@nnodes@/$nnodes/g;
s/\@nprocs@/$nprocs/g;
s/\@elapse@/$elapse{$nnodes}/g;
s/\@use_mck@/$use_mck/g;
s/\@mck_mem@/$mck_mem/g;
s/\@progress@/$progress/g;
s/\@genv@/$genv/g;
s/\@mpiexec@/$mpiexec/g;
s/\@mcexec@/$mcexec/g;
s/\@mcexecopt@/$mcexecopt/g;
if(/\@env@/) {
open(INCL, "../env_$mpi.sh");
while(my $line = <INCL>) {
print OUT $line;
}
next;
}
print OUT $_;
}
close(IN);
close(OUT);
$cmd = 'PJM_MCK_AVAILABLE=1 pjsub ./job.sh';
#print $cmd."\n";
exec($cmd);

View File

@@ -0,0 +1,16 @@
#!/bin/sh
#PJM -L rscgrp=@rg@
#PJM -L node=@nnodes@
#PJM --mpi proc=@nprocs@
#PJM -L elapse=@elapse@
#PJM -L proc-crproc=16384
#PJM -g gg10
#PJM -j
#PJM -s
@use_mck@
@mck_mem@
@env@
@mpiexec@ @genv@ @progress@ -np @nprocs@ -machinefile ${PJM_O_NODEINF} @mcexec@ @mcexecopt@ ./001 1048576 1000

186
test/uti/mpi/util.c Normal file
View File

@@ -0,0 +1,186 @@
#define _GNU_SOURCE /* See feature_test_macros(7) */
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdint.h>
#include <sys/syscall.h> /* For SYS_xxx definitions */
#include <sched.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <string.h>
#include <time.h>
#include "util.h"
/* Messaging */
enum test_loglevel test_loglevel = TEST_LOGLEVEL_DEBUG;
/* Calculation */
static inline void asmloop(unsigned long n) {
int j;
for (j = 0; j < n; j++) {
asm volatile(
"movq $0, %%rcx\n\t"
"1:\t"
"addq $1, %%rcx\n\t"
"cmpq $99, %%rcx\n\t"
"jle 1b\n\t"
:
:
: "rcx", "cc");
}
}
#define N_INIT 10000000
double nspw; /* nsec per work */
void ndelay_init(int verbose) {
struct timeval start, end;
//clock_gettime(TIMER_KIND, &start);
gettimeofday(&start, NULL);
#pragma omp parallel
{
asmloop(N_INIT);
}
//clock_gettime(TIMER_KIND, &end);
gettimeofday(&end, NULL);
nspw = DIFFUSEC(end, start) * 1000 / (double)N_INIT;
if (verbose) {
pr_debug("nspw=%f\n", nspw);
}
}
#if 1
void ndelay(long delay_nsec) {
if (delay_nsec < 0) {
printf("delay_nsec < 0\n");
return;
}
#pragma omp parallel
{
asmloop(delay_nsec / nspw);
}
}
#else /* For machines with large core-to-core performance variation (e.g. OFP) */
void ndelay(long delay_nsec) {
struct timespec start, end;
if (delay_nsec < 0) { return; }
clock_gettime(TIMER_KIND, &start);
while (1) {
clock_gettime(TIMER_KIND, &end);
if (DIFFNSEC(end, start) >= delay_nsec) {
break;
}
asmloop(2); /* ~150 ns per iteration on FOP */
}
}
#endif
double cycpw; /* cyc per work */
void cdlay_init() {
unsigned long start, end;
start = rdtsc_light();
#define N_INIT 10000000
asmloop(N_INIT);
end = rdtsc_light();
cycpw = (end - start) / (double)N_INIT;
}
#if 0
void cdelay(long delay_cyc) {
if (delay_cyc < 0) {
return;
}
asmloop(delay_cyc / cycpw);
}
#else /* For machines with large core-to-core performance variation (e.g. OFP) */
void cdelay(long delay_cyc) {
unsigned long start, end;
if (delay_cyc < 0) { return; }
start = rdtsc_light();
while (1) {
end = rdtsc_light();
if (end - start >= delay_cyc) {
break;
}
asmloop(2);
}
}
#endif
int print_cpu_last_executed_on(const char *name) {
char fn[256];
char* result;
pid_t tid = syscall(SYS_gettid);
int fd;
int offset;
int mpi_errno = 0;
int rc;
sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid);
//printf("fn=%s\n", fn);
fd = open(fn, O_RDONLY);
if(fd == -1) {
printf("open() failed\n");
goto fn_fail;
}
result = malloc(65536);
if(result == NULL) {
printf("malloc() failed");
goto fn_fail;
}
int amount = 0;
offset = 0;
while(1) {
amount = read(fd, result + offset, 65536);
// printf("amount=%d\n", amount);
if(amount == -1) {
printf("read() failed");
goto fn_fail;
}
if(amount == 0) {
goto eof;
}
offset += amount;
}
eof:;
//printf("result:%s\n", result);
char* next_delim = result;
char* field;
int i;
for(i = 0; i < 39; i++) {
field = strsep(&next_delim, " ");
}
int cpu = sched_getcpu();
if(cpu == -1) {
printf("getpu() failed\n");
goto fn_fail;
}
rc = syscall(732);
printf("%s: pmi_rank=%02d,os=%s,stat-cpu=%02d,sched_getcpu=%02d,tid=%d\n", name, atoi(getenv("PMI_RANK")), rc == -1 ? "lin" : "mck", atoi(field), cpu, tid); fflush(stdout);
fn_exit:
free(result);
return mpi_errno;
fn_fail:
mpi_errno = -1;
goto fn_exit;
}

73
test/uti/mpi/util.h Normal file
View File

@@ -0,0 +1,73 @@
#ifndef __UTIL_H_INCLUDED__
#define __UTIL_H_INCLUDED__
#include <stdint.h>
/* Messaging */
enum test_loglevel {
TEST_LOGLEVEL_ERR = 0,
TEST_LOGLEVEL_WARN,
TEST_LOGLEVEL_DEBUG
};
extern enum test_loglevel test_loglevel;
static inline void test_set_loglevel(enum test_loglevel level)
{
test_loglevel = level;
}
#define pr_level(level, fmt, args...) do { \
if (test_loglevel >= level) { \
fprintf(stdout, fmt, ##args); \
} \
} while (0)
#define pr_err(fmt, args...) pr_level(TEST_LOGLEVEL_ERR, fmt, ##args)
#define pr_warn(fmt, args...) pr_level(TEST_LOGLEVEL_WARN, fmt, ##args)
#define pr_debug(fmt, args...) pr_level(TEST_LOGLEVEL_DEBUG, fmt, ##args)
#define _OKNG(verb, jump, cond, fmt, args...) do { \
if (cond) { \
if (verb) \
printf("[ OK ] " fmt, ##args); \
} else { \
printf("[ NG ] " fmt, ##args); \
if (jump) { \
ret = -1; \
goto out; \
} \
} \
} while (0)
#define OKNG(args...) _OKNG(1, 1, ##args)
#define NG(args...) _OKNG(0, 1, ##args)
#define OKNGNOJUMP(args...) _OKNG(1, 0, ##args)
/* Time */
inline uint64_t rdtsc_light(void)
{
uint64_t x;
__asm__ __volatile__("rdtscp;" /* rdtscp don't jump over earlier instructions */
"shl $32, %%rdx;"
"or %%rdx, %%rax" :
"=a"(x) :
:
"%rcx", "%rdx", "memory");
return x;
}
#define DIFFUSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000UL + (end.tv_usec - start.tv_usec))
#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec))
#define TIMER_KIND CLOCK_MONOTONIC_RAW /* CLOCK_THREAD_CPUTIME_ID */
/* Calculation emulation */
void ndelay_init();
void ndelay(long delay_nsec);
void cdelay_init();
void cdelay(long delay_cyc);
/* CPU location */
int print_cpu_last_executed_on();
#endif