Revert "Detect hang of McKernel in mcexec"
Change-Id: Ie8a0cf725f84a2f5d85da8b8fb15b30a826ddfcb
This commit is contained in:
@@ -15,7 +15,7 @@ set_property(TARGET libmcexec PROPERTY POSITION_INDEPENDENT_CODE ON)
|
|||||||
|
|
||||||
add_executable(mcexec mcexec.c)
|
add_executable(mcexec mcexec.c)
|
||||||
target_link_libraries(mcexec
|
target_link_libraries(mcexec
|
||||||
libmcexec ihklib ${LIBRT} ${LIBNUMA} $<$<BOOL:ENABLE_QLMPI>:${LIBMPI}> pthread)
|
libmcexec ${LIBRT} ${LIBNUMA} $<$<BOOL:ENABLE_QLMPI>:${LIBMPI}> pthread)
|
||||||
target_include_directories(mcexec PUBLIC "${KERNEL_DIR}")
|
target_include_directories(mcexec PUBLIC "${KERNEL_DIR}")
|
||||||
set_property(TARGET mcexec PROPERTY POSITION_INDEPENDENT_CODE ON)
|
set_property(TARGET mcexec PROPERTY POSITION_INDEPENDENT_CODE ON)
|
||||||
set_property(TARGET mcexec PROPERTY LINK_FLAGS "-fPIE -pie")
|
set_property(TARGET mcexec PROPERTY LINK_FLAGS "-fPIE -pie")
|
||||||
|
|||||||
@@ -83,8 +83,6 @@
|
|||||||
#include <sys/un.h>
|
#include <sys/un.h>
|
||||||
#include "../include/pmi.h"
|
#include "../include/pmi.h"
|
||||||
#include "../include/qlmpi.h"
|
#include "../include/qlmpi.h"
|
||||||
#include <ihk/ihklib.h>
|
|
||||||
#include <sys/epoll.h>
|
|
||||||
#include <sys/xattr.h>
|
#include <sys/xattr.h>
|
||||||
#include "../../lib/include/list.h"
|
#include "../../lib/include/list.h"
|
||||||
|
|
||||||
@@ -1061,89 +1059,6 @@ pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
|
|||||||
pthread_barrier_t init_ready;
|
pthread_barrier_t init_ready;
|
||||||
pthread_barrier_t uti_init_ready;
|
pthread_barrier_t uti_init_ready;
|
||||||
|
|
||||||
pthread_attr_t watchdog_thread_attr;
|
|
||||||
pthread_t watchdog_thread;
|
|
||||||
|
|
||||||
/* Detects hang of McKernel */
|
|
||||||
static void *watchdog_thread_func(void *arg) {
|
|
||||||
int ret = 0;
|
|
||||||
int evfd = -1;
|
|
||||||
int epfd = -1;
|
|
||||||
struct epoll_event event_in;
|
|
||||||
struct epoll_event event_out;
|
|
||||||
|
|
||||||
if ((evfd = ihk_os_get_eventfd(0, IHK_OS_EVENTFD_TYPE_STATUS)) < 0) {
|
|
||||||
fprintf(stderr, "%s: Error: geteventfd failed (%d)\n", __FUNCTION__, evfd);
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ((epfd = epoll_create(1)) == -1) {
|
|
||||||
fprintf(stderr, "%s: Error: epoll_create failed (%d)\n", __FUNCTION__, epfd);
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
memset(&event_in, 0, sizeof(struct epoll_event));
|
|
||||||
event_in.events = EPOLLIN;
|
|
||||||
event_in.data.fd = evfd;
|
|
||||||
if ((ret = epoll_ctl(epfd, EPOLL_CTL_ADD, evfd, &event_in)) != 0) {
|
|
||||||
fprintf(stderr, "%s: Error: epoll_ctl failed (%d)\n", __FUNCTION__, ret);
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
do {
|
|
||||||
int nfd;
|
|
||||||
uint64_t counter;
|
|
||||||
ssize_t nread;
|
|
||||||
|
|
||||||
nfd = epoll_wait(epfd, &event_out, 1, -1);
|
|
||||||
if (nfd == -1) {
|
|
||||||
if (errno == EINTR) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
fprintf(stderr, "%s: Error: epoll_wait failed (%s)\n", __FUNCTION__, strerror(errno));
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (nfd == 0) {
|
|
||||||
fprintf(stderr, "%s: Error: epoll_wait timed out unexpectedly\n", __FUNCTION__);
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (nfd > 1) {
|
|
||||||
fprintf(stderr, "%s: Error: Too many (%d) events\n", __FUNCTION__, nfd);
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (event_out.data.fd != evfd) {
|
|
||||||
fprintf(stderr, "%s: Error: Unknown event (fd:%d)\n", __FUNCTION__, event_out.data.fd);
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
nread = read(evfd, &counter, sizeof(counter));
|
|
||||||
if (nread == 0) {
|
|
||||||
fprintf(stderr, "%s: Error: read got EOF\n", __FUNCTION__);
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (nread == -1) {
|
|
||||||
fprintf(stderr, "%s: Error: read failed (%s)\n", __FUNCTION__, strerror(errno));
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
fprintf(stderr, "mcexec detected hang of McKernel\n");
|
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
} while (1);
|
|
||||||
|
|
||||||
out:
|
|
||||||
if (evfd != -1) {
|
|
||||||
close(evfd);
|
|
||||||
}
|
|
||||||
if (epfd != -1) {
|
|
||||||
close(epfd);
|
|
||||||
}
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void *main_loop_thread_func(void *arg)
|
static void *main_loop_thread_func(void *arg)
|
||||||
{
|
{
|
||||||
struct thread_data_s *td = (struct thread_data_s *)arg;
|
struct thread_data_s *td = (struct thread_data_s *)arg;
|
||||||
@@ -2703,26 +2618,6 @@ int main(int argc, char **argv)
|
|||||||
|
|
||||||
init_sigaction();
|
init_sigaction();
|
||||||
|
|
||||||
/* Initialize watchdog thread which detects hang of McKernel */
|
|
||||||
|
|
||||||
if ((error = pthread_attr_init(&watchdog_thread_attr))) {
|
|
||||||
fprintf(stderr, "Error: pthread_attr_init failed (%d)\n", error);
|
|
||||||
close(fd);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ((error = pthread_attr_setdetachstate(&watchdog_thread_attr, PTHREAD_CREATE_DETACHED))) {
|
|
||||||
fprintf(stderr, "Error: pthread_attr_getdetachstate failed (%d)\n", error);
|
|
||||||
close(fd);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ((error = pthread_create(&watchdog_thread, &watchdog_thread_attr, watchdog_thread_func, NULL))) {
|
|
||||||
fprintf(stderr, "Error: pthread_create failed (%d)\n", error);
|
|
||||||
close(fd);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ((error = init_worker_threads(fd)) != 0) {
|
if ((error = init_worker_threads(fd)) != 0) {
|
||||||
fprintf(stderr, "%s: Error: creating worker threads: %s\n",
|
fprintf(stderr, "%s: Error: creating worker threads: %s\n",
|
||||||
__func__, strerror(-error));
|
__func__, strerror(-error));
|
||||||
|
|||||||
Reference in New Issue
Block a user