rusage and ihklib: Fix out-of-memory reporting and cleanup

1. Fix OOM: Count memory usage only when allocation succeeded
2. Fix OOM: Make user allocation fail when memory is running out
3. Fix OOM: Move rusage_init() before numa_init()
4. Cleanup: Rename ihkconfig/ihkosctl functions
5. Cleanup: Pass event type to eventfd()
6. Cleanup: arch/.../rusage.h --> arch/.../arch_rusage.h
This commit is contained in:
Masamichi Takagi
2017-08-23 19:39:46 +09:00
parent a1af7edd6e
commit daa7526127
18 changed files with 688 additions and 151 deletions

View File

@@ -3,6 +3,8 @@
#ifndef __RUSAGE_H
#define __RUSAGE_H
//#define RUSAGE_DEBUG
#define IHK_MAX_NUM_PGSIZES 4
#define IHK_MAX_NUM_NUMA_NODES 1024
#define IHK_MAX_NUM_CPUS 1024
@@ -13,21 +15,28 @@ struct rusage_percpu {
};
struct rusage_global {
/* Memory usage accounting */
long memory_stat_rss[IHK_MAX_NUM_PGSIZES];
long memory_stat_mapped_file[IHK_MAX_NUM_PGSIZES];
long rss_current; /* anon && user, used only for memory_max_usage */
unsigned long memory_max_usage;
unsigned long max_num_threads;
unsigned long num_threads;
long rss_current;
unsigned long memory_kmem_usage;
unsigned long memory_kmem_max_usage;
unsigned long memory_numa_stat[IHK_MAX_NUM_NUMA_NODES];
/* CPU usage accounting */
struct rusage_percpu cpu[IHK_MAX_NUM_CPUS]; /* clv[i].monitor = &cpu[i] */
/* OOM monitoring */
unsigned long total_memory;
unsigned long total_memory_usage;
unsigned long total_memory_max_usage;
#ifdef RUSAGE_DEBUG
unsigned long total_memory_max_usage_old; /* debug */
#endif
/* Used for translating results into struct mckernel_rusage */
unsigned long num_numa_nodes;
unsigned long num_processors;
unsigned long ns_per_tsc;

View File

@@ -8,18 +8,25 @@
#include <ihk/atomic.h>
#include <memobj.h>
#include <rusage.h>
#include <arch/rusage.h>
#include <ihk/ihk_monitor.h>
#include <arch_rusage.h>
#ifdef ENABLE_RUSAGE
#define RUSAGE_MEM_LIMIT (2 * 1024 * 1024) // 2MB
#define RUSAGE_OOM_MARGIN (2 * 1024 * 1024) // 2MB
extern void eventfd();
extern void eventfd(int type);
static inline void
rusage_total_memory_add(unsigned long size)
{
#ifdef RUSAGE_DEBUG
kprintf("%s: total_memory=%ld,size=%ld\n", __FUNCTION__, rusage->total_memory, size);
#endif
rusage->total_memory += size;
#ifdef RUSAGE_DEBUG
kprintf("%s: total_memory=%ld\n", __FUNCTION__, rusage->total_memory);
#endif
}
static inline void
@@ -220,6 +227,22 @@ rusage_numa_sub(int numa_id, unsigned long size)
__sync_sub_and_fetch(rusage->memory_numa_stat + numa_id, size);
}
static inline int
rusage_check_oom(int numa_id, unsigned long pages, int is_user)
{
unsigned long size = pages * PAGE_SIZE;
if (rusage->total_memory_usage + size > rusage->total_memory - RUSAGE_OOM_MARGIN) {
kprintf("%s: memory used:%ld available:%ld\n", __FUNCTION__, rusage->total_memory_usage, rusage->total_memory);
eventfd(IHK_OS_EVENTFD_TYPE_OOM);
if (is_user) {
return -ENOMEM;
}
}
return 0;
}
static inline void
rusage_page_add(int numa_id, unsigned long pages, int is_user)
{
@@ -228,6 +251,12 @@ rusage_page_add(int numa_id, unsigned long pages, int is_user)
unsigned long oldval;
unsigned long retval;
#ifdef RUSAGE_DEBUG
if (numa_id < 0 || numa_id >= rusage->num_numa_nodes) {
kprintf("%s: Error: invalid numa_id=%d\n", __FUNCTION__, numa_id);
return;
}
#endif
if (is_user)
rusage_numa_add(numa_id, size);
else
@@ -239,10 +268,12 @@ rusage_page_add(int numa_id, unsigned long pages, int is_user)
retval = __sync_val_compare_and_swap(&rusage->total_memory_max_usage,
oldval, newval);
if (retval == oldval) {
if (rusage->total_memory - newval <
RUSAGE_MEM_LIMIT) {
eventfd();
#ifdef RUSAGE_DEBUG
if (rusage->total_memory_max_usage > rusage->total_memory_max_usage_old + (1 * (1ULL << 30))) {
kprintf("%s: max(%ld) > old + 1GB,numa_id=%d\n", __FUNCTION__, rusage->total_memory_max_usage, numa_id);
rusage->total_memory_max_usage_old = rusage->total_memory_max_usage;
}
#endif
break;
}
oldval = retval;
@@ -253,7 +284,15 @@ static inline void
rusage_page_sub(int numa_id, unsigned long pages, int is_user)
{
unsigned long size = pages * PAGE_SIZE;
#ifdef RUSAGE_DEBUG
if (numa_id < 0 || numa_id >= rusage->num_numa_nodes) {
kprintf("%s: Error: invalid numa_id=%d\n", __FUNCTION__, numa_id);
return;
}
if (rusage->total_memory_usage < size) {
kprintf("%s: Error, total_memory_usage=%ld,size=%ld\n", __FUNCTION__, rusage->total_memory_max_usage, size);
}
#endif
__sync_sub_and_fetch(&rusage->total_memory_usage, size);
if (is_user)
@@ -343,9 +382,15 @@ rusage_numa_sub(int numa_id, unsigned long size)
{
}
static inline int
rusage_check_oom(int numa_id, unsigned long pages, int is_user)
{
return 0;
}
static inline void
rusage_page_add(int numa_id, unsigned long size, int is_user)
{
return;
}
static inline void

View File

@@ -268,6 +268,11 @@ struct ikc_scd_packet {
enum mcctrl_os_cpu_operation op;
void *resp;
};
/* SCD_MSG_EVENTFD */
struct {
int eventfd_type;
};
};
char padding[12];
};

View File

@@ -31,7 +31,6 @@
#include <cls.h>
#include <syscall.h>
#include <sysfs.h>
#include <rusage_private.h>
#include <ihk/monitor.h>
//#define IOCTL_FUNC_EXTENSION
@@ -287,21 +286,6 @@ static void monitor_init()
#endif /* POSTK_DEBUG_TEMP_FIX_73 */
}
static void rusage_init()
{
int npages;
unsigned long phys;
npages = (sizeof(struct rusage_global) + PAGE_SIZE -1) >> PAGE_SHIFT;
rusage = ihk_mc_alloc_pages(npages, IHK_MC_AP_CRITICAL);
memset(rusage, 0, npages * PAGE_SIZE);
rusage->num_processors = num_processors;
rusage->num_numa_nodes = ihk_mc_get_nr_numa_nodes();
rusage->ns_per_tsc = ihk_mc_get_ns_per_tsc();
phys = virt_to_phys(rusage);
ihk_set_rusage(phys, sizeof(struct rusage_global));
}
int nmi_mode;
static void nmi_init()
@@ -326,7 +310,6 @@ static void rest_init(void)
#ifndef POSTK_DEBUG_TEMP_FIX_73 /* NULL access for *monitor fix */
monitor_init();
#endif /* !POSTK_DEBUG_TEMP_FIX_73 */
rusage_init();
cpu_local_var_init();
nmi_init();
time_init();

View File

@@ -80,6 +80,8 @@ static void *___ihk_mc_alloc_aligned_pages_node(int npages,
static void *___ihk_mc_alloc_pages(int npages, ihk_mc_ap_flag flag, int is_user);
static void ___ihk_mc_free_pages(void *p, int npages, int is_user);
extern unsigned long ihk_mc_get_ns_per_tsc(void);
/*
* Page allocator tracking routines
*/
@@ -571,14 +573,22 @@ static void *mckernel_allocate_aligned_pages_node(int npages, int p2align,
if (pref_node > -1 && pref_node < ihk_mc_get_nr_numa_nodes()) {
#ifdef IHK_RBTREE_ALLOCATOR
{
pa = ihk_numa_alloc_pages(&memory_nodes[pref_node], npages, p2align);
if (rusage_check_oom(pref_node, npages, is_user) == -ENOMEM) {
pa = 0;
} else {
pa = ihk_numa_alloc_pages(&memory_nodes[pref_node], npages, p2align);
}
#else
list_for_each_entry(pa_allocator,
&memory_nodes[pref_node].allocators, list) {
pa = ihk_pagealloc_alloc(pa_allocator, npages, p2align);
if (rusage_check_oom(pref_node, npages, is_user) == -ENOMEM) {
pa = 0;
} else {
pa = ihk_pagealloc_alloc(pa_allocator, npages, p2align);
}
#endif
if (pa) {
rusage_page_add(pref_node, npages, is_user);
dkprintf("%s: explicit (node: %d) CPU @ node %d allocated "
"%d pages from node %d\n",
__FUNCTION__,
@@ -586,8 +596,6 @@ static void *mckernel_allocate_aligned_pages_node(int npages, int p2align,
ihk_mc_get_numa_id(),
npages, node);
rusage_page_add(pref_node, npages, is_user);
return phys_to_virt(pa);
}
else {
@@ -617,23 +625,30 @@ static void *mckernel_allocate_aligned_pages_node(int npages, int p2align,
numa_id = memory_nodes[node].nodes_by_distance[i].id;
#ifdef IHK_RBTREE_ALLOCATOR
{
pa = ihk_numa_alloc_pages(&memory_nodes[memory_nodes[node].
nodes_by_distance[i].id], npages, p2align);
if (rusage_check_oom(numa_id, npages, is_user) == -ENOMEM) {
pa = 0;
} else {
pa = ihk_numa_alloc_pages(&memory_nodes[memory_nodes[node].
nodes_by_distance[i].id], npages, p2align);
}
#else
list_for_each_entry(pa_allocator,
&memory_nodes[numa_id].allocators, list) {
pa = ihk_pagealloc_alloc(pa_allocator, npages, p2align);
if (rusage_check_oom(numa_id, npages, is_user) == -ENOMEM) {
pa = 0;
} else {
pa = ihk_pagealloc_alloc(pa_allocator, npages, p2align);
}
#endif
if (pa) {
rusage_page_add(numa_id, npages, is_user);
dkprintf("%s: policy: CPU @ node %d allocated "
"%d pages from node %d\n",
__FUNCTION__,
ihk_mc_get_numa_id(),
npages, node);
rusage_page_add(numa_id, npages,
is_user);
break;
}
@@ -674,22 +689,31 @@ distance_based:
#ifdef IHK_RBTREE_ALLOCATOR
{
pa = ihk_numa_alloc_pages(&memory_nodes[memory_nodes[node].
nodes_by_distance[i].id], npages, p2align);
if (rusage_check_oom(numa_id, npages, is_user) == -ENOMEM) {
pa = 0;
} else {
pa = ihk_numa_alloc_pages(&memory_nodes[memory_nodes[node].
nodes_by_distance[i].id], npages, p2align);
}
#else
list_for_each_entry(pa_allocator,
&memory_nodes[numa_id].allocators, list) {
pa = ihk_pagealloc_alloc(pa_allocator, npages, p2align);
if (rusage_check_oom(numa_id, npages, is_user) == -ENOMEM) {
pa = 0;
} else {
pa = ihk_pagealloc_alloc(pa_allocator, npages, p2align);
}
#endif
if (pa) {
rusage_page_add(numa_id, npages, is_user);
dkprintf("%s: distance: CPU @ node %d allocated "
"%d pages from node %d\n",
__FUNCTION__,
ihk_mc_get_numa_id(),
npages,
memory_nodes[node].nodes_by_distance[i].id);
rusage_page_add(numa_id, npages, is_user);
break;
}
}
@@ -708,13 +732,22 @@ order_based:
numa_id = (node + i) % ihk_mc_get_nr_numa_nodes();
#ifdef IHK_RBTREE_ALLOCATOR
{
pa = ihk_numa_alloc_pages(&memory_nodes[(node + i) %
ihk_mc_get_nr_numa_nodes()], npages, p2align);
if (rusage_check_oom(numa_id, npages, is_user) == -ENOMEM) {
pa = 0;
} else {
pa = ihk_numa_alloc_pages(&memory_nodes[(node + i) %
ihk_mc_get_nr_numa_nodes()], npages, p2align);
}
#else
list_for_each_entry(pa_allocator,
&memory_nodes[numa_id].allocators, list) {
pa = ihk_pagealloc_alloc(pa_allocator, npages, p2align);
if (rusage_check_oom(numa_id, npages, is_user) == -ENOMEM) {
pa = 0;
} else {
pa = ihk_pagealloc_alloc(pa_allocator, npages, p2align);
}
#endif
if (pa) {
rusage_page_add(numa_id, npages, is_user);
break;
@@ -730,6 +763,7 @@ order_based:
if(flag != IHK_MC_AP_NOWAIT)
panic("Not enough space\n");
*/
dkprintf("OOM\n", __FUNCTION__);
return NULL;
}
@@ -1256,13 +1290,13 @@ static void numa_init(void)
#endif
#ifdef IHK_RBTREE_ALLOCATOR
dkprintf("Physical memory: 0x%lx - 0x%lx, %lu bytes, %d pages available @ NUMA: %d\n",
kprintf("Physical memory: 0x%lx - 0x%lx, %lu bytes, %d pages available @ NUMA: %d\n",
start, end,
end - start,
(end - start) >> PAGE_SHIFT,
numa_id);
#else
dkprintf("Physical memory: 0x%lx - 0x%lx, %lu bytes, %d pages available @ NUMA: %d\n",
kprintf("Physical memory: 0x%lx - 0x%lx, %lu bytes, %d pages available @ NUMA: %d\n",
start, end,
ihk_pagealloc_count(allocator) * PAGE_SIZE,
ihk_pagealloc_count(allocator),
@@ -1659,6 +1693,22 @@ void ihk_mc_clean_micpa(void){
}
#endif
static void rusage_init()
{
int npages;
unsigned long phys;
npages = (sizeof(struct rusage_global) + PAGE_SIZE -1) >> PAGE_SHIFT;
rusage = ihk_mc_alloc_pages(npages, IHK_MC_AP_CRITICAL);
memset(rusage, 0, npages * PAGE_SIZE);
rusage->num_processors = num_processors;
rusage->num_numa_nodes = ihk_mc_get_nr_numa_nodes();
rusage->ns_per_tsc = ihk_mc_get_ns_per_tsc();
phys = virt_to_phys(rusage);
ihk_set_rusage(phys, sizeof(struct rusage_global));
dkprintf("%s: rusage->total_memory=%ld\n", __FUNCTION__, rusage->total_memory);
}
#ifdef POSTK_DEBUG_TEMP_FIX_73 /* NULL access for *monitor fix */
extern void monitor_init(void);
#endif /* POSTK_DEBUG_TEMP_FIX_73 */
@@ -1667,6 +1717,10 @@ void mem_init(void)
#ifdef POSTK_DEBUG_TEMP_FIX_73 /* NULL access for *monitor fix */
monitor_init();
#endif /* !POSTK_DEBUG_TEMP_FIX_73 */
/* It must precedes numa_init() because rusage->total_memory is initialized in numa_init() */
rusage_init();
/* Initialize NUMA information and memory allocator bitmaps */
numa_init();

View File

@@ -2638,6 +2638,19 @@ void release_thread(struct thread *thread)
release_process_vm(vm);
rusage_num_threads_dec();
#ifdef RUSAGE_DEBUG
if (rusage->num_threads == 0) {
int i;
kprintf("total_memory_usage=%ld\n", rusage->total_memory_usage);
for(i = 0; i < IHK_MAX_NUM_PGSIZES; i++) {
kprintf("memory_stat_rss[%d]=%ld\n", i, rusage->memory_stat_rss[i]);
}
for(i = 0; i < IHK_MAX_NUM_PGSIZES; i++) {
kprintf("memory_stat_mapped_file[%d]=%ld\n", i, rusage->memory_stat_mapped_file[i]);
}
}
#endif
}
void cpu_set(int cpu, cpu_set_t *cpu_set, ihk_spinlock_t *lock)
@@ -3397,6 +3410,18 @@ void runq_add_thread(struct thread *thread, int cpu_id)
procfs_create_thread(thread);
rusage_num_threads_inc();
#ifdef RUSAGE_DEBUG
if (rusage->num_threads == 1) {
int i;
kprintf("total_memory_usage=%ld\n", rusage->total_memory_usage);
for(i = 0; i < IHK_MAX_NUM_PGSIZES; i++) {
kprintf("memory_stat_rss[%d]=%ld\n", i, rusage->memory_stat_rss[i]);
}
for(i = 0; i < IHK_MAX_NUM_PGSIZES; i++) {
kprintf("memory_stat_mapped_file[%d]=%ld\n", i, rusage->memory_stat_mapped_file[i]);
}
}
#endif
/* Kick scheduler */
#ifdef POSTK_DEBUG_ARCH_DEP_8 /* arch depend hide */

View File

@@ -1063,14 +1063,6 @@ void terminate(int rc, int sig)
dkprintf("terminate,pid=%d\n", proc->pid);
/* rusage debug */
for(i = 0; i < IHK_MAX_NUM_PGSIZES; i++) {
dkprintf("memory_stat_rss[%d]=%ld\n", i, rusage->memory_stat_rss[i]);
}
for(i = 0; i < IHK_MAX_NUM_PGSIZES; i++) {
dkprintf("memory_stat_mapped_file[%d]=%ld\n", i, rusage->memory_stat_mapped_file[i]);
}
#ifdef DCFA_KMOD
do_mod_exit(rc);
#endif
@@ -1147,8 +1139,7 @@ terminate_host(int pid)
do_kill(cpu_local_var(current), pid, -1, SIGKILL, NULL, 0);
}
void
eventfd()
void eventfd(int type)
{
struct ihk_ikc_channel_desc *syscall_channel;
struct ikc_scd_packet pckt;
@@ -1156,6 +1147,7 @@ eventfd()
syscall_channel = get_cpu_local_var(0)->ikc2linux;
memset(&pckt, '\0', sizeof pckt);
pckt.msg = SCD_MSG_EVENTFD;
pckt.eventfd_type = type;
ihk_ikc_send(syscall_channel, &pckt, 0);
}