This replaces the chained list used to keep track of all memory ranges of a process by a standard rbtree (no need of interval tree here because there is no overlap) Accesses that were done directly through vm_range_list before were replaced by lookup_process_memory_range, even full list scan (e.g. coredump). The full scans will thus be less efficient because calls to rb_next() will not be inlined, but these are rarer calls that can probably afford this compared to code simplicity. The only reference to the actual backing structure left outside of process.c is a call to rb_erase in xpmem_free_process_memory_range. v2: fix lookup_process_memory_range with small start address v3: make vm_range_insert error out properly Panic does not lead to easy debug, all error paths are handled to just return someting on error v4: fix lookup_process_memory_range (again) That optimistically going left was a more serious bug than just last iteration, we could just pass by a match and continue down the tree if the match was not a leaf. v5: some users actually needed leftmost match, so restore behavior without the breakage (hopefully)
844 lines
22 KiB
C
844 lines
22 KiB
C
/*
|
|
* \file pager.c
|
|
* License details are found in the file LICENSE.
|
|
* \brief
|
|
* paging system
|
|
* \author Yutaka Ishikawa <ishikawa@riken.jp>
|
|
*/
|
|
/*
|
|
* HISTORY:
|
|
*/
|
|
#include <types.h>
|
|
#include <kmsg.h>
|
|
#include <ihk/cpu.h>
|
|
#include <cpulocal.h>
|
|
#include <ihk/mm.h>
|
|
#include <ihk/debug.h>
|
|
#include <ihk/ikc.h>
|
|
#include <errno.h>
|
|
#include <cls.h>
|
|
#include <syscall.h>
|
|
#include <kmalloc.h>
|
|
#include <process.h>
|
|
#include <swapfmt.h>
|
|
|
|
#define O_RDONLY 00000000
|
|
#define O_WRONLY 00000001
|
|
#define O_RDWR 00000002
|
|
#define O_CREAT 00000100
|
|
#define O_TRUNC 00001000
|
|
#define SEEK_SET 0 /* from include/uapi/linux/fs.h in Linux */
|
|
#define SEEK_CUR 1 /* from include/uapi/linux/fs.h in Linux */
|
|
#define IS_TEXT(start, region) ((start) == (region)->text_start)
|
|
#define IS_DATA(start, region) ((start) == (region)->data_start)
|
|
#define IS_STACK(start, region) ((start) == (region)->stack_start)
|
|
#define IS_INVALID_USERADDRESS(addr, region) \
|
|
((((unsigned long) addr) < region->user_start) \
|
|
|| ((unsigned long) addr) >= region->user_end)
|
|
#define IS_INVALID_LENGTH(len, region) \
|
|
((len) > (region->user_end - region->user_start))
|
|
#define IS_READONLY(flag) (((flag)&VR_PROT_WRITE) == 0)
|
|
#define IS_NOTUSER(flag) (((flag)&VR_AP_USER) == 0)
|
|
|
|
|
|
//#define DEBUG_PRINT_PROCESS
|
|
|
|
#ifdef DEBUG_PRINT_PROCESS
|
|
#define dkprintf(...) kprintf(__VA_ARGS__)
|
|
#define ekprintf(...) kprintf(__VA_ARGS__)
|
|
#else
|
|
#define dkprintf(...) do { if (0) kprintf(__VA_ARGS__); } while (0)
|
|
#define ekprintf(...) kprintf(__VA_ARGS__)
|
|
#endif
|
|
|
|
/*
|
|
* Contiguous pages are represented by the "addrpair" structure.
|
|
* - The swap_area, whose type is "struct arealist", keeps swappable pages
|
|
* using "areaent" structures that keeps a list of the "addrpair" structures.
|
|
* - The mlock_area is also the "struct arealist" struct, keeping pages locked byt
|
|
* both McKernel and Linux.
|
|
* - The mlock_container is also the "struct arealist" type, keeping pages loc
|
|
*/
|
|
/*
|
|
* The page areas are independently managed by McKernel and Linux.
|
|
* Pages locked by Linuxkernel are not known by McKernel. To get the information,
|
|
* the mlockcntnr structure is used.
|
|
* The mlockcntnr keeps the list of
|
|
*/
|
|
#define MLOCKADDRS_SIZE 128
|
|
struct addrpair {
|
|
unsigned long start;
|
|
unsigned long end;
|
|
unsigned long flag;
|
|
};
|
|
struct areaent {
|
|
struct areaent *next;
|
|
int count;
|
|
struct addrpair pair[MLOCKADDRS_SIZE];
|
|
};
|
|
|
|
struct arealist {
|
|
struct areaent *head;
|
|
struct areaent *tail;
|
|
int count;
|
|
};
|
|
|
|
struct mlockcntnr {
|
|
struct areaent *from;
|
|
int ccount;
|
|
struct areaent *cur;
|
|
};
|
|
|
|
struct swapinfo {
|
|
struct swap_header *swphdr;
|
|
struct swap_areainfo *swap_info, *mlock_info;
|
|
|
|
struct arealist swap_area;
|
|
struct arealist mlock_area;
|
|
struct mlockcntnr mlock_container;
|
|
#define UDATA_BUFSIZE (8*1024)
|
|
char *swapfname;
|
|
char *udata_buf; /* To read-store data from Linux to user space */
|
|
|
|
void *user_buf;
|
|
size_t ubuf_size, ubuf_alloced;
|
|
};
|
|
|
|
static void
|
|
area_print(struct vm_regions *region)
|
|
{
|
|
dkprintf("text %016lx:%016lx\n", region->text_start, region->text_end);
|
|
dkprintf("data %016lx:%016lx\n", region->data_start, region->data_end);
|
|
dkprintf("brk %016lx:%016lx\n", region->brk_start, region->brk_end);
|
|
dkprintf("map %016lx:%016lx\n", region->map_start, region->map_end);
|
|
dkprintf("stack %016lx:%016lx\n", region->stack_start, region->stack_end);
|
|
dkprintf("user %016lx:%016lx\n", region->user_start, region->user_end);
|
|
}
|
|
|
|
|
|
static int
|
|
myalloc_init(struct swapinfo *si, void *p, size_t sz)
|
|
{
|
|
extern SYSCALL_DECLARE(mlock);
|
|
ihk_mc_user_context_t ctx0;
|
|
int cc;
|
|
|
|
/* pin the buffer down in McKernel side */
|
|
ihk_mc_syscall_arg0(&ctx0) = (uintptr_t) p;
|
|
ihk_mc_syscall_arg1(&ctx0) = sz;
|
|
cc = sys_mlock(__NR_mlock, &ctx0);
|
|
if (cc < 0) return cc;
|
|
/* init */
|
|
si->user_buf = p;
|
|
si->ubuf_size = sz;
|
|
si->ubuf_alloced = 0;
|
|
dkprintf("myalloc_init: buffer(%p) size(0x%lx)\n", si->user_buf, si->ubuf_size);
|
|
return 0;
|
|
}
|
|
|
|
void
|
|
myalloc_finalize(struct swapinfo *si)
|
|
{
|
|
extern SYSCALL_DECLARE(munlock);
|
|
ihk_mc_user_context_t ctx0;
|
|
|
|
/* unpindown in McKernel side */
|
|
ihk_mc_syscall_arg0(&ctx0) = (uintptr_t) si->user_buf;
|
|
ihk_mc_syscall_arg1(&ctx0) = si->ubuf_size;
|
|
sys_munlock(__NR_munlock, &ctx0);
|
|
}
|
|
|
|
void *
|
|
myalloc(struct swapinfo *si, size_t sz)
|
|
{
|
|
void *p = NULL;
|
|
|
|
if ((si->ubuf_alloced + sz) < si->ubuf_size) {
|
|
p = (void*) &((char*)si->user_buf)[si->ubuf_alloced];
|
|
si->ubuf_alloced += sz;
|
|
}
|
|
return p;
|
|
}
|
|
|
|
void
|
|
myfree()
|
|
{
|
|
/* nothing so far */
|
|
}
|
|
|
|
static int
|
|
linux_open(char *fname, int flag, int mode)
|
|
{
|
|
ihk_mc_user_context_t ctx0;
|
|
int fd;
|
|
|
|
ihk_mc_syscall_arg0(&ctx0) = (uintptr_t) fname;
|
|
ihk_mc_syscall_arg1(&ctx0) = flag;
|
|
ihk_mc_syscall_arg2(&ctx0) = mode;
|
|
fd = syscall_generic_forwarding(__NR_open, &ctx0);
|
|
return fd;
|
|
}
|
|
|
|
static int
|
|
linux_unlink(char *fname)
|
|
{
|
|
ihk_mc_user_context_t ctx0;
|
|
|
|
ihk_mc_syscall_arg0(&ctx0) = (uintptr_t) fname;
|
|
return syscall_generic_forwarding(__NR_unlink, &ctx0);
|
|
}
|
|
|
|
static ssize_t
|
|
linux_read(int fd, void *buf, size_t count)
|
|
{
|
|
ihk_mc_user_context_t ctx0;
|
|
ssize_t sz;
|
|
size_t count0 = count;
|
|
|
|
ihk_mc_syscall_arg0(&ctx0) = fd;
|
|
sz = 0;
|
|
for (;;) {
|
|
ssize_t sz0;
|
|
|
|
ihk_mc_syscall_arg1(&ctx0) = (uintptr_t) buf;
|
|
ihk_mc_syscall_arg2(&ctx0) = count;
|
|
sz0 = syscall_generic_forwarding(__NR_read, &ctx0);
|
|
if (sz0 == -EINTR)
|
|
continue;
|
|
if (sz0 <= 0) {
|
|
if (sz == 0)
|
|
sz = sz0;
|
|
break;
|
|
}
|
|
sz += sz0;
|
|
if (sz == count0)
|
|
break;
|
|
count -= sz0;
|
|
buf = (char *)buf + sz0;
|
|
}
|
|
return sz;
|
|
}
|
|
|
|
static ssize_t
|
|
linux_write(int fd, void *buf, size_t count)
|
|
{
|
|
ihk_mc_user_context_t ctx0;
|
|
ssize_t sz;
|
|
size_t count0 = count;
|
|
|
|
ihk_mc_syscall_arg0(&ctx0) = fd;
|
|
sz = 0;
|
|
for (;;) {
|
|
ssize_t sz0;
|
|
|
|
ihk_mc_syscall_arg1(&ctx0) = (uintptr_t) buf;
|
|
ihk_mc_syscall_arg2(&ctx0) = count;
|
|
sz0 = syscall_generic_forwarding(__NR_write, &ctx0);
|
|
if (sz0 == -EINTR)
|
|
continue;
|
|
if (sz0 <= 0) {
|
|
if (sz == 0)
|
|
sz = sz0;
|
|
break;
|
|
}
|
|
sz += sz0;
|
|
if (sz == count0)
|
|
break;
|
|
count -= sz0;
|
|
buf = (char *)buf + sz0;
|
|
}
|
|
return sz;
|
|
}
|
|
|
|
static off_t
|
|
linux_lseek(int fd, off_t off, int whence)
|
|
{
|
|
ihk_mc_user_context_t ctx0;
|
|
int cc;
|
|
|
|
ihk_mc_syscall_arg0(&ctx0) = fd;
|
|
ihk_mc_syscall_arg1(&ctx0) = off;
|
|
ihk_mc_syscall_arg2(&ctx0) = whence;
|
|
cc = syscall_generic_forwarding(__NR_lseek, &ctx0);
|
|
return cc;
|
|
}
|
|
|
|
static int
|
|
linux_close(int fd)
|
|
{
|
|
ihk_mc_user_context_t ctx0;
|
|
int cc;
|
|
|
|
ihk_mc_syscall_arg0(&ctx0) = fd;
|
|
cc = syscall_generic_forwarding(__NR_close, &ctx0);
|
|
return cc;
|
|
}
|
|
|
|
/*
|
|
* The munmap syscall from McKernel is handled by mccntrl module.
|
|
* An extra argument, flag, is to set new remote page table if not zero.
|
|
*/
|
|
static int
|
|
linux_munmap(void *addr, size_t len, int flag)
|
|
{
|
|
ihk_mc_user_context_t ctx0;
|
|
int cc;
|
|
|
|
ihk_mc_syscall_arg0(&ctx0) = (uintptr_t) addr;
|
|
ihk_mc_syscall_arg1(&ctx0) = len;
|
|
ihk_mc_syscall_arg2(&ctx0) = flag;
|
|
cc = syscall_generic_forwarding(__NR_munmap, &ctx0);
|
|
return cc;
|
|
}
|
|
|
|
static int
|
|
pager_open(struct swapinfo *si, char *fname, int flag, int mode)
|
|
{
|
|
int fd;
|
|
strcpy(si->udata_buf, fname);
|
|
fd = linux_open(si->udata_buf, flag, mode);
|
|
return fd;
|
|
}
|
|
|
|
static int
|
|
pager_unlink(struct swapinfo *si, char *fname)
|
|
{
|
|
strcpy(si->udata_buf, fname);
|
|
return linux_unlink(si->udata_buf);
|
|
}
|
|
|
|
static ssize_t
|
|
pager_read(struct swapinfo *si, int fd, void *start, size_t size)
|
|
{
|
|
ssize_t off, sz, rs;
|
|
|
|
kprintf("pager_read: %lx (%lx)\n", start, size);
|
|
for (off = 0; off < size; off += sz) {
|
|
sz = size - off;
|
|
sz = (sz > UDATA_BUFSIZE) ? UDATA_BUFSIZE : sz;
|
|
rs = linux_read(fd, si->udata_buf, sz);
|
|
if (rs != sz) return rs;
|
|
copy_to_user(start + off, si->udata_buf, sz);
|
|
}
|
|
return off;
|
|
}
|
|
|
|
static ssize_t
|
|
pager_write(int fd, void *start, size_t size)
|
|
{
|
|
ssize_t sz;
|
|
|
|
sz = linux_write(fd, start, size);
|
|
return sz;
|
|
}
|
|
|
|
static int
|
|
mlocklist_req(unsigned long start, unsigned long end, struct addrpair *addr, int nent)
|
|
{
|
|
ihk_mc_user_context_t ctx0;
|
|
int cc;
|
|
|
|
#define PAGER_REQ_MLOCK_LIST 0x0008
|
|
ihk_mc_syscall_arg0(&ctx0) = PAGER_REQ_MLOCK_LIST;
|
|
ihk_mc_syscall_arg1(&ctx0) = start;
|
|
ihk_mc_syscall_arg2(&ctx0) = end;
|
|
ihk_mc_syscall_arg3(&ctx0) = (unsigned long) addr;
|
|
ihk_mc_syscall_arg4(&ctx0) = nent;
|
|
cc = syscall_generic_forwarding(__NR_mmap, &ctx0);
|
|
return cc;
|
|
}
|
|
|
|
/*
|
|
* If the last entry of addrpair is -1, more paged locked by Linux exist.
|
|
*/
|
|
static int
|
|
mlocklist_morereq(struct swapinfo *si, unsigned long *start)
|
|
{
|
|
struct areaent *ent = si->mlock_area.tail;
|
|
|
|
dkprintf("mlocklist_morereq: start = %ld and = %ld\n",
|
|
ent->pair[ent->count].start, ent->pair[ent->count].end);
|
|
if (ent->pair[ent->count].start != (unsigned long) -1) {
|
|
return 0;
|
|
}
|
|
*start = ent->pair[ent->count].end;
|
|
return 1;
|
|
}
|
|
|
|
static int
|
|
arealist_alloc(struct swapinfo *si, struct arealist *areap)
|
|
{
|
|
areap->head = areap->tail = myalloc(si, sizeof(struct areaent));
|
|
if (areap->head == NULL) return -ENOMEM;
|
|
memset(areap->head, 0, sizeof(struct areaent));
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
arealist_init(struct swapinfo *si)
|
|
{
|
|
int cc;
|
|
|
|
if ((cc = arealist_alloc(si, &si->swap_area)) < 0) return cc;
|
|
cc = arealist_alloc(si, &si->mlock_area);
|
|
return cc;
|
|
}
|
|
|
|
|
|
static void
|
|
arealist_free(struct arealist *area)
|
|
{
|
|
struct areaent *tmp;
|
|
for (tmp = area->head; tmp != NULL; tmp = tmp->next) {
|
|
myfree(tmp);
|
|
}
|
|
memset(area, 0, sizeof(struct arealist));
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* returns the start address of addrpair and its size
|
|
*/
|
|
static int
|
|
arealist_get(struct swapinfo *si, struct addrpair **pair, struct arealist *area)
|
|
{
|
|
struct areaent *tmp;
|
|
struct areaent *tail = area->tail;
|
|
if (tail->count < MLOCKADDRS_SIZE - 1) { /* at least two entries are needed */
|
|
if (pair) *pair = &tail->pair[tail->count];
|
|
return MLOCKADDRS_SIZE - tail->count;
|
|
}
|
|
tmp = myalloc(si, sizeof(struct areaent));
|
|
if (tmp == NULL) {
|
|
return -1;
|
|
}
|
|
memset(tmp, 0, sizeof(struct areaent));
|
|
area->tail->next = tmp;
|
|
area->tail = tmp;
|
|
if (pair) *pair = area->tail->pair;
|
|
return MLOCKADDRS_SIZE;
|
|
};
|
|
|
|
static void
|
|
arealist_update(int cnt, struct arealist *area)
|
|
{
|
|
area->tail->count += cnt;
|
|
area->count += cnt;
|
|
}
|
|
|
|
static int
|
|
arealist_add(struct swapinfo *si, unsigned long start, unsigned long end,
|
|
unsigned long flag, struct arealist *area)
|
|
{
|
|
int cc;
|
|
struct addrpair *addr;
|
|
|
|
cc = arealist_get(si, &addr, area);
|
|
if (cc < 0) return -1;
|
|
addr->start = start; addr->end = end; addr->flag = flag;
|
|
arealist_update(1, area);
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
arealist_preparewrite(struct arealist *areap, struct swap_areainfo *info,
|
|
ssize_t off, struct process_vm *vm, int flag)
|
|
{
|
|
struct areaent *ent;
|
|
int count = 0;
|
|
ssize_t totsz = 0;
|
|
struct page_table *pt = vm->address_space->page_table;
|
|
|
|
for (ent = areap->head; ent != NULL; ent = ent->next) {
|
|
int i;
|
|
for (i = 0; i < ent->count; i++, count++) {
|
|
ssize_t sz = ent->pair[i].end - ent->pair[i].start;
|
|
info[count].start = ent->pair[i].start;
|
|
info[count].end = ent->pair[i].end;
|
|
info[count].flag = ent->pair[i].flag;
|
|
if (flag) { /* position in file */
|
|
info[count].pos = off + totsz;
|
|
} else { /* physical memory */
|
|
if (ihk_mc_pt_virt_to_phys(pt,
|
|
(void*) ent->pair[i].start,
|
|
&info[count].pos)) {
|
|
kprintf("Cannot get phys\n");
|
|
}
|
|
}
|
|
totsz += sz;
|
|
}
|
|
}
|
|
return count;
|
|
}
|
|
|
|
static ssize_t
|
|
arealist_write(int fd, struct swap_areainfo *info, int count)
|
|
{
|
|
ssize_t sz;
|
|
|
|
sz = linux_write(fd, info, sizeof(struct swap_areainfo)*count);
|
|
if (sz != sizeof(struct swap_areainfo)*count) return -1;
|
|
return 0;
|
|
}
|
|
|
|
static void
|
|
arealist_print(char *msg, struct arealist *areap, int count)
|
|
{
|
|
struct areaent *ent;
|
|
kprintf("%s: %d\n", msg, count);
|
|
for (ent = areap->head; ent != NULL; ent = ent->next) {
|
|
int i;
|
|
for (i = 0; i < ent->count; i++) {
|
|
kprintf("\t%p -- %p\n",
|
|
(void*) ent->pair[i].start, (void*) ent->pair[i].end);
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
*
|
|
*/
|
|
static int
|
|
mlockcntnr_sethead(struct swapinfo *si)
|
|
{
|
|
int cnt;
|
|
cnt = arealist_get(si, 0, &si->mlock_area); /* Adjust arealist */
|
|
if (cnt < 0) return -1;
|
|
si->mlock_container.from = si->mlock_container.cur = si->mlock_area.tail;
|
|
si->mlock_container.ccount = si->mlock_area.tail->count;
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
mlockcntnr_isempty(struct swapinfo *si)
|
|
{
|
|
return si->mlock_container.from == si->mlock_area.tail
|
|
&& si->mlock_container.ccount == si->mlock_area.tail->count;
|
|
}
|
|
|
|
static int
|
|
mlockcntnr_addrent(struct swapinfo *si, struct addrpair *laddr)
|
|
{
|
|
if (si->mlock_container.ccount == si->mlock_container.cur->count) {
|
|
struct areaent *tmp = si->mlock_container.cur->next;
|
|
if (tmp == 0) return 0;
|
|
si->mlock_container.cur = tmp;
|
|
si->mlock_container.ccount = 1;
|
|
}
|
|
*laddr = si->mlock_container.cur->pair[si->mlock_container.ccount - 1];
|
|
si->mlock_container.ccount++;
|
|
return 1;
|
|
}
|
|
|
|
static void
|
|
print_area(char *label, unsigned long start, unsigned long sz,
|
|
struct vm_regions *region)
|
|
{
|
|
char *type;
|
|
|
|
if (start == region->text_start) {
|
|
type = "text";
|
|
} else if (start == region->data_start) {
|
|
type = "data";
|
|
} else if (start == region->brk_start) {
|
|
type = "brk";
|
|
} else if (start == region->stack_start) {
|
|
type = "stack";
|
|
} else if (start == region->user_start) {
|
|
type = "user";
|
|
} else if (start >= region->map_start
|
|
&& start <= region->stack_start) {
|
|
type = "map";
|
|
} else {
|
|
type = "other";
|
|
}
|
|
kprintf("%s: %s write(%p, %ld)\n", label, type, start, sz);
|
|
}
|
|
|
|
void
|
|
print_region(char *msg, struct process_vm *vm)
|
|
{
|
|
struct vm_range *range, *next;
|
|
|
|
kprintf("%s:\n", msg);
|
|
next = lookup_process_memory_range(vm, 0, -1);
|
|
while ((range = next)) {
|
|
next = next_process_memory_range(vm, range);
|
|
if (range->memobj != NULL) continue;
|
|
kprintf("\t%016lx:%016lx (%lx)\n",
|
|
range->start, range->end, range->flag);
|
|
}
|
|
}
|
|
|
|
static void
|
|
debug_dump(char *msg, unsigned char *p)
|
|
{
|
|
kprintf("%s-> %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x"
|
|
":%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n",
|
|
msg, p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
|
|
p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
|
|
}
|
|
|
|
int
|
|
do_pagein(int flag)
|
|
{
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct process_vm *vm = thread->vm;
|
|
int fd, i;
|
|
ssize_t pos, sz, rs;
|
|
struct swapinfo *si = vm->swapinfo;
|
|
|
|
dkprintf("do_pagein: flag(%d) currss(%lx)\n", flag, vm->currss);
|
|
fd = pager_open(si, si->swapfname, O_RDONLY, 0);
|
|
pager_unlink(si, si->swapfname);
|
|
if (fd < 0) {
|
|
kprintf("do_pagein: Cannot open file: %s\n", si->swapfname);
|
|
return fd;
|
|
}
|
|
/*
|
|
* In the current implementaion, the following working areas remain
|
|
* in the physical memory area:
|
|
* swphdr, swap_info and mlock_info
|
|
*/
|
|
pos = sizeof(struct swap_header);
|
|
pos += sizeof(struct swap_areainfo)*si->swphdr->count_sarea;
|
|
pos += sizeof(struct swap_areainfo)*si->swphdr->count_marea;
|
|
rs = linux_lseek(fd, pos, SEEK_SET);
|
|
for (i = 0; i < si->swphdr->count_sarea; i++) {
|
|
extern int ihk_mc_pt_print_pte(struct page_table *pt, void *virt);
|
|
sz = si->swap_info[i].end - si->swap_info[i].start;
|
|
dkprintf("pagein: %016lx:%016lx sz(%lx)\n", si->swap_info[i].start, si->swap_info[i].end, sz);
|
|
rs = pager_read(si, fd, (void*) si->swap_info[i].start, sz);
|
|
if (rs != sz) goto err;
|
|
// ihk_mc_pt_print_pte(vm->address_space->page_table, (void*) si->swap_info[i].start);
|
|
}
|
|
linux_close(fd);
|
|
print_region("after pagin", vm);
|
|
kprintf("do_pagein: done, currss(%lx)\n", vm->currss);
|
|
vm->swapinfo = NULL;
|
|
kfree(si->swapfname);
|
|
kfree(si);
|
|
return 0;
|
|
err:
|
|
linux_close(fd);
|
|
ekprintf("pagein: read error: return(%lx) size(%lx)\n", rs, sz);
|
|
vm->swapinfo = NULL;
|
|
kfree(si->swapfname);
|
|
kfree(si);
|
|
return -1;
|
|
}
|
|
|
|
int
|
|
do_pageout(char *fname, void *buf, size_t size, int flag)
|
|
{
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct process_vm *vm = thread->vm;
|
|
struct vm_regions *region = &vm->region;
|
|
struct vm_range *range, *next;
|
|
struct addrpair *addr;
|
|
int i, fd;
|
|
long cc;
|
|
unsigned long start, end;
|
|
ssize_t pos, sz;
|
|
struct swapinfo *si;
|
|
|
|
fd = -1;
|
|
dkprintf("do_pageout: buf(%p) size(%d) flag(%d) currss(%lx)\n",
|
|
buf, size, flag, vm->currss);
|
|
if (IS_INVALID_USERADDRESS(fname, region)
|
|
|| IS_INVALID_USERADDRESS(buf, region)
|
|
|| IS_INVALID_LENGTH(size, region)) {
|
|
return -EINVAL;
|
|
}
|
|
if (!(si = kmalloc(sizeof(struct swapinfo), IHK_MC_AP_NOWAIT))) {
|
|
ekprintf("do_pageout: Cannot allocate working memory in kmalloc\n");
|
|
return -ENOMEM;
|
|
}
|
|
memset(si, '\0', sizeof(struct swapinfo));
|
|
cc = myalloc_init(si, buf, size);
|
|
if (cc < 0) {
|
|
kfree(si);
|
|
ekprintf("do_pageout: Cannot pin buf (%p) down\n", buf);
|
|
return cc;
|
|
}
|
|
si->udata_buf = myalloc(si, UDATA_BUFSIZE);
|
|
si->swapfname = kmalloc(strlen(fname) + 1, IHK_MC_AP_NOWAIT);
|
|
if (si->swapfname == NULL) {
|
|
kfree(si);
|
|
ekprintf("do_pageout: Cannot allocate working memory in kmalloc\n");
|
|
return -ENOMEM;
|
|
}
|
|
if (strcpy_from_user(si->swapfname, fname)) {
|
|
cc = -EFAULT;
|
|
goto err;
|
|
}
|
|
cc = arealist_init(si);
|
|
if (cc < 0) {
|
|
ekprintf("do_pageout: user buffer area is needed more than %d byte\n",
|
|
UDATA_BUFSIZE + sizeof(struct areaent)*2);
|
|
goto err;
|
|
}
|
|
|
|
fd = linux_open(fname, O_RDWR|O_CREAT|O_TRUNC, 0600);
|
|
if (fd < 0) {
|
|
ekprintf("do_pageout: Cannot open/create file: %s\n", fname);
|
|
cc = fd;
|
|
goto err;
|
|
}
|
|
area_print(region);
|
|
|
|
/* looking at ranges except for non anoymous, text, and data */
|
|
next = lookup_process_memory_range(vm, 0, -1);
|
|
while ((range = next)) {
|
|
next = next_process_memory_range(vm, range);
|
|
if (range->memobj != NULL) continue;
|
|
if (IS_TEXT(range->start, region)
|
|
|| IS_STACK(range->start, region)
|
|
|| IS_INVALID_USERADDRESS(range->start, region)
|
|
|| IS_READONLY(range->flag)
|
|
|| IS_NOTUSER(range->flag)) continue;
|
|
if (range->flag & VR_LOCKED) {
|
|
/* this range is locked by McKernel */
|
|
cc = arealist_add(si, range->start, range->end,
|
|
range->flag, &si->mlock_area);
|
|
if (cc < 0) goto nomem;
|
|
continue;
|
|
}
|
|
start = range->start; end = range->end;
|
|
if ((cc = mlockcntnr_sethead(si)) < 0) goto nomem;
|
|
/* Requesting mlock list in Linux Kernel. We do not know how much
|
|
* addrpair entries are needed. The Linux side stores -1 in
|
|
* the last entry of addrpair to inform more entries exist.
|
|
* the mlocklist_morereq function checks this condition. */
|
|
do {
|
|
if ((cc = arealist_get(si, &addr, &si->mlock_area)) < 0) goto nomem;
|
|
cc = mlocklist_req(start, end, addr, cc);
|
|
arealist_update(cc, &si->mlock_area);
|
|
} while (mlocklist_morereq(si, &start));
|
|
/* */
|
|
if (mlockcntnr_isempty(si)) { /* whole range is going to swap */
|
|
cc = arealist_add(si, range->start, range->end,
|
|
range->flag, &si->swap_area);
|
|
} else { /* partial range is going to swap */
|
|
for (start = range->start; start < range->end;) {
|
|
struct addrpair laddr;
|
|
if (mlockcntnr_addrent(si, &laddr) == 0) {
|
|
/* No more entry locked by Linux */
|
|
cc = arealist_add(si, start, range->end,
|
|
range->flag,
|
|
&si->swap_area);
|
|
if (cc < 0) goto nomem;
|
|
break;
|
|
}
|
|
if (start < laddr.start) {
|
|
/* swap range from start to laddr.start */
|
|
cc = arealist_add(si, start, laddr.start,
|
|
range->flag,
|
|
&si->swap_area);
|
|
if (cc < 0) goto nomem;
|
|
}
|
|
start = laddr.end;
|
|
kprintf("do_pageout: start(%ld) range->end(%ld)\n",
|
|
start, range->end);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
arealist_print("SWAP", &si->swap_area, si->swap_area.count);
|
|
arealist_print("MLOCK", &si->mlock_area, si->mlock_area.count);
|
|
si->swap_info = myalloc(si, sizeof(struct swap_areainfo)* si->swap_area.count);
|
|
si->mlock_info = myalloc(si, sizeof(struct swap_areainfo)* si->mlock_area.count);
|
|
if (si->swap_info == NULL || si->mlock_info == NULL) goto nomem;
|
|
|
|
/* preparing page store */
|
|
si->swphdr = myalloc(si, sizeof(struct swap_header));
|
|
strncpy(si->swphdr->magic, MCKERNEL_SWAP, SWAP_HLEN);
|
|
strncpy(si->swphdr->version, MCKERNEL_SWAP_VERSION, SWAP_HLEN);
|
|
si->swphdr->count_sarea = si->swap_area.count;
|
|
si->swphdr->count_marea = si->mlock_area.count;
|
|
if ((cc = pager_write(fd, si->swphdr, sizeof(struct swap_header)))
|
|
!= sizeof(struct swap_header)) {
|
|
if (cc >= 0)
|
|
cc = -EIO;
|
|
goto err;
|
|
}
|
|
pos = linux_lseek(fd, 0, SEEK_CUR);
|
|
pos += sizeof(struct swap_areainfo)*(si->swap_area.count+si->mlock_area.count);
|
|
cc = arealist_preparewrite(&si->swap_area, si->swap_info, pos, vm, 1);
|
|
if (cc != si->swap_area.count) {
|
|
ekprintf("do_pageout: ERROR file ent(%d) != list ent(%d) in swap_area\n",
|
|
cc, si->swap_area.count);
|
|
}
|
|
cc = arealist_preparewrite(&si->mlock_area, si->mlock_info, 0, vm, 0);
|
|
if (cc != si->mlock_area.count) {
|
|
ekprintf("do_pageout: ERROR file ent(%d) != list ent(%d) in swap_area\n",
|
|
cc, si->mlock_area.count);
|
|
}
|
|
/* arealists are stored */
|
|
if ((cc = arealist_write(fd, si->swap_info, si->swap_area.count)) < 0) goto err;
|
|
if ((cc = arealist_write(fd, si->mlock_info, si->mlock_area.count)) < 0) goto err;
|
|
/* now pages are stored */
|
|
for (i = 0; i < si->swap_area.count; i++) {
|
|
sz = si->swap_info[i].end - si->swap_info[i].start;
|
|
if ((cc = pager_write(fd, (void*) si->swap_info[i].start, sz)) != sz) {
|
|
if (cc >= 0)
|
|
cc = -EIO;
|
|
goto err;
|
|
}
|
|
}
|
|
if (flag & 0x04) {
|
|
kprintf("skipping physical memory removal\n");
|
|
goto free_exit;
|
|
}
|
|
kprintf("removing physical memory\n");
|
|
for (i = 0; i < si->swap_area.count; i++) {
|
|
cc = ihk_mc_pt_free_range(vm->address_space->page_table,
|
|
vm,
|
|
(void*) si->swap_info[i].start,
|
|
(void*) si->swap_info[i].end, NULL);
|
|
if (cc < 0) {
|
|
kprintf("ihk_mc_pt_clear_range returns: %d\n", cc);
|
|
}
|
|
}
|
|
#if 0
|
|
range->flag |= VR_PAGEOUT;
|
|
#endif
|
|
cc = linux_close(fd);
|
|
fd = -1;
|
|
/*
|
|
* Unmapping McKernel's user virtual spaces in Linux side.
|
|
* From here to the completion of do_pagein, the nonlocking user spaces
|
|
* except TEXT, STACK, readonly pages, are not invalid.
|
|
*/
|
|
for (i = 0; i < si->swap_area.count; i++) {
|
|
sz = si->swap_info[i].end - si->swap_info[i].start;
|
|
cc = linux_munmap((void*) si->swap_info[i].start, sz, 0);
|
|
if (cc < 0) {
|
|
kprintf("do_pageout: Cannot munmap: %lx len(%lx)\n",
|
|
si->swap_info[i].start, sz);
|
|
}
|
|
}
|
|
cc = 0;
|
|
goto free_exit;
|
|
err:
|
|
ekprintf("do_pageout: write error: %d\n", cc);
|
|
goto free_exit;
|
|
nomem:
|
|
ekprintf("do_pageout: cannot allocate working memory\n");
|
|
cc = -ENOMEM;
|
|
free_exit:
|
|
if (fd >= 0)
|
|
linux_close(fd);
|
|
dkprintf("do_pageout: done, currss(%lx)\n", vm->currss);
|
|
arealist_free(&si->mlock_area); arealist_free(&si->swap_area);
|
|
if (cc != 0) {
|
|
pager_unlink(si, si->swapfname);
|
|
kfree(si->swapfname);
|
|
kfree(si);
|
|
}
|
|
else {
|
|
vm->swapinfo = si;
|
|
}
|
|
return cc;
|
|
}
|