xpmem: support large page

1. try to use as large page as possible on attach
2. pre-map resident remote pages on attach

Change-Id: I5580682a4199e94085a9bad9ce3958a0f14cdcea
This commit is contained in:
Yoshihisa Morizumi
2021-01-08 16:56:32 +09:00
committed by Masamichi Takagi
parent 3aaa5350f0
commit d2db639853
68 changed files with 4878 additions and 244 deletions

View File

@@ -142,7 +142,7 @@ int prepare_process_ranges_args_envs(struct thread *thread,
if ((error = add_process_memory_range(vm, s, e, NOPHYS, flags, NULL, 0,
pn->sections[i].len > LARGE_PAGE_SIZE ?
LARGE_PAGE_SHIFT : PAGE_SHIFT,
&range)) != 0) {
NULL, &range)) != 0) {
kprintf("ERROR: adding memory range for ELF section %i\n", i);
goto err;
}
@@ -289,7 +289,7 @@ int prepare_process_ranges_args_envs(struct thread *thread,
dkprintf("%s: args_envs: %d pages\n",
__func__, argenv_page_count);
if ((error = add_process_memory_range(vm, addr, e, args_envs_p,
flags, NULL, 0, PAGE_SHIFT, NULL)) != 0) {
flags, NULL, 0, PAGE_SHIFT, NULL, NULL)) != 0){
ihk_mc_free_pages_user(args_envs, argenv_page_count);
kprintf("ERROR: adding memory range for args/envs\n");
goto err;

View File

@@ -72,4 +72,5 @@ extern int anon_on_demand;
#ifdef ENABLE_FUGAKU_HACKS
extern int hugetlbfs_on_demand;
#endif
extern int xpmem_page_in_remote_on_attach;
#endif

View File

@@ -55,6 +55,7 @@
#define VR_MEMTYPE_MASK 0x0f000000
#define VR_PAGEOUT 0x10000000
#define VR_DONTDUMP 0x20000000
#define VR_XPMEM 0x40000000
#define VR_WIPEONFORK 0x80000000
#define PROT_TO_VR_FLAG(prot) (((unsigned long)(prot) << 16) & VR_PROT_MASK)
@@ -850,7 +851,7 @@ int add_process_memory_range(struct process_vm *vm,
unsigned long start, unsigned long end,
unsigned long phys, unsigned long flag,
struct memobj *memobj, off_t offset,
int pgshift, struct vm_range **rp);
int pgshift, void *private_data, struct vm_range **rp);
int remove_process_memory_range(struct process_vm *vm, unsigned long start,
unsigned long end, int *ro_freedp);
int split_process_memory_range(struct process_vm *vm,
@@ -875,6 +876,9 @@ struct vm_range *previous_process_memory_range(
int extend_up_process_memory_range(struct process_vm *vm,
struct vm_range *range, uintptr_t newend);
int page_fault_process_memory_range(struct process_vm *vm,
struct vm_range *range,
uintptr_t fault_addr, uint64_t reason);
int page_fault_process_vm(struct process_vm *fault_vm, void *fault_addr,
uint64_t reason);
int remove_process_region(struct process_vm *vm,

View File

@@ -535,7 +535,7 @@ enum set_cputime_mode {
void set_cputime(enum set_cputime_mode mode);
int do_munmap(void *addr, size_t len, int holding_memory_range_lock);
intptr_t do_mmap(uintptr_t addr0, size_t len0, int prot, int flags, int fd,
off_t off0);
off_t off0, const int vrf0, void *private_data);
void clear_host_pte(uintptr_t addr, size_t len, int holding_memory_range_lock);
typedef int32_t key_t;
int do_shmget(key_t key, size_t size, int shmflg);

View File

@@ -27,6 +27,8 @@ int xpmem_remove_process_memory_range(struct process_vm *vm,
struct vm_range *vmr);
int xpmem_fault_process_memory_range(struct process_vm *vm,
struct vm_range *vmr, unsigned long vaddr, uint64_t reason);
int xpmem_update_process_page_table(struct process_vm *vm,
struct vm_range *vmr);
struct xpmem_attachment {
ihk_rwspinlock_t at_lock; /* att lock */

View File

@@ -248,11 +248,12 @@ static void xpmem_clear_PTEs_of_att(struct xpmem_attachment *, unsigned long,
static int xpmem_remap_pte(struct process_vm *, struct vm_range *,
unsigned long, uint64_t, struct xpmem_segment *, unsigned long);
static int xpmem_ensure_valid_page(struct xpmem_segment *, unsigned long);
static int xpmem_ensure_valid_page(struct xpmem_segment *, unsigned long,
int);
static pte_t * xpmem_vaddr_to_pte(struct process_vm *, unsigned long,
size_t *pgsize);
static int xpmem_pin_page(struct xpmem_thread_group *, struct thread *,
struct process_vm *, unsigned long);
struct process_vm *, unsigned long, int);
static void xpmem_unpin_pages(struct xpmem_segment *, struct process_vm *,
unsigned long, size_t);
@@ -406,7 +407,7 @@ static inline void xpmem_tg_ref(
DBUG_ON(ihk_atomic_read(&tg->refcnt) <= 0);
ihk_atomic_inc(&tg->refcnt);
XPMEM_DEBUG("return: tg->refcnt=%d", tg->refcnt);
//XPMEM_DEBUG("return: tg->refcnt=%d", tg->refcnt);
}
static inline void xpmem_seg_ref(
@@ -415,7 +416,7 @@ static inline void xpmem_seg_ref(
DBUG_ON(ihk_atomic_read(&seg->refcnt) <= 0);
ihk_atomic_inc(&seg->refcnt);
XPMEM_DEBUG("return: seg->refcnt=%d", seg->refcnt);
//XPMEM_DEBUG("return: seg->refcnt=%d", seg->refcnt);
}
static inline void xpmem_ap_ref(
@@ -424,7 +425,7 @@ static inline void xpmem_ap_ref(
DBUG_ON(ihk_atomic_read(&ap->refcnt) <= 0);
ihk_atomic_inc(&ap->refcnt);
XPMEM_DEBUG("return: ap->refcnt=%d", ap->refcnt);
//XPMEM_DEBUG("return: ap->refcnt=%d", ap->refcnt);
}
static inline void xpmem_att_ref(
@@ -433,7 +434,7 @@ static inline void xpmem_att_ref(
DBUG_ON(ihk_atomic_read(&att->refcnt) <= 0);
ihk_atomic_inc(&att->refcnt);
XPMEM_DEBUG("return: att->refcnt=%d", att->refcnt);
//XPMEM_DEBUG("return: att->refcnt=%d", att->refcnt);
}
static inline int xpmem_is_private_data(

View File

@@ -66,6 +66,7 @@ int anon_on_demand = 0;
#ifdef ENABLE_FUGAKU_HACKS
int hugetlbfs_on_demand;
#endif
int xpmem_page_in_remote_on_attach;
int sysctl_overcommit_memory = OVERCOMMIT_ALWAYS;
static struct ihk_mc_pa_ops *pa_ops;
@@ -2144,6 +2145,11 @@ void mem_init(void)
kprintf("Demand paging on ANONYMOUS mappings enabled.\n");
anon_on_demand = 1;
}
if (find_command_line("xpmem_page_in_remote_on_attach")) {
kprintf("Demand paging on XPMEM remote mappings enabled.\n");
xpmem_page_in_remote_on_attach = 1;
}
#ifdef ENABLE_FUGAKU_HACKS
if (find_command_line("hugetlbfs_on_demand")) {

View File

@@ -1465,7 +1465,7 @@ int add_process_memory_range(struct process_vm *vm,
unsigned long start, unsigned long end,
unsigned long phys, unsigned long flag,
struct memobj *memobj, off_t offset,
int pgshift, struct vm_range **rp)
int pgshift, void *private_data, struct vm_range **rp)
{
dkprintf("%s: start=%lx,end=%lx,phys=%lx,flag=%lx\n", __FUNCTION__, start, end, phys, flag);
struct vm_range *range;
@@ -1493,7 +1493,7 @@ int add_process_memory_range(struct process_vm *vm,
range->memobj = memobj;
range->objoff = offset;
range->pgshift = pgshift;
range->private_data = NULL;
range->private_data = private_data;
range->straight_start = 0;
#ifdef ENABLE_TOFU
INIT_LIST_HEAD(&range->tofu_stag_list);
@@ -1517,6 +1517,10 @@ int add_process_memory_range(struct process_vm *vm,
else if (flag & VR_IO_NOCACHE) {
rc = update_process_page_table(vm, range, phys, PTATTR_UNCACHABLE);
}
else if (flag & VR_XPMEM) {
range->memobj->flags |= MF_XPMEM;
// xpmem_update_process_page_table() is called in do_mmap()
}
else if (flag & VR_DEMAND_PAGING) {
dkprintf("%s: range: 0x%lx - 0x%lx is demand paging\n",
__FUNCTION__, range->start, range->end);
@@ -1539,7 +1543,8 @@ int add_process_memory_range(struct process_vm *vm,
}
/* Clear content! */
if (phys != NOPHYS && !(flag & (VR_REMOTE | VR_DEMAND_PAGING))
if (phys != NOPHYS
&& !(flag & (VR_REMOTE | VR_DEMAND_PAGING | VR_XPMEM))
&& ((flag & VR_PROT_MASK) != VR_PROT_NONE)) {
if (!zero_at_free) {
@@ -2074,7 +2079,9 @@ out:
return error;
}
static int page_fault_process_memory_range(struct process_vm *vm, struct vm_range *range, uintptr_t fault_addr, uint64_t reason)
int page_fault_process_memory_range(struct process_vm *vm,
struct vm_range *range,
uintptr_t fault_addr, uint64_t reason)
{
int error;
pte_t *ptep;
@@ -2621,7 +2628,8 @@ int init_process_stack(struct thread *thread, struct program_load_desc *pn,
vrflag |= VR_MAXPROT_READ | VR_MAXPROT_WRITE | VR_MAXPROT_EXEC;
#define NOPHYS ((uintptr_t)-1)
if ((rc = add_process_memory_range(thread->vm, start, end, NOPHYS,
vrflag, NULL, 0, USER_STACK_PAGE_SHIFT, &range)) != 0) {
vrflag, NULL, 0, USER_STACK_PAGE_SHIFT,
NULL, &range)) != 0) {
ihk_mc_free_pages_user(stack, minsz >> PAGE_SHIFT);
kprintf("%s: error addding process memory range: %d\n", rc);
return rc;
@@ -2795,7 +2803,7 @@ unsigned long extend_process_region(struct process_vm *vm,
if ((rc = add_process_memory_range(vm, end_allocated, new_end_allocated,
(p == 0 ? 0 : virt_to_phys(p)), flag, NULL, 0,
align_shift, NULL)) != 0) {
align_shift, NULL, NULL)) != 0) {
ihk_mc_free_pages_user(p, npages);
return end_allocated;
}

View File

@@ -1772,6 +1772,12 @@ static int search_free_space(size_t len, int pgshift, uintptr_t *addrp)
/* try given addr first */
addr = *addrp;
if (addr != 0) {
if ((region->user_end <= addr)
|| ((region->user_end - len) < addr)) {
error = -ENOMEM;
goto out;
}
range = lookup_process_memory_range(thread->vm, addr, addr+len);
if (range == NULL)
goto out;
@@ -1807,7 +1813,8 @@ out:
intptr_t
do_mmap(const uintptr_t addr0, const size_t len0, const int prot,
const int flags, const int fd, const off_t off0)
const int flags, const int fd, const off_t off0,
const int vrf0, void *private_data)
{
struct thread *thread = cpu_local_var(current);
struct vm_regions *region = &thread->vm->region;
@@ -1815,11 +1822,11 @@ do_mmap(const uintptr_t addr0, const size_t len0, const int prot,
size_t len = len0;
size_t populate_len = 0;
off_t off;
int error;
intptr_t npages;
int error = 0;
intptr_t npages = 0;
int p2align;
void *p = NULL;
int vrflags;
int vrflags = VR_NONE;
uintptr_t phys;
intptr_t straight_phys;
struct memobj *memobj = NULL;
@@ -1941,7 +1948,7 @@ do_mmap(const uintptr_t addr0, const size_t len0, const int prot,
if (add_process_memory_range(proc->vm, (unsigned long)proc->straight_va,
(unsigned long)proc->straight_va + proc->straight_len,
NOPHYS, vrflags, NULL, 0,
PAGE_SHIFT + p2align, &range) != 0) {
PAGE_SHIFT + p2align, private_data, &range) != 0) {
kprintf("%s: error: adding straight memory range \n",
__FUNCTION__);
proc->straight_va = 0;
@@ -2076,7 +2083,8 @@ straight_out:
}
p2align = pgshift - PAGE_SHIFT;
}
else if ((flags & MAP_PRIVATE) && (flags & MAP_ANONYMOUS)
else if ((((flags & MAP_PRIVATE) && (flags & MAP_ANONYMOUS))
|| (vrf0 & VR_XPMEM))
&& !proc->thp_disable) {
pgshift = 0; /* transparent huge page */
p2align = PAGE_P2ALIGN;
@@ -2116,22 +2124,24 @@ straight_out:
}
else if (flags & MAP_ANONYMOUS) {
/* Obtain mapping address */
error = search_free_space(len, PAGE_SHIFT + p2align, &addr);
error = search_free_space(len,
PAGE_SHIFT + p2align, &addr);
if (error) {
ekprintf("do_mmap:search_free_space(%lx,%lx,%d) failed. %d\n",
len, region->map_end, p2align, error);
kprintf("%s: error: search_free_space(%lx,%lx,%lx) failed. %d\n",
__func__, len, PAGE_SHIFT + p2align, addr, error);
goto out;
}
}
/* do the map */
vrflags = VR_NONE;
vrflags |= vrf0;
vrflags |= PROT_TO_VR_FLAG(prot);
vrflags |= (flags & MAP_PRIVATE)? VR_PRIVATE: 0;
vrflags |= (flags & MAP_LOCKED)? VR_LOCKED: 0;
vrflags |= VR_DEMAND_PAGING;
if (flags & MAP_ANONYMOUS) {
if (!anon_on_demand && (flags & MAP_PRIVATE)) {
if (flags & MAP_ANONYMOUS && !anon_on_demand) {
if (flags & MAP_PRIVATE) {
vrflags &= ~VR_DEMAND_PAGING;
}
}
@@ -2276,6 +2286,7 @@ straight_out:
}
/* Prepopulated ANONYMOUS mapping */
else if (!(vrflags & VR_DEMAND_PAGING)
&& !(flags & MAP_SHARED)
&& ((vrflags & VR_PROT_MASK) != VR_PROT_NONE)) {
npages = len >> PAGE_SHIFT;
/* Small allocations mostly benefit from closest RAM,
@@ -2379,7 +2390,7 @@ straight_out:
}
error = add_process_memory_range(thread->vm, addr, addr+len, phys,
vrflags, memobj, off, pgshift, &range);
vrflags, memobj, off, pgshift, private_data, &range);
if (error) {
kprintf("%s: add_process_memory_range failed for 0x%lx:%lu"
" flags: %lx, vrflags: %lx, pgshift: %d, error: %d\n",
@@ -2467,6 +2478,19 @@ out:
}
ihk_rwspinlock_write_unlock_noirq(&thread->vm->memory_range_lock);
ihk_rwspinlock_read_lock_noirq(&thread->vm->memory_range_lock);
if (!error && range && range->memobj &&
(range->memobj->flags & MF_XPMEM)) {
error = xpmem_update_process_page_table(thread->vm, range);
if (error) {
ekprintf("%s: xpmem_update_process_page_table(): "
"vm: %p, range: %lx-%lx failed %d\n",
__func__, thread->vm,
range->start, range->end, error);
}
}
ihk_rwspinlock_read_unlock_noirq(&thread->vm->memory_range_lock);
if (!error && populated_mapping &&
!((vrflags & VR_PROT_MASK) == VR_PROT_NONE) && !range->straight_start) {
error = populate_process_memory(thread->vm,
@@ -2496,7 +2520,7 @@ out:
}
}
if (p) {
if (p && npages > 0) {
ihk_mc_free_pages_user(p, npages);
}
if (memobj) {
@@ -5009,7 +5033,7 @@ perf_mmap(struct mckfd *sfd, ihk_mc_user_context_t *ctx)
flags |= MAP_ANONYMOUS;
prot |= PROT_WRITE;
rc = do_mmap(addr0, len0, prot, flags, fd, off0);
rc = do_mmap(addr0, len0, prot, flags, fd, off0, 0, NULL);
// setup perf_event_mmap_page
page = (struct perf_event_mmap_page *)rc;
@@ -6542,7 +6566,7 @@ SYSCALL_DECLARE(shmat)
}
error = add_process_memory_range(vm, addr, addr+len, -1,
vrflags, &obj->memobj, 0, obj->pgshift, NULL);
vrflags, &obj->memobj, 0, obj->pgshift, NULL, NULL);
if (error) {
if (!(prot & PROT_WRITE)) {
(void)set_host_vma(addr, len, PROT_READ | PROT_WRITE | PROT_EXEC, 1/* holding memory_range_lock */);
@@ -9441,7 +9465,7 @@ SYSCALL_DECLARE(mremap)
error = add_process_memory_range(thread->vm, newstart, newend, -1,
range->flag, range->memobj,
range->objoff + (oldstart - range->start),
range->pgshift, NULL);
0, NULL, NULL);
if (error) {
ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):"
"add failed. %d\n",

View File

@@ -25,6 +25,7 @@
#include <kmalloc.h>
#include <limits.h>
#include <memobj.h>
#include <process.h>
#include <mman.h>
#include <page.h>
#include <string.h>
@@ -222,7 +223,8 @@ static int xpmem_ioctl(
attach_info.fd, attach_info.flags,
&at_vaddr);
if (ret != 0) {
XPMEM_DEBUG("return: cmd=0x%x, ret=%d", cmd, ret);
XPMEM_DEBUG("return: at_vaddr: %lx, cmd=0x%x, ret=%d",
at_vaddr, cmd, ret);
return ret;
}
@@ -233,7 +235,8 @@ static int xpmem_ioctl(
return -EFAULT;
}
XPMEM_DEBUG("return: cmd=0x%x, ret=%d", cmd, ret);
XPMEM_DEBUG("XPMEM_CMD_ATTACH: return: at_vaddr: %lx, cmd=0x%x, ret=%d",
at_vaddr, cmd, ret);
return ret;
}
@@ -447,8 +450,8 @@ static int xpmem_make(
* multiple of pages in size.
*/
if (offset_in_page(vaddr) != 0 ||
/* Special treatment of -1UL */
(offset_in_page(size) != 0 && size != 0xffffffffffffffff)) {
/* Special treatment of -1UL */
(offset_in_page(size) != 0 && size != 0xffffffffffffffff)) {
xpmem_tg_deref(seg_tg);
XPMEM_DEBUG("return: ret=%d", -EINVAL);
return -EINVAL;
@@ -1011,7 +1014,6 @@ static int xpmem_attach(
struct xpmem_segment *seg;
struct xpmem_attachment *att;
unsigned long at_lock;
struct vm_range *vmr;
struct process_vm *vm = cpu_local_var(current)->vm;
XPMEM_DEBUG("call: apid=0x%lx, offset=0x%lx, size=0x%lx, vaddr=0x%lx, "
@@ -1137,37 +1139,18 @@ static int xpmem_attach(
XPMEM_DEBUG("do_mmap(): vaddr=0x%lx, size=0x%lx, prot_flags=0x%lx, "
"flags=0x%lx, fd=%d, offset=0x%lx",
vaddr, size, prot_flags, flags, mckfd->fd, offset);
/* The new range uses on-demand paging and is associated with shmobj because of
MAP_ANONYMOUS && !MAP_PRIVATE && MAP_SHARED */
at_vaddr = do_mmap(vaddr, size, prot_flags, flags, mckfd->fd, offset);
/* The new range is associated with shmobj because of
* MAP_ANONYMOUS && !MAP_PRIVATE && MAP_SHARED. Note that MAP_FIXED
* support prevents us from reusing segment vm_range when segment vm
* and attach vm is the same.
*/
at_vaddr = do_mmap(vaddr, size, prot_flags, flags, mckfd->fd,
offset, VR_XPMEM, att);
if (IS_ERR((void *)(uintptr_t)at_vaddr)) {
ret = at_vaddr;
goto out_2;
}
XPMEM_DEBUG("at_vaddr=0x%lx", at_vaddr);
att->at_vaddr = at_vaddr;
ihk_rwspinlock_read_lock_noirq(&vm->memory_range_lock);
vmr = lookup_process_memory_range(vm, at_vaddr, at_vaddr + 1);
/* To identify pages of XPMEM attachment for rusage accounting */
if(vmr->memobj) {
vmr->memobj->flags |= MF_XPMEM;
} else {
ekprintf("%s: vmr->memobj equals to NULL\n", __FUNCTION__);
}
ihk_rwspinlock_read_unlock_noirq(&vm->memory_range_lock);
if (!vmr) {
ret = -ENOENT;
goto out_2;
}
vmr->private_data = att;
att->at_vmr = vmr;
*at_vaddr_p = at_vaddr + offset_in_page(att->vaddr);
@@ -1193,7 +1176,6 @@ out_1:
return ret;
}
static int xpmem_detach(
unsigned long at_vaddr)
{
@@ -1757,27 +1739,33 @@ out:
}
int xpmem_fault_process_memory_range(
static int _xpmem_fault_process_memory_range(
struct process_vm *vm,
struct vm_range *vmr,
unsigned long vaddr,
uint64_t reason)
uint64_t reason,
int page_in_remote)
{
int ret = 0;
unsigned long seg_vaddr = 0;
pte_t *pte = NULL;
pte_t *old_pte = NULL;
unsigned long seg_vaddr;
struct xpmem_thread_group *ap_tg;
struct xpmem_thread_group *seg_tg;
struct xpmem_access_permit *ap;
struct xpmem_attachment *att;
struct xpmem_segment *seg;
size_t pgsize;
unsigned long at_lock;
int att_locked = 0;
pte_t *att_pte;
void *att_pgaddr;
size_t att_pgsize;
int att_p2align;
pte_t *seg_pte;
size_t seg_pgsize;
uintptr_t seg_phys;
uintptr_t seg_phys_plus_off;
uintptr_t seg_phys_aligned;
enum ihk_mc_pt_attribute att_attr;
XPMEM_DEBUG("call: vmr=0x%p, vaddr=0x%lx, reason=0x%lx",
vmr, vaddr, reason);
XPMEM_DEBUG("call: vmr=0x%p, vaddr=0x%lx, reason=0x%lx, page_in_remote: %d",
vmr, vaddr, reason, page_in_remote);
att = (struct xpmem_attachment *)vmr->private_data;
if (att == NULL) {
@@ -1804,70 +1792,169 @@ int xpmem_fault_process_memory_range(
seg_tg = seg->tg;
xpmem_tg_ref(seg_tg);
at_lock = ihk_rwspinlock_write_lock(&att->at_lock);
att_locked = 1;
if ((seg->flags & XPMEM_FLAG_DESTROYING) ||
(seg_tg->flags & XPMEM_FLAG_DESTROYING)) {
ret = -ENOENT;
goto out_2;
ret = -EFAULT;
goto out;
}
if ((att->flags & XPMEM_FLAG_DESTROYING) ||
(ap_tg->flags & XPMEM_FLAG_DESTROYING) ||
(seg_tg->flags & XPMEM_FLAG_DESTROYING)) {
goto out_2;
kprintf("%s: XPMEM_FLAG_DESTROYING\n",
__func__);
ret = -EFAULT;
goto out;
}
if (vaddr < att->at_vaddr || vaddr + 1 > att->at_vaddr + att->at_size) {
goto out_2;
kprintf("%s: vaddr: %lx, att->at_vaddr: %lx, att->at_size: %lx\n",
__func__, vaddr, att->at_vaddr, att->at_size);
ret = -EFAULT;
goto out;
}
seg_vaddr = (att->vaddr & PAGE_MASK) + (vaddr - att->at_vaddr);
/* page-in remote pages on page-fault or (on attach and
* xpmem_page_in_remote_on_attach isn't specified)
*/
seg_vaddr = att->vaddr + (vaddr - att->at_vaddr);
XPMEM_DEBUG("vaddr=%lx, seg_vaddr=%lx", vaddr, seg_vaddr);
ret = xpmem_ensure_valid_page(seg, seg_vaddr);
ret = xpmem_ensure_valid_page(seg, seg_vaddr, page_in_remote);
if (ret != 0) {
goto out_2;
goto out;
}
pte = xpmem_vaddr_to_pte(seg_tg->vm, seg_vaddr, &pgsize);
if (is_remote_vm(seg_tg->vm)) {
ihk_rwspinlock_read_lock_noirq(&seg_tg->vm->memory_range_lock);
}
if (seg_tg->vm->proc->straight_va &&
seg_vaddr >= (unsigned long)seg_tg->vm->proc->straight_va &&
seg_vaddr < ((unsigned long)seg_tg->vm->proc->straight_va +
seg_tg->vm->proc->straight_len)) {
seg_phys = (((unsigned long)seg_vaddr & PAGE_MASK) -
(unsigned long)seg_tg->vm->proc->straight_va) +
seg_tg->vm->proc->straight_pa;
seg_pgsize = (1UL << 29);
XPMEM_DEBUG("seg_vaddr: 0x%lx in PID %d is straight -> phys: 0x%lx",
(unsigned long)seg_vaddr & PAGE_MASK,
seg_tg->tgid, seg_phys);
}
else {
seg_pte = xpmem_vaddr_to_pte(seg_tg->vm, seg_vaddr, &seg_pgsize);
/* map only resident remote pages on attach and
* xpmem_page_in_remote_on_attach is specified
*/
if (!seg_pte || pte_is_null(seg_pte)) {
ret = page_in_remote ? -EFAULT : 0;
if (is_remote_vm(seg_tg->vm)) {
ihk_rwspinlock_read_unlock_noirq(&seg_tg->vm->memory_range_lock);
}
goto out;
}
seg_phys = pte_get_phys(seg_pte);
}
/* clear lower bits of the contiguous-PTE tail entries */
seg_phys_plus_off = (seg_phys & ~(seg_pgsize - 1)) |
(seg_vaddr & (seg_pgsize - 1));
XPMEM_DEBUG("seg_vaddr: %lx, seg_phys: %lx, seg_phys_plus_off: %lx, seg_pgsize: %lx",
seg_vaddr, seg_phys, seg_phys_plus_off, seg_pgsize);
if (is_remote_vm(seg_tg->vm)) {
ihk_rwspinlock_read_unlock_noirq(&seg_tg->vm->memory_range_lock);
}
/* find largest page-size fitting vm range and segment page */
att_pte = ihk_mc_pt_lookup_pte(vm->address_space->page_table,
(void *)vaddr, vmr->pgshift, &att_pgaddr, &att_pgsize,
&att_p2align);
while ((unsigned long)att_pgaddr < vmr->start ||
vmr->end < (uintptr_t)att_pgaddr + att_pgsize ||
att_pgsize > seg_pgsize) {
att_pte = NULL;
ret = arch_get_smaller_page_size(NULL, att_pgsize,
&att_pgsize, &att_p2align);
if (ret) {
kprintf("%s: arch_get_smaller_page_size failed: "
" range: %lx-%lx, pgsize: %lx, ret: %d\n",
__func__, vmr->start, vmr->end, att_pgsize,
ret);
goto out;
}
att_pgaddr = (void *)(vaddr & ~(att_pgsize - 1));
}
arch_adjust_allocate_page_size(vm->address_space->page_table,
vaddr, att_pte, &att_pgaddr,
&att_pgsize);
seg_phys_aligned = seg_phys_plus_off & ~(att_pgsize - 1);
XPMEM_DEBUG("att_pte=%p, att_pgaddr=0x%p, att_pgsize=%lu, "
"att_p2align=%d",
att_pte, att_pgaddr, att_pgsize, att_p2align);
/* last arg is not used */
att_attr = arch_vrflag_to_ptattr(vmr->flag, reason, NULL);
XPMEM_DEBUG("att_attr=0x%lx", att_attr);
if (att_pte && !pte_is_null(att_pte)) {
unsigned long att_phys = pte_get_phys(att_pte);
if (att_phys != seg_phys_aligned) {
ret = -EFAULT;
ekprintf("%s: ERROR: pte mismatch: "
"0x%lx != 0x%lx\n",
__func__, att_phys, seg_phys_aligned);
}
if (page_in_remote) {
ihk_atomic_dec(&seg->tg->n_pinned);
}
goto out;
}
XPMEM_DEBUG("att_pgaddr: %lx, att_pgsize: %lx, "
"seg_vaddr: %lx, seg_pgsize: %lx, "
"seg_phys_aligned: %lx\n",
att_pgaddr, att_pgsize, seg_vaddr,
seg_pgsize, seg_phys_aligned);
if (att_pte && !pgsize_is_contiguous(att_pgsize)) {
ret = ihk_mc_pt_set_pte(vm->address_space->page_table,
att_pte, att_pgsize,
seg_phys_aligned,
att_attr);
if (ret) {
ret = -EFAULT;
ekprintf("%s: ERROR: ihk_mc_pt_set_pte() failed %d\n",
__func__, ret);
goto out;
}
}
else {
ret = ihk_mc_pt_set_range(vm->address_space->page_table, vm,
att_pgaddr, att_pgaddr + att_pgsize,
seg_phys_aligned,
att_attr, vmr->pgshift, vmr, 1);
if (ret) {
ret = -EFAULT;
ekprintf("%s: ERROR: ihk_mc_pt_set_range() failed %d\n",
__func__, ret);
goto out;
}
}
att->flags |= XPMEM_FLAG_VALIDPTEs;
out_2:
xpmem_ap_deref(ap);
xpmem_tg_deref(ap_tg);
if (pte && !pte_is_null(pte)) {
old_pte = xpmem_vaddr_to_pte(cpu_local_var(current)->vm, vaddr,
&pgsize);
if (old_pte && !pte_is_null(old_pte)) {
if (*old_pte != *pte) {
ret = -EFAULT;
ekprintf("%s: ERROR: pte mismatch: "
"0x%lx != 0x%lx\n",
__FUNCTION__, *old_pte, *pte);
}
ihk_atomic_dec(&seg->tg->n_pinned);
goto out_1;
}
ret = xpmem_remap_pte(vm, vmr, vaddr, reason, seg, seg_vaddr);
if (ret) {
ekprintf("%s: ERROR: xpmem_remap_pte() failed %d\n",
__FUNCTION__, ret);
}
}
flush_tlb_single(vaddr);
out_1:
if (att_locked) {
ihk_rwspinlock_write_unlock(&att->at_lock, at_lock);
}
out:
xpmem_ap_deref(ap);
xpmem_tg_deref(ap_tg);
xpmem_tg_deref(seg_tg);
xpmem_seg_deref(seg);
xpmem_att_deref(att);
@@ -1877,125 +1964,124 @@ out_1:
return ret;
}
static int xpmem_remap_pte(
int xpmem_fault_process_memory_range(
struct process_vm *vm,
struct vm_range *vmr,
unsigned long vaddr,
uint64_t reason,
struct xpmem_segment *seg,
unsigned long seg_vaddr)
uint64_t reason)
{
int ret;
struct xpmem_thread_group *seg_tg = seg->tg;
struct vm_range *seg_vmr;
pte_t *seg_pte;
void *seg_pgaddr;
size_t seg_pgsize;
int seg_p2align;
uintptr_t seg_phys;
pte_t *att_pte;
void *att_pgaddr;
size_t att_pgsize;
int att_p2align;
enum ihk_mc_pt_attribute att_attr;
unsigned long at_lock;
struct xpmem_attachment *att;
XPMEM_DEBUG("call: vmr=0x%p, vaddr=0x%lx, reason=0x%lx, segid=0x%lx, "
"seg_vaddr=0x%lx",
vmr, vaddr, reason, seg->segid, seg_vaddr);
att = (struct xpmem_attachment *)vmr->private_data;
if (att == NULL) {
return -EFAULT;
}
at_lock = ihk_rwspinlock_read_lock(&att->at_lock);
ret = _xpmem_fault_process_memory_range(vm, vmr, vaddr, reason, 1);
ihk_rwspinlock_read_unlock(&att->at_lock, at_lock);
return ret;
}
if (is_remote_vm(seg_tg->vm)) {
ihk_rwspinlock_read_lock_noirq(&seg_tg->vm->memory_range_lock);
int xpmem_update_process_page_table(
struct process_vm *vm, struct vm_range *vmr)
{
int ret = 0;
unsigned long vaddr;
pte_t *pte;
size_t pgsize;
struct xpmem_thread_group *ap_tg;
struct xpmem_thread_group *seg_tg;
struct xpmem_access_permit *ap;
struct xpmem_attachment *att;
struct xpmem_segment *seg;
XPMEM_DEBUG("call: vmr=0x%p", vmr);
att = (struct xpmem_attachment *)vmr->private_data;
if (att == NULL) {
return -EFAULT;
}
seg_vmr = lookup_process_memory_range(seg_tg->vm, seg_vaddr,
seg_vaddr + 1);
xpmem_att_ref(att);
ap = att->ap;
xpmem_ap_ref(ap);
ap_tg = ap->tg;
xpmem_tg_ref(ap_tg);
if (!seg_vmr) {
if ((ap->flags & XPMEM_FLAG_DESTROYING) ||
(ap_tg->flags & XPMEM_FLAG_DESTROYING)) {
ret = -EFAULT;
ekprintf("%s: ERROR: lookup_process_memory_range() failed\n",
__FUNCTION__);
goto out;
goto out_1;
}
if (seg_tg->vm->proc->straight_va &&
seg_vaddr >= (unsigned long)seg_tg->vm->proc->straight_va &&
seg_vaddr < ((unsigned long)seg_tg->vm->proc->straight_va +
seg_tg->vm->proc->straight_len)) {
seg_phys = (((unsigned long)seg_vaddr & PAGE_MASK) -
(unsigned long)seg_tg->vm->proc->straight_va) +
seg_tg->vm->proc->straight_pa;
dkprintf("%s: 0x%lx in PID %d is straight -> phys: 0x%lx\n",
__func__, (unsigned long)seg_vaddr & PAGE_MASK,
seg_tg->tgid, seg_phys);
}
else {
DBUG_ON(cpu_local_var(current)->proc->pid != ap_tg->tgid);
DBUG_ON(ap->mode != XPMEM_RDWR);
seg_pte = ihk_mc_pt_lookup_pte(seg_tg->vm->address_space->page_table,
(void *)seg_vaddr, seg_vmr->pgshift, &seg_pgaddr, &seg_pgsize,
&seg_p2align);
if (!seg_pte) {
ret = -EFAULT;
ekprintf("%s: ERROR: ihk_mc_pt_lookup_pte() failed\n",
__FUNCTION__);
goto out;
}
XPMEM_DEBUG("seg_pte=0x%016lx, seg_pgaddr=0x%p, seg_pgsize=%lu, "
"seg_p2align=%d",
*seg_pte, seg_pgaddr, seg_pgsize, seg_p2align);
seg = ap->seg;
xpmem_seg_ref(seg);
seg_tg = seg->tg;
xpmem_tg_ref(seg_tg);
seg_phys = pte_get_phys(seg_pte);
XPMEM_DEBUG("seg_phys=0x%lx", seg_phys);
if ((seg->flags & XPMEM_FLAG_DESTROYING) ||
(seg_tg->flags & XPMEM_FLAG_DESTROYING)) {
ret = -ENOENT;
goto out_2;
}
att_pte = ihk_mc_pt_lookup_pte(vm->address_space->page_table,
(void *)vaddr, vmr->pgshift, &att_pgaddr, &att_pgsize,
&att_p2align);
XPMEM_DEBUG("att_pte=%p, att_pgaddr=0x%p, att_pgsize=%lu, "
"att_p2align=%d",
att_pte, att_pgaddr, att_pgsize, att_p2align);
att->at_vaddr = vmr->start;
att->at_vmr = vmr;
att_attr = arch_vrflag_to_ptattr(vmr->flag, reason, att_pte);
XPMEM_DEBUG("att_attr=0x%lx", att_attr);
if ((att->flags & XPMEM_FLAG_DESTROYING) ||
(ap_tg->flags & XPMEM_FLAG_DESTROYING) ||
(seg_tg->flags & XPMEM_FLAG_DESTROYING)) {
goto out_2;
}
if (att_pte) {
ret = ihk_mc_pt_set_pte(vm->address_space->page_table, att_pte,
att_pgsize, seg_phys, att_attr);
for (vaddr = vmr->start; vaddr < vmr->end; vaddr += pgsize) {
XPMEM_DEBUG("vmr: %lx-%lx, vaddr: %lx",
vmr->start, vmr->end, vaddr);
ret = _xpmem_fault_process_memory_range(vm, vmr, vaddr,
0,
xpmem_page_in_remote_on_attach);
if (ret) {
ret = -EFAULT;
ekprintf("%s: ERROR: ihk_mc_pt_set_pte() failed %d\n",
__FUNCTION__, ret);
goto out;
ekprintf("%s: ERROR: "
"_xpmem_fault_process_memory_range() "
"failed %d\n", __func__, ret);
}
// memory_stat_rss_add() is called by the process hosting the memory area
}
else {
ret = ihk_mc_pt_set_range(vm->address_space->page_table, vm,
att_pgaddr, att_pgaddr + att_pgsize, seg_phys, att_attr,
vmr->pgshift, vmr, 0);
if (ret) {
ret = -EFAULT;
ekprintf("%s: ERROR: ihk_mc_pt_set_range() failed %d\n",
__FUNCTION__, ret);
goto out;
pte = ihk_mc_pt_lookup_pte(vm->address_space->page_table,
(void *)vaddr, vmr->pgshift,
NULL, &pgsize, NULL);
/* when segment page is not resident and
* xpmem_page_in_remote_on_attach is specified
*/
if (!pte || pte_is_null(pte)) {
pgsize = PAGE_SIZE;
}
// memory_stat_rss_add() is called by the process hosting the memory area
}
out:
if (is_remote_vm(seg_tg->vm)) {
ihk_rwspinlock_read_unlock_noirq(&seg_tg->vm->memory_range_lock);
}
out_2:
xpmem_tg_deref(seg_tg);
xpmem_seg_deref(seg);
out_1:
xpmem_att_deref(att);
xpmem_ap_deref(ap);
xpmem_tg_deref(ap_tg);
XPMEM_DEBUG("return: ret=%d", ret);
return ret;
}
static int xpmem_ensure_valid_page(
struct xpmem_segment *seg,
unsigned long vaddr)
unsigned long vaddr,
int page_in)
{
int ret;
struct xpmem_thread_group *seg_tg = seg->tg;
@@ -2005,7 +2091,8 @@ static int xpmem_ensure_valid_page(
if (seg->flags & XPMEM_FLAG_DESTROYING)
return -ENOENT;
ret = xpmem_pin_page(seg_tg, seg_tg->group_leader, seg_tg->vm, vaddr);
ret = xpmem_pin_page(seg_tg, seg_tg->group_leader, seg_tg->vm, vaddr,
page_in);
XPMEM_DEBUG("return: ret=%d", ret);
@@ -2043,8 +2130,7 @@ static pte_t * xpmem_vaddr_to_pte(
}
out:
return pte;
return pte;
}
@@ -2052,21 +2138,26 @@ static int xpmem_pin_page(
struct xpmem_thread_group *tg,
struct thread *src_thread,
struct process_vm *src_vm,
unsigned long vaddr)
unsigned long vaddr,
int page_in)
{
int ret;
int ret = 0;
struct vm_range *range;
XPMEM_DEBUG("call: tgid=%d, vaddr=0x%lx", tg->tgid, vaddr);
retry:
ihk_rwspinlock_read_lock_noirq(&src_vm->memory_range_lock);
if (is_remote_vm(src_vm)) {
ihk_rwspinlock_read_lock_noirq(&src_vm->memory_range_lock);
}
range = lookup_process_memory_range(src_vm, vaddr, vaddr + 1);
ihk_rwspinlock_read_unlock_noirq(&src_vm->memory_range_lock);
if (!range || range->start > vaddr) {
if (is_remote_vm(src_vm)) {
ihk_rwspinlock_read_unlock_noirq(&src_vm->memory_range_lock);
}
/*
* Grow the stack if address falls into stack region
* so that we can lookup range successfully.
@@ -2085,21 +2176,31 @@ retry:
}
if (xpmem_is_private_data(range)) {
return -ENOENT;
ret = -ENOENT;
goto out;
}
ret = page_fault_process_vm(src_vm, (void *)vaddr,
PF_POPULATE | PF_WRITE | PF_USER);
if (!ret) {
/* Page-in remote area */
if (page_in) {
/* skip read lock for the case src_vm is local
* because write lock is taken in do_mmap.
*/
ret = page_fault_process_memory_range(src_vm, range,
vaddr,
PF_POPULATE | PF_WRITE |
PF_USER);
if (ret) {
goto out;
}
ihk_atomic_inc(&tg->n_pinned);
}
else {
return -ENOENT;
out:
if (is_remote_vm(src_vm)) {
ihk_rwspinlock_read_unlock_noirq(&src_vm->memory_range_lock);
}
XPMEM_DEBUG("return: ret=%d", ret);
return ret;
return ret;
}
@@ -2109,30 +2210,27 @@ static void xpmem_unpin_pages(
unsigned long vaddr,
size_t size)
{
int n_pgs = (((offset_in_page(vaddr) + (size)) + (PAGE_SIZE - 1)) >>
PAGE_SHIFT);
int n_pgs_unpinned = 0;
size_t vsize = 0;
unsigned long end = vaddr + size;
pte_t *pte = NULL;
XPMEM_DEBUG("call: segid=0x%lx, vaddr=0x%lx, size=0x%lx",
seg->segid, vaddr, size);
XPMEM_DEBUG("n_pgs=%d", n_pgs);
vaddr &= PAGE_MASK;
while (n_pgs > 0) {
/* attachment can't be straight-mapped because it's mapped
* with MAP_SHARED
*/
while (vaddr < end) {
pte = xpmem_vaddr_to_pte(vm, vaddr, &vsize);
if (pte && !pte_is_null(pte)) {
n_pgs_unpinned++;
vaddr += PAGE_SIZE;
n_pgs--;
vaddr += vsize;
}
else {
vsize = ((vaddr + vsize) & (~(vsize - 1)));
n_pgs -= (vsize - vaddr) / PAGE_SIZE;
vaddr = vsize;
vaddr = ((vaddr + vsize) & (~(vsize - 1)));
}
}
@@ -2196,8 +2294,8 @@ static void xpmem_tg_deref(
{
DBUG_ON(ihk_atomic_read(&tg->refcnt) <= 0);
if (ihk_atomic_dec_return(&tg->refcnt) != 0) {
XPMEM_DEBUG("return: tg->refcnt=%d, tg->n_pinned=%d",
tg->refcnt, tg->n_pinned);
/*XPMEM_DEBUG("return: tg->refcnt=%d, tg->n_pinned=%d",
tg->refcnt, tg->n_pinned);*/
return;
}
@@ -2236,7 +2334,7 @@ static void xpmem_seg_deref(struct xpmem_segment *seg)
{
DBUG_ON(ihk_atomic_read(&seg->refcnt) <= 0);
if (ihk_atomic_dec_return(&seg->refcnt) != 0) {
XPMEM_DEBUG("return: seg->refcnt=%d", seg->refcnt);
//XPMEM_DEBUG("return: seg->refcnt=%d", seg->refcnt);
return;
}
@@ -2282,7 +2380,7 @@ static void xpmem_ap_deref(struct xpmem_access_permit *ap)
{
DBUG_ON(ihk_atomic_read(&ap->refcnt) <= 0);
if (ihk_atomic_dec_return(&ap->refcnt) != 0) {
XPMEM_DEBUG("return: ap->refcnt=%d", ap->refcnt);
//XPMEM_DEBUG("return: ap->refcnt=%d", ap->refcnt);
return;
}
@@ -2297,7 +2395,7 @@ static void xpmem_att_deref(struct xpmem_attachment *att)
{
DBUG_ON(ihk_atomic_read(&att->refcnt) <= 0);
if (ihk_atomic_dec_return(&att->refcnt) != 0) {
XPMEM_DEBUG("return: att->refcnt=%d", att->refcnt);
//XPMEM_DEBUG("return: att->refcnt=%d", att->refcnt);
return;
}