Compare commits

..

50 Commits
1.1.1 ... 1.1.2

Author SHA1 Message Date
Yoichi Umezawa
8d21846562 mcoverlayfs: supported Linux kernel 4.0 or rhel kernel 3.10.0-327
add mcoverlayfs(linux-3.10.0-327.36.1.el7 base)
2016-09-30 14:55:36 +09:00
Yoichi Umezawa
3e1367caa1 mcoverlayfs: move mcoverlayfs(linux-4.0.9 base) to executer/kernel/mcoverlayfs/linux-4.0.9 2016-09-30 13:48:55 +09:00
Ken Sato
02536b7724 Merge remote-tracking branch 'remotes/origin/ikc2'
Conflicts:
	executer/kernel/mcctrl/syscall.c
It is resolved.
2016-09-27 11:48:12 +09:00
Tomoki Shirasawa
e28725884f fix debug print 2016-09-19 17:29:41 +09:00
Masamichi Takagi
c2b3fb7236 Modify interrupt load balancing policy on reboot/stop
* Fix the timing of stopping irqbalance when booting McKernel
2016-09-16 19:07:07 +09:00
Masamichi Takagi
2f95f7cda8 Modify interrupt load balancing policy on reboot/stop
When rebooting:
1. Stop irqbalance
2. Modify /proc/irq/*/smp_affinity so that McKernel cores are not
   included
3. Start irqbalance with McKernel cores and IHK IRQ banned from
   load balancing

When stopping:
1. Stop irqbalance
2. Restore /proc/irq/*/smp_affinity
3. Restart irqbalance with the system default settings

refs #760
2016-09-16 13:04:24 +09:00
Tomoki Shirasawa
e551aa17ed execve: do not search command PATH 2016-09-14 22:22:18 +09:00
Tomoki Shirasawa
e6d4c160cd mcexec: fix how to look for command
refs #754
2016-09-13 15:56:58 +09:00
Tomoki Shirasawa
9390fe5d2c signal: send signal to thread using thread-id. not cpu-id 2016-09-12 15:43:29 +09:00
Tomoki Shirasawa
419f5e495b set*[ug]id: propagate credentials to thread pool 2016-09-12 15:40:33 +09:00
Tomoki Shirasawa
673deadf37 fix syscall return type 2016-09-12 15:40:06 +09:00
Tomoki Shirasawa
20ea65b38c fix some vDSO bugs.
- vDSO sometimes becomes invalid.
- vDSO is not succeeded for child process.
- vDSO becomes invalid when execve.
refs #744
2016-09-04 23:13:00 +09:00
Balazs Gerofi
84665ff699 do_page_fault_process_vm(): fix error msg format that could cause another PF 2016-09-04 10:59:50 +09:00
Balazs Gerofi
bfbc94dfb0 mcctrl+mcexec: fix per-proc data allocation for fork() 2016-09-02 15:08:00 +09:00
Balazs Gerofi
d550bced78 kmalloc(): use macros to define size alignment 2016-08-19 12:51:28 +09:00
Balazs Gerofi
a7ee3f531b sched_setaffinity(): error handling for invalid input 2016-08-19 11:52:44 +09:00
Balazs Gerofi
b9439947a7 kmalloc(): re-implementation of memory leak tracking 2016-08-19 11:52:00 +09:00
Balazs Gerofi
3b60a95f13 kmalloc()/kfree() re-implementation 2016-08-18 21:51:36 +09:00
Balazs Gerofi
82ae6d7458 query_free_mem_interrupt_handler(): report number of free pages as kmsg 2016-08-18 14:52:05 +09:00
Balazs Gerofi
7ebc34ddcc do_fork(): fix tids memory leak; additional sanity checks 2016-08-18 14:31:52 +09:00
Balazs Gerofi
bd6a2c2311 sys_mmap(): correct initial address check 2016-08-18 07:32:31 +09:00
Balazs Gerofi
5fd68eae54 PF handler: fix up various error msgs 2016-08-18 07:31:25 +09:00
Balazs Gerofi
f5857cfc9e MM: use ihk_mc_{alloc/free}_pages() everywhere and fix free_pages() on kmalloc()ed object bug 2016-08-17 18:02:05 +09:00
Balazs Gerofi
01d2ea1605 do_munmap(): do TLB flush per address in remote_tlb_flush_cpu_mask() 2016-08-17 15:08:30 +09:00
Balazs Gerofi
9efd568e07 do_mmap(): simplify demand paging flags; avoid zeroobj and allocate pages directly 2016-08-17 14:00:05 +09:00
Balazs Gerofi
1a207e19c2 clean up a couple of debug messages 2016-08-17 13:55:36 +09:00
Balazs Gerofi
73cf93727b clone(): use CAS for TID allocation 2016-08-16 14:18:58 +09:00
Balazs Gerofi
4410e702d9 devobj: fix memory leak for device file mapping 2016-08-16 14:17:59 +09:00
Balazs Gerofi
f584e2ec25 increase kernel stack size and eliminate unused waitq declaration in do_syscall() 2016-08-16 09:20:55 +09:00
Balazs Gerofi
3aa06444f4 do_syscall(): allow descheduling threads in offloaded syscalls if CPU core oversubscribed 2016-08-16 08:58:22 +09:00
Balazs Gerofi
c897a56c34 __notify_syscall_requester(): use CAS or IKC to notify syscall completion 2016-08-16 08:56:05 +09:00
Balazs Gerofi
5e9957da0f syscall_response: introduction of req_thread_status field 2016-08-16 08:53:41 +09:00
Balazs Gerofi
6ff2d4abe7 mcctrl: store per-process data in hash table 2016-08-15 13:47:57 +09:00
Balazs Gerofi
e4239f1885 mcexec: use 16 threads initially in offload handler pool 2016-08-14 14:29:10 +09:00
Balazs Gerofi
fbbaaf5b54 mcctrl: use GFP_ATOMIC in atomic context 2016-08-14 14:28:21 +09:00
Balazs Gerofi
3fa3920bb3 fix a couple of debug msgs 2016-08-14 11:30:17 +09:00
Balazs Gerofi
45e51fcc07 mcctrl: fix padding for 128bytes SCD message 2016-08-14 11:29:02 +09:00
Balazs Gerofi
0884e3d543 IHK-IKC: map queue in McKernel as cacheable 2016-08-14 11:16:40 +09:00
Balazs Gerofi
e3c7c9b890 mcctrl: separate waiting threads and pending requests 2016-08-12 21:52:13 +09:00
Balazs Gerofi
f4155cc9e8 mcstop+release-smp-x86.sh: fix OS instance discovery bug 2016-08-12 12:27:04 +09:00
Balazs Gerofi
a01ae91051 mcctrl: use IKC packet pools 2016-08-12 12:26:14 +09:00
Balazs Gerofi
daca522d25 mcctrl: move kmalloc/kfree of wait queue head out of fast path 2016-08-12 10:18:58 +09:00
Balazs Gerofi
ec521feb15 do_syscall(): remove invalid reference 2016-08-09 17:16:47 +09:00
Balazs Gerofi
d7bc947a02 mcctrl: redesign mcctrl_channels for IKC packet based syscall offloading 2016-08-09 16:49:42 +09:00
Balazs Gerofi
fb84d4ef11 mcctrl: thread pool based system call offload handling 2016-08-08 19:43:05 +09:00
Balazs Gerofi
5fbeee953a mcctrl: clean up syscall offload wait code 2016-08-07 20:55:36 +09:00
Balazs Gerofi
4cefb4333f mcctrl: use atomic malloc in IRQ context 2016-08-06 08:54:55 +09:00
Balazs Gerofi
689da07ac6 ihk_mc_ikc_init_first_local(): hold ref to master channel 2016-08-06 08:52:14 +09:00
Balazs Gerofi
76981bcc18 mcctrl: move procfs TID processing into dedicated work queue 2016-08-04 15:22:40 +09:00
Balazs Gerofi
6aae35cb3d process: transfer TIDs in bulk and reuse them locally 2016-08-02 16:59:04 +09:00
51 changed files with 5790 additions and 1324 deletions

View File

@@ -1,5 +1,6 @@
TARGET = @TARGET@ TARGET = @TARGET@
SBINDIR = @SBINDIR@ SBINDIR = @SBINDIR@
ETCDIR = @ETCDIR@
MANDIR = @MANDIR@ MANDIR = @MANDIR@
all:: all::
@@ -48,6 +49,9 @@ install::
mkdir -p -m 755 $(SBINDIR); \ mkdir -p -m 755 $(SBINDIR); \
install -m 755 arch/x86/tools/mcreboot-smp-x86.sh $(SBINDIR)/mcreboot.sh; \ install -m 755 arch/x86/tools/mcreboot-smp-x86.sh $(SBINDIR)/mcreboot.sh; \
install -m 755 arch/x86/tools/mcstop+release-smp-x86.sh $(SBINDIR)/mcstop+release.sh; \ install -m 755 arch/x86/tools/mcstop+release-smp-x86.sh $(SBINDIR)/mcstop+release.sh; \
mkdir -p -m 755 $(ETCDIR); \
install -m 644 arch/x86/tools/irqbalance_mck.service $(ETCDIR)/irqbalance_mck.service; \
install -m 644 arch/x86/tools/irqbalance_mck.in $(ETCDIR)/irqbalance_mck.in; \
mkdir -p -m 755 $(MANDIR)/man1; \ mkdir -p -m 755 $(MANDIR)/man1; \
install -m 644 arch/x86/tools/mcreboot.1 $(MANDIR)/man1/mcreboot.1; \ install -m 644 arch/x86/tools/mcreboot.1 $(MANDIR)/man1/mcreboot.1; \
;; \ ;; \

View File

@@ -1054,9 +1054,8 @@ unhandled_page_fault(struct thread *thread, void *fault_addr, void *regs)
unsigned long error = ((struct x86_user_context *)regs)->gpr.error; unsigned long error = ((struct x86_user_context *)regs)->gpr.error;
irqflags = kprintf_lock(); irqflags = kprintf_lock();
dkprintf("[%d] Page fault for 0x%lX\n", __kprintf("Page fault for 0x%lx\n", address);
ihk_mc_get_processor_id(), address); __kprintf("%s for %s access in %s mode (reserved bit %s set), "
dkprintf("%s for %s access in %s mode (reserved bit %s set), "
"it %s an instruction fetch\n", "it %s an instruction fetch\n",
(error & PF_PROT ? "protection fault" : "no page found"), (error & PF_PROT ? "protection fault" : "no page found"),
(error & PF_WRITE ? "write" : "read"), (error & PF_WRITE ? "write" : "read"),
@@ -1068,14 +1067,14 @@ unhandled_page_fault(struct thread *thread, void *fault_addr, void *regs)
list_for_each_entry(range, &vm->vm_range_list, list) { list_for_each_entry(range, &vm->vm_range_list, list) {
if (range->start <= address && range->end > address) { if (range->start <= address && range->end > address) {
found = 1; found = 1;
dkprintf("address is in range, flag: 0x%X! \n", __kprintf("address is in range, flag: 0x%lx\n",
range->flag); range->flag);
ihk_mc_pt_print_pte(vm->address_space->page_table, (void*)address); ihk_mc_pt_print_pte(vm->address_space->page_table, (void*)address);
break; break;
} }
} }
if (!found) { if (!found) {
dkprintf("address is out of range! \n"); __kprintf("address is out of range! \n");
} }
kprintf_unlock(irqflags); kprintf_unlock(irqflags);

View File

@@ -318,5 +318,5 @@ extern unsigned long ap_trampoline;
#define AP_TRAMPOLINE_SIZE 0x2000 #define AP_TRAMPOLINE_SIZE 0x2000
/* Local is cachable */ /* Local is cachable */
#define IHK_IKC_QUEUE_PT_ATTR (PTATTR_NO_EXECUTE | PTATTR_WRITABLE | PTATTR_UNCACHABLE) #define IHK_IKC_QUEUE_PT_ATTR (PTATTR_NO_EXECUTE | PTATTR_WRITABLE)
#endif #endif

View File

@@ -23,6 +23,7 @@
#include <process.h> #include <process.h>
#include <page.h> #include <page.h>
#include <cls.h> #include <cls.h>
#include <kmalloc.h>
#define dkprintf(...) do { if (0) kprintf(__VA_ARGS__); } while (0) #define dkprintf(...) do { if (0) kprintf(__VA_ARGS__); } while (0)
#define ekprintf(...) kprintf(__VA_ARGS__) #define ekprintf(...) kprintf(__VA_ARGS__)
@@ -84,20 +85,22 @@ void ihk_mc_free_pages(void *p, int npages)
pa_ops->free_page(p, npages); pa_ops->free_page(p, npages);
} }
void *ihk_mc_allocate(int size, enum ihk_mc_ap_flag flag) void *ihk_mc_allocate(int size, int flag)
{ {
if (pa_ops && pa_ops->alloc) if (!cpu_local_var(kmalloc_initialized)) {
return pa_ops->alloc(size, flag); kprintf("%s: error, kmalloc not yet initialized\n", __FUNCTION__);
else return NULL;
return ihk_mc_alloc_pages(1, flag); }
return kmalloc(size, IHK_MC_AP_NOWAIT);
} }
void ihk_mc_free(void *p) void ihk_mc_free(void *p)
{ {
if (pa_ops && pa_ops->free) if (!cpu_local_var(kmalloc_initialized)) {
return pa_ops->free(p); kprintf("%s: error, kmalloc not yet initialized\n", __FUNCTION__);
else return;
return ihk_mc_free_pages(p, 1); }
kfree(p);
} }
void *get_last_early_heap(void) void *get_last_early_heap(void)
@@ -1111,6 +1114,7 @@ static int clear_range_l1(void *args0, pte_t *ptep, uint64_t base,
if (!(old & PFL1_FILEOFF) && args->free_physical) { if (!(old & PFL1_FILEOFF) && args->free_physical) {
if (page && page_unmap(page)) { if (page && page_unmap(page)) {
ihk_mc_free_pages(phys_to_virt(phys), 1); ihk_mc_free_pages(phys_to_virt(phys), 1);
dkprintf("%s: freeing regular page at 0x%lx\n", __FUNCTION__, base);
} }
args->vm->currss -= PTL1_SIZE; args->vm->currss -= PTL1_SIZE;
} }
@@ -1159,6 +1163,7 @@ static int clear_range_l2(void *args0, pte_t *ptep, uint64_t base,
if (!(old & PFL2_FILEOFF) && args->free_physical) { if (!(old & PFL2_FILEOFF) && args->free_physical) {
if (page && page_unmap(page)) { if (page && page_unmap(page)) {
ihk_mc_free_pages(phys_to_virt(phys), PTL2_SIZE/PTL1_SIZE); ihk_mc_free_pages(phys_to_virt(phys), PTL2_SIZE/PTL1_SIZE);
dkprintf("%s: freeing large page at 0x%lx\n", __FUNCTION__, base);
} }
args->vm->currss -= PTL2_SIZE; args->vm->currss -= PTL2_SIZE;
} }
@@ -2273,6 +2278,9 @@ int read_process_vm(struct process_vm *vm, void *kdst, const void *usrc, size_t
reason = PF_USER; /* page not present */ reason = PF_USER; /* page not present */
for (addr = ustart & PAGE_MASK; addr < uend; addr += PAGE_SIZE) { for (addr = ustart & PAGE_MASK; addr < uend; addr += PAGE_SIZE) {
if (!addr)
return -EINVAL;
error = page_fault_process_vm(vm, (void *)addr, reason); error = page_fault_process_vm(vm, (void *)addr, reason);
if (error) { if (error) {
kprintf("%s: error: PF for %p failed\n", __FUNCTION__, addr); kprintf("%s: error: PF for %p failed\n", __FUNCTION__, addr);

View File

@@ -38,7 +38,7 @@ int ihk_mc_ikc_init_first_local(struct ihk_ikc_channel_desc *channel,
arch_master_channel_packet_handler = packet_handler; arch_master_channel_packet_handler = packet_handler;
ihk_ikc_init_desc(channel, IKC_OS_HOST, 0, rq, wq, ihk_ikc_init_desc(channel, IKC_OS_HOST, 0, rq, wq,
ihk_ikc_master_channel_packet_handler); ihk_ikc_master_channel_packet_handler, channel);
ihk_ikc_enable_channel(channel); ihk_ikc_enable_channel(channel);
/* Set boot parameter */ /* Set boot parameter */

View File

@@ -105,7 +105,7 @@ static int set_perfctr_x86_direct(int counter, int mode, unsigned int value)
wrmsr(MSR_IA32_PERFEVTSEL0 + counter, value); wrmsr(MSR_IA32_PERFEVTSEL0 + counter, value);
//kprintf("wrmsr: %d <= %x\n", MSR_PERF_GLOBAL_CTRL, 0); //kprintf("wrmsr: %d <= %x\n", MSR_PERF_GLOBAL_CTRL, 0);
kprintf("wrmsr: %d <= %x\n", MSR_IA32_PERFEVTSEL0 + counter, value); //kprintf("wrmsr: %d <= %x\n", MSR_IA32_PERFEVTSEL0 + counter, value);
return 0; return 0;
} }

View File

@@ -293,7 +293,7 @@ SYSCALL_DECLARE(rt_sigreturn)
extern struct cpu_local_var *clv; extern struct cpu_local_var *clv;
extern unsigned long do_kill(struct thread *thread, int pid, int tid, int sig, struct siginfo *info, int ptracecont); extern unsigned long do_kill(struct thread *thread, int pid, int tid, int sig, struct siginfo *info, int ptracecont);
extern void interrupt_syscall(int all, int pid); extern void interrupt_syscall(int pid, int tid);
extern int num_processors; extern int num_processors;
#define RFLAGS_MASK (RFLAGS_CF | RFLAGS_PF | RFLAGS_AF | RFLAGS_ZF | \ #define RFLAGS_MASK (RFLAGS_CF | RFLAGS_PF | RFLAGS_AF | RFLAGS_ZF | \
@@ -1290,7 +1290,7 @@ done:
cpu_restore_interrupt(irqstate); cpu_restore_interrupt(irqstate);
if (doint && !(mask & tthread->sigmask.__val[0])) { if (doint && !(mask & tthread->sigmask.__val[0])) {
int cpuid = tthread->cpu_id; int tid = tthread->tid;
int pid = tproc->pid; int pid = tproc->pid;
int status = tthread->status; int status = tthread->status;
@@ -1301,7 +1301,7 @@ done:
} }
if(!tthread->proc->nohost) if(!tthread->proc->nohost)
interrupt_syscall(pid, cpuid); interrupt_syscall(pid, tid);
if (status != PS_RUNNING) { if (status != PS_RUNNING) {
if(sig == SIGKILL){ if(sig == SIGKILL){
@@ -1437,9 +1437,8 @@ SYSCALL_DECLARE(mmap)
goto out; goto out;
} }
if ((addr < region->user_start) if ((flags & MAP_FIXED) && ((addr < region->user_start)
|| (region->user_end <= addr) || (region->user_end <= addr))) {
|| ((region->user_end - addr) < len)) {
ekprintf("sys_mmap(%lx,%lx,%x,%x,%x,%lx):ENOMEM\n", ekprintf("sys_mmap(%lx,%lx,%x,%x,%x,%lx):ENOMEM\n",
addr0, len0, prot, flags0, fd, off0); addr0, len0, prot, flags0, fd, off0);
error = -ENOMEM; error = -ENOMEM;
@@ -1563,6 +1562,7 @@ static int vdso_get_vdso_info(void)
struct ihk_ikc_channel_desc *ch = cpu_local_var(syscall_channel); struct ihk_ikc_channel_desc *ch = cpu_local_var(syscall_channel);
dkprintf("vdso_get_vdso_info()\n"); dkprintf("vdso_get_vdso_info()\n");
memset(&vdso, '\0', sizeof vdso);
vdso.busy = 1; vdso.busy = 1;
vdso.vdso_npages = 0; vdso.vdso_npages = 0;

View File

@@ -0,0 +1,28 @@
# irqbalance is a daemon process that distributes interrupts across
# CPUS on SMP systems. The default is to rebalance once every 10
# seconds. This is the environment file that is specified to systemd via the
# EnvironmentFile key in the service unit file (or via whatever method the init
# system you're using has.
#
# ONESHOT=yes
# after starting, wait for a minute, then look at the interrupt
# load and balance it once; after balancing exit and do not change
# it again.
#IRQBALANCE_ONESHOT=
#
# IRQBALANCE_BANNED_CPUS
# 64 bit bitmask which allows you to indicate which cpu's should
# be skipped when reblancing irqs. Cpu numbers which have their
# corresponding bits set to one in this mask will not have any
# irq's assigned to them on rebalance
#
IRQBALANCE_BANNED_CPUS=%mask%
#
# IRQBALANCE_ARGS
# append any args here to the irqbalance daemon as documented in the man page
#
IRQBALANCE_ARGS=--banirq=%banirq%

View File

@@ -0,0 +1,10 @@
[Unit]
Description=irqbalance daemon
After=syslog.target
[Service]
EnvironmentFile=@ETCDIR@/irqbalance_mck
ExecStart=/usr/sbin/irqbalance --foreground $IRQBALANCE_ARGS
[Install]
WantedBy=multi-user.target

View File

@@ -15,6 +15,7 @@
prefix="@prefix@" prefix="@prefix@"
BINDIR="${prefix}/bin" BINDIR="${prefix}/bin"
SBINDIR="${prefix}/sbin" SBINDIR="${prefix}/sbin"
ETCDIR=@ETCDIR@
KMODDIR="${prefix}/kmod" KMODDIR="${prefix}/kmod"
KERNDIR="${prefix}/@TARGET@/kernel" KERNDIR="${prefix}/@TARGET@/kernel"
ENABLE_MCOVERLAYFS="@ENABLE_MCOVERLAYFS@" ENABLE_MCOVERLAYFS="@ENABLE_MCOVERLAYFS@"
@@ -27,6 +28,12 @@ LOGMODE=0
facility="LOG_LOCAL6" facility="LOG_LOCAL6"
chown_option=`logname 2> /dev/null` chown_option=`logname 2> /dev/null`
if [ "`systemctl status irqbalance_mck.service 2> /dev/null |grep -E 'Active: active'`" != "" -o "`systemctl status irqbalance.service 2> /dev/null |grep -E 'Active: active'`" != "" ]; then
irqbalance_used="yes"
else
irqbalance_used="no"
fi
while getopts :i:k:c:m:o:f: OPT while getopts :i:k:c:m:o:f: OPT
do do
case ${OPT} in case ${OPT} in
@@ -78,10 +85,17 @@ patch=`echo ${release} | sed -e 's/^[0-9]*.[0-9]*.\([0-9]*\).*/\1/'`
linux_version_code=`expr \( ${major} \* 65536 \) + \( ${minor} \* 256 \) + ${patch}` linux_version_code=`expr \( ${major} \* 65536 \) + \( ${minor} \* 256 \) + ${patch}`
rhel_release=`echo ${release} | sed -e 's/^[0-9]*.[0-9]*.[0-9]*-\([0-9]*\).*/\1/'` rhel_release=`echo ${release} | sed -e 's/^[0-9]*.[0-9]*.[0-9]*-\([0-9]*\).*/\1/'`
if [ "${release}" == "${rhel_release}" ]; then rhel_release=""; fi if [ "${release}" == "${rhel_release}" ]; then rhel_release=""; fi
enable_mcoverlay="no"
if [ "${ENABLE_MCOVERLAYFS}" == "yes" ]; then if [ "${ENABLE_MCOVERLAYFS}" == "yes" ]; then
enable_mcoverlay=`if ( [ ${linux_version_code} -ge 262144 ] && [ ${linux_version_code} -lt 262400 ] ); then echo "yes"; else echo "no"; fi` if [ "${rhel_release}" == "" ]; then
else if [ ${linux_version_code} -ge 262144 -a ${linux_version_code} -lt 262400 ]; then
enable_mcoverlay=no enable_mcoverlay="yes"
fi
else
if [ ${linux_version_code} -eq 199168 -a ${rhel_release} -ge 327 ]; then
enable_mcoverlay="yes"
fi
fi
fi fi
if [ "$cpus" == "" ]; then if [ "$cpus" == "" ]; then
@@ -106,6 +120,12 @@ if [ "$enable_mcoverlay" == "yes" ]; then
fi fi
fi fi
# Stop irqbalance
if [ "${irqbalance_used}" == "yes" ]; then
systemctl stop irqbalance_mck.service 2>/dev/null
if ! systemctl stop irqbalance.service 2>/dev/null ; then echo "error: stopping irqbalance" >&2; exit 1; fi;
fi
# Load IHK if not loaded # Load IHK if not loaded
if [ "`lsmod | grep ihk`" == "" ]; then if [ "`lsmod | grep ihk`" == "" ]; then
if ! insmod ${KMODDIR}/ihk.ko; then echo "error: loading ihk" >&2; exit 1; fi; if ! insmod ${KMODDIR}/ihk.ko; then echo "error: loading ihk" >&2; exit 1; fi;
@@ -210,3 +230,21 @@ then
pkill mcklogd pkill mcklogd
SBINDIR=${SBINDIR} ${SBINDIR}/mcklogd -i ${INTERVAL} -f ${facility} SBINDIR=${SBINDIR} ${SBINDIR}/mcklogd -i ${INTERVAL} -f ${facility}
fi fi
# Start irqbalance with CPUs and IRQ for McKernel banned
if [ "${irqbalance_used}" == "yes" ]; then
if ! etcdir=@ETCDIR@ perl -e 'use File::Copy qw(copy); $etcdir=$ENV{'etcdir'}; @files = grep { -f } glob "/proc/irq/*/smp_affinity"; foreach $file (@files) { $rel = substr($file, 1); $dir=substr($rel, 0, length($rel)-length("/smp_affinity")); if(0) { print "cp $file $etcdir/$rel\n";} if(system("mkdir -p $etcdir/$dir")){ exit 1;} if(!copy($file,"$etcdir/$rel")){ exit 1;} }' ; then echo "error: saving /proc/irq/*/smp_affinity" >&2; exit 1; fi;
ncpus=`lscpu | grep -E '^CPU\(s\):' | awk '{print $2}'`
smp_affinity_mask=`echo $cpus | ncpus=$ncpus perl -e 'while(<>){@tokens = split /,/;foreach $token (@tokens) {@nums = split /-/,$token; for($num = $nums[0]; $num <= $nums[$#nums]; $num++) {$ndx=int($num/32); $mask[$ndx] |= (1<<($num % 32))}}} $nint32s = int(($ENV{'ncpus'}+31)/32); for($j = $nint32s - 1; $j >= 0; $j--) { if($j != $nint32s - 1){print ",";} $nblks = $j == $nint32s - 1 ? int(($ENV{'ncpus'} % 32)/4) : 8; for($i = $nblks - 1;$i >= 0;$i--){ printf("%01x",($mask[$j] >> ($i*4)) & 0xf);}}'`
if ! ncpus=$ncpus smp_affinity_mask=$smp_affinity_mask perl -e '@dirs = grep { -d } glob "/proc/irq/*"; foreach $dir (@dirs) { $hit = 0; $affinity_str = `cat $dir/smp_affinity`; chomp $affinity_str; @int32strs = split /,/, $affinity_str; @int32strs_mask=split /,/, $ENV{'smp_affinity_mask'}; for($i=0;$i <= $#int32strs_mask; $i++) { $int32strs_inv[$i] = sprintf("%08x",hex($int32strs_mask[$i])^0xffffffff); if($i == 0) { $len = int((($ENV{'ncpus'}%32)+3)/4); $int32strs_inv[$i] = substr($int32strs_inv[$i], -$len, $len); } } $inv = join(",", @int32strs_inv); $nint32s = int(($ENV{'ncpus'}+31)/32); for($j = $nint32s - 1; $j >= 0; $j--) { if(hex($int32strs[$nint32s - 1 - $j]) & hex($int32strs_mask[$nint32s - 1 - $j])) { $hit = 1; }} if($hit == 1) { $cmd = "echo $inv > $dir/smp_affinity 2>/dev/null"; system $cmd;}}'; then echo "error: modifying /proc/irq/*/smp_affinity" >&2; exit 1; fi;
banirq=`cat /proc/interrupts| perl -e 'while(<>) { if(/^\s*(\d+).*IHK\-SMP\s*$/) {print $1;}}'`
sed "s/%mask%/$smp_affinity_mask/g" $ETCDIR/irqbalance_mck.in | sed "s/%banirq%/$banirq/g" > $ETCDIR/irqbalance_mck
if ! systemctl link $ETCDIR/irqbalance_mck.service >/dev/null 2>/dev/null; then echo "error: linking irqbalance_mck" >&2; exit 1; fi;
if ! systemctl start irqbalance_mck.service 2>/dev/null ; then echo "error: starting irqbalance_mck" >&2; exit 1; fi;
# echo cpus=$cpus mask=$smp_affinity_mask banirq=$banirq
fi

View File

@@ -10,6 +10,7 @@
prefix="@prefix@" prefix="@prefix@"
BINDIR="@BINDIR@" BINDIR="@BINDIR@"
SBINDIR="@SBINDIR@" SBINDIR="@SBINDIR@"
ETCDIR=@ETCDIR@
KMODDIR="@KMODDIR@" KMODDIR="@KMODDIR@"
KERNDIR="@KERNDIR@" KERNDIR="@KERNDIR@"
@@ -20,10 +21,12 @@ cpus=""
if [ "`lsmod | grep ihk_smp_x86`" == "" ]; then exit 0; fi if [ "`lsmod | grep ihk_smp_x86`" == "" ]; then exit 0; fi
# Destroy all LWK instances # Destroy all LWK instances
if ls /dev/mcos* 1>/dev/null 2>&1; then
for i in /dev/mcos*; do for i in /dev/mcos*; do
ind=`echo $i|cut -c10-`; ind=`echo $i|cut -c10-`;
if ! ${SBINDIR}/ihkconfig 0 destroy $ind; then echo "error: destroying LWK instance $ind failed" >&2; exit 1; fi if ! ${SBINDIR}/ihkconfig 0 destroy $ind; then echo "error: destroying LWK instance $ind failed" >&2; exit 1; fi
done done
fi
# Query IHK-SMP resources and release them # Query IHK-SMP resources and release them
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then echo "error: querying cpus" >&2; exit 1; fi if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then echo "error: querying cpus" >&2; exit 1; fi
@@ -51,3 +54,11 @@ fi
# Stop mcklogd # Stop mcklogd
pkill mcklogd pkill mcklogd
# Start irqbalance with the original settings
if [ "`systemctl status irqbalance_mck.service 2> /dev/null |grep -E 'Active: active'`" != "" ]; then
if ! systemctl stop irqbalance_mck.service 2>/dev/null ; then echo "error: stopping irqbalance_mck" >&2; exit 1; fi;
if ! systemctl disable irqbalance_mck.service >/dev/null 2>/dev/null; then echo "error: disabling irqbalance_mck" >&2; exit 1; fi;
if ! etcdir=@ETCDIR@ perl -e '$etcdir=$ENV{'etcdir'}; @files = grep { -f } glob "$etcdir/proc/irq/*/smp_affinity"; foreach $file (@files) { $dest = substr($file, length($etcdir)); if(0) {print "cp $file $dest\n";} system("cp $file $dest 2>/dev/null"); }' ; then echo "error: restoring /proc/irq/*/smp_affinity" >&2; exit 1; fi;
if ! systemctl start irqbalance.service; then echo "error: starting irqbalance" >&2; exit 1; fi;
fi

11
configure vendored
View File

@@ -632,6 +632,7 @@ ENABLE_MCOVERLAYFS
MANDIR MANDIR
KERNDIR KERNDIR
KMODDIR KMODDIR
ETCDIR
SBINDIR SBINDIR
BINDIR BINDIR
TARGET TARGET
@@ -3031,6 +3032,9 @@ case $WITH_TARGET in
if test "X$SBINDIR" = X; then if test "X$SBINDIR" = X; then
SBINDIR="$prefix/sbin" SBINDIR="$prefix/sbin"
fi fi
if test "X$ETCDIR" = X; then
ETCDIR="$prefix/etc"
fi
if test "X$KMODDIR" = X; then if test "X$KMODDIR" = X; then
KMODDIR="$prefix/kmod" KMODDIR="$prefix/kmod"
fi fi
@@ -3882,11 +3886,12 @@ fi
ac_config_headers="$ac_config_headers executer/config.h" ac_config_headers="$ac_config_headers executer/config.h"
ac_config_files="$ac_config_files Makefile executer/user/Makefile executer/kernel/mcctrl/Makefile executer/kernel/mcctrl/arch/x86_64/Makefile executer/kernel/mcoverlayfs/Makefile kernel/Makefile kernel/Makefile.build arch/x86/tools/mcreboot-attached-mic.sh arch/x86/tools/mcshutdown-attached-mic.sh arch/x86/tools/mcreboot-builtin-x86.sh arch/x86/tools/mcreboot-smp-x86.sh arch/x86/tools/mcstop+release-smp-x86.sh arch/x86/tools/mcshutdown-builtin-x86.sh arch/x86/tools/mcreboot.1:arch/x86/tools/mcreboot.1in" ac_config_files="$ac_config_files Makefile executer/user/Makefile executer/kernel/mcctrl/Makefile executer/kernel/mcctrl/arch/x86_64/Makefile executer/kernel/mcoverlayfs/Makefile executer/kernel/mcoverlayfs/linux-3.10.0-327.36.1.el7/Makefile executer/kernel/mcoverlayfs/linux-4.0.9/Makefile kernel/Makefile kernel/Makefile.build arch/x86/tools/mcreboot-attached-mic.sh arch/x86/tools/mcshutdown-attached-mic.sh arch/x86/tools/mcreboot-builtin-x86.sh arch/x86/tools/mcreboot-smp-x86.sh arch/x86/tools/mcstop+release-smp-x86.sh arch/x86/tools/mcshutdown-builtin-x86.sh arch/x86/tools/mcreboot.1:arch/x86/tools/mcreboot.1in arch/x86/tools/irqbalance_mck.service arch/x86/tools/irqbalance_mck.in"
if test "x$enable_dcfa" = xyes; then : if test "x$enable_dcfa" = xyes; then :
@@ -4590,6 +4595,8 @@ do
"executer/kernel/mcctrl/Makefile") CONFIG_FILES="$CONFIG_FILES executer/kernel/mcctrl/Makefile" ;; "executer/kernel/mcctrl/Makefile") CONFIG_FILES="$CONFIG_FILES executer/kernel/mcctrl/Makefile" ;;
"executer/kernel/mcctrl/arch/x86_64/Makefile") CONFIG_FILES="$CONFIG_FILES executer/kernel/mcctrl/arch/x86_64/Makefile" ;; "executer/kernel/mcctrl/arch/x86_64/Makefile") CONFIG_FILES="$CONFIG_FILES executer/kernel/mcctrl/arch/x86_64/Makefile" ;;
"executer/kernel/mcoverlayfs/Makefile") CONFIG_FILES="$CONFIG_FILES executer/kernel/mcoverlayfs/Makefile" ;; "executer/kernel/mcoverlayfs/Makefile") CONFIG_FILES="$CONFIG_FILES executer/kernel/mcoverlayfs/Makefile" ;;
"executer/kernel/mcoverlayfs/linux-3.10.0-327.36.1.el7/Makefile") CONFIG_FILES="$CONFIG_FILES executer/kernel/mcoverlayfs/linux-3.10.0-327.36.1.el7/Makefile" ;;
"executer/kernel/mcoverlayfs/linux-4.0.9/Makefile") CONFIG_FILES="$CONFIG_FILES executer/kernel/mcoverlayfs/linux-4.0.9/Makefile" ;;
"kernel/Makefile") CONFIG_FILES="$CONFIG_FILES kernel/Makefile" ;; "kernel/Makefile") CONFIG_FILES="$CONFIG_FILES kernel/Makefile" ;;
"kernel/Makefile.build") CONFIG_FILES="$CONFIG_FILES kernel/Makefile.build" ;; "kernel/Makefile.build") CONFIG_FILES="$CONFIG_FILES kernel/Makefile.build" ;;
"arch/x86/tools/mcreboot-attached-mic.sh") CONFIG_FILES="$CONFIG_FILES arch/x86/tools/mcreboot-attached-mic.sh" ;; "arch/x86/tools/mcreboot-attached-mic.sh") CONFIG_FILES="$CONFIG_FILES arch/x86/tools/mcreboot-attached-mic.sh" ;;
@@ -4599,6 +4606,8 @@ do
"arch/x86/tools/mcstop+release-smp-x86.sh") CONFIG_FILES="$CONFIG_FILES arch/x86/tools/mcstop+release-smp-x86.sh" ;; "arch/x86/tools/mcstop+release-smp-x86.sh") CONFIG_FILES="$CONFIG_FILES arch/x86/tools/mcstop+release-smp-x86.sh" ;;
"arch/x86/tools/mcshutdown-builtin-x86.sh") CONFIG_FILES="$CONFIG_FILES arch/x86/tools/mcshutdown-builtin-x86.sh" ;; "arch/x86/tools/mcshutdown-builtin-x86.sh") CONFIG_FILES="$CONFIG_FILES arch/x86/tools/mcshutdown-builtin-x86.sh" ;;
"arch/x86/tools/mcreboot.1") CONFIG_FILES="$CONFIG_FILES arch/x86/tools/mcreboot.1:arch/x86/tools/mcreboot.1in" ;; "arch/x86/tools/mcreboot.1") CONFIG_FILES="$CONFIG_FILES arch/x86/tools/mcreboot.1:arch/x86/tools/mcreboot.1in" ;;
"arch/x86/tools/irqbalance_mck.service") CONFIG_FILES="$CONFIG_FILES arch/x86/tools/irqbalance_mck.service" ;;
"arch/x86/tools/irqbalance_mck.in") CONFIG_FILES="$CONFIG_FILES arch/x86/tools/irqbalance_mck.in" ;;
"kernel/Makefile.dcfa") CONFIG_FILES="$CONFIG_FILES kernel/Makefile.dcfa" ;; "kernel/Makefile.dcfa") CONFIG_FILES="$CONFIG_FILES kernel/Makefile.dcfa" ;;
*) as_fn_error $? "invalid argument: \`$ac_config_target'" "$LINENO" 5;; *) as_fn_error $? "invalid argument: \`$ac_config_target'" "$LINENO" 5;;

View File

@@ -146,6 +146,9 @@ case $WITH_TARGET in
if test "X$SBINDIR" = X; then if test "X$SBINDIR" = X; then
SBINDIR="$prefix/sbin" SBINDIR="$prefix/sbin"
fi fi
if test "X$ETCDIR" = X; then
ETCDIR="$prefix/etc"
fi
if test "X$KMODDIR" = X; then if test "X$KMODDIR" = X; then
KMODDIR="$prefix/kmod" KMODDIR="$prefix/kmod"
fi fi
@@ -278,6 +281,7 @@ AC_SUBST(KDIR)
AC_SUBST(TARGET) AC_SUBST(TARGET)
AC_SUBST(BINDIR) AC_SUBST(BINDIR)
AC_SUBST(SBINDIR) AC_SUBST(SBINDIR)
AC_SUBST(ETCDIR)
AC_SUBST(KMODDIR) AC_SUBST(KMODDIR)
AC_SUBST(KERNDIR) AC_SUBST(KERNDIR)
AC_SUBST(MANDIR) AC_SUBST(MANDIR)
@@ -298,6 +302,8 @@ AC_CONFIG_FILES([
executer/kernel/mcctrl/Makefile executer/kernel/mcctrl/Makefile
executer/kernel/mcctrl/arch/x86_64/Makefile executer/kernel/mcctrl/arch/x86_64/Makefile
executer/kernel/mcoverlayfs/Makefile executer/kernel/mcoverlayfs/Makefile
executer/kernel/mcoverlayfs/linux-3.10.0-327.36.1.el7/Makefile
executer/kernel/mcoverlayfs/linux-4.0.9/Makefile
kernel/Makefile kernel/Makefile
kernel/Makefile.build kernel/Makefile.build
arch/x86/tools/mcreboot-attached-mic.sh arch/x86/tools/mcreboot-attached-mic.sh
@@ -307,6 +313,8 @@ AC_CONFIG_FILES([
arch/x86/tools/mcstop+release-smp-x86.sh arch/x86/tools/mcstop+release-smp-x86.sh
arch/x86/tools/mcshutdown-builtin-x86.sh arch/x86/tools/mcshutdown-builtin-x86.sh
arch/x86/tools/mcreboot.1:arch/x86/tools/mcreboot.1in arch/x86/tools/mcreboot.1:arch/x86/tools/mcreboot.1in
arch/x86/tools/irqbalance_mck.service
arch/x86/tools/irqbalance_mck.in
]) ])
AS_IF([test "x$enable_dcfa" = xyes], [ AS_IF([test "x$enable_dcfa" = xyes], [

View File

@@ -110,6 +110,13 @@ struct program_load_desc {
}; };
struct syscall_request { struct syscall_request {
/* TID of requesting thread */
int rtid;
/*
* TID of target thread. Remote page fault response needs to designate the
* thread that must serve the request, 0 indicates any thread from the pool
*/
int ttid;
unsigned long valid; unsigned long valid;
unsigned long number; unsigned long number;
unsigned long args[6]; unsigned long args[6];
@@ -128,8 +135,17 @@ struct syscall_load_desc {
unsigned long size; unsigned long size;
}; };
#define IHK_SCD_REQ_THREAD_SPINNING 0
#define IHK_SCD_REQ_THREAD_TO_BE_WOKEN 1
#define IHK_SCD_REQ_THREAD_DESCHEDULED 2
struct syscall_response { struct syscall_response {
/* TID of the thread that requested the service */
int ttid;
/* TID of the mcexec thread that is serving or has served the request */
int stid;
unsigned long status; unsigned long status;
unsigned long req_thread_status;
long ret; long ret;
unsigned long fault_address; unsigned long fault_address;
unsigned long fault_reason; unsigned long fault_reason;

View File

@@ -100,8 +100,6 @@ void get_vdso_info(ihk_os_t os, long vdso_rpa)
vdso_pa = ihk_device_map_memory(dev, vdso_rpa, sizeof(*vdso)); vdso_pa = ihk_device_map_memory(dev, vdso_rpa, sizeof(*vdso));
vdso = ihk_device_map_virtual(dev, vdso_pa, sizeof(*vdso), NULL, 0); vdso = ihk_device_map_virtual(dev, vdso_pa, sizeof(*vdso), NULL, 0);
memset(vdso, 0, sizeof(*vdso));
/* VDSO pages */ /* VDSO pages */
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0) #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)
size = vdso_image->size; size = vdso_image->size;

View File

@@ -33,6 +33,7 @@
#include <linux/file.h> #include <linux/file.h>
#include <linux/version.h> #include <linux/version.h>
#include <linux/semaphore.h> #include <linux/semaphore.h>
#include <linux/interrupt.h>
#include <asm/uaccess.h> #include <asm/uaccess.h>
#include <asm/delay.h> #include <asm/delay.h>
#include <asm/io.h> #include <asm/io.h>
@@ -81,7 +82,6 @@ static long mcexec_prepare_image(ihk_os_t os,
void *args, *envs; void *args, *envs;
long ret = 0; long ret = 0;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os); struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
unsigned long flags;
struct mcctrl_per_proc_data *ppd = NULL; struct mcctrl_per_proc_data *ppd = NULL;
if (copy_from_user(&desc, udesc, if (copy_from_user(&desc, udesc,
@@ -124,52 +124,48 @@ static long mcexec_prepare_image(ihk_os_t os,
} }
pdesc->args = (void*)virt_to_phys(args); pdesc->args = (void*)virt_to_phys(args);
printk("args: 0x%lX\n", (unsigned long)pdesc->args); dprintk("args: 0x%lX\n", (unsigned long)pdesc->args);
printk("argc: %ld\n", *(long *)args); dprintk("argc: %ld\n", *(long *)args);
pdesc->envs = (void*)virt_to_phys(envs); pdesc->envs = (void*)virt_to_phys(envs);
printk("envs: 0x%lX\n", (unsigned long)pdesc->envs); dprintk("envs: 0x%lX\n", (unsigned long)pdesc->envs);
printk("envc: %ld\n", *(long *)envs); dprintk("envc: %ld\n", *(long *)envs);
isp.msg = SCD_MSG_PREPARE_PROCESS; isp.msg = SCD_MSG_PREPARE_PROCESS;
isp.ref = pdesc->cpu; isp.ref = pdesc->cpu;
isp.arg = virt_to_phys(pdesc); isp.arg = virt_to_phys(pdesc);
printk("# of sections: %d\n", pdesc->num_sections); dprintk("# of sections: %d\n", pdesc->num_sections);
printk("%p (%lx)\n", pdesc, isp.arg); dprintk("%p (%lx)\n", pdesc, isp.arg);
pdesc->status = 0; pdesc->status = 0;
mcctrl_ikc_send(os, pdesc->cpu, &isp); mcctrl_ikc_send(os, pdesc->cpu, &isp);
wait_event_interruptible(usrdata->wq_prepare, pdesc->status); while (wait_event_interruptible(usrdata->wq_prepare, pdesc->status) != 0);
if(pdesc->err < 0){ if(pdesc->err < 0){
ret = pdesc->err; ret = pdesc->err;
goto free_out; goto free_out;
} }
ppd = kmalloc(sizeof(*ppd), GFP_ATOMIC); ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current));
if (!ppd) { if (!ppd) {
printk("ERROR: allocating per process data\n"); printk("ERROR: no per process data for PID %d\n", task_tgid_vnr(current));
ret = -ENOMEM; ret = -EINVAL;
goto free_out; goto free_out;
} }
ppd->pid = pdesc->pid; /* Update rpgtable */
ppd->rpgtable = pdesc->rpgtable; ppd->rpgtable = pdesc->rpgtable;
flags = ihk_ikc_spinlock_lock(&usrdata->per_proc_list_lock);
list_add_tail(&ppd->list, &usrdata->per_proc_list);
ihk_ikc_spinlock_unlock(&usrdata->per_proc_list_lock, flags);
dprintk("pid %d, rpgtable: 0x%lx added\n",
ppd->pid, ppd->rpgtable);
if (copy_to_user(udesc, pdesc, sizeof(struct program_load_desc) + if (copy_to_user(udesc, pdesc, sizeof(struct program_load_desc) +
sizeof(struct program_image_section) * desc.num_sections)) { sizeof(struct program_image_section) * desc.num_sections)) {
ret = -EFAULT; ret = -EFAULT;
goto free_out; goto free_out;
} }
dprintk("%s: pid %d, rpgtable: 0x%lx added\n",
__FUNCTION__, ppd->pid, ppd->rpgtable);
ret = 0; ret = 0;
free_out: free_out:
@@ -417,19 +413,200 @@ static long mcexec_get_cpu(ihk_os_t os)
return info->n_cpus; return info->n_cpus;
} }
int mcexec_syscall(struct mcctrl_channel *c, int pid, unsigned long arg) int mcctrl_add_per_proc_data(struct mcctrl_usrdata *ud, int pid,
struct mcctrl_per_proc_data *ppd)
{
struct mcctrl_per_proc_data *ppd_iter;
int hash = (pid & MCCTRL_PER_PROC_DATA_HASH_MASK);
int ret = 0;
unsigned long flags;
/* Check if data for this thread exists and add if not */
write_lock_irqsave(&ud->per_proc_data_hash_lock[hash], flags);
list_for_each_entry(ppd_iter, &ud->per_proc_data_hash[hash], hash) {
if (ppd_iter->pid == pid) {
ret = -EBUSY;
goto out;
}
}
list_add_tail(&ppd->hash, &ud->per_proc_data_hash[hash]);
out:
write_unlock_irqrestore(&ud->per_proc_data_hash_lock[hash], flags);
return ret;
}
int mcctrl_delete_per_proc_data(struct mcctrl_usrdata *ud, int pid)
{
struct mcctrl_per_proc_data *ppd_iter, *ppd = NULL;
int hash = (pid & MCCTRL_PER_PROC_DATA_HASH_MASK);
int ret = 0;
unsigned long flags;
write_lock_irqsave(&ud->per_proc_data_hash_lock[hash], flags);
list_for_each_entry(ppd_iter, &ud->per_proc_data_hash[hash], hash) {
if (ppd_iter->pid == pid) {
ppd = ppd_iter;
break;
}
}
if (!ppd) {
ret = -EINVAL;
goto out;
}
list_del(&ppd->hash);
out:
write_unlock_irqrestore(&ud->per_proc_data_hash_lock[hash], flags);
return ret;
}
inline struct mcctrl_per_proc_data *mcctrl_get_per_proc_data(
struct mcctrl_usrdata *ud, int pid)
{
struct mcctrl_per_proc_data *ppd_iter, *ppd = NULL;
int hash = (pid & MCCTRL_PER_PROC_DATA_HASH_MASK);
unsigned long flags;
/* Check if data for this process exists and return it */
read_lock_irqsave(&ud->per_proc_data_hash_lock[hash], flags);
list_for_each_entry(ppd_iter, &ud->per_proc_data_hash[hash], hash) {
if (ppd_iter->pid == pid) {
ppd = ppd_iter;
break;
}
}
read_unlock_irqrestore(&ud->per_proc_data_hash_lock[hash], flags);
return ppd;
}
/*
* Called indirectly from the IKC message handler.
*/
int mcexec_syscall(struct mcctrl_usrdata *ud, struct ikc_scd_packet *packet)
{ {
struct wait_queue_head_list_node *wqhln = NULL; struct wait_queue_head_list_node *wqhln = NULL;
struct wait_queue_head_list_node *wqhln_iter; struct wait_queue_head_list_node *wqhln_iter;
struct wait_queue_head_list_node *wqhln_alloc = NULL;
int pid = packet->pid;
unsigned long flags; unsigned long flags;
struct mcctrl_per_proc_data *ppd;
/* Look up per-process structure */
ppd = mcctrl_get_per_proc_data(ud, pid);
if (unlikely(!ppd)) {
kprintf("%s: ERROR: no per-process structure for PID %d??\n",
__FUNCTION__, task_tgid_vnr(current));
return 0;
}
dprintk("%s: (packet_handler) rtid: %d, ttid: %d, sys nr: %d\n",
__FUNCTION__,
packet->req.rtid,
packet->req.ttid,
packet->req.number);
/*
* Three scenarios are possible:
* - Find the designated thread if req->ttid is specified.
* - Find any available thread if req->ttid is zero.
* - Add a request element if no threads are available.
*/
flags = ihk_ikc_spinlock_lock(&ppd->wq_list_lock);
/* Is this a request for a specific thread? See if it's waiting */
if (unlikely(packet->req.ttid)) {
list_for_each_entry(wqhln_iter, &ppd->wq_list_exact, list) {
if (packet->req.ttid != task_pid_vnr(wqhln_iter->task))
continue;
/* Look up per-process wait queue head with pid */
flags = ihk_ikc_spinlock_lock(&c->wq_list_lock);
list_for_each_entry(wqhln_iter, &c->wq_list, list) {
if (wqhln_iter->pid == pid) {
wqhln = wqhln_iter; wqhln = wqhln_iter;
break; break;
} }
if (!wqhln) {
printk("%s: WARNING: no target thread found for exact request??\n",
__FUNCTION__);
}
}
/* Is there any thread available? */
else {
list_for_each_entry(wqhln_iter, &ppd->wq_list, list) {
if (wqhln_iter->task && !wqhln_iter->req) {
wqhln = wqhln_iter;
break;
}
}
}
/* If no match found, add request to pending request list */
if (unlikely(!wqhln)) {
retry_alloc:
wqhln_alloc = kmalloc(sizeof(*wqhln), GFP_ATOMIC);
if (!wqhln_alloc) {
printk("WARNING: coudln't alloc wait queue head, retrying..\n");
goto retry_alloc;
}
wqhln = wqhln_alloc;
wqhln->req = 0;
wqhln->task = NULL;
init_waitqueue_head(&wqhln->wq_syscall);
list_add_tail(&wqhln->list, &ppd->wq_req_list);
}
wqhln->packet = packet;
wqhln->req = 1;
wake_up(&wqhln->wq_syscall);
ihk_ikc_spinlock_unlock(&ppd->wq_list_lock, flags);
return 0;
}
/*
* Called from an mcexec thread via ioctl().
*/
int mcexec_wait_syscall(ihk_os_t os, struct syscall_wait_desc *__user req)
{
struct ikc_scd_packet *packet;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct wait_queue_head_list_node *wqhln = NULL;
struct wait_queue_head_list_node *wqhln_iter;
int ret = 0;
unsigned long irqflags;
struct mcctrl_per_proc_data *ppd;
/* Look up per-process structure */
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current));
if (unlikely(!ppd)) {
kprintf("%s: ERROR: no per-process structure for PID %d??\n",
__FUNCTION__, task_tgid_vnr(current));
return -EINVAL;
}
packet = (struct ikc_scd_packet *)mcctrl_get_per_thread_data(ppd, current);
if (packet) {
printk("%s: ERROR: packet %p is already registered for thread %d\n",
__FUNCTION__, packet, task_pid_vnr(current));
return -EBUSY;
}
retry:
/* Prepare per-thread wait queue head or find a valid request */
irqflags = ihk_ikc_spinlock_lock(&ppd->wq_list_lock);
/* First see if there is a valid request already that is not yet taken */
list_for_each_entry(wqhln_iter, &ppd->wq_req_list, list) {
if (wqhln_iter->task == NULL && wqhln_iter->req) {
wqhln = wqhln_iter;
wqhln->task = current;
list_del(&wqhln->list);
break;
}
} }
if (!wqhln) { if (!wqhln) {
@@ -440,180 +617,86 @@ retry_alloc:
goto retry_alloc; goto retry_alloc;
} }
wqhln->pid = pid; wqhln->task = current;
wqhln->req = 0; wqhln->req = 0;
init_waitqueue_head(&wqhln->wq_syscall); init_waitqueue_head(&wqhln->wq_syscall);
list_add_tail(&wqhln->list, &c->wq_list);
/* Wait for a request.. */
list_add(&wqhln->list, &ppd->wq_list);
ihk_ikc_spinlock_unlock(&ppd->wq_list_lock, irqflags);
ret = wait_event_interruptible(wqhln->wq_syscall, wqhln->req);
/* Remove per-thread wait queue head */
irqflags = ihk_ikc_spinlock_lock(&ppd->wq_list_lock);
list_del(&wqhln->list);
} }
ihk_ikc_spinlock_unlock(&ppd->wq_list_lock, irqflags);
wqhln->req = 1;
wake_up(&wqhln->wq_syscall);
ihk_ikc_spinlock_unlock(&c->wq_list_lock, flags);
return 0;
}
#ifndef DO_USER_MODE
// static int remaining_job, base_cpu, job_pos;
#endif
// extern int num_channels;
// extern int mcctrl_dma_abort;
int mcexec_wait_syscall(ihk_os_t os, struct syscall_wait_desc *__user req)
{
struct syscall_wait_desc swd;
struct mcctrl_channel *c;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct wait_queue_head_list_node *wqhln;
struct wait_queue_head_list_node *wqhln_iter;
int ret = 0;
unsigned long irqflags;
#ifndef DO_USER_MODE
unsigned long s, w, d;
#endif
//printk("mcexec_wait_syscall swd=%p req=%p size=%d\n", &swd, req, sizeof(swd.cpu));
if (copy_from_user(&swd, req, sizeof(swd))) {
return -EFAULT;
}
if (swd.cpu >= usrdata->num_channels)
return -EINVAL;
c = get_peer_channel(usrdata, current);
if (c) {
printk("mcexec_wait_syscall:already registered. task %p ch %p\n",
current, c);
return -EBUSY;
}
c = usrdata->channels + swd.cpu;
#ifdef DO_USER_MODE
retry:
/* Prepare per-process wait queue head */
retry_alloc:
wqhln = kmalloc(sizeof(*wqhln), GFP_KERNEL);
if (!wqhln) {
printk("WARNING: coudln't alloc wait queue head, retrying..\n");
goto retry_alloc;
}
wqhln->pid = swd.pid;
wqhln->req = 0;
init_waitqueue_head(&wqhln->wq_syscall);
irqflags = ihk_ikc_spinlock_lock(&c->wq_list_lock);
/* First see if there is one wait queue already */
list_for_each_entry(wqhln_iter, &c->wq_list, list) {
if (wqhln_iter->pid == task_tgid_vnr(current)) {
kfree(wqhln);
wqhln = wqhln_iter;
list_del(&wqhln->list);
break;
}
}
list_add_tail(&wqhln->list, &c->wq_list);
ihk_ikc_spinlock_unlock(&c->wq_list_lock, irqflags);
ret = wait_event_interruptible(wqhln->wq_syscall, wqhln->req);
/* Remove per-process wait queue head */
irqflags = ihk_ikc_spinlock_lock(&c->wq_list_lock);
list_del(&wqhln->list);
ihk_ikc_spinlock_unlock(&c->wq_list_lock, irqflags);
if (ret && !wqhln->req) { if (ret && !wqhln->req) {
kfree(wqhln); kfree(wqhln);
wqhln = NULL;
return -EINTR; return -EINTR;
} }
packet = wqhln->packet;
kfree(wqhln); kfree(wqhln);
wqhln = NULL;
if (c->param.request_va->number == 61 && dprintk("%s: tid: %d request from CPU %d\n",
c->param.request_va->args[0] == swd.pid) { __FUNCTION__, task_pid_vnr(current), packet->ref);
dprintk("pid: %d, tid: %d: SC %d, swd.cpu: %d, WARNING: wait4() for self?\n",
task_tgid_vnr(current),
task_pid_vnr(current);
c->param.request_va->number,
swd.cpu);
return -EINTR;
}
#if 1
mb(); mb();
if (!c->param.request_va->valid) { if (!packet->req.valid) {
printk("mcexec_wait_syscall:stray wakeup\n"); printk("%s: ERROR: stray wakeup pid: %d, tid: %d: SC %lu\n",
__FUNCTION__,
task_tgid_vnr(current),
task_pid_vnr(current),
packet->req.number);
ihk_ikc_release_packet((struct ihk_ikc_free_packet *)packet,
(usrdata->channels + packet->ref)->c);
goto retry; goto retry;
} }
#endif
#else
while (1) {
c = usrdata->channels + swd.cpu;
ihk_get_tsc(s);
if (!usrdata->remaining_job) {
while (!(*c->param.doorbell_va)) {
mb();
cpu_relax();
ihk_get_tsc(w);
if (w > s + 1024UL * 1024 * 1024 * 10) {
return -EINTR;
}
}
d = (*c->param.doorbell_va) - 1;
*c->param.doorbell_va = 0;
if (d < 0 || d >= usrdata->num_channels) { packet->req.valid = 0; /* ack */
d = 0; dprintk("%s: system call: %d, args[0]: %lu, args[1]: %lu, args[2]: %lu, "
} "args[3]: %lu, args[4]: %lu, args[5]: %lu\n",
usrdata->base_cpu = d; __FUNCTION__,
usrdata->job_pos = 0; packet->req.number,
usrdata->remaining_job = 1; packet->req.args[0],
} else { packet->req.args[1],
usrdata->job_pos++; packet->req.args[2],
} packet->req.args[3],
packet->req.args[4],
for (; usrdata->job_pos < usrdata->num_channels; usrdata->job_pos++) { packet->req.args[5]);
if (base_cpu + job_pos >= num_channels) {
c = usrdata->channels + if (mcctrl_add_per_thread_data(ppd, current, packet) < 0) {
(usrdata->base_cpu + usrdata->job_pos - usrdata->num_channels); kprintf("%s: error adding per-thread data\n", __FUNCTION__);
} else { return -EINVAL;
c = usrdata->channels + usrdata->base_cpu + usrdata->job_pos;
}
if (!c) {
continue;
}
if (c->param.request_va &&
c->param.request_va->valid) {
#endif
c->param.request_va->valid = 0; /* ack */
dprintk("SC #%lx, %lx\n",
c->param.request_va->number,
c->param.request_va->args[0]);
register_peer_channel(usrdata, current, c);
if (__do_in_kernel_syscall(os, c, c->param.request_va)) {
if (copy_to_user(&req->sr, c->param.request_va,
sizeof(struct syscall_request))) {
deregister_peer_channel(usrdata, current, c);
return -EFAULT;
}
return 0;
}
deregister_peer_channel(usrdata, current, c);
#ifdef DO_USER_MODE
goto retry;
#endif
#ifndef DO_USER_MODE
if (usrdata->mcctrl_dma_abort) {
return -2;
}
}
}
usrdata->remaining_job = 0;
} }
#endif
return 0; if (__do_in_kernel_syscall(os, packet)) {
if (copy_to_user(&req->sr, &packet->req,
sizeof(struct syscall_request))) {
if (mcctrl_delete_per_thread_data(ppd, current) < 0) {
kprintf("%s: error deleting per-thread data\n", __FUNCTION__);
return -EINVAL;
}
return -EFAULT;
}
return 0;
}
ihk_ikc_release_packet((struct ihk_ikc_free_packet *)packet,
(usrdata->channels + packet->ref)->c);
if (mcctrl_delete_per_thread_data(ppd, current) < 0) {
kprintf("%s: error deleting per-thread data\n", __FUNCTION__);
return -EINVAL;
}
goto retry;
} }
long mcexec_pin_region(ihk_os_t os, unsigned long *__user arg) long mcexec_pin_region(ihk_os_t os, unsigned long *__user arg)
@@ -696,33 +779,6 @@ long mcexec_load_syscall(ihk_os_t os, struct syscall_load_desc *__user arg)
#endif #endif
ihk_device_unmap_memory(ihk_os_to_dev(os), phys, desc.size); ihk_device_unmap_memory(ihk_os_to_dev(os), phys, desc.size);
/*
ihk_dma_channel_t channel;
struct ihk_dma_request request;
unsigned long dma_status = 0;
channel = ihk_device_get_dma_channel(ihk_os_to_dev(os), 0);
if (!channel) {
return -EINVAL;
}
memset(&request, 0, sizeof(request));
request.src_os = os;
request.src_phys = desc.src;
request.dest_os = NULL;
request.dest_phys = desc.dest;
request.size = desc.size;
request.notify = (void *)virt_to_phys(&dma_status);
request.priv = (void *)1;
ihk_dma_request(channel, &request);
while (!dma_status) {
mb();
udelay(1);
}
*/
return 0; return 0;
} }
@@ -730,74 +786,60 @@ long mcexec_load_syscall(ihk_os_t os, struct syscall_load_desc *__user arg)
long mcexec_ret_syscall(ihk_os_t os, struct syscall_ret_desc *__user arg) long mcexec_ret_syscall(ihk_os_t os, struct syscall_ret_desc *__user arg)
{ {
struct syscall_ret_desc ret; struct syscall_ret_desc ret;
struct mcctrl_channel *mc; struct ikc_scd_packet *packet;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os); struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
#if 0 struct mcctrl_per_proc_data *ppd;
ihk_dma_channel_t channel;
struct ihk_dma_request request;
channel = ihk_device_get_dma_channel(ihk_os_to_dev(os), 0);
if (!channel) {
return -EINVAL;
}
#endif
if (copy_from_user(&ret, arg, sizeof(struct syscall_ret_desc))) { if (copy_from_user(&ret, arg, sizeof(struct syscall_ret_desc))) {
return -EFAULT; return -EFAULT;
} }
mc = usrdata->channels + ret.cpu;
if (!mc) { /* Look up per-process structure */
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current));
if (!ppd) {
kprintf("%s: ERROR: no per-process structure for PID %d??\n",
__FUNCTION__, task_tgid_vnr(current));
return -EINVAL; return -EINVAL;
} }
deregister_peer_channel(usrdata, current, mc);
mc->param.response_va->ret = ret.ret; packet = (struct ikc_scd_packet *)mcctrl_get_per_thread_data(ppd, current);
if (!packet) {
kprintf("%s: ERROR: no packet registered for TID %d\n",
__FUNCTION__, task_pid_vnr(current));
return -EINVAL;
}
mcctrl_delete_per_thread_data(ppd, current);
if (ret.size > 0) { if (ret.size > 0) {
/* Host => Accel. Write is fast. */ /* Host => Accel. Write is fast. */
unsigned long phys; unsigned long phys;
void *rpm; void *rpm;
phys = ihk_device_map_memory(ihk_os_to_dev(os), ret.dest, phys = ihk_device_map_memory(ihk_os_to_dev(os), ret.dest, ret.size);
ret.size);
#ifdef CONFIG_MIC #ifdef CONFIG_MIC
rpm = ioremap_wc(phys, ret.size); rpm = ioremap_wc(phys, ret.size);
#else #else
rpm = ihk_device_map_virtual(ihk_os_to_dev(os), phys, rpm = ihk_device_map_virtual(ihk_os_to_dev(os), phys,
ret.size, NULL, 0); ret.size, NULL, 0);
#endif #endif
if (copy_from_user(rpm, (void *__user)ret.src, ret.size)) { if (copy_from_user(rpm, (void *__user)ret.src, ret.size)) {
return -EFAULT; return -EFAULT;
} }
mb();
mc->param.response_va->status = 1;
#ifdef CONFIG_MIC #ifdef CONFIG_MIC
iounmap(rpm); iounmap(rpm);
#else #else
ihk_device_unmap_virtual(ihk_os_to_dev(os), rpm, ret.size); ihk_device_unmap_virtual(ihk_os_to_dev(os), rpm, ret.size);
#endif #endif
ihk_device_unmap_memory(ihk_os_to_dev(os), phys, ret.size); ihk_device_unmap_memory(ihk_os_to_dev(os), phys, ret.size);
}
/* __return_syscall(os, packet, ret.ret, task_pid_vnr(current));
memset(&request, 0, sizeof(request));
request.src_os = NULL; /* Free packet */
request.src_phys = ret.src; ihk_ikc_release_packet((struct ihk_ikc_free_packet *)packet,
request.dest_os = os; (usrdata->channels + packet->ref)->c);
request.dest_phys = ret.dest;
request.size = ret.size;
request.notify_os = os;
request.notify = (void *)mc->param.response_rpa;
request.priv = (void *)1;
ihk_dma_request(channel, &request);
*/
} else {
mb();
mc->param.response_va->status = 1;
}
return 0; return 0;
} }
@@ -862,14 +904,53 @@ int mcexec_open_exec(ihk_os_t os, char * __user filename)
int retval; int retval;
int os_ind = ihk_host_os_get_index(os); int os_ind = ihk_host_os_get_index(os);
char *pathbuf, *fullpath; char *pathbuf, *fullpath;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct mcctrl_per_proc_data *ppd = NULL;
int i;
if (os_ind < 0) { if (os_ind < 0) {
return EINVAL; return EINVAL;
} }
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current));
if (!ppd) {
ppd = kmalloc(sizeof(*ppd), GFP_KERNEL);
if (!ppd) {
printk("ERROR: allocating per process data\n");
return -ENOMEM;
}
ppd->pid = task_tgid_vnr(current);
/*
* XXX: rpgtable will be updated in __do_in_kernel_syscall()
* under case __NR_munmap
*/
INIT_LIST_HEAD(&ppd->wq_list);
INIT_LIST_HEAD(&ppd->wq_req_list);
INIT_LIST_HEAD(&ppd->wq_list_exact);
spin_lock_init(&ppd->wq_list_lock);
for (i = 0; i < MCCTRL_PER_THREAD_DATA_HASH_SIZE; ++i) {
INIT_LIST_HEAD(&ppd->per_thread_data_hash[i]);
rwlock_init(&ppd->per_thread_data_hash_lock[i]);
}
if (mcctrl_add_per_proc_data(usrdata, ppd->pid, ppd) < 0) {
printk("%s: error adding per process data\n", __FUNCTION__);
retval = EINVAL;
goto out_free_ppd;
}
}
else {
/* Only deallocate in case of an error if we added it above */
ppd = NULL;
}
pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY); pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
if (!pathbuf) { if (!pathbuf) {
return ENOMEM; retval = ENOMEM;
goto out_error_drop_ppd;
} }
file = open_exec(filename); file = open_exec(filename);
@@ -901,7 +982,7 @@ int mcexec_open_exec(ihk_os_t os, char * __user filename)
break; break;
} }
} }
/* Add new exec file to the list */ /* Add new exec file to the list */
mcef->os = os; mcef->os = os;
mcef->pid = task_tgid_vnr(current); mcef->pid = task_tgid_vnr(current);
@@ -918,12 +999,15 @@ int mcexec_open_exec(ihk_os_t os, char * __user filename)
kfree(pathbuf); kfree(pathbuf);
return 0; return 0;
out_put_file: out_put_file:
fput(file); fput(file);
out_error_free: out_error_free:
kfree(pathbuf); kfree(pathbuf);
out_error_drop_ppd:
if (ppd) mcctrl_delete_per_proc_data(usrdata, ppd->pid);
out_free_ppd:
if (ppd) kfree(ppd);
return -retval; return -retval;
} }
@@ -933,6 +1017,23 @@ int mcexec_close_exec(ihk_os_t os)
struct mckernel_exec_file *mcef = NULL; struct mckernel_exec_file *mcef = NULL;
int found = 0; int found = 0;
int os_ind = ihk_host_os_get_index(os); int os_ind = ihk_host_os_get_index(os);
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct mcctrl_per_proc_data *ppd = NULL;
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current));
if (ppd) {
mcctrl_delete_per_proc_data(usrdata, ppd->pid);
dprintk("pid: %d, tid: %d: rpgtable for %d (0x%lx) removed\n",
task_tgid_vnr(current), current->pid, ppd->pid, ppd->rpgtable);
kfree(ppd);
}
else {
printk("WARNING: no per process data for pid %d ?\n",
task_tgid_vnr(current));
}
if (os_ind < 0) { if (os_ind < 0) {
return EINVAL; return EINVAL;

View File

@@ -27,6 +27,7 @@
#include <linux/miscdevice.h> #include <linux/miscdevice.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/string.h> #include <linux/string.h>
#include <linux/interrupt.h>
#include "mcctrl.h" #include "mcctrl.h"
#ifdef ATTACHED_MIC #ifdef ATTACHED_MIC
#include <sysdeps/mic/mic/micconst.h> #include <sysdeps/mic/mic/micconst.h>
@@ -40,16 +41,18 @@
void mcexec_prepare_ack(ihk_os_t os, unsigned long arg, int err); void mcexec_prepare_ack(ihk_os_t os, unsigned long arg, int err);
static void mcctrl_ikc_init(ihk_os_t os, int cpu, unsigned long rphys, struct ihk_ikc_channel_desc *c); static void mcctrl_ikc_init(ihk_os_t os, int cpu, unsigned long rphys, struct ihk_ikc_channel_desc *c);
int mcexec_syscall(struct mcctrl_channel *c, int pid, unsigned long arg); int mcexec_syscall(struct mcctrl_usrdata *ud, struct ikc_scd_packet *packet);
void sig_done(unsigned long arg, int err); void sig_done(unsigned long arg, int err);
/* XXX: this runs in atomic context! */
static int syscall_packet_handler(struct ihk_ikc_channel_desc *c, static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
void *__packet, void *__os) void *__packet, void *__os)
{ {
struct ikc_scd_packet *pisp = __packet; struct ikc_scd_packet *pisp = __packet;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(__os); struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(__os);
int msg = pisp->msg;
switch (pisp->msg) { switch (msg) {
case SCD_MSG_INIT_CHANNEL: case SCD_MSG_INIT_CHANNEL:
mcctrl_ikc_init(__os, pisp->ref, pisp->arg, c); mcctrl_ikc_init(__os, pisp->ref, pisp->arg, c);
break; break;
@@ -63,7 +66,7 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
break; break;
case SCD_MSG_SYSCALL_ONESIDE: case SCD_MSG_SYSCALL_ONESIDE:
mcexec_syscall(usrdata->channels + pisp->ref, pisp->pid, pisp->arg); mcexec_syscall(usrdata, pisp);
break; break;
case SCD_MSG_PROCFS_ANSWER: case SCD_MSG_PROCFS_ANSWER:
@@ -88,11 +91,8 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
break; break;
case SCD_MSG_PROCFS_TID_CREATE: case SCD_MSG_PROCFS_TID_CREATE:
add_tid_entry(ihk_host_os_get_index(__os), pisp->pid, pisp->arg);
break;
case SCD_MSG_PROCFS_TID_DELETE: case SCD_MSG_PROCFS_TID_DELETE:
delete_tid_entry(ihk_host_os_get_index(__os), pisp->pid, pisp->arg); procfsm_packet_handler(__os, pisp->msg, pisp->pid, pisp->arg);
break; break;
case SCD_MSG_GET_VDSO_INFO: case SCD_MSG_GET_VDSO_INFO:
@@ -110,6 +110,14 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
pisp->err, pisp->arg); pisp->err, pisp->arg);
break; break;
} }
/*
* SCD_MSG_SYSCALL_ONESIDE holds the packet and frees is it
* mcexec_ret_syscall(), for the rest, free it here.
*/
if (msg != SCD_MSG_SYSCALL_ONESIDE) {
ihk_ikc_release_packet((struct ihk_ikc_free_packet *)__packet, c);
}
return 0; return 0;
} }
@@ -146,8 +154,6 @@ int mcctrl_ikc_set_recv_cpu(ihk_os_t os, int cpu)
ihk_ikc_channel_set_cpu(usrdata->channels[cpu].c, ihk_ikc_channel_set_cpu(usrdata->channels[cpu].c,
ihk_ikc_get_processor_id()); ihk_ikc_get_processor_id());
kprintf("Setting the target to %d\n",
ihk_ikc_get_processor_id());
return 0; return 0;
} }
@@ -193,12 +199,13 @@ static void mcctrl_ikc_init(ihk_os_t os, int cpu, unsigned long rphys, struct ih
#endif #endif
pmc->param.request_va = pmc->param.request_va =
(void *)__get_free_pages(GFP_KERNEL, (void *)__get_free_pages(in_interrupt() ? GFP_ATOMIC : GFP_KERNEL,
REQUEST_SHIFT - PAGE_SHIFT); REQUEST_SHIFT - PAGE_SHIFT);
pmc->param.request_pa = virt_to_phys(pmc->param.request_va); pmc->param.request_pa = virt_to_phys(pmc->param.request_va);
pmc->param.doorbell_va = usrdata->mcctrl_doorbell_va; pmc->param.doorbell_va = usrdata->mcctrl_doorbell_va;
pmc->param.doorbell_pa = usrdata->mcctrl_doorbell_pa; pmc->param.doorbell_pa = usrdata->mcctrl_doorbell_pa;
pmc->param.post_va = (void *)__get_free_page(GFP_KERNEL); pmc->param.post_va = (void *)__get_free_page(in_interrupt() ?
GFP_ATOMIC : GFP_KERNEL);
pmc->param.post_pa = virt_to_phys(pmc->param.post_va); pmc->param.post_pa = virt_to_phys(pmc->param.post_va);
memset(pmc->param.doorbell_va, 0, PAGE_SIZE); memset(pmc->param.doorbell_va, 0, PAGE_SIZE);
memset(pmc->param.request_va, 0, PAGE_SIZE); memset(pmc->param.request_va, 0, PAGE_SIZE);
@@ -218,8 +225,9 @@ static void mcctrl_ikc_init(ihk_os_t os, int cpu, unsigned long rphys, struct ih
PAGE_SIZE, NULL, 0); PAGE_SIZE, NULL, 0);
#endif #endif
pmc->dma_buf = (void *)__get_free_pages(GFP_KERNEL, pmc->dma_buf = (void *)__get_free_pages(in_interrupt() ?
DMA_PIN_SHIFT - PAGE_SHIFT); GFP_ATOMIC : GFP_KERNEL,
DMA_PIN_SHIFT - PAGE_SHIFT);
rpm->request_page = pmc->param.request_pa; rpm->request_page = pmc->param.request_pa;
rpm->doorbell_page = pmc->param.doorbell_pa; rpm->doorbell_page = pmc->param.doorbell_pa;
@@ -265,9 +273,6 @@ static int connect_handler(struct ihk_ikc_channel_info *param)
} }
param->packet_handler = syscall_packet_handler; param->packet_handler = syscall_packet_handler;
INIT_LIST_HEAD(&usrdata->channels[cpu].wq_list);
spin_lock_init(&usrdata->channels[cpu].wq_list_lock);
usrdata->channels[cpu].c = c; usrdata->channels[cpu].c = c;
kprintf("syscall: MC CPU %d connected. c=%p\n", cpu, c); kprintf("syscall: MC CPU %d connected. c=%p\n", cpu, c);
@@ -286,9 +291,6 @@ static int connect_handler2(struct ihk_ikc_channel_info *param)
param->packet_handler = syscall_packet_handler; param->packet_handler = syscall_packet_handler;
INIT_LIST_HEAD(&usrdata->channels[cpu].wq_list);
spin_lock_init(&usrdata->channels[cpu].wq_list_lock);
usrdata->channels[cpu].c = c; usrdata->channels[cpu].c = c;
kprintf("syscall: MC CPU %d connected. c=%p\n", cpu, c); kprintf("syscall: MC CPU %d connected. c=%p\n", cpu, c);
@@ -315,7 +317,7 @@ int prepare_ikc_channels(ihk_os_t os)
{ {
struct ihk_cpu_info *info; struct ihk_cpu_info *info;
struct mcctrl_usrdata *usrdata; struct mcctrl_usrdata *usrdata;
int error; int i;
usrdata = kzalloc(sizeof(struct mcctrl_usrdata), GFP_KERNEL); usrdata = kzalloc(sizeof(struct mcctrl_usrdata), GFP_KERNEL);
usrdata->mcctrl_doorbell_va = (void *)__get_free_page(GFP_KERNEL); usrdata->mcctrl_doorbell_va = (void *)__get_free_page(GFP_KERNEL);
@@ -347,17 +349,14 @@ int prepare_ikc_channels(ihk_os_t os)
memcpy(&usrdata->listen_param2, &listen_param2, sizeof listen_param2); memcpy(&usrdata->listen_param2, &listen_param2, sizeof listen_param2);
ihk_ikc_listen_port(os, &usrdata->listen_param2); ihk_ikc_listen_port(os, &usrdata->listen_param2);
INIT_LIST_HEAD(&usrdata->per_proc_list); for (i = 0; i < MCCTRL_PER_PROC_DATA_HASH_SIZE; ++i) {
spin_lock_init(&usrdata->per_proc_list_lock); INIT_LIST_HEAD(&usrdata->per_proc_data_hash[i]);
rwlock_init(&usrdata->per_proc_data_hash_lock[i]);
}
INIT_LIST_HEAD(&usrdata->cpu_topology_list); INIT_LIST_HEAD(&usrdata->cpu_topology_list);
INIT_LIST_HEAD(&usrdata->node_topology_list); INIT_LIST_HEAD(&usrdata->node_topology_list);
error = init_peer_channel_registry(usrdata);
if (error) {
return error;
}
return 0; return 0;
} }
@@ -396,7 +395,6 @@ void destroy_ikc_channels(ihk_os_t os)
} }
free_page((unsigned long)usrdata->mcctrl_doorbell_va); free_page((unsigned long)usrdata->mcctrl_doorbell_va);
destroy_peer_channel_registry(usrdata);
kfree(usrdata->channels); kfree(usrdata->channels);
kfree(usrdata); kfree(usrdata);
} }

View File

@@ -41,6 +41,7 @@
#include <ikc/master.h> #include <ikc/master.h>
#include <ihk/msr.h> #include <ihk/msr.h>
#include <linux/semaphore.h> #include <linux/semaphore.h>
#include <linux/rwlock.h>
#include <linux/threads.h> #include <linux/threads.h>
#include "sysfs.h" #include "sysfs.h"
@@ -48,6 +49,7 @@
#define SCD_MSG_PREPARE_PROCESS_ACKED 0x2 #define SCD_MSG_PREPARE_PROCESS_ACKED 0x2
#define SCD_MSG_PREPARE_PROCESS_NACKED 0x7 #define SCD_MSG_PREPARE_PROCESS_NACKED 0x7
#define SCD_MSG_SCHEDULE_PROCESS 0x3 #define SCD_MSG_SCHEDULE_PROCESS 0x3
#define SCD_MSG_WAKE_UP_SYSCALL_THREAD 0x14
#define SCD_MSG_INIT_CHANNEL 0x5 #define SCD_MSG_INIT_CHANNEL 0x5
#define SCD_MSG_INIT_CHANNEL_ACKED 0x6 #define SCD_MSG_INIT_CHANNEL_ACKED 0x6
@@ -110,8 +112,9 @@ struct ikc_scd_packet {
int ref; int ref;
int osnum; int osnum;
int pid; int pid;
int padding;
unsigned long arg; unsigned long arg;
struct syscall_request req;
unsigned long resp_pa;
}; };
/* for SCD_MSG_SYSFS_* */ /* for SCD_MSG_SYSFS_* */
@@ -120,7 +123,13 @@ struct ikc_scd_packet {
long sysfs_arg2; long sysfs_arg2;
long sysfs_arg3; long sysfs_arg3;
}; };
/* SCD_MSG_SCHEDULE_THREAD */
struct {
int ttid;
};
}; };
char padding[12];
}; };
struct mcctrl_priv { struct mcctrl_priv {
@@ -154,8 +163,11 @@ struct syscall_params {
struct wait_queue_head_list_node { struct wait_queue_head_list_node {
struct list_head list; struct list_head list;
wait_queue_head_t wq_syscall; wait_queue_head_t wq_syscall;
int pid; struct task_struct *task;
/* Denotes an exclusive wait for requester TID rtid */
int rtid;
int req; int req;
struct ikc_scd_packet *packet;
}; };
struct mcctrl_channel { struct mcctrl_channel {
@@ -163,15 +175,30 @@ struct mcctrl_channel {
struct syscall_params param; struct syscall_params param;
struct ikc_scd_init_param init; struct ikc_scd_init_param init;
void *dma_buf; void *dma_buf;
struct list_head wq_list;
ihk_spinlock_t wq_list_lock;
}; };
struct mcctrl_per_thread_data {
struct list_head hash;
struct task_struct *task;
void *data;
};
#define MCCTRL_PER_THREAD_DATA_HASH_SHIFT 8
#define MCCTRL_PER_THREAD_DATA_HASH_SIZE (1 << MCCTRL_PER_THREAD_DATA_HASH_SHIFT)
#define MCCTRL_PER_THREAD_DATA_HASH_MASK (MCCTRL_PER_THREAD_DATA_HASH_SIZE - 1)
struct mcctrl_per_proc_data { struct mcctrl_per_proc_data {
struct list_head list; struct list_head hash;
int pid; int pid;
unsigned long rpgtable; /* per process, not per OS */ unsigned long rpgtable; /* per process, not per OS */
struct list_head wq_list;
struct list_head wq_req_list;
struct list_head wq_list_exact;
ihk_spinlock_t wq_list_lock;
struct list_head per_thread_data_hash[MCCTRL_PER_THREAD_DATA_HASH_SIZE];
rwlock_t per_thread_data_hash_lock[MCCTRL_PER_THREAD_DATA_HASH_SIZE];
}; };
struct sysfsm_req { struct sysfsm_req {
@@ -230,6 +257,10 @@ struct node_topology {
#define CPU_LONGS (((NR_CPUS) + (BITS_PER_LONG) - 1) / (BITS_PER_LONG)) #define CPU_LONGS (((NR_CPUS) + (BITS_PER_LONG) - 1) / (BITS_PER_LONG))
#define MCCTRL_PER_PROC_DATA_HASH_SHIFT 7
#define MCCTRL_PER_PROC_DATA_HASH_SIZE (1 << MCCTRL_PER_PROC_DATA_HASH_SHIFT)
#define MCCTRL_PER_PROC_DATA_HASH_MASK (MCCTRL_PER_PROC_DATA_HASH_SIZE - 1)
struct mcctrl_usrdata { struct mcctrl_usrdata {
struct ihk_ikc_listen_param listen_param; struct ihk_ikc_listen_param listen_param;
struct ihk_ikc_listen_param listen_param2; struct ihk_ikc_listen_param listen_param2;
@@ -245,8 +276,9 @@ struct mcctrl_usrdata {
unsigned long last_thread_exec; unsigned long last_thread_exec;
wait_queue_head_t wq_prepare; wait_queue_head_t wq_prepare;
struct list_head per_proc_list; struct list_head per_proc_data_hash[MCCTRL_PER_PROC_DATA_HASH_SIZE];
ihk_spinlock_t per_proc_list_lock; rwlock_t per_proc_data_hash_lock[MCCTRL_PER_PROC_DATA_HASH_SIZE];
void **keys; void **keys;
struct sysfsm_data sysfsm_data; struct sysfsm_data sysfsm_data;
unsigned long cpu_online[CPU_LONGS]; unsigned long cpu_online[CPU_LONGS];
@@ -273,12 +305,22 @@ int mcctrl_ikc_is_valid_thread(ihk_os_t os, int cpu);
ihk_os_t osnum_to_os(int n); ihk_os_t osnum_to_os(int n);
/* syscall.c */ /* syscall.c */
int init_peer_channel_registry(struct mcctrl_usrdata *ud); int __do_in_kernel_syscall(ihk_os_t os, struct ikc_scd_packet *packet);
void destroy_peer_channel_registry(struct mcctrl_usrdata *ud); int mcctrl_add_per_proc_data(struct mcctrl_usrdata *ud, int pid,
int register_peer_channel(struct mcctrl_usrdata *ud, void *key, struct mcctrl_channel *ch); struct mcctrl_per_proc_data *ppd);
int deregister_peer_channel(struct mcctrl_usrdata *ud, void *key, struct mcctrl_channel *ch); int mcctrl_delete_per_proc_data(struct mcctrl_usrdata *ud, int pid);
struct mcctrl_channel *get_peer_channel(struct mcctrl_usrdata *ud, void *key); inline struct mcctrl_per_proc_data *mcctrl_get_per_proc_data(
int __do_in_kernel_syscall(ihk_os_t os, struct mcctrl_channel *c, struct syscall_request *sc); struct mcctrl_usrdata *ud, int pid);
int mcctrl_add_per_thread_data(struct mcctrl_per_proc_data* ppd,
struct task_struct *task, void *data);
int mcctrl_delete_per_thread_data(struct mcctrl_per_proc_data* ppd,
struct task_struct *task);
inline struct mcctrl_per_thread_data *mcctrl_get_per_thread_data(
struct mcctrl_per_proc_data *ppd, struct task_struct *task);
void __return_syscall(ihk_os_t os, struct ikc_scd_packet *packet,
long ret, int stid);
#define PROCFS_NAME_MAX 1000 #define PROCFS_NAME_MAX 1000
@@ -301,6 +343,7 @@ struct procfs_file {
}; };
void procfs_answer(unsigned int arg, int err); void procfs_answer(unsigned int arg, int err);
int procfsm_packet_handler(void *os, int msg, int pid, unsigned long arg);
void add_tid_entry(int osnum, int pid, int tid); void add_tid_entry(int osnum, int pid, int tid);
void add_pid_entry(int osnum, int pid); void add_pid_entry(int osnum, int pid);
void delete_tid_entry(int osnum, int pid, int tid); void delete_tid_entry(int osnum, int pid, int tid);

View File

@@ -17,6 +17,7 @@
#include <linux/uaccess.h> #include <linux/uaccess.h>
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/resource.h> #include <linux/resource.h>
#include <linux/interrupt.h>
#include "mcctrl.h" #include "mcctrl.h"
#include <linux/version.h> #include <linux/version.h>
#include <linux/semaphore.h> #include <linux/semaphore.h>
@@ -713,6 +714,57 @@ mckernel_procfs_lseek(struct file *file, loff_t offset, int orig)
return file->f_pos; return file->f_pos;
} }
struct procfs_work {
void *os;
int msg;
int pid;
unsigned long arg;
struct work_struct work;
};
static void procfsm_work_main(struct work_struct *work0)
{
struct procfs_work *work = container_of(work0, struct procfs_work, work);
switch (work->msg) {
case SCD_MSG_PROCFS_TID_CREATE:
add_tid_entry(ihk_host_os_get_index(work->os), work->pid, work->arg);
break;
case SCD_MSG_PROCFS_TID_DELETE:
delete_tid_entry(ihk_host_os_get_index(work->os), work->pid, work->arg);
break;
default:
printk("%s: unknown work: msg: %d, pid: %d, arg: %lu)\n",
__FUNCTION__, work->msg, work->pid, work->arg);
break;
}
kfree(work);
return;
}
int procfsm_packet_handler(void *os, int msg, int pid, unsigned long arg)
{
struct procfs_work *work = NULL;
work = kzalloc(sizeof(*work), GFP_ATOMIC);
if (!work) {
printk("%s: kzalloc failed\n", __FUNCTION__);
return -1;
}
work->os = os;
work->msg = msg;
work->pid = pid;
work->arg = arg;
INIT_WORK(&work->work, &procfsm_work_main);
schedule_work(&work->work);
return 0;
}
static const struct file_operations mckernel_forward_ro = { static const struct file_operations mckernel_forward_ro = {
.llseek = mckernel_procfs_lseek, .llseek = mckernel_procfs_lseek,
.read = mckernel_procfs_read, .read = mckernel_procfs_read,

View File

@@ -40,6 +40,7 @@
#include <linux/cred.h> #include <linux/cred.h>
#include <linux/capability.h> #include <linux/capability.h>
#include <linux/semaphore.h> #include <linux/semaphore.h>
#include <linux/spinlock.h>
#include <linux/mount.h> #include <linux/mount.h>
#include <asm/uaccess.h> #include <asm/uaccess.h>
#include <asm/delay.h> #include <asm/delay.h>
@@ -84,88 +85,96 @@ static void print_dma_lastreq(void)
} }
#endif #endif
int init_peer_channel_registry(struct mcctrl_usrdata *ud) int mcctrl_add_per_thread_data(struct mcctrl_per_proc_data* ppd,
struct task_struct *task, void *data)
{ {
ud->keys = kzalloc(sizeof(void *) * ud->num_channels, GFP_KERNEL); struct mcctrl_per_thread_data *ptd_iter, *ptd = NULL;
if (!ud->keys) { struct mcctrl_per_thread_data *ptd_alloc = NULL;
printk("Error: cannot allocate usrdata.keys[].\n"); int hash = (((uint64_t)task >> 4) & MCCTRL_PER_THREAD_DATA_HASH_MASK);
return -ENOMEM; int ret = 0;
unsigned long flags;
ptd_alloc = kmalloc(sizeof(*ptd), GFP_ATOMIC);
if (!ptd_alloc) {
kprintf("%s: error allocate per thread data\n", __FUNCTION__);
ret = -ENOMEM;
goto out_noalloc;
} }
return 0; /* Check if data for this thread exists and add if not */
} write_lock_irqsave(&ppd->per_thread_data_hash_lock[hash], flags);
list_for_each_entry(ptd_iter, &ppd->per_thread_data_hash[hash], hash) {
void destroy_peer_channel_registry(struct mcctrl_usrdata *ud) if (ptd_iter->task == task) {
{ ptd = ptd_iter;
kfree(ud->keys); break;
ud->keys = NULL;
return;
}
int register_peer_channel(struct mcctrl_usrdata *ud, void *key, struct mcctrl_channel *ch)
{
int cpu;
cpu = ch - ud->channels;
if ((cpu < 0) || (ud->num_channels <= cpu)) {
printk("register_peer_channel(%p,%p,%p):"
"not a syscall channel. cpu=%d\n",
ud, key, ch, cpu);
return -EINVAL;
}
if (ud->keys[cpu] != NULL) {
printk("register_peer_channel(%p,%p,%p):"
"already registered. cpu=%d\n",
ud, key, ch, cpu);
/*
* When mcexec receives a signal,
* it may be finished without doing deregister_peer_channel().
* Therefore a substitute registration is necessary.
*/
#if 0
return -EBUSY;
#endif
}
ud->keys[cpu] = key;
return 0;
}
int deregister_peer_channel(struct mcctrl_usrdata *ud, void *key, struct mcctrl_channel *ch)
{
int cpu;
cpu = ch - ud->channels;
if ((cpu < 0) || (ud->num_channels <= cpu)) {
printk("deregister_peer_channel(%p,%p,%p):"
"not a syscall channel. cpu=%d\n",
ud, key, ch, cpu);
return -EINVAL;
}
if (ud->keys[cpu] && (ud->keys[cpu] != key)) {
printk("deregister_peer_channel(%p,%p,%p):"
"not registered. cpu=%d\n",
ud, key, ch, cpu);
return -EBUSY;
}
ud->keys[cpu] = NULL;
return 0;
}
struct mcctrl_channel *get_peer_channel(struct mcctrl_usrdata *ud, void *key)
{
int cpu;
for (cpu = 0; cpu < ud->num_channels; ++cpu) {
if (ud->keys[cpu] == key) {
return &ud->channels[cpu];
} }
} }
return NULL; if (unlikely(ptd)) {
ret = -EBUSY;
kfree(ptd_alloc);
goto out;
}
ptd = ptd_alloc;
ptd->task = task;
ptd->data = data;
list_add_tail(&ptd->hash, &ppd->per_thread_data_hash[hash]);
out:
write_unlock_irqrestore(&ppd->per_thread_data_hash_lock[hash], flags);
out_noalloc:
return ret;
}
int mcctrl_delete_per_thread_data(struct mcctrl_per_proc_data* ppd,
struct task_struct *task)
{
struct mcctrl_per_thread_data *ptd_iter, *ptd = NULL;
int hash = (((uint64_t)task >> 4) & MCCTRL_PER_THREAD_DATA_HASH_MASK);
int ret = 0;
unsigned long flags;
/* Check if data for this thread exists and delete it */
write_lock_irqsave(&ppd->per_thread_data_hash_lock[hash], flags);
list_for_each_entry(ptd_iter, &ppd->per_thread_data_hash[hash], hash) {
if (ptd_iter->task == task) {
ptd = ptd_iter;
break;
}
}
if (!ptd) {
ret = -EINVAL;
goto out;
}
list_del(&ptd->hash);
kfree(ptd);
out:
write_unlock_irqrestore(&ppd->per_thread_data_hash_lock[hash], flags);
return ret;
}
struct mcctrl_per_thread_data *mcctrl_get_per_thread_data(struct mcctrl_per_proc_data *ppd, struct task_struct *task)
{
struct mcctrl_per_thread_data *ptd_iter, *ptd = NULL;
int hash = (((uint64_t)task >> 4) & MCCTRL_PER_THREAD_DATA_HASH_MASK);
unsigned long flags;
/* Check if data for this thread exists and return it */
read_lock_irqsave(&ppd->per_thread_data_hash_lock[hash], flags);
list_for_each_entry(ptd_iter, &ppd->per_thread_data_hash[hash], hash) {
if (ptd_iter->task == task) {
ptd = ptd_iter;
break;
}
}
read_unlock_irqrestore(&ppd->per_thread_data_hash_lock[hash], flags);
return ptd ? ptd->data : NULL;
} }
#if 1 /* x86 depend, host OS side */ #if 1 /* x86 depend, host OS side */
@@ -232,80 +241,156 @@ out:
} }
#endif #endif
static int __notify_syscall_requester(ihk_os_t os, struct ikc_scd_packet *packet,
struct syscall_response *res)
{
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct ihk_ikc_channel_desc *c = (usrdata->channels + packet->ref)->c;
struct ikc_scd_packet r_packet;
int ret = 0;
/* If spinning, no need for IKC message */
if (__sync_bool_compare_and_swap(&res->req_thread_status,
IHK_SCD_REQ_THREAD_SPINNING,
IHK_SCD_REQ_THREAD_TO_BE_WOKEN)) {
dprintk("%s: no need to send IKC message for PID %d\n",
__FUNCTION__, packet->pid);
return ret;
}
/* The thread is not spinning any more, make sure it's descheduled */
if (!__sync_bool_compare_and_swap(&res->req_thread_status,
IHK_SCD_REQ_THREAD_DESCHEDULED,
IHK_SCD_REQ_THREAD_TO_BE_WOKEN)) {
printk("%s: WARNING: inconsistent requester status, "
"pid: %d, req status: %lu, syscall nr: %lu\n",
__FUNCTION__, packet->pid,
res->req_thread_status, packet->req.number);
dump_stack();
return -EINVAL;
}
r_packet.msg = SCD_MSG_WAKE_UP_SYSCALL_THREAD;
r_packet.ttid = packet->req.rtid;
ret = ihk_ikc_send(c, &r_packet, 0);
return ret;
}
static int remote_page_fault(struct mcctrl_usrdata *usrdata, void *fault_addr, uint64_t reason) static int remote_page_fault(struct mcctrl_usrdata *usrdata, void *fault_addr, uint64_t reason)
{ {
struct mcctrl_channel *channel; struct ikc_scd_packet *packet;
struct syscall_request *req; struct syscall_request *req;
struct syscall_response *resp; struct syscall_response *resp;
int error; int error;
struct wait_queue_head_list_node *wqhln;
unsigned long irqflags;
struct mcctrl_per_proc_data *ppd;
unsigned long phys;
dprintk("remote_page_fault(%p,%p,%llx)\n", usrdata, fault_addr, reason); dprintk("%s: tid: %d, fault_addr: %lu, reason: %lu\n",
__FUNCTION__, task_pid_vnr(current), fault_addr, reason);
/* Look up per-process structure */
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current));
channel = get_peer_channel(usrdata, current); if (!ppd) {
if (!channel) { kprintf("%s: ERROR: no per-process structure for PID %d??\n",
error = -ENOENT; __FUNCTION__, task_tgid_vnr(current));
printk("remote_page_fault(%p,%p,%llx):channel not found. %d\n", return -EINVAL;
usrdata, fault_addr, reason, error);
goto out;
} }
req = channel->param.request_va; packet = (struct ikc_scd_packet *)mcctrl_get_per_thread_data(ppd, current);
resp = channel->param.response_va; if (!packet) {
error = -ENOENT;
printk("%s: no packet registered for TID %d\n",
__FUNCTION__, task_pid_vnr(current));
goto out_no_unmap;
}
/* request page fault */ req = &packet->req;
/* Map response structure */
phys = ihk_device_map_memory(ihk_os_to_dev(usrdata->os),
packet->resp_pa, sizeof(*resp));
resp = ihk_device_map_virtual(ihk_os_to_dev(usrdata->os),
phys, sizeof(*resp), NULL, 0);
retry_alloc:
wqhln = kmalloc(sizeof(*wqhln), GFP_ATOMIC);
if (!wqhln) {
printk("WARNING: coudln't alloc wait queue head, retrying..\n");
goto retry_alloc;
}
/* Prepare per-thread wait queue head */
wqhln->task = current;
wqhln->req = 0;
init_waitqueue_head(&wqhln->wq_syscall);
irqflags = ihk_ikc_spinlock_lock(&ppd->wq_list_lock);
/* Add to exact list */
list_add_tail(&wqhln->list, &ppd->wq_list_exact);
ihk_ikc_spinlock_unlock(&ppd->wq_list_lock, irqflags);
/* Request page fault */
resp->ret = -EFAULT; resp->ret = -EFAULT;
resp->fault_address = (unsigned long)fault_addr; resp->fault_address = (unsigned long)fault_addr;
resp->fault_reason = reason; resp->fault_reason = reason;
resp->stid = task_pid_vnr(current);
#define STATUS_PAGER_COMPLETED 1 #define STATUS_PAGER_COMPLETED 1
#define STATUS_PAGE_FAULT 3 #define STATUS_PAGE_FAULT 3
req->valid = 0; req->valid = 0;
if (__notify_syscall_requester(usrdata->os, packet, resp) < 0) {
printk("%s: WARNING: failed to notify PID %d\n",
__FUNCTION__, packet->pid);
}
mb(); mb();
resp->status = STATUS_PAGE_FAULT; resp->status = STATUS_PAGE_FAULT;
for (;;) { for (;;) {
struct wait_queue_head_list_node *wqhln; dprintk("%s: tid: %d, fault_addr: %p SLEEPING\n",
struct wait_queue_head_list_node *wqhln_iter; __FUNCTION__, task_pid_vnr(current), fault_addr);
unsigned long irqflags;
retry_alloc:
wqhln = kmalloc(sizeof(*wqhln), GFP_KERNEL);
if (!wqhln) {
printk("WARNING: coudln't alloc wait queue head, retrying..\n");
goto retry_alloc;
}
/* Prepare per-process wait queue head */
wqhln->pid = task_tgid_vnr(current);
wqhln->req = 0;
init_waitqueue_head(&wqhln->wq_syscall);
irqflags = ihk_ikc_spinlock_lock(&channel->wq_list_lock);
/* First see if there is a wait queue already */
list_for_each_entry(wqhln_iter, &channel->wq_list, list) {
if (wqhln_iter->pid == task_tgid_vnr(current)) {
kfree(wqhln);
wqhln = wqhln_iter;
list_del(&wqhln->list);
break;
}
}
list_add_tail(&wqhln->list, &channel->wq_list);
ihk_ikc_spinlock_unlock(&channel->wq_list_lock, irqflags);
/* wait for response */ /* wait for response */
error = wait_event_interruptible(wqhln->wq_syscall, wqhln->req); error = wait_event_interruptible(wqhln->wq_syscall, wqhln->req);
/* Remove per-process wait queue head */ /* Remove per-thread wait queue head */
irqflags = ihk_ikc_spinlock_lock(&channel->wq_list_lock); irqflags = ihk_ikc_spinlock_lock(&ppd->wq_list_lock);
list_del(&wqhln->list); list_del(&wqhln->list);
ihk_ikc_spinlock_unlock(&channel->wq_list_lock, irqflags); ihk_ikc_spinlock_unlock(&ppd->wq_list_lock, irqflags);
kfree(wqhln);
dprintk("%s: tid: %d, fault_addr: %p WOKEN UP\n",
__FUNCTION__, task_pid_vnr(current), fault_addr);
if (error) { if (error) {
kfree(wqhln);
printk("remote_page_fault:interrupted. %d\n", error); printk("remote_page_fault:interrupted. %d\n", error);
goto out; goto out;
} }
else {
/* Update packet reference */
packet = wqhln->packet;
req = &packet->req;
{
unsigned long phys2;
struct syscall_response *resp2;
phys2 = ihk_device_map_memory(ihk_os_to_dev(usrdata->os),
packet->resp_pa, sizeof(*resp));
resp2 = ihk_device_map_virtual(ihk_os_to_dev(usrdata->os),
phys2, sizeof(*resp), NULL, 0);
if (resp != resp2) {
resp = resp2;
phys = phys2;
printk("%s: updated new remote PA for resp\n", __FUNCTION__);
}
}
}
if (!req->valid) { if (!req->valid) {
printk("remote_page_fault:not valid\n"); printk("remote_page_fault:not valid\n");
} }
@@ -321,23 +406,37 @@ retry_alloc:
#define PAGER_REQ_RESUME 0x0101 #define PAGER_REQ_RESUME 0x0101
else if (req->args[0] != PAGER_REQ_RESUME) { else if (req->args[0] != PAGER_REQ_RESUME) {
resp->ret = pager_call(usrdata->os, (void *)req); resp->ret = pager_call(usrdata->os, (void *)req);
if (__notify_syscall_requester(usrdata->os, packet, resp) < 0) {
printk("%s: WARNING: failed to notify PID %d\n",
__FUNCTION__, packet->pid);
}
mb(); mb();
resp->status = STATUS_PAGER_COMPLETED; resp->status = STATUS_PAGER_COMPLETED;
continue; break;
//continue;
} }
else { else {
error = req->args[1]; error = req->args[1];
if (error) { if (error) {
printk("remote_page_fault:response %d\n", error); printk("remote_page_fault:response %d\n", error);
kfree(wqhln);
goto out; goto out;
} }
} }
break; break;
} }
kfree(wqhln);
error = 0; error = 0;
out: out:
dprintk("remote_page_fault(%p,%p,%llx): %d\n", usrdata, fault_addr, reason, error); ihk_device_unmap_virtual(ihk_os_to_dev(usrdata->os), resp, sizeof(*resp));
ihk_device_unmap_memory(ihk_os_to_dev(usrdata->os), phys, sizeof(*resp));
out_no_unmap:
dprintk("%s: tid: %d, fault_addr: %lu, reason: %lu, error: %d\n",
__FUNCTION__, task_pid_vnr(current), fault_addr, reason, error);
return error; return error;
} }
@@ -389,8 +488,9 @@ static int rus_page_hash_insert(struct page *page)
{ {
int ret = 0; int ret = 0;
struct rus_page *rp; struct rus_page *rp;
unsigned long flags;
spin_lock(&rus_page_hash_lock); spin_lock_irqsave(&rus_page_hash_lock, flags);
rp = _rus_page_hash_lookup(page); rp = _rus_page_hash_lookup(page);
if (!rp) { if (!rp) {
@@ -417,7 +517,7 @@ static int rus_page_hash_insert(struct page *page)
out: out:
spin_unlock(&rus_page_hash_lock); spin_unlock_irqrestore(&rus_page_hash_lock, flags);
return ret; return ret;
} }
@@ -426,8 +526,9 @@ void rus_page_hash_put_pages(void)
int i; int i;
struct rus_page *rp_iter; struct rus_page *rp_iter;
struct rus_page *rp_iter_next; struct rus_page *rp_iter_next;
unsigned long flags;
spin_lock(&rus_page_hash_lock); spin_lock_irqsave(&rus_page_hash_lock, flags);
for (i = 0; i < RUS_PAGE_HASH_SIZE; ++i) { for (i = 0; i < RUS_PAGE_HASH_SIZE; ++i) {
@@ -440,7 +541,7 @@ void rus_page_hash_put_pages(void)
} }
} }
spin_unlock(&rus_page_hash_lock); spin_unlock_irqrestore(&rus_page_hash_lock, flags);
} }
@@ -472,27 +573,22 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
#if USE_VM_INSERT_PFN #if USE_VM_INSERT_PFN
size_t pix; size_t pix;
#endif #endif
struct mcctrl_per_proc_data *ppd, *ppd_iter; struct mcctrl_per_proc_data *ppd;
unsigned long flags;
dprintk("mcctrl:page fault:flags %#x pgoff %#lx va %p page %p\n", dprintk("mcctrl:page fault:flags %#x pgoff %#lx va %p page %p\n",
vmf->flags, vmf->pgoff, vmf->virtual_address, vmf->page); vmf->flags, vmf->pgoff, vmf->virtual_address, vmf->page);
ppd = NULL; /* Look up per-process structure */
flags = ihk_ikc_spinlock_lock(&usrdata->per_proc_list_lock); ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current));
if (!ppd) {
list_for_each_entry(ppd_iter, &usrdata->per_proc_list, list) { ppd = mcctrl_get_per_proc_data(usrdata, vma->vm_mm->owner->pid);
if (ppd_iter->pid == task_tgid_vnr(current) ||
ppd_iter->pid == vma->vm_mm->owner->pid) {
ppd = ppd_iter;
break;
}
} }
ihk_ikc_spinlock_unlock(&usrdata->per_proc_list_lock, flags);
if (!ppd) { if (!ppd) {
printk("ERROR: no per process data for pid %d\n", task_tgid_vnr(current)); kprintf("%s: ERROR: no per-process structure for PID %d??\n",
return VM_FAULT_SIGBUS; __FUNCTION__, task_tgid_vnr(current));
return -EINVAL;
} }
for (try = 1; ; ++try) { for (try = 1; ; ++try) {
@@ -626,237 +722,6 @@ reserve_user_space_common(struct mcctrl_usrdata *usrdata, unsigned long start, u
return start; return start;
} }
//unsigned long last_thread_exec = 0;
#ifndef DO_USER_MODE
static struct {
long (*do_sys_open)(int, const char __user *, int, int);
long (*sys_lseek)(unsigned int, off_t, unsigned int);
long (*sys_read)(unsigned int, char __user *, size_t);
long (*sys_write)(unsigned int, const char __user *, size_t);
} syscalls;
void
mcctrl_syscall_init(void)
{
printk("mcctrl_syscall_init\n");
syscalls.do_sys_open = (void *)kallsyms_lookup_name("do_sys_open");
syscalls.sys_lseek = (void *)kallsyms_lookup_name("sys_lseek");
syscalls.sys_read = (void *)kallsyms_lookup_name("sys_read");
syscalls.sys_write = (void *)kallsyms_lookup_name("sys_write");
printk("syscalls.do_sys_open=%lx\n", (long)syscalls.do_sys_open);
printk("syscalls.sys_lseek=%lx\n", (long)syscalls.sys_lseek);
printk("syscalls.sys_read=%lx\n", (long)syscalls.sys_read);
printk("syscalls.sys_write=%lx\n", (long)syscalls.sys_write);
}
static int do_async_copy(ihk_os_t os, unsigned long dest, unsigned long src,
unsigned long size, unsigned int inbound)
{
struct ihk_dma_request request;
ihk_dma_channel_t channel;
unsigned long asize = ALIGN_WAIT_BUF(size);
channel = ihk_device_get_dma_channel(ihk_os_to_dev(os), 0);
if (!channel) {
return -EINVAL;
}
memset(&request, 0, sizeof(request));
request.src_os = inbound ? os : NULL;
request.src_phys = src;
request.dest_os = inbound ? NULL : os;
request.dest_phys = dest;
request.size = size;
request.notify = (void *)(inbound ? dest + asize : src + asize);
request.priv = (void *)1;
*(unsigned long *)phys_to_virt((unsigned long)request.notify) = 0;
#ifdef SC_DEBUG
last_request = request;
#endif
ihk_dma_request(channel, &request);
return 0;
}
//int mcctrl_dma_abort;
static void async_wait(ihk_os_t os, unsigned char *p, int size)
{
int asize = ALIGN_WAIT_BUF(size);
unsigned long long s, w;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
rdtscll(s);
while (!p[asize]) {
mb();
cpu_relax();
rdtscll(w);
if (w > s + 1024UL * 1024 * 1024 * 10) {
printk("DMA Timed out : %p (%p + %d) => %d\n",
p + asize, p, size, p[asize]);
#ifdef SC_DEBUG
print_dma_lastreq();
#endif
usrdata->mcctrl_dma_abort = 1;
return;
}
}
}
static void clear_wait(unsigned char *p, int size)
{
//int asize = ALIGN_WAIT_BUF(size);
p[size] = 0;
}
static unsigned long translate_remote_va(struct mcctrl_channel *c,
unsigned long rva)
{
int i, n;
struct syscall_post *p;
p = c->param.post_va;
n = (int)p->v[0];
if (n < 0 || n >= PAGE_SIZE / sizeof(struct syscall_post)) {
return -EINVAL;
}
for (i = 0; i < n; i++) {
if (p[i + 1].v[0] != 1) {
continue;
}
if (rva >= p[i + 1].v[1] && rva < p[i + 1].v[2]) {
return p[i + 1].v[3] + (rva - p[i + 1].v[1]);
}
}
return -EFAULT;
}
//extern struct mcctrl_channel *channels;
#if 0
int __do_in_kernel_syscall(ihk_os_t os, struct mcctrl_channel *c,
struct syscall_request *sc)
{
int ret;
mm_segment_t fs;
unsigned long pa;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
switch (sc->number) {
case 0: /* read */
case 1024:
if (sc->number & 1024) {
sc->args[1] = translate_remote_va(c, sc->args[1]);
if ((long)sc->args[1] < 0) {
__return_syscall(c, -EFAULT);
return 0;
}
}
clear_wait(c->dma_buf, sc->args[2]);
fs = get_fs();
set_fs(KERNEL_DS);
ret = syscalls.sys_read(sc->args[0], c->dma_buf, sc->args[2]);
if (ret > 0) {
do_async_copy(os, sc->args[1], virt_to_phys(c->dma_buf),
sc->args[2], 0);
set_fs(fs);
async_wait(os, c->dma_buf, sc->args[2]);
}
__return_syscall(c, ret);
return 0;
case 1: /* write */
case 1025:
if (sc->number & 1024) {
sc->args[1] = translate_remote_va(c, sc->args[1]);
if ((long)sc->args[1] < 0) {
__return_syscall(c, -EFAULT);
return 0;
}
}
clear_wait(c->dma_buf, sc->args[2]);
do_async_copy(os, virt_to_phys(c->dma_buf), sc->args[1],
sc->args[2], 1);
fs = get_fs();
set_fs(KERNEL_DS);
async_wait(os, c->dma_buf, sc->args[2]);
ret = syscalls.sys_write(sc->args[0], c->dma_buf, sc->args[2]);
set_fs(fs);
__return_syscall(c, ret);
return 0;
case 2: /* open */
case 1026:
if (sc->number & 1024) {
sc->args[0] = translate_remote_va(c, sc->args[0]);
if ((long)sc->args[0] < 0) {
__return_syscall(c, -EFAULT);
return 0;
}
}
clear_wait(c->dma_buf, 256);
do_async_copy(os, virt_to_phys(c->dma_buf), sc->args[0],
256, 1);
fs = get_fs();
set_fs(KERNEL_DS);
async_wait(os, c->dma_buf, 256);
ret = syscalls.do_sys_open(AT_FDCWD, c->dma_buf, sc->args[1],
sc->args[2]);
set_fs(fs);
__return_syscall(c, ret);
return 0;
case 3: /* Close */
ret = sys_close(sc->args[0]);
__return_syscall(c, ret);
return 0;
case 8: /* lseek */
ret = syscalls.sys_lseek(sc->args[0], sc->args[1], sc->args[2]);
__return_syscall(c, ret);
return 0;
case 56: /* Clone */
usrdata->last_thread_exec++;
if (mcctrl_ikc_is_valid_thread(usrdata->last_thread_exec)) {
printk("Clone notification: %lx\n", sc->args[0]);
if (channels[usrdata->last_thread_exec].param.post_va) {
memcpy(usrdata->channels[usrdata->last_thread_exec].param.post_va,
c->param.post_va, PAGE_SIZE);
}
mcctrl_ikc_send_msg(usrdata->last_thread_exec,
SCD_MSG_SCHEDULE_PROCESS,
usrdata->last_thread_exec, sc->args[0]);
}
__return_syscall(c, 0);
return 0;
default:
if (sc->number & 1024) {
__return_syscall(c, -EFAULT);
return 0;
} else {
return -ENOSYS;
}
}
}
#endif
#endif /* !DO_USER_MODE */
struct pager { struct pager {
struct list_head list; struct list_head list;
struct inode * inode; struct inode * inode;
@@ -967,7 +832,7 @@ static int pager_req_create(ihk_os_t os, int fd, uintptr_t result_pa)
up(&pager_sem); up(&pager_sem);
newpager = kzalloc(sizeof(*newpager), GFP_KERNEL); newpager = kzalloc(sizeof(*newpager), GFP_ATOMIC);
if (!newpager) { if (!newpager) {
error = -ENOMEM; error = -ENOMEM;
printk("pager_req_create(%d,%lx):kzalloc failed. %d\n", fd, (long)result_pa, error); printk("pager_req_create(%d,%lx):kzalloc failed. %d\n", fd, (long)result_pa, error);
@@ -1223,7 +1088,7 @@ static int pager_req_map(ihk_os_t os, int fd, size_t len, off_t off,
uintptr_t phys; uintptr_t phys;
dprintk("pager_req_map(%p,%d,%lx,%lx,%lx)\n", os, fd, len, off, result_rpa); dprintk("pager_req_map(%p,%d,%lx,%lx,%lx)\n", os, fd, len, off, result_rpa);
pager = kzalloc(sizeof(*pager), GFP_KERNEL); pager = kzalloc(sizeof(*pager), GFP_ATOMIC);
if (!pager) { if (!pager) {
error = -ENOMEM; error = -ENOMEM;
printk("pager_req_map(%p,%d,%lx,%lx,%lx):kzalloc failed. %d\n", os, fd, len, off, result_rpa, error); printk("pager_req_map(%p,%d,%lx,%lx,%lx):kzalloc failed. %d\n", os, fd, len, off, result_rpa, error);
@@ -1475,11 +1340,31 @@ static long pager_call(ihk_os_t os, struct syscall_request *req)
return ret; return ret;
} }
static void __return_syscall(struct mcctrl_channel *c, int ret) void __return_syscall(ihk_os_t os, struct ikc_scd_packet *packet,
long ret, int stid)
{ {
c->param.response_va->ret = ret; unsigned long phys;
struct syscall_response *res;
phys = ihk_device_map_memory(ihk_os_to_dev(os),
packet->resp_pa, sizeof(*res));
res = ihk_device_map_virtual(ihk_os_to_dev(os),
phys, sizeof(*res), NULL, 0);
/* Map response structure and notify offloading thread */
res->ret = ret;
res->stid = stid;
if (__notify_syscall_requester(os, packet, res) < 0) {
printk("%s: WARNING: failed to notify PID %d\n",
__FUNCTION__, packet->pid);
}
mb(); mb();
c->param.response_va->status = 1; res->status = 1;
ihk_device_unmap_virtual(ihk_os_to_dev(os), res, sizeof(*res));
ihk_device_unmap_memory(ihk_os_to_dev(os), phys, sizeof(*res));
} }
static int remap_user_space(uintptr_t rva, size_t len, int prot) static int remap_user_space(uintptr_t rva, size_t len, int prot)
@@ -1668,13 +1553,14 @@ fail:
#define SCHED_CHECK_SAME_OWNER 0x01 #define SCHED_CHECK_SAME_OWNER 0x01
#define SCHED_CHECK_ROOT 0x02 #define SCHED_CHECK_ROOT 0x02
int __do_in_kernel_syscall(ihk_os_t os, struct mcctrl_channel *c, struct syscall_request *sc) int __do_in_kernel_syscall(ihk_os_t os, struct ikc_scd_packet *packet)
{ {
struct syscall_request *sc = &packet->req;
int error; int error;
long ret = -1; long ret = -1;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os); struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
dprintk("__do_in_kernel_syscall(%p,%p,%ld %lx)\n", os, c, sc->number, sc->args[0]); dprintk("%s: system call: %d\n", __FUNCTION__, sc->args[0]);
switch (sc->number) { switch (sc->number) {
case __NR_mmap: case __NR_mmap:
ret = pager_call(os, sc); ret = pager_call(os, sc);
@@ -1683,25 +1569,19 @@ int __do_in_kernel_syscall(ihk_os_t os, struct mcctrl_channel *c, struct syscall
case __NR_munmap: case __NR_munmap:
/* Set new remote page table if not zero */ /* Set new remote page table if not zero */
if (sc->args[2]) { if (sc->args[2]) {
unsigned long flags;
struct mcctrl_per_proc_data *ppd = NULL; struct mcctrl_per_proc_data *ppd = NULL;
ppd = kmalloc(sizeof(*ppd), GFP_ATOMIC); ppd = mcctrl_get_per_proc_data(usrdata, sc->args[3]);
if (!ppd) { if (unlikely(!ppd)) {
printk("ERROR: allocating per process data\n"); kprintf("%s: ERROR: no per-process structure for PID %d??\n",
error = -ENOMEM; __FUNCTION__, task_tgid_vnr(current));
goto out; return -1;
} }
ppd->pid = task_tgid_vnr(current);
ppd->rpgtable = sc->args[2]; ppd->rpgtable = sc->args[2];
flags = ihk_ikc_spinlock_lock(&usrdata->per_proc_list_lock); dprintk("%s: pid: %d, rpgtable: 0x%lx updated\n",
list_add_tail(&ppd->list, &usrdata->per_proc_list); __FUNCTION__, ppd->pid, ppd->rpgtable);
ihk_ikc_spinlock_unlock(&usrdata->per_proc_list_lock, flags);
dprintk("pid: %d, rpgtable: 0x%lx added\n",
ppd->pid, ppd->rpgtable);
} }
ret = clear_pte_range(sc->args[0], sc->args[1]); ret = clear_pte_range(sc->args[0], sc->args[1]);
@@ -1712,33 +1592,6 @@ int __do_in_kernel_syscall(ihk_os_t os, struct mcctrl_channel *c, struct syscall
break; break;
case __NR_exit_group: { case __NR_exit_group: {
unsigned long flags;
struct mcctrl_per_proc_data *ppd = NULL, *ppd_iter;
ppd = NULL;
flags = ihk_ikc_spinlock_lock(&usrdata->per_proc_list_lock);
list_for_each_entry(ppd_iter, &usrdata->per_proc_list, list) {
if (ppd_iter->pid == task_tgid_vnr(current)) {
ppd = ppd_iter;
break;
}
}
if (ppd) {
list_del(&ppd->list);
dprintk("pid: %d, tid: %d: rpgtable for %d (0x%lx) removed\n",
task_tgid_vnr(current), current->pid, ppd->pid, ppd->rpgtable);
kfree(ppd);
}
else {
printk("WARNING: no per process data for pid %d ?\n",
task_tgid_vnr(current));
}
ihk_ikc_spinlock_unlock(&usrdata->per_proc_list_lock, flags);
/* Make sure the user space handler will be called as well */ /* Make sure the user space handler will be called as well */
error = -ENOSYS; error = -ENOSYS;
@@ -1821,10 +1674,11 @@ sched_setparam_out:
break; break;
} }
__return_syscall(c, ret); __return_syscall(os, packet, ret, 0);
error = 0; error = 0;
out: out:
dprintk("__do_in_kernel_syscall(%p,%p,%ld %lx): %d %ld\n", os, c, sc->number, sc->args[0], error, ret); dprintk("%s: system call: %d, error: %d, ret: %ld\n",
__FUNCTION__, sc->number, sc->args[0], error, ret);
return error; return error;
} }

View File

@@ -14,6 +14,7 @@
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/device.h> #include <linux/device.h>
#include <linux/version.h> #include <linux/version.h>
#include <linux/interrupt.h>
#include "mcctrl.h" #include "mcctrl.h"
#include "sysfs_msg.h" #include "sysfs_msg.h"

View File

@@ -1,7 +1,3 @@
KDIR ?= @KDIR@
ARCH ?= @ARCH@
KMODDIR=@KMODDIR@
src = @abs_srcdir@
ENABLE_MCOVERLAYFS=@ENABLE_MCOVERLAYFS@ ENABLE_MCOVERLAYFS=@ENABLE_MCOVERLAYFS@
RELEASE=$(shell uname -r) RELEASE=$(shell uname -r)
@@ -9,31 +5,36 @@ MAJOR=$(shell echo ${RELEASE} | sed -e 's/^\([0-9]*\).*/\1/')
MINOR=$(shell echo ${RELEASE} | sed -e 's/^[0-9]*.\([0-9]*\).*/\1/') MINOR=$(shell echo ${RELEASE} | sed -e 's/^[0-9]*.\([0-9]*\).*/\1/')
PATCH=$(shell echo ${RELEASE} | sed -e 's/^[0-9]*.[0-9]*.\([0-9]*\).*/\1/') PATCH=$(shell echo ${RELEASE} | sed -e 's/^[0-9]*.[0-9]*.\([0-9]*\).*/\1/')
LINUX_VERSION_CODE=$(shell expr \( ${MAJOR} \* 65536 \) + \( ${MINOR} \* 256 \) + ${PATCH}) LINUX_VERSION_CODE=$(shell expr \( ${MAJOR} \* 65536 \) + \( ${MINOR} \* 256 \) + ${PATCH})
RHEL_RELEASE=$(shell echo ${RELEASE} | sed -e 's/^[0-9]*.[0-9]*.[0-9]*-\([0-9]*\).*/\1/') RHEL_RELEASE_TMP=$(shell echo ${RELEASE} | sed -e 's/^[0-9]*.[0-9]*.[0-9]*-\([0-9]*\).*/\1/')
RHEL_RELEASE=$(shell if [ "${RELEASE}" == "${RHEL_RELEASE}" ]; then echo ""; else echo ${RHEL_RELEASE}; fi) RHEL_RELEASE=$(shell if [ "${RELEASE}" == "${RHEL_RELEASE_TMP}" ]; then echo ""; else echo ${RHEL_RELEASE_TMP}; fi)
BUILD_MODULE_TMP=$(shell if [ "${RHEL_RELEASE}" == "" ]; then echo "org"; else echo "rhel"; fi)
BUILD_MODULE=none
ifeq ($(ENABLE_MCOVERLAYFS),yes) ifeq ($(ENABLE_MCOVERLAYFS),yes)
ENABLE_BUILD=$(shell if ( [ ${LINUX_VERSION_CODE} -ge 262144 ] && [ ${LINUX_VERSION_CODE} -lt 262400 ] ); then echo "yes"; else echo "no"; fi) ifeq ($(BUILD_MODULE_TMP),org)
else ifeq ($(BUILD_MODULE),none)
ENABLE_BUILD=no BUILD_MODULE=$(shell if [ ${LINUX_VERSION_CODE} -ge 262144 -a ${LINUX_VERSION_CODE} -lt 262400 ]; then echo "linux-4.0.9"; else echo "none"; fi)
endif
endif
ifeq ($(BUILD_MODULE_TMP),rhel)
ifeq ($(BUILD_MODULE),none)
BUILD_MODULE=$(shell if [ ${LINUX_VERSION_CODE} -eq 199168 -a ${RHEL_RELEASE} -eq 327 ]; then echo "linux-3.10.0-327.36.1.el7"; else echo "none"; fi)
endif
endif
endif endif
obj-m += mcoverlay.o
mcoverlay-y := copy_up.o dir.o inode.o readdir.o super.o
.PHONY: clean install modules .PHONY: clean install modules
modules: modules:
ifeq ($(ENABLE_BUILD),yes) ifneq ($(BUILD_MODULE),none)
$(MAKE) -C $(KDIR) M=$(PWD) SUBDIRS=$(PWD) ARCH=$(ARCH) modules @(cd $(BUILD_MODULE); make modules)
endif endif
clean: clean:
$(RM) .*.cmd *.mod.c *.o *.ko* Module.symvers modules.order -r .tmp* @(cd linux-3.10.0-327.36.1.el7; make clean)
@(cd linux-4.0.9; make clean)
install: install:
ifeq ($(ENABLE_BUILD),yes) ifneq ($(BUILD_MODULE),none)
mkdir -p -m 755 $(KMODDIR) @(cd $(BUILD_MODULE); make install)
install -m 644 mcoverlay.ko $(KMODDIR)
endif endif

View File

@@ -0,0 +1,21 @@
KDIR ?= @KDIR@
ARCH ?= @ARCH@
KMODDIR = @KMODDIR@
src = @abs_srcdir@
obj-m += mcoverlay.o
mcoverlay-y := copy_up.o dir.o inode.o readdir.o super.o
.PHONY: clean install modules
modules:
$(MAKE) -C $(KDIR) M=$(PWD) SUBDIRS=$(PWD) ARCH=$(ARCH) modules
clean:
$(RM) .*.cmd *.mod.c *.o *.ko* Module.symvers modules.order -r .tmp*
install:
mkdir -p -m 755 $(KMODDIR)
install -m 644 mcoverlay.ko $(KMODDIR)

View File

@@ -0,0 +1,461 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/splice.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/uaccess.h>
#include <linux/sched.h>
#include <linux/namei.h>
#include <linux/fdtable.h>
#include <linux/ratelimit.h>
#include "overlayfs.h"
#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
static unsigned ovl_check_copy_up = 1;
module_param_named(check_copy_up, ovl_check_copy_up, uint,
S_IWUSR | S_IRUGO);
MODULE_PARM_DESC(ovl_check_copy_up,
"Warn on copy-up when causing process also has a R/O fd open");
static int ovl_check_fd(const void *data, struct file *f, unsigned fd)
{
const struct dentry *dentry = data;
if (f->f_path.dentry == dentry)
pr_warn_ratelimited("overlayfs: Warning: Copying up %pD, but open R/O on fd %u which will cease to be coherent [pid=%d %s]\n",
f, fd, current->pid, current->comm);
return 0;
}
/*
* Check the fds open by this process and warn if something like the following
* scenario is about to occur:
*
* fd1 = open("foo", O_RDONLY);
* fd2 = open("foo", O_RDWR);
*/
static void ovl_do_check_copy_up(struct dentry *dentry)
{
if (ovl_check_copy_up)
iterate_fd(current->files, 0, ovl_check_fd, dentry);
}
int ovl_copy_xattr(struct dentry *old, struct dentry *new)
{
ssize_t list_size, size, value_size = 0;
char *buf, *name, *value = NULL;
int uninitialized_var(error);
if (!old->d_inode->i_op->getxattr ||
!new->d_inode->i_op->getxattr)
return 0;
list_size = vfs_listxattr(old, NULL, 0);
if (list_size <= 0) {
if (list_size == -EOPNOTSUPP)
return 0;
return list_size;
}
buf = kzalloc(list_size, GFP_KERNEL);
if (!buf)
return -ENOMEM;
list_size = vfs_listxattr(old, buf, list_size);
if (list_size <= 0) {
error = list_size;
goto out;
}
for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
retry:
size = vfs_getxattr(old, name, value, value_size);
if (size == -ERANGE)
size = vfs_getxattr(old, name, NULL, 0);
if (size < 0) {
error = size;
break;
}
if (size > value_size) {
void *new;
new = krealloc(value, size, GFP_KERNEL);
if (!new) {
error = -ENOMEM;
break;
}
value = new;
value_size = size;
goto retry;
}
error = vfs_setxattr(new, name, value, size, 0);
if (error)
break;
}
kfree(value);
out:
kfree(buf);
return error;
}
static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
{
struct file *old_file;
struct file *new_file;
loff_t old_pos = 0;
loff_t new_pos = 0;
int error = 0;
if (len == 0)
return 0;
old_file = ovl_path_open(old, O_RDONLY);
if (IS_ERR(old_file))
return PTR_ERR(old_file);
new_file = ovl_path_open(new, O_WRONLY);
if (IS_ERR(new_file)) {
error = PTR_ERR(new_file);
goto out_fput;
}
/* FIXME: copy up sparse files efficiently */
while (len) {
size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
long bytes;
if (len < this_len)
this_len = len;
if (signal_pending_state(TASK_KILLABLE, current)) {
error = -EINTR;
break;
}
bytes = do_splice_direct(old_file, &old_pos,
new_file, &new_pos,
this_len, SPLICE_F_MOVE);
if (bytes <= 0) {
error = bytes;
break;
}
WARN_ON(old_pos != new_pos);
len -= bytes;
}
fput(new_file);
out_fput:
fput(old_file);
return error;
}
static char *ovl_read_symlink(struct dentry *realdentry)
{
int res;
char *buf;
struct inode *inode = realdentry->d_inode;
mm_segment_t old_fs;
res = -EINVAL;
if (!inode->i_op->readlink)
goto err;
res = -ENOMEM;
buf = (char *) __get_free_page(GFP_KERNEL);
if (!buf)
goto err;
old_fs = get_fs();
set_fs(get_ds());
/* The cast to a user pointer is valid due to the set_fs() */
res = inode->i_op->readlink(realdentry,
(char __user *)buf, PAGE_SIZE - 1);
set_fs(old_fs);
if (res < 0) {
free_page((unsigned long) buf);
goto err;
}
buf[res] = '\0';
return buf;
err:
return ERR_PTR(res);
}
static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
{
struct iattr attr = {
.ia_valid =
ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
.ia_atime = stat->atime,
.ia_mtime = stat->mtime,
};
return notify_change(upperdentry, &attr, NULL);
}
int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
{
int err = 0;
if (!S_ISLNK(stat->mode)) {
struct iattr attr = {
.ia_valid = ATTR_MODE,
.ia_mode = stat->mode,
};
err = notify_change(upperdentry, &attr, NULL);
}
if (!err) {
struct iattr attr = {
.ia_valid = ATTR_UID | ATTR_GID,
.ia_uid = stat->uid,
.ia_gid = stat->gid,
};
err = notify_change(upperdentry, &attr, NULL);
}
if (!err)
ovl_set_timestamps(upperdentry, stat);
return err;
}
static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
struct dentry *dentry, struct path *lowerpath,
struct kstat *stat, struct iattr *attr,
const char *link)
{
struct inode *wdir = workdir->d_inode;
struct inode *udir = upperdir->d_inode;
struct dentry *newdentry = NULL;
struct dentry *upper = NULL;
umode_t mode = stat->mode;
int err;
newdentry = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out1;
/* Can't properly set mode on creation because of the umask */
stat->mode &= S_IFMT;
err = ovl_create_real(wdir, newdentry, stat, link, NULL, true);
stat->mode = mode;
if (err)
goto out2;
if (S_ISREG(stat->mode)) {
struct path upperpath;
ovl_path_upper(dentry, &upperpath);
BUG_ON(upperpath.dentry != NULL);
upperpath.dentry = newdentry;
err = ovl_copy_up_data(lowerpath, &upperpath, stat->size);
if (err)
goto out_cleanup;
}
err = ovl_copy_xattr(lowerpath->dentry, newdentry);
if (err)
goto out_cleanup;
mutex_lock(&newdentry->d_inode->i_mutex);
err = ovl_set_attr(newdentry, stat);
if (!err && attr)
err = notify_change(newdentry, attr, NULL);
mutex_unlock(&newdentry->d_inode->i_mutex);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
if (err)
goto out_cleanup;
ovl_dentry_update(dentry, newdentry);
newdentry = NULL;
/*
* Non-directores become opaque when copied up.
*/
if (!S_ISDIR(stat->mode))
ovl_dentry_set_opaque(dentry, true);
out2:
dput(upper);
out1:
dput(newdentry);
out:
return err;
out_cleanup:
ovl_cleanup(wdir, newdentry);
goto out;
}
/*
* Copy up a single dentry
*
* Directory renames only allowed on "pure upper" (already created on
* upper filesystem, never copied up). Directories which are on lower or
* are merged may not be renamed. For these -EXDEV is returned and
* userspace has to deal with it. This means, when copying up a
* directory we can rely on it and ancestors being stable.
*
* Non-directory renames start with copy up of source if necessary. The
* actual rename will only proceed once the copy up was successful. Copy
* up uses upper parent i_mutex for exclusion. Since rename can change
* d_parent it is possible that the copy up will lock the old parent. At
* that point the file will have already been copied up anyway.
*/
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
struct path *lowerpath, struct kstat *stat,
struct iattr *attr)
{
struct dentry *workdir = ovl_workdir(dentry);
int err;
struct kstat pstat;
struct path parentpath;
struct dentry *upperdir;
struct dentry *upperdentry;
const struct cred *old_cred;
struct cred *override_cred;
char *link = NULL;
if (WARN_ON(!workdir))
return -EROFS;
ovl_do_check_copy_up(lowerpath->dentry);
ovl_path_upper(parent, &parentpath);
upperdir = parentpath.dentry;
err = vfs_getattr(&parentpath, &pstat);
if (err)
return err;
if (S_ISLNK(stat->mode)) {
link = ovl_read_symlink(lowerpath->dentry);
if (IS_ERR(link))
return PTR_ERR(link);
}
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_free_link;
override_cred->fsuid = stat->uid;
override_cred->fsgid = stat->gid;
/*
* CAP_SYS_ADMIN for copying up extended attributes
* CAP_DAC_OVERRIDE for create
* CAP_FOWNER for chmod, timestamp update
* CAP_FSETID for chmod
* CAP_CHOWN for chown
* CAP_MKNOD for mknod
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
cap_raise(override_cred->cap_effective, CAP_MKNOD);
old_cred = override_creds(override_cred);
err = -EIO;
if (lock_rename(workdir, upperdir) != NULL) {
pr_err("overlayfs: failed to lock workdir+upperdir\n");
goto out_unlock;
}
upperdentry = ovl_dentry_upper(dentry);
if (upperdentry) {
unlock_rename(workdir, upperdir);
err = 0;
/* Raced with another copy-up? Do the setattr here */
if (attr) {
mutex_lock(&upperdentry->d_inode->i_mutex);
err = notify_change(upperdentry, attr, NULL);
mutex_unlock(&upperdentry->d_inode->i_mutex);
}
goto out_put_cred;
}
err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath,
stat, attr, link);
if (!err) {
/* Restore timestamps on parent (best effort) */
ovl_set_timestamps(upperdir, &pstat);
}
out_unlock:
unlock_rename(workdir, upperdir);
out_put_cred:
revert_creds(old_cred);
put_cred(override_cred);
out_free_link:
if (link)
free_page((unsigned long) link);
return err;
}
int ovl_copy_up(struct dentry *dentry)
{
int err;
err = 0;
while (!err) {
struct dentry *next;
struct dentry *parent;
struct path lowerpath;
struct kstat stat;
enum ovl_path_type type = ovl_path_type(dentry);
if (OVL_TYPE_UPPER(type))
break;
next = dget(dentry);
/* find the topmost dentry not yet copied up */
for (;;) {
parent = dget_parent(next);
type = ovl_path_type(parent);
if (OVL_TYPE_UPPER(type))
break;
dput(next);
next = parent;
}
ovl_path_lower(next, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
if (!err)
err = ovl_copy_up_one(parent, next, &lowerpath, &stat, NULL);
dput(parent);
dput(next);
}
return err;
}

View File

@@ -0,0 +1,972 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/cred.h>
#include "overlayfs.h"
void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
{
int err;
dget(wdentry);
if (S_ISDIR(wdentry->d_inode->i_mode))
err = ovl_do_rmdir(wdir, wdentry);
else
err = ovl_do_unlink(wdir, wdentry);
dput(wdentry);
if (err) {
pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n",
wdentry, err);
}
}
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry)
{
struct dentry *temp;
char name[20];
snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry);
temp = lookup_one_len(name, workdir, strlen(name));
if (!IS_ERR(temp) && temp->d_inode) {
pr_err("overlayfs: workdir/%s already exists\n", name);
dput(temp);
temp = ERR_PTR(-EIO);
}
return temp;
}
/* caller holds i_mutex on workdir */
static struct dentry *ovl_whiteout(struct dentry *workdir,
struct dentry *dentry)
{
int err;
struct dentry *whiteout;
struct inode *wdir = workdir->d_inode;
whiteout = ovl_lookup_temp(workdir, dentry);
if (IS_ERR(whiteout))
return whiteout;
err = ovl_do_whiteout(wdir, whiteout);
if (err) {
dput(whiteout);
whiteout = ERR_PTR(err);
}
return whiteout;
}
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct kstat *stat, const char *link,
struct dentry *hardlink, bool debug)
{
int err;
if (newdentry->d_inode)
return -ESTALE;
if (hardlink) {
err = ovl_do_link(hardlink, dir, newdentry, debug);
} else {
switch (stat->mode & S_IFMT) {
case S_IFREG:
err = ovl_do_create(dir, newdentry, stat->mode, debug);
break;
case S_IFDIR:
err = ovl_do_mkdir(dir, newdentry, stat->mode, debug);
break;
case S_IFCHR:
case S_IFBLK:
case S_IFIFO:
case S_IFSOCK:
err = ovl_do_mknod(dir, newdentry,
stat->mode, stat->rdev, debug);
break;
case S_IFLNK:
err = ovl_do_symlink(dir, newdentry, link, debug);
break;
default:
err = -EPERM;
}
}
if (!err && WARN_ON(!newdentry->d_inode)) {
/*
* Not quite sure if non-instantiated dentry is legal or not.
* VFS doesn't seem to care so check and warn here.
*/
err = -ENOENT;
}
return err;
}
static int ovl_set_opaque(struct dentry *upperdentry)
{
return ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0);
}
static void ovl_remove_opaque(struct dentry *upperdentry)
{
int err;
err = ovl_do_removexattr(upperdentry, OVL_XATTR_OPAQUE);
if (err) {
pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n",
upperdentry->d_name.name, err);
}
}
static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
int err;
enum ovl_path_type type;
struct path realpath;
type = ovl_path_real(dentry, &realpath);
err = vfs_getattr(&realpath, stat);
if (err)
return err;
stat->dev = dentry->d_sb->s_dev;
stat->ino = dentry->d_inode->i_ino;
/*
* It's probably not worth it to count subdirs to get the
* correct link count. nlink=1 seems to pacify 'find' and
* other utilities.
*/
if (OVL_TYPE_MERGE(type))
stat->nlink = 1;
return 0;
}
static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
struct kstat *stat, const char *link,
struct dentry *hardlink)
{
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *newdentry;
int err;
mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT);
newdentry = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
err = ovl_create_real(udir, newdentry, stat, link, hardlink, false);
if (err)
goto out_dput;
ovl_dentry_version_inc(dentry->d_parent);
ovl_dentry_update(dentry, newdentry);
ovl_copyattr(newdentry->d_inode, inode);
d_instantiate(dentry, inode);
newdentry = NULL;
out_dput:
dput(newdentry);
out_unlock:
mutex_unlock(&udir->i_mutex);
return err;
}
static int ovl_lock_rename_workdir(struct dentry *workdir,
struct dentry *upperdir)
{
/* Workdir should not be the same as upperdir */
if (workdir == upperdir)
goto err;
/* Workdir should not be subdir of upperdir and vice versa */
if (lock_rename(workdir, upperdir) != NULL)
goto err_unlock;
return 0;
err_unlock:
unlock_rename(workdir, upperdir);
err:
pr_err("overlayfs: failed to lock workdir+upperdir\n");
return -EIO;
}
static struct dentry *ovl_clear_empty(struct dentry *dentry,
struct list_head *list)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct path upperpath;
struct dentry *upper;
struct dentry *opaquedir;
struct kstat stat;
int err;
if (WARN_ON(!workdir))
return ERR_PTR(-EROFS);
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out;
ovl_path_upper(dentry, &upperpath);
err = vfs_getattr(&upperpath, &stat);
if (err)
goto out_unlock;
err = -ESTALE;
if (!S_ISDIR(stat.mode))
goto out_unlock;
upper = upperpath.dentry;
if (upper->d_parent->d_inode != udir)
goto out_unlock;
opaquedir = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
goto out_unlock;
err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true);
if (err)
goto out_dput;
err = ovl_copy_xattr(upper, opaquedir);
if (err)
goto out_cleanup;
err = ovl_set_opaque(opaquedir);
if (err)
goto out_cleanup;
mutex_lock(&opaquedir->d_inode->i_mutex);
err = ovl_set_attr(opaquedir, &stat);
mutex_unlock(&opaquedir->d_inode->i_mutex);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
if (err)
goto out_cleanup;
ovl_cleanup_whiteouts(upper, list);
ovl_cleanup(wdir, upper);
unlock_rename(workdir, upperdir);
/* dentry's upper doesn't match now, get rid of it */
d_drop(dentry);
return opaquedir;
out_cleanup:
ovl_cleanup(wdir, opaquedir);
out_dput:
dput(opaquedir);
out_unlock:
unlock_rename(workdir, upperdir);
out:
return ERR_PTR(err);
}
static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry)
{
int err;
struct dentry *ret = NULL;
LIST_HEAD(list);
err = ovl_check_empty_dir(dentry, &list);
if (err)
ret = ERR_PTR(err);
else {
/*
* If no upperdentry then skip clearing whiteouts.
*
* Can race with copy-up, since we don't hold the upperdir
* mutex. Doesn't matter, since copy-up can't create a
* non-empty directory from an empty one.
*/
if (ovl_dentry_upper(dentry))
ret = ovl_clear_empty(dentry, &list);
}
ovl_cache_free(&list);
return ret;
}
static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
struct kstat *stat, const char *link,
struct dentry *hardlink)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *upper;
struct dentry *newdentry;
int err;
if (WARN_ON(!workdir))
return -EROFS;
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out;
newdentry = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_dput;
err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true);
if (err)
goto out_dput2;
if (S_ISDIR(stat->mode)) {
err = ovl_set_opaque(newdentry);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, newdentry, udir, upper,
RENAME_EXCHANGE);
if (err)
goto out_cleanup;
ovl_cleanup(wdir, upper);
} else {
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
if (err)
goto out_cleanup;
}
ovl_dentry_version_inc(dentry->d_parent);
ovl_dentry_update(dentry, newdentry);
ovl_copyattr(newdentry->d_inode, inode);
d_instantiate(dentry, inode);
newdentry = NULL;
out_dput2:
dput(upper);
out_dput:
dput(newdentry);
out_unlock:
unlock_rename(workdir, upperdir);
out:
return err;
out_cleanup:
ovl_cleanup(wdir, newdentry);
goto out_dput2;
}
static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev,
const char *link, struct dentry *hardlink)
{
int err;
struct inode *inode;
struct kstat stat = {
.mode = mode,
.rdev = rdev,
};
err = -ENOMEM;
inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata);
if (!inode)
goto out;
err = ovl_copy_up(dentry->d_parent);
if (err)
goto out_iput;
if (!ovl_dentry_is_opaque(dentry)) {
err = ovl_create_upper(dentry, inode, &stat, link, hardlink);
} else {
const struct cred *old_cred;
struct cred *override_cred;
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_iput;
/*
* CAP_SYS_ADMIN for setting opaque xattr
* CAP_DAC_OVERRIDE for create in workdir, rename
* CAP_FOWNER for removing whiteout from sticky dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
old_cred = override_creds(override_cred);
err = ovl_create_over_whiteout(dentry, inode, &stat, link,
hardlink);
revert_creds(old_cred);
put_cred(override_cred);
}
if (!err)
inode = NULL;
out_iput:
iput(inode);
out:
return err;
}
static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
const char *link)
{
int err;
err = ovl_want_write(dentry);
if (!err) {
err = ovl_create_or_link(dentry, mode, rdev, link, NULL);
ovl_drop_write(dentry);
}
return err;
}
static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode,
bool excl)
{
return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
}
static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
}
static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
dev_t rdev)
{
/* Don't allow creation of "whiteout" on overlay */
if (S_ISCHR(mode) && rdev == WHITEOUT_DEV)
return -EPERM;
return ovl_create_object(dentry, mode, rdev, NULL);
}
static int ovl_symlink(struct inode *dir, struct dentry *dentry,
const char *link)
{
return ovl_create_object(dentry, S_IFLNK, 0, link);
}
static int ovl_link(struct dentry *old, struct inode *newdir,
struct dentry *new)
{
int err;
struct dentry *upper;
err = ovl_want_write(old);
if (err)
goto out;
err = ovl_copy_up(old);
if (err)
goto out_drop_write;
upper = ovl_dentry_upper(old);
err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper);
out_drop_write:
ovl_drop_write(old);
out:
return err;
}
static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *whiteout;
struct dentry *upper;
struct dentry *opaquedir = NULL;
int err;
int flags = 0;
if (WARN_ON(!workdir))
return -EROFS;
if (is_dir) {
if (OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) {
opaquedir = ovl_check_empty_and_clear(dentry);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
goto out;
} else {
LIST_HEAD(list);
/*
* When removing an empty opaque directory, then it
* makes no sense to replace it with an exact replica of
* itself. But emptiness still needs to be checked.
*/
err = ovl_check_empty_dir(dentry, &list);
ovl_cache_free(&list);
if (err)
goto out;
}
}
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out_dput;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_unlock;
err = -ESTALE;
if ((opaquedir && upper != opaquedir) ||
(!opaquedir && ovl_dentry_upper(dentry) &&
upper != ovl_dentry_upper(dentry))) {
goto out_dput_upper;
}
whiteout = ovl_whiteout(workdir, dentry);
err = PTR_ERR(whiteout);
if (IS_ERR(whiteout))
goto out_dput_upper;
if (d_is_dir(upper))
flags = RENAME_EXCHANGE;
err = ovl_do_rename(wdir, whiteout, udir, upper, flags);
if (err)
goto kill_whiteout;
if (flags)
ovl_cleanup(wdir, upper);
ovl_dentry_version_inc(dentry->d_parent);
out_d_drop:
d_drop(dentry);
dput(whiteout);
out_dput_upper:
dput(upper);
out_unlock:
unlock_rename(workdir, upperdir);
out_dput:
dput(opaquedir);
out:
return err;
kill_whiteout:
ovl_cleanup(wdir, whiteout);
goto out_d_drop;
}
static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
{
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *dir = upperdir->d_inode;
struct dentry *upper;
int err;
mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_unlock;
err = -ESTALE;
if (upper == ovl_dentry_upper(dentry)) {
if (is_dir)
err = vfs_rmdir(dir, upper);
else
err = vfs_unlink(dir, upper, NULL);
ovl_dentry_version_inc(dentry->d_parent);
}
dput(upper);
/*
* Keeping this dentry hashed would mean having to release
* upperpath/lowerpath, which could only be done if we are the
* sole user of this dentry. Too tricky... Just unhash for
* now.
*/
if (!err)
d_drop(dentry);
out_unlock:
mutex_unlock(&dir->i_mutex);
return err;
}
static inline int ovl_check_sticky(struct dentry *dentry)
{
struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode;
struct inode *inode = ovl_dentry_real(dentry)->d_inode;
if (check_sticky(dir, inode))
return -EPERM;
return 0;
}
static int ovl_do_remove(struct dentry *dentry, bool is_dir)
{
enum ovl_path_type type;
int err;
err = ovl_check_sticky(dentry);
if (err)
goto out;
err = ovl_want_write(dentry);
if (err)
goto out;
err = ovl_copy_up(dentry->d_parent);
if (err)
goto out_drop_write;
type = ovl_path_type(dentry);
if (OVL_TYPE_PURE_UPPER(type)) {
err = ovl_remove_upper(dentry, is_dir);
} else {
const struct cred *old_cred;
struct cred *override_cred;
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_drop_write;
/*
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
* CAP_DAC_OVERRIDE for create in workdir, rename
* CAP_FOWNER for removing whiteout from sticky dir
* CAP_FSETID for chmod of opaque dir
* CAP_CHOWN for chown of opaque dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
old_cred = override_creds(override_cred);
err = ovl_remove_and_whiteout(dentry, is_dir);
revert_creds(old_cred);
put_cred(override_cred);
}
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static int ovl_unlink(struct inode *dir, struct dentry *dentry)
{
return ovl_do_remove(dentry, false);
}
static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
{
return ovl_do_remove(dentry, true);
}
static int ovl_rename2(struct inode *olddir, struct dentry *old,
struct inode *newdir, struct dentry *new,
unsigned int flags)
{
int err;
enum ovl_path_type old_type;
enum ovl_path_type new_type;
struct dentry *old_upperdir;
struct dentry *new_upperdir;
struct dentry *olddentry;
struct dentry *newdentry;
struct dentry *trap;
bool old_opaque;
bool new_opaque;
bool new_create = false;
bool cleanup_whiteout = false;
bool overwrite = !(flags & RENAME_EXCHANGE);
bool is_dir = S_ISDIR(old->d_inode->i_mode);
bool new_is_dir = false;
struct dentry *opaquedir = NULL;
const struct cred *old_cred = NULL;
struct cred *override_cred = NULL;
err = -EINVAL;
if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE))
goto out;
flags &= ~RENAME_NOREPLACE;
err = ovl_check_sticky(old);
if (err)
goto out;
/* Don't copy up directory trees */
old_type = ovl_path_type(old);
err = -EXDEV;
if (OVL_TYPE_MERGE_OR_LOWER(old_type) && is_dir)
goto out;
if (new->d_inode) {
err = ovl_check_sticky(new);
if (err)
goto out;
if (S_ISDIR(new->d_inode->i_mode))
new_is_dir = true;
new_type = ovl_path_type(new);
err = -EXDEV;
if (!overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir)
goto out;
err = 0;
if (!OVL_TYPE_UPPER(new_type) && !OVL_TYPE_UPPER(old_type)) {
if (ovl_dentry_lower(old)->d_inode ==
ovl_dentry_lower(new)->d_inode)
goto out;
}
if (OVL_TYPE_UPPER(new_type) && OVL_TYPE_UPPER(old_type)) {
if (ovl_dentry_upper(old)->d_inode ==
ovl_dentry_upper(new)->d_inode)
goto out;
}
} else {
if (ovl_dentry_is_opaque(new))
new_type = __OVL_PATH_UPPER;
else
new_type = __OVL_PATH_UPPER | __OVL_PATH_PURE;
}
err = ovl_want_write(old);
if (err)
goto out;
err = ovl_copy_up(old);
if (err)
goto out_drop_write;
err = ovl_copy_up(new->d_parent);
if (err)
goto out_drop_write;
if (!overwrite) {
err = ovl_copy_up(new);
if (err)
goto out_drop_write;
}
old_opaque = !OVL_TYPE_PURE_UPPER(old_type);
new_opaque = !OVL_TYPE_PURE_UPPER(new_type);
if (old_opaque || new_opaque) {
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_drop_write;
/*
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
* CAP_DAC_OVERRIDE for create in workdir
* CAP_FOWNER for removing whiteout from sticky dir
* CAP_FSETID for chmod of opaque dir
* CAP_CHOWN for chown of opaque dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
old_cred = override_creds(override_cred);
}
if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) {
opaquedir = ovl_check_empty_and_clear(new);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir)) {
opaquedir = NULL;
goto out_revert_creds;
}
}
if (overwrite) {
if (old_opaque) {
if (new->d_inode || !new_opaque) {
/* Whiteout source */
flags |= RENAME_WHITEOUT;
} else {
/* Switch whiteouts */
flags |= RENAME_EXCHANGE;
}
} else if (is_dir && !new->d_inode && new_opaque) {
flags |= RENAME_EXCHANGE;
cleanup_whiteout = true;
}
}
old_upperdir = ovl_dentry_upper(old->d_parent);
new_upperdir = ovl_dentry_upper(new->d_parent);
trap = lock_rename(new_upperdir, old_upperdir);
olddentry = lookup_one_len(old->d_name.name, old_upperdir,
old->d_name.len);
err = PTR_ERR(olddentry);
if (IS_ERR(olddentry))
goto out_unlock;
err = -ESTALE;
if (olddentry != ovl_dentry_upper(old))
goto out_dput_old;
newdentry = lookup_one_len(new->d_name.name, new_upperdir,
new->d_name.len);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_dput_old;
err = -ESTALE;
if (ovl_dentry_upper(new)) {
if (opaquedir) {
if (newdentry != opaquedir)
goto out_dput;
} else {
if (newdentry != ovl_dentry_upper(new))
goto out_dput;
}
} else {
new_create = true;
if (!d_is_negative(newdentry) &&
(!new_opaque || !ovl_is_whiteout(newdentry)))
goto out_dput;
}
if (olddentry == trap)
goto out_dput;
if (newdentry == trap)
goto out_dput;
if (is_dir && !old_opaque && new_opaque) {
err = ovl_set_opaque(olddentry);
if (err)
goto out_dput;
}
if (!overwrite && new_is_dir && old_opaque && !new_opaque) {
err = ovl_set_opaque(newdentry);
if (err)
goto out_dput;
}
if (old_opaque || new_opaque) {
err = ovl_do_rename(old_upperdir->d_inode, olddentry,
new_upperdir->d_inode, newdentry,
flags);
} else {
/* No debug for the plain case */
BUG_ON(flags & ~RENAME_EXCHANGE);
err = vfs_rename(old_upperdir->d_inode, olddentry,
new_upperdir->d_inode, newdentry,
NULL, flags);
}
if (err) {
if (is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(olddentry);
if (!overwrite && new_is_dir && old_opaque && !new_opaque)
ovl_remove_opaque(newdentry);
goto out_dput;
}
if (is_dir && old_opaque && !new_opaque)
ovl_remove_opaque(olddentry);
if (!overwrite && new_is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(newdentry);
if (old_opaque != new_opaque) {
ovl_dentry_set_opaque(old, new_opaque);
if (!overwrite)
ovl_dentry_set_opaque(new, old_opaque);
}
if (cleanup_whiteout)
ovl_cleanup(old_upperdir->d_inode, newdentry);
ovl_dentry_version_inc(old->d_parent);
ovl_dentry_version_inc(new->d_parent);
out_dput:
dput(newdentry);
out_dput_old:
dput(olddentry);
out_unlock:
unlock_rename(new_upperdir, old_upperdir);
out_revert_creds:
if (old_opaque || new_opaque) {
revert_creds(old_cred);
put_cred(override_cred);
}
out_drop_write:
ovl_drop_write(old);
out:
dput(opaquedir);
return err;
}
static int ovl_rename(struct inode *olddir, struct dentry *old,
struct inode *newdir, struct dentry *new)
{
return ovl_rename2(olddir, old, newdir, new, 0);
}
const struct inode_operations_wrapper ovl_dir_inode_operations = {
.ops = {
.lookup = ovl_lookup,
.mkdir = ovl_mkdir,
.symlink = ovl_symlink,
.unlink = ovl_unlink,
.rmdir = ovl_rmdir,
.rename = ovl_rename,
.link = ovl_link,
.setattr = ovl_setattr,
.create = ovl_create,
.mknod = ovl_mknod,
.permission = ovl_permission,
.getattr = ovl_dir_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
},
.rename2 = ovl_rename2,
};

View File

@@ -0,0 +1,442 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/xattr.h>
#include "overlayfs.h"
static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr,
bool no_data)
{
int err;
struct dentry *parent;
struct kstat stat;
struct path lowerpath;
parent = dget_parent(dentry);
err = ovl_copy_up(parent);
if (err)
goto out_dput_parent;
ovl_path_lower(dentry, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
if (err)
goto out_dput_parent;
if (no_data)
stat.size = 0;
err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat, attr);
out_dput_parent:
dput(parent);
return err;
}
int ovl_setattr(struct dentry *dentry, struct iattr *attr)
{
int err;
struct dentry *upperdentry;
err = ovl_want_write(dentry);
if (err)
goto out;
err = ovl_copy_up(dentry);
if (!err) {
upperdentry = ovl_dentry_upper(dentry);
mutex_lock(&upperdentry->d_inode->i_mutex);
err = notify_change(upperdentry, attr, NULL);
mutex_unlock(&upperdentry->d_inode->i_mutex);
}
ovl_drop_write(dentry);
out:
return err;
}
static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
struct path realpath;
ovl_path_real(dentry, &realpath);
return vfs_getattr(&realpath, stat);
}
int ovl_permission(struct inode *inode, int mask)
{
struct ovl_entry *oe;
struct dentry *alias = NULL;
struct inode *realinode;
struct dentry *realdentry;
bool is_upper;
int err;
if (S_ISDIR(inode->i_mode)) {
oe = inode->i_private;
} else if (mask & MAY_NOT_BLOCK) {
return -ECHILD;
} else {
/*
* For non-directories find an alias and get the info
* from there.
*/
alias = d_find_any_alias(inode);
if (WARN_ON(!alias))
return -ENOENT;
oe = alias->d_fsdata;
}
realdentry = ovl_entry_real(oe, &is_upper);
/* Careful in RCU walk mode */
realinode = ACCESS_ONCE(realdentry->d_inode);
if (!realinode) {
WARN_ON(!(mask & MAY_NOT_BLOCK));
err = -ENOENT;
goto out_dput;
}
if (mask & MAY_WRITE) {
umode_t mode = realinode->i_mode;
/*
* Writes will always be redirected to upper layer, so
* ignore lower layer being read-only.
*
* If the overlay itself is read-only then proceed
* with the permission check, don't return EROFS.
* This will only happen if this is the lower layer of
* another overlayfs.
*
* If upper fs becomes read-only after the overlay was
* constructed return EROFS to prevent modification of
* upper layer.
*/
err = -EROFS;
if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) &&
(S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
goto out_dput;
}
err = __inode_permission(realinode, mask);
out_dput:
dput(alias);
return err;
}
struct ovl_link_data {
struct dentry *realdentry;
void *cookie;
};
static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
{
void *ret;
struct dentry *realdentry;
struct inode *realinode;
struct ovl_link_data *data = NULL;
realdentry = ovl_dentry_real(dentry);
realinode = realdentry->d_inode;
if (WARN_ON(!realinode->i_op->follow_link))
return ERR_PTR(-EPERM);
if (realinode->i_op->put_link) {
data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
if (!data)
return ERR_PTR(-ENOMEM);
data->realdentry = realdentry;
}
ret = realinode->i_op->follow_link(realdentry, nd);
if (IS_ERR(ret)) {
kfree(data);
return ret;
}
if (data)
data->cookie = ret;
return data;
}
static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
{
struct inode *realinode;
struct ovl_link_data *data = c;
if (!data)
return;
realinode = data->realdentry->d_inode;
realinode->i_op->put_link(data->realdentry, nd, data->cookie);
kfree(data);
}
static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
{
struct path realpath;
struct inode *realinode;
ovl_path_real(dentry, &realpath);
realinode = realpath.dentry->d_inode;
if (!realinode->i_op->readlink)
return -EINVAL;
touch_atime(&realpath);
return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
}
static bool ovl_is_private_xattr(const char *name)
{
return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0;
}
int ovl_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
int err;
struct dentry *upperdentry;
err = ovl_want_write(dentry);
if (err)
goto out;
err = -EPERM;
if (ovl_is_private_xattr(name))
goto out_drop_write;
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
upperdentry = ovl_dentry_upper(dentry);
err = vfs_setxattr(upperdentry, name, value, size, flags);
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static bool ovl_need_xattr_filter(struct dentry *dentry,
enum ovl_path_type type)
{
if ((type & (__OVL_PATH_PURE | __OVL_PATH_UPPER)) == __OVL_PATH_UPPER)
return S_ISDIR(dentry->d_inode->i_mode);
else
return false;
}
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
void *value, size_t size)
{
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
return -ENODATA;
return vfs_getxattr(realpath.dentry, name, value, size);
}
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
{
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
ssize_t res;
int off;
res = vfs_listxattr(realpath.dentry, list, size);
if (res <= 0 || size == 0)
return res;
if (!ovl_need_xattr_filter(dentry, type))
return res;
/* filter out private xattrs */
for (off = 0; off < res;) {
char *s = list + off;
size_t slen = strlen(s) + 1;
BUG_ON(off + slen > res);
if (ovl_is_private_xattr(s)) {
res -= slen;
memmove(s, s + slen, res - off);
} else {
off += slen;
}
}
return res;
}
int ovl_removexattr(struct dentry *dentry, const char *name)
{
int err;
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
err = ovl_want_write(dentry);
if (err)
goto out;
err = -ENODATA;
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
goto out_drop_write;
if (!OVL_TYPE_UPPER(type)) {
err = vfs_getxattr(realpath.dentry, name, NULL, 0);
if (err < 0)
goto out_drop_write;
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
ovl_path_upper(dentry, &realpath);
}
err = vfs_removexattr(realpath.dentry, name);
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
struct dentry *realdentry)
{
if (OVL_TYPE_UPPER(type))
return false;
if (special_file(realdentry->d_inode->i_mode))
return false;
if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
return false;
return true;
}
static int ovl_dentry_open(struct dentry *dentry, struct file *file,
const struct cred *cred)
{
int err;
struct path realpath;
enum ovl_path_type type;
bool want_write = false;
type = ovl_path_real(dentry, &realpath);
if (!ovl_is_nocopyupw(dentry)) {
if (ovl_open_need_copy_up(file->f_flags, type,
realpath.dentry)) {
want_write = true;
err = ovl_want_write(dentry);
if (err)
goto out;
if (file->f_flags & O_TRUNC)
err = ovl_copy_up_last(dentry, NULL, true);
else
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
ovl_path_upper(dentry, &realpath);
}
}
err = vfs_open(&realpath, file, cred);
out_drop_write:
if (want_write)
ovl_drop_write(dentry);
out:
return err;
}
static const struct inode_operations_wrapper ovl_file_inode_operations = {
.ops = {
.setattr = ovl_setattr,
.permission = ovl_permission,
.getattr = ovl_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
},
.dentry_open = ovl_dentry_open,
};
static const struct inode_operations ovl_symlink_inode_operations = {
.setattr = ovl_setattr,
.follow_link = ovl_follow_link,
.put_link = ovl_put_link,
.readlink = ovl_readlink,
.getattr = ovl_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
};
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
struct ovl_entry *oe)
{
struct inode *inode;
inode = new_inode(sb);
if (!inode)
return NULL;
mode &= S_IFMT;
inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_flags |= S_NOATIME | S_NOCMTIME;
switch (mode) {
case S_IFDIR:
inode->i_private = oe;
inode->i_op = &ovl_dir_inode_operations.ops;
inode->i_fop = &ovl_dir_operations;
inode->i_flags |= S_IOPS_WRAPPER;
break;
case S_IFLNK:
inode->i_op = &ovl_symlink_inode_operations;
break;
case S_IFREG:
case S_IFSOCK:
case S_IFBLK:
case S_IFCHR:
case S_IFIFO:
inode->i_op = &ovl_file_inode_operations.ops;
inode->i_flags |= S_IOPS_WRAPPER;
break;
default:
WARN(1, "illegal file type: %i\n", mode);
iput(inode);
inode = NULL;
}
return inode;
}

View File

@@ -0,0 +1,200 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/kernel.h>
struct ovl_entry;
enum ovl_path_type {
__OVL_PATH_PURE = (1 << 0),
__OVL_PATH_UPPER = (1 << 1),
__OVL_PATH_MERGE = (1 << 2),
};
#define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER)
#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE)
#define OVL_TYPE_PURE_UPPER(type) ((type) & __OVL_PATH_PURE)
#define OVL_TYPE_MERGE_OR_LOWER(type) \
(OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type))
#define OVL_XATTR_PRE_NAME "trusted.overlay."
#define OVL_XATTR_PRE_LEN 16
#define OVL_XATTR_OPAQUE OVL_XATTR_PRE_NAME"opaque"
static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
{
int err = vfs_rmdir(dir, dentry);
pr_debug("rmdir(%pd2) = %i\n", dentry, err);
return err;
}
static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
{
int err = vfs_unlink(dir, dentry, NULL);
pr_debug("unlink(%pd2) = %i\n", dentry, err);
return err;
}
static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *new_dentry, bool debug)
{
int err = vfs_link(old_dentry, dir, new_dentry, NULL);
if (debug) {
pr_debug("link(%pd2, %pd2) = %i\n",
old_dentry, new_dentry, err);
}
return err;
}
static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
umode_t mode, bool debug)
{
int err = vfs_create(dir, dentry, mode, true);
if (debug)
pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
umode_t mode, bool debug)
{
int err = vfs_mkdir(dir, dentry, mode);
if (debug)
pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
umode_t mode, dev_t dev, bool debug)
{
int err = vfs_mknod(dir, dentry, mode, dev);
if (debug) {
pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n",
dentry, mode, dev, err);
}
return err;
}
static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
const char *oldname, bool debug)
{
int err = vfs_symlink(dir, dentry, oldname);
if (debug)
pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
return err;
}
static inline int ovl_do_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
int err = vfs_setxattr(dentry, name, value, size, flags);
pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n",
dentry, name, (int) size, (char *) value, flags, err);
return err;
}
static inline int ovl_do_removexattr(struct dentry *dentry, const char *name)
{
int err = vfs_removexattr(dentry, name);
pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
return err;
}
static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
struct inode *newdir, struct dentry *newdentry,
unsigned int flags)
{
int err;
pr_debug("rename2(%pd2, %pd2, 0x%x)\n",
olddentry, newdentry, flags);
err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
if (err) {
pr_debug("...rename2(%pd2, %pd2, ...) = %i\n",
olddentry, newdentry, err);
}
return err;
}
static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
{
int err = vfs_whiteout(dir, dentry);
pr_debug("whiteout(%pd2) = %i\n", dentry, err);
return err;
}
bool ovl_is_nocopyupw(struct dentry *dentry);
enum ovl_path_type ovl_path_type(struct dentry *dentry);
u64 ovl_dentry_version_get(struct dentry *dentry);
void ovl_dentry_version_inc(struct dentry *dentry);
void ovl_path_upper(struct dentry *dentry, struct path *path);
void ovl_path_lower(struct dentry *dentry, struct path *path);
enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
struct dentry *ovl_dentry_upper(struct dentry *dentry);
struct dentry *ovl_dentry_lower(struct dentry *dentry);
struct dentry *ovl_dentry_real(struct dentry *dentry);
struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
struct dentry *ovl_workdir(struct dentry *dentry);
int ovl_want_write(struct dentry *dentry);
void ovl_drop_write(struct dentry *dentry);
bool ovl_dentry_is_opaque(struct dentry *dentry);
void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
bool ovl_is_whiteout(struct dentry *dentry);
void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags);
struct file *ovl_path_open(struct path *path, int flags);
struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
struct kstat *stat, const char *link);
/* readdir.c */
extern const struct file_operations ovl_dir_operations;
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
void ovl_cache_free(struct list_head *list);
/* inode.c */
int ovl_setattr(struct dentry *dentry, struct iattr *attr);
int ovl_permission(struct inode *inode, int mask);
int ovl_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags);
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
void *value, size_t size);
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
int ovl_removexattr(struct dentry *dentry, const char *name);
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
struct ovl_entry *oe);
static inline void ovl_copyattr(struct inode *from, struct inode *to)
{
to->i_uid = from->i_uid;
to->i_gid = from->i_gid;
}
/* dir.c */
extern const struct inode_operations_wrapper ovl_dir_inode_operations;
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry);
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct kstat *stat, const char *link,
struct dentry *hardlink, bool debug);
void ovl_cleanup(struct inode *dir, struct dentry *dentry);
/* copy_up.c */
int ovl_copy_up(struct dentry *dentry);
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
struct path *lowerpath, struct kstat *stat,
struct iattr *attr);
int ovl_copy_xattr(struct dentry *old, struct dentry *new);
int ovl_set_attr(struct dentry *upper, struct kstat *stat);

View File

@@ -0,0 +1,588 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/namei.h>
#include <linux/file.h>
#include <linux/xattr.h>
#include <linux/rbtree.h>
#include <linux/security.h>
#include <linux/cred.h>
#include "overlayfs.h"
struct ovl_cache_entry {
unsigned int len;
unsigned int type;
u64 ino;
struct list_head l_node;
struct rb_node node;
struct ovl_cache_entry *next_maybe_whiteout;
bool is_whiteout;
char name[];
};
struct ovl_dir_cache {
long refcount;
u64 version;
struct list_head entries;
};
struct dir_context {
const filldir_t actor;
//loff_t pos;
};
struct ovl_readdir_data {
struct dir_context ctx;
bool is_merge;
struct rb_root root;
struct list_head *list;
struct list_head middle;
struct ovl_cache_entry *first_maybe_whiteout;
int count;
int err;
};
struct ovl_dir_file {
bool is_real;
bool is_upper;
struct ovl_dir_cache *cache;
struct list_head *cursor;
struct file *realfile;
struct file *upperfile;
};
static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
{
return container_of(n, struct ovl_cache_entry, node);
}
static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
const char *name, int len)
{
struct rb_node *node = root->rb_node;
int cmp;
while (node) {
struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
cmp = strncmp(name, p->name, len);
if (cmp > 0)
node = p->node.rb_right;
else if (cmp < 0 || len < p->len)
node = p->node.rb_left;
else
return p;
}
return NULL;
}
static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
const char *name, int len,
u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
size_t size = offsetof(struct ovl_cache_entry, name[len + 1]);
p = kmalloc(size, GFP_KERNEL);
if (!p)
return NULL;
memcpy(p->name, name, len);
p->name[len] = '\0';
p->len = len;
p->type = d_type;
p->ino = ino;
p->is_whiteout = false;
if (d_type == DT_CHR) {
p->next_maybe_whiteout = rdd->first_maybe_whiteout;
rdd->first_maybe_whiteout = p;
}
return p;
}
static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
const char *name, int len, u64 ino,
unsigned int d_type)
{
struct rb_node **newp = &rdd->root.rb_node;
struct rb_node *parent = NULL;
struct ovl_cache_entry *p;
while (*newp) {
int cmp;
struct ovl_cache_entry *tmp;
parent = *newp;
tmp = ovl_cache_entry_from_node(*newp);
cmp = strncmp(name, tmp->name, len);
if (cmp > 0)
newp = &tmp->node.rb_right;
else if (cmp < 0 || len < tmp->len)
newp = &tmp->node.rb_left;
else
return 0;
}
p = ovl_cache_entry_new(rdd, name, len, ino, d_type);
if (p == NULL)
return -ENOMEM;
list_add_tail(&p->l_node, rdd->list);
rb_link_node(&p->node, parent, newp);
rb_insert_color(&p->node, &rdd->root);
return 0;
}
static int ovl_fill_lower(struct ovl_readdir_data *rdd,
const char *name, int namelen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
p = ovl_cache_entry_find(&rdd->root, name, namelen);
if (p) {
list_move_tail(&p->l_node, &rdd->middle);
} else {
p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type);
if (p == NULL)
rdd->err = -ENOMEM;
else
list_add_tail(&p->l_node, &rdd->middle);
}
return rdd->err;
}
void ovl_cache_free(struct list_head *list)
{
struct ovl_cache_entry *p;
struct ovl_cache_entry *n;
list_for_each_entry_safe(p, n, list, l_node)
kfree(p);
INIT_LIST_HEAD(list);
}
static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
{
struct ovl_dir_cache *cache = od->cache;
WARN_ON(cache->refcount <= 0);
cache->refcount--;
if (!cache->refcount) {
if (ovl_dir_cache(dentry) == cache)
ovl_set_dir_cache(dentry, NULL);
ovl_cache_free(&cache->entries);
kfree(cache);
}
}
static int ovl_fill_merge(void *buf, const char *name, int namelen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct dir_context *ctx = buf;
struct ovl_readdir_data *rdd =
container_of(ctx, struct ovl_readdir_data, ctx);
rdd->count++;
if (!rdd->is_merge)
return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
else
return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type);
}
static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
{
int err;
struct ovl_cache_entry *p;
struct dentry *dentry;
const struct cred *old_cred;
struct cred *override_cred;
override_cred = prepare_creds();
if (!override_cred)
return -ENOMEM;
/*
* CAP_DAC_OVERRIDE for lookup
*/
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
old_cred = override_creds(override_cred);
err = mutex_lock_killable(&dir->d_inode->i_mutex);
if (!err) {
while (rdd->first_maybe_whiteout) {
p = rdd->first_maybe_whiteout;
rdd->first_maybe_whiteout = p->next_maybe_whiteout;
dentry = lookup_one_len(p->name, dir, p->len);
if (!IS_ERR(dentry)) {
p->is_whiteout = ovl_is_whiteout(dentry);
dput(dentry);
}
}
mutex_unlock(&dir->d_inode->i_mutex);
}
revert_creds(old_cred);
put_cred(override_cred);
return err;
}
static inline int ovl_dir_read(struct path *realpath,
struct ovl_readdir_data *rdd)
{
struct file *realfile;
int err;
realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY);
if (IS_ERR(realfile))
return PTR_ERR(realfile);
rdd->first_maybe_whiteout = NULL;
//rdd->ctx.pos = 0;
do {
rdd->count = 0;
rdd->err = 0;
err = vfs_readdir(realfile, rdd->ctx.actor, rdd);
if (err >= 0)
err = rdd->err;
} while (!err && rdd->count);
if (!err && rdd->first_maybe_whiteout)
err = ovl_check_whiteouts(realpath->dentry, rdd);
fput(realfile);
return err;
}
static void ovl_dir_reset(struct file *file)
{
struct ovl_dir_file *od = file->private_data;
struct ovl_dir_cache *cache = od->cache;
struct dentry *dentry = file->f_path.dentry;
enum ovl_path_type type = ovl_path_type(dentry);
if (cache && ovl_dentry_version_get(dentry) != cache->version) {
ovl_cache_put(od, dentry);
od->cache = NULL;
od->cursor = NULL;
}
WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type));
if (od->is_real && OVL_TYPE_MERGE(type))
od->is_real = false;
}
static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
{
int err;
struct path realpath;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_merge,
.list = list,
.root = RB_ROOT,
.is_merge = false,
};
int idx, next;
for (idx = 0; idx != -1; idx = next) {
next = ovl_path_next(idx, dentry, &realpath);
if (next != -1) {
err = ovl_dir_read(&realpath, &rdd);
if (err)
break;
} else {
/*
* Insert lowest layer entries before upper ones, this
* allows offsets to be reasonably constant
*/
list_add(&rdd.middle, rdd.list);
rdd.is_merge = true;
err = ovl_dir_read(&realpath, &rdd);
list_del(&rdd.middle);
}
}
return err;
}
static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
{
struct list_head *p;
loff_t off = 0;
list_for_each(p, &od->cache->entries) {
if (off >= pos)
break;
off++;
}
/* Cursor is safe since the cache is stable */
od->cursor = p;
}
static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
{
int res;
struct ovl_dir_cache *cache;
cache = ovl_dir_cache(dentry);
if (cache && ovl_dentry_version_get(dentry) == cache->version) {
cache->refcount++;
return cache;
}
ovl_set_dir_cache(dentry, NULL);
cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
if (!cache)
return ERR_PTR(-ENOMEM);
cache->refcount = 1;
INIT_LIST_HEAD(&cache->entries);
res = ovl_dir_read_merged(dentry, &cache->entries);
if (res) {
ovl_cache_free(&cache->entries);
kfree(cache);
return ERR_PTR(res);
}
cache->version = ovl_dentry_version_get(dentry);
ovl_set_dir_cache(dentry, cache);
return cache;
}
static int ovl_readdir(struct file *file, void *buf, filldir_t filler)
{
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
struct ovl_cache_entry *p;
int res;
if (!file->f_pos)
ovl_dir_reset(file);
if (od->is_real) {
res = vfs_readdir(od->realfile, filler, buf);
file->f_pos = od->realfile->f_pos;
return res;
}
if (!od->cache) {
struct ovl_dir_cache *cache;
cache = ovl_cache_get(dentry);
if (IS_ERR(cache))
return PTR_ERR(cache);
od->cache = cache;
ovl_seek_cursor(od, file->f_pos);
}
while (od->cursor != &od->cache->entries) {
p = list_entry(od->cursor, struct ovl_cache_entry, l_node);
if (!p->is_whiteout)
if (filler(buf, p->name, p->len, file->f_pos, p->ino, p->type))
break;
od->cursor = p->l_node.next;
file->f_pos++;
}
return 0;
}
static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
{
loff_t res;
struct ovl_dir_file *od = file->private_data;
mutex_lock(&file_inode(file)->i_mutex);
if (!file->f_pos)
ovl_dir_reset(file);
if (od->is_real) {
res = vfs_llseek(od->realfile, offset, origin);
file->f_pos = od->realfile->f_pos;
} else {
res = -EINVAL;
switch (origin) {
case SEEK_CUR:
offset += file->f_pos;
break;
case SEEK_SET:
break;
default:
goto out_unlock;
}
if (offset < 0)
goto out_unlock;
if (offset != file->f_pos) {
file->f_pos = offset;
if (od->cache)
ovl_seek_cursor(od, offset);
}
res = offset;
}
out_unlock:
mutex_unlock(&file_inode(file)->i_mutex);
return res;
}
static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
int datasync)
{
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
struct file *realfile = od->realfile;
/*
* Need to check if we started out being a lower dir, but got copied up
*/
if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) {
struct inode *inode = file_inode(file);
realfile = lockless_dereference(od->upperfile);
if (!realfile) {
struct path upperpath;
ovl_path_upper(dentry, &upperpath);
realfile = ovl_path_open(&upperpath, O_RDONLY);
smp_mb__before_spinlock();
mutex_lock(&inode->i_mutex);
if (!od->upperfile) {
if (IS_ERR(realfile)) {
mutex_unlock(&inode->i_mutex);
return PTR_ERR(realfile);
}
od->upperfile = realfile;
} else {
/* somebody has beaten us to it */
if (!IS_ERR(realfile))
fput(realfile);
realfile = od->upperfile;
}
mutex_unlock(&inode->i_mutex);
}
}
return vfs_fsync_range(realfile, start, end, datasync);
}
static int ovl_dir_release(struct inode *inode, struct file *file)
{
struct ovl_dir_file *od = file->private_data;
if (od->cache) {
mutex_lock(&inode->i_mutex);
ovl_cache_put(od, file->f_path.dentry);
mutex_unlock(&inode->i_mutex);
}
fput(od->realfile);
if (od->upperfile)
fput(od->upperfile);
kfree(od);
return 0;
}
static int ovl_dir_open(struct inode *inode, struct file *file)
{
struct path realpath;
struct file *realfile;
struct ovl_dir_file *od;
enum ovl_path_type type;
od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
if (!od)
return -ENOMEM;
type = ovl_path_real(file->f_path.dentry, &realpath);
realfile = ovl_path_open(&realpath, file->f_flags);
if (IS_ERR(realfile)) {
kfree(od);
return PTR_ERR(realfile);
}
od->realfile = realfile;
od->is_real = !OVL_TYPE_MERGE(type);
od->is_upper = OVL_TYPE_UPPER(type);
file->private_data = od;
return 0;
}
const struct file_operations ovl_dir_operations = {
.read = generic_read_dir,
.open = ovl_dir_open,
.readdir = ovl_readdir,
.llseek = ovl_dir_llseek,
.fsync = ovl_dir_fsync,
.release = ovl_dir_release,
};
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
{
int err;
struct ovl_cache_entry *p;
err = ovl_dir_read_merged(dentry, list);
if (err)
return err;
err = 0;
list_for_each_entry(p, list, l_node) {
if (p->is_whiteout)
continue;
if (p->name[0] == '.') {
if (p->len == 1)
continue;
if (p->len == 2 && p->name[1] == '.')
continue;
}
err = -ENOTEMPTY;
break;
}
return err;
}
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
{
struct ovl_cache_entry *p;
mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD);
list_for_each_entry(p, list, l_node) {
struct dentry *dentry;
if (!p->is_whiteout)
continue;
dentry = lookup_one_len(p->name, upper, p->len);
if (IS_ERR(dentry)) {
pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n",
upper->d_name.name, p->len, p->name,
(int) PTR_ERR(dentry));
continue;
}
ovl_cleanup(upper->d_inode, dentry);
dput(dentry);
}
mutex_unlock(&upper->d_inode->i_mutex);
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,21 @@
KDIR ?= @KDIR@
ARCH ?= @ARCH@
KMODDIR = @KMODDIR@
src = @abs_srcdir@
obj-m += mcoverlay.o
mcoverlay-y := copy_up.o dir.o inode.o readdir.o super.o
.PHONY: clean install modules
modules:
$(MAKE) -C $(KDIR) M=$(PWD) SUBDIRS=$(PWD) ARCH=$(ARCH) modules
clean:
$(RM) .*.cmd *.mod.c *.o *.ko* Module.symvers modules.order -r .tmp*
install:
mkdir -p -m 755 $(KMODDIR)
install -m 644 mcoverlay.ko $(KMODDIR)

View File

@@ -101,6 +101,19 @@ int __glob_argc = -1;
char **__glob_argv = 0; char **__glob_argv = 0;
#endif #endif
#ifdef ENABLE_MCOVERLAYFS
#undef ENABLE_MCOVERLAYFS
#ifndef RHEL_RELEASE_CODE
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0) && LINUX_VERSION_CODE < KERNEL_VERSION(4,1,0)
#define ENABLE_MCOVERLAYFS 1
#endif // LINUX_VERSION_CODE == 4.0
#else
#if RHEL_RELEASE_CODE == RHEL_RELEASE_VERSION(7,2)
#define ENABLE_MCOVERLAYFS 1
#endif // RHEL_RELEASE_CODE == 7.2
#endif // RHEL_RELEASE_CODE
#endif // ENABLE_MCOVERLAYFS
typedef unsigned char cc_t; typedef unsigned char cc_t;
typedef unsigned int speed_t; typedef unsigned int speed_t;
typedef unsigned int tcflag_t; typedef unsigned int tcflag_t;
@@ -375,7 +388,7 @@ struct program_load_desc *load_interp(struct program_load_desc *desc0, FILE *fp)
unsigned char *dma_buf; unsigned char *dma_buf;
int lookup_exec_path(char *filename, char *path, int max_len) int lookup_exec_path(char *filename, char *path, int max_len, int execvp)
{ {
int found; int found;
int error; int error;
@@ -393,28 +406,27 @@ retry:
char *token, *string, *tofree; char *token, *string, *tofree;
char *PATH = getenv("COKERNEL_PATH"); char *PATH = getenv("COKERNEL_PATH");
if (!PATH) {
if (!execvp) {
if (strlen(filename) + 1 > max_len) {
return ENAMETOOLONG;
}
strcpy(path, filename);
error = access(path, X_OK);
if (error) {
return errno;
}
found = 1;
break;
}
if (!(PATH = getenv("COKERNEL_PATH"))) {
PATH = getenv("PATH"); PATH = getenv("PATH");
} }
if (strlen(filename) >= 255) { if (strlen(filename) >= 255) {
return ENAMETOOLONG; return ENAMETOOLONG;
} }
/* See first whether file is available in current working dir */
error = access(filename, X_OK);
if (error == 0) {
__dprintf("lookup_exec_path(): found %s in cwd\n", filename);
error = snprintf(path, max_len, "%s", filename);
if (error < 0 || error >= max_len) {
fprintf(stderr, "lookup_exec_path(): array too small?\n");
return ENOMEM;
}
found = 1;
break;
}
__dprintf("PATH: %s\n", PATH); __dprintf("PATH: %s\n", PATH);
@@ -442,6 +454,9 @@ retry:
} }
free(tofree); free(tofree);
if(!found){
return ENOENT;
}
break; break;
} }
@@ -868,7 +883,10 @@ struct thread_data_s {
pthread_mutex_t *lock; pthread_mutex_t *lock;
pthread_barrier_t *init_ready; pthread_barrier_t *init_ready;
} *thread_data; } *thread_data;
int ncpu; int ncpu;
int n_threads;
pid_t master_tid; pid_t master_tid;
pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
@@ -879,7 +897,7 @@ static void *main_loop_thread_func(void *arg)
struct thread_data_s *td = (struct thread_data_s *)arg; struct thread_data_s *td = (struct thread_data_s *)arg;
td->tid = gettid(); td->tid = gettid();
td->remote_tid = (int)td->tid; td->remote_tid = -1;
pthread_barrier_wait(&init_ready); pthread_barrier_wait(&init_ready);
td->ret = main_loop(td->fd, td->cpu, td->lock); td->ret = main_loop(td->fd, td->cpu, td->lock);
@@ -1106,9 +1124,9 @@ void init_worker_threads(int fd)
int i; int i;
pthread_mutex_init(&lock, NULL); pthread_mutex_init(&lock, NULL);
pthread_barrier_init(&init_ready, NULL, ncpu + 2); pthread_barrier_init(&init_ready, NULL, n_threads + 2);
for (i = 0; i <= ncpu; ++i) { for (i = 0; i <= n_threads; ++i) {
int ret; int ret;
thread_data[i].fd = fd; thread_data[i].fd = fd;
@@ -1129,7 +1147,6 @@ void init_worker_threads(int fd)
} }
#ifdef ENABLE_MCOVERLAYFS #ifdef ENABLE_MCOVERLAYFS
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0) && LINUX_VERSION_CODE < KERNEL_VERSION(4,1,0)
#define READ_BUFSIZE 1024 #define READ_BUFSIZE 1024
static int isunshare(void) static int isunshare(void)
{ {
@@ -1201,7 +1218,6 @@ static int isunshare(void)
__dprintf("err=%d\n", err); __dprintf("err=%d\n", err);
return err; return err;
} }
#endif
#endif // ENABLE_MCOVERLAYFS #endif // ENABLE_MCOVERLAYFS
#define MCK_RLIMIT_AS 0 #define MCK_RLIMIT_AS 0
@@ -1391,7 +1407,6 @@ int main(int argc, char **argv)
} }
#ifdef ENABLE_MCOVERLAYFS #ifdef ENABLE_MCOVERLAYFS
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0) && LINUX_VERSION_CODE < KERNEL_VERSION(4,1,0)
__dprintf("mcoverlay enable\n"); __dprintf("mcoverlay enable\n");
char mcos_procdir[PATH_MAX]; char mcos_procdir[PATH_MAX];
char mcos_sysdir[PATH_MAX]; char mcos_sysdir[PATH_MAX];
@@ -1439,12 +1454,11 @@ int main(int argc, char **argv)
} else if (error == -1) { } else if (error == -1) {
return 1; return 1;
} }
#endif
#else #else
__dprintf("mcoverlay disable\n"); __dprintf("mcoverlay disable\n");
#endif // ENABLE_MCOVERLAYFS #endif // ENABLE_MCOVERLAYFS
if (lookup_exec_path(argv[optind], path, sizeof(path)) != 0) { if (lookup_exec_path(argv[optind], path, sizeof(path), 1) != 0) {
fprintf(stderr, "error: finding file: %s\n", argv[optind]); fprintf(stderr, "error: finding file: %s\n", argv[optind]);
return 1; return 1;
} }
@@ -1456,7 +1470,7 @@ int main(int argc, char **argv)
/* Check whether shell script */ /* Check whether shell script */
if (shell) { if (shell) {
if (lookup_exec_path(shell, shell_path, sizeof(shell_path)) != 0) { if (lookup_exec_path(shell, shell_path, sizeof(shell_path), 0) != 0) {
fprintf(stderr, "error: finding file: %s\n", shell); fprintf(stderr, "error: finding file: %s\n", shell);
return 1; return 1;
} }
@@ -1518,6 +1532,19 @@ int main(int argc, char **argv)
return 1; return 1;
} }
n_threads = ncpu;
if (ncpu > 16) {
n_threads = 16;
}
/*
* XXX: keep thread_data ncpu sized despite that there are only
* n_threads worker threads in the pool so that signaling code
* keeps working.
*
* TODO: fix signaling code to be independent of TIDs.
* TODO: implement dynaic thread pool resizing.
*/
thread_data = (struct thread_data_s *)malloc(sizeof(struct thread_data_s) * (ncpu + 1)); thread_data = (struct thread_data_s *)malloc(sizeof(struct thread_data_s) * (ncpu + 1));
memset(thread_data, '\0', sizeof(struct thread_data_s) * (ncpu + 1)); memset(thread_data, '\0', sizeof(struct thread_data_s) * (ncpu + 1));
@@ -1602,7 +1629,7 @@ int main(int argc, char **argv)
return 1; return 1;
} }
for (i = 0; i <= ncpu; ++i) { for (i = 0; i <= n_threads; ++i) {
pthread_join(thread_data[i].thread_id, NULL); pthread_join(thread_data[i].thread_id, NULL);
} }
@@ -1664,16 +1691,14 @@ do_generic_syscall(
} }
static void static void
kill_thread(unsigned long cpu) kill_thread(unsigned long tid)
{ {
if(cpu >= 0 && cpu < ncpu){ int i;
pthread_kill(thread_data[cpu].thread_id, LOCALSIG);
}
else{
int i;
for (i = 0; i < ncpu; ++i) { for (i = 0; i < n_threads; ++i) {
if(thread_data[i].remote_tid == tid){
pthread_kill(thread_data[i].thread_id, LOCALSIG); pthread_kill(thread_data[i].thread_id, LOCALSIG);
break;
} }
} }
} }
@@ -1779,9 +1804,7 @@ char *
chgpath(char *in, char *buf) chgpath(char *in, char *buf)
{ {
#ifdef ENABLE_MCOVERLAYFS #ifdef ENABLE_MCOVERLAYFS
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0) && LINUX_VERSION_CODE < KERNEL_VERSION(4,1,0)
return in; return in;
#endif
#endif // ENABLE_MCOVERLAYFS #endif // ENABLE_MCOVERLAYFS
char *fn = in; char *fn = in;
struct stat sb; struct stat sb;
@@ -1832,6 +1855,8 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
//pthread_mutex_lock(lock); //pthread_mutex_lock(lock);
thread_data[cpu].remote_tid = w.sr.rtid;
switch (w.sr.number) { switch (w.sr.number) {
case __NR_open: case __NR_open:
ret = do_strncpy_from_user(fd, pathbuf, (void *)w.sr.args[0], PATH_MAX); ret = do_strncpy_from_user(fd, pathbuf, (void *)w.sr.args[0], PATH_MAX);
@@ -1870,13 +1895,13 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
sig = 0; sig = 0;
term = 0; term = 0;
do_syscall_return(fd, cpu, 0, 0, 0, 0, 0);
/* Drop executable file */ /* Drop executable file */
if ((ret = ioctl(fd, MCEXEC_UP_CLOSE_EXEC)) != 0) { if ((ret = ioctl(fd, MCEXEC_UP_CLOSE_EXEC)) != 0) {
fprintf(stderr, "WARNING: close_exec() couldn't find exec file?\n"); fprintf(stderr, "WARNING: close_exec() couldn't find exec file?\n");
} }
do_syscall_return(fd, cpu, 0, 0, 0, 0, 0);
__dprintf("__NR_exit/__NR_exit_group: %ld (cpu_id: %d)\n", __dprintf("__NR_exit/__NR_exit_group: %ld (cpu_id: %d)\n",
w.sr.args[0], cpu); w.sr.args[0], cpu);
if(w.sr.number == __NR_exit_group){ if(w.sr.number == __NR_exit_group){
@@ -1944,6 +1969,39 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
thread_data[oldcpuid].remote_tid = wtid; thread_data[oldcpuid].remote_tid = wtid;
} }
/*
* Number of TIDs and the remote physical address where TIDs are
* expected are passed in arg 4 and 5, respectively.
*/
if (w.sr.args[4] > 0) {
struct remote_transfer trans;
int i = 0;
int *tids = malloc(sizeof(int) * w.sr.args[4]);
if (!tids) {
fprintf(stderr, "__NR_gettid(): error allocating TIDs\n");
goto gettid_out;
}
for (i = 0; i < ncpu && i < w.sr.args[4]; ++i) {
tids[i] = thread_data[i].tid;
}
for (; i < ncpu; ++i) {
tids[i] = 0;
}
trans.userp = (void*)tids;
trans.rphys = w.sr.args[5];
trans.size = sizeof(int) * w.sr.args[4];
trans.direction = MCEXEC_UP_TRANSFER_TO_REMOTE;
if (ioctl(fd, MCEXEC_UP_TRANSFER, &trans) != 0) {
fprintf(stderr, "__NR_gettid(): error transfering TIDs\n");
}
free(tids);
}
gettid_out:
do_syscall_return(fd, cpu, thread_data[newcpuid].remote_tid, 0, 0, 0, 0); do_syscall_return(fd, cpu, thread_data[newcpuid].remote_tid, 0, 0, 0, 0);
break; break;
} }
@@ -2039,7 +2097,6 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
/* Reinit signals and syscall threads */ /* Reinit signals and syscall threads */
init_sigaction(); init_sigaction();
init_worker_threads(fd);
__dprintf("pid(%d): signals and syscall threads OK\n", __dprintf("pid(%d): signals and syscall threads OK\n",
getpid()); getpid());
@@ -2053,6 +2110,8 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
goto fork_child_sync_pipe; goto fork_child_sync_pipe;
} }
init_worker_threads(fd);
fork_child_sync_pipe: fork_child_sync_pipe:
sem_post(&fs->sem); sem_post(&fs->sem);
if (fs->status) if (fs->status)
@@ -2163,7 +2222,7 @@ fork_err:
shell = NULL; shell = NULL;
filename = (char *)w.sr.args[1]; filename = (char *)w.sr.args[1];
if ((ret = lookup_exec_path(filename, path, sizeof(path))) if ((ret = lookup_exec_path(filename, path, sizeof(path), 0))
!= 0) { != 0) {
goto return_execve1; goto return_execve1;
} }
@@ -2177,7 +2236,7 @@ fork_err:
/* Check whether shell script */ /* Check whether shell script */
if (shell) { if (shell) {
if ((ret = lookup_exec_path(shell, shell_path, if ((ret = lookup_exec_path(shell, shell_path,
sizeof(shell_path))) != 0) { sizeof(shell_path), 0)) != 0) {
fprintf(stderr, "execve(): error: finding file: %s\n", shell); fprintf(stderr, "execve(): error: finding file: %s\n", shell);
goto return_execve1; goto return_execve1;
} }
@@ -2198,6 +2257,7 @@ fork_err:
strcpy(desc->shell_path, shell_path); strcpy(desc->shell_path, shell_path);
} }
desc->enable_vdso = enable_vdso;
__dprintf("execve(): load_elf_desc() for %s OK, num sections: %d\n", __dprintf("execve(): load_elf_desc() for %s OK, num sections: %d\n",
path, desc->num_sections); path, desc->num_sections);
@@ -2310,6 +2370,53 @@ return_execve2:
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0); do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break; break;
case __NR_setresuid:
ret = setresuid(w.sr.args[0], w.sr.args[1], w.sr.args[2]);
if(ret == -1)
ret = -errno;
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_setreuid:
ret = setreuid(w.sr.args[0], w.sr.args[1]);
if(ret == -1)
ret = -errno;
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_setuid:
ret = setuid(w.sr.args[0]);
if(ret == -1)
ret = -errno;
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_setresgid:
ret = setresgid(w.sr.args[0], w.sr.args[1], w.sr.args[2]);
if(ret == -1)
ret = -errno;
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_setregid:
ret = setregid(w.sr.args[0], w.sr.args[1]);
if(ret == -1)
ret = -errno;
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_setgid:
ret = setgid(w.sr.args[0]);
if(ret == -1)
ret = -errno;
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_setfsgid:
ret = setfsgid(w.sr.args[0]);
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_close: case __NR_close:
if(w.sr.args[0] == fd) if(w.sr.args[0] == fd)
ret = -EBADF; ret = -EBADF;
@@ -2343,7 +2450,9 @@ return_execve2:
break; break;
} }
thread_data[cpu].remote_tid = -1;
//pthread_mutex_unlock(lock); //pthread_mutex_unlock(lock);
} }
__dprint("timed out.\n"); __dprint("timed out.\n");

View File

@@ -110,6 +110,7 @@ int __kprintf(const char *format, ...)
char buf[KPRINTF_LOCAL_BUF_LEN]; char buf[KPRINTF_LOCAL_BUF_LEN];
/* Copy into the local buf */ /* Copy into the local buf */
len = sprintf(buf, "[%3d]: ", ihk_mc_get_processor_id());
va_start(va, format); va_start(va, format);
len += vsnprintf(buf + len, KPRINTF_LOCAL_BUF_LEN - len - 2, format, va); len += vsnprintf(buf + len, KPRINTF_LOCAL_BUF_LEN - len - 2, format, va);
va_end(va); va_end(va);

View File

@@ -99,7 +99,7 @@ int devobj_create(int fd, size_t len, off_t off, struct memobj **objp, int *maxp
} }
memset(obj, 0, sizeof(*obj)); memset(obj, 0, sizeof(*obj));
obj->pfn_table = allocate_pages(pfn_npages, IHK_MC_AP_NOWAIT); obj->pfn_table = ihk_mc_alloc_pages(pfn_npages, IHK_MC_AP_NOWAIT);
if (!obj->pfn_table) { if (!obj->pfn_table) {
error = -ENOMEM; error = -ENOMEM;
kprintf("%s: error: fd: %d, len: %lu, off: %lu allocating PFN failed.\n", kprintf("%s: error: fd: %d, len: %lu, off: %lu allocating PFN failed.\n",
@@ -141,7 +141,7 @@ int devobj_create(int fd, size_t len, off_t off, struct memobj **objp, int *maxp
out: out:
if (obj) { if (obj) {
if (obj->pfn_table) { if (obj->pfn_table) {
free_pages(obj->pfn_table, pfn_npages); ihk_mc_free_pages(obj->pfn_table, pfn_npages);
} }
kfree(obj); kfree(obj);
} }
@@ -166,6 +166,8 @@ static void devobj_release(struct memobj *memobj)
struct devobj *obj = to_devobj(memobj); struct devobj *obj = to_devobj(memobj);
struct devobj *free_obj = NULL; struct devobj *free_obj = NULL;
uintptr_t handle; uintptr_t handle;
const size_t pfn_npages =
(obj->npages / (PAGE_SIZE / sizeof(uintptr_t))) + 1;
dkprintf("devobj_release(%p %lx)\n", obj, obj->handle); dkprintf("devobj_release(%p %lx)\n", obj, obj->handle);
@@ -194,7 +196,7 @@ static void devobj_release(struct memobj *memobj)
} }
if (obj->pfn_table) { if (obj->pfn_table) {
free_pages(obj->pfn_table, 1); ihk_mc_free_pages(obj->pfn_table, pfn_npages);
} }
kfree(free_obj); kfree(free_obj);
} }

View File

@@ -332,6 +332,9 @@ int prepare_process_ranges_args_envs(struct thread *thread,
goto err; goto err;
} }
} }
else {
vm->vdso_addr = NULL;
}
p->rprocess = (unsigned long)thread; p->rprocess = (unsigned long)thread;
p->rpgtable = virt_to_phys(as->page_table); p->rpgtable = virt_to_phys(as->page_table);
@@ -373,10 +376,16 @@ static int process_msg_prepare_process(unsigned long rphys)
} }
n = p->num_sections; n = p->num_sections;
if (n > 16) {
kprintf("%s: ERROR: more ELF sections than 16??\n",
__FUNCTION__);
return -ENOMEM;
}
dkprintf("# of sections: %d\n", n); dkprintf("# of sections: %d\n", n);
if((pn = ihk_mc_allocate(sizeof(struct program_load_desc) if((pn = kmalloc(sizeof(struct program_load_desc)
+ sizeof(struct program_image_section) * n, IHK_MC_AP_NOWAIT)) == NULL){ + sizeof(struct program_image_section) * n,
IHK_MC_AP_NOWAIT)) == NULL){
ihk_mc_unmap_virtual(p, npages, 0); ihk_mc_unmap_virtual(p, npages, 0);
ihk_mc_unmap_memory(NULL, phys, sz); ihk_mc_unmap_memory(NULL, phys, sz);
return -ENOMEM; return -ENOMEM;
@@ -385,7 +394,7 @@ static int process_msg_prepare_process(unsigned long rphys)
+ sizeof(struct program_image_section) * n); + sizeof(struct program_image_section) * n);
if((thread = create_thread(p->entry)) == NULL){ if((thread = create_thread(p->entry)) == NULL){
ihk_mc_free(pn); kfree(pn);
ihk_mc_unmap_virtual(p, npages, 1); ihk_mc_unmap_virtual(p, npages, 1);
ihk_mc_unmap_memory(NULL, phys, sz); ihk_mc_unmap_memory(NULL, phys, sz);
return -ENOMEM; return -ENOMEM;
@@ -435,7 +444,7 @@ static int process_msg_prepare_process(unsigned long rphys)
dkprintf("new process : %p [%d] / table : %p\n", proc, proc->pid, dkprintf("new process : %p [%d] / table : %p\n", proc, proc->pid,
vm->address_space->page_table); vm->address_space->page_table);
ihk_mc_free(pn); kfree(pn);
ihk_mc_unmap_virtual(p, npages, 1); ihk_mc_unmap_virtual(p, npages, 1);
ihk_mc_unmap_memory(NULL, phys, sz); ihk_mc_unmap_memory(NULL, phys, sz);
@@ -443,7 +452,7 @@ static int process_msg_prepare_process(unsigned long rphys)
return 0; return 0;
err: err:
ihk_mc_free(pn); kfree(pn);
ihk_mc_unmap_virtual(p, npages, 1); ihk_mc_unmap_virtual(p, npages, 1);
ihk_mc_unmap_memory(NULL, phys, sz); ihk_mc_unmap_memory(NULL, phys, sz);
destroy_thread(thread); destroy_thread(thread);
@@ -452,7 +461,7 @@ err:
static void process_msg_init(struct ikc_scd_init_param *pcp, struct syscall_params *lparam) static void process_msg_init(struct ikc_scd_init_param *pcp, struct syscall_params *lparam)
{ {
lparam->response_va = allocate_pages(RESPONSE_PAGE_COUNT, 0); lparam->response_va = ihk_mc_alloc_pages(RESPONSE_PAGE_COUNT, 0);
lparam->response_pa = virt_to_phys(lparam->response_va); lparam->response_pa = virt_to_phys(lparam->response_va);
pcp->request_page = 0; pcp->request_page = 0;
@@ -521,12 +530,7 @@ static void syscall_channel_send(struct ihk_ikc_channel_desc *c,
} }
extern unsigned long do_kill(struct thread *, int, int, int, struct siginfo *, int ptracecont); extern unsigned long do_kill(struct thread *, int, int, int, struct siginfo *, int ptracecont);
extern void settid(struct thread *proc, int mode, int newcpuid, int oldcpuid);
extern void process_procfs_request(unsigned long rarg); extern void process_procfs_request(unsigned long rarg);
extern int memcheckall();
extern int freecheck(int runcount);
extern int runcount;
extern void terminate_host(int pid); extern void terminate_host(int pid);
extern void debug_log(long); extern void debug_log(long);
@@ -561,6 +565,7 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
struct ikc_scd_packet *packet = __packet; struct ikc_scd_packet *packet = __packet;
struct ikc_scd_packet pckt; struct ikc_scd_packet pckt;
int rc; int rc;
struct mcs_rwlock_node_irqsave lock;
struct thread *thread; struct thread *thread;
struct process *proc; struct process *proc;
struct mcctrl_signal { struct mcctrl_signal {
@@ -572,22 +577,17 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
} *sp, info; } *sp, info;
unsigned long pp; unsigned long pp;
int cpuid; int cpuid;
int ret = 0;
switch (packet->msg) { switch (packet->msg) {
case SCD_MSG_INIT_CHANNEL_ACKED: case SCD_MSG_INIT_CHANNEL_ACKED:
dkprintf("SCD_MSG_INIT_CHANNEL_ACKED\n"); dkprintf("SCD_MSG_INIT_CHANNEL_ACKED\n");
process_msg_init_acked(c, packet->arg); process_msg_init_acked(c, packet->arg);
return 0; ret = 0;
break;
case SCD_MSG_PREPARE_PROCESS: case SCD_MSG_PREPARE_PROCESS:
if (find_command_line("memdebug")) {
memcheckall();
if (runcount)
freecheck(runcount);
runcount++;
}
if((rc = process_msg_prepare_process(packet->arg)) == 0){ if((rc = process_msg_prepare_process(packet->arg)) == 0){
pckt.msg = SCD_MSG_PREPARE_PROCESS_ACKED; pckt.msg = SCD_MSG_PREPARE_PROCESS_ACKED;
pckt.err = 0; pckt.err = 0;
@@ -600,19 +600,21 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
pckt.arg = packet->arg; pckt.arg = packet->arg;
syscall_channel_send(c, &pckt); syscall_channel_send(c, &pckt);
return 0; ret = 0;
break;
case SCD_MSG_SCHEDULE_PROCESS: case SCD_MSG_SCHEDULE_PROCESS:
cpuid = obtain_clone_cpuid(); cpuid = obtain_clone_cpuid();
if(cpuid == -1){ if(cpuid == -1){
kprintf("No CPU available\n"); kprintf("No CPU available\n");
return -1; ret = -1;
break;
} }
dkprintf("SCD_MSG_SCHEDULE_PROCESS: %lx\n", packet->arg); dkprintf("SCD_MSG_SCHEDULE_PROCESS: %lx\n", packet->arg);
thread = (struct thread *)packet->arg; thread = (struct thread *)packet->arg;
proc = thread->proc; proc = thread->proc;
settid(thread, 0, cpuid, -1); settid(thread, 0, cpuid, -1, 0, NULL);
proc->status = PS_RUNNING; proc->status = PS_RUNNING;
thread->status = PS_RUNNING; thread->status = PS_RUNNING;
chain_thread(thread); chain_thread(thread);
@@ -620,7 +622,29 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
runq_add_thread(thread, cpuid); runq_add_thread(thread, cpuid);
//cpu_local_var(next) = (struct thread *)packet->arg; //cpu_local_var(next) = (struct thread *)packet->arg;
return 0; ret = 0;
break;
/*
* Used for syscall offload reply message to explicitly schedule in
* the waiting thread
*/
case SCD_MSG_WAKE_UP_SYSCALL_THREAD:
thread = find_thread(0, packet->ttid, &lock);
if (!thread) {
kprintf("%s: WARNING: no thread for SCD reply? TID: %d\n",
__FUNCTION__, packet->ttid);
ret = -EINVAL;
break;
}
thread_unlock(thread, &lock);
dkprintf("%s: SCD_MSG_WAKE_UP_SYSCALL_THREAD: waking up tid %d\n",
__FUNCTION__, packet->ttid);
waitq_wakeup(&thread->scd_wq);
ret = 0;
break;
case SCD_MSG_SEND_SIGNAL: case SCD_MSG_SEND_SIGNAL:
pp = ihk_mc_map_memory(NULL, packet->arg, sizeof(struct mcctrl_signal)); pp = ihk_mc_map_memory(NULL, packet->arg, sizeof(struct mcctrl_signal));
sp = (struct mcctrl_signal *)ihk_mc_map_virtual(pp, 1, PTATTR_WRITABLE | PTATTR_ACTIVE); sp = (struct mcctrl_signal *)ihk_mc_map_virtual(pp, 1, PTATTR_WRITABLE | PTATTR_ACTIVE);
@@ -635,18 +659,25 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
rc = do_kill(NULL, info.pid, info.tid, info.sig, &info.info, 0); rc = do_kill(NULL, info.pid, info.tid, info.sig, &info.info, 0);
kprintf("SCD_MSG_SEND_SIGNAL: do_kill(pid=%d, tid=%d, sig=%d)=%d\n", info.pid, info.tid, info.sig, rc); kprintf("SCD_MSG_SEND_SIGNAL: do_kill(pid=%d, tid=%d, sig=%d)=%d\n", info.pid, info.tid, info.sig, rc);
return 0; ret = 0;
break;
case SCD_MSG_PROCFS_REQUEST: case SCD_MSG_PROCFS_REQUEST:
process_procfs_request(packet->arg); process_procfs_request(packet->arg);
return 0; ret = 0;
break;
case SCD_MSG_CLEANUP_PROCESS: case SCD_MSG_CLEANUP_PROCESS:
dkprintf("SCD_MSG_CLEANUP_PROCESS pid=%d\n", packet->pid); dkprintf("SCD_MSG_CLEANUP_PROCESS pid=%d\n", packet->pid);
terminate_host(packet->pid); terminate_host(packet->pid);
return 0; ret = 0;
break;
case SCD_MSG_DEBUG_LOG: case SCD_MSG_DEBUG_LOG:
dkprintf("SCD_MSG_DEBUG_LOG code=%lx\n", packet->arg); dkprintf("SCD_MSG_DEBUG_LOG code=%lx\n", packet->arg);
debug_log(packet->arg); debug_log(packet->arg);
return 0; ret = 0;
break;
case SCD_MSG_SYSFS_REQ_SHOW: case SCD_MSG_SYSFS_REQ_SHOW:
case SCD_MSG_SYSFS_REQ_STORE: case SCD_MSG_SYSFS_REQ_STORE:
@@ -654,7 +685,8 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
sysfss_packet_handler(c, packet->msg, packet->err, sysfss_packet_handler(c, packet->msg, packet->err,
packet->sysfs_arg1, packet->sysfs_arg2, packet->sysfs_arg1, packet->sysfs_arg2,
packet->sysfs_arg3); packet->sysfs_arg3);
return 0; ret = 0;
break;
case SCD_MSG_GET_CPU_MAPPING: case SCD_MSG_GET_CPU_MAPPING:
req_get_cpu_mapping(packet->arg); req_get_cpu_mapping(packet->arg);
@@ -662,17 +694,21 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
pckt.msg = SCD_MSG_REPLY_GET_CPU_MAPPING; pckt.msg = SCD_MSG_REPLY_GET_CPU_MAPPING;
pckt.arg = packet->arg; pckt.arg = packet->arg;
syscall_channel_send(c, &pckt); syscall_channel_send(c, &pckt);
return 0; ret = 0;
break;
default: default:
kprintf("syscall_pakcet_handler:unknown message " kprintf("syscall_pakcet_handler:unknown message "
"(%d.%d.%d.%d.%d.%#lx)\n", "(%d.%d.%d.%d.%d.%#lx)\n",
packet->msg, packet->ref, packet->osnum, packet->msg, packet->ref, packet->osnum,
packet->pid, packet->err, packet->arg); packet->pid, packet->err, packet->arg);
return 0; ret = 0;
break;
} }
return 0;
ihk_ikc_release_packet((struct ihk_ikc_free_packet *)packet, c);
return ret;
} }
void init_host_syscall_channel(void) void init_host_syscall_channel(void)

View File

@@ -19,11 +19,13 @@
* CPU Local Storage (cls) * CPU Local Storage (cls)
*/ */
struct malloc_header { struct kmalloc_header {
unsigned int check; unsigned int front_magic;
unsigned int cpu_id; unsigned int cpu_id;
struct malloc_header *next; struct list_head list;
unsigned long size; int size; /* The size of this chunk without the header */
unsigned int end_magic;
/* 32 bytes */
}; };
#include <ihk/lock.h> #include <ihk/lock.h>
@@ -38,8 +40,9 @@ extern ihk_spinlock_t cpu_status_lock;
struct cpu_local_var { struct cpu_local_var {
/* malloc */ /* malloc */
struct malloc_header free_list; struct list_head free_list;
struct malloc_header *remote_free_list; struct list_head remote_free_list;
ihk_spinlock_t remote_free_list_lock;
struct thread idle; struct thread idle;
struct process idle_proc; struct process idle_proc;
@@ -73,6 +76,7 @@ struct cpu_local_var {
int in_interrupt; int in_interrupt;
int no_preempt; int no_preempt;
int timer_enabled; int timer_enabled;
int kmalloc_initialized;
} __attribute__((aligned(64))); } __attribute__((aligned(64)));

View File

@@ -32,11 +32,10 @@ void *_kmalloc(int size, enum ihk_mc_ap_flag flag, char *file, int line);
void _kfree(void *ptr, char *file, int line); void _kfree(void *ptr, char *file, int line);
void *__kmalloc(int size, enum ihk_mc_ap_flag flag); void *__kmalloc(int size, enum ihk_mc_ap_flag flag);
void __kfree(void *ptr); void __kfree(void *ptr);
void *___kmalloc(int size, enum ihk_mc_ap_flag flag);
void ___kfree(void *ptr);
int _memcheck(void *ptr, char *msg, char *file, int line, int free); int _memcheck(void *ptr, char *msg, char *file, int line, int free);
int memcheckall(); int memcheckall();
int freecheck(int runcount); int freecheck(int runcount);
void kmalloc_consolidate_free_list(void);
#endif #endif

View File

@@ -161,7 +161,7 @@
#endif #endif
#define USER_STACK_NR_PAGES 8192 #define USER_STACK_NR_PAGES 8192
#define KERNEL_STACK_NR_PAGES 25 #define KERNEL_STACK_NR_PAGES 32
#define NOPHYS ((uintptr_t)-1) #define NOPHYS ((uintptr_t)-1)
@@ -349,6 +349,11 @@ struct sig_pending {
typedef void pgio_func_t(void *arg); typedef void pgio_func_t(void *arg);
struct mcexec_tid {
int tid;
struct thread *thread;
};
/* Represents a node in the process fork tree, it may exist even after the /* Represents a node in the process fork tree, it may exist even after the
* corresponding process exited due to references from the parent and/or * corresponding process exited due to references from the parent and/or
* children and is used for implementing wait/waitpid without having a * children and is used for implementing wait/waitpid without having a
@@ -363,6 +368,9 @@ struct process {
// threads and children // threads and children
struct list_head threads_list; struct list_head threads_list;
mcs_rwlock_lock_t threads_lock; // lock for threads_list mcs_rwlock_lock_t threads_lock; // lock for threads_list
/* TID set of proxy process */
struct mcexec_tid *tids;
int nr_tids;
/* The ptracing process behave as the parent of the ptraced process /* The ptracing process behave as the parent of the ptraced process
after using PTRACE_ATTACH except getppid. So we save it here. */ after using PTRACE_ATTACH except getppid. So we save it here. */
@@ -559,6 +567,9 @@ struct thread {
struct itimerval itimer_prof; struct itimerval itimer_prof;
struct timespec itimer_virtual_value; struct timespec itimer_virtual_value;
struct timespec itimer_prof_value; struct timespec itimer_prof_value;
/* Syscall offload wait queue head */
struct waitq scd_wq;
}; };
struct process_vm { struct process_vm {
@@ -679,5 +690,7 @@ void chain_thread(struct thread *);
void proc_init(); void proc_init();
void set_timer(); void set_timer();
struct sig_pending *hassigpending(struct thread *thread); struct sig_pending *hassigpending(struct thread *thread);
void settid(struct thread *thread, int mode, int newcpuid, int oldcpuid,
int nr_tids, int *tids);
#endif #endif

View File

@@ -31,6 +31,7 @@
#define SCD_MSG_PREPARE_PROCESS_ACKED 0x2 #define SCD_MSG_PREPARE_PROCESS_ACKED 0x2
#define SCD_MSG_PREPARE_PROCESS_NACKED 0x7 #define SCD_MSG_PREPARE_PROCESS_NACKED 0x7
#define SCD_MSG_SCHEDULE_PROCESS 0x3 #define SCD_MSG_SCHEDULE_PROCESS 0x3
#define SCD_MSG_WAKE_UP_SYSCALL_THREAD 0x14
#define SCD_MSG_INIT_CHANNEL 0x5 #define SCD_MSG_INIT_CHANNEL 0x5
#define SCD_MSG_INIT_CHANNEL_ACKED 0x6 #define SCD_MSG_INIT_CHANNEL_ACKED 0x6
@@ -117,28 +118,6 @@ struct user_desc {
unsigned int lm:1; unsigned int lm:1;
}; };
struct ikc_scd_packet {
int msg;
int err;
union {
/* for traditional SCD_MSG_* */
struct {
int ref;
int osnum;
int pid;
int padding;
unsigned long arg;
};
/* for SCD_MSG_SYSFS_* */
struct {
long sysfs_arg1;
long sysfs_arg2;
long sysfs_arg3;
};
};
};
struct program_image_section { struct program_image_section {
unsigned long vaddr; unsigned long vaddr;
unsigned long len; unsigned long len;
@@ -210,13 +189,58 @@ struct ikc_scd_init_param {
}; };
struct syscall_request { struct syscall_request {
/* TID of requesting thread */
int rtid;
/*
* TID of target thread. Remote page fault response needs to designate the
* thread that must serve the request, 0 indicates any thread from the pool
*/
int ttid;
unsigned long valid; unsigned long valid;
unsigned long number; unsigned long number;
unsigned long args[6]; unsigned long args[6];
}; };
struct ikc_scd_packet {
int msg;
int err;
union {
/* for traditional SCD_MSG_* */
struct {
int ref;
int osnum;
int pid;
unsigned long arg;
struct syscall_request req;
unsigned long resp_pa;
};
/* for SCD_MSG_SYSFS_* */
struct {
long sysfs_arg1;
long sysfs_arg2;
long sysfs_arg3;
};
/* SCD_MSG_SCHEDULE_THREAD */
struct {
int ttid;
};
};
char padding[12];
};
#define IHK_SCD_REQ_THREAD_SPINNING 0
#define IHK_SCD_REQ_THREAD_TO_BE_WOKEN 1
#define IHK_SCD_REQ_THREAD_DESCHEDULED 2
struct syscall_response { struct syscall_response {
/* TID of the thread that requested the service */
int ttid;
/* TID of the mcexec thread that is serving the request */
int stid;
unsigned long status; unsigned long status;
unsigned long req_thread_status;
long ret; long ret;
unsigned long fault_address; unsigned long fault_address;
unsigned long fault_reason; unsigned long fault_reason;

View File

@@ -371,7 +371,7 @@ int main(void)
} }
kmsg_init(mode); kmsg_init(mode);
kputs("MCK started.\n"); kputs("IHK/McKernel started.\n");
arch_init(); arch_init();
@@ -393,7 +393,7 @@ int main(void)
futex_init(); futex_init();
kputs("MCK/IHK booted.\n"); kputs("IHK/McKernel booted.\n");
#ifdef DCFA_KMOD #ifdef DCFA_KMOD
mc_cmd_client_init(); mc_cmd_client_init();

View File

@@ -156,13 +156,17 @@ void sbox_write(int offset, unsigned int value);
static void query_free_mem_interrupt_handler(void *priv) static void query_free_mem_interrupt_handler(void *priv)
{ {
#ifdef ATTACHED_MIC
dkprintf("query free mem handler!\n");
int pages = ihk_pagealloc_query_free(pa_allocator); int pages = ihk_pagealloc_query_free(pa_allocator);
dkprintf("free pages: %d\n", pages); kprintf("McKernel free pages: %d\n", pages);
if (find_command_line("memdebug")) {
extern void kmalloc_memcheck(void);
kmalloc_memcheck();
}
#ifdef ATTACHED_MIC
sbox_write(SBOX_SCRATCH0, pages); sbox_write(SBOX_SCRATCH0, pages);
sbox_write(SBOX_SCRATCH1, 1); sbox_write(SBOX_SCRATCH1, 1);
#endif #endif
@@ -265,6 +269,13 @@ void remote_flush_tlb_cpumask(struct process_vm *vm,
unsigned long tsc; unsigned long tsc;
tsc = rdtsc() + 12884901888; /* 1.2GHz =>10 sec */ tsc = rdtsc() + 12884901888; /* 1.2GHz =>10 sec */
#endif #endif
if (flush_entry->addr) {
flush_tlb_single(flush_entry->addr & PAGE_MASK);
}
/* Zero address denotes full TLB flush */
else {
flush_tlb();
}
/* Wait for all cores */ /* Wait for all cores */
while (ihk_atomic_read(&flush_entry->pending) != 0) { while (ihk_atomic_read(&flush_entry->pending) != 0) {
@@ -335,10 +346,9 @@ static void page_fault_handler(void *fault_addr, uint64_t reason, void *regs)
// no return // no return
} }
kprintf("[%d]page_fault_handler(%p,%lx,%p):" kprintf("%s fault VM failed for TID: %d, addr: 0x%lx, "
"fault vm failed. %d, TID: %d\n", "reason: %d, error: %d\n", __FUNCTION__,
ihk_mc_get_processor_id(), fault_addr, thread->tid, fault_addr, reason, error);
reason, regs, error, thread->tid);
unhandled_page_fault(thread, fault_addr, regs); unhandled_page_fault(thread, fault_addr, regs);
preempt_enable(); preempt_enable();
memset(&info, '\0', sizeof info); memset(&info, '\0', sizeof info);
@@ -425,8 +435,9 @@ static void page_allocator_init(void)
ihk_mc_reserve_arch_pages(pa_start, pa_end, reserve_pages); ihk_mc_reserve_arch_pages(pa_start, pa_end, reserve_pages);
kprintf("Available pages: %ld pages\n", kprintf("Available memory: %ld bytes in %ld pages\n",
ihk_pagealloc_count(pa_allocator)); (ihk_pagealloc_count(pa_allocator) * PAGE_SIZE),
ihk_pagealloc_count(pa_allocator));
/* Notify the ihk to use my page allocator */ /* Notify the ihk to use my page allocator */
ihk_mc_set_page_allocator(&allocator); ihk_mc_set_page_allocator(&allocator);
@@ -507,6 +518,9 @@ static void page_init(void)
static char *memdebug = NULL; static char *memdebug = NULL;
static void *___kmalloc(int size, enum ihk_mc_ap_flag flag);
static void ___kfree(void *ptr);
void register_kmalloc(void) void register_kmalloc(void)
{ {
if(memdebug){ if(memdebug){
@@ -636,60 +650,100 @@ void mem_init(void)
} }
} }
struct location { #define KMALLOC_TRACK_HASH_SHIFT (8)
struct location *next; #define KMALLOC_TRACK_HASH_SIZE (1 << KMALLOC_TRACK_HASH_SHIFT)
int line; #define KMALLOC_TRACK_HASH_MASK (KMALLOC_TRACK_HASH_SIZE - 1)
int cnt;
char file[0];
};
struct alloc { struct list_head kmalloc_track_hash[KMALLOC_TRACK_HASH_SIZE];
struct alloc *next; ihk_spinlock_t kmalloc_track_hash_locks[KMALLOC_TRACK_HASH_SIZE];
struct malloc_header *p;
struct location *loc; struct list_head kmalloc_addr_hash[KMALLOC_TRACK_HASH_SIZE];
int size; ihk_spinlock_t kmalloc_addr_hash_locks[KMALLOC_TRACK_HASH_SIZE];
int kmalloc_track_initialized = 0;
int kmalloc_runcount = 0;
struct kmalloc_track_addr_entry {
void *addr;
int runcount; int runcount;
struct list_head list; /* track_entry's list */
struct kmalloc_track_entry *entry;
struct list_head hash; /* address hash */
}; };
#define HASHNUM 129 struct kmalloc_track_entry {
char *file;
int line;
int size;
ihk_atomic_t alloc_count;
struct list_head hash;
struct list_head addr_list;
ihk_spinlock_t addr_list_lock;
};
static struct alloc *allochash[HASHNUM]; void kmalloc_init(void)
static struct location *lochash[HASHNUM];
static ihk_spinlock_t alloclock;
int runcount;
static unsigned char *page;
static int space;
static void *dalloc(unsigned long size)
{ {
void *r; struct cpu_local_var *v = get_this_cpu_local_var();
static int pos = 0;
unsigned long irqstate;
irqstate = ihk_mc_spinlock_lock(&alloclock); register_kmalloc();
size = (size + 7) & 0xfffffffffffffff8L;
if (pos + size > space) { INIT_LIST_HEAD(&v->free_list);
page = allocate_pages(1, IHK_MC_AP_NOWAIT); INIT_LIST_HEAD(&v->remote_free_list);
space = 4096; ihk_mc_spinlock_init(&v->remote_free_list_lock);
pos = 0;
v->kmalloc_initialized = 1;
if (!kmalloc_track_initialized) {
int i;
memdebug = find_command_line("memdebug");
kmalloc_track_initialized = 1;
for (i = 0; i < KMALLOC_TRACK_HASH_SIZE; ++i) {
ihk_mc_spinlock_init(&kmalloc_track_hash_locks[i]);
INIT_LIST_HEAD(&kmalloc_track_hash[i]);
ihk_mc_spinlock_init(&kmalloc_addr_hash_locks[i]);
INIT_LIST_HEAD(&kmalloc_addr_hash[i]);
}
} }
r = page + pos;
pos += size;
ihk_mc_spinlock_unlock(&alloclock, irqstate);
return r;
} }
/* NOTE: Hash lock must be held */
struct kmalloc_track_entry *__kmalloc_track_find_entry(
int size, char *file, int line)
{
struct kmalloc_track_entry *entry_iter, *entry = NULL;
int hash = (strlen(file) + line + size) & KMALLOC_TRACK_HASH_MASK;
list_for_each_entry(entry_iter, &kmalloc_track_hash[hash], hash) {
if (!strcmp(entry_iter->file, file) &&
entry_iter->size == size &&
entry_iter->line == line) {
entry = entry_iter;
break;
}
}
if (entry) {
dkprintf("%s found entry %s:%d size: %d\n", __FUNCTION__,
file, line, size);
}
else {
dkprintf("%s couldn't find entry %s:%d size: %d\n", __FUNCTION__,
file, line, size);
}
return entry;
}
/* Top level routines called from macro */
void *_kmalloc(int size, enum ihk_mc_ap_flag flag, char *file, int line) void *_kmalloc(int size, enum ihk_mc_ap_flag flag, char *file, int line)
{ {
char *r = ___kmalloc(size, flag); unsigned long irqflags;
struct malloc_header *h; struct kmalloc_track_entry *entry;
unsigned long hash; struct kmalloc_track_addr_entry *addr_entry;
char *t; int hash, addr_hash;
struct location *lp; void *r = ___kmalloc(size, flag);
struct alloc *ap;
unsigned long alcsize;
unsigned long chksize;
if (!memdebug) if (!memdebug)
return r; return r;
@@ -697,177 +751,177 @@ void *_kmalloc(int size, enum ihk_mc_ap_flag flag, char *file, int line)
if (!r) if (!r)
return r; return r;
h = ((struct malloc_header *)r) - 1; hash = (strlen(file) + line + size) & KMALLOC_TRACK_HASH_MASK;
alcsize = h->size * sizeof(struct malloc_header); irqflags = ihk_mc_spinlock_lock(&kmalloc_track_hash_locks[hash]);
chksize = alcsize - size;
memset(r + size, '\x5a', chksize);
for (hash = 0, t = file; *t; t++) { entry = __kmalloc_track_find_entry(size, file, line);
hash <<= 1;
hash += *t; if (!entry) {
entry = ___kmalloc(sizeof(*entry), IHK_MC_AP_NOWAIT);
if (!entry) {
kprintf("%s: ERROR: allocating tracking entry\n");
goto out;
}
entry->line = line;
entry->size = size;
ihk_atomic_set(&entry->alloc_count, 0);
ihk_mc_spinlock_init(&entry->addr_list_lock);
INIT_LIST_HEAD(&entry->addr_list);
entry->file = ___kmalloc(strlen(file) + 1, IHK_MC_AP_NOWAIT);
if (!entry->file) {
kprintf("%s: ERROR: allocating file string\n");
___kfree(entry);
ihk_mc_spinlock_unlock(&kmalloc_track_hash_locks[hash], irqflags);
goto out;
}
strcpy(entry->file, file);
entry->file[strlen(file)] = 0;
list_add(&entry->hash, &kmalloc_track_hash[hash]);
dkprintf("%s entry %s:%d size: %d added\n", __FUNCTION__,
file, line, size);
} }
hash += line; ihk_mc_spinlock_unlock(&kmalloc_track_hash_locks[hash], irqflags);
hash %= HASHNUM;
for (lp = lochash[hash]; lp; lp = lp->next) ihk_atomic_inc(&entry->alloc_count);
if (lp->line == line &&
!strcmp(lp->file, file)) /* Add new addr entry for this allocation entry */
break; addr_entry = ___kmalloc(sizeof(*addr_entry), IHK_MC_AP_NOWAIT);
if (!lp) { if (!addr_entry) {
lp = dalloc(sizeof(struct location) + strlen(file) + 1); kprintf("%s: ERROR: allocating addr entry\n");
memset(lp, '\0', sizeof(struct location)); goto out;
lp->line = line;
strcpy(lp->file, file);
do {
lp->next = lochash[hash];
} while (!compare_and_swap(lochash + hash, (unsigned long)lp->next, (unsigned long)lp));
} }
hash = (unsigned long)h % HASHNUM; addr_entry->addr = r;
do { addr_entry->runcount = kmalloc_runcount;
for (ap = allochash[hash]; ap; ap = ap->next) addr_entry->entry = entry;
if (!ap->p)
break;
} while (ap && !compare_and_swap(&ap->p, 0UL, (unsigned long)h));
if (!ap) {
ap = dalloc(sizeof(struct alloc));
memset(ap, '\0', sizeof(struct alloc));
ap->p = h;
do {
ap->next = allochash[hash];
} while (!compare_and_swap(allochash + hash, (unsigned long)ap->next, (unsigned long)ap));
}
ap->loc = lp; irqflags = ihk_mc_spinlock_lock(&entry->addr_list_lock);
ap->size = size; list_add(&addr_entry->list, &entry->addr_list);
ap->runcount = runcount; ihk_mc_spinlock_unlock(&entry->addr_list_lock, irqflags);
return r; /* Add addr entry to address hash */
} addr_hash = ((unsigned long)r >> 5) & KMALLOC_TRACK_HASH_MASK;
irqflags = ihk_mc_spinlock_lock(&kmalloc_addr_hash_locks[addr_hash]);
list_add(&addr_entry->hash, &kmalloc_addr_hash[addr_hash]);
ihk_mc_spinlock_unlock(&kmalloc_addr_hash_locks[addr_hash], irqflags);
int _memcheck(void *ptr, char *msg, char *file, int line, int flags) dkprintf("%s addr_entry %p added\n", __FUNCTION__, r);
{
struct malloc_header *h = ((struct malloc_header *)ptr) - 1;
struct malloc_header *next;
unsigned long hash = (unsigned long)h % HASHNUM;
struct alloc *ap;
static unsigned long check = 0x5a5a5a5a5a5a5a5aUL;
unsigned long alcsize;
unsigned long chksize;
if (h->check != 0x5a5a5a5a) {
int i;
unsigned long max = 0;
unsigned long cur = (unsigned long)h;
struct alloc *maxap = NULL;
for (i = 0; i < HASHNUM; i++)
for (ap = allochash[i]; ap; ap = ap->next)
if ((unsigned long)ap->p < cur &&
(unsigned long)ap->p > max) {
max = (unsigned long)ap->p;
maxap = ap;
}
kprintf("%s: detect buffer overrun, alc=%s:%d size=%ld h=%p, s=%ld\n", msg, maxap->loc->file, maxap->loc->line, maxap->size, maxap->p, maxap->p->size);
kprintf("broken header: h=%p next=%p size=%ld cpu_id=%d\n", h, h->next, h->size, h->cpu_id);
}
for (ap = allochash[hash]; ap; ap = ap->next)
if (ap->p == h)
break;
if (!ap) {
if(file)
kprintf("%s: address not found, %s:%d p=%p\n", msg, file, line, ptr);
else
kprintf("%s: address not found p=%p\n", msg, ptr);
return 1;
}
alcsize = h->size * sizeof(struct malloc_header);
chksize = alcsize - ap->size;
if (chksize > 8)
chksize = 8;
next = (struct malloc_header *)((char *)ptr + alcsize);
if (next->check != 0x5a5a5a5a ||
memcmp((char *)ptr + ap->size, &check, chksize)) {
unsigned long buf = 0x5a5a5a5a5a5a5a5aUL;
unsigned char *p;
unsigned char *q;
memcpy(&buf, (char *)ptr + ap->size, chksize);
p = (unsigned char *)&(next->check);
q = (unsigned char *)&buf;
if (file)
kprintf("%s: broken, %s:%d alc=%s:%d %02x%02x%02x%02x%02x%02x%02x%02x %02x%02x%02x%02x size=%ld\n", msg, file, line, ap->loc->file, ap->loc->line, q[0], q[1], q[2], q[3], q[4], q[5], q[6], q[7], p[0], p[1], p[2], p[3], ap->size);
else
kprintf("%s: broken, alc=%s:%d %02x%02x%02x%02x%02x%02x%02x%02x %02x%02x%02x%02x size=%ld\n", msg, ap->loc->file, ap->loc->line, q[0], q[1], q[2], q[3], q[4], q[5], q[6], q[7], p[0], p[1], p[2], p[3], ap->size);
if (next->check != 0x5a5a5a5a)
kprintf("next->HEADER: next=%p size=%ld cpu_id=%d\n", next->next, next->size, next->cpu_id);
return 1;
}
if(flags & 1){
ap->p = NULL;
ap->loc = NULL;
ap->size = 0;
}
return 0;
}
int memcheckall()
{
int i;
struct alloc *ap;
int r = 0;
for(i = 0; i < HASHNUM; i++)
for(ap = allochash[i]; ap; ap = ap->next)
if(ap->p)
r |= _memcheck(ap->p + 1, "memcheck", NULL, 0, 2);
return r;
}
int freecheck(int runcount)
{
int i;
struct alloc *ap;
struct location *lp;
int r = 0;
for (i = 0; i < HASHNUM; i++)
for (lp = lochash[i]; lp; lp = lp->next)
lp->cnt = 0;
for (i = 0; i < HASHNUM; i++)
for (ap = allochash[i]; ap; ap = ap->next)
if (ap->p && ap->runcount == runcount) {
ap->loc->cnt++;
r++;
}
if (r) {
kprintf("memory leak?\n");
for (i = 0; i < HASHNUM; i++)
for (lp = lochash[i]; lp; lp = lp->next)
if (lp->cnt)
kprintf(" alc=%s:%d cnt=%d\n", lp->file, lp->line, lp->cnt);
}
out:
return r; return r;
} }
void _kfree(void *ptr, char *file, int line) void _kfree(void *ptr, char *file, int line)
{ {
if (memdebug) unsigned long irqflags;
_memcheck(ptr, "KFREE", file, line, 1); struct kmalloc_track_entry *entry;
struct kmalloc_track_addr_entry *addr_entry_iter, *addr_entry = NULL;
int hash;
if (!memdebug) {
goto out;
}
hash = ((unsigned long)ptr >> 5) & KMALLOC_TRACK_HASH_MASK;
irqflags = ihk_mc_spinlock_lock(&kmalloc_addr_hash_locks[hash]);
list_for_each_entry(addr_entry_iter,
&kmalloc_addr_hash[hash], hash) {
if (addr_entry_iter->addr == ptr) {
addr_entry = addr_entry_iter;
break;
}
}
if (addr_entry) {
list_del(&addr_entry->hash);
}
ihk_mc_spinlock_unlock(&kmalloc_addr_hash_locks[hash], irqflags);
if (!addr_entry) {
kprintf("%s: ERROR: kfree()ing invalid pointer\n", __FUNCTION__);
panic("panic");
}
entry = addr_entry->entry;
irqflags = ihk_mc_spinlock_lock(&entry->addr_list_lock);
list_del(&addr_entry->list);
ihk_mc_spinlock_unlock(&entry->addr_list_lock, irqflags);
dkprintf("%s addr_entry %p removed\n", __FUNCTION__, addr_entry->addr);
___kfree(addr_entry);
/* Do we need to remove tracking entry as well? */
if (!ihk_atomic_dec_and_test(&entry->alloc_count)) {
goto out;
}
hash = (strlen(entry->file) + entry->line + entry->size) &
KMALLOC_TRACK_HASH_MASK;
irqflags = ihk_mc_spinlock_lock(&kmalloc_track_hash_locks[hash]);
list_del(&entry->hash);
ihk_mc_spinlock_unlock(&kmalloc_track_hash_locks[hash], irqflags);
dkprintf("%s entry %s:%d size: %d removed\n", __FUNCTION__,
entry->file, entry->line, entry->size);
___kfree(entry->file);
___kfree(entry);
out:
___kfree(ptr); ___kfree(ptr);
} }
void kmalloc_memcheck(void)
{
int i;
unsigned long irqflags;
struct kmalloc_track_entry *entry = NULL;
for (i = 0; i < KMALLOC_TRACK_HASH_SIZE; ++i) {
irqflags = ihk_mc_spinlock_lock(&kmalloc_track_hash_locks[i]);
list_for_each_entry(entry, &kmalloc_track_hash[i], hash) {
struct kmalloc_track_addr_entry *addr_entry = NULL;
int cnt = 0;
ihk_mc_spinlock_lock_noirq(&entry->addr_list_lock);
list_for_each_entry(addr_entry, &entry->addr_list, list) {
dkprintf("%s memory leak: %p @ %s:%d size: %d runcount: %d\n",
__FUNCTION__,
addr_entry->addr,
entry->file,
entry->line,
entry->size,
addr_entry->runcount);
if (kmalloc_runcount != addr_entry->runcount)
continue;
cnt++;
}
ihk_mc_spinlock_unlock_noirq(&entry->addr_list_lock);
if (!cnt)
continue;
kprintf("%s memory leak: %s:%d size: %d cnt: %d, runcount: %d\n",
__FUNCTION__,
entry->file,
entry->line,
entry->size,
cnt,
kmalloc_runcount);
}
ihk_mc_spinlock_unlock(&kmalloc_track_hash_locks[i], irqflags);
}
++kmalloc_runcount;
}
/* Redirection routines registered in alloc structure */
void *__kmalloc(int size, enum ihk_mc_ap_flag flag) void *__kmalloc(int size, enum ihk_mc_ap_flag flag)
{ {
return kmalloc(size, flag); return kmalloc(size, flag);
@@ -878,160 +932,199 @@ void __kfree(void *ptr)
kfree(ptr); kfree(ptr);
} }
void kmalloc_init(void)
static void ___kmalloc_insert_chunk(struct list_head *free_list,
struct kmalloc_header *chunk)
{ {
struct cpu_local_var *v = get_this_cpu_local_var(); struct kmalloc_header *chunk_iter, *next_chunk = NULL;
struct malloc_header *h = &v->free_list;
int i;
h->check = 0x5a5a5a5a; /* Find out where to insert */
h->next = &v->free_list; list_for_each_entry(chunk_iter, free_list, list) {
h->size = 0; if ((void *)chunk < (void *)chunk_iter) {
next_chunk = chunk_iter;
register_kmalloc(); break;
memdebug = find_command_line("memdebug");
for (i = 0; i < HASHNUM; i++) {
allochash[i] = NULL;
lochash[i] = NULL;
}
page = allocate_pages(16, IHK_MC_AP_NOWAIT);
space = 16 * 4096;
ihk_mc_spinlock_init(&alloclock);
}
void ____kfree(struct cpu_local_var *v, struct malloc_header *p)
{
struct malloc_header *h = &v->free_list;
int combined = 0;
h = h->next;
while ((p < h || p > h->next) && h != &v->free_list) {
h = h->next;
}
if (h + h->size + 1 == p && h->size != 0) {
combined = 1;
h->size += p->size + 1;
h->check = 0x5a5a5a5a;
}
if (h->next == p + p->size + 1 && h->next->size != 0) {
if (combined) {
h->check = 0x5a5a5a5a;
h->size += h->next->size + 1;
h->next = h->next->next;
} else {
p->check = 0x5a5a5a5a;
p->size += h->next->size + 1;
p->next = h->next->next;
h->next = p;
} }
} else if (!combined) {
p->next = h->next;
h->next = p;
} }
/* Add in front of next */
if (next_chunk) {
list_add_tail(&chunk->list, &next_chunk->list);
}
/* Add after the head */
else {
list_add(&chunk->list, free_list);
}
return;
} }
void *___kmalloc(int size, enum ihk_mc_ap_flag flag) static void ___kmalloc_init_chunk(struct kmalloc_header *h, int size)
{ {
struct cpu_local_var *v = get_this_cpu_local_var(); h->size = size;
struct malloc_header *h = &v->free_list, *prev, *p; h->front_magic = 0x5c5c5c5c;
int u, req_page; h->end_magic = 0x6d6d6d6d;
h->cpu_id = ihk_mc_get_processor_id();
}
p = (struct malloc_header *)xchg8((unsigned long *)&v->remote_free_list, 0L); static void ___kmalloc_consolidate_list(struct list_head *list)
while(p){ {
struct malloc_header *n = p->next; struct kmalloc_header *chunk_iter, *chunk, *next_chunk;
____kfree(v, p);
p = n; reiterate:
chunk_iter = NULL;
chunk = NULL;
list_for_each_entry(next_chunk, list, list) {
if (chunk_iter && (((void *)chunk_iter + sizeof(struct kmalloc_header)
+ chunk_iter->size) == (void *)next_chunk)) {
chunk = chunk_iter;
break;
}
chunk_iter = next_chunk;
} }
if (size >= PAGE_SIZE * 4) { if (!chunk) {
return;
}
chunk->size += (next_chunk->size + sizeof(struct kmalloc_header));
list_del(&next_chunk->list);
goto reiterate;
}
void kmalloc_consolidate_free_list(void)
{
struct kmalloc_header *chunk, *tmp;
unsigned long irqflags =
ihk_mc_spinlock_lock(&cpu_local_var(remote_free_list_lock));
/* Clean up remotely deallocated chunks */
list_for_each_entry_safe(chunk, tmp,
&cpu_local_var(remote_free_list), list) {
list_del(&chunk->list);
___kmalloc_insert_chunk(&cpu_local_var(free_list), chunk);
}
/* Free list lock ensures IRQs are disabled */
___kmalloc_consolidate_list(&cpu_local_var(free_list));
ihk_mc_spinlock_unlock(&cpu_local_var(remote_free_list_lock), irqflags);
}
#define KMALLOC_MIN_SHIFT (5)
#define KMALLOC_MIN_SIZE (1 << KMALLOC_TRACK_HASH_SHIFT)
#define KMALLOC_MIN_MASK (KMALLOC_MIN_SIZE - 1)
/* Actual low-level allocation routines */
static void *___kmalloc(int size, enum ihk_mc_ap_flag flag)
{
struct kmalloc_header *chunk_iter;
struct kmalloc_header *chunk = NULL;
int npages;
unsigned long kmalloc_irq_flags = cpu_disable_interrupt_save();
/* KMALLOC_MIN_SIZE bytes aligned size. */
if (size & KMALLOC_MIN_MASK) {
size = ((size + KMALLOC_MIN_SIZE - 1) & ~(KMALLOC_MIN_MASK));
}
chunk = NULL;
/* Find a chunk that is big enough */
list_for_each_entry(chunk_iter, &cpu_local_var(free_list), list) {
if (chunk_iter->size >= size) {
chunk = chunk_iter;
break;
}
}
split_and_return:
/* Did we find one? */
if (chunk) {
/* Do we need to split it? Only if there is enough space for
* another header and some actual content */
if (chunk->size > (size + sizeof(struct kmalloc_header))) {
struct kmalloc_header *leftover;
leftover = (struct kmalloc_header *)
((void *)chunk + sizeof(struct kmalloc_header) + size);
___kmalloc_init_chunk(leftover,
(chunk->size - size - sizeof(struct kmalloc_header)));
list_add(&leftover->list, &chunk->list);
chunk->size = size;
}
list_del(&chunk->list);
cpu_restore_interrupt(kmalloc_irq_flags);
return ((void *)chunk + sizeof(struct kmalloc_header));
}
/* Allocate new memory and add it to free list */
npages = (size + sizeof(struct kmalloc_header) + (PAGE_SIZE - 1))
>> PAGE_SHIFT;
chunk = ihk_mc_alloc_pages(npages, flag);
if (!chunk) {
cpu_restore_interrupt(kmalloc_irq_flags);
return NULL; return NULL;
} }
u = (size + sizeof(*h) - 1) / sizeof(*h); ___kmalloc_init_chunk(chunk,
(npages * PAGE_SIZE - sizeof(struct kmalloc_header)));
___kmalloc_insert_chunk(&cpu_local_var(free_list), chunk);
prev = h; goto split_and_return;
h = h->next;
while (1) {
if (h == &v->free_list) {
req_page = ((u + 2) * sizeof(*h) + PAGE_SIZE - 1)
>> PAGE_SHIFT;
h = allocate_pages(req_page, flag);
if(h == NULL) {
kprintf("kmalloc(%#x,%#x): out of memory\n", size, flag);
return NULL;
}
h->check = 0x5a5a5a5a;
prev->next = h;
h->size = (req_page * PAGE_SIZE) / sizeof(*h) - 2;
/* Guard entry */
p = h + h->size + 1;
p->check = 0x5a5a5a5a;
p->next = &v->free_list;
p->size = 0;
h->next = p;
}
if (h->size >= u) {
if (h->size == u || h->size == u + 1) {
prev->next = h->next;
h->cpu_id = ihk_mc_get_processor_id();
return h + 1;
} else { /* Divide */
h->size -= u + 1;
p = h + h->size + 1;
p->check = 0x5a5a5a5a;
p->size = u;
p->cpu_id = ihk_mc_get_processor_id();
return p + 1;
}
}
prev = h;
h = h->next;
}
} }
void ___kfree(void *ptr) static void ___kfree(void *ptr)
{ {
struct malloc_header *p = (struct malloc_header *)ptr; struct kmalloc_header *chunk =
struct cpu_local_var *v = get_cpu_local_var((--p)->cpu_id); (struct kmalloc_header*)(ptr - sizeof(struct kmalloc_header));
unsigned long kmalloc_irq_flags = cpu_disable_interrupt_save();
if(p->cpu_id == ihk_mc_get_processor_id()){ /* Sanity check */
____kfree(v, p); if (chunk->front_magic != 0x5c5c5c5c || chunk->end_magic != 0x6d6d6d6d) {
kprintf("%s: memory corruption at address 0x%p\n", __FUNCTION__, ptr);
panic("panic");
} }
else{
unsigned long oldval; /* Does this chunk belong to this CPU? */
unsigned long newval; if (chunk->cpu_id == ihk_mc_get_processor_id()) {
unsigned long rval;
do{ ___kmalloc_insert_chunk(&cpu_local_var(free_list), chunk);
p->next = v->remote_free_list; ___kmalloc_consolidate_list(&cpu_local_var(free_list));
oldval = (unsigned long)p->next;
newval = (unsigned long)p;
rval = atomic_cmpxchg8(
(unsigned long *)&v->remote_free_list,
oldval, newval);
}while(rval != oldval);
} }
else {
struct cpu_local_var *v = get_cpu_local_var(chunk->cpu_id);
unsigned long irqflags;
irqflags = ihk_mc_spinlock_lock(&v->remote_free_list_lock);
list_add(&chunk->list, &v->remote_free_list);
ihk_mc_spinlock_unlock(&v->remote_free_list_lock, irqflags);
}
cpu_restore_interrupt(kmalloc_irq_flags);
} }
void print_free_list(void)
void ___kmalloc_print_free_list(struct list_head *list)
{ {
struct cpu_local_var *v = get_this_cpu_local_var(); struct kmalloc_header *chunk_iter;
struct malloc_header *h = &v->free_list; unsigned long irqflags = kprintf_lock();
h = h->next; __kprintf("%s: [ \n", __FUNCTION__);
list_for_each_entry(chunk_iter, &cpu_local_var(free_list), list) {
kprintf("free_list : \n"); __kprintf("%s: 0x%lx:%d (VA PFN: %lu, off: %lu)\n", __FUNCTION__,
while (h != &v->free_list) { (unsigned long)chunk_iter,
kprintf(" %p : %p, %d ->\n", h, h->next, h->size); chunk_iter->size,
h = h->next; (unsigned long)chunk_iter >> PAGE_SHIFT,
(unsigned long)chunk_iter % PAGE_SIZE);
} }
kprintf("\n"); __kprintf("%s: ] \n", __FUNCTION__);
kprintf_unlock(irqflags);
} }

View File

@@ -53,7 +53,6 @@ static int copy_user_ranges(struct process_vm *vm, struct process_vm *orgvm);
extern void release_fp_regs(struct thread *proc); extern void release_fp_regs(struct thread *proc);
extern void save_fp_regs(struct thread *proc); extern void save_fp_regs(struct thread *proc);
extern void restore_fp_regs(struct thread *proc); extern void restore_fp_regs(struct thread *proc);
void settid(struct thread *proc, int mode, int newcpuid, int oldcpuid);
extern void __runq_add_proc(struct thread *proc, int cpu_id); extern void __runq_add_proc(struct thread *proc, int cpu_id);
extern void terminate_host(int pid); extern void terminate_host(int pid);
extern void lapic_timer_enable(unsigned int clocks); extern void lapic_timer_enable(unsigned int clocks);
@@ -387,6 +386,8 @@ clone_thread(struct thread *org, unsigned long pc, unsigned long sp,
goto err_free_proc; goto err_free_proc;
} }
thread->vm->vdso_addr = org->vm->vdso_addr;
thread->vm->vvar_addr = org->vm->vvar_addr;
thread->proc->maxrss = org->proc->maxrss; thread->proc->maxrss = org->proc->maxrss;
thread->vm->currss = org->vm->currss; thread->vm->currss = org->vm->currss;
@@ -743,7 +744,7 @@ int join_process_memory_range(struct process_vm *vm,
memobj_release(merging->memobj); memobj_release(merging->memobj);
} }
list_del(&merging->list); list_del(&merging->list);
ihk_mc_free(merging); kfree(merging);
error = 0; error = 0;
out: out:
@@ -839,8 +840,9 @@ int free_process_memory_range(struct process_vm *vm, struct vm_range *range)
if (range->memobj) { if (range->memobj) {
memobj_release(range->memobj); memobj_release(range->memobj);
} }
list_del(&range->list); list_del(&range->list);
ihk_mc_free(range); kfree(range);
dkprintf("free_process_memory_range(%p,%lx-%lx): 0\n", dkprintf("free_process_memory_range(%p,%lx-%lx): 0\n",
vm, start0, end0); vm, start0, end0);
@@ -966,7 +968,6 @@ enum ihk_mc_pt_attribute common_vrflag_to_ptattr(unsigned long flag, uint64_t fa
return attr; return attr;
} }
/* XXX: インデントを揃える必要がある */
int add_process_memory_range(struct process_vm *vm, int add_process_memory_range(struct process_vm *vm,
unsigned long start, unsigned long end, unsigned long start, unsigned long end,
unsigned long phys, unsigned long flag, unsigned long phys, unsigned long flag,
@@ -1537,6 +1538,8 @@ retry:
kprintf("page_fault_process_memory_range(%p,%lx-%lx %lx,%lx,%lx):cannot allocate new page. %d\n", vm, range->start, range->end, range->flag, fault_addr, reason, error); kprintf("page_fault_process_memory_range(%p,%lx-%lx %lx,%lx,%lx):cannot allocate new page. %d\n", vm, range->start, range->end, range->flag, fault_addr, reason, error);
goto out; goto out;
} }
dkprintf("%s: clearing 0x%lx:%lu\n",
__FUNCTION__, pgaddr, pgsize);
memset(virt, 0, pgsize); memset(virt, 0, pgsize);
phys = virt_to_phys(virt); phys = virt_to_phys(virt);
page_map(phys_to_page(phys)); page_map(phys_to_page(phys));
@@ -1569,6 +1572,8 @@ retry:
kprintf("page_fault_process_memory_range(%p,%lx-%lx %lx,%lx,%lx):cannot allocate copy page. %d\n", vm, range->start, range->end, range->flag, fault_addr, reason, error); kprintf("page_fault_process_memory_range(%p,%lx-%lx %lx,%lx,%lx):cannot allocate copy page. %d\n", vm, range->start, range->end, range->flag, fault_addr, reason, error);
goto out; goto out;
} }
dkprintf("%s: copying 0x%lx:%lu\n",
__FUNCTION__, pgaddr, pgsize);
memcpy(virt, phys_to_virt(phys), pgsize); memcpy(virt, phys_to_virt(phys), pgsize);
phys = virt_to_phys(virt); phys = virt_to_phys(virt);
@@ -1649,7 +1654,7 @@ static int do_page_fault_process_vm(struct process_vm *vm, void *fault_addr0, ui
"access denied. %d\n", "access denied. %d\n",
ihk_mc_get_processor_id(), vm, ihk_mc_get_processor_id(), vm,
fault_addr0, reason, error); fault_addr0, reason, error);
kprintf("%s: reason: %s%s%s%s%s%s%s%s\n", __FUNCTION__, kprintf("%s: reason: %s%s%s%s%s%s%s\n", __FUNCTION__,
(reason & PF_PROT) ? "PF_PROT " : "", (reason & PF_PROT) ? "PF_PROT " : "",
(reason & PF_WRITE) ? "PF_WRITE " : "", (reason & PF_WRITE) ? "PF_WRITE " : "",
(reason & PF_USER) ? "PF_USER " : "", (reason & PF_USER) ? "PF_USER " : "",
@@ -1888,14 +1893,14 @@ unsigned long extend_process_region(struct process_vm *vm,
aligned_end = (aligned_end + (LARGE_PAGE_SIZE - 1)) & LARGE_PAGE_MASK; aligned_end = (aligned_end + (LARGE_PAGE_SIZE - 1)) & LARGE_PAGE_MASK;
/* Fill in the gap between old_aligned_end and aligned_end /* Fill in the gap between old_aligned_end and aligned_end
* with regular pages */ * with regular pages */
if((p = allocate_pages((aligned_end - old_aligned_end) >> PAGE_SHIFT, if((p = ihk_mc_alloc_pages((aligned_end - old_aligned_end) >> PAGE_SHIFT,
IHK_MC_AP_NOWAIT)) == NULL){ IHK_MC_AP_NOWAIT)) == NULL){
return end; return end;
} }
if((rc = add_process_memory_range(vm, old_aligned_end, if((rc = add_process_memory_range(vm, old_aligned_end,
aligned_end, virt_to_phys(p), flag, aligned_end, virt_to_phys(p), flag,
LARGE_PAGE_SHIFT)) != 0){ LARGE_PAGE_SHIFT)) != 0){
free_pages(p, (aligned_end - old_aligned_end) >> PAGE_SHIFT); ihk_mc_free_pages(p, (aligned_end - old_aligned_end) >> PAGE_SHIFT);
return end; return end;
} }
@@ -1908,7 +1913,7 @@ unsigned long extend_process_region(struct process_vm *vm,
(LARGE_PAGE_SIZE - 1)) & LARGE_PAGE_MASK; (LARGE_PAGE_SIZE - 1)) & LARGE_PAGE_MASK;
address = aligned_new_end; address = aligned_new_end;
if((p = allocate_pages((aligned_new_end - aligned_end + LARGE_PAGE_SIZE) >> PAGE_SHIFT, if((p = ihk_mc_alloc_pages((aligned_new_end - aligned_end + LARGE_PAGE_SIZE) >> PAGE_SHIFT,
IHK_MC_AP_NOWAIT)) == NULL){ IHK_MC_AP_NOWAIT)) == NULL){
return end; return end;
} }
@@ -1916,16 +1921,16 @@ unsigned long extend_process_region(struct process_vm *vm,
p_aligned = ((unsigned long)p + (LARGE_PAGE_SIZE - 1)) & LARGE_PAGE_MASK; p_aligned = ((unsigned long)p + (LARGE_PAGE_SIZE - 1)) & LARGE_PAGE_MASK;
if (p_aligned > (unsigned long)p) { if (p_aligned > (unsigned long)p) {
free_pages(p, (p_aligned - (unsigned long)p) >> PAGE_SHIFT); ihk_mc_free_pages(p, (p_aligned - (unsigned long)p) >> PAGE_SHIFT);
} }
free_pages( ihk_mc_free_pages(
(void *)(p_aligned + aligned_new_end - aligned_end), (void *)(p_aligned + aligned_new_end - aligned_end),
(LARGE_PAGE_SIZE - (p_aligned - (unsigned long)p)) >> PAGE_SHIFT); (LARGE_PAGE_SIZE - (p_aligned - (unsigned long)p)) >> PAGE_SHIFT);
if((rc = add_process_memory_range(vm, aligned_end, if((rc = add_process_memory_range(vm, aligned_end,
aligned_new_end, virt_to_phys((void *)p_aligned), aligned_new_end, virt_to_phys((void *)p_aligned),
flag, LARGE_PAGE_SHIFT)) != 0){ flag, LARGE_PAGE_SHIFT)) != 0){
free_pages(p, (aligned_new_end - aligned_end + LARGE_PAGE_SIZE) >> PAGE_SHIFT); ihk_mc_free_pages(p, (aligned_new_end - aligned_end + LARGE_PAGE_SIZE) >> PAGE_SHIFT);
return end; return end;
} }
@@ -1943,7 +1948,7 @@ unsigned long extend_process_region(struct process_vm *vm,
p=0; p=0;
}else{ }else{
p = allocate_pages((aligned_new_end - aligned_end) >> PAGE_SHIFT, IHK_MC_AP_NOWAIT); p = ihk_mc_alloc_pages((aligned_new_end - aligned_end) >> PAGE_SHIFT, IHK_MC_AP_NOWAIT);
if (!p) { if (!p) {
return end; return end;
@@ -1952,7 +1957,7 @@ unsigned long extend_process_region(struct process_vm *vm,
if((rc = add_process_memory_range(vm, aligned_end, aligned_new_end, if((rc = add_process_memory_range(vm, aligned_end, aligned_new_end,
(p==0?0:virt_to_phys(p)), flag, NULL, 0, (p==0?0:virt_to_phys(p)), flag, NULL, 0,
PAGE_SHIFT)) != 0){ PAGE_SHIFT)) != 0){
free_pages(p, (aligned_new_end - aligned_end) >> PAGE_SHIFT); ihk_mc_free_pages(p, (aligned_new_end - aligned_end) >> PAGE_SHIFT);
return end; return end;
} }
@@ -2065,6 +2070,7 @@ release_process(struct process *proc)
mcs_rwlock_writer_unlock(&parent->children_lock, &lock); mcs_rwlock_writer_unlock(&parent->children_lock, &lock);
} }
if (proc->tids) kfree(proc->tids);
kfree(proc); kfree(proc);
} }
@@ -2170,6 +2176,23 @@ release_sigcommon(struct sig_common *sigcommon)
kfree(sigcommon); kfree(sigcommon);
} }
/*
* Release the TID from the process' TID set corresponding to this thread.
* NOTE: threads_lock must be held.
*/
void __release_tid(struct process *proc, struct thread *thread) {
int i;
for (i = 0; i < proc->nr_tids; ++i) {
if (proc->tids[i].thread != thread) continue;
proc->tids[i].thread = NULL;
dkprintf("%s: tid %d has been released by %p\n",
__FUNCTION__, thread->tid, thread);
break;
}
}
void destroy_thread(struct thread *thread) void destroy_thread(struct thread *thread)
{ {
struct sig_pending *pending; struct sig_pending *pending;
@@ -2186,6 +2209,7 @@ void destroy_thread(struct thread *thread)
mcs_rwlock_writer_lock(&proc->threads_lock, &lock); mcs_rwlock_writer_lock(&proc->threads_lock, &lock);
list_del(&thread->siblings_list); list_del(&thread->siblings_list);
__release_tid(proc, thread);
mcs_rwlock_writer_unlock(&proc->threads_lock, &lock); mcs_rwlock_writer_unlock(&proc->threads_lock, &lock);
cpu_clear(thread->cpu_id, &thread->vm->address_space->cpu_set, cpu_clear(thread->cpu_id, &thread->vm->address_space->cpu_set,
@@ -2323,6 +2347,8 @@ static void idle(void)
} }
if (v->status == CPU_STATUS_IDLE || if (v->status == CPU_STATUS_IDLE ||
v->status == CPU_STATUS_RESERVED) { v->status == CPU_STATUS_RESERVED) {
/* No work to do? Consolidate the kmalloc free list */
kmalloc_consolidate_free_list();
cpu_safe_halt(); cpu_safe_halt();
} }
else { else {
@@ -2525,7 +2551,7 @@ static void do_migrate(void)
v->flags |= CPU_FLAG_NEED_RESCHED; v->flags |= CPU_FLAG_NEED_RESCHED;
ihk_mc_interrupt_cpu(get_x86_cpu_local_variable(cpu_id)->apic_id, 0xd1); ihk_mc_interrupt_cpu(get_x86_cpu_local_variable(cpu_id)->apic_id, 0xd1);
double_rq_unlock(cur_v, v, irqstate); double_rq_unlock(cur_v, v, irqstate);
settid(req->thread, 2, cpu_id, old_cpu_id); //settid(req->thread, 2, cpu_id, old_cpu_id, 0, NULL);
ack: ack:
waitq_wakeup(&req->wq); waitq_wakeup(&req->wq);
@@ -2561,13 +2587,8 @@ void schedule(void)
struct thread *last; struct thread *last;
if (cpu_local_var(no_preempt)) { if (cpu_local_var(no_preempt)) {
dkprintf("no schedule() while no preemption! \n"); kprintf("%s: WARNING can't schedule() while no preemption, cnt: %d\n",
return; __FUNCTION__, cpu_local_var(no_preempt));
}
if (cpu_local_var(current)
&& cpu_local_var(current)->in_syscall_offload) {
dkprintf("no schedule() while syscall offload!\n");
return; return;
} }

View File

@@ -127,11 +127,9 @@ int prepare_process_ranges_args_envs(struct thread *thread,
static void do_mod_exit(int status); static void do_mod_exit(int status);
#endif #endif
static void send_syscall(struct syscall_request *req, int cpu, int pid) static void send_syscall(struct syscall_request *req, int cpu, int pid, struct syscall_response *res)
{ {
struct ikc_scd_packet packet; struct ikc_scd_packet packet IHK_DMA_ALIGN;
struct syscall_response *res;
struct syscall_params *scp;
struct ihk_ikc_channel_desc *syscall_channel; struct ihk_ikc_channel_desc *syscall_channel;
int ret; int ret;
@@ -140,7 +138,6 @@ static void send_syscall(struct syscall_request *req, int cpu, int pid)
req->number == __NR_kill){ // interrupt syscall req->number == __NR_kill){ // interrupt syscall
extern int num_processors; extern int num_processors;
scp = &get_cpu_local_var(0)->scp2;
syscall_channel = get_cpu_local_var(0)->syscall_channel2; syscall_channel = get_cpu_local_var(0)->syscall_channel2;
/* XXX: is this really going to work if multiple processes /* XXX: is this really going to work if multiple processes
@@ -152,34 +149,22 @@ static void send_syscall(struct syscall_request *req, int cpu, int pid)
pid = req->args[1]; pid = req->args[1];
} }
else{ else{
scp = &get_cpu_local_var(cpu)->scp;
syscall_channel = get_cpu_local_var(cpu)->syscall_channel; syscall_channel = get_cpu_local_var(cpu)->syscall_channel;
} }
res = scp->response_va;
res->status = 0; res->status = 0;
req->valid = 0; req->valid = 0;
#ifdef USE_DMA memcpy(&packet.req, req, sizeof(*req));
memcpy_async(scp->request_pa,
virt_to_phys(req), sizeof(*req), 0, &fin);
memcpy_async_wait(&scp->post_fin);
scp->post_va->v[0] = scp->post_idx;
memcpy_async_wait(&fin);
#else
memcpy(scp->request_va, req, sizeof(*req));
#endif
barrier(); barrier();
scp->request_va->valid = 1; packet.req.valid = 1;
*(unsigned int *)scp->doorbell_va = cpu + 1;
#ifdef SYSCALL_BY_IKC #ifdef SYSCALL_BY_IKC
packet.msg = SCD_MSG_SYSCALL_ONESIDE; packet.msg = SCD_MSG_SYSCALL_ONESIDE;
packet.ref = cpu; packet.ref = cpu;
packet.pid = pid ? pid : cpu_local_var(current)->proc->pid; packet.pid = pid ? pid : cpu_local_var(current)->proc->pid;
packet.arg = scp->request_rpa; packet.resp_pa = virt_to_phys(res);
dkprintf("send syscall, nr: %d, pid: %d\n", req->number, packet.pid); dkprintf("send syscall, nr: %d, pid: %d\n", req->number, packet.pid);
ret = ihk_ikc_send(syscall_channel, &packet, 0); ret = ihk_ikc_send(syscall_channel, &packet, 0);
@@ -193,9 +178,8 @@ ihk_spinlock_t syscall_lock;
long do_syscall(struct syscall_request *req, int cpu, int pid) long do_syscall(struct syscall_request *req, int cpu, int pid)
{ {
struct syscall_response *res; struct syscall_response res;
struct syscall_request req2 IHK_DMA_ALIGN; struct syscall_request req2 IHK_DMA_ALIGN;
struct syscall_params *scp;
int error; int error;
long rc; long rc;
int islock = 0; int islock = 0;
@@ -206,6 +190,9 @@ long do_syscall(struct syscall_request *req, int cpu, int pid)
dkprintf("SC(%d)[%3d] sending syscall\n", dkprintf("SC(%d)[%3d] sending syscall\n",
ihk_mc_get_processor_id(), ihk_mc_get_processor_id(),
req->number); req->number);
irqstate = 0; /* for avoidance of warning */
barrier();
if(req->number != __NR_exit_group){ if(req->number != __NR_exit_group){
if(proc->nohost && // host is down if(proc->nohost && // host is down
@@ -215,20 +202,18 @@ long do_syscall(struct syscall_request *req, int cpu, int pid)
++thread->in_syscall_offload; ++thread->in_syscall_offload;
} }
irqstate = 0; /* for avoidance of warning */
if(req->number == __NR_exit_group || if(req->number == __NR_exit_group ||
req->number == __NR_gettid || req->number == __NR_gettid ||
req->number == __NR_kill){ // interrupt syscall req->number == __NR_kill){ // interrupt syscall
scp = &get_cpu_local_var(0)->scp2;
islock = 1; islock = 1;
irqstate = ihk_mc_spinlock_lock(&syscall_lock); irqstate = ihk_mc_spinlock_lock(&syscall_lock);
} }
else{ /* The current thread is the requester and any thread from
scp = &get_cpu_local_var(cpu)->scp; * the pool may serve the request */
} req->rtid = cpu_local_var(current)->tid;
res = scp->response_va; req->ttid = 0;
res.req_thread_status = IHK_SCD_REQ_THREAD_SPINNING;
send_syscall(req, cpu, pid); send_syscall(req, cpu, pid, &res);
dkprintf("%s: syscall num: %d waiting for Linux.. \n", dkprintf("%s: syscall num: %d waiting for Linux.. \n",
__FUNCTION__, req->number); __FUNCTION__, req->number);
@@ -236,60 +221,83 @@ long do_syscall(struct syscall_request *req, int cpu, int pid)
#define STATUS_IN_PROGRESS 0 #define STATUS_IN_PROGRESS 0
#define STATUS_COMPLETED 1 #define STATUS_COMPLETED 1
#define STATUS_PAGE_FAULT 3 #define STATUS_PAGE_FAULT 3
while (res->status != STATUS_COMPLETED) { while (res.status != STATUS_COMPLETED) {
while (res->status == STATUS_IN_PROGRESS) { while (res.status == STATUS_IN_PROGRESS) {
struct cpu_local_var *v; struct cpu_local_var *v;
int call_schedule = 0; int do_schedule = 0;
long runq_irqstate; long runq_irqstate;
unsigned long flags;
DECLARE_WAITQ_ENTRY(scd_wq_entry, cpu_local_var(current));
cpu_pause(); cpu_pause();
/* XXX: Intel MPI + Intel OpenMP situation: /* Spin if not preemptable */
* While the MPI helper thread waits in a poll() call the OpenMP master if (cpu_local_var(no_preempt) || !thread->tid) {
* thread is iterating through the CPU cores using setaffinity(). continue;
* Unless we give a chance to it on this core the two threads seem to }
* hang in deadlock. If the new thread would make a system call on this
* core we would be in trouble. For now, allow it, but in the future /* Spin by default, but if re-schedule is requested let
* we should have syscall channels for each thread instead of per core, * the other thread run */
* or we should multiplex syscall threads in mcexec */
runq_irqstate = runq_irqstate =
ihk_mc_spinlock_lock(&(get_this_cpu_local_var()->runq_lock)); ihk_mc_spinlock_lock(&(get_this_cpu_local_var()->runq_lock));
v = get_this_cpu_local_var(); v = get_this_cpu_local_var();
if (v->flags & CPU_FLAG_NEED_RESCHED) { if (v->flags & CPU_FLAG_NEED_RESCHED) {
call_schedule = 1; do_schedule = 1;
--thread->in_syscall_offload;
} }
ihk_mc_spinlock_unlock(&v->runq_lock, runq_irqstate); ihk_mc_spinlock_unlock(&v->runq_lock, runq_irqstate);
if (call_schedule) { if (!do_schedule) {
schedule(); continue;
++thread->in_syscall_offload;
} }
flags = cpu_disable_interrupt_save();
/* Try to sleep until notified */
if (__sync_bool_compare_and_swap(&res.req_thread_status,
IHK_SCD_REQ_THREAD_SPINNING,
IHK_SCD_REQ_THREAD_DESCHEDULED)) {
dkprintf("%s: tid %d waiting for syscall reply...\n",
__FUNCTION__, thread->tid);
waitq_init(&thread->scd_wq);
waitq_prepare_to_wait(&thread->scd_wq, &scd_wq_entry,
PS_INTERRUPTIBLE);
cpu_restore_interrupt(flags);
schedule();
waitq_finish_wait(&thread->scd_wq, &scd_wq_entry);
}
cpu_restore_interrupt(flags);
} }
if (res->status == STATUS_PAGE_FAULT) { if (res.status == STATUS_PAGE_FAULT) {
dkprintf("STATUS_PAGE_FAULT in syscall, pid: %d\n", dkprintf("STATUS_PAGE_FAULT in syscall, pid: %d\n",
cpu_local_var(current)->proc->pid); cpu_local_var(current)->proc->pid);
error = page_fault_process_vm(thread->vm, error = page_fault_process_vm(thread->vm,
(void *)res->fault_address, (void *)res.fault_address,
res->fault_reason|PF_POPULATE); res.fault_reason|PF_POPULATE);
/* send result */ /* send result */
req2.number = __NR_mmap; req2.number = __NR_mmap;
#define PAGER_RESUME_PAGE_FAULT 0x0101 #define PAGER_RESUME_PAGE_FAULT 0x0101
req2.args[0] = PAGER_RESUME_PAGE_FAULT; req2.args[0] = PAGER_RESUME_PAGE_FAULT;
req2.args[1] = error; req2.args[1] = error;
/* The current thread is the requester and only the waiting thread
* may serve the request */
req2.rtid = cpu_local_var(current)->tid;
req2.ttid = res.stid;
send_syscall(&req2, cpu, pid); res.req_thread_status = IHK_SCD_REQ_THREAD_SPINNING;
send_syscall(&req2, cpu, pid, &res);
} }
} }
dkprintf("%s: syscall num: %d got host reply: %d \n", dkprintf("%s: syscall num: %d got host reply: %d \n",
__FUNCTION__, req->number, res->ret); __FUNCTION__, req->number, res.ret);
rc = res->ret; rc = res.ret;
if(islock){ if(islock){
ihk_mc_spinlock_unlock(&syscall_lock, irqstate); ihk_mc_spinlock_unlock(&syscall_lock, irqstate);
} }
@@ -820,7 +828,8 @@ terminate(int rc, int sig)
release_thread(mythread); release_thread(mythread);
release_process_vm(vm); release_process_vm(vm);
schedule(); schedule();
// no return kprintf("%s: ERROR: returned from terminate() -> schedule()\n", __FUNCTION__);
panic("panic");
} }
void void
@@ -838,14 +847,15 @@ terminate_host(int pid)
} }
void void
interrupt_syscall(int pid, int cpuid) interrupt_syscall(int pid, int tid)
{ {
dkprintf("interrupt_syscall,target pid=%d,target cpuid=%d\n", pid, cpuid); dkprintf("interrupt_syscall,target pid=%d,target tid=%d\n", pid, tid);
ihk_mc_user_context_t ctx; ihk_mc_user_context_t ctx;
long lerror; long lerror;
kprintf("interrupt_syscall pid=%d tid=%d\n", pid, tid);
ihk_mc_syscall_arg0(&ctx) = pid; ihk_mc_syscall_arg0(&ctx) = pid;
ihk_mc_syscall_arg1(&ctx) = cpuid; ihk_mc_syscall_arg1(&ctx) = tid;
lerror = syscall_generic_forwarding(__NR_kill, &ctx); lerror = syscall_generic_forwarding(__NR_kill, &ctx);
if (lerror) { if (lerror) {
@@ -908,8 +918,6 @@ static int do_munmap(void *addr, size_t len)
begin_free_pages_pending(); begin_free_pages_pending();
error = remove_process_memory_range(cpu_local_var(current)->vm, error = remove_process_memory_range(cpu_local_var(current)->vm,
(intptr_t)addr, (intptr_t)addr+len, &ro_freed); (intptr_t)addr, (intptr_t)addr+len, &ro_freed);
// XXX: TLB flush
flush_tlb();
if (error || !ro_freed) { if (error || !ro_freed) {
clear_host_pte((uintptr_t)addr, len); clear_host_pte((uintptr_t)addr, len);
} }
@@ -921,6 +929,8 @@ static int do_munmap(void *addr, size_t len)
} }
} }
finish_free_pages_pending(); finish_free_pages_pending();
dkprintf("%s: 0x%lx:%lu, error: %ld\n",
__FUNCTION__, addr, len, error);
return error; return error;
} }
@@ -1068,25 +1078,18 @@ do_mmap(const intptr_t addr0, const size_t len0, const int prot,
vrflags |= PROT_TO_VR_FLAG(prot); vrflags |= PROT_TO_VR_FLAG(prot);
vrflags |= (flags & MAP_PRIVATE)? VR_PRIVATE: 0; vrflags |= (flags & MAP_PRIVATE)? VR_PRIVATE: 0;
vrflags |= (flags & MAP_LOCKED)? VR_LOCKED: 0; vrflags |= (flags & MAP_LOCKED)? VR_LOCKED: 0;
vrflags |= VR_DEMAND_PAGING;
if (flags & MAP_ANONYMOUS) { if (flags & MAP_ANONYMOUS) {
if (0) { if (!anon_on_demand) {
/* dummy */ populated_mapping = 1;
} }
#ifdef USE_NOCACHE_MMAP #ifdef USE_NOCACHE_MMAP
#define X_MAP_NOCACHE MAP_32BIT #define X_MAP_NOCACHE MAP_32BIT
else if (flags & X_MAP_NOCACHE) { else if (flags & X_MAP_NOCACHE) {
vrflags &= ~VR_DEMAND_PAGING;
vrflags |= VR_IO_NOCACHE; vrflags |= VR_IO_NOCACHE;
} }
#endif #endif
else {
vrflags |= VR_DEMAND_PAGING;
if (!anon_on_demand) {
populated_mapping = 1;
}
}
}
else {
vrflags |= VR_DEMAND_PAGING;
} }
if (flags & (MAP_POPULATE | MAP_LOCKED)) { if (flags & (MAP_POPULATE | MAP_LOCKED)) {
@@ -1162,6 +1165,8 @@ do_mmap(const intptr_t addr0, const size_t len0, const int prot,
error = -ENOMEM; error = -ENOMEM;
goto out; goto out;
} }
dkprintf("%s: 0x%x:%lu allocated %d pages, p2align: %lx\n",
__FUNCTION__, addr, len, npages, p2align);
phys = virt_to_phys(p); phys = virt_to_phys(p);
} }
else if (flags & MAP_SHARED) { else if (flags & MAP_SHARED) {
@@ -1197,10 +1202,10 @@ do_mmap(const intptr_t addr0, const size_t len0, const int prot,
error = add_process_memory_range(thread->vm, addr, addr+len, phys, error = add_process_memory_range(thread->vm, addr, addr+len, phys,
vrflags, memobj, off, pgshift); vrflags, memobj, off, pgshift);
if (error) { if (error) {
ekprintf("do_mmap:add_process_memory_range" kprintf("%s: add_process_memory_range failed for 0x%lx:%lu"
"(%p,%lx,%lx,%lx,%lx,%d) failed %d\n", " flags: %lx, vrflags: %lx, pgshift: %d, error: %d\n",
thread->vm, addr, addr+len, __FUNCTION__, addr, addr+len,
virt_to_phys(p), vrflags, pgshift, error); flags, vrflags, pgshift, error);
goto out; goto out;
} }
@@ -1246,8 +1251,12 @@ out:
if (memobj) { if (memobj) {
memobj_release(memobj); memobj_release(memobj);
} }
dkprintf("do_mmap(%lx,%lx,%x,%x,%d,%lx): %ld %lx\n", dkprintf("%s: 0x%lx:%8lu, (req: 0x%lx:%lu), prot: %x, flags: %x, "
addr0, len0, prot, flags, fd, off0, error, addr); "fd: %d, off: %lu, error: %ld, addr: 0x%lx\n",
__FUNCTION__,
addr, len, addr0, len0, prot, flags,
fd, off0, error, addr);
return (!error)? addr: error; return (!error)? addr: error;
} }
@@ -1478,8 +1487,8 @@ SYSCALL_DECLARE(getppid)
return thread->proc->ppid_parent->pid; return thread->proc->ppid_parent->pid;
} }
void void settid(struct thread *thread, int mode, int newcpuid, int oldcpuid,
settid(struct thread *thread, int mode, int newcpuid, int oldcpuid) int nr_tids, int *tids)
{ {
struct syscall_request request IHK_DMA_ALIGN; struct syscall_request request IHK_DMA_ALIGN;
unsigned long rc; unsigned long rc;
@@ -1489,6 +1498,12 @@ settid(struct thread *thread, int mode, int newcpuid, int oldcpuid)
request.args[1] = thread->proc->pid; request.args[1] = thread->proc->pid;
request.args[2] = newcpuid; request.args[2] = newcpuid;
request.args[3] = oldcpuid; request.args[3] = oldcpuid;
/*
* If nr_tids is non-zero, tids should point to an array of ints
* where the thread ids of the mcexec process are expected.
*/
request.args[4] = nr_tids;
request.args[5] = virt_to_phys(tids);
rc = do_syscall(&request, ihk_mc_get_processor_id(), thread->proc->pid); rc = do_syscall(&request, ihk_mc_get_processor_id(), thread->proc->pid);
if (mode != 2) { if (mode != 2) {
thread->tid = rc; thread->tid = rc;
@@ -1893,7 +1908,61 @@ unsigned long do_fork(int clone_flags, unsigned long newsp,
&new->vm->address_space->cpu_set_lock); &new->vm->address_space->cpu_set_lock);
if (clone_flags & CLONE_VM) { if (clone_flags & CLONE_VM) {
settid(new, 1, cpuid, -1); int *tids = NULL;
int i;
struct mcs_rwlock_node_irqsave lock;
mcs_rwlock_writer_lock(&newproc->threads_lock, &lock);
/* Obtain mcexec TIDs if not known yet */
if (!newproc->nr_tids) {
tids = kmalloc(sizeof(int) * num_processors, IHK_MC_AP_NOWAIT);
if (!tids) {
mcs_rwlock_writer_unlock(&newproc->threads_lock, &lock);
release_cpuid(cpuid);
return -ENOMEM;
}
newproc->tids = kmalloc(sizeof(struct mcexec_tid) * num_processors, IHK_MC_AP_NOWAIT);
if (!newproc->tids) {
mcs_rwlock_writer_unlock(&newproc->threads_lock, &lock);
kfree(tids);
release_cpuid(cpuid);
return -ENOMEM;
}
settid(new, 1, cpuid, -1, num_processors, tids);
for (i = 0; (i < num_processors) && tids[i]; ++i) {
dkprintf("%s: tid[%d]: %d\n", __FUNCTION__, i, tids[i]);
newproc->tids[i].tid = tids[i];
newproc->tids[i].thread = NULL;
++newproc->nr_tids;
}
kfree(tids);
}
/* Find an unused TID */
retry_tid:
for (i = 0; i < newproc->nr_tids; ++i) {
if (!newproc->tids[i].thread) {
if (!__sync_bool_compare_and_swap(
&newproc->tids[i].thread, NULL, new)) {
goto retry_tid;
}
new->tid = newproc->tids[i].tid;
dkprintf("%s: tid %d assigned to %p\n", __FUNCTION__, new->tid, new);
break;
}
}
/* TODO: spawn more mcexec threads */
if (!new->tid) {
kprintf("%s: no more TIDs available\n");
panic("");
}
mcs_rwlock_writer_unlock(&newproc->threads_lock, &lock);
} }
/* fork() a new process on the host */ /* fork() a new process on the host */
else { else {
@@ -1913,7 +1982,7 @@ unsigned long do_fork(int clone_flags, unsigned long newsp,
} }
/* In a single threaded process TID equals to PID */ /* In a single threaded process TID equals to PID */
settid(new, 0, cpuid, -1); new->tid = newproc->pid;
new->vm->address_space->pids[0] = new->proc->pid; new->vm->address_space->pids[0] = new->proc->pid;
dkprintf("fork(): new pid: %d\n", new->proc->pid); dkprintf("fork(): new pid: %d\n", new->proc->pid);
@@ -5712,6 +5781,10 @@ SYSCALL_DECLARE(sched_setaffinity)
int empty_set = 1; int empty_set = 1;
extern int num_processors; extern int num_processors;
if (!u_cpu_set) {
return -EINVAL;
}
if (sizeof(k_cpu_set) > len) { if (sizeof(k_cpu_set) > len) {
memset(&k_cpu_set, 0, sizeof(k_cpu_set)); memset(&k_cpu_set, 0, sizeof(k_cpu_set));
} }
@@ -5719,7 +5792,7 @@ SYSCALL_DECLARE(sched_setaffinity)
len = MIN2(len, sizeof(k_cpu_set)); len = MIN2(len, sizeof(k_cpu_set));
if (copy_from_user(&k_cpu_set, u_cpu_set, len)) { if (copy_from_user(&k_cpu_set, u_cpu_set, len)) {
kprintf("%s: error: copy_from_user failed for %p:%d\n", __FUNCTION__, u_cpu_set, len); dkprintf("%s: error: copy_from_user failed for %p:%d\n", __FUNCTION__, u_cpu_set, len);
return -EFAULT; return -EFAULT;
} }

View File

@@ -75,7 +75,7 @@ sysfs_createf(struct sysfs_ops *ops, void *instance, int mode,
dkprintf("sysfs_createf(%p,%p,%#o,%s,...)\n", dkprintf("sysfs_createf(%p,%p,%#o,%s,...)\n",
ops, instance, mode, fmt); ops, instance, mode, fmt);
param = allocate_pages(1, IHK_MC_AP_NOWAIT); param = ihk_mc_alloc_pages(1, IHK_MC_AP_NOWAIT);
if (!param) { if (!param) {
error = -ENOMEM; error = -ENOMEM;
ekprintf("sysfs_createf:allocate_pages failed. %d\n", error); ekprintf("sysfs_createf:allocate_pages failed. %d\n", error);
@@ -134,7 +134,7 @@ sysfs_createf(struct sysfs_ops *ops, void *instance, int mode,
error = 0; error = 0;
out: out:
if (param) { if (param) {
free_pages(param, 1); ihk_mc_free_pages(param, 1);
} }
if (error) { if (error) {
ekprintf("sysfs_createf(%p,%p,%#o,%s,...): %d\n", ekprintf("sysfs_createf(%p,%p,%#o,%s,...): %d\n",
@@ -156,7 +156,7 @@ sysfs_mkdirf(sysfs_handle_t *dirhp, const char *fmt, ...)
dkprintf("sysfs_mkdirf(%p,%s,...)\n", dirhp, fmt); dkprintf("sysfs_mkdirf(%p,%s,...)\n", dirhp, fmt);
param = allocate_pages(1, IHK_MC_AP_NOWAIT); param = ihk_mc_alloc_pages(1, IHK_MC_AP_NOWAIT);
if (!param) { if (!param) {
error = -ENOMEM; error = -ENOMEM;
ekprintf("sysfs_mkdirf:allocate_pages failed. %d\n", error); ekprintf("sysfs_mkdirf:allocate_pages failed. %d\n", error);
@@ -208,7 +208,7 @@ sysfs_mkdirf(sysfs_handle_t *dirhp, const char *fmt, ...)
out: out:
if (param) { if (param) {
free_pages(param, 1); ihk_mc_free_pages(param, 1);
} }
if (error) { if (error) {
ekprintf("sysfs_mkdirf(%p,%s,...): %d\n", dirhp, fmt, error); ekprintf("sysfs_mkdirf(%p,%s,...): %d\n", dirhp, fmt, error);
@@ -229,7 +229,7 @@ sysfs_symlinkf(sysfs_handle_t targeth, const char *fmt, ...)
dkprintf("sysfs_symlinkf(%#lx,%s,...)\n", targeth.handle, fmt); dkprintf("sysfs_symlinkf(%#lx,%s,...)\n", targeth.handle, fmt);
param = allocate_pages(1, IHK_MC_AP_NOWAIT); param = ihk_mc_alloc_pages(1, IHK_MC_AP_NOWAIT);
if (!param) { if (!param) {
error = -ENOMEM; error = -ENOMEM;
ekprintf("sysfs_symlinkf:allocate_pages failed. %d\n", error); ekprintf("sysfs_symlinkf:allocate_pages failed. %d\n", error);
@@ -279,7 +279,7 @@ sysfs_symlinkf(sysfs_handle_t targeth, const char *fmt, ...)
error = 0; error = 0;
out: out:
if (param) { if (param) {
free_pages(param, 1); ihk_mc_free_pages(param, 1);
} }
if (error) { if (error) {
ekprintf("sysfs_symlinkf(%#lx,%s,...): %d\n", ekprintf("sysfs_symlinkf(%#lx,%s,...): %d\n",
@@ -301,7 +301,7 @@ sysfs_lookupf(sysfs_handle_t *objhp, const char *fmt, ...)
dkprintf("sysfs_lookupf(%p,%s,...)\n", objhp, fmt); dkprintf("sysfs_lookupf(%p,%s,...)\n", objhp, fmt);
param = allocate_pages(1, IHK_MC_AP_NOWAIT); param = ihk_mc_alloc_pages(1, IHK_MC_AP_NOWAIT);
if (!param) { if (!param) {
error = -ENOMEM; error = -ENOMEM;
ekprintf("sysfs_lookupf:allocate_pages failed. %d\n", error); ekprintf("sysfs_lookupf:allocate_pages failed. %d\n", error);
@@ -353,7 +353,7 @@ sysfs_lookupf(sysfs_handle_t *objhp, const char *fmt, ...)
out: out:
if (param) { if (param) {
free_pages(param, 1); ihk_mc_free_pages(param, 1);
} }
if (error) { if (error) {
ekprintf("sysfs_lookupf(%p,%s,...): %d\n", objhp, fmt, error); ekprintf("sysfs_lookupf(%p,%s,...): %d\n", objhp, fmt, error);
@@ -374,7 +374,7 @@ sysfs_unlinkf(int flags, const char *fmt, ...)
dkprintf("sysfs_unlinkf(%#x,%s,...)\n", flags, fmt); dkprintf("sysfs_unlinkf(%#x,%s,...)\n", flags, fmt);
param = allocate_pages(1, IHK_MC_AP_NOWAIT); param = ihk_mc_alloc_pages(1, IHK_MC_AP_NOWAIT);
if (!param) { if (!param) {
error = -ENOMEM; error = -ENOMEM;
ekprintf("sysfs_unlinkf:allocate_pages failed. %d\n", error); ekprintf("sysfs_unlinkf:allocate_pages failed. %d\n", error);
@@ -423,7 +423,7 @@ sysfs_unlinkf(int flags, const char *fmt, ...)
error = 0; error = 0;
out: out:
if (param) { if (param) {
free_pages(param, 1); ihk_mc_free_pages(param, 1);
} }
if (error) { if (error) {
ekprintf("sysfs_unlinkf(%#x,%s,...): %d\n", flags, fmt, error); ekprintf("sysfs_unlinkf(%#x,%s,...): %d\n", flags, fmt, error);
@@ -601,14 +601,14 @@ sysfs_init(void)
} }
sysfs_data_bufsize = PAGE_SIZE; sysfs_data_bufsize = PAGE_SIZE;
sysfs_data_buf = allocate_pages(1, IHK_MC_AP_NOWAIT); sysfs_data_buf = ihk_mc_alloc_pages(1, IHK_MC_AP_NOWAIT);
if (!sysfs_data_buf) { if (!sysfs_data_buf) {
error = -ENOMEM; error = -ENOMEM;
ekprintf("sysfs_init:allocate_pages(buf) failed. %d\n", error); ekprintf("sysfs_init:allocate_pages(buf) failed. %d\n", error);
goto out; goto out;
} }
param = allocate_pages(1, IHK_MC_AP_NOWAIT); param = ihk_mc_alloc_pages(1, IHK_MC_AP_NOWAIT);
if (!param) { if (!param) {
error = -ENOMEM; error = -ENOMEM;
ekprintf("sysfs_init:allocate_pages(param) failed. %d\n", ekprintf("sysfs_init:allocate_pages(param) failed. %d\n",
@@ -644,7 +644,7 @@ sysfs_init(void)
error = 0; error = 0;
out: out:
if (param) { if (param) {
free_pages(param, 1); ihk_mc_free_pages(param, 1);
} }
if (error) { if (error) {
ekprintf("sysfs_init(): %d\n", error); ekprintf("sysfs_init(): %d\n", error);

View File

@@ -172,6 +172,10 @@ static int zeroobj_get_page(struct memobj *memobj, off_t off, int p2align,
struct zeroobj *obj = to_zeroobj(memobj); struct zeroobj *obj = to_zeroobj(memobj);
struct page *page; struct page *page;
/* Don't bother about zero page, page fault handler will
* allocate and clear pages */
return 0;
dkprintf("zeroobj_get_page(%p,%#lx,%d,%p)\n", dkprintf("zeroobj_get_page(%p,%#lx,%d,%p)\n",
memobj, off, p2align, physp); memobj, off, p2align, physp);
if (off & ~PAGE_MASK) { if (off & ~PAGE_MASK) {

View File

@@ -103,7 +103,7 @@ void ihk_mc_clean_micpa(void);
void *ihk_mc_alloc_aligned_pages(int npages, int p2align, enum ihk_mc_ap_flag flag); void *ihk_mc_alloc_aligned_pages(int npages, int p2align, enum ihk_mc_ap_flag flag);
void *ihk_mc_alloc_pages(int npages, enum ihk_mc_ap_flag flag); void *ihk_mc_alloc_pages(int npages, enum ihk_mc_ap_flag flag);
void ihk_mc_free_pages(void *p, int npages); void ihk_mc_free_pages(void *p, int npages);
void *ihk_mc_allocate(int size, enum ihk_mc_ap_flag flag); void *ihk_mc_allocate(int size, int flag);
void ihk_mc_free(void *p); void ihk_mc_free(void *p);
void *arch_alloc_page(enum ihk_mc_ap_flag flag); void *arch_alloc_page(enum ihk_mc_ap_flag flag);